PyPI - datamule - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.3__py3-none-any.whl - Mend

datamule 1.2.0py3-none-any.whl → 1.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

datamule/__init__.py +3 -1
datamule/document/__init__.py +0 -0
datamule/document/document.py +255 -0
datamule/document/processing.py +604 -0
datamule/document/table.py +260 -0
datamule/package_updater.py +30 -0
datamule/portfolio.py +5 -3
datamule/sec/submissions/downloader.py +14 -37
datamule/seclibrary/downloader.py +50 -9
datamule/submission.py +111 -26
{datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/METADATA +1 -1
{datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/RECORD +14 -10
datamule/document.py +0 -465
{datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/WHEEL +0 -0
{datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/top_level.txt +0 -0

datamule/submission.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from pathlib import Path
 import json
-from .document import Document
+from .document.document import Document
 from secsgml import parse_sgml_submission_into_memory
-from pathlib import Path
+import os
+import aiofiles
 class Submission:
     def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -14,7 +16,13 @@ class Submission:
         if sgml_content is not None:
             self.path = None
             self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
+            # code dupe
+            self.accession = self.metadata['accession-number']
+            self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
             self.documents = []
+            filtered_metadata_documents = []
             for idx,doc in enumerate(self.metadata['documents']):
                 type = doc.get('type')
@@ -24,16 +32,25 @@ class Submission:
                     continue
                 filename = doc.get('filename')
                 extension = Path(filename).suffix
-                self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension))
+                self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
+                filtered_metadata_documents.append(doc)
+            self.metadata['documents'] = filtered_metadata_documents
         if path is not None:
             self.path = Path(path)
             metadata_path = self.path / 'metadata.json'
             with metadata_path.open('r') as f:
                 self.metadata = json.load(f)
+            # Code dupe
+            self.accession = self.metadata['accession-number']
+            self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
     def document_type(self, document_type):
         # Convert single document type to list for consistent handling
         if isinstance(document_type, str):
@@ -54,10 +71,13 @@ class Submission:
                     document_path = self.path / filename
                     extension = document_path.suffix
-                    with document_path.open('r') as f:
+                    with document_path.open('rb') as f:
                         content = f.read()
-                    yield Document(type=doc['type'], content=content, extension=extension)
+                    if extension in ['.htm','.html','.txt','.xml']:
+                        content = content.decode('utf-8', errors='replace')
+                    yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
                 # if loaded from sgml_content
                 else:
                     yield self.documents[idx]
@@ -78,10 +98,13 @@ class Submission:
                 # check if the file exists
                 if document_path.exists():
-                    with document_path.open('r') as f:
+                    with document_path.open('rb') as f:
                         content = f.read()
-                    yield Document(type=doc['type'], content=content, extension=extension)
+                    if extension in ['.htm','.html','.txt','.xml']:
+                        content = content.decode('utf-8', errors='replace')
+                    yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
                 else:
                     print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
@@ -89,25 +112,87 @@ class Submission:
             else:
                 yield self.documents[idx]
-    # keep documents by document type
-    def keep(self, document_type):
-        # Convert single document type to list for consistent handling
-        if isinstance(document_type, str):
-            document_types = [document_type]
-        else:
-            document_types = document_type
-        if self.path is not None:
-            for doc in self.metadata['documents']:
+    def save(self, output_dir="filings"):
+        file_dir = Path(output_dir) / str(self.accession)
+        file_dir.mkdir(parents=True, exist_ok=True)
+        metadata_path = file_dir / "metadata.json"
+        with open(metadata_path, 'w') as f:
+            json.dump(self.metadata, f, indent=4)
+        for idx, doc in enumerate(self.metadata['documents']):
+            try:
                 filename = doc.get('filename')
-                type = doc.get('type')
-                if type not in document_types:
-                    # oh we need handling here for sequences case
-                    if filename is None:
-                        filename = doc.sequence + '.txt'
+                if filename is None:
+                    filename = f"{doc.get('sequence', idx)}.txt"
+            except (KeyError, IndexError):
+                filename = f"{idx}.txt"
+            doc_path = file_dir / filename
+            if self.path is not None:
+                if hasattr(self, 'documents') and self.documents:
+                    content = self.documents[idx].content
+                else:
+                    orig_doc_path = self.path / filename
+                    if orig_doc_path.exists():
+                        with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
+                            content = f.read()
+                    else:
+                        print(f"Warning: File {orig_doc_path} does not exist, skipping.")
+                        continue
+            else:
+                content = self.documents[idx].content
+            if isinstance(content, bytes):
+                with open(doc_path, 'wb') as f:
+                    f.write(content)
+            else:
+                with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
+                    f.write(content)
+        return file_dir
-                    document_path = self.path / filename
-                    # delete the file
-                    document_path.unlink()
-        else:
-            print("Warning: keep() method is only available when loading from path.")
+    async def save_async(self, output_dir="filings"):
+        file_dir = Path(output_dir) / str(self.accession)
+        os.makedirs(file_dir, exist_ok=True)
+        metadata_path = file_dir / "metadata.json"
+        async with aiofiles.open(metadata_path, 'w') as f:
+            await f.write(json.dumps(self.metadata, indent=4))
+        for idx, doc in enumerate(self.metadata['documents']):
+            try:
+                filename = doc.get('filename')
+                if filename is None:
+                    filename = f"{doc.get('sequence', idx)}.txt"
+            except (KeyError, IndexError):
+                filename = f"{idx}.txt"
+            doc_path = file_dir / filename
+            if self.path is not None:
+                if hasattr(self, 'documents') and self.documents:
+                    content = self.documents[idx].content
+                else:
+                    orig_doc_path = self.path / filename
+                    if orig_doc_path.exists():
+                        async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
+                            content = await f.read()
+                    else:
+                        print(f"Warning: File {orig_doc_path} does not exist, skipping.")
+                        continue
+            else:
+                content = self.documents[idx].content
+            if isinstance(content, bytes):
+                async with aiofiles.open(doc_path, 'wb') as f:
+                    await f.write(content)
+            else:
+                async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
+                    await f.write(content)
+        return file_dir

{datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.2.0
+Version: 1.2.3
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,15 @@
-datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
+datamule/__init__.py,sha256=8KioESb9y0Xwy72WuTfsYZnnMFdCrRhSv8DW-kZ4-To,1066
 datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
-datamule/document.py,sha256=CvuyazJ1qP8Ygpv49ikMc8DyGK7N-tApTU2Ccgv57q4,21556
 datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
 datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
-datamule/portfolio.py,sha256=yWt5gYTjV7rJsLiPUmhc6Vmr3lfvfCR5MSpLQ_6Gdp4,7104
+datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
+datamule/portfolio.py,sha256=so6j2KrkcZOToHIqkANAu3CC4QsfgaUN1zk9CrbRe1E,7225
 datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
-datamule/submission.py,sha256=LI7Zr60YbE_tU-v2N09k2dGjfztSgplKZACT3eRUkFE,4463
+datamule/submission.py,sha256=HXuFL6snLevGk7DGlvPbjcBOJuccAIxEPXnkA1TXX8Y,8121
+datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datamule/document/document.py,sha256=BRnHPVt-vIT7EZTF-c-Ulv3N33xX9zE02Q9mKXVDeuY,9474
+datamule/document/processing.py,sha256=fw-1OWfbmZhG1R8XpJx_vcGwz3_djmk0FrblHAMPmwc,27476
+datamule/document/table.py,sha256=Sv9jTGiVhnWIY9nHaynUUixwbCrvbLsf0fdOnFR-NCY,10791
 datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
 datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
@@ -16,7 +20,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
 datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
 datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/sec/submissions/downloader.py,sha256=IB08W8-lQD5Bb0LgzrTN4Xi4HsCw24DybRLHqE1AUrU,3290
+datamule/sec/submissions/downloader.py,sha256=60wX2Yml1UCuxOtU0xMxqqeyHhrypCmlDQ0jZF-StJo,2665
 datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
 datamule/sec/submissions/monitor.py,sha256=Im2kgnUehhTgyY2Vq3uk07n4Vkj4PjII_SsRDi8ehAE,5384
 datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
@@ -28,9 +32,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=WyJIwuy5mNMXWpx_IkhFzDMe9MOfQ-vNk
 datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
-datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
+datamule/seclibrary/downloader.py,sha256=fJztJ_sEfv2oHHbDff07DRlXLmztXnzt3Yvv5YaZgGk,13718
 datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
-datamule-1.2.0.dist-info/METADATA,sha256=IDVSWEibrVQWmrNKEXrD1oaucOpmP7Agr4f6bv6o3Kg,512
-datamule-1.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-1.2.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-1.2.0.dist-info/RECORD,,
+datamule-1.2.3.dist-info/METADATA,sha256=3gODk6YjozgMTYnjvXRX_pox_Otkq7tSDZY2LEl6MiU,512
+datamule-1.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-1.2.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-1.2.3.dist-info/RECORD,,

datamule 1.2.0__py3-none-any.whl → 1.2.3__py3-none-any.whl

datamule 1.2.0py3-none-any.whl → 1.2.3py3-none-any.whl