PyPI - datamule - Versions diffs - 1.4.4__tar.gz → 1.4.5__tar.gz - Mend

datamule 1.4.4tar.gz → 1.4.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{datamule-1.4.4 → datamule-1.4.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.4.4
+Version: 1.4.5
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.4.4 → datamule-1.4.5}/datamule/document/document.py RENAMED Viewed

@@ -12,6 +12,19 @@ from .processing import process_tabular_data
 from pathlib import Path
 import webbrowser
+def convert_bytes_keys(obj):
+    if isinstance(obj, dict):
+        return {
+            (k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
+            for k, v in obj.items()
+        }
+    elif isinstance(obj, list):
+        return [convert_bytes_keys(item) for item in obj]
+    elif isinstance(obj, bytes):
+        return obj.decode('utf-8').lower()
+    else:
+        return obj
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -19,7 +32,11 @@ class Document:
         extension = extension.lower()
         self.accession = accession
         self.filing_date = filing_date
-        self.content = content
+        if self.type == 'submission_metadata':
+            self.content = convert_bytes_keys(content)
+        else:
+            self.content = content
         if path is not None:
             self.path = path
@@ -157,8 +174,40 @@ class Document:
                 dict_appntc_html
             elif self.type == 'CB':
                 mapping_dict = dict_cb_html
+            elif self.type == 'DSTRBRPT':
+                mapping_dict = dict_dstrbrpt_html
+            elif self.type == 'N-18F1':
+                mapping_dict = dict_n18f1_html
+            elif self.type == 'N-CSRS':
+                mapping_dict = dict_ncsrs_html
+            elif self.type == 'NT-10K':
+                mapping_dict = dict_nt10k_html
+            elif self.type == 'NT-10Q':
+                mapping_dict = dict_nt10q_html
+            elif self.type == 'NT 20-F':
+                mapping_dict = dict_nt20f_html
+            elif self.type == 'NT-NCEN':
+                mapping_dict = dict_ntncen_html
+            elif self.type == 'NT-NCSR':
+                mapping_dict = dict_ntncsr_html
+            elif self.type == 'NTFNCEN':
+                mapping_dict = dict_ntfcen_html
+            elif self.type == 'NTFNCSR':
+                mapping_dict = dict_ntfncsr_html
+            elif self.type == 'EX-99.CERT':
+                mapping_dict = dict_ex99cert_html
+            elif self.type == 'SC 13E3':
+                mapping_dict = dict_sc13e3_html
+            elif self.type == 'SC 14D9':
+                mapping_dict = dict_sc14d9_html
+            elif self.type == 'SP 15D2':
+                mapping_dict = dict_sp15d2_html
             elif self.type == 'SD':
                 mapping_dict = dict_sd_html
+            elif self.type == 'T-3':
+                mapping_dict = dict_t3_html
             elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
                 mapping_dict = dict_nt10k_html

{datamule-1.4.4 → datamule-1.4.5}/datamule/mapping_dicts/html_mapping_dicts.py RENAMED Viewed

@@ -26,6 +26,12 @@ dict_abs15g_html = {
 dict_nt10k_html = {
     ('part',r'^part\s*([ivx]+)') : 0,
 }
+dict_nt10q_html = dict_nt10k_html
+dict_nt20f_html = dict_nt10k_html
+dict_ntncen_html = dict_nt10k_html
+dict_ntncsr_html = dict_nt10k_html
+dict_ntfcen_html = dict_nt10k_html
+dict_ntfncsr_html = dict_nt10k_html
 dict_1kpartii_html = {
     ('item',r'^item\s*(\d+)') : 0,
@@ -72,4 +78,44 @@ dict_appntc_html = {('agency',r'^agency') : 0,
 dict_cb_html = {
     ('part', r'^part\s*([ivx]+)') : 0,
     ('item', r'^item\s*(\d+)') : 1,
-}
+}
+dict_dstrbrpt_html = dict_1kpartii_html
+dict_n18f1_html = {
+    ('notification of election', r'^notification of election') : 0,
+    ('signatures', r'^signatures?\.*$') : 0,
+}
+dict_ex99cert_html = {
+    ('item',r'^(\d+)') : 0,
+    ('letter',r'^\(?([a-z])') : 1,
+}
+dict_ncsrs_html = {
+    ('item',r'^(\d+)') : 0,
+    ('signatures',r'^signatures?\.*$') : 0,
+}
+dict_sc13e3_html = {
+    ('item', r'^item\s*(\d+)') : 0,
+    ('signatures', r'^signatures?\.*$') : 0,
+    ('letter', r'^\(?([a-z])') : 1,
+}
+dict_sc14d9_html = {
+    ('item', r'^item\s*(\d+)') : 0,
+    ('signatures', r'^signatures?\.*$') : 0,
+    ('annex', r'^annex') : 0,
+}
+dict_sp15d2_html = dict_10k_html
+dict_t3_html = {('general',r'^general'):0,
+                ('affiliations',r'^affiliations'):0,
+                ('management and control',r'^management and control'):0,
+                ('underwriters',r'^underwriters'):0,
+                ('capital securities',r'^capital securities'):0,
+                ('indenture securities',r'^indenture securities'):0,
+                ('signatures',r'^signatures?\.*$') : 0,
+                ('number',r'^(\d+)') : 1,}

{datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/downloader.py RENAMED Viewed

@@ -9,7 +9,7 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings",
     try:
         # Create a Submission object directly from the content
         # Note: the content needs to be decoded from bytes to string for the parser
-        submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
+        submission = Submission(sgml_content=content,
                                keep_document_types=keep_document_types)
         # Use the async save method to write the submission to disk

{datamule-1.4.4 → datamule-1.4.5}/datamule/seclibrary/downloader.py RENAMED Viewed

@@ -99,6 +99,7 @@ class Downloader:
                 self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
                 self.pbar.update(1)
             except Exception as e:
+                print(f"Exception {e} in {filename}")
                 accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
                 if os.path.exists(accession_dir):
                     shutil.rmtree(accession_dir)
@@ -143,7 +144,7 @@ class Downloader:
             with dctx.stream_reader(input_buffer) as reader:
                 shutil.copyfileobj(reader, decompressed_content)
-            content = decompressed_content.getvalue().decode('utf-8')
+            content = decompressed_content.getvalue()
             processor.processing_queue.put((filename, content))
             return True
@@ -159,7 +160,7 @@ class Downloader:
     def save_regular_file(self, chunks, filename, output_dir, processor):
         try:
-            content = b''.join(chunks).decode('utf-8')
+            content = b''.join(chunks)
             processor.processing_queue.put((filename, content))
             return True

{datamule-1.4.4 → datamule-1.4.5}/datamule/submission.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from pathlib import Path
 import json
 from .document.document import Document
-from secsgml import parse_sgml_submission_into_memory
+from secsgml import parse_sgml_content_into_memory
 import os
 import aiofiles
 import tempfile
@@ -79,9 +79,8 @@ class Submission:
         if sgml_content is not None:
             self.path = None
-            metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
+            metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
             # code dupe
             self.accession = self.metadata.content['accession-number']
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
@@ -95,7 +94,9 @@ class Submission:
                 # Keep only specified types
                 if keep_document_types is not None and type not in keep_document_types:
                     continue
-                filename = doc.get('filename')
+                # write as txt if not declared
+                filename = doc.get('filename','.txt')
                 extension = Path(filename).suffix
                 self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
@@ -190,12 +191,9 @@ class Submission:
             json.dump(self.metadata.content, f, indent=4)
         for idx, doc in enumerate(self.metadata.content['documents']):
-            try:
-                filename = doc.get('filename')
-                if filename is None:
-                    filename = f"{doc.get('sequence', idx)}.txt"
-            except (KeyError, IndexError):
-                filename = f"{idx}.txt"
+            filename = doc.get('filename')
+            if filename is None:
+                filename = f"{doc.get('sequence')}.txt"
             doc_path = file_dir / filename
@@ -231,12 +229,11 @@ class Submission:
             await f.write(json.dumps(self.metadata.content, indent=4))
         for idx, doc in enumerate(self.metadata.content['documents']):
-            try:
-                filename = doc.get('filename')
-                if filename is None:
-                    filename = f"{doc.get('sequence', idx)}.txt"
-            except (KeyError, IndexError):
-                filename = f"{idx}.txt"
+            filename = doc.get('filename')
+            # oh we need handling here for sequences case
+            if filename is None:
+                filename = doc['sequence'] + '.txt'
             doc_path = file_dir / filename

{datamule-1.4.4 → datamule-1.4.5}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.4.4
+Version: 1.4.5
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.4.4 → datamule-1.4.5}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="1.4.4",
+    version="1.4.5",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",