PyPI - datamule - Versions diffs - 1.2.2__py3-none-any.whl → 1.2.9__py3-none-any.whl - Mend

datamule 1.2.2py3-none-any.whl → 1.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

datamule/__init__.py +1 -0
datamule/document/document.py +27 -14
datamule/document/mappings/atsn.py +208 -0
datamule/document/mappings/cfportal.py +346 -0
datamule/document/mappings/d.py +125 -0
datamule/document/mappings/ex102_abs.py +63 -0
datamule/document/mappings/ex99a_sdr.py +1 -0
datamule/document/mappings/ex99c_sdr.py +0 -0
datamule/document/mappings/ex99g_sdr.py +0 -0
datamule/document/mappings/ex99i_sdr.py +0 -0
datamule/document/mappings/information_table.py +35 -0
datamule/document/mappings/nmfp.py +275 -0
datamule/document/mappings/npx.py +85 -0
datamule/document/mappings/onefourtyfour.py +68 -0
datamule/document/mappings/ownership.py +163 -0
datamule/document/mappings/proxy_voting_record.py +17 -0
datamule/document/mappings/sbs.py +0 -0
datamule/document/mappings/sbsef.py +13 -0
datamule/document/mappings/schedule13.py +117 -0
datamule/document/mappings/sdr.py +63 -0
datamule/document/mappings/submission_metadata.py +9 -0
datamule/document/mappings/ta.py +0 -0
datamule/document/mappings/thirteenfhr.py +72 -0
datamule/document/mappings/twentyfivense.py +22 -0
datamule/document/mappings/twentyfourf2nt.py +100 -0
datamule/document/processing.py +170 -42
datamule/document/table.py +60 -5
datamule/helper.py +10 -1
datamule/index.py +8 -10
datamule/portfolio.py +17 -16
datamule/sec/submissions/monitor.py +173 -120
datamule/sec/submissions/textsearch.py +0 -4
datamule/sec/xbrl/streamcompanyfacts.py +1 -1
datamule/seclibrary/downloader.py +2 -2
datamule/submission.py +92 -36
{datamule-1.2.2.dist-info → datamule-1.2.9.dist-info}/METADATA +1 -2
datamule-1.2.9.dist-info/RECORD +62 -0
datamule/sec/rss/monitor.py +0 -416
datamule-1.2.2.dist-info/RECORD +0 -40
/datamule/{sec/rss → document/mappings}/__init__.py +0 -0
{datamule-1.2.2.dist-info → datamule-1.2.9.dist-info}/WHEEL +0 -0
{datamule-1.2.2.dist-info → datamule-1.2.9.dist-info}/top_level.txt +0 -0

datamule/document/processing.py CHANGED Viewed

@@ -17,6 +17,17 @@ def process_tabular_data(self):
         tables = process_13fhr(self.data, self.accession)
     elif self.type in ["INFORMATION TABLE"]:
         tables = process_information_table(self.data, self.accession)
+    elif self.type in ["25-NSE", "25-NSE/A"]:
+        tables = process_25nse(self.data, self.accession)
+    # complete mark:
+    elif self.type in ["EX-102"]:
+        tables = process_ex102_abs(self.data, self.accession)
+    elif self.type in ["D","D/A"]:
+        tables = process_d(self.data, self.accession)
+    elif self.type in ["N-PX","N-PX/A"]:
+        tables = process_npx(self.data, self.accession)
     elif self.type in ["SBSEF","SBSEF/A","SBSEF-V","SBSEF-W"]:
         tables = process_sbsef(self.data, self.accession)
     elif self.type in ["SDR","SDR/A","SDR-W","SDR-A"]:
@@ -33,8 +44,7 @@ def process_tabular_data(self):
         tables = process_144(self.data, self.accession)
     elif self.type in ["24F-2NT", "24F-2NT/A"]:
         tables = process_24f2nt(self.data, self.accession)
-    elif self.type in ["25-NSE", "25-NSE/A"]:
-        tables = process_25nse(self.data, self.accession)
     elif self.type in ["ATS-N", "ATS-N/A"]:
         tables = process_ats(self.data, self.accession)
     # elif self.type in ["C","C-W","C-U","C-U-W","C/A","C/A-W",
@@ -42,8 +52,7 @@ def process_tabular_data(self):
     #     tables = process_c(self.data, self.accession)
     elif self.type in ["CFPORTAL","CFPORTAL/A","CFPORTAL-W"]:
         tables = process_cfportal(self.data, self.accession)
-    # elif self.type in ["D","D/A"]:
-    #     tables = process_d(self.data, self.accession)
     # elif self.type in ["MA","MA-A","MA/A","MA-I","MA-I/A","MA-W"]:
     #     tables = process_ma(self.data, self.accession)
     # elif self.type in ["N-CEN","N-CEN/A"]:
@@ -53,8 +62,7 @@ def process_tabular_data(self):
     #     tables = process_nmfp(self.data, self.accession)
     # elif self.type in ["NPORT-P","NPORT-P/A"]:
     #     tables = process_nportp(self.data, self.accession)
-    elif self.type in ["N-PX","N-PX/A"]:
-        tables = process_npx(self.data, self.accession)
     # elif self.type in ["TA-1","TA-1/A","TA-W","TA-2","TA-2/A"]:
     #     tables = process_ta(self.data, self.accession)
     elif self.type in ["X-17A-5","X-17A-5/A"]:
@@ -66,10 +74,11 @@ def process_tabular_data(self):
         tables = process_reg_a(self.data, self.accession)
     # elif self.type in ["SBSE","SBSE/A","SBSE-A","SBSE-A/A","SBSE-BD","SBSE-BD/A","SBSE-C","SBSE-W","SBSE-CCO-RPT","SBSE-CCO-RPT/A"]:
     #     tables = process_sbs(self.data, self.accession)
-    # elif self.type in ["EX-102"]:
-    #     tables = process_ex102_abs(self.data, self.accession)
     elif self.type == "PROXY VOTING RECORD":
         tables = process_proxy_voting_record(self.data, self.accession)
+    elif self.type == 'submission_metadata':
+        tables = process_submission_metadata(self.content, self.accession)
     else:
         warn(f"Processing for {self.type} is not implemented yet.")
         return []
@@ -95,6 +104,67 @@ def _flatten_dict(d, parent_key=''):
     return items
+# flattens in a different way
+def flatten_dict_to_rows(d, parent_key='', sep='_'):
+    if isinstance(d, list):
+        # If input is a list, flatten each item and return all rows
+        all_rows = []
+        for item in d:
+            all_rows.extend(flatten_dict_to_rows(item, parent_key, sep))
+        return all_rows
+    if not isinstance(d, dict):
+        # If input is a primitive value, return single row
+        return [{parent_key: d}] if parent_key else []
+    # Input is a dictionary
+    rows = [{}]
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            # Recursively flatten nested dictionaries
+            nested_rows = flatten_dict_to_rows(v, new_key, sep)
+            # Cross-product with existing rows
+            new_rows = []
+            for row in rows:
+                for nested_row in nested_rows:
+                    combined_row = row.copy()
+                    combined_row.update(nested_row)
+                    new_rows.append(combined_row)
+            rows = new_rows
+        elif isinstance(v, list):
+            # Handle lists - create multiple rows
+            if not v:  # Empty list
+                for row in rows:
+                    row[new_key] = ''
+            else:
+                new_rows = []
+                for row in rows:
+                    for list_item in v:
+                        new_row = row.copy()
+                        if isinstance(list_item, dict):
+                            # Recursively flatten dict items in list
+                            nested_rows = flatten_dict_to_rows(list_item, new_key, sep)
+                            for nested_row in nested_rows:
+                                combined_row = new_row.copy()
+                                combined_row.update(nested_row)
+                                new_rows.append(combined_row)
+                        else:
+                            # Primitive value in list
+                            new_row[new_key] = list_item
+                            new_rows.append(new_row)
+                rows = new_rows
+        else:
+            # Handle primitive values
+            for row in rows:
+                row[new_key] = v
+    return rows
 def process_ownership(data, accession):
     tables = []
     if 'ownershipDocument' not in data:
@@ -346,33 +416,41 @@ def process_cfportal(data, accession):
     return tables
-# def process_d(data, accession):
-#     tables = []
-#     primary_issuer = safe_get(data, ['edgarSubmission', 'primaryIssuer'])
-#     if primary_issuer:
-#         metadata = Table(_flatten_dict(primary_issuer), 'metadata_d', accession)
-#         metadata_columns = ['schemaVersion', 'submissionType', 'testOrLive', 'returnCopy', 'contactData', 'notificationAddressList']
-#         for col in metadata_columns:
-#             col_data = safe_get(data, ['edgarSubmission', col])
-#             if col_data:
-#                 metadata.add_column(col, col_data)
-#         tables.append(metadata)
-#     issuer_list = safe_get(data, ['edgarSubmission', 'issuerList'])
-#     if issuer_list:
-#         tables.append(Table(_flatten_dict(issuer_list), 'primary_issuer_d', accession))
-#     offering_data = safe_get(data, ['edgarSubmission', 'offeringData'])
-#     if offering_data:
-#         tables.append(Table(_flatten_dict(offering_data), 'offering_data_d', accession))
-#     related_persons_list = safe_get(data, ['edgarSubmission', 'relatedPersonsList'])
-#     if related_persons_list:
-#         tables.append(Table(_flatten_dict(related_persons_list), 'related_persons_list_d', accession))
+def process_d(data, accession):
+    tables = []
+    groups = [('contactData', 'contact_data_d'),
+                ('notificationAddressList', 'notification_address_list_d'),
+                ('primaryIssuer', 'primary_issuer_d'),
+                ('issuerList', 'issuer_list_d'),
+                ('relatedPersonsList', 'related_persons_list_d'),
+                ('offeringData', 'offering_data_d'),
+    ]
+    for group,table_type in groups:
+        if group == 'relatedPersonList':
+            group_data = data['edgarSubmission'].pop('relatedPersonInfo', None)
+            data['edgarSubmission'].pop(group, None)
+        elif group == 'issuerList':
+            group_data = data['edgarSubmission'].pop('issuerList', None)
+        else:
+            group_data = data['edgarSubmission'].pop(group, None)
+        if group_data:
+            # Special handling ONLY for relatedPersonsList
+            if group in ['relatedPersonsList', 'issuerList','offeringData']:
+                # Use the new flatten_dict_to_rows ONLY for this key
+                flattened_rows = flatten_dict_to_rows(group_data)
+                if flattened_rows:
+                    tables.append(Table(flattened_rows, table_type, accession))
+            else:
+                # Everything else remains EXACTLY the same
+                tables.append(Table(_flatten_dict(group_data), table_type, accession))
+    metadata_table = Table(_flatten_dict(data['edgarSubmission']), 'metadata_d', accession)
+    tables.append(metadata_table)
-#     return tables
+    return tables
 # def process_nmfp(data, accession):
 #     tables = []
@@ -583,13 +661,39 @@ def process_reg_a(data, accession):
 #     return tables
-# def process_ex102_abs(data, accession):
-#     tables = []
-#     asset_data = safe_get(data, ['assetData'])
-#     if asset_data:
-#         tables.append(Table(_flatten_dict(asset_data), 'abs', accession))
-#     raise NotImplementedError("Need to implement the rest of the ABS processing")
-#     return tables
+def process_ex102_abs(data, accession):
+    tables = []
+    data = safe_get(data, ['assetData', 'assets'])
+    # Create assets list: all items without their 'property' field
+    assets = [{k: v for k, v in item.items() if k != 'property'} for item in data]
+    # Create properties list in a more vectorized way
+    properties = []
+    # Handle dictionary properties
+    properties.extend([
+        item['property'] | {'assetNumber': item['assetNumber']}
+        for item in data
+        if 'property' in item and isinstance(item['property'], dict)
+    ])
+    # Handle list properties - flatten in one operation
+    properties.extend([
+        prop | {'assetNumber': item['assetNumber']}
+        for item in data
+        if 'property' in item and isinstance(item['property'], list)
+        for prop in item['property']
+        if isinstance(prop, dict)
+    ])
+    if assets:
+        tables.append(Table(_flatten_dict(assets), 'assets_ex102_absee', accession))
+    if properties:
+        tables.append(Table(_flatten_dict(properties), 'properties_ex102_absee', accession))
+    return tables
 # def process_ma(data, accession):
 #     tables = []
@@ -601,4 +705,28 @@ def process_reg_a(data, accession):
 #     raise NotImplementedError("Need to implement the rest of the MA processing")
 # def process_ncen(data, accession):
-#     raise NotImplementedError("Need to implement the N-CEN processing")
+#     raise NotImplementedError("Need to implement the N-CEN processing")
+# WIP
+# Note: going to pause this for now, as I don't have a great way of putting this in a csv.
+def process_submission_metadata(data,accession):
+    tables = []
+    document_data = safe_get(data, ['documents'])
+    if document_data:
+        tables.append(Table(_flatten_dict(document_data), 'document_submission_metadata', accession))
+    reporting_owner_data = safe_get(data,['reporting-owner'])
+    if reporting_owner_data:
+        tables.append(Table(_flatten_dict(reporting_owner_data), 'reporting_owner_submission_metadata', accession))
+    issuer_data = safe_get(data,['issuer'])
+    if issuer_data:
+        tables.append(Table(_flatten_dict(issuer_data), 'issuer_submission_metadata', accession))
+    # # construct metadata
+    # accession-number date-of-filing-date-change, depositor-cik effectiveness-date
+    # # other tables
+    # depositor, securitizer
+    return tables

datamule/document/table.py CHANGED Viewed

@@ -18,7 +18,12 @@ from .mappings.thirteenfhr import *
 from .mappings.twentyfivense import *
 from .mappings.twentyfourf2nt import *
 from .mappings.information_table import *
+from .mappings.submission_metadata import *
+from .mappings.ex102_abs import *
+from .mappings.d import *
+from pathlib import Path
+import csv
 # need to check if mappings correctly create new columns
 class Table():
     def __init__(self, data, type,accession):
@@ -27,11 +32,18 @@ class Table():
         self.type = type
         self.data = data
         self.accession = accession
-        self.columns = self.determine_columns()
+        self.columns = self.determine_columns_complete()
+    def determine_columns_complete(self):
+        if not self.data:
+            return []
+        return list(set().union(*(row.keys() for row in self.data)))
     def determine_columns(self):
         if len(self.data) == 0:
             return []
         return self.data[0].keys()
     def add_column(self,column_name,value):
@@ -190,6 +202,17 @@ class Table():
         elif self.type == 'signature_schedule_13':
             mapping_dict = signature_schedule_13_dict
+        # D
+        elif self.type == 'issuer_list_d':
+            mapping_dict = issuer_list_d_dict
+        elif self.type == 'metadata_d':
+            mapping_dict = metadata_d_dict
+        elif self.type == 'offering_data_d':
+            mapping_dict = offering_data_d_dict
+        elif self.type == 'primary_issuer_d':
+            mapping_dict = primary_issuer_d_dict
+        elif self.type == 'related_persons_list_d':
+            mapping_dict = related_persons_d_dict
         # SDR
         elif self.type == 'sdr':
             mapping_dict = sdr_dict
@@ -227,6 +250,15 @@ class Table():
             mapping_dict = item_9_24f2nt_dict
         elif self.type == 'signature_info_schedule_a':
             mapping_dict = signature_24f2nt_dict
+        # ABS
+        elif self.type == 'assets_ex102_absee':
+            mapping_dict = assets_dict_ex102_abs
+        elif self.type =='properties_ex102_absee':
+            mapping_dict = properties_dict_ex102_abs
+        # submission metadata
+        elif self.type == 'document_submission_metadata':
+            mapping_dict = document_submission_metadata_dict
         else:
             mapping_dict = {}
@@ -245,9 +277,6 @@ class Table():
             for old_key, new_key in mapping_dict.items():
                 if old_key in row:
                     ordered_row[new_key] = row.pop(old_key)
-                else:
-                    # if the old key is not present, set the new key to None
-                    ordered_row[new_key] = None
             # Then add any remaining keys that weren't in the mapping
             for key, value in row.items():
@@ -257,4 +286,30 @@ class Table():
             row.clear()
             row.update(ordered_row)
-        self.determine_columns()
+        # Update the columns after mapping
+        columns = set(self.columns)
+        # remove the old columns that are now in the mapping
+        columns.difference_update(mapping_dict.keys())
+        # add the new columns from the mapping
+        columns.update(mapping_dict.values())
+        # add the accession column to the columns
+        columns.add('accession')
+        self.columns = list(columns)
+    def write_csv(self, output_file):
+        output_file = Path(output_file)
+        fieldnames = self.columns
+        # Check if the file already exists
+        if output_file.exists():
+            # Append to existing file without writing header
+            with open(output_file, 'a', newline='') as csvfile:
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
+                writer.writerows(self.data)
+        else:
+            # Create new file with header
+            with open(output_file, 'w', newline='') as csvfile:
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
+                writer.writeheader()
+                writer.writerows(self.data)

datamule/helper.py CHANGED Viewed

@@ -79,7 +79,16 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
         # Convert ticker to CIK if provided
         if ticker is not None:
-            cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
+            if isinstance(ticker, str):
+                ticker = [ticker]
+            ciks_from_ticker = []
+            for t in ticker:
+                ciks = get_cik_from_dataset('listed_filer_metadata', 'ticker', t)
+                if ciks:
+                    ciks_from_ticker.extend(ciks)
+            cik = ciks
         # Normalize CIK format
         if cik is not None:

datamule/index.py CHANGED Viewed

@@ -1,16 +1,16 @@
-from pathlib import Path
 from .sec.submissions.textsearch import query
-from .helper import _process_cik_and_metadata_filters, load_package_dataset
+from .helper import _process_cik_and_metadata_filters
+from pathlib import Path
 class Index:
-    def __init__(self, path=None):
-        self.path = Path(path) if path else None
+    def __init__(self):
+        pass
     def search_submissions(
         self,
         text_query,
-        start_date=None,
-        end_date=None,
+        filing_date=None,
         submission_type=None,
         cik=None,
         ticker=None,
@@ -47,16 +47,14 @@ class Index:
         # Execute the search query
         results = query(
             f'{text_query}',
-            filing_date=(start_date, end_date),
+            filing_date=filing_date,
             requests_per_second=requests_per_second,
             quiet=quiet,
             submission_type=submission_type,
             **kwargs
         )
-        # Save results to path if specified
-        if self.path:
-            self._save_results(results, text_query)
         return results

datamule/portfolio.py CHANGED Viewed

@@ -9,22 +9,28 @@ import os
 from .helper import _process_cik_and_metadata_filters
 from .seclibrary.downloader import download as seclibrary_download
 from .sec.xbrl.filter_xbrl import filter_xbrl
-from .sec.submissions.monitor import monitor
-from .sec.xbrl.xbrlmonitor import XBRLMonitor
+from .sec.submissions.monitor import Monitor
+#from .sec.xbrl.xbrlmonitor import XBRLMonitor
 class Portfolio:
     def __init__(self, path):
         self.path = Path(path)
+        self.api_key = None
         self.submissions = []
         self.submissions_loaded = False
         self.MAX_WORKERS = os.cpu_count() - 1
+        self.monitor = Monitor()
         if self.path.exists():
             self._load_submissions()
             self.submissions_loaded = True
         else:
             self.path.mkdir(parents=True, exist_ok=True)
+    def set_api_key(self, api_key):
+        self.api_key = api_key
     def _load_submissions(self):
         folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -132,6 +138,7 @@ class Portfolio:
             seclibrary_download(
                 output_dir=self.path,
                 cik=cik,
+                api_key=self.api_key,
                 submission_type=submission_type,
                 filing_date=filing_date,
                 accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
@@ -149,20 +156,18 @@ class Portfolio:
             )
         self.submissions_loaded = False
-    def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
-           polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
+    def monitor_submissions(self, data_callback=None, interval_callback=None,
+                            polling_interval=1000, quiet=True, start_date=None,
+                            validation_interval=600000):
-        cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
-        monitor(
+        self.monitor.monitor_submissions(
             data_callback=data_callback,
-            poll_callback=poll_callback,
-            cik=cik,
-            submission_type=submission_type,
+            interval_callback=interval_callback,
             polling_interval=polling_interval,
-            requests_per_second=requests_per_second,
             quiet=quiet,
-            start_date=start_date
+            start_date=start_date,
+            validation_interval=validation_interval
         )
@@ -179,8 +184,4 @@ class Portfolio:
             document_types = [document_types]
         for submission in self.submissions:
-            yield from submission.document_type(document_types)
-    def keep(self,document_type):
-        for submission in self.__iter__():
-            submission.keep(document_type)
+            yield from submission.document_type(document_types)

datamule 1.2.2__py3-none-any.whl → 1.2.9__py3-none-any.whl

datamule 1.2.2py3-none-any.whl → 1.2.9py3-none-any.whl