PyPI - datamule - Versions diffs - 1.2.4__tar.gz → 1.2.6__tar.gz - Mend

datamule 1.2.4tar.gz → 1.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

{datamule-1.2.4 → datamule-1.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.2.4
+Version: 1.2.6
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -10,7 +10,6 @@ Requires-Dist: tqdm
 Requires-Dist: requests
 Requires-Dist: nest_asyncio
 Requires-Dist: aiofiles
-Requires-Dist: polars
 Requires-Dist: setuptools
 Requires-Dist: selectolax
 Requires-Dist: pytz

{datamule-1.2.4 → datamule-1.2.6}/datamule/__init__.py RENAMED Viewed

@@ -8,6 +8,7 @@ from .index import Index
 from .package_updater import PackageUpdater
 # Keep the notebook environment setup
 def _is_notebook_env():
     """Check if the code is running in a Jupyter or Colab environment."""

{datamule-1.2.4 → datamule-1.2.6}/datamule/document/document.py RENAMED Viewed

@@ -118,10 +118,11 @@ class Document:
         # will deprecate this when we add html2dict
         elif self.extension in ['.htm', '.html','.txt']:
-            if self.type == '10-K':
-                mapping_dict = dict_10k
-            elif self.type == '10-Q':
+            if self.type == '10-Q':
                 mapping_dict = dict_10q
+            elif self.type == '10-K':
+                mapping_dict = dict_10k
             elif self.type == '8-K':
                 mapping_dict = dict_8k
             elif self.type == 'SC 13D':
@@ -141,27 +142,39 @@ class Document:
             json.dump(self.data, f, indent=2)
     def to_tabular(self):
-        if self.extension != '.xml':
+        if self.type == 'submission_metadata':
+            return process_tabular_data(self)
+        elif self.extension != '.xml':
             return []
-        self.parse()
-        return process_tabular_data(self)
+        else:
+            self.parse()
+            return process_tabular_data(self)
-    def write_csv(self, output_folder, accession_number=None):
+    def write_csv(self, output_folder):
+        output_folder = Path(output_folder)
+        output_folder.mkdir(exist_ok=True)
-        tables = self.to_tabular(accession_number)
+        tables = self.to_tabular()
         if not tables:
             return
         for table in tables:
             fieldnames = table.columns
-            output_filename = Path(output_folder) / f"{table.type}.csv"
+            output_filename = output_folder / f"{table.type}.csv"
+            # Check if the file already exists
+            if output_filename.exists():
-            with open(output_filename, 'w', newline='') as csvfile:
-                writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
-                writer.writeheader()
-                writer.writerows(table.data)
+                with open(output_filename, 'a', newline='') as csvfile:
+                    writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
+                    writer.writerows(table.data)
+            else:
+                with open(output_filename, 'w', newline='') as csvfile:
+                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
+                    writer.writeheader()
+                    writer.writerows(table.data)
     def _document_to_section_text(self, document_data, parent_key=''):

{datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/information_table.py RENAMED Viewed

@@ -1,3 +1,4 @@
+# Ready for mass testing
 # Information Table (13F-HR Securities) mapping
 information_table_dict = {

{datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ownership.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# Mapping dictionaries for SEC filing table types based on actual field occurrences
+# Ready for mass testing
 # Non-derivative transaction ownership mapping
 non_derivative_transaction_ownership_dict = {

datamule-1.2.6/datamule/document/mappings/proxy_voting_record.py ADDED Viewed

@@ -0,0 +1,17 @@
+proxy_voting_record_dict = {
+    'meetingDate': 'meetingDate',
+    'accession': 'accessionNumber',
+    'vote_voteRecord_managementRecommendation': 'managementRecommendation',
+    'sharesVoted': 'sharesVoted',  # Top-level sharesVoted
+    'vote_voteRecord_howVoted': 'howVoted',
+    'sharesOnLoan': 'sharesOnLoan',
+    'cusip': 'cusip',
+    'issuerName': 'issuerName',
+    'voteCategories_voteCategory_categoryType': 'categoryType',
+    'voteDescription': 'voteDescription',
+    'voteManager_otherManagers_otherManager': 'otherManager',
+    'vote_voteRecord_sharesVoted': 'recordSharesVoted',  # To distinguish from top-level sharesVoted
+    'isin': 'isin',
+    'voteSource': 'voteSource',
+    'voteSeries': 'voteSeries'
+}

datamule-1.2.6/datamule/document/mappings/submission_metadata.py ADDED Viewed

@@ -0,0 +1,9 @@
+# Note: submission_metadata is my designation, not SEC for the header of the Submission tag
+document_submission_metadata_dict = {
+    'accession':'accession',
+    'type':'type',
+    'sequence' : 'sequence',
+    'filename' : 'filename',
+    'description':'description'
+}

datamule-1.2.6/datamule/document/mappings/thirteenfhr.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Ready for mass testing
+# 13F-HR (Institutional Investment Manager Holdings) mapping
+thirteenfhr_dict =  {
+    # Cover Page Mapping
+    'formData_coverPage_reportCalendarOrQuarter': 'reportCalendarOrQuarter',
+    'formData_coverPage_filingManager_name': 'filingManagerName',
+    'formData_coverPage_filingManager_address_street1': 'filingManagerStreet1',
+    'formData_coverPage_filingManager_address_street2': 'filingManagerStreet2',
+    'formData_coverPage_filingManager_address_city': 'filingManagerCity',
+    'formData_coverPage_filingManager_address_stateOrCountry': 'filingManagerStateOrCountry',
+    'formData_coverPage_filingManager_address_zipCode': 'filingManagerZipCode',
+    'formData_coverPage_crdNumber': 'crdNumber',
+    'formData_coverPage_secFileNumber': 'secFileNumber',
+    'formData_coverPage_form13FFileNumber': 'form13FFileNumber',
+    'formData_coverPage_reportType': 'reportType',
+    'formData_coverPage_isAmendment': 'isAmendment',
+    'formData_coverPage_amendmentNo': 'amendmentNo',
+    'formData_coverPage_amendmentInfo_amendmentType': 'amendmentType',
+    'formData_coverPage_amendmentInfo_confDeniedExpired': 'confDeniedExpired',
+    'formData_coverPage_additionalInformation': 'additionalInformation',
+    'formData_coverPage_provideInfoForInstruction5': 'provideInfoForInstruction5',
+    # Other Managers Info Mapping
+    'formData_coverPage_otherManagersInfo_otherManager': 'otherManager',
+    'formData_coverPage_otherManagersInfo_otherManager_cik': 'otherManagerCik',
+    'formData_coverPage_otherManagersInfo_otherManager_name': 'otherManagerName',
+    'formData_coverPage_otherManagersInfo_otherManager_crdNumber': 'otherManagerCrdNumber',
+    'formData_coverPage_otherManagersInfo_otherManager_secFileNumber': 'otherManagerSecFileNumber',
+    'formData_coverPage_otherManagersInfo_otherManager_form13FFileNumber': 'otherManagerForm13FFileNumber',
+    # Summary Page Mapping
+    'formData_summaryPage_isConfidentialOmitted': 'isConfidentialOmitted',
+    'formData_summaryPage_otherIncludedManagersCount': 'otherIncludedManagersCount',
+    'formData_summaryPage_tableEntryTotal': 'tableEntryTotal',
+    'formData_summaryPage_tableValueTotal': 'tableValueTotal',
+    # Other Managers 2 Info Mapping
+    'formData_summaryPage_otherManagers2Info_otherManager2': 'otherManager2',
+    'formData_summaryPage_otherManagers2Info_otherManager2_sequenceNumber': 'otherManager2SequenceNumber',
+    'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_cik': 'otherManager2Cik',
+    'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_name': 'otherManager2Name',
+    'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_crdNumber': 'otherManager2CrdNumber',
+    'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_secFileNumber': 'otherManager2SecFileNumber',
+    'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_form13FFileNumber': 'otherManager2Form13FFileNumber',
+    # Signature Block Mapping
+    'formData_signatureBlock_name': 'signatureName',
+    'formData_signatureBlock_title': 'signatureTitle',
+    'formData_signatureBlock_phone': 'signaturePhone',
+    'formData_signatureBlock_signature': 'signature',
+    'formData_signatureBlock_city': 'signatureCity',
+    'formData_signatureBlock_stateOrCountry': 'signatureStateOrCountry',
+    'formData_signatureBlock_signatureDate': 'signatureDate',
+    # Header Data Mapping
+    'headerData_filerInfo_periodOfReport': 'periodOfReport',
+    'headerData_filerInfo_filer_fileNumber': 'filerFileNumber',
+    'headerData_filerInfo_filer_credentials_cik': 'filerCik',
+    'headerData_filerInfo_filer_credentials_ccc': 'filerCcc',
+    'headerData_filerInfo_flags_confirmingCopyFlag': 'confirmingCopyFlag',
+    'headerData_filerInfo_flags_returnCopyFlag': 'returnCopyFlag',
+    'headerData_filerInfo_flags_overrideInternetFlag': 'overrideInternetFlag',
+    'headerData_filerInfo_denovoRequest': 'denovoRequest',
+    'headerData_filerInfo_liveTestFlag': 'liveTestFlag',
+    'headerData_submissionType': 'submissionType',
+    # Schema and Metadata Mapping
+    'schemaLocation': 'schemaLocation',
+    'schemaVersion': 'schemaVersion',
+    'accession': 'accessionNumber'
+}

{datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/twentyfivense.py RENAMED Viewed

@@ -1,3 +1,4 @@
+# Ready for mass testing
 # 25-NSE mapping
 twentyfive_nse_dict = {
     'descriptionClassSecurity': 'securityDescription',

{datamule-1.2.4 → datamule-1.2.6}/datamule/document/processing.py RENAMED Viewed

@@ -17,6 +17,12 @@ def process_tabular_data(self):
         tables = process_13fhr(self.data, self.accession)
     elif self.type in ["INFORMATION TABLE"]:
         tables = process_information_table(self.data, self.accession)
+    elif self.type in ["25-NSE", "25-NSE/A"]:
+        tables = process_25nse(self.data, self.accession)
+    # complete mark:
+    elif self.type in ["N-PX","N-PX/A"]:
+        tables = process_npx(self.data, self.accession)
     elif self.type in ["SBSEF","SBSEF/A","SBSEF-V","SBSEF-W"]:
         tables = process_sbsef(self.data, self.accession)
     elif self.type in ["SDR","SDR/A","SDR-W","SDR-A"]:
@@ -33,8 +39,7 @@ def process_tabular_data(self):
         tables = process_144(self.data, self.accession)
     elif self.type in ["24F-2NT", "24F-2NT/A"]:
         tables = process_24f2nt(self.data, self.accession)
-    elif self.type in ["25-NSE", "25-NSE/A"]:
-        tables = process_25nse(self.data, self.accession)
     elif self.type in ["ATS-N", "ATS-N/A"]:
         tables = process_ats(self.data, self.accession)
     # elif self.type in ["C","C-W","C-U","C-U-W","C/A","C/A-W",
@@ -53,8 +58,7 @@ def process_tabular_data(self):
     #     tables = process_nmfp(self.data, self.accession)
     # elif self.type in ["NPORT-P","NPORT-P/A"]:
     #     tables = process_nportp(self.data, self.accession)
-    elif self.type in ["N-PX","N-PX/A"]:
-        tables = process_npx(self.data, self.accession)
     # elif self.type in ["TA-1","TA-1/A","TA-W","TA-2","TA-2/A"]:
     #     tables = process_ta(self.data, self.accession)
     elif self.type in ["X-17A-5","X-17A-5/A"]:
@@ -70,6 +74,8 @@ def process_tabular_data(self):
     #     tables = process_ex102_abs(self.data, self.accession)
     elif self.type == "PROXY VOTING RECORD":
         tables = process_proxy_voting_record(self.data, self.accession)
+    elif self.type == 'submission_metadata':
+        tables = process_submission_metadata(self.content, self.accession)
     else:
         warn(f"Processing for {self.type} is not implemented yet.")
         return []
@@ -601,4 +607,28 @@ def process_reg_a(data, accession):
 #     raise NotImplementedError("Need to implement the rest of the MA processing")
 # def process_ncen(data, accession):
-#     raise NotImplementedError("Need to implement the N-CEN processing")
+#     raise NotImplementedError("Need to implement the N-CEN processing")
+# WIP
+# Note: going to pause this for now, as I don't have a great way of putting this in a csv.
+def process_submission_metadata(data,accession):
+    tables = []
+    document_data = safe_get(data, ['documents'])
+    if document_data:
+        tables.append(Table(_flatten_dict(document_data), 'document_submission_metadata', accession))
+    reporting_owner_data = safe_get(data,['reporting-owner'])
+    if reporting_owner_data:
+        tables.append(Table(_flatten_dict(reporting_owner_data), 'reporting_owner_submission_metadata', accession))
+    issuer_data = safe_get(data,['issuer'])
+    if issuer_data:
+        tables.append(Table(_flatten_dict(issuer_data), 'issuer_submission_metadata', accession))
+    # # construct metadata
+    # accession-number date-of-filing-date-change, depositor-cik effectiveness-date
+    # # other tables
+    # depositor, securitizer
+    return tables

{datamule-1.2.4 → datamule-1.2.6}/datamule/document/table.py RENAMED Viewed

@@ -18,7 +18,7 @@ from .mappings.thirteenfhr import *
 from .mappings.twentyfivense import *
 from .mappings.twentyfourf2nt import *
 from .mappings.information_table import *
+from .mappings.submission_metadata import *
 # need to check if mappings correctly create new columns
 class Table():
     def __init__(self, data, type,accession):
@@ -228,6 +228,11 @@ class Table():
         elif self.type == 'signature_info_schedule_a':
             mapping_dict = signature_24f2nt_dict
+        # submission metadata
+        elif self.type == 'document_submission_metadata':
+            mapping_dict = document_submission_metadata_dict
         else:
             mapping_dict = {}

{datamule-1.2.4 → datamule-1.2.6}/datamule/helper.py RENAMED Viewed

@@ -79,7 +79,16 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
         # Convert ticker to CIK if provided
         if ticker is not None:
-            cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
+            if isinstance(ticker, str):
+                ticker = [ticker]
+            ciks_from_ticker = []
+            for t in ticker:
+                ciks = get_cik_from_dataset('listed_filer_metadata', 'ticker', t)
+                if ciks:
+                    ciks_from_ticker.extend(ciks)
+            cik = ciks
         # Normalize CIK format
         if cik is not None:

{datamule-1.2.4 → datamule-1.2.6}/datamule/index.py RENAMED Viewed

@@ -1,16 +1,16 @@
-from pathlib import Path
 from .sec.submissions.textsearch import query
-from .helper import _process_cik_and_metadata_filters, load_package_dataset
+from .helper import _process_cik_and_metadata_filters
+from pathlib import Path
 class Index:
-    def __init__(self, path=None):
-        self.path = Path(path) if path else None
+    def __init__(self):
+        pass
     def search_submissions(
         self,
         text_query,
-        start_date=None,
-        end_date=None,
+        filing_date=None,
         submission_type=None,
         cik=None,
         ticker=None,
@@ -47,16 +47,14 @@ class Index:
         # Execute the search query
         results = query(
             f'{text_query}',
-            filing_date=(start_date, end_date),
+            filing_date=filing_date,
             requests_per_second=requests_per_second,
             quiet=quiet,
             submission_type=submission_type,
             **kwargs
         )
-        # Save results to path if specified
-        if self.path:
-            self._save_results(results, text_query)
         return results

{datamule-1.2.4 → datamule-1.2.6}/datamule/portfolio.py RENAMED Viewed

@@ -9,22 +9,28 @@ import os
 from .helper import _process_cik_and_metadata_filters
 from .seclibrary.downloader import download as seclibrary_download
 from .sec.xbrl.filter_xbrl import filter_xbrl
-from .sec.submissions.monitor import monitor
-from .sec.xbrl.xbrlmonitor import XBRLMonitor
+from .sec.submissions.monitor import Monitor
+#from .sec.xbrl.xbrlmonitor import XBRLMonitor
 class Portfolio:
     def __init__(self, path):
         self.path = Path(path)
+        self.api_key = None
         self.submissions = []
         self.submissions_loaded = False
         self.MAX_WORKERS = os.cpu_count() - 1
+        self.monitor = Monitor()
         if self.path.exists():
             self._load_submissions()
             self.submissions_loaded = True
         else:
             self.path.mkdir(parents=True, exist_ok=True)
+    def set_api_key(self, api_key):
+        self.api_key = api_key
     def _load_submissions(self):
         folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -132,6 +138,7 @@ class Portfolio:
             seclibrary_download(
                 output_dir=self.path,
                 cik=cik,
+                api_key=self.api_key,
                 submission_type=submission_type,
                 filing_date=filing_date,
                 accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
@@ -149,20 +156,18 @@ class Portfolio:
             )
         self.submissions_loaded = False
-    def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
-           polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
+    def monitor_submissions(self, data_callback=None, interval_callback=None,
+                            polling_interval=1000, quiet=True, start_date=None,
+                            validation_interval=600000):
-        cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
-        monitor(
+        self.monitor.monitor_submissions(
             data_callback=data_callback,
-            poll_callback=poll_callback,
-            cik=cik,
-            submission_type=submission_type,
+            interval_callback=interval_callback,
             polling_interval=polling_interval,
-            requests_per_second=requests_per_second,
             quiet=quiet,
-            start_date=start_date
+            start_date=start_date,
+            validation_interval=validation_interval
         )
@@ -179,8 +184,4 @@ class Portfolio:
             document_types = [document_types]
         for submission in self.submissions:
-            yield from submission.document_type(document_types)
-    def keep(self,document_type):
-        for submission in self.__iter__():
-            submission.keep(document_type)
+            yield from submission.document_type(document_types)

datamule-1.2.6/datamule/sec/submissions/monitor.py ADDED Viewed

@@ -0,0 +1,183 @@
+import time
+from collections import deque
+from datetime import datetime
+import xml.etree.ElementTree as ET
+import re
+import asyncio
+from ..utils import headers, PreciseRateLimiter
+from .eftsquery import EFTSQuery
+import aiohttp
+async def poll_rss(limiter):
+    base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
+    # Create a session specifically for this RSS polling operation
+    async with aiohttp.ClientSession(headers=headers) as session:
+        # Use the rate limiter before making the request
+        async with limiter:
+            # Make the HTTP request with the session
+            async with session.get(base_url) as response:
+                content = await response.read()
+    # Process the content
+    content_str = content.decode('utf-8')
+    root = ET.fromstring(content_str)
+    namespace = {'atom': 'http://www.w3.org/2005/Atom'}
+    entries = root.findall('atom:entry', namespace)
+    grouped = {}
+    for entry in entries:
+        url = entry.find('atom:link', namespace).get('href')
+        accession = re.search(r'/(\d{10})-(\d{2})-(\d{6})', url)
+        accession = accession.group(1) + accession.group(2) + accession.group(3)
+        cik = re.search(r'/data/(\d+)/', url).group(1)
+        if accession not in grouped:
+            grouped[accession] = {'submission_type': '', 'ciks': set(), 'filing_date': ''}
+        grouped[accession]['ciks'].add(cik)
+        grouped[accession]['submission_type'] = entry.find('atom:category', namespace).get('term')
+        summary_text = entry.find('atom:summary', namespace).text
+        filing_date_match = re.search(r'Filed:</b>\s*(\d{4}-\d{2}-\d{2})', summary_text)
+        if filing_date_match:
+            grouped[accession]['filing_date'] = filing_date_match.group(1)
+    results = [{'accession': int(k.replace('-', '')), 'submission_type': v['submission_type'], 'ciks': list(v['ciks']), 'filing_date': v['filing_date']} for k, v in grouped.items()]
+    return results
+def clean_efts_hits(hits):
+    # clean hits
+    hits = [{'accession': int(hit['_source']['adsh'].replace('-','')), 'filing_date': hit['_source']['file_date'], 'ciks': hit['_source']['ciks']} for hit in hits]
+    return hits
+class Monitor():
+    def __init__(self):
+        self.accessions = deque(maxlen=50000)
+        self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
+        self.efts_query = EFTSQuery(quiet=True)
+        self.efts_query.limiter = self.ratelimiters['sec.gov']
+    def set_domain_rate_limit(self, domain, rate):
+        self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
+        if domain == 'sec.gov':
+            self.efts_query.limiter = self.ratelimiters[domain]
+    async def _async_run_efts_query(self, **kwargs):
+        """Async helper method to run EFTS query without creating a new event loop"""
+        # Make sure to set quiet parameter if provided in kwargs
+        self.efts_query.quiet = kwargs.get('quiet', True)
+        return await self.efts_query.query(
+            cik=kwargs.get('cik'),
+            submission_type=kwargs.get('submission_type'),
+            filing_date=kwargs.get('filing_date'),
+            location=kwargs.get('location'),
+            callback=kwargs.get('callback'),
+            name=kwargs.get('name')
+        )
+    async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
+                            polling_interval=1000, quiet=True, start_date=None,
+                            validation_interval=60000):
+        """
+        Async implementation of monitor_submissions.
+        """
+        # Backfill if start_date is provided
+        if start_date is not None:
+            today_date = datetime.now().date().strftime('%Y-%m-%d')
+            if not quiet:
+                print(f"Backfilling from {start_date} to {today_date}")
+            hits = clean_efts_hits(await self._async_run_efts_query(
+                filing_date=(start_date, today_date),
+                quiet=quiet
+            ))
+            new_hits = self._filter_new_accessions(hits)
+            if not quiet:
+                print(f"New submissions found: {len(new_hits)}")
+            if new_hits and data_callback:
+                data_callback(new_hits)
+        last_polling_time = time.time()
+        last_validation_time = last_polling_time
+        current_time = last_polling_time
+        while True:
+            # RSS polling
+            if not quiet:
+                print(f"Polling RSS feed")
+            results = await poll_rss(self.ratelimiters['sec.gov'])
+            new_results = self._filter_new_accessions(results)
+            if new_results:
+                if not quiet:
+                    print(f"Found {len(new_results)} new submissions via RSS")
+                if data_callback:
+                    data_callback(new_results)
+            # EFTS validation
+            if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
+                # Get submissions from the last 24 hours for validation
+                today_date = datetime.now().strftime('%Y-%m-%d')
+                if not quiet:
+                    print(f"Validating submissions from {today_date}")
+                hits = clean_efts_hits(await self._async_run_efts_query(
+                    filing_date=(today_date, today_date),
+                    quiet=quiet
+                ))
+                new_hits = self._filter_new_accessions(hits)
+                if new_hits:
+                    if not quiet:
+                        print(f"Found {len(new_hits)} new submissions via EFTS validation")
+                    if data_callback:
+                        data_callback(new_hits)
+                last_polling_time = time.time()
+                last_validation_time = current_time
+            # Interval callback
+            if interval_callback:
+                interval_callback()
+            next_poll_time = last_polling_time + (polling_interval / 1000)
+            current_time = time.time()
+            time_to_sleep = max(0, next_poll_time - current_time)
+            await asyncio.sleep(time_to_sleep)
+            last_polling_time = next_poll_time
+    def monitor_submissions(self, data_callback=None, interval_callback=None,
+                            polling_interval=1000, quiet=True, start_date=None,
+                            validation_interval=60000):
+        """
+        Monitor SEC submissions using the EDGAR system.
+        :param data_callback: function to call with the data
+        :param interval_callback: function that executes between polls
+        :param polling_interval: interval between polls in milliseconds
+        :param quiet: if True, suppresses output
+        :param start_date: backfill start date in YYYY-MM-DD format
+        :param validation_interval: interval between validation in milliseconds
+        This function combines the speed of the RSS feed (fast, but misses some submissions) with the accuracy of the EFTS system.
+        """
+        # This is now a synchronous wrapper around the async implementation
+        return asyncio.run(self._async_monitor_submissions(
+            data_callback=data_callback,
+            interval_callback=interval_callback,
+            polling_interval=polling_interval,
+            quiet=quiet,
+            start_date=start_date,
+            validation_interval=validation_interval
+        ))
+    def _filter_new_accessions(self, items):
+        """Filter items to only include those with new accession numbers."""
+        new_items = []
+        for item in items:
+            accession = item['accession']
+            if accession not in self.accessions:
+                self.accessions.append(accession)
+                new_items.append(item)
+        return new_items

{datamule-1.2.4 → datamule-1.2.6}/datamule/sec/submissions/textsearch.py RENAMED Viewed

@@ -1,8 +1,4 @@
 import asyncio
-import aiohttp
-from datetime import datetime
-from urllib.parse import urlencode
-from tqdm import tqdm
 from .eftsquery import EFTSQuery
 class TextSearchEFTSQuery(EFTSQuery):

{datamule-1.2.4 → datamule-1.2.6}/datamule/sec/xbrl/streamcompanyfacts.py RENAMED Viewed

@@ -2,7 +2,7 @@ import asyncio
 import aiohttp
 import json
 from tqdm import tqdm
-from ..utils import PreciseRateLimiter, RateMonitor, RetryException, headers
+from ..utils import PreciseRateLimiter, RateMonitor, headers
 async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
     # Format CIK with leading zeros to 10 digits

datamule 1.2.4__tar.gz → 1.2.6__tar.gz

datamule 1.2.4tar.gz → 1.2.6tar.gz