PyPI - datamule - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

datamule 1.0.2py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

datamule/__init__.py +2 -13
datamule/document.py +0 -1
datamule/helper.py +85 -105
datamule/portfolio.py +105 -29
datamule/submission.py +0 -38
{datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/METADATA +2 -8
datamule-1.0.6.dist-info/RECORD +10 -0
datamule/book/__init__.py +0 -0
datamule/book/book.py +0 -34
datamule/book/eftsquery.py +0 -127
datamule/book/xbrl_retriever.py +0 -88
datamule/data/company_former_names.csv +0 -8148
datamule/data/company_metadata.csv +0 -10049
datamule/data/company_tickers.csv +0 -9999
datamule/data/sec-glossary.csv +0 -728
datamule/data/xbrl_descriptions.csv +0 -10024
datamule/downloader/downloader.py +0 -374
datamule/downloader/premiumdownloader.py +0 -335
datamule/mapping_dicts/txt_mapping_dicts.py +0 -232
datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
datamule/monitor.py +0 -238
datamule/mulebot/__init__.py +0 -1
datamule/mulebot/helper.py +0 -35
datamule/mulebot/mulebot.py +0 -130
datamule/mulebot/mulebot_server/__init__.py +0 -1
datamule/mulebot/mulebot_server/server.py +0 -87
datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
datamule/mulebot/search.py +0 -52
datamule/mulebot/tools.py +0 -82
datamule/packageupdater.py +0 -207
datamule-1.0.2.dist-info/RECORD +0 -43
{datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/WHEEL +0 -0
{datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/top_level.txt +0 -0

datamule/__init__.py CHANGED Viewed

@@ -1,12 +1,7 @@
-from .downloader.downloader import Downloader
-from .downloader.premiumdownloader import PremiumDownloader
-from .monitor import Monitor
-from .packageupdater import PackageUpdater
 from .submission import Submission
 from .portfolio import Portfolio
 from .document import Document
-from secsgml import parse_sgml_submission
-from .helper import load_package_csv, load_package_dataset
+from .helper import _load_package_csv, load_package_dataset
 from .config import Config
@@ -32,16 +27,10 @@ def _setup_notebook_env():
 _setup_notebook_env()
 __all__ = [
-    'Downloader',
-    'PremiumDownloader',
-    'load_package_csv',
+    '_load_package_csv',
     'load_package_dataset',
-    'Filing',
     'Portfolio',
-    'Monitor',
-    'PackageUpdater',
     'Submission',
     'Document',
-    'parse_sgml_submission',
     'Config'
 ]

datamule/document.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import csv
-from .helper import convert_to_dashed_accession
 import re
 from doc2dict import xml2dict, txt2dict, dict2dict
 from doc2dict.mapping import flatten_hierarchy

datamule/helper.py CHANGED Viewed

@@ -1,123 +1,103 @@
-import requests
-import os
-from tqdm import tqdm
-import zipfile
-from pkg_resources import resource_filename
+from functools import lru_cache
 import csv
-import re
+from pathlib import Path
-# Unused in current implementation.
-def construct_primary_doc_url(cik, accession_number,primary_doc_url):
-    accession_number = accession_number.replace("-", "")
-    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
-# DONE
-def _download_from_dropbox(url, output_path):
-    headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
-    r = requests.get(url, stream=True, headers=headers)
-    total_size = int(r.headers.get('content-length', 0))
+def _load_package_csv(name):
+    """Load CSV files from ~/.datamule/ directory"""
+    data_dir = Path.home() / ".datamule"
+    csv_path = data_dir / f"{name}.csv"
-    with open(output_path, 'wb') as f, tqdm(
-        desc="Downloading " + os.path.basename(output_path),
-        total=total_size,
-        unit='iB',
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as progress_bar:
-        for chunk in r.iter_content(chunk_size=1024):
-            size = f.write(chunk)
-            progress_bar.update(size)
-    # Check if the downloaded file is a zip file
-    if zipfile.is_zipfile(output_path):
-        extract_path = os.path.dirname(output_path)
-        with zipfile.ZipFile(output_path, 'r') as zip_ref:
-            for file_info in zip_ref.infolist():
-                extract_file_path = os.path.join(extract_path, file_info.filename)
-                with zip_ref.open(file_info) as file_in_zip, \
-                    open(extract_file_path, 'wb') as output_file, \
-                    tqdm(total=file_info.file_size, unit='B', unit_scale=True,
-                         desc=f"Extracting {file_info.filename}") as pbar:
-                    while True:
-                        chunk = file_in_zip.read(8192)
-                        if not chunk:
-                            break
-                        output_file.write(chunk)
-                        pbar.update(len(chunk))
-        # Remove the zip file after extraction
-        os.remove(output_path)
-        print(f"Extracted contents to {extract_path}")
-    else:
-        print(f"Downloaded file is not a zip. Saved to {output_path}")
-# May generalize to load any package resource
-def load_package_csv(name):
-    """Load package CSV files"""
-    csv_path = resource_filename('datamule', f'data/{name}.csv')
-    company_tickers = []
+    data = []
     with open(csv_path, 'r') as csvfile:
         csv_reader = csv.DictReader(csvfile)
         for row in csv_reader:
-            company_tickers.append(row)
+            data.append(row)
-    return company_tickers
+    return data
 def load_package_dataset(dataset):
-    if dataset == 'company_tickers':
-        return load_package_csv('company_tickers')
-    elif dataset =='company_former_names':
-        return load_package_csv('company_former_names')
-    elif dataset =='company_metadata':
-        return load_package_csv('company_metadata')
-    elif dataset == 'sec_glossary':
-        return load_package_csv('sec-glossary')
-    elif dataset == 'xbrl_descriptions':
-        return load_package_csv('xbrl_descriptions')
+    if dataset =='listed_filer_metadata':
+        return _load_package_csv('listed_filer_metadata')
-# DONE
-def identifier_to_cik(ticker):
-    """Convert company tickers to CIK codes"""
-    company_tickers = load_package_csv('company_tickers')
-    if ticker:
-        if isinstance(ticker, list):
-            cik = []
-            for t in ticker:
-                cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
-        else:
-            cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
-    if not cik:
-        raise ValueError("No matching companies found")
-    return cik
+@lru_cache(maxsize=128)
+def get_cik_from_dataset(dataset_name, key, value):
+    dataset = load_package_dataset(dataset_name)
+    if dataset_name == 'listed_filer_metadata' and key == 'ticker':
+        key = 'tickers'
+    result = []
+    for company in dataset:
+        if key in ['tickers', 'exchanges'] and dataset_name == 'listed_filer_metadata':
+            # Parse the string representation of list into an actual list
+            list_values = [i.strip() for i in company[key][1:-1].replace("'", "").replace('"', '').split(',')]
+            if str(value) in list_values:
+                result.append(company['cik'])
+        elif str(value) == company[key]:
+            result.append(company['cik'])
+    return result
-def fix_filing_url(url):
-    match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
-    if match_suffix:
-        suffix_number = match_suffix.group(1)
-        file_ext = match_suffix.group(2)
-        match_accession = re.search(r'/(\d{18})/', url)
-        if match_accession:
-            accession_number = match_accession.group(1)
-            formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
-            new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
-            return new_url
-    return url
-def convert_to_dashed_accession(accession):
-    # Remove any existing dashes or whitespace
-    cleaned = ''.join(accession.split())
+@lru_cache(maxsize=128)
+def get_ciks_from_metadata_filters(**kwargs):
+    """Get CIKs from listed_filer_metadata.csv that match all provided filters."""
-    # Check if the cleaned string has 18 characters
-    if len(cleaned) != 18:
-        raise ValueError("Invalid accession number format. Expected 18 characters.")
+    # Start with None to get all CIKs from first filter
+    result_ciks = None
-    # Insert dashes at the correct positions
-    dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
+    # For each filter, get matching CIKs and keep intersection
+    for key, value in kwargs.items():
+        # Get CIKs for this filter
+        ciks = get_cik_from_dataset('listed_filer_metadata', key, value)
+        ciks = [int(cik) for cik in ciks]
+        # If this is the first filter, set as initial result
+        if result_ciks is None:
+            result_ciks = set(ciks)
+        # Otherwise, take intersection with previous results
+        else:
+            result_ciks &= set(ciks)
+        # If no matches left, we can exit early
+        if not result_ciks:
+            return []
-    return dashed
+    return list(result_ciks)
+def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
+        """
+        Helper method to process CIK, ticker, and metadata filters.
+        Returns a list of CIKs after processing.
+        """
+        # Input validation
+        if cik is not None and ticker is not None:
+            raise ValueError("Only one of cik or ticker should be provided, not both.")
+        # Convert ticker to CIK if provided
+        if ticker is not None:
+            cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
+        # Normalize CIK format
+        if cik is not None:
+            if isinstance(cik, str):
+                cik = [int(cik)]
+            elif isinstance(cik, int):
+                cik = [cik]
+            elif isinstance(cik, list):
+                cik = [int(x) for x in cik]
+        # Process metadata filters if provided
+        if kwargs:
+            metadata_ciks = get_ciks_from_metadata_filters(**kwargs)
-headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
+            if cik is not None:
+                cik = list(set(cik).intersection(metadata_ciks))
+            else:
+                cik = metadata_ciks
+        return cik

datamule/portfolio.py CHANGED Viewed

@@ -2,19 +2,29 @@ from pathlib import Path
 from tqdm import tqdm
 from concurrent.futures import ThreadPoolExecutor
 from .submission import Submission
-from .downloader.premiumdownloader import PremiumDownloader
-from .downloader.downloader import Downloader
+from .sec.submissions.downloader import download as sec_download
+from .sec.submissions.textsearch import filter_text
 from .config import Config
 import os
+from .helper import _process_cik_and_metadata_filters
+from .seclibrary.downloader import download as seclibrary_download
+from .sec.xbrl.filter_xbrl import filter_xbrl
+from .sec.submissions.monitor import monitor
+from .sec.xbrl.xbrlmonitor import XBRLMonitor
 class Portfolio:
     def __init__(self, path):
         self.path = Path(path)
         self.submissions = []
+        self.submissions_loaded = False
         self.MAX_WORKERS = os.cpu_count() - 1
         if self.path.exists():
             self._load_submissions()
+            self.submissions_loaded = True
+        else:
+            self.path.mkdir(parents=True, exist_ok=True)
     def _load_submissions(self):
         folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -40,6 +50,8 @@ class Portfolio:
     def process_submissions(self, callback):
         """Process all submissions using a thread pool."""
+        if not self.submissions_loaded:
+            self._load_submissions()
         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
             results = list(tqdm(
                 executor.map(callback, self.submissions),
@@ -50,6 +62,9 @@ class Portfolio:
     def process_documents(self, callback):
         """Process all documents using a thread pool."""
+        if not self.submissions_loaded:
+            self._load_submissions()
         documents = [doc for sub in self.submissions for doc in sub]
         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
@@ -59,48 +74,109 @@ class Portfolio:
                 desc="Processing documents"
             ))
             return results
+    def filter_text(self, text_query, cik=None, ticker=None, submission_type=None, filing_date=None, **kwargs):
+        """
+        Filter text based on query and various parameters.
+        When called multiple times, takes the intersection of results.
+        Now supports metadata filters through kwargs.
+        """
+        # Process CIK and metadata filters
+        cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
+        # Call the filter_text function with processed parameters
+        new_accession_numbers = filter_text(
+            text_query=text_query,
+            cik=cik,
+            submission_type=submission_type,
+            filing_date=filing_date
+        )
+        # If we already have accession numbers, take the intersection
+        if hasattr(self, 'accession_numbers') and self.accession_numbers:
+            self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
+        else:
+            # First query, just set the accession numbers
+            self.accession_numbers = new_accession_numbers
+    def filter_xbrl(self, taxonomy, concept, unit, period, logic, value):
+        """
+        Filter XBRL data based on logic and value.
+        """
+        new_accession_numbers = filter_xbrl(
+            taxonomy=taxonomy,
+            concept=concept,
+            unit=unit,
+            period=period,
+            logic=logic,
+            value=value
+        )
+        # If we already have accession numbers, take the intersection
+        if hasattr(self, 'accession_numbers') and self.accession_numbers:
+            self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
+        else:
+            # First query, just set the accession numbers
+            self.accession_numbers = new_accession_numbers
-    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
         if provider is None:
             config = Config()
             provider = config.get_default_source()
-        downloader = PremiumDownloader() if provider == 'datamule' else Downloader()
-        downloader.download_submissions(
-            output_dir=self.path,
+        # Process CIK and metadata filters
+        cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
+        if provider == 'datamule':
+            seclibrary_download(
+                output_dir=self.path,
+                cik=cik,
+                submission_type=submission_type,
+                filing_date=filing_date,
+                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
+            )
+        else:
+            sec_download(
+                output_dir=self.path,
+                cik=cik,
+                submission_type=submission_type,
+                filing_date=filing_date,
+                requests_per_second=5, # Revisit this later.
+                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
+            )
+        self.submissions_loaded = False
+    def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
+           polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
+        cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
+        monitor(
+            data_callback=data_callback,
+            poll_callback=poll_callback,
             cik=cik,
-            ticker=ticker,
             submission_type=submission_type,
-            filing_date=filing_date
+            polling_interval=polling_interval,
+            requests_per_second=requests_per_second,
+            quiet=quiet,
+            start_date=start_date
         )
-        # Reload submissions after download
-        self._load_submissions()
     def __iter__(self):
+        if not self.submissions_loaded:
+            self._load_submissions()
         return iter(self.submissions)
     def document_type(self, document_types):
         """Filter documents by type(s)."""
+        if not self.submissions_loaded:
+            self._load_submissions()
         if isinstance(document_types, str):
             document_types = [document_types]
         for submission in self.submissions:
-            yield from submission.document_type(document_types)
-    def contains_string(self, pattern, document_types=None):
-        """Search for pattern in documents, with optional type filter."""
-        def check_document(document):
-            return document if document.contains_string(pattern) else None
-        # Get documents, filtered by type if specified
-        documents = list(self.document_type(document_types)) if document_types else [
-            doc for sub in self.submissions for doc in sub
-        ]
-        with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
-            results = executor.map(check_document, documents)
-            for doc in tqdm(results, total=len(documents), desc=f"Searching for '{pattern}'"):
-                if doc is not None:
-                    yield doc
+            yield from submission.document_type(document_types)

datamule/submission.py CHANGED Viewed

@@ -11,44 +11,6 @@ class Submission:
         metadata_path = self.path / 'metadata.json'
         with metadata_path.open('r') as f:
             self.metadata = json.load(f)
-    def keep(self, document_types):
-        """Keep files of specified document types, delete others
-        Args:
-            document_types: string or list of strings representing document types to keep
-        """
-        # Convert single string to list for consistent handling
-        if isinstance(document_types, str):
-            document_types = [document_types]
-        for doc in self.metadata['documents']:
-            filename = doc.get('filename')
-            if filename is None:
-                continue
-            filepath = self.path / filename
-            # Delete if document type isn't in our keep list
-            if doc['type'] not in document_types and filepath.exists():
-                filepath.unlink()
-    def drop(self, document_types):
-        """Delete files of specified document types, keep others
-        Args:
-            document_types: string or list of strings representing document types to drop
-        """
-        # Convert single string to list for consistent handling
-        if isinstance(document_types, str):
-            document_types = [document_types]
-        for doc in self.metadata['documents']:
-            filename = doc.get('filename')
-            if filename is None:
-                continue
-            filepath = self.path / filename
-            # Delete if document type is in our drop list
-            if doc['type'] in document_types and filepath.exists():
-                filepath.unlink()
     def document_type(self, document_type):
         # Convert single document type to list for consistent handling

{datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.0.2
+Version: 1.0.6
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -17,11 +17,5 @@ Requires-Dist: pytz
 Requires-Dist: zstandard
 Requires-Dist: doc2dict
 Requires-Dist: secsgml
-Provides-Extra: all
-Requires-Dist: openai; extra == "all"
-Requires-Dist: flask; extra == "all"
-Provides-Extra: mulebot
-Requires-Dist: openai; extra == "mulebot"
-Provides-Extra: mulebot_server
-Requires-Dist: flask; extra == "mulebot-server"
+Requires-Dist: lxml

datamule-1.0.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+datamule/__init__.py,sha256=0npnB3i2F7YB7etG315oDiCd-eMo-A6MP5LX2gQclHY,914
+datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
+datamule/document.py,sha256=uohyX7pt_nSHOS1y02fOuwjqYewKD9HgIdBwCtOlKx8,10864
+datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
+datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
+datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
+datamule-1.0.6.dist-info/METADATA,sha256=n53ZBeKhntC3jX6su9jbKPr9WxSohgOvvLC7sIbYwhk,512
+datamule-1.0.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+datamule-1.0.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-1.0.6.dist-info/RECORD,,

datamule/book/__init__.py DELETED Viewed

File without changes

datamule/book/book.py DELETED Viewed

@@ -1,34 +0,0 @@
-# Streams data rather than downloading it.
-# additional functionality such as query by xbrl, and other db
-# also this is basically our experimental rework of portfolio w/o disturbing existing users
-# this is highly experimental and may not work as expected
-# only for datamule source
-# likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
-# wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
-# example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
-# hmm do we go for sql esq or not.
-# I think we do.
-# i think we remove cik, ticker, sic, etc and just have a query object
-# should be sql esq so users can use it easily w/o learnign new syntax
-# WHERE submission_type = '10-K'
-# AND us-gaap:ResearchAndDevelopmentExpense > 0
-# AND dei:debt_to_equity < 2
-# AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
-# AND CIK in (123, 456, 789)
-# AND SIC in (123, 456, 789)
-# AND ticker in ('AAPL', 'GOOGL', 'AMZN')
-# AND document_type = 'EX-99.1' # to select attachments
-from .eftsquery import EFTSQuery
-class Book():
-    def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
-                            xbrl_query={},
-                            metadata_callback=None,
-                            document_callback=None,):
-        # grabs data and processes it
-        pass

datamule/book/eftsquery.py DELETED Viewed

@@ -1,127 +0,0 @@
-import asyncio
-import aiohttp
-from tqdm import tqdm
-from datetime import datetime
-from urllib.parse import urlencode
-import time
-class PreciseRateLimiter:
-    def __init__(self, rate=10, interval=1.0):
-        self.rate = rate  # requests per interval
-        self.interval = interval  # in seconds
-        self.token_time = self.interval / self.rate  # time per token
-        self.last_time = time.time()
-        self.lock = asyncio.Lock()
-    async def acquire(self):
-        async with self.lock:
-            now = time.time()
-            wait_time = self.last_time + self.token_time - now
-            if wait_time > 0:
-                await asyncio.sleep(wait_time)
-            self.last_time = time.time()
-            return True
-class EFTSQuery:
-    def __init__(self):
-        self.headers = {
-            'User-Agent': 'Your Name yourname@email.com',
-            'Accept-Encoding': 'gzip, deflate',
-            'Host': 'efts.sec.gov'
-        }
-        self.session = None
-        self.limiter = PreciseRateLimiter(10)
-    async def __aenter__(self):
-        if not self.session:
-            self.session = aiohttp.ClientSession(headers=self.headers)
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if self.session:
-            await self.session.close()
-            self.session = None
-    async def _fetch_json(self, url):
-        await self.limiter.acquire()
-        try:
-            async with self.session.get(url) as response:
-                if response.status == 429:
-                    await asyncio.sleep(61)
-                    return await self._fetch_json(url)
-                return await response.json()
-        except Exception as e:
-            print(f"Error fetching {url}: {str(e)}")
-            return None
-    async def _get_accession_numbers(self, base_url):
-        data = await self._fetch_json(f"{base_url}&from=0&size=1")
-        if not data or 'hits' not in data:
-            return []
-        total_hits = data['hits']['total']['value']
-        if not total_hits:
-            return []
-        accession_numbers = []
-        start = 0
-        page_size = 100
-        batch_size = 10  # Number of concurrent requests
-        with tqdm(total=total_hits) as pbar:
-            while start < total_hits:
-                tasks = []
-                for i in range(batch_size):
-                    if start + i * page_size >= total_hits:
-                        break
-                    url = f"{base_url}&from={start + i * page_size}&size={page_size}"
-                    tasks.append(self._fetch_json(url))
-                if not tasks:
-                    break
-                results = await asyncio.gather(*tasks)
-                for data in results:
-                    if data and 'hits' in data:
-                        hits = data['hits']['hits']
-                        batch_numbers = [
-                            f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
-                            for hit in hits
-                        ]
-                        accession_numbers.extend(batch_numbers)
-                        pbar.update(len(hits))
-                start += batch_size * page_size
-        return accession_numbers
-    def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
-        async def _download():
-            async with self as downloader:
-                params = {}
-                if cik:
-                    params['ciks'] = str(cik).zfill(10)
-                if submission_type:
-                    params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
-                if isinstance(filing_date, list):
-                    dates = [(d, d) for d in filing_date]
-                elif isinstance(filing_date, tuple):
-                    dates = [filing_date]
-                else:
-                    date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
-                    start, end = date_str.split(',')
-                    dates = [(start, end)]
-                params['startdt'], params['enddt'] = dates[0]
-                if search_text:
-                    params['q'] = f'"{search_text}"'
-                base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
-                return await self._get_accession_numbers(base_url)
-        return asyncio.run(_download())

datamule 1.0.2__py3-none-any.whl → 1.0.6__py3-none-any.whl

datamule 1.0.2py3-none-any.whl → 1.0.6py3-none-any.whl