PyPI - datamule - Versions diffs - 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

datamule 1.0.8py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

datamule/book/__init__.py +0 -0
datamule/book/book.py +34 -0
datamule/mapping_dicts/__init__.py +0 -0
datamule/mapping_dicts/txt_mapping_dicts.py +234 -0
datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
datamule/sec/__init__.py +0 -0
datamule/sec/infrastructure/__init__.py +0 -0
datamule/sec/infrastructure/submissions_metadata.py +407 -0
datamule/sec/rss/__init__.py +0 -0
datamule/sec/rss/monitor.py +416 -0
datamule/sec/submissions/__init__.py +0 -0
datamule/sec/submissions/downloader.py +70 -0
datamule/sec/submissions/eftsquery.py +502 -0
datamule/sec/submissions/monitor.py +126 -0
datamule/sec/submissions/streamer.py +228 -0
datamule/sec/submissions/textsearch.py +122 -0
datamule/sec/utils.py +64 -0
datamule/sec/xbrl/__init__.py +0 -0
datamule/sec/xbrl/downloadcompanyfacts.py +83 -0
datamule/sec/xbrl/filter_xbrl.py +39 -0
datamule/sec/xbrl/streamcompanyfacts.py +93 -0
datamule/sec/xbrl/xbrlmonitor.py +143 -0
datamule/seclibrary/__init__.py +0 -0
datamule/seclibrary/downloader.py +286 -0
datamule/seclibrary/query.py +181 -0
{datamule-1.0.8.dist-info → datamule-1.1.0.dist-info}/METADATA +1 -1
datamule-1.1.0.dist-info/RECORD +35 -0
datamule-1.0.8.dist-info/RECORD +0 -10
{datamule-1.0.8.dist-info → datamule-1.1.0.dist-info}/WHEEL +0 -0
{datamule-1.0.8.dist-info → datamule-1.1.0.dist-info}/top_level.txt +0 -0

datamule/sec/submissions/streamer.py ADDED Viewed

@@ -0,0 +1,228 @@
+import asyncio
+from urllib.parse import urlencode
+from tqdm import tqdm
+import re
+from .eftsquery import EFTSQuery
+# This is to fix some broken SEC URLS. There's a better way to do this, but this is a quick fix.
+def fix_filing_url(url):
+    match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
+    if match_suffix:
+        suffix_number = match_suffix.group(1)
+        file_ext = match_suffix.group(2)
+        match_accession = re.search(r'/(\d{18})/', url)
+        if match_accession:
+            accession_number = match_accession.group(1)
+            formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
+            new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
+            return new_url
+    return url
+class Streamer(EFTSQuery):
+    def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None):
+        super().__init__(requests_per_second=requests_per_second)
+        self.document_callback = document_callback
+        self.document_queue = asyncio.Queue()
+        self.download_in_progress = asyncio.Event()
+        self.query_paused = asyncio.Event()
+        self.document_pbar = None
+        self.document_workers = []
+        self.documents_processed = 0
+        self.total_documents = 0
+        self.accession_numbers = accession_numbers
+        self.skipped_documents = 0
+    async def _fetch_worker(self):
+        """Override the parent class worker to implement pause/resume"""
+        while True:
+            try:
+                # Check if we should pause for document downloads
+                if self.query_paused.is_set():
+                    # Wait until downloads are done and we're resumed
+                    await self.query_paused.wait()
+                params, from_val, size_val, callback = await self.fetch_queue.get()
+                url = f"{self.base_url}?{urlencode(params, doseq=True)}&from={from_val}&size={size_val}"
+                try:
+                    data = await self._fetch_json(url)
+                    if 'hits' in data:
+                        hits = data['hits']['hits']
+                        if self.pbar:
+                            self.pbar.update(len(hits))
+                        if callback:
+                            await callback(hits)
+                    self.fetch_queue.task_done()
+                except Exception as e:
+                    print(f"\nError fetching {url}: {str(e)}")
+                    self.fetch_queue.task_done()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                print(f"\nWorker error: {str(e)}")
+                self.fetch_queue.task_done()
+    def _construct_submission_url(self, hit):
+        """Construct the URL for retrieving the actual submission"""
+        try:
+            # Extract CIK from the hit
+            cik = hit['_source']['ciks'][0]
+            # Extract accession number from _id (format: accno:file.txt)
+            accno_w_dash = hit['_id'].split(':')[0]
+            accno_no_dash = accno_w_dash.replace('-', '')
+            # Check if we should filter by accession numbers
+            if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
+                return None, None, None
+            # Construct the URL
+            url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accno_no_dash}/{accno_w_dash}.txt"
+            url = fix_filing_url(url)
+            return url, cik, accno_w_dash
+        except (KeyError, IndexError) as e:
+            print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
+            return None, None, None
+    async def _document_download_worker(self):
+        """Worker to download actual filing documents"""
+        while True:
+            try:
+                hit, doc_url, cik, accno = await self.document_queue.get()
+                try:
+                    # Use the same rate limiter as the EFTS queries
+                    async with self.limiter:
+                        async with self.session.get(doc_url) as response:
+                            response.raise_for_status()
+                            content = await response.read()
+                            # Update rate monitor
+                            await self.rate_monitor.add_request(len(content))
+                            # Call document callback with content in memory
+                            if self.document_callback:
+                                await self.document_callback(hit, content, cik, accno, doc_url)
+                            # Update progress bar
+                            if self.document_pbar:
+                                self.document_pbar.update(1)
+                                self.documents_processed += 1
+                    self.document_queue.task_done()
+                except Exception as e:
+                    print(f"\nError streaming document {doc_url}: {str(e)}")
+                    self.document_queue.task_done()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                print(f"\nDocument worker error: {str(e)}")
+                self.document_queue.task_done()
+    async def document_download_callback(self, hits):
+        """Callback to process EFTS query results and stream submissions"""
+        # Pause the EFTS query processing
+        self.query_paused.set()
+        # Signal that document download is in progress
+        self.download_in_progress.set()
+        # Create progress bar for documents if not exists
+        if not self.document_pbar:
+            self.document_pbar = tqdm(total=0, desc="Streaming submissions")
+        # Queue up the documents for download
+        for hit in hits:
+            doc_url, cik, accno = self._construct_submission_url(hit)
+            if doc_url:
+                # Update document progress bar total
+                self.document_pbar.total += 1
+                self.total_documents += 1
+                # Add to download queue
+                await self.document_queue.put((hit, doc_url, cik, accno))
+            elif accno is None and self.accession_numbers is not None:
+                # Document was skipped due to accession number filter
+                self.skipped_documents += 1
+        # Wait for all documents to be downloaded
+        await self.document_queue.join()
+        # Resume EFTS query processing
+        self.query_paused.clear()
+        # Signal that document download is complete
+        self.download_in_progress.clear()
+    async def stream(self, cik=None, submission_type=None, filing_date=None):
+        """Main method to stream EFTS results and download documents"""
+        # Create document worker tasks
+        self.document_workers = [
+            asyncio.create_task(self._document_download_worker())
+            for _ in range(5)  # Same number as query workers
+        ]
+        # Reset counters
+        self.documents_processed = 0
+        self.total_documents = 0
+        self.skipped_documents = 0
+        # Run the main query with our document download callback
+        results = await self.query(cik, submission_type, filing_date, self.document_download_callback)
+        # Make sure all document downloads are complete
+        if self.download_in_progress.is_set():
+            print("Waiting for remaining document downloads to complete...")
+            await self.document_queue.join()
+        # Clean up document workers
+        for worker in self.document_workers:
+            worker.cancel()
+        await asyncio.gather(*self.document_workers, return_exceptions=True)
+        # Close document progress bar and don't show a new one
+        if self.document_pbar:
+            self.document_pbar.close()
+            self.document_pbar = None  # Set to None to prevent reuse
+        print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
+        if self.accession_numbers is not None:
+            print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
+        return results
+def stream(cik=None, submission_type=None, filing_date=None,
+                requests_per_second=5.0, document_callback=None, accession_numbers=None):
+    """
+    Stream EFTS results and download documents into memory.
+    Parameters:
+    - cik: CIK number(s) to query for
+    - submission_type: Filing type(s) to query for
+    - filing_date: Date or date range to query for
+    - requests_per_second: Rate limit for SEC requests (combined EFTS and document downloads)
+    - document_callback: Callback function that receives (hit, content, cik, accno, url)
+    - accession_numbers: Optional list of accession numbers to filter by
+    Returns:
+    - List of all EFTS hits processed
+    """
+    # check if acc no is empty list
+    if accession_numbers == []:
+        raise ValueError("Applied filter resulted in empty accession numbers list")
+    async def run_stream():
+        streamer = Streamer(
+            requests_per_second=requests_per_second,
+            document_callback=document_callback,
+            accession_numbers=accession_numbers
+        )
+        return await streamer.stream(cik, submission_type, filing_date)
+    return asyncio.run(run_stream())

datamule/sec/submissions/textsearch.py ADDED Viewed

@@ -0,0 +1,122 @@
+import asyncio
+import aiohttp
+from datetime import datetime
+from urllib.parse import urlencode
+from tqdm import tqdm
+from .eftsquery import EFTSQuery
+class TextSearchEFTSQuery(EFTSQuery):
+    """
+    Extended EFTSQuery class that adds text search capabilities.
+    """
+    def __init__(self, text_query, requests_per_second=5.0):
+        super().__init__(requests_per_second=requests_per_second)
+        self.text_query = text_query
+    def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
+        # Get base parameters from parent class
+        params = super()._prepare_params(cik, submission_type, filing_date)
+        # Add text query parameter
+        params['q'] = self.text_query
+        return params
+async def extract_accession_numbers(hits):
+    """
+    Extract accession numbers from hits.
+    Parameters:
+    -----------
+    hits : list
+        List of hit objects from the EFTS API.
+    Returns:
+    --------
+    list
+        List of accession numbers extracted from the hits.
+    """
+    accession_numbers = []
+    for hit in hits:
+        if '_id' in hit:
+            # Extract accession number (part before the colon)
+            doc_id = hit['_id']
+            if ':' in doc_id:
+                acc_no = doc_id.split(':')[0]
+                accession_numbers.append(acc_no)
+    return accession_numbers
+def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
+    """
+    Search SEC filings for text and return the full search results.
+    Parameters:
+    -----------
+    text_query : str
+        The text to search for in filings. To search for an exact phrase, use double quotes.
+        Example: 'covid' or '"climate change"'
+    cik : str, list, optional
+        CIK number(s) to filter by. Will be zero-padded to 10 digits.
+    submission_type : str, list, optional
+        Filing type(s) to filter by (e.g., '10-K', '10-Q').
+        Defaults to '-0' for primary documents only.
+    filing_date : str, tuple, list, optional
+        Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
+        a tuple of (start_date, end_date), or a list of dates.
+    requests_per_second : float, optional
+        Maximum number of requests per second to make to the SEC API.
+        Default is 5.0.
+    Returns:
+    --------
+    list
+        Complete search results with all hit data.
+    """
+    async def run_query():
+        query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
+        return await query.query(cik, submission_type, filing_date)
+    return asyncio.run(run_query())
+def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
+    """
+    Search SEC filings for text and return matching accession numbers.
+    Parameters:
+    -----------
+    text_query : str
+        The text to search for in filings. To search for an exact phrase, use double quotes.
+        Example: 'covid' or '"climate change"'
+    cik : str, list, optional
+        CIK number(s) to filter by. Will be zero-padded to 10 digits.
+    submission_type : str, list, optional
+        Filing type(s) to filter by (e.g., '10-K', '10-Q').
+        Defaults to '-0' for primary documents only.
+    filing_date : str, tuple, list, optional
+        Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
+        a tuple of (start_date, end_date), or a list of dates.
+    requests_per_second : float, optional
+        Maximum number of requests per second to make to the SEC API.
+        Default is 5.0.
+    Returns:
+    --------
+    list
+        List of accession numbers (as strings) for filings that match the text query.
+    """
+    async def run_query():
+        query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
+        # Create a collector for accession numbers
+        all_acc_nos = []
+        async def collect_acc_nos(hits):
+            acc_nos = await extract_accession_numbers(hits)
+            all_acc_nos.extend(acc_nos)
+        # Run the query with our callback
+        await query_obj.query(cik, submission_type, filing_date, collect_acc_nos)
+        return all_acc_nos
+    return asyncio.run(run_query())

datamule/sec/utils.py ADDED Viewed

@@ -0,0 +1,64 @@
+import asyncio
+import time
+from collections import deque
+class RetryException(Exception):
+    def __init__(self, url, retry_after=601): # SEC Rate limit is typically 10 minutes.
+        self.url = url
+        self.retry_after = retry_after
+class PreciseRateLimiter:
+    def __init__(self, rate, interval=1.0):
+        self.rate = rate  # requests per interval
+        self.interval = interval  # in seconds
+        self.token_time = self.interval / self.rate  # time per token
+        self.last_time = time.time()
+        self.lock = asyncio.Lock()
+    async def acquire(self):
+        async with self.lock:
+            now = time.time()
+            wait_time = self.last_time + self.token_time - now
+            if wait_time > 0:
+                await asyncio.sleep(wait_time)
+            self.last_time = time.time()
+            return True
+    async def __aenter__(self):
+        await self.acquire()
+        return self
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+class RateMonitor:
+    def __init__(self, window_size=1.0):
+        self.window_size = window_size
+        self.requests = deque()
+        self._lock = asyncio.Lock()
+    async def add_request(self, size_bytes):
+        async with self._lock:
+            now = time.time()
+            self.requests.append((now, size_bytes))
+            while self.requests and self.requests[0][0] < now - self.window_size:
+                self.requests.popleft()
+    def get_current_rates(self):
+        now = time.time()
+        while self.requests and self.requests[0][0] < now - self.window_size:
+            self.requests.popleft()
+        if not self.requests:
+            return 0, 0
+        request_count = len(self.requests)
+        byte_count = sum(size for _, size in self.requests)
+        requests_per_second = request_count / self.window_size
+        mb_per_second = (byte_count / 1024 / 1024) / self.window_size
+        return round(requests_per_second, 1), round(mb_per_second, 2)
+headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}

datamule/sec/xbrl/__init__.py ADDED Viewed

File without changes

datamule/sec/xbrl/downloadcompanyfacts.py ADDED Viewed

@@ -0,0 +1,83 @@
+import os
+import csv
+from pathlib import Path
+from .streamcompanyfacts import stream_company_facts
+def process_company_data(data, output_path):
+    # Check for errors in data
+    if data and 'error' in data:
+        print(f"Error processing CIK {data.get('cik')}: {data.get('error')}")
+        return False
+    # Define CSV output path
+    company_cik = data.get('cik')
+    csv_path = output_path / f"{company_cik}.csv"
+    with open(csv_path, 'w', newline='') as csvfile:
+        fieldnames = [
+            'cik', 'entity_name', 'namespace', 'concept_name',
+            'end_date', 'value', 'unit', 'accession_number',
+            'fiscal_year', 'fiscal_period', 'form_type',
+            'filed_date', 'frame'
+        ]
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        entity_name = data.get('entityName')
+        # Process each namespace (dei, us-gaap, etc.)
+        for namespace, concepts in data.get('facts', {}).items():
+            # Process each concept in the namespace
+            for concept_name, concept_data in concepts.items():
+                # Get units data (shares, USD, etc.)
+                units = concept_data.get('units', {})
+                # Process each unit type
+                for unit_type, values in units.items():
+                    # Process each value (each filing/period)
+                    for value_data in values:
+                        # Create a row for the CSV
+                        row = {
+                            'cik': company_cik,
+                            'entity_name': entity_name,
+                            'namespace': namespace,
+                            'concept_name': concept_name,
+                            'end_date': value_data.get('end'),
+                            'value': value_data.get('val'),
+                            'unit': unit_type,
+                            'accession_number': value_data.get('accn'),
+                            'fiscal_year': value_data.get('fy'),
+                            'fiscal_period': value_data.get('fp'),
+                            'form_type': value_data.get('form'),
+                            'filed_date': value_data.get('filed'),
+                            'frame': value_data.get('frame')
+                        }
+                        writer.writerow(row)
+    return True
+def download_company_facts(cik, output_dir, requests_per_second=5):
+    # Create output directory if it doesn't exist
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Handle both single CIK and list
+    if isinstance(cik, list):
+        # Define callback to process the data for each CIK
+        def callback(data):
+            process_company_data(data, output_path)
+        # Process all CIKs in parallel
+        results = stream_company_facts(
+            cik=cik,
+            requests_per_second=requests_per_second,
+            callback=callback
+        )
+        # Just return since the callback handles the processing
+        return True
+    else:
+        # Single CIK case
+        result = stream_company_facts(cik=cik, requests_per_second=requests_per_second)
+        return process_company_data(result, output_path)

datamule/sec/xbrl/filter_xbrl.py ADDED Viewed

@@ -0,0 +1,39 @@
+# simple implementation
+import requests
+from ..utils import headers
+def fetch_frame(taxonomy, concept, unit, period):
+    url = f"https://data.sec.gov/api/xbrl/frames/{taxonomy}/{concept}/{unit}/{period}.json"
+    response = requests.get(url, headers=headers)
+    print(url)
+    print(response)
+    return response.json()
+def filter_xbrl(taxonomy, concept, unit, period, logic, value):
+    response_data = fetch_frame(taxonomy, concept, unit, period)
+    if response_data is None:
+        raise ValueError("Unable to fetch XBRL data. Incorrect parameters?")
+    # input validation
+    value = int(value)
+    # Filter data based on logic and value
+    data= response_data['data']
+    if logic == '>':
+        return [row['accn'] for row in data if row['val'] > value]
+    elif logic == '<':
+        return [row['accn'] for row in data if row['val'] < value]
+    elif logic == '>=':
+        return [row['accn'] for row in data if row['val'] >= value]
+    elif logic == '<=':
+        return [row['accn'] for row in data if row['val'] <= value]
+    elif logic == '==':
+        return [row['accn'] for row in data if row['val'] == value]
+    elif logic == '!=':
+        return [row['accn'] for row in data if row['val'] != value]
+    else:
+        raise ValueError(f"Invalid logic operator: {logic}")

datamule/sec/xbrl/streamcompanyfacts.py ADDED Viewed

@@ -0,0 +1,93 @@
+import asyncio
+import aiohttp
+import json
+from tqdm import tqdm
+from ..utils import PreciseRateLimiter, RateMonitor, RetryException, headers
+async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
+    # Format CIK with leading zeros to 10 digits
+    formatted_cik = f"CIK{str(cik).zfill(10)}"
+    url = f"https://data.sec.gov/api/xbrl/companyfacts/{formatted_cik}.json"
+    try:
+        # Acquire rate limit token
+        await rate_limiter.acquire()
+        async with session.get(url, headers=headers) as response:
+            content_length = int(response.headers.get('Content-Length', 0))
+            await rate_monitor.add_request(content_length)
+            # Log current rates
+            req_rate, mb_rate = rate_monitor.get_current_rates()
+            pbar.set_postfix({"req/s": req_rate, "MB/s": mb_rate})
+            # Handle rate limiting
+            if response.status == 429:
+                retry_after = int(response.headers.get('Retry-After', 601))
+                pbar.set_description(f"Rate limited, retry after {retry_after}s")
+                await asyncio.sleep(retry_after)
+                pbar.set_description(f"Fetching CIK {cik}")
+                return await fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar)
+            # Handle other errors
+            if response.status != 200:
+                pbar.update(1)
+                return {"error": f"HTTP {response.status}", "cik": cik}
+            data = await response.json()
+            pbar.update(1)
+            return data
+    except Exception as e:
+        pbar.update(1)
+        return {"error": str(e), "cik": cik}
+async def stream_companyfacts(cik=None, requests_per_second=5, callback=None):
+    if cik is None:
+        return {"error": "No CIK provided. Please specify a CIK."}
+    # Handle both single CIK and list of CIKs
+    if not isinstance(cik, list):
+        cik_list = [cik]
+    else:
+        cik_list = cik
+    # Initialize rate limiter and monitor
+    rate_limiter = PreciseRateLimiter(rate=requests_per_second)
+    rate_monitor = RateMonitor(window_size=10.0)
+    # Create progress bar
+    pbar = tqdm(total=len(cik_list), desc="Fetching company facts")
+    results = []
+    async with aiohttp.ClientSession() as session:
+        # Create tasks for all CIKs
+        tasks = [
+            fetch_company_facts(session, cik_item, rate_limiter, rate_monitor, pbar)
+            for cik_item in cik_list
+        ]
+        # Process tasks as they complete
+        for completed_task in asyncio.as_completed(tasks):
+            data = await completed_task
+            # Call callback if provided
+            if callback and not (data and 'error' in data):
+                callback(data)
+            results.append(data)
+    pbar.close()
+    # If single CIK was passed, return just that result
+    if len(cik_list) == 1:
+        return results[0]
+    # Otherwise return all results
+    return results
+def stream_company_facts(cik=None, requests_per_second=5, callback=None):
+    loop = asyncio.get_event_loop()
+    return loop.run_until_complete(
+        stream_companyfacts(cik=cik, requests_per_second=requests_per_second, callback=callback)
+    )

datamule 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl

datamule 1.0.8py3-none-any.whl → 1.1.0py3-none-any.whl