PyPI - datamule - Versions diffs - 2.0.7__tar.gz → 3.1.0__tar.gz - Mend

datamule 2.0.7tar.gz → 3.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

{datamule-2.0.7 → datamule-3.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.0.7
+Version: 3.1.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -19,3 +19,5 @@ Requires-Dist: secxbrl
 Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company_fundamentals
+Requires-Dist: flashtext
+Requires-Dist: aioboto3

{datamule-2.0.7 → datamule-3.1.0}/datamule/__init__.py RENAMED Viewed

@@ -1,13 +1,14 @@
-from .submission import Submission
-from .portfolio import Portfolio
+from .submission.submission import Submission
+from .portfolio.portfolio import Portfolio
 from .document.document import Document
 from .helper import _load_package_csv, load_package_dataset
 from .config import Config
-from .sheet import Sheet
+from .sheet.sheet import Sheet
 from .index import Index
 from .package_updater import PackageUpdater
 from .utils.format_accession import format_accession
 from .utils.construct_submissions_data import construct_submissions_data
+from .book.book import Book
 # Keep the notebook environment setup
@@ -31,6 +32,8 @@ def _setup_notebook_env():
 # Set up notebook environment
 _setup_notebook_env()
+# TODO, is this load bearing?
 __all__ = [
     '_load_package_csv',
     'load_package_dataset',

datamule-3.1.0/datamule/book/book.py ADDED Viewed

@@ -0,0 +1,38 @@
+import os
+from .s3transfer import s3_transfer as _s3_transfer
+from .download_dataset_from_s3 import download_dataset as _download_dataset
+class Book:
+    def __init__(self, api_key=None):
+        if api_key is not None:
+            self._api_key = api_key
+    @property
+    def api_key(self):
+        return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
+    @api_key.setter
+    def api_key(self, value):
+        if not value:
+            raise ValueError("API key cannot be empty")
+        self._api_key = value
+    def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4,
+                    errors_json_filename='s3_transfer_errors.json', retry_errors=3,
+                    force_daily=True, cik=None, submission_type=None, filing_date=None,
+                    api_key=None, accession=None):
+        # Use provided key, or fall back to instance property
+        api_key = api_key or self.api_key
+        _s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials,
+                    max_workers=max_workers, errors_json_filename=errors_json_filename,
+                    retry_errors=retry_errors, force_daily=force_daily, cik=cik,
+                    submission_type=submission_type, filing_date=filing_date,
+                    api_key=api_key, accession_number=accession)
+    def download_dataset(self, dataset, filename=None, api_key=None):
+        # Use provided key, or fall back to instance property
+        api_key = api_key or self.api_key
+        _download_dataset(dataset=dataset, filename=filename, api_key=api_key)

datamule-3.1.0/datamule/book/download_dataset_from_s3.py ADDED Viewed

@@ -0,0 +1,106 @@
+import urllib.request
+import urllib.parse
+from tqdm import tqdm
+import json
+# Dataset name mapping - lowercase underscore to official name
+DATASET_NAME_MAP = {
+    'sec_accessions': 'SEC Accessions Master Index',
+    'sec_master_submissions': 'SEC Master Submissions Table',
+    'sec_accession_cik_table': 'SEC Accession CIK Table',
+    'sec_documents_table': 'SEC Documents Table',
+    'sec_submission_details_table': 'SEC Submissions Details Table',
+    'simple_xbrl_table': 'Simple XBRL Table',
+    'proxy_voting_records_table': 'Proxy Voting Records Table',
+    'institutional_holdings_table': 'Institutional Holdings Table',
+    'metadata_ownership_table': 'Insider Ownership Metadata Table',
+    'reporting_owner_ownership_table': 'Insider Reporting Owner Table',
+    'non_derivative_transaction_ownership_table': 'Insider Non-Derivative Transactions Table',
+    'non_derivative_holding_ownership_table': 'Insider Non-Derivative Holdings Table',
+    'derivative_transaction_ownership_table': 'Insider Derivative Transactions Table',
+    'derivative_holding_ownership_table': 'Insider Derivative Holdings Table',
+    'owner_signature_ownership_table': 'Insider Owner Signatures Table',
+}
+def download_dataset(dataset, api_key, filename=None):
+    """
+    Download a dataset from Datamule API
+    Args:
+        dataset: Dataset name (lowercase underscore format, e.g. 'sec_accessions')
+        api_key: Datamule API key
+        filename: Output filename (optional, extracted from URL if not provided)
+    """
+    # Map dataset name to official name
+    dataset_name = DATASET_NAME_MAP.get(dataset)
+    if not dataset_name:
+        raise ValueError(f"Unknown dataset: {dataset}")
+    # Get download URL from API
+    api_url = f"https://api.datamule.xyz/dataset/{urllib.parse.quote(dataset_name)}?api_key={api_key}"
+    # Create request with headers
+    req = urllib.request.Request(
+        api_url,
+        headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    )
+    try:
+        with urllib.request.urlopen(req) as response:
+            data = json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        error_body = e.read().decode()
+        raise Exception(f"API request failed: {error_body}")
+    if not data.get('success'):
+        raise Exception(f"API error: {data.get('error', 'Unknown error')}")
+    download_url = data['data']['download_url']
+    size_gb = data['data']['size_gb']
+    # Extract filename from URL if not provided
+    if filename is None:
+        # Parse the path parameter from the download URL
+        parsed = urllib.parse.urlparse(download_url)
+        query_params = urllib.parse.parse_qs(parsed.query)
+        path = query_params.get('path', [''])[0]
+        # Get the filename from the path (last part after /)
+        filename = urllib.parse.unquote(path.split('/')[-1])
+        if not filename:
+            filename = f"{dataset}.download"
+    # Download file with progress bar
+    print(f"Downloading {dataset} ({size_gb:.2f} GB)...")
+    # Create request with headers for download
+    download_req = urllib.request.Request(
+        download_url,
+        headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    )
+    try:
+        with urllib.request.urlopen(download_req) as response:
+            total_size = int(response.headers.get('Content-Length', 0))
+            with open(filename, 'wb') as f, tqdm(
+                total=total_size,
+                unit='B',
+                unit_scale=True,
+                desc=filename
+            ) as pbar:
+                while True:
+                    chunk = response.read(8192)
+                    if not chunk:
+                        break
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+    except urllib.error.HTTPError as e:
+        error_body = e.read().decode()
+        raise Exception(f"Download failed: {error_body}")
+    print(f"Downloaded to {filename}")

datamule-3.1.0/datamule/book/s3transfer.py ADDED Viewed

@@ -0,0 +1,263 @@
+import asyncio
+import aiohttp
+import aioboto3
+import ssl
+import time
+import json
+from datetime import datetime, timedelta
+from urllib.parse import urlparse
+from tqdm import tqdm
+import logging
+from ..sheet.sheet import Sheet
+from ..utils.format_accession import format_accession
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def generate_date_range(start_date_str, end_date_str):
+    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
+    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
+    dates = []
+    current_date = start_date
+    while current_date <= end_date:
+        dates.append(current_date.strftime('%Y-%m-%d'))
+        current_date += timedelta(days=1)
+    return dates
+def get_filings_sgml_r2_urls(submission_type=None, cik=None, datamule_api_key=None, filing_date=None,accession_number=None):
+    datamule_bucket_endpoint = 'https://sec-library.datamule.xyz/'
+    sheet = Sheet('s3transfer')
+    submissions = sheet.get_submissions(distinct=True, quiet=False, api_key=datamule_api_key,
+                                    submission_type=submission_type, cik=cik, columns=['accessionNumber'], filing_date=filing_date,
+                                    accession_number=accession_number)
+    accessions = [format_accession(sub['accessionNumber'], 'no-dash') for sub in submissions]
+    urls = [f"{datamule_bucket_endpoint}{accession}.sgml" for accession in accessions]
+    return urls
+class AsyncS3Transfer:
+    def __init__(self, s3_credentials, max_workers=100, chunk_size=2*1024*1024):
+        self.s3_credentials = s3_credentials
+        self.max_workers = max_workers
+        self.chunk_size = chunk_size
+    async def __aenter__(self):
+        # Create aiohttp session with optimized connector
+        connector = aiohttp.TCPConnector(
+            limit=self.max_workers,
+            force_close=False,
+            ssl=ssl.create_default_context(),
+            ttl_dns_cache=300,
+            keepalive_timeout=60
+        )
+        self.session = aiohttp.ClientSession(
+            connector=connector,
+            timeout=aiohttp.ClientTimeout(total=600),
+            headers={
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Connection': 'keep-alive',
+                'Accept-Encoding': 'gzip, deflate, br'
+            }
+        )
+        # Create async boto3 client
+        if self.s3_credentials['s3_provider'] == 'aws':
+            session = aioboto3.Session()
+            self.s3_client = await session.client(
+                's3',
+                aws_access_key_id=self.s3_credentials['aws_access_key_id'],
+                aws_secret_access_key=self.s3_credentials['aws_secret_access_key'],
+                region_name=self.s3_credentials['region_name']
+            ).__aenter__()
+        else:
+            raise ValueError("S3 Provider not supported yet. Please use another provider or email johnfriedman@datamule.xyz to add support.")
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if hasattr(self, 'session') and self.session:
+            await self.session.close()
+        if hasattr(self, 's3_client') and self.s3_client:
+            await self.s3_client.__aexit__(exc_type, exc_val, exc_tb)
+    async def transfer_single_file(self, semaphore, url, retry_errors=3):
+        """Transfer a single file with retry logic and preserve metadata"""
+        async with semaphore:
+            filename = urlparse(url).path.split('/')[-1]
+            s3_key = filename
+            bucket_name = self.s3_credentials['bucket_name']
+            last_error = None
+            for attempt in range(retry_errors + 1):
+                try:
+                    async with self.session.get(url) as response:
+                        if response.status == 200:
+                            # Capture source metadata from response headers
+                            content_length = response.headers.get('Content-Length')
+                            size_bytes = int(content_length) if content_length else 0
+                            content_type = response.headers.get('Content-Type', 'application/octet-stream')
+                            last_modified = response.headers.get('Last-Modified')
+                            # Read response content
+                            content = await response.read()
+                            # Prepare S3 upload parameters with preserved metadata
+                            upload_params = {
+                                'Bucket': bucket_name,
+                                'Key': s3_key,
+                                'Body': content,
+                                'ContentType': content_type,
+                                'StorageClass': 'STANDARD',
+                                'Metadata': {
+                                    'source-url': url,
+                                    'original-size': str(size_bytes),
+                                    'transfer-date': datetime.utcnow().isoformat()
+                                }
+                            }
+                            # Add last modified if available
+                            if last_modified:
+                                upload_params['Metadata']['original-last-modified'] = last_modified
+                            # Upload to S3 with metadata
+                            await self.s3_client.put_object(**upload_params)
+                            return {
+                                'success': True,
+                                'url': url,
+                                'message': f"Copied: {url} -> s3://{bucket_name}/{s3_key}",
+                                'size_bytes': size_bytes,
+                                's3_key': s3_key,
+                                'content_type': content_type,
+                                'last_modified': last_modified
+                            }
+                        else:
+                            raise aiohttp.ClientResponseError(
+                                request_info=response.request_info,
+                                history=response.history,
+                                status=response.status
+                            )
+                except Exception as e:
+                    print(e)
+                    last_error = e
+                    if attempt < retry_errors:
+                        await asyncio.sleep(2 ** attempt)  # Exponential backoff
+            # All attempts failed
+            return {
+                'success': False,
+                'url': url,
+                'error': str(last_error),
+                'message': f"Failed to copy {url} after {retry_errors + 1} attempts: {last_error}",
+                'size_bytes': 0
+            }
+    async def transfer_batch(self, urls, retry_errors=3):
+        """Transfer multiple files concurrently"""
+        semaphore = asyncio.Semaphore(self.max_workers)
+        failed_files = []
+        total_bytes = 0
+        start_time = time.time()
+        # Create tasks for all transfers
+        tasks = [
+            self.transfer_single_file(semaphore, url, retry_errors)
+            for url in urls
+        ]
+        # Process with progress bar
+        with tqdm(total=len(urls), desc="Transferring files", unit="file") as pbar:
+            for coro in asyncio.as_completed(tasks):
+                result = await coro
+                if result['success']:
+                    total_bytes += result.get('size_bytes', 0)
+                else:
+                    failed_files.append(result)
+                # Update progress bar with total GB transferred
+                total_gb = total_bytes / (1024 ** 3)
+                pbar.set_postfix({'Total': f'{total_gb:.2f} GB'})
+                pbar.update(1)
+        return failed_files, total_bytes
+async def async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4,
+                                         errors_json_filename='s3_transfer_errors.json',
+                                         retry_errors=3):
+    """Async version of transfer_cached_urls_to_s3"""
+    failed_files = []
+    total_bytes = 0
+    async with AsyncS3Transfer(s3_credentials, max_workers) as transfer:
+        failed_files, total_bytes = await transfer.transfer_batch(urls, retry_errors)
+        # Save errors to JSON if filename provided and there are errors
+        if errors_json_filename and failed_files:
+            with open(errors_json_filename, 'w') as f:
+                json.dump(failed_files, f, indent=2)
+            print(f"Saved {len(failed_files)} errors to {errors_json_filename}")
+        print(f"Transfer complete: {len(urls) - len(failed_files)}/{len(urls)} files successful")
+def transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3):
+    """Wrapper to run async transfer in sync context"""
+    asyncio.run(async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers, errors_json_filename, retry_errors))
+def s3_transfer(datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
+                force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession_number=None):
+    if datamule_bucket in ['filings_sgml_r2','sec_filings_sgml_r2']:
+        if accession_number is not None:
+            if any(param is not None for param in [cik, submission_type, filing_date]):
+                raise ValueError('If accession is provided, then cik, type, and date must be None')
+            urls = get_filings_sgml_r2_urls(datamule_api_key=datamule_api_key,accession_number=accession_number)
+            transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+        else:
+            if not force_daily:
+                urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=filing_date)
+                transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+            else:
+                if isinstance(filing_date, str):
+                    urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=filing_date)
+                    transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+                elif isinstance(filing_date, list):
+                    for date in filing_date:
+                        print(f"Transferring {date}")
+                        urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=date)
+                        transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+                elif isinstance(filing_date, tuple):
+                    dates = generate_date_range(filing_date[0], filing_date[1])
+                    for date in dates:
+                        print(f"Transferring {date}")
+                        urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=date)
+                        transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+                else:
+                    raise ValueError('filing_date can only be string, list, or (startdt,enddt)')
+    else:
+        raise ValueError('Datamule S3 bucket not found.')

datamule-3.1.0/datamule/datamule/datamule_lookup.py ADDED Viewed

@@ -0,0 +1,86 @@
+from ..sheet.sheet import Sheet
+from ..utils.format_accession import format_accession
+from ..helper import _process_cik_and_metadata_filters
+def _filters(accession_numbers, filtered_accession_numbers=None, skip_accession_numbers=None):
+    """
+    Apply intersection and exclusion filters to accession numbers.
+    Args:
+        accession_numbers: List of accession numbers to filter
+        filtered_accession_numbers: If provided, only keep accessions in this list (intersection)
+        skip_accession_numbers: If provided, remove accessions in this list (exclusion)
+    Returns:
+        Filtered list of accession numbers
+    """
+    # Apply intersection filter if provided
+    if filtered_accession_numbers is not None:
+        filtered_accession_numbers = [format_accession(item,'int') for item in filtered_accession_numbers]
+        filtered_set = set(filtered_accession_numbers)
+        accession_numbers = [acc for acc in accession_numbers if acc in filtered_set]
+    # Apply exclusion filter if provided
+    if skip_accession_numbers is not None:
+        skip_accession_numbers = [format_accession(item,'int') for item in skip_accession_numbers]
+        skip_set = set(skip_accession_numbers)
+        accession_numbers = [acc for acc in accession_numbers if acc not in skip_set]
+    return accession_numbers
+def datamule_lookup(cik=None, ticker=None, submission_type=None, filing_date=None,
+                   report_date=None, detected_time=None,
+                   contains_xbrl=None, document_type=None, filename=None,
+                   sequence=None, quiet=False, api_key=None,filtered_accession_numbers=None,
+                   skip_accession_numbers= None, provider='datamule-tar', **kwargs):
+    lookup_args = {}
+    # Direct mappings
+    cik =  _process_cik_and_metadata_filters(cik, ticker, **kwargs)
+    if cik is not None:
+        lookup_args['cik'] = cik
+    if submission_type is not None:
+        lookup_args['submissionType'] = submission_type
+    # Filing date - can be specific date(s) or range
+    if filing_date is not None:
+        lookup_args['filingDate'] = filing_date
+    # Report date - can be specific date(s) or range
+    if report_date is not None:
+        lookup_args['reportDate'] = report_date
+    if detected_time is not None:
+        lookup_args['detectedTime'] = detected_time
+    # XBRL flag
+    if contains_xbrl is not None:
+        lookup_args['containsXBRL'] = contains_xbrl
+    # Document-level filters
+    if document_type is not None:
+        lookup_args['documentType'] = document_type
+    if filename is not None:
+        lookup_args['filename'] = filename
+    if sequence is not None:
+        lookup_args['sequence'] = sequence
+    sheet = Sheet('')
+    if provider == 'datamule-sgml':
+        database = 'sgml-archive'
+    else:
+        database = 'tar-archive'
+    accessions = sheet.get_table(
+        database = database, **lookup_args
+    )
+    accessions = _filters(accession_numbers=accessions, filtered_accession_numbers=filtered_accession_numbers,
+                           skip_accession_numbers=skip_accession_numbers)
+    return accessions

datamule 2.0.7__tar.gz → 3.1.0__tar.gz

datamule 2.0.7tar.gz → 3.1.0tar.gz