PyPI - datamule - Versions diffs - 2.2.9__tar.gz → 2.3.2__tar.gz - Mend

datamule 2.2.9tar.gz → 2.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamule might be problematic. Click here for more details.

Files changed (71) hide show

{datamule-2.2.9 → datamule-2.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.9
+Version: 2.3.2
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -20,3 +20,4 @@ Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company_fundamentals
 Requires-Dist: flashtext
+Requires-Dist: aioboto3

{datamule-2.2.9 → datamule-2.3.2}/datamule/__init__.py RENAMED Viewed

@@ -8,6 +8,7 @@ from .index import Index
 from .package_updater import PackageUpdater
 from .utils.format_accession import format_accession
 from .utils.construct_submissions_data import construct_submissions_data
+from .book.book import Book
 # Keep the notebook environment setup

datamule-2.3.2/datamule/book/book.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .s3transfer import s3_transfer
+class Book:
+    def __init__(self):
+        pass
+    def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
+                    force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession=None):
+        s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
+                          errors_json_filename=errors_json_filename, retry_errors=retry_errors,
+                          force_daily=force_daily, cik=cik, submission_type=submission_type,
+                          filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)

datamule-2.3.2/datamule/book/s3transfer.py ADDED Viewed

@@ -0,0 +1,264 @@
+import asyncio
+import aiohttp
+import aioboto3
+import ssl
+import time
+import json
+from datetime import datetime, timedelta
+from urllib.parse import urlparse
+from tqdm import tqdm
+import logging
+from ..sheet import Sheet
+from ..utils.format_accession import format_accession
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def generate_date_range(start_date_str, end_date_str):
+    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
+    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
+    dates = []
+    current_date = start_date
+    while current_date <= end_date:
+        dates.append(current_date.strftime('%Y-%m-%d'))
+        current_date += timedelta(days=1)
+    return dates
+def get_filings_sgml_r2_urls(submission_type=None, cik=None, datamule_api_key=None, filing_date=None,accession_number=None):
+    datamule_bucket_endpoint = 'https://sec-library.datamule.xyz/'
+    sheet = Sheet('s3transfer')
+    submissions = sheet.get_submissions(distinct=True, quiet=False, api_key=datamule_api_key,
+                                    submission_type=submission_type, cik=cik, columns=['accessionNumber'], filing_date=filing_date,
+                                    accession_number=accession_number)
+    accessions = [format_accession(sub['accessionNumber'], 'no-dash') for sub in submissions]
+    urls = [f"{datamule_bucket_endpoint}{accession}.sgml" for accession in accessions]
+    return urls
+class AsyncS3Transfer:
+    def __init__(self, s3_credentials, max_workers=100, chunk_size=2*1024*1024):
+        self.s3_credentials = s3_credentials
+        self.max_workers = max_workers
+        self.chunk_size = chunk_size
+    async def __aenter__(self):
+        # Create aiohttp session with optimized connector
+        connector = aiohttp.TCPConnector(
+            limit=self.max_workers,
+            force_close=False,
+            ssl=ssl.create_default_context(),
+            ttl_dns_cache=300,
+            keepalive_timeout=60
+        )
+        self.session = aiohttp.ClientSession(
+            connector=connector,
+            timeout=aiohttp.ClientTimeout(total=600),
+            headers={
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Connection': 'keep-alive',
+                'Accept-Encoding': 'gzip, deflate, br'
+            }
+        )
+        # Create async boto3 client
+        if self.s3_credentials['s3_provider'] == 'aws':
+            session = aioboto3.Session()
+            self.s3_client = await session.client(
+                's3',
+                aws_access_key_id=self.s3_credentials['aws_access_key_id'],
+                aws_secret_access_key=self.s3_credentials['aws_secret_access_key'],
+                region_name=self.s3_credentials['region_name']
+            ).__aenter__()
+        else:
+            raise ValueError("S3 Provider not supported yet. Please use another provider or email johnfriedman@datamule.xyz to add support.")
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if hasattr(self, 'session') and self.session:
+            await self.session.close()
+        if hasattr(self, 's3_client') and self.s3_client:
+            await self.s3_client.__aexit__(exc_type, exc_val, exc_tb)
+    async def transfer_single_file(self, semaphore, url, retry_errors=3):
+        """Transfer a single file with retry logic and preserve metadata"""
+        async with semaphore:
+            filename = urlparse(url).path.split('/')[-1]
+            s3_key = filename
+            bucket_name = self.s3_credentials['bucket_name']
+            last_error = None
+            for attempt in range(retry_errors + 1):
+                try:
+                    async with self.session.get(url) as response:
+                        if response.status == 200:
+                            # Capture source metadata from response headers
+                            content_length = response.headers.get('Content-Length')
+                            size_bytes = int(content_length) if content_length else 0
+                            content_type = response.headers.get('Content-Type', 'application/octet-stream')
+                            last_modified = response.headers.get('Last-Modified')
+                            # Read response content
+                            content = await response.read()
+                            # Prepare S3 upload parameters with preserved metadata
+                            upload_params = {
+                                'Bucket': bucket_name,
+                                'Key': s3_key,
+                                'Body': content,
+                                'ContentType': content_type,
+                                'StorageClass': 'STANDARD',
+                                'Metadata': {
+                                    'source-url': url,
+                                    'original-size': str(size_bytes),
+                                    'transfer-date': datetime.utcnow().isoformat()
+                                }
+                            }
+                            # Add last modified if available
+                            if last_modified:
+                                upload_params['Metadata']['original-last-modified'] = last_modified
+                            # Upload to S3 with metadata
+                            await self.s3_client.put_object(**upload_params)
+                            return {
+                                'success': True,
+                                'url': url,
+                                'message': f"Copied: {url} -> s3://{bucket_name}/{s3_key}",
+                                'size_bytes': size_bytes,
+                                's3_key': s3_key,
+                                'content_type': content_type,
+                                'last_modified': last_modified
+                            }
+                        else:
+                            raise aiohttp.ClientResponseError(
+                                request_info=response.request_info,
+                                history=response.history,
+                                status=response.status
+                            )
+                except Exception as e:
+                    print(e)
+                    last_error = e
+                    if attempt < retry_errors:
+                        await asyncio.sleep(2 ** attempt)  # Exponential backoff
+            # All attempts failed
+            return {
+                'success': False,
+                'url': url,
+                'error': str(last_error),
+                'message': f"Failed to copy {url} after {retry_errors + 1} attempts: {last_error}",
+                'size_bytes': 0
+            }
+    async def transfer_batch(self, urls, retry_errors=3):
+        """Transfer multiple files concurrently"""
+        semaphore = asyncio.Semaphore(self.max_workers)
+        failed_files = []
+        total_bytes = 0
+        start_time = time.time()
+        # Create tasks for all transfers
+        tasks = [
+            self.transfer_single_file(semaphore, url, retry_errors)
+            for url in urls
+        ]
+        # Process with progress bar
+        with tqdm(total=len(urls), desc="Transferring files", unit="file") as pbar:
+            for coro in asyncio.as_completed(tasks):
+                result = await coro
+                if result['success']:
+                    total_bytes += result.get('size_bytes', 0)
+                else:
+                    failed_files.append(result)
+                # Update progress bar with total GB transferred
+                total_gb = total_bytes / (1024 ** 3)
+                pbar.set_postfix({'Total': f'{total_gb:.2f} GB'})
+                pbar.update(1)
+        return failed_files, total_bytes
+async def async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4,
+                                         errors_json_filename='s3_transfer_errors.json',
+                                         retry_errors=3):
+    """Async version of transfer_cached_urls_to_s3"""
+    failed_files = []
+    total_bytes = 0
+    async with AsyncS3Transfer(s3_credentials, max_workers) as transfer:
+        failed_files, total_bytes = await transfer.transfer_batch(urls, retry_errors)
+        # Save errors to JSON if filename provided and there are errors
+        if errors_json_filename and failed_files:
+            with open(errors_json_filename, 'w') as f:
+                json.dump(failed_files, f, indent=2)
+            print(f"Saved {len(failed_files)} errors to {errors_json_filename}")
+        print(f"Transfer complete: {len(urls) - len(failed_files)}/{len(urls)} files successful")
+def transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3):
+    """Wrapper to run async transfer in sync context"""
+    asyncio.run(async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers, errors_json_filename, retry_errors))
+def s3_transfer(datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
+                force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession_number=None):
+    if datamule_bucket == 'filings_sgml_r2':
+        if accession_number is not None:
+            if any(param is not None for param in [cik, submission_type, filing_date]):
+                raise ValueError('If accession is provided, then cik, type, and date must be None')
+            urls = get_filings_sgml_r2_urls(datamule_api_key=datamule_api_key,accession_number=accession_number)
+            transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+        else:
+            if not force_daily:
+                urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=filing_date)
+                transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+            else:
+                if isinstance(filing_date, str):
+                    urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=filing_date)
+                    transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+                elif isinstance(filing_date, list):
+                    for date in filing_date:
+                        print(f"Transferring {date}")
+                        urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=date)
+                        transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+                elif isinstance(filing_date, tuple):
+                    dates = generate_date_range(filing_date[0], filing_date[1])
+                    for date in dates:
+                        print(f"Transferring {date}")
+                        urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
+                                                filing_date=date)
+                        transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
+                else:
+                    raise ValueError('filing_date can only be string, list, or (startdt,enddt)')
+    else:
+        raise ValueError('Datamule S3 bucket not found.')

{datamule-2.2.9 → datamule-2.3.2}/datamule/datamule/downloader.py RENAMED Viewed

@@ -228,7 +228,7 @@ class Downloader:
                 headers = {
                     'Connection': 'keep-alive',
                     'Accept-Encoding': 'gzip, deflate, br',
-                    'Authorization': f'Bearer {api_key}'
+                    #'Authorization': f'Bearer {api_key}'
                 }
                 async with session.get(url, headers=headers) as response:

{datamule-2.2.9 → datamule-2.3.2}/datamule/document/document.py RENAMED Viewed

@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
 from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import *
-from selectolax.parser import HTMLParser
 from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
@@ -294,7 +292,6 @@ class Document:
             return bool(re.search(pattern, self.content))
         return False
-    # Note: this method will be heavily modified in the future
     def parse(self):
         # check if we have already parsed the content
         if self._data:
@@ -384,6 +381,8 @@ class Document:
                 dct = html2dict(content=self.content, mapping_dict=mapping_dict)
             elif self.extension in ['.txt']:
                 dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
+            elif self.extension == '.pdf':
+                dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
             else:
                 dct = {}
@@ -391,10 +390,8 @@ class Document:
         elif self.extension == '.xml':
             if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
                 mapping_dict = dict_345
             self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
-        elif self.extension == '.pdf':
-            self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
         else:
             pass
@@ -409,6 +406,12 @@ class Document:
             if not isinstance(self._data, DataWithTags):
                 self._data = DataWithTags(self._data, self)
+        elif self.extension == '.xml':
+            if self._data is None:
+                self.parse()
+            if self._data is None:
+                self._data = {}
         return self._data
@@ -444,19 +447,46 @@ class Document:
             json.dump(self.data, f, indent=2)
     def parse_tables(self,must_exist_in_mapping=True):
-        if self.extension != '.xml':
-            self._tables = []
+        """Must exist in mapping means columns must occur in mapping schema."""
+        if self.extension == '.xml':
+            tables = Tables(document_type = self.type, accession=self.accession)
+            tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
+            self._tables = tables
+        elif self._data_bool:
+            tables = Tables(document_type = self.type, accession=self.accession)
+            data_tuples = self.data_tuples
+            for i, (id, type, content, level) in enumerate(data_tuples):
+                if type == "table" and i > 0:
+                    description = None
+                    # Look at previous element
+                    prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
+                    # Case 1: Same level + text content
+                    if prev_level == level and prev_type in ["text", "textsmall"]:
+                        description = prev_content
+                    # Case 2: Higher level (lower number) + title
+                    elif prev_level < level and prev_type == "title":
+                        description = prev_content
+                    # Case 3: No matching description - add table without description
+                    # (description remains None)
+                    tables.add_table(data=content, description=description, name="extracted_table")
+            self._tables = tables
         else:
-            # Use the property to trigger parsing if needed
-            data = self.data
-            tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
-            self._tables = tables.tables
+            self._tables = []
     @property
     def tables(self):
         if self._tables is None:
             self.parse_tables()
-        return self._tables
+        return self._tables.tables
     def write_csv(self, output_folder):
@@ -547,6 +577,7 @@ class Document:
             webbrowser.open('file://' + temp_path)
         else:
             print(f"Cannot open files with extension {self.extension}")
     def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
         if self._data_bool:
             if not self.data:
@@ -557,3 +588,9 @@ class Document:
                 return [item[1] for item in result]
             else:
                 return [flatten_dict(item[1],format) for item in result]
+    # TODO
+    def get_tables(self,description_regex=None,name=None):
+        # make sure tables is initialized
+        self.tables
+        return self._tables.get_tables(description_regex=description_regex, name=name)

{datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables.py RENAMED Viewed

@@ -6,8 +6,10 @@ from .tables_npx import config_npx
 from .tables_sbsef import config_sbsef
 from .tables_sdr import config_sdr
 from .tables_proxyvotingrecord import config_proxyvotingrecord
+from doc2dict.utils.format_dict import _format_table
 from .utils import safe_get, flatten_dict
+import re
 # will add filing date param later? or extension
 all_tables_dict = {
     '3' : config_ownership,
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
 # should have table type, accession, data
 class Table:
-    def __init__(self,data,name,accession):
+    def __init__(self,data,name,accession,description = None):
         self.data = data
         self.name = name
         self.accession = accession
+        self.description = description
+    # TODO MADE IN A HURRY #
+    def __str__(self):
+        formatted_table = _format_table(self.data)
+        if isinstance(formatted_table, list):
+            table_str = '\n'.join(formatted_table)
+        else:
+            table_str = str(formatted_table)
+        return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
 class Tables():
-    def __init__(self,document_type,accession,data,must_exist_in_mapping=True):
+    def __init__(self,document_type,accession):
         self.document_type = document_type
         self.accession = accession
-        self.data = data
-        # to fill in
         self.tables = []
-        self.parse_tables(must_exist_in_mapping=must_exist_in_mapping)
-    def parse_tables(self,must_exist_in_mapping=True):
-        # first select dict
+    def parse_tables(self,data,must_exist_in_mapping=True):
+        self.data = data
         try:
             tables_dict = all_tables_dict[self.document_type]
@@ -120,11 +127,32 @@ class Tables():
         # now get the dicts from the data
         data_dicts = seperate_data(tables_dict,self.data)
         # now flatten
         data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
         for table_name, flattened_data in data_dicts:
             mapping_dict = tables_dict[table_name]['mapping']
             mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
-            self.tables.append(Table(mapped_data, table_name, self.accession))
+            self.tables.append(Table(mapped_data, table_name, self.accession))
+    def add_table(self,data,name,description=None):
+        self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
+    def get_tables(self, description_regex=None, name=None):
+        matching_tables = []
+        for table in self.tables:
+            # Check name match (exact match)
+            if name is not None:
+                if table.name == name:
+                    matching_tables.append(table)
+                    continue
+            # Check description regex match
+            if description_regex is not None and table.description is not None:
+                if re.search(description_regex, table.description):
+                    matching_tables.append(table)
+                    continue
+        return matching_tables

datamule-2.3.2/datamule/utils/__init__.py ADDED Viewed

File without changes

{datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.9
+Version: 2.3.2
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -20,3 +20,4 @@ Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company_fundamentals
 Requires-Dist: flashtext
+Requires-Dist: aioboto3

{datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -14,6 +14,9 @@ datamule.egg-info/SOURCES.txt
 datamule.egg-info/dependency_links.txt
 datamule.egg-info/requires.txt
 datamule.egg-info/top_level.txt
+datamule/book/__init__.py
+datamule/book/book.py
+datamule/book/s3transfer.py
 datamule/data/listed_filer_metadata.csv
 datamule/datamule/__init__.py
 datamule/datamule/datamule_lookup.py

{datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/requires.txt RENAMED Viewed

@@ -14,3 +14,4 @@ secsgml
 websocket-client
 company_fundamentals
 flashtext
+aioboto3

{datamule-2.2.9 → datamule-2.3.2}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.2.9",
+    version="2.3.2",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",
@@ -52,7 +52,8 @@ setup(
         'secsgml',
         'websocket-client',
         'company_fundamentals',
-        'flashtext'
+        'flashtext',
+        'aioboto3'
     ],
     # Include the data directory in the package
     package_data={