datamule 2.0.7__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {datamule-2.0.7 → datamule-3.1.0}/PKG-INFO +3 -1
  2. {datamule-2.0.7 → datamule-3.1.0}/datamule/__init__.py +6 -3
  3. datamule-3.1.0/datamule/book/book.py +38 -0
  4. datamule-3.1.0/datamule/book/download_dataset_from_s3.py +106 -0
  5. datamule-3.1.0/datamule/book/s3transfer.py +263 -0
  6. datamule-3.1.0/datamule/datamule/datamule_lookup.py +86 -0
  7. datamule-3.1.0/datamule/datamule/datamule_mysql_rds.py +160 -0
  8. {datamule-2.0.7 → datamule-3.1.0}/datamule/datamule/downloader.py +85 -91
  9. {datamule-2.0.7 → datamule-3.1.0}/datamule/datamule/sec_connector.py +13 -11
  10. datamule-3.1.0/datamule/datamule/tar_downloader.py +650 -0
  11. datamule-3.1.0/datamule/datasets.py +53 -0
  12. datamule-3.1.0/datamule/document/document.py +599 -0
  13. datamule-3.1.0/datamule/filings_constructor/filings_constructor.py +86 -0
  14. {datamule-2.0.7 → datamule-3.1.0}/datamule/helper.py +0 -1
  15. {datamule-2.0.7 → datamule-3.1.0}/datamule/mapping_dicts/html_mapping_dicts.py +11 -4
  16. {datamule-2.0.7/datamule → datamule-3.1.0/datamule/portfolio}/portfolio.py +74 -41
  17. datamule-2.0.7/datamule/portfolio_compression_utils.py → datamule-3.1.0/datamule/portfolio/portfolio_compression_utils_legacy.py +2 -0
  18. datamule-3.1.0/datamule/providers/providers.py +7 -0
  19. datamule-3.1.0/datamule/sec/submissions/__init__.py +0 -0
  20. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/submissions/downloader.py +6 -2
  21. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/submissions/streamer.py +1 -1
  22. datamule-3.1.0/datamule/sec/xbrl/__init__.py +0 -0
  23. datamule-3.1.0/datamule/seclibrary/__init__.py +0 -0
  24. {datamule-2.0.7 → datamule-3.1.0}/datamule/seclibrary/bq.py +2 -0
  25. datamule-3.1.0/datamule/sheet/__init__.py +0 -0
  26. datamule-3.1.0/datamule/sheet/sheet.py +120 -0
  27. datamule-3.1.0/datamule/submission/__init__.py +0 -0
  28. {datamule-2.0.7/datamule → datamule-3.1.0/datamule/submission}/submission.py +177 -95
  29. datamule-3.1.0/datamule/submission/tar_submission.py +81 -0
  30. datamule-3.1.0/datamule/tables/__init__.py +0 -0
  31. datamule-3.1.0/datamule/tables/tables.py +199 -0
  32. datamule-3.1.0/datamule/tables/tables_informationtable.py +20 -0
  33. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_proxyvotingrecord.py +3 -2
  34. datamule-3.1.0/datamule/tags/__init__.py +0 -0
  35. datamule-3.1.0/datamule/tags/config.py +47 -0
  36. datamule-3.1.0/datamule/tags/dictionaries.py +117 -0
  37. datamule-3.1.0/datamule/tags/regex.py +105 -0
  38. datamule-3.1.0/datamule/tags/utils.py +178 -0
  39. datamule-3.1.0/datamule/utils/__init__.py +0 -0
  40. datamule-3.1.0/datamule/utils/compression.py +35 -0
  41. datamule-3.1.0/datamule/utils/convenience.py +4 -0
  42. datamule-3.1.0/datamule/utils/pdf.py +25 -0
  43. {datamule-2.0.7 → datamule-3.1.0}/datamule.egg-info/PKG-INFO +3 -1
  44. {datamule-2.0.7 → datamule-3.1.0}/datamule.egg-info/SOURCES.txt +39 -17
  45. {datamule-2.0.7 → datamule-3.1.0}/datamule.egg-info/requires.txt +2 -0
  46. {datamule-2.0.7 → datamule-3.1.0}/setup.py +4 -2
  47. datamule-2.0.7/datamule/datamule/datamule_lookup.py +0 -235
  48. datamule-2.0.7/datamule/datamule/datamule_mysql_rds.py +0 -295
  49. datamule-2.0.7/datamule/document/document.py +0 -388
  50. datamule-2.0.7/datamule/document/tables/tables.py +0 -129
  51. datamule-2.0.7/datamule/document/tables/tables_informationtable.py +0 -39
  52. datamule-2.0.7/datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
  53. datamule-2.0.7/datamule/sheet.py +0 -706
  54. {datamule-2.0.7/datamule/datamule → datamule-3.1.0/datamule/book}/__init__.py +0 -0
  55. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule/cloud}/__init__.py +0 -0
  56. {datamule-2.0.7 → datamule-3.1.0}/datamule/config.py +0 -0
  57. {datamule-2.0.7 → datamule-3.1.0}/datamule/data/listed_filer_metadata.csv +0 -0
  58. {datamule-2.0.7/datamule/document/tables → datamule-3.1.0/datamule/datamule}/__init__.py +0 -0
  59. {datamule-2.0.7/datamule/mapping_dicts → datamule-3.1.0/datamule/document}/__init__.py +0 -0
  60. {datamule-2.0.7/datamule/sec → datamule-3.1.0/datamule/filings_constructor}/__init__.py +0 -0
  61. {datamule-2.0.7 → datamule-3.1.0}/datamule/index.py +0 -0
  62. {datamule-2.0.7/datamule/sec/infrastructure → datamule-3.1.0/datamule/mapping_dicts}/__init__.py +0 -0
  63. {datamule-2.0.7 → datamule-3.1.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  64. {datamule-2.0.7 → datamule-3.1.0}/datamule/package_updater.py +0 -0
  65. {datamule-2.0.7/datamule/sec/submissions → datamule-3.1.0/datamule/portfolio}/__init__.py +0 -0
  66. {datamule-2.0.7/datamule/sec/xbrl → datamule-3.1.0/datamule/providers}/__init__.py +0 -0
  67. {datamule-2.0.7/datamule/seclibrary → datamule-3.1.0/datamule/sec}/__init__.py +0 -0
  68. {datamule-2.0.7/datamule/utils → datamule-3.1.0/datamule/sec/infrastructure}/__init__.py +0 -0
  69. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  70. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/submissions/eftsquery.py +0 -0
  71. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/submissions/monitor.py +0 -0
  72. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/submissions/textsearch.py +0 -0
  73. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/utils.py +0 -0
  74. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  75. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  76. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  77. {datamule-2.0.7 → datamule-3.1.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  78. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_13fhr.py +0 -0
  79. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_25nse.py +0 -0
  80. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_npx.py +0 -0
  81. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_ownership.py +0 -0
  82. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_sbsef.py +0 -0
  83. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/tables_sdr.py +0 -0
  84. {datamule-2.0.7/datamule/document → datamule-3.1.0/datamule}/tables/utils.py +0 -0
  85. {datamule-2.0.7 → datamule-3.1.0}/datamule/utils/construct_submissions_data.py +0 -0
  86. {datamule-2.0.7 → datamule-3.1.0}/datamule/utils/format_accession.py +0 -0
  87. {datamule-2.0.7 → datamule-3.1.0}/datamule.egg-info/dependency_links.txt +0 -0
  88. {datamule-2.0.7 → datamule-3.1.0}/datamule.egg-info/top_level.txt +0 -0
  89. {datamule-2.0.7 → datamule-3.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.0.7
3
+ Version: 3.1.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -19,3 +19,5 @@ Requires-Dist: secxbrl
19
19
  Requires-Dist: secsgml
20
20
  Requires-Dist: websocket-client
21
21
  Requires-Dist: company_fundamentals
22
+ Requires-Dist: flashtext
23
+ Requires-Dist: aioboto3
@@ -1,13 +1,14 @@
1
- from .submission import Submission
2
- from .portfolio import Portfolio
1
+ from .submission.submission import Submission
2
+ from .portfolio.portfolio import Portfolio
3
3
  from .document.document import Document
4
4
  from .helper import _load_package_csv, load_package_dataset
5
5
  from .config import Config
6
- from .sheet import Sheet
6
+ from .sheet.sheet import Sheet
7
7
  from .index import Index
8
8
  from .package_updater import PackageUpdater
9
9
  from .utils.format_accession import format_accession
10
10
  from .utils.construct_submissions_data import construct_submissions_data
11
+ from .book.book import Book
11
12
 
12
13
 
13
14
  # Keep the notebook environment setup
@@ -31,6 +32,8 @@ def _setup_notebook_env():
31
32
  # Set up notebook environment
32
33
  _setup_notebook_env()
33
34
 
35
+
36
+ # TODO, is this load bearing?
34
37
  __all__ = [
35
38
  '_load_package_csv',
36
39
  'load_package_dataset',
@@ -0,0 +1,38 @@
1
+ import os
2
+ from .s3transfer import s3_transfer as _s3_transfer
3
+ from .download_dataset_from_s3 import download_dataset as _download_dataset
4
+
5
+ class Book:
6
+ def __init__(self, api_key=None):
7
+ if api_key is not None:
8
+ self._api_key = api_key
9
+
10
+ @property
11
+ def api_key(self):
12
+ return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
13
+
14
+ @api_key.setter
15
+ def api_key(self, value):
16
+ if not value:
17
+ raise ValueError("API key cannot be empty")
18
+ self._api_key = value
19
+
20
+ def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4,
21
+ errors_json_filename='s3_transfer_errors.json', retry_errors=3,
22
+ force_daily=True, cik=None, submission_type=None, filing_date=None,
23
+ api_key=None, accession=None):
24
+
25
+ # Use provided key, or fall back to instance property
26
+ api_key = api_key or self.api_key
27
+
28
+ _s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials,
29
+ max_workers=max_workers, errors_json_filename=errors_json_filename,
30
+ retry_errors=retry_errors, force_daily=force_daily, cik=cik,
31
+ submission_type=submission_type, filing_date=filing_date,
32
+ api_key=api_key, accession_number=accession)
33
+
34
+ def download_dataset(self, dataset, filename=None, api_key=None):
35
+ # Use provided key, or fall back to instance property
36
+ api_key = api_key or self.api_key
37
+
38
+ _download_dataset(dataset=dataset, filename=filename, api_key=api_key)
@@ -0,0 +1,106 @@
1
+ import urllib.request
2
+ import urllib.parse
3
+ from tqdm import tqdm
4
+ import json
5
+
6
+ # Dataset name mapping - lowercase underscore to official name
7
+ DATASET_NAME_MAP = {
8
+ 'sec_accessions': 'SEC Accessions Master Index',
9
+ 'sec_master_submissions': 'SEC Master Submissions Table',
10
+ 'sec_accession_cik_table': 'SEC Accession CIK Table',
11
+ 'sec_documents_table': 'SEC Documents Table',
12
+ 'sec_submission_details_table': 'SEC Submissions Details Table',
13
+ 'simple_xbrl_table': 'Simple XBRL Table',
14
+ 'proxy_voting_records_table': 'Proxy Voting Records Table',
15
+ 'institutional_holdings_table': 'Institutional Holdings Table',
16
+ 'metadata_ownership_table': 'Insider Ownership Metadata Table',
17
+ 'reporting_owner_ownership_table': 'Insider Reporting Owner Table',
18
+ 'non_derivative_transaction_ownership_table': 'Insider Non-Derivative Transactions Table',
19
+ 'non_derivative_holding_ownership_table': 'Insider Non-Derivative Holdings Table',
20
+ 'derivative_transaction_ownership_table': 'Insider Derivative Transactions Table',
21
+ 'derivative_holding_ownership_table': 'Insider Derivative Holdings Table',
22
+ 'owner_signature_ownership_table': 'Insider Owner Signatures Table',
23
+ }
24
+
25
+
26
+ def download_dataset(dataset, api_key, filename=None):
27
+ """
28
+ Download a dataset from Datamule API
29
+
30
+ Args:
31
+ dataset: Dataset name (lowercase underscore format, e.g. 'sec_accessions')
32
+ api_key: Datamule API key
33
+ filename: Output filename (optional, extracted from URL if not provided)
34
+ """
35
+ # Map dataset name to official name
36
+ dataset_name = DATASET_NAME_MAP.get(dataset)
37
+ if not dataset_name:
38
+ raise ValueError(f"Unknown dataset: {dataset}")
39
+
40
+ # Get download URL from API
41
+ api_url = f"https://api.datamule.xyz/dataset/{urllib.parse.quote(dataset_name)}?api_key={api_key}"
42
+
43
+ # Create request with headers
44
+ req = urllib.request.Request(
45
+ api_url,
46
+ headers={
47
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
48
+ }
49
+ )
50
+
51
+ try:
52
+ with urllib.request.urlopen(req) as response:
53
+ data = json.loads(response.read().decode())
54
+ except urllib.error.HTTPError as e:
55
+ error_body = e.read().decode()
56
+ raise Exception(f"API request failed: {error_body}")
57
+
58
+ if not data.get('success'):
59
+ raise Exception(f"API error: {data.get('error', 'Unknown error')}")
60
+
61
+ download_url = data['data']['download_url']
62
+ size_gb = data['data']['size_gb']
63
+
64
+ # Extract filename from URL if not provided
65
+ if filename is None:
66
+ # Parse the path parameter from the download URL
67
+ parsed = urllib.parse.urlparse(download_url)
68
+ query_params = urllib.parse.parse_qs(parsed.query)
69
+ path = query_params.get('path', [''])[0]
70
+ # Get the filename from the path (last part after /)
71
+ filename = urllib.parse.unquote(path.split('/')[-1])
72
+ if not filename:
73
+ filename = f"{dataset}.download"
74
+
75
+ # Download file with progress bar
76
+ print(f"Downloading {dataset} ({size_gb:.2f} GB)...")
77
+
78
+ # Create request with headers for download
79
+ download_req = urllib.request.Request(
80
+ download_url,
81
+ headers={
82
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
83
+ }
84
+ )
85
+
86
+ try:
87
+ with urllib.request.urlopen(download_req) as response:
88
+ total_size = int(response.headers.get('Content-Length', 0))
89
+
90
+ with open(filename, 'wb') as f, tqdm(
91
+ total=total_size,
92
+ unit='B',
93
+ unit_scale=True,
94
+ desc=filename
95
+ ) as pbar:
96
+ while True:
97
+ chunk = response.read(8192)
98
+ if not chunk:
99
+ break
100
+ f.write(chunk)
101
+ pbar.update(len(chunk))
102
+ except urllib.error.HTTPError as e:
103
+ error_body = e.read().decode()
104
+ raise Exception(f"Download failed: {error_body}")
105
+
106
+ print(f"Downloaded to {filename}")
@@ -0,0 +1,263 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import aioboto3
4
+ import ssl
5
+ import time
6
+ import json
7
+ from datetime import datetime, timedelta
8
+ from urllib.parse import urlparse
9
+ from tqdm import tqdm
10
+ import logging
11
+ from ..sheet.sheet import Sheet
12
+ from ..utils.format_accession import format_accession
13
+
14
+ # Set up logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def generate_date_range(start_date_str, end_date_str):
23
+ start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
24
+ end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
25
+
26
+ dates = []
27
+ current_date = start_date
28
+
29
+ while current_date <= end_date:
30
+ dates.append(current_date.strftime('%Y-%m-%d'))
31
+ current_date += timedelta(days=1)
32
+
33
+ return dates
34
+
35
+
36
+ def get_filings_sgml_r2_urls(submission_type=None, cik=None, datamule_api_key=None, filing_date=None,accession_number=None):
37
+ datamule_bucket_endpoint = 'https://sec-library.datamule.xyz/'
38
+ sheet = Sheet('s3transfer')
39
+ submissions = sheet.get_submissions(distinct=True, quiet=False, api_key=datamule_api_key,
40
+ submission_type=submission_type, cik=cik, columns=['accessionNumber'], filing_date=filing_date,
41
+ accession_number=accession_number)
42
+
43
+ accessions = [format_accession(sub['accessionNumber'], 'no-dash') for sub in submissions]
44
+
45
+ urls = [f"{datamule_bucket_endpoint}{accession}.sgml" for accession in accessions]
46
+
47
+ return urls
48
+
49
+
50
+ class AsyncS3Transfer:
51
+ def __init__(self, s3_credentials, max_workers=100, chunk_size=2*1024*1024):
52
+ self.s3_credentials = s3_credentials
53
+ self.max_workers = max_workers
54
+ self.chunk_size = chunk_size
55
+
56
+ async def __aenter__(self):
57
+ # Create aiohttp session with optimized connector
58
+ connector = aiohttp.TCPConnector(
59
+ limit=self.max_workers,
60
+ force_close=False,
61
+ ssl=ssl.create_default_context(),
62
+ ttl_dns_cache=300,
63
+ keepalive_timeout=60
64
+ )
65
+
66
+ self.session = aiohttp.ClientSession(
67
+ connector=connector,
68
+ timeout=aiohttp.ClientTimeout(total=600),
69
+ headers={
70
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
71
+ 'Connection': 'keep-alive',
72
+ 'Accept-Encoding': 'gzip, deflate, br'
73
+ }
74
+ )
75
+
76
+ # Create async boto3 client
77
+ if self.s3_credentials['s3_provider'] == 'aws':
78
+ session = aioboto3.Session()
79
+ self.s3_client = await session.client(
80
+ 's3',
81
+ aws_access_key_id=self.s3_credentials['aws_access_key_id'],
82
+ aws_secret_access_key=self.s3_credentials['aws_secret_access_key'],
83
+ region_name=self.s3_credentials['region_name']
84
+ ).__aenter__()
85
+ else:
86
+ raise ValueError("S3 Provider not supported yet. Please use another provider or email johnfriedman@datamule.xyz to add support.")
87
+
88
+ return self
89
+
90
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
91
+ if hasattr(self, 'session') and self.session:
92
+ await self.session.close()
93
+ if hasattr(self, 's3_client') and self.s3_client:
94
+ await self.s3_client.__aexit__(exc_type, exc_val, exc_tb)
95
+
96
+ async def transfer_single_file(self, semaphore, url, retry_errors=3):
97
+ """Transfer a single file with retry logic and preserve metadata"""
98
+ async with semaphore:
99
+ filename = urlparse(url).path.split('/')[-1]
100
+ s3_key = filename
101
+ bucket_name = self.s3_credentials['bucket_name']
102
+
103
+ last_error = None
104
+
105
+ for attempt in range(retry_errors + 1):
106
+ try:
107
+ async with self.session.get(url) as response:
108
+ if response.status == 200:
109
+ # Capture source metadata from response headers
110
+ content_length = response.headers.get('Content-Length')
111
+ size_bytes = int(content_length) if content_length else 0
112
+ content_type = response.headers.get('Content-Type', 'application/octet-stream')
113
+ last_modified = response.headers.get('Last-Modified')
114
+
115
+ # Read response content
116
+ content = await response.read()
117
+
118
+ # Prepare S3 upload parameters with preserved metadata
119
+ upload_params = {
120
+ 'Bucket': bucket_name,
121
+ 'Key': s3_key,
122
+ 'Body': content,
123
+ 'ContentType': content_type,
124
+ 'StorageClass': 'STANDARD',
125
+ 'Metadata': {
126
+ 'source-url': url,
127
+ 'original-size': str(size_bytes),
128
+ 'transfer-date': datetime.utcnow().isoformat()
129
+ }
130
+ }
131
+
132
+ # Add last modified if available
133
+ if last_modified:
134
+ upload_params['Metadata']['original-last-modified'] = last_modified
135
+
136
+ # Upload to S3 with metadata
137
+ await self.s3_client.put_object(**upload_params)
138
+
139
+ return {
140
+ 'success': True,
141
+ 'url': url,
142
+ 'message': f"Copied: {url} -> s3://{bucket_name}/{s3_key}",
143
+ 'size_bytes': size_bytes,
144
+ 's3_key': s3_key,
145
+ 'content_type': content_type,
146
+ 'last_modified': last_modified
147
+ }
148
+ else:
149
+ raise aiohttp.ClientResponseError(
150
+ request_info=response.request_info,
151
+ history=response.history,
152
+ status=response.status
153
+ )
154
+
155
+ except Exception as e:
156
+ print(e)
157
+ last_error = e
158
+ if attempt < retry_errors:
159
+ await asyncio.sleep(2 ** attempt) # Exponential backoff
160
+
161
+ # All attempts failed
162
+ return {
163
+ 'success': False,
164
+ 'url': url,
165
+ 'error': str(last_error),
166
+ 'message': f"Failed to copy {url} after {retry_errors + 1} attempts: {last_error}",
167
+ 'size_bytes': 0
168
+ }
169
+
170
+ async def transfer_batch(self, urls, retry_errors=3):
171
+ """Transfer multiple files concurrently"""
172
+ semaphore = asyncio.Semaphore(self.max_workers)
173
+ failed_files = []
174
+ total_bytes = 0
175
+ start_time = time.time()
176
+
177
+ # Create tasks for all transfers
178
+ tasks = [
179
+ self.transfer_single_file(semaphore, url, retry_errors)
180
+ for url in urls
181
+ ]
182
+
183
+ # Process with progress bar
184
+ with tqdm(total=len(urls), desc="Transferring files", unit="file") as pbar:
185
+ for coro in asyncio.as_completed(tasks):
186
+ result = await coro
187
+
188
+ if result['success']:
189
+ total_bytes += result.get('size_bytes', 0)
190
+ else:
191
+ failed_files.append(result)
192
+
193
+ # Update progress bar with total GB transferred
194
+ total_gb = total_bytes / (1024 ** 3)
195
+ pbar.set_postfix({'Total': f'{total_gb:.2f} GB'})
196
+
197
+ pbar.update(1)
198
+
199
+ return failed_files, total_bytes
200
+
201
+
202
+ async def async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4,
203
+ errors_json_filename='s3_transfer_errors.json',
204
+ retry_errors=3):
205
+ """Async version of transfer_cached_urls_to_s3"""
206
+ failed_files = []
207
+ total_bytes = 0
208
+
209
+ async with AsyncS3Transfer(s3_credentials, max_workers) as transfer:
210
+ failed_files, total_bytes = await transfer.transfer_batch(urls, retry_errors)
211
+
212
+ # Save errors to JSON if filename provided and there are errors
213
+ if errors_json_filename and failed_files:
214
+ with open(errors_json_filename, 'w') as f:
215
+ json.dump(failed_files, f, indent=2)
216
+ print(f"Saved {len(failed_files)} errors to {errors_json_filename}")
217
+
218
+ print(f"Transfer complete: {len(urls) - len(failed_files)}/{len(urls)} files successful")
219
+
220
+
221
+ def transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3):
222
+ """Wrapper to run async transfer in sync context"""
223
+ asyncio.run(async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers, errors_json_filename, retry_errors))
224
+
225
+
226
+ def s3_transfer(datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
227
+ force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession_number=None):
228
+
229
+ if datamule_bucket in ['filings_sgml_r2','sec_filings_sgml_r2']:
230
+
231
+ if accession_number is not None:
232
+ if any(param is not None for param in [cik, submission_type, filing_date]):
233
+ raise ValueError('If accession is provided, then cik, type, and date must be None')
234
+ urls = get_filings_sgml_r2_urls(datamule_api_key=datamule_api_key,accession_number=accession_number)
235
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
236
+ else:
237
+ if not force_daily:
238
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
239
+ filing_date=filing_date)
240
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
241
+ else:
242
+ if isinstance(filing_date, str):
243
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
244
+ filing_date=filing_date)
245
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
246
+ elif isinstance(filing_date, list):
247
+ for date in filing_date:
248
+ print(f"Transferring {date}")
249
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
250
+ filing_date=date)
251
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
252
+ elif isinstance(filing_date, tuple):
253
+ dates = generate_date_range(filing_date[0], filing_date[1])
254
+ for date in dates:
255
+ print(f"Transferring {date}")
256
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
257
+ filing_date=date)
258
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
259
+ else:
260
+ raise ValueError('filing_date can only be string, list, or (startdt,enddt)')
261
+
262
+ else:
263
+ raise ValueError('Datamule S3 bucket not found.')
@@ -0,0 +1,86 @@
1
+ from ..sheet.sheet import Sheet
2
+ from ..utils.format_accession import format_accession
3
+
4
+ from ..helper import _process_cik_and_metadata_filters
5
+
6
+ def _filters(accession_numbers, filtered_accession_numbers=None, skip_accession_numbers=None):
7
+ """
8
+ Apply intersection and exclusion filters to accession numbers.
9
+
10
+ Args:
11
+ accession_numbers: List of accession numbers to filter
12
+ filtered_accession_numbers: If provided, only keep accessions in this list (intersection)
13
+ skip_accession_numbers: If provided, remove accessions in this list (exclusion)
14
+
15
+ Returns:
16
+ Filtered list of accession numbers
17
+ """
18
+
19
+ # Apply intersection filter if provided
20
+ if filtered_accession_numbers is not None:
21
+ filtered_accession_numbers = [format_accession(item,'int') for item in filtered_accession_numbers]
22
+ filtered_set = set(filtered_accession_numbers)
23
+ accession_numbers = [acc for acc in accession_numbers if acc in filtered_set]
24
+
25
+ # Apply exclusion filter if provided
26
+ if skip_accession_numbers is not None:
27
+ skip_accession_numbers = [format_accession(item,'int') for item in skip_accession_numbers]
28
+ skip_set = set(skip_accession_numbers)
29
+ accession_numbers = [acc for acc in accession_numbers if acc not in skip_set]
30
+
31
+ return accession_numbers
32
+
33
+
34
+ def datamule_lookup(cik=None, ticker=None, submission_type=None, filing_date=None,
35
+ report_date=None, detected_time=None,
36
+ contains_xbrl=None, document_type=None, filename=None,
37
+ sequence=None, quiet=False, api_key=None,filtered_accession_numbers=None,
38
+ skip_accession_numbers= None, provider='datamule-tar', **kwargs):
39
+
40
+ lookup_args = {}
41
+
42
+ # Direct mappings
43
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
44
+ if cik is not None:
45
+ lookup_args['cik'] = cik
46
+
47
+ if submission_type is not None:
48
+ lookup_args['submissionType'] = submission_type
49
+
50
+ # Filing date - can be specific date(s) or range
51
+ if filing_date is not None:
52
+ lookup_args['filingDate'] = filing_date
53
+
54
+
55
+ # Report date - can be specific date(s) or range
56
+ if report_date is not None:
57
+ lookup_args['reportDate'] = report_date
58
+
59
+ if detected_time is not None:
60
+ lookup_args['detectedTime'] = detected_time
61
+
62
+ # XBRL flag
63
+ if contains_xbrl is not None:
64
+ lookup_args['containsXBRL'] = contains_xbrl
65
+
66
+ # Document-level filters
67
+ if document_type is not None:
68
+ lookup_args['documentType'] = document_type
69
+
70
+ if filename is not None:
71
+ lookup_args['filename'] = filename
72
+
73
+ if sequence is not None:
74
+ lookup_args['sequence'] = sequence
75
+
76
+ sheet = Sheet('')
77
+ if provider == 'datamule-sgml':
78
+ database = 'sgml-archive'
79
+ else:
80
+ database = 'tar-archive'
81
+ accessions = sheet.get_table(
82
+ database = database, **lookup_args
83
+ )
84
+ accessions = _filters(accession_numbers=accessions, filtered_accession_numbers=filtered_accession_numbers,
85
+ skip_accession_numbers=skip_accession_numbers)
86
+ return accessions