datamule 2.3.0__py3-none-any.whl → 2.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

datamule/__init__.py CHANGED
@@ -8,6 +8,7 @@ from .index import Index
8
8
  from .package_updater import PackageUpdater
9
9
  from .utils.format_accession import format_accession
10
10
  from .utils.construct_submissions_data import construct_submissions_data
11
+ from .book.book import Book
11
12
 
12
13
 
13
14
  # Keep the notebook environment setup
File without changes
datamule/book/book.py ADDED
@@ -0,0 +1,13 @@
1
+ from .s3transfer import s3_transfer
2
+
3
+ class Book:
4
+ def __init__(self):
5
+ pass
6
+
7
+ def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
8
+ force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession=None):
9
+
10
+ s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
11
+ errors_json_filename=errors_json_filename, retry_errors=retry_errors,
12
+ force_daily=force_daily, cik=cik, submission_type=submission_type,
13
+ filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
@@ -0,0 +1,264 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import aioboto3
4
+ import ssl
5
+ import time
6
+ import json
7
+ from datetime import datetime, timedelta
8
+ from urllib.parse import urlparse
9
+ from tqdm import tqdm
10
+ import logging
11
+ from ..sheet import Sheet
12
+ from ..utils.format_accession import format_accession
13
+
14
+ # Set up logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def generate_date_range(start_date_str, end_date_str):
23
+ start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
24
+ end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
25
+
26
+ dates = []
27
+ current_date = start_date
28
+
29
+ while current_date <= end_date:
30
+ dates.append(current_date.strftime('%Y-%m-%d'))
31
+ current_date += timedelta(days=1)
32
+
33
+ return dates
34
+
35
+
36
+ def get_filings_sgml_r2_urls(submission_type=None, cik=None, datamule_api_key=None, filing_date=None,accession_number=None):
37
+ datamule_bucket_endpoint = 'https://sec-library.datamule.xyz/'
38
+ sheet = Sheet('s3transfer')
39
+ submissions = sheet.get_submissions(distinct=True, quiet=False, api_key=datamule_api_key,
40
+ submission_type=submission_type, cik=cik, columns=['accessionNumber'], filing_date=filing_date,
41
+ accession_number=accession_number)
42
+
43
+ accessions = [format_accession(sub['accessionNumber'], 'no-dash') for sub in submissions]
44
+
45
+ urls = [f"{datamule_bucket_endpoint}{accession}.sgml" for accession in accessions]
46
+
47
+ return urls
48
+
49
+
50
+ class AsyncS3Transfer:
51
+ def __init__(self, s3_credentials, max_workers=100, chunk_size=2*1024*1024):
52
+ self.s3_credentials = s3_credentials
53
+ self.max_workers = max_workers
54
+ self.chunk_size = chunk_size
55
+
56
+ async def __aenter__(self):
57
+ # Create aiohttp session with optimized connector
58
+ connector = aiohttp.TCPConnector(
59
+ limit=self.max_workers,
60
+ force_close=False,
61
+ ssl=ssl.create_default_context(),
62
+ ttl_dns_cache=300,
63
+ keepalive_timeout=60
64
+ )
65
+
66
+ self.session = aiohttp.ClientSession(
67
+ connector=connector,
68
+ timeout=aiohttp.ClientTimeout(total=600),
69
+ headers={
70
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
71
+ 'Connection': 'keep-alive',
72
+ 'Accept-Encoding': 'gzip, deflate, br'
73
+ }
74
+ )
75
+
76
+ # Create async boto3 client
77
+ if self.s3_credentials['s3_provider'] == 'aws':
78
+ session = aioboto3.Session()
79
+ self.s3_client = await session.client(
80
+ 's3',
81
+ aws_access_key_id=self.s3_credentials['aws_access_key_id'],
82
+ aws_secret_access_key=self.s3_credentials['aws_secret_access_key'],
83
+ region_name=self.s3_credentials['region_name']
84
+ ).__aenter__()
85
+ else:
86
+ raise ValueError("S3 Provider not supported yet. Please use another provider or email johnfriedman@datamule.xyz to add support.")
87
+
88
+ return self
89
+
90
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
91
+ if hasattr(self, 'session') and self.session:
92
+ await self.session.close()
93
+ if hasattr(self, 's3_client') and self.s3_client:
94
+ await self.s3_client.__aexit__(exc_type, exc_val, exc_tb)
95
+
96
+ async def transfer_single_file(self, semaphore, url, retry_errors=3):
97
+ """Transfer a single file with retry logic and preserve metadata"""
98
+ async with semaphore:
99
+ filename = urlparse(url).path.split('/')[-1]
100
+ s3_key = filename
101
+ bucket_name = self.s3_credentials['bucket_name']
102
+
103
+ last_error = None
104
+
105
+ for attempt in range(retry_errors + 1):
106
+ try:
107
+ async with self.session.get(url) as response:
108
+ if response.status == 200:
109
+ # Capture source metadata from response headers
110
+ content_length = response.headers.get('Content-Length')
111
+ size_bytes = int(content_length) if content_length else 0
112
+ content_type = response.headers.get('Content-Type', 'application/octet-stream')
113
+ last_modified = response.headers.get('Last-Modified')
114
+
115
+ # Read response content
116
+ content = await response.read()
117
+
118
+ # Prepare S3 upload parameters with preserved metadata
119
+ upload_params = {
120
+ 'Bucket': bucket_name,
121
+ 'Key': s3_key,
122
+ 'Body': content,
123
+ 'ContentType': content_type,
124
+ 'StorageClass': 'STANDARD',
125
+ 'Metadata': {
126
+ 'source-url': url,
127
+ 'original-size': str(size_bytes),
128
+ 'transfer-date': datetime.utcnow().isoformat()
129
+ }
130
+ }
131
+
132
+ # Add last modified if available
133
+ if last_modified:
134
+ upload_params['Metadata']['original-last-modified'] = last_modified
135
+
136
+ # Upload to S3 with metadata
137
+ await self.s3_client.put_object(**upload_params)
138
+
139
+ return {
140
+ 'success': True,
141
+ 'url': url,
142
+ 'message': f"Copied: {url} -> s3://{bucket_name}/{s3_key}",
143
+ 'size_bytes': size_bytes,
144
+ 's3_key': s3_key,
145
+ 'content_type': content_type,
146
+ 'last_modified': last_modified
147
+ }
148
+ else:
149
+ raise aiohttp.ClientResponseError(
150
+ request_info=response.request_info,
151
+ history=response.history,
152
+ status=response.status
153
+ )
154
+
155
+ except Exception as e:
156
+ print(e)
157
+ last_error = e
158
+ if attempt < retry_errors:
159
+ await asyncio.sleep(2 ** attempt) # Exponential backoff
160
+
161
+ # All attempts failed
162
+ return {
163
+ 'success': False,
164
+ 'url': url,
165
+ 'error': str(last_error),
166
+ 'message': f"Failed to copy {url} after {retry_errors + 1} attempts: {last_error}",
167
+ 'size_bytes': 0
168
+ }
169
+
170
+ async def transfer_batch(self, urls, retry_errors=3):
171
+ """Transfer multiple files concurrently"""
172
+ semaphore = asyncio.Semaphore(self.max_workers)
173
+ failed_files = []
174
+ total_bytes = 0
175
+ start_time = time.time()
176
+
177
+ # Create tasks for all transfers
178
+ tasks = [
179
+ self.transfer_single_file(semaphore, url, retry_errors)
180
+ for url in urls
181
+ ]
182
+
183
+ # Process with progress bar
184
+ with tqdm(total=len(urls), desc="Transferring files", unit="file") as pbar:
185
+ for coro in asyncio.as_completed(tasks):
186
+ result = await coro
187
+
188
+ if result['success']:
189
+ total_bytes += result.get('size_bytes', 0)
190
+ else:
191
+ failed_files.append(result)
192
+
193
+ # Update progress bar with total GB transferred
194
+ total_gb = total_bytes / (1024 ** 3)
195
+ pbar.set_postfix({'Total': f'{total_gb:.2f} GB'})
196
+
197
+ pbar.update(1)
198
+
199
+ return failed_files, total_bytes
200
+
201
+
202
+ async def async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4,
203
+ errors_json_filename='s3_transfer_errors.json',
204
+ retry_errors=3):
205
+ """Async version of transfer_cached_urls_to_s3"""
206
+ failed_files = []
207
+ total_bytes = 0
208
+
209
+ async with AsyncS3Transfer(s3_credentials, max_workers) as transfer:
210
+ failed_files, total_bytes = await transfer.transfer_batch(urls, retry_errors)
211
+
212
+ # Save errors to JSON if filename provided and there are errors
213
+ if errors_json_filename and failed_files:
214
+ with open(errors_json_filename, 'w') as f:
215
+ json.dump(failed_files, f, indent=2)
216
+ print(f"Saved {len(failed_files)} errors to {errors_json_filename}")
217
+
218
+ print(f"Transfer complete: {len(urls) - len(failed_files)}/{len(urls)} files successful")
219
+
220
+
221
+ def transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3):
222
+ """Wrapper to run async transfer in sync context"""
223
+ asyncio.run(async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers, errors_json_filename, retry_errors))
224
+
225
+
226
+ def s3_transfer(datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
227
+ force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession_number=None):
228
+
229
+ if datamule_bucket == 'filings_sgml_r2':
230
+
231
+
232
+ if accession_number is not None:
233
+ if any(param is not None for param in [cik, submission_type, filing_date]):
234
+ raise ValueError('If accession is provided, then cik, type, and date must be None')
235
+ urls = get_filings_sgml_r2_urls(datamule_api_key=datamule_api_key,accession_number=accession_number)
236
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
237
+ else:
238
+ if not force_daily:
239
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
240
+ filing_date=filing_date)
241
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
242
+ else:
243
+ if isinstance(filing_date, str):
244
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
245
+ filing_date=filing_date)
246
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
247
+ elif isinstance(filing_date, list):
248
+ for date in filing_date:
249
+ print(f"Transferring {date}")
250
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
251
+ filing_date=date)
252
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
253
+ elif isinstance(filing_date, tuple):
254
+ dates = generate_date_range(filing_date[0], filing_date[1])
255
+ for date in dates:
256
+ print(f"Transferring {date}")
257
+ urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
258
+ filing_date=date)
259
+ transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
260
+ else:
261
+ raise ValueError('filing_date can only be string, list, or (startdt,enddt)')
262
+
263
+ else:
264
+ raise ValueError('Datamule S3 bucket not found.')
@@ -228,7 +228,7 @@ class Downloader:
228
228
  headers = {
229
229
  'Connection': 'keep-alive',
230
230
  'Accept-Encoding': 'gzip, deflate, br',
231
- 'Authorization': f'Bearer {api_key}'
231
+ #'Authorization': f'Bearer {api_key}'
232
232
  }
233
233
 
234
234
  async with session.get(url, headers=headers) as response:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.3.0
3
+ Version: 2.3.2
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -20,4 +20,5 @@ Requires-Dist: secsgml
20
20
  Requires-Dist: websocket-client
21
21
  Requires-Dist: company-fundamentals
22
22
  Requires-Dist: flashtext
23
+ Requires-Dist: aioboto3
23
24
 
@@ -1,4 +1,4 @@
1
- datamule/__init__.py,sha256=sY9rYx9z4LADjOLmwjL3BXssIzHs8MQM6gt9IWMS85U,1192
1
+ datamule/__init__.py,sha256=gsWTW0emwGtM-KVtwe2OICVmW7ImvLvP0SORULTPe-Y,1220
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/datasets.py,sha256=-2_5kTRS3mxlkKbXwBg8aiistYljLYRnZjDLZNhV8bk,1867
4
4
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
@@ -8,11 +8,14 @@ datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
8
8
  datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
9
9
  datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
10
10
  datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
11
+ datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
13
+ datamule/book/s3transfer.py,sha256=4Zpw5daAH05u1dppv2ARXG_VSBIdsHnlEWC9xZgBfZM,12590
11
14
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
12
15
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
16
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
14
17
  datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
15
- datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
18
+ datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
16
19
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
20
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
21
  datamule/document/document.py,sha256=NrMqhY_u_X7gyvraxY0hzZEDJddqSJDgiHFzkaRTBVA,23102
@@ -58,7 +61,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
61
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
59
62
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
60
63
  datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
61
- datamule-2.3.0.dist-info/METADATA,sha256=jUra4jM6LMxAS3IKnrF9urlK6ZI4ZAcl6yimnsD67pk,585
62
- datamule-2.3.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
- datamule-2.3.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
- datamule-2.3.0.dist-info/RECORD,,
64
+ datamule-2.3.2.dist-info/METADATA,sha256=Mn-oEWDcCGepxr663ugvpQibjGbDsYzlqg5CsP1Rgvs,609
65
+ datamule-2.3.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
66
+ datamule-2.3.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
67
+ datamule-2.3.2.dist-info/RECORD,,