datamule 2.2.9__tar.gz → 2.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- {datamule-2.2.9 → datamule-2.3.2}/PKG-INFO +2 -1
- {datamule-2.2.9 → datamule-2.3.2}/datamule/__init__.py +1 -0
- datamule-2.3.2/datamule/book/book.py +13 -0
- datamule-2.3.2/datamule/book/s3transfer.py +264 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/datamule/downloader.py +1 -1
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/document.py +50 -13
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables.py +39 -11
- datamule-2.3.2/datamule/utils/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/PKG-INFO +2 -1
- {datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/SOURCES.txt +3 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/requires.txt +1 -0
- {datamule-2.2.9 → datamule-2.3.2}/setup.py +3 -2
- {datamule-2.2.9/datamule/datamule → datamule-2.3.2/datamule/book}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/config.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.9/datamule/document → datamule-2.3.2/datamule/datamule}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/datasets.py +0 -0
- {datamule-2.2.9/datamule/document/tables → datamule-2.3.2/datamule/document}/__init__.py +0 -0
- {datamule-2.2.9/datamule/mapping_dicts → datamule-2.3.2/datamule/document/tables}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/helper.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/index.py +0 -0
- {datamule-2.2.9/datamule/sec → datamule-2.3.2/datamule/mapping_dicts}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/package_updater.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/portfolio.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.9/datamule/sec/infrastructure → datamule-2.3.2/datamule/sec}/__init__.py +0 -0
- {datamule-2.2.9/datamule/sec/submissions → datamule-2.3.2/datamule/sec/infrastructure}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.9/datamule/sec/xbrl → datamule-2.3.2/datamule/sec/submissions}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/utils.py +0 -0
- {datamule-2.2.9/datamule/seclibrary → datamule-2.3.2/datamule/sec/xbrl}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.9/datamule/sentiment → datamule-2.3.2/datamule/seclibrary}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.9/datamule/tags → datamule-2.3.2/datamule/sentiment}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/sheet.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/submission.py +0 -0
- {datamule-2.2.9/datamule/utils → datamule-2.3.2/datamule/tags}/__init__.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/tags/config.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/tags/dictionaries.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/tags/regex.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/tags/utils.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule/utils/pdf.py +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.9 → datamule-2.3.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamule
|
|
3
|
-
Version: 2.2
|
|
3
|
+
Version: 2.3.2
|
|
4
4
|
Summary: Work with SEC submissions at scale.
|
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
|
6
6
|
Author: John Friedman
|
|
@@ -20,3 +20,4 @@ Requires-Dist: secsgml
|
|
|
20
20
|
Requires-Dist: websocket-client
|
|
21
21
|
Requires-Dist: company_fundamentals
|
|
22
22
|
Requires-Dist: flashtext
|
|
23
|
+
Requires-Dist: aioboto3
|
|
@@ -8,6 +8,7 @@ from .index import Index
|
|
|
8
8
|
from .package_updater import PackageUpdater
|
|
9
9
|
from .utils.format_accession import format_accession
|
|
10
10
|
from .utils.construct_submissions_data import construct_submissions_data
|
|
11
|
+
from .book.book import Book
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
# Keep the notebook environment setup
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .s3transfer import s3_transfer
|
|
2
|
+
|
|
3
|
+
class Book:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
|
|
8
|
+
force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession=None):
|
|
9
|
+
|
|
10
|
+
s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
|
|
11
|
+
errors_json_filename=errors_json_filename, retry_errors=retry_errors,
|
|
12
|
+
force_daily=force_daily, cik=cik, submission_type=submission_type,
|
|
13
|
+
filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import aiohttp
|
|
3
|
+
import aioboto3
|
|
4
|
+
import ssl
|
|
5
|
+
import time
|
|
6
|
+
import json
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
import logging
|
|
11
|
+
from ..sheet import Sheet
|
|
12
|
+
from ..utils.format_accession import format_accession
|
|
13
|
+
|
|
14
|
+
# Set up logging
|
|
15
|
+
logging.basicConfig(
|
|
16
|
+
level=logging.INFO,
|
|
17
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
18
|
+
)
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def generate_date_range(start_date_str, end_date_str):
|
|
23
|
+
start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
|
|
24
|
+
end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
|
|
25
|
+
|
|
26
|
+
dates = []
|
|
27
|
+
current_date = start_date
|
|
28
|
+
|
|
29
|
+
while current_date <= end_date:
|
|
30
|
+
dates.append(current_date.strftime('%Y-%m-%d'))
|
|
31
|
+
current_date += timedelta(days=1)
|
|
32
|
+
|
|
33
|
+
return dates
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_filings_sgml_r2_urls(submission_type=None, cik=None, datamule_api_key=None, filing_date=None,accession_number=None):
|
|
37
|
+
datamule_bucket_endpoint = 'https://sec-library.datamule.xyz/'
|
|
38
|
+
sheet = Sheet('s3transfer')
|
|
39
|
+
submissions = sheet.get_submissions(distinct=True, quiet=False, api_key=datamule_api_key,
|
|
40
|
+
submission_type=submission_type, cik=cik, columns=['accessionNumber'], filing_date=filing_date,
|
|
41
|
+
accession_number=accession_number)
|
|
42
|
+
|
|
43
|
+
accessions = [format_accession(sub['accessionNumber'], 'no-dash') for sub in submissions]
|
|
44
|
+
|
|
45
|
+
urls = [f"{datamule_bucket_endpoint}{accession}.sgml" for accession in accessions]
|
|
46
|
+
|
|
47
|
+
return urls
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AsyncS3Transfer:
|
|
51
|
+
def __init__(self, s3_credentials, max_workers=100, chunk_size=2*1024*1024):
|
|
52
|
+
self.s3_credentials = s3_credentials
|
|
53
|
+
self.max_workers = max_workers
|
|
54
|
+
self.chunk_size = chunk_size
|
|
55
|
+
|
|
56
|
+
async def __aenter__(self):
|
|
57
|
+
# Create aiohttp session with optimized connector
|
|
58
|
+
connector = aiohttp.TCPConnector(
|
|
59
|
+
limit=self.max_workers,
|
|
60
|
+
force_close=False,
|
|
61
|
+
ssl=ssl.create_default_context(),
|
|
62
|
+
ttl_dns_cache=300,
|
|
63
|
+
keepalive_timeout=60
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
self.session = aiohttp.ClientSession(
|
|
67
|
+
connector=connector,
|
|
68
|
+
timeout=aiohttp.ClientTimeout(total=600),
|
|
69
|
+
headers={
|
|
70
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
71
|
+
'Connection': 'keep-alive',
|
|
72
|
+
'Accept-Encoding': 'gzip, deflate, br'
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Create async boto3 client
|
|
77
|
+
if self.s3_credentials['s3_provider'] == 'aws':
|
|
78
|
+
session = aioboto3.Session()
|
|
79
|
+
self.s3_client = await session.client(
|
|
80
|
+
's3',
|
|
81
|
+
aws_access_key_id=self.s3_credentials['aws_access_key_id'],
|
|
82
|
+
aws_secret_access_key=self.s3_credentials['aws_secret_access_key'],
|
|
83
|
+
region_name=self.s3_credentials['region_name']
|
|
84
|
+
).__aenter__()
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError("S3 Provider not supported yet. Please use another provider or email johnfriedman@datamule.xyz to add support.")
|
|
87
|
+
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
91
|
+
if hasattr(self, 'session') and self.session:
|
|
92
|
+
await self.session.close()
|
|
93
|
+
if hasattr(self, 's3_client') and self.s3_client:
|
|
94
|
+
await self.s3_client.__aexit__(exc_type, exc_val, exc_tb)
|
|
95
|
+
|
|
96
|
+
async def transfer_single_file(self, semaphore, url, retry_errors=3):
|
|
97
|
+
"""Transfer a single file with retry logic and preserve metadata"""
|
|
98
|
+
async with semaphore:
|
|
99
|
+
filename = urlparse(url).path.split('/')[-1]
|
|
100
|
+
s3_key = filename
|
|
101
|
+
bucket_name = self.s3_credentials['bucket_name']
|
|
102
|
+
|
|
103
|
+
last_error = None
|
|
104
|
+
|
|
105
|
+
for attempt in range(retry_errors + 1):
|
|
106
|
+
try:
|
|
107
|
+
async with self.session.get(url) as response:
|
|
108
|
+
if response.status == 200:
|
|
109
|
+
# Capture source metadata from response headers
|
|
110
|
+
content_length = response.headers.get('Content-Length')
|
|
111
|
+
size_bytes = int(content_length) if content_length else 0
|
|
112
|
+
content_type = response.headers.get('Content-Type', 'application/octet-stream')
|
|
113
|
+
last_modified = response.headers.get('Last-Modified')
|
|
114
|
+
|
|
115
|
+
# Read response content
|
|
116
|
+
content = await response.read()
|
|
117
|
+
|
|
118
|
+
# Prepare S3 upload parameters with preserved metadata
|
|
119
|
+
upload_params = {
|
|
120
|
+
'Bucket': bucket_name,
|
|
121
|
+
'Key': s3_key,
|
|
122
|
+
'Body': content,
|
|
123
|
+
'ContentType': content_type,
|
|
124
|
+
'StorageClass': 'STANDARD',
|
|
125
|
+
'Metadata': {
|
|
126
|
+
'source-url': url,
|
|
127
|
+
'original-size': str(size_bytes),
|
|
128
|
+
'transfer-date': datetime.utcnow().isoformat()
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Add last modified if available
|
|
133
|
+
if last_modified:
|
|
134
|
+
upload_params['Metadata']['original-last-modified'] = last_modified
|
|
135
|
+
|
|
136
|
+
# Upload to S3 with metadata
|
|
137
|
+
await self.s3_client.put_object(**upload_params)
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
'success': True,
|
|
141
|
+
'url': url,
|
|
142
|
+
'message': f"Copied: {url} -> s3://{bucket_name}/{s3_key}",
|
|
143
|
+
'size_bytes': size_bytes,
|
|
144
|
+
's3_key': s3_key,
|
|
145
|
+
'content_type': content_type,
|
|
146
|
+
'last_modified': last_modified
|
|
147
|
+
}
|
|
148
|
+
else:
|
|
149
|
+
raise aiohttp.ClientResponseError(
|
|
150
|
+
request_info=response.request_info,
|
|
151
|
+
history=response.history,
|
|
152
|
+
status=response.status
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(e)
|
|
157
|
+
last_error = e
|
|
158
|
+
if attempt < retry_errors:
|
|
159
|
+
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
|
160
|
+
|
|
161
|
+
# All attempts failed
|
|
162
|
+
return {
|
|
163
|
+
'success': False,
|
|
164
|
+
'url': url,
|
|
165
|
+
'error': str(last_error),
|
|
166
|
+
'message': f"Failed to copy {url} after {retry_errors + 1} attempts: {last_error}",
|
|
167
|
+
'size_bytes': 0
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
async def transfer_batch(self, urls, retry_errors=3):
|
|
171
|
+
"""Transfer multiple files concurrently"""
|
|
172
|
+
semaphore = asyncio.Semaphore(self.max_workers)
|
|
173
|
+
failed_files = []
|
|
174
|
+
total_bytes = 0
|
|
175
|
+
start_time = time.time()
|
|
176
|
+
|
|
177
|
+
# Create tasks for all transfers
|
|
178
|
+
tasks = [
|
|
179
|
+
self.transfer_single_file(semaphore, url, retry_errors)
|
|
180
|
+
for url in urls
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
# Process with progress bar
|
|
184
|
+
with tqdm(total=len(urls), desc="Transferring files", unit="file") as pbar:
|
|
185
|
+
for coro in asyncio.as_completed(tasks):
|
|
186
|
+
result = await coro
|
|
187
|
+
|
|
188
|
+
if result['success']:
|
|
189
|
+
total_bytes += result.get('size_bytes', 0)
|
|
190
|
+
else:
|
|
191
|
+
failed_files.append(result)
|
|
192
|
+
|
|
193
|
+
# Update progress bar with total GB transferred
|
|
194
|
+
total_gb = total_bytes / (1024 ** 3)
|
|
195
|
+
pbar.set_postfix({'Total': f'{total_gb:.2f} GB'})
|
|
196
|
+
|
|
197
|
+
pbar.update(1)
|
|
198
|
+
|
|
199
|
+
return failed_files, total_bytes
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
async def async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4,
|
|
203
|
+
errors_json_filename='s3_transfer_errors.json',
|
|
204
|
+
retry_errors=3):
|
|
205
|
+
"""Async version of transfer_cached_urls_to_s3"""
|
|
206
|
+
failed_files = []
|
|
207
|
+
total_bytes = 0
|
|
208
|
+
|
|
209
|
+
async with AsyncS3Transfer(s3_credentials, max_workers) as transfer:
|
|
210
|
+
failed_files, total_bytes = await transfer.transfer_batch(urls, retry_errors)
|
|
211
|
+
|
|
212
|
+
# Save errors to JSON if filename provided and there are errors
|
|
213
|
+
if errors_json_filename and failed_files:
|
|
214
|
+
with open(errors_json_filename, 'w') as f:
|
|
215
|
+
json.dump(failed_files, f, indent=2)
|
|
216
|
+
print(f"Saved {len(failed_files)} errors to {errors_json_filename}")
|
|
217
|
+
|
|
218
|
+
print(f"Transfer complete: {len(urls) - len(failed_files)}/{len(urls)} files successful")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def transfer_cached_urls_to_s3(urls, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3):
|
|
222
|
+
"""Wrapper to run async transfer in sync context"""
|
|
223
|
+
asyncio.run(async_transfer_cached_urls_to_s3(urls, s3_credentials, max_workers, errors_json_filename, retry_errors))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def s3_transfer(datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
|
|
227
|
+
force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession_number=None):
|
|
228
|
+
|
|
229
|
+
if datamule_bucket == 'filings_sgml_r2':
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
if accession_number is not None:
|
|
233
|
+
if any(param is not None for param in [cik, submission_type, filing_date]):
|
|
234
|
+
raise ValueError('If accession is provided, then cik, type, and date must be None')
|
|
235
|
+
urls = get_filings_sgml_r2_urls(datamule_api_key=datamule_api_key,accession_number=accession_number)
|
|
236
|
+
transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
|
|
237
|
+
else:
|
|
238
|
+
if not force_daily:
|
|
239
|
+
urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
|
|
240
|
+
filing_date=filing_date)
|
|
241
|
+
transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
|
|
242
|
+
else:
|
|
243
|
+
if isinstance(filing_date, str):
|
|
244
|
+
urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
|
|
245
|
+
filing_date=filing_date)
|
|
246
|
+
transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
|
|
247
|
+
elif isinstance(filing_date, list):
|
|
248
|
+
for date in filing_date:
|
|
249
|
+
print(f"Transferring {date}")
|
|
250
|
+
urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
|
|
251
|
+
filing_date=date)
|
|
252
|
+
transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
|
|
253
|
+
elif isinstance(filing_date, tuple):
|
|
254
|
+
dates = generate_date_range(filing_date[0], filing_date[1])
|
|
255
|
+
for date in dates:
|
|
256
|
+
print(f"Transferring {date}")
|
|
257
|
+
urls = get_filings_sgml_r2_urls(submission_type=submission_type, cik=cik, datamule_api_key=datamule_api_key,
|
|
258
|
+
filing_date=date)
|
|
259
|
+
transfer_cached_urls_to_s3(urls=urls, s3_credentials=s3_credentials, max_workers=max_workers, errors_json_filename=errors_json_filename, retry_errors=retry_errors)
|
|
260
|
+
else:
|
|
261
|
+
raise ValueError('filing_date can only be string, list, or (startdt,enddt)')
|
|
262
|
+
|
|
263
|
+
else:
|
|
264
|
+
raise ValueError('Datamule S3 bucket not found.')
|
|
@@ -228,7 +228,7 @@ class Downloader:
|
|
|
228
228
|
headers = {
|
|
229
229
|
'Connection': 'keep-alive',
|
|
230
230
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
231
|
-
'Authorization': f'Bearer {api_key}'
|
|
231
|
+
#'Authorization': f'Bearer {api_key}'
|
|
232
232
|
}
|
|
233
233
|
|
|
234
234
|
async with session.get(url, headers=headers) as response:
|
|
@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
|
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
|
9
9
|
from ..mapping_dicts.html_mapping_dicts import *
|
|
10
|
-
from selectolax.parser import HTMLParser
|
|
11
|
-
|
|
12
10
|
from pathlib import Path
|
|
13
11
|
import webbrowser
|
|
14
12
|
from secsgml.utils import bytes_to_str
|
|
@@ -294,7 +292,6 @@ class Document:
|
|
|
294
292
|
return bool(re.search(pattern, self.content))
|
|
295
293
|
return False
|
|
296
294
|
|
|
297
|
-
# Note: this method will be heavily modified in the future
|
|
298
295
|
def parse(self):
|
|
299
296
|
# check if we have already parsed the content
|
|
300
297
|
if self._data:
|
|
@@ -384,6 +381,8 @@ class Document:
|
|
|
384
381
|
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
|
385
382
|
elif self.extension in ['.txt']:
|
|
386
383
|
dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
|
|
384
|
+
elif self.extension == '.pdf':
|
|
385
|
+
dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
|
387
386
|
else:
|
|
388
387
|
dct = {}
|
|
389
388
|
|
|
@@ -391,10 +390,8 @@ class Document:
|
|
|
391
390
|
elif self.extension == '.xml':
|
|
392
391
|
if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
|
|
393
392
|
mapping_dict = dict_345
|
|
394
|
-
|
|
395
393
|
self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
|
396
|
-
|
|
397
|
-
self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
|
394
|
+
|
|
398
395
|
else:
|
|
399
396
|
pass
|
|
400
397
|
|
|
@@ -409,6 +406,12 @@ class Document:
|
|
|
409
406
|
|
|
410
407
|
if not isinstance(self._data, DataWithTags):
|
|
411
408
|
self._data = DataWithTags(self._data, self)
|
|
409
|
+
elif self.extension == '.xml':
|
|
410
|
+
if self._data is None:
|
|
411
|
+
self.parse()
|
|
412
|
+
|
|
413
|
+
if self._data is None:
|
|
414
|
+
self._data = {}
|
|
412
415
|
|
|
413
416
|
return self._data
|
|
414
417
|
|
|
@@ -444,19 +447,46 @@ class Document:
|
|
|
444
447
|
json.dump(self.data, f, indent=2)
|
|
445
448
|
|
|
446
449
|
def parse_tables(self,must_exist_in_mapping=True):
|
|
447
|
-
|
|
448
|
-
|
|
450
|
+
"""Must exist in mapping means columns must occur in mapping schema."""
|
|
451
|
+
if self.extension == '.xml':
|
|
452
|
+
tables = Tables(document_type = self.type, accession=self.accession)
|
|
453
|
+
tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
|
|
454
|
+
self._tables = tables
|
|
455
|
+
|
|
456
|
+
elif self._data_bool:
|
|
457
|
+
tables = Tables(document_type = self.type, accession=self.accession)
|
|
458
|
+
data_tuples = self.data_tuples
|
|
459
|
+
|
|
460
|
+
for i, (id, type, content, level) in enumerate(data_tuples):
|
|
461
|
+
if type == "table" and i > 0:
|
|
462
|
+
description = None
|
|
463
|
+
|
|
464
|
+
# Look at previous element
|
|
465
|
+
prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
|
|
466
|
+
|
|
467
|
+
# Case 1: Same level + text content
|
|
468
|
+
if prev_level == level and prev_type in ["text", "textsmall"]:
|
|
469
|
+
description = prev_content
|
|
470
|
+
|
|
471
|
+
# Case 2: Higher level (lower number) + title
|
|
472
|
+
elif prev_level < level and prev_type == "title":
|
|
473
|
+
description = prev_content
|
|
474
|
+
|
|
475
|
+
# Case 3: No matching description - add table without description
|
|
476
|
+
# (description remains None)
|
|
477
|
+
|
|
478
|
+
tables.add_table(data=content, description=description, name="extracted_table")
|
|
479
|
+
|
|
480
|
+
self._tables = tables
|
|
481
|
+
|
|
449
482
|
else:
|
|
450
|
-
|
|
451
|
-
data = self.data
|
|
452
|
-
tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
|
|
453
|
-
self._tables = tables.tables
|
|
483
|
+
self._tables = []
|
|
454
484
|
|
|
455
485
|
@property
|
|
456
486
|
def tables(self):
|
|
457
487
|
if self._tables is None:
|
|
458
488
|
self.parse_tables()
|
|
459
|
-
return self._tables
|
|
489
|
+
return self._tables.tables
|
|
460
490
|
|
|
461
491
|
|
|
462
492
|
def write_csv(self, output_folder):
|
|
@@ -547,6 +577,7 @@ class Document:
|
|
|
547
577
|
webbrowser.open('file://' + temp_path)
|
|
548
578
|
else:
|
|
549
579
|
print(f"Cannot open files with extension {self.extension}")
|
|
580
|
+
|
|
550
581
|
def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
|
|
551
582
|
if self._data_bool:
|
|
552
583
|
if not self.data:
|
|
@@ -557,3 +588,9 @@ class Document:
|
|
|
557
588
|
return [item[1] for item in result]
|
|
558
589
|
else:
|
|
559
590
|
return [flatten_dict(item[1],format) for item in result]
|
|
591
|
+
|
|
592
|
+
# TODO
|
|
593
|
+
def get_tables(self,description_regex=None,name=None):
|
|
594
|
+
# make sure tables is initialized
|
|
595
|
+
self.tables
|
|
596
|
+
return self._tables.get_tables(description_regex=description_regex, name=name)
|
|
@@ -6,8 +6,10 @@ from .tables_npx import config_npx
|
|
|
6
6
|
from .tables_sbsef import config_sbsef
|
|
7
7
|
from .tables_sdr import config_sdr
|
|
8
8
|
from .tables_proxyvotingrecord import config_proxyvotingrecord
|
|
9
|
+
from doc2dict.utils.format_dict import _format_table
|
|
9
10
|
|
|
10
11
|
from .utils import safe_get, flatten_dict
|
|
12
|
+
import re
|
|
11
13
|
# will add filing date param later? or extension
|
|
12
14
|
all_tables_dict = {
|
|
13
15
|
'3' : config_ownership,
|
|
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
|
|
|
93
95
|
|
|
94
96
|
# should have table type, accession, data
|
|
95
97
|
class Table:
|
|
96
|
-
def __init__(self,data,name,accession):
|
|
98
|
+
def __init__(self,data,name,accession,description = None):
|
|
97
99
|
self.data = data
|
|
98
100
|
self.name = name
|
|
99
101
|
self.accession = accession
|
|
102
|
+
self.description = description
|
|
103
|
+
|
|
104
|
+
# TODO MADE IN A HURRY #
|
|
105
|
+
def __str__(self):
|
|
106
|
+
formatted_table = _format_table(self.data)
|
|
107
|
+
if isinstance(formatted_table, list):
|
|
108
|
+
table_str = '\n'.join(formatted_table)
|
|
109
|
+
else:
|
|
110
|
+
table_str = str(formatted_table)
|
|
111
|
+
return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
|
|
100
112
|
|
|
101
113
|
|
|
102
114
|
class Tables():
|
|
103
|
-
def __init__(self,document_type,accession
|
|
115
|
+
def __init__(self,document_type,accession):
|
|
104
116
|
self.document_type = document_type
|
|
105
117
|
self.accession = accession
|
|
106
|
-
self.data = data
|
|
107
|
-
|
|
108
|
-
# to fill in
|
|
109
118
|
self.tables = []
|
|
110
119
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def parse_tables(self,must_exist_in_mapping=True):
|
|
114
|
-
# first select dict
|
|
120
|
+
def parse_tables(self,data,must_exist_in_mapping=True):
|
|
121
|
+
self.data = data
|
|
115
122
|
|
|
116
123
|
try:
|
|
117
124
|
tables_dict = all_tables_dict[self.document_type]
|
|
@@ -120,11 +127,32 @@ class Tables():
|
|
|
120
127
|
|
|
121
128
|
# now get the dicts from the data
|
|
122
129
|
data_dicts = seperate_data(tables_dict,self.data)
|
|
123
|
-
|
|
130
|
+
|
|
124
131
|
# now flatten
|
|
125
132
|
data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
|
|
126
133
|
|
|
127
134
|
for table_name, flattened_data in data_dicts:
|
|
128
135
|
mapping_dict = tables_dict[table_name]['mapping']
|
|
129
136
|
mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
|
|
130
|
-
self.tables.append(Table(mapped_data, table_name, self.accession))
|
|
137
|
+
self.tables.append(Table(mapped_data, table_name, self.accession))
|
|
138
|
+
|
|
139
|
+
def add_table(self,data,name,description=None):
|
|
140
|
+
self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
|
|
141
|
+
|
|
142
|
+
def get_tables(self, description_regex=None, name=None):
|
|
143
|
+
matching_tables = []
|
|
144
|
+
|
|
145
|
+
for table in self.tables:
|
|
146
|
+
# Check name match (exact match)
|
|
147
|
+
if name is not None:
|
|
148
|
+
if table.name == name:
|
|
149
|
+
matching_tables.append(table)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
# Check description regex match
|
|
153
|
+
if description_regex is not None and table.description is not None:
|
|
154
|
+
if re.search(description_regex, table.description):
|
|
155
|
+
matching_tables.append(table)
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
return matching_tables
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamule
|
|
3
|
-
Version: 2.2
|
|
3
|
+
Version: 2.3.2
|
|
4
4
|
Summary: Work with SEC submissions at scale.
|
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
|
6
6
|
Author: John Friedman
|
|
@@ -20,3 +20,4 @@ Requires-Dist: secsgml
|
|
|
20
20
|
Requires-Dist: websocket-client
|
|
21
21
|
Requires-Dist: company_fundamentals
|
|
22
22
|
Requires-Dist: flashtext
|
|
23
|
+
Requires-Dist: aioboto3
|
|
@@ -14,6 +14,9 @@ datamule.egg-info/SOURCES.txt
|
|
|
14
14
|
datamule.egg-info/dependency_links.txt
|
|
15
15
|
datamule.egg-info/requires.txt
|
|
16
16
|
datamule.egg-info/top_level.txt
|
|
17
|
+
datamule/book/__init__.py
|
|
18
|
+
datamule/book/book.py
|
|
19
|
+
datamule/book/s3transfer.py
|
|
17
20
|
datamule/data/listed_filer_metadata.csv
|
|
18
21
|
datamule/datamule/__init__.py
|
|
19
22
|
datamule/datamule/datamule_lookup.py
|
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
|
32
32
|
setup(
|
|
33
33
|
name="datamule",
|
|
34
34
|
author="John Friedman",
|
|
35
|
-
version="2.2
|
|
35
|
+
version="2.3.2",
|
|
36
36
|
description="Work with SEC submissions at scale.",
|
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
|
@@ -52,7 +52,8 @@ setup(
|
|
|
52
52
|
'secsgml',
|
|
53
53
|
'websocket-client',
|
|
54
54
|
'company_fundamentals',
|
|
55
|
-
'flashtext'
|
|
55
|
+
'flashtext',
|
|
56
|
+
'aioboto3'
|
|
56
57
|
],
|
|
57
58
|
# Include the data directory in the package
|
|
58
59
|
package_data={
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamule-2.2.9/datamule/mapping_dicts → datamule-2.3.2/datamule/document/tables}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamule-2.2.9/datamule/sec/submissions → datamule-2.3.2/datamule/sec/infrastructure}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|