datamule 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/book/__init__.py +0 -0
- datamule/book/book.py +34 -0
- datamule/mapping_dicts/__init__.py +0 -0
- datamule/mapping_dicts/txt_mapping_dicts.py +234 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/sec/__init__.py +0 -0
- datamule/sec/infrastructure/__init__.py +0 -0
- datamule/sec/infrastructure/submissions_metadata.py +407 -0
- datamule/sec/rss/__init__.py +0 -0
- datamule/sec/rss/monitor.py +416 -0
- datamule/sec/submissions/__init__.py +0 -0
- datamule/sec/submissions/downloader.py +70 -0
- datamule/sec/submissions/eftsquery.py +502 -0
- datamule/sec/submissions/monitor.py +126 -0
- datamule/sec/submissions/streamer.py +228 -0
- datamule/sec/submissions/textsearch.py +122 -0
- datamule/sec/utils.py +64 -0
- datamule/sec/xbrl/__init__.py +0 -0
- datamule/sec/xbrl/downloadcompanyfacts.py +83 -0
- datamule/sec/xbrl/filter_xbrl.py +39 -0
- datamule/sec/xbrl/streamcompanyfacts.py +93 -0
- datamule/sec/xbrl/xbrlmonitor.py +143 -0
- datamule/seclibrary/__init__.py +0 -0
- datamule/seclibrary/downloader.py +286 -0
- datamule/seclibrary/query.py +181 -0
- {datamule-1.0.8.dist-info → datamule-1.1.0.dist-info}/METADATA +1 -1
- datamule-1.1.0.dist-info/RECORD +35 -0
- datamule-1.0.8.dist-info/RECORD +0 -10
- {datamule-1.0.8.dist-info → datamule-1.1.0.dist-info}/WHEEL +0 -0
- {datamule-1.0.8.dist-info → datamule-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,407 @@
|
|
1
|
+
import zipfile
|
2
|
+
import os
|
3
|
+
import json
|
4
|
+
import csv
|
5
|
+
import gzip
|
6
|
+
import asyncio
|
7
|
+
import aiohttp
|
8
|
+
import tempfile
|
9
|
+
from tqdm import tqdm
|
10
|
+
from datetime import datetime
|
11
|
+
from ..utils import headers
|
12
|
+
|
13
|
+
async def download_sec_file(url, target_path):
|
14
|
+
"""Download submissions.zip from SEC website with progress bar."""
|
15
|
+
|
16
|
+
async with aiohttp.ClientSession() as session:
|
17
|
+
async with session.get(url, headers=headers) as response:
|
18
|
+
if response.status != 200:
|
19
|
+
raise Exception(f"Failed to download: HTTP {response.status}")
|
20
|
+
|
21
|
+
file_size = int(response.headers.get('Content-Length', 0))
|
22
|
+
|
23
|
+
with tqdm(total=file_size, unit='B', unit_scale=True, desc="Downloading SEC data") as progress_bar:
|
24
|
+
with open(target_path, 'wb') as f:
|
25
|
+
chunk_size = 1024 * 1024 # 1MB chunks
|
26
|
+
downloaded = 0
|
27
|
+
|
28
|
+
async for chunk in response.content.iter_chunked(chunk_size):
|
29
|
+
f.write(chunk)
|
30
|
+
downloaded += len(chunk)
|
31
|
+
progress_bar.update(len(chunk))
|
32
|
+
|
33
|
+
print(f"Download complete: {target_path}")
|
34
|
+
return target_path
|
35
|
+
|
36
|
+
def extract_metadata(data):
|
37
|
+
"""Extract and flatten relevant company metadata from SEC submission data."""
|
38
|
+
result = {}
|
39
|
+
|
40
|
+
# Extract top-level fields, but exclude formerNames as it will be processed separately
|
41
|
+
for key in ['cik', 'entityType', 'sic', 'sicDescription', 'ownerOrg',
|
42
|
+
'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists',
|
43
|
+
'name', 'tickers', 'exchanges', 'ein', 'description', 'category', 'fiscalYearEnd', 'stateOfIncorporation',
|
44
|
+
'stateOfIncorporationDescription', 'phone', 'flags']:
|
45
|
+
result[key] = data.get(key)
|
46
|
+
|
47
|
+
# Extract address fields
|
48
|
+
if 'addresses' in data:
|
49
|
+
for addr_type in ['mailing', 'business']:
|
50
|
+
if addr_type in data['addresses']:
|
51
|
+
addr = data['addresses'][addr_type]
|
52
|
+
for field in ['street1', 'street2', 'city', 'stateOrCountry', 'zipCode', 'stateOrCountryDescription']:
|
53
|
+
result[f"{addr_type}_{field}"] = addr.get(field)
|
54
|
+
|
55
|
+
# Add start_date field (will be populated later)
|
56
|
+
result['start_date'] = ''
|
57
|
+
|
58
|
+
return result
|
59
|
+
|
60
|
+
def extract_earliest_filing_date(data):
|
61
|
+
"""Extract the earliest filing date from the full JSON data."""
|
62
|
+
earliest_date = None
|
63
|
+
|
64
|
+
# Try to get dates from the filings.files array first
|
65
|
+
if 'filings' in data and 'files' in data['filings'] and isinstance(data['filings']['files'], list):
|
66
|
+
for file_info in data['filings']['files']:
|
67
|
+
if isinstance(file_info, dict) and 'filingFrom' in file_info:
|
68
|
+
file_date = file_info.get('filingFrom', '')
|
69
|
+
if file_date and (earliest_date is None or file_date < earliest_date):
|
70
|
+
earliest_date = file_date
|
71
|
+
|
72
|
+
# If no date found in files array, check filingDate array in filings.recent
|
73
|
+
if earliest_date is None and 'filings' in data and 'recent' in data['filings']:
|
74
|
+
if 'filingDate' in data['filings']['recent'] and isinstance(data['filings']['recent']['filingDate'], list):
|
75
|
+
filing_dates = data['filings']['recent']['filingDate']
|
76
|
+
for filing_date in filing_dates:
|
77
|
+
if filing_date and (earliest_date is None or filing_date < earliest_date):
|
78
|
+
earliest_date = filing_date
|
79
|
+
|
80
|
+
return earliest_date
|
81
|
+
|
82
|
+
def process_former_names(data, cik, current_name):
|
83
|
+
"""
|
84
|
+
Process former names into a list of records.
|
85
|
+
Returns former names records and the earliest company date.
|
86
|
+
"""
|
87
|
+
former_names_records = []
|
88
|
+
earliest_company_date = None
|
89
|
+
|
90
|
+
# Process former names if present
|
91
|
+
former_names = data.get('formerNames', [])
|
92
|
+
|
93
|
+
# Track the latest end date to use for current name start date
|
94
|
+
latest_end_date = None
|
95
|
+
|
96
|
+
if former_names and isinstance(former_names, list):
|
97
|
+
for former_name in former_names:
|
98
|
+
if isinstance(former_name, dict):
|
99
|
+
# Extract name, start date, and end date
|
100
|
+
name = former_name.get('name', '')
|
101
|
+
start_date = former_name.get('from', '')
|
102
|
+
end_date = former_name.get('to', '')
|
103
|
+
|
104
|
+
# Clean up date formats (remove time component)
|
105
|
+
if start_date:
|
106
|
+
start_date = start_date.split('T')[0]
|
107
|
+
# Track earliest company date across all former names
|
108
|
+
if earliest_company_date is None or start_date < earliest_company_date:
|
109
|
+
earliest_company_date = start_date
|
110
|
+
|
111
|
+
if end_date:
|
112
|
+
end_date = end_date.split('T')[0]
|
113
|
+
# Track latest end date
|
114
|
+
if not latest_end_date or end_date > latest_end_date:
|
115
|
+
latest_end_date = end_date
|
116
|
+
|
117
|
+
# Create record for former name
|
118
|
+
record = {
|
119
|
+
'name': name,
|
120
|
+
'start_date': start_date,
|
121
|
+
'end_date': end_date,
|
122
|
+
'cik': cik
|
123
|
+
}
|
124
|
+
|
125
|
+
former_names_records.append(record)
|
126
|
+
|
127
|
+
# Find the earliest filing date for the company if no date found in former names
|
128
|
+
if earliest_company_date is None:
|
129
|
+
earliest_company_date = extract_earliest_filing_date(data)
|
130
|
+
if earliest_company_date and 'T' in earliest_company_date:
|
131
|
+
earliest_company_date = earliest_company_date.split('T')[0]
|
132
|
+
|
133
|
+
# For the current name, if we don't have a start date from former names,
|
134
|
+
# we'll use the earliest filing date
|
135
|
+
if not latest_end_date:
|
136
|
+
latest_end_date = earliest_company_date
|
137
|
+
|
138
|
+
# Add current name record with start date as latest end date
|
139
|
+
current_record = {
|
140
|
+
'name': current_name,
|
141
|
+
'start_date': latest_end_date if latest_end_date else '',
|
142
|
+
'end_date': '', # Current name has no end date
|
143
|
+
'cik': cik
|
144
|
+
}
|
145
|
+
|
146
|
+
former_names_records.append(current_record)
|
147
|
+
|
148
|
+
# Return both the records and the earliest company date (for metadata)
|
149
|
+
return former_names_records, earliest_company_date
|
150
|
+
|
151
|
+
def write_metadata_to_csv(metadata_list, output_path):
|
152
|
+
"""Write metadata records to CSV and compress with gzip."""
|
153
|
+
if not metadata_list:
|
154
|
+
return
|
155
|
+
|
156
|
+
# Add .gz extension if not already present
|
157
|
+
if not output_path.endswith('.gz'):
|
158
|
+
output_path = output_path + '.gz'
|
159
|
+
|
160
|
+
# Get all possible field names across all records
|
161
|
+
fieldnames = set()
|
162
|
+
for metadata in metadata_list:
|
163
|
+
fieldnames.update(metadata.keys())
|
164
|
+
|
165
|
+
# Make sure 'name', 'cik', and 'start_date' come first
|
166
|
+
fieldnames = ['name', 'cik', 'start_date'] + [f for f in sorted(fieldnames) if f not in ['name', 'cik', 'start_date']]
|
167
|
+
|
168
|
+
# Write directly to gzipped CSV without using StringIO buffer
|
169
|
+
with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
|
170
|
+
writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
|
171
|
+
writer.writeheader()
|
172
|
+
writer.writerows(metadata_list)
|
173
|
+
|
174
|
+
print(f"Wrote {len(metadata_list)} records to {output_path}")
|
175
|
+
|
176
|
+
def write_names_to_csv(names_list, output_path):
|
177
|
+
"""Write name records to CSV and compress with gzip."""
|
178
|
+
if not names_list:
|
179
|
+
return
|
180
|
+
|
181
|
+
# Add .gz extension if not already present
|
182
|
+
if not output_path.endswith('.gz'):
|
183
|
+
output_path = output_path + '.gz'
|
184
|
+
|
185
|
+
# Names CSV has fixed columns
|
186
|
+
fieldnames = ['name', 'start_date', 'end_date', 'cik']
|
187
|
+
|
188
|
+
# Write directly to gzipped CSV
|
189
|
+
with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
|
190
|
+
writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
|
191
|
+
writer.writeheader()
|
192
|
+
writer.writerows(names_list)
|
193
|
+
|
194
|
+
print(f"Wrote {len(names_list)} records to {output_path}")
|
195
|
+
|
196
|
+
async def extract_and_process_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
|
197
|
+
"""
|
198
|
+
Extracts metadata from JSON files in a ZIP archive and writes to multiple CSV files.
|
199
|
+
Can download the ZIP file from SEC to a temporary location if local_zip_path not provided.
|
200
|
+
|
201
|
+
Args:
|
202
|
+
output_dir (str): Directory for output CSV files
|
203
|
+
local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
|
204
|
+
sec_url (str): URL to download the SEC submissions ZIP file
|
205
|
+
max_bytes (int): Maximum number of bytes to extract from each file
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
dict: Statistics about processed files
|
209
|
+
"""
|
210
|
+
# Ensure output directory exists
|
211
|
+
if not os.path.exists(output_dir):
|
212
|
+
os.makedirs(output_dir)
|
213
|
+
|
214
|
+
# Initialize collections for different types of data
|
215
|
+
listed_metadata = []
|
216
|
+
unlisted_metadata = []
|
217
|
+
listed_names = []
|
218
|
+
unlisted_names = []
|
219
|
+
|
220
|
+
stats = {
|
221
|
+
'total_processed': 0,
|
222
|
+
'listed_companies': 0,
|
223
|
+
'unlisted_companies': 0,
|
224
|
+
'full_content_reads': 0
|
225
|
+
}
|
226
|
+
|
227
|
+
# Use provided ZIP file or download to temporary file
|
228
|
+
if local_zip_path:
|
229
|
+
# Use local file
|
230
|
+
print(f"Using local ZIP file: {local_zip_path}")
|
231
|
+
zip_path = local_zip_path
|
232
|
+
temp_file = None
|
233
|
+
else:
|
234
|
+
# Download to temporary file
|
235
|
+
print(f"Downloading from SEC to temporary file: {sec_url}")
|
236
|
+
temp_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
|
237
|
+
temp_file.close() # Close the file so we can download to it
|
238
|
+
zip_path = temp_file.name
|
239
|
+
|
240
|
+
try:
|
241
|
+
await download_sec_file(sec_url, zip_path)
|
242
|
+
except Exception as e:
|
243
|
+
# Clean up temp file if download fails
|
244
|
+
if os.path.exists(zip_path):
|
245
|
+
os.unlink(zip_path)
|
246
|
+
raise e
|
247
|
+
|
248
|
+
try:
|
249
|
+
# Process the ZIP file
|
250
|
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
251
|
+
# Get list of files (excluding directories)
|
252
|
+
files = [f for f in zip_ref.infolist() if not f.is_dir()]
|
253
|
+
# Remove files that contain "submission" in their name
|
254
|
+
files = [f for f in files if "submission" not in f.filename]
|
255
|
+
# Remove placeholder.txt
|
256
|
+
files = [f for f in files if "placeholder.txt" not in f.filename]
|
257
|
+
|
258
|
+
|
259
|
+
# Create a progress bar
|
260
|
+
with tqdm(total=len(files), desc="Extracting metadata", unit="file") as pbar:
|
261
|
+
# Loop through all files in the ZIP archive
|
262
|
+
for file_info in files:
|
263
|
+
try:
|
264
|
+
# Initially read just a small chunk of the file
|
265
|
+
with zip_ref.open(file_info.filename, 'r') as file:
|
266
|
+
partial_content_bytes = file.read(max_bytes)
|
267
|
+
|
268
|
+
# Convert to string
|
269
|
+
partial_content = partial_content_bytes.decode('utf-8', errors='replace')
|
270
|
+
|
271
|
+
# Truncate at "filings" and complete the JSON for initial parsing
|
272
|
+
filings_index = partial_content.find('"filings":')
|
273
|
+
if filings_index != -1:
|
274
|
+
# Get content up to "filings"
|
275
|
+
truncated_content = partial_content[:filings_index]
|
276
|
+
# Remove trailing comma if present
|
277
|
+
truncated_content = truncated_content.rstrip().rstrip(',')
|
278
|
+
# Add closing brace
|
279
|
+
partial_content = truncated_content + '}'
|
280
|
+
else:
|
281
|
+
# If "filings" not found, try to make valid JSON by adding closing brace
|
282
|
+
partial_content = partial_content.rstrip().rstrip(',') + '}'
|
283
|
+
|
284
|
+
try:
|
285
|
+
# Parse the partial JSON to check for former names
|
286
|
+
partial_json_data = json.loads(partial_content)
|
287
|
+
|
288
|
+
# Check if we need full content (no former names or former names is empty list)
|
289
|
+
former_names = partial_json_data.get('formerNames', [])
|
290
|
+
need_full_content = not former_names or len(former_names) == 0
|
291
|
+
|
292
|
+
# Initialize json_data with the partial data
|
293
|
+
json_data = partial_json_data
|
294
|
+
|
295
|
+
# If we need more data for filing dates, read the full file
|
296
|
+
if need_full_content:
|
297
|
+
stats['full_content_reads'] += 1
|
298
|
+
|
299
|
+
# Read the entire file content
|
300
|
+
with zip_ref.open(file_info.filename, 'r') as full_file:
|
301
|
+
full_content_bytes = full_file.read()
|
302
|
+
full_content = full_content_bytes.decode('utf-8', errors='replace')
|
303
|
+
|
304
|
+
try:
|
305
|
+
# Parse the full JSON
|
306
|
+
json_data = json.loads(full_content)
|
307
|
+
except json.JSONDecodeError:
|
308
|
+
# If full content can't be parsed, stick with partial data
|
309
|
+
print(f"Warning: Could not parse full content of {file_info.filename}, using partial data")
|
310
|
+
|
311
|
+
# Extract metadata (without former names)
|
312
|
+
metadata = extract_metadata(json_data)
|
313
|
+
|
314
|
+
# Get CIK and name for former names processing
|
315
|
+
cik = metadata.get('cik', '')
|
316
|
+
name = metadata.get('name', '')
|
317
|
+
|
318
|
+
# Process former names with the full json_data
|
319
|
+
# Now also returning the earliest company date
|
320
|
+
former_names_records, earliest_company_date = process_former_names(json_data, cik, name)
|
321
|
+
|
322
|
+
# Add the earliest company date to the metadata
|
323
|
+
metadata['start_date'] = earliest_company_date if earliest_company_date else ''
|
324
|
+
|
325
|
+
# Check if company is listed (has tickers)
|
326
|
+
tickers = metadata.get('tickers', [])
|
327
|
+
is_listed = tickers and isinstance(tickers, list) and len(tickers) > 0
|
328
|
+
|
329
|
+
# Add to appropriate collections
|
330
|
+
if is_listed:
|
331
|
+
listed_metadata.append(metadata)
|
332
|
+
listed_names.extend(former_names_records)
|
333
|
+
stats['listed_companies'] += 1
|
334
|
+
else:
|
335
|
+
unlisted_metadata.append(metadata)
|
336
|
+
unlisted_names.extend(former_names_records)
|
337
|
+
stats['unlisted_companies'] += 1
|
338
|
+
|
339
|
+
stats['total_processed'] += 1
|
340
|
+
|
341
|
+
except json.JSONDecodeError as je:
|
342
|
+
print(f"JSON parsing error in {file_info.filename}: {str(je)}")
|
343
|
+
|
344
|
+
except Exception as e:
|
345
|
+
# Handle any errors
|
346
|
+
print(f"Error processing {file_info.filename}: {str(e)}")
|
347
|
+
|
348
|
+
# Update the progress bar
|
349
|
+
pbar.update(1)
|
350
|
+
|
351
|
+
finally:
|
352
|
+
# Clean up temporary file if we created one
|
353
|
+
if temp_file and os.path.exists(zip_path):
|
354
|
+
print(f"Removing temporary file: {zip_path}")
|
355
|
+
os.unlink(zip_path)
|
356
|
+
|
357
|
+
# Define output file paths (without .gz extension, it will be added in the write functions)
|
358
|
+
listed_metadata_path = os.path.join(output_dir, "listed_filer_metadata.csv")
|
359
|
+
unlisted_metadata_path = os.path.join(output_dir, "unlisted_filer_metadata.csv")
|
360
|
+
listed_names_path = os.path.join(output_dir, "listed_filer_names.csv")
|
361
|
+
unlisted_names_path = os.path.join(output_dir, "unlisted_filer_names.csv")
|
362
|
+
|
363
|
+
# Write listed metadata to CSV
|
364
|
+
if listed_metadata:
|
365
|
+
write_metadata_to_csv(listed_metadata, listed_metadata_path)
|
366
|
+
|
367
|
+
# Write unlisted metadata to CSV
|
368
|
+
if unlisted_metadata:
|
369
|
+
write_metadata_to_csv(unlisted_metadata, unlisted_metadata_path)
|
370
|
+
|
371
|
+
# Write listed names to CSV
|
372
|
+
if listed_names:
|
373
|
+
write_names_to_csv(listed_names, listed_names_path)
|
374
|
+
|
375
|
+
# Write unlisted names to CSV
|
376
|
+
if unlisted_names:
|
377
|
+
write_names_to_csv(unlisted_names, unlisted_names_path)
|
378
|
+
|
379
|
+
# Print summary
|
380
|
+
print(f"\nTotal files processed: {stats['total_processed']}")
|
381
|
+
print(f"Listed companies found: {stats['listed_companies']}")
|
382
|
+
print(f"Unlisted companies found: {stats['unlisted_companies']}")
|
383
|
+
print(f"Files requiring full content read: {stats['full_content_reads']}")
|
384
|
+
print(f"Output files written to {output_dir}")
|
385
|
+
|
386
|
+
return stats
|
387
|
+
|
388
|
+
# Convenience function to run the extractor
|
389
|
+
def process_submissions_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
|
390
|
+
"""
|
391
|
+
Convenience function to run the SEC Metadata Extractor.
|
392
|
+
|
393
|
+
Args:
|
394
|
+
output_dir (str): Directory for output CSV files
|
395
|
+
local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
|
396
|
+
sec_url (str): URL to download the SEC submissions ZIP file
|
397
|
+
max_bytes (int): Maximum number of bytes to extract from each file
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
dict: Statistics about processed files
|
401
|
+
"""
|
402
|
+
return asyncio.run(extract_and_process_metadata(
|
403
|
+
output_dir=output_dir,
|
404
|
+
local_zip_path=local_zip_path,
|
405
|
+
sec_url=sec_url,
|
406
|
+
max_bytes=max_bytes
|
407
|
+
))
|
File without changes
|