datamule 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,407 @@
1
+ import zipfile
2
+ import os
3
+ import json
4
+ import csv
5
+ import gzip
6
+ import asyncio
7
+ import aiohttp
8
+ import tempfile
9
+ from tqdm import tqdm
10
+ from datetime import datetime
11
+ from ..utils import headers
12
+
13
+ async def download_sec_file(url, target_path):
14
+ """Download submissions.zip from SEC website with progress bar."""
15
+
16
+ async with aiohttp.ClientSession() as session:
17
+ async with session.get(url, headers=headers) as response:
18
+ if response.status != 200:
19
+ raise Exception(f"Failed to download: HTTP {response.status}")
20
+
21
+ file_size = int(response.headers.get('Content-Length', 0))
22
+
23
+ with tqdm(total=file_size, unit='B', unit_scale=True, desc="Downloading SEC data") as progress_bar:
24
+ with open(target_path, 'wb') as f:
25
+ chunk_size = 1024 * 1024 # 1MB chunks
26
+ downloaded = 0
27
+
28
+ async for chunk in response.content.iter_chunked(chunk_size):
29
+ f.write(chunk)
30
+ downloaded += len(chunk)
31
+ progress_bar.update(len(chunk))
32
+
33
+ print(f"Download complete: {target_path}")
34
+ return target_path
35
+
36
+ def extract_metadata(data):
37
+ """Extract and flatten relevant company metadata from SEC submission data."""
38
+ result = {}
39
+
40
+ # Extract top-level fields, but exclude formerNames as it will be processed separately
41
+ for key in ['cik', 'entityType', 'sic', 'sicDescription', 'ownerOrg',
42
+ 'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists',
43
+ 'name', 'tickers', 'exchanges', 'ein', 'description', 'category', 'fiscalYearEnd', 'stateOfIncorporation',
44
+ 'stateOfIncorporationDescription', 'phone', 'flags']:
45
+ result[key] = data.get(key)
46
+
47
+ # Extract address fields
48
+ if 'addresses' in data:
49
+ for addr_type in ['mailing', 'business']:
50
+ if addr_type in data['addresses']:
51
+ addr = data['addresses'][addr_type]
52
+ for field in ['street1', 'street2', 'city', 'stateOrCountry', 'zipCode', 'stateOrCountryDescription']:
53
+ result[f"{addr_type}_{field}"] = addr.get(field)
54
+
55
+ # Add start_date field (will be populated later)
56
+ result['start_date'] = ''
57
+
58
+ return result
59
+
60
+ def extract_earliest_filing_date(data):
61
+ """Extract the earliest filing date from the full JSON data."""
62
+ earliest_date = None
63
+
64
+ # Try to get dates from the filings.files array first
65
+ if 'filings' in data and 'files' in data['filings'] and isinstance(data['filings']['files'], list):
66
+ for file_info in data['filings']['files']:
67
+ if isinstance(file_info, dict) and 'filingFrom' in file_info:
68
+ file_date = file_info.get('filingFrom', '')
69
+ if file_date and (earliest_date is None or file_date < earliest_date):
70
+ earliest_date = file_date
71
+
72
+ # If no date found in files array, check filingDate array in filings.recent
73
+ if earliest_date is None and 'filings' in data and 'recent' in data['filings']:
74
+ if 'filingDate' in data['filings']['recent'] and isinstance(data['filings']['recent']['filingDate'], list):
75
+ filing_dates = data['filings']['recent']['filingDate']
76
+ for filing_date in filing_dates:
77
+ if filing_date and (earliest_date is None or filing_date < earliest_date):
78
+ earliest_date = filing_date
79
+
80
+ return earliest_date
81
+
82
+ def process_former_names(data, cik, current_name):
83
+ """
84
+ Process former names into a list of records.
85
+ Returns former names records and the earliest company date.
86
+ """
87
+ former_names_records = []
88
+ earliest_company_date = None
89
+
90
+ # Process former names if present
91
+ former_names = data.get('formerNames', [])
92
+
93
+ # Track the latest end date to use for current name start date
94
+ latest_end_date = None
95
+
96
+ if former_names and isinstance(former_names, list):
97
+ for former_name in former_names:
98
+ if isinstance(former_name, dict):
99
+ # Extract name, start date, and end date
100
+ name = former_name.get('name', '')
101
+ start_date = former_name.get('from', '')
102
+ end_date = former_name.get('to', '')
103
+
104
+ # Clean up date formats (remove time component)
105
+ if start_date:
106
+ start_date = start_date.split('T')[0]
107
+ # Track earliest company date across all former names
108
+ if earliest_company_date is None or start_date < earliest_company_date:
109
+ earliest_company_date = start_date
110
+
111
+ if end_date:
112
+ end_date = end_date.split('T')[0]
113
+ # Track latest end date
114
+ if not latest_end_date or end_date > latest_end_date:
115
+ latest_end_date = end_date
116
+
117
+ # Create record for former name
118
+ record = {
119
+ 'name': name,
120
+ 'start_date': start_date,
121
+ 'end_date': end_date,
122
+ 'cik': cik
123
+ }
124
+
125
+ former_names_records.append(record)
126
+
127
+ # Find the earliest filing date for the company if no date found in former names
128
+ if earliest_company_date is None:
129
+ earliest_company_date = extract_earliest_filing_date(data)
130
+ if earliest_company_date and 'T' in earliest_company_date:
131
+ earliest_company_date = earliest_company_date.split('T')[0]
132
+
133
+ # For the current name, if we don't have a start date from former names,
134
+ # we'll use the earliest filing date
135
+ if not latest_end_date:
136
+ latest_end_date = earliest_company_date
137
+
138
+ # Add current name record with start date as latest end date
139
+ current_record = {
140
+ 'name': current_name,
141
+ 'start_date': latest_end_date if latest_end_date else '',
142
+ 'end_date': '', # Current name has no end date
143
+ 'cik': cik
144
+ }
145
+
146
+ former_names_records.append(current_record)
147
+
148
+ # Return both the records and the earliest company date (for metadata)
149
+ return former_names_records, earliest_company_date
150
+
151
+ def write_metadata_to_csv(metadata_list, output_path):
152
+ """Write metadata records to CSV and compress with gzip."""
153
+ if not metadata_list:
154
+ return
155
+
156
+ # Add .gz extension if not already present
157
+ if not output_path.endswith('.gz'):
158
+ output_path = output_path + '.gz'
159
+
160
+ # Get all possible field names across all records
161
+ fieldnames = set()
162
+ for metadata in metadata_list:
163
+ fieldnames.update(metadata.keys())
164
+
165
+ # Make sure 'name', 'cik', and 'start_date' come first
166
+ fieldnames = ['name', 'cik', 'start_date'] + [f for f in sorted(fieldnames) if f not in ['name', 'cik', 'start_date']]
167
+
168
+ # Write directly to gzipped CSV without using StringIO buffer
169
+ with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
170
+ writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
171
+ writer.writeheader()
172
+ writer.writerows(metadata_list)
173
+
174
+ print(f"Wrote {len(metadata_list)} records to {output_path}")
175
+
176
+ def write_names_to_csv(names_list, output_path):
177
+ """Write name records to CSV and compress with gzip."""
178
+ if not names_list:
179
+ return
180
+
181
+ # Add .gz extension if not already present
182
+ if not output_path.endswith('.gz'):
183
+ output_path = output_path + '.gz'
184
+
185
+ # Names CSV has fixed columns
186
+ fieldnames = ['name', 'start_date', 'end_date', 'cik']
187
+
188
+ # Write directly to gzipped CSV
189
+ with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
190
+ writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
191
+ writer.writeheader()
192
+ writer.writerows(names_list)
193
+
194
+ print(f"Wrote {len(names_list)} records to {output_path}")
195
+
196
+ async def extract_and_process_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
197
+ """
198
+ Extracts metadata from JSON files in a ZIP archive and writes to multiple CSV files.
199
+ Can download the ZIP file from SEC to a temporary location if local_zip_path not provided.
200
+
201
+ Args:
202
+ output_dir (str): Directory for output CSV files
203
+ local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
204
+ sec_url (str): URL to download the SEC submissions ZIP file
205
+ max_bytes (int): Maximum number of bytes to extract from each file
206
+
207
+ Returns:
208
+ dict: Statistics about processed files
209
+ """
210
+ # Ensure output directory exists
211
+ if not os.path.exists(output_dir):
212
+ os.makedirs(output_dir)
213
+
214
+ # Initialize collections for different types of data
215
+ listed_metadata = []
216
+ unlisted_metadata = []
217
+ listed_names = []
218
+ unlisted_names = []
219
+
220
+ stats = {
221
+ 'total_processed': 0,
222
+ 'listed_companies': 0,
223
+ 'unlisted_companies': 0,
224
+ 'full_content_reads': 0
225
+ }
226
+
227
+ # Use provided ZIP file or download to temporary file
228
+ if local_zip_path:
229
+ # Use local file
230
+ print(f"Using local ZIP file: {local_zip_path}")
231
+ zip_path = local_zip_path
232
+ temp_file = None
233
+ else:
234
+ # Download to temporary file
235
+ print(f"Downloading from SEC to temporary file: {sec_url}")
236
+ temp_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
237
+ temp_file.close() # Close the file so we can download to it
238
+ zip_path = temp_file.name
239
+
240
+ try:
241
+ await download_sec_file(sec_url, zip_path)
242
+ except Exception as e:
243
+ # Clean up temp file if download fails
244
+ if os.path.exists(zip_path):
245
+ os.unlink(zip_path)
246
+ raise e
247
+
248
+ try:
249
+ # Process the ZIP file
250
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
251
+ # Get list of files (excluding directories)
252
+ files = [f for f in zip_ref.infolist() if not f.is_dir()]
253
+ # Remove files that contain "submission" in their name
254
+ files = [f for f in files if "submission" not in f.filename]
255
+ # Remove placeholder.txt
256
+ files = [f for f in files if "placeholder.txt" not in f.filename]
257
+
258
+
259
+ # Create a progress bar
260
+ with tqdm(total=len(files), desc="Extracting metadata", unit="file") as pbar:
261
+ # Loop through all files in the ZIP archive
262
+ for file_info in files:
263
+ try:
264
+ # Initially read just a small chunk of the file
265
+ with zip_ref.open(file_info.filename, 'r') as file:
266
+ partial_content_bytes = file.read(max_bytes)
267
+
268
+ # Convert to string
269
+ partial_content = partial_content_bytes.decode('utf-8', errors='replace')
270
+
271
+ # Truncate at "filings" and complete the JSON for initial parsing
272
+ filings_index = partial_content.find('"filings":')
273
+ if filings_index != -1:
274
+ # Get content up to "filings"
275
+ truncated_content = partial_content[:filings_index]
276
+ # Remove trailing comma if present
277
+ truncated_content = truncated_content.rstrip().rstrip(',')
278
+ # Add closing brace
279
+ partial_content = truncated_content + '}'
280
+ else:
281
+ # If "filings" not found, try to make valid JSON by adding closing brace
282
+ partial_content = partial_content.rstrip().rstrip(',') + '}'
283
+
284
+ try:
285
+ # Parse the partial JSON to check for former names
286
+ partial_json_data = json.loads(partial_content)
287
+
288
+ # Check if we need full content (no former names or former names is empty list)
289
+ former_names = partial_json_data.get('formerNames', [])
290
+ need_full_content = not former_names or len(former_names) == 0
291
+
292
+ # Initialize json_data with the partial data
293
+ json_data = partial_json_data
294
+
295
+ # If we need more data for filing dates, read the full file
296
+ if need_full_content:
297
+ stats['full_content_reads'] += 1
298
+
299
+ # Read the entire file content
300
+ with zip_ref.open(file_info.filename, 'r') as full_file:
301
+ full_content_bytes = full_file.read()
302
+ full_content = full_content_bytes.decode('utf-8', errors='replace')
303
+
304
+ try:
305
+ # Parse the full JSON
306
+ json_data = json.loads(full_content)
307
+ except json.JSONDecodeError:
308
+ # If full content can't be parsed, stick with partial data
309
+ print(f"Warning: Could not parse full content of {file_info.filename}, using partial data")
310
+
311
+ # Extract metadata (without former names)
312
+ metadata = extract_metadata(json_data)
313
+
314
+ # Get CIK and name for former names processing
315
+ cik = metadata.get('cik', '')
316
+ name = metadata.get('name', '')
317
+
318
+ # Process former names with the full json_data
319
+ # Now also returning the earliest company date
320
+ former_names_records, earliest_company_date = process_former_names(json_data, cik, name)
321
+
322
+ # Add the earliest company date to the metadata
323
+ metadata['start_date'] = earliest_company_date if earliest_company_date else ''
324
+
325
+ # Check if company is listed (has tickers)
326
+ tickers = metadata.get('tickers', [])
327
+ is_listed = tickers and isinstance(tickers, list) and len(tickers) > 0
328
+
329
+ # Add to appropriate collections
330
+ if is_listed:
331
+ listed_metadata.append(metadata)
332
+ listed_names.extend(former_names_records)
333
+ stats['listed_companies'] += 1
334
+ else:
335
+ unlisted_metadata.append(metadata)
336
+ unlisted_names.extend(former_names_records)
337
+ stats['unlisted_companies'] += 1
338
+
339
+ stats['total_processed'] += 1
340
+
341
+ except json.JSONDecodeError as je:
342
+ print(f"JSON parsing error in {file_info.filename}: {str(je)}")
343
+
344
+ except Exception as e:
345
+ # Handle any errors
346
+ print(f"Error processing {file_info.filename}: {str(e)}")
347
+
348
+ # Update the progress bar
349
+ pbar.update(1)
350
+
351
+ finally:
352
+ # Clean up temporary file if we created one
353
+ if temp_file and os.path.exists(zip_path):
354
+ print(f"Removing temporary file: {zip_path}")
355
+ os.unlink(zip_path)
356
+
357
+ # Define output file paths (without .gz extension, it will be added in the write functions)
358
+ listed_metadata_path = os.path.join(output_dir, "listed_filer_metadata.csv")
359
+ unlisted_metadata_path = os.path.join(output_dir, "unlisted_filer_metadata.csv")
360
+ listed_names_path = os.path.join(output_dir, "listed_filer_names.csv")
361
+ unlisted_names_path = os.path.join(output_dir, "unlisted_filer_names.csv")
362
+
363
+ # Write listed metadata to CSV
364
+ if listed_metadata:
365
+ write_metadata_to_csv(listed_metadata, listed_metadata_path)
366
+
367
+ # Write unlisted metadata to CSV
368
+ if unlisted_metadata:
369
+ write_metadata_to_csv(unlisted_metadata, unlisted_metadata_path)
370
+
371
+ # Write listed names to CSV
372
+ if listed_names:
373
+ write_names_to_csv(listed_names, listed_names_path)
374
+
375
+ # Write unlisted names to CSV
376
+ if unlisted_names:
377
+ write_names_to_csv(unlisted_names, unlisted_names_path)
378
+
379
+ # Print summary
380
+ print(f"\nTotal files processed: {stats['total_processed']}")
381
+ print(f"Listed companies found: {stats['listed_companies']}")
382
+ print(f"Unlisted companies found: {stats['unlisted_companies']}")
383
+ print(f"Files requiring full content read: {stats['full_content_reads']}")
384
+ print(f"Output files written to {output_dir}")
385
+
386
+ return stats
387
+
388
+ # Convenience function to run the extractor
389
+ def process_submissions_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
390
+ """
391
+ Convenience function to run the SEC Metadata Extractor.
392
+
393
+ Args:
394
+ output_dir (str): Directory for output CSV files
395
+ local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
396
+ sec_url (str): URL to download the SEC submissions ZIP file
397
+ max_bytes (int): Maximum number of bytes to extract from each file
398
+
399
+ Returns:
400
+ dict: Statistics about processed files
401
+ """
402
+ return asyncio.run(extract_and_process_metadata(
403
+ output_dir=output_dir,
404
+ local_zip_path=local_zip_path,
405
+ sec_url=sec_url,
406
+ max_bytes=max_bytes
407
+ ))
File without changes