datamule 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
datamule/book/book.py ADDED
@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+ from ..helper import _process_cik_and_metadata_filters, load_package_dataset
3
+ from ..sec.xbrl.downloadcompanyfacts import download_company_facts
4
+
5
+ class Book:
6
+ def __init__(self, path):
7
+ self.path = Path(path)
8
+
9
+ def download_xbrl(
10
+ self,
11
+ cik=None,
12
+ ticker=None,
13
+ **kwargs
14
+ ):
15
+ # If no CIK or ticker specified, get all companies with tickers
16
+ if cik is None and ticker is None:
17
+ cik = [row['cik'] for row in load_package_dataset('company_tickers')]
18
+
19
+ # Normalize cik to list format
20
+ if isinstance(cik, (str, int)):
21
+ cik = [cik]
22
+
23
+ # Process CIK and metadata filters
24
+ cik_list = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
25
+
26
+ # Download facts for all CIKs in parallel
27
+ download_company_facts(cik=cik_list, output_dir=self.path)
28
+
29
+ def query_345():
30
+ pass
31
+ def query_xbrl():
32
+ pass
33
+ def query_13fhr():
34
+ pass
File without changes
@@ -0,0 +1,234 @@
1
+ import copy
2
+
3
+ dict_sgml = {
4
+ "rules": {
5
+ "join_text": "\n",
6
+ "remove": [
7
+ {
8
+ "pattern": r"^<PAGE>",
9
+ }
10
+ ],
11
+ "mappings": [
12
+ {
13
+ "name": "table",
14
+ "pattern": r"^<TABLE>",
15
+ "end": r"^</TABLE>"
16
+ },
17
+ {
18
+ "name": "caption",
19
+ "pattern": r"^<CAPTION>",
20
+ "end": r"^<S>",
21
+ "keep_end": True
22
+ },
23
+ {
24
+ "name": "footnote",
25
+ "pattern": r"^<FN>",
26
+ "end": r"^</FN>"
27
+ }
28
+ ]
29
+ }
30
+ }
31
+
32
+ item_pattern_mapping = r"^\n\n\s*(ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
33
+ item_pattern_mapping_8k = r"^\n\n\s*(ITEM|Item)\s+(\d+(?:\.\d+)?[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
34
+ part_pattern_mapping = r"^\n\n\s*(PART|Part)\s+(?:I{1,3}|IV)\.?"
35
+
36
+ item_pattern_standardization = r"^\s*(?:ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
37
+ item_pattern_standardization_8k = r"^\s*(?:ITEM|Item)\s+(\d+(?:\.\d+)?[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN)\.?"
38
+ part_pattern_standardization = r"^\s*(?:PART|Part)\s+([IVX]+)"
39
+
40
+
41
+ dict_10k = copy.deepcopy(dict_sgml)
42
+ dict_10k["rules"]["mappings"].extend([
43
+ {
44
+ "type": "hierarchy",
45
+ "name": "part",
46
+ "pattern": part_pattern_mapping,
47
+ "hierarchy": 0
48
+ },
49
+ {
50
+ "type": "hierarchy",
51
+ "name": "item",
52
+ "pattern": item_pattern_mapping,
53
+ "hierarchy": 1
54
+ },
55
+ ])
56
+
57
+ # In the mapping dict:
58
+ dict_10k['transformations'] = [
59
+ {
60
+ "type": "standardize",
61
+ "match": {
62
+ "type": "part",
63
+ "text_pattern": part_pattern_standardization
64
+ },
65
+ "output": {
66
+ "format": "part{}",
67
+ "field": "text" # Where to store the standardized value
68
+ }
69
+ },
70
+ {
71
+ "type": "standardize",
72
+ "match": {
73
+ "type": "item",
74
+ "text_pattern": item_pattern_standardization
75
+ },
76
+ "output": {
77
+ "format": "item{}",
78
+ "field": "text" # Could also be "text" or any other field name
79
+ }
80
+ },
81
+ {
82
+ "type": "merge_consecutive",
83
+ "match": {
84
+ "types": ["part", "item"] # sections types to check for merging
85
+ }
86
+ },
87
+ {
88
+ "type": "trim",
89
+ "match": {
90
+ "type": "item", # or "item"
91
+ "expected": 1
92
+ },
93
+ "output": {
94
+ "type": "introduction",
95
+ "separator": "\n"
96
+ }
97
+ }
98
+
99
+ ]
100
+
101
+ dict_10q = copy.deepcopy(dict_sgml)
102
+ dict_10q["rules"]["mappings"].extend([
103
+ {
104
+ "type": "hierarchy",
105
+ "name": "part",
106
+ "pattern": part_pattern_mapping,
107
+ "hierarchy": 0
108
+ },
109
+ {
110
+ "type": "hierarchy",
111
+ "name": "item",
112
+ "pattern": item_pattern_mapping,
113
+ "hierarchy": 1
114
+ },
115
+ ])
116
+
117
+ # In the mapping dict:
118
+ dict_10q['transformations'] = [
119
+ {
120
+ "type": "standardize",
121
+ "match": {
122
+ "type": "part",
123
+ "text_pattern": part_pattern_standardization
124
+ },
125
+ "output": {
126
+ "format": "part{}",
127
+ "field": "text" # Where to store the standardized value
128
+ }
129
+ },
130
+ {
131
+ "type": "standardize",
132
+ "match": {
133
+ "type": "item",
134
+ "text_pattern": item_pattern_standardization
135
+ },
136
+ "output": {
137
+ "format": "item{}",
138
+ "field": "text" # Could also be "text" or any other field name
139
+ }
140
+ },
141
+ {
142
+ "type": "merge_consecutive",
143
+ "match": {
144
+ "types": ["part", "item"] # sections types to check for merging
145
+ }
146
+ },
147
+ {
148
+ "type": "trim",
149
+ "match": {
150
+ "type": "item", # or "item"
151
+ "expected": 2
152
+ },
153
+ "output": {
154
+ "type": "introduction",
155
+ "separator": "\n"
156
+ }
157
+ }
158
+
159
+ ]
160
+
161
+ dict_13d = copy.deepcopy(dict_sgml)
162
+ dict_13d["rules"]["mappings"].extend([
163
+ {
164
+ "type": "hierarchy",
165
+ "name": "item",
166
+ "pattern": item_pattern_mapping,
167
+ "hierarchy": 0
168
+ },
169
+ ])
170
+
171
+ dict_13d['transformations'] = [
172
+ {
173
+ "type": "standardize",
174
+ "match": {
175
+ "type": "item",
176
+ "text_pattern": item_pattern_standardization
177
+ },
178
+ "output": {
179
+ "format": "item{}",
180
+ "field": "text" # Could also be "text" or any other field name
181
+ }
182
+ },
183
+ {
184
+ "type": "merge_consecutive",
185
+ "match": {
186
+ "types": ["item"] # sections types to check for merging
187
+ }
188
+ }
189
+
190
+ ]
191
+
192
+ dict_13g = copy.deepcopy(dict_13d)
193
+
194
+ dict_8k = copy.deepcopy(dict_sgml)
195
+ dict_8k["rules"]["mappings"].extend([
196
+ {
197
+ "type": "hierarchy",
198
+ "name": "item",
199
+ "pattern": item_pattern_mapping_8k,
200
+ "hierarchy": 0
201
+ },
202
+ ])
203
+
204
+ dict_8k['transformations'] = [
205
+ {
206
+ "type": "standardize",
207
+ "match": {
208
+ "type": "item",
209
+ "text_pattern": item_pattern_standardization_8k
210
+ },
211
+ "output": {
212
+ "format": "item{}",
213
+ "field": "text" # Could also be "text" or any other field name
214
+ }
215
+ },
216
+ {
217
+ "type": "merge_consecutive",
218
+ "match": {
219
+ "types": ["item"] # sections types to check for merging
220
+ }
221
+ },
222
+ {
223
+ "type": "trim",
224
+ "match": {
225
+ "type": "item", # or "item"
226
+ "expected": 1
227
+ },
228
+ "output": {
229
+ "type": "introduction",
230
+ "separator": "\n"
231
+ }
232
+ }
233
+
234
+ ]
@@ -0,0 +1,19 @@
1
+ dict_345 = {
2
+ "transformations": [
3
+ {
4
+ "search": {
5
+ "key": "footnoteId",
6
+ "identifier": "@id"
7
+ },
8
+ "match": {
9
+ "identifier": "@id",
10
+ "content": "#text",
11
+ "remove_after_use": True
12
+ },
13
+ "output": {
14
+ "key": "footnote",
15
+ "value": "content"
16
+ }
17
+ }
18
+ ]
19
+ }
File without changes
File without changes
@@ -0,0 +1,386 @@
1
+ import zipfile
2
+ import os
3
+ import json
4
+ import csv
5
+ import gzip
6
+ import asyncio
7
+ import aiohttp
8
+ import tempfile
9
+ from tqdm import tqdm
10
+ from datetime import datetime
11
+ from ..utils import headers
12
+
13
+ async def download_sec_file(url, target_path):
14
+ """Download submissions.zip from SEC website with progress bar."""
15
+
16
+
17
+ async with aiohttp.ClientSession() as session:
18
+ async with session.get(url, headers=headers) as response:
19
+ if response.status != 200:
20
+ raise Exception(f"Failed to download: HTTP {response.status}")
21
+
22
+ file_size = int(response.headers.get('Content-Length', 0))
23
+
24
+ with tqdm(total=file_size, unit='B', unit_scale=True, desc="Downloading SEC data") as progress_bar:
25
+ with open(target_path, 'wb') as f:
26
+ chunk_size = 1024 * 1024 # 1MB chunks
27
+ downloaded = 0
28
+
29
+ async for chunk in response.content.iter_chunked(chunk_size):
30
+ f.write(chunk)
31
+ downloaded += len(chunk)
32
+ progress_bar.update(len(chunk))
33
+
34
+ print(f"Download complete: {target_path}")
35
+ return target_path
36
+
37
+ def extract_metadata(data):
38
+ """Extract and flatten relevant company metadata from SEC submission data."""
39
+ result = {}
40
+
41
+ # Extract top-level fields, but exclude formerNames as it will be processed separately
42
+ for key in ['cik', 'entityType', 'sic', 'sicDescription', 'ownerOrg',
43
+ 'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists',
44
+ 'name', 'tickers', 'exchanges', 'ein', 'description', 'category', 'fiscalYearEnd', 'stateOfIncorporation',
45
+ 'stateOfIncorporationDescription', 'phone', 'flags']:
46
+ result[key] = data.get(key)
47
+
48
+ # Extract address fields
49
+ if 'addresses' in data:
50
+ for addr_type in ['mailing', 'business']:
51
+ if addr_type in data['addresses']:
52
+ addr = data['addresses'][addr_type]
53
+ for field in ['street1', 'street2', 'city', 'stateOrCountry', 'zipCode', 'stateOrCountryDescription']:
54
+ result[f"{addr_type}_{field}"] = addr.get(field)
55
+
56
+ return result
57
+
58
+ def extract_earliest_filing_date(data):
59
+ """Extract the earliest filing date from the full JSON data."""
60
+ earliest_date = None
61
+
62
+ # Try to get dates from the filings.files array first
63
+ if 'filings' in data and 'files' in data['filings'] and isinstance(data['filings']['files'], list):
64
+ for file_info in data['filings']['files']:
65
+ if isinstance(file_info, dict) and 'filingFrom' in file_info:
66
+ file_date = file_info.get('filingFrom', '')
67
+ if file_date and (earliest_date is None or file_date < earliest_date):
68
+ earliest_date = file_date
69
+
70
+ # If no date found in files array, check filingDate array in filings.recent
71
+ if earliest_date is None and 'filings' in data and 'recent' in data['filings']:
72
+ if 'filingDate' in data['filings']['recent'] and isinstance(data['filings']['recent']['filingDate'], list):
73
+ filing_dates = data['filings']['recent']['filingDate']
74
+ for filing_date in filing_dates:
75
+ if filing_date and (earliest_date is None or filing_date < earliest_date):
76
+ earliest_date = filing_date
77
+
78
+ return earliest_date
79
+
80
+ def process_former_names(data, cik, current_name):
81
+ """Process former names into a list of records."""
82
+ former_names_records = []
83
+
84
+ # Process former names if present
85
+ former_names = data.get('formerNames', [])
86
+
87
+ # Track the latest end date to use for current name start date
88
+ latest_end_date = None
89
+
90
+ if former_names and isinstance(former_names, list):
91
+ for former_name in former_names:
92
+ if isinstance(former_name, dict):
93
+ # Extract name, start date, and end date
94
+ name = former_name.get('name', '')
95
+ start_date = former_name.get('from', '')
96
+ end_date = former_name.get('to', '')
97
+
98
+ # Clean up date formats (remove time component)
99
+ if start_date:
100
+ start_date = start_date.split('T')[0]
101
+ if end_date:
102
+ end_date = end_date.split('T')[0]
103
+ # Track latest end date
104
+ if not latest_end_date or end_date > latest_end_date:
105
+ latest_end_date = end_date
106
+
107
+ # Create record for former name
108
+ record = {
109
+ 'name': name,
110
+ 'start_date': start_date,
111
+ 'end_date': end_date,
112
+ 'cik': cik
113
+ }
114
+
115
+ former_names_records.append(record)
116
+
117
+ # For the current name, if we don't have a start date from former names,
118
+ # we'll try to find the earliest filing date
119
+ if not latest_end_date:
120
+ latest_end_date = extract_earliest_filing_date(data)
121
+
122
+ # Add current name record with start date as latest end date
123
+ current_record = {
124
+ 'name': current_name,
125
+ 'start_date': latest_end_date if latest_end_date else '',
126
+ 'end_date': '', # Current name has no end date
127
+ 'cik': cik
128
+ }
129
+
130
+ former_names_records.append(current_record)
131
+
132
+ return former_names_records
133
+
134
+ def write_metadata_to_csv(metadata_list, output_path):
135
+ """Write metadata records to CSV and compress with gzip."""
136
+ if not metadata_list:
137
+ return
138
+
139
+ # Add .gz extension if not already present
140
+ if not output_path.endswith('.gz'):
141
+ output_path = output_path + '.gz'
142
+
143
+ # Get all possible field names across all records
144
+ fieldnames = set()
145
+ for metadata in metadata_list:
146
+ fieldnames.update(metadata.keys())
147
+
148
+ # Make sure 'name' and 'cik' come first
149
+ fieldnames = ['name', 'cik'] + [f for f in sorted(fieldnames) if f not in ['name', 'cik']]
150
+
151
+ # Write directly to gzipped CSV without using StringIO buffer
152
+ with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
153
+ writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
154
+ writer.writeheader()
155
+ writer.writerows(metadata_list)
156
+
157
+ print(f"Wrote {len(metadata_list)} records to {output_path}")
158
+
159
+ def write_names_to_csv(names_list, output_path):
160
+ """Write name records to CSV and compress with gzip."""
161
+ if not names_list:
162
+ return
163
+
164
+ # Add .gz extension if not already present
165
+ if not output_path.endswith('.gz'):
166
+ output_path = output_path + '.gz'
167
+
168
+ # Names CSV has fixed columns
169
+ fieldnames = ['name', 'start_date', 'end_date', 'cik']
170
+
171
+ # Write directly to gzipped CSV
172
+ with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
173
+ writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
174
+ writer.writeheader()
175
+ writer.writerows(names_list)
176
+
177
+ print(f"Wrote {len(names_list)} records to {output_path}")
178
+
179
+ async def extract_and_process_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
180
+ """
181
+ Extracts metadata from JSON files in a ZIP archive and writes to multiple CSV files.
182
+ Can download the ZIP file from SEC to a temporary location if local_zip_path not provided.
183
+
184
+ Args:
185
+ output_dir (str): Directory for output CSV files
186
+ local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
187
+ sec_url (str): URL to download the SEC submissions ZIP file
188
+ max_bytes (int): Maximum number of bytes to extract from each file
189
+
190
+ Returns:
191
+ dict: Statistics about processed files
192
+ """
193
+ # Ensure output directory exists
194
+ if not os.path.exists(output_dir):
195
+ os.makedirs(output_dir)
196
+
197
+ # Initialize collections for different types of data
198
+ listed_metadata = []
199
+ unlisted_metadata = []
200
+ listed_names = []
201
+ unlisted_names = []
202
+
203
+ stats = {
204
+ 'total_processed': 0,
205
+ 'listed_companies': 0,
206
+ 'unlisted_companies': 0,
207
+ 'full_content_reads': 0
208
+ }
209
+
210
+ # Use provided ZIP file or download to temporary file
211
+ if local_zip_path:
212
+ # Use local file
213
+ print(f"Using local ZIP file: {local_zip_path}")
214
+ zip_path = local_zip_path
215
+ temp_file = None
216
+ else:
217
+ # Download to temporary file
218
+ print(f"Downloading from SEC to temporary file: {sec_url}")
219
+ temp_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
220
+ temp_file.close() # Close the file so we can download to it
221
+ zip_path = temp_file.name
222
+
223
+ try:
224
+ await download_sec_file(sec_url, zip_path)
225
+ except Exception as e:
226
+ # Clean up temp file if download fails
227
+ if os.path.exists(zip_path):
228
+ os.unlink(zip_path)
229
+ raise e
230
+
231
+ try:
232
+ # Process the ZIP file
233
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
234
+ # Get list of files (excluding directories)
235
+ files = [f for f in zip_ref.infolist() if not f.is_dir()]
236
+ # Remove files that contain "submission" in their name
237
+ files = [f for f in files if "submission" not in f.filename]
238
+ # Remove placeholder.txt
239
+ files = [f for f in files if "placeholder.txt" not in f.filename]
240
+
241
+
242
+ # Create a progress bar
243
+ with tqdm(total=len(files), desc="Extracting metadata", unit="file") as pbar:
244
+ # Loop through all files in the ZIP archive
245
+ for file_info in files:
246
+ try:
247
+ # Initially read just a small chunk of the file
248
+ with zip_ref.open(file_info.filename, 'r') as file:
249
+ partial_content_bytes = file.read(max_bytes)
250
+
251
+ # Convert to string
252
+ partial_content = partial_content_bytes.decode('utf-8', errors='replace')
253
+
254
+ # Truncate at "filings" and complete the JSON for initial parsing
255
+ filings_index = partial_content.find('"filings":')
256
+ if filings_index != -1:
257
+ # Get content up to "filings"
258
+ truncated_content = partial_content[:filings_index]
259
+ # Remove trailing comma if present
260
+ truncated_content = truncated_content.rstrip().rstrip(',')
261
+ # Add closing brace
262
+ partial_content = truncated_content + '}'
263
+ else:
264
+ # If "filings" not found, try to make valid JSON by adding closing brace
265
+ partial_content = partial_content.rstrip().rstrip(',') + '}'
266
+
267
+ try:
268
+ # Parse the partial JSON to check for former names
269
+ partial_json_data = json.loads(partial_content)
270
+
271
+ # Check if we need full content (no former names or former names is empty list)
272
+ former_names = partial_json_data.get('formerNames', [])
273
+ need_full_content = not former_names or len(former_names) == 0
274
+
275
+ # Initialize json_data with the partial data
276
+ json_data = partial_json_data
277
+
278
+ # If we need more data for filing dates, read the full file
279
+ if need_full_content:
280
+ stats['full_content_reads'] += 1
281
+
282
+ # Read the entire file content
283
+ with zip_ref.open(file_info.filename, 'r') as full_file:
284
+ full_content_bytes = full_file.read()
285
+ full_content = full_content_bytes.decode('utf-8', errors='replace')
286
+
287
+ try:
288
+ # Parse the full JSON
289
+ json_data = json.loads(full_content)
290
+ except json.JSONDecodeError:
291
+ # If full content can't be parsed, stick with partial data
292
+ print(f"Warning: Could not parse full content of {file_info.filename}, using partial data")
293
+
294
+ # Extract metadata (without former names)
295
+ metadata = extract_metadata(json_data)
296
+
297
+ # Get CIK and name for former names processing
298
+ cik = metadata.get('cik', '')
299
+ name = metadata.get('name', '')
300
+
301
+ # Process former names with the full json_data
302
+ former_names_records = process_former_names(json_data, cik, name)
303
+
304
+ # Check if company is listed (has tickers)
305
+ tickers = metadata.get('tickers', [])
306
+ is_listed = tickers and isinstance(tickers, list) and len(tickers) > 0
307
+
308
+ # Add to appropriate collections
309
+ if is_listed:
310
+ listed_metadata.append(metadata)
311
+ listed_names.extend(former_names_records)
312
+ stats['listed_companies'] += 1
313
+ else:
314
+ unlisted_metadata.append(metadata)
315
+ unlisted_names.extend(former_names_records)
316
+ stats['unlisted_companies'] += 1
317
+
318
+ stats['total_processed'] += 1
319
+
320
+ except json.JSONDecodeError as je:
321
+ print(f"JSON parsing error in {file_info.filename}: {str(je)}")
322
+
323
+ except Exception as e:
324
+ # Handle any errors
325
+ print(f"Error processing {file_info.filename}: {str(e)}")
326
+
327
+ # Update the progress bar
328
+ pbar.update(1)
329
+
330
+ finally:
331
+ # Clean up temporary file if we created one
332
+ if temp_file and os.path.exists(zip_path):
333
+ print(f"Removing temporary file: {zip_path}")
334
+ os.unlink(zip_path)
335
+
336
+ # Define output file paths (without .gz extension, it will be added in the write functions)
337
+ listed_metadata_path = os.path.join(output_dir, "listed_filer_metadata.csv")
338
+ unlisted_metadata_path = os.path.join(output_dir, "unlisted_filer_metadata.csv")
339
+ listed_names_path = os.path.join(output_dir, "listed_filer_names.csv")
340
+ unlisted_names_path = os.path.join(output_dir, "unlisted_filer_names.csv")
341
+
342
+ # Write listed metadata to CSV
343
+ if listed_metadata:
344
+ write_metadata_to_csv(listed_metadata, listed_metadata_path)
345
+
346
+ # Write unlisted metadata to CSV
347
+ if unlisted_metadata:
348
+ write_metadata_to_csv(unlisted_metadata, unlisted_metadata_path)
349
+
350
+ # Write listed names to CSV
351
+ if listed_names:
352
+ write_names_to_csv(listed_names, listed_names_path)
353
+
354
+ # Write unlisted names to CSV
355
+ if unlisted_names:
356
+ write_names_to_csv(unlisted_names, unlisted_names_path)
357
+
358
+ # Print summary
359
+ print(f"\nTotal files processed: {stats['total_processed']}")
360
+ print(f"Listed companies found: {stats['listed_companies']}")
361
+ print(f"Unlisted companies found: {stats['unlisted_companies']}")
362
+ print(f"Files requiring full content read: {stats['full_content_reads']}")
363
+ print(f"Output files written to {output_dir}")
364
+
365
+ return stats
366
+
367
+ # Convenience function to run the extractor
368
+ def process_submissions_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
369
+ """
370
+ Convenience function to run the SEC Metadata Extractor.
371
+
372
+ Args:
373
+ output_dir (str): Directory for output CSV files
374
+ local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
375
+ sec_url (str): URL to download the SEC submissions ZIP file
376
+ max_bytes (int): Maximum number of bytes to extract from each file
377
+
378
+ Returns:
379
+ dict: Statistics about processed files
380
+ """
381
+ return asyncio.run(extract_and_process_metadata(
382
+ output_dir=output_dir,
383
+ local_zip_path=local_zip_path,
384
+ sec_url=sec_url,
385
+ max_bytes=max_bytes
386
+ ))
File without changes