datamule 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/book/__init__.py +0 -0
- datamule/book/book.py +34 -0
- datamule/mapping_dicts/__init__.py +0 -0
- datamule/mapping_dicts/txt_mapping_dicts.py +234 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/sec/__init__.py +0 -0
- datamule/sec/infrastructure/__init__.py +0 -0
- datamule/sec/infrastructure/submissions_metadata.py +386 -0
- datamule/sec/rss/__init__.py +0 -0
- datamule/sec/rss/monitor.py +416 -0
- datamule/sec/submissions/__init__.py +0 -0
- datamule/sec/submissions/downloader.py +70 -0
- datamule/sec/submissions/eftsquery.py +502 -0
- datamule/sec/submissions/monitor.py +126 -0
- datamule/sec/submissions/streamer.py +228 -0
- datamule/sec/submissions/textsearch.py +122 -0
- datamule/sec/utils.py +64 -0
- datamule/sec/xbrl/__init__.py +0 -0
- datamule/sec/xbrl/downloadcompanyfacts.py +83 -0
- datamule/sec/xbrl/filter_xbrl.py +39 -0
- datamule/sec/xbrl/streamcompanyfacts.py +93 -0
- datamule/sec/xbrl/xbrlmonitor.py +143 -0
- datamule/seclibrary/__init__.py +0 -0
- datamule/seclibrary/downloader.py +286 -0
- datamule/seclibrary/query.py +181 -0
- {datamule-1.0.8.dist-info → datamule-1.0.9.dist-info}/METADATA +1 -1
- datamule-1.0.9.dist-info/RECORD +35 -0
- datamule-1.0.8.dist-info/RECORD +0 -10
- {datamule-1.0.8.dist-info → datamule-1.0.9.dist-info}/WHEEL +0 -0
- {datamule-1.0.8.dist-info → datamule-1.0.9.dist-info}/top_level.txt +0 -0
File without changes
|
datamule/book/book.py
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from ..helper import _process_cik_and_metadata_filters, load_package_dataset
|
3
|
+
from ..sec.xbrl.downloadcompanyfacts import download_company_facts
|
4
|
+
|
5
|
+
class Book:
|
6
|
+
def __init__(self, path):
|
7
|
+
self.path = Path(path)
|
8
|
+
|
9
|
+
def download_xbrl(
|
10
|
+
self,
|
11
|
+
cik=None,
|
12
|
+
ticker=None,
|
13
|
+
**kwargs
|
14
|
+
):
|
15
|
+
# If no CIK or ticker specified, get all companies with tickers
|
16
|
+
if cik is None and ticker is None:
|
17
|
+
cik = [row['cik'] for row in load_package_dataset('company_tickers')]
|
18
|
+
|
19
|
+
# Normalize cik to list format
|
20
|
+
if isinstance(cik, (str, int)):
|
21
|
+
cik = [cik]
|
22
|
+
|
23
|
+
# Process CIK and metadata filters
|
24
|
+
cik_list = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
25
|
+
|
26
|
+
# Download facts for all CIKs in parallel
|
27
|
+
download_company_facts(cik=cik_list, output_dir=self.path)
|
28
|
+
|
29
|
+
def query_345():
|
30
|
+
pass
|
31
|
+
def query_xbrl():
|
32
|
+
pass
|
33
|
+
def query_13fhr():
|
34
|
+
pass
|
File without changes
|
@@ -0,0 +1,234 @@
|
|
1
|
+
import copy
|
2
|
+
|
3
|
+
dict_sgml = {
|
4
|
+
"rules": {
|
5
|
+
"join_text": "\n",
|
6
|
+
"remove": [
|
7
|
+
{
|
8
|
+
"pattern": r"^<PAGE>",
|
9
|
+
}
|
10
|
+
],
|
11
|
+
"mappings": [
|
12
|
+
{
|
13
|
+
"name": "table",
|
14
|
+
"pattern": r"^<TABLE>",
|
15
|
+
"end": r"^</TABLE>"
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"name": "caption",
|
19
|
+
"pattern": r"^<CAPTION>",
|
20
|
+
"end": r"^<S>",
|
21
|
+
"keep_end": True
|
22
|
+
},
|
23
|
+
{
|
24
|
+
"name": "footnote",
|
25
|
+
"pattern": r"^<FN>",
|
26
|
+
"end": r"^</FN>"
|
27
|
+
}
|
28
|
+
]
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
item_pattern_mapping = r"^\n\n\s*(ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
|
33
|
+
item_pattern_mapping_8k = r"^\n\n\s*(ITEM|Item)\s+(\d+(?:\.\d+)?[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
|
34
|
+
part_pattern_mapping = r"^\n\n\s*(PART|Part)\s+(?:I{1,3}|IV)\.?"
|
35
|
+
|
36
|
+
item_pattern_standardization = r"^\s*(?:ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
|
37
|
+
item_pattern_standardization_8k = r"^\s*(?:ITEM|Item)\s+(\d+(?:\.\d+)?[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN)\.?"
|
38
|
+
part_pattern_standardization = r"^\s*(?:PART|Part)\s+([IVX]+)"
|
39
|
+
|
40
|
+
|
41
|
+
dict_10k = copy.deepcopy(dict_sgml)
|
42
|
+
dict_10k["rules"]["mappings"].extend([
|
43
|
+
{
|
44
|
+
"type": "hierarchy",
|
45
|
+
"name": "part",
|
46
|
+
"pattern": part_pattern_mapping,
|
47
|
+
"hierarchy": 0
|
48
|
+
},
|
49
|
+
{
|
50
|
+
"type": "hierarchy",
|
51
|
+
"name": "item",
|
52
|
+
"pattern": item_pattern_mapping,
|
53
|
+
"hierarchy": 1
|
54
|
+
},
|
55
|
+
])
|
56
|
+
|
57
|
+
# In the mapping dict:
|
58
|
+
dict_10k['transformations'] = [
|
59
|
+
{
|
60
|
+
"type": "standardize",
|
61
|
+
"match": {
|
62
|
+
"type": "part",
|
63
|
+
"text_pattern": part_pattern_standardization
|
64
|
+
},
|
65
|
+
"output": {
|
66
|
+
"format": "part{}",
|
67
|
+
"field": "text" # Where to store the standardized value
|
68
|
+
}
|
69
|
+
},
|
70
|
+
{
|
71
|
+
"type": "standardize",
|
72
|
+
"match": {
|
73
|
+
"type": "item",
|
74
|
+
"text_pattern": item_pattern_standardization
|
75
|
+
},
|
76
|
+
"output": {
|
77
|
+
"format": "item{}",
|
78
|
+
"field": "text" # Could also be "text" or any other field name
|
79
|
+
}
|
80
|
+
},
|
81
|
+
{
|
82
|
+
"type": "merge_consecutive",
|
83
|
+
"match": {
|
84
|
+
"types": ["part", "item"] # sections types to check for merging
|
85
|
+
}
|
86
|
+
},
|
87
|
+
{
|
88
|
+
"type": "trim",
|
89
|
+
"match": {
|
90
|
+
"type": "item", # or "item"
|
91
|
+
"expected": 1
|
92
|
+
},
|
93
|
+
"output": {
|
94
|
+
"type": "introduction",
|
95
|
+
"separator": "\n"
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
]
|
100
|
+
|
101
|
+
dict_10q = copy.deepcopy(dict_sgml)
|
102
|
+
dict_10q["rules"]["mappings"].extend([
|
103
|
+
{
|
104
|
+
"type": "hierarchy",
|
105
|
+
"name": "part",
|
106
|
+
"pattern": part_pattern_mapping,
|
107
|
+
"hierarchy": 0
|
108
|
+
},
|
109
|
+
{
|
110
|
+
"type": "hierarchy",
|
111
|
+
"name": "item",
|
112
|
+
"pattern": item_pattern_mapping,
|
113
|
+
"hierarchy": 1
|
114
|
+
},
|
115
|
+
])
|
116
|
+
|
117
|
+
# In the mapping dict:
|
118
|
+
dict_10q['transformations'] = [
|
119
|
+
{
|
120
|
+
"type": "standardize",
|
121
|
+
"match": {
|
122
|
+
"type": "part",
|
123
|
+
"text_pattern": part_pattern_standardization
|
124
|
+
},
|
125
|
+
"output": {
|
126
|
+
"format": "part{}",
|
127
|
+
"field": "text" # Where to store the standardized value
|
128
|
+
}
|
129
|
+
},
|
130
|
+
{
|
131
|
+
"type": "standardize",
|
132
|
+
"match": {
|
133
|
+
"type": "item",
|
134
|
+
"text_pattern": item_pattern_standardization
|
135
|
+
},
|
136
|
+
"output": {
|
137
|
+
"format": "item{}",
|
138
|
+
"field": "text" # Could also be "text" or any other field name
|
139
|
+
}
|
140
|
+
},
|
141
|
+
{
|
142
|
+
"type": "merge_consecutive",
|
143
|
+
"match": {
|
144
|
+
"types": ["part", "item"] # sections types to check for merging
|
145
|
+
}
|
146
|
+
},
|
147
|
+
{
|
148
|
+
"type": "trim",
|
149
|
+
"match": {
|
150
|
+
"type": "item", # or "item"
|
151
|
+
"expected": 2
|
152
|
+
},
|
153
|
+
"output": {
|
154
|
+
"type": "introduction",
|
155
|
+
"separator": "\n"
|
156
|
+
}
|
157
|
+
}
|
158
|
+
|
159
|
+
]
|
160
|
+
|
161
|
+
dict_13d = copy.deepcopy(dict_sgml)
|
162
|
+
dict_13d["rules"]["mappings"].extend([
|
163
|
+
{
|
164
|
+
"type": "hierarchy",
|
165
|
+
"name": "item",
|
166
|
+
"pattern": item_pattern_mapping,
|
167
|
+
"hierarchy": 0
|
168
|
+
},
|
169
|
+
])
|
170
|
+
|
171
|
+
dict_13d['transformations'] = [
|
172
|
+
{
|
173
|
+
"type": "standardize",
|
174
|
+
"match": {
|
175
|
+
"type": "item",
|
176
|
+
"text_pattern": item_pattern_standardization
|
177
|
+
},
|
178
|
+
"output": {
|
179
|
+
"format": "item{}",
|
180
|
+
"field": "text" # Could also be "text" or any other field name
|
181
|
+
}
|
182
|
+
},
|
183
|
+
{
|
184
|
+
"type": "merge_consecutive",
|
185
|
+
"match": {
|
186
|
+
"types": ["item"] # sections types to check for merging
|
187
|
+
}
|
188
|
+
}
|
189
|
+
|
190
|
+
]
|
191
|
+
|
192
|
+
dict_13g = copy.deepcopy(dict_13d)
|
193
|
+
|
194
|
+
dict_8k = copy.deepcopy(dict_sgml)
|
195
|
+
dict_8k["rules"]["mappings"].extend([
|
196
|
+
{
|
197
|
+
"type": "hierarchy",
|
198
|
+
"name": "item",
|
199
|
+
"pattern": item_pattern_mapping_8k,
|
200
|
+
"hierarchy": 0
|
201
|
+
},
|
202
|
+
])
|
203
|
+
|
204
|
+
dict_8k['transformations'] = [
|
205
|
+
{
|
206
|
+
"type": "standardize",
|
207
|
+
"match": {
|
208
|
+
"type": "item",
|
209
|
+
"text_pattern": item_pattern_standardization_8k
|
210
|
+
},
|
211
|
+
"output": {
|
212
|
+
"format": "item{}",
|
213
|
+
"field": "text" # Could also be "text" or any other field name
|
214
|
+
}
|
215
|
+
},
|
216
|
+
{
|
217
|
+
"type": "merge_consecutive",
|
218
|
+
"match": {
|
219
|
+
"types": ["item"] # sections types to check for merging
|
220
|
+
}
|
221
|
+
},
|
222
|
+
{
|
223
|
+
"type": "trim",
|
224
|
+
"match": {
|
225
|
+
"type": "item", # or "item"
|
226
|
+
"expected": 1
|
227
|
+
},
|
228
|
+
"output": {
|
229
|
+
"type": "introduction",
|
230
|
+
"separator": "\n"
|
231
|
+
}
|
232
|
+
}
|
233
|
+
|
234
|
+
]
|
@@ -0,0 +1,19 @@
|
|
1
|
+
dict_345 = {
|
2
|
+
"transformations": [
|
3
|
+
{
|
4
|
+
"search": {
|
5
|
+
"key": "footnoteId",
|
6
|
+
"identifier": "@id"
|
7
|
+
},
|
8
|
+
"match": {
|
9
|
+
"identifier": "@id",
|
10
|
+
"content": "#text",
|
11
|
+
"remove_after_use": True
|
12
|
+
},
|
13
|
+
"output": {
|
14
|
+
"key": "footnote",
|
15
|
+
"value": "content"
|
16
|
+
}
|
17
|
+
}
|
18
|
+
]
|
19
|
+
}
|
datamule/sec/__init__.py
ADDED
File without changes
|
File without changes
|
@@ -0,0 +1,386 @@
|
|
1
|
+
import zipfile
|
2
|
+
import os
|
3
|
+
import json
|
4
|
+
import csv
|
5
|
+
import gzip
|
6
|
+
import asyncio
|
7
|
+
import aiohttp
|
8
|
+
import tempfile
|
9
|
+
from tqdm import tqdm
|
10
|
+
from datetime import datetime
|
11
|
+
from ..utils import headers
|
12
|
+
|
13
|
+
async def download_sec_file(url, target_path):
|
14
|
+
"""Download submissions.zip from SEC website with progress bar."""
|
15
|
+
|
16
|
+
|
17
|
+
async with aiohttp.ClientSession() as session:
|
18
|
+
async with session.get(url, headers=headers) as response:
|
19
|
+
if response.status != 200:
|
20
|
+
raise Exception(f"Failed to download: HTTP {response.status}")
|
21
|
+
|
22
|
+
file_size = int(response.headers.get('Content-Length', 0))
|
23
|
+
|
24
|
+
with tqdm(total=file_size, unit='B', unit_scale=True, desc="Downloading SEC data") as progress_bar:
|
25
|
+
with open(target_path, 'wb') as f:
|
26
|
+
chunk_size = 1024 * 1024 # 1MB chunks
|
27
|
+
downloaded = 0
|
28
|
+
|
29
|
+
async for chunk in response.content.iter_chunked(chunk_size):
|
30
|
+
f.write(chunk)
|
31
|
+
downloaded += len(chunk)
|
32
|
+
progress_bar.update(len(chunk))
|
33
|
+
|
34
|
+
print(f"Download complete: {target_path}")
|
35
|
+
return target_path
|
36
|
+
|
37
|
+
def extract_metadata(data):
|
38
|
+
"""Extract and flatten relevant company metadata from SEC submission data."""
|
39
|
+
result = {}
|
40
|
+
|
41
|
+
# Extract top-level fields, but exclude formerNames as it will be processed separately
|
42
|
+
for key in ['cik', 'entityType', 'sic', 'sicDescription', 'ownerOrg',
|
43
|
+
'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists',
|
44
|
+
'name', 'tickers', 'exchanges', 'ein', 'description', 'category', 'fiscalYearEnd', 'stateOfIncorporation',
|
45
|
+
'stateOfIncorporationDescription', 'phone', 'flags']:
|
46
|
+
result[key] = data.get(key)
|
47
|
+
|
48
|
+
# Extract address fields
|
49
|
+
if 'addresses' in data:
|
50
|
+
for addr_type in ['mailing', 'business']:
|
51
|
+
if addr_type in data['addresses']:
|
52
|
+
addr = data['addresses'][addr_type]
|
53
|
+
for field in ['street1', 'street2', 'city', 'stateOrCountry', 'zipCode', 'stateOrCountryDescription']:
|
54
|
+
result[f"{addr_type}_{field}"] = addr.get(field)
|
55
|
+
|
56
|
+
return result
|
57
|
+
|
58
|
+
def extract_earliest_filing_date(data):
|
59
|
+
"""Extract the earliest filing date from the full JSON data."""
|
60
|
+
earliest_date = None
|
61
|
+
|
62
|
+
# Try to get dates from the filings.files array first
|
63
|
+
if 'filings' in data and 'files' in data['filings'] and isinstance(data['filings']['files'], list):
|
64
|
+
for file_info in data['filings']['files']:
|
65
|
+
if isinstance(file_info, dict) and 'filingFrom' in file_info:
|
66
|
+
file_date = file_info.get('filingFrom', '')
|
67
|
+
if file_date and (earliest_date is None or file_date < earliest_date):
|
68
|
+
earliest_date = file_date
|
69
|
+
|
70
|
+
# If no date found in files array, check filingDate array in filings.recent
|
71
|
+
if earliest_date is None and 'filings' in data and 'recent' in data['filings']:
|
72
|
+
if 'filingDate' in data['filings']['recent'] and isinstance(data['filings']['recent']['filingDate'], list):
|
73
|
+
filing_dates = data['filings']['recent']['filingDate']
|
74
|
+
for filing_date in filing_dates:
|
75
|
+
if filing_date and (earliest_date is None or filing_date < earliest_date):
|
76
|
+
earliest_date = filing_date
|
77
|
+
|
78
|
+
return earliest_date
|
79
|
+
|
80
|
+
def process_former_names(data, cik, current_name):
|
81
|
+
"""Process former names into a list of records."""
|
82
|
+
former_names_records = []
|
83
|
+
|
84
|
+
# Process former names if present
|
85
|
+
former_names = data.get('formerNames', [])
|
86
|
+
|
87
|
+
# Track the latest end date to use for current name start date
|
88
|
+
latest_end_date = None
|
89
|
+
|
90
|
+
if former_names and isinstance(former_names, list):
|
91
|
+
for former_name in former_names:
|
92
|
+
if isinstance(former_name, dict):
|
93
|
+
# Extract name, start date, and end date
|
94
|
+
name = former_name.get('name', '')
|
95
|
+
start_date = former_name.get('from', '')
|
96
|
+
end_date = former_name.get('to', '')
|
97
|
+
|
98
|
+
# Clean up date formats (remove time component)
|
99
|
+
if start_date:
|
100
|
+
start_date = start_date.split('T')[0]
|
101
|
+
if end_date:
|
102
|
+
end_date = end_date.split('T')[0]
|
103
|
+
# Track latest end date
|
104
|
+
if not latest_end_date or end_date > latest_end_date:
|
105
|
+
latest_end_date = end_date
|
106
|
+
|
107
|
+
# Create record for former name
|
108
|
+
record = {
|
109
|
+
'name': name,
|
110
|
+
'start_date': start_date,
|
111
|
+
'end_date': end_date,
|
112
|
+
'cik': cik
|
113
|
+
}
|
114
|
+
|
115
|
+
former_names_records.append(record)
|
116
|
+
|
117
|
+
# For the current name, if we don't have a start date from former names,
|
118
|
+
# we'll try to find the earliest filing date
|
119
|
+
if not latest_end_date:
|
120
|
+
latest_end_date = extract_earliest_filing_date(data)
|
121
|
+
|
122
|
+
# Add current name record with start date as latest end date
|
123
|
+
current_record = {
|
124
|
+
'name': current_name,
|
125
|
+
'start_date': latest_end_date if latest_end_date else '',
|
126
|
+
'end_date': '', # Current name has no end date
|
127
|
+
'cik': cik
|
128
|
+
}
|
129
|
+
|
130
|
+
former_names_records.append(current_record)
|
131
|
+
|
132
|
+
return former_names_records
|
133
|
+
|
134
|
+
def write_metadata_to_csv(metadata_list, output_path):
|
135
|
+
"""Write metadata records to CSV and compress with gzip."""
|
136
|
+
if not metadata_list:
|
137
|
+
return
|
138
|
+
|
139
|
+
# Add .gz extension if not already present
|
140
|
+
if not output_path.endswith('.gz'):
|
141
|
+
output_path = output_path + '.gz'
|
142
|
+
|
143
|
+
# Get all possible field names across all records
|
144
|
+
fieldnames = set()
|
145
|
+
for metadata in metadata_list:
|
146
|
+
fieldnames.update(metadata.keys())
|
147
|
+
|
148
|
+
# Make sure 'name' and 'cik' come first
|
149
|
+
fieldnames = ['name', 'cik'] + [f for f in sorted(fieldnames) if f not in ['name', 'cik']]
|
150
|
+
|
151
|
+
# Write directly to gzipped CSV without using StringIO buffer
|
152
|
+
with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
|
153
|
+
writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
|
154
|
+
writer.writeheader()
|
155
|
+
writer.writerows(metadata_list)
|
156
|
+
|
157
|
+
print(f"Wrote {len(metadata_list)} records to {output_path}")
|
158
|
+
|
159
|
+
def write_names_to_csv(names_list, output_path):
|
160
|
+
"""Write name records to CSV and compress with gzip."""
|
161
|
+
if not names_list:
|
162
|
+
return
|
163
|
+
|
164
|
+
# Add .gz extension if not already present
|
165
|
+
if not output_path.endswith('.gz'):
|
166
|
+
output_path = output_path + '.gz'
|
167
|
+
|
168
|
+
# Names CSV has fixed columns
|
169
|
+
fieldnames = ['name', 'start_date', 'end_date', 'cik']
|
170
|
+
|
171
|
+
# Write directly to gzipped CSV
|
172
|
+
with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
|
173
|
+
writer = csv.DictWriter(gzfile, fieldnames=fieldnames)
|
174
|
+
writer.writeheader()
|
175
|
+
writer.writerows(names_list)
|
176
|
+
|
177
|
+
print(f"Wrote {len(names_list)} records to {output_path}")
|
178
|
+
|
179
|
+
async def extract_and_process_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
|
180
|
+
"""
|
181
|
+
Extracts metadata from JSON files in a ZIP archive and writes to multiple CSV files.
|
182
|
+
Can download the ZIP file from SEC to a temporary location if local_zip_path not provided.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
output_dir (str): Directory for output CSV files
|
186
|
+
local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
|
187
|
+
sec_url (str): URL to download the SEC submissions ZIP file
|
188
|
+
max_bytes (int): Maximum number of bytes to extract from each file
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
dict: Statistics about processed files
|
192
|
+
"""
|
193
|
+
# Ensure output directory exists
|
194
|
+
if not os.path.exists(output_dir):
|
195
|
+
os.makedirs(output_dir)
|
196
|
+
|
197
|
+
# Initialize collections for different types of data
|
198
|
+
listed_metadata = []
|
199
|
+
unlisted_metadata = []
|
200
|
+
listed_names = []
|
201
|
+
unlisted_names = []
|
202
|
+
|
203
|
+
stats = {
|
204
|
+
'total_processed': 0,
|
205
|
+
'listed_companies': 0,
|
206
|
+
'unlisted_companies': 0,
|
207
|
+
'full_content_reads': 0
|
208
|
+
}
|
209
|
+
|
210
|
+
# Use provided ZIP file or download to temporary file
|
211
|
+
if local_zip_path:
|
212
|
+
# Use local file
|
213
|
+
print(f"Using local ZIP file: {local_zip_path}")
|
214
|
+
zip_path = local_zip_path
|
215
|
+
temp_file = None
|
216
|
+
else:
|
217
|
+
# Download to temporary file
|
218
|
+
print(f"Downloading from SEC to temporary file: {sec_url}")
|
219
|
+
temp_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
|
220
|
+
temp_file.close() # Close the file so we can download to it
|
221
|
+
zip_path = temp_file.name
|
222
|
+
|
223
|
+
try:
|
224
|
+
await download_sec_file(sec_url, zip_path)
|
225
|
+
except Exception as e:
|
226
|
+
# Clean up temp file if download fails
|
227
|
+
if os.path.exists(zip_path):
|
228
|
+
os.unlink(zip_path)
|
229
|
+
raise e
|
230
|
+
|
231
|
+
try:
|
232
|
+
# Process the ZIP file
|
233
|
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
234
|
+
# Get list of files (excluding directories)
|
235
|
+
files = [f for f in zip_ref.infolist() if not f.is_dir()]
|
236
|
+
# Remove files that contain "submission" in their name
|
237
|
+
files = [f for f in files if "submission" not in f.filename]
|
238
|
+
# Remove placeholder.txt
|
239
|
+
files = [f for f in files if "placeholder.txt" not in f.filename]
|
240
|
+
|
241
|
+
|
242
|
+
# Create a progress bar
|
243
|
+
with tqdm(total=len(files), desc="Extracting metadata", unit="file") as pbar:
|
244
|
+
# Loop through all files in the ZIP archive
|
245
|
+
for file_info in files:
|
246
|
+
try:
|
247
|
+
# Initially read just a small chunk of the file
|
248
|
+
with zip_ref.open(file_info.filename, 'r') as file:
|
249
|
+
partial_content_bytes = file.read(max_bytes)
|
250
|
+
|
251
|
+
# Convert to string
|
252
|
+
partial_content = partial_content_bytes.decode('utf-8', errors='replace')
|
253
|
+
|
254
|
+
# Truncate at "filings" and complete the JSON for initial parsing
|
255
|
+
filings_index = partial_content.find('"filings":')
|
256
|
+
if filings_index != -1:
|
257
|
+
# Get content up to "filings"
|
258
|
+
truncated_content = partial_content[:filings_index]
|
259
|
+
# Remove trailing comma if present
|
260
|
+
truncated_content = truncated_content.rstrip().rstrip(',')
|
261
|
+
# Add closing brace
|
262
|
+
partial_content = truncated_content + '}'
|
263
|
+
else:
|
264
|
+
# If "filings" not found, try to make valid JSON by adding closing brace
|
265
|
+
partial_content = partial_content.rstrip().rstrip(',') + '}'
|
266
|
+
|
267
|
+
try:
|
268
|
+
# Parse the partial JSON to check for former names
|
269
|
+
partial_json_data = json.loads(partial_content)
|
270
|
+
|
271
|
+
# Check if we need full content (no former names or former names is empty list)
|
272
|
+
former_names = partial_json_data.get('formerNames', [])
|
273
|
+
need_full_content = not former_names or len(former_names) == 0
|
274
|
+
|
275
|
+
# Initialize json_data with the partial data
|
276
|
+
json_data = partial_json_data
|
277
|
+
|
278
|
+
# If we need more data for filing dates, read the full file
|
279
|
+
if need_full_content:
|
280
|
+
stats['full_content_reads'] += 1
|
281
|
+
|
282
|
+
# Read the entire file content
|
283
|
+
with zip_ref.open(file_info.filename, 'r') as full_file:
|
284
|
+
full_content_bytes = full_file.read()
|
285
|
+
full_content = full_content_bytes.decode('utf-8', errors='replace')
|
286
|
+
|
287
|
+
try:
|
288
|
+
# Parse the full JSON
|
289
|
+
json_data = json.loads(full_content)
|
290
|
+
except json.JSONDecodeError:
|
291
|
+
# If full content can't be parsed, stick with partial data
|
292
|
+
print(f"Warning: Could not parse full content of {file_info.filename}, using partial data")
|
293
|
+
|
294
|
+
# Extract metadata (without former names)
|
295
|
+
metadata = extract_metadata(json_data)
|
296
|
+
|
297
|
+
# Get CIK and name for former names processing
|
298
|
+
cik = metadata.get('cik', '')
|
299
|
+
name = metadata.get('name', '')
|
300
|
+
|
301
|
+
# Process former names with the full json_data
|
302
|
+
former_names_records = process_former_names(json_data, cik, name)
|
303
|
+
|
304
|
+
# Check if company is listed (has tickers)
|
305
|
+
tickers = metadata.get('tickers', [])
|
306
|
+
is_listed = tickers and isinstance(tickers, list) and len(tickers) > 0
|
307
|
+
|
308
|
+
# Add to appropriate collections
|
309
|
+
if is_listed:
|
310
|
+
listed_metadata.append(metadata)
|
311
|
+
listed_names.extend(former_names_records)
|
312
|
+
stats['listed_companies'] += 1
|
313
|
+
else:
|
314
|
+
unlisted_metadata.append(metadata)
|
315
|
+
unlisted_names.extend(former_names_records)
|
316
|
+
stats['unlisted_companies'] += 1
|
317
|
+
|
318
|
+
stats['total_processed'] += 1
|
319
|
+
|
320
|
+
except json.JSONDecodeError as je:
|
321
|
+
print(f"JSON parsing error in {file_info.filename}: {str(je)}")
|
322
|
+
|
323
|
+
except Exception as e:
|
324
|
+
# Handle any errors
|
325
|
+
print(f"Error processing {file_info.filename}: {str(e)}")
|
326
|
+
|
327
|
+
# Update the progress bar
|
328
|
+
pbar.update(1)
|
329
|
+
|
330
|
+
finally:
|
331
|
+
# Clean up temporary file if we created one
|
332
|
+
if temp_file and os.path.exists(zip_path):
|
333
|
+
print(f"Removing temporary file: {zip_path}")
|
334
|
+
os.unlink(zip_path)
|
335
|
+
|
336
|
+
# Define output file paths (without .gz extension, it will be added in the write functions)
|
337
|
+
listed_metadata_path = os.path.join(output_dir, "listed_filer_metadata.csv")
|
338
|
+
unlisted_metadata_path = os.path.join(output_dir, "unlisted_filer_metadata.csv")
|
339
|
+
listed_names_path = os.path.join(output_dir, "listed_filer_names.csv")
|
340
|
+
unlisted_names_path = os.path.join(output_dir, "unlisted_filer_names.csv")
|
341
|
+
|
342
|
+
# Write listed metadata to CSV
|
343
|
+
if listed_metadata:
|
344
|
+
write_metadata_to_csv(listed_metadata, listed_metadata_path)
|
345
|
+
|
346
|
+
# Write unlisted metadata to CSV
|
347
|
+
if unlisted_metadata:
|
348
|
+
write_metadata_to_csv(unlisted_metadata, unlisted_metadata_path)
|
349
|
+
|
350
|
+
# Write listed names to CSV
|
351
|
+
if listed_names:
|
352
|
+
write_names_to_csv(listed_names, listed_names_path)
|
353
|
+
|
354
|
+
# Write unlisted names to CSV
|
355
|
+
if unlisted_names:
|
356
|
+
write_names_to_csv(unlisted_names, unlisted_names_path)
|
357
|
+
|
358
|
+
# Print summary
|
359
|
+
print(f"\nTotal files processed: {stats['total_processed']}")
|
360
|
+
print(f"Listed companies found: {stats['listed_companies']}")
|
361
|
+
print(f"Unlisted companies found: {stats['unlisted_companies']}")
|
362
|
+
print(f"Files requiring full content read: {stats['full_content_reads']}")
|
363
|
+
print(f"Output files written to {output_dir}")
|
364
|
+
|
365
|
+
return stats
|
366
|
+
|
367
|
+
# Convenience function to run the extractor
|
368
|
+
def process_submissions_metadata(output_dir, local_zip_path=None, sec_url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip", max_bytes=2000):
|
369
|
+
"""
|
370
|
+
Convenience function to run the SEC Metadata Extractor.
|
371
|
+
|
372
|
+
Args:
|
373
|
+
output_dir (str): Directory for output CSV files
|
374
|
+
local_zip_path (str, optional): Path to a local ZIP file. If None, downloads from SEC to temp
|
375
|
+
sec_url (str): URL to download the SEC submissions ZIP file
|
376
|
+
max_bytes (int): Maximum number of bytes to extract from each file
|
377
|
+
|
378
|
+
Returns:
|
379
|
+
dict: Statistics about processed files
|
380
|
+
"""
|
381
|
+
return asyncio.run(extract_and_process_metadata(
|
382
|
+
output_dir=output_dir,
|
383
|
+
local_zip_path=local_zip_path,
|
384
|
+
sec_url=sec_url,
|
385
|
+
max_bytes=max_bytes
|
386
|
+
))
|
File without changes
|