datamule 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +3 -1
- datamule/document/__init__.py +0 -0
- datamule/document/document.py +255 -0
- datamule/document/processing.py +604 -0
- datamule/document/table.py +260 -0
- datamule/package_updater.py +31 -0
- datamule/portfolio.py +5 -3
- datamule/sec/submissions/downloader.py +14 -37
- datamule/seclibrary/downloader.py +50 -9
- datamule/submission.py +102 -7
- {datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/METADATA +1 -1
- {datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/RECORD +14 -10
- datamule/document.py +0 -465
- {datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/WHEEL +0 -0
- {datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,260 @@
|
|
1
|
+
from .mappings.atsn import *
|
2
|
+
from .mappings.cfportal import *
|
3
|
+
from .mappings.ex99a_sdr import *
|
4
|
+
from .mappings.ex99c_sdr import *
|
5
|
+
from .mappings.ex99g_sdr import *
|
6
|
+
from .mappings.ex99i_sdr import *
|
7
|
+
from .mappings.nmfp import *
|
8
|
+
from .mappings.npx import *
|
9
|
+
from .mappings.onefourtyfour import *
|
10
|
+
from .mappings.ownership import *
|
11
|
+
from .mappings.proxy_voting_record import *
|
12
|
+
from .mappings.sbs import *
|
13
|
+
from .mappings.sbsef import *
|
14
|
+
from .mappings.schedule13 import *
|
15
|
+
from .mappings.sdr import *
|
16
|
+
from .mappings.ta import *
|
17
|
+
from .mappings.thirteenfhr import *
|
18
|
+
from .mappings.twentyfivense import *
|
19
|
+
from .mappings.twentyfourf2nt import *
|
20
|
+
from .mappings.information_table import *
|
21
|
+
|
22
|
+
# need to check if mappings correctly create new columns
|
23
|
+
class Table():
|
24
|
+
def __init__(self, data, type,accession):
|
25
|
+
if isinstance(data,dict):
|
26
|
+
data = [data]
|
27
|
+
self.type = type
|
28
|
+
self.data = data
|
29
|
+
self.accession = accession
|
30
|
+
self.columns = self.determine_columns()
|
31
|
+
|
32
|
+
def determine_columns(self):
|
33
|
+
if len(self.data) == 0:
|
34
|
+
return []
|
35
|
+
return self.data[0].keys()
|
36
|
+
|
37
|
+
def add_column(self,column_name,value):
|
38
|
+
for row in self.data:
|
39
|
+
row[column_name] = value
|
40
|
+
|
41
|
+
def map_data(self):
|
42
|
+
# Add the accession column to all rows first, ensuring it will be first
|
43
|
+
self.add_column('accession', self.accession)
|
44
|
+
|
45
|
+
|
46
|
+
# ATS-N, types: metadata_ats,cover_ats,part_one_ats,part_two_ats,part_three_ats,part_four_ats
|
47
|
+
if self.type == 'metadata_ats':
|
48
|
+
mapping_dict = metadata_ats_dict
|
49
|
+
elif self.type == 'cover_ats':
|
50
|
+
mapping_dict = cover_ats_dict
|
51
|
+
elif self.type == 'part_one_ats':
|
52
|
+
mapping_dict = part_one_ats_dict
|
53
|
+
elif self.type == 'part_two_ats':
|
54
|
+
mapping_dict = part_two_ats_dict
|
55
|
+
elif self.type == 'part_three_ats':
|
56
|
+
mapping_dict = part_three_ats_dict
|
57
|
+
elif self.type == 'part_four_ats':
|
58
|
+
mapping_dict = part_four_ats_dict
|
59
|
+
# CFPORTAL
|
60
|
+
elif self.type == 'metadata_cfportal':
|
61
|
+
mapping_dict = metadata_cfportal_dict
|
62
|
+
elif self.type == 'identifying_information_cfportal':
|
63
|
+
mapping_dict = identifying_information_cfportal_dict
|
64
|
+
elif self.type == 'form_of_organization_cfportal':
|
65
|
+
mapping_dict = form_of_organization_cfportal_dict
|
66
|
+
elif self.type == 'successions_cfportal':
|
67
|
+
mapping_dict = successions_cfportal_dict
|
68
|
+
elif self.type == 'control_relationships_cfportal':
|
69
|
+
mapping_dict = control_relationships_cfportal_dict
|
70
|
+
elif self.type == 'disclosure_answers_cfportal':
|
71
|
+
mapping_dict = disclosure_answers_cfportal_dict
|
72
|
+
elif self.type == 'non_securities_related_business_cfportal':
|
73
|
+
mapping_dict = non_securities_related_business_cfportal_dict
|
74
|
+
elif self.type == 'escrow_arrangements_cfportal':
|
75
|
+
mapping_dict = escrow_arrangements_cfportal_dict
|
76
|
+
elif self.type == 'execution_cfportal':
|
77
|
+
mapping_dict = execution_cfportal_dict
|
78
|
+
elif self.type == 'schedule_a_cfportal':
|
79
|
+
mapping_dict = schedule_a_cfportal_dict
|
80
|
+
elif self.type == 'schedule_b_cfportal':
|
81
|
+
mapping_dict = schedule_b_cfportal_dict
|
82
|
+
elif self.type == 'schedule_c_cfportal':
|
83
|
+
mapping_dict = schedule_c_cfportal_dict
|
84
|
+
elif self.type == 'schedule_d_cfportal':
|
85
|
+
mapping_dict = schedule_d_cfportal_dict
|
86
|
+
elif self.type == 'criminal_drip_info_cfportal':
|
87
|
+
mapping_dict = criminal_drip_info_cfportal_dict
|
88
|
+
elif self.type == 'regulatory_drip_info_cfportal':
|
89
|
+
mapping_dict = regulatory_drip_info_cfportal_dict
|
90
|
+
elif self.type == 'civil_judicial_drip_info_cfportal':
|
91
|
+
mapping_dict = civil_judicial_drip_info_cfportal_dict
|
92
|
+
elif self.type == 'bankruptcy_sipc_drip_info_cfportal':
|
93
|
+
mapping_dict = bankruptcy_sipc_drip_info_cfportal_dict
|
94
|
+
elif self.type == 'bond_drip_info_cfportal':
|
95
|
+
mapping_dict = bond_drip_info_cfportal_dict
|
96
|
+
elif self.type == 'judgement_drip_info_cfportal':
|
97
|
+
mapping_dict = judgement_drip_info_cfportal_dict
|
98
|
+
|
99
|
+
# SDR
|
100
|
+
|
101
|
+
# Information Table
|
102
|
+
elif self.type == 'information_table':
|
103
|
+
mapping_dict = information_table_dict
|
104
|
+
|
105
|
+
# NFMP
|
106
|
+
elif self.type == 'metadata_nmfp':
|
107
|
+
mapping_dict = metadata_nmfp_dict
|
108
|
+
elif self.type == 'general_information_nmfp':
|
109
|
+
mapping_dict = general_information_nmfp_dict
|
110
|
+
elif self.type == 'series_level_info_nmfp':
|
111
|
+
mapping_dict = series_level_info_nmfp_dict
|
112
|
+
elif self.type == 'class_level_info_nmfp':
|
113
|
+
mapping_dict = class_level_info_nmfp_dict
|
114
|
+
elif self.type == 'schedule_of_portfolio_securities_info_nmfp':
|
115
|
+
mapping_dict = schedule_of_portfolio_securities_info_nmfp_dict
|
116
|
+
elif self.type == 'signature_nmfp':
|
117
|
+
mapping_dict = signature_nmfp_dict
|
118
|
+
|
119
|
+
# NPX
|
120
|
+
elif self.type == 'npx':
|
121
|
+
mapping_dict = npx_dict
|
122
|
+
|
123
|
+
# 144
|
124
|
+
elif self.type == 'signatures_144':
|
125
|
+
mapping_dict = signatures_144_dict
|
126
|
+
elif self.type == 'securities_sold_in_past_3_months_144':
|
127
|
+
mapping_dict = securities_sold_in_past_3_months_144_dict
|
128
|
+
elif self.type == 'securities_to_be_sold_144':
|
129
|
+
mapping_dict = securities_to_be_sold_144_dict
|
130
|
+
elif self.type == 'securities_information_144':
|
131
|
+
mapping_dict = securities_information_144_dict
|
132
|
+
elif self.type == 'issuer_information_144':
|
133
|
+
mapping_dict = issuer_information_144_dict
|
134
|
+
elif self.type == 'metadata_144':
|
135
|
+
mapping_dict = metadata_144_dict
|
136
|
+
|
137
|
+
# Ownership
|
138
|
+
elif self.type == 'non_derivative_holding_ownership':
|
139
|
+
mapping_dict = non_derivative_holding_ownership_dict
|
140
|
+
elif self.type == 'non_derivative_transaction_ownership':
|
141
|
+
mapping_dict = non_derivative_transaction_ownership_dict
|
142
|
+
elif self.type == 'derivative_transaction_ownership':
|
143
|
+
mapping_dict = derivative_transaction_ownership_dict
|
144
|
+
elif self.type == 'derivative_holding_ownership':
|
145
|
+
mapping_dict = derivative_holding_ownership_dict
|
146
|
+
elif self.type == 'reporting_owner_ownership':
|
147
|
+
mapping_dict = reporting_owner_ownership_dict
|
148
|
+
elif self.type == 'metadata_ownership':
|
149
|
+
mapping_dict = metadata_ownership_dict
|
150
|
+
elif self.type == 'owner_signature_ownership':
|
151
|
+
mapping_dict = owner_signature_ownership_dict
|
152
|
+
|
153
|
+
# Proxy Voting Record
|
154
|
+
elif self.type == 'proxy_voting_record':
|
155
|
+
mapping_dict = proxy_voting_record_dict
|
156
|
+
|
157
|
+
# SBS
|
158
|
+
|
159
|
+
# SBSEF
|
160
|
+
elif self.type == 'sbsef':
|
161
|
+
mapping_dict = sbsef_dict
|
162
|
+
|
163
|
+
# Schedule 13
|
164
|
+
elif self.type == 'metadata_schedule_13':
|
165
|
+
mapping_dict = metadata_schedule_13_dict
|
166
|
+
elif self.type == 'cover_schedule_13':
|
167
|
+
mapping_dict = cover_schedule_13_dict
|
168
|
+
elif self.type == 'reporting_person_details_schedule_13':
|
169
|
+
mapping_dict = reporting_person_details_schedule_13_dict
|
170
|
+
elif self.type == 'item_1_schedule_13':
|
171
|
+
mapping_dict = item_1_schedule_13_dict
|
172
|
+
elif self.type == 'item_2_schedule_13':
|
173
|
+
mapping_dict = item_2_schedule_13_dict
|
174
|
+
elif self.type == 'item_3_schedule_13':
|
175
|
+
mapping_dict = item_3_schedule_13_dict
|
176
|
+
elif self.type == 'item_4_schedule_13':
|
177
|
+
mapping_dict = item_4_schedule_13_dict
|
178
|
+
elif self.type == 'item_5_schedule_13':
|
179
|
+
mapping_dict = item_5_schedule_13_dict
|
180
|
+
elif self.type == 'item_6_schedule_13':
|
181
|
+
mapping_dict = item_6_schedule_13_dict
|
182
|
+
elif self.type == 'item_7_schedule_13':
|
183
|
+
mapping_dict = item_7_schedule_13_dict
|
184
|
+
elif self.type == 'item_8_schedule_13':
|
185
|
+
mapping_dict = item_8_schedule_13_dict
|
186
|
+
elif self.type == 'item_9_schedule_13':
|
187
|
+
mapping_dict = item_9_schedule_13_dict
|
188
|
+
elif self.type == 'item_10_schedule_13':
|
189
|
+
mapping_dict = item_10_schedule_13_dict
|
190
|
+
elif self.type == 'signature_schedule_13':
|
191
|
+
mapping_dict = signature_schedule_13_dict
|
192
|
+
|
193
|
+
# SDR
|
194
|
+
elif self.type == 'sdr':
|
195
|
+
mapping_dict = sdr_dict
|
196
|
+
|
197
|
+
# TA
|
198
|
+
|
199
|
+
# 13F-HR
|
200
|
+
elif self.type == '13fhr':
|
201
|
+
mapping_dict = thirteenfhr_dict
|
202
|
+
|
203
|
+
# 25-NSE
|
204
|
+
elif self.type == '25nse':
|
205
|
+
mapping_dict = twentyfive_nse_dict
|
206
|
+
|
207
|
+
# 24F-2NT
|
208
|
+
elif self.type == 'metadata_24f_2nt':
|
209
|
+
mapping_dict = metadata_24f_2nt_dict
|
210
|
+
elif self.type == 'item_1_24f2nt':
|
211
|
+
mapping_dict = item_1_24f2nt_dict
|
212
|
+
elif self.type == 'item_2_24f2nt':
|
213
|
+
mapping_dict = item_2_24f2nt_dict
|
214
|
+
elif self.type == 'item_3_24f2nt':
|
215
|
+
mapping_dict = item_3_24f2nt_dict
|
216
|
+
elif self.type == 'item_4_24f2nt':
|
217
|
+
mapping_dict = item_4_24f2nt_dict
|
218
|
+
elif self.type == 'item_5_24f2nt':
|
219
|
+
mapping_dict = item_5_24f2nt_dict
|
220
|
+
elif self.type == 'item_6_24f2nt':
|
221
|
+
mapping_dict = item_6_24f2nt_dict
|
222
|
+
elif self.type == 'item_7_24f2nt':
|
223
|
+
mapping_dict = item_7_24f2nt_dict
|
224
|
+
elif self.type == 'item_8_24f2nt':
|
225
|
+
mapping_dict = item_8_24f2nt_dict
|
226
|
+
elif self.type == 'item_9_24f2nt':
|
227
|
+
mapping_dict = item_9_24f2nt_dict
|
228
|
+
elif self.type == 'signature_info_schedule_a':
|
229
|
+
mapping_dict = signature_24f2nt_dict
|
230
|
+
|
231
|
+
else:
|
232
|
+
mapping_dict = {}
|
233
|
+
|
234
|
+
# Update mapping dictionary to include accession at the beginning
|
235
|
+
# Create a new mapping with accession as the first key
|
236
|
+
new_mapping = {'accession': 'accession'}
|
237
|
+
# Add the rest of the mapping
|
238
|
+
new_mapping.update(mapping_dict)
|
239
|
+
mapping_dict = new_mapping
|
240
|
+
|
241
|
+
# apply the mapping to the data
|
242
|
+
for row in self.data:
|
243
|
+
ordered_row = {}
|
244
|
+
# First add all keys from the mapping dict in order
|
245
|
+
for old_key, new_key in mapping_dict.items():
|
246
|
+
if old_key in row:
|
247
|
+
ordered_row[new_key] = row.pop(old_key)
|
248
|
+
else:
|
249
|
+
# if the old key is not present, set the new key to None
|
250
|
+
ordered_row[new_key] = None
|
251
|
+
|
252
|
+
# Then add any remaining keys that weren't in the mapping
|
253
|
+
for key, value in row.items():
|
254
|
+
ordered_row[key] = value
|
255
|
+
|
256
|
+
# Replace the original row with the ordered row
|
257
|
+
row.clear()
|
258
|
+
row.update(ordered_row)
|
259
|
+
|
260
|
+
self.determine_columns()
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
from pathlib import Path
|
3
|
+
import urllib.request
|
4
|
+
import gzip
|
5
|
+
import shutil
|
6
|
+
import os
|
7
|
+
|
8
|
+
class PackageUpdater():
|
9
|
+
def __init__(self):
|
10
|
+
pass
|
11
|
+
|
12
|
+
def update_package_data():
|
13
|
+
# Create data directory in user's home
|
14
|
+
data_dir = Path.home() / ".datamule"
|
15
|
+
data_dir.mkdir(exist_ok=True)
|
16
|
+
|
17
|
+
# Download data file
|
18
|
+
file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
|
19
|
+
file_path = data_dir / "listed_filer_metadata.csv"
|
20
|
+
temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
|
21
|
+
|
22
|
+
if not file_path.exists():
|
23
|
+
print(f"Downloading data to {data_dir}")
|
24
|
+
urllib.request.urlretrieve(file_url, temp_gz_path)
|
25
|
+
|
26
|
+
with gzip.open(temp_gz_path, 'rb') as f_in:
|
27
|
+
with open(file_path, 'wb') as f_out:
|
28
|
+
shutil.copyfileobj(f_in, f_out)
|
29
|
+
|
30
|
+
os.remove(temp_gz_path)
|
31
|
+
print(f"Data downloaded to {file_path}")
|
datamule/portfolio.py
CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
|
|
119
119
|
# First query, just set the accession numbers
|
120
120
|
self.accession_numbers = new_accession_numbers
|
121
121
|
|
122
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
|
122
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
|
123
123
|
if provider is None:
|
124
124
|
config = Config()
|
125
125
|
provider = config.get_default_source()
|
@@ -134,7 +134,8 @@ class Portfolio:
|
|
134
134
|
cik=cik,
|
135
135
|
submission_type=submission_type,
|
136
136
|
filing_date=filing_date,
|
137
|
-
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
137
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
138
|
+
keep_document_types=document_type
|
138
139
|
)
|
139
140
|
else:
|
140
141
|
sec_download(
|
@@ -143,7 +144,8 @@ class Portfolio:
|
|
143
144
|
submission_type=submission_type,
|
144
145
|
filing_date=filing_date,
|
145
146
|
requests_per_second=requests_per_second,
|
146
|
-
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
148
|
+
keep_document_types=document_type
|
147
149
|
)
|
148
150
|
|
149
151
|
self.submissions_loaded = False
|
@@ -1,35 +1,19 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
3
|
from .streamer import stream
|
4
|
-
from secsgml import parse_sgml_submission_into_memory
|
5
4
|
import aiofiles
|
5
|
+
from ...submission import Submission
|
6
6
|
|
7
|
-
async def download_callback(hit, content, cik, accno, url, output_dir="filings"):
|
7
|
+
async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
|
8
8
|
"""Save downloaded SEC submission to disk."""
|
9
9
|
try:
|
10
|
-
#
|
11
|
-
|
10
|
+
# Create a Submission object directly from the content
|
11
|
+
# Note: the content needs to be decoded from bytes to string for the parser
|
12
|
+
submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
|
13
|
+
keep_document_types=keep_document_types)
|
12
14
|
|
13
|
-
#
|
14
|
-
file_dir =
|
15
|
-
os.makedirs(file_dir, exist_ok=True)
|
16
|
-
|
17
|
-
# Save metadata
|
18
|
-
metadata_path = os.path.join(file_dir, "metadata.json")
|
19
|
-
async with aiofiles.open(metadata_path, 'w') as f:
|
20
|
-
await f.write(json.dumps(metadata, indent=4))
|
21
|
-
|
22
|
-
# Save all documents
|
23
|
-
for idx, _ in enumerate(metadata['documents']):
|
24
|
-
try:
|
25
|
-
filename = metadata['documents'][idx]['filename']
|
26
|
-
except (KeyError, IndexError):
|
27
|
-
filename = f"{metadata['documents'][idx].get('sequence', idx)}.txt"
|
28
|
-
|
29
|
-
# Use async file writing
|
30
|
-
doc_path = os.path.join(file_dir, filename)
|
31
|
-
async with aiofiles.open(doc_path, 'wb') as f:
|
32
|
-
await f.write(documents[idx])
|
15
|
+
# Use the async save method to write the submission to disk
|
16
|
+
file_dir = await submission.save_async(output_dir=output_dir)
|
33
17
|
|
34
18
|
return file_dir
|
35
19
|
except Exception as e:
|
@@ -37,7 +21,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
|
|
37
21
|
return None
|
38
22
|
|
39
23
|
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
40
|
-
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
24
|
+
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
25
|
+
quiet=False, keep_document_types=None):
|
41
26
|
"""
|
42
27
|
Download SEC EDGAR filings and extract their documents.
|
43
28
|
|
@@ -51,27 +36,19 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
|
|
51
36
|
- output_dir: Directory to save documents
|
52
37
|
- accession_numbers: Optional list of accession numbers to filter by
|
53
38
|
- quiet: Whether to suppress progress output
|
39
|
+
- keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
|
54
40
|
|
55
41
|
Returns:
|
56
42
|
- List of all document paths processed
|
57
|
-
|
58
|
-
Examples:
|
59
|
-
# Download filings by CIK
|
60
|
-
download(cik="1318605", submission_type="10-K")
|
61
|
-
|
62
|
-
# Download filings by company name
|
63
|
-
download(name="Tesla", submission_type="10-K")
|
64
|
-
|
65
|
-
# Download filings with location filter
|
66
|
-
download(name="Apple", location="CA", submission_type="10-K")
|
67
43
|
"""
|
68
|
-
|
69
44
|
# Make sure output directory exists
|
70
45
|
os.makedirs(output_dir, exist_ok=True)
|
71
46
|
|
72
47
|
# Create a wrapper for the download_callback that includes the output_dir
|
73
48
|
async def callback_wrapper(hit, content, cik, accno, url):
|
74
|
-
return await download_callback(hit, content, cik, accno, url,
|
49
|
+
return await download_callback(hit, content, cik, accno, url,
|
50
|
+
output_dir=output_dir,
|
51
|
+
keep_document_types=keep_document_types)
|
75
52
|
|
76
53
|
# Call the stream function with our callback
|
77
54
|
return stream(
|
@@ -16,17 +16,35 @@ from threading import Thread
|
|
16
16
|
from secsgml import parse_sgml_submission
|
17
17
|
from .query import query
|
18
18
|
from os import cpu_count
|
19
|
+
from ..submission import Submission
|
19
20
|
|
20
21
|
class Downloader:
|
21
22
|
def __init__(self, api_key=None):
|
22
23
|
self.BASE_URL = "https://library.datamule.xyz/original/nc/"
|
23
24
|
self.CHUNK_SIZE = 2 * 1024 * 1024
|
24
|
-
self.MAX_CONCURRENT_DOWNLOADS =
|
25
|
+
self.MAX_CONCURRENT_DOWNLOADS = 100
|
25
26
|
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
26
27
|
self.MAX_PROCESSING_WORKERS = cpu_count()
|
27
28
|
self.QUEUE_SIZE = 10
|
28
29
|
if api_key is not None:
|
29
30
|
self._api_key = api_key
|
31
|
+
# Create a shared event loop for async operations
|
32
|
+
self.loop = asyncio.new_event_loop()
|
33
|
+
# Create a thread to run the event loop
|
34
|
+
self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
|
35
|
+
self.loop_thread.start()
|
36
|
+
# Create a queue for async tasks
|
37
|
+
self.async_queue = Queue()
|
38
|
+
|
39
|
+
def _run_event_loop(self):
|
40
|
+
"""Run the event loop in a separate thread"""
|
41
|
+
asyncio.set_event_loop(self.loop)
|
42
|
+
self.loop.run_forever()
|
43
|
+
|
44
|
+
def _run_coroutine(self, coro):
|
45
|
+
"""Run a coroutine in the event loop and return its result"""
|
46
|
+
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
47
|
+
return future.result()
|
30
48
|
|
31
49
|
@property
|
32
50
|
def api_key(self):
|
@@ -55,7 +73,7 @@ class Downloader:
|
|
55
73
|
print(f"Failed to log error to {error_file}: {str(e)}")
|
56
74
|
|
57
75
|
class FileProcessor:
|
58
|
-
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
|
76
|
+
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
|
59
77
|
self.processing_queue = Queue(maxsize=queue_size)
|
60
78
|
self.should_stop = False
|
61
79
|
self.processing_workers = []
|
@@ -64,6 +82,7 @@ class Downloader:
|
|
64
82
|
self.batch_size = 50
|
65
83
|
self.pbar = pbar
|
66
84
|
self.downloader = downloader
|
85
|
+
self.keep_document_types = keep_document_types
|
67
86
|
|
68
87
|
def start_processing_workers(self):
|
69
88
|
for _ in range(self.max_workers):
|
@@ -75,7 +94,9 @@ class Downloader:
|
|
75
94
|
def _process_file(self, item):
|
76
95
|
filename, content = item
|
77
96
|
try:
|
78
|
-
|
97
|
+
submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
|
98
|
+
# Use the shared event loop to run save_async
|
99
|
+
self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
|
79
100
|
self.pbar.update(1)
|
80
101
|
except Exception as e:
|
81
102
|
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
@@ -189,11 +210,11 @@ class Downloader:
|
|
189
210
|
except Exception as e:
|
190
211
|
self._log_error(output_dir, filename, str(e))
|
191
212
|
|
192
|
-
async def process_batch(self, urls, output_dir):
|
213
|
+
async def process_batch(self, urls, output_dir, keep_document_types=None):
|
193
214
|
os.makedirs(output_dir, exist_ok=True)
|
194
215
|
|
195
216
|
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
196
|
-
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
|
217
|
+
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
|
197
218
|
processor.start_processing_workers()
|
198
219
|
|
199
220
|
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
@@ -216,7 +237,7 @@ class Downloader:
|
|
216
237
|
processor.stop_workers()
|
217
238
|
decompression_pool.shutdown()
|
218
239
|
|
219
|
-
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None):
|
240
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
|
220
241
|
"""
|
221
242
|
Query SEC filings and download/process them.
|
222
243
|
|
@@ -225,6 +246,8 @@ class Downloader:
|
|
225
246
|
- cik: Company CIK number(s), string, int, or list
|
226
247
|
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
227
248
|
- output_dir: Directory to save downloaded files
|
249
|
+
- accession_numbers: List of specific accession numbers to download
|
250
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
228
251
|
"""
|
229
252
|
if self.api_key is None:
|
230
253
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
@@ -262,15 +285,32 @@ class Downloader:
|
|
262
285
|
start_time = time.time()
|
263
286
|
|
264
287
|
# Process the batch asynchronously
|
265
|
-
asyncio.run(self.process_batch(urls, output_dir))
|
288
|
+
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
|
266
289
|
|
267
290
|
# Calculate and display performance metrics
|
268
291
|
elapsed_time = time.time() - start_time
|
269
292
|
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
270
293
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
294
|
+
|
295
|
+
def __del__(self):
|
296
|
+
"""Cleanup when the downloader is garbage collected"""
|
297
|
+
if hasattr(self, 'loop') and self.loop.is_running():
|
298
|
+
self.loop.call_soon_threadsafe(self.loop.stop)
|
271
299
|
|
272
300
|
|
273
|
-
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None):
|
301
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
|
302
|
+
"""
|
303
|
+
Query SEC filings and download/process them.
|
304
|
+
|
305
|
+
Parameters:
|
306
|
+
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
307
|
+
- cik: Company CIK number(s), string, int, or list
|
308
|
+
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
309
|
+
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
310
|
+
- output_dir: Directory to save downloaded files
|
311
|
+
- accession_numbers: List of specific accession numbers to download
|
312
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
313
|
+
"""
|
274
314
|
if accession_numbers:
|
275
315
|
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
276
316
|
# check if acc no is empty list
|
@@ -282,5 +322,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
282
322
|
cik=cik,
|
283
323
|
filing_date=filing_date,
|
284
324
|
output_dir=output_dir,
|
285
|
-
accession_numbers=accession_numbers
|
325
|
+
accession_numbers=accession_numbers,
|
326
|
+
keep_document_types=keep_document_types
|
286
327
|
)
|