datamule 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +3 -1
- datamule/document/__init__.py +0 -0
- datamule/document/document.py +255 -0
- datamule/document/processing.py +604 -0
- datamule/document/table.py +260 -0
- datamule/package_updater.py +31 -0
- datamule/portfolio.py +5 -3
- datamule/sec/submissions/downloader.py +14 -37
- datamule/seclibrary/bq.py +349 -12
- datamule/seclibrary/downloader.py +50 -9
- datamule/sheet.py +458 -34
- datamule/submission.py +102 -7
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/METADATA +1 -1
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/RECORD +16 -12
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/WHEEL +1 -1
- datamule/document.py +0 -472
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,260 @@
|
|
1
|
+
from .mappings.atsn import *
|
2
|
+
from .mappings.cfportal import *
|
3
|
+
from .mappings.ex99a_sdr import *
|
4
|
+
from .mappings.ex99c_sdr import *
|
5
|
+
from .mappings.ex99g_sdr import *
|
6
|
+
from .mappings.ex99i_sdr import *
|
7
|
+
from .mappings.nmfp import *
|
8
|
+
from .mappings.npx import *
|
9
|
+
from .mappings.onefourtyfour import *
|
10
|
+
from .mappings.ownership import *
|
11
|
+
from .mappings.proxy_voting_record import *
|
12
|
+
from .mappings.sbs import *
|
13
|
+
from .mappings.sbsef import *
|
14
|
+
from .mappings.schedule13 import *
|
15
|
+
from .mappings.sdr import *
|
16
|
+
from .mappings.ta import *
|
17
|
+
from .mappings.thirteenfhr import *
|
18
|
+
from .mappings.twentyfivense import *
|
19
|
+
from .mappings.twentyfourf2nt import *
|
20
|
+
from .mappings.information_table import *
|
21
|
+
|
22
|
+
# need to check if mappings correctly create new columns
|
23
|
+
class Table():
|
24
|
+
def __init__(self, data, type,accession):
|
25
|
+
if isinstance(data,dict):
|
26
|
+
data = [data]
|
27
|
+
self.type = type
|
28
|
+
self.data = data
|
29
|
+
self.accession = accession
|
30
|
+
self.columns = self.determine_columns()
|
31
|
+
|
32
|
+
def determine_columns(self):
|
33
|
+
if len(self.data) == 0:
|
34
|
+
return []
|
35
|
+
return self.data[0].keys()
|
36
|
+
|
37
|
+
def add_column(self,column_name,value):
|
38
|
+
for row in self.data:
|
39
|
+
row[column_name] = value
|
40
|
+
|
41
|
+
def map_data(self):
|
42
|
+
# Add the accession column to all rows first, ensuring it will be first
|
43
|
+
self.add_column('accession', self.accession)
|
44
|
+
|
45
|
+
|
46
|
+
# ATS-N, types: metadata_ats,cover_ats,part_one_ats,part_two_ats,part_three_ats,part_four_ats
|
47
|
+
if self.type == 'metadata_ats':
|
48
|
+
mapping_dict = metadata_ats_dict
|
49
|
+
elif self.type == 'cover_ats':
|
50
|
+
mapping_dict = cover_ats_dict
|
51
|
+
elif self.type == 'part_one_ats':
|
52
|
+
mapping_dict = part_one_ats_dict
|
53
|
+
elif self.type == 'part_two_ats':
|
54
|
+
mapping_dict = part_two_ats_dict
|
55
|
+
elif self.type == 'part_three_ats':
|
56
|
+
mapping_dict = part_three_ats_dict
|
57
|
+
elif self.type == 'part_four_ats':
|
58
|
+
mapping_dict = part_four_ats_dict
|
59
|
+
# CFPORTAL
|
60
|
+
elif self.type == 'metadata_cfportal':
|
61
|
+
mapping_dict = metadata_cfportal_dict
|
62
|
+
elif self.type == 'identifying_information_cfportal':
|
63
|
+
mapping_dict = identifying_information_cfportal_dict
|
64
|
+
elif self.type == 'form_of_organization_cfportal':
|
65
|
+
mapping_dict = form_of_organization_cfportal_dict
|
66
|
+
elif self.type == 'successions_cfportal':
|
67
|
+
mapping_dict = successions_cfportal_dict
|
68
|
+
elif self.type == 'control_relationships_cfportal':
|
69
|
+
mapping_dict = control_relationships_cfportal_dict
|
70
|
+
elif self.type == 'disclosure_answers_cfportal':
|
71
|
+
mapping_dict = disclosure_answers_cfportal_dict
|
72
|
+
elif self.type == 'non_securities_related_business_cfportal':
|
73
|
+
mapping_dict = non_securities_related_business_cfportal_dict
|
74
|
+
elif self.type == 'escrow_arrangements_cfportal':
|
75
|
+
mapping_dict = escrow_arrangements_cfportal_dict
|
76
|
+
elif self.type == 'execution_cfportal':
|
77
|
+
mapping_dict = execution_cfportal_dict
|
78
|
+
elif self.type == 'schedule_a_cfportal':
|
79
|
+
mapping_dict = schedule_a_cfportal_dict
|
80
|
+
elif self.type == 'schedule_b_cfportal':
|
81
|
+
mapping_dict = schedule_b_cfportal_dict
|
82
|
+
elif self.type == 'schedule_c_cfportal':
|
83
|
+
mapping_dict = schedule_c_cfportal_dict
|
84
|
+
elif self.type == 'schedule_d_cfportal':
|
85
|
+
mapping_dict = schedule_d_cfportal_dict
|
86
|
+
elif self.type == 'criminal_drip_info_cfportal':
|
87
|
+
mapping_dict = criminal_drip_info_cfportal_dict
|
88
|
+
elif self.type == 'regulatory_drip_info_cfportal':
|
89
|
+
mapping_dict = regulatory_drip_info_cfportal_dict
|
90
|
+
elif self.type == 'civil_judicial_drip_info_cfportal':
|
91
|
+
mapping_dict = civil_judicial_drip_info_cfportal_dict
|
92
|
+
elif self.type == 'bankruptcy_sipc_drip_info_cfportal':
|
93
|
+
mapping_dict = bankruptcy_sipc_drip_info_cfportal_dict
|
94
|
+
elif self.type == 'bond_drip_info_cfportal':
|
95
|
+
mapping_dict = bond_drip_info_cfportal_dict
|
96
|
+
elif self.type == 'judgement_drip_info_cfportal':
|
97
|
+
mapping_dict = judgement_drip_info_cfportal_dict
|
98
|
+
|
99
|
+
# SDR
|
100
|
+
|
101
|
+
# Information Table
|
102
|
+
elif self.type == 'information_table':
|
103
|
+
mapping_dict = information_table_dict
|
104
|
+
|
105
|
+
# NFMP
|
106
|
+
elif self.type == 'metadata_nmfp':
|
107
|
+
mapping_dict = metadata_nmfp_dict
|
108
|
+
elif self.type == 'general_information_nmfp':
|
109
|
+
mapping_dict = general_information_nmfp_dict
|
110
|
+
elif self.type == 'series_level_info_nmfp':
|
111
|
+
mapping_dict = series_level_info_nmfp_dict
|
112
|
+
elif self.type == 'class_level_info_nmfp':
|
113
|
+
mapping_dict = class_level_info_nmfp_dict
|
114
|
+
elif self.type == 'schedule_of_portfolio_securities_info_nmfp':
|
115
|
+
mapping_dict = schedule_of_portfolio_securities_info_nmfp_dict
|
116
|
+
elif self.type == 'signature_nmfp':
|
117
|
+
mapping_dict = signature_nmfp_dict
|
118
|
+
|
119
|
+
# NPX
|
120
|
+
elif self.type == 'npx':
|
121
|
+
mapping_dict = npx_dict
|
122
|
+
|
123
|
+
# 144
|
124
|
+
elif self.type == 'signatures_144':
|
125
|
+
mapping_dict = signatures_144_dict
|
126
|
+
elif self.type == 'securities_sold_in_past_3_months_144':
|
127
|
+
mapping_dict = securities_sold_in_past_3_months_144_dict
|
128
|
+
elif self.type == 'securities_to_be_sold_144':
|
129
|
+
mapping_dict = securities_to_be_sold_144_dict
|
130
|
+
elif self.type == 'securities_information_144':
|
131
|
+
mapping_dict = securities_information_144_dict
|
132
|
+
elif self.type == 'issuer_information_144':
|
133
|
+
mapping_dict = issuer_information_144_dict
|
134
|
+
elif self.type == 'metadata_144':
|
135
|
+
mapping_dict = metadata_144_dict
|
136
|
+
|
137
|
+
# Ownership
|
138
|
+
elif self.type == 'non_derivative_holding_ownership':
|
139
|
+
mapping_dict = non_derivative_holding_ownership_dict
|
140
|
+
elif self.type == 'non_derivative_transaction_ownership':
|
141
|
+
mapping_dict = non_derivative_transaction_ownership_dict
|
142
|
+
elif self.type == 'derivative_transaction_ownership':
|
143
|
+
mapping_dict = derivative_transaction_ownership_dict
|
144
|
+
elif self.type == 'derivative_holding_ownership':
|
145
|
+
mapping_dict = derivative_holding_ownership_dict
|
146
|
+
elif self.type == 'reporting_owner_ownership':
|
147
|
+
mapping_dict = reporting_owner_ownership_dict
|
148
|
+
elif self.type == 'metadata_ownership':
|
149
|
+
mapping_dict = metadata_ownership_dict
|
150
|
+
elif self.type == 'owner_signature_ownership':
|
151
|
+
mapping_dict = owner_signature_ownership_dict
|
152
|
+
|
153
|
+
# Proxy Voting Record
|
154
|
+
elif self.type == 'proxy_voting_record':
|
155
|
+
mapping_dict = proxy_voting_record_dict
|
156
|
+
|
157
|
+
# SBS
|
158
|
+
|
159
|
+
# SBSEF
|
160
|
+
elif self.type == 'sbsef':
|
161
|
+
mapping_dict = sbsef_dict
|
162
|
+
|
163
|
+
# Schedule 13
|
164
|
+
elif self.type == 'metadata_schedule_13':
|
165
|
+
mapping_dict = metadata_schedule_13_dict
|
166
|
+
elif self.type == 'cover_schedule_13':
|
167
|
+
mapping_dict = cover_schedule_13_dict
|
168
|
+
elif self.type == 'reporting_person_details_schedule_13':
|
169
|
+
mapping_dict = reporting_person_details_schedule_13_dict
|
170
|
+
elif self.type == 'item_1_schedule_13':
|
171
|
+
mapping_dict = item_1_schedule_13_dict
|
172
|
+
elif self.type == 'item_2_schedule_13':
|
173
|
+
mapping_dict = item_2_schedule_13_dict
|
174
|
+
elif self.type == 'item_3_schedule_13':
|
175
|
+
mapping_dict = item_3_schedule_13_dict
|
176
|
+
elif self.type == 'item_4_schedule_13':
|
177
|
+
mapping_dict = item_4_schedule_13_dict
|
178
|
+
elif self.type == 'item_5_schedule_13':
|
179
|
+
mapping_dict = item_5_schedule_13_dict
|
180
|
+
elif self.type == 'item_6_schedule_13':
|
181
|
+
mapping_dict = item_6_schedule_13_dict
|
182
|
+
elif self.type == 'item_7_schedule_13':
|
183
|
+
mapping_dict = item_7_schedule_13_dict
|
184
|
+
elif self.type == 'item_8_schedule_13':
|
185
|
+
mapping_dict = item_8_schedule_13_dict
|
186
|
+
elif self.type == 'item_9_schedule_13':
|
187
|
+
mapping_dict = item_9_schedule_13_dict
|
188
|
+
elif self.type == 'item_10_schedule_13':
|
189
|
+
mapping_dict = item_10_schedule_13_dict
|
190
|
+
elif self.type == 'signature_schedule_13':
|
191
|
+
mapping_dict = signature_schedule_13_dict
|
192
|
+
|
193
|
+
# SDR
|
194
|
+
elif self.type == 'sdr':
|
195
|
+
mapping_dict = sdr_dict
|
196
|
+
|
197
|
+
# TA
|
198
|
+
|
199
|
+
# 13F-HR
|
200
|
+
elif self.type == '13fhr':
|
201
|
+
mapping_dict = thirteenfhr_dict
|
202
|
+
|
203
|
+
# 25-NSE
|
204
|
+
elif self.type == '25nse':
|
205
|
+
mapping_dict = twentyfive_nse_dict
|
206
|
+
|
207
|
+
# 24F-2NT
|
208
|
+
elif self.type == 'metadata_24f_2nt':
|
209
|
+
mapping_dict = metadata_24f_2nt_dict
|
210
|
+
elif self.type == 'item_1_24f2nt':
|
211
|
+
mapping_dict = item_1_24f2nt_dict
|
212
|
+
elif self.type == 'item_2_24f2nt':
|
213
|
+
mapping_dict = item_2_24f2nt_dict
|
214
|
+
elif self.type == 'item_3_24f2nt':
|
215
|
+
mapping_dict = item_3_24f2nt_dict
|
216
|
+
elif self.type == 'item_4_24f2nt':
|
217
|
+
mapping_dict = item_4_24f2nt_dict
|
218
|
+
elif self.type == 'item_5_24f2nt':
|
219
|
+
mapping_dict = item_5_24f2nt_dict
|
220
|
+
elif self.type == 'item_6_24f2nt':
|
221
|
+
mapping_dict = item_6_24f2nt_dict
|
222
|
+
elif self.type == 'item_7_24f2nt':
|
223
|
+
mapping_dict = item_7_24f2nt_dict
|
224
|
+
elif self.type == 'item_8_24f2nt':
|
225
|
+
mapping_dict = item_8_24f2nt_dict
|
226
|
+
elif self.type == 'item_9_24f2nt':
|
227
|
+
mapping_dict = item_9_24f2nt_dict
|
228
|
+
elif self.type == 'signature_info_schedule_a':
|
229
|
+
mapping_dict = signature_24f2nt_dict
|
230
|
+
|
231
|
+
else:
|
232
|
+
mapping_dict = {}
|
233
|
+
|
234
|
+
# Update mapping dictionary to include accession at the beginning
|
235
|
+
# Create a new mapping with accession as the first key
|
236
|
+
new_mapping = {'accession': 'accession'}
|
237
|
+
# Add the rest of the mapping
|
238
|
+
new_mapping.update(mapping_dict)
|
239
|
+
mapping_dict = new_mapping
|
240
|
+
|
241
|
+
# apply the mapping to the data
|
242
|
+
for row in self.data:
|
243
|
+
ordered_row = {}
|
244
|
+
# First add all keys from the mapping dict in order
|
245
|
+
for old_key, new_key in mapping_dict.items():
|
246
|
+
if old_key in row:
|
247
|
+
ordered_row[new_key] = row.pop(old_key)
|
248
|
+
else:
|
249
|
+
# if the old key is not present, set the new key to None
|
250
|
+
ordered_row[new_key] = None
|
251
|
+
|
252
|
+
# Then add any remaining keys that weren't in the mapping
|
253
|
+
for key, value in row.items():
|
254
|
+
ordered_row[key] = value
|
255
|
+
|
256
|
+
# Replace the original row with the ordered row
|
257
|
+
row.clear()
|
258
|
+
row.update(ordered_row)
|
259
|
+
|
260
|
+
self.determine_columns()
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
from pathlib import Path
|
3
|
+
import urllib.request
|
4
|
+
import gzip
|
5
|
+
import shutil
|
6
|
+
import os
|
7
|
+
|
8
|
+
class PackageUpdater():
|
9
|
+
def __init__(self):
|
10
|
+
pass
|
11
|
+
|
12
|
+
def update_package_data():
|
13
|
+
# Create data directory in user's home
|
14
|
+
data_dir = Path.home() / ".datamule"
|
15
|
+
data_dir.mkdir(exist_ok=True)
|
16
|
+
|
17
|
+
# Download data file
|
18
|
+
file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
|
19
|
+
file_path = data_dir / "listed_filer_metadata.csv"
|
20
|
+
temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
|
21
|
+
|
22
|
+
if not file_path.exists():
|
23
|
+
print(f"Downloading data to {data_dir}")
|
24
|
+
urllib.request.urlretrieve(file_url, temp_gz_path)
|
25
|
+
|
26
|
+
with gzip.open(temp_gz_path, 'rb') as f_in:
|
27
|
+
with open(file_path, 'wb') as f_out:
|
28
|
+
shutil.copyfileobj(f_in, f_out)
|
29
|
+
|
30
|
+
os.remove(temp_gz_path)
|
31
|
+
print(f"Data downloaded to {file_path}")
|
datamule/portfolio.py
CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
|
|
119
119
|
# First query, just set the accession numbers
|
120
120
|
self.accession_numbers = new_accession_numbers
|
121
121
|
|
122
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
|
122
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
|
123
123
|
if provider is None:
|
124
124
|
config = Config()
|
125
125
|
provider = config.get_default_source()
|
@@ -134,7 +134,8 @@ class Portfolio:
|
|
134
134
|
cik=cik,
|
135
135
|
submission_type=submission_type,
|
136
136
|
filing_date=filing_date,
|
137
|
-
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
137
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
138
|
+
keep_document_types=document_type
|
138
139
|
)
|
139
140
|
else:
|
140
141
|
sec_download(
|
@@ -143,7 +144,8 @@ class Portfolio:
|
|
143
144
|
submission_type=submission_type,
|
144
145
|
filing_date=filing_date,
|
145
146
|
requests_per_second=requests_per_second,
|
146
|
-
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
148
|
+
keep_document_types=document_type
|
147
149
|
)
|
148
150
|
|
149
151
|
self.submissions_loaded = False
|
@@ -1,35 +1,19 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
3
|
from .streamer import stream
|
4
|
-
from secsgml import parse_sgml_submission_into_memory
|
5
4
|
import aiofiles
|
5
|
+
from ...submission import Submission
|
6
6
|
|
7
|
-
async def download_callback(hit, content, cik, accno, url, output_dir="filings"):
|
7
|
+
async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
|
8
8
|
"""Save downloaded SEC submission to disk."""
|
9
9
|
try:
|
10
|
-
#
|
11
|
-
|
10
|
+
# Create a Submission object directly from the content
|
11
|
+
# Note: the content needs to be decoded from bytes to string for the parser
|
12
|
+
submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
|
13
|
+
keep_document_types=keep_document_types)
|
12
14
|
|
13
|
-
#
|
14
|
-
file_dir =
|
15
|
-
os.makedirs(file_dir, exist_ok=True)
|
16
|
-
|
17
|
-
# Save metadata
|
18
|
-
metadata_path = os.path.join(file_dir, "metadata.json")
|
19
|
-
async with aiofiles.open(metadata_path, 'w') as f:
|
20
|
-
await f.write(json.dumps(metadata, indent=4))
|
21
|
-
|
22
|
-
# Save all documents
|
23
|
-
for idx, _ in enumerate(metadata['documents']):
|
24
|
-
try:
|
25
|
-
filename = metadata['documents'][idx]['filename']
|
26
|
-
except (KeyError, IndexError):
|
27
|
-
filename = f"{metadata['documents'][idx].get('sequence', idx)}.txt"
|
28
|
-
|
29
|
-
# Use async file writing
|
30
|
-
doc_path = os.path.join(file_dir, filename)
|
31
|
-
async with aiofiles.open(doc_path, 'wb') as f:
|
32
|
-
await f.write(documents[idx])
|
15
|
+
# Use the async save method to write the submission to disk
|
16
|
+
file_dir = await submission.save_async(output_dir=output_dir)
|
33
17
|
|
34
18
|
return file_dir
|
35
19
|
except Exception as e:
|
@@ -37,7 +21,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
|
|
37
21
|
return None
|
38
22
|
|
39
23
|
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
40
|
-
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
24
|
+
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
25
|
+
quiet=False, keep_document_types=None):
|
41
26
|
"""
|
42
27
|
Download SEC EDGAR filings and extract their documents.
|
43
28
|
|
@@ -51,27 +36,19 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
|
|
51
36
|
- output_dir: Directory to save documents
|
52
37
|
- accession_numbers: Optional list of accession numbers to filter by
|
53
38
|
- quiet: Whether to suppress progress output
|
39
|
+
- keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
|
54
40
|
|
55
41
|
Returns:
|
56
42
|
- List of all document paths processed
|
57
|
-
|
58
|
-
Examples:
|
59
|
-
# Download filings by CIK
|
60
|
-
download(cik="1318605", submission_type="10-K")
|
61
|
-
|
62
|
-
# Download filings by company name
|
63
|
-
download(name="Tesla", submission_type="10-K")
|
64
|
-
|
65
|
-
# Download filings with location filter
|
66
|
-
download(name="Apple", location="CA", submission_type="10-K")
|
67
43
|
"""
|
68
|
-
|
69
44
|
# Make sure output directory exists
|
70
45
|
os.makedirs(output_dir, exist_ok=True)
|
71
46
|
|
72
47
|
# Create a wrapper for the download_callback that includes the output_dir
|
73
48
|
async def callback_wrapper(hit, content, cik, accno, url):
|
74
|
-
return await download_callback(hit, content, cik, accno, url,
|
49
|
+
return await download_callback(hit, content, cik, accno, url,
|
50
|
+
output_dir=output_dir,
|
51
|
+
keep_document_types=keep_document_types)
|
75
52
|
|
76
53
|
# Call the stream function with our callback
|
77
54
|
return stream(
|