datamule 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +66 -1
- datamule/helper.py +3 -0
- datamule/mapping_dicts/html_mapping_dicts.py +77 -4
- datamule/sec/submissions/downloader.py +1 -1
- datamule/seclibrary/downloader.py +3 -2
- datamule/submission.py +13 -16
- {datamule-1.4.3.dist-info → datamule-1.4.5.dist-info}/METADATA +1 -1
- {datamule-1.4.3.dist-info → datamule-1.4.5.dist-info}/RECORD +10 -10
- {datamule-1.4.3.dist-info → datamule-1.4.5.dist-info}/WHEEL +0 -0
- {datamule-1.4.3.dist-info → datamule-1.4.5.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -12,6 +12,19 @@ from .processing import process_tabular_data
|
|
12
12
|
from pathlib import Path
|
13
13
|
import webbrowser
|
14
14
|
|
15
|
+
def convert_bytes_keys(obj):
|
16
|
+
if isinstance(obj, dict):
|
17
|
+
return {
|
18
|
+
(k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
|
19
|
+
for k, v in obj.items()
|
20
|
+
}
|
21
|
+
elif isinstance(obj, list):
|
22
|
+
return [convert_bytes_keys(item) for item in obj]
|
23
|
+
elif isinstance(obj, bytes):
|
24
|
+
return obj.decode('utf-8').lower()
|
25
|
+
else:
|
26
|
+
return obj
|
27
|
+
|
15
28
|
class Document:
|
16
29
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
17
30
|
|
@@ -19,7 +32,11 @@ class Document:
|
|
19
32
|
extension = extension.lower()
|
20
33
|
self.accession = accession
|
21
34
|
self.filing_date = filing_date
|
22
|
-
|
35
|
+
|
36
|
+
if self.type == 'submission_metadata':
|
37
|
+
self.content = convert_bytes_keys(content)
|
38
|
+
else:
|
39
|
+
self.content = content
|
23
40
|
|
24
41
|
if path is not None:
|
25
42
|
self.path = path
|
@@ -137,12 +154,60 @@ class Document:
|
|
137
154
|
mapping_dict = dict_10q_html
|
138
155
|
elif self.type == '20-F':
|
139
156
|
mapping_dict = dict_20f_html
|
157
|
+
elif self.type == '8-A12B':
|
158
|
+
mapping_dict = dict_8a12b_html
|
159
|
+
elif self.type == '8-A12G':
|
160
|
+
mapping_dict = dict_8a12g_html
|
140
161
|
elif self.type == '8-K':
|
141
162
|
mapping_dict = dict_8k_html
|
163
|
+
elif self.type == '8-K12B':
|
164
|
+
mapping_dict = dict_8k12b_html
|
165
|
+
elif self.type == '8-K12G3':
|
166
|
+
mapping_dict = dict_8k12g3_html
|
167
|
+
elif self.type == '8-K15D5':
|
168
|
+
mapping_dict = dict_8k15d5_html
|
142
169
|
elif self.type == 'ABS-15G':
|
143
170
|
mapping_dict = dict_abs15g_html
|
171
|
+
elif self.type == 'ABS-EE':
|
172
|
+
mapping_dict = dict_absee_html
|
173
|
+
elif self.type == 'APP NTC':
|
174
|
+
dict_appntc_html
|
175
|
+
elif self.type == 'CB':
|
176
|
+
mapping_dict = dict_cb_html
|
177
|
+
elif self.type == 'DSTRBRPT':
|
178
|
+
mapping_dict = dict_dstrbrpt_html
|
179
|
+
elif self.type == 'N-18F1':
|
180
|
+
mapping_dict = dict_n18f1_html
|
181
|
+
elif self.type == 'N-CSRS':
|
182
|
+
mapping_dict = dict_ncsrs_html
|
183
|
+
elif self.type == 'NT-10K':
|
184
|
+
mapping_dict = dict_nt10k_html
|
185
|
+
elif self.type == 'NT-10Q':
|
186
|
+
mapping_dict = dict_nt10q_html
|
187
|
+
elif self.type == 'NT 20-F':
|
188
|
+
mapping_dict = dict_nt20f_html
|
189
|
+
elif self.type == 'NT-NCEN':
|
190
|
+
mapping_dict = dict_ntncen_html
|
191
|
+
elif self.type == 'NT-NCSR':
|
192
|
+
mapping_dict = dict_ntncsr_html
|
193
|
+
elif self.type == 'NTFNCEN':
|
194
|
+
mapping_dict = dict_ntfcen_html
|
195
|
+
elif self.type == 'NTFNCSR':
|
196
|
+
mapping_dict = dict_ntfncsr_html
|
197
|
+
elif self.type == 'EX-99.CERT':
|
198
|
+
mapping_dict = dict_ex99cert_html
|
199
|
+
elif self.type == 'SC 13E3':
|
200
|
+
mapping_dict = dict_sc13e3_html
|
201
|
+
elif self.type == 'SC 14D9':
|
202
|
+
mapping_dict = dict_sc14d9_html
|
203
|
+
elif self.type == 'SP 15D2':
|
204
|
+
mapping_dict = dict_sp15d2_html
|
205
|
+
|
144
206
|
elif self.type == 'SD':
|
145
207
|
mapping_dict = dict_sd_html
|
208
|
+
|
209
|
+
elif self.type == 'T-3':
|
210
|
+
mapping_dict = dict_t3_html
|
146
211
|
elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
|
147
212
|
mapping_dict = dict_nt10k_html
|
148
213
|
|
datamule/helper.py
CHANGED
@@ -89,6 +89,9 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
|
|
89
89
|
if ticker_ciks:
|
90
90
|
cik.extend(ticker_ciks)
|
91
91
|
|
92
|
+
if len(cik) == 0:
|
93
|
+
raise ValueError(f"No CIKs found for ticker: {ticker}")
|
94
|
+
|
92
95
|
# Normalize CIK format
|
93
96
|
if cik is not None:
|
94
97
|
if isinstance(cik, str):
|
@@ -26,15 +26,21 @@ dict_abs15g_html = {
|
|
26
26
|
dict_nt10k_html = {
|
27
27
|
('part',r'^part\s*([ivx]+)') : 0,
|
28
28
|
}
|
29
|
+
dict_nt10q_html = dict_nt10k_html
|
30
|
+
dict_nt20f_html = dict_nt10k_html
|
31
|
+
dict_ntncen_html = dict_nt10k_html
|
32
|
+
dict_ntncsr_html = dict_nt10k_html
|
33
|
+
dict_ntfcen_html = dict_nt10k_html
|
34
|
+
dict_ntfncsr_html = dict_nt10k_html
|
29
35
|
|
30
36
|
dict_1kpartii_html = {
|
31
|
-
('item',r'^item\s*(\d+)') :
|
37
|
+
('item',r'^item\s*(\d+)') : 0,
|
32
38
|
}
|
33
39
|
|
34
40
|
dict_1sa_html = dict_1kpartii_html
|
35
41
|
|
36
|
-
dict_1u_html = {('item',r'^item\s*(\d+)') :
|
37
|
-
('signatures',r'^signatures?\.*$') :
|
42
|
+
dict_1u_html = {('item',r'^item\s*(\d+)') : 0,
|
43
|
+
('signatures',r'^signatures?\.*$') : 0,}
|
38
44
|
|
39
45
|
dict_1012b_html = dict_1u_html
|
40
46
|
|
@@ -45,4 +51,71 @@ dict_20f_html = {
|
|
45
51
|
('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
|
46
52
|
('letter',r'\d*\.?([a-z])') : 2,
|
47
53
|
('signatures',r'^signatures?\.*$') : 0,
|
48
|
-
}
|
54
|
+
}
|
55
|
+
|
56
|
+
dict_8a12b_html = dict_1kpartii_html
|
57
|
+
dict_8a12g_html = dict_1kpartii_html
|
58
|
+
|
59
|
+
dict_8k12b_html = dict_8k_html
|
60
|
+
|
61
|
+
dict_8k12g3_html = dict_8k_html
|
62
|
+
dict_8k15d5_html = dict_8k_html
|
63
|
+
|
64
|
+
dict_absee_html = {('item',r'^item\s*(\d+)') : 0,
|
65
|
+
('signatures',r'^signatures?\.*$') : 0,}
|
66
|
+
|
67
|
+
dict_appntc_html = {('agency',r'^agency') : 0,
|
68
|
+
('action',r'^action') : 0,
|
69
|
+
('summary',r'^summary of application') : 0,
|
70
|
+
('applicants',r'^applicants') : 0,
|
71
|
+
('filing',r'^filing dates') : 0,
|
72
|
+
('hearing',r'^hearing or notification of hearing') : 0,
|
73
|
+
('addresses',r'^addresses') : 0,
|
74
|
+
('further contact',r'^for further information contact') : 0,
|
75
|
+
('supplementary information',r'^supplementary information') : 0,
|
76
|
+
}
|
77
|
+
|
78
|
+
dict_cb_html = {
|
79
|
+
('part', r'^part\s*([ivx]+)') : 0,
|
80
|
+
('item', r'^item\s*(\d+)') : 1,
|
81
|
+
}
|
82
|
+
|
83
|
+
dict_dstrbrpt_html = dict_1kpartii_html
|
84
|
+
|
85
|
+
dict_n18f1_html = {
|
86
|
+
('notification of election', r'^notification of election') : 0,
|
87
|
+
('signatures', r'^signatures?\.*$') : 0,
|
88
|
+
}
|
89
|
+
|
90
|
+
dict_ex99cert_html = {
|
91
|
+
('item',r'^(\d+)') : 0,
|
92
|
+
('letter',r'^\(?([a-z])') : 1,
|
93
|
+
}
|
94
|
+
|
95
|
+
dict_ncsrs_html = {
|
96
|
+
('item',r'^(\d+)') : 0,
|
97
|
+
('signatures',r'^signatures?\.*$') : 0,
|
98
|
+
}
|
99
|
+
|
100
|
+
dict_sc13e3_html = {
|
101
|
+
('item', r'^item\s*(\d+)') : 0,
|
102
|
+
('signatures', r'^signatures?\.*$') : 0,
|
103
|
+
('letter', r'^\(?([a-z])') : 1,
|
104
|
+
}
|
105
|
+
|
106
|
+
dict_sc14d9_html = {
|
107
|
+
('item', r'^item\s*(\d+)') : 0,
|
108
|
+
('signatures', r'^signatures?\.*$') : 0,
|
109
|
+
('annex', r'^annex') : 0,
|
110
|
+
}
|
111
|
+
|
112
|
+
dict_sp15d2_html = dict_10k_html
|
113
|
+
|
114
|
+
dict_t3_html = {('general',r'^general'):0,
|
115
|
+
('affiliations',r'^affiliations'):0,
|
116
|
+
('management and control',r'^management and control'):0,
|
117
|
+
('underwriters',r'^underwriters'):0,
|
118
|
+
('capital securities',r'^capital securities'):0,
|
119
|
+
('indenture securities',r'^indenture securities'):0,
|
120
|
+
('signatures',r'^signatures?\.*$') : 0,
|
121
|
+
('number',r'^(\d+)') : 1,}
|
@@ -9,7 +9,7 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings",
|
|
9
9
|
try:
|
10
10
|
# Create a Submission object directly from the content
|
11
11
|
# Note: the content needs to be decoded from bytes to string for the parser
|
12
|
-
submission = Submission(sgml_content=content
|
12
|
+
submission = Submission(sgml_content=content,
|
13
13
|
keep_document_types=keep_document_types)
|
14
14
|
|
15
15
|
# Use the async save method to write the submission to disk
|
@@ -99,6 +99,7 @@ class Downloader:
|
|
99
99
|
self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
|
100
100
|
self.pbar.update(1)
|
101
101
|
except Exception as e:
|
102
|
+
print(f"Exception {e} in {filename}")
|
102
103
|
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
103
104
|
if os.path.exists(accession_dir):
|
104
105
|
shutil.rmtree(accession_dir)
|
@@ -143,7 +144,7 @@ class Downloader:
|
|
143
144
|
with dctx.stream_reader(input_buffer) as reader:
|
144
145
|
shutil.copyfileobj(reader, decompressed_content)
|
145
146
|
|
146
|
-
content = decompressed_content.getvalue()
|
147
|
+
content = decompressed_content.getvalue()
|
147
148
|
processor.processing_queue.put((filename, content))
|
148
149
|
return True
|
149
150
|
|
@@ -159,7 +160,7 @@ class Downloader:
|
|
159
160
|
|
160
161
|
def save_regular_file(self, chunks, filename, output_dir, processor):
|
161
162
|
try:
|
162
|
-
content = b''.join(chunks)
|
163
|
+
content = b''.join(chunks)
|
163
164
|
processor.processing_queue.put((filename, content))
|
164
165
|
return True
|
165
166
|
|
datamule/submission.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import json
|
3
3
|
from .document.document import Document
|
4
|
-
from secsgml import
|
4
|
+
from secsgml import parse_sgml_content_into_memory
|
5
5
|
import os
|
6
6
|
import aiofiles
|
7
7
|
import tempfile
|
@@ -79,9 +79,8 @@ class Submission:
|
|
79
79
|
|
80
80
|
if sgml_content is not None:
|
81
81
|
self.path = None
|
82
|
-
metadata, raw_documents =
|
82
|
+
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
83
83
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
84
|
-
|
85
84
|
# code dupe
|
86
85
|
self.accession = self.metadata.content['accession-number']
|
87
86
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
@@ -95,7 +94,9 @@ class Submission:
|
|
95
94
|
# Keep only specified types
|
96
95
|
if keep_document_types is not None and type not in keep_document_types:
|
97
96
|
continue
|
98
|
-
|
97
|
+
|
98
|
+
# write as txt if not declared
|
99
|
+
filename = doc.get('filename','.txt')
|
99
100
|
extension = Path(filename).suffix
|
100
101
|
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
101
102
|
|
@@ -190,12 +191,9 @@ class Submission:
|
|
190
191
|
json.dump(self.metadata.content, f, indent=4)
|
191
192
|
|
192
193
|
for idx, doc in enumerate(self.metadata.content['documents']):
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
filename = f"{doc.get('sequence', idx)}.txt"
|
197
|
-
except (KeyError, IndexError):
|
198
|
-
filename = f"{idx}.txt"
|
194
|
+
filename = doc.get('filename')
|
195
|
+
if filename is None:
|
196
|
+
filename = f"{doc.get('sequence')}.txt"
|
199
197
|
|
200
198
|
doc_path = file_dir / filename
|
201
199
|
|
@@ -231,12 +229,11 @@ class Submission:
|
|
231
229
|
await f.write(json.dumps(self.metadata.content, indent=4))
|
232
230
|
|
233
231
|
for idx, doc in enumerate(self.metadata.content['documents']):
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
filename = f"{idx}.txt"
|
232
|
+
filename = doc.get('filename')
|
233
|
+
# oh we need handling here for sequences case
|
234
|
+
if filename is None:
|
235
|
+
filename = doc['sequence'] + '.txt'
|
236
|
+
|
240
237
|
|
241
238
|
doc_path = file_dir / filename
|
242
239
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
datamule/__init__.py,sha256=glzwBeGJEE6-TG7mRule9GH6L59XaIRR9T7ALcdpMus,1067
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
-
datamule/helper.py,sha256=
|
3
|
+
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
4
|
datamule/index.py,sha256=_7Ox5hyF_7RWdblVFr5rNyv_ARwBP7VY4f703pk9qQ8,2074
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
6
|
datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=EtWdEnAyWLZdu69Dyzbs4qb5YL41HlExFGMjwEoMhsg,10904
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
datamule/document/document.py,sha256=
|
11
|
+
datamule/document/document.py,sha256=3vX850H7rZH4H8BysitZDaLhT6WPJuIreoV1PSjACno,14301
|
12
12
|
datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
|
13
13
|
datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
|
14
14
|
datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -36,7 +36,7 @@ datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3
|
|
36
36
|
datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
|
37
37
|
datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
|
38
38
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
-
datamule/mapping_dicts/html_mapping_dicts.py,sha256=
|
39
|
+
datamule/mapping_dicts/html_mapping_dicts.py,sha256=rf2cNLhNRuDpYVX_gNFAfR7fbjLb_Wbo114EScj53RU,3321
|
40
40
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
41
41
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
42
42
|
datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -44,7 +44,7 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
|
|
44
44
|
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
|
46
46
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
-
datamule/sec/submissions/downloader.py,sha256=
|
47
|
+
datamule/sec/submissions/downloader.py,sha256=izaz559PtBCAWPWGzqUReloawJtXwnraclgXdzEOteI,2631
|
48
48
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
49
49
|
datamule/sec/submissions/monitor.py,sha256=dZYuVCi_X82eYA8l_9cbnkRjiawz3K4U-FnCAyJcgk4,7892
|
50
50
|
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
@@ -56,9 +56,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
56
56
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
57
57
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
59
|
-
datamule/seclibrary/downloader.py,sha256=
|
59
|
+
datamule/seclibrary/downloader.py,sha256=VIdaQq5wDcYWnqrv9t8J7z0KtdNRGK8ahfBsgvTfdQQ,13675
|
60
60
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
61
|
-
datamule-1.4.
|
62
|
-
datamule-1.4.
|
63
|
-
datamule-1.4.
|
64
|
-
datamule-1.4.
|
61
|
+
datamule-1.4.5.dist-info/METADATA,sha256=oKn8LWnuOVozqGyVe7ggRcCmoyjy7vwZbgZG7AKYWGY,469
|
62
|
+
datamule-1.4.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-1.4.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-1.4.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|