datamule 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +50 -1
- datamule/index.py +1 -1
- datamule/mapping_dicts/html_mapping_dicts.py +47 -1
- datamule/sec/submissions/downloader.py +1 -1
- datamule/sec/submissions/textsearch.py +5 -3
- datamule/seclibrary/downloader.py +3 -2
- datamule/submission.py +13 -16
- {datamule-1.4.4.dist-info → datamule-1.4.6.dist-info}/METADATA +1 -1
- {datamule-1.4.4.dist-info → datamule-1.4.6.dist-info}/RECORD +11 -11
- {datamule-1.4.4.dist-info → datamule-1.4.6.dist-info}/WHEEL +0 -0
- {datamule-1.4.4.dist-info → datamule-1.4.6.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -12,6 +12,19 @@ from .processing import process_tabular_data
|
|
12
12
|
from pathlib import Path
|
13
13
|
import webbrowser
|
14
14
|
|
15
|
+
def convert_bytes_keys(obj):
|
16
|
+
if isinstance(obj, dict):
|
17
|
+
return {
|
18
|
+
(k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
|
19
|
+
for k, v in obj.items()
|
20
|
+
}
|
21
|
+
elif isinstance(obj, list):
|
22
|
+
return [convert_bytes_keys(item) for item in obj]
|
23
|
+
elif isinstance(obj, bytes):
|
24
|
+
return obj.decode('utf-8').lower()
|
25
|
+
else:
|
26
|
+
return obj
|
27
|
+
|
15
28
|
class Document:
|
16
29
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
17
30
|
|
@@ -19,7 +32,11 @@ class Document:
|
|
19
32
|
extension = extension.lower()
|
20
33
|
self.accession = accession
|
21
34
|
self.filing_date = filing_date
|
22
|
-
|
35
|
+
|
36
|
+
if self.type == 'submission_metadata':
|
37
|
+
self.content = convert_bytes_keys(content)
|
38
|
+
else:
|
39
|
+
self.content = content
|
23
40
|
|
24
41
|
if path is not None:
|
25
42
|
self.path = path
|
@@ -157,8 +174,40 @@ class Document:
|
|
157
174
|
dict_appntc_html
|
158
175
|
elif self.type == 'CB':
|
159
176
|
mapping_dict = dict_cb_html
|
177
|
+
elif self.type == 'DSTRBRPT':
|
178
|
+
mapping_dict = dict_dstrbrpt_html
|
179
|
+
elif self.type == 'N-18F1':
|
180
|
+
mapping_dict = dict_n18f1_html
|
181
|
+
elif self.type == 'N-CSRS':
|
182
|
+
mapping_dict = dict_ncsrs_html
|
183
|
+
elif self.type == 'NT-10K':
|
184
|
+
mapping_dict = dict_nt10k_html
|
185
|
+
elif self.type == 'NT-10Q':
|
186
|
+
mapping_dict = dict_nt10q_html
|
187
|
+
elif self.type == 'NT 20-F':
|
188
|
+
mapping_dict = dict_nt20f_html
|
189
|
+
elif self.type == 'NT-NCEN':
|
190
|
+
mapping_dict = dict_ntncen_html
|
191
|
+
elif self.type == 'NT-NCSR':
|
192
|
+
mapping_dict = dict_ntncsr_html
|
193
|
+
elif self.type == 'NTFNCEN':
|
194
|
+
mapping_dict = dict_ntfcen_html
|
195
|
+
elif self.type == 'NTFNCSR':
|
196
|
+
mapping_dict = dict_ntfncsr_html
|
197
|
+
elif self.type == 'EX-99.CERT':
|
198
|
+
mapping_dict = dict_ex99cert_html
|
199
|
+
elif self.type == 'SC 13E3':
|
200
|
+
mapping_dict = dict_sc13e3_html
|
201
|
+
elif self.type == 'SC 14D9':
|
202
|
+
mapping_dict = dict_sc14d9_html
|
203
|
+
elif self.type == 'SP 15D2':
|
204
|
+
mapping_dict = dict_sp15d2_html
|
205
|
+
|
160
206
|
elif self.type == 'SD':
|
161
207
|
mapping_dict = dict_sd_html
|
208
|
+
|
209
|
+
elif self.type == 'T-3':
|
210
|
+
mapping_dict = dict_t3_html
|
162
211
|
elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
|
163
212
|
mapping_dict = dict_nt10k_html
|
164
213
|
|
datamule/index.py
CHANGED
@@ -26,6 +26,12 @@ dict_abs15g_html = {
|
|
26
26
|
dict_nt10k_html = {
|
27
27
|
('part',r'^part\s*([ivx]+)') : 0,
|
28
28
|
}
|
29
|
+
dict_nt10q_html = dict_nt10k_html
|
30
|
+
dict_nt20f_html = dict_nt10k_html
|
31
|
+
dict_ntncen_html = dict_nt10k_html
|
32
|
+
dict_ntncsr_html = dict_nt10k_html
|
33
|
+
dict_ntfcen_html = dict_nt10k_html
|
34
|
+
dict_ntfncsr_html = dict_nt10k_html
|
29
35
|
|
30
36
|
dict_1kpartii_html = {
|
31
37
|
('item',r'^item\s*(\d+)') : 0,
|
@@ -72,4 +78,44 @@ dict_appntc_html = {('agency',r'^agency') : 0,
|
|
72
78
|
dict_cb_html = {
|
73
79
|
('part', r'^part\s*([ivx]+)') : 0,
|
74
80
|
('item', r'^item\s*(\d+)') : 1,
|
75
|
-
}
|
81
|
+
}
|
82
|
+
|
83
|
+
dict_dstrbrpt_html = dict_1kpartii_html
|
84
|
+
|
85
|
+
dict_n18f1_html = {
|
86
|
+
('notification of election', r'^notification of election') : 0,
|
87
|
+
('signatures', r'^signatures?\.*$') : 0,
|
88
|
+
}
|
89
|
+
|
90
|
+
dict_ex99cert_html = {
|
91
|
+
('item',r'^(\d+)') : 0,
|
92
|
+
('letter',r'^\(?([a-z])') : 1,
|
93
|
+
}
|
94
|
+
|
95
|
+
dict_ncsrs_html = {
|
96
|
+
('item',r'^(\d+)') : 0,
|
97
|
+
('signatures',r'^signatures?\.*$') : 0,
|
98
|
+
}
|
99
|
+
|
100
|
+
dict_sc13e3_html = {
|
101
|
+
('item', r'^item\s*(\d+)') : 0,
|
102
|
+
('signatures', r'^signatures?\.*$') : 0,
|
103
|
+
('letter', r'^\(?([a-z])') : 1,
|
104
|
+
}
|
105
|
+
|
106
|
+
dict_sc14d9_html = {
|
107
|
+
('item', r'^item\s*(\d+)') : 0,
|
108
|
+
('signatures', r'^signatures?\.*$') : 0,
|
109
|
+
('annex', r'^annex') : 0,
|
110
|
+
}
|
111
|
+
|
112
|
+
dict_sp15d2_html = dict_10k_html
|
113
|
+
|
114
|
+
dict_t3_html = {('general',r'^general'):0,
|
115
|
+
('affiliations',r'^affiliations'):0,
|
116
|
+
('management and control',r'^management and control'):0,
|
117
|
+
('underwriters',r'^underwriters'):0,
|
118
|
+
('capital securities',r'^capital securities'):0,
|
119
|
+
('indenture securities',r'^indenture securities'):0,
|
120
|
+
('signatures',r'^signatures?\.*$') : 0,
|
121
|
+
('number',r'^(\d+)') : 1,}
|
@@ -9,7 +9,7 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings",
|
|
9
9
|
try:
|
10
10
|
# Create a Submission object directly from the content
|
11
11
|
# Note: the content needs to be decoded from bytes to string for the parser
|
12
|
-
submission = Submission(sgml_content=content
|
12
|
+
submission = Submission(sgml_content=content,
|
13
13
|
keep_document_types=keep_document_types)
|
14
14
|
|
15
15
|
# Use the async save method to write the submission to disk
|
@@ -7,14 +7,16 @@ class TextSearchEFTSQuery(EFTSQuery):
|
|
7
7
|
"""
|
8
8
|
def __init__(self, text_query, requests_per_second=5.0, quiet=False):
|
9
9
|
super().__init__(requests_per_second=requests_per_second, quiet=quiet)
|
10
|
-
|
10
|
+
if text_query is not None:
|
11
|
+
self.text_query = text_query
|
11
12
|
|
12
13
|
def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
|
13
14
|
# Get base parameters from parent class
|
14
15
|
params = super()._prepare_params(cik, submission_type, filing_date, location)
|
15
16
|
|
16
17
|
# Add text query parameter
|
17
|
-
|
18
|
+
if self.text_query is not None:
|
19
|
+
params['q'] = self.text_query
|
18
20
|
|
19
21
|
return params
|
20
22
|
|
@@ -42,7 +44,7 @@ async def extract_accession_numbers(hits):
|
|
42
44
|
accession_numbers.append(acc_no)
|
43
45
|
return accession_numbers
|
44
46
|
|
45
|
-
def query(text_query, cik=None, submission_type=None, filing_date=None, location=None,
|
47
|
+
def query(text_query=None, cik=None, submission_type=None, filing_date=None, location=None,
|
46
48
|
name=None, requests_per_second=5.0, quiet=False):
|
47
49
|
"""
|
48
50
|
Search SEC filings for text and return the full search results.
|
@@ -99,6 +99,7 @@ class Downloader:
|
|
99
99
|
self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
|
100
100
|
self.pbar.update(1)
|
101
101
|
except Exception as e:
|
102
|
+
print(f"Exception {e} in {filename}")
|
102
103
|
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
103
104
|
if os.path.exists(accession_dir):
|
104
105
|
shutil.rmtree(accession_dir)
|
@@ -143,7 +144,7 @@ class Downloader:
|
|
143
144
|
with dctx.stream_reader(input_buffer) as reader:
|
144
145
|
shutil.copyfileobj(reader, decompressed_content)
|
145
146
|
|
146
|
-
content = decompressed_content.getvalue()
|
147
|
+
content = decompressed_content.getvalue()
|
147
148
|
processor.processing_queue.put((filename, content))
|
148
149
|
return True
|
149
150
|
|
@@ -159,7 +160,7 @@ class Downloader:
|
|
159
160
|
|
160
161
|
def save_regular_file(self, chunks, filename, output_dir, processor):
|
161
162
|
try:
|
162
|
-
content = b''.join(chunks)
|
163
|
+
content = b''.join(chunks)
|
163
164
|
processor.processing_queue.put((filename, content))
|
164
165
|
return True
|
165
166
|
|
datamule/submission.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import json
|
3
3
|
from .document.document import Document
|
4
|
-
from secsgml import
|
4
|
+
from secsgml import parse_sgml_content_into_memory
|
5
5
|
import os
|
6
6
|
import aiofiles
|
7
7
|
import tempfile
|
@@ -79,9 +79,8 @@ class Submission:
|
|
79
79
|
|
80
80
|
if sgml_content is not None:
|
81
81
|
self.path = None
|
82
|
-
metadata, raw_documents =
|
82
|
+
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
83
83
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
84
|
-
|
85
84
|
# code dupe
|
86
85
|
self.accession = self.metadata.content['accession-number']
|
87
86
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
@@ -95,7 +94,9 @@ class Submission:
|
|
95
94
|
# Keep only specified types
|
96
95
|
if keep_document_types is not None and type not in keep_document_types:
|
97
96
|
continue
|
98
|
-
|
97
|
+
|
98
|
+
# write as txt if not declared
|
99
|
+
filename = doc.get('filename','.txt')
|
99
100
|
extension = Path(filename).suffix
|
100
101
|
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
101
102
|
|
@@ -190,12 +191,9 @@ class Submission:
|
|
190
191
|
json.dump(self.metadata.content, f, indent=4)
|
191
192
|
|
192
193
|
for idx, doc in enumerate(self.metadata.content['documents']):
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
filename = f"{doc.get('sequence', idx)}.txt"
|
197
|
-
except (KeyError, IndexError):
|
198
|
-
filename = f"{idx}.txt"
|
194
|
+
filename = doc.get('filename')
|
195
|
+
if filename is None:
|
196
|
+
filename = f"{doc.get('sequence')}.txt"
|
199
197
|
|
200
198
|
doc_path = file_dir / filename
|
201
199
|
|
@@ -231,12 +229,11 @@ class Submission:
|
|
231
229
|
await f.write(json.dumps(self.metadata.content, indent=4))
|
232
230
|
|
233
231
|
for idx, doc in enumerate(self.metadata.content['documents']):
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
filename = f"{idx}.txt"
|
232
|
+
filename = doc.get('filename')
|
233
|
+
# oh we need handling here for sequences case
|
234
|
+
if filename is None:
|
235
|
+
filename = doc['sequence'] + '.txt'
|
236
|
+
|
240
237
|
|
241
238
|
doc_path = file_dir / filename
|
242
239
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
datamule/__init__.py,sha256=glzwBeGJEE6-TG7mRule9GH6L59XaIRR9T7ALcdpMus,1067
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
|
-
datamule/index.py,sha256=
|
4
|
+
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
6
|
datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=EtWdEnAyWLZdu69Dyzbs4qb5YL41HlExFGMjwEoMhsg,10904
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
datamule/document/document.py,sha256=
|
11
|
+
datamule/document/document.py,sha256=3vX850H7rZH4H8BysitZDaLhT6WPJuIreoV1PSjACno,14301
|
12
12
|
datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
|
13
13
|
datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
|
14
14
|
datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -36,7 +36,7 @@ datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3
|
|
36
36
|
datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
|
37
37
|
datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
|
38
38
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
-
datamule/mapping_dicts/html_mapping_dicts.py,sha256=
|
39
|
+
datamule/mapping_dicts/html_mapping_dicts.py,sha256=rf2cNLhNRuDpYVX_gNFAfR7fbjLb_Wbo114EScj53RU,3321
|
40
40
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
41
41
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
42
42
|
datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -44,11 +44,11 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
|
|
44
44
|
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
|
46
46
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
-
datamule/sec/submissions/downloader.py,sha256=
|
47
|
+
datamule/sec/submissions/downloader.py,sha256=izaz559PtBCAWPWGzqUReloawJtXwnraclgXdzEOteI,2631
|
48
48
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
49
49
|
datamule/sec/submissions/monitor.py,sha256=dZYuVCi_X82eYA8l_9cbnkRjiawz3K4U-FnCAyJcgk4,7892
|
50
50
|
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
51
|
-
datamule/sec/submissions/textsearch.py,sha256=
|
51
|
+
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
52
52
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
|
54
54
|
datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
|
@@ -56,9 +56,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
56
56
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
57
57
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
59
|
-
datamule/seclibrary/downloader.py,sha256=
|
59
|
+
datamule/seclibrary/downloader.py,sha256=VIdaQq5wDcYWnqrv9t8J7z0KtdNRGK8ahfBsgvTfdQQ,13675
|
60
60
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
61
|
-
datamule-1.4.
|
62
|
-
datamule-1.4.
|
63
|
-
datamule-1.4.
|
64
|
-
datamule-1.4.
|
61
|
+
datamule-1.4.6.dist-info/METADATA,sha256=IxggkAHbjanZjnTtWGNOyRM68sztal4gQlUfa0shlXg,469
|
62
|
+
datamule-1.4.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-1.4.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-1.4.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|