datamule 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,19 @@ from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
 
15
+ def convert_bytes_keys(obj):
16
+ if isinstance(obj, dict):
17
+ return {
18
+ (k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
19
+ for k, v in obj.items()
20
+ }
21
+ elif isinstance(obj, list):
22
+ return [convert_bytes_keys(item) for item in obj]
23
+ elif isinstance(obj, bytes):
24
+ return obj.decode('utf-8').lower()
25
+ else:
26
+ return obj
27
+
15
28
  class Document:
16
29
  def __init__(self, type, content, extension,accession,filing_date,path=None):
17
30
 
@@ -19,7 +32,11 @@ class Document:
19
32
  extension = extension.lower()
20
33
  self.accession = accession
21
34
  self.filing_date = filing_date
22
- self.content = content
35
+
36
+ if self.type == 'submission_metadata':
37
+ self.content = convert_bytes_keys(content)
38
+ else:
39
+ self.content = content
23
40
 
24
41
  if path is not None:
25
42
  self.path = path
@@ -157,8 +174,40 @@ class Document:
157
174
  dict_appntc_html
158
175
  elif self.type == 'CB':
159
176
  mapping_dict = dict_cb_html
177
+ elif self.type == 'DSTRBRPT':
178
+ mapping_dict = dict_dstrbrpt_html
179
+ elif self.type == 'N-18F1':
180
+ mapping_dict = dict_n18f1_html
181
+ elif self.type == 'N-CSRS':
182
+ mapping_dict = dict_ncsrs_html
183
+ elif self.type == 'NT-10K':
184
+ mapping_dict = dict_nt10k_html
185
+ elif self.type == 'NT-10Q':
186
+ mapping_dict = dict_nt10q_html
187
+ elif self.type == 'NT 20-F':
188
+ mapping_dict = dict_nt20f_html
189
+ elif self.type == 'NT-NCEN':
190
+ mapping_dict = dict_ntncen_html
191
+ elif self.type == 'NT-NCSR':
192
+ mapping_dict = dict_ntncsr_html
193
+ elif self.type == 'NTFNCEN':
194
+ mapping_dict = dict_ntfcen_html
195
+ elif self.type == 'NTFNCSR':
196
+ mapping_dict = dict_ntfncsr_html
197
+ elif self.type == 'EX-99.CERT':
198
+ mapping_dict = dict_ex99cert_html
199
+ elif self.type == 'SC 13E3':
200
+ mapping_dict = dict_sc13e3_html
201
+ elif self.type == 'SC 14D9':
202
+ mapping_dict = dict_sc14d9_html
203
+ elif self.type == 'SP 15D2':
204
+ mapping_dict = dict_sp15d2_html
205
+
160
206
  elif self.type == 'SD':
161
207
  mapping_dict = dict_sd_html
208
+
209
+ elif self.type == 'T-3':
210
+ mapping_dict = dict_t3_html
162
211
  elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
163
212
  mapping_dict = dict_nt10k_html
164
213
 
@@ -26,6 +26,12 @@ dict_abs15g_html = {
26
26
  dict_nt10k_html = {
27
27
  ('part',r'^part\s*([ivx]+)') : 0,
28
28
  }
29
+ dict_nt10q_html = dict_nt10k_html
30
+ dict_nt20f_html = dict_nt10k_html
31
+ dict_ntncen_html = dict_nt10k_html
32
+ dict_ntncsr_html = dict_nt10k_html
33
+ dict_ntfcen_html = dict_nt10k_html
34
+ dict_ntfncsr_html = dict_nt10k_html
29
35
 
30
36
  dict_1kpartii_html = {
31
37
  ('item',r'^item\s*(\d+)') : 0,
@@ -72,4 +78,44 @@ dict_appntc_html = {('agency',r'^agency') : 0,
72
78
  dict_cb_html = {
73
79
  ('part', r'^part\s*([ivx]+)') : 0,
74
80
  ('item', r'^item\s*(\d+)') : 1,
75
- }
81
+ }
82
+
83
+ dict_dstrbrpt_html = dict_1kpartii_html
84
+
85
+ dict_n18f1_html = {
86
+ ('notification of election', r'^notification of election') : 0,
87
+ ('signatures', r'^signatures?\.*$') : 0,
88
+ }
89
+
90
+ dict_ex99cert_html = {
91
+ ('item',r'^(\d+)') : 0,
92
+ ('letter',r'^\(?([a-z])') : 1,
93
+ }
94
+
95
+ dict_ncsrs_html = {
96
+ ('item',r'^(\d+)') : 0,
97
+ ('signatures',r'^signatures?\.*$') : 0,
98
+ }
99
+
100
+ dict_sc13e3_html = {
101
+ ('item', r'^item\s*(\d+)') : 0,
102
+ ('signatures', r'^signatures?\.*$') : 0,
103
+ ('letter', r'^\(?([a-z])') : 1,
104
+ }
105
+
106
+ dict_sc14d9_html = {
107
+ ('item', r'^item\s*(\d+)') : 0,
108
+ ('signatures', r'^signatures?\.*$') : 0,
109
+ ('annex', r'^annex') : 0,
110
+ }
111
+
112
+ dict_sp15d2_html = dict_10k_html
113
+
114
+ dict_t3_html = {('general',r'^general'):0,
115
+ ('affiliations',r'^affiliations'):0,
116
+ ('management and control',r'^management and control'):0,
117
+ ('underwriters',r'^underwriters'):0,
118
+ ('capital securities',r'^capital securities'):0,
119
+ ('indenture securities',r'^indenture securities'):0,
120
+ ('signatures',r'^signatures?\.*$') : 0,
121
+ ('number',r'^(\d+)') : 1,}
@@ -9,7 +9,7 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings",
9
9
  try:
10
10
  # Create a Submission object directly from the content
11
11
  # Note: the content needs to be decoded from bytes to string for the parser
12
- submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
12
+ submission = Submission(sgml_content=content,
13
13
  keep_document_types=keep_document_types)
14
14
 
15
15
  # Use the async save method to write the submission to disk
@@ -99,6 +99,7 @@ class Downloader:
99
99
  self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
100
100
  self.pbar.update(1)
101
101
  except Exception as e:
102
+ print(f"Exception {e} in {filename}")
102
103
  accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
103
104
  if os.path.exists(accession_dir):
104
105
  shutil.rmtree(accession_dir)
@@ -143,7 +144,7 @@ class Downloader:
143
144
  with dctx.stream_reader(input_buffer) as reader:
144
145
  shutil.copyfileobj(reader, decompressed_content)
145
146
 
146
- content = decompressed_content.getvalue().decode('utf-8')
147
+ content = decompressed_content.getvalue()
147
148
  processor.processing_queue.put((filename, content))
148
149
  return True
149
150
 
@@ -159,7 +160,7 @@ class Downloader:
159
160
 
160
161
  def save_regular_file(self, chunks, filename, output_dir, processor):
161
162
  try:
162
- content = b''.join(chunks).decode('utf-8')
163
+ content = b''.join(chunks)
163
164
  processor.processing_queue.put((filename, content))
164
165
  return True
165
166
 
datamule/submission.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
- from secsgml import parse_sgml_submission_into_memory
4
+ from secsgml import parse_sgml_content_into_memory
5
5
  import os
6
6
  import aiofiles
7
7
  import tempfile
@@ -79,9 +79,8 @@ class Submission:
79
79
 
80
80
  if sgml_content is not None:
81
81
  self.path = None
82
- metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
82
+ metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
83
83
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
84
-
85
84
  # code dupe
86
85
  self.accession = self.metadata.content['accession-number']
87
86
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
@@ -95,7 +94,9 @@ class Submission:
95
94
  # Keep only specified types
96
95
  if keep_document_types is not None and type not in keep_document_types:
97
96
  continue
98
- filename = doc.get('filename')
97
+
98
+ # write as txt if not declared
99
+ filename = doc.get('filename','.txt')
99
100
  extension = Path(filename).suffix
100
101
  self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
101
102
 
@@ -190,12 +191,9 @@ class Submission:
190
191
  json.dump(self.metadata.content, f, indent=4)
191
192
 
192
193
  for idx, doc in enumerate(self.metadata.content['documents']):
193
- try:
194
- filename = doc.get('filename')
195
- if filename is None:
196
- filename = f"{doc.get('sequence', idx)}.txt"
197
- except (KeyError, IndexError):
198
- filename = f"{idx}.txt"
194
+ filename = doc.get('filename')
195
+ if filename is None:
196
+ filename = f"{doc.get('sequence')}.txt"
199
197
 
200
198
  doc_path = file_dir / filename
201
199
 
@@ -231,12 +229,11 @@ class Submission:
231
229
  await f.write(json.dumps(self.metadata.content, indent=4))
232
230
 
233
231
  for idx, doc in enumerate(self.metadata.content['documents']):
234
- try:
235
- filename = doc.get('filename')
236
- if filename is None:
237
- filename = f"{doc.get('sequence', idx)}.txt"
238
- except (KeyError, IndexError):
239
- filename = f"{idx}.txt"
232
+ filename = doc.get('filename')
233
+ # oh we need handling here for sequences case
234
+ if filename is None:
235
+ filename = doc['sequence'] + '.txt'
236
+
240
237
 
241
238
  doc_path = file_dir / filename
242
239
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.4
3
+ Version: 1.4.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -5,10 +5,10 @@ datamule/index.py,sha256=_7Ox5hyF_7RWdblVFr5rNyv_ARwBP7VY4f703pk9qQ8,2074
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
6
  datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=Yh5nG3ioumhl6z30wJdIEmKjDDNSuo0r2xycZSIaeIg,11035
8
+ datamule/submission.py,sha256=EtWdEnAyWLZdu69Dyzbs4qb5YL41HlExFGMjwEoMhsg,10904
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- datamule/document/document.py,sha256=RQkeBl9VxyjP2_3BT2GyCiwuFrQM-YlJlqMq3SdeIdU,12438
11
+ datamule/document/document.py,sha256=3vX850H7rZH4H8BysitZDaLhT6WPJuIreoV1PSjACno,14301
12
12
  datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
13
13
  datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
14
14
  datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,7 +36,7 @@ datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3
36
36
  datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
37
37
  datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
38
38
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- datamule/mapping_dicts/html_mapping_dicts.py,sha256=bh3aM0IXKA6v_LhFdO_6akUN7sfp8mhXydeDyVe1ht4,1985
39
+ datamule/mapping_dicts/html_mapping_dicts.py,sha256=rf2cNLhNRuDpYVX_gNFAfR7fbjLb_Wbo114EScj53RU,3321
40
40
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
41
41
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
42
42
  datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -44,7 +44,7 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
44
44
  datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
46
46
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- datamule/sec/submissions/downloader.py,sha256=60wX2Yml1UCuxOtU0xMxqqeyHhrypCmlDQ0jZF-StJo,2665
47
+ datamule/sec/submissions/downloader.py,sha256=izaz559PtBCAWPWGzqUReloawJtXwnraclgXdzEOteI,2631
48
48
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
49
49
  datamule/sec/submissions/monitor.py,sha256=dZYuVCi_X82eYA8l_9cbnkRjiawz3K4U-FnCAyJcgk4,7892
50
50
  datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
@@ -56,9 +56,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
56
56
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
57
57
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
59
- datamule/seclibrary/downloader.py,sha256=PIgz_7ASUTZOHcUZGcD1SmLaGSbq7xe7EiJT0Z7HU4M,13653
59
+ datamule/seclibrary/downloader.py,sha256=VIdaQq5wDcYWnqrv9t8J7z0KtdNRGK8ahfBsgvTfdQQ,13675
60
60
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
61
- datamule-1.4.4.dist-info/METADATA,sha256=-_m15CLGbd7csE_aiXo9Q54wMCqFjW0VfN1QJKplQtY,469
62
- datamule-1.4.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
- datamule-1.4.4.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
- datamule-1.4.4.dist-info/RECORD,,
61
+ datamule-1.4.5.dist-info/METADATA,sha256=oKn8LWnuOVozqGyVe7ggRcCmoyjy7vwZbgZG7AKYWGY,469
62
+ datamule-1.4.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
+ datamule-1.4.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
+ datamule-1.4.5.dist-info/RECORD,,