datamule 1.4.4__tar.gz → 1.4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-1.4.4 → datamule-1.4.5}/PKG-INFO +1 -1
  2. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/document.py +50 -1
  3. {datamule-1.4.4 → datamule-1.4.5}/datamule/mapping_dicts/html_mapping_dicts.py +47 -1
  4. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/downloader.py +1 -1
  5. {datamule-1.4.4 → datamule-1.4.5}/datamule/seclibrary/downloader.py +3 -2
  6. {datamule-1.4.4 → datamule-1.4.5}/datamule/submission.py +13 -16
  7. {datamule-1.4.4 → datamule-1.4.5}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-1.4.4 → datamule-1.4.5}/setup.py +1 -1
  9. {datamule-1.4.4 → datamule-1.4.5}/datamule/__init__.py +0 -0
  10. {datamule-1.4.4 → datamule-1.4.5}/datamule/config.py +0 -0
  11. {datamule-1.4.4 → datamule-1.4.5}/datamule/data/listed_filer_metadata.csv +0 -0
  12. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/__init__.py +0 -0
  13. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/__init__.py +0 -0
  14. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/atsn.py +0 -0
  15. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/cfportal.py +0 -0
  16. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/d.py +0 -0
  17. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ex102_abs.py +0 -0
  18. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ex99a_sdr.py +0 -0
  19. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ex99c_sdr.py +0 -0
  20. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ex99g_sdr.py +0 -0
  21. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ex99i_sdr.py +0 -0
  22. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/information_table.py +0 -0
  23. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/nmfp.py +0 -0
  24. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/npx.py +0 -0
  25. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/onefourtyfour.py +0 -0
  26. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ownership.py +0 -0
  27. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/proxy_voting_record.py +0 -0
  28. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/sbs.py +0 -0
  29. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/sbsef.py +0 -0
  30. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/schedule13.py +0 -0
  31. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/sdr.py +0 -0
  32. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/submission_metadata.py +0 -0
  33. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/ta.py +0 -0
  34. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/thirteenfhr.py +0 -0
  35. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/twentyfivense.py +0 -0
  36. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  37. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/processing.py +0 -0
  38. {datamule-1.4.4 → datamule-1.4.5}/datamule/document/table.py +0 -0
  39. {datamule-1.4.4 → datamule-1.4.5}/datamule/helper.py +0 -0
  40. {datamule-1.4.4 → datamule-1.4.5}/datamule/index.py +0 -0
  41. {datamule-1.4.4 → datamule-1.4.5}/datamule/mapping_dicts/__init__.py +0 -0
  42. {datamule-1.4.4 → datamule-1.4.5}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  43. {datamule-1.4.4 → datamule-1.4.5}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  44. {datamule-1.4.4 → datamule-1.4.5}/datamule/package_updater.py +0 -0
  45. {datamule-1.4.4 → datamule-1.4.5}/datamule/portfolio.py +0 -0
  46. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/__init__.py +0 -0
  47. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/infrastructure/__init__.py +0 -0
  48. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  49. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/__init__.py +0 -0
  50. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/eftsquery.py +0 -0
  51. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/monitor.py +0 -0
  52. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/streamer.py +0 -0
  53. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/submissions/textsearch.py +0 -0
  54. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/utils.py +0 -0
  55. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/xbrl/__init__.py +0 -0
  56. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  57. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  58. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  59. {datamule-1.4.4 → datamule-1.4.5}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  60. {datamule-1.4.4 → datamule-1.4.5}/datamule/seclibrary/__init__.py +0 -0
  61. {datamule-1.4.4 → datamule-1.4.5}/datamule/seclibrary/bq.py +0 -0
  62. {datamule-1.4.4 → datamule-1.4.5}/datamule/seclibrary/query.py +0 -0
  63. {datamule-1.4.4 → datamule-1.4.5}/datamule/sheet.py +0 -0
  64. {datamule-1.4.4 → datamule-1.4.5}/datamule.egg-info/SOURCES.txt +0 -0
  65. {datamule-1.4.4 → datamule-1.4.5}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-1.4.4 → datamule-1.4.5}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-1.4.4 → datamule-1.4.5}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-1.4.4 → datamule-1.4.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.4
3
+ Version: 1.4.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -12,6 +12,19 @@ from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
 
15
+ def convert_bytes_keys(obj):
16
+ if isinstance(obj, dict):
17
+ return {
18
+ (k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
19
+ for k, v in obj.items()
20
+ }
21
+ elif isinstance(obj, list):
22
+ return [convert_bytes_keys(item) for item in obj]
23
+ elif isinstance(obj, bytes):
24
+ return obj.decode('utf-8').lower()
25
+ else:
26
+ return obj
27
+
15
28
  class Document:
16
29
  def __init__(self, type, content, extension,accession,filing_date,path=None):
17
30
 
@@ -19,7 +32,11 @@ class Document:
19
32
  extension = extension.lower()
20
33
  self.accession = accession
21
34
  self.filing_date = filing_date
22
- self.content = content
35
+
36
+ if self.type == 'submission_metadata':
37
+ self.content = convert_bytes_keys(content)
38
+ else:
39
+ self.content = content
23
40
 
24
41
  if path is not None:
25
42
  self.path = path
@@ -157,8 +174,40 @@ class Document:
157
174
  dict_appntc_html
158
175
  elif self.type == 'CB':
159
176
  mapping_dict = dict_cb_html
177
+ elif self.type == 'DSTRBRPT':
178
+ mapping_dict = dict_dstrbrpt_html
179
+ elif self.type == 'N-18F1':
180
+ mapping_dict = dict_n18f1_html
181
+ elif self.type == 'N-CSRS':
182
+ mapping_dict = dict_ncsrs_html
183
+ elif self.type == 'NT-10K':
184
+ mapping_dict = dict_nt10k_html
185
+ elif self.type == 'NT-10Q':
186
+ mapping_dict = dict_nt10q_html
187
+ elif self.type == 'NT 20-F':
188
+ mapping_dict = dict_nt20f_html
189
+ elif self.type == 'NT-NCEN':
190
+ mapping_dict = dict_ntncen_html
191
+ elif self.type == 'NT-NCSR':
192
+ mapping_dict = dict_ntncsr_html
193
+ elif self.type == 'NTFNCEN':
194
+ mapping_dict = dict_ntfcen_html
195
+ elif self.type == 'NTFNCSR':
196
+ mapping_dict = dict_ntfncsr_html
197
+ elif self.type == 'EX-99.CERT':
198
+ mapping_dict = dict_ex99cert_html
199
+ elif self.type == 'SC 13E3':
200
+ mapping_dict = dict_sc13e3_html
201
+ elif self.type == 'SC 14D9':
202
+ mapping_dict = dict_sc14d9_html
203
+ elif self.type == 'SP 15D2':
204
+ mapping_dict = dict_sp15d2_html
205
+
160
206
  elif self.type == 'SD':
161
207
  mapping_dict = dict_sd_html
208
+
209
+ elif self.type == 'T-3':
210
+ mapping_dict = dict_t3_html
162
211
  elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
163
212
  mapping_dict = dict_nt10k_html
164
213
 
@@ -26,6 +26,12 @@ dict_abs15g_html = {
26
26
  dict_nt10k_html = {
27
27
  ('part',r'^part\s*([ivx]+)') : 0,
28
28
  }
29
+ dict_nt10q_html = dict_nt10k_html
30
+ dict_nt20f_html = dict_nt10k_html
31
+ dict_ntncen_html = dict_nt10k_html
32
+ dict_ntncsr_html = dict_nt10k_html
33
+ dict_ntfcen_html = dict_nt10k_html
34
+ dict_ntfncsr_html = dict_nt10k_html
29
35
 
30
36
  dict_1kpartii_html = {
31
37
  ('item',r'^item\s*(\d+)') : 0,
@@ -72,4 +78,44 @@ dict_appntc_html = {('agency',r'^agency') : 0,
72
78
  dict_cb_html = {
73
79
  ('part', r'^part\s*([ivx]+)') : 0,
74
80
  ('item', r'^item\s*(\d+)') : 1,
75
- }
81
+ }
82
+
83
+ dict_dstrbrpt_html = dict_1kpartii_html
84
+
85
+ dict_n18f1_html = {
86
+ ('notification of election', r'^notification of election') : 0,
87
+ ('signatures', r'^signatures?\.*$') : 0,
88
+ }
89
+
90
+ dict_ex99cert_html = {
91
+ ('item',r'^(\d+)') : 0,
92
+ ('letter',r'^\(?([a-z])') : 1,
93
+ }
94
+
95
+ dict_ncsrs_html = {
96
+ ('item',r'^(\d+)') : 0,
97
+ ('signatures',r'^signatures?\.*$') : 0,
98
+ }
99
+
100
+ dict_sc13e3_html = {
101
+ ('item', r'^item\s*(\d+)') : 0,
102
+ ('signatures', r'^signatures?\.*$') : 0,
103
+ ('letter', r'^\(?([a-z])') : 1,
104
+ }
105
+
106
+ dict_sc14d9_html = {
107
+ ('item', r'^item\s*(\d+)') : 0,
108
+ ('signatures', r'^signatures?\.*$') : 0,
109
+ ('annex', r'^annex') : 0,
110
+ }
111
+
112
+ dict_sp15d2_html = dict_10k_html
113
+
114
+ dict_t3_html = {('general',r'^general'):0,
115
+ ('affiliations',r'^affiliations'):0,
116
+ ('management and control',r'^management and control'):0,
117
+ ('underwriters',r'^underwriters'):0,
118
+ ('capital securities',r'^capital securities'):0,
119
+ ('indenture securities',r'^indenture securities'):0,
120
+ ('signatures',r'^signatures?\.*$') : 0,
121
+ ('number',r'^(\d+)') : 1,}
@@ -9,7 +9,7 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings",
9
9
  try:
10
10
  # Create a Submission object directly from the content
11
11
  # Note: the content needs to be decoded from bytes to string for the parser
12
- submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
12
+ submission = Submission(sgml_content=content,
13
13
  keep_document_types=keep_document_types)
14
14
 
15
15
  # Use the async save method to write the submission to disk
@@ -99,6 +99,7 @@ class Downloader:
99
99
  self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
100
100
  self.pbar.update(1)
101
101
  except Exception as e:
102
+ print(f"Exception {e} in {filename}")
102
103
  accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
103
104
  if os.path.exists(accession_dir):
104
105
  shutil.rmtree(accession_dir)
@@ -143,7 +144,7 @@ class Downloader:
143
144
  with dctx.stream_reader(input_buffer) as reader:
144
145
  shutil.copyfileobj(reader, decompressed_content)
145
146
 
146
- content = decompressed_content.getvalue().decode('utf-8')
147
+ content = decompressed_content.getvalue()
147
148
  processor.processing_queue.put((filename, content))
148
149
  return True
149
150
 
@@ -159,7 +160,7 @@ class Downloader:
159
160
 
160
161
  def save_regular_file(self, chunks, filename, output_dir, processor):
161
162
  try:
162
- content = b''.join(chunks).decode('utf-8')
163
+ content = b''.join(chunks)
163
164
  processor.processing_queue.put((filename, content))
164
165
  return True
165
166
 
@@ -1,7 +1,7 @@
1
1
  from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
- from secsgml import parse_sgml_submission_into_memory
4
+ from secsgml import parse_sgml_content_into_memory
5
5
  import os
6
6
  import aiofiles
7
7
  import tempfile
@@ -79,9 +79,8 @@ class Submission:
79
79
 
80
80
  if sgml_content is not None:
81
81
  self.path = None
82
- metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
82
+ metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
83
83
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
84
-
85
84
  # code dupe
86
85
  self.accession = self.metadata.content['accession-number']
87
86
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
@@ -95,7 +94,9 @@ class Submission:
95
94
  # Keep only specified types
96
95
  if keep_document_types is not None and type not in keep_document_types:
97
96
  continue
98
- filename = doc.get('filename')
97
+
98
+ # write as txt if not declared
99
+ filename = doc.get('filename','.txt')
99
100
  extension = Path(filename).suffix
100
101
  self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
101
102
 
@@ -190,12 +191,9 @@ class Submission:
190
191
  json.dump(self.metadata.content, f, indent=4)
191
192
 
192
193
  for idx, doc in enumerate(self.metadata.content['documents']):
193
- try:
194
- filename = doc.get('filename')
195
- if filename is None:
196
- filename = f"{doc.get('sequence', idx)}.txt"
197
- except (KeyError, IndexError):
198
- filename = f"{idx}.txt"
194
+ filename = doc.get('filename')
195
+ if filename is None:
196
+ filename = f"{doc.get('sequence')}.txt"
199
197
 
200
198
  doc_path = file_dir / filename
201
199
 
@@ -231,12 +229,11 @@ class Submission:
231
229
  await f.write(json.dumps(self.metadata.content, indent=4))
232
230
 
233
231
  for idx, doc in enumerate(self.metadata.content['documents']):
234
- try:
235
- filename = doc.get('filename')
236
- if filename is None:
237
- filename = f"{doc.get('sequence', idx)}.txt"
238
- except (KeyError, IndexError):
239
- filename = f"{idx}.txt"
232
+ filename = doc.get('filename')
233
+ # oh we need handling here for sequences case
234
+ if filename is None:
235
+ filename = doc['sequence'] + '.txt'
236
+
240
237
 
241
238
  doc_path = file_dir / filename
242
239
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.4
3
+ Version: 1.4.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.4.4",
35
+ version="1.4.5",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes