datamule 1.4.0__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,10 @@ import csv
3
3
  import re
4
4
  from doc2dict import xml2dict, txt2dict, dict2dict
5
5
  from doc2dict.mapping import flatten_hierarchy
6
- from doc2dict import html2dict, visualize_dict, get_title, unnest_dict
6
+ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
- from ..mapping_dicts.html_mapping_dicts import dict_10k_html, dict_10q_html, dict_8k_html
9
+ from ..mapping_dicts.html_mapping_dicts import *
10
10
  from selectolax.parser import HTMLParser
11
11
  from .processing import process_tabular_data
12
12
  from pathlib import Path
@@ -120,12 +120,31 @@ class Document:
120
120
  self.data = {}
121
121
  self.data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
122
122
  elif self.extension in ['.htm', '.html']:
123
- if self.type == '10-K':
123
+
124
+ if self.type == '1-K':
125
+ mapping_dict = dict_1kpartii_html
126
+ elif self.type == '1-SA':
127
+ mapping_dict = dict_1sa_html
128
+ elif self.type == '1-U':
129
+ mapping_dict = dict_1u_html
130
+ elif self.type == '10-12B':
131
+ mapping_dict = dict_1012b_html
132
+ elif self.type == '10-D':
133
+ mapping_dict = dict_10d_html
134
+ elif self.type == '10-K':
124
135
  mapping_dict = dict_10k_html
125
136
  elif self.type == '10-Q':
126
137
  mapping_dict = dict_10q_html
138
+ elif self.type == '20-F':
139
+ mapping_dict = dict_20f_html
127
140
  elif self.type == '8-K':
128
141
  mapping_dict = dict_8k_html
142
+ elif self.type == 'ABS-15G':
143
+ mapping_dict = dict_abs15g_html
144
+ elif self.type == 'SD':
145
+ mapping_dict = dict_sd_html
146
+ elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
147
+ mapping_dict = dict_nt10k_html
129
148
 
130
149
  dct = html2dict(content=self.content, mapping_dict=mapping_dict)
131
150
  self.data = dct
@@ -134,6 +153,8 @@ class Document:
134
153
  mapping_dict = dict_345
135
154
 
136
155
  self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
156
+ elif self.extension == '.pdf':
157
+ self.data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
137
158
  else:
138
159
  pass
139
160
 
@@ -235,8 +256,7 @@ class Document:
235
256
  return result
236
257
 
237
258
 
238
- # this will all have to be changed. default will be to flatten everything
239
- # candidate for deletion
259
+ # TODO CHANGE THIS
240
260
  def __iter__(self):
241
261
  self.parse()
242
262
 
@@ -8,4 +8,41 @@ dict_10q_html = dict_10k_html
8
8
  dict_8k_html = {
9
9
  ('signatures',r'^signatures?\.*$') : 0,
10
10
  ('item',r'^item\s*(\d+\.\d+)') : 0,
11
+ }
12
+
13
+ dict_sd_html = {
14
+ ('signatures',r'^signatures?\.*$') : 0,
15
+ ('item',r'^item\s*(\d+\.\d+)') : 0,
16
+ }
17
+
18
+ dict_abs15g_html = {
19
+ ('part',r'^part\s*([ivx]+)') : 0,
20
+ ('signatures',r'^signatures?\.*$') : 0,
21
+ ('item',r'^item\s*(\d+\.\d+)') : 1,
22
+ }
23
+
24
+
25
+
26
+ dict_nt10k_html = {
27
+ ('part',r'^part\s*([ivx]+)') : 0,
28
+ }
29
+
30
+ dict_1kpartii_html = {
31
+ ('item',r'^item\s*(\d+)') : 1,
32
+ }
33
+
34
+ dict_1sa_html = dict_1kpartii_html
35
+
36
+ dict_1u_html = {('item',r'^item\s*(\d+)') : 1,
37
+ ('signatures',r'^signatures?\.*$') : 1,}
38
+
39
+ dict_1012b_html = dict_1u_html
40
+
41
+ dict_10d_html = dict_10k_html
42
+
43
+ dict_20f_html = {
44
+ ('part',r'^part\s*([ivx]+)') : 0,
45
+ ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
46
+ ('letter',r'\d*\.?([a-z])') : 2,
47
+ ('signatures',r'^signatures?\.*$') : 0,
11
48
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.0
3
+ Version: 1.4.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -8,7 +8,7 @@ datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
8
  datamule/submission.py,sha256=Yh5nG3ioumhl6z30wJdIEmKjDDNSuo0r2xycZSIaeIg,11035
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- datamule/document/document.py,sha256=fQ7woCklxFwzABvjdFknFR8AFUwPuEEfuXZ_VE5q-7Y,10954
11
+ datamule/document/document.py,sha256=qjWHGyhvwrwZqYnIUiAqmzlqNtS_ybG6neFjCdCL1vA,11758
12
12
  datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
13
13
  datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
14
14
  datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,7 +36,7 @@ datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3
36
36
  datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
37
37
  datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
38
38
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- datamule/mapping_dicts/html_mapping_dicts.py,sha256=Tfg7wX6EUUtsg_HD2KHH3Z6S3GR9FZc1pwL8kCGfRRU,282
39
+ datamule/mapping_dicts/html_mapping_dicts.py,sha256=RMHrjosQyS8KlDJnyMKfz65Y9bDo4mJpLDEtaiF0s_s,1046
40
40
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
41
41
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
42
42
  datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -58,7 +58,7 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
58
58
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
59
59
  datamule/seclibrary/downloader.py,sha256=PIgz_7ASUTZOHcUZGcD1SmLaGSbq7xe7EiJT0Z7HU4M,13653
60
60
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
61
- datamule-1.4.0.dist-info/METADATA,sha256=6H4Zzj37KnA56z-_XNAEv2NNpW3qKiEV_sul921KshM,469
62
- datamule-1.4.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
- datamule-1.4.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
- datamule-1.4.0.dist-info/RECORD,,
61
+ datamule-1.4.3.dist-info/METADATA,sha256=kH9lAMkixkxGm_dMQ7PrWAOThMjKKPEzIaz1qUC5mAc,469
62
+ datamule-1.4.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
+ datamule-1.4.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
+ datamule-1.4.3.dist-info/RECORD,,