datamule 1.4.0__tar.gz → 1.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.4.0 → datamule-1.4.3}/PKG-INFO +1 -1
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/document.py +25 -5
- datamule-1.4.3/datamule/mapping_dicts/html_mapping_dicts.py +48 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-1.4.0 → datamule-1.4.3}/setup.py +1 -1
- datamule-1.4.0/datamule/mapping_dicts/html_mapping_dicts.py +0 -11
- {datamule-1.4.0 → datamule-1.4.3}/datamule/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/config.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/atsn.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/cfportal.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/d.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ex102_abs.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ex99a_sdr.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ex99c_sdr.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ex99g_sdr.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ex99i_sdr.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/information_table.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/nmfp.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/npx.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/onefourtyfour.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ownership.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/proxy_voting_record.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/sbs.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/sbsef.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/schedule13.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/sdr.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/submission_metadata.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/ta.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/thirteenfhr.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/twentyfivense.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/mappings/twentyfourf2nt.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/processing.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/document/table.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/helper.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/index.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/package_updater.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/portfolio.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/utils.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/seclibrary/downloader.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/seclibrary/query.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/sheet.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule/submission.py +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule.egg-info/requires.txt +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.4.0 → datamule-1.4.3}/setup.cfg +0 -0
@@ -3,10 +3,10 @@ import csv
|
|
3
3
|
import re
|
4
4
|
from doc2dict import xml2dict, txt2dict, dict2dict
|
5
5
|
from doc2dict.mapping import flatten_hierarchy
|
6
|
-
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict
|
6
|
+
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
|
-
from ..mapping_dicts.html_mapping_dicts import
|
9
|
+
from ..mapping_dicts.html_mapping_dicts import *
|
10
10
|
from selectolax.parser import HTMLParser
|
11
11
|
from .processing import process_tabular_data
|
12
12
|
from pathlib import Path
|
@@ -120,12 +120,31 @@ class Document:
|
|
120
120
|
self.data = {}
|
121
121
|
self.data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
|
122
122
|
elif self.extension in ['.htm', '.html']:
|
123
|
-
|
123
|
+
|
124
|
+
if self.type == '1-K':
|
125
|
+
mapping_dict = dict_1kpartii_html
|
126
|
+
elif self.type == '1-SA':
|
127
|
+
mapping_dict = dict_1sa_html
|
128
|
+
elif self.type == '1-U':
|
129
|
+
mapping_dict = dict_1u_html
|
130
|
+
elif self.type == '10-12B':
|
131
|
+
mapping_dict = dict_1012b_html
|
132
|
+
elif self.type == '10-D':
|
133
|
+
mapping_dict = dict_10d_html
|
134
|
+
elif self.type == '10-K':
|
124
135
|
mapping_dict = dict_10k_html
|
125
136
|
elif self.type == '10-Q':
|
126
137
|
mapping_dict = dict_10q_html
|
138
|
+
elif self.type == '20-F':
|
139
|
+
mapping_dict = dict_20f_html
|
127
140
|
elif self.type == '8-K':
|
128
141
|
mapping_dict = dict_8k_html
|
142
|
+
elif self.type == 'ABS-15G':
|
143
|
+
mapping_dict = dict_abs15g_html
|
144
|
+
elif self.type == 'SD':
|
145
|
+
mapping_dict = dict_sd_html
|
146
|
+
elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
|
147
|
+
mapping_dict = dict_nt10k_html
|
129
148
|
|
130
149
|
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
131
150
|
self.data = dct
|
@@ -134,6 +153,8 @@ class Document:
|
|
134
153
|
mapping_dict = dict_345
|
135
154
|
|
136
155
|
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
156
|
+
elif self.extension == '.pdf':
|
157
|
+
self.data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
137
158
|
else:
|
138
159
|
pass
|
139
160
|
|
@@ -235,8 +256,7 @@ class Document:
|
|
235
256
|
return result
|
236
257
|
|
237
258
|
|
238
|
-
#
|
239
|
-
# candidate for deletion
|
259
|
+
# TODO CHANGE THIS
|
240
260
|
def __iter__(self):
|
241
261
|
self.parse()
|
242
262
|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
dict_10k_html = {
|
2
|
+
('part',r'^part\s*([ivx]+)$') : 0,
|
3
|
+
('signatures',r'^signatures?\.*$') : 0,
|
4
|
+
('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
|
5
|
+
}
|
6
|
+
dict_10q_html = dict_10k_html
|
7
|
+
|
8
|
+
dict_8k_html = {
|
9
|
+
('signatures',r'^signatures?\.*$') : 0,
|
10
|
+
('item',r'^item\s*(\d+\.\d+)') : 0,
|
11
|
+
}
|
12
|
+
|
13
|
+
dict_sd_html = {
|
14
|
+
('signatures',r'^signatures?\.*$') : 0,
|
15
|
+
('item',r'^item\s*(\d+\.\d+)') : 0,
|
16
|
+
}
|
17
|
+
|
18
|
+
dict_abs15g_html = {
|
19
|
+
('part',r'^part\s*([ivx]+)') : 0,
|
20
|
+
('signatures',r'^signatures?\.*$') : 0,
|
21
|
+
('item',r'^item\s*(\d+\.\d+)') : 1,
|
22
|
+
}
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
dict_nt10k_html = {
|
27
|
+
('part',r'^part\s*([ivx]+)') : 0,
|
28
|
+
}
|
29
|
+
|
30
|
+
dict_1kpartii_html = {
|
31
|
+
('item',r'^item\s*(\d+)') : 1,
|
32
|
+
}
|
33
|
+
|
34
|
+
dict_1sa_html = dict_1kpartii_html
|
35
|
+
|
36
|
+
dict_1u_html = {('item',r'^item\s*(\d+)') : 1,
|
37
|
+
('signatures',r'^signatures?\.*$') : 1,}
|
38
|
+
|
39
|
+
dict_1012b_html = dict_1u_html
|
40
|
+
|
41
|
+
dict_10d_html = dict_10k_html
|
42
|
+
|
43
|
+
dict_20f_html = {
|
44
|
+
('part',r'^part\s*([ivx]+)') : 0,
|
45
|
+
('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
|
46
|
+
('letter',r'\d*\.?([a-z])') : 2,
|
47
|
+
('signatures',r'^signatures?\.*$') : 0,
|
48
|
+
}
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="1.4.
|
35
|
+
version="1.4.3",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
@@ -1,11 +0,0 @@
|
|
1
|
-
dict_10k_html = {
|
2
|
-
('part',r'^part\s*([ivx]+)$') : 0,
|
3
|
-
('signatures',r'^signatures?\.*$') : 0,
|
4
|
-
('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
|
5
|
-
}
|
6
|
-
dict_10q_html = dict_10k_html
|
7
|
-
|
8
|
-
dict_8k_html = {
|
9
|
-
('signatures',r'^signatures?\.*$') : 0,
|
10
|
-
('item',r'^item\s*(\d+\.\d+)') : 0,
|
11
|
-
}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|