datamule 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.3.1 → datamule-1.4.0}/PKG-INFO +1 -1
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/document.py +49 -21
- datamule-1.4.0/datamule/mapping_dicts/html_mapping_dicts.py +11 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-1.3.1 → datamule-1.4.0}/datamule.egg-info/SOURCES.txt +1 -0
- {datamule-1.3.1 → datamule-1.4.0}/setup.py +1 -1
- {datamule-1.3.1 → datamule-1.4.0}/datamule/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/config.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/atsn.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/cfportal.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/d.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ex102_abs.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ex99a_sdr.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ex99c_sdr.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ex99g_sdr.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ex99i_sdr.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/information_table.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/nmfp.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/npx.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/onefourtyfour.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ownership.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/proxy_voting_record.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/sbs.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/sbsef.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/schedule13.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/sdr.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/submission_metadata.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/ta.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/thirteenfhr.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/twentyfivense.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/mappings/twentyfourf2nt.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/processing.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/document/table.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/helper.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/index.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/package_updater.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/portfolio.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/utils.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/seclibrary/downloader.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/seclibrary/query.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/sheet.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule/submission.py +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule.egg-info/requires.txt +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.3.1 → datamule-1.4.0}/setup.cfg +0 -0
@@ -3,11 +3,14 @@ import csv
|
|
3
3
|
import re
|
4
4
|
from doc2dict import xml2dict, txt2dict, dict2dict
|
5
5
|
from doc2dict.mapping import flatten_hierarchy
|
6
|
+
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict
|
6
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
7
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
|
+
from ..mapping_dicts.html_mapping_dicts import dict_10k_html, dict_10q_html, dict_8k_html
|
8
10
|
from selectolax.parser import HTMLParser
|
9
11
|
from .processing import process_tabular_data
|
10
12
|
from pathlib import Path
|
13
|
+
import webbrowser
|
11
14
|
|
12
15
|
class Document:
|
13
16
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
@@ -99,26 +102,10 @@ class Document:
|
|
99
102
|
if self.data:
|
100
103
|
return self.data
|
101
104
|
|
102
|
-
# preprocess content
|
103
|
-
if self.extension == '.txt':
|
104
|
-
self.content = self._preprocess_txt_content()
|
105
|
-
elif self.extension in ['.htm', '.html']:
|
106
|
-
self.content = self._preprocess_html_content()
|
107
|
-
|
108
105
|
mapping_dict = None
|
109
|
-
|
110
|
-
if self.extension == '.
|
111
|
-
|
112
|
-
mapping_dict = dict_345
|
113
|
-
|
114
|
-
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
# will deprecate this when we add html2dict
|
119
|
-
elif self.extension in ['.htm', '.html','.txt']:
|
120
|
-
|
121
|
-
|
106
|
+
|
107
|
+
if self.extension == '.txt':
|
108
|
+
content = self._preprocess_txt_content()
|
122
109
|
if self.type == '10-Q':
|
123
110
|
mapping_dict = dict_10q
|
124
111
|
elif self.type == '10-K':
|
@@ -131,8 +118,24 @@ class Document:
|
|
131
118
|
mapping_dict = dict_13g
|
132
119
|
|
133
120
|
self.data = {}
|
134
|
-
self.data['document'] = dict2dict(txt2dict(content=
|
135
|
-
|
121
|
+
self.data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
|
122
|
+
elif self.extension in ['.htm', '.html']:
|
123
|
+
if self.type == '10-K':
|
124
|
+
mapping_dict = dict_10k_html
|
125
|
+
elif self.type == '10-Q':
|
126
|
+
mapping_dict = dict_10q_html
|
127
|
+
elif self.type == '8-K':
|
128
|
+
mapping_dict = dict_8k_html
|
129
|
+
|
130
|
+
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
131
|
+
self.data = dct
|
132
|
+
elif self.extension == '.xml':
|
133
|
+
if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
|
134
|
+
mapping_dict = dict_345
|
135
|
+
|
136
|
+
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
137
|
+
else:
|
138
|
+
pass
|
136
139
|
|
137
140
|
def write_json(self, output_filename=None):
|
138
141
|
if not self.data:
|
@@ -206,6 +209,31 @@ class Document:
|
|
206
209
|
})
|
207
210
|
|
208
211
|
return items
|
212
|
+
|
213
|
+
def visualize(self):
|
214
|
+
if not self.data:
|
215
|
+
self.parse()
|
216
|
+
|
217
|
+
if not self.data:
|
218
|
+
if self.extension in ['.jpg', '.png', '.pdf']:
|
219
|
+
webbrowser.open('file://' + str(self.path))
|
220
|
+
else:
|
221
|
+
pass
|
222
|
+
else:
|
223
|
+
visualize_dict(self.data)
|
224
|
+
|
225
|
+
def get_section(self, title, format='dict'):
|
226
|
+
if not self.data:
|
227
|
+
self.parse()
|
228
|
+
|
229
|
+
result = get_title(self.data,title)
|
230
|
+
|
231
|
+
if format == 'text':
|
232
|
+
result = [item[1] for item in result]
|
233
|
+
result = [unnest_dict(item) for item in result]
|
234
|
+
|
235
|
+
return result
|
236
|
+
|
209
237
|
|
210
238
|
# this will all have to be changed. default will be to flatten everything
|
211
239
|
# candidate for deletion
|
@@ -0,0 +1,11 @@
|
|
1
|
+
dict_10k_html = {
|
2
|
+
('part',r'^part\s*([ivx]+)$') : 0,
|
3
|
+
('signatures',r'^signatures?\.*$') : 0,
|
4
|
+
('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
|
5
|
+
}
|
6
|
+
dict_10q_html = dict_10k_html
|
7
|
+
|
8
|
+
dict_8k_html = {
|
9
|
+
('signatures',r'^signatures?\.*$') : 0,
|
10
|
+
('item',r'^item\s*(\d+\.\d+)') : 0,
|
11
|
+
}
|
@@ -42,6 +42,7 @@ datamule/document/mappings/thirteenfhr.py
|
|
42
42
|
datamule/document/mappings/twentyfivense.py
|
43
43
|
datamule/document/mappings/twentyfourf2nt.py
|
44
44
|
datamule/mapping_dicts/__init__.py
|
45
|
+
datamule/mapping_dicts/html_mapping_dicts.py
|
45
46
|
datamule/mapping_dicts/txt_mapping_dicts.py
|
46
47
|
datamule/mapping_dicts/xml_mapping_dicts.py
|
47
48
|
datamule/sec/__init__.py
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="1.
|
35
|
+
version="1.4.0",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|