datamule 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.2.0 → datamule-1.2.1}/PKG-INFO +1 -1
- {datamule-1.2.0 → datamule-1.2.1}/datamule/__init__.py +3 -1
- datamule-1.2.1/datamule/document/document.py +255 -0
- datamule-1.2.1/datamule/document/processing.py +604 -0
- datamule-1.2.1/datamule/document/table.py +260 -0
- datamule-1.2.1/datamule/package_updater.py +31 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/portfolio.py +5 -3
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/submissions/downloader.py +14 -37
- datamule-1.2.1/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/seclibrary/downloader.py +50 -9
- {datamule-1.2.0 → datamule-1.2.1}/datamule/submission.py +102 -7
- {datamule-1.2.0 → datamule-1.2.1}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-1.2.0 → datamule-1.2.1}/datamule.egg-info/SOURCES.txt +5 -1
- {datamule-1.2.0 → datamule-1.2.1}/setup.py +2 -1
- datamule-1.2.0/datamule/document.py +0 -465
- {datamule-1.2.0 → datamule-1.2.1}/datamule/config.py +0 -0
- {datamule-1.2.0/datamule/mapping_dicts → datamule-1.2.1/datamule/document}/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/helper.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/index.py +0 -0
- {datamule-1.2.0/datamule/sec → datamule-1.2.1/datamule/mapping_dicts}/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.2.0/datamule/sec/infrastructure → datamule-1.2.1/datamule/sec}/__init__.py +0 -0
- {datamule-1.2.0/datamule/sec/rss → datamule-1.2.1/datamule/sec/infrastructure}/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.2.0/datamule/sec/submissions → datamule-1.2.1/datamule/sec/rss}/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/rss/monitor.py +0 -0
- {datamule-1.2.0/datamule/sec/xbrl → datamule-1.2.1/datamule/sec/submissions}/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/utils.py +0 -0
- {datamule-1.2.0/datamule/seclibrary → datamule-1.2.1/datamule/sec/xbrl}/__init__.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/seclibrary/query.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule/sheet.py +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule.egg-info/requires.txt +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.2.0 → datamule-1.2.1}/setup.cfg +0 -0
@@ -1,10 +1,11 @@
|
|
1
1
|
from .submission import Submission
|
2
2
|
from .portfolio import Portfolio
|
3
|
-
from .document import Document
|
3
|
+
from .document.document import Document
|
4
4
|
from .helper import _load_package_csv, load_package_dataset
|
5
5
|
from .config import Config
|
6
6
|
from .sheet import Sheet
|
7
7
|
from .index import Index
|
8
|
+
from .package_updater import PackageUpdater
|
8
9
|
|
9
10
|
|
10
11
|
# Keep the notebook environment setup
|
@@ -37,4 +38,5 @@ __all__ = [
|
|
37
38
|
'Config',
|
38
39
|
'Sheet',
|
39
40
|
'Index',
|
41
|
+
'PackageUpdater',
|
40
42
|
]
|
@@ -0,0 +1,255 @@
|
|
1
|
+
import json
|
2
|
+
import csv
|
3
|
+
import re
|
4
|
+
from doc2dict import xml2dict, txt2dict, dict2dict
|
5
|
+
from doc2dict.mapping import flatten_hierarchy
|
6
|
+
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
7
|
+
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
8
|
+
from selectolax.parser import HTMLParser
|
9
|
+
from .processing import process_tabular_data
|
10
|
+
from pathlib import Path
|
11
|
+
|
12
|
+
class Document:
|
13
|
+
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
14
|
+
|
15
|
+
self.type = type
|
16
|
+
extension = extension.lower()
|
17
|
+
self.accession = accession
|
18
|
+
self.filing_date = filing_date
|
19
|
+
self.content = content
|
20
|
+
|
21
|
+
if path is not None:
|
22
|
+
self.path = path
|
23
|
+
|
24
|
+
self.extension = extension
|
25
|
+
# this will be filled by parsed
|
26
|
+
self.data = None
|
27
|
+
|
28
|
+
#_load_text_content
|
29
|
+
def _preprocess_txt_content(self):
|
30
|
+
return self.content.translate(str.maketrans({
|
31
|
+
'\xa0': ' ', '\u2003': ' ',
|
32
|
+
'\u2018': "'", '\u2019': "'",
|
33
|
+
'\u201c': '"', '\u201d': '"'
|
34
|
+
}))
|
35
|
+
|
36
|
+
# will deprecate this when we add html2dict
|
37
|
+
def _preprocess_html_content(self):
|
38
|
+
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
39
|
+
|
40
|
+
# Remove hidden elements first
|
41
|
+
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
42
|
+
for node in hidden_nodes:
|
43
|
+
node.decompose()
|
44
|
+
|
45
|
+
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
46
|
+
lines = []
|
47
|
+
current_line = []
|
48
|
+
|
49
|
+
def flush_line():
|
50
|
+
if current_line:
|
51
|
+
# Don't add spaces between adjacent spans
|
52
|
+
lines.append(''.join(current_line))
|
53
|
+
current_line.clear()
|
54
|
+
|
55
|
+
for node in parser.root.traverse(include_text=True):
|
56
|
+
if node.tag in ('script', 'style', 'css'):
|
57
|
+
continue
|
58
|
+
|
59
|
+
if node.tag in blocks:
|
60
|
+
flush_line()
|
61
|
+
lines.append('')
|
62
|
+
|
63
|
+
if node.text_content:
|
64
|
+
text = node.text_content.strip()
|
65
|
+
if text:
|
66
|
+
if node.tag in blocks:
|
67
|
+
flush_line()
|
68
|
+
lines.append(text)
|
69
|
+
lines.append('')
|
70
|
+
else:
|
71
|
+
# Only add space if nodes aren't directly adjacent
|
72
|
+
if current_line and not current_line[-1].endswith(' '):
|
73
|
+
if node.prev and node.prev.text_content:
|
74
|
+
if node.parent != node.prev.parent or node.prev.next != node:
|
75
|
+
current_line.append(' ')
|
76
|
+
current_line.append(text)
|
77
|
+
|
78
|
+
flush_line()
|
79
|
+
|
80
|
+
text = '\n'.join(lines)
|
81
|
+
while '\n\n\n' in text:
|
82
|
+
text = text.replace('\n\n\n', '\n\n')
|
83
|
+
|
84
|
+
return text.translate(str.maketrans({
|
85
|
+
'\xa0': ' ', '\u2003': ' ',
|
86
|
+
'\u2018': "'", '\u2019': "'",
|
87
|
+
'\u201c': '"', '\u201d': '"'
|
88
|
+
}))
|
89
|
+
|
90
|
+
def contains_string(self, pattern):
|
91
|
+
"""Works for select files"""
|
92
|
+
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
93
|
+
return bool(re.search(pattern, self.content))
|
94
|
+
return False
|
95
|
+
|
96
|
+
# Note: this method will be heavily modified in the future
|
97
|
+
def parse(self):
|
98
|
+
# check if we have already parsed the content
|
99
|
+
if self.data:
|
100
|
+
return self.data
|
101
|
+
|
102
|
+
# preprocess content
|
103
|
+
if self.extension == '.txt':
|
104
|
+
self.content = self._preprocess_txt_content()
|
105
|
+
elif self.extension in ['.htm', '.html']:
|
106
|
+
self.content = self._preprocess_html_content()
|
107
|
+
|
108
|
+
mapping_dict = None
|
109
|
+
|
110
|
+
if self.extension == '.xml':
|
111
|
+
if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
|
112
|
+
mapping_dict = dict_345
|
113
|
+
|
114
|
+
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
# will deprecate this when we add html2dict
|
119
|
+
elif self.extension in ['.htm', '.html','.txt']:
|
120
|
+
|
121
|
+
if self.type == '10-K':
|
122
|
+
mapping_dict = dict_10k
|
123
|
+
elif self.type == '10-Q':
|
124
|
+
mapping_dict = dict_10q
|
125
|
+
elif self.type == '8-K':
|
126
|
+
mapping_dict = dict_8k
|
127
|
+
elif self.type == 'SC 13D':
|
128
|
+
mapping_dict = dict_13d
|
129
|
+
elif self.type == 'SC 13G':
|
130
|
+
mapping_dict = dict_13g
|
131
|
+
|
132
|
+
self.data = {}
|
133
|
+
self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
|
134
|
+
return self.data
|
135
|
+
|
136
|
+
def write_json(self, output_filename=None):
|
137
|
+
if not self.data:
|
138
|
+
self.parse()
|
139
|
+
|
140
|
+
with open(output_filename, 'w',encoding='utf-8') as f:
|
141
|
+
json.dump(self.data, f, indent=2)
|
142
|
+
|
143
|
+
def to_tabular(self):
|
144
|
+
if self.extension != '.xml':
|
145
|
+
return []
|
146
|
+
self.parse()
|
147
|
+
return process_tabular_data(self)
|
148
|
+
|
149
|
+
|
150
|
+
def write_csv(self, output_folder, accession_number=None):
|
151
|
+
|
152
|
+
tables = self.to_tabular(accession_number)
|
153
|
+
|
154
|
+
if not tables:
|
155
|
+
return
|
156
|
+
|
157
|
+
for table in tables:
|
158
|
+
fieldnames = table.columns
|
159
|
+
output_filename = Path(output_folder) / f"{table.type}.csv"
|
160
|
+
|
161
|
+
with open(output_filename, 'w', newline='') as csvfile:
|
162
|
+
writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
163
|
+
writer.writeheader()
|
164
|
+
writer.writerows(table.data)
|
165
|
+
|
166
|
+
|
167
|
+
def _document_to_section_text(self, document_data, parent_key=''):
|
168
|
+
items = []
|
169
|
+
|
170
|
+
if isinstance(document_data, dict):
|
171
|
+
for key, value in document_data.items():
|
172
|
+
# Build the section name
|
173
|
+
section = f"{parent_key}_{key}" if parent_key else key
|
174
|
+
|
175
|
+
# If the value is a dict, recurse
|
176
|
+
if isinstance(value, dict):
|
177
|
+
items.extend(self._document_to_section_text(value, section))
|
178
|
+
# If it's a list, handle each item
|
179
|
+
elif isinstance(value, list):
|
180
|
+
for i, item in enumerate(value):
|
181
|
+
if isinstance(item, dict):
|
182
|
+
items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
|
183
|
+
else:
|
184
|
+
items.append({
|
185
|
+
'section': f"{section}_{i+1}",
|
186
|
+
'text': str(item)
|
187
|
+
})
|
188
|
+
# Base case - add the item
|
189
|
+
else:
|
190
|
+
items.append({
|
191
|
+
'section': section,
|
192
|
+
'text': str(value)
|
193
|
+
})
|
194
|
+
|
195
|
+
return items
|
196
|
+
|
197
|
+
# this will all have to be changed. default will be to flatten everything
|
198
|
+
# candidate for deletion
|
199
|
+
def __iter__(self):
|
200
|
+
self.parse()
|
201
|
+
|
202
|
+
# Let's remove XML iterable for now
|
203
|
+
|
204
|
+
# Handle text-based documents
|
205
|
+
if self.extension in ['.txt', '.htm', '.html']:
|
206
|
+
document_data = self.data
|
207
|
+
if not document_data:
|
208
|
+
return iter([])
|
209
|
+
|
210
|
+
# Find highest hierarchy level from mapping dict
|
211
|
+
highest_hierarchy = float('inf')
|
212
|
+
section_type = None
|
213
|
+
|
214
|
+
if self.type in ['10-K', '10-Q']:
|
215
|
+
mapping_dict = dict_10k if self.type == '10-K' else dict_10q
|
216
|
+
elif self.type == '8-K':
|
217
|
+
mapping_dict = dict_8k
|
218
|
+
elif self.type == 'SC 13D':
|
219
|
+
mapping_dict = dict_13d
|
220
|
+
elif self.type == 'SC 13G':
|
221
|
+
mapping_dict = dict_13g
|
222
|
+
else:
|
223
|
+
return iter([])
|
224
|
+
|
225
|
+
# Find section type with highest hierarchy number
|
226
|
+
highest_hierarchy = -1 # Start at -1 to find highest
|
227
|
+
for mapping in mapping_dict['rules']['mappings']:
|
228
|
+
if mapping.get('hierarchy') is not None:
|
229
|
+
if mapping['hierarchy'] > highest_hierarchy:
|
230
|
+
highest_hierarchy = mapping['hierarchy']
|
231
|
+
section_type = mapping['name']
|
232
|
+
|
233
|
+
if not section_type:
|
234
|
+
return iter([])
|
235
|
+
|
236
|
+
# Extract sections of the identified type
|
237
|
+
def find_sections(data, target_type):
|
238
|
+
sections = []
|
239
|
+
if isinstance(data, dict):
|
240
|
+
if data.get('type') == target_type:
|
241
|
+
sections.append({
|
242
|
+
'item': data.get('text', ''),
|
243
|
+
'text': flatten_hierarchy(data.get('content', []))
|
244
|
+
})
|
245
|
+
for value in data.values():
|
246
|
+
if isinstance(value, (dict, list)):
|
247
|
+
sections.extend(find_sections(value, target_type))
|
248
|
+
elif isinstance(data, list):
|
249
|
+
for item in data:
|
250
|
+
sections.extend(find_sections(item, target_type))
|
251
|
+
return sections
|
252
|
+
|
253
|
+
return iter(find_sections(document_data, section_type))
|
254
|
+
|
255
|
+
return iter([])
|