datamule 2.2.4__tar.gz → 2.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.2.4 → datamule-2.2.6}/PKG-INFO +1 -1
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/document.py +33 -73
- {datamule-2.2.4 → datamule-2.2.6}/datamule/submission.py +10 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/tags/config.py +6 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/tags/utils.py +14 -8
- {datamule-2.2.4 → datamule-2.2.6}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.2.4 → datamule-2.2.6}/setup.py +1 -1
- {datamule-2.2.4 → datamule-2.2.6}/datamule/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/config.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/datamule/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/datamule/downloader.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/datasets.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/helper.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/index.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/package_updater.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/portfolio.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/utils.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sentiment/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/sheet.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/tags/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/tags/dictionaries.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/tags/regex.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/utils/__init__.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.4 → datamule-2.2.6}/setup.cfg +0 -0
@@ -3,7 +3,7 @@ import csv
|
|
3
3
|
import re
|
4
4
|
from doc2dict import xml2dict, txt2dict, dict2dict
|
5
5
|
from doc2dict.mapping import flatten_hierarchy
|
6
|
-
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
6
|
+
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
9
|
from ..mapping_dicts.html_mapping_dicts import *
|
@@ -221,7 +221,6 @@ class Tags(TextAnalysisBase):
|
|
221
221
|
if not hasattr(self, '_persons'):
|
222
222
|
self._persons = []
|
223
223
|
sources = self._get_text_sources()
|
224
|
-
|
225
224
|
for source in sources:
|
226
225
|
if '8k_2024_persons' in self.processors:
|
227
226
|
results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
|
@@ -289,71 +288,17 @@ class Document:
|
|
289
288
|
self._data = None
|
290
289
|
self._tables = None
|
291
290
|
self._text = None
|
291
|
+
self._markdown = None
|
292
|
+
|
293
|
+
# booleans
|
294
|
+
self._data_bool = self.extension in ('.htm', '.html','.txt')
|
295
|
+
self._text_bool = self._data_bool
|
296
|
+
self._markdown_bool = self._data_bool
|
297
|
+
self._visualize_bool = self._data_bool
|
298
|
+
self._tables_bool = self.extension in ('.xml')
|
292
299
|
|
293
300
|
|
294
301
|
|
295
|
-
#_load_text_content
|
296
|
-
def _preprocess_txt_content(self):
|
297
|
-
self._text = self.content.decode().translate(str.maketrans({
|
298
|
-
'\xa0': ' ', '\u2003': ' ',
|
299
|
-
'\u2018': "'", '\u2019': "'",
|
300
|
-
'\u201c': '"', '\u201d': '"'
|
301
|
-
}))
|
302
|
-
|
303
|
-
# needs work
|
304
|
-
def _preprocess_html_content(self):
|
305
|
-
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
306
|
-
|
307
|
-
# Remove hidden elements first
|
308
|
-
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
309
|
-
for node in hidden_nodes:
|
310
|
-
node.decompose()
|
311
|
-
|
312
|
-
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
313
|
-
lines = []
|
314
|
-
current_line = []
|
315
|
-
|
316
|
-
def flush_line():
|
317
|
-
if current_line:
|
318
|
-
# Don't add spaces between adjacent spans
|
319
|
-
lines.append(''.join(current_line))
|
320
|
-
current_line.clear()
|
321
|
-
|
322
|
-
for node in parser.root.traverse(include_text=True):
|
323
|
-
if node.tag in ('script', 'style', 'css'):
|
324
|
-
continue
|
325
|
-
|
326
|
-
if node.tag in blocks:
|
327
|
-
flush_line()
|
328
|
-
lines.append('')
|
329
|
-
|
330
|
-
if node.text_content:
|
331
|
-
text = node.text_content.strip()
|
332
|
-
if text:
|
333
|
-
if node.tag in blocks:
|
334
|
-
flush_line()
|
335
|
-
lines.append(text)
|
336
|
-
lines.append('')
|
337
|
-
else:
|
338
|
-
# Only add space if nodes aren't directly adjacent
|
339
|
-
if current_line and not current_line[-1].endswith(' '):
|
340
|
-
if node.prev and node.prev.text_content:
|
341
|
-
if node.parent != node.prev.parent or node.prev.next != node:
|
342
|
-
current_line.append(' ')
|
343
|
-
current_line.append(text)
|
344
|
-
|
345
|
-
flush_line()
|
346
|
-
|
347
|
-
text = '\n'.join(lines)
|
348
|
-
while '\n\n\n' in text:
|
349
|
-
text = text.replace('\n\n\n', '\n\n')
|
350
|
-
|
351
|
-
self._text = text.translate(str.maketrans({
|
352
|
-
'\xa0': ' ', '\u2003': ' ',
|
353
|
-
'\u2018': "'", '\u2019': "'",
|
354
|
-
'\u201c': '"', '\u201d': '"'
|
355
|
-
}))
|
356
|
-
|
357
302
|
def contains_string(self, pattern):
|
358
303
|
"""Works for select files"""
|
359
304
|
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
@@ -485,17 +430,21 @@ class Document:
|
|
485
430
|
|
486
431
|
@property
|
487
432
|
def text(self):
|
488
|
-
if self.
|
489
|
-
if self.
|
490
|
-
|
491
|
-
|
492
|
-
self._preprocess_txt_content() # Still sets self._text to plain string
|
493
|
-
|
494
|
-
# Convert the plain string to TextWithTags
|
495
|
-
plain_text = self._text
|
496
|
-
self._text = TextWithTags(plain_text, self)
|
433
|
+
if self._text_bool:
|
434
|
+
if self._text is None:
|
435
|
+
text = flatten_dict(self.data,'text')
|
436
|
+
self._text = TextWithTags(text, self)
|
497
437
|
return self._text
|
498
438
|
|
439
|
+
@property
|
440
|
+
def markdown(self):
|
441
|
+
if self._markdown_bool:
|
442
|
+
if self._markdown is None:
|
443
|
+
self._markdown = flatten_dict(self.data,'markdown')
|
444
|
+
|
445
|
+
return self._markdown
|
446
|
+
|
447
|
+
|
499
448
|
def write_json(self, output_filename=None):
|
500
449
|
if not self.data:
|
501
450
|
self.parse()
|
@@ -544,6 +493,17 @@ class Document:
|
|
544
493
|
writer.writeheader()
|
545
494
|
writer.writerows(table.data)
|
546
495
|
|
496
|
+
def reset_nlp(self):
|
497
|
+
"""Reset all NLP analysis by creating fresh wrapper objects"""
|
498
|
+
# Reset data wrapper
|
499
|
+
if hasattr(self, '_data') and self._data is not None:
|
500
|
+
raw_data = dict(self._data) # Extract the underlying dict
|
501
|
+
self._data = DataWithTags(raw_data, self)
|
502
|
+
|
503
|
+
# Reset text wrapper
|
504
|
+
if hasattr(self, '_text') and self._text is not None:
|
505
|
+
raw_text = str(self._text) # Extract the underlying string
|
506
|
+
self._text = TextWithTags(raw_text, self)
|
547
507
|
|
548
508
|
def _document_to_section_text(self, document_data, parent_key=''):
|
549
509
|
items = []
|
@@ -121,6 +121,16 @@ class Submission:
|
|
121
121
|
self.accession = self.metadata.content['accession-number']
|
122
122
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
123
123
|
|
124
|
+
|
125
|
+
# booleans
|
126
|
+
self._has_xbrl = any(
|
127
|
+
doc['type'] in ('EX-100.INS', 'EX-101.INS') or
|
128
|
+
doc.get('filename', '').endswith('_htm.xml')
|
129
|
+
for doc in self.metadata.content['documents']
|
130
|
+
)
|
131
|
+
|
132
|
+
self._has_fundamentals = self._has_xbrl
|
133
|
+
|
124
134
|
def _load_document_by_index(self, idx):
|
125
135
|
"""Load a document by its index in the metadata documents list."""
|
126
136
|
doc = self.metadata.content['documents'][idx]
|
@@ -3,6 +3,12 @@ from .dictionaries import download_dictionary, load_dictionary
|
|
3
3
|
_active_dictionaries = []
|
4
4
|
_loaded_dictionaries = {}
|
5
5
|
|
6
|
+
def clear_dictionaries():
|
7
|
+
"""Remove all active dictionaries"""
|
8
|
+
global _active_dictionaries, _loaded_dictionaries
|
9
|
+
_active_dictionaries = []
|
10
|
+
_loaded_dictionaries = {}
|
11
|
+
|
6
12
|
def set_dictionaries(dictionaries, overwrite=False):
|
7
13
|
"""Set active dictionaries and load them into memory"""
|
8
14
|
global _active_dictionaries, _loaded_dictionaries
|
@@ -67,25 +67,31 @@ def get_ticker_regex_dict():
|
|
67
67
|
return dict(ticker_regex_list)
|
68
68
|
|
69
69
|
# will change in future to accomodate other datasets
|
70
|
-
def validate_full_name(full_name,keywords):
|
70
|
+
def validate_full_name(full_name, keywords):
|
71
71
|
if len(full_name) == 1:
|
72
72
|
return False
|
73
|
-
|
74
|
-
|
73
|
+
|
74
|
+
# Clean punctuation before validation
|
75
|
+
cleaned_name = [word.rstrip(".,;:!?()[]") for word in full_name]
|
76
|
+
|
77
|
+
# Skip validation if cleaning removed everything
|
78
|
+
if not all(cleaned_name):
|
75
79
|
return False
|
76
|
-
|
77
|
-
|
80
|
+
|
81
|
+
# Apply existing checks to cleaned words
|
82
|
+
if all(word.isupper() for word in cleaned_name):
|
78
83
|
return False
|
79
|
-
|
84
|
+
|
85
|
+
# check if any number in word
|
86
|
+
if any(any(char.isdigit() for char in word) for word in cleaned_name):
|
80
87
|
return False
|
81
88
|
|
82
89
|
# add optional set lookups
|
83
90
|
if keywords is not None:
|
84
91
|
# return false if first word is not in keywords set
|
85
|
-
if
|
92
|
+
if cleaned_name[0] not in keywords:
|
86
93
|
return False
|
87
94
|
|
88
|
-
|
89
95
|
return True
|
90
96
|
|
91
97
|
def get_full_names(text,keywords=None):
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.2.
|
35
|
+
version="2.2.6",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|