datamule 2.2.3__tar.gz → 2.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.2.3 → datamule-2.2.5}/PKG-INFO +1 -1
- {datamule-2.2.3 → datamule-2.2.5}/datamule/datamule/downloader.py +1 -1
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/document.py +17 -1
- {datamule-2.2.3 → datamule-2.2.5}/datamule/submission.py +10 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/tags/config.py +6 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/tags/utils.py +14 -8
- {datamule-2.2.3 → datamule-2.2.5}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.2.3 → datamule-2.2.5}/setup.py +1 -1
- {datamule-2.2.3 → datamule-2.2.5}/datamule/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/config.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/datamule/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/datasets.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/helper.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/index.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/package_updater.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/portfolio.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/utils.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sentiment/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/sheet.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/tags/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/tags/dictionaries.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/tags/regex.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/utils/__init__.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.3 → datamule-2.2.5}/setup.cfg +0 -0
@@ -287,7 +287,7 @@ class Downloader:
|
|
287
287
|
keepalive_timeout=60
|
288
288
|
)
|
289
289
|
|
290
|
-
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=
|
290
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
|
291
291
|
tasks = [
|
292
292
|
self.download_and_process(
|
293
293
|
session, url, semaphore, decompression_pool,
|
@@ -221,7 +221,6 @@ class Tags(TextAnalysisBase):
|
|
221
221
|
if not hasattr(self, '_persons'):
|
222
222
|
self._persons = []
|
223
223
|
sources = self._get_text_sources()
|
224
|
-
|
225
224
|
for source in sources:
|
226
225
|
if '8k_2024_persons' in self.processors:
|
227
226
|
results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
|
@@ -289,6 +288,12 @@ class Document:
|
|
289
288
|
self._data = None
|
290
289
|
self._tables = None
|
291
290
|
self._text = None
|
291
|
+
|
292
|
+
# booleans
|
293
|
+
self._text_bool = self.extension in ('.htm', '.html','.txt')
|
294
|
+
self._data_bool = self.extension in ('.htm', '.html','.txt')
|
295
|
+
self._visualize_bool = self._data_bool
|
296
|
+
self._tables_bool = self.extension in ('.xml')
|
292
297
|
|
293
298
|
|
294
299
|
|
@@ -544,6 +549,17 @@ class Document:
|
|
544
549
|
writer.writeheader()
|
545
550
|
writer.writerows(table.data)
|
546
551
|
|
552
|
+
def reset_nlp(self):
|
553
|
+
"""Reset all NLP analysis by creating fresh wrapper objects"""
|
554
|
+
# Reset data wrapper
|
555
|
+
if hasattr(self, '_data') and self._data is not None:
|
556
|
+
raw_data = dict(self._data) # Extract the underlying dict
|
557
|
+
self._data = DataWithTags(raw_data, self)
|
558
|
+
|
559
|
+
# Reset text wrapper
|
560
|
+
if hasattr(self, '_text') and self._text is not None:
|
561
|
+
raw_text = str(self._text) # Extract the underlying string
|
562
|
+
self._text = TextWithTags(raw_text, self)
|
547
563
|
|
548
564
|
def _document_to_section_text(self, document_data, parent_key=''):
|
549
565
|
items = []
|
@@ -121,6 +121,16 @@ class Submission:
|
|
121
121
|
self.accession = self.metadata.content['accession-number']
|
122
122
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
123
123
|
|
124
|
+
|
125
|
+
# booleans
|
126
|
+
self._has_xbrl = any(
|
127
|
+
doc['type'] in ('EX-100.INS', 'EX-101.INS') or
|
128
|
+
doc.get('filename', '').endswith('_htm.xml')
|
129
|
+
for doc in self.metadata.content['documents']
|
130
|
+
)
|
131
|
+
|
132
|
+
self._has_fundamentals = self._has_xbrl
|
133
|
+
|
124
134
|
def _load_document_by_index(self, idx):
|
125
135
|
"""Load a document by its index in the metadata documents list."""
|
126
136
|
doc = self.metadata.content['documents'][idx]
|
@@ -3,6 +3,12 @@ from .dictionaries import download_dictionary, load_dictionary
|
|
3
3
|
_active_dictionaries = []
|
4
4
|
_loaded_dictionaries = {}
|
5
5
|
|
6
|
+
def clear_dictionaries():
|
7
|
+
"""Remove all active dictionaries"""
|
8
|
+
global _active_dictionaries, _loaded_dictionaries
|
9
|
+
_active_dictionaries = []
|
10
|
+
_loaded_dictionaries = {}
|
11
|
+
|
6
12
|
def set_dictionaries(dictionaries, overwrite=False):
|
7
13
|
"""Set active dictionaries and load them into memory"""
|
8
14
|
global _active_dictionaries, _loaded_dictionaries
|
@@ -67,25 +67,31 @@ def get_ticker_regex_dict():
|
|
67
67
|
return dict(ticker_regex_list)
|
68
68
|
|
69
69
|
# will change in future to accomodate other datasets
|
70
|
-
def validate_full_name(full_name,keywords):
|
70
|
+
def validate_full_name(full_name, keywords):
|
71
71
|
if len(full_name) == 1:
|
72
72
|
return False
|
73
|
-
|
74
|
-
|
73
|
+
|
74
|
+
# Clean punctuation before validation
|
75
|
+
cleaned_name = [word.rstrip(".,;:!?()[]") for word in full_name]
|
76
|
+
|
77
|
+
# Skip validation if cleaning removed everything
|
78
|
+
if not all(cleaned_name):
|
75
79
|
return False
|
76
|
-
|
77
|
-
|
80
|
+
|
81
|
+
# Apply existing checks to cleaned words
|
82
|
+
if all(word.isupper() for word in cleaned_name):
|
78
83
|
return False
|
79
|
-
|
84
|
+
|
85
|
+
# check if any number in word
|
86
|
+
if any(any(char.isdigit() for char in word) for word in cleaned_name):
|
80
87
|
return False
|
81
88
|
|
82
89
|
# add optional set lookups
|
83
90
|
if keywords is not None:
|
84
91
|
# return false if first word is not in keywords set
|
85
|
-
if
|
92
|
+
if cleaned_name[0] not in keywords:
|
86
93
|
return False
|
87
94
|
|
88
|
-
|
89
95
|
return True
|
90
96
|
|
91
97
|
def get_full_names(text,keywords=None):
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.2.
|
35
|
+
version="2.2.5",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|