datamule 2.2.0__tar.gz → 2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.2.0 → datamule-2.2.1}/PKG-INFO +1 -1
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/document.py +9 -5
- datamule-2.2.1/datamule/tags/config.py +33 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/tags/utils.py +3 -7
- {datamule-2.2.0 → datamule-2.2.1}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.2.0 → datamule-2.2.1}/setup.py +1 -1
- datamule-2.2.0/datamule/tags/config.py +0 -16
- {datamule-2.2.0 → datamule-2.2.1}/datamule/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/config.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/datamule/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/datamule/downloader.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/datasets.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/helper.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/index.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/package_updater.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/portfolio.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/utils.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/sheet.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/submission.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/tags/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/tags/regex.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/utils/__init__.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/utils/dictionaries.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.0 → datamule-2.2.1}/setup.cfg +0 -0
@@ -64,11 +64,15 @@ class Tags:
|
|
64
64
|
self.document = document
|
65
65
|
self._tickers = None
|
66
66
|
self.dictionaries = {}
|
67
|
+
self.processors = {}
|
67
68
|
|
68
|
-
# Load global dictionaries with their data
|
69
|
+
# Load global dictionaries with their data and processors
|
69
70
|
active_dicts = _active_dictionaries
|
70
71
|
for dict_name in active_dicts:
|
71
|
-
|
72
|
+
dict_info = _loaded_dictionaries[dict_name]
|
73
|
+
self.dictionaries[dict_name] = dict_info['data']
|
74
|
+
if dict_info['processor'] is not None:
|
75
|
+
self.processors[dict_name] = dict_info['processor']
|
72
76
|
|
73
77
|
|
74
78
|
def _check_support(self):
|
@@ -128,9 +132,9 @@ class Tags:
|
|
128
132
|
return None
|
129
133
|
|
130
134
|
if not hasattr(self, '_persons'):
|
131
|
-
if '8k_2024_persons' in self.
|
132
|
-
# Use
|
133
|
-
self._persons = get_full_names_dictionary_lookup(self.document.text, self.
|
135
|
+
if '8k_2024_persons' in self.processors:
|
136
|
+
# Use pre-built processor
|
137
|
+
self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
|
134
138
|
elif 'ssa_baby_first_names' in self.dictionaries:
|
135
139
|
# Use regex with SSA names for validation
|
136
140
|
self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from ..utils.dictionaries import download_dictionary, load_dictionary
|
2
|
+
|
3
|
+
_active_dictionaries = []
|
4
|
+
_loaded_dictionaries = {}
|
5
|
+
|
6
|
+
def set_dictionaries(dictionaries, overwrite=False):
|
7
|
+
"""Set active dictionaries and load them into memory"""
|
8
|
+
global _active_dictionaries, _loaded_dictionaries
|
9
|
+
_active_dictionaries = dictionaries
|
10
|
+
_loaded_dictionaries = {}
|
11
|
+
|
12
|
+
for dict_name in dictionaries:
|
13
|
+
# Download if needed
|
14
|
+
download_dictionary(dict_name, overwrite=overwrite)
|
15
|
+
# Load raw data
|
16
|
+
raw_data = load_dictionary(dict_name)
|
17
|
+
|
18
|
+
# Create processor for dictionary lookup methods
|
19
|
+
if dict_name in ['8k_2024_persons']: # Add other dict names as needed
|
20
|
+
from flashtext import KeywordProcessor
|
21
|
+
processor = KeywordProcessor(case_sensitive=True)
|
22
|
+
for key in raw_data.keys():
|
23
|
+
processor.add_keyword(key, key)
|
24
|
+
|
25
|
+
_loaded_dictionaries[dict_name] = {
|
26
|
+
'data': raw_data,
|
27
|
+
'processor': processor
|
28
|
+
}
|
29
|
+
else:
|
30
|
+
_loaded_dictionaries[dict_name] = {
|
31
|
+
'data': raw_data,
|
32
|
+
'processor': None
|
33
|
+
}
|
@@ -134,14 +134,10 @@ def get_full_names(text,keywords=None):
|
|
134
134
|
return full_names
|
135
135
|
|
136
136
|
# add dictionary lookup based on precomputed lists
|
137
|
-
def get_full_names_dictionary_lookup(text,
|
138
|
-
|
139
|
-
|
140
|
-
for key in dictionary.keys():
|
141
|
-
keyword_processor.add_keyword(key, key)
|
142
|
-
|
137
|
+
def get_full_names_dictionary_lookup(text, processor):
|
138
|
+
"""Use pre-built KeywordProcessor instead of creating new one"""
|
143
139
|
matches = []
|
144
|
-
keywords_found =
|
140
|
+
keywords_found = processor.extract_keywords(text, span_info=True)
|
145
141
|
|
146
142
|
for keyword, start_pos, end_pos in keywords_found:
|
147
143
|
matches.append((keyword, start_pos, end_pos))
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.2.
|
35
|
+
version="2.2.1",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
@@ -1,16 +0,0 @@
|
|
1
|
-
from ..utils.dictionaries import download_dictionary, load_dictionary
|
2
|
-
|
3
|
-
_active_dictionaries = []
|
4
|
-
_loaded_dictionaries = {}
|
5
|
-
|
6
|
-
def set_dictionaries(dictionaries, overwrite=False):
|
7
|
-
"""Set active dictionaries and load them into memory"""
|
8
|
-
global _active_dictionaries, _loaded_dictionaries
|
9
|
-
_active_dictionaries = dictionaries
|
10
|
-
_loaded_dictionaries = {}
|
11
|
-
|
12
|
-
for dict_name in dictionaries:
|
13
|
-
# Download if needed
|
14
|
-
download_dictionary(dict_name, overwrite=overwrite)
|
15
|
-
# Load into memory
|
16
|
-
_loaded_dictionaries[dict_name] = load_dictionary(dict_name)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|