datamule 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +9 -5
- datamule/tags/config.py +19 -2
- datamule/tags/utils.py +3 -7
- {datamule-2.2.0.dist-info → datamule-2.2.1.dist-info}/METADATA +1 -1
- {datamule-2.2.0.dist-info → datamule-2.2.1.dist-info}/RECORD +7 -7
- {datamule-2.2.0.dist-info → datamule-2.2.1.dist-info}/WHEEL +0 -0
- {datamule-2.2.0.dist-info → datamule-2.2.1.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -64,11 +64,15 @@ class Tags:
|
|
64
64
|
self.document = document
|
65
65
|
self._tickers = None
|
66
66
|
self.dictionaries = {}
|
67
|
+
self.processors = {}
|
67
68
|
|
68
|
-
# Load global dictionaries with their data
|
69
|
+
# Load global dictionaries with their data and processors
|
69
70
|
active_dicts = _active_dictionaries
|
70
71
|
for dict_name in active_dicts:
|
71
|
-
|
72
|
+
dict_info = _loaded_dictionaries[dict_name]
|
73
|
+
self.dictionaries[dict_name] = dict_info['data']
|
74
|
+
if dict_info['processor'] is not None:
|
75
|
+
self.processors[dict_name] = dict_info['processor']
|
72
76
|
|
73
77
|
|
74
78
|
def _check_support(self):
|
@@ -128,9 +132,9 @@ class Tags:
|
|
128
132
|
return None
|
129
133
|
|
130
134
|
if not hasattr(self, '_persons'):
|
131
|
-
if '8k_2024_persons' in self.
|
132
|
-
# Use
|
133
|
-
self._persons = get_full_names_dictionary_lookup(self.document.text, self.
|
135
|
+
if '8k_2024_persons' in self.processors:
|
136
|
+
# Use pre-built processor
|
137
|
+
self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
|
134
138
|
elif 'ssa_baby_first_names' in self.dictionaries:
|
135
139
|
# Use regex with SSA names for validation
|
136
140
|
self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
|
datamule/tags/config.py
CHANGED
@@ -12,5 +12,22 @@ def set_dictionaries(dictionaries, overwrite=False):
|
|
12
12
|
for dict_name in dictionaries:
|
13
13
|
# Download if needed
|
14
14
|
download_dictionary(dict_name, overwrite=overwrite)
|
15
|
-
# Load
|
16
|
-
|
15
|
+
# Load raw data
|
16
|
+
raw_data = load_dictionary(dict_name)
|
17
|
+
|
18
|
+
# Create processor for dictionary lookup methods
|
19
|
+
if dict_name in ['8k_2024_persons']: # Add other dict names as needed
|
20
|
+
from flashtext import KeywordProcessor
|
21
|
+
processor = KeywordProcessor(case_sensitive=True)
|
22
|
+
for key in raw_data.keys():
|
23
|
+
processor.add_keyword(key, key)
|
24
|
+
|
25
|
+
_loaded_dictionaries[dict_name] = {
|
26
|
+
'data': raw_data,
|
27
|
+
'processor': processor
|
28
|
+
}
|
29
|
+
else:
|
30
|
+
_loaded_dictionaries[dict_name] = {
|
31
|
+
'data': raw_data,
|
32
|
+
'processor': None
|
33
|
+
}
|
datamule/tags/utils.py
CHANGED
@@ -134,14 +134,10 @@ def get_full_names(text,keywords=None):
|
|
134
134
|
return full_names
|
135
135
|
|
136
136
|
# add dictionary lookup based on precomputed lists
|
137
|
-
def get_full_names_dictionary_lookup(text,
|
138
|
-
|
139
|
-
|
140
|
-
for key in dictionary.keys():
|
141
|
-
keyword_processor.add_keyword(key, key)
|
142
|
-
|
137
|
+
def get_full_names_dictionary_lookup(text, processor):
|
138
|
+
"""Use pre-built KeywordProcessor instead of creating new one"""
|
143
139
|
matches = []
|
144
|
-
keywords_found =
|
140
|
+
keywords_found = processor.extract_keywords(text, span_info=True)
|
145
141
|
|
146
142
|
for keyword, start_pos, end_pos in keywords_found:
|
147
143
|
matches.append((keyword, start_pos, end_pos))
|
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=yiev4AYewjp8bPjWn9cuL43N2O11s9WUo4X2e7WUgiY,20628
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
@@ -49,14 +49,14 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
|
|
49
49
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
50
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
51
51
|
datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
|
-
datamule/tags/config.py,sha256=
|
52
|
+
datamule/tags/config.py,sha256=RCYRw_voP2MrEx_iN7zjJiZ8YDa4QlzKPGpW5ZTij6U,1197
|
53
53
|
datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
|
54
|
-
datamule/tags/utils.py,sha256=
|
54
|
+
datamule/tags/utils.py,sha256=hQpQBVAJPmys1UKVS2mqc8Z5-qO_zma5ecFXvW9DXoo,5329
|
55
55
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
57
57
|
datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
|
58
58
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
59
|
-
datamule-2.2.
|
60
|
-
datamule-2.2.
|
61
|
-
datamule-2.2.
|
62
|
-
datamule-2.2.
|
59
|
+
datamule-2.2.1.dist-info/METADATA,sha256=aINGZMWV34SclEt-2Ij2d2848PJA7cLF6ZoBL2LwpfY,585
|
60
|
+
datamule-2.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
61
|
+
datamule-2.2.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
62
|
+
datamule-2.2.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|