datamule 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,11 +64,15 @@ class Tags:
64
64
  self.document = document
65
65
  self._tickers = None
66
66
  self.dictionaries = {}
67
+ self.processors = {}
67
68
 
68
- # Load global dictionaries with their data
69
+ # Load global dictionaries with their data and processors
69
70
  active_dicts = _active_dictionaries
70
71
  for dict_name in active_dicts:
71
- self.dictionaries[dict_name] = _loaded_dictionaries[dict_name]
72
+ dict_info = _loaded_dictionaries[dict_name]
73
+ self.dictionaries[dict_name] = dict_info['data']
74
+ if dict_info['processor'] is not None:
75
+ self.processors[dict_name] = dict_info['processor']
72
76
 
73
77
 
74
78
  def _check_support(self):
@@ -128,9 +132,9 @@ class Tags:
128
132
  return None
129
133
 
130
134
  if not hasattr(self, '_persons'):
131
- if '8k_2024_persons' in self.dictionaries:
132
- # Use FlashText dictionary lookup for 8K persons
133
- self._persons = get_full_names_dictionary_lookup(self.document.text, self.dictionaries['8k_2024_persons'])
135
+ if '8k_2024_persons' in self.processors:
136
+ # Use pre-built processor
137
+ self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
134
138
  elif 'ssa_baby_first_names' in self.dictionaries:
135
139
  # Use regex with SSA names for validation
136
140
  self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
datamule/tags/config.py CHANGED
@@ -12,5 +12,22 @@ def set_dictionaries(dictionaries, overwrite=False):
12
12
  for dict_name in dictionaries:
13
13
  # Download if needed
14
14
  download_dictionary(dict_name, overwrite=overwrite)
15
- # Load into memory
16
- _loaded_dictionaries[dict_name] = load_dictionary(dict_name)
15
+ # Load raw data
16
+ raw_data = load_dictionary(dict_name)
17
+
18
+ # Create processor for dictionary lookup methods
19
+ if dict_name in ['8k_2024_persons']: # Add other dict names as needed
20
+ from flashtext import KeywordProcessor
21
+ processor = KeywordProcessor(case_sensitive=True)
22
+ for key in raw_data.keys():
23
+ processor.add_keyword(key, key)
24
+
25
+ _loaded_dictionaries[dict_name] = {
26
+ 'data': raw_data,
27
+ 'processor': processor
28
+ }
29
+ else:
30
+ _loaded_dictionaries[dict_name] = {
31
+ 'data': raw_data,
32
+ 'processor': None
33
+ }
datamule/tags/utils.py CHANGED
@@ -134,14 +134,10 @@ def get_full_names(text,keywords=None):
134
134
  return full_names
135
135
 
136
136
  # add dictionary lookup based on precomputed lists
137
- def get_full_names_dictionary_lookup(text, dictionary):
138
- keyword_processor = KeywordProcessor(case_sensitive=True)
139
-
140
- for key in dictionary.keys():
141
- keyword_processor.add_keyword(key, key)
142
-
137
+ def get_full_names_dictionary_lookup(text, processor):
138
+ """Use pre-built KeywordProcessor instead of creating new one"""
143
139
  matches = []
144
- keywords_found = keyword_processor.extract_keywords(text, span_info=True)
140
+ keywords_found = processor.extract_keywords(text, span_info=True)
145
141
 
146
142
  for keyword, start_pos, end_pos in keywords_found:
147
143
  matches.append((keyword, start_pos, end_pos))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
15
15
  datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamule/document/document.py,sha256=QjncYOIdf0Zf_0AONEOXu2KlPxMksGZzvwmHOpbM5N8,20450
18
+ datamule/document/document.py,sha256=yiev4AYewjp8bPjWn9cuL43N2O11s9WUo4X2e7WUgiY,20628
19
19
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
21
21
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -49,14 +49,14 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
49
49
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
50
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
51
51
  datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
- datamule/tags/config.py,sha256=JVdIkqu9rBEAadNLP-FiIbZ35TRORGIDCJvqDh0CuqE,585
52
+ datamule/tags/config.py,sha256=RCYRw_voP2MrEx_iN7zjJiZ8YDa4QlzKPGpW5ZTij6U,1197
53
53
  datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
54
- datamule/tags/utils.py,sha256=k5fyjMjJNh6gZjj491sw_9rnMqYIlHHDBathkDcHD0A,5423
54
+ datamule/tags/utils.py,sha256=hQpQBVAJPmys1UKVS2mqc8Z5-qO_zma5ecFXvW9DXoo,5329
55
55
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
57
57
  datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
58
58
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
59
- datamule-2.2.0.dist-info/METADATA,sha256=fuT_ABK8D6LhEi1_TjtVnIKobXdafBPiMSGy3XCWyRo,585
60
- datamule-2.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
- datamule-2.2.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
62
- datamule-2.2.0.dist-info/RECORD,,
59
+ datamule-2.2.1.dist-info/METADATA,sha256=aINGZMWV34SclEt-2Ij2d2848PJA7cLF6ZoBL2LwpfY,585
60
+ datamule-2.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
+ datamule-2.2.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
62
+ datamule-2.2.1.dist-info/RECORD,,