datamule 2.2.1__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +142 -58
- datamule/utils/dictionaries.py +8 -1
- {datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/METADATA +1 -1
- {datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/RECORD +6 -6
- {datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/WHEEL +0 -0
- {datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -13,11 +13,35 @@ from pathlib import Path
|
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
15
|
import tempfile
|
16
|
-
import warnings
|
17
16
|
from .tables.tables import Tables
|
18
17
|
|
19
18
|
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
|
20
19
|
|
20
|
+
class DataWithTags(dict):
|
21
|
+
def __init__(self, data, document):
|
22
|
+
super().__init__(data)
|
23
|
+
self._document = document
|
24
|
+
self._tags = None
|
25
|
+
|
26
|
+
@property
|
27
|
+
def tags(self):
|
28
|
+
if self._tags is None:
|
29
|
+
self._tags = Tags(self._document, mode='data') # New fragment-based behavior
|
30
|
+
return self._tags
|
31
|
+
|
32
|
+
class TextWithTags(str):
|
33
|
+
def __new__(cls, content, document):
|
34
|
+
instance = str.__new__(cls, content)
|
35
|
+
instance._document = document
|
36
|
+
instance._tags = None
|
37
|
+
return instance
|
38
|
+
|
39
|
+
@property
|
40
|
+
def tags(self):
|
41
|
+
if self._tags is None:
|
42
|
+
self._tags = Tags(self._document, mode='text') # Original behavior
|
43
|
+
return self._tags
|
44
|
+
|
21
45
|
|
22
46
|
class Tickers:
|
23
47
|
def __init__(self, document):
|
@@ -27,11 +51,7 @@ class Tickers:
|
|
27
51
|
def _get_tickers_data(self):
|
28
52
|
"""Get all tickers data once and cache it"""
|
29
53
|
if self._tickers_data is None:
|
30
|
-
|
31
|
-
if self.document.extension not in ['.htm', '.html', '.txt']:
|
32
|
-
self._tickers_data = {}
|
33
|
-
else:
|
34
|
-
self._tickers_data = get_all_tickers(self.document.text)
|
54
|
+
self._tickers_data = get_all_tickers(self.document.text)
|
35
55
|
return self._tickers_data
|
36
56
|
|
37
57
|
def __getattr__(self, exchange_name):
|
@@ -58,13 +78,14 @@ class Tickers:
|
|
58
78
|
return str(data)
|
59
79
|
|
60
80
|
class Tags:
|
61
|
-
def __init__(self, document):
|
81
|
+
def __init__(self, document, mode='text'):
|
62
82
|
from ..tags.config import _active_dictionaries,_loaded_dictionaries
|
63
|
-
self.not_supported = document.extension not in ['.htm', '.html', '.txt']
|
64
83
|
self.document = document
|
84
|
+
self.mode = mode # 'text' or 'data'
|
65
85
|
self._tickers = None
|
66
86
|
self.dictionaries = {}
|
67
87
|
self.processors = {}
|
88
|
+
self._text_sources = None
|
68
89
|
|
69
90
|
# Load global dictionaries with their data and processors
|
70
91
|
active_dicts = _active_dictionaries
|
@@ -73,77 +94,131 @@ class Tags:
|
|
73
94
|
self.dictionaries[dict_name] = dict_info['data']
|
74
95
|
if dict_info['processor'] is not None:
|
75
96
|
self.processors[dict_name] = dict_info['processor']
|
76
|
-
|
77
97
|
|
78
|
-
def
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
98
|
+
def _get_text_sources(self):
|
99
|
+
"""Get text sources based on mode - either single text or multiple fragments"""
|
100
|
+
if self._text_sources is None:
|
101
|
+
if self.mode == 'text':
|
102
|
+
# Original behavior - single text source
|
103
|
+
self._text_sources = [{'id': None, 'text': str(self.document.text)}]
|
104
|
+
else: # mode == 'data'
|
105
|
+
# New behavior - multiple text fragments
|
106
|
+
self._text_sources = []
|
107
|
+
self._extract_text_fragments(self.document.data, '')
|
108
|
+
return self._text_sources
|
109
|
+
|
110
|
+
def _extract_text_fragments(self, data, parent_id=''):
|
111
|
+
"""Extract all text fragments with their document IDs from parsed data"""
|
112
|
+
if isinstance(data, dict):
|
113
|
+
for key, value in data.items():
|
114
|
+
if key in ["text", "title"] and isinstance(value, str):
|
115
|
+
# Use the current dictionary's parent key as the fragment ID
|
116
|
+
self._text_sources.append({
|
117
|
+
'id': parent_id,
|
118
|
+
'text': value
|
119
|
+
})
|
120
|
+
elif isinstance(value, (dict, list)):
|
121
|
+
# Pass the current key as the parent_id for the next level
|
122
|
+
self._extract_text_fragments(value, key)
|
123
|
+
elif isinstance(data, list):
|
124
|
+
for i, item in enumerate(data):
|
125
|
+
if isinstance(item, (dict, list)):
|
126
|
+
self._extract_text_fragments(item, parent_id)
|
127
|
+
|
128
|
+
def _format_results(self, results, fragment_id):
|
129
|
+
"""Format results based on mode"""
|
130
|
+
if self.mode == 'text':
|
131
|
+
# Original format: (match, start, end)
|
132
|
+
return results
|
133
|
+
else:
|
134
|
+
# New format: (match, fragment_id, start, end)
|
135
|
+
return [(match, fragment_id, start, end) for match, start, end in results]
|
83
136
|
|
84
137
|
@property
|
85
138
|
def cusips(self):
|
86
|
-
if not self
|
87
|
-
|
139
|
+
if not hasattr(self, '_cusips'):
|
140
|
+
self._cusips = []
|
141
|
+
sources = self._get_text_sources()
|
88
142
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
143
|
+
for source in sources:
|
144
|
+
if 'sc13dg_cusips' in self.dictionaries:
|
145
|
+
keywords = self.dictionaries['sc13dg_cusips']
|
146
|
+
results = get_cusip_using_regex(source['text'], keywords)
|
147
|
+
elif "13fhr_information_table_cusips" in self.dictionaries:
|
148
|
+
keywords = self.dictionaries['13fhr_information_table_cusips']
|
149
|
+
results = get_cusip_using_regex(source['text'], keywords)
|
150
|
+
else:
|
151
|
+
results = get_cusip_using_regex(source['text'])
|
152
|
+
|
153
|
+
# Format results based on mode
|
154
|
+
formatted_results = self._format_results(results, source['id'])
|
155
|
+
self._cusips.extend(formatted_results)
|
156
|
+
|
157
|
+
return self._cusips
|
96
158
|
|
97
159
|
@property
|
98
160
|
def isins(self):
|
99
|
-
if not self
|
100
|
-
|
161
|
+
if not hasattr(self, '_isins'):
|
162
|
+
self._isins = []
|
163
|
+
sources = self._get_text_sources()
|
101
164
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
165
|
+
for source in sources:
|
166
|
+
if 'npx_isins' in self.dictionaries:
|
167
|
+
keywords = self.dictionaries['npx_isins']
|
168
|
+
results = get_isin_using_regex(source['text'], keywords)
|
169
|
+
else:
|
170
|
+
results = get_isin_using_regex(source['text'])
|
171
|
+
|
172
|
+
formatted_results = self._format_results(results, source['id'])
|
173
|
+
self._isins.extend(formatted_results)
|
174
|
+
|
175
|
+
return self._isins
|
109
176
|
|
110
177
|
@property
|
111
178
|
def figis(self):
|
112
|
-
if not self
|
113
|
-
|
179
|
+
if not hasattr(self, '_figis'):
|
180
|
+
self._figis = []
|
181
|
+
sources = self._get_text_sources()
|
114
182
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
183
|
+
for source in sources:
|
184
|
+
if 'npx_figis' in self.dictionaries:
|
185
|
+
keywords = self.dictionaries['npx_figis']
|
186
|
+
results = get_figi_using_regex(source['text'], keywords)
|
187
|
+
else:
|
188
|
+
results = get_figi_using_regex(source['text'])
|
189
|
+
|
190
|
+
formatted_results = self._format_results(results, source['id'])
|
191
|
+
self._figis.extend(formatted_results)
|
192
|
+
|
193
|
+
return self._figis
|
122
194
|
|
123
195
|
@property
|
124
196
|
def tickers(self):
|
197
|
+
# Tickers work differently - they need the full document context
|
198
|
+
# Keep original behavior for now
|
125
199
|
if self._tickers is None:
|
126
200
|
self._tickers = Tickers(self.document)
|
127
201
|
return self._tickers
|
128
202
|
|
129
203
|
@property
|
130
204
|
def persons(self):
|
131
|
-
if not self._check_support():
|
132
|
-
return None
|
133
|
-
|
134
205
|
if not hasattr(self, '_persons'):
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
206
|
+
self._persons = []
|
207
|
+
sources = self._get_text_sources()
|
208
|
+
|
209
|
+
for source in sources:
|
210
|
+
if '8k_2024_persons' in self.processors:
|
211
|
+
results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
|
212
|
+
elif 'ssa_baby_first_names' in self.dictionaries:
|
213
|
+
results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
|
214
|
+
else:
|
215
|
+
results = get_full_names(source['text'])
|
216
|
+
|
217
|
+
formatted_results = self._format_results(results, source['id'])
|
218
|
+
self._persons.extend(formatted_results)
|
219
|
+
|
144
220
|
return self._persons
|
145
221
|
|
146
|
-
|
147
222
|
class Document:
|
148
223
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
149
224
|
|
@@ -168,8 +243,6 @@ class Document:
|
|
168
243
|
self._tables = None
|
169
244
|
self._text = None
|
170
245
|
|
171
|
-
self.tags = Tags(self)
|
172
|
-
|
173
246
|
|
174
247
|
|
175
248
|
#_load_text_content
|
@@ -354,15 +427,26 @@ class Document:
|
|
354
427
|
def data(self):
|
355
428
|
if self._data is None:
|
356
429
|
self.parse()
|
430
|
+
|
431
|
+
if self._data is None:
|
432
|
+
self._data = {}
|
433
|
+
|
434
|
+
if not isinstance(self._data, DataWithTags):
|
435
|
+
self._data = DataWithTags(self._data, self)
|
436
|
+
|
357
437
|
return self._data
|
358
438
|
|
359
439
|
@property
|
360
440
|
def text(self):
|
361
441
|
if self._text is None:
|
362
442
|
if self.extension in ['.htm','.html']:
|
363
|
-
self._preprocess_html_content()
|
443
|
+
self._preprocess_html_content() # Still sets self._text to plain string
|
364
444
|
elif self.extension == '.txt':
|
365
|
-
self._preprocess_txt_content()
|
445
|
+
self._preprocess_txt_content() # Still sets self._text to plain string
|
446
|
+
|
447
|
+
# Convert the plain string to TextWithTags
|
448
|
+
plain_text = self._text
|
449
|
+
self._text = TextWithTags(plain_text, self)
|
366
450
|
return self._text
|
367
451
|
|
368
452
|
def write_json(self, output_filename=None):
|
datamule/utils/dictionaries.py
CHANGED
@@ -6,7 +6,8 @@ urls = {
|
|
6
6
|
"npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
|
7
7
|
"npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
|
8
8
|
"sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
|
9
|
-
"8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
|
9
|
+
"8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
|
10
|
+
"13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
|
10
11
|
}
|
11
12
|
|
12
13
|
|
@@ -67,6 +68,12 @@ def load_dictionary(name):
|
|
67
68
|
for line in f:
|
68
69
|
cusip_set.add(line.strip())
|
69
70
|
return cusip_set
|
71
|
+
elif name == "13fhr_information_table_cusips":
|
72
|
+
cusip_set = set()
|
73
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
74
|
+
for line in f:
|
75
|
+
cusip_set.add(line.strip())
|
76
|
+
return cusip_set
|
70
77
|
elif name == "8k_2024_persons":
|
71
78
|
|
72
79
|
with open(file_path, 'r', encoding='utf-8') as f:
|
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=mpoWmK8K7B92ukXj4WZzFhYOwpoVop5DZYfj2Q-6FE8,24332
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
@@ -54,9 +54,9 @@ datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
|
|
54
54
|
datamule/tags/utils.py,sha256=hQpQBVAJPmys1UKVS2mqc8Z5-qO_zma5ecFXvW9DXoo,5329
|
55
55
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
57
|
-
datamule/utils/dictionaries.py,sha256=
|
57
|
+
datamule/utils/dictionaries.py,sha256=1VwzuyDausEsvMIJRa2UD7SvtmlMRHmT_tFeaCY6eXo,3201
|
58
58
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
59
|
-
datamule-2.2.
|
60
|
-
datamule-2.2.
|
61
|
-
datamule-2.2.
|
62
|
-
datamule-2.2.
|
59
|
+
datamule-2.2.2.dist-info/METADATA,sha256=pVMWNBGvR-KNKCYOvfvcFa95srRzS3j_t-zuW6QiXQk,585
|
60
|
+
datamule-2.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
61
|
+
datamule-2.2.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
62
|
+
datamule-2.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|