datamule 2.1.5__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +171 -45
- datamule/sheet.py +1 -401
- datamule/tags/__init__.py +0 -0
- datamule/tags/config.py +16 -0
- datamule/tags/regex.py +105 -0
- datamule/tags/utils.py +149 -0
- datamule/utils/dictionaries.py +76 -0
- {datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/METADATA +2 -1
- {datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/RECORD +11 -6
- {datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/WHEEL +0 -0
- {datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -13,9 +13,133 @@ from pathlib import Path
|
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
15
|
import tempfile
|
16
|
-
|
16
|
+
import warnings
|
17
17
|
from .tables.tables import Tables
|
18
18
|
|
19
|
+
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
|
20
|
+
|
21
|
+
|
22
|
+
class Tickers:
|
23
|
+
def __init__(self, document):
|
24
|
+
self.document = document
|
25
|
+
self._tickers_data = None
|
26
|
+
|
27
|
+
def _get_tickers_data(self):
|
28
|
+
"""Get all tickers data once and cache it"""
|
29
|
+
if self._tickers_data is None:
|
30
|
+
# Check if document extension is supported
|
31
|
+
if self.document.extension not in ['.htm', '.html', '.txt']:
|
32
|
+
self._tickers_data = {}
|
33
|
+
else:
|
34
|
+
self._tickers_data = get_all_tickers(self.document.text)
|
35
|
+
return self._tickers_data
|
36
|
+
|
37
|
+
def __getattr__(self, exchange_name):
|
38
|
+
data = self._get_tickers_data()
|
39
|
+
|
40
|
+
if exchange_name in data:
|
41
|
+
return data[exchange_name]
|
42
|
+
|
43
|
+
return []
|
44
|
+
|
45
|
+
def __bool__(self):
|
46
|
+
"""Return True if any tickers were found"""
|
47
|
+
data = self._get_tickers_data()
|
48
|
+
return bool(data.get('all', []))
|
49
|
+
|
50
|
+
def __repr__(self):
|
51
|
+
"""Show the full ticker data when printed or accessed directly"""
|
52
|
+
data = self._get_tickers_data()
|
53
|
+
return str(data)
|
54
|
+
|
55
|
+
def __str__(self):
|
56
|
+
"""Show the full ticker data when printed"""
|
57
|
+
data = self._get_tickers_data()
|
58
|
+
return str(data)
|
59
|
+
|
60
|
+
class Tags:
|
61
|
+
def __init__(self, document):
|
62
|
+
from ..tags.config import _active_dictionaries,_loaded_dictionaries
|
63
|
+
self.not_supported = document.extension not in ['.htm', '.html', '.txt']
|
64
|
+
self.document = document
|
65
|
+
self._tickers = None
|
66
|
+
self.dictionaries = {}
|
67
|
+
|
68
|
+
# Load global dictionaries with their data
|
69
|
+
active_dicts = _active_dictionaries
|
70
|
+
for dict_name in active_dicts:
|
71
|
+
self.dictionaries[dict_name] = _loaded_dictionaries[dict_name]
|
72
|
+
|
73
|
+
|
74
|
+
def _check_support(self):
|
75
|
+
if self.not_supported:
|
76
|
+
warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
|
77
|
+
return False
|
78
|
+
return True
|
79
|
+
|
80
|
+
@property
|
81
|
+
def cusips(self):
|
82
|
+
if not self._check_support():
|
83
|
+
return None
|
84
|
+
|
85
|
+
if not hasattr(self, '_cusip'):
|
86
|
+
if 'sc13dg_cusips' in self.dictionaries:
|
87
|
+
keywords = self.dictionaries['sc13dg_cusips']
|
88
|
+
self._cusip = get_cusip_using_regex(self.document.text, keywords)
|
89
|
+
else:
|
90
|
+
self._cusip = get_cusip_using_regex(self.document.text)
|
91
|
+
return self._cusip
|
92
|
+
|
93
|
+
@property
|
94
|
+
def isins(self):
|
95
|
+
if not self._check_support():
|
96
|
+
return None
|
97
|
+
|
98
|
+
if not hasattr(self, '_isin'):
|
99
|
+
if 'npx_isins' in self.dictionaries:
|
100
|
+
keywords = self.dictionaries['npx_isins']
|
101
|
+
self._isin = get_isin_using_regex(self.document.text, keywords)
|
102
|
+
else:
|
103
|
+
self._isin = get_isin_using_regex(self.document.text)
|
104
|
+
return self._isin
|
105
|
+
|
106
|
+
@property
|
107
|
+
def figis(self):
|
108
|
+
if not self._check_support():
|
109
|
+
return None
|
110
|
+
|
111
|
+
if not hasattr(self, '_figi'):
|
112
|
+
if 'npx_figis' in self.dictionaries:
|
113
|
+
keywords = self.dictionaries['npx_figis']
|
114
|
+
self._figi = get_figi_using_regex(self.document.text, keywords)
|
115
|
+
else:
|
116
|
+
self._figi = get_figi_using_regex(self.document.text)
|
117
|
+
return self._figi
|
118
|
+
|
119
|
+
@property
|
120
|
+
def tickers(self):
|
121
|
+
if self._tickers is None:
|
122
|
+
self._tickers = Tickers(self.document)
|
123
|
+
return self._tickers
|
124
|
+
|
125
|
+
@property
|
126
|
+
def persons(self):
|
127
|
+
if not self._check_support():
|
128
|
+
return None
|
129
|
+
|
130
|
+
if not hasattr(self, '_persons'):
|
131
|
+
if '8k_2024_persons' in self.dictionaries:
|
132
|
+
# Use FlashText dictionary lookup for 8K persons
|
133
|
+
self._persons = get_full_names_dictionary_lookup(self.document.text, self.dictionaries['8k_2024_persons'])
|
134
|
+
elif 'ssa_baby_first_names' in self.dictionaries:
|
135
|
+
# Use regex with SSA names for validation
|
136
|
+
self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
|
137
|
+
else:
|
138
|
+
# Fallback to regex without validation
|
139
|
+
self._persons = get_full_names(self.document.text)
|
140
|
+
return self._persons
|
141
|
+
|
142
|
+
|
19
143
|
class Document:
|
20
144
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
21
145
|
|
@@ -34,10 +158,13 @@ class Document:
|
|
34
158
|
self.path = path
|
35
159
|
|
36
160
|
self.extension = extension
|
161
|
+
|
37
162
|
# this will be filled by parsed
|
38
163
|
self._data = None
|
39
164
|
self._tables = None
|
40
165
|
self._text = None
|
166
|
+
|
167
|
+
self.tags = Tags(self)
|
41
168
|
|
42
169
|
|
43
170
|
|
@@ -119,93 +246,92 @@ class Document:
|
|
119
246
|
|
120
247
|
if self.extension == '.txt':
|
121
248
|
content = self.text
|
122
|
-
if self.type
|
249
|
+
if self.type in ['10-Q', '10-Q/A']:
|
123
250
|
mapping_dict = dict_10q
|
124
|
-
elif self.type
|
251
|
+
elif self.type in ['10-K','10-K/A']:
|
125
252
|
mapping_dict = dict_10k
|
126
|
-
elif self.type
|
253
|
+
elif self.type in ['8-K', '8-K/A']:
|
127
254
|
mapping_dict = dict_8k
|
128
|
-
elif self.type
|
255
|
+
elif self.type in ['SC 13D', 'SC 13D/A']:
|
129
256
|
mapping_dict = dict_13d
|
130
|
-
elif self.type
|
257
|
+
elif self.type in ['SC 13G', 'SC 13G/A']:
|
131
258
|
mapping_dict = dict_13g
|
132
259
|
|
133
260
|
self._data = {}
|
134
261
|
self._data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
|
135
262
|
elif self.extension in ['.htm', '.html']:
|
136
263
|
|
137
|
-
if self.type
|
264
|
+
if self.type in ['1-K', '1-K/A']:
|
138
265
|
mapping_dict = dict_1kpartii_html
|
139
|
-
elif self.type
|
266
|
+
elif self.type in ['1-SA', '1-SA/A']:
|
140
267
|
mapping_dict = dict_1sa_html
|
141
|
-
elif self.type
|
268
|
+
elif self.type in ['1-U', '1-U/A']:
|
142
269
|
mapping_dict = dict_1u_html
|
143
|
-
elif self.type
|
270
|
+
elif self.type in ['10-12B', '10-12B/A']:
|
144
271
|
mapping_dict = dict_1012b_html
|
145
|
-
elif self.type
|
272
|
+
elif self.type in ['10-D', '10-D/A']:
|
146
273
|
mapping_dict = dict_10d_html
|
147
|
-
elif self.type
|
274
|
+
elif self.type in ['10-K', '10-K/A']:
|
148
275
|
mapping_dict = dict_10k_html
|
149
|
-
elif self.type
|
276
|
+
elif self.type in ['10-Q', '10-Q/A']:
|
150
277
|
mapping_dict = dict_10q_html
|
151
|
-
elif self.type
|
278
|
+
elif self.type in ['20-F', '20-F/A']:
|
152
279
|
mapping_dict = dict_20f_html
|
153
|
-
elif self.type
|
280
|
+
elif self.type in ['8-A12B', '8-A12B/A']:
|
154
281
|
mapping_dict = dict_8a12b_html
|
155
|
-
elif self.type
|
282
|
+
elif self.type in ['8-A12G', '8-A12G/A']:
|
156
283
|
mapping_dict = dict_8a12g_html
|
157
|
-
elif self.type
|
284
|
+
elif self.type in ['8-K', '8-K/A']:
|
158
285
|
mapping_dict = dict_8k_html
|
159
|
-
elif self.type
|
286
|
+
elif self.type in ['8-K12B', '8-K12B/A']:
|
160
287
|
mapping_dict = dict_8k12b_html
|
161
|
-
elif self.type
|
288
|
+
elif self.type in ['8-K12G3', '8-K12G3/A']:
|
162
289
|
mapping_dict = dict_8k12g3_html
|
163
|
-
elif self.type
|
290
|
+
elif self.type in ['8-K15D5', '8-K15D5/A']:
|
164
291
|
mapping_dict = dict_8k15d5_html
|
165
|
-
elif self.type
|
292
|
+
elif self.type in ['ABS-15G', 'ABS-15G/A']:
|
166
293
|
mapping_dict = dict_abs15g_html
|
167
|
-
elif self.type
|
294
|
+
elif self.type in ['ABS-EE', 'ABS-EE/A']:
|
168
295
|
mapping_dict = dict_absee_html
|
169
|
-
elif self.type
|
170
|
-
dict_appntc_html
|
171
|
-
elif self.type
|
296
|
+
elif self.type in ['APP NTC', 'APP NTC/A']:
|
297
|
+
mapping_dict = dict_appntc_html
|
298
|
+
elif self.type in ['CB', 'CB/A']:
|
172
299
|
mapping_dict = dict_cb_html
|
173
|
-
elif self.type
|
300
|
+
elif self.type in ['DSTRBRPT', 'DSTRBRPT/A']:
|
174
301
|
mapping_dict = dict_dstrbrpt_html
|
175
|
-
elif self.type
|
302
|
+
elif self.type in ['N-18F1', 'N-18F1/A']:
|
176
303
|
mapping_dict = dict_n18f1_html
|
177
|
-
elif self.type
|
304
|
+
elif self.type in ['N-CSRS', 'N-CSRS/A']:
|
178
305
|
mapping_dict = dict_ncsrs_html
|
179
|
-
elif self.type
|
306
|
+
elif self.type in ['NT-10K', 'NT-10K/A']:
|
180
307
|
mapping_dict = dict_nt10k_html
|
181
|
-
elif self.type
|
308
|
+
elif self.type in ['NT-10Q', 'NT-10Q/A']:
|
182
309
|
mapping_dict = dict_nt10q_html
|
183
|
-
elif self.type
|
310
|
+
elif self.type in ['NT 20-F', 'NT 20-F/A']:
|
184
311
|
mapping_dict = dict_nt20f_html
|
185
|
-
elif self.type
|
312
|
+
elif self.type in ['NT-NCEN', 'NT-NCEN/A']:
|
186
313
|
mapping_dict = dict_ntncen_html
|
187
|
-
elif self.type
|
314
|
+
elif self.type in ['NT-NCSR', 'NT-NCSR/A']:
|
188
315
|
mapping_dict = dict_ntncsr_html
|
189
|
-
elif self.type
|
316
|
+
elif self.type in ['NTFNCEN', 'NTFNCEN/A']:
|
190
317
|
mapping_dict = dict_ntfcen_html
|
191
|
-
elif self.type
|
318
|
+
elif self.type in ['NTFNCSR', 'NTFNCSR/A']:
|
192
319
|
mapping_dict = dict_ntfncsr_html
|
193
|
-
elif self.type
|
320
|
+
elif self.type in ['EX-99.CERT', 'EX-99.CERT/A']:
|
194
321
|
mapping_dict = dict_ex99cert_html
|
195
|
-
elif self.type
|
322
|
+
elif self.type in ['SC 13E3', 'SC 13E3/A']:
|
196
323
|
mapping_dict = dict_sc13e3_html
|
197
|
-
elif self.type
|
324
|
+
elif self.type in ['SC 14D9', 'SC 14D9/A']:
|
198
325
|
mapping_dict = dict_sc14d9_html
|
199
|
-
elif self.type
|
326
|
+
elif self.type in ['SP 15D2', 'SP 15D2/A']:
|
200
327
|
mapping_dict = dict_sp15d2_html
|
201
|
-
|
202
|
-
elif self.type == 'SD':
|
328
|
+
elif self.type in ['SD', 'SD/A']:
|
203
329
|
mapping_dict = dict_sd_html
|
204
|
-
elif self.type
|
330
|
+
elif self.type in ['S-1', 'S-1/A']:
|
205
331
|
mapping_dict = dict_s1_html
|
206
|
-
elif self.type
|
332
|
+
elif self.type in ['T-3', 'T-3/A']:
|
207
333
|
mapping_dict = dict_t3_html
|
208
|
-
elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
|
334
|
+
elif self.type in ['NT 10-K', 'NT 10-K/A', 'NT 10-Q', 'NT 10-Q/A', 'NT 20-F', 'NT 20-F/A']:
|
209
335
|
mapping_dict = dict_nt10k_html
|
210
336
|
|
211
337
|
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
@@ -233,7 +359,7 @@ class Document:
|
|
233
359
|
self._preprocess_html_content()
|
234
360
|
elif self.extension == '.txt':
|
235
361
|
self._preprocess_txt_content()
|
236
|
-
|
362
|
+
return self._text
|
237
363
|
|
238
364
|
def write_json(self, output_filename=None):
|
239
365
|
if not self.data:
|
datamule/sheet.py
CHANGED
@@ -7,9 +7,6 @@ from .datamule.datamule_lookup import datamule_lookup
|
|
7
7
|
from .datamule.datamule_mysql_rds import query_mysql_rds
|
8
8
|
from company_fundamentals.utils import get_fundamental_mappings
|
9
9
|
from company_fundamentals import construct_fundamentals
|
10
|
-
# slated for deprecation?
|
11
|
-
from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
|
12
|
-
|
13
10
|
class Sheet:
|
14
11
|
def __init__(self, path):
|
15
12
|
self.path = Path(path)
|
@@ -306,401 +303,4 @@ class Sheet:
|
|
306
303
|
if verbose:
|
307
304
|
print(f"Saved {len(data)} records to {filepath_obj}")
|
308
305
|
|
309
|
-
|
310
|
-
def download_information_table(
|
311
|
-
self,
|
312
|
-
filepath,
|
313
|
-
# Optional filtering parameters
|
314
|
-
columns=None,
|
315
|
-
name_of_issuer=None,
|
316
|
-
title_of_class=None,
|
317
|
-
cusip=None,
|
318
|
-
value=None,
|
319
|
-
ssh_prnamt=None,
|
320
|
-
ssh_prnamt_type=None,
|
321
|
-
investment_discretion=None,
|
322
|
-
voting_authority_sole=None,
|
323
|
-
voting_authority_shared=None,
|
324
|
-
voting_authority_none=None,
|
325
|
-
reporting_owner_cik=None,
|
326
|
-
put_call=None,
|
327
|
-
other_manager=None,
|
328
|
-
figi=None,
|
329
|
-
accession=None,
|
330
|
-
filing_date=None,
|
331
|
-
|
332
|
-
# API key handling
|
333
|
-
api_key=None,
|
334
|
-
|
335
|
-
# Additional options
|
336
|
-
print_cost=True,
|
337
|
-
verbose=False
|
338
|
-
):
|
339
|
-
"""
|
340
|
-
Query the SEC BigQuery API for 13F-HR information table data and save to CSV.
|
341
|
-
|
342
|
-
Parameters:
|
343
|
-
-----------
|
344
|
-
filepath : str
|
345
|
-
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
346
|
-
|
347
|
-
columns : List[str], optional
|
348
|
-
Specific columns to return. If None, all columns are returned.
|
349
|
-
|
350
|
-
# Filter parameters
|
351
|
-
name_of_issuer, title_of_class, etc. : Various filters that can be:
|
352
|
-
- str: Exact match
|
353
|
-
- List[str]: Match any in list
|
354
|
-
- tuple: (min, max) range for numeric/date fields
|
355
|
-
|
356
|
-
api_key : str, optional
|
357
|
-
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
358
|
-
print_cost : bool
|
359
|
-
Whether to print the query cost information
|
360
|
-
verbose : bool
|
361
|
-
Whether to print additional information about the query
|
362
|
-
|
363
|
-
Returns:
|
364
|
-
--------
|
365
|
-
List[Dict]
|
366
|
-
A list of dictionaries containing the query results
|
367
|
-
|
368
|
-
Raises:
|
369
|
-
-------
|
370
|
-
ValueError
|
371
|
-
If API key is missing or invalid
|
372
|
-
Exception
|
373
|
-
For API errors or other issues
|
374
|
-
"""
|
375
|
-
# Get the data from the API
|
376
|
-
data = self.get_information_table(
|
377
|
-
columns=columns,
|
378
|
-
name_of_issuer=name_of_issuer,
|
379
|
-
title_of_class=title_of_class,
|
380
|
-
cusip=cusip,
|
381
|
-
value=value,
|
382
|
-
ssh_prnamt=ssh_prnamt,
|
383
|
-
ssh_prnamt_type=ssh_prnamt_type,
|
384
|
-
investment_discretion=investment_discretion,
|
385
|
-
voting_authority_sole=voting_authority_sole,
|
386
|
-
voting_authority_shared=voting_authority_shared,
|
387
|
-
voting_authority_none=voting_authority_none,
|
388
|
-
reporting_owner_cik=reporting_owner_cik,
|
389
|
-
put_call=put_call,
|
390
|
-
other_manager=other_manager,
|
391
|
-
figi=figi,
|
392
|
-
accession=accession,
|
393
|
-
filing_date=filing_date,
|
394
|
-
api_key=api_key,
|
395
|
-
print_cost=print_cost,
|
396
|
-
verbose=verbose
|
397
|
-
)
|
398
|
-
|
399
|
-
# Save to CSV using the helper method
|
400
|
-
return self._download_to_csv(data, filepath, verbose)
|
401
|
-
|
402
|
-
def download_345(
|
403
|
-
self,
|
404
|
-
filepath,
|
405
|
-
# Optional filtering parameters
|
406
|
-
columns=None,
|
407
|
-
is_derivative=None,
|
408
|
-
is_non_derivative=None,
|
409
|
-
security_title=None,
|
410
|
-
transaction_date=None,
|
411
|
-
document_type=None,
|
412
|
-
transaction_code=None,
|
413
|
-
equity_swap_involved=None,
|
414
|
-
transaction_timeliness=None,
|
415
|
-
transaction_shares=None,
|
416
|
-
transaction_price_per_share=None,
|
417
|
-
shares_owned_following_transaction=None,
|
418
|
-
ownership_type=None,
|
419
|
-
deemed_execution_date=None,
|
420
|
-
conversion_or_exercise_price=None,
|
421
|
-
exercise_date=None,
|
422
|
-
expiration_date=None,
|
423
|
-
underlying_security_title=None,
|
424
|
-
underlying_security_shares=None,
|
425
|
-
underlying_security_value=None,
|
426
|
-
accession=None,
|
427
|
-
reporting_owner_cik=None,
|
428
|
-
issuer_cik=None,
|
429
|
-
filing_date=None,
|
430
|
-
|
431
|
-
# API key handling
|
432
|
-
api_key=None,
|
433
|
-
|
434
|
-
# Additional options
|
435
|
-
print_cost=True,
|
436
|
-
verbose=False
|
437
|
-
):
|
438
|
-
"""
|
439
|
-
Query the SEC BigQuery API for Form 345 insider transaction data and save to CSV.
|
440
|
-
|
441
|
-
Parameters:
|
442
|
-
-----------
|
443
|
-
filepath : str
|
444
|
-
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
445
|
-
|
446
|
-
columns : List[str], optional
|
447
|
-
Specific columns to return. If None, all columns are returned.
|
448
|
-
|
449
|
-
# Filter parameters
|
450
|
-
is_derivative, security_title, etc. : Various filters that can be:
|
451
|
-
- str/bool: Exact match
|
452
|
-
- List[str]: Match any in list
|
453
|
-
- tuple: (min, max) range for numeric/date fields
|
454
|
-
|
455
|
-
reporting_owner_cik : str or List[str]
|
456
|
-
CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
|
457
|
-
Any match within the array will return the record.
|
458
|
-
|
459
|
-
issuer_cik : str or List[str]
|
460
|
-
CIK(s) of the company/companies
|
461
|
-
|
462
|
-
api_key : str, optional
|
463
|
-
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
464
|
-
print_cost : bool
|
465
|
-
Whether to print the query cost information
|
466
|
-
verbose : bool
|
467
|
-
Whether to print additional information about the query
|
468
|
-
|
469
|
-
Returns:
|
470
|
-
--------
|
471
|
-
List[Dict]
|
472
|
-
A list of dictionaries containing the query results
|
473
|
-
|
474
|
-
Raises:
|
475
|
-
-------
|
476
|
-
ValueError
|
477
|
-
If API key is missing or invalid
|
478
|
-
Exception
|
479
|
-
For API errors or other issues
|
480
|
-
"""
|
481
|
-
# Get the data from the API
|
482
|
-
data = self.get_345(
|
483
|
-
columns=columns,
|
484
|
-
is_derivative=is_derivative,
|
485
|
-
is_non_derivative=is_non_derivative,
|
486
|
-
security_title=security_title,
|
487
|
-
transaction_date=transaction_date,
|
488
|
-
document_type=document_type,
|
489
|
-
transaction_code=transaction_code,
|
490
|
-
equity_swap_involved=equity_swap_involved,
|
491
|
-
transaction_timeliness=transaction_timeliness,
|
492
|
-
transaction_shares=transaction_shares,
|
493
|
-
transaction_price_per_share=transaction_price_per_share,
|
494
|
-
shares_owned_following_transaction=shares_owned_following_transaction,
|
495
|
-
ownership_type=ownership_type,
|
496
|
-
deemed_execution_date=deemed_execution_date,
|
497
|
-
conversion_or_exercise_price=conversion_or_exercise_price,
|
498
|
-
exercise_date=exercise_date,
|
499
|
-
expiration_date=expiration_date,
|
500
|
-
underlying_security_title=underlying_security_title,
|
501
|
-
underlying_security_shares=underlying_security_shares,
|
502
|
-
underlying_security_value=underlying_security_value,
|
503
|
-
accession=accession,
|
504
|
-
reporting_owner_cik=reporting_owner_cik,
|
505
|
-
issuer_cik=issuer_cik,
|
506
|
-
filing_date=filing_date,
|
507
|
-
api_key=api_key,
|
508
|
-
print_cost=print_cost,
|
509
|
-
verbose=verbose
|
510
|
-
)
|
511
|
-
|
512
|
-
# Save to CSV using the helper method
|
513
|
-
return self._download_to_csv(data, filepath, verbose)
|
514
|
-
|
515
|
-
def get_proxy_voting_record(
|
516
|
-
self,
|
517
|
-
# Optional filtering parameters
|
518
|
-
columns=None,
|
519
|
-
meeting_date=None,
|
520
|
-
isin=None,
|
521
|
-
cusip=None,
|
522
|
-
issuer_name=None,
|
523
|
-
vote_description=None,
|
524
|
-
shares_on_loan=None,
|
525
|
-
shares_voted=None,
|
526
|
-
vote_category=None,
|
527
|
-
vote_record=None,
|
528
|
-
vote_source=None,
|
529
|
-
how_voted=None,
|
530
|
-
figi=None,
|
531
|
-
management_recommendation=None,
|
532
|
-
accession=None,
|
533
|
-
reporting_owner_cik=None,
|
534
|
-
filing_date=None,
|
535
|
-
|
536
|
-
# API key handling
|
537
|
-
api_key=None,
|
538
|
-
|
539
|
-
# Additional options
|
540
|
-
print_cost=True,
|
541
|
-
verbose=False
|
542
|
-
):
|
543
|
-
"""
|
544
|
-
Query the SEC BigQuery API for NPX proxy voting record data.
|
545
|
-
|
546
|
-
Parameters:
|
547
|
-
-----------
|
548
|
-
columns : List[str], optional
|
549
|
-
Specific columns to return. If None, all columns are returned.
|
550
|
-
|
551
|
-
# Filter parameters
|
552
|
-
meeting_date, isin, cusip, etc. : Various filters that can be:
|
553
|
-
- str: Exact match
|
554
|
-
- List[str]: Match any in list
|
555
|
-
- tuple: (min, max) range for numeric/date fields
|
556
|
-
|
557
|
-
shares_on_loan, shares_voted : int/float or tuple
|
558
|
-
Numeric values or (min, max) range
|
559
|
-
|
560
|
-
filing_date : str or tuple
|
561
|
-
Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
|
562
|
-
|
563
|
-
api_key : str, optional
|
564
|
-
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
565
|
-
print_cost : bool
|
566
|
-
Whether to print the query cost information
|
567
|
-
verbose : bool
|
568
|
-
Whether to print additional information about the query
|
569
|
-
|
570
|
-
Returns:
|
571
|
-
--------
|
572
|
-
List[Dict]
|
573
|
-
A list of dictionaries containing the query results
|
574
|
-
|
575
|
-
Raises:
|
576
|
-
-------
|
577
|
-
ValueError
|
578
|
-
If API key is missing or invalid
|
579
|
-
Exception
|
580
|
-
For API errors or other issues
|
581
|
-
"""
|
582
|
-
|
583
|
-
return get_proxy_voting_record(
|
584
|
-
columns=columns,
|
585
|
-
meeting_date=meeting_date,
|
586
|
-
isin=isin,
|
587
|
-
cusip=cusip,
|
588
|
-
issuer_name=issuer_name,
|
589
|
-
vote_description=vote_description,
|
590
|
-
shares_on_loan=shares_on_loan,
|
591
|
-
shares_voted=shares_voted,
|
592
|
-
vote_category=vote_category,
|
593
|
-
vote_record=vote_record,
|
594
|
-
vote_source=vote_source,
|
595
|
-
how_voted=how_voted,
|
596
|
-
figi=figi,
|
597
|
-
management_recommendation=management_recommendation,
|
598
|
-
accession=accession,
|
599
|
-
reporting_owner_cik=reporting_owner_cik,
|
600
|
-
filing_date=filing_date,
|
601
|
-
|
602
|
-
# API key handling
|
603
|
-
api_key=api_key,
|
604
|
-
|
605
|
-
# Additional options
|
606
|
-
print_cost=print_cost,
|
607
|
-
verbose=verbose
|
608
|
-
)
|
609
|
-
|
610
|
-
def download_proxy_voting_record(
|
611
|
-
self,
|
612
|
-
filepath,
|
613
|
-
# Optional filtering parameters
|
614
|
-
columns=None,
|
615
|
-
meeting_date=None,
|
616
|
-
isin=None,
|
617
|
-
cusip=None,
|
618
|
-
issuer_name=None,
|
619
|
-
vote_description=None,
|
620
|
-
shares_on_loan=None,
|
621
|
-
shares_voted=None,
|
622
|
-
vote_category=None,
|
623
|
-
vote_record=None,
|
624
|
-
vote_source=None,
|
625
|
-
how_voted=None,
|
626
|
-
figi=None,
|
627
|
-
management_recommendation=None,
|
628
|
-
accession=None,
|
629
|
-
reporting_owner_cik=None,
|
630
|
-
filing_date=None,
|
631
|
-
|
632
|
-
# API key handling
|
633
|
-
api_key=None,
|
634
|
-
|
635
|
-
# Additional options
|
636
|
-
print_cost=True,
|
637
|
-
verbose=False
|
638
|
-
):
|
639
|
-
"""
|
640
|
-
Query the SEC BigQuery API for NPX proxy voting record data and save to CSV.
|
641
|
-
|
642
|
-
Parameters:
|
643
|
-
-----------
|
644
|
-
filepath : str
|
645
|
-
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
646
|
-
|
647
|
-
columns : List[str], optional
|
648
|
-
Specific columns to return. If None, all columns are returned.
|
649
|
-
|
650
|
-
# Filter parameters
|
651
|
-
meeting_date, isin, cusip, etc. : Various filters that can be:
|
652
|
-
- str: Exact match
|
653
|
-
- List[str]: Match any in list
|
654
|
-
- tuple: (min, max) range for numeric/date fields
|
655
|
-
|
656
|
-
shares_on_loan, shares_voted : int/float or tuple
|
657
|
-
Numeric values or (min, max) range
|
658
|
-
|
659
|
-
filing_date : str or tuple
|
660
|
-
Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
|
661
|
-
|
662
|
-
api_key : str, optional
|
663
|
-
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
664
|
-
print_cost : bool
|
665
|
-
Whether to print the query cost information
|
666
|
-
verbose : bool
|
667
|
-
Whether to print additional information about the query
|
668
|
-
|
669
|
-
Returns:
|
670
|
-
--------
|
671
|
-
List[Dict]
|
672
|
-
A list of dictionaries containing the query results
|
673
|
-
|
674
|
-
Raises:
|
675
|
-
-------
|
676
|
-
ValueError
|
677
|
-
If API key is missing or invalid
|
678
|
-
Exception
|
679
|
-
For API errors or other issues
|
680
|
-
"""
|
681
|
-
# Get the data from the API
|
682
|
-
data = self.get_proxy_voting_record(
|
683
|
-
columns=columns,
|
684
|
-
meeting_date=meeting_date,
|
685
|
-
isin=isin,
|
686
|
-
cusip=cusip,
|
687
|
-
issuer_name=issuer_name,
|
688
|
-
vote_description=vote_description,
|
689
|
-
shares_on_loan=shares_on_loan,
|
690
|
-
shares_voted=shares_voted,
|
691
|
-
vote_category=vote_category,
|
692
|
-
vote_record=vote_record,
|
693
|
-
vote_source=vote_source,
|
694
|
-
how_voted=how_voted,
|
695
|
-
figi=figi,
|
696
|
-
management_recommendation=management_recommendation,
|
697
|
-
accession=accession,
|
698
|
-
reporting_owner_cik=reporting_owner_cik,
|
699
|
-
filing_date=filing_date,
|
700
|
-
api_key=api_key,
|
701
|
-
print_cost=print_cost,
|
702
|
-
verbose=verbose
|
703
|
-
)
|
704
|
-
|
705
|
-
# Save to CSV using the helper method
|
706
|
-
return self._download_to_csv(data, filepath, verbose)
|
306
|
+
|
File without changes
|
datamule/tags/config.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
from ..utils.dictionaries import download_dictionary, load_dictionary
|
2
|
+
|
3
|
+
_active_dictionaries = []
|
4
|
+
_loaded_dictionaries = {}
|
5
|
+
|
6
|
+
def set_dictionaries(dictionaries, overwrite=False):
|
7
|
+
"""Set active dictionaries and load them into memory"""
|
8
|
+
global _active_dictionaries, _loaded_dictionaries
|
9
|
+
_active_dictionaries = dictionaries
|
10
|
+
_loaded_dictionaries = {}
|
11
|
+
|
12
|
+
for dict_name in dictionaries:
|
13
|
+
# Download if needed
|
14
|
+
download_dictionary(dict_name, overwrite=overwrite)
|
15
|
+
# Load into memory
|
16
|
+
_loaded_dictionaries[dict_name] = load_dictionary(dict_name)
|
datamule/tags/regex.py
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
# Exchange ticker regexes with word boundaries
|
2
|
+
nyse_regex = r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"
|
3
|
+
nasdaq_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
4
|
+
nyse_american_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
5
|
+
london_stock_exchange_regex = r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"
|
6
|
+
toronto_stock_exchange_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
7
|
+
euronext_paris_regex = r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"
|
8
|
+
euronext_amsterdam_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
9
|
+
euronext_brussels_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
10
|
+
euronext_lisbon_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
|
11
|
+
euronext_milan_regex = r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"
|
12
|
+
deutsche_borse_xetra_regex = r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"
|
13
|
+
six_swiss_exchange_regex = r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"
|
14
|
+
tokyo_stock_exchange_regex = r"\b(\d{4})\b"
|
15
|
+
hong_kong_stock_exchange_regex = r"\b(\d{4,5})\b"
|
16
|
+
shanghai_stock_exchange_regex = r"\b(6\d{5})\b"
|
17
|
+
shenzhen_stock_exchange_regex = r"\b([03]\d{5})\b"
|
18
|
+
australian_securities_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
|
19
|
+
singapore_exchange_regex = r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"
|
20
|
+
nse_bse_regex = r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"
|
21
|
+
sao_paulo_b3_regex = r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"
|
22
|
+
mexico_bmv_regex = r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"
|
23
|
+
korea_exchange_regex = r"\b(\d{6})\b"
|
24
|
+
taiwan_stock_exchange_regex = r"\b(\d{4})\b"
|
25
|
+
johannesburg_stock_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
|
26
|
+
tel_aviv_stock_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
|
27
|
+
moscow_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
|
28
|
+
istanbul_stock_exchange_regex = r"\b([A-Z]{5})(\.[A-Z]+)?\b"
|
29
|
+
nasdaq_stockholm_regex = r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"
|
30
|
+
oslo_bors_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
|
31
|
+
otc_markets_us_regex = r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"
|
32
|
+
pink_sheets_regex = r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"
|
33
|
+
|
34
|
+
ticker_regex_list = [
|
35
|
+
("nyse", r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"),
|
36
|
+
("nasdaq", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
37
|
+
("nyse_american", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
38
|
+
("london_stock_exchange", r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"),
|
39
|
+
("toronto_stock_exchange", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
40
|
+
("euronext_paris", r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"),
|
41
|
+
("euronext_amsterdam", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
42
|
+
("euronext_brussels", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
43
|
+
("euronext_lisbon", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
|
44
|
+
("euronext_milan", r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"),
|
45
|
+
("deutsche_borse_xetra", r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"),
|
46
|
+
("six_swiss_exchange", r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"),
|
47
|
+
("tokyo_stock_exchange", r"\b(\d{4})\b"),
|
48
|
+
("hong_kong_stock_exchange", r"\b(\d{4,5})\b"),
|
49
|
+
("shanghai_stock_exchange", r"\b(6\d{5})\b"),
|
50
|
+
("shenzhen_stock_exchange", r"\b([03]\d{5})\b"),
|
51
|
+
("australian_securities_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
|
52
|
+
("singapore_exchange", r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"),
|
53
|
+
("nse_bse", r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"),
|
54
|
+
("sao_paulo_b3", r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"),
|
55
|
+
("mexico_bmv", r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"),
|
56
|
+
("korea_exchange", r"\b(\d{6})\b"),
|
57
|
+
("taiwan_stock_exchange", r"\b(\d{4})\b"),
|
58
|
+
("johannesburg_stock_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
|
59
|
+
("tel_aviv_stock_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
|
60
|
+
("moscow_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
|
61
|
+
("istanbul_stock_exchange", r"\b([A-Z]{5})(\.[A-Z]+)?\b"),
|
62
|
+
("nasdaq_stockholm", r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"),
|
63
|
+
("oslo_bors", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
|
64
|
+
("otc_markets_us", r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"),
|
65
|
+
("pink_sheets", r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"),
|
66
|
+
]
|
67
|
+
# Security identifier regexes with word boundaries
|
68
|
+
cusip_regex = r"\b[0-9A-Z]{8}[0-9]\b"
|
69
|
+
isin_regex = r"\b[A-Z]{2}[0-9A-Z]{9}[0-9]\b"
|
70
|
+
figi_regex = r"\b[A-Z]{2}G[A-Z0-9]{8}[0-9]\b"
|
71
|
+
|
72
|
+
particles = {
|
73
|
+
# Dutch - single words only
|
74
|
+
'van', 'der', 'den', 'de',
|
75
|
+
|
76
|
+
# German - single words only
|
77
|
+
'von', 'zu', 'vom', 'zur', 'zum',
|
78
|
+
|
79
|
+
# Spanish - single words only
|
80
|
+
'de', 'del', 'y',
|
81
|
+
|
82
|
+
# Portuguese - single words only
|
83
|
+
'da', 'das', 'do', 'dos', 'e',
|
84
|
+
|
85
|
+
# French - single words only
|
86
|
+
'de', 'du', 'des', 'le', 'la', 'les', "d'",
|
87
|
+
|
88
|
+
# Italian - single words only
|
89
|
+
'da', 'di', 'del', 'della', 'delle', 'dei', 'degli', 'dello',
|
90
|
+
|
91
|
+
# Irish/Scottish
|
92
|
+
'mac', 'mc', 'o',
|
93
|
+
|
94
|
+
# Arabic
|
95
|
+
'al', 'el', 'ibn', 'bin', 'bint', 'abu',
|
96
|
+
|
97
|
+
# Other European
|
98
|
+
'af', 'av', # Scandinavian
|
99
|
+
'ter', # Dutch/Flemish
|
100
|
+
'op', # Dutch
|
101
|
+
'aan', # Dutch
|
102
|
+
'ten', # Dutch
|
103
|
+
'het', # Dutch
|
104
|
+
'in', # Dutch
|
105
|
+
}
|
datamule/tags/utils.py
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
import re
|
2
|
+
from .regex import cusip_regex, isin_regex, figi_regex, ticker_regex_list
|
3
|
+
from .regex import particles
|
4
|
+
from flashtext import KeywordProcessor
|
5
|
+
|
6
|
+
def get_cusip_using_regex(text,keywords=None):
|
7
|
+
matches = []
|
8
|
+
for match in re.finditer(cusip_regex, text):
|
9
|
+
if keywords is not None:
|
10
|
+
if match.group() in keywords:
|
11
|
+
matches.append((match.group(), match.start(), match.end()))
|
12
|
+
else:
|
13
|
+
matches.append((match.group(), match.start(), match.end()))
|
14
|
+
return matches
|
15
|
+
|
16
|
+
def get_isin_using_regex(text,keywords=None):
|
17
|
+
matches = []
|
18
|
+
for match in re.finditer(isin_regex, text):
|
19
|
+
if keywords is not None:
|
20
|
+
if match.group() in keywords:
|
21
|
+
matches.append((match.group(), match.start(), match.end()))
|
22
|
+
else:
|
23
|
+
matches.append((match.group(), match.start(), match.end()))
|
24
|
+
return matches
|
25
|
+
|
26
|
+
def get_figi_using_regex(text,keywords=None):
|
27
|
+
matches = []
|
28
|
+
for match in re.finditer(figi_regex, text):
|
29
|
+
if keywords is not None:
|
30
|
+
if match.group() in keywords:
|
31
|
+
matches.append((match.group(), match.start(), match.end()))
|
32
|
+
else:
|
33
|
+
matches.append((match.group(), match.start(), match.end()))
|
34
|
+
return matches
|
35
|
+
|
36
|
+
def get_tickers_using_regex(text, regex_pattern):
|
37
|
+
"""Extract tickers using the given regex pattern with position information"""
|
38
|
+
matches = []
|
39
|
+
for match in re.finditer(regex_pattern, text):
|
40
|
+
# Handle tuples from regex groups - take the first capture group
|
41
|
+
if match.groups():
|
42
|
+
ticker = match.group(1) if match.group(1) else match.group(0)
|
43
|
+
else:
|
44
|
+
ticker = match.group(0)
|
45
|
+
matches.append((ticker, match.start(), match.end()))
|
46
|
+
return matches
|
47
|
+
|
48
|
+
def get_all_tickers(text):
|
49
|
+
"""Get all tickers from all exchanges organized by exchange with position info"""
|
50
|
+
result = {}
|
51
|
+
all_tickers = []
|
52
|
+
|
53
|
+
for exchange_name, regex_pattern in ticker_regex_list:
|
54
|
+
tickers = get_tickers_using_regex(text, regex_pattern)
|
55
|
+
result[exchange_name] = tickers
|
56
|
+
all_tickers.extend(tickers)
|
57
|
+
|
58
|
+
# Remove duplicates while preserving order for 'all'
|
59
|
+
# Keep track of seen ticker values (first element of tuple)
|
60
|
+
seen = set()
|
61
|
+
result['all'] = [x for x in all_tickers if not (x[0] in seen or seen.add(x[0]))]
|
62
|
+
|
63
|
+
return result
|
64
|
+
|
65
|
+
def get_ticker_regex_dict():
|
66
|
+
"""Return ticker regex list as a dictionary for easy lookup"""
|
67
|
+
return dict(ticker_regex_list)
|
68
|
+
|
69
|
+
# will change in future to accomodate other datasets
|
70
|
+
def validate_full_name(full_name,keywords):
|
71
|
+
if len(full_name) == 1:
|
72
|
+
return False
|
73
|
+
# check all is upper
|
74
|
+
if all(word.isupper() for word in full_name):
|
75
|
+
return False
|
76
|
+
# check if any number in word
|
77
|
+
if any(any(char.isdigit() for char in word) for word in full_name):
|
78
|
+
return False
|
79
|
+
if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
|
80
|
+
return False
|
81
|
+
|
82
|
+
# add optional set lookups
|
83
|
+
if keywords is not None:
|
84
|
+
# return false if first word is not in keywords set
|
85
|
+
if full_name[0] not in keywords:
|
86
|
+
return False
|
87
|
+
|
88
|
+
|
89
|
+
return True
|
90
|
+
|
91
|
+
def get_full_names(text,keywords=None):
|
92
|
+
words = text.split()
|
93
|
+
full_names = []
|
94
|
+
current_pos = None
|
95
|
+
word_start_positions = []
|
96
|
+
|
97
|
+
# Calculate word positions in the original text
|
98
|
+
pos = 0
|
99
|
+
for word in words:
|
100
|
+
start = text.find(word, pos)
|
101
|
+
word_start_positions.append(start)
|
102
|
+
pos = start + len(word)
|
103
|
+
|
104
|
+
for idx, word in enumerate(words):
|
105
|
+
if current_pos is None:
|
106
|
+
if word[0].isupper():
|
107
|
+
current_pos = idx
|
108
|
+
else:
|
109
|
+
if word[0].isupper() or word.lower() in particles:
|
110
|
+
continue
|
111
|
+
else:
|
112
|
+
full_name = words[current_pos:idx]
|
113
|
+
if validate_full_name(full_name,keywords):
|
114
|
+
name_text = ' '.join(full_name)
|
115
|
+
start_pos = word_start_positions[current_pos]
|
116
|
+
# Calculate end position of the last word in the name
|
117
|
+
last_word_idx = idx - 1
|
118
|
+
end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
|
119
|
+
full_names.append((name_text, start_pos, end_pos))
|
120
|
+
|
121
|
+
current_pos = None
|
122
|
+
|
123
|
+
# handle last case - if we're still tracking a name when we reach the end
|
124
|
+
if current_pos is not None:
|
125
|
+
full_name = words[current_pos:]
|
126
|
+
if validate_full_name(full_name,keywords):
|
127
|
+
name_text = ' '.join(full_name)
|
128
|
+
start_pos = word_start_positions[current_pos]
|
129
|
+
# Calculate end position of the last word
|
130
|
+
last_word_idx = len(words) - 1
|
131
|
+
end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
|
132
|
+
full_names.append((name_text, start_pos, end_pos))
|
133
|
+
|
134
|
+
return full_names
|
135
|
+
|
136
|
+
# add dictionary lookup based on precomputed lists
|
137
|
+
def get_full_names_dictionary_lookup(text, dictionary):
|
138
|
+
keyword_processor = KeywordProcessor(case_sensitive=True)
|
139
|
+
|
140
|
+
for key in dictionary.keys():
|
141
|
+
keyword_processor.add_keyword(key, key)
|
142
|
+
|
143
|
+
matches = []
|
144
|
+
keywords_found = keyword_processor.extract_keywords(text, span_info=True)
|
145
|
+
|
146
|
+
for keyword, start_pos, end_pos in keywords_found:
|
147
|
+
matches.append((keyword, start_pos, end_pos))
|
148
|
+
|
149
|
+
return matches
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import urllib.request
|
3
|
+
import json
|
4
|
+
urls = {
|
5
|
+
"ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
|
6
|
+
"npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
|
7
|
+
"npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
|
8
|
+
"sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
|
9
|
+
"8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
|
10
|
+
}
|
11
|
+
|
12
|
+
|
13
|
+
def download_dictionary(name,overwrite=False):
|
14
|
+
url = urls[name]
|
15
|
+
|
16
|
+
# Create dictionaries directory in datamule folder
|
17
|
+
dict_dir = Path.home() / ".datamule" / "dictionaries"
|
18
|
+
dict_dir.mkdir(parents=True, exist_ok=True)
|
19
|
+
|
20
|
+
# check if file exists first
|
21
|
+
if not overwrite:
|
22
|
+
filename = url.split('/')[-1]
|
23
|
+
file_path = dict_dir / filename
|
24
|
+
if file_path.exists():
|
25
|
+
return
|
26
|
+
|
27
|
+
# Extract filename from URL
|
28
|
+
filename = url.split('/')[-1]
|
29
|
+
file_path = dict_dir / filename
|
30
|
+
|
31
|
+
print(f"Downloading {name} dictionary to {file_path}")
|
32
|
+
urllib.request.urlretrieve(url, file_path)
|
33
|
+
return
|
34
|
+
|
35
|
+
def load_dictionary(name):
|
36
|
+
# Get or download the dictionary file
|
37
|
+
dict_dir = Path.home() / ".datamule" / "dictionaries"
|
38
|
+
filename = urls[name].split('/')[-1]
|
39
|
+
file_path = dict_dir / filename
|
40
|
+
|
41
|
+
# Download if doesn't exist
|
42
|
+
if not file_path.exists():
|
43
|
+
download_dictionary(name)
|
44
|
+
|
45
|
+
# Load the dictionary based on name
|
46
|
+
if name == "ssa_baby_first_names":
|
47
|
+
names_set = set()
|
48
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
49
|
+
for line in f:
|
50
|
+
names_set.add(line.strip())
|
51
|
+
return names_set
|
52
|
+
elif name == "npx_figis":
|
53
|
+
figi_set = set()
|
54
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
55
|
+
for line in f:
|
56
|
+
figi_set.add(line.strip())
|
57
|
+
return figi_set
|
58
|
+
elif name == "npx_isins":
|
59
|
+
isin_set = set()
|
60
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
61
|
+
for line in f:
|
62
|
+
isin_set.add(line.strip())
|
63
|
+
return isin_set
|
64
|
+
elif name == "sc13dg_cusips":
|
65
|
+
cusip_set = set()
|
66
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
67
|
+
for line in f:
|
68
|
+
cusip_set.add(line.strip())
|
69
|
+
return cusip_set
|
70
|
+
elif name == "8k_2024_persons":
|
71
|
+
|
72
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
73
|
+
persons_list = json.load(f)
|
74
|
+
return persons_list
|
75
|
+
else:
|
76
|
+
raise ValueError("dictionary not found")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.2.0
|
4
4
|
Summary: Work with SEC submissions at scale.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -19,4 +19,5 @@ Requires-Dist: secxbrl
|
|
19
19
|
Requires-Dist: secsgml
|
20
20
|
Requires-Dist: websocket-client
|
21
21
|
Requires-Dist: company-fundamentals
|
22
|
+
Requires-Dist: flashtext
|
22
23
|
|
@@ -6,7 +6,7 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
6
6
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
7
7
|
datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
|
8
8
|
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
9
|
-
datamule/sheet.py,sha256=
|
9
|
+
datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
|
10
10
|
datamule/submission.py,sha256=TdQDfFjOKXy2qAZcD6hc9kjDSxmuZLqk8WRhtMjjC-g,15822
|
11
11
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
12
12
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=QjncYOIdf0Zf_0AONEOXu2KlPxMksGZzvwmHOpbM5N8,20450
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
@@ -48,10 +48,15 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
48
48
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
49
49
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
50
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
51
|
+
datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
|
+
datamule/tags/config.py,sha256=JVdIkqu9rBEAadNLP-FiIbZ35TRORGIDCJvqDh0CuqE,585
|
53
|
+
datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
|
54
|
+
datamule/tags/utils.py,sha256=k5fyjMjJNh6gZjj491sw_9rnMqYIlHHDBathkDcHD0A,5423
|
51
55
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
56
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
57
|
+
datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
|
53
58
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
54
|
-
datamule-2.
|
55
|
-
datamule-2.
|
56
|
-
datamule-2.
|
57
|
-
datamule-2.
|
59
|
+
datamule-2.2.0.dist-info/METADATA,sha256=fuT_ABK8D6LhEi1_TjtVnIKobXdafBPiMSGy3XCWyRo,585
|
60
|
+
datamule-2.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
61
|
+
datamule-2.2.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
62
|
+
datamule-2.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|