datamule 2.1.6__tar.gz → 2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.1.6 → datamule-2.2.1}/PKG-INFO +2 -1
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/document.py +132 -1
- datamule-2.2.1/datamule/tags/config.py +33 -0
- datamule-2.2.1/datamule/tags/regex.py +105 -0
- datamule-2.2.1/datamule/tags/utils.py +145 -0
- datamule-2.2.1/datamule/utils/__init__.py +0 -0
- datamule-2.2.1/datamule/utils/dictionaries.py +76 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/PKG-INFO +2 -1
- {datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/SOURCES.txt +5 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/requires.txt +1 -0
- {datamule-2.1.6 → datamule-2.2.1}/setup.py +3 -2
- {datamule-2.1.6 → datamule-2.2.1}/datamule/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/config.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/datamule/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/datamule/downloader.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/datasets.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/document/tables/utils.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/helper.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/index.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/package_updater.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/portfolio.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/utils.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/sheet.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/submission.py +0 -0
- {datamule-2.1.6/datamule/utils → datamule-2.2.1/datamule/tags}/__init__.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule/utils/format_accession.py +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.1.6 → datamule-2.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 2.1
|
3
|
+
Version: 2.2.1
|
4
4
|
Summary: Work with SEC submissions at scale.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -19,3 +19,4 @@ Requires-Dist: secxbrl
|
|
19
19
|
Requires-Dist: secsgml
|
20
20
|
Requires-Dist: websocket-client
|
21
21
|
Requires-Dist: company_fundamentals
|
22
|
+
Requires-Dist: flashtext
|
@@ -13,9 +13,137 @@ from pathlib import Path
|
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
15
|
import tempfile
|
16
|
-
|
16
|
+
import warnings
|
17
17
|
from .tables.tables import Tables
|
18
18
|
|
19
|
+
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
|
20
|
+
|
21
|
+
|
22
|
+
class Tickers:
|
23
|
+
def __init__(self, document):
|
24
|
+
self.document = document
|
25
|
+
self._tickers_data = None
|
26
|
+
|
27
|
+
def _get_tickers_data(self):
|
28
|
+
"""Get all tickers data once and cache it"""
|
29
|
+
if self._tickers_data is None:
|
30
|
+
# Check if document extension is supported
|
31
|
+
if self.document.extension not in ['.htm', '.html', '.txt']:
|
32
|
+
self._tickers_data = {}
|
33
|
+
else:
|
34
|
+
self._tickers_data = get_all_tickers(self.document.text)
|
35
|
+
return self._tickers_data
|
36
|
+
|
37
|
+
def __getattr__(self, exchange_name):
|
38
|
+
data = self._get_tickers_data()
|
39
|
+
|
40
|
+
if exchange_name in data:
|
41
|
+
return data[exchange_name]
|
42
|
+
|
43
|
+
return []
|
44
|
+
|
45
|
+
def __bool__(self):
|
46
|
+
"""Return True if any tickers were found"""
|
47
|
+
data = self._get_tickers_data()
|
48
|
+
return bool(data.get('all', []))
|
49
|
+
|
50
|
+
def __repr__(self):
|
51
|
+
"""Show the full ticker data when printed or accessed directly"""
|
52
|
+
data = self._get_tickers_data()
|
53
|
+
return str(data)
|
54
|
+
|
55
|
+
def __str__(self):
|
56
|
+
"""Show the full ticker data when printed"""
|
57
|
+
data = self._get_tickers_data()
|
58
|
+
return str(data)
|
59
|
+
|
60
|
+
class Tags:
|
61
|
+
def __init__(self, document):
|
62
|
+
from ..tags.config import _active_dictionaries,_loaded_dictionaries
|
63
|
+
self.not_supported = document.extension not in ['.htm', '.html', '.txt']
|
64
|
+
self.document = document
|
65
|
+
self._tickers = None
|
66
|
+
self.dictionaries = {}
|
67
|
+
self.processors = {}
|
68
|
+
|
69
|
+
# Load global dictionaries with their data and processors
|
70
|
+
active_dicts = _active_dictionaries
|
71
|
+
for dict_name in active_dicts:
|
72
|
+
dict_info = _loaded_dictionaries[dict_name]
|
73
|
+
self.dictionaries[dict_name] = dict_info['data']
|
74
|
+
if dict_info['processor'] is not None:
|
75
|
+
self.processors[dict_name] = dict_info['processor']
|
76
|
+
|
77
|
+
|
78
|
+
def _check_support(self):
|
79
|
+
if self.not_supported:
|
80
|
+
warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
|
81
|
+
return False
|
82
|
+
return True
|
83
|
+
|
84
|
+
@property
|
85
|
+
def cusips(self):
|
86
|
+
if not self._check_support():
|
87
|
+
return None
|
88
|
+
|
89
|
+
if not hasattr(self, '_cusip'):
|
90
|
+
if 'sc13dg_cusips' in self.dictionaries:
|
91
|
+
keywords = self.dictionaries['sc13dg_cusips']
|
92
|
+
self._cusip = get_cusip_using_regex(self.document.text, keywords)
|
93
|
+
else:
|
94
|
+
self._cusip = get_cusip_using_regex(self.document.text)
|
95
|
+
return self._cusip
|
96
|
+
|
97
|
+
@property
|
98
|
+
def isins(self):
|
99
|
+
if not self._check_support():
|
100
|
+
return None
|
101
|
+
|
102
|
+
if not hasattr(self, '_isin'):
|
103
|
+
if 'npx_isins' in self.dictionaries:
|
104
|
+
keywords = self.dictionaries['npx_isins']
|
105
|
+
self._isin = get_isin_using_regex(self.document.text, keywords)
|
106
|
+
else:
|
107
|
+
self._isin = get_isin_using_regex(self.document.text)
|
108
|
+
return self._isin
|
109
|
+
|
110
|
+
@property
|
111
|
+
def figis(self):
|
112
|
+
if not self._check_support():
|
113
|
+
return None
|
114
|
+
|
115
|
+
if not hasattr(self, '_figi'):
|
116
|
+
if 'npx_figis' in self.dictionaries:
|
117
|
+
keywords = self.dictionaries['npx_figis']
|
118
|
+
self._figi = get_figi_using_regex(self.document.text, keywords)
|
119
|
+
else:
|
120
|
+
self._figi = get_figi_using_regex(self.document.text)
|
121
|
+
return self._figi
|
122
|
+
|
123
|
+
@property
|
124
|
+
def tickers(self):
|
125
|
+
if self._tickers is None:
|
126
|
+
self._tickers = Tickers(self.document)
|
127
|
+
return self._tickers
|
128
|
+
|
129
|
+
@property
|
130
|
+
def persons(self):
|
131
|
+
if not self._check_support():
|
132
|
+
return None
|
133
|
+
|
134
|
+
if not hasattr(self, '_persons'):
|
135
|
+
if '8k_2024_persons' in self.processors:
|
136
|
+
# Use pre-built processor
|
137
|
+
self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
|
138
|
+
elif 'ssa_baby_first_names' in self.dictionaries:
|
139
|
+
# Use regex with SSA names for validation
|
140
|
+
self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
|
141
|
+
else:
|
142
|
+
# Fallback to regex without validation
|
143
|
+
self._persons = get_full_names(self.document.text)
|
144
|
+
return self._persons
|
145
|
+
|
146
|
+
|
19
147
|
class Document:
|
20
148
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
21
149
|
|
@@ -34,10 +162,13 @@ class Document:
|
|
34
162
|
self.path = path
|
35
163
|
|
36
164
|
self.extension = extension
|
165
|
+
|
37
166
|
# this will be filled by parsed
|
38
167
|
self._data = None
|
39
168
|
self._tables = None
|
40
169
|
self._text = None
|
170
|
+
|
171
|
+
self.tags = Tags(self)
|
41
172
|
|
42
173
|
|
43
174
|
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from ..utils.dictionaries import download_dictionary, load_dictionary
|
2
|
+
|
3
|
+
_active_dictionaries = []
|
4
|
+
_loaded_dictionaries = {}
|
5
|
+
|
6
|
+
def set_dictionaries(dictionaries, overwrite=False):
|
7
|
+
"""Set active dictionaries and load them into memory"""
|
8
|
+
global _active_dictionaries, _loaded_dictionaries
|
9
|
+
_active_dictionaries = dictionaries
|
10
|
+
_loaded_dictionaries = {}
|
11
|
+
|
12
|
+
for dict_name in dictionaries:
|
13
|
+
# Download if needed
|
14
|
+
download_dictionary(dict_name, overwrite=overwrite)
|
15
|
+
# Load raw data
|
16
|
+
raw_data = load_dictionary(dict_name)
|
17
|
+
|
18
|
+
# Create processor for dictionary lookup methods
|
19
|
+
if dict_name in ['8k_2024_persons']: # Add other dict names as needed
|
20
|
+
from flashtext import KeywordProcessor
|
21
|
+
processor = KeywordProcessor(case_sensitive=True)
|
22
|
+
for key in raw_data.keys():
|
23
|
+
processor.add_keyword(key, key)
|
24
|
+
|
25
|
+
_loaded_dictionaries[dict_name] = {
|
26
|
+
'data': raw_data,
|
27
|
+
'processor': processor
|
28
|
+
}
|
29
|
+
else:
|
30
|
+
_loaded_dictionaries[dict_name] = {
|
31
|
+
'data': raw_data,
|
32
|
+
'processor': None
|
33
|
+
}
|
@@ -0,0 +1,105 @@
|
|
1
|
+
# Exchange ticker regexes with word boundaries
|
2
|
+
nyse_regex = r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"
|
3
|
+
nasdaq_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
4
|
+
nyse_american_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
5
|
+
london_stock_exchange_regex = r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"
|
6
|
+
toronto_stock_exchange_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
7
|
+
euronext_paris_regex = r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"
|
8
|
+
euronext_amsterdam_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
9
|
+
euronext_brussels_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
|
10
|
+
euronext_lisbon_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
|
11
|
+
euronext_milan_regex = r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"
|
12
|
+
deutsche_borse_xetra_regex = r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"
|
13
|
+
six_swiss_exchange_regex = r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"
|
14
|
+
tokyo_stock_exchange_regex = r"\b(\d{4})\b"
|
15
|
+
hong_kong_stock_exchange_regex = r"\b(\d{4,5})\b"
|
16
|
+
shanghai_stock_exchange_regex = r"\b(6\d{5})\b"
|
17
|
+
shenzhen_stock_exchange_regex = r"\b([03]\d{5})\b"
|
18
|
+
australian_securities_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
|
19
|
+
singapore_exchange_regex = r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"
|
20
|
+
nse_bse_regex = r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"
|
21
|
+
sao_paulo_b3_regex = r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"
|
22
|
+
mexico_bmv_regex = r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"
|
23
|
+
korea_exchange_regex = r"\b(\d{6})\b"
|
24
|
+
taiwan_stock_exchange_regex = r"\b(\d{4})\b"
|
25
|
+
johannesburg_stock_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
|
26
|
+
tel_aviv_stock_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
|
27
|
+
moscow_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
|
28
|
+
istanbul_stock_exchange_regex = r"\b([A-Z]{5})(\.[A-Z]+)?\b"
|
29
|
+
nasdaq_stockholm_regex = r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"
|
30
|
+
oslo_bors_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
|
31
|
+
otc_markets_us_regex = r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"
|
32
|
+
pink_sheets_regex = r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"
|
33
|
+
|
34
|
+
ticker_regex_list = [
|
35
|
+
("nyse", r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"),
|
36
|
+
("nasdaq", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
37
|
+
("nyse_american", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
38
|
+
("london_stock_exchange", r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"),
|
39
|
+
("toronto_stock_exchange", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
40
|
+
("euronext_paris", r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"),
|
41
|
+
("euronext_amsterdam", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
42
|
+
("euronext_brussels", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
|
43
|
+
("euronext_lisbon", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
|
44
|
+
("euronext_milan", r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"),
|
45
|
+
("deutsche_borse_xetra", r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"),
|
46
|
+
("six_swiss_exchange", r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"),
|
47
|
+
("tokyo_stock_exchange", r"\b(\d{4})\b"),
|
48
|
+
("hong_kong_stock_exchange", r"\b(\d{4,5})\b"),
|
49
|
+
("shanghai_stock_exchange", r"\b(6\d{5})\b"),
|
50
|
+
("shenzhen_stock_exchange", r"\b([03]\d{5})\b"),
|
51
|
+
("australian_securities_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
|
52
|
+
("singapore_exchange", r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"),
|
53
|
+
("nse_bse", r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"),
|
54
|
+
("sao_paulo_b3", r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"),
|
55
|
+
("mexico_bmv", r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"),
|
56
|
+
("korea_exchange", r"\b(\d{6})\b"),
|
57
|
+
("taiwan_stock_exchange", r"\b(\d{4})\b"),
|
58
|
+
("johannesburg_stock_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
|
59
|
+
("tel_aviv_stock_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
|
60
|
+
("moscow_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
|
61
|
+
("istanbul_stock_exchange", r"\b([A-Z]{5})(\.[A-Z]+)?\b"),
|
62
|
+
("nasdaq_stockholm", r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"),
|
63
|
+
("oslo_bors", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
|
64
|
+
("otc_markets_us", r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"),
|
65
|
+
("pink_sheets", r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"),
|
66
|
+
]
|
67
|
+
# Security identifier regexes with word boundaries
|
68
|
+
cusip_regex = r"\b[0-9A-Z]{8}[0-9]\b"
|
69
|
+
isin_regex = r"\b[A-Z]{2}[0-9A-Z]{9}[0-9]\b"
|
70
|
+
figi_regex = r"\b[A-Z]{2}G[A-Z0-9]{8}[0-9]\b"
|
71
|
+
|
72
|
+
particles = {
|
73
|
+
# Dutch - single words only
|
74
|
+
'van', 'der', 'den', 'de',
|
75
|
+
|
76
|
+
# German - single words only
|
77
|
+
'von', 'zu', 'vom', 'zur', 'zum',
|
78
|
+
|
79
|
+
# Spanish - single words only
|
80
|
+
'de', 'del', 'y',
|
81
|
+
|
82
|
+
# Portuguese - single words only
|
83
|
+
'da', 'das', 'do', 'dos', 'e',
|
84
|
+
|
85
|
+
# French - single words only
|
86
|
+
'de', 'du', 'des', 'le', 'la', 'les', "d'",
|
87
|
+
|
88
|
+
# Italian - single words only
|
89
|
+
'da', 'di', 'del', 'della', 'delle', 'dei', 'degli', 'dello',
|
90
|
+
|
91
|
+
# Irish/Scottish
|
92
|
+
'mac', 'mc', 'o',
|
93
|
+
|
94
|
+
# Arabic
|
95
|
+
'al', 'el', 'ibn', 'bin', 'bint', 'abu',
|
96
|
+
|
97
|
+
# Other European
|
98
|
+
'af', 'av', # Scandinavian
|
99
|
+
'ter', # Dutch/Flemish
|
100
|
+
'op', # Dutch
|
101
|
+
'aan', # Dutch
|
102
|
+
'ten', # Dutch
|
103
|
+
'het', # Dutch
|
104
|
+
'in', # Dutch
|
105
|
+
}
|
@@ -0,0 +1,145 @@
|
|
1
|
+
import re
|
2
|
+
from .regex import cusip_regex, isin_regex, figi_regex, ticker_regex_list
|
3
|
+
from .regex import particles
|
4
|
+
from flashtext import KeywordProcessor
|
5
|
+
|
6
|
+
def get_cusip_using_regex(text,keywords=None):
|
7
|
+
matches = []
|
8
|
+
for match in re.finditer(cusip_regex, text):
|
9
|
+
if keywords is not None:
|
10
|
+
if match.group() in keywords:
|
11
|
+
matches.append((match.group(), match.start(), match.end()))
|
12
|
+
else:
|
13
|
+
matches.append((match.group(), match.start(), match.end()))
|
14
|
+
return matches
|
15
|
+
|
16
|
+
def get_isin_using_regex(text,keywords=None):
|
17
|
+
matches = []
|
18
|
+
for match in re.finditer(isin_regex, text):
|
19
|
+
if keywords is not None:
|
20
|
+
if match.group() in keywords:
|
21
|
+
matches.append((match.group(), match.start(), match.end()))
|
22
|
+
else:
|
23
|
+
matches.append((match.group(), match.start(), match.end()))
|
24
|
+
return matches
|
25
|
+
|
26
|
+
def get_figi_using_regex(text,keywords=None):
|
27
|
+
matches = []
|
28
|
+
for match in re.finditer(figi_regex, text):
|
29
|
+
if keywords is not None:
|
30
|
+
if match.group() in keywords:
|
31
|
+
matches.append((match.group(), match.start(), match.end()))
|
32
|
+
else:
|
33
|
+
matches.append((match.group(), match.start(), match.end()))
|
34
|
+
return matches
|
35
|
+
|
36
|
+
def get_tickers_using_regex(text, regex_pattern):
|
37
|
+
"""Extract tickers using the given regex pattern with position information"""
|
38
|
+
matches = []
|
39
|
+
for match in re.finditer(regex_pattern, text):
|
40
|
+
# Handle tuples from regex groups - take the first capture group
|
41
|
+
if match.groups():
|
42
|
+
ticker = match.group(1) if match.group(1) else match.group(0)
|
43
|
+
else:
|
44
|
+
ticker = match.group(0)
|
45
|
+
matches.append((ticker, match.start(), match.end()))
|
46
|
+
return matches
|
47
|
+
|
48
|
+
def get_all_tickers(text):
|
49
|
+
"""Get all tickers from all exchanges organized by exchange with position info"""
|
50
|
+
result = {}
|
51
|
+
all_tickers = []
|
52
|
+
|
53
|
+
for exchange_name, regex_pattern in ticker_regex_list:
|
54
|
+
tickers = get_tickers_using_regex(text, regex_pattern)
|
55
|
+
result[exchange_name] = tickers
|
56
|
+
all_tickers.extend(tickers)
|
57
|
+
|
58
|
+
# Remove duplicates while preserving order for 'all'
|
59
|
+
# Keep track of seen ticker values (first element of tuple)
|
60
|
+
seen = set()
|
61
|
+
result['all'] = [x for x in all_tickers if not (x[0] in seen or seen.add(x[0]))]
|
62
|
+
|
63
|
+
return result
|
64
|
+
|
65
|
+
def get_ticker_regex_dict():
|
66
|
+
"""Return ticker regex list as a dictionary for easy lookup"""
|
67
|
+
return dict(ticker_regex_list)
|
68
|
+
|
69
|
+
# will change in future to accomodate other datasets
|
70
|
+
def validate_full_name(full_name,keywords):
|
71
|
+
if len(full_name) == 1:
|
72
|
+
return False
|
73
|
+
# check all is upper
|
74
|
+
if all(word.isupper() for word in full_name):
|
75
|
+
return False
|
76
|
+
# check if any number in word
|
77
|
+
if any(any(char.isdigit() for char in word) for word in full_name):
|
78
|
+
return False
|
79
|
+
if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
|
80
|
+
return False
|
81
|
+
|
82
|
+
# add optional set lookups
|
83
|
+
if keywords is not None:
|
84
|
+
# return false if first word is not in keywords set
|
85
|
+
if full_name[0] not in keywords:
|
86
|
+
return False
|
87
|
+
|
88
|
+
|
89
|
+
return True
|
90
|
+
|
91
|
+
def get_full_names(text,keywords=None):
|
92
|
+
words = text.split()
|
93
|
+
full_names = []
|
94
|
+
current_pos = None
|
95
|
+
word_start_positions = []
|
96
|
+
|
97
|
+
# Calculate word positions in the original text
|
98
|
+
pos = 0
|
99
|
+
for word in words:
|
100
|
+
start = text.find(word, pos)
|
101
|
+
word_start_positions.append(start)
|
102
|
+
pos = start + len(word)
|
103
|
+
|
104
|
+
for idx, word in enumerate(words):
|
105
|
+
if current_pos is None:
|
106
|
+
if word[0].isupper():
|
107
|
+
current_pos = idx
|
108
|
+
else:
|
109
|
+
if word[0].isupper() or word.lower() in particles:
|
110
|
+
continue
|
111
|
+
else:
|
112
|
+
full_name = words[current_pos:idx]
|
113
|
+
if validate_full_name(full_name,keywords):
|
114
|
+
name_text = ' '.join(full_name)
|
115
|
+
start_pos = word_start_positions[current_pos]
|
116
|
+
# Calculate end position of the last word in the name
|
117
|
+
last_word_idx = idx - 1
|
118
|
+
end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
|
119
|
+
full_names.append((name_text, start_pos, end_pos))
|
120
|
+
|
121
|
+
current_pos = None
|
122
|
+
|
123
|
+
# handle last case - if we're still tracking a name when we reach the end
|
124
|
+
if current_pos is not None:
|
125
|
+
full_name = words[current_pos:]
|
126
|
+
if validate_full_name(full_name,keywords):
|
127
|
+
name_text = ' '.join(full_name)
|
128
|
+
start_pos = word_start_positions[current_pos]
|
129
|
+
# Calculate end position of the last word
|
130
|
+
last_word_idx = len(words) - 1
|
131
|
+
end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
|
132
|
+
full_names.append((name_text, start_pos, end_pos))
|
133
|
+
|
134
|
+
return full_names
|
135
|
+
|
136
|
+
# add dictionary lookup based on precomputed lists
|
137
|
+
def get_full_names_dictionary_lookup(text, processor):
|
138
|
+
"""Use pre-built KeywordProcessor instead of creating new one"""
|
139
|
+
matches = []
|
140
|
+
keywords_found = processor.extract_keywords(text, span_info=True)
|
141
|
+
|
142
|
+
for keyword, start_pos, end_pos in keywords_found:
|
143
|
+
matches.append((keyword, start_pos, end_pos))
|
144
|
+
|
145
|
+
return matches
|
File without changes
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import urllib.request
|
3
|
+
import json
|
4
|
+
urls = {
|
5
|
+
"ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
|
6
|
+
"npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
|
7
|
+
"npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
|
8
|
+
"sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
|
9
|
+
"8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
|
10
|
+
}
|
11
|
+
|
12
|
+
|
13
|
+
def download_dictionary(name,overwrite=False):
|
14
|
+
url = urls[name]
|
15
|
+
|
16
|
+
# Create dictionaries directory in datamule folder
|
17
|
+
dict_dir = Path.home() / ".datamule" / "dictionaries"
|
18
|
+
dict_dir.mkdir(parents=True, exist_ok=True)
|
19
|
+
|
20
|
+
# check if file exists first
|
21
|
+
if not overwrite:
|
22
|
+
filename = url.split('/')[-1]
|
23
|
+
file_path = dict_dir / filename
|
24
|
+
if file_path.exists():
|
25
|
+
return
|
26
|
+
|
27
|
+
# Extract filename from URL
|
28
|
+
filename = url.split('/')[-1]
|
29
|
+
file_path = dict_dir / filename
|
30
|
+
|
31
|
+
print(f"Downloading {name} dictionary to {file_path}")
|
32
|
+
urllib.request.urlretrieve(url, file_path)
|
33
|
+
return
|
34
|
+
|
35
|
+
def load_dictionary(name):
|
36
|
+
# Get or download the dictionary file
|
37
|
+
dict_dir = Path.home() / ".datamule" / "dictionaries"
|
38
|
+
filename = urls[name].split('/')[-1]
|
39
|
+
file_path = dict_dir / filename
|
40
|
+
|
41
|
+
# Download if doesn't exist
|
42
|
+
if not file_path.exists():
|
43
|
+
download_dictionary(name)
|
44
|
+
|
45
|
+
# Load the dictionary based on name
|
46
|
+
if name == "ssa_baby_first_names":
|
47
|
+
names_set = set()
|
48
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
49
|
+
for line in f:
|
50
|
+
names_set.add(line.strip())
|
51
|
+
return names_set
|
52
|
+
elif name == "npx_figis":
|
53
|
+
figi_set = set()
|
54
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
55
|
+
for line in f:
|
56
|
+
figi_set.add(line.strip())
|
57
|
+
return figi_set
|
58
|
+
elif name == "npx_isins":
|
59
|
+
isin_set = set()
|
60
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
61
|
+
for line in f:
|
62
|
+
isin_set.add(line.strip())
|
63
|
+
return isin_set
|
64
|
+
elif name == "sc13dg_cusips":
|
65
|
+
cusip_set = set()
|
66
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
67
|
+
for line in f:
|
68
|
+
cusip_set.add(line.strip())
|
69
|
+
return cusip_set
|
70
|
+
elif name == "8k_2024_persons":
|
71
|
+
|
72
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
73
|
+
persons_list = json.load(f)
|
74
|
+
return persons_list
|
75
|
+
else:
|
76
|
+
raise ValueError("dictionary not found")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 2.1
|
3
|
+
Version: 2.2.1
|
4
4
|
Summary: Work with SEC submissions at scale.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -19,3 +19,4 @@ Requires-Dist: secxbrl
|
|
19
19
|
Requires-Dist: secsgml
|
20
20
|
Requires-Dist: websocket-client
|
21
21
|
Requires-Dist: company_fundamentals
|
22
|
+
Requires-Dist: flashtext
|
@@ -54,6 +54,11 @@ datamule/sec/xbrl/streamcompanyfacts.py
|
|
54
54
|
datamule/sec/xbrl/xbrlmonitor.py
|
55
55
|
datamule/seclibrary/__init__.py
|
56
56
|
datamule/seclibrary/bq.py
|
57
|
+
datamule/tags/__init__.py
|
58
|
+
datamule/tags/config.py
|
59
|
+
datamule/tags/regex.py
|
60
|
+
datamule/tags/utils.py
|
57
61
|
datamule/utils/__init__.py
|
58
62
|
datamule/utils/construct_submissions_data.py
|
63
|
+
datamule/utils/dictionaries.py
|
59
64
|
datamule/utils/format_accession.py
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.1
|
35
|
+
version="2.2.1",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
@@ -51,7 +51,8 @@ setup(
|
|
51
51
|
'secxbrl',
|
52
52
|
'secsgml',
|
53
53
|
'websocket-client',
|
54
|
-
'company_fundamentals'
|
54
|
+
'company_fundamentals',
|
55
|
+
'flashtext'
|
55
56
|
],
|
56
57
|
# Include the data directory in the package
|
57
58
|
package_data={
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|