datamule 2.2.2__tar.gz → 2.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.2.2 → datamule-2.2.4}/PKG-INFO +1 -1
- {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/downloader.py +1 -1
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/document.py +51 -4
- {datamule-2.2.2 → datamule-2.2.4}/datamule/tags/config.py +9 -1
- {datamule-2.2.2/datamule/utils → datamule-2.2.4/datamule/tags}/dictionaries.py +41 -7
- {datamule-2.2.2 → datamule-2.2.4}/datamule/tags/utils.py +28 -1
- datamule-2.2.4/datamule/utils/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/SOURCES.txt +2 -1
- {datamule-2.2.2 → datamule-2.2.4}/setup.py +1 -1
- {datamule-2.2.2 → datamule-2.2.4}/datamule/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/config.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/datasets.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/helper.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/index.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/package_updater.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/portfolio.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/utils.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.2/datamule/tags → datamule-2.2.4/datamule/sentiment}/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/sheet.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/submission.py +0 -0
- {datamule-2.2.2/datamule/utils → datamule-2.2.4/datamule/tags}/__init__.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/tags/regex.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.2 → datamule-2.2.4}/setup.cfg +0 -0
@@ -287,7 +287,7 @@ class Downloader:
|
|
287
287
|
keepalive_timeout=60
|
288
288
|
)
|
289
289
|
|
290
|
-
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=
|
290
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
|
291
291
|
tasks = [
|
292
292
|
self.download_and_process(
|
293
293
|
session, url, semaphore, decompression_pool,
|
@@ -15,7 +15,7 @@ from secsgml.utils import bytes_to_str
|
|
15
15
|
import tempfile
|
16
16
|
from .tables.tables import Tables
|
17
17
|
|
18
|
-
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
|
18
|
+
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
|
19
19
|
|
20
20
|
class DataWithTags(dict):
|
21
21
|
def __init__(self, data, document):
|
@@ -29,6 +29,12 @@ class DataWithTags(dict):
|
|
29
29
|
self._tags = Tags(self._document, mode='data') # New fragment-based behavior
|
30
30
|
return self._tags
|
31
31
|
|
32
|
+
@property
|
33
|
+
def similarity(self):
|
34
|
+
if not hasattr(self, '_similarity'):
|
35
|
+
self._similarity = Similarity(self._document, mode='data')
|
36
|
+
return self._similarity
|
37
|
+
|
32
38
|
class TextWithTags(str):
|
33
39
|
def __new__(cls, content, document):
|
34
40
|
instance = str.__new__(cls, content)
|
@@ -42,6 +48,12 @@ class TextWithTags(str):
|
|
42
48
|
self._tags = Tags(self._document, mode='text') # Original behavior
|
43
49
|
return self._tags
|
44
50
|
|
51
|
+
@property
|
52
|
+
def similarity(self):
|
53
|
+
if not hasattr(self, '_similarity'):
|
54
|
+
self._similarity = Similarity(self._document, mode='text')
|
55
|
+
return self._similarity
|
56
|
+
|
45
57
|
|
46
58
|
class Tickers:
|
47
59
|
def __init__(self, document):
|
@@ -77,12 +89,11 @@ class Tickers:
|
|
77
89
|
data = self._get_tickers_data()
|
78
90
|
return str(data)
|
79
91
|
|
80
|
-
class
|
92
|
+
class TextAnalysisBase:
|
81
93
|
def __init__(self, document, mode='text'):
|
82
94
|
from ..tags.config import _active_dictionaries,_loaded_dictionaries
|
83
95
|
self.document = document
|
84
96
|
self.mode = mode # 'text' or 'data'
|
85
|
-
self._tickers = None
|
86
97
|
self.dictionaries = {}
|
87
98
|
self.processors = {}
|
88
99
|
self._text_sources = None
|
@@ -133,6 +144,11 @@ class Tags:
|
|
133
144
|
else:
|
134
145
|
# New format: (match, fragment_id, start, end)
|
135
146
|
return [(match, fragment_id, start, end) for match, start, end in results]
|
147
|
+
|
148
|
+
class Tags(TextAnalysisBase):
|
149
|
+
def __init__(self, document, mode='text'):
|
150
|
+
super().__init__(document, mode)
|
151
|
+
self._tickers = None
|
136
152
|
|
137
153
|
@property
|
138
154
|
def cusips(self):
|
@@ -218,7 +234,38 @@ class Tags:
|
|
218
234
|
self._persons.extend(formatted_results)
|
219
235
|
|
220
236
|
return self._persons
|
221
|
-
|
237
|
+
|
238
|
+
class Similarity(TextAnalysisBase):
|
239
|
+
@property
|
240
|
+
def loughran_mcdonald(self):
|
241
|
+
if not hasattr(self, '_loughran_mcdonald'):
|
242
|
+
self._loughran_mcdonald = []
|
243
|
+
sources = self._get_text_sources()
|
244
|
+
|
245
|
+
if 'loughran_mcdonald' in self.processors:
|
246
|
+
lm_processors = self.processors['loughran_mcdonald']
|
247
|
+
|
248
|
+
for source in sources:
|
249
|
+
results = analyze_lm_sentiment_fragment(source['text'], lm_processors)
|
250
|
+
|
251
|
+
if self.mode == 'text':
|
252
|
+
# Single result for whole document
|
253
|
+
self._loughran_mcdonald = results
|
254
|
+
break
|
255
|
+
else:
|
256
|
+
# Per-fragment results with fragment_id
|
257
|
+
fragment_result = {
|
258
|
+
'fragment_id': source['id'],
|
259
|
+
**results
|
260
|
+
}
|
261
|
+
self._loughran_mcdonald.append(fragment_result)
|
262
|
+
else:
|
263
|
+
# No processors available
|
264
|
+
self._loughran_mcdonald = [] if self.mode == 'data' else {}
|
265
|
+
|
266
|
+
return self._loughran_mcdonald
|
267
|
+
|
268
|
+
|
222
269
|
class Document:
|
223
270
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
224
271
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from
|
1
|
+
from .dictionaries import download_dictionary, load_dictionary
|
2
2
|
|
3
3
|
_active_dictionaries = []
|
4
4
|
_loaded_dictionaries = {}
|
@@ -26,6 +26,14 @@ def set_dictionaries(dictionaries, overwrite=False):
|
|
26
26
|
'data': raw_data,
|
27
27
|
'processor': processor
|
28
28
|
}
|
29
|
+
elif dict_name == 'loughran_mcdonald':
|
30
|
+
from .utils import create_lm_processors
|
31
|
+
processors = create_lm_processors(raw_data)
|
32
|
+
|
33
|
+
_loaded_dictionaries[dict_name] = {
|
34
|
+
'data': raw_data,
|
35
|
+
'processor': processors
|
36
|
+
}
|
29
37
|
else:
|
30
38
|
_loaded_dictionaries[dict_name] = {
|
31
39
|
'data': raw_data,
|
@@ -1,17 +1,19 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import urllib.request
|
3
3
|
import json
|
4
|
+
import csv
|
4
5
|
urls = {
|
5
6
|
"ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
|
6
7
|
"npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
|
7
8
|
"npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
|
8
9
|
"sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
|
9
10
|
"8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
|
10
|
-
"13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
|
11
|
+
"13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt",
|
12
|
+
"loughran_mcdonald" : "https://drive.usercontent.google.com/u/0/uc?id=1cfg_w3USlRFS97wo7XQmYnuzhpmzboAY&export=download"
|
11
13
|
}
|
12
14
|
|
13
15
|
|
14
|
-
def download_dictionary(name,overwrite=False):
|
16
|
+
def download_dictionary(name, overwrite=False):
|
15
17
|
url = urls[name]
|
16
18
|
|
17
19
|
# Create dictionaries directory in datamule folder
|
@@ -20,13 +22,19 @@ def download_dictionary(name,overwrite=False):
|
|
20
22
|
|
21
23
|
# check if file exists first
|
22
24
|
if not overwrite:
|
23
|
-
|
25
|
+
if name == "loughran_mcdonald":
|
26
|
+
filename = "loughran_mcdonald.csv"
|
27
|
+
else:
|
28
|
+
filename = url.split('/')[-1]
|
24
29
|
file_path = dict_dir / filename
|
25
30
|
if file_path.exists():
|
26
31
|
return
|
27
32
|
|
28
33
|
# Extract filename from URL
|
29
|
-
|
34
|
+
if name == "loughran_mcdonald":
|
35
|
+
filename = "loughran_mcdonald.csv"
|
36
|
+
else:
|
37
|
+
filename = url.split('/')[-1]
|
30
38
|
file_path = dict_dir / filename
|
31
39
|
|
32
40
|
print(f"Downloading {name} dictionary to {file_path}")
|
@@ -36,7 +44,11 @@ def download_dictionary(name,overwrite=False):
|
|
36
44
|
def load_dictionary(name):
|
37
45
|
# Get or download the dictionary file
|
38
46
|
dict_dir = Path.home() / ".datamule" / "dictionaries"
|
39
|
-
|
47
|
+
|
48
|
+
if name == "loughran_mcdonald":
|
49
|
+
filename = "loughran_mcdonald.csv"
|
50
|
+
else:
|
51
|
+
filename = urls[name].split('/')[-1]
|
40
52
|
file_path = dict_dir / filename
|
41
53
|
|
42
54
|
# Download if doesn't exist
|
@@ -75,9 +87,31 @@ def load_dictionary(name):
|
|
75
87
|
cusip_set.add(line.strip())
|
76
88
|
return cusip_set
|
77
89
|
elif name == "8k_2024_persons":
|
78
|
-
|
79
90
|
with open(file_path, 'r', encoding='utf-8') as f:
|
80
91
|
persons_list = json.load(f)
|
81
92
|
return persons_list
|
93
|
+
elif name == "loughran_mcdonald":
|
94
|
+
# Load the Loughran-McDonald dictionary using base Python CSV
|
95
|
+
lm_dict = {}
|
96
|
+
categories = ['Negative', 'Positive', 'Uncertainty', 'Litigious',
|
97
|
+
'Strong_Modal', 'Weak_Modal', 'Constraining']
|
98
|
+
|
99
|
+
# Initialize category sets
|
100
|
+
for category in categories:
|
101
|
+
lm_dict[category.lower()] = set()
|
102
|
+
|
103
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
104
|
+
reader = csv.DictReader(f)
|
105
|
+
for row in reader:
|
106
|
+
word = row['Word'].lower()
|
107
|
+
for category in categories:
|
108
|
+
value = row.get(category)
|
109
|
+
# Check if value exists and is not 0 (words added in specific years)
|
110
|
+
if value and str(value).strip() != '0':
|
111
|
+
lm_dict[category.lower()].add(word)
|
112
|
+
|
113
|
+
return lm_dict
|
82
114
|
else:
|
83
|
-
raise ValueError("dictionary not found")
|
115
|
+
raise ValueError("dictionary not found")
|
116
|
+
|
117
|
+
download_dictionary('loughran_mcdonald')
|
@@ -142,4 +142,31 @@ def get_full_names_dictionary_lookup(text, processor):
|
|
142
142
|
for keyword, start_pos, end_pos in keywords_found:
|
143
143
|
matches.append((keyword, start_pos, end_pos))
|
144
144
|
|
145
|
-
return matches
|
145
|
+
return matches
|
146
|
+
|
147
|
+
|
148
|
+
def create_lm_processors(lm_dict):
|
149
|
+
processors = {}
|
150
|
+
|
151
|
+
for category_key, word_set in lm_dict.items():
|
152
|
+
processor = KeywordProcessor(case_sensitive=False)
|
153
|
+
for word in word_set:
|
154
|
+
processor.add_keyword(word)
|
155
|
+
processors[category_key] = processor
|
156
|
+
|
157
|
+
return processors
|
158
|
+
|
159
|
+
def analyze_lm_sentiment_fragment(text, processors):
|
160
|
+
"""Analyze sentiment for a single text fragment"""
|
161
|
+
if not text or not text.strip():
|
162
|
+
return {}
|
163
|
+
|
164
|
+
word_count = len(text.split())
|
165
|
+
results = {}
|
166
|
+
|
167
|
+
for category, processor in processors.items():
|
168
|
+
matches = processor.extract_keywords(text.lower(), span_info=True)
|
169
|
+
results[category] = len(matches)
|
170
|
+
|
171
|
+
results['total_words'] = word_count
|
172
|
+
return results
|
File without changes
|
@@ -54,11 +54,12 @@ datamule/sec/xbrl/streamcompanyfacts.py
|
|
54
54
|
datamule/sec/xbrl/xbrlmonitor.py
|
55
55
|
datamule/seclibrary/__init__.py
|
56
56
|
datamule/seclibrary/bq.py
|
57
|
+
datamule/sentiment/__init__.py
|
57
58
|
datamule/tags/__init__.py
|
58
59
|
datamule/tags/config.py
|
60
|
+
datamule/tags/dictionaries.py
|
59
61
|
datamule/tags/regex.py
|
60
62
|
datamule/tags/utils.py
|
61
63
|
datamule/utils/__init__.py
|
62
64
|
datamule/utils/construct_submissions_data.py
|
63
|
-
datamule/utils/dictionaries.py
|
64
65
|
datamule/utils/format_accession.py
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.2.
|
35
|
+
version="2.2.4",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|