datamule 2.2.1__tar.gz → 2.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {datamule-2.2.1 → datamule-2.2.3}/PKG-INFO +1 -1
  2. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/document.py +192 -61
  3. {datamule-2.2.1 → datamule-2.2.3}/datamule/tags/config.py +9 -1
  4. {datamule-2.2.1/datamule/utils → datamule-2.2.3/datamule/tags}/dictionaries.py +48 -7
  5. {datamule-2.2.1 → datamule-2.2.3}/datamule/tags/utils.py +28 -1
  6. datamule-2.2.3/datamule/utils/__init__.py +0 -0
  7. {datamule-2.2.1 → datamule-2.2.3}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-2.2.1 → datamule-2.2.3}/datamule.egg-info/SOURCES.txt +2 -1
  9. {datamule-2.2.1 → datamule-2.2.3}/setup.py +1 -1
  10. {datamule-2.2.1 → datamule-2.2.3}/datamule/__init__.py +0 -0
  11. {datamule-2.2.1 → datamule-2.2.3}/datamule/config.py +0 -0
  12. {datamule-2.2.1 → datamule-2.2.3}/datamule/data/listed_filer_metadata.csv +0 -0
  13. {datamule-2.2.1 → datamule-2.2.3}/datamule/datamule/__init__.py +0 -0
  14. {datamule-2.2.1 → datamule-2.2.3}/datamule/datamule/datamule_lookup.py +0 -0
  15. {datamule-2.2.1 → datamule-2.2.3}/datamule/datamule/datamule_mysql_rds.py +0 -0
  16. {datamule-2.2.1 → datamule-2.2.3}/datamule/datamule/downloader.py +0 -0
  17. {datamule-2.2.1 → datamule-2.2.3}/datamule/datamule/sec_connector.py +0 -0
  18. {datamule-2.2.1 → datamule-2.2.3}/datamule/datasets.py +0 -0
  19. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/__init__.py +0 -0
  20. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/__init__.py +0 -0
  21. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables.py +0 -0
  22. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_13fhr.py +0 -0
  23. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_25nse.py +0 -0
  24. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_informationtable.py +0 -0
  25. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_npx.py +0 -0
  26. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_ownership.py +0 -0
  27. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  28. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_sbsef.py +0 -0
  29. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/tables_sdr.py +0 -0
  30. {datamule-2.2.1 → datamule-2.2.3}/datamule/document/tables/utils.py +0 -0
  31. {datamule-2.2.1 → datamule-2.2.3}/datamule/helper.py +0 -0
  32. {datamule-2.2.1 → datamule-2.2.3}/datamule/index.py +0 -0
  33. {datamule-2.2.1 → datamule-2.2.3}/datamule/mapping_dicts/__init__.py +0 -0
  34. {datamule-2.2.1 → datamule-2.2.3}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  35. {datamule-2.2.1 → datamule-2.2.3}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  36. {datamule-2.2.1 → datamule-2.2.3}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  37. {datamule-2.2.1 → datamule-2.2.3}/datamule/package_updater.py +0 -0
  38. {datamule-2.2.1 → datamule-2.2.3}/datamule/portfolio.py +0 -0
  39. {datamule-2.2.1 → datamule-2.2.3}/datamule/portfolio_compression_utils.py +0 -0
  40. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/__init__.py +0 -0
  41. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/infrastructure/__init__.py +0 -0
  42. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  43. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/submissions/__init__.py +0 -0
  44. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/submissions/downloader.py +0 -0
  45. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/submissions/eftsquery.py +0 -0
  46. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/submissions/monitor.py +0 -0
  47. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/submissions/streamer.py +0 -0
  48. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/submissions/textsearch.py +0 -0
  49. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/utils.py +0 -0
  50. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/xbrl/__init__.py +0 -0
  51. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  52. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  53. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  54. {datamule-2.2.1 → datamule-2.2.3}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  55. {datamule-2.2.1 → datamule-2.2.3}/datamule/seclibrary/__init__.py +0 -0
  56. {datamule-2.2.1 → datamule-2.2.3}/datamule/seclibrary/bq.py +0 -0
  57. {datamule-2.2.1/datamule/tags → datamule-2.2.3/datamule/sentiment}/__init__.py +0 -0
  58. {datamule-2.2.1 → datamule-2.2.3}/datamule/sheet.py +0 -0
  59. {datamule-2.2.1 → datamule-2.2.3}/datamule/submission.py +0 -0
  60. {datamule-2.2.1/datamule/utils → datamule-2.2.3/datamule/tags}/__init__.py +0 -0
  61. {datamule-2.2.1 → datamule-2.2.3}/datamule/tags/regex.py +0 -0
  62. {datamule-2.2.1 → datamule-2.2.3}/datamule/utils/construct_submissions_data.py +0 -0
  63. {datamule-2.2.1 → datamule-2.2.3}/datamule/utils/format_accession.py +0 -0
  64. {datamule-2.2.1 → datamule-2.2.3}/datamule.egg-info/dependency_links.txt +0 -0
  65. {datamule-2.2.1 → datamule-2.2.3}/datamule.egg-info/requires.txt +0 -0
  66. {datamule-2.2.1 → datamule-2.2.3}/datamule.egg-info/top_level.txt +0 -0
  67. {datamule-2.2.1 → datamule-2.2.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.1
3
+ Version: 2.2.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -13,11 +13,47 @@ from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
15
  import tempfile
16
- import warnings
17
16
  from .tables.tables import Tables
18
17
 
19
- from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
18
+ from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
20
19
 
20
+ class DataWithTags(dict):
21
+ def __init__(self, data, document):
22
+ super().__init__(data)
23
+ self._document = document
24
+ self._tags = None
25
+
26
+ @property
27
+ def tags(self):
28
+ if self._tags is None:
29
+ self._tags = Tags(self._document, mode='data') # New fragment-based behavior
30
+ return self._tags
31
+
32
+ @property
33
+ def similarity(self):
34
+ if not hasattr(self, '_similarity'):
35
+ self._similarity = Similarity(self._document, mode='data')
36
+ return self._similarity
37
+
38
+ class TextWithTags(str):
39
+ def __new__(cls, content, document):
40
+ instance = str.__new__(cls, content)
41
+ instance._document = document
42
+ instance._tags = None
43
+ return instance
44
+
45
+ @property
46
+ def tags(self):
47
+ if self._tags is None:
48
+ self._tags = Tags(self._document, mode='text') # Original behavior
49
+ return self._tags
50
+
51
+ @property
52
+ def similarity(self):
53
+ if not hasattr(self, '_similarity'):
54
+ self._similarity = Similarity(self._document, mode='text')
55
+ return self._similarity
56
+
21
57
 
22
58
  class Tickers:
23
59
  def __init__(self, document):
@@ -27,11 +63,7 @@ class Tickers:
27
63
  def _get_tickers_data(self):
28
64
  """Get all tickers data once and cache it"""
29
65
  if self._tickers_data is None:
30
- # Check if document extension is supported
31
- if self.document.extension not in ['.htm', '.html', '.txt']:
32
- self._tickers_data = {}
33
- else:
34
- self._tickers_data = get_all_tickers(self.document.text)
66
+ self._tickers_data = get_all_tickers(self.document.text)
35
67
  return self._tickers_data
36
68
 
37
69
  def __getattr__(self, exchange_name):
@@ -57,14 +89,14 @@ class Tickers:
57
89
  data = self._get_tickers_data()
58
90
  return str(data)
59
91
 
60
- class Tags:
61
- def __init__(self, document):
92
+ class TextAnalysisBase:
93
+ def __init__(self, document, mode='text'):
62
94
  from ..tags.config import _active_dictionaries,_loaded_dictionaries
63
- self.not_supported = document.extension not in ['.htm', '.html', '.txt']
64
95
  self.document = document
65
- self._tickers = None
96
+ self.mode = mode # 'text' or 'data'
66
97
  self.dictionaries = {}
67
98
  self.processors = {}
99
+ self._text_sources = None
68
100
 
69
101
  # Load global dictionaries with their data and processors
70
102
  active_dicts = _active_dictionaries
@@ -73,76 +105,166 @@ class Tags:
73
105
  self.dictionaries[dict_name] = dict_info['data']
74
106
  if dict_info['processor'] is not None:
75
107
  self.processors[dict_name] = dict_info['processor']
76
-
77
108
 
78
- def _check_support(self):
79
- if self.not_supported:
80
- warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
81
- return False
82
- return True
109
+ def _get_text_sources(self):
110
+ """Get text sources based on mode - either single text or multiple fragments"""
111
+ if self._text_sources is None:
112
+ if self.mode == 'text':
113
+ # Original behavior - single text source
114
+ self._text_sources = [{'id': None, 'text': str(self.document.text)}]
115
+ else: # mode == 'data'
116
+ # New behavior - multiple text fragments
117
+ self._text_sources = []
118
+ self._extract_text_fragments(self.document.data, '')
119
+ return self._text_sources
120
+
121
+ def _extract_text_fragments(self, data, parent_id=''):
122
+ """Extract all text fragments with their document IDs from parsed data"""
123
+ if isinstance(data, dict):
124
+ for key, value in data.items():
125
+ if key in ["text", "title"] and isinstance(value, str):
126
+ # Use the current dictionary's parent key as the fragment ID
127
+ self._text_sources.append({
128
+ 'id': parent_id,
129
+ 'text': value
130
+ })
131
+ elif isinstance(value, (dict, list)):
132
+ # Pass the current key as the parent_id for the next level
133
+ self._extract_text_fragments(value, key)
134
+ elif isinstance(data, list):
135
+ for i, item in enumerate(data):
136
+ if isinstance(item, (dict, list)):
137
+ self._extract_text_fragments(item, parent_id)
138
+
139
+ def _format_results(self, results, fragment_id):
140
+ """Format results based on mode"""
141
+ if self.mode == 'text':
142
+ # Original format: (match, start, end)
143
+ return results
144
+ else:
145
+ # New format: (match, fragment_id, start, end)
146
+ return [(match, fragment_id, start, end) for match, start, end in results]
147
+
148
+ class Tags(TextAnalysisBase):
149
+ def __init__(self, document, mode='text'):
150
+ super().__init__(document, mode)
151
+ self._tickers = None
83
152
 
84
153
  @property
85
154
  def cusips(self):
86
- if not self._check_support():
87
- return None
155
+ if not hasattr(self, '_cusips'):
156
+ self._cusips = []
157
+ sources = self._get_text_sources()
88
158
 
89
- if not hasattr(self, '_cusip'):
90
- if 'sc13dg_cusips' in self.dictionaries:
91
- keywords = self.dictionaries['sc13dg_cusips']
92
- self._cusip = get_cusip_using_regex(self.document.text, keywords)
93
- else:
94
- self._cusip = get_cusip_using_regex(self.document.text)
95
- return self._cusip
159
+ for source in sources:
160
+ if 'sc13dg_cusips' in self.dictionaries:
161
+ keywords = self.dictionaries['sc13dg_cusips']
162
+ results = get_cusip_using_regex(source['text'], keywords)
163
+ elif "13fhr_information_table_cusips" in self.dictionaries:
164
+ keywords = self.dictionaries['13fhr_information_table_cusips']
165
+ results = get_cusip_using_regex(source['text'], keywords)
166
+ else:
167
+ results = get_cusip_using_regex(source['text'])
168
+
169
+ # Format results based on mode
170
+ formatted_results = self._format_results(results, source['id'])
171
+ self._cusips.extend(formatted_results)
172
+
173
+ return self._cusips
96
174
 
97
175
  @property
98
176
  def isins(self):
99
- if not self._check_support():
100
- return None
177
+ if not hasattr(self, '_isins'):
178
+ self._isins = []
179
+ sources = self._get_text_sources()
101
180
 
102
- if not hasattr(self, '_isin'):
103
- if 'npx_isins' in self.dictionaries:
104
- keywords = self.dictionaries['npx_isins']
105
- self._isin = get_isin_using_regex(self.document.text, keywords)
106
- else:
107
- self._isin = get_isin_using_regex(self.document.text)
108
- return self._isin
181
+ for source in sources:
182
+ if 'npx_isins' in self.dictionaries:
183
+ keywords = self.dictionaries['npx_isins']
184
+ results = get_isin_using_regex(source['text'], keywords)
185
+ else:
186
+ results = get_isin_using_regex(source['text'])
187
+
188
+ formatted_results = self._format_results(results, source['id'])
189
+ self._isins.extend(formatted_results)
190
+
191
+ return self._isins
109
192
 
110
193
  @property
111
194
  def figis(self):
112
- if not self._check_support():
113
- return None
195
+ if not hasattr(self, '_figis'):
196
+ self._figis = []
197
+ sources = self._get_text_sources()
114
198
 
115
- if not hasattr(self, '_figi'):
116
- if 'npx_figis' in self.dictionaries:
117
- keywords = self.dictionaries['npx_figis']
118
- self._figi = get_figi_using_regex(self.document.text, keywords)
119
- else:
120
- self._figi = get_figi_using_regex(self.document.text)
121
- return self._figi
199
+ for source in sources:
200
+ if 'npx_figis' in self.dictionaries:
201
+ keywords = self.dictionaries['npx_figis']
202
+ results = get_figi_using_regex(source['text'], keywords)
203
+ else:
204
+ results = get_figi_using_regex(source['text'])
205
+
206
+ formatted_results = self._format_results(results, source['id'])
207
+ self._figis.extend(formatted_results)
208
+
209
+ return self._figis
122
210
 
123
211
  @property
124
212
  def tickers(self):
213
+ # Tickers work differently - they need the full document context
214
+ # Keep original behavior for now
125
215
  if self._tickers is None:
126
216
  self._tickers = Tickers(self.document)
127
217
  return self._tickers
128
218
 
129
219
  @property
130
220
  def persons(self):
131
- if not self._check_support():
132
- return None
133
-
134
221
  if not hasattr(self, '_persons'):
135
- if '8k_2024_persons' in self.processors:
136
- # Use pre-built processor
137
- self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
138
- elif 'ssa_baby_first_names' in self.dictionaries:
139
- # Use regex with SSA names for validation
140
- self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
141
- else:
142
- # Fallback to regex without validation
143
- self._persons = get_full_names(self.document.text)
222
+ self._persons = []
223
+ sources = self._get_text_sources()
224
+
225
+ for source in sources:
226
+ if '8k_2024_persons' in self.processors:
227
+ results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
228
+ elif 'ssa_baby_first_names' in self.dictionaries:
229
+ results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
230
+ else:
231
+ results = get_full_names(source['text'])
232
+
233
+ formatted_results = self._format_results(results, source['id'])
234
+ self._persons.extend(formatted_results)
235
+
144
236
  return self._persons
145
-
237
+
238
+ class Similarity(TextAnalysisBase):
239
+ @property
240
+ def loughran_mcdonald(self):
241
+ if not hasattr(self, '_loughran_mcdonald'):
242
+ self._loughran_mcdonald = []
243
+ sources = self._get_text_sources()
244
+
245
+ if 'loughran_mcdonald' in self.processors:
246
+ lm_processors = self.processors['loughran_mcdonald']
247
+
248
+ for source in sources:
249
+ results = analyze_lm_sentiment_fragment(source['text'], lm_processors)
250
+
251
+ if self.mode == 'text':
252
+ # Single result for whole document
253
+ self._loughran_mcdonald = results
254
+ break
255
+ else:
256
+ # Per-fragment results with fragment_id
257
+ fragment_result = {
258
+ 'fragment_id': source['id'],
259
+ **results
260
+ }
261
+ self._loughran_mcdonald.append(fragment_result)
262
+ else:
263
+ # No processors available
264
+ self._loughran_mcdonald = [] if self.mode == 'data' else {}
265
+
266
+ return self._loughran_mcdonald
267
+
146
268
 
147
269
  class Document:
148
270
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -168,8 +290,6 @@ class Document:
168
290
  self._tables = None
169
291
  self._text = None
170
292
 
171
- self.tags = Tags(self)
172
-
173
293
 
174
294
 
175
295
  #_load_text_content
@@ -354,15 +474,26 @@ class Document:
354
474
  def data(self):
355
475
  if self._data is None:
356
476
  self.parse()
477
+
478
+ if self._data is None:
479
+ self._data = {}
480
+
481
+ if not isinstance(self._data, DataWithTags):
482
+ self._data = DataWithTags(self._data, self)
483
+
357
484
  return self._data
358
485
 
359
486
  @property
360
487
  def text(self):
361
488
  if self._text is None:
362
489
  if self.extension in ['.htm','.html']:
363
- self._preprocess_html_content()
490
+ self._preprocess_html_content() # Still sets self._text to plain string
364
491
  elif self.extension == '.txt':
365
- self._preprocess_txt_content()
492
+ self._preprocess_txt_content() # Still sets self._text to plain string
493
+
494
+ # Convert the plain string to TextWithTags
495
+ plain_text = self._text
496
+ self._text = TextWithTags(plain_text, self)
366
497
  return self._text
367
498
 
368
499
  def write_json(self, output_filename=None):
@@ -1,4 +1,4 @@
1
- from ..utils.dictionaries import download_dictionary, load_dictionary
1
+ from .dictionaries import download_dictionary, load_dictionary
2
2
 
3
3
  _active_dictionaries = []
4
4
  _loaded_dictionaries = {}
@@ -26,6 +26,14 @@ def set_dictionaries(dictionaries, overwrite=False):
26
26
  'data': raw_data,
27
27
  'processor': processor
28
28
  }
29
+ elif dict_name == 'loughran_mcdonald':
30
+ from .utils import create_lm_processors
31
+ processors = create_lm_processors(raw_data)
32
+
33
+ _loaded_dictionaries[dict_name] = {
34
+ 'data': raw_data,
35
+ 'processor': processors
36
+ }
29
37
  else:
30
38
  _loaded_dictionaries[dict_name] = {
31
39
  'data': raw_data,
@@ -1,16 +1,19 @@
1
1
  from pathlib import Path
2
2
  import urllib.request
3
3
  import json
4
+ import csv
4
5
  urls = {
5
6
  "ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
6
7
  "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
7
8
  "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
8
9
  "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
9
- "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
10
+ "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
11
+ "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt",
12
+ "loughran_mcdonald" : "https://drive.usercontent.google.com/u/0/uc?id=1cfg_w3USlRFS97wo7XQmYnuzhpmzboAY&export=download"
10
13
  }
11
14
 
12
15
 
13
- def download_dictionary(name,overwrite=False):
16
+ def download_dictionary(name, overwrite=False):
14
17
  url = urls[name]
15
18
 
16
19
  # Create dictionaries directory in datamule folder
@@ -19,13 +22,19 @@ def download_dictionary(name,overwrite=False):
19
22
 
20
23
  # check if file exists first
21
24
  if not overwrite:
22
- filename = url.split('/')[-1]
25
+ if name == "loughran_mcdonald":
26
+ filename = "loughran_mcdonald.csv"
27
+ else:
28
+ filename = url.split('/')[-1]
23
29
  file_path = dict_dir / filename
24
30
  if file_path.exists():
25
31
  return
26
32
 
27
33
  # Extract filename from URL
28
- filename = url.split('/')[-1]
34
+ if name == "loughran_mcdonald":
35
+ filename = "loughran_mcdonald.csv"
36
+ else:
37
+ filename = url.split('/')[-1]
29
38
  file_path = dict_dir / filename
30
39
 
31
40
  print(f"Downloading {name} dictionary to {file_path}")
@@ -35,7 +44,11 @@ def download_dictionary(name,overwrite=False):
35
44
  def load_dictionary(name):
36
45
  # Get or download the dictionary file
37
46
  dict_dir = Path.home() / ".datamule" / "dictionaries"
38
- filename = urls[name].split('/')[-1]
47
+
48
+ if name == "loughran_mcdonald":
49
+ filename = "loughran_mcdonald.csv"
50
+ else:
51
+ filename = urls[name].split('/')[-1]
39
52
  file_path = dict_dir / filename
40
53
 
41
54
  # Download if doesn't exist
@@ -67,10 +80,38 @@ def load_dictionary(name):
67
80
  for line in f:
68
81
  cusip_set.add(line.strip())
69
82
  return cusip_set
83
+ elif name == "13fhr_information_table_cusips":
84
+ cusip_set = set()
85
+ with open(file_path, 'r', encoding='utf-8') as f:
86
+ for line in f:
87
+ cusip_set.add(line.strip())
88
+ return cusip_set
70
89
  elif name == "8k_2024_persons":
71
-
72
90
  with open(file_path, 'r', encoding='utf-8') as f:
73
91
  persons_list = json.load(f)
74
92
  return persons_list
93
+ elif name == "loughran_mcdonald":
94
+ # Load the Loughran-McDonald dictionary using base Python CSV
95
+ lm_dict = {}
96
+ categories = ['Negative', 'Positive', 'Uncertainty', 'Litigious',
97
+ 'Strong_Modal', 'Weak_Modal', 'Constraining']
98
+
99
+ # Initialize category sets
100
+ for category in categories:
101
+ lm_dict[category.lower()] = set()
102
+
103
+ with open(file_path, 'r', encoding='utf-8') as f:
104
+ reader = csv.DictReader(f)
105
+ for row in reader:
106
+ word = row['Word'].lower()
107
+ for category in categories:
108
+ value = row.get(category)
109
+ # Check if value exists and is not 0 (words added in specific years)
110
+ if value and str(value).strip() != '0':
111
+ lm_dict[category.lower()].add(word)
112
+
113
+ return lm_dict
75
114
  else:
76
- raise ValueError("dictionary not found")
115
+ raise ValueError("dictionary not found")
116
+
117
+ download_dictionary('loughran_mcdonald')
@@ -142,4 +142,31 @@ def get_full_names_dictionary_lookup(text, processor):
142
142
  for keyword, start_pos, end_pos in keywords_found:
143
143
  matches.append((keyword, start_pos, end_pos))
144
144
 
145
- return matches
145
+ return matches
146
+
147
+
148
+ def create_lm_processors(lm_dict):
149
+ processors = {}
150
+
151
+ for category_key, word_set in lm_dict.items():
152
+ processor = KeywordProcessor(case_sensitive=False)
153
+ for word in word_set:
154
+ processor.add_keyword(word)
155
+ processors[category_key] = processor
156
+
157
+ return processors
158
+
159
+ def analyze_lm_sentiment_fragment(text, processors):
160
+ """Analyze sentiment for a single text fragment"""
161
+ if not text or not text.strip():
162
+ return {}
163
+
164
+ word_count = len(text.split())
165
+ results = {}
166
+
167
+ for category, processor in processors.items():
168
+ matches = processor.extract_keywords(text.lower(), span_info=True)
169
+ results[category] = len(matches)
170
+
171
+ results['total_words'] = word_count
172
+ return results
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.1
3
+ Version: 2.2.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -54,11 +54,12 @@ datamule/sec/xbrl/streamcompanyfacts.py
54
54
  datamule/sec/xbrl/xbrlmonitor.py
55
55
  datamule/seclibrary/__init__.py
56
56
  datamule/seclibrary/bq.py
57
+ datamule/sentiment/__init__.py
57
58
  datamule/tags/__init__.py
58
59
  datamule/tags/config.py
60
+ datamule/tags/dictionaries.py
59
61
  datamule/tags/regex.py
60
62
  datamule/tags/utils.py
61
63
  datamule/utils/__init__.py
62
64
  datamule/utils/construct_submissions_data.py
63
- datamule/utils/dictionaries.py
64
65
  datamule/utils/format_accession.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.1",
35
+ version="2.2.3",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes