datamule 2.2.2__tar.gz → 2.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {datamule-2.2.2 → datamule-2.2.4}/PKG-INFO +1 -1
  2. {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/downloader.py +1 -1
  3. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/document.py +51 -4
  4. {datamule-2.2.2 → datamule-2.2.4}/datamule/tags/config.py +9 -1
  5. {datamule-2.2.2/datamule/utils → datamule-2.2.4/datamule/tags}/dictionaries.py +41 -7
  6. {datamule-2.2.2 → datamule-2.2.4}/datamule/tags/utils.py +28 -1
  7. datamule-2.2.4/datamule/utils/__init__.py +0 -0
  8. {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/PKG-INFO +1 -1
  9. {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/SOURCES.txt +2 -1
  10. {datamule-2.2.2 → datamule-2.2.4}/setup.py +1 -1
  11. {datamule-2.2.2 → datamule-2.2.4}/datamule/__init__.py +0 -0
  12. {datamule-2.2.2 → datamule-2.2.4}/datamule/config.py +0 -0
  13. {datamule-2.2.2 → datamule-2.2.4}/datamule/data/listed_filer_metadata.csv +0 -0
  14. {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/__init__.py +0 -0
  15. {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/datamule_lookup.py +0 -0
  16. {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/datamule_mysql_rds.py +0 -0
  17. {datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/sec_connector.py +0 -0
  18. {datamule-2.2.2 → datamule-2.2.4}/datamule/datasets.py +0 -0
  19. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/__init__.py +0 -0
  20. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/__init__.py +0 -0
  21. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables.py +0 -0
  22. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_13fhr.py +0 -0
  23. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_25nse.py +0 -0
  24. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_informationtable.py +0 -0
  25. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_npx.py +0 -0
  26. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_ownership.py +0 -0
  27. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  28. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_sbsef.py +0 -0
  29. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/tables_sdr.py +0 -0
  30. {datamule-2.2.2 → datamule-2.2.4}/datamule/document/tables/utils.py +0 -0
  31. {datamule-2.2.2 → datamule-2.2.4}/datamule/helper.py +0 -0
  32. {datamule-2.2.2 → datamule-2.2.4}/datamule/index.py +0 -0
  33. {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/__init__.py +0 -0
  34. {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  35. {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  36. {datamule-2.2.2 → datamule-2.2.4}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  37. {datamule-2.2.2 → datamule-2.2.4}/datamule/package_updater.py +0 -0
  38. {datamule-2.2.2 → datamule-2.2.4}/datamule/portfolio.py +0 -0
  39. {datamule-2.2.2 → datamule-2.2.4}/datamule/portfolio_compression_utils.py +0 -0
  40. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/__init__.py +0 -0
  41. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/infrastructure/__init__.py +0 -0
  42. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  43. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/__init__.py +0 -0
  44. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/downloader.py +0 -0
  45. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/eftsquery.py +0 -0
  46. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/monitor.py +0 -0
  47. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/streamer.py +0 -0
  48. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/submissions/textsearch.py +0 -0
  49. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/utils.py +0 -0
  50. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/__init__.py +0 -0
  51. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  52. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  53. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  54. {datamule-2.2.2 → datamule-2.2.4}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  55. {datamule-2.2.2 → datamule-2.2.4}/datamule/seclibrary/__init__.py +0 -0
  56. {datamule-2.2.2 → datamule-2.2.4}/datamule/seclibrary/bq.py +0 -0
  57. {datamule-2.2.2/datamule/tags → datamule-2.2.4/datamule/sentiment}/__init__.py +0 -0
  58. {datamule-2.2.2 → datamule-2.2.4}/datamule/sheet.py +0 -0
  59. {datamule-2.2.2 → datamule-2.2.4}/datamule/submission.py +0 -0
  60. {datamule-2.2.2/datamule/utils → datamule-2.2.4/datamule/tags}/__init__.py +0 -0
  61. {datamule-2.2.2 → datamule-2.2.4}/datamule/tags/regex.py +0 -0
  62. {datamule-2.2.2 → datamule-2.2.4}/datamule/utils/construct_submissions_data.py +0 -0
  63. {datamule-2.2.2 → datamule-2.2.4}/datamule/utils/format_accession.py +0 -0
  64. {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/dependency_links.txt +0 -0
  65. {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/requires.txt +0 -0
  66. {datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/top_level.txt +0 -0
  67. {datamule-2.2.2 → datamule-2.2.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.2
3
+ Version: 2.2.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -287,7 +287,7 @@ class Downloader:
287
287
  keepalive_timeout=60
288
288
  )
289
289
 
290
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
290
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
291
291
  tasks = [
292
292
  self.download_and_process(
293
293
  session, url, semaphore, decompression_pool,
@@ -15,7 +15,7 @@ from secsgml.utils import bytes_to_str
15
15
  import tempfile
16
16
  from .tables.tables import Tables
17
17
 
18
- from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
18
+ from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
19
19
 
20
20
  class DataWithTags(dict):
21
21
  def __init__(self, data, document):
@@ -29,6 +29,12 @@ class DataWithTags(dict):
29
29
  self._tags = Tags(self._document, mode='data') # New fragment-based behavior
30
30
  return self._tags
31
31
 
32
+ @property
33
+ def similarity(self):
34
+ if not hasattr(self, '_similarity'):
35
+ self._similarity = Similarity(self._document, mode='data')
36
+ return self._similarity
37
+
32
38
  class TextWithTags(str):
33
39
  def __new__(cls, content, document):
34
40
  instance = str.__new__(cls, content)
@@ -42,6 +48,12 @@ class TextWithTags(str):
42
48
  self._tags = Tags(self._document, mode='text') # Original behavior
43
49
  return self._tags
44
50
 
51
+ @property
52
+ def similarity(self):
53
+ if not hasattr(self, '_similarity'):
54
+ self._similarity = Similarity(self._document, mode='text')
55
+ return self._similarity
56
+
45
57
 
46
58
  class Tickers:
47
59
  def __init__(self, document):
@@ -77,12 +89,11 @@ class Tickers:
77
89
  data = self._get_tickers_data()
78
90
  return str(data)
79
91
 
80
- class Tags:
92
+ class TextAnalysisBase:
81
93
  def __init__(self, document, mode='text'):
82
94
  from ..tags.config import _active_dictionaries,_loaded_dictionaries
83
95
  self.document = document
84
96
  self.mode = mode # 'text' or 'data'
85
- self._tickers = None
86
97
  self.dictionaries = {}
87
98
  self.processors = {}
88
99
  self._text_sources = None
@@ -133,6 +144,11 @@ class Tags:
133
144
  else:
134
145
  # New format: (match, fragment_id, start, end)
135
146
  return [(match, fragment_id, start, end) for match, start, end in results]
147
+
148
+ class Tags(TextAnalysisBase):
149
+ def __init__(self, document, mode='text'):
150
+ super().__init__(document, mode)
151
+ self._tickers = None
136
152
 
137
153
  @property
138
154
  def cusips(self):
@@ -218,7 +234,38 @@ class Tags:
218
234
  self._persons.extend(formatted_results)
219
235
 
220
236
  return self._persons
221
-
237
+
238
+ class Similarity(TextAnalysisBase):
239
+ @property
240
+ def loughran_mcdonald(self):
241
+ if not hasattr(self, '_loughran_mcdonald'):
242
+ self._loughran_mcdonald = []
243
+ sources = self._get_text_sources()
244
+
245
+ if 'loughran_mcdonald' in self.processors:
246
+ lm_processors = self.processors['loughran_mcdonald']
247
+
248
+ for source in sources:
249
+ results = analyze_lm_sentiment_fragment(source['text'], lm_processors)
250
+
251
+ if self.mode == 'text':
252
+ # Single result for whole document
253
+ self._loughran_mcdonald = results
254
+ break
255
+ else:
256
+ # Per-fragment results with fragment_id
257
+ fragment_result = {
258
+ 'fragment_id': source['id'],
259
+ **results
260
+ }
261
+ self._loughran_mcdonald.append(fragment_result)
262
+ else:
263
+ # No processors available
264
+ self._loughran_mcdonald = [] if self.mode == 'data' else {}
265
+
266
+ return self._loughran_mcdonald
267
+
268
+
222
269
  class Document:
223
270
  def __init__(self, type, content, extension,accession,filing_date,path=None):
224
271
 
@@ -1,4 +1,4 @@
1
- from ..utils.dictionaries import download_dictionary, load_dictionary
1
+ from .dictionaries import download_dictionary, load_dictionary
2
2
 
3
3
  _active_dictionaries = []
4
4
  _loaded_dictionaries = {}
@@ -26,6 +26,14 @@ def set_dictionaries(dictionaries, overwrite=False):
26
26
  'data': raw_data,
27
27
  'processor': processor
28
28
  }
29
+ elif dict_name == 'loughran_mcdonald':
30
+ from .utils import create_lm_processors
31
+ processors = create_lm_processors(raw_data)
32
+
33
+ _loaded_dictionaries[dict_name] = {
34
+ 'data': raw_data,
35
+ 'processor': processors
36
+ }
29
37
  else:
30
38
  _loaded_dictionaries[dict_name] = {
31
39
  'data': raw_data,
@@ -1,17 +1,19 @@
1
1
  from pathlib import Path
2
2
  import urllib.request
3
3
  import json
4
+ import csv
4
5
  urls = {
5
6
  "ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
6
7
  "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
7
8
  "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
8
9
  "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
9
10
  "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
10
- "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
11
+ "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt",
12
+ "loughran_mcdonald" : "https://drive.usercontent.google.com/u/0/uc?id=1cfg_w3USlRFS97wo7XQmYnuzhpmzboAY&export=download"
11
13
  }
12
14
 
13
15
 
14
- def download_dictionary(name,overwrite=False):
16
+ def download_dictionary(name, overwrite=False):
15
17
  url = urls[name]
16
18
 
17
19
  # Create dictionaries directory in datamule folder
@@ -20,13 +22,19 @@ def download_dictionary(name,overwrite=False):
20
22
 
21
23
  # check if file exists first
22
24
  if not overwrite:
23
- filename = url.split('/')[-1]
25
+ if name == "loughran_mcdonald":
26
+ filename = "loughran_mcdonald.csv"
27
+ else:
28
+ filename = url.split('/')[-1]
24
29
  file_path = dict_dir / filename
25
30
  if file_path.exists():
26
31
  return
27
32
 
28
33
  # Extract filename from URL
29
- filename = url.split('/')[-1]
34
+ if name == "loughran_mcdonald":
35
+ filename = "loughran_mcdonald.csv"
36
+ else:
37
+ filename = url.split('/')[-1]
30
38
  file_path = dict_dir / filename
31
39
 
32
40
  print(f"Downloading {name} dictionary to {file_path}")
@@ -36,7 +44,11 @@ def download_dictionary(name,overwrite=False):
36
44
  def load_dictionary(name):
37
45
  # Get or download the dictionary file
38
46
  dict_dir = Path.home() / ".datamule" / "dictionaries"
39
- filename = urls[name].split('/')[-1]
47
+
48
+ if name == "loughran_mcdonald":
49
+ filename = "loughran_mcdonald.csv"
50
+ else:
51
+ filename = urls[name].split('/')[-1]
40
52
  file_path = dict_dir / filename
41
53
 
42
54
  # Download if doesn't exist
@@ -75,9 +87,31 @@ def load_dictionary(name):
75
87
  cusip_set.add(line.strip())
76
88
  return cusip_set
77
89
  elif name == "8k_2024_persons":
78
-
79
90
  with open(file_path, 'r', encoding='utf-8') as f:
80
91
  persons_list = json.load(f)
81
92
  return persons_list
93
+ elif name == "loughran_mcdonald":
94
+ # Load the Loughran-McDonald dictionary using base Python CSV
95
+ lm_dict = {}
96
+ categories = ['Negative', 'Positive', 'Uncertainty', 'Litigious',
97
+ 'Strong_Modal', 'Weak_Modal', 'Constraining']
98
+
99
+ # Initialize category sets
100
+ for category in categories:
101
+ lm_dict[category.lower()] = set()
102
+
103
+ with open(file_path, 'r', encoding='utf-8') as f:
104
+ reader = csv.DictReader(f)
105
+ for row in reader:
106
+ word = row['Word'].lower()
107
+ for category in categories:
108
+ value = row.get(category)
109
+ # Check if value exists and is not 0 (words added in specific years)
110
+ if value and str(value).strip() != '0':
111
+ lm_dict[category.lower()].add(word)
112
+
113
+ return lm_dict
82
114
  else:
83
- raise ValueError("dictionary not found")
115
+ raise ValueError("dictionary not found")
116
+
117
+ download_dictionary('loughran_mcdonald')
@@ -142,4 +142,31 @@ def get_full_names_dictionary_lookup(text, processor):
142
142
  for keyword, start_pos, end_pos in keywords_found:
143
143
  matches.append((keyword, start_pos, end_pos))
144
144
 
145
- return matches
145
+ return matches
146
+
147
+
148
+ def create_lm_processors(lm_dict):
149
+ processors = {}
150
+
151
+ for category_key, word_set in lm_dict.items():
152
+ processor = KeywordProcessor(case_sensitive=False)
153
+ for word in word_set:
154
+ processor.add_keyword(word)
155
+ processors[category_key] = processor
156
+
157
+ return processors
158
+
159
+ def analyze_lm_sentiment_fragment(text, processors):
160
+ """Analyze sentiment for a single text fragment"""
161
+ if not text or not text.strip():
162
+ return {}
163
+
164
+ word_count = len(text.split())
165
+ results = {}
166
+
167
+ for category, processor in processors.items():
168
+ matches = processor.extract_keywords(text.lower(), span_info=True)
169
+ results[category] = len(matches)
170
+
171
+ results['total_words'] = word_count
172
+ return results
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.2
3
+ Version: 2.2.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -54,11 +54,12 @@ datamule/sec/xbrl/streamcompanyfacts.py
54
54
  datamule/sec/xbrl/xbrlmonitor.py
55
55
  datamule/seclibrary/__init__.py
56
56
  datamule/seclibrary/bq.py
57
+ datamule/sentiment/__init__.py
57
58
  datamule/tags/__init__.py
58
59
  datamule/tags/config.py
60
+ datamule/tags/dictionaries.py
59
61
  datamule/tags/regex.py
60
62
  datamule/tags/utils.py
61
63
  datamule/utils/__init__.py
62
64
  datamule/utils/construct_submissions_data.py
63
- datamule/utils/dictionaries.py
64
65
  datamule/utils/format_accession.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.2",
35
+ version="2.2.4",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes