datamule 2.2.0__tar.gz → 2.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {datamule-2.2.0 → datamule-2.2.2}/PKG-INFO +1 -1
  2. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/document.py +148 -60
  3. datamule-2.2.2/datamule/tags/config.py +33 -0
  4. {datamule-2.2.0 → datamule-2.2.2}/datamule/tags/utils.py +3 -7
  5. {datamule-2.2.0 → datamule-2.2.2}/datamule/utils/dictionaries.py +8 -1
  6. {datamule-2.2.0 → datamule-2.2.2}/datamule.egg-info/PKG-INFO +1 -1
  7. {datamule-2.2.0 → datamule-2.2.2}/setup.py +1 -1
  8. datamule-2.2.0/datamule/tags/config.py +0 -16
  9. {datamule-2.2.0 → datamule-2.2.2}/datamule/__init__.py +0 -0
  10. {datamule-2.2.0 → datamule-2.2.2}/datamule/config.py +0 -0
  11. {datamule-2.2.0 → datamule-2.2.2}/datamule/data/listed_filer_metadata.csv +0 -0
  12. {datamule-2.2.0 → datamule-2.2.2}/datamule/datamule/__init__.py +0 -0
  13. {datamule-2.2.0 → datamule-2.2.2}/datamule/datamule/datamule_lookup.py +0 -0
  14. {datamule-2.2.0 → datamule-2.2.2}/datamule/datamule/datamule_mysql_rds.py +0 -0
  15. {datamule-2.2.0 → datamule-2.2.2}/datamule/datamule/downloader.py +0 -0
  16. {datamule-2.2.0 → datamule-2.2.2}/datamule/datamule/sec_connector.py +0 -0
  17. {datamule-2.2.0 → datamule-2.2.2}/datamule/datasets.py +0 -0
  18. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/__init__.py +0 -0
  19. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/__init__.py +0 -0
  20. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables.py +0 -0
  21. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_13fhr.py +0 -0
  22. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_25nse.py +0 -0
  23. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_informationtable.py +0 -0
  24. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_npx.py +0 -0
  25. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_ownership.py +0 -0
  26. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  27. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_sbsef.py +0 -0
  28. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/tables_sdr.py +0 -0
  29. {datamule-2.2.0 → datamule-2.2.2}/datamule/document/tables/utils.py +0 -0
  30. {datamule-2.2.0 → datamule-2.2.2}/datamule/helper.py +0 -0
  31. {datamule-2.2.0 → datamule-2.2.2}/datamule/index.py +0 -0
  32. {datamule-2.2.0 → datamule-2.2.2}/datamule/mapping_dicts/__init__.py +0 -0
  33. {datamule-2.2.0 → datamule-2.2.2}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  34. {datamule-2.2.0 → datamule-2.2.2}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  35. {datamule-2.2.0 → datamule-2.2.2}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  36. {datamule-2.2.0 → datamule-2.2.2}/datamule/package_updater.py +0 -0
  37. {datamule-2.2.0 → datamule-2.2.2}/datamule/portfolio.py +0 -0
  38. {datamule-2.2.0 → datamule-2.2.2}/datamule/portfolio_compression_utils.py +0 -0
  39. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/__init__.py +0 -0
  40. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/infrastructure/__init__.py +0 -0
  41. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  42. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/submissions/__init__.py +0 -0
  43. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/submissions/downloader.py +0 -0
  44. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/submissions/eftsquery.py +0 -0
  45. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/submissions/monitor.py +0 -0
  46. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/submissions/streamer.py +0 -0
  47. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/submissions/textsearch.py +0 -0
  48. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/utils.py +0 -0
  49. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/xbrl/__init__.py +0 -0
  50. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  51. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  52. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  53. {datamule-2.2.0 → datamule-2.2.2}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  54. {datamule-2.2.0 → datamule-2.2.2}/datamule/seclibrary/__init__.py +0 -0
  55. {datamule-2.2.0 → datamule-2.2.2}/datamule/seclibrary/bq.py +0 -0
  56. {datamule-2.2.0 → datamule-2.2.2}/datamule/sheet.py +0 -0
  57. {datamule-2.2.0 → datamule-2.2.2}/datamule/submission.py +0 -0
  58. {datamule-2.2.0 → datamule-2.2.2}/datamule/tags/__init__.py +0 -0
  59. {datamule-2.2.0 → datamule-2.2.2}/datamule/tags/regex.py +0 -0
  60. {datamule-2.2.0 → datamule-2.2.2}/datamule/utils/__init__.py +0 -0
  61. {datamule-2.2.0 → datamule-2.2.2}/datamule/utils/construct_submissions_data.py +0 -0
  62. {datamule-2.2.0 → datamule-2.2.2}/datamule/utils/format_accession.py +0 -0
  63. {datamule-2.2.0 → datamule-2.2.2}/datamule.egg-info/SOURCES.txt +0 -0
  64. {datamule-2.2.0 → datamule-2.2.2}/datamule.egg-info/dependency_links.txt +0 -0
  65. {datamule-2.2.0 → datamule-2.2.2}/datamule.egg-info/requires.txt +0 -0
  66. {datamule-2.2.0 → datamule-2.2.2}/datamule.egg-info/top_level.txt +0 -0
  67. {datamule-2.2.0 → datamule-2.2.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -13,11 +13,35 @@ from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
15
  import tempfile
16
- import warnings
17
16
  from .tables.tables import Tables
18
17
 
19
18
  from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
20
19
 
20
+ class DataWithTags(dict):
21
+ def __init__(self, data, document):
22
+ super().__init__(data)
23
+ self._document = document
24
+ self._tags = None
25
+
26
+ @property
27
+ def tags(self):
28
+ if self._tags is None:
29
+ self._tags = Tags(self._document, mode='data') # New fragment-based behavior
30
+ return self._tags
31
+
32
+ class TextWithTags(str):
33
+ def __new__(cls, content, document):
34
+ instance = str.__new__(cls, content)
35
+ instance._document = document
36
+ instance._tags = None
37
+ return instance
38
+
39
+ @property
40
+ def tags(self):
41
+ if self._tags is None:
42
+ self._tags = Tags(self._document, mode='text') # Original behavior
43
+ return self._tags
44
+
21
45
 
22
46
  class Tickers:
23
47
  def __init__(self, document):
@@ -27,11 +51,7 @@ class Tickers:
27
51
  def _get_tickers_data(self):
28
52
  """Get all tickers data once and cache it"""
29
53
  if self._tickers_data is None:
30
- # Check if document extension is supported
31
- if self.document.extension not in ['.htm', '.html', '.txt']:
32
- self._tickers_data = {}
33
- else:
34
- self._tickers_data = get_all_tickers(self.document.text)
54
+ self._tickers_data = get_all_tickers(self.document.text)
35
55
  return self._tickers_data
36
56
 
37
57
  def __getattr__(self, exchange_name):
@@ -58,88 +78,147 @@ class Tickers:
58
78
  return str(data)
59
79
 
60
80
  class Tags:
61
- def __init__(self, document):
81
+ def __init__(self, document, mode='text'):
62
82
  from ..tags.config import _active_dictionaries,_loaded_dictionaries
63
- self.not_supported = document.extension not in ['.htm', '.html', '.txt']
64
83
  self.document = document
84
+ self.mode = mode # 'text' or 'data'
65
85
  self._tickers = None
66
86
  self.dictionaries = {}
87
+ self.processors = {}
88
+ self._text_sources = None
67
89
 
68
- # Load global dictionaries with their data
90
+ # Load global dictionaries with their data and processors
69
91
  active_dicts = _active_dictionaries
70
92
  for dict_name in active_dicts:
71
- self.dictionaries[dict_name] = _loaded_dictionaries[dict_name]
72
-
93
+ dict_info = _loaded_dictionaries[dict_name]
94
+ self.dictionaries[dict_name] = dict_info['data']
95
+ if dict_info['processor'] is not None:
96
+ self.processors[dict_name] = dict_info['processor']
73
97
 
74
- def _check_support(self):
75
- if self.not_supported:
76
- warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
77
- return False
78
- return True
98
+ def _get_text_sources(self):
99
+ """Get text sources based on mode - either single text or multiple fragments"""
100
+ if self._text_sources is None:
101
+ if self.mode == 'text':
102
+ # Original behavior - single text source
103
+ self._text_sources = [{'id': None, 'text': str(self.document.text)}]
104
+ else: # mode == 'data'
105
+ # New behavior - multiple text fragments
106
+ self._text_sources = []
107
+ self._extract_text_fragments(self.document.data, '')
108
+ return self._text_sources
109
+
110
+ def _extract_text_fragments(self, data, parent_id=''):
111
+ """Extract all text fragments with their document IDs from parsed data"""
112
+ if isinstance(data, dict):
113
+ for key, value in data.items():
114
+ if key in ["text", "title"] and isinstance(value, str):
115
+ # Use the current dictionary's parent key as the fragment ID
116
+ self._text_sources.append({
117
+ 'id': parent_id,
118
+ 'text': value
119
+ })
120
+ elif isinstance(value, (dict, list)):
121
+ # Pass the current key as the parent_id for the next level
122
+ self._extract_text_fragments(value, key)
123
+ elif isinstance(data, list):
124
+ for i, item in enumerate(data):
125
+ if isinstance(item, (dict, list)):
126
+ self._extract_text_fragments(item, parent_id)
127
+
128
+ def _format_results(self, results, fragment_id):
129
+ """Format results based on mode"""
130
+ if self.mode == 'text':
131
+ # Original format: (match, start, end)
132
+ return results
133
+ else:
134
+ # New format: (match, fragment_id, start, end)
135
+ return [(match, fragment_id, start, end) for match, start, end in results]
79
136
 
80
137
  @property
81
138
  def cusips(self):
82
- if not self._check_support():
83
- return None
139
+ if not hasattr(self, '_cusips'):
140
+ self._cusips = []
141
+ sources = self._get_text_sources()
84
142
 
85
- if not hasattr(self, '_cusip'):
86
- if 'sc13dg_cusips' in self.dictionaries:
87
- keywords = self.dictionaries['sc13dg_cusips']
88
- self._cusip = get_cusip_using_regex(self.document.text, keywords)
89
- else:
90
- self._cusip = get_cusip_using_regex(self.document.text)
91
- return self._cusip
143
+ for source in sources:
144
+ if 'sc13dg_cusips' in self.dictionaries:
145
+ keywords = self.dictionaries['sc13dg_cusips']
146
+ results = get_cusip_using_regex(source['text'], keywords)
147
+ elif "13fhr_information_table_cusips" in self.dictionaries:
148
+ keywords = self.dictionaries['13fhr_information_table_cusips']
149
+ results = get_cusip_using_regex(source['text'], keywords)
150
+ else:
151
+ results = get_cusip_using_regex(source['text'])
152
+
153
+ # Format results based on mode
154
+ formatted_results = self._format_results(results, source['id'])
155
+ self._cusips.extend(formatted_results)
156
+
157
+ return self._cusips
92
158
 
93
159
  @property
94
160
  def isins(self):
95
- if not self._check_support():
96
- return None
161
+ if not hasattr(self, '_isins'):
162
+ self._isins = []
163
+ sources = self._get_text_sources()
97
164
 
98
- if not hasattr(self, '_isin'):
99
- if 'npx_isins' in self.dictionaries:
100
- keywords = self.dictionaries['npx_isins']
101
- self._isin = get_isin_using_regex(self.document.text, keywords)
102
- else:
103
- self._isin = get_isin_using_regex(self.document.text)
104
- return self._isin
165
+ for source in sources:
166
+ if 'npx_isins' in self.dictionaries:
167
+ keywords = self.dictionaries['npx_isins']
168
+ results = get_isin_using_regex(source['text'], keywords)
169
+ else:
170
+ results = get_isin_using_regex(source['text'])
171
+
172
+ formatted_results = self._format_results(results, source['id'])
173
+ self._isins.extend(formatted_results)
174
+
175
+ return self._isins
105
176
 
106
177
  @property
107
178
  def figis(self):
108
- if not self._check_support():
109
- return None
179
+ if not hasattr(self, '_figis'):
180
+ self._figis = []
181
+ sources = self._get_text_sources()
110
182
 
111
- if not hasattr(self, '_figi'):
112
- if 'npx_figis' in self.dictionaries:
113
- keywords = self.dictionaries['npx_figis']
114
- self._figi = get_figi_using_regex(self.document.text, keywords)
115
- else:
116
- self._figi = get_figi_using_regex(self.document.text)
117
- return self._figi
183
+ for source in sources:
184
+ if 'npx_figis' in self.dictionaries:
185
+ keywords = self.dictionaries['npx_figis']
186
+ results = get_figi_using_regex(source['text'], keywords)
187
+ else:
188
+ results = get_figi_using_regex(source['text'])
189
+
190
+ formatted_results = self._format_results(results, source['id'])
191
+ self._figis.extend(formatted_results)
192
+
193
+ return self._figis
118
194
 
119
195
  @property
120
196
  def tickers(self):
197
+ # Tickers work differently - they need the full document context
198
+ # Keep original behavior for now
121
199
  if self._tickers is None:
122
200
  self._tickers = Tickers(self.document)
123
201
  return self._tickers
124
202
 
125
203
  @property
126
204
  def persons(self):
127
- if not self._check_support():
128
- return None
129
-
130
205
  if not hasattr(self, '_persons'):
131
- if '8k_2024_persons' in self.dictionaries:
132
- # Use FlashText dictionary lookup for 8K persons
133
- self._persons = get_full_names_dictionary_lookup(self.document.text, self.dictionaries['8k_2024_persons'])
134
- elif 'ssa_baby_first_names' in self.dictionaries:
135
- # Use regex with SSA names for validation
136
- self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
137
- else:
138
- # Fallback to regex without validation
139
- self._persons = get_full_names(self.document.text)
206
+ self._persons = []
207
+ sources = self._get_text_sources()
208
+
209
+ for source in sources:
210
+ if '8k_2024_persons' in self.processors:
211
+ results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
212
+ elif 'ssa_baby_first_names' in self.dictionaries:
213
+ results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
214
+ else:
215
+ results = get_full_names(source['text'])
216
+
217
+ formatted_results = self._format_results(results, source['id'])
218
+ self._persons.extend(formatted_results)
219
+
140
220
  return self._persons
141
221
 
142
-
143
222
  class Document:
144
223
  def __init__(self, type, content, extension,accession,filing_date,path=None):
145
224
 
@@ -164,8 +243,6 @@ class Document:
164
243
  self._tables = None
165
244
  self._text = None
166
245
 
167
- self.tags = Tags(self)
168
-
169
246
 
170
247
 
171
248
  #_load_text_content
@@ -350,15 +427,26 @@ class Document:
350
427
  def data(self):
351
428
  if self._data is None:
352
429
  self.parse()
430
+
431
+ if self._data is None:
432
+ self._data = {}
433
+
434
+ if not isinstance(self._data, DataWithTags):
435
+ self._data = DataWithTags(self._data, self)
436
+
353
437
  return self._data
354
438
 
355
439
  @property
356
440
  def text(self):
357
441
  if self._text is None:
358
442
  if self.extension in ['.htm','.html']:
359
- self._preprocess_html_content()
443
+ self._preprocess_html_content() # Still sets self._text to plain string
360
444
  elif self.extension == '.txt':
361
- self._preprocess_txt_content()
445
+ self._preprocess_txt_content() # Still sets self._text to plain string
446
+
447
+ # Convert the plain string to TextWithTags
448
+ plain_text = self._text
449
+ self._text = TextWithTags(plain_text, self)
362
450
  return self._text
363
451
 
364
452
  def write_json(self, output_filename=None):
@@ -0,0 +1,33 @@
1
+ from ..utils.dictionaries import download_dictionary, load_dictionary
2
+
3
+ _active_dictionaries = []
4
+ _loaded_dictionaries = {}
5
+
6
+ def set_dictionaries(dictionaries, overwrite=False):
7
+ """Set active dictionaries and load them into memory"""
8
+ global _active_dictionaries, _loaded_dictionaries
9
+ _active_dictionaries = dictionaries
10
+ _loaded_dictionaries = {}
11
+
12
+ for dict_name in dictionaries:
13
+ # Download if needed
14
+ download_dictionary(dict_name, overwrite=overwrite)
15
+ # Load raw data
16
+ raw_data = load_dictionary(dict_name)
17
+
18
+ # Create processor for dictionary lookup methods
19
+ if dict_name in ['8k_2024_persons']: # Add other dict names as needed
20
+ from flashtext import KeywordProcessor
21
+ processor = KeywordProcessor(case_sensitive=True)
22
+ for key in raw_data.keys():
23
+ processor.add_keyword(key, key)
24
+
25
+ _loaded_dictionaries[dict_name] = {
26
+ 'data': raw_data,
27
+ 'processor': processor
28
+ }
29
+ else:
30
+ _loaded_dictionaries[dict_name] = {
31
+ 'data': raw_data,
32
+ 'processor': None
33
+ }
@@ -134,14 +134,10 @@ def get_full_names(text,keywords=None):
134
134
  return full_names
135
135
 
136
136
  # add dictionary lookup based on precomputed lists
137
- def get_full_names_dictionary_lookup(text, dictionary):
138
- keyword_processor = KeywordProcessor(case_sensitive=True)
139
-
140
- for key in dictionary.keys():
141
- keyword_processor.add_keyword(key, key)
142
-
137
+ def get_full_names_dictionary_lookup(text, processor):
138
+ """Use pre-built KeywordProcessor instead of creating new one"""
143
139
  matches = []
144
- keywords_found = keyword_processor.extract_keywords(text, span_info=True)
140
+ keywords_found = processor.extract_keywords(text, span_info=True)
145
141
 
146
142
  for keyword, start_pos, end_pos in keywords_found:
147
143
  matches.append((keyword, start_pos, end_pos))
@@ -6,7 +6,8 @@ urls = {
6
6
  "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
7
7
  "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
8
8
  "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
9
- "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
9
+ "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
10
+ "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
10
11
  }
11
12
 
12
13
 
@@ -67,6 +68,12 @@ def load_dictionary(name):
67
68
  for line in f:
68
69
  cusip_set.add(line.strip())
69
70
  return cusip_set
71
+ elif name == "13fhr_information_table_cusips":
72
+ cusip_set = set()
73
+ with open(file_path, 'r', encoding='utf-8') as f:
74
+ for line in f:
75
+ cusip_set.add(line.strip())
76
+ return cusip_set
70
77
  elif name == "8k_2024_persons":
71
78
 
72
79
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.0",
35
+ version="2.2.2",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
@@ -1,16 +0,0 @@
1
- from ..utils.dictionaries import download_dictionary, load_dictionary
2
-
3
- _active_dictionaries = []
4
- _loaded_dictionaries = {}
5
-
6
- def set_dictionaries(dictionaries, overwrite=False):
7
- """Set active dictionaries and load them into memory"""
8
- global _active_dictionaries, _loaded_dictionaries
9
- _active_dictionaries = dictionaries
10
- _loaded_dictionaries = {}
11
-
12
- for dict_name in dictionaries:
13
- # Download if needed
14
- download_dictionary(dict_name, overwrite=overwrite)
15
- # Load into memory
16
- _loaded_dictionaries[dict_name] = load_dictionary(dict_name)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes