datamule 2.2.1__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,11 +13,35 @@ from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
15
  import tempfile
16
- import warnings
17
16
  from .tables.tables import Tables
18
17
 
19
18
  from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
20
19
 
20
+ class DataWithTags(dict):
21
+ def __init__(self, data, document):
22
+ super().__init__(data)
23
+ self._document = document
24
+ self._tags = None
25
+
26
+ @property
27
+ def tags(self):
28
+ if self._tags is None:
29
+ self._tags = Tags(self._document, mode='data') # New fragment-based behavior
30
+ return self._tags
31
+
32
+ class TextWithTags(str):
33
+ def __new__(cls, content, document):
34
+ instance = str.__new__(cls, content)
35
+ instance._document = document
36
+ instance._tags = None
37
+ return instance
38
+
39
+ @property
40
+ def tags(self):
41
+ if self._tags is None:
42
+ self._tags = Tags(self._document, mode='text') # Original behavior
43
+ return self._tags
44
+
21
45
 
22
46
  class Tickers:
23
47
  def __init__(self, document):
@@ -27,11 +51,7 @@ class Tickers:
27
51
  def _get_tickers_data(self):
28
52
  """Get all tickers data once and cache it"""
29
53
  if self._tickers_data is None:
30
- # Check if document extension is supported
31
- if self.document.extension not in ['.htm', '.html', '.txt']:
32
- self._tickers_data = {}
33
- else:
34
- self._tickers_data = get_all_tickers(self.document.text)
54
+ self._tickers_data = get_all_tickers(self.document.text)
35
55
  return self._tickers_data
36
56
 
37
57
  def __getattr__(self, exchange_name):
@@ -58,13 +78,14 @@ class Tickers:
58
78
  return str(data)
59
79
 
60
80
  class Tags:
61
- def __init__(self, document):
81
+ def __init__(self, document, mode='text'):
62
82
  from ..tags.config import _active_dictionaries,_loaded_dictionaries
63
- self.not_supported = document.extension not in ['.htm', '.html', '.txt']
64
83
  self.document = document
84
+ self.mode = mode # 'text' or 'data'
65
85
  self._tickers = None
66
86
  self.dictionaries = {}
67
87
  self.processors = {}
88
+ self._text_sources = None
68
89
 
69
90
  # Load global dictionaries with their data and processors
70
91
  active_dicts = _active_dictionaries
@@ -73,77 +94,131 @@ class Tags:
73
94
  self.dictionaries[dict_name] = dict_info['data']
74
95
  if dict_info['processor'] is not None:
75
96
  self.processors[dict_name] = dict_info['processor']
76
-
77
97
 
78
- def _check_support(self):
79
- if self.not_supported:
80
- warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
81
- return False
82
- return True
98
+ def _get_text_sources(self):
99
+ """Get text sources based on mode - either single text or multiple fragments"""
100
+ if self._text_sources is None:
101
+ if self.mode == 'text':
102
+ # Original behavior - single text source
103
+ self._text_sources = [{'id': None, 'text': str(self.document.text)}]
104
+ else: # mode == 'data'
105
+ # New behavior - multiple text fragments
106
+ self._text_sources = []
107
+ self._extract_text_fragments(self.document.data, '')
108
+ return self._text_sources
109
+
110
+ def _extract_text_fragments(self, data, parent_id=''):
111
+ """Extract all text fragments with their document IDs from parsed data"""
112
+ if isinstance(data, dict):
113
+ for key, value in data.items():
114
+ if key in ["text", "title"] and isinstance(value, str):
115
+ # Use the current dictionary's parent key as the fragment ID
116
+ self._text_sources.append({
117
+ 'id': parent_id,
118
+ 'text': value
119
+ })
120
+ elif isinstance(value, (dict, list)):
121
+ # Pass the current key as the parent_id for the next level
122
+ self._extract_text_fragments(value, key)
123
+ elif isinstance(data, list):
124
+ for i, item in enumerate(data):
125
+ if isinstance(item, (dict, list)):
126
+ self._extract_text_fragments(item, parent_id)
127
+
128
+ def _format_results(self, results, fragment_id):
129
+ """Format results based on mode"""
130
+ if self.mode == 'text':
131
+ # Original format: (match, start, end)
132
+ return results
133
+ else:
134
+ # New format: (match, fragment_id, start, end)
135
+ return [(match, fragment_id, start, end) for match, start, end in results]
83
136
 
84
137
  @property
85
138
  def cusips(self):
86
- if not self._check_support():
87
- return None
139
+ if not hasattr(self, '_cusips'):
140
+ self._cusips = []
141
+ sources = self._get_text_sources()
88
142
 
89
- if not hasattr(self, '_cusip'):
90
- if 'sc13dg_cusips' in self.dictionaries:
91
- keywords = self.dictionaries['sc13dg_cusips']
92
- self._cusip = get_cusip_using_regex(self.document.text, keywords)
93
- else:
94
- self._cusip = get_cusip_using_regex(self.document.text)
95
- return self._cusip
143
+ for source in sources:
144
+ if 'sc13dg_cusips' in self.dictionaries:
145
+ keywords = self.dictionaries['sc13dg_cusips']
146
+ results = get_cusip_using_regex(source['text'], keywords)
147
+ elif "13fhr_information_table_cusips" in self.dictionaries:
148
+ keywords = self.dictionaries['13fhr_information_table_cusips']
149
+ results = get_cusip_using_regex(source['text'], keywords)
150
+ else:
151
+ results = get_cusip_using_regex(source['text'])
152
+
153
+ # Format results based on mode
154
+ formatted_results = self._format_results(results, source['id'])
155
+ self._cusips.extend(formatted_results)
156
+
157
+ return self._cusips
96
158
 
97
159
  @property
98
160
  def isins(self):
99
- if not self._check_support():
100
- return None
161
+ if not hasattr(self, '_isins'):
162
+ self._isins = []
163
+ sources = self._get_text_sources()
101
164
 
102
- if not hasattr(self, '_isin'):
103
- if 'npx_isins' in self.dictionaries:
104
- keywords = self.dictionaries['npx_isins']
105
- self._isin = get_isin_using_regex(self.document.text, keywords)
106
- else:
107
- self._isin = get_isin_using_regex(self.document.text)
108
- return self._isin
165
+ for source in sources:
166
+ if 'npx_isins' in self.dictionaries:
167
+ keywords = self.dictionaries['npx_isins']
168
+ results = get_isin_using_regex(source['text'], keywords)
169
+ else:
170
+ results = get_isin_using_regex(source['text'])
171
+
172
+ formatted_results = self._format_results(results, source['id'])
173
+ self._isins.extend(formatted_results)
174
+
175
+ return self._isins
109
176
 
110
177
  @property
111
178
  def figis(self):
112
- if not self._check_support():
113
- return None
179
+ if not hasattr(self, '_figis'):
180
+ self._figis = []
181
+ sources = self._get_text_sources()
114
182
 
115
- if not hasattr(self, '_figi'):
116
- if 'npx_figis' in self.dictionaries:
117
- keywords = self.dictionaries['npx_figis']
118
- self._figi = get_figi_using_regex(self.document.text, keywords)
119
- else:
120
- self._figi = get_figi_using_regex(self.document.text)
121
- return self._figi
183
+ for source in sources:
184
+ if 'npx_figis' in self.dictionaries:
185
+ keywords = self.dictionaries['npx_figis']
186
+ results = get_figi_using_regex(source['text'], keywords)
187
+ else:
188
+ results = get_figi_using_regex(source['text'])
189
+
190
+ formatted_results = self._format_results(results, source['id'])
191
+ self._figis.extend(formatted_results)
192
+
193
+ return self._figis
122
194
 
123
195
  @property
124
196
  def tickers(self):
197
+ # Tickers work differently - they need the full document context
198
+ # Keep original behavior for now
125
199
  if self._tickers is None:
126
200
  self._tickers = Tickers(self.document)
127
201
  return self._tickers
128
202
 
129
203
  @property
130
204
  def persons(self):
131
- if not self._check_support():
132
- return None
133
-
134
205
  if not hasattr(self, '_persons'):
135
- if '8k_2024_persons' in self.processors:
136
- # Use pre-built processor
137
- self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
138
- elif 'ssa_baby_first_names' in self.dictionaries:
139
- # Use regex with SSA names for validation
140
- self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
141
- else:
142
- # Fallback to regex without validation
143
- self._persons = get_full_names(self.document.text)
206
+ self._persons = []
207
+ sources = self._get_text_sources()
208
+
209
+ for source in sources:
210
+ if '8k_2024_persons' in self.processors:
211
+ results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
212
+ elif 'ssa_baby_first_names' in self.dictionaries:
213
+ results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
214
+ else:
215
+ results = get_full_names(source['text'])
216
+
217
+ formatted_results = self._format_results(results, source['id'])
218
+ self._persons.extend(formatted_results)
219
+
144
220
  return self._persons
145
221
 
146
-
147
222
  class Document:
148
223
  def __init__(self, type, content, extension,accession,filing_date,path=None):
149
224
 
@@ -168,8 +243,6 @@ class Document:
168
243
  self._tables = None
169
244
  self._text = None
170
245
 
171
- self.tags = Tags(self)
172
-
173
246
 
174
247
 
175
248
  #_load_text_content
@@ -354,15 +427,26 @@ class Document:
354
427
  def data(self):
355
428
  if self._data is None:
356
429
  self.parse()
430
+
431
+ if self._data is None:
432
+ self._data = {}
433
+
434
+ if not isinstance(self._data, DataWithTags):
435
+ self._data = DataWithTags(self._data, self)
436
+
357
437
  return self._data
358
438
 
359
439
  @property
360
440
  def text(self):
361
441
  if self._text is None:
362
442
  if self.extension in ['.htm','.html']:
363
- self._preprocess_html_content()
443
+ self._preprocess_html_content() # Still sets self._text to plain string
364
444
  elif self.extension == '.txt':
365
- self._preprocess_txt_content()
445
+ self._preprocess_txt_content() # Still sets self._text to plain string
446
+
447
+ # Convert the plain string to TextWithTags
448
+ plain_text = self._text
449
+ self._text = TextWithTags(plain_text, self)
366
450
  return self._text
367
451
 
368
452
  def write_json(self, output_filename=None):
@@ -6,7 +6,8 @@ urls = {
6
6
  "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
7
7
  "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
8
8
  "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
9
- "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
9
+ "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
10
+ "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
10
11
  }
11
12
 
12
13
 
@@ -67,6 +68,12 @@ def load_dictionary(name):
67
68
  for line in f:
68
69
  cusip_set.add(line.strip())
69
70
  return cusip_set
71
+ elif name == "13fhr_information_table_cusips":
72
+ cusip_set = set()
73
+ with open(file_path, 'r', encoding='utf-8') as f:
74
+ for line in f:
75
+ cusip_set.add(line.strip())
76
+ return cusip_set
70
77
  elif name == "8k_2024_persons":
71
78
 
72
79
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.1
3
+ Version: 2.2.2
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
15
15
  datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamule/document/document.py,sha256=yiev4AYewjp8bPjWn9cuL43N2O11s9WUo4X2e7WUgiY,20628
18
+ datamule/document/document.py,sha256=mpoWmK8K7B92ukXj4WZzFhYOwpoVop5DZYfj2Q-6FE8,24332
19
19
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
21
21
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -54,9 +54,9 @@ datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
54
54
  datamule/tags/utils.py,sha256=hQpQBVAJPmys1UKVS2mqc8Z5-qO_zma5ecFXvW9DXoo,5329
55
55
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
57
- datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
57
+ datamule/utils/dictionaries.py,sha256=1VwzuyDausEsvMIJRa2UD7SvtmlMRHmT_tFeaCY6eXo,3201
58
58
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
59
- datamule-2.2.1.dist-info/METADATA,sha256=aINGZMWV34SclEt-2Ij2d2848PJA7cLF6ZoBL2LwpfY,585
60
- datamule-2.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
- datamule-2.2.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
62
- datamule-2.2.1.dist-info/RECORD,,
59
+ datamule-2.2.2.dist-info/METADATA,sha256=pVMWNBGvR-KNKCYOvfvcFa95srRzS3j_t-zuW6QiXQk,585
60
+ datamule-2.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
+ datamule-2.2.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
62
+ datamule-2.2.2.dist-info/RECORD,,