datamule 2.2.4__py3-none-any.whl → 2.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import csv
3
3
  import re
4
4
  from doc2dict import xml2dict, txt2dict, dict2dict
5
5
  from doc2dict.mapping import flatten_hierarchy
6
- from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
6
+ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
9
  from ..mapping_dicts.html_mapping_dicts import *
@@ -221,7 +221,6 @@ class Tags(TextAnalysisBase):
221
221
  if not hasattr(self, '_persons'):
222
222
  self._persons = []
223
223
  sources = self._get_text_sources()
224
-
225
224
  for source in sources:
226
225
  if '8k_2024_persons' in self.processors:
227
226
  results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
@@ -289,71 +288,17 @@ class Document:
289
288
  self._data = None
290
289
  self._tables = None
291
290
  self._text = None
291
+ self._markdown = None
292
+
293
+ # booleans
294
+ self._data_bool = self.extension in ('.htm', '.html','.txt')
295
+ self._text_bool = self._data_bool
296
+ self._markdown_bool = self._data_bool
297
+ self._visualize_bool = self._data_bool
298
+ self._tables_bool = self.extension in ('.xml')
292
299
 
293
300
 
294
301
 
295
- #_load_text_content
296
- def _preprocess_txt_content(self):
297
- self._text = self.content.decode().translate(str.maketrans({
298
- '\xa0': ' ', '\u2003': ' ',
299
- '\u2018': "'", '\u2019': "'",
300
- '\u201c': '"', '\u201d': '"'
301
- }))
302
-
303
- # needs work
304
- def _preprocess_html_content(self):
305
- parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
306
-
307
- # Remove hidden elements first
308
- hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
309
- for node in hidden_nodes:
310
- node.decompose()
311
-
312
- blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
313
- lines = []
314
- current_line = []
315
-
316
- def flush_line():
317
- if current_line:
318
- # Don't add spaces between adjacent spans
319
- lines.append(''.join(current_line))
320
- current_line.clear()
321
-
322
- for node in parser.root.traverse(include_text=True):
323
- if node.tag in ('script', 'style', 'css'):
324
- continue
325
-
326
- if node.tag in blocks:
327
- flush_line()
328
- lines.append('')
329
-
330
- if node.text_content:
331
- text = node.text_content.strip()
332
- if text:
333
- if node.tag in blocks:
334
- flush_line()
335
- lines.append(text)
336
- lines.append('')
337
- else:
338
- # Only add space if nodes aren't directly adjacent
339
- if current_line and not current_line[-1].endswith(' '):
340
- if node.prev and node.prev.text_content:
341
- if node.parent != node.prev.parent or node.prev.next != node:
342
- current_line.append(' ')
343
- current_line.append(text)
344
-
345
- flush_line()
346
-
347
- text = '\n'.join(lines)
348
- while '\n\n\n' in text:
349
- text = text.replace('\n\n\n', '\n\n')
350
-
351
- self._text = text.translate(str.maketrans({
352
- '\xa0': ' ', '\u2003': ' ',
353
- '\u2018': "'", '\u2019': "'",
354
- '\u201c': '"', '\u201d': '"'
355
- }))
356
-
357
302
  def contains_string(self, pattern):
358
303
  """Works for select files"""
359
304
  if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -485,17 +430,21 @@ class Document:
485
430
 
486
431
  @property
487
432
  def text(self):
488
- if self._text is None:
489
- if self.extension in ['.htm','.html']:
490
- self._preprocess_html_content() # Still sets self._text to plain string
491
- elif self.extension == '.txt':
492
- self._preprocess_txt_content() # Still sets self._text to plain string
493
-
494
- # Convert the plain string to TextWithTags
495
- plain_text = self._text
496
- self._text = TextWithTags(plain_text, self)
433
+ if self._text_bool:
434
+ if self._text is None:
435
+ text = flatten_dict(self.data,'text')
436
+ self._text = TextWithTags(text, self)
497
437
  return self._text
498
438
 
439
+ @property
440
+ def markdown(self):
441
+ if self._markdown_bool:
442
+ if self._markdown is None:
443
+ self._markdown = flatten_dict(self.data,'markdown')
444
+
445
+ return self._markdown
446
+
447
+
499
448
  def write_json(self, output_filename=None):
500
449
  if not self.data:
501
450
  self.parse()
@@ -544,6 +493,17 @@ class Document:
544
493
  writer.writeheader()
545
494
  writer.writerows(table.data)
546
495
 
496
+ def reset_nlp(self):
497
+ """Reset all NLP analysis by creating fresh wrapper objects"""
498
+ # Reset data wrapper
499
+ if hasattr(self, '_data') and self._data is not None:
500
+ raw_data = dict(self._data) # Extract the underlying dict
501
+ self._data = DataWithTags(raw_data, self)
502
+
503
+ # Reset text wrapper
504
+ if hasattr(self, '_text') and self._text is not None:
505
+ raw_text = str(self._text) # Extract the underlying string
506
+ self._text = TextWithTags(raw_text, self)
547
507
 
548
508
  def _document_to_section_text(self, document_data, parent_key=''):
549
509
  items = []
datamule/submission.py CHANGED
@@ -121,6 +121,16 @@ class Submission:
121
121
  self.accession = self.metadata.content['accession-number']
122
122
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
123
123
 
124
+
125
+ # booleans
126
+ self._has_xbrl = any(
127
+ doc['type'] in ('EX-100.INS', 'EX-101.INS') or
128
+ doc.get('filename', '').endswith('_htm.xml')
129
+ for doc in self.metadata.content['documents']
130
+ )
131
+
132
+ self._has_fundamentals = self._has_xbrl
133
+
124
134
  def _load_document_by_index(self, idx):
125
135
  """Load a document by its index in the metadata documents list."""
126
136
  doc = self.metadata.content['documents'][idx]
datamule/tags/config.py CHANGED
@@ -3,6 +3,12 @@ from .dictionaries import download_dictionary, load_dictionary
3
3
  _active_dictionaries = []
4
4
  _loaded_dictionaries = {}
5
5
 
6
+ def clear_dictionaries():
7
+ """Remove all active dictionaries"""
8
+ global _active_dictionaries, _loaded_dictionaries
9
+ _active_dictionaries = []
10
+ _loaded_dictionaries = {}
11
+
6
12
  def set_dictionaries(dictionaries, overwrite=False):
7
13
  """Set active dictionaries and load them into memory"""
8
14
  global _active_dictionaries, _loaded_dictionaries
datamule/tags/utils.py CHANGED
@@ -67,25 +67,31 @@ def get_ticker_regex_dict():
67
67
  return dict(ticker_regex_list)
68
68
 
69
69
  # will change in future to accomodate other datasets
70
- def validate_full_name(full_name,keywords):
70
+ def validate_full_name(full_name, keywords):
71
71
  if len(full_name) == 1:
72
72
  return False
73
- # check all is upper
74
- if all(word.isupper() for word in full_name):
73
+
74
+ # Clean punctuation before validation
75
+ cleaned_name = [word.rstrip(".,;:!?()[]") for word in full_name]
76
+
77
+ # Skip validation if cleaning removed everything
78
+ if not all(cleaned_name):
75
79
  return False
76
- # check if any number in word
77
- if any(any(char.isdigit() for char in word) for word in full_name):
80
+
81
+ # Apply existing checks to cleaned words
82
+ if all(word.isupper() for word in cleaned_name):
78
83
  return False
79
- if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
84
+
85
+ # check if any number in word
86
+ if any(any(char.isdigit() for char in word) for word in cleaned_name):
80
87
  return False
81
88
 
82
89
  # add optional set lookups
83
90
  if keywords is not None:
84
91
  # return false if first word is not in keywords set
85
- if full_name[0] not in keywords:
92
+ if cleaned_name[0] not in keywords:
86
93
  return False
87
94
 
88
-
89
95
  return True
90
96
 
91
97
  def get_full_names(text,keywords=None):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.4
3
+ Version: 2.2.6
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -7,7 +7,7 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
7
7
  datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
8
8
  datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
9
9
  datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
10
- datamule/submission.py,sha256=TdQDfFjOKXy2qAZcD6hc9kjDSxmuZLqk8WRhtMjjC-g,15822
10
+ datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
11
11
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
12
12
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
15
15
  datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamule/document/document.py,sha256=oOib-bFPZ0rsIk8WBgBVY73CwuU18MZDmXnAQ8fTVD8,26124
18
+ datamule/document/document.py,sha256=AuF5JSVjFHA2w5JoLq8zG1UOq906PvJNcp50Qia--fE,24521
19
19
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
21
21
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -50,14 +50,14 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
50
50
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
51
51
  datamule/sentiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- datamule/tags/config.py,sha256=w7386pyvnWYPNwgMVT_Nw5ivXibOeFuSuMEI7lRsGrk,1495
53
+ datamule/tags/config.py,sha256=rxawvOBDT2v72Aw-VkmnUOLsKSAIrZBrjz_E0hPU7MY,1677
54
54
  datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
55
55
  datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
56
- datamule/tags/utils.py,sha256=hexmz_3YnoPrC98A5DTz1xa8o58xZ1yKbzQYP1XiQts,6100
56
+ datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
57
57
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
59
59
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
60
- datamule-2.2.4.dist-info/METADATA,sha256=SD47CDv1rjDKzI0GukLS7HEAEPN45RlQ5rqZauG1YJE,585
61
- datamule-2.2.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
- datamule-2.2.4.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
63
- datamule-2.2.4.dist-info/RECORD,,
60
+ datamule-2.2.6.dist-info/METADATA,sha256=lY7IAgOEQ9TUlWaKRhypyBfRIXS3jmr5q9sEHOgaYfg,585
61
+ datamule-2.2.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
+ datamule-2.2.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
63
+ datamule-2.2.6.dist-info/RECORD,,