datamule 2.2.3__py3-none-any.whl → 2.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -287,7 +287,7 @@ class Downloader:
287
287
  keepalive_timeout=60
288
288
  )
289
289
 
290
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
290
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
291
291
  tasks = [
292
292
  self.download_and_process(
293
293
  session, url, semaphore, decompression_pool,
@@ -221,7 +221,6 @@ class Tags(TextAnalysisBase):
221
221
  if not hasattr(self, '_persons'):
222
222
  self._persons = []
223
223
  sources = self._get_text_sources()
224
-
225
224
  for source in sources:
226
225
  if '8k_2024_persons' in self.processors:
227
226
  results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
@@ -289,6 +288,12 @@ class Document:
289
288
  self._data = None
290
289
  self._tables = None
291
290
  self._text = None
291
+
292
+ # booleans
293
+ self._text_bool = self.extension in ('.htm', '.html','.txt')
294
+ self._data_bool = self.extension in ('.htm', '.html','.txt')
295
+ self._visualize_bool = self._data_bool
296
+ self._tables_bool = self.extension in ('.xml')
292
297
 
293
298
 
294
299
 
@@ -544,6 +549,17 @@ class Document:
544
549
  writer.writeheader()
545
550
  writer.writerows(table.data)
546
551
 
552
+ def reset_nlp(self):
553
+ """Reset all NLP analysis by creating fresh wrapper objects"""
554
+ # Reset data wrapper
555
+ if hasattr(self, '_data') and self._data is not None:
556
+ raw_data = dict(self._data) # Extract the underlying dict
557
+ self._data = DataWithTags(raw_data, self)
558
+
559
+ # Reset text wrapper
560
+ if hasattr(self, '_text') and self._text is not None:
561
+ raw_text = str(self._text) # Extract the underlying string
562
+ self._text = TextWithTags(raw_text, self)
547
563
 
548
564
  def _document_to_section_text(self, document_data, parent_key=''):
549
565
  items = []
datamule/submission.py CHANGED
@@ -121,6 +121,16 @@ class Submission:
121
121
  self.accession = self.metadata.content['accession-number']
122
122
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
123
123
 
124
+
125
+ # booleans
126
+ self._has_xbrl = any(
127
+ doc['type'] in ('EX-100.INS', 'EX-101.INS') or
128
+ doc.get('filename', '').endswith('_htm.xml')
129
+ for doc in self.metadata.content['documents']
130
+ )
131
+
132
+ self._has_fundamentals = self._has_xbrl
133
+
124
134
  def _load_document_by_index(self, idx):
125
135
  """Load a document by its index in the metadata documents list."""
126
136
  doc = self.metadata.content['documents'][idx]
datamule/tags/config.py CHANGED
@@ -3,6 +3,12 @@ from .dictionaries import download_dictionary, load_dictionary
3
3
  _active_dictionaries = []
4
4
  _loaded_dictionaries = {}
5
5
 
6
+ def clear_dictionaries():
7
+ """Remove all active dictionaries"""
8
+ global _active_dictionaries, _loaded_dictionaries
9
+ _active_dictionaries = []
10
+ _loaded_dictionaries = {}
11
+
6
12
  def set_dictionaries(dictionaries, overwrite=False):
7
13
  """Set active dictionaries and load them into memory"""
8
14
  global _active_dictionaries, _loaded_dictionaries
datamule/tags/utils.py CHANGED
@@ -67,25 +67,31 @@ def get_ticker_regex_dict():
67
67
  return dict(ticker_regex_list)
68
68
 
69
69
  # will change in future to accomodate other datasets
70
- def validate_full_name(full_name,keywords):
70
+ def validate_full_name(full_name, keywords):
71
71
  if len(full_name) == 1:
72
72
  return False
73
- # check all is upper
74
- if all(word.isupper() for word in full_name):
73
+
74
+ # Clean punctuation before validation
75
+ cleaned_name = [word.rstrip(".,;:!?()[]") for word in full_name]
76
+
77
+ # Skip validation if cleaning removed everything
78
+ if not all(cleaned_name):
75
79
  return False
76
- # check if any number in word
77
- if any(any(char.isdigit() for char in word) for word in full_name):
80
+
81
+ # Apply existing checks to cleaned words
82
+ if all(word.isupper() for word in cleaned_name):
78
83
  return False
79
- if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
84
+
85
+ # check if any number in word
86
+ if any(any(char.isdigit() for char in word) for word in cleaned_name):
80
87
  return False
81
88
 
82
89
  # add optional set lookups
83
90
  if keywords is not None:
84
91
  # return false if first word is not in keywords set
85
- if full_name[0] not in keywords:
92
+ if cleaned_name[0] not in keywords:
86
93
  return False
87
94
 
88
-
89
95
  return True
90
96
 
91
97
  def get_full_names(text,keywords=None):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.3
3
+ Version: 2.2.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -7,15 +7,15 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
7
7
  datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
8
8
  datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
9
9
  datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
10
- datamule/submission.py,sha256=TdQDfFjOKXy2qAZcD6hc9kjDSxmuZLqk8WRhtMjjC-g,15822
10
+ datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
11
11
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
12
12
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
14
14
  datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
15
- datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
15
+ datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamule/document/document.py,sha256=oOib-bFPZ0rsIk8WBgBVY73CwuU18MZDmXnAQ8fTVD8,26124
18
+ datamule/document/document.py,sha256=msIMoLdxjcwdMv4ijwCMLutySk2-5BvGU266nWQkzg4,26909
19
19
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
21
21
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -50,14 +50,14 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
50
50
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
51
51
  datamule/sentiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- datamule/tags/config.py,sha256=w7386pyvnWYPNwgMVT_Nw5ivXibOeFuSuMEI7lRsGrk,1495
53
+ datamule/tags/config.py,sha256=rxawvOBDT2v72Aw-VkmnUOLsKSAIrZBrjz_E0hPU7MY,1677
54
54
  datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
55
55
  datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
56
- datamule/tags/utils.py,sha256=hexmz_3YnoPrC98A5DTz1xa8o58xZ1yKbzQYP1XiQts,6100
56
+ datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
57
57
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
59
59
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
60
- datamule-2.2.3.dist-info/METADATA,sha256=cca85xqYigHxQbSRJPlOwyJ6pbVp-87YYk0wUBXcMr8,585
61
- datamule-2.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
- datamule-2.2.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
63
- datamule-2.2.3.dist-info/RECORD,,
60
+ datamule-2.2.5.dist-info/METADATA,sha256=Mm0hhgixEljkpYk__oV2nIUe9ceglvbdhJr0lgEZ_b0,585
61
+ datamule-2.2.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
+ datamule-2.2.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
63
+ datamule-2.2.5.dist-info/RECORD,,