datamule 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ from selectolax.parser import HTMLParser
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
+ import tempfile
15
16
 
16
17
  from .tables.tables import Tables
17
18
 
@@ -36,18 +37,19 @@ class Document:
36
37
  # this will be filled by parsed
37
38
  self._data = None
38
39
  self._tables = None
40
+ self._text = None
39
41
 
40
42
 
41
43
 
42
44
  #_load_text_content
43
45
  def _preprocess_txt_content(self):
44
- return self.content.translate(str.maketrans({
46
+ self._text = self.content.decode().translate(str.maketrans({
45
47
  '\xa0': ' ', '\u2003': ' ',
46
48
  '\u2018': "'", '\u2019': "'",
47
49
  '\u201c': '"', '\u201d': '"'
48
50
  }))
49
51
 
50
- # will deprecate this when we add html2dict
52
+ # needs work
51
53
  def _preprocess_html_content(self):
52
54
  parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
53
55
 
@@ -95,7 +97,7 @@ class Document:
95
97
  while '\n\n\n' in text:
96
98
  text = text.replace('\n\n\n', '\n\n')
97
99
 
98
- return text.translate(str.maketrans({
100
+ self._text = text.translate(str.maketrans({
99
101
  '\xa0': ' ', '\u2003': ' ',
100
102
  '\u2018': "'", '\u2019': "'",
101
103
  '\u201c': '"', '\u201d': '"'
@@ -116,7 +118,7 @@ class Document:
116
118
  mapping_dict = None
117
119
 
118
120
  if self.extension == '.txt':
119
- content = self._preprocess_txt_content()
121
+ content = self.text
120
122
  if self.type == '10-Q':
121
123
  mapping_dict = dict_10q
122
124
  elif self.type == '10-K':
@@ -224,6 +226,15 @@ class Document:
224
226
  self.parse()
225
227
  return self._data
226
228
 
229
+ @property
230
+ def text(self):
231
+ if self._text is None:
232
+ if self.extension in ['.htm','.html']:
233
+ self._preprocess_html_content()
234
+ elif self.extension == '.txt':
235
+ self._preprocess_txt_content()
236
+ return self._text
237
+
227
238
  def write_json(self, output_filename=None):
228
239
  if not self.data:
229
240
  self.parse()
@@ -308,13 +319,23 @@ class Document:
308
319
  self.parse()
309
320
 
310
321
  if not self.data:
311
- if self.extension in ['.jpg', '.png', '.pdf']:
312
- webbrowser.open('file://' + str(self.path))
313
- else:
314
- pass
322
+ pass
315
323
  else:
316
324
  visualize_dict(self.data)
317
325
 
326
+ # alpha feature
327
+ def open(self):
328
+ """Open the document. Experimental. Creates copy in temp, rather than use tar path for now."""
329
+ if self.extension in ['.htm', '.html','.txt','.jpg','.png', '.pdf']:
330
+ # Create a temporary file with the content and open it
331
+
332
+ with tempfile.NamedTemporaryFile(mode='wb', suffix=self.extension, delete=False) as f:
333
+ f.write(self.content)
334
+ temp_path = f.name
335
+ webbrowser.open('file://' + temp_path)
336
+ else:
337
+ print(f"Cannot open files with extension {self.extension}")
338
+
318
339
  def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
319
340
  if not self.data:
320
341
  self.parse()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.1.2
3
+ Version: 2.1.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -14,7 +14,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
14
14
  datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
15
15
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
16
16
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datamule/document/document.py,sha256=Q137zAfOm18ZDBRxT_u2s4adMdOXfW1ktEw3hsB-AMI,14571
17
+ datamule/document/document.py,sha256=lWFmRnX7UfcX2W-2bs5wuFjWDiuFphIlWRXoatAOMZ0,15328
18
18
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
20
20
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -50,7 +50,7 @@ datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,180
50
50
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
52
52
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
53
- datamule-2.1.2.dist-info/METADATA,sha256=sHCW3Up78hM0SW9WWKf5guhYjgEjjB6F0D6hy_CcNBM,560
54
- datamule-2.1.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
55
- datamule-2.1.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
56
- datamule-2.1.2.dist-info/RECORD,,
53
+ datamule-2.1.3.dist-info/METADATA,sha256=jE2eNeO223VJUDysOc9ENyiOIDbVsv5DEXdCnXrhFRA,560
54
+ datamule-2.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
55
+ datamule-2.1.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
56
+ datamule-2.1.3.dist-info/RECORD,,