datamule 2.1.1__py3-none-any.whl → 2.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,7 +23,11 @@ from ..utils.format_accession import format_accession
23
23
  # could be cleaned up
24
24
 
25
25
  # Set up logging
26
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(levelname)s - %(message)s',
29
+ handlers=logging.getLogger().handlers,
30
+ )
27
31
  logger = logging.getLogger(__name__)
28
32
 
29
33
 
@@ -12,6 +12,7 @@ from selectolax.parser import HTMLParser
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
+ import tempfile
15
16
 
16
17
  from .tables.tables import Tables
17
18
 
@@ -36,18 +37,19 @@ class Document:
36
37
  # this will be filled by parsed
37
38
  self._data = None
38
39
  self._tables = None
40
+ self._text = None
39
41
 
40
42
 
41
43
 
42
44
  #_load_text_content
43
45
  def _preprocess_txt_content(self):
44
- return self.content.translate(str.maketrans({
46
+ self._text = self.content.decode().translate(str.maketrans({
45
47
  '\xa0': ' ', '\u2003': ' ',
46
48
  '\u2018': "'", '\u2019': "'",
47
49
  '\u201c': '"', '\u201d': '"'
48
50
  }))
49
51
 
50
- # will deprecate this when we add html2dict
52
+ # needs work
51
53
  def _preprocess_html_content(self):
52
54
  parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
53
55
 
@@ -95,7 +97,7 @@ class Document:
95
97
  while '\n\n\n' in text:
96
98
  text = text.replace('\n\n\n', '\n\n')
97
99
 
98
- return text.translate(str.maketrans({
100
+ self._text = text.translate(str.maketrans({
99
101
  '\xa0': ' ', '\u2003': ' ',
100
102
  '\u2018': "'", '\u2019': "'",
101
103
  '\u201c': '"', '\u201d': '"'
@@ -116,7 +118,7 @@ class Document:
116
118
  mapping_dict = None
117
119
 
118
120
  if self.extension == '.txt':
119
- content = self._preprocess_txt_content()
121
+ content = self.text
120
122
  if self.type == '10-Q':
121
123
  mapping_dict = dict_10q
122
124
  elif self.type == '10-K':
@@ -224,6 +226,15 @@ class Document:
224
226
  self.parse()
225
227
  return self._data
226
228
 
229
+ @property
230
+ def text(self):
231
+ if self._text is None:
232
+ if self.extension in ['.htm','.html']:
233
+ self._preprocess_html_content()
234
+ elif self.extension == '.txt':
235
+ self._preprocess_txt_content()
236
+ return self._text
237
+
227
238
  def write_json(self, output_filename=None):
228
239
  if not self.data:
229
240
  self.parse()
@@ -308,18 +319,28 @@ class Document:
308
319
  self.parse()
309
320
 
310
321
  if not self.data:
311
- if self.extension in ['.jpg', '.png', '.pdf']:
312
- webbrowser.open('file://' + str(self.path))
313
- else:
314
- pass
322
+ pass
315
323
  else:
316
324
  visualize_dict(self.data)
317
325
 
318
- def get_section(self, title, format='dict'):
326
+ # alpha feature
327
+ def open(self):
328
+ """Open the document. Experimental. Creates copy in temp, rather than use tar path for now."""
329
+ if self.extension in ['.htm', '.html','.txt','.jpg','.png', '.pdf']:
330
+ # Create a temporary file with the content and open it
331
+
332
+ with tempfile.NamedTemporaryFile(mode='wb', suffix=self.extension, delete=False) as f:
333
+ f.write(self.content)
334
+ temp_path = f.name
335
+ webbrowser.open('file://' + temp_path)
336
+ else:
337
+ print(f"Cannot open files with extension {self.extension}")
338
+
339
+ def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
319
340
  if not self.data:
320
341
  self.parse()
321
342
 
322
- result = get_title(self.data,title)
343
+ result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
323
344
 
324
345
  if format == 'text':
325
346
  result = [item[1] for item in result]
@@ -1,7 +1,7 @@
1
1
  dict_10k_html = {
2
2
  ('part',r'^part\s*([ivx]+)$') : 0,
3
3
  ('signatures',r'^signatures?\.*$') : 0,
4
- ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
4
+ ('item',r'^item\s*(\d+)\.?([a-z])?(?![a-z])') : 1,
5
5
  }
6
6
  dict_10q_html = dict_10k_html
7
7
 
@@ -48,7 +48,7 @@ dict_10d_html = dict_10k_html
48
48
 
49
49
  dict_20f_html = {
50
50
  ('part',r'^part\s*([ivx]+)') : 0,
51
- ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
51
+ ('item',r'^item\s*(\d+)\.?([a-z])?(?![a-z])') : 1,
52
52
  ('letter',r'\d*\.?([a-z])') : 2,
53
53
  ('signatures',r'^signatures?\.*$') : 0,
54
54
  }
datamule/submission.py CHANGED
@@ -163,8 +163,8 @@ class Submission:
163
163
  content = zstd.ZstdDecompressor().decompress(content)
164
164
 
165
165
  # Decode text files
166
- if extension in ['.htm', '.html', '.txt', '.xml']:
167
- content = content.decode('utf-8', errors='replace')
166
+ # if extension in ['.htm', '.html', '.txt', '.xml']:
167
+ # content = content.decode('utf-8', errors='replace')
168
168
 
169
169
  document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
170
170
 
@@ -197,8 +197,8 @@ class Submission:
197
197
  content = zstd.ZstdDecompressor().decompress(content)
198
198
 
199
199
  # Decode text files
200
- if extension in ['.htm', '.html', '.txt', '.xml']:
201
- content = content.decode('utf-8', errors='replace')
200
+ # if extension in ['.htm', '.html', '.txt', '.xml']:
201
+ # content = content.decode('utf-8', errors='replace')
202
202
 
203
203
  document_path = f"{self.path}::{actual_filename}"
204
204
 
@@ -219,8 +219,8 @@ class Submission:
219
219
  content = zstd.ZstdDecompressor().decompress(content)
220
220
 
221
221
  # Decode text files
222
- if extension in ['.htm', '.html', '.txt', '.xml']:
223
- content = content.decode('utf-8', errors='replace')
222
+ # if extension in ['.htm', '.html', '.txt', '.xml']:
223
+ # content = content.decode('utf-8', errors='replace')
224
224
 
225
225
  return Document(
226
226
  type=doc['type'],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.1.1
3
+ Version: 2.1.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -6,15 +6,15 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
6
6
  datamule/portfolio.py,sha256=YViG1JgJ9SFhg8N3tOOhBI8oc6Pmi2vwnHeHmlkC_5U,12119
7
7
  datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
8
8
  datamule/sheet.py,sha256=Ws_YRtpvewLVioarngVMe8cgG_sp11MP9_goGbRaiWE,23952
9
- datamule/submission.py,sha256=7rtN3EEB50iU7E-B_i-e0JHY382EIgmDcrS1KRZ1mUc,15886
9
+ datamule/submission.py,sha256=piMtTyoMZrKmLBpjyi0BBFhlkugi_CEuyox7J-jnusQ,15898
10
10
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
11
11
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
13
13
  datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
14
- datamule/datamule/downloader.py,sha256=aTyVUuIwynPtHB0Z9BvCasy9Ao5wfHptNAsjN-7yDTk,18525
14
+ datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
15
15
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
16
16
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datamule/document/document.py,sha256=OtfIvHRxdIXmNq5lOpBiLQw9P4o9OYScdpJxNEX51Yc,14477
17
+ datamule/document/document.py,sha256=lWFmRnX7UfcX2W-2bs5wuFjWDiuFphIlWRXoatAOMZ0,15328
18
18
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
20
20
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -27,7 +27,7 @@ datamule/document/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-5
27
27
  datamule/document/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
28
28
  datamule/document/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
29
29
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- datamule/mapping_dicts/html_mapping_dicts.py,sha256=G2PWB__FNg4VH9iFJFkflM0u-qOEtk67IWtGoqesb0k,5388
30
+ datamule/mapping_dicts/html_mapping_dicts.py,sha256=OmelylkccxcPcz6Qv28r6rjbwOhUGnz5Vmzy_BYC0Pg,5406
31
31
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
32
32
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
33
33
  datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,7 +50,7 @@ datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,180
50
50
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
52
52
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
53
- datamule-2.1.1.dist-info/METADATA,sha256=c7wijhGbi_7q3LNTf1HvZcGJtCgvvO43nxcdqj3tWhs,560
54
- datamule-2.1.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
55
- datamule-2.1.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
56
- datamule-2.1.1.dist-info/RECORD,,
53
+ datamule-2.1.3.dist-info/METADATA,sha256=jE2eNeO223VJUDysOc9ENyiOIDbVsv5DEXdCnXrhFRA,560
54
+ datamule-2.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
55
+ datamule-2.1.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
56
+ datamule-2.1.3.dist-info/RECORD,,