datamule 2.2.5__py3-none-any.whl → 2.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import csv
3
3
  import re
4
4
  from doc2dict import xml2dict, txt2dict, dict2dict
5
5
  from doc2dict.mapping import flatten_hierarchy
6
- from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
6
+ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
9
  from ..mapping_dicts.html_mapping_dicts import *
@@ -288,77 +288,17 @@ class Document:
288
288
  self._data = None
289
289
  self._tables = None
290
290
  self._text = None
291
+ self._markdown = None
291
292
 
292
293
  # booleans
293
- self._text_bool = self.extension in ('.htm', '.html','.txt')
294
294
  self._data_bool = self.extension in ('.htm', '.html','.txt')
295
+ self._text_bool = self._data_bool
296
+ self._markdown_bool = self._data_bool
295
297
  self._visualize_bool = self._data_bool
296
298
  self._tables_bool = self.extension in ('.xml')
297
299
 
298
300
 
299
301
 
300
- #_load_text_content
301
- def _preprocess_txt_content(self):
302
- self._text = self.content.decode().translate(str.maketrans({
303
- '\xa0': ' ', '\u2003': ' ',
304
- '\u2018': "'", '\u2019': "'",
305
- '\u201c': '"', '\u201d': '"'
306
- }))
307
-
308
- # needs work
309
- def _preprocess_html_content(self):
310
- parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
311
-
312
- # Remove hidden elements first
313
- hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
314
- for node in hidden_nodes:
315
- node.decompose()
316
-
317
- blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
318
- lines = []
319
- current_line = []
320
-
321
- def flush_line():
322
- if current_line:
323
- # Don't add spaces between adjacent spans
324
- lines.append(''.join(current_line))
325
- current_line.clear()
326
-
327
- for node in parser.root.traverse(include_text=True):
328
- if node.tag in ('script', 'style', 'css'):
329
- continue
330
-
331
- if node.tag in blocks:
332
- flush_line()
333
- lines.append('')
334
-
335
- if node.text_content:
336
- text = node.text_content.strip()
337
- if text:
338
- if node.tag in blocks:
339
- flush_line()
340
- lines.append(text)
341
- lines.append('')
342
- else:
343
- # Only add space if nodes aren't directly adjacent
344
- if current_line and not current_line[-1].endswith(' '):
345
- if node.prev and node.prev.text_content:
346
- if node.parent != node.prev.parent or node.prev.next != node:
347
- current_line.append(' ')
348
- current_line.append(text)
349
-
350
- flush_line()
351
-
352
- text = '\n'.join(lines)
353
- while '\n\n\n' in text:
354
- text = text.replace('\n\n\n', '\n\n')
355
-
356
- self._text = text.translate(str.maketrans({
357
- '\xa0': ' ', '\u2003': ' ',
358
- '\u2018': "'", '\u2019': "'",
359
- '\u201c': '"', '\u201d': '"'
360
- }))
361
-
362
302
  def contains_string(self, pattern):
363
303
  """Works for select files"""
364
304
  if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -490,17 +430,21 @@ class Document:
490
430
 
491
431
  @property
492
432
  def text(self):
493
- if self._text is None:
494
- if self.extension in ['.htm','.html']:
495
- self._preprocess_html_content() # Still sets self._text to plain string
496
- elif self.extension == '.txt':
497
- self._preprocess_txt_content() # Still sets self._text to plain string
498
-
499
- # Convert the plain string to TextWithTags
500
- plain_text = self._text
501
- self._text = TextWithTags(plain_text, self)
433
+ if self._text_bool:
434
+ if self._text is None:
435
+ text = flatten_dict(self.data,'text')
436
+ self._text = TextWithTags(text, self)
502
437
  return self._text
503
438
 
439
+ @property
440
+ def markdown(self):
441
+ if self._markdown_bool:
442
+ if self._markdown is None:
443
+ self._markdown = flatten_dict(self.data,'markdown')
444
+
445
+ return self._markdown
446
+
447
+
504
448
  def write_json(self, output_filename=None):
505
449
  if not self.data:
506
450
  self.parse()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.5
3
+ Version: 2.2.6
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
15
15
  datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamule/document/document.py,sha256=msIMoLdxjcwdMv4ijwCMLutySk2-5BvGU266nWQkzg4,26909
18
+ datamule/document/document.py,sha256=AuF5JSVjFHA2w5JoLq8zG1UOq906PvJNcp50Qia--fE,24521
19
19
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
21
21
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -57,7 +57,7 @@ datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
57
57
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
59
59
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
60
- datamule-2.2.5.dist-info/METADATA,sha256=Mm0hhgixEljkpYk__oV2nIUe9ceglvbdhJr0lgEZ_b0,585
61
- datamule-2.2.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
- datamule-2.2.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
63
- datamule-2.2.5.dist-info/RECORD,,
60
+ datamule-2.2.6.dist-info/METADATA,sha256=lY7IAgOEQ9TUlWaKRhypyBfRIXS3jmr5q9sEHOgaYfg,585
61
+ datamule-2.2.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
+ datamule-2.2.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
63
+ datamule-2.2.6.dist-info/RECORD,,