datamule 2.2.5__tar.gz → 2.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {datamule-2.2.5 → datamule-2.2.6}/PKG-INFO +1 -1
  2. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/document.py +17 -73
  3. {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/PKG-INFO +1 -1
  4. {datamule-2.2.5 → datamule-2.2.6}/setup.py +1 -1
  5. {datamule-2.2.5 → datamule-2.2.6}/datamule/__init__.py +0 -0
  6. {datamule-2.2.5 → datamule-2.2.6}/datamule/config.py +0 -0
  7. {datamule-2.2.5 → datamule-2.2.6}/datamule/data/listed_filer_metadata.csv +0 -0
  8. {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/__init__.py +0 -0
  9. {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/datamule_lookup.py +0 -0
  10. {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/datamule_mysql_rds.py +0 -0
  11. {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/downloader.py +0 -0
  12. {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/sec_connector.py +0 -0
  13. {datamule-2.2.5 → datamule-2.2.6}/datamule/datasets.py +0 -0
  14. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/__init__.py +0 -0
  15. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/__init__.py +0 -0
  16. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables.py +0 -0
  17. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_13fhr.py +0 -0
  18. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_25nse.py +0 -0
  19. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_informationtable.py +0 -0
  20. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_npx.py +0 -0
  21. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_ownership.py +0 -0
  22. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  23. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_sbsef.py +0 -0
  24. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_sdr.py +0 -0
  25. {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/utils.py +0 -0
  26. {datamule-2.2.5 → datamule-2.2.6}/datamule/helper.py +0 -0
  27. {datamule-2.2.5 → datamule-2.2.6}/datamule/index.py +0 -0
  28. {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/__init__.py +0 -0
  29. {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  30. {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  31. {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  32. {datamule-2.2.5 → datamule-2.2.6}/datamule/package_updater.py +0 -0
  33. {datamule-2.2.5 → datamule-2.2.6}/datamule/portfolio.py +0 -0
  34. {datamule-2.2.5 → datamule-2.2.6}/datamule/portfolio_compression_utils.py +0 -0
  35. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/__init__.py +0 -0
  36. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/infrastructure/__init__.py +0 -0
  37. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  38. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/__init__.py +0 -0
  39. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/downloader.py +0 -0
  40. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/eftsquery.py +0 -0
  41. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/monitor.py +0 -0
  42. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/streamer.py +0 -0
  43. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/textsearch.py +0 -0
  44. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/utils.py +0 -0
  45. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/__init__.py +0 -0
  46. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  47. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  48. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  49. {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  50. {datamule-2.2.5 → datamule-2.2.6}/datamule/seclibrary/__init__.py +0 -0
  51. {datamule-2.2.5 → datamule-2.2.6}/datamule/seclibrary/bq.py +0 -0
  52. {datamule-2.2.5 → datamule-2.2.6}/datamule/sentiment/__init__.py +0 -0
  53. {datamule-2.2.5 → datamule-2.2.6}/datamule/sheet.py +0 -0
  54. {datamule-2.2.5 → datamule-2.2.6}/datamule/submission.py +0 -0
  55. {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/__init__.py +0 -0
  56. {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/config.py +0 -0
  57. {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/dictionaries.py +0 -0
  58. {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/regex.py +0 -0
  59. {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/utils.py +0 -0
  60. {datamule-2.2.5 → datamule-2.2.6}/datamule/utils/__init__.py +0 -0
  61. {datamule-2.2.5 → datamule-2.2.6}/datamule/utils/construct_submissions_data.py +0 -0
  62. {datamule-2.2.5 → datamule-2.2.6}/datamule/utils/format_accession.py +0 -0
  63. {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/SOURCES.txt +0 -0
  64. {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/dependency_links.txt +0 -0
  65. {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/requires.txt +0 -0
  66. {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/top_level.txt +0 -0
  67. {datamule-2.2.5 → datamule-2.2.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.5
3
+ Version: 2.2.6
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -3,7 +3,7 @@ import csv
3
3
  import re
4
4
  from doc2dict import xml2dict, txt2dict, dict2dict
5
5
  from doc2dict.mapping import flatten_hierarchy
6
- from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
6
+ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
9
  from ..mapping_dicts.html_mapping_dicts import *
@@ -288,77 +288,17 @@ class Document:
288
288
  self._data = None
289
289
  self._tables = None
290
290
  self._text = None
291
+ self._markdown = None
291
292
 
292
293
  # booleans
293
- self._text_bool = self.extension in ('.htm', '.html','.txt')
294
294
  self._data_bool = self.extension in ('.htm', '.html','.txt')
295
+ self._text_bool = self._data_bool
296
+ self._markdown_bool = self._data_bool
295
297
  self._visualize_bool = self._data_bool
296
298
  self._tables_bool = self.extension in ('.xml')
297
299
 
298
300
 
299
301
 
300
- #_load_text_content
301
- def _preprocess_txt_content(self):
302
- self._text = self.content.decode().translate(str.maketrans({
303
- '\xa0': ' ', '\u2003': ' ',
304
- '\u2018': "'", '\u2019': "'",
305
- '\u201c': '"', '\u201d': '"'
306
- }))
307
-
308
- # needs work
309
- def _preprocess_html_content(self):
310
- parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
311
-
312
- # Remove hidden elements first
313
- hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
314
- for node in hidden_nodes:
315
- node.decompose()
316
-
317
- blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
318
- lines = []
319
- current_line = []
320
-
321
- def flush_line():
322
- if current_line:
323
- # Don't add spaces between adjacent spans
324
- lines.append(''.join(current_line))
325
- current_line.clear()
326
-
327
- for node in parser.root.traverse(include_text=True):
328
- if node.tag in ('script', 'style', 'css'):
329
- continue
330
-
331
- if node.tag in blocks:
332
- flush_line()
333
- lines.append('')
334
-
335
- if node.text_content:
336
- text = node.text_content.strip()
337
- if text:
338
- if node.tag in blocks:
339
- flush_line()
340
- lines.append(text)
341
- lines.append('')
342
- else:
343
- # Only add space if nodes aren't directly adjacent
344
- if current_line and not current_line[-1].endswith(' '):
345
- if node.prev and node.prev.text_content:
346
- if node.parent != node.prev.parent or node.prev.next != node:
347
- current_line.append(' ')
348
- current_line.append(text)
349
-
350
- flush_line()
351
-
352
- text = '\n'.join(lines)
353
- while '\n\n\n' in text:
354
- text = text.replace('\n\n\n', '\n\n')
355
-
356
- self._text = text.translate(str.maketrans({
357
- '\xa0': ' ', '\u2003': ' ',
358
- '\u2018': "'", '\u2019': "'",
359
- '\u201c': '"', '\u201d': '"'
360
- }))
361
-
362
302
  def contains_string(self, pattern):
363
303
  """Works for select files"""
364
304
  if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -490,17 +430,21 @@ class Document:
490
430
 
491
431
  @property
492
432
  def text(self):
493
- if self._text is None:
494
- if self.extension in ['.htm','.html']:
495
- self._preprocess_html_content() # Still sets self._text to plain string
496
- elif self.extension == '.txt':
497
- self._preprocess_txt_content() # Still sets self._text to plain string
498
-
499
- # Convert the plain string to TextWithTags
500
- plain_text = self._text
501
- self._text = TextWithTags(plain_text, self)
433
+ if self._text_bool:
434
+ if self._text is None:
435
+ text = flatten_dict(self.data,'text')
436
+ self._text = TextWithTags(text, self)
502
437
  return self._text
503
438
 
439
+ @property
440
+ def markdown(self):
441
+ if self._markdown_bool:
442
+ if self._markdown is None:
443
+ self._markdown = flatten_dict(self.data,'markdown')
444
+
445
+ return self._markdown
446
+
447
+
504
448
  def write_json(self, output_filename=None):
505
449
  if not self.data:
506
450
  self.parse()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.5
3
+ Version: 2.2.6
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.5",
35
+ version="2.2.6",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes