datamule 2.2.5__tar.gz → 2.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {datamule-2.2.5 → datamule-2.2.7}/PKG-INFO +1 -1
  2. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/document.py +33 -90
  3. {datamule-2.2.5 → datamule-2.2.7}/datamule.egg-info/PKG-INFO +1 -1
  4. {datamule-2.2.5 → datamule-2.2.7}/setup.py +1 -1
  5. {datamule-2.2.5 → datamule-2.2.7}/datamule/__init__.py +0 -0
  6. {datamule-2.2.5 → datamule-2.2.7}/datamule/config.py +0 -0
  7. {datamule-2.2.5 → datamule-2.2.7}/datamule/data/listed_filer_metadata.csv +0 -0
  8. {datamule-2.2.5 → datamule-2.2.7}/datamule/datamule/__init__.py +0 -0
  9. {datamule-2.2.5 → datamule-2.2.7}/datamule/datamule/datamule_lookup.py +0 -0
  10. {datamule-2.2.5 → datamule-2.2.7}/datamule/datamule/datamule_mysql_rds.py +0 -0
  11. {datamule-2.2.5 → datamule-2.2.7}/datamule/datamule/downloader.py +0 -0
  12. {datamule-2.2.5 → datamule-2.2.7}/datamule/datamule/sec_connector.py +0 -0
  13. {datamule-2.2.5 → datamule-2.2.7}/datamule/datasets.py +0 -0
  14. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/__init__.py +0 -0
  15. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/__init__.py +0 -0
  16. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables.py +0 -0
  17. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_13fhr.py +0 -0
  18. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_25nse.py +0 -0
  19. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_informationtable.py +0 -0
  20. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_npx.py +0 -0
  21. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_ownership.py +0 -0
  22. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  23. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_sbsef.py +0 -0
  24. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/tables_sdr.py +0 -0
  25. {datamule-2.2.5 → datamule-2.2.7}/datamule/document/tables/utils.py +0 -0
  26. {datamule-2.2.5 → datamule-2.2.7}/datamule/helper.py +0 -0
  27. {datamule-2.2.5 → datamule-2.2.7}/datamule/index.py +0 -0
  28. {datamule-2.2.5 → datamule-2.2.7}/datamule/mapping_dicts/__init__.py +0 -0
  29. {datamule-2.2.5 → datamule-2.2.7}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  30. {datamule-2.2.5 → datamule-2.2.7}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  31. {datamule-2.2.5 → datamule-2.2.7}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  32. {datamule-2.2.5 → datamule-2.2.7}/datamule/package_updater.py +0 -0
  33. {datamule-2.2.5 → datamule-2.2.7}/datamule/portfolio.py +0 -0
  34. {datamule-2.2.5 → datamule-2.2.7}/datamule/portfolio_compression_utils.py +0 -0
  35. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/__init__.py +0 -0
  36. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/infrastructure/__init__.py +0 -0
  37. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  38. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/submissions/__init__.py +0 -0
  39. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/submissions/downloader.py +0 -0
  40. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/submissions/eftsquery.py +0 -0
  41. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/submissions/monitor.py +0 -0
  42. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/submissions/streamer.py +0 -0
  43. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/submissions/textsearch.py +0 -0
  44. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/utils.py +0 -0
  45. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/xbrl/__init__.py +0 -0
  46. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  47. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  48. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  49. {datamule-2.2.5 → datamule-2.2.7}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  50. {datamule-2.2.5 → datamule-2.2.7}/datamule/seclibrary/__init__.py +0 -0
  51. {datamule-2.2.5 → datamule-2.2.7}/datamule/seclibrary/bq.py +0 -0
  52. {datamule-2.2.5 → datamule-2.2.7}/datamule/sentiment/__init__.py +0 -0
  53. {datamule-2.2.5 → datamule-2.2.7}/datamule/sheet.py +0 -0
  54. {datamule-2.2.5 → datamule-2.2.7}/datamule/submission.py +0 -0
  55. {datamule-2.2.5 → datamule-2.2.7}/datamule/tags/__init__.py +0 -0
  56. {datamule-2.2.5 → datamule-2.2.7}/datamule/tags/config.py +0 -0
  57. {datamule-2.2.5 → datamule-2.2.7}/datamule/tags/dictionaries.py +0 -0
  58. {datamule-2.2.5 → datamule-2.2.7}/datamule/tags/regex.py +0 -0
  59. {datamule-2.2.5 → datamule-2.2.7}/datamule/tags/utils.py +0 -0
  60. {datamule-2.2.5 → datamule-2.2.7}/datamule/utils/__init__.py +0 -0
  61. {datamule-2.2.5 → datamule-2.2.7}/datamule/utils/construct_submissions_data.py +0 -0
  62. {datamule-2.2.5 → datamule-2.2.7}/datamule/utils/format_accession.py +0 -0
  63. {datamule-2.2.5 → datamule-2.2.7}/datamule.egg-info/SOURCES.txt +0 -0
  64. {datamule-2.2.5 → datamule-2.2.7}/datamule.egg-info/dependency_links.txt +0 -0
  65. {datamule-2.2.5 → datamule-2.2.7}/datamule.egg-info/requires.txt +0 -0
  66. {datamule-2.2.5 → datamule-2.2.7}/datamule.egg-info/top_level.txt +0 -0
  67. {datamule-2.2.5 → datamule-2.2.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.5
3
+ Version: 2.2.7
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -3,7 +3,7 @@ import csv
3
3
  import re
4
4
  from doc2dict import xml2dict, txt2dict, dict2dict
5
5
  from doc2dict.mapping import flatten_hierarchy
6
- from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
6
+ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
9
  from ..mapping_dicts.html_mapping_dicts import *
@@ -288,77 +288,17 @@ class Document:
288
288
  self._data = None
289
289
  self._tables = None
290
290
  self._text = None
291
+ self._markdown = None
291
292
 
292
293
  # booleans
293
- self._text_bool = self.extension in ('.htm', '.html','.txt')
294
294
  self._data_bool = self.extension in ('.htm', '.html','.txt')
295
+ self._text_bool = self._data_bool
296
+ self._markdown_bool = self._data_bool
295
297
  self._visualize_bool = self._data_bool
296
298
  self._tables_bool = self.extension in ('.xml')
297
299
 
298
300
 
299
301
 
300
- #_load_text_content
301
- def _preprocess_txt_content(self):
302
- self._text = self.content.decode().translate(str.maketrans({
303
- '\xa0': ' ', '\u2003': ' ',
304
- '\u2018': "'", '\u2019': "'",
305
- '\u201c': '"', '\u201d': '"'
306
- }))
307
-
308
- # needs work
309
- def _preprocess_html_content(self):
310
- parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
311
-
312
- # Remove hidden elements first
313
- hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
314
- for node in hidden_nodes:
315
- node.decompose()
316
-
317
- blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
318
- lines = []
319
- current_line = []
320
-
321
- def flush_line():
322
- if current_line:
323
- # Don't add spaces between adjacent spans
324
- lines.append(''.join(current_line))
325
- current_line.clear()
326
-
327
- for node in parser.root.traverse(include_text=True):
328
- if node.tag in ('script', 'style', 'css'):
329
- continue
330
-
331
- if node.tag in blocks:
332
- flush_line()
333
- lines.append('')
334
-
335
- if node.text_content:
336
- text = node.text_content.strip()
337
- if text:
338
- if node.tag in blocks:
339
- flush_line()
340
- lines.append(text)
341
- lines.append('')
342
- else:
343
- # Only add space if nodes aren't directly adjacent
344
- if current_line and not current_line[-1].endswith(' '):
345
- if node.prev and node.prev.text_content:
346
- if node.parent != node.prev.parent or node.prev.next != node:
347
- current_line.append(' ')
348
- current_line.append(text)
349
-
350
- flush_line()
351
-
352
- text = '\n'.join(lines)
353
- while '\n\n\n' in text:
354
- text = text.replace('\n\n\n', '\n\n')
355
-
356
- self._text = text.translate(str.maketrans({
357
- '\xa0': ' ', '\u2003': ' ',
358
- '\u2018': "'", '\u2019': "'",
359
- '\u201c': '"', '\u201d': '"'
360
- }))
361
-
362
302
  def contains_string(self, pattern):
363
303
  """Works for select files"""
364
304
  if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -477,30 +417,35 @@ class Document:
477
417
 
478
418
  @property
479
419
  def data(self):
480
- if self._data is None:
481
- self.parse()
420
+ if self._data_bool:
421
+ if self._data is None:
422
+ self.parse()
482
423
 
483
- if self._data is None:
484
- self._data = {}
485
-
486
- if not isinstance(self._data, DataWithTags):
487
- self._data = DataWithTags(self._data, self)
424
+ if self._data is None:
425
+ self._data = {}
426
+
427
+ if not isinstance(self._data, DataWithTags):
428
+ self._data = DataWithTags(self._data, self)
488
429
 
489
430
  return self._data
490
431
 
491
432
  @property
492
433
  def text(self):
493
- if self._text is None:
494
- if self.extension in ['.htm','.html']:
495
- self._preprocess_html_content() # Still sets self._text to plain string
496
- elif self.extension == '.txt':
497
- self._preprocess_txt_content() # Still sets self._text to plain string
498
-
499
- # Convert the plain string to TextWithTags
500
- plain_text = self._text
501
- self._text = TextWithTags(plain_text, self)
434
+ if self._text_bool:
435
+ if self._text is None:
436
+ text = flatten_dict(self.data,'text')
437
+ self._text = TextWithTags(text, self)
502
438
  return self._text
503
439
 
440
+ @property
441
+ def markdown(self):
442
+ if self._markdown_bool:
443
+ if self._markdown is None:
444
+ self._markdown = flatten_dict(self.data,'markdown')
445
+
446
+ return self._markdown
447
+
448
+
504
449
  def write_json(self, output_filename=None):
505
450
  if not self.data:
506
451
  self.parse()
@@ -612,18 +557,16 @@ class Document:
612
557
  webbrowser.open('file://' + temp_path)
613
558
  else:
614
559
  print(f"Cannot open files with extension {self.extension}")
615
-
616
560
  def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
617
- if not self.data:
618
- self.parse()
561
+ if self._data_bool:
562
+ if not self.data:
563
+ self.parse()
619
564
 
620
- result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
621
-
622
- if format == 'text':
623
- result = [item[1] for item in result]
624
- result = [unnest_dict(item) for item in result]
625
-
626
- return result
565
+ result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
566
+ if format == 'dict':
567
+ return [item[1] for item in result]
568
+ else:
569
+ return [flatten_dict(item[1],format) for item in result]
627
570
 
628
571
 
629
572
  # TODO CHANGE THIS
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.5
3
+ Version: 2.2.7
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.5",
35
+ version="2.2.7",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes