datamule 2.2.9__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-2.2.9 → datamule-2.3.0}/PKG-INFO +1 -1
  2. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/document.py +50 -13
  3. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables.py +39 -11
  4. {datamule-2.2.9 → datamule-2.3.0}/datamule.egg-info/PKG-INFO +1 -1
  5. {datamule-2.2.9 → datamule-2.3.0}/setup.py +1 -1
  6. {datamule-2.2.9 → datamule-2.3.0}/datamule/__init__.py +0 -0
  7. {datamule-2.2.9 → datamule-2.3.0}/datamule/config.py +0 -0
  8. {datamule-2.2.9 → datamule-2.3.0}/datamule/data/listed_filer_metadata.csv +0 -0
  9. {datamule-2.2.9 → datamule-2.3.0}/datamule/datamule/__init__.py +0 -0
  10. {datamule-2.2.9 → datamule-2.3.0}/datamule/datamule/datamule_lookup.py +0 -0
  11. {datamule-2.2.9 → datamule-2.3.0}/datamule/datamule/datamule_mysql_rds.py +0 -0
  12. {datamule-2.2.9 → datamule-2.3.0}/datamule/datamule/downloader.py +0 -0
  13. {datamule-2.2.9 → datamule-2.3.0}/datamule/datamule/sec_connector.py +0 -0
  14. {datamule-2.2.9 → datamule-2.3.0}/datamule/datasets.py +0 -0
  15. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/__init__.py +0 -0
  16. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/__init__.py +0 -0
  17. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_13fhr.py +0 -0
  18. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_25nse.py +0 -0
  19. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_informationtable.py +0 -0
  20. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_npx.py +0 -0
  21. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_ownership.py +0 -0
  22. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  23. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_sbsef.py +0 -0
  24. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables_sdr.py +0 -0
  25. {datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/utils.py +0 -0
  26. {datamule-2.2.9 → datamule-2.3.0}/datamule/helper.py +0 -0
  27. {datamule-2.2.9 → datamule-2.3.0}/datamule/index.py +0 -0
  28. {datamule-2.2.9 → datamule-2.3.0}/datamule/mapping_dicts/__init__.py +0 -0
  29. {datamule-2.2.9 → datamule-2.3.0}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  30. {datamule-2.2.9 → datamule-2.3.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  31. {datamule-2.2.9 → datamule-2.3.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  32. {datamule-2.2.9 → datamule-2.3.0}/datamule/package_updater.py +0 -0
  33. {datamule-2.2.9 → datamule-2.3.0}/datamule/portfolio.py +0 -0
  34. {datamule-2.2.9 → datamule-2.3.0}/datamule/portfolio_compression_utils.py +0 -0
  35. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/__init__.py +0 -0
  36. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/infrastructure/__init__.py +0 -0
  37. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  38. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/submissions/__init__.py +0 -0
  39. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/submissions/downloader.py +0 -0
  40. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/submissions/eftsquery.py +0 -0
  41. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/submissions/monitor.py +0 -0
  42. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/submissions/streamer.py +0 -0
  43. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/submissions/textsearch.py +0 -0
  44. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/utils.py +0 -0
  45. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/xbrl/__init__.py +0 -0
  46. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  47. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  48. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  49. {datamule-2.2.9 → datamule-2.3.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  50. {datamule-2.2.9 → datamule-2.3.0}/datamule/seclibrary/__init__.py +0 -0
  51. {datamule-2.2.9 → datamule-2.3.0}/datamule/seclibrary/bq.py +0 -0
  52. {datamule-2.2.9 → datamule-2.3.0}/datamule/sentiment/__init__.py +0 -0
  53. {datamule-2.2.9 → datamule-2.3.0}/datamule/sheet.py +0 -0
  54. {datamule-2.2.9 → datamule-2.3.0}/datamule/submission.py +0 -0
  55. {datamule-2.2.9 → datamule-2.3.0}/datamule/tags/__init__.py +0 -0
  56. {datamule-2.2.9 → datamule-2.3.0}/datamule/tags/config.py +0 -0
  57. {datamule-2.2.9 → datamule-2.3.0}/datamule/tags/dictionaries.py +0 -0
  58. {datamule-2.2.9 → datamule-2.3.0}/datamule/tags/regex.py +0 -0
  59. {datamule-2.2.9 → datamule-2.3.0}/datamule/tags/utils.py +0 -0
  60. {datamule-2.2.9 → datamule-2.3.0}/datamule/utils/__init__.py +0 -0
  61. {datamule-2.2.9 → datamule-2.3.0}/datamule/utils/construct_submissions_data.py +0 -0
  62. {datamule-2.2.9 → datamule-2.3.0}/datamule/utils/format_accession.py +0 -0
  63. {datamule-2.2.9 → datamule-2.3.0}/datamule/utils/pdf.py +0 -0
  64. {datamule-2.2.9 → datamule-2.3.0}/datamule.egg-info/SOURCES.txt +0 -0
  65. {datamule-2.2.9 → datamule-2.3.0}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-2.2.9 → datamule-2.3.0}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-2.2.9 → datamule-2.3.0}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-2.2.9 → datamule-2.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.9
3
+ Version: 2.3.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
9
  from ..mapping_dicts.html_mapping_dicts import *
10
- from selectolax.parser import HTMLParser
11
-
12
10
  from pathlib import Path
13
11
  import webbrowser
14
12
  from secsgml.utils import bytes_to_str
@@ -294,7 +292,6 @@ class Document:
294
292
  return bool(re.search(pattern, self.content))
295
293
  return False
296
294
 
297
- # Note: this method will be heavily modified in the future
298
295
  def parse(self):
299
296
  # check if we have already parsed the content
300
297
  if self._data:
@@ -384,6 +381,8 @@ class Document:
384
381
  dct = html2dict(content=self.content, mapping_dict=mapping_dict)
385
382
  elif self.extension in ['.txt']:
386
383
  dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
384
+ elif self.extension == '.pdf':
385
+ dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
387
386
  else:
388
387
  dct = {}
389
388
 
@@ -391,10 +390,8 @@ class Document:
391
390
  elif self.extension == '.xml':
392
391
  if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
393
392
  mapping_dict = dict_345
394
-
395
393
  self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
396
- elif self.extension == '.pdf':
397
- self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
394
+
398
395
  else:
399
396
  pass
400
397
 
@@ -409,6 +406,12 @@ class Document:
409
406
 
410
407
  if not isinstance(self._data, DataWithTags):
411
408
  self._data = DataWithTags(self._data, self)
409
+ elif self.extension == '.xml':
410
+ if self._data is None:
411
+ self.parse()
412
+
413
+ if self._data is None:
414
+ self._data = {}
412
415
 
413
416
  return self._data
414
417
 
@@ -444,19 +447,46 @@ class Document:
444
447
  json.dump(self.data, f, indent=2)
445
448
 
446
449
  def parse_tables(self,must_exist_in_mapping=True):
447
- if self.extension != '.xml':
448
- self._tables = []
450
+ """Must exist in mapping means columns must occur in mapping schema."""
451
+ if self.extension == '.xml':
452
+ tables = Tables(document_type = self.type, accession=self.accession)
453
+ tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
454
+ self._tables = tables
455
+
456
+ elif self._data_bool:
457
+ tables = Tables(document_type = self.type, accession=self.accession)
458
+ data_tuples = self.data_tuples
459
+
460
+ for i, (id, type, content, level) in enumerate(data_tuples):
461
+ if type == "table" and i > 0:
462
+ description = None
463
+
464
+ # Look at previous element
465
+ prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
466
+
467
+ # Case 1: Same level + text content
468
+ if prev_level == level and prev_type in ["text", "textsmall"]:
469
+ description = prev_content
470
+
471
+ # Case 2: Higher level (lower number) + title
472
+ elif prev_level < level and prev_type == "title":
473
+ description = prev_content
474
+
475
+ # Case 3: No matching description - add table without description
476
+ # (description remains None)
477
+
478
+ tables.add_table(data=content, description=description, name="extracted_table")
479
+
480
+ self._tables = tables
481
+
449
482
  else:
450
- # Use the property to trigger parsing if needed
451
- data = self.data
452
- tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
453
- self._tables = tables.tables
483
+ self._tables = []
454
484
 
455
485
  @property
456
486
  def tables(self):
457
487
  if self._tables is None:
458
488
  self.parse_tables()
459
- return self._tables
489
+ return self._tables.tables
460
490
 
461
491
 
462
492
  def write_csv(self, output_folder):
@@ -547,6 +577,7 @@ class Document:
547
577
  webbrowser.open('file://' + temp_path)
548
578
  else:
549
579
  print(f"Cannot open files with extension {self.extension}")
580
+
550
581
  def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
551
582
  if self._data_bool:
552
583
  if not self.data:
@@ -557,3 +588,9 @@ class Document:
557
588
  return [item[1] for item in result]
558
589
  else:
559
590
  return [flatten_dict(item[1],format) for item in result]
591
+
592
+ # TODO
593
+ def get_tables(self,description_regex=None,name=None):
594
+ # make sure tables is initialized
595
+ self.tables
596
+ return self._tables.get_tables(description_regex=description_regex, name=name)
@@ -6,8 +6,10 @@ from .tables_npx import config_npx
6
6
  from .tables_sbsef import config_sbsef
7
7
  from .tables_sdr import config_sdr
8
8
  from .tables_proxyvotingrecord import config_proxyvotingrecord
9
+ from doc2dict.utils.format_dict import _format_table
9
10
 
10
11
  from .utils import safe_get, flatten_dict
12
+ import re
11
13
  # will add filing date param later? or extension
12
14
  all_tables_dict = {
13
15
  '3' : config_ownership,
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
93
95
 
94
96
  # should have table type, accession, data
95
97
  class Table:
96
- def __init__(self,data,name,accession):
98
+ def __init__(self,data,name,accession,description = None):
97
99
  self.data = data
98
100
  self.name = name
99
101
  self.accession = accession
102
+ self.description = description
103
+
104
+ # TODO MADE IN A HURRY #
105
+ def __str__(self):
106
+ formatted_table = _format_table(self.data)
107
+ if isinstance(formatted_table, list):
108
+ table_str = '\n'.join(formatted_table)
109
+ else:
110
+ table_str = str(formatted_table)
111
+ return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
100
112
 
101
113
 
102
114
  class Tables():
103
- def __init__(self,document_type,accession,data,must_exist_in_mapping=True):
115
+ def __init__(self,document_type,accession):
104
116
  self.document_type = document_type
105
117
  self.accession = accession
106
- self.data = data
107
-
108
- # to fill in
109
118
  self.tables = []
110
119
 
111
- self.parse_tables(must_exist_in_mapping=must_exist_in_mapping)
112
-
113
- def parse_tables(self,must_exist_in_mapping=True):
114
- # first select dict
120
+ def parse_tables(self,data,must_exist_in_mapping=True):
121
+ self.data = data
115
122
 
116
123
  try:
117
124
  tables_dict = all_tables_dict[self.document_type]
@@ -120,11 +127,32 @@ class Tables():
120
127
 
121
128
  # now get the dicts from the data
122
129
  data_dicts = seperate_data(tables_dict,self.data)
123
-
130
+
124
131
  # now flatten
125
132
  data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
126
133
 
127
134
  for table_name, flattened_data in data_dicts:
128
135
  mapping_dict = tables_dict[table_name]['mapping']
129
136
  mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
130
- self.tables.append(Table(mapped_data, table_name, self.accession))
137
+ self.tables.append(Table(mapped_data, table_name, self.accession))
138
+
139
+ def add_table(self,data,name,description=None):
140
+ self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
141
+
142
+ def get_tables(self, description_regex=None, name=None):
143
+ matching_tables = []
144
+
145
+ for table in self.tables:
146
+ # Check name match (exact match)
147
+ if name is not None:
148
+ if table.name == name:
149
+ matching_tables.append(table)
150
+ continue
151
+
152
+ # Check description regex match
153
+ if description_regex is not None and table.description is not None:
154
+ if re.search(description_regex, table.description):
155
+ matching_tables.append(table)
156
+ continue
157
+
158
+ return matching_tables
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.9
3
+ Version: 2.3.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.9",
35
+ version="2.3.0",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes