datamule 2.2.8__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
7
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
8
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
9
  from ..mapping_dicts.html_mapping_dicts import *
10
- from selectolax.parser import HTMLParser
11
-
12
10
  from pathlib import Path
13
11
  import webbrowser
14
12
  from secsgml.utils import bytes_to_str
@@ -294,30 +292,13 @@ class Document:
294
292
  return bool(re.search(pattern, self.content))
295
293
  return False
296
294
 
297
- # Note: this method will be heavily modified in the future
298
295
  def parse(self):
299
296
  # check if we have already parsed the content
300
297
  if self._data:
301
298
  return
302
299
 
303
300
  mapping_dict = None
304
-
305
- if self.extension == '.txt':
306
- content = self.text
307
- if self.type in ['10-Q', '10-Q/A']:
308
- mapping_dict = dict_10q
309
- elif self.type in ['10-K','10-K/A']:
310
- mapping_dict = dict_10k
311
- elif self.type in ['8-K', '8-K/A']:
312
- mapping_dict = dict_8k
313
- elif self.type in ['SC 13D', 'SC 13D/A']:
314
- mapping_dict = dict_13d
315
- elif self.type in ['SC 13G', 'SC 13G/A']:
316
- mapping_dict = dict_13g
317
-
318
- self._data = {}
319
- self._data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
320
- elif self.extension in ['.htm', '.html']:
301
+ if self._data_bool:
321
302
 
322
303
  if self.type in ['1-K', '1-K/A']:
323
304
  mapping_dict = dict_1kpartii_html
@@ -391,16 +372,26 @@ class Document:
391
372
  mapping_dict = dict_t3_html
392
373
  elif self.type in ['NT 10-K', 'NT 10-K/A', 'NT 10-Q', 'NT 10-Q/A', 'NT 20-F', 'NT 20-F/A']:
393
374
  mapping_dict = dict_nt10k_html
375
+ elif self.type in ['SC 13G', 'SC 13G/A']:
376
+ mapping_dict = dict_13g
377
+ elif self.type in ['SC 13D', 'SC 13D/A']:
378
+ mapping_dict = dict_13d
379
+
380
+ if self.extension in ['.htm','.html']:
381
+ dct = html2dict(content=self.content, mapping_dict=mapping_dict)
382
+ elif self.extension in ['.txt']:
383
+ dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
384
+ elif self.extension == '.pdf':
385
+ dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
386
+ else:
387
+ dct = {}
394
388
 
395
- dct = html2dict(content=self.content, mapping_dict=mapping_dict)
396
389
  self._data = dct
397
390
  elif self.extension == '.xml':
398
391
  if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
399
392
  mapping_dict = dict_345
400
-
401
393
  self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
402
- elif self.extension == '.pdf':
403
- self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
394
+
404
395
  else:
405
396
  pass
406
397
 
@@ -415,6 +406,12 @@ class Document:
415
406
 
416
407
  if not isinstance(self._data, DataWithTags):
417
408
  self._data = DataWithTags(self._data, self)
409
+ elif self.extension == '.xml':
410
+ if self._data is None:
411
+ self.parse()
412
+
413
+ if self._data is None:
414
+ self._data = {}
418
415
 
419
416
  return self._data
420
417
 
@@ -450,19 +447,46 @@ class Document:
450
447
  json.dump(self.data, f, indent=2)
451
448
 
452
449
  def parse_tables(self,must_exist_in_mapping=True):
453
- if self.extension != '.xml':
454
- self._tables = []
450
+ """Must exist in mapping means columns must occur in mapping schema."""
451
+ if self.extension == '.xml':
452
+ tables = Tables(document_type = self.type, accession=self.accession)
453
+ tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
454
+ self._tables = tables
455
+
456
+ elif self._data_bool:
457
+ tables = Tables(document_type = self.type, accession=self.accession)
458
+ data_tuples = self.data_tuples
459
+
460
+ for i, (id, type, content, level) in enumerate(data_tuples):
461
+ if type == "table" and i > 0:
462
+ description = None
463
+
464
+ # Look at previous element
465
+ prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
466
+
467
+ # Case 1: Same level + text content
468
+ if prev_level == level and prev_type in ["text", "textsmall"]:
469
+ description = prev_content
470
+
471
+ # Case 2: Higher level (lower number) + title
472
+ elif prev_level < level and prev_type == "title":
473
+ description = prev_content
474
+
475
+ # Case 3: No matching description - add table without description
476
+ # (description remains None)
477
+
478
+ tables.add_table(data=content, description=description, name="extracted_table")
479
+
480
+ self._tables = tables
481
+
455
482
  else:
456
- # Use the property to trigger parsing if needed
457
- data = self.data
458
- tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
459
- self._tables = tables.tables
483
+ self._tables = []
460
484
 
461
485
  @property
462
486
  def tables(self):
463
487
  if self._tables is None:
464
488
  self.parse_tables()
465
- return self._tables
489
+ return self._tables.tables
466
490
 
467
491
 
468
492
  def write_csv(self, output_folder):
@@ -553,6 +577,7 @@ class Document:
553
577
  webbrowser.open('file://' + temp_path)
554
578
  else:
555
579
  print(f"Cannot open files with extension {self.extension}")
580
+
556
581
  def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
557
582
  if self._data_bool:
558
583
  if not self.data:
@@ -564,62 +589,8 @@ class Document:
564
589
  else:
565
590
  return [flatten_dict(item[1],format) for item in result]
566
591
 
567
-
568
- # TODO CHANGE THIS
569
- def __iter__(self):
570
- # Use the property to trigger parsing if needed
571
- document_data = self.data
572
-
573
- # Let's remove XML iterable for now
574
-
575
- # Handle text-based documents
576
- if self.extension in ['.txt', '.htm', '.html']:
577
- if not document_data:
578
- return iter([])
579
-
580
- # Find highest hierarchy level from mapping dict
581
- highest_hierarchy = float('inf')
582
- section_type = None
583
-
584
- if self.type in ['10-K', '10-Q']:
585
- mapping_dict = dict_10k if self.type == '10-K' else dict_10q
586
- elif self.type == '8-K':
587
- mapping_dict = dict_8k
588
- elif self.type == 'SC 13D':
589
- mapping_dict = dict_13d
590
- elif self.type == 'SC 13G':
591
- mapping_dict = dict_13g
592
- else:
593
- return iter([])
594
-
595
- # Find section type with highest hierarchy number
596
- highest_hierarchy = -1 # Start at -1 to find highest
597
- for mapping in mapping_dict['rules']['mappings']:
598
- if mapping.get('hierarchy') is not None:
599
- if mapping['hierarchy'] > highest_hierarchy:
600
- highest_hierarchy = mapping['hierarchy']
601
- section_type = mapping['name']
602
-
603
- if not section_type:
604
- return iter([])
605
-
606
- # Extract sections of the identified type
607
- def find_sections(data, target_type):
608
- sections = []
609
- if isinstance(data, dict):
610
- if data.get('type') == target_type:
611
- sections.append({
612
- 'item': data.get('text', ''),
613
- 'text': flatten_hierarchy(data.get('content', []))
614
- })
615
- for value in data.values():
616
- if isinstance(value, (dict, list)):
617
- sections.extend(find_sections(value, target_type))
618
- elif isinstance(data, list):
619
- for item in data:
620
- sections.extend(find_sections(item, target_type))
621
- return sections
622
-
623
- return iter(find_sections(document_data, section_type))
624
-
625
- return iter([])
592
+ # TODO
593
+ def get_tables(self,description_regex=None,name=None):
594
+ # make sure tables is initialized
595
+ self.tables
596
+ return self._tables.get_tables(description_regex=description_regex, name=name)
@@ -6,8 +6,10 @@ from .tables_npx import config_npx
6
6
  from .tables_sbsef import config_sbsef
7
7
  from .tables_sdr import config_sdr
8
8
  from .tables_proxyvotingrecord import config_proxyvotingrecord
9
+ from doc2dict.utils.format_dict import _format_table
9
10
 
10
11
  from .utils import safe_get, flatten_dict
12
+ import re
11
13
  # will add filing date param later? or extension
12
14
  all_tables_dict = {
13
15
  '3' : config_ownership,
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
93
95
 
94
96
  # should have table type, accession, data
95
97
  class Table:
96
- def __init__(self,data,name,accession):
98
+ def __init__(self,data,name,accession,description = None):
97
99
  self.data = data
98
100
  self.name = name
99
101
  self.accession = accession
102
+ self.description = description
103
+
104
+ # TODO MADE IN A HURRY #
105
+ def __str__(self):
106
+ formatted_table = _format_table(self.data)
107
+ if isinstance(formatted_table, list):
108
+ table_str = '\n'.join(formatted_table)
109
+ else:
110
+ table_str = str(formatted_table)
111
+ return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
100
112
 
101
113
 
102
114
  class Tables():
103
- def __init__(self,document_type,accession,data,must_exist_in_mapping=True):
115
+ def __init__(self,document_type,accession):
104
116
  self.document_type = document_type
105
117
  self.accession = accession
106
- self.data = data
107
-
108
- # to fill in
109
118
  self.tables = []
110
119
 
111
- self.parse_tables(must_exist_in_mapping=must_exist_in_mapping)
112
-
113
- def parse_tables(self,must_exist_in_mapping=True):
114
- # first select dict
120
+ def parse_tables(self,data,must_exist_in_mapping=True):
121
+ self.data = data
115
122
 
116
123
  try:
117
124
  tables_dict = all_tables_dict[self.document_type]
@@ -120,11 +127,32 @@ class Tables():
120
127
 
121
128
  # now get the dicts from the data
122
129
  data_dicts = seperate_data(tables_dict,self.data)
123
-
130
+
124
131
  # now flatten
125
132
  data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
126
133
 
127
134
  for table_name, flattened_data in data_dicts:
128
135
  mapping_dict = tables_dict[table_name]['mapping']
129
136
  mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
130
- self.tables.append(Table(mapped_data, table_name, self.accession))
137
+ self.tables.append(Table(mapped_data, table_name, self.accession))
138
+
139
+ def add_table(self,data,name,description=None):
140
+ self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
141
+
142
+ def get_tables(self, description_regex=None, name=None):
143
+ matching_tables = []
144
+
145
+ for table in self.tables:
146
+ # Check name match (exact match)
147
+ if name is not None:
148
+ if table.name == name:
149
+ matching_tables.append(table)
150
+ continue
151
+
152
+ # Check description regex match
153
+ if description_regex is not None and table.description is not None:
154
+ if re.search(description_regex, table.description):
155
+ matching_tables.append(table)
156
+ continue
157
+
158
+ return matching_tables
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.8
3
+ Version: 2.3.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -15,9 +15,9 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
15
15
  datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamule/document/document.py,sha256=d9Gv8_7zJVZhIVYtF3cLT_7MCtWZV1gn9_l3u8us7b0,24275
18
+ datamule/document/document.py,sha256=NrMqhY_u_X7gyvraxY0hzZEDJddqSJDgiHFzkaRTBVA,23102
19
19
  datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
20
+ datamule/document/tables/tables.py,sha256=uEMDYg7c4iHjVtIjNQgCgZOGp6j9aFWVB05agpVsNOI,5727
21
21
  datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
22
22
  datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
23
23
  datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
@@ -58,7 +58,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
59
59
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
60
60
  datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
61
- datamule-2.2.8.dist-info/METADATA,sha256=MfCW0SCjpYwtorAPr-540bS8VhJ5_4hEwSbDZN_b-Zo,585
62
- datamule-2.2.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
- datamule-2.2.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
- datamule-2.2.8.dist-info/RECORD,,
61
+ datamule-2.3.0.dist-info/METADATA,sha256=jUra4jM6LMxAS3IKnrF9urlK6ZI4ZAcl6yimnsD67pk,585
62
+ datamule-2.3.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
+ datamule-2.3.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
+ datamule-2.3.0.dist-info/RECORD,,