datamule 2.2.8__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +61 -90
- datamule/document/tables/tables.py +39 -11
- {datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/METADATA +1 -1
- {datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/RECORD +6 -6
- {datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/WHEEL +0 -0
- {datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
9
|
from ..mapping_dicts.html_mapping_dicts import *
|
10
|
-
from selectolax.parser import HTMLParser
|
11
|
-
|
12
10
|
from pathlib import Path
|
13
11
|
import webbrowser
|
14
12
|
from secsgml.utils import bytes_to_str
|
@@ -294,30 +292,13 @@ class Document:
|
|
294
292
|
return bool(re.search(pattern, self.content))
|
295
293
|
return False
|
296
294
|
|
297
|
-
# Note: this method will be heavily modified in the future
|
298
295
|
def parse(self):
|
299
296
|
# check if we have already parsed the content
|
300
297
|
if self._data:
|
301
298
|
return
|
302
299
|
|
303
300
|
mapping_dict = None
|
304
|
-
|
305
|
-
if self.extension == '.txt':
|
306
|
-
content = self.text
|
307
|
-
if self.type in ['10-Q', '10-Q/A']:
|
308
|
-
mapping_dict = dict_10q
|
309
|
-
elif self.type in ['10-K','10-K/A']:
|
310
|
-
mapping_dict = dict_10k
|
311
|
-
elif self.type in ['8-K', '8-K/A']:
|
312
|
-
mapping_dict = dict_8k
|
313
|
-
elif self.type in ['SC 13D', 'SC 13D/A']:
|
314
|
-
mapping_dict = dict_13d
|
315
|
-
elif self.type in ['SC 13G', 'SC 13G/A']:
|
316
|
-
mapping_dict = dict_13g
|
317
|
-
|
318
|
-
self._data = {}
|
319
|
-
self._data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
|
320
|
-
elif self.extension in ['.htm', '.html']:
|
301
|
+
if self._data_bool:
|
321
302
|
|
322
303
|
if self.type in ['1-K', '1-K/A']:
|
323
304
|
mapping_dict = dict_1kpartii_html
|
@@ -391,16 +372,26 @@ class Document:
|
|
391
372
|
mapping_dict = dict_t3_html
|
392
373
|
elif self.type in ['NT 10-K', 'NT 10-K/A', 'NT 10-Q', 'NT 10-Q/A', 'NT 20-F', 'NT 20-F/A']:
|
393
374
|
mapping_dict = dict_nt10k_html
|
375
|
+
elif self.type in ['SC 13G', 'SC 13G/A']:
|
376
|
+
mapping_dict = dict_13g
|
377
|
+
elif self.type in ['SC 13D', 'SC 13D/A']:
|
378
|
+
mapping_dict = dict_13d
|
379
|
+
|
380
|
+
if self.extension in ['.htm','.html']:
|
381
|
+
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
382
|
+
elif self.extension in ['.txt']:
|
383
|
+
dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
|
384
|
+
elif self.extension == '.pdf':
|
385
|
+
dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
386
|
+
else:
|
387
|
+
dct = {}
|
394
388
|
|
395
|
-
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
396
389
|
self._data = dct
|
397
390
|
elif self.extension == '.xml':
|
398
391
|
if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
|
399
392
|
mapping_dict = dict_345
|
400
|
-
|
401
393
|
self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
402
|
-
|
403
|
-
self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
394
|
+
|
404
395
|
else:
|
405
396
|
pass
|
406
397
|
|
@@ -415,6 +406,12 @@ class Document:
|
|
415
406
|
|
416
407
|
if not isinstance(self._data, DataWithTags):
|
417
408
|
self._data = DataWithTags(self._data, self)
|
409
|
+
elif self.extension == '.xml':
|
410
|
+
if self._data is None:
|
411
|
+
self.parse()
|
412
|
+
|
413
|
+
if self._data is None:
|
414
|
+
self._data = {}
|
418
415
|
|
419
416
|
return self._data
|
420
417
|
|
@@ -450,19 +447,46 @@ class Document:
|
|
450
447
|
json.dump(self.data, f, indent=2)
|
451
448
|
|
452
449
|
def parse_tables(self,must_exist_in_mapping=True):
|
453
|
-
|
454
|
-
|
450
|
+
"""Must exist in mapping means columns must occur in mapping schema."""
|
451
|
+
if self.extension == '.xml':
|
452
|
+
tables = Tables(document_type = self.type, accession=self.accession)
|
453
|
+
tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
|
454
|
+
self._tables = tables
|
455
|
+
|
456
|
+
elif self._data_bool:
|
457
|
+
tables = Tables(document_type = self.type, accession=self.accession)
|
458
|
+
data_tuples = self.data_tuples
|
459
|
+
|
460
|
+
for i, (id, type, content, level) in enumerate(data_tuples):
|
461
|
+
if type == "table" and i > 0:
|
462
|
+
description = None
|
463
|
+
|
464
|
+
# Look at previous element
|
465
|
+
prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
|
466
|
+
|
467
|
+
# Case 1: Same level + text content
|
468
|
+
if prev_level == level and prev_type in ["text", "textsmall"]:
|
469
|
+
description = prev_content
|
470
|
+
|
471
|
+
# Case 2: Higher level (lower number) + title
|
472
|
+
elif prev_level < level and prev_type == "title":
|
473
|
+
description = prev_content
|
474
|
+
|
475
|
+
# Case 3: No matching description - add table without description
|
476
|
+
# (description remains None)
|
477
|
+
|
478
|
+
tables.add_table(data=content, description=description, name="extracted_table")
|
479
|
+
|
480
|
+
self._tables = tables
|
481
|
+
|
455
482
|
else:
|
456
|
-
|
457
|
-
data = self.data
|
458
|
-
tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
|
459
|
-
self._tables = tables.tables
|
483
|
+
self._tables = []
|
460
484
|
|
461
485
|
@property
|
462
486
|
def tables(self):
|
463
487
|
if self._tables is None:
|
464
488
|
self.parse_tables()
|
465
|
-
return self._tables
|
489
|
+
return self._tables.tables
|
466
490
|
|
467
491
|
|
468
492
|
def write_csv(self, output_folder):
|
@@ -553,6 +577,7 @@ class Document:
|
|
553
577
|
webbrowser.open('file://' + temp_path)
|
554
578
|
else:
|
555
579
|
print(f"Cannot open files with extension {self.extension}")
|
580
|
+
|
556
581
|
def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
|
557
582
|
if self._data_bool:
|
558
583
|
if not self.data:
|
@@ -564,62 +589,8 @@ class Document:
|
|
564
589
|
else:
|
565
590
|
return [flatten_dict(item[1],format) for item in result]
|
566
591
|
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
# Let's remove XML iterable for now
|
574
|
-
|
575
|
-
# Handle text-based documents
|
576
|
-
if self.extension in ['.txt', '.htm', '.html']:
|
577
|
-
if not document_data:
|
578
|
-
return iter([])
|
579
|
-
|
580
|
-
# Find highest hierarchy level from mapping dict
|
581
|
-
highest_hierarchy = float('inf')
|
582
|
-
section_type = None
|
583
|
-
|
584
|
-
if self.type in ['10-K', '10-Q']:
|
585
|
-
mapping_dict = dict_10k if self.type == '10-K' else dict_10q
|
586
|
-
elif self.type == '8-K':
|
587
|
-
mapping_dict = dict_8k
|
588
|
-
elif self.type == 'SC 13D':
|
589
|
-
mapping_dict = dict_13d
|
590
|
-
elif self.type == 'SC 13G':
|
591
|
-
mapping_dict = dict_13g
|
592
|
-
else:
|
593
|
-
return iter([])
|
594
|
-
|
595
|
-
# Find section type with highest hierarchy number
|
596
|
-
highest_hierarchy = -1 # Start at -1 to find highest
|
597
|
-
for mapping in mapping_dict['rules']['mappings']:
|
598
|
-
if mapping.get('hierarchy') is not None:
|
599
|
-
if mapping['hierarchy'] > highest_hierarchy:
|
600
|
-
highest_hierarchy = mapping['hierarchy']
|
601
|
-
section_type = mapping['name']
|
602
|
-
|
603
|
-
if not section_type:
|
604
|
-
return iter([])
|
605
|
-
|
606
|
-
# Extract sections of the identified type
|
607
|
-
def find_sections(data, target_type):
|
608
|
-
sections = []
|
609
|
-
if isinstance(data, dict):
|
610
|
-
if data.get('type') == target_type:
|
611
|
-
sections.append({
|
612
|
-
'item': data.get('text', ''),
|
613
|
-
'text': flatten_hierarchy(data.get('content', []))
|
614
|
-
})
|
615
|
-
for value in data.values():
|
616
|
-
if isinstance(value, (dict, list)):
|
617
|
-
sections.extend(find_sections(value, target_type))
|
618
|
-
elif isinstance(data, list):
|
619
|
-
for item in data:
|
620
|
-
sections.extend(find_sections(item, target_type))
|
621
|
-
return sections
|
622
|
-
|
623
|
-
return iter(find_sections(document_data, section_type))
|
624
|
-
|
625
|
-
return iter([])
|
592
|
+
# TODO
|
593
|
+
def get_tables(self,description_regex=None,name=None):
|
594
|
+
# make sure tables is initialized
|
595
|
+
self.tables
|
596
|
+
return self._tables.get_tables(description_regex=description_regex, name=name)
|
@@ -6,8 +6,10 @@ from .tables_npx import config_npx
|
|
6
6
|
from .tables_sbsef import config_sbsef
|
7
7
|
from .tables_sdr import config_sdr
|
8
8
|
from .tables_proxyvotingrecord import config_proxyvotingrecord
|
9
|
+
from doc2dict.utils.format_dict import _format_table
|
9
10
|
|
10
11
|
from .utils import safe_get, flatten_dict
|
12
|
+
import re
|
11
13
|
# will add filing date param later? or extension
|
12
14
|
all_tables_dict = {
|
13
15
|
'3' : config_ownership,
|
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
|
|
93
95
|
|
94
96
|
# should have table type, accession, data
|
95
97
|
class Table:
|
96
|
-
def __init__(self,data,name,accession):
|
98
|
+
def __init__(self,data,name,accession,description = None):
|
97
99
|
self.data = data
|
98
100
|
self.name = name
|
99
101
|
self.accession = accession
|
102
|
+
self.description = description
|
103
|
+
|
104
|
+
# TODO MADE IN A HURRY #
|
105
|
+
def __str__(self):
|
106
|
+
formatted_table = _format_table(self.data)
|
107
|
+
if isinstance(formatted_table, list):
|
108
|
+
table_str = '\n'.join(formatted_table)
|
109
|
+
else:
|
110
|
+
table_str = str(formatted_table)
|
111
|
+
return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
|
100
112
|
|
101
113
|
|
102
114
|
class Tables():
|
103
|
-
def __init__(self,document_type,accession
|
115
|
+
def __init__(self,document_type,accession):
|
104
116
|
self.document_type = document_type
|
105
117
|
self.accession = accession
|
106
|
-
self.data = data
|
107
|
-
|
108
|
-
# to fill in
|
109
118
|
self.tables = []
|
110
119
|
|
111
|
-
|
112
|
-
|
113
|
-
def parse_tables(self,must_exist_in_mapping=True):
|
114
|
-
# first select dict
|
120
|
+
def parse_tables(self,data,must_exist_in_mapping=True):
|
121
|
+
self.data = data
|
115
122
|
|
116
123
|
try:
|
117
124
|
tables_dict = all_tables_dict[self.document_type]
|
@@ -120,11 +127,32 @@ class Tables():
|
|
120
127
|
|
121
128
|
# now get the dicts from the data
|
122
129
|
data_dicts = seperate_data(tables_dict,self.data)
|
123
|
-
|
130
|
+
|
124
131
|
# now flatten
|
125
132
|
data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
|
126
133
|
|
127
134
|
for table_name, flattened_data in data_dicts:
|
128
135
|
mapping_dict = tables_dict[table_name]['mapping']
|
129
136
|
mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
|
130
|
-
self.tables.append(Table(mapped_data, table_name, self.accession))
|
137
|
+
self.tables.append(Table(mapped_data, table_name, self.accession))
|
138
|
+
|
139
|
+
def add_table(self,data,name,description=None):
|
140
|
+
self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
|
141
|
+
|
142
|
+
def get_tables(self, description_regex=None, name=None):
|
143
|
+
matching_tables = []
|
144
|
+
|
145
|
+
for table in self.tables:
|
146
|
+
# Check name match (exact match)
|
147
|
+
if name is not None:
|
148
|
+
if table.name == name:
|
149
|
+
matching_tables.append(table)
|
150
|
+
continue
|
151
|
+
|
152
|
+
# Check description regex match
|
153
|
+
if description_regex is not None and table.description is not None:
|
154
|
+
if re.search(description_regex, table.description):
|
155
|
+
matching_tables.append(table)
|
156
|
+
continue
|
157
|
+
|
158
|
+
return matching_tables
|
@@ -15,9 +15,9 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=NrMqhY_u_X7gyvraxY0hzZEDJddqSJDgiHFzkaRTBVA,23102
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
datamule/document/tables/tables.py,sha256=
|
20
|
+
datamule/document/tables/tables.py,sha256=uEMDYg7c4iHjVtIjNQgCgZOGp6j9aFWVB05agpVsNOI,5727
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
22
22
|
datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
|
23
23
|
datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
|
@@ -58,7 +58,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
59
59
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
60
60
|
datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
|
61
|
-
datamule-2.
|
62
|
-
datamule-2.
|
63
|
-
datamule-2.
|
64
|
-
datamule-2.
|
61
|
+
datamule-2.3.0.dist-info/METADATA,sha256=jUra4jM6LMxAS3IKnrF9urlK6ZI4ZAcl6yimnsD67pk,585
|
62
|
+
datamule-2.3.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-2.3.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-2.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|