datamule 2.2.9__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +50 -13
- datamule/document/tables/tables.py +39 -11
- {datamule-2.2.9.dist-info → datamule-2.3.0.dist-info}/METADATA +1 -1
- {datamule-2.2.9.dist-info → datamule-2.3.0.dist-info}/RECORD +6 -6
- {datamule-2.2.9.dist-info → datamule-2.3.0.dist-info}/WHEEL +0 -0
- {datamule-2.2.9.dist-info → datamule-2.3.0.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
9
|
from ..mapping_dicts.html_mapping_dicts import *
|
10
|
-
from selectolax.parser import HTMLParser
|
11
|
-
|
12
10
|
from pathlib import Path
|
13
11
|
import webbrowser
|
14
12
|
from secsgml.utils import bytes_to_str
|
@@ -294,7 +292,6 @@ class Document:
|
|
294
292
|
return bool(re.search(pattern, self.content))
|
295
293
|
return False
|
296
294
|
|
297
|
-
# Note: this method will be heavily modified in the future
|
298
295
|
def parse(self):
|
299
296
|
# check if we have already parsed the content
|
300
297
|
if self._data:
|
@@ -384,6 +381,8 @@ class Document:
|
|
384
381
|
dct = html2dict(content=self.content, mapping_dict=mapping_dict)
|
385
382
|
elif self.extension in ['.txt']:
|
386
383
|
dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
|
384
|
+
elif self.extension == '.pdf':
|
385
|
+
dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
387
386
|
else:
|
388
387
|
dct = {}
|
389
388
|
|
@@ -391,10 +390,8 @@ class Document:
|
|
391
390
|
elif self.extension == '.xml':
|
392
391
|
if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
|
393
392
|
mapping_dict = dict_345
|
394
|
-
|
395
393
|
self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
396
|
-
|
397
|
-
self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
|
394
|
+
|
398
395
|
else:
|
399
396
|
pass
|
400
397
|
|
@@ -409,6 +406,12 @@ class Document:
|
|
409
406
|
|
410
407
|
if not isinstance(self._data, DataWithTags):
|
411
408
|
self._data = DataWithTags(self._data, self)
|
409
|
+
elif self.extension == '.xml':
|
410
|
+
if self._data is None:
|
411
|
+
self.parse()
|
412
|
+
|
413
|
+
if self._data is None:
|
414
|
+
self._data = {}
|
412
415
|
|
413
416
|
return self._data
|
414
417
|
|
@@ -444,19 +447,46 @@ class Document:
|
|
444
447
|
json.dump(self.data, f, indent=2)
|
445
448
|
|
446
449
|
def parse_tables(self,must_exist_in_mapping=True):
|
447
|
-
|
448
|
-
|
450
|
+
"""Must exist in mapping means columns must occur in mapping schema."""
|
451
|
+
if self.extension == '.xml':
|
452
|
+
tables = Tables(document_type = self.type, accession=self.accession)
|
453
|
+
tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
|
454
|
+
self._tables = tables
|
455
|
+
|
456
|
+
elif self._data_bool:
|
457
|
+
tables = Tables(document_type = self.type, accession=self.accession)
|
458
|
+
data_tuples = self.data_tuples
|
459
|
+
|
460
|
+
for i, (id, type, content, level) in enumerate(data_tuples):
|
461
|
+
if type == "table" and i > 0:
|
462
|
+
description = None
|
463
|
+
|
464
|
+
# Look at previous element
|
465
|
+
prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
|
466
|
+
|
467
|
+
# Case 1: Same level + text content
|
468
|
+
if prev_level == level and prev_type in ["text", "textsmall"]:
|
469
|
+
description = prev_content
|
470
|
+
|
471
|
+
# Case 2: Higher level (lower number) + title
|
472
|
+
elif prev_level < level and prev_type == "title":
|
473
|
+
description = prev_content
|
474
|
+
|
475
|
+
# Case 3: No matching description - add table without description
|
476
|
+
# (description remains None)
|
477
|
+
|
478
|
+
tables.add_table(data=content, description=description, name="extracted_table")
|
479
|
+
|
480
|
+
self._tables = tables
|
481
|
+
|
449
482
|
else:
|
450
|
-
|
451
|
-
data = self.data
|
452
|
-
tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
|
453
|
-
self._tables = tables.tables
|
483
|
+
self._tables = []
|
454
484
|
|
455
485
|
@property
|
456
486
|
def tables(self):
|
457
487
|
if self._tables is None:
|
458
488
|
self.parse_tables()
|
459
|
-
return self._tables
|
489
|
+
return self._tables.tables
|
460
490
|
|
461
491
|
|
462
492
|
def write_csv(self, output_folder):
|
@@ -547,6 +577,7 @@ class Document:
|
|
547
577
|
webbrowser.open('file://' + temp_path)
|
548
578
|
else:
|
549
579
|
print(f"Cannot open files with extension {self.extension}")
|
580
|
+
|
550
581
|
def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
|
551
582
|
if self._data_bool:
|
552
583
|
if not self.data:
|
@@ -557,3 +588,9 @@ class Document:
|
|
557
588
|
return [item[1] for item in result]
|
558
589
|
else:
|
559
590
|
return [flatten_dict(item[1],format) for item in result]
|
591
|
+
|
592
|
+
# TODO
|
593
|
+
def get_tables(self,description_regex=None,name=None):
|
594
|
+
# make sure tables is initialized
|
595
|
+
self.tables
|
596
|
+
return self._tables.get_tables(description_regex=description_regex, name=name)
|
@@ -6,8 +6,10 @@ from .tables_npx import config_npx
|
|
6
6
|
from .tables_sbsef import config_sbsef
|
7
7
|
from .tables_sdr import config_sdr
|
8
8
|
from .tables_proxyvotingrecord import config_proxyvotingrecord
|
9
|
+
from doc2dict.utils.format_dict import _format_table
|
9
10
|
|
10
11
|
from .utils import safe_get, flatten_dict
|
12
|
+
import re
|
11
13
|
# will add filing date param later? or extension
|
12
14
|
all_tables_dict = {
|
13
15
|
'3' : config_ownership,
|
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
|
|
93
95
|
|
94
96
|
# should have table type, accession, data
|
95
97
|
class Table:
|
96
|
-
def __init__(self,data,name,accession):
|
98
|
+
def __init__(self,data,name,accession,description = None):
|
97
99
|
self.data = data
|
98
100
|
self.name = name
|
99
101
|
self.accession = accession
|
102
|
+
self.description = description
|
103
|
+
|
104
|
+
# TODO MADE IN A HURRY #
|
105
|
+
def __str__(self):
|
106
|
+
formatted_table = _format_table(self.data)
|
107
|
+
if isinstance(formatted_table, list):
|
108
|
+
table_str = '\n'.join(formatted_table)
|
109
|
+
else:
|
110
|
+
table_str = str(formatted_table)
|
111
|
+
return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
|
100
112
|
|
101
113
|
|
102
114
|
class Tables():
|
103
|
-
def __init__(self,document_type,accession
|
115
|
+
def __init__(self,document_type,accession):
|
104
116
|
self.document_type = document_type
|
105
117
|
self.accession = accession
|
106
|
-
self.data = data
|
107
|
-
|
108
|
-
# to fill in
|
109
118
|
self.tables = []
|
110
119
|
|
111
|
-
|
112
|
-
|
113
|
-
def parse_tables(self,must_exist_in_mapping=True):
|
114
|
-
# first select dict
|
120
|
+
def parse_tables(self,data,must_exist_in_mapping=True):
|
121
|
+
self.data = data
|
115
122
|
|
116
123
|
try:
|
117
124
|
tables_dict = all_tables_dict[self.document_type]
|
@@ -120,11 +127,32 @@ class Tables():
|
|
120
127
|
|
121
128
|
# now get the dicts from the data
|
122
129
|
data_dicts = seperate_data(tables_dict,self.data)
|
123
|
-
|
130
|
+
|
124
131
|
# now flatten
|
125
132
|
data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
|
126
133
|
|
127
134
|
for table_name, flattened_data in data_dicts:
|
128
135
|
mapping_dict = tables_dict[table_name]['mapping']
|
129
136
|
mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
|
130
|
-
self.tables.append(Table(mapped_data, table_name, self.accession))
|
137
|
+
self.tables.append(Table(mapped_data, table_name, self.accession))
|
138
|
+
|
139
|
+
def add_table(self,data,name,description=None):
|
140
|
+
self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
|
141
|
+
|
142
|
+
def get_tables(self, description_regex=None, name=None):
|
143
|
+
matching_tables = []
|
144
|
+
|
145
|
+
for table in self.tables:
|
146
|
+
# Check name match (exact match)
|
147
|
+
if name is not None:
|
148
|
+
if table.name == name:
|
149
|
+
matching_tables.append(table)
|
150
|
+
continue
|
151
|
+
|
152
|
+
# Check description regex match
|
153
|
+
if description_regex is not None and table.description is not None:
|
154
|
+
if re.search(description_regex, table.description):
|
155
|
+
matching_tables.append(table)
|
156
|
+
continue
|
157
|
+
|
158
|
+
return matching_tables
|
@@ -15,9 +15,9 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=NrMqhY_u_X7gyvraxY0hzZEDJddqSJDgiHFzkaRTBVA,23102
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
datamule/document/tables/tables.py,sha256=
|
20
|
+
datamule/document/tables/tables.py,sha256=uEMDYg7c4iHjVtIjNQgCgZOGp6j9aFWVB05agpVsNOI,5727
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
22
22
|
datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
|
23
23
|
datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
|
@@ -58,7 +58,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
59
59
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
60
60
|
datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
|
61
|
-
datamule-2.
|
62
|
-
datamule-2.
|
63
|
-
datamule-2.
|
64
|
-
datamule-2.
|
61
|
+
datamule-2.3.0.dist-info/METADATA,sha256=jUra4jM6LMxAS3IKnrF9urlK6ZI4ZAcl6yimnsD67pk,585
|
62
|
+
datamule-2.3.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-2.3.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-2.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|