datamule 2.2.5__py3-none-any.whl → 2.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +33 -90
- {datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/METADATA +1 -1
- {datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/RECORD +5 -5
- {datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/WHEEL +0 -0
- {datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -3,7 +3,7 @@ import csv
|
|
3
3
|
import re
|
4
4
|
from doc2dict import xml2dict, txt2dict, dict2dict
|
5
5
|
from doc2dict.mapping import flatten_hierarchy
|
6
|
-
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
6
|
+
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
9
|
from ..mapping_dicts.html_mapping_dicts import *
|
@@ -288,77 +288,17 @@ class Document:
|
|
288
288
|
self._data = None
|
289
289
|
self._tables = None
|
290
290
|
self._text = None
|
291
|
+
self._markdown = None
|
291
292
|
|
292
293
|
# booleans
|
293
|
-
self._text_bool = self.extension in ('.htm', '.html','.txt')
|
294
294
|
self._data_bool = self.extension in ('.htm', '.html','.txt')
|
295
|
+
self._text_bool = self._data_bool
|
296
|
+
self._markdown_bool = self._data_bool
|
295
297
|
self._visualize_bool = self._data_bool
|
296
298
|
self._tables_bool = self.extension in ('.xml')
|
297
299
|
|
298
300
|
|
299
301
|
|
300
|
-
#_load_text_content
|
301
|
-
def _preprocess_txt_content(self):
|
302
|
-
self._text = self.content.decode().translate(str.maketrans({
|
303
|
-
'\xa0': ' ', '\u2003': ' ',
|
304
|
-
'\u2018': "'", '\u2019': "'",
|
305
|
-
'\u201c': '"', '\u201d': '"'
|
306
|
-
}))
|
307
|
-
|
308
|
-
# needs work
|
309
|
-
def _preprocess_html_content(self):
|
310
|
-
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
311
|
-
|
312
|
-
# Remove hidden elements first
|
313
|
-
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
314
|
-
for node in hidden_nodes:
|
315
|
-
node.decompose()
|
316
|
-
|
317
|
-
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
318
|
-
lines = []
|
319
|
-
current_line = []
|
320
|
-
|
321
|
-
def flush_line():
|
322
|
-
if current_line:
|
323
|
-
# Don't add spaces between adjacent spans
|
324
|
-
lines.append(''.join(current_line))
|
325
|
-
current_line.clear()
|
326
|
-
|
327
|
-
for node in parser.root.traverse(include_text=True):
|
328
|
-
if node.tag in ('script', 'style', 'css'):
|
329
|
-
continue
|
330
|
-
|
331
|
-
if node.tag in blocks:
|
332
|
-
flush_line()
|
333
|
-
lines.append('')
|
334
|
-
|
335
|
-
if node.text_content:
|
336
|
-
text = node.text_content.strip()
|
337
|
-
if text:
|
338
|
-
if node.tag in blocks:
|
339
|
-
flush_line()
|
340
|
-
lines.append(text)
|
341
|
-
lines.append('')
|
342
|
-
else:
|
343
|
-
# Only add space if nodes aren't directly adjacent
|
344
|
-
if current_line and not current_line[-1].endswith(' '):
|
345
|
-
if node.prev and node.prev.text_content:
|
346
|
-
if node.parent != node.prev.parent or node.prev.next != node:
|
347
|
-
current_line.append(' ')
|
348
|
-
current_line.append(text)
|
349
|
-
|
350
|
-
flush_line()
|
351
|
-
|
352
|
-
text = '\n'.join(lines)
|
353
|
-
while '\n\n\n' in text:
|
354
|
-
text = text.replace('\n\n\n', '\n\n')
|
355
|
-
|
356
|
-
self._text = text.translate(str.maketrans({
|
357
|
-
'\xa0': ' ', '\u2003': ' ',
|
358
|
-
'\u2018': "'", '\u2019': "'",
|
359
|
-
'\u201c': '"', '\u201d': '"'
|
360
|
-
}))
|
361
|
-
|
362
302
|
def contains_string(self, pattern):
|
363
303
|
"""Works for select files"""
|
364
304
|
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
@@ -477,30 +417,35 @@ class Document:
|
|
477
417
|
|
478
418
|
@property
|
479
419
|
def data(self):
|
480
|
-
if self.
|
481
|
-
self.
|
420
|
+
if self._data_bool:
|
421
|
+
if self._data is None:
|
422
|
+
self.parse()
|
482
423
|
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
424
|
+
if self._data is None:
|
425
|
+
self._data = {}
|
426
|
+
|
427
|
+
if not isinstance(self._data, DataWithTags):
|
428
|
+
self._data = DataWithTags(self._data, self)
|
488
429
|
|
489
430
|
return self._data
|
490
431
|
|
491
432
|
@property
|
492
433
|
def text(self):
|
493
|
-
if self.
|
494
|
-
if self.
|
495
|
-
|
496
|
-
|
497
|
-
self._preprocess_txt_content() # Still sets self._text to plain string
|
498
|
-
|
499
|
-
# Convert the plain string to TextWithTags
|
500
|
-
plain_text = self._text
|
501
|
-
self._text = TextWithTags(plain_text, self)
|
434
|
+
if self._text_bool:
|
435
|
+
if self._text is None:
|
436
|
+
text = flatten_dict(self.data,'text')
|
437
|
+
self._text = TextWithTags(text, self)
|
502
438
|
return self._text
|
503
439
|
|
440
|
+
@property
|
441
|
+
def markdown(self):
|
442
|
+
if self._markdown_bool:
|
443
|
+
if self._markdown is None:
|
444
|
+
self._markdown = flatten_dict(self.data,'markdown')
|
445
|
+
|
446
|
+
return self._markdown
|
447
|
+
|
448
|
+
|
504
449
|
def write_json(self, output_filename=None):
|
505
450
|
if not self.data:
|
506
451
|
self.parse()
|
@@ -612,18 +557,16 @@ class Document:
|
|
612
557
|
webbrowser.open('file://' + temp_path)
|
613
558
|
else:
|
614
559
|
print(f"Cannot open files with extension {self.extension}")
|
615
|
-
|
616
560
|
def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
|
617
|
-
if
|
618
|
-
self.
|
561
|
+
if self._data_bool:
|
562
|
+
if not self.data:
|
563
|
+
self.parse()
|
619
564
|
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
return result
|
565
|
+
result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
|
566
|
+
if format == 'dict':
|
567
|
+
return [item[1] for item in result]
|
568
|
+
else:
|
569
|
+
return [flatten_dict(item[1],format) for item in result]
|
627
570
|
|
628
571
|
|
629
572
|
# TODO CHANGE THIS
|
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=tFsNUMVeBvx_3Td5bKPMlEJGjyzQtac4tui8jk2PusE,24629
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
@@ -57,7 +57,7 @@ datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
|
|
57
57
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
59
59
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
60
|
-
datamule-2.2.
|
61
|
-
datamule-2.2.
|
62
|
-
datamule-2.2.
|
63
|
-
datamule-2.2.
|
60
|
+
datamule-2.2.7.dist-info/METADATA,sha256=WMBfQuS6vgKcVlP04FGpD0BWDMU2nRaMVU_lsFQd9T4,585
|
61
|
+
datamule-2.2.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
62
|
+
datamule-2.2.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
63
|
+
datamule-2.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|