datamule 2.2.5__tar.gz → 2.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.2.5 → datamule-2.2.6}/PKG-INFO +1 -1
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/document.py +17 -73
- {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.2.5 → datamule-2.2.6}/setup.py +1 -1
- {datamule-2.2.5 → datamule-2.2.6}/datamule/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/config.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/downloader.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/datasets.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/helper.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/index.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/package_updater.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/portfolio.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/utils.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sentiment/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/sheet.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/submission.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/config.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/dictionaries.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/regex.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/tags/utils.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/utils/__init__.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.5 → datamule-2.2.6}/setup.cfg +0 -0
@@ -3,7 +3,7 @@ import csv
|
|
3
3
|
import re
|
4
4
|
from doc2dict import xml2dict, txt2dict, dict2dict
|
5
5
|
from doc2dict.mapping import flatten_hierarchy
|
6
|
-
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
|
6
|
+
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
|
7
7
|
from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
8
8
|
from ..mapping_dicts.xml_mapping_dicts import dict_345
|
9
9
|
from ..mapping_dicts.html_mapping_dicts import *
|
@@ -288,77 +288,17 @@ class Document:
|
|
288
288
|
self._data = None
|
289
289
|
self._tables = None
|
290
290
|
self._text = None
|
291
|
+
self._markdown = None
|
291
292
|
|
292
293
|
# booleans
|
293
|
-
self._text_bool = self.extension in ('.htm', '.html','.txt')
|
294
294
|
self._data_bool = self.extension in ('.htm', '.html','.txt')
|
295
|
+
self._text_bool = self._data_bool
|
296
|
+
self._markdown_bool = self._data_bool
|
295
297
|
self._visualize_bool = self._data_bool
|
296
298
|
self._tables_bool = self.extension in ('.xml')
|
297
299
|
|
298
300
|
|
299
301
|
|
300
|
-
#_load_text_content
|
301
|
-
def _preprocess_txt_content(self):
|
302
|
-
self._text = self.content.decode().translate(str.maketrans({
|
303
|
-
'\xa0': ' ', '\u2003': ' ',
|
304
|
-
'\u2018': "'", '\u2019': "'",
|
305
|
-
'\u201c': '"', '\u201d': '"'
|
306
|
-
}))
|
307
|
-
|
308
|
-
# needs work
|
309
|
-
def _preprocess_html_content(self):
|
310
|
-
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
311
|
-
|
312
|
-
# Remove hidden elements first
|
313
|
-
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
314
|
-
for node in hidden_nodes:
|
315
|
-
node.decompose()
|
316
|
-
|
317
|
-
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
318
|
-
lines = []
|
319
|
-
current_line = []
|
320
|
-
|
321
|
-
def flush_line():
|
322
|
-
if current_line:
|
323
|
-
# Don't add spaces between adjacent spans
|
324
|
-
lines.append(''.join(current_line))
|
325
|
-
current_line.clear()
|
326
|
-
|
327
|
-
for node in parser.root.traverse(include_text=True):
|
328
|
-
if node.tag in ('script', 'style', 'css'):
|
329
|
-
continue
|
330
|
-
|
331
|
-
if node.tag in blocks:
|
332
|
-
flush_line()
|
333
|
-
lines.append('')
|
334
|
-
|
335
|
-
if node.text_content:
|
336
|
-
text = node.text_content.strip()
|
337
|
-
if text:
|
338
|
-
if node.tag in blocks:
|
339
|
-
flush_line()
|
340
|
-
lines.append(text)
|
341
|
-
lines.append('')
|
342
|
-
else:
|
343
|
-
# Only add space if nodes aren't directly adjacent
|
344
|
-
if current_line and not current_line[-1].endswith(' '):
|
345
|
-
if node.prev and node.prev.text_content:
|
346
|
-
if node.parent != node.prev.parent or node.prev.next != node:
|
347
|
-
current_line.append(' ')
|
348
|
-
current_line.append(text)
|
349
|
-
|
350
|
-
flush_line()
|
351
|
-
|
352
|
-
text = '\n'.join(lines)
|
353
|
-
while '\n\n\n' in text:
|
354
|
-
text = text.replace('\n\n\n', '\n\n')
|
355
|
-
|
356
|
-
self._text = text.translate(str.maketrans({
|
357
|
-
'\xa0': ' ', '\u2003': ' ',
|
358
|
-
'\u2018': "'", '\u2019': "'",
|
359
|
-
'\u201c': '"', '\u201d': '"'
|
360
|
-
}))
|
361
|
-
|
362
302
|
def contains_string(self, pattern):
|
363
303
|
"""Works for select files"""
|
364
304
|
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
@@ -490,17 +430,21 @@ class Document:
|
|
490
430
|
|
491
431
|
@property
|
492
432
|
def text(self):
|
493
|
-
if self.
|
494
|
-
if self.
|
495
|
-
|
496
|
-
|
497
|
-
self._preprocess_txt_content() # Still sets self._text to plain string
|
498
|
-
|
499
|
-
# Convert the plain string to TextWithTags
|
500
|
-
plain_text = self._text
|
501
|
-
self._text = TextWithTags(plain_text, self)
|
433
|
+
if self._text_bool:
|
434
|
+
if self._text is None:
|
435
|
+
text = flatten_dict(self.data,'text')
|
436
|
+
self._text = TextWithTags(text, self)
|
502
437
|
return self._text
|
503
438
|
|
439
|
+
@property
|
440
|
+
def markdown(self):
|
441
|
+
if self._markdown_bool:
|
442
|
+
if self._markdown is None:
|
443
|
+
self._markdown = flatten_dict(self.data,'markdown')
|
444
|
+
|
445
|
+
return self._markdown
|
446
|
+
|
447
|
+
|
504
448
|
def write_json(self, output_filename=None):
|
505
449
|
if not self.data:
|
506
450
|
self.parse()
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.2.
|
35
|
+
version="2.2.6",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|