datamule 2.2.7__tar.gz → 2.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.2.7 → datamule-2.2.8}/PKG-INFO +1 -1
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/document.py +19 -23
- datamule-2.2.8/datamule/utils/pdf.py +25 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/SOURCES.txt +2 -1
- {datamule-2.2.7 → datamule-2.2.8}/setup.py +1 -1
- {datamule-2.2.7 → datamule-2.2.8}/datamule/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/config.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/downloader.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/datasets.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/utils.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/helper.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/index.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/package_updater.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/portfolio.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/utils.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sentiment/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/sheet.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/submission.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/config.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/dictionaries.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/regex.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/utils.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/utils/__init__.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule/utils/format_accession.py +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.2.7 → datamule-2.2.8}/setup.cfg +0 -0
@@ -16,6 +16,7 @@ import tempfile
|
|
16
16
|
from .tables.tables import Tables
|
17
17
|
|
18
18
|
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
|
19
|
+
from ..utils.pdf import has_extractable_text
|
19
20
|
|
20
21
|
class DataWithTags(dict):
|
21
22
|
def __init__(self, data, document):
|
@@ -113,29 +114,9 @@ class TextAnalysisBase:
|
|
113
114
|
# Original behavior - single text source
|
114
115
|
self._text_sources = [{'id': None, 'text': str(self.document.text)}]
|
115
116
|
else: # mode == 'data'
|
116
|
-
|
117
|
-
self._text_sources = []
|
118
|
-
self._extract_text_fragments(self.document.data, '')
|
117
|
+
self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall']]
|
119
118
|
return self._text_sources
|
120
119
|
|
121
|
-
def _extract_text_fragments(self, data, parent_id=''):
|
122
|
-
"""Extract all text fragments with their document IDs from parsed data"""
|
123
|
-
if isinstance(data, dict):
|
124
|
-
for key, value in data.items():
|
125
|
-
if key in ["text", "title"] and isinstance(value, str):
|
126
|
-
# Use the current dictionary's parent key as the fragment ID
|
127
|
-
self._text_sources.append({
|
128
|
-
'id': parent_id,
|
129
|
-
'text': value
|
130
|
-
})
|
131
|
-
elif isinstance(value, (dict, list)):
|
132
|
-
# Pass the current key as the parent_id for the next level
|
133
|
-
self._extract_text_fragments(value, key)
|
134
|
-
elif isinstance(data, list):
|
135
|
-
for i, item in enumerate(data):
|
136
|
-
if isinstance(item, (dict, list)):
|
137
|
-
self._extract_text_fragments(item, parent_id)
|
138
|
-
|
139
120
|
def _format_results(self, results, fragment_id):
|
140
121
|
"""Format results based on mode"""
|
141
122
|
if self.mode == 'text':
|
@@ -286,12 +267,20 @@ class Document:
|
|
286
267
|
|
287
268
|
# this will be filled by parsed
|
288
269
|
self._data = None
|
270
|
+
self._data_tuples = None
|
289
271
|
self._tables = None
|
290
272
|
self._text = None
|
291
273
|
self._markdown = None
|
292
274
|
|
293
275
|
# booleans
|
294
276
|
self._data_bool = self.extension in ('.htm', '.html','.txt')
|
277
|
+
|
278
|
+
# may slow things down?
|
279
|
+
if self.extension == '.pdf':
|
280
|
+
if has_extractable_text(pdf_bytes=self.content):
|
281
|
+
self._data_bool = True
|
282
|
+
|
283
|
+
self._data_tuples_bool = self._data_bool
|
295
284
|
self._text_bool = self._data_bool
|
296
285
|
self._markdown_bool = self._data_bool
|
297
286
|
self._visualize_bool = self._data_bool
|
@@ -429,11 +418,18 @@ class Document:
|
|
429
418
|
|
430
419
|
return self._data
|
431
420
|
|
421
|
+
@property
|
422
|
+
def data_tuples(self):
|
423
|
+
if self._data_bool:
|
424
|
+
if self._data_tuples is None:
|
425
|
+
self._data_tuples = unnest_dict(self.data)
|
426
|
+
return self._data_tuples
|
427
|
+
|
432
428
|
@property
|
433
429
|
def text(self):
|
434
430
|
if self._text_bool:
|
435
431
|
if self._text is None:
|
436
|
-
text = flatten_dict(self.
|
432
|
+
text = flatten_dict(tuples_list=self.data_tuples,format='text')
|
437
433
|
self._text = TextWithTags(text, self)
|
438
434
|
return self._text
|
439
435
|
|
@@ -441,7 +437,7 @@ class Document:
|
|
441
437
|
def markdown(self):
|
442
438
|
if self._markdown_bool:
|
443
439
|
if self._markdown is None:
|
444
|
-
self._markdown = flatten_dict(self.
|
440
|
+
self._markdown = flatten_dict(tuples_list=self.data_tuples,format='markdown')
|
445
441
|
|
446
442
|
return self._markdown
|
447
443
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
def has_extractable_text(pdf_bytes, search_range=50000):
|
2
|
+
"""
|
3
|
+
Check if PDF contains extractable text within first N bytes
|
4
|
+
Returns True if found in range, False otherwise
|
5
|
+
|
6
|
+
Args:
|
7
|
+
pdf_bytes: PDF content as bytes
|
8
|
+
search_range: Number of bytes to search from start (default 50KB)
|
9
|
+
"""
|
10
|
+
# Text indicators to search for
|
11
|
+
indicators = [
|
12
|
+
b'BT', # Begin text - most common
|
13
|
+
b'Tj', # Show text
|
14
|
+
b'TJ', # Show text with positioning
|
15
|
+
b'Tf', # Set font
|
16
|
+
]
|
17
|
+
|
18
|
+
# Search only within the specified range
|
19
|
+
search_data = pdf_bytes[:search_range]
|
20
|
+
|
21
|
+
for indicator in indicators:
|
22
|
+
if indicator in search_data:
|
23
|
+
return True
|
24
|
+
|
25
|
+
return False
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.2.
|
35
|
+
version="2.2.8",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|