datamule 2.2.6__py3-none-any.whl → 2.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +35 -40
- datamule/utils/pdf.py +25 -0
- {datamule-2.2.6.dist-info → datamule-2.2.8.dist-info}/METADATA +1 -1
- {datamule-2.2.6.dist-info → datamule-2.2.8.dist-info}/RECORD +6 -5
- {datamule-2.2.6.dist-info → datamule-2.2.8.dist-info}/WHEEL +0 -0
- {datamule-2.2.6.dist-info → datamule-2.2.8.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -16,6 +16,7 @@ import tempfile
|
|
16
16
|
from .tables.tables import Tables
|
17
17
|
|
18
18
|
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
|
19
|
+
from ..utils.pdf import has_extractable_text
|
19
20
|
|
20
21
|
class DataWithTags(dict):
|
21
22
|
def __init__(self, data, document):
|
@@ -113,29 +114,9 @@ class TextAnalysisBase:
|
|
113
114
|
# Original behavior - single text source
|
114
115
|
self._text_sources = [{'id': None, 'text': str(self.document.text)}]
|
115
116
|
else: # mode == 'data'
|
116
|
-
|
117
|
-
self._text_sources = []
|
118
|
-
self._extract_text_fragments(self.document.data, '')
|
117
|
+
self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall']]
|
119
118
|
return self._text_sources
|
120
119
|
|
121
|
-
def _extract_text_fragments(self, data, parent_id=''):
|
122
|
-
"""Extract all text fragments with their document IDs from parsed data"""
|
123
|
-
if isinstance(data, dict):
|
124
|
-
for key, value in data.items():
|
125
|
-
if key in ["text", "title"] and isinstance(value, str):
|
126
|
-
# Use the current dictionary's parent key as the fragment ID
|
127
|
-
self._text_sources.append({
|
128
|
-
'id': parent_id,
|
129
|
-
'text': value
|
130
|
-
})
|
131
|
-
elif isinstance(value, (dict, list)):
|
132
|
-
# Pass the current key as the parent_id for the next level
|
133
|
-
self._extract_text_fragments(value, key)
|
134
|
-
elif isinstance(data, list):
|
135
|
-
for i, item in enumerate(data):
|
136
|
-
if isinstance(item, (dict, list)):
|
137
|
-
self._extract_text_fragments(item, parent_id)
|
138
|
-
|
139
120
|
def _format_results(self, results, fragment_id):
|
140
121
|
"""Format results based on mode"""
|
141
122
|
if self.mode == 'text':
|
@@ -286,12 +267,20 @@ class Document:
|
|
286
267
|
|
287
268
|
# this will be filled by parsed
|
288
269
|
self._data = None
|
270
|
+
self._data_tuples = None
|
289
271
|
self._tables = None
|
290
272
|
self._text = None
|
291
273
|
self._markdown = None
|
292
274
|
|
293
275
|
# booleans
|
294
276
|
self._data_bool = self.extension in ('.htm', '.html','.txt')
|
277
|
+
|
278
|
+
# may slow things down?
|
279
|
+
if self.extension == '.pdf':
|
280
|
+
if has_extractable_text(pdf_bytes=self.content):
|
281
|
+
self._data_bool = True
|
282
|
+
|
283
|
+
self._data_tuples_bool = self._data_bool
|
295
284
|
self._text_bool = self._data_bool
|
296
285
|
self._markdown_bool = self._data_bool
|
297
286
|
self._visualize_bool = self._data_bool
|
@@ -417,22 +406,30 @@ class Document:
|
|
417
406
|
|
418
407
|
@property
|
419
408
|
def data(self):
|
420
|
-
if self.
|
421
|
-
self.
|
409
|
+
if self._data_bool:
|
410
|
+
if self._data is None:
|
411
|
+
self.parse()
|
422
412
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
413
|
+
if self._data is None:
|
414
|
+
self._data = {}
|
415
|
+
|
416
|
+
if not isinstance(self._data, DataWithTags):
|
417
|
+
self._data = DataWithTags(self._data, self)
|
428
418
|
|
429
419
|
return self._data
|
430
420
|
|
421
|
+
@property
|
422
|
+
def data_tuples(self):
|
423
|
+
if self._data_bool:
|
424
|
+
if self._data_tuples is None:
|
425
|
+
self._data_tuples = unnest_dict(self.data)
|
426
|
+
return self._data_tuples
|
427
|
+
|
431
428
|
@property
|
432
429
|
def text(self):
|
433
430
|
if self._text_bool:
|
434
431
|
if self._text is None:
|
435
|
-
text = flatten_dict(self.
|
432
|
+
text = flatten_dict(tuples_list=self.data_tuples,format='text')
|
436
433
|
self._text = TextWithTags(text, self)
|
437
434
|
return self._text
|
438
435
|
|
@@ -440,7 +437,7 @@ class Document:
|
|
440
437
|
def markdown(self):
|
441
438
|
if self._markdown_bool:
|
442
439
|
if self._markdown is None:
|
443
|
-
self._markdown = flatten_dict(self.
|
440
|
+
self._markdown = flatten_dict(tuples_list=self.data_tuples,format='markdown')
|
444
441
|
|
445
442
|
return self._markdown
|
446
443
|
|
@@ -556,18 +553,16 @@ class Document:
|
|
556
553
|
webbrowser.open('file://' + temp_path)
|
557
554
|
else:
|
558
555
|
print(f"Cannot open files with extension {self.extension}")
|
559
|
-
|
560
556
|
def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
|
561
|
-
if
|
562
|
-
self.
|
563
|
-
|
564
|
-
result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
|
557
|
+
if self._data_bool:
|
558
|
+
if not self.data:
|
559
|
+
self.parse()
|
565
560
|
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
561
|
+
result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
|
562
|
+
if format == 'dict':
|
563
|
+
return [item[1] for item in result]
|
564
|
+
else:
|
565
|
+
return [flatten_dict(item[1],format) for item in result]
|
571
566
|
|
572
567
|
|
573
568
|
# TODO CHANGE THIS
|
datamule/utils/pdf.py
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
def has_extractable_text(pdf_bytes, search_range=50000):
|
2
|
+
"""
|
3
|
+
Check if PDF contains extractable text within first N bytes
|
4
|
+
Returns True if found in range, False otherwise
|
5
|
+
|
6
|
+
Args:
|
7
|
+
pdf_bytes: PDF content as bytes
|
8
|
+
search_range: Number of bytes to search from start (default 50KB)
|
9
|
+
"""
|
10
|
+
# Text indicators to search for
|
11
|
+
indicators = [
|
12
|
+
b'BT', # Begin text - most common
|
13
|
+
b'Tj', # Show text
|
14
|
+
b'TJ', # Show text with positioning
|
15
|
+
b'Tf', # Set font
|
16
|
+
]
|
17
|
+
|
18
|
+
# Search only within the specified range
|
19
|
+
search_data = pdf_bytes[:search_range]
|
20
|
+
|
21
|
+
for indicator in indicators:
|
22
|
+
if indicator in search_data:
|
23
|
+
return True
|
24
|
+
|
25
|
+
return False
|
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
15
15
|
datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
17
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
datamule/document/document.py,sha256=
|
18
|
+
datamule/document/document.py,sha256=d9Gv8_7zJVZhIVYtF3cLT_7MCtWZV1gn9_l3u8us7b0,24275
|
19
19
|
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
|
21
21
|
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
@@ -57,7 +57,8 @@ datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
|
|
57
57
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
59
59
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
60
|
-
datamule
|
61
|
-
datamule-2.2.
|
62
|
-
datamule-2.2.
|
63
|
-
datamule-2.2.
|
60
|
+
datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
|
61
|
+
datamule-2.2.8.dist-info/METADATA,sha256=MfCW0SCjpYwtorAPr-540bS8VhJ5_4hEwSbDZN_b-Zo,585
|
62
|
+
datamule-2.2.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-2.2.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-2.2.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|