datamule 2.2.6__tar.gz → 2.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-2.2.6 → datamule-2.2.8}/PKG-INFO +1 -1
  2. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/document.py +35 -40
  3. datamule-2.2.8/datamule/utils/pdf.py +25 -0
  4. {datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/PKG-INFO +1 -1
  5. {datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/SOURCES.txt +2 -1
  6. {datamule-2.2.6 → datamule-2.2.8}/setup.py +1 -1
  7. {datamule-2.2.6 → datamule-2.2.8}/datamule/__init__.py +0 -0
  8. {datamule-2.2.6 → datamule-2.2.8}/datamule/config.py +0 -0
  9. {datamule-2.2.6 → datamule-2.2.8}/datamule/data/listed_filer_metadata.csv +0 -0
  10. {datamule-2.2.6 → datamule-2.2.8}/datamule/datamule/__init__.py +0 -0
  11. {datamule-2.2.6 → datamule-2.2.8}/datamule/datamule/datamule_lookup.py +0 -0
  12. {datamule-2.2.6 → datamule-2.2.8}/datamule/datamule/datamule_mysql_rds.py +0 -0
  13. {datamule-2.2.6 → datamule-2.2.8}/datamule/datamule/downloader.py +0 -0
  14. {datamule-2.2.6 → datamule-2.2.8}/datamule/datamule/sec_connector.py +0 -0
  15. {datamule-2.2.6 → datamule-2.2.8}/datamule/datasets.py +0 -0
  16. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/__init__.py +0 -0
  17. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/__init__.py +0 -0
  18. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables.py +0 -0
  19. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_13fhr.py +0 -0
  20. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_25nse.py +0 -0
  21. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_informationtable.py +0 -0
  22. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_npx.py +0 -0
  23. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_ownership.py +0 -0
  24. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  25. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_sbsef.py +0 -0
  26. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/tables_sdr.py +0 -0
  27. {datamule-2.2.6 → datamule-2.2.8}/datamule/document/tables/utils.py +0 -0
  28. {datamule-2.2.6 → datamule-2.2.8}/datamule/helper.py +0 -0
  29. {datamule-2.2.6 → datamule-2.2.8}/datamule/index.py +0 -0
  30. {datamule-2.2.6 → datamule-2.2.8}/datamule/mapping_dicts/__init__.py +0 -0
  31. {datamule-2.2.6 → datamule-2.2.8}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  32. {datamule-2.2.6 → datamule-2.2.8}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  33. {datamule-2.2.6 → datamule-2.2.8}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  34. {datamule-2.2.6 → datamule-2.2.8}/datamule/package_updater.py +0 -0
  35. {datamule-2.2.6 → datamule-2.2.8}/datamule/portfolio.py +0 -0
  36. {datamule-2.2.6 → datamule-2.2.8}/datamule/portfolio_compression_utils.py +0 -0
  37. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/__init__.py +0 -0
  38. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/infrastructure/__init__.py +0 -0
  39. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  40. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/submissions/__init__.py +0 -0
  41. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/submissions/downloader.py +0 -0
  42. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/submissions/eftsquery.py +0 -0
  43. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/submissions/monitor.py +0 -0
  44. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/submissions/streamer.py +0 -0
  45. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/submissions/textsearch.py +0 -0
  46. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/utils.py +0 -0
  47. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/xbrl/__init__.py +0 -0
  48. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  49. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  50. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  51. {datamule-2.2.6 → datamule-2.2.8}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  52. {datamule-2.2.6 → datamule-2.2.8}/datamule/seclibrary/__init__.py +0 -0
  53. {datamule-2.2.6 → datamule-2.2.8}/datamule/seclibrary/bq.py +0 -0
  54. {datamule-2.2.6 → datamule-2.2.8}/datamule/sentiment/__init__.py +0 -0
  55. {datamule-2.2.6 → datamule-2.2.8}/datamule/sheet.py +0 -0
  56. {datamule-2.2.6 → datamule-2.2.8}/datamule/submission.py +0 -0
  57. {datamule-2.2.6 → datamule-2.2.8}/datamule/tags/__init__.py +0 -0
  58. {datamule-2.2.6 → datamule-2.2.8}/datamule/tags/config.py +0 -0
  59. {datamule-2.2.6 → datamule-2.2.8}/datamule/tags/dictionaries.py +0 -0
  60. {datamule-2.2.6 → datamule-2.2.8}/datamule/tags/regex.py +0 -0
  61. {datamule-2.2.6 → datamule-2.2.8}/datamule/tags/utils.py +0 -0
  62. {datamule-2.2.6 → datamule-2.2.8}/datamule/utils/__init__.py +0 -0
  63. {datamule-2.2.6 → datamule-2.2.8}/datamule/utils/construct_submissions_data.py +0 -0
  64. {datamule-2.2.6 → datamule-2.2.8}/datamule/utils/format_accession.py +0 -0
  65. {datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-2.2.6 → datamule-2.2.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.6
3
+ Version: 2.2.8
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -16,6 +16,7 @@ import tempfile
16
16
  from .tables.tables import Tables
17
17
 
18
18
  from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
19
+ from ..utils.pdf import has_extractable_text
19
20
 
20
21
  class DataWithTags(dict):
21
22
  def __init__(self, data, document):
@@ -113,29 +114,9 @@ class TextAnalysisBase:
113
114
  # Original behavior - single text source
114
115
  self._text_sources = [{'id': None, 'text': str(self.document.text)}]
115
116
  else: # mode == 'data'
116
- # New behavior - multiple text fragments
117
- self._text_sources = []
118
- self._extract_text_fragments(self.document.data, '')
117
+ self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall']]
119
118
  return self._text_sources
120
119
 
121
- def _extract_text_fragments(self, data, parent_id=''):
122
- """Extract all text fragments with their document IDs from parsed data"""
123
- if isinstance(data, dict):
124
- for key, value in data.items():
125
- if key in ["text", "title"] and isinstance(value, str):
126
- # Use the current dictionary's parent key as the fragment ID
127
- self._text_sources.append({
128
- 'id': parent_id,
129
- 'text': value
130
- })
131
- elif isinstance(value, (dict, list)):
132
- # Pass the current key as the parent_id for the next level
133
- self._extract_text_fragments(value, key)
134
- elif isinstance(data, list):
135
- for i, item in enumerate(data):
136
- if isinstance(item, (dict, list)):
137
- self._extract_text_fragments(item, parent_id)
138
-
139
120
  def _format_results(self, results, fragment_id):
140
121
  """Format results based on mode"""
141
122
  if self.mode == 'text':
@@ -286,12 +267,20 @@ class Document:
286
267
 
287
268
  # this will be filled by parsed
288
269
  self._data = None
270
+ self._data_tuples = None
289
271
  self._tables = None
290
272
  self._text = None
291
273
  self._markdown = None
292
274
 
293
275
  # booleans
294
276
  self._data_bool = self.extension in ('.htm', '.html','.txt')
277
+
278
+ # may slow things down?
279
+ if self.extension == '.pdf':
280
+ if has_extractable_text(pdf_bytes=self.content):
281
+ self._data_bool = True
282
+
283
+ self._data_tuples_bool = self._data_bool
295
284
  self._text_bool = self._data_bool
296
285
  self._markdown_bool = self._data_bool
297
286
  self._visualize_bool = self._data_bool
@@ -417,22 +406,30 @@ class Document:
417
406
 
418
407
  @property
419
408
  def data(self):
420
- if self._data is None:
421
- self.parse()
409
+ if self._data_bool:
410
+ if self._data is None:
411
+ self.parse()
422
412
 
423
- if self._data is None:
424
- self._data = {}
425
-
426
- if not isinstance(self._data, DataWithTags):
427
- self._data = DataWithTags(self._data, self)
413
+ if self._data is None:
414
+ self._data = {}
415
+
416
+ if not isinstance(self._data, DataWithTags):
417
+ self._data = DataWithTags(self._data, self)
428
418
 
429
419
  return self._data
430
420
 
421
+ @property
422
+ def data_tuples(self):
423
+ if self._data_bool:
424
+ if self._data_tuples is None:
425
+ self._data_tuples = unnest_dict(self.data)
426
+ return self._data_tuples
427
+
431
428
  @property
432
429
  def text(self):
433
430
  if self._text_bool:
434
431
  if self._text is None:
435
- text = flatten_dict(self.data,'text')
432
+ text = flatten_dict(tuples_list=self.data_tuples,format='text')
436
433
  self._text = TextWithTags(text, self)
437
434
  return self._text
438
435
 
@@ -440,7 +437,7 @@ class Document:
440
437
  def markdown(self):
441
438
  if self._markdown_bool:
442
439
  if self._markdown is None:
443
- self._markdown = flatten_dict(self.data,'markdown')
440
+ self._markdown = flatten_dict(tuples_list=self.data_tuples,format='markdown')
444
441
 
445
442
  return self._markdown
446
443
 
@@ -556,18 +553,16 @@ class Document:
556
553
  webbrowser.open('file://' + temp_path)
557
554
  else:
558
555
  print(f"Cannot open files with extension {self.extension}")
559
-
560
556
  def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
561
- if not self.data:
562
- self.parse()
563
-
564
- result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
557
+ if self._data_bool:
558
+ if not self.data:
559
+ self.parse()
565
560
 
566
- if format == 'text':
567
- result = [item[1] for item in result]
568
- result = [unnest_dict(item) for item in result]
569
-
570
- return result
561
+ result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
562
+ if format == 'dict':
563
+ return [item[1] for item in result]
564
+ else:
565
+ return [flatten_dict(item[1],format) for item in result]
571
566
 
572
567
 
573
568
  # TODO CHANGE THIS
@@ -0,0 +1,25 @@
1
+ def has_extractable_text(pdf_bytes, search_range=50000):
2
+ """
3
+ Check if PDF contains extractable text within first N bytes
4
+ Returns True if found in range, False otherwise
5
+
6
+ Args:
7
+ pdf_bytes: PDF content as bytes
8
+ search_range: Number of bytes to search from start (default 50KB)
9
+ """
10
+ # Text indicators to search for
11
+ indicators = [
12
+ b'BT', # Begin text - most common
13
+ b'Tj', # Show text
14
+ b'TJ', # Show text with positioning
15
+ b'Tf', # Set font
16
+ ]
17
+
18
+ # Search only within the specified range
19
+ search_data = pdf_bytes[:search_range]
20
+
21
+ for indicator in indicators:
22
+ if indicator in search_data:
23
+ return True
24
+
25
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.6
3
+ Version: 2.2.8
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -62,4 +62,5 @@ datamule/tags/regex.py
62
62
  datamule/tags/utils.py
63
63
  datamule/utils/__init__.py
64
64
  datamule/utils/construct_submissions_data.py
65
- datamule/utils/format_accession.py
65
+ datamule/utils/format_accession.py
66
+ datamule/utils/pdf.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.6",
35
+ version="2.2.8",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes