datamule 2.2.7__tar.gz → 2.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-2.2.7 → datamule-2.2.8}/PKG-INFO +1 -1
  2. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/document.py +19 -23
  3. datamule-2.2.8/datamule/utils/pdf.py +25 -0
  4. {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/PKG-INFO +1 -1
  5. {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/SOURCES.txt +2 -1
  6. {datamule-2.2.7 → datamule-2.2.8}/setup.py +1 -1
  7. {datamule-2.2.7 → datamule-2.2.8}/datamule/__init__.py +0 -0
  8. {datamule-2.2.7 → datamule-2.2.8}/datamule/config.py +0 -0
  9. {datamule-2.2.7 → datamule-2.2.8}/datamule/data/listed_filer_metadata.csv +0 -0
  10. {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/__init__.py +0 -0
  11. {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/datamule_lookup.py +0 -0
  12. {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/datamule_mysql_rds.py +0 -0
  13. {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/downloader.py +0 -0
  14. {datamule-2.2.7 → datamule-2.2.8}/datamule/datamule/sec_connector.py +0 -0
  15. {datamule-2.2.7 → datamule-2.2.8}/datamule/datasets.py +0 -0
  16. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/__init__.py +0 -0
  17. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/__init__.py +0 -0
  18. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables.py +0 -0
  19. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_13fhr.py +0 -0
  20. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_25nse.py +0 -0
  21. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_informationtable.py +0 -0
  22. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_npx.py +0 -0
  23. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_ownership.py +0 -0
  24. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  25. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_sbsef.py +0 -0
  26. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/tables_sdr.py +0 -0
  27. {datamule-2.2.7 → datamule-2.2.8}/datamule/document/tables/utils.py +0 -0
  28. {datamule-2.2.7 → datamule-2.2.8}/datamule/helper.py +0 -0
  29. {datamule-2.2.7 → datamule-2.2.8}/datamule/index.py +0 -0
  30. {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/__init__.py +0 -0
  31. {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  32. {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  33. {datamule-2.2.7 → datamule-2.2.8}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  34. {datamule-2.2.7 → datamule-2.2.8}/datamule/package_updater.py +0 -0
  35. {datamule-2.2.7 → datamule-2.2.8}/datamule/portfolio.py +0 -0
  36. {datamule-2.2.7 → datamule-2.2.8}/datamule/portfolio_compression_utils.py +0 -0
  37. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/__init__.py +0 -0
  38. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/infrastructure/__init__.py +0 -0
  39. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  40. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/__init__.py +0 -0
  41. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/downloader.py +0 -0
  42. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/eftsquery.py +0 -0
  43. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/monitor.py +0 -0
  44. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/streamer.py +0 -0
  45. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/submissions/textsearch.py +0 -0
  46. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/utils.py +0 -0
  47. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/__init__.py +0 -0
  48. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  49. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  50. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  51. {datamule-2.2.7 → datamule-2.2.8}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  52. {datamule-2.2.7 → datamule-2.2.8}/datamule/seclibrary/__init__.py +0 -0
  53. {datamule-2.2.7 → datamule-2.2.8}/datamule/seclibrary/bq.py +0 -0
  54. {datamule-2.2.7 → datamule-2.2.8}/datamule/sentiment/__init__.py +0 -0
  55. {datamule-2.2.7 → datamule-2.2.8}/datamule/sheet.py +0 -0
  56. {datamule-2.2.7 → datamule-2.2.8}/datamule/submission.py +0 -0
  57. {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/__init__.py +0 -0
  58. {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/config.py +0 -0
  59. {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/dictionaries.py +0 -0
  60. {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/regex.py +0 -0
  61. {datamule-2.2.7 → datamule-2.2.8}/datamule/tags/utils.py +0 -0
  62. {datamule-2.2.7 → datamule-2.2.8}/datamule/utils/__init__.py +0 -0
  63. {datamule-2.2.7 → datamule-2.2.8}/datamule/utils/construct_submissions_data.py +0 -0
  64. {datamule-2.2.7 → datamule-2.2.8}/datamule/utils/format_accession.py +0 -0
  65. {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-2.2.7 → datamule-2.2.8}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-2.2.7 → datamule-2.2.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.7
3
+ Version: 2.2.8
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -16,6 +16,7 @@ import tempfile
16
16
  from .tables.tables import Tables
17
17
 
18
18
  from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
19
+ from ..utils.pdf import has_extractable_text
19
20
 
20
21
  class DataWithTags(dict):
21
22
  def __init__(self, data, document):
@@ -113,29 +114,9 @@ class TextAnalysisBase:
113
114
  # Original behavior - single text source
114
115
  self._text_sources = [{'id': None, 'text': str(self.document.text)}]
115
116
  else: # mode == 'data'
116
- # New behavior - multiple text fragments
117
- self._text_sources = []
118
- self._extract_text_fragments(self.document.data, '')
117
+ self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall']]
119
118
  return self._text_sources
120
119
 
121
- def _extract_text_fragments(self, data, parent_id=''):
122
- """Extract all text fragments with their document IDs from parsed data"""
123
- if isinstance(data, dict):
124
- for key, value in data.items():
125
- if key in ["text", "title"] and isinstance(value, str):
126
- # Use the current dictionary's parent key as the fragment ID
127
- self._text_sources.append({
128
- 'id': parent_id,
129
- 'text': value
130
- })
131
- elif isinstance(value, (dict, list)):
132
- # Pass the current key as the parent_id for the next level
133
- self._extract_text_fragments(value, key)
134
- elif isinstance(data, list):
135
- for i, item in enumerate(data):
136
- if isinstance(item, (dict, list)):
137
- self._extract_text_fragments(item, parent_id)
138
-
139
120
  def _format_results(self, results, fragment_id):
140
121
  """Format results based on mode"""
141
122
  if self.mode == 'text':
@@ -286,12 +267,20 @@ class Document:
286
267
 
287
268
  # this will be filled by parsed
288
269
  self._data = None
270
+ self._data_tuples = None
289
271
  self._tables = None
290
272
  self._text = None
291
273
  self._markdown = None
292
274
 
293
275
  # booleans
294
276
  self._data_bool = self.extension in ('.htm', '.html','.txt')
277
+
278
+ # may slow things down?
279
+ if self.extension == '.pdf':
280
+ if has_extractable_text(pdf_bytes=self.content):
281
+ self._data_bool = True
282
+
283
+ self._data_tuples_bool = self._data_bool
295
284
  self._text_bool = self._data_bool
296
285
  self._markdown_bool = self._data_bool
297
286
  self._visualize_bool = self._data_bool
@@ -429,11 +418,18 @@ class Document:
429
418
 
430
419
  return self._data
431
420
 
421
+ @property
422
+ def data_tuples(self):
423
+ if self._data_bool:
424
+ if self._data_tuples is None:
425
+ self._data_tuples = unnest_dict(self.data)
426
+ return self._data_tuples
427
+
432
428
  @property
433
429
  def text(self):
434
430
  if self._text_bool:
435
431
  if self._text is None:
436
- text = flatten_dict(self.data,'text')
432
+ text = flatten_dict(tuples_list=self.data_tuples,format='text')
437
433
  self._text = TextWithTags(text, self)
438
434
  return self._text
439
435
 
@@ -441,7 +437,7 @@ class Document:
441
437
  def markdown(self):
442
438
  if self._markdown_bool:
443
439
  if self._markdown is None:
444
- self._markdown = flatten_dict(self.data,'markdown')
440
+ self._markdown = flatten_dict(tuples_list=self.data_tuples,format='markdown')
445
441
 
446
442
  return self._markdown
447
443
 
@@ -0,0 +1,25 @@
1
+ def has_extractable_text(pdf_bytes, search_range=50000):
2
+ """
3
+ Check if PDF contains extractable text within first N bytes
4
+ Returns True if found in range, False otherwise
5
+
6
+ Args:
7
+ pdf_bytes: PDF content as bytes
8
+ search_range: Number of bytes to search from start (default 50KB)
9
+ """
10
+ # Text indicators to search for
11
+ indicators = [
12
+ b'BT', # Begin text - most common
13
+ b'Tj', # Show text
14
+ b'TJ', # Show text with positioning
15
+ b'Tf', # Set font
16
+ ]
17
+
18
+ # Search only within the specified range
19
+ search_data = pdf_bytes[:search_range]
20
+
21
+ for indicator in indicators:
22
+ if indicator in search_data:
23
+ return True
24
+
25
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.2.7
3
+ Version: 2.2.8
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -62,4 +62,5 @@ datamule/tags/regex.py
62
62
  datamule/tags/utils.py
63
63
  datamule/utils/__init__.py
64
64
  datamule/utils/construct_submissions_data.py
65
- datamule/utils/format_accession.py
65
+ datamule/utils/format_accession.py
66
+ datamule/utils/pdf.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.2.7",
35
+ version="2.2.8",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes