PyPI - datamule - Versions diffs - 2.2.6__tar.gz → 2.2.8__tar.gz - Mend

datamule 2.2.6tar.gz → 2.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{datamule-2.2.6 → datamule-2.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.6
+Version: 2.2.8
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.6 → datamule-2.2.8}/datamule/document/document.py RENAMED Viewed

@@ -16,6 +16,7 @@ import tempfile
 from .tables.tables import Tables
 from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
+from ..utils.pdf import has_extractable_text
 class DataWithTags(dict):
     def __init__(self, data, document):
@@ -113,29 +114,9 @@ class TextAnalysisBase:
                 # Original behavior - single text source
                 self._text_sources = [{'id': None, 'text': str(self.document.text)}]
             else:  # mode == 'data'
-                # New behavior - multiple text fragments
-                self._text_sources = []
-                self._extract_text_fragments(self.document.data, '')
+                self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall']]
         return self._text_sources
-    def _extract_text_fragments(self, data, parent_id=''):
-        """Extract all text fragments with their document IDs from parsed data"""
-        if isinstance(data, dict):
-            for key, value in data.items():
-                if key in ["text", "title"] and isinstance(value, str):
-                    # Use the current dictionary's parent key as the fragment ID
-                    self._text_sources.append({
-                        'id': parent_id,
-                        'text': value
-                    })
-                elif isinstance(value, (dict, list)):
-                    # Pass the current key as the parent_id for the next level
-                    self._extract_text_fragments(value, key)
-        elif isinstance(data, list):
-            for i, item in enumerate(data):
-                if isinstance(item, (dict, list)):
-                    self._extract_text_fragments(item, parent_id)
     def _format_results(self, results, fragment_id):
         """Format results based on mode"""
         if self.mode == 'text':
@@ -286,12 +267,20 @@ class Document:
         # this will be filled by parsed
         self._data = None
+        self._data_tuples = None
         self._tables = None
         self._text = None
         self._markdown = None
         # booleans
         self._data_bool = self.extension in ('.htm', '.html','.txt')
+        # may slow things down?
+        if self.extension == '.pdf':
+            if has_extractable_text(pdf_bytes=self.content):
+                self._data_bool = True
+        self._data_tuples_bool = self._data_bool
         self._text_bool = self._data_bool
         self._markdown_bool = self._data_bool
         self._visualize_bool = self._data_bool
@@ -417,22 +406,30 @@ class Document:
     @property
     def data(self):
-        if self._data is None:
-            self.parse()
+        if self._data_bool:
+            if self._data is None:
+                self.parse()
-        if self._data is None:
-            self._data = {}
-        if not isinstance(self._data, DataWithTags):
-            self._data = DataWithTags(self._data, self)
+            if self._data is None:
+                self._data = {}
+            if not isinstance(self._data, DataWithTags):
+                self._data = DataWithTags(self._data, self)
         return self._data
+    @property
+    def data_tuples(self):
+        if self._data_bool:
+            if self._data_tuples is None:
+                self._data_tuples = unnest_dict(self.data)
+        return self._data_tuples
     @property
     def text(self):
         if self._text_bool:
             if self._text is None:
-                text = flatten_dict(self.data,'text')
+                text = flatten_dict(tuples_list=self.data_tuples,format='text')
                 self._text = TextWithTags(text, self)
         return self._text
@@ -440,7 +437,7 @@ class Document:
     def markdown(self):
         if self._markdown_bool:
             if self._markdown is None:
-                self._markdown = flatten_dict(self.data,'markdown')
+                self._markdown = flatten_dict(tuples_list=self.data_tuples,format='markdown')
         return self._markdown
@@ -556,18 +553,16 @@ class Document:
             webbrowser.open('file://' + temp_path)
         else:
             print(f"Cannot open files with extension {self.extension}")
     def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
-        if not self.data:
-            self.parse()
-        result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
+        if self._data_bool:
+            if not self.data:
+                self.parse()
-        if format == 'text':
-            result = [item[1] for item in result]
-            result = [unnest_dict(item) for item in result]
-        return result
+            result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
+            if format == 'dict':
+                return [item[1] for item in result]
+            else:
+                return [flatten_dict(item[1],format) for item in result]
    # TODO CHANGE THIS

datamule-2.2.8/datamule/utils/pdf.py ADDED Viewed

@@ -0,0 +1,25 @@
+def has_extractable_text(pdf_bytes, search_range=50000):
+    """
+    Check if PDF contains extractable text within first N bytes
+    Returns True if found in range, False otherwise
+    Args:
+        pdf_bytes: PDF content as bytes
+        search_range: Number of bytes to search from start (default 50KB)
+    """
+    # Text indicators to search for
+    indicators = [
+        b'BT',  # Begin text - most common
+        b'Tj',  # Show text
+        b'TJ',  # Show text with positioning
+        b'Tf',  # Set font
+    ]
+    # Search only within the specified range
+    search_data = pdf_bytes[:search_range]
+    for indicator in indicators:
+        if indicator in search_data:
+            return True
+    return False

{datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.6
+Version: 2.2.8
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.6 → datamule-2.2.8}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -62,4 +62,5 @@ datamule/tags/regex.py
 datamule/tags/utils.py
 datamule/utils/__init__.py
 datamule/utils/construct_submissions_data.py
-datamule/utils/format_accession.py
+datamule/utils/format_accession.py
+datamule/utils/pdf.py

{datamule-2.2.6 → datamule-2.2.8}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.2.6",
+    version="2.2.8",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",