PyPI - datamule - Versions diffs - 2.2.9__tar.gz → 2.3.0__tar.gz - Mend

datamule 2.2.9tar.gz → 2.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{datamule-2.2.9 → datamule-2.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.9
+Version: 2.3.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.9 → datamule-2.3.0}/datamule/document/document.py RENAMED Viewed

@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
 from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import *
-from selectolax.parser import HTMLParser
 from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
@@ -294,7 +292,6 @@ class Document:
             return bool(re.search(pattern, self.content))
         return False
-    # Note: this method will be heavily modified in the future
     def parse(self):
         # check if we have already parsed the content
         if self._data:
@@ -384,6 +381,8 @@ class Document:
                 dct = html2dict(content=self.content, mapping_dict=mapping_dict)
             elif self.extension in ['.txt']:
                 dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
+            elif self.extension == '.pdf':
+                dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
             else:
                 dct = {}
@@ -391,10 +390,8 @@ class Document:
         elif self.extension == '.xml':
             if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
                 mapping_dict = dict_345
             self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
-        elif self.extension == '.pdf':
-            self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
         else:
             pass
@@ -409,6 +406,12 @@ class Document:
             if not isinstance(self._data, DataWithTags):
                 self._data = DataWithTags(self._data, self)
+        elif self.extension == '.xml':
+            if self._data is None:
+                self.parse()
+            if self._data is None:
+                self._data = {}
         return self._data
@@ -444,19 +447,46 @@ class Document:
             json.dump(self.data, f, indent=2)
     def parse_tables(self,must_exist_in_mapping=True):
-        if self.extension != '.xml':
-            self._tables = []
+        """Must exist in mapping means columns must occur in mapping schema."""
+        if self.extension == '.xml':
+            tables = Tables(document_type = self.type, accession=self.accession)
+            tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
+            self._tables = tables
+        elif self._data_bool:
+            tables = Tables(document_type = self.type, accession=self.accession)
+            data_tuples = self.data_tuples
+            for i, (id, type, content, level) in enumerate(data_tuples):
+                if type == "table" and i > 0:
+                    description = None
+                    # Look at previous element
+                    prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
+                    # Case 1: Same level + text content
+                    if prev_level == level and prev_type in ["text", "textsmall"]:
+                        description = prev_content
+                    # Case 2: Higher level (lower number) + title
+                    elif prev_level < level and prev_type == "title":
+                        description = prev_content
+                    # Case 3: No matching description - add table without description
+                    # (description remains None)
+                    tables.add_table(data=content, description=description, name="extracted_table")
+            self._tables = tables
         else:
-            # Use the property to trigger parsing if needed
-            data = self.data
-            tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
-            self._tables = tables.tables
+            self._tables = []
     @property
     def tables(self):
         if self._tables is None:
             self.parse_tables()
-        return self._tables
+        return self._tables.tables
     def write_csv(self, output_folder):
@@ -547,6 +577,7 @@ class Document:
             webbrowser.open('file://' + temp_path)
         else:
             print(f"Cannot open files with extension {self.extension}")
     def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
         if self._data_bool:
             if not self.data:
@@ -557,3 +588,9 @@ class Document:
                 return [item[1] for item in result]
             else:
                 return [flatten_dict(item[1],format) for item in result]
+    # TODO
+    def get_tables(self,description_regex=None,name=None):
+        # make sure tables is initialized
+        self.tables
+        return self._tables.get_tables(description_regex=description_regex, name=name)

{datamule-2.2.9 → datamule-2.3.0}/datamule/document/tables/tables.py RENAMED Viewed

@@ -6,8 +6,10 @@ from .tables_npx import config_npx
 from .tables_sbsef import config_sbsef
 from .tables_sdr import config_sdr
 from .tables_proxyvotingrecord import config_proxyvotingrecord
+from doc2dict.utils.format_dict import _format_table
 from .utils import safe_get, flatten_dict
+import re
 # will add filing date param later? or extension
 all_tables_dict = {
     '3' : config_ownership,
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
 # should have table type, accession, data
 class Table:
-    def __init__(self,data,name,accession):
+    def __init__(self,data,name,accession,description = None):
         self.data = data
         self.name = name
         self.accession = accession
+        self.description = description
+    # TODO MADE IN A HURRY #
+    def __str__(self):
+        formatted_table = _format_table(self.data)
+        if isinstance(formatted_table, list):
+            table_str = '\n'.join(formatted_table)
+        else:
+            table_str = str(formatted_table)
+        return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
 class Tables():
-    def __init__(self,document_type,accession,data,must_exist_in_mapping=True):
+    def __init__(self,document_type,accession):
         self.document_type = document_type
         self.accession = accession
-        self.data = data
-        # to fill in
         self.tables = []
-        self.parse_tables(must_exist_in_mapping=must_exist_in_mapping)
-    def parse_tables(self,must_exist_in_mapping=True):
-        # first select dict
+    def parse_tables(self,data,must_exist_in_mapping=True):
+        self.data = data
         try:
             tables_dict = all_tables_dict[self.document_type]
@@ -120,11 +127,32 @@ class Tables():
         # now get the dicts from the data
         data_dicts = seperate_data(tables_dict,self.data)
         # now flatten
         data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
         for table_name, flattened_data in data_dicts:
             mapping_dict = tables_dict[table_name]['mapping']
             mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
-            self.tables.append(Table(mapped_data, table_name, self.accession))
+            self.tables.append(Table(mapped_data, table_name, self.accession))
+    def add_table(self,data,name,description=None):
+        self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
+    def get_tables(self, description_regex=None, name=None):
+        matching_tables = []
+        for table in self.tables:
+            # Check name match (exact match)
+            if name is not None:
+                if table.name == name:
+                    matching_tables.append(table)
+                    continue
+            # Check description regex match
+            if description_regex is not None and table.description is not None:
+                if re.search(description_regex, table.description):
+                    matching_tables.append(table)
+                    continue
+        return matching_tables

{datamule-2.2.9 → datamule-2.3.0}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.9
+Version: 2.3.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.9 → datamule-2.3.0}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.2.9",
+    version="2.3.0",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",