PyPI - datamule - Versions diffs - 2.2.8__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

datamule 2.2.8py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

datamule/document/document.py CHANGED Viewed

@@ -7,8 +7,6 @@ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
 from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import *
-from selectolax.parser import HTMLParser
 from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
@@ -294,30 +292,13 @@ class Document:
             return bool(re.search(pattern, self.content))
         return False
-    # Note: this method will be heavily modified in the future
     def parse(self):
         # check if we have already parsed the content
         if self._data:
             return
         mapping_dict = None
-        if self.extension == '.txt':
-            content = self.text
-            if self.type in ['10-Q', '10-Q/A']:
-                mapping_dict = dict_10q
-            elif self.type in ['10-K','10-K/A']:
-                mapping_dict = dict_10k
-            elif self.type in ['8-K', '8-K/A']:
-                mapping_dict = dict_8k
-            elif self.type in ['SC 13D', 'SC 13D/A']:
-                mapping_dict = dict_13d
-            elif self.type in ['SC 13G', 'SC 13G/A']:
-                mapping_dict = dict_13g
-            self._data = {}
-            self._data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
-        elif self.extension in ['.htm', '.html']:
+        if self._data_bool:
             if self.type in ['1-K', '1-K/A']:
                 mapping_dict = dict_1kpartii_html
@@ -391,16 +372,26 @@ class Document:
                 mapping_dict = dict_t3_html
             elif self.type in ['NT 10-K', 'NT 10-K/A', 'NT 10-Q', 'NT 10-Q/A', 'NT 20-F', 'NT 20-F/A']:
                 mapping_dict = dict_nt10k_html
+            elif self.type in ['SC 13G', 'SC 13G/A']:
+                mapping_dict = dict_13g
+            elif self.type in ['SC 13D', 'SC 13D/A']:
+                mapping_dict = dict_13d
+            if self.extension in ['.htm','.html']:
+                dct = html2dict(content=self.content, mapping_dict=mapping_dict)
+            elif self.extension in ['.txt']:
+                dct = txt2dict(content=self.content, mapping_dict=mapping_dict)
+            elif self.extension == '.pdf':
+                dct = pdf2dict(content=self.content, mapping_dict=mapping_dict)
+            else:
+                dct = {}
-            dct = html2dict(content=self.content, mapping_dict=mapping_dict)
             self._data = dct
         elif self.extension == '.xml':
             if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
                 mapping_dict = dict_345
             self._data = xml2dict(content=self.content, mapping_dict=mapping_dict)
-        elif self.extension == '.pdf':
-            self._data = pdf2dict(content=self.content, mapping_dict=mapping_dict)
         else:
             pass
@@ -415,6 +406,12 @@ class Document:
             if not isinstance(self._data, DataWithTags):
                 self._data = DataWithTags(self._data, self)
+        elif self.extension == '.xml':
+            if self._data is None:
+                self.parse()
+            if self._data is None:
+                self._data = {}
         return self._data
@@ -450,19 +447,46 @@ class Document:
             json.dump(self.data, f, indent=2)
     def parse_tables(self,must_exist_in_mapping=True):
-        if self.extension != '.xml':
-            self._tables = []
+        """Must exist in mapping means columns must occur in mapping schema."""
+        if self.extension == '.xml':
+            tables = Tables(document_type = self.type, accession=self.accession)
+            tables.parse_tables(data=self.data,must_exist_in_mapping=must_exist_in_mapping)
+            self._tables = tables
+        elif self._data_bool:
+            tables = Tables(document_type = self.type, accession=self.accession)
+            data_tuples = self.data_tuples
+            for i, (id, type, content, level) in enumerate(data_tuples):
+                if type == "table" and i > 0:
+                    description = None
+                    # Look at previous element
+                    prev_id, prev_type, prev_content, prev_level = data_tuples[i-1]
+                    # Case 1: Same level + text content
+                    if prev_level == level and prev_type in ["text", "textsmall"]:
+                        description = prev_content
+                    # Case 2: Higher level (lower number) + title
+                    elif prev_level < level and prev_type == "title":
+                        description = prev_content
+                    # Case 3: No matching description - add table without description
+                    # (description remains None)
+                    tables.add_table(data=content, description=description, name="extracted_table")
+            self._tables = tables
         else:
-            # Use the property to trigger parsing if needed
-            data = self.data
-            tables = Tables(document_type = self.type, accession=self.accession, data=data,must_exist_in_mapping=must_exist_in_mapping)
-            self._tables = tables.tables
+            self._tables = []
     @property
     def tables(self):
         if self._tables is None:
             self.parse_tables()
-        return self._tables
+        return self._tables.tables
     def write_csv(self, output_folder):
@@ -553,6 +577,7 @@ class Document:
             webbrowser.open('file://' + temp_path)
         else:
             print(f"Cannot open files with extension {self.extension}")
     def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
         if self._data_bool:
             if not self.data:
@@ -564,62 +589,8 @@ class Document:
             else:
                 return [flatten_dict(item[1],format) for item in result]
-   # TODO CHANGE THIS
-    def __iter__(self):
-        # Use the property to trigger parsing if needed
-        document_data = self.data
-        # Let's remove XML iterable for now
-        # Handle text-based documents
-        if self.extension in ['.txt', '.htm', '.html']:
-            if not document_data:
-                return iter([])
-            # Find highest hierarchy level from mapping dict
-            highest_hierarchy = float('inf')
-            section_type = None
-            if self.type in ['10-K', '10-Q']:
-                mapping_dict = dict_10k if self.type == '10-K' else dict_10q
-            elif self.type == '8-K':
-                mapping_dict = dict_8k
-            elif self.type == 'SC 13D':
-                mapping_dict = dict_13d
-            elif self.type == 'SC 13G':
-                mapping_dict = dict_13g
-            else:
-                return iter([])
-            # Find section type with highest hierarchy number
-            highest_hierarchy = -1  # Start at -1 to find highest
-            for mapping in mapping_dict['rules']['mappings']:
-                if mapping.get('hierarchy') is not None:
-                    if mapping['hierarchy'] > highest_hierarchy:
-                        highest_hierarchy = mapping['hierarchy']
-                        section_type = mapping['name']
-            if not section_type:
-                return iter([])
-            # Extract sections of the identified type
-            def find_sections(data, target_type):
-                sections = []
-                if isinstance(data, dict):
-                    if data.get('type') == target_type:
-                        sections.append({
-                            'item': data.get('text', ''),
-                            'text': flatten_hierarchy(data.get('content', []))
-                        })
-                    for value in data.values():
-                        if isinstance(value, (dict, list)):
-                            sections.extend(find_sections(value, target_type))
-                elif isinstance(data, list):
-                    for item in data:
-                        sections.extend(find_sections(item, target_type))
-                return sections
-            return iter(find_sections(document_data, section_type))
-        return iter([])
+    # TODO
+    def get_tables(self,description_regex=None,name=None):
+        # make sure tables is initialized
+        self.tables
+        return self._tables.get_tables(description_regex=description_regex, name=name)

datamule/document/tables/tables.py CHANGED Viewed

@@ -6,8 +6,10 @@ from .tables_npx import config_npx
 from .tables_sbsef import config_sbsef
 from .tables_sdr import config_sdr
 from .tables_proxyvotingrecord import config_proxyvotingrecord
+from doc2dict.utils.format_dict import _format_table
 from .utils import safe_get, flatten_dict
+import re
 # will add filing date param later? or extension
 all_tables_dict = {
     '3' : config_ownership,
@@ -93,25 +95,30 @@ def apply_mapping(flattened_data, mapping_dict, accession, must_exist_in_mapping
 # should have table type, accession, data
 class Table:
-    def __init__(self,data,name,accession):
+    def __init__(self,data,name,accession,description = None):
         self.data = data
         self.name = name
         self.accession = accession
+        self.description = description
+    # TODO MADE IN A HURRY #
+    def __str__(self):
+        formatted_table = _format_table(self.data)
+        if isinstance(formatted_table, list):
+            table_str = '\n'.join(formatted_table)
+        else:
+            table_str = str(formatted_table)
+        return f"Table '{self.name}' ({self.accession}) - {len(self.data) if isinstance(self.data, list) else 'N/A'} rows\ndescription: {self.description if self.description else ''}\n{table_str}"
 class Tables():
-    def __init__(self,document_type,accession,data,must_exist_in_mapping=True):
+    def __init__(self,document_type,accession):
         self.document_type = document_type
         self.accession = accession
-        self.data = data
-        # to fill in
         self.tables = []
-        self.parse_tables(must_exist_in_mapping=must_exist_in_mapping)
-    def parse_tables(self,must_exist_in_mapping=True):
-        # first select dict
+    def parse_tables(self,data,must_exist_in_mapping=True):
+        self.data = data
         try:
             tables_dict = all_tables_dict[self.document_type]
@@ -120,11 +127,32 @@ class Tables():
         # now get the dicts from the data
         data_dicts = seperate_data(tables_dict,self.data)
         # now flatten
         data_dicts = [(x,flatten_dict(y)) for x,y in data_dicts]
         for table_name, flattened_data in data_dicts:
             mapping_dict = tables_dict[table_name]['mapping']
             mapped_data = apply_mapping(flattened_data, mapping_dict, self.accession,must_exist_in_mapping)
-            self.tables.append(Table(mapped_data, table_name, self.accession))
+            self.tables.append(Table(mapped_data, table_name, self.accession))
+    def add_table(self,data,name,description=None):
+        self.tables.append(Table(data=data,name=name,accession=self.accession,description=description))
+    def get_tables(self, description_regex=None, name=None):
+        matching_tables = []
+        for table in self.tables:
+            # Check name match (exact match)
+            if name is not None:
+                if table.name == name:
+                    matching_tables.append(table)
+                    continue
+            # Check description regex match
+            if description_regex is not None and table.description is not None:
+                if re.search(description_regex, table.description):
+                    matching_tables.append(table)
+                    continue
+        return matching_tables

{datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.8
+Version: 2.3.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/RECORD RENAMED Viewed

@@ -15,9 +15,9 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
 datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/document.py,sha256=d9Gv8_7zJVZhIVYtF3cLT_7MCtWZV1gn9_l3u8us7b0,24275
+datamule/document/document.py,sha256=NrMqhY_u_X7gyvraxY0hzZEDJddqSJDgiHFzkaRTBVA,23102
 datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
+datamule/document/tables/tables.py,sha256=uEMDYg7c4iHjVtIjNQgCgZOGp6j9aFWVB05agpVsNOI,5727
 datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
 datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
 datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
@@ -58,7 +58,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
 datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
-datamule-2.2.8.dist-info/METADATA,sha256=MfCW0SCjpYwtorAPr-540bS8VhJ5_4hEwSbDZN_b-Zo,585
-datamule-2.2.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.2.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.2.8.dist-info/RECORD,,
+datamule-2.3.0.dist-info/METADATA,sha256=jUra4jM6LMxAS3IKnrF9urlK6ZI4ZAcl6yimnsD67pk,585
+datamule-2.3.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.3.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.3.0.dist-info/RECORD,,

{datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.2.8.dist-info → datamule-2.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.2.8__py3-none-any.whl → 2.3.0__py3-none-any.whl

datamule 2.2.8py3-none-any.whl → 2.3.0py3-none-any.whl