PyPI - content-extraction - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

content-extraction 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

content_extraction/file_handlers.py CHANGED Viewed

@@ -269,7 +269,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
     parser = HTMLSectionParser()
     parsed_sections = parser.parse_sections(html_content)
     parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
-    with open(parsed_sections_output_file) as f:
+    with open(parsed_sections_output_file, 'w') as f:
         f.write(json.dumps(parsed_sections))
     logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')

content_extraction/semantic_chunk_html.py CHANGED Viewed

@@ -10,7 +10,7 @@ class HTMLSectionParser:
     """Fast parser for HTML that finds sections and splits content into subsections."""
     def __init__(self):
-        self.heading_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+        self.heading_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
     def get_heading_level(self, element) -> int | None:
         """Extract heading level from an element."""
@@ -19,23 +19,21 @@ class HTMLSectionParser:
             return int(element.name[1])
         # Elements with role="heading" and aria-level
-        if element.get("role") == "heading":
-            aria_level = element.get("aria-level")
+        if element.get('role') == 'heading':
+            aria_level = element.get('aria-level')
             if aria_level and aria_level.isdigit():
                 return int(aria_level)
             # Default to level 1 if no aria-level specified
             return 1
         # Elements with aria-level (even without role="heading")
-        aria_level = element.get("aria-level")
+        aria_level = element.get('aria-level')
         if aria_level and aria_level.isdigit():
             return int(aria_level)
         return None
-    def extract_text_between_headings(
-        self, soup, start_element, end_element=None
-    ) -> str:
+    def extract_text_between_headings(self, soup, start_element, end_element=None) -> str:
         """Extract all content between two heading elements."""
         content_parts = []
         current = start_element.next_sibling
@@ -45,8 +43,8 @@ class HTMLSectionParser:
                 # Check if this is a heading element
                 if (
                     current.name in self.heading_tags
-                    or (current.get("role") == "heading")
-                    or current.get("aria-level", "").isdigit()
+                    or (current.get('role') == 'heading')
+                    or current.get('aria-level', '').isdigit()
                 ):
                     # Hit another heading, stop
                     break
@@ -59,14 +57,12 @@ class HTMLSectionParser:
                     break
                 content_parts.append(str(current))
-            elif (
-                hasattr(current, "string") and current.string and current.string.strip()
-            ):
+            elif hasattr(current, 'string') and current.string and current.string.strip():
                 # It's text content
                 content_parts.append(current.string)
             current = current.next_sibling
-        return "".join(content_parts).strip()
+        return ''.join(content_parts).strip()
     def _find_headings_in_element(self, element):
         """Find all heading elements within a given element."""
@@ -77,9 +73,7 @@ class HTMLSectionParser:
                 headings.append((child, level))
         return headings
-    def find_next_heading_at_level_or_higher(
-        self, soup, start_element, current_level: int
-    ):
+    def find_next_heading_at_level_or_higher(self, soup, start_element, current_level: int):
         """Find the next heading at the same level or higher."""
         current = start_element.next_sibling
@@ -94,7 +88,7 @@ class HTMLSectionParser:
     def parse_sections(self, html_content: str) -> list[dict[str, object]]:
         """Parse HTML and extract hierarchical sections."""
-        soup = BeautifulSoup(html_content, "lxml")
+        soup = BeautifulSoup(html_content, 'lxml')
         # Find all potential heading elements in document order
         headings = []
@@ -141,19 +135,17 @@ class HTMLSectionParser:
             if j < len(headings):
                 next_boundary = headings[j][0]
-            text_content = self.extract_text_between_headings(
-                soup, current_element, next_boundary
-            )
+            text_content = self.extract_text_between_headings(soup, current_element, next_boundary)
             # Build subsections recursively
             subsections = self._build_hierarchy(soup, subsection_headings)
             # Build the section dictionary
             section = {
-                "title": current_element.get_text().strip(),
-                "text": text_content,
-                "level": current_level,
-                "subsections": subsections,
+                'title': current_element.get_text().strip(),
+                'text': text_content,
+                'level': current_level,
+                'subsections': subsections,
             }
             result.append(section)

{content_extraction-0.4.1.dist-info → content_extraction-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content_extraction
-Version: 0.4.1
+Version: 0.4.2
 Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown

{content_extraction-0.4.1.dist-info → content_extraction-0.4.2.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,16 @@
 content_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXDHmkfWc,1547
 content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
-content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
 content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
-content_extraction/file_handlers.py,sha256=ppCi2A05Qns1I89jLu6gJyV2UidcY03DGjsZ8TkGXK8,11777
+content_extraction/file_handlers.py,sha256=I15c2dINQudsRY3wXsv0pNeNsXc8fm5PIZ7GkY4DfrM,11782
 content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
 content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
 content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
 content_extraction/process.py,sha256=iLcmSjWhEg_DbgnftnVIfybIeLCuTEI57gasot0MtDk,1809
 content_extraction/process_document.sh,sha256=QbQOrV7isiEyxin1PBNGYmCbfVQ_eW-JgsbuQV4VB2o,1106
-content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
+content_extraction/semantic_chunk_html.py,sha256=PpK2W2Fse5-SU7hBqE-JWEW_sT3cEPaKNRRD2orEs-k,5696
 content_extraction/split_and_create_digest.py,sha256=vW4lyeTlRzZcqJS15g8Xqq5IZB06unrUBnQV7RrFDmA,4342
-content_extraction-0.4.1.dist-info/METADATA,sha256=8V8OnWOkS4Nie974-BPfMfWTKH-CmTYo72yUmIjdJE8,6266
-content_extraction-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-content_extraction-0.4.1.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
-content_extraction-0.4.1.dist-info/RECORD,,
+content_extraction-0.4.2.dist-info/METADATA,sha256=glN_ZSgjFvvwhIp0X49yKSxJ1Av1EqjTZONDu_FMHOo,6266
+content_extraction-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+content_extraction-0.4.2.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
+content_extraction-0.4.2.dist-info/RECORD,,

content_extraction/dspy_modules.py DELETED Viewed

@@ -1,24 +0,0 @@
-import dspy
-lm = dspy.LM("openai/gpt-4o-mini", temperature=0.3, max_tokens=5000)
-dspy.configure(lm=lm)
-class CorrectHeadingLevelSignature(dspy.Signature):
-    """Correct heading levels. Main title should be H1, Chapter Titles H2, etc."""
-    headings: str = dspy.InputField(
-        description=r"String of headings extracted via OCR process, separated by \n"
-    )
-    corrected_headings: str = dspy.OutputField(
-        description="Headings with corrected level"
-    )
-class CorrectHeadingLevel(dspy.Module):
-    def __init__(self):
-        self.predictor = dspy.ChainOfThought(CorrectHeadingLevelSignature)
-    def forward(self, headings):
-        prediction = self.predictor(headings=headings)
-        return prediction

{content_extraction-0.4.1.dist-info → content_extraction-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_extraction-0.4.1.dist-info → content_extraction-0.4.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

content-extraction 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

content-extraction 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl