PyPI - content-extraction - Versions diffs - 0.1.0__py3-none-any.whl - Mend

content-extraction 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

content_extraction/__init__.py +0 -0
content_extraction/common_std_io.py +50 -0
content_extraction/do_ocr.py +199 -0
content_extraction/dspy_modules.py +24 -0
content_extraction/extract_from_pptx.py +174 -0
content_extraction/file_handlers.py +280 -0
content_extraction/fix_ocr.py +245 -0
content_extraction/logging_config.py +13 -0
content_extraction/parse_html.py +117 -0
content_extraction/semantic_chunk_html.py +164 -0
content_extraction/split_and_create_digest.py +134 -0
content_extraction-0.1.0.dist-info/METADATA +258 -0
content_extraction-0.1.0.dist-info/RECORD +15 -0
content_extraction-0.1.0.dist-info/WHEEL +5 -0
content_extraction-0.1.0.dist-info/top_level.txt +1 -0

content_extraction/semantic_chunk_html.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""
+HTML Section Parser - Extracts hierarchical sections from HTML content
+"""
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+class HTMLSectionParser:
+    """Fast parser for HTML that finds sections and splits content into subsections."""
+    def __init__(self):
+        self.heading_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+    def get_heading_level(self, element) -> int | None:
+        """Extract heading level from an element."""
+        # Standard heading tags
+        if element.name in self.heading_tags:
+            return int(element.name[1])
+        # Elements with role="heading" and aria-level
+        if element.get("role") == "heading":
+            aria_level = element.get("aria-level")
+            if aria_level and aria_level.isdigit():
+                return int(aria_level)
+            # Default to level 1 if no aria-level specified
+            return 1
+        # Elements with aria-level (even without role="heading")
+        aria_level = element.get("aria-level")
+        if aria_level and aria_level.isdigit():
+            return int(aria_level)
+        return None
+    def extract_text_between_headings(
+        self, soup, start_element, end_element=None
+    ) -> str:
+        """Extract all content between two heading elements."""
+        content_parts = []
+        current = start_element.next_sibling
+        while current and current != end_element:
+            if isinstance(current, Tag):
+                # Check if this is a heading element
+                if (
+                    current.name in self.heading_tags
+                    or (current.get("role") == "heading")
+                    or current.get("aria-level", "").isdigit()
+                ):
+                    # Hit another heading, stop
+                    break
+                # Check if this element contains headings (like a section)
+                nested_headings = self._find_headings_in_element(current)
+                if nested_headings:
+                    # This element contains headings, so we should stop here
+                    # and let those headings be processed as subsections
+                    break
+                content_parts.append(str(current))
+            elif (
+                hasattr(current, "string") and current.string and current.string.strip()
+            ):
+                # It's text content
+                content_parts.append(current.string)
+            current = current.next_sibling
+        return "".join(content_parts).strip()
+    def _find_headings_in_element(self, element):
+        """Find all heading elements within a given element."""
+        headings = []
+        for child in element.find_all():
+            level = self.get_heading_level(child)
+            if level is not None:
+                headings.append((child, level))
+        return headings
+    def find_next_heading_at_level_or_higher(
+        self, soup, start_element, current_level: int
+    ):
+        """Find the next heading at the same level or higher."""
+        current = start_element.next_sibling
+        while current:
+            if isinstance(current, Tag):
+                level = self.get_heading_level(current)
+                if level is not None and level <= current_level:
+                    return current
+            current = current.next_sibling
+        return None
+    def parse_sections(self, html_content: str) -> list[dict[str, object]]:
+        """Parse HTML and extract hierarchical sections."""
+        soup = BeautifulSoup(html_content, "lxml")
+        # Find all potential heading elements in document order
+        headings = []
+        for element in soup.find_all():
+            level = self.get_heading_level(element)
+            if level is not None:
+                headings.append((element, level))
+        if not headings:
+            return []
+        result = self._build_hierarchy(soup, headings)
+        # If there's only one top-level section and the test expects a single object
+        # (not a list), we might need to adjust this based on the specific use case
+        return result
+    def _build_hierarchy(self, soup, headings: list[tuple]) -> list[dict[str, object]]:
+        """Build hierarchical structure from headings."""
+        if not headings:
+            return []
+        result = []
+        i = 0
+        while i < len(headings):
+            current_element, current_level = headings[i]
+            # Find all headings that belong to this section (higher level numbers)
+            subsection_headings = []
+            j = i + 1
+            # Collect all subsections until we hit a heading at same or higher level
+            while j < len(headings):
+                next_element, next_level = headings[j]
+                if next_level <= current_level:
+                    break
+                subsection_headings.append(headings[j])
+                j += 1
+            # Extract text content for this section
+            # Find the next heading at the same or higher level for boundary
+            next_boundary = None
+            if j < len(headings):
+                next_boundary = headings[j][0]
+            text_content = self.extract_text_between_headings(
+                soup, current_element, next_boundary
+            )
+            # Build subsections recursively
+            subsections = self._build_hierarchy(soup, subsection_headings)
+            # Build the section dictionary
+            section = {
+                "title": current_element.get_text().strip(),
+                "text": text_content,
+                "level": current_level,
+                "subsections": subsections,
+            }
+            result.append(section)
+            # Move to the next section at the same or higher level
+            i = j
+        return result

content_extraction/split_and_create_digest.py ADDED Viewed

@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+import sys
+import argparse
+import hashlib
+import json
+import logging
+from content_extraction.common_std_io import read_input, write_stream_of_obj
+from dataclasses import dataclass, field, asdict
+from .logging_config import setup_logging
+logger = logging.getLogger(__name__)
+@dataclass
+class Node:
+    title: str
+    text: str
+    level: int
+    subsections: list['Node'] | None = field(default_factory=list)
+@dataclass
+class SectionDigestNode:
+    title: str
+    text: str
+    subsections: list['SectionDigestNode'] = field(default_factory=list)
+@dataclass
+class ProcessResultNode:
+    digest_hash: str
+    parent_digest_hash: str
+    title: str
+    text: str
+    section_digest: SectionDigestNode
+def shorten_text(text: str, max_elements: int = 2, subsections: list[dict] | None = None) -> str:
+    """Shorten text by splitting on lines and keeping at most max_elements, appending '...' if truncated."""
+    if max_elements == -1:
+        return text
+    if not text:
+        result = ''
+        for child in subsections or []:
+            result = '<p>Covered topics in this subsection:</p><ul>'
+            for child in subsections or []:
+                result += f'<li>{child.get("title")}</li>'
+            result += '</ul>'
+        return result
+    DELIM = ''
+    lines = text.splitlines()
+    if len(lines) <= max_elements:
+        if subsections:
+            lines.append('...')
+        return DELIM.join(lines)
+    shortened = lines[:max_elements]
+    shortened.append('...')
+    return DELIM.join(shortened)
+def generate_section_digest(node: dict) -> SectionDigestNode:
+    """Generate a section digest string for a node, including its title/text and immediate children."""
+    text = node.get('text', '')
+    section_digest = SectionDigestNode(title=node.get('title', ''), text=text)
+    # Include immediate children
+    for child in node.get('subsections') or []:
+        child_title = child.get('title')
+        child_text = child.get('text')
+        length = 1 if text else -1
+        short_text = shorten_text(child_text, length, child.get('subsections'))
+        section_digest.subsections.append(SectionDigestNode(title=child_title, text=short_text))
+    return section_digest
+def compute_digest_hash(section_digest: SectionDigestNode) -> str:
+    """Compute a BLAKE2b hash of the section digest text as the node ID."""
+    h = hashlib.blake2b(digest_size=16)
+    section_digest_text = str(section_digest)
+    h.update(section_digest_text.encode('utf-8'))
+    return h.hexdigest()
+def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict]:
+    """
+    Recursively process a node and its subsections, returning a flat list of nodes.
+    """
+    section_digest = generate_section_digest(node)
+    digest_hash = compute_digest_hash(section_digest)
+    result = ProcessResultNode(
+        **{
+            'digest_hash': digest_hash,
+            'parent_digest_hash': parent_digest_hash,
+            'title': node.get('title'),
+            'text': node.get('text'),
+            'section_digest': section_digest,
+        }
+    )
+    result = asdict(result)
+    nodes = [result]
+    for child in node.get('subsections') or []:
+        nodes += process_node(child, parent_digest_hash=digest_hash)
+    return nodes
+def main():
+    parser = argparse.ArgumentParser(
+        description=('Split hierarchical JSON into JSON Lines with node summaries and parent digests.')
+    )
+    parser.add_argument('input', nargs='?', help='Input JSON file (defaults to stdin)')
+    parser.add_argument('-o', '--output', help='Output JSONL file (defaults to stdout)')
+    args = parser.parse_args()
+    setup_logging()
+    logger.info(f'Processing input from {args.input or "stdin"}')
+    content = read_input(args.input)
+    data_list = json.loads(content)
+    logger.info(f'Found {len(data_list)} top-level sections to process.')
+    nodes = [
+        node
+        for result_list in (process_node(data, parent_digest_hash=None) for data in data_list)
+        for node in result_list
+    ]
+    write_stream_of_obj(nodes, args.output)
+    logger.info(f'Successfully processed and wrote {len(nodes)} nodes to {args.output or "stdout"}.')
+if __name__ == '__main__':
+    sys.exit(main())

content_extraction-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,258 @@
+Metadata-Version: 2.4
+Name: content_extraction
+Version: 0.1.0
+Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
+Requires-Python: >=3.13
+Description-Content-Type: text/markdown
+Requires-Dist: beautifulsoup4>=4.13.4
+Requires-Dist: lxml>=6.0.0
+Requires-Dist: python-pptx>=1.0.2
+# HTML Content Extraction Tool
+A powerful command-line tool for extracting structured content from HTML documents. Converts HTML sections into hierarchical JSON data while preserving formatting, links, and semantic structure.
+## Features
+- **Hierarchical Parsing**: Automatically detects heading levels and creates nested section structures
+- **HTML Preservation**: Maintains original formatting, links, and semantic elements
+- **Smart Element Filtering**: Includes meaningful content while filtering out irrelevant elements
+- **Flexible Input/Output**: Read from files or stdin, output to files or stdout
+- **Section Support**: Works with existing `<section>`, `<article>`, and `<main>` elements
+- **Custom Headings**: Supports both standard headings (`h1`-`h6`) and custom headings with `aria-level`
+## Installation
+```bash
+# Install dependencies
+pip install beautifulsoup4
+# Clone or download this repository
+git clone <repository-url>
+cd content-extraction
+```
+## Usage
+### Basic Usage
+```bash
+# Parse HTML file and output to stdout
+python main.py example.html
+# Parse with pretty-printed JSON
+python main.py --pretty example.html
+# Save output to file
+python main.py example.html -o output.json
+# Read from stdin
+cat example.html | python main.py --pretty
+# Verbose mode with debug information
+python main.py --verbose example.html
+```
+### Command Line Options
+```
+usage: main.py [-h] [-o FILE] [--pretty] [-v] [--version] [input_file]
+Extract structured content from HTML documents
+positional arguments:
+  input_file         Input HTML file (if not provided, reads from stdin)
+options:
+  -h, --help         show this help message and exit
+  -o, --output FILE  Output JSON file (if not provided, writes to stdout)
+  --pretty           Pretty-print JSON output with indentation
+  -v, --verbose      Show verbose output and debug information
+  --version          show program's version number and exit
+```
+## Output Format
+The tool outputs JSON with the following structure:
+```json
+{
+  "title": "Section Title",
+  "text": "<p>HTML content preserved</p>",
+  "level": 1,
+  "subsections": [
+    {
+      "title": "Subsection Title",
+      "text": "<p>Subsection content</p>",
+      "level": 2,
+      "subsections": []
+    }
+  ]
+}
+```
+### Fields
+- **`title`**: Text content of the highest-level heading in the section
+- **`text`**: All content except headings, with HTML formatting preserved
+- **`level`**: Aria level of the main heading (1-6, or custom levels)
+- **`subsections`**: Array of nested subsections with the same structure
+## Examples
+### Simple Section
+**Input HTML:**
+```html
+<section>
+    <h2>Getting Started</h2>
+    <p>Welcome to our <a href="/api">API</a>!</p>
+    <ul>
+        <li>Step 1: Register</li>
+        <li>Step 2: Get API key</li>
+    </ul>
+</section>
+```
+**Output:**
+```json
+{
+  "title": "Getting Started",
+  "text": "<p>Welcome to our <a href=\"/api\">API</a>!</p>\n<ul>\n<li>Step 1: Register</li>\n<li>Step 2: Get API key</li>\n</ul>",
+  "level": 2,
+  "subsections": []
+}
+```
+### Nested Sections
+**Input HTML:**
+```html
+<main>
+    <h1>Documentation</h1>
+    <p>Introduction text.</p>
+    <h2>Installation</h2>
+    <p>Installation instructions.</p>
+    <h3>Requirements</h3>
+    <p>System requirements.</p>
+    <h2>Usage</h2>
+    <p>Usage examples.</p>
+</main>
+```
+**Output:**
+```json
+{
+  "title": "Documentation",
+  "text": "<p>Introduction text.</p>",
+  "level": 1,
+  "subsections": [
+    {
+      "title": "Installation",
+      "text": "<p>Installation instructions.</p>",
+      "level": 2,
+      "subsections": [
+        {
+          "title": "Requirements",
+          "text": "<p>System requirements.</p>",
+          "level": 3,
+          "subsections": []
+        }
+      ]
+    },
+    {
+      "title": "Usage",
+      "text": "<p>Usage examples.</p>",
+      "level": 2,
+      "subsections": []
+    }
+  ]
+}
+```
+## Supported HTML Elements
+### Included Elements
+- Paragraphs (`<p>`)
+- Lists (`<ul>`, `<ol>`, `<li>`)
+- Links (`<a>`)
+- Formatting (`<strong>`, `<em>`, `<code>`, etc.)
+- Semantic elements (`<section>`, `<article>`, `<aside>`, etc.)
+- Tables (`<table>`, `<tr>`, `<td>`, etc.)
+- Media (`<img>`, `<figure>`)
+- Code blocks (`<pre>`, `<code>`)
+- Quotes (`<blockquote>`, `<q>`)
+- All other content elements with meaningful text
+### Excluded Elements
+- Headings (processed separately as section titles)
+- Script and style tags
+- Meta elements
+- Empty elements
+- Elements containing headings (processed as subsections)
+## Smart Root Element Detection
+The tool automatically detects the best root element in this priority order:
+1. `<main>` - Primary content area
+2. `<article>` - Standalone article content
+3. `<section>` - Document section
+4. `<body>` - Document body
+5. First substantial `<div>` - Fallback for div-based layouts
+6. Entire document - Last resort
+## Advanced Features
+### Custom Headings
+Supports custom headings with ARIA attributes:
+```html
+<div role="heading" aria-level="2">Custom Heading</div>
+```
+### Aria Level Overrides
+Standard headings can have their levels overridden:
+```html
+<h3 aria-level="1">This is treated as level 1</h3>
+```
+### Mixed Content
+Handles complex layouts with mixed content types:
+```html
+<div>
+    <h1>Main Title</h1>
+    <p>Introduction</p>
+    <section>
+        <h2>Section in Section</h2>
+        <p>Section content</p>
+    </section>
+    <h2>Regular Heading</h2>
+    <p>Regular content</p>
+</div>
+```
+## Testing
+Run the test suite:
+```bash
+python -m pytest tests/ -v
+```
+The project includes comprehensive tests covering:
+- Basic parsing functionality
+- Heading level detection
+- Content extraction
+- Section handling
+- Edge cases and error conditions
+## License
+This project is open source. See LICENSE file for details.
+## Contributing
+Contributions are welcome! Please submit pull requests with tests for any new features.

content_extraction-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+content_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXDHmkfWc,1547
+content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
+content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
+content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
+content_extraction/file_handlers.py,sha256=mO4HWiA_ZEKkV8KZP4fOz_nGnxDpghkqAhS0ADG9Oqk,11149
+content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
+content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
+content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
+content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
+content_extraction/split_and_create_digest.py,sha256=bKZL9Axc74zLH_VrlNjd46ZiVTQQrAY5iNJCotO-8v8,4253
+content_extraction-0.1.0.dist-info/METADATA,sha256=3dQRIhF8zxiifsp3Fxpo8BCKqvV9N3xtjyCAkNlwQ_I,6201
+content_extraction-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+content_extraction-0.1.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
+content_extraction-0.1.0.dist-info/RECORD,,

content_extraction-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

content_extraction-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ content_extraction