PyPI - content-extraction - Versions diffs - 0.1.0__py3-none-any.whl - Mend

content-extraction 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

content_extraction/__init__.py +0 -0
content_extraction/common_std_io.py +50 -0
content_extraction/do_ocr.py +199 -0
content_extraction/dspy_modules.py +24 -0
content_extraction/extract_from_pptx.py +174 -0
content_extraction/file_handlers.py +280 -0
content_extraction/fix_ocr.py +245 -0
content_extraction/logging_config.py +13 -0
content_extraction/parse_html.py +117 -0
content_extraction/semantic_chunk_html.py +164 -0
content_extraction/split_and_create_digest.py +134 -0
content_extraction-0.1.0.dist-info/METADATA +258 -0
content_extraction-0.1.0.dist-info/RECORD +15 -0
content_extraction-0.1.0.dist-info/WHEEL +5 -0
content_extraction-0.1.0.dist-info/top_level.txt +1 -0

content_extraction/file_handlers.py ADDED Viewed

@@ -0,0 +1,280 @@
+import os
+import shutil
+import subprocess
+import tempfile
+import mimetypes
+import logging
+from urllib.parse import urlparse
+import requests
+from content_extraction.extract_from_pptx import extract_content as extract_pptx_content
+from content_extraction.semantic_chunk_html import HTMLSectionParser
+from content_extraction.common_std_io import write_stream_of_obj
+from content_extraction.split_and_create_digest import process_node
+logger = logging.getLogger(__name__)
+class FileHandlerError(Exception):
+    """Custom exception for file handling errors."""
+def _convert_with_pandoc(file_path: str, output_dir: str):
+    """Helper function to run pandoc for different file types."""
+    output_html_path = os.path.join(output_dir, 'index.html')
+    try:
+        subprocess.run(
+            ['pandoc', file_path, '-s', '-o', output_html_path],
+            check=True,
+            capture_output=True,
+            text=True,
+            encoding='utf-8',
+        )
+        return output_html_path
+    except FileNotFoundError:
+        error_msg = 'Error: `pandoc` command not found. Please ensure pandoc is installed and in your PATH.'
+        logger.error(error_msg)
+        raise FileHandlerError(error_msg)
+    except subprocess.CalledProcessError as e:
+        logger.error(f'Error converting {file_path} to HTML: {e.stderr}')
+        raise FileHandlerError(f'Pandoc conversion failed for {file_path}') from e
+def process_pdf(file_path: str, output_dir: str):
+    """
+    Handles PDF files by running the main processing script.
+    The script is expected to convert the PDF to HTML and place it as index.html
+    in the output_dir.
+    """
+    logger.info(f'[Processing PDF file] started for: "{file_path}"')
+    # This path assumes the script is located at src/scripts/process_document.sh
+    script_path = os.path.join(os.path.dirname(__file__), '..', 'scripts', 'process_document.sh')
+    output_html_path = os.path.join(output_dir, 'index.html')  # Define output_html_path
+    logger.debug(f'[Processing PDF file] script path: "{script_path}"; output_html_path: "{output_html_path}"')
+    if not os.path.exists(script_path):
+        raise FileNotFoundError(f'Processing script not found at: {script_path}')
+    # Ensure the script is executable
+    if not os.access(script_path, os.X_OK):
+        logger.warning(f'Script {script_path} is not executable. Attempting to set permissions.')
+        try:
+            os.chmod(script_path, 0o755)
+        except OSError as e:
+            raise FileHandlerError(f'Failed to set executable permissions for {script_path}: {e}')
+    try:
+        # The script is expected to take input_file and output_directory as arguments
+        subprocess.run(
+            [script_path, file_path, output_dir],
+            check=True,  # Raise CalledProcessError if the command returns a non-zero exit code
+            capture_output=True,  # Capture stdout and stderr
+            text=True,  # Decode stdout/stderr as text
+            encoding='utf-8',
+        )
+        if not os.path.exists(output_html_path):
+            raise FileHandlerError(
+                f'Processing script {script_path} completed, but did not produce the expected output file: {output_html_path}'
+            )
+    except subprocess.CalledProcessError as e:
+        logger.error(f'Error processing PDF with script: {e.stderr}')
+        raise FileHandlerError(f'PDF processing script failed for {file_path}') from e
+    logger.info(f'[Processing PDF file] completed for: "{file_path}"')
+    return output_html_path
+def process_pptx(file_path: str, output_dir: str):
+    """
+    Handles PowerPoint files using the existing pptx extraction function.
+    """
+    logger.info(f'[Processing PPTX file] started for: "{file_path}"')
+    html_out = extract_pptx_content(file_path, output_dir)
+    if not html_out:
+        raise FileHandlerError(f'Failed to extract content from {file_path}')
+    # Standardize the output filename to index.html
+    standard_path = os.path.join(output_dir, 'index.html')
+    if os.path.abspath(html_out) != os.path.abspath(standard_path):
+        shutil.move(html_out, standard_path)
+    logger.info(f'[Processing PPTX file] completed for: "{file_path}"')
+    return standard_path
+def process_docx(file_path: str, output_dir: str):
+    """Handles Word documents by converting them to HTML using pandoc."""
+    logger.info(f'[Processing DOCX file] started for: "{file_path}"')
+    result = _convert_with_pandoc(file_path, output_dir)
+    logger.info(f'[Processing DOCX file] completed for: "{file_path}"')
+    return result
+def process_markdown(file_path: str, output_dir: str):
+    """Handles Markdown files by converting them to HTML using pandoc."""
+    logger.info(f'[Processing Markdown file] started for: "{file_path}"')
+    result = _convert_with_pandoc(file_path, output_dir)
+    logger.info(f'[Processing Markdown file] completed for: "{file_path}"')
+    return result
+def process_html(file_path: str, output_dir: str):
+    """
+    Handles HTML files by copying them to the output directory with the standard name.
+    """
+    logger.info(f'[Processing HTML file] started for: "{file_path}"')
+    dest_path = os.path.join(output_dir, 'index.html')
+    if os.path.abspath(file_path) != os.path.abspath(dest_path):
+        shutil.move(file_path, dest_path)
+    logger.info(f'[Processing HTML file] completed for: "{file_path}"')
+    return dest_path
+def handle_url(url: str, output_dir: str, force_ext: str = ''):
+    """
+    Handles a URL by determining the file type and using the most efficient
+    processing method.
+    """
+    logger.info(f'[Processing URL] started for: "{url}"')
+    file_ext = None
+    if force_ext:
+        file_ext = f'.{force_ext.lstrip(".")}'
+    else:
+        try:
+            response = requests.head(url, timeout=15, allow_redirects=True)
+            response.raise_for_status()
+            content_type = response.headers.get('content-type')
+            if content_type:
+                mime_type = content_type.split(';')[0].strip()
+                file_ext = mimetypes.guess_extension(mime_type)
+            if not file_ext or file_ext in ['.bin']:
+                parsed_url = urlparse(url)
+                _, ext_from_url = os.path.splitext(parsed_url.path)
+                if ext_from_url:
+                    file_ext = ext_from_url
+        except requests.RequestException as e:
+            raise FileHandlerError(f'Failed to retrieve headers from URL {url}: {e}') from e
+    if not file_ext or file_ext.lower() not in EXTENSION_HANDLERS:
+        logger.warning(f'Could not determine a supported file type for {url}. Defaulting to HTML.')
+        file_ext = '.html'
+    # Download to a temporary file for all types except HTML, which is streamed.
+    if file_ext == '.html':
+        output_html_path = os.path.join(output_dir, 'index.html')
+        try:
+            with requests.get(url, stream=True, timeout=60) as r:
+                r.raise_for_status()
+                with open(output_html_path, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+            return output_html_path
+        except requests.RequestException as e:
+            raise FileHandlerError(f'Failed to download HTML content from {url}: {e}')
+    handler_func = EXTENSION_HANDLERS.get(file_ext.lower())
+    if not handler_func:
+        raise FileHandlerError(f"No handler found for file type '{file_ext}' from URL {url}")
+    temp_file_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
+            temp_file_path = temp_file.name
+            with requests.get(url, stream=True, timeout=120) as r:
+                r.raise_for_status()
+                for chunk in r.iter_content(chunk_size=8192):
+                    temp_file.write(chunk)
+    except requests.RequestException as e:
+        raise FileHandlerError(f'Failed to download content from {url}: {e}') from e
+    logger.info(f'[Processing URL] completed for: "{url}"')
+    return handler_func(temp_file_path, output_dir)
+# Mapping of file extensions to handler functions
+EXTENSION_HANDLERS = {
+    '.pdf': process_pdf,
+    '.pptx': process_pptx,
+    '.docx': process_docx,
+    '.md': process_markdown,
+    '.html': process_html,
+}
+def get_handler(input_path: str, force_ext: str = ''):
+    """
+    Determines and returns the correct file handler function based on the input.
+    """
+    if input_path.startswith(('http://', 'https://')):
+        return lambda output_dir: handle_url(input_path, output_dir, force_ext)
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f'Input file not found: {input_path}')
+    _, ext = os.path.splitext(input_path)
+    file_ext = f'.{force_ext.lstrip(".")}' if force_ext else ext
+    if not file_ext:
+        raise ValueError('File has no extension, and --force-ext was not provided.')
+    handler_func = EXTENSION_HANDLERS.get(file_ext.lower())
+    if not handler_func:
+        logger.error(f'Unsupported file type: {file_ext}')
+        raise ValueError(f'Unsupported file type: {file_ext}')
+    return lambda output_dir: handler_func(input_path, output_dir)
+def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
+    """
+    Main entry point for processing a file or URL.
+    It identifies the file type, runs the appropriate handler, and returns the path to the final processed HTML file.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    logger.info(f'[Processing File] Retrieving correct parser for "{input_path}"')
+    handler = get_handler(input_path, force_ext)
+    try:
+        final_html_path = handler(output_dir)
+    except FileHandlerError as e:
+        logger.error(
+            f'[Processing File] Processing failed to produce an output file for "{input_path}"',
+            extra={'error': str(e)},
+        )
+        raise
+    if not final_html_path or not os.path.exists(final_html_path):
+        raise FileHandlerError(f"[Processing File] Processing failed to produce an output file for '{input_path}'")
+    logger.info(f'[Processing File] Reading generated HTML file in "{final_html_path}"')
+    try:
+        with open(final_html_path, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+    except Exception as e:
+        logger.error(
+            f'[Processing File] Failed to read the generated HTML file at {final_html_path}',
+            extra={'error': str(e)},
+        )
+        raise
+    logger.info('[Processing File] Parsing HTML into sections.')
+    parser = HTMLSectionParser()
+    parsed_sections = parser.parse_sections(html_content)
+    logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
+    jsonl_output_path = os.path.join(output_dir, 'sections.jsonl')
+    all_nodes = []
+    if parsed_sections:
+        for section in parsed_sections:
+            all_nodes.extend(process_node(section, parent_digest_hash=None))
+    write_stream_of_obj(all_nodes, jsonl_output_path)
+    logger.info(f'[Processing File] Successfully created JSON digest at {jsonl_output_path}')
+    return final_html_path

content_extraction/fix_ocr.py ADDED Viewed

@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+"""
+Markdown File Formatting Automation Script
+This script automates two specific formatting fixes in a Markdown file:
+1. Adjusting heading levels based on numerical hierarchy.
+2. Formatting the REFERENCES section with consistent spacing.
+It operates as a command-line tool, reading from a file or stdin and
+writing to a file or stdout, in a standard UNIX-like fashion.
+"""
+import re
+import argparse
+import difflib
+from typing import Iterable
+import sys
+import logging
+from content_extraction.common_std_io import write_output
+from .logging_config import setup_logging
+logger = logging.getLogger(__name__)
+def adjust_headings(lines):
+    """
+    Adjusts heading levels to match numerical hierarchy and separates paragraphs.
+    This function yields lines of text, ensuring appropriate spacing around
+    headings and their associated paragraphs.
+    Args:
+        lines (list): A list of strings, where each string is a line from the input.
+    Yields:
+        str: The processed lines of text.
+    """
+    HEADING_PATTERN = re.compile(r'^(#*)\s+([A-Z]\s?(?:\.\s*\d+)*|\d+(?:\.\s*\d+)*)\s*(.*)$')
+    ORDERED_LIST_PATTERN = re.compile(r'^\s*\d+\.\s')
+    for line in lines:
+        # If the line is a numbered list item, leave it as is.
+        if ORDERED_LIST_PATTERN.match(line):
+            yield line
+            continue
+        match = HEADING_PATTERN.match(line)
+        if not match:
+            stripped_line = line.lstrip()
+            if stripped_line.startswith('#'):
+                # Default non-matching headings to level 4
+                text = stripped_line.lstrip('#').strip()
+                yield ''
+                yield '#### ' + text
+                continue
+            # Keep non-matching lines as is
+            yield line
+            continue
+        # A candidate heading was found
+        _, heading_number, heading_text = match.groups()
+        # Determine correct markdown level
+        parts = heading_number.split('.')
+        desired_hashes_count = len(parts) + 1
+        # Process heading text to separate title from a potential inline paragraph.
+        heading_text_stripped = heading_text.strip()
+        clean_title = heading_text_stripped
+        paragraph = None
+        if '.' in heading_text_stripped:
+            title_parts = heading_text_stripped.split('.', 1)
+            potential_paragraph = title_parts[1].strip()
+            # If the part after the period has letters, it's a paragraph.
+            if potential_paragraph and potential_paragraph[0].isalpha():
+                clean_title = title_parts[0].strip() + '.'
+                paragraph = potential_paragraph
+        # Yield a blank line before the new heading for spacing
+        yield ''
+        # Construct and yield the new heading line
+        new_heading_line = '#' * desired_hashes_count + ' ' + heading_number + ' ' + clean_title
+        yield new_heading_line
+        if paragraph:
+            # Yield a blank line between heading and its paragraph, then the paragraph
+            yield ''
+            yield paragraph
+def format_references(lines):
+    """
+    Format the REFERENCES section with consistent spacing.
+    Args:
+        lines (list): List of lines from the input file.
+    Returns:
+        list: Modified lines with properly formatted references.
+    """
+    modified_lines = []
+    in_references_section = False
+    for line in lines:
+        # Check if we're entering the REFERENCES section
+        if line.strip() == '# REFERENCES':
+            in_references_section = True
+            modified_lines.append(line)
+            continue
+        if in_references_section:
+            stripped = line.strip()
+            if stripped:  # If the line is not empty
+                modified_lines.append(stripped)
+                # Append one blank line after each non-empty line
+                modified_lines.append('')
+        else:
+            # Not in references section
+            modified_lines.append(line)
+    return modified_lines
+def process_science_paper(text_file_content: str, heading_file_content: str):
+    """
+    Process markdown content with both formatting fixes.
+    """
+    lines = text_file_content.splitlines()
+    adjusted_lines_generator = adjust_headings(lines)
+    # Consume the generator to pass a list to the next function
+    formatted_lines = format_references(list(adjusted_lines_generator))
+    # Join lines back into a single string with a trailing newline
+    return '\n'.join(formatted_lines) + '\n'
+def parse_ndiff(diff_lines: Iterable[str]) -> list[tuple[str, str]]:
+    """
+    Turn an ndiff iterable into a list of (old_line, new_line) patches.
+    Only pairs up “- old” followed by “+ new” within the same hunk.
+    """
+    patches: list[tuple[str, str]] = []
+    pending_old = None
+    for line in diff_lines:
+        if line.startswith('- '):
+            pending_old = line[2:]
+        elif line.startswith('+ ') and pending_old is not None:
+            patches.append((pending_old, line[2:]))
+            pending_old = None
+        elif line.startswith('  ') or not line:
+            patches.append((pending_old or '', ''))
+            pending_old = None
+    return patches
+def apply_heading_patches(ocr_text: str, diff_lines: Iterable[str]) -> str:
+    """
+    Apply heading corrections from an ndiff iterable to the OCR text.
+    For each (old, new) patch, replace the first exact match of old in the OCR
+    text with new.
+    """
+    patches = parse_ndiff(diff_lines)
+    lines = ocr_text.splitlines()
+    for old_heading, new_heading in patches:
+        for idx, line in enumerate(lines):
+            if line == old_heading:
+                lines[idx] = new_heading
+                break
+    return '\n'.join(lines)
+def process_general_paper(text_file_content: str, heading_file_content: str) -> str:
+    from content_extraction.dspy_modules import CorrectHeadingLevel
+    heading_corrector = CorrectHeadingLevel()
+    pred = heading_corrector(heading_file_content)
+    corrected_headings = pred.corrected_headings
+    with open('corrected_headings.txt', 'w') as f:
+        f.write(corrected_headings)
+    diff = difflib.ndiff(heading_file_content.splitlines(), corrected_headings.splitlines())
+    fixed_text = apply_heading_patches(text_file_content, diff)
+    return fixed_text
+def main():
+    """Main function to handle command line arguments and execute the script."""
+    setup_logging()
+    parser = argparse.ArgumentParser(
+        description='Automate markdown file formatting fixes.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s input.md                    # Parse file, output to stdout
+  %(prog)s -o output.md input.md     # Parse file, save to file
+  cat input.md | %(prog)s              # Parse from stdin
+  %(prog)s --verbose input.md          # Show debug information
+        """,
+    )
+    parser.add_argument('ocr_input_file', help='Path to input markdown file')
+    parser.add_argument('headings_input_file', help='Path to markdown file with headings')
+    parser.add_argument(
+        '-o',
+        '--output',
+        help='Path to output markdown file (if not provided, writes to stdout)',
+    )
+    parser.add_argument(
+        '--science_paper',
+        action='store_true',
+        help='Indicates that the input is a science paper. Parsing optimized for scientific papers.',
+    )
+    args = parser.parse_args()
+    with open(args.ocr_input_file, 'r') as f:
+        markdown_content = f.read()
+    with open(args.headings_input_file, 'r') as f:
+        headings_content = f.read()
+    # Process the markdown content
+    if args.science_paper:
+        processed_content = process_science_paper(markdown_content, headings_content)
+    else:
+        processed_content = process_general_paper(markdown_content, headings_content)
+    # Write output to file or stdout
+    write_output(processed_content, args.output)
+if __name__ == '__main__':
+    sys.exit(main())

content_extraction/logging_config.py ADDED Viewed

@@ -0,0 +1,13 @@
+import logging
+import sys
+def setup_logging(level=logging.INFO):
+    """
+    Set up basic logging for the application.
+    """
+    logging.basicConfig(
+        level=level,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        stream=sys.stdout,  # Log to stdout
+    )

content_extraction/parse_html.py ADDED Viewed

@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+HTML Content Extraction CLI
+A command-line tool for extracting structured content from HTML documents.
+Converts HTML sections into hierarchical JSON data with preserved formatting.
+Usage:
+    python main.py [options] [input_file]
+Examples:
+    # Read from stdin, output to stdout
+    cat example.html | python main.py
+    # Read from file, output to stdout
+    python main.py input.html
+    # Read from stdin, output to file
+    python main.py -o output.json
+    # Read from file, output to file
+    python main.py input.html -o output.json
+    # Pretty print JSON output
+    python main.py --pretty input.html
+    # Verbose mode with debug information
+    python main.py --verbose input.html
+"""
+import sys
+import argparse
+import json
+import logging
+from content_extraction.common_std_io import read_input, write_output
+from content_extraction.semantic_chunk_html import HTMLSectionParser
+from .logging_config import setup_logging
+logger = logging.getLogger(__name__)
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description='Extract structured content from HTML documents',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s input.html                    # Parse file, output to stdout
+  %(prog)s -o output.json input.html     # Parse file, save to JSON
+  cat input.html | %(prog)s              # Parse from stdin
+  %(prog)s --pretty input.html           # Pretty-printed JSON output
+  %(prog)s --verbose input.html          # Show debug information
+        """,
+    )
+    parser.add_argument(
+        'input_file',
+        nargs='?',
+        help='Input HTML file (if not provided, reads from stdin)',
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='FILE',
+        help='Output JSON file (if not provided, writes to stdout)',
+    )
+    parser.add_argument(
+        '--pretty',
+        action='store_true',
+        help='Pretty-print JSON output with indentation',
+    )
+    parser.add_argument(
+        '-v',
+        '--verbose',
+        action='store_true',
+        help='Show verbose output and debug information',
+    )
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
+    args = parser.parse_args()
+    setup_logging(level=logging.DEBUG if args.verbose else logging.INFO)
+    try:
+        # Read input
+        if args.input_file:
+            logger.debug(f'Reading from file: {args.input_file}')
+        else:
+            logger.debug('Reading from stdin...')
+        html_content = read_input(args.input_file)
+        # Parse HTML
+        parser = HTMLSectionParser()
+        result = parser.parse_sections(html_content)
+        # Write output
+        write_output(json.dumps(result), args.output)
+        logger.debug('Processing completed successfully')
+    except KeyboardInterrupt:
+        logger.warning('Operation cancelled by user')
+        return 1
+    except Exception:
+        logger.error('An unexpected error occurred', exc_info=True)
+        return 1
+if __name__ == '__main__':
+    sys.exit(main())