PyPI - content-extraction - Versions diffs - 0.1.0__py3-none-any.whl - Mend

content-extraction 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

content_extraction/__init__.py +0 -0
content_extraction/common_std_io.py +50 -0
content_extraction/do_ocr.py +199 -0
content_extraction/dspy_modules.py +24 -0
content_extraction/extract_from_pptx.py +174 -0
content_extraction/file_handlers.py +280 -0
content_extraction/fix_ocr.py +245 -0
content_extraction/logging_config.py +13 -0
content_extraction/parse_html.py +117 -0
content_extraction/semantic_chunk_html.py +164 -0
content_extraction/split_and_create_digest.py +134 -0
content_extraction-0.1.0.dist-info/METADATA +258 -0
content_extraction-0.1.0.dist-info/RECORD +15 -0
content_extraction-0.1.0.dist-info/WHEEL +5 -0
content_extraction-0.1.0.dist-info/top_level.txt +1 -0

content_extraction/__init__.py ADDED Viewed

File without changes

content_extraction/common_std_io.py ADDED Viewed

@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+import sys
+import json
+import logging
+from typing import Iterable
+logger = logging.getLogger(__name__)
+def read_input(input_file: str | None = None) -> str:
+    """Read JSON content from a file or stdin and parse it."""
+    try:
+        if input_file:
+            with open(input_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+        else:
+            content = sys.stdin.read()
+    except Exception as e:
+        logger.error(f'Error reading input from {input_file or "stdin"}', exc_info=True)
+        raise RuntimeError(f'Error reading input: {e}')
+    return content
+def write_output(output: str, output_file: str | None = None):
+    try:
+        if output_file:
+            with open(output_file, 'w', encoding='utf-8') as f:
+                f.write(output)
+        else:
+            sys.stdout.write(output)
+    except IOError:
+        logger.error(f'Error writing to {output_file or "stdout"}', exc_info=True)
+        raise
+def write_stream_of_obj(obj_stream: Iterable[dict], output_file: str | None = None):
+    try:
+        if output_file:
+            with open(output_file, 'w', encoding='utf-8') as f:
+                for obj in obj_stream:
+                    f.write(json.dumps(obj))
+                    f.write('\n')
+        else:
+            for obj in obj_stream:
+                sys.stdout.write(json.dumps(obj))
+                sys.stdout.write('\n')
+    except IOError:
+        logger.error(f'Error writing stream to {output_file or "stdout"}', exc_info=True)
+        raise

content_extraction/do_ocr.py ADDED Viewed

@@ -0,0 +1,199 @@
+import os
+import sys
+import json
+import logging
+import subprocess
+import base64
+import requests
+import argparse
+from urllib.parse import urlparse
+from content_extraction.logging_config import setup_logging
+logger = logging.getLogger(__name__)
+def set_env_vars():
+    x = {
+        'MODEL_ID': 'mistral-ocr-2505',
+        'PROJECT_ID': 'edu-course-companion',
+        'REGION': 'europe-west4',
+    }
+    for k, v in x.items():
+        os.environ[k] = v
+def authenticate_and_get_token() -> str | None:
+    process = subprocess.Popen('gcloud auth print-access-token', stdout=subprocess.PIPE, shell=True)
+    (access_token_bytes, err) = process.communicate()
+    if err:
+        logger.error(f'Error getting access token: {err.decode("utf-8")}')
+        return None
+    access_token = access_token_bytes.decode('utf-8').strip()
+    return access_token
+def build_url_to_model():
+    region = os.getenv('REGION')
+    project_id = os.getenv('PROJECT_ID')
+    model_id = os.getenv('MODEL_ID')
+    url = f'https://{region}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{region}/publishers/mistralai/models/{model_id}:rawPredict'
+    return url
+def file_to_base64_string(file_object):
+    """
+    Converts a Python file-like object to a base64 encoded string.
+    Args:
+        file_object: A file-like object opened in binary read mode (e.g., 'rb').
+    Returns:
+        A string containing the base64 encoded content of the file.
+    """
+    encoded_bytes = base64.b64encode(file_object.read())
+    encoded_string = encoded_bytes.decode('utf-8')
+    return encoded_string
+def build_data_url_from_file(filepath):
+    """Creates a data URL from a local file path."""
+    with open(filepath, 'rb') as file:
+        base64_pdf = file_to_base64_string(file)
+    # The API expects this specific format for data URLs
+    document_url = f'data:application/pdf;base64,{base64_pdf}'
+    return document_url
+def build_payload(document_url):
+    model_id = os.getenv('MODEL_ID')
+    payload = {
+        'model': model_id,
+        'document': {
+            'type': 'document_url',
+            'document_url': document_url,
+        },
+        'include_image_base64': True,  # Request image content
+    }
+    return payload
+def make_request(payload) -> dict | None:
+    logger.debug('[Authentication] started')
+    access_token = authenticate_and_get_token()
+    if access_token is None:
+        return None
+    logger.debug('[Authentication] successfull')
+    headers = {
+        'Authorization': f'Bearer {access_token}',
+        'Accept': 'application/json',
+    }
+    url = build_url_to_model()
+    logger.debug(f'[Request] started using URL: "{url}"')
+    response = requests.post(url=url, headers=headers, json=payload)
+    if response.status_code == 200:
+        try:
+            response_dict = response.json()
+        except json.JSONDecodeError as e:
+            logger.error(f'[Request] Error decoding JSON: {e}', extra={'response': response.text})
+            return None
+    else:
+        logger.error(
+            f'Request failed with status code: {response.status_code}',
+            extra={'response': response.text},
+        )
+        return None
+    logger.debug(f'[Request] completed using URL: "{url}"')
+    return response_dict
+def save_response_to_disk(response_dict, output_dir):
+    logger.debug(f'[Saving to disk] started. Saving output to directory: {output_dir}.')
+    os.makedirs(output_dir, exist_ok=True)
+    for page in response_dict.get('pages', []):
+        logger.debug(f'[Saving to disk] started processing page {page["index"]}')
+        zfilled_index = str(page['index']).zfill(4)
+        page_filename = os.path.join(output_dir, f'page-{zfilled_index}.md')
+        with open(page_filename, 'w', encoding='utf-8') as f:
+            f.write(page['markdown'])
+        logger.debug(f'[Saving to disk] saved page "{page_filename}"')
+        logger.debug('[Saving to disk] started saving images')
+        for image in page.get('images', []):
+            logger.debug(f'[Saving to disk] started saving image {image["id"]}')
+            image_base64 = image['image_base64']
+            colon_index = image_base64.find(',')
+            if colon_index == -1:
+                logger.warning(f'Could not find comma in image_base64 for {image["id"]}, skipping.')
+                continue
+            encoded_image = image_base64[colon_index + 1 :]
+            image_bytes = base64.b64decode(encoded_image)
+            # image id already has the extension
+            image_filename = os.path.join(output_dir, image['id'])
+            with open(image_filename, 'wb') as f:
+                f.write(image_bytes)
+            logger.debug(f'[Saving to disk] completed saving image {image["id"]}')
+        logger.debug(f'[Saving to disk] completed processing page {page["index"]}')
+    logger.debug('[Saving to disk] completed')
+def main():
+    """Main CLI entry point."""
+    setup_logging()
+    parser = argparse.ArgumentParser(
+        description='Extract text and images from a document using OCR.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument('input_source', help='Input file path or URL to a document.')
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='DIRECTORY',
+        help='Output directory to save pages and images. Defaults to a directory named after the input file.',
+    )
+    args = parser.parse_args()
+    set_env_vars()
+    input_source = args.input_source
+    output_dir = args.output
+    # Determine default output directory if not provided
+    if not output_dir:
+        if input_source.startswith(('http://', 'https://')):
+            parsed_url = urlparse(input_source)
+            filename = os.path.basename(parsed_url.path)
+            output_dir = os.path.splitext(filename)[0] if filename else 'ocr_output'
+        else:
+            output_dir = os.path.splitext(os.path.basename(input_source))[0]
+    logger.info(f'[Processing: {input_source}] Started!')
+    logger.info(f'Output will be saved to: {output_dir}')
+    if input_source.startswith(('http://', 'https://')):
+        # If the input is a URL, pass it directly.
+        document_url = input_source
+    else:
+        # If the input is a local file, check for existence and create a data URL.
+        if not os.path.exists(input_source):
+            logger.error(f"Error: Input file not found at '{input_source}'")
+            return 1
+        document_url = build_data_url_from_file(input_source)
+    payload = build_payload(document_url)
+    response_dict = make_request(payload)
+    if not response_dict:
+        logger.error('Failed to get a valid response from the OCR service.')
+        return 1
+    save_response_to_disk(response_dict, output_dir)
+    logger.info(f'[Processing: {input_source}] Completed successfully!')
+if __name__ == '__main__':
+    sys.exit(main())

content_extraction/dspy_modules.py ADDED Viewed

@@ -0,0 +1,24 @@
+import dspy
+lm = dspy.LM("openai/gpt-4o-mini", temperature=0.3, max_tokens=5000)
+dspy.configure(lm=lm)
+class CorrectHeadingLevelSignature(dspy.Signature):
+    """Correct heading levels. Main title should be H1, Chapter Titles H2, etc."""
+    headings: str = dspy.InputField(
+        description=r"String of headings extracted via OCR process, separated by \n"
+    )
+    corrected_headings: str = dspy.OutputField(
+        description="Headings with corrected level"
+    )
+class CorrectHeadingLevel(dspy.Module):
+    def __init__(self):
+        self.predictor = dspy.ChainOfThought(CorrectHeadingLevelSignature)
+    def forward(self, headings):
+        prediction = self.predictor(headings=headings)
+        return prediction

content_extraction/extract_from_pptx.py ADDED Viewed

@@ -0,0 +1,174 @@
+import os
+import argparse
+import sys
+import logging
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+logger = logging.getLogger(__name__)
+def extract_content(pptx_path: str, output_dir: str) -> str | None:
+    logger.info(f'Extracting content from {pptx_path} to {output_dir}')
+    """
+    Extracts text, tables, and images from a PPTX file and saves them
+    into an HTML file and an images directory.
+    Args:
+        pptx_path (str): The path to the input PowerPoint presentation.
+        output_dir (str): The directory where the output HTML and images will be saved.
+    Returns:
+        tuple[str, str] | tuple[None, None]: A tuple containing the path to the
+        output HTML file and the images directory, or (None, None) on failure.
+    """
+    logger.info(f'[Extracting content] started from {pptx_path}')
+    images_dir = os.path.join(output_dir, 'images')
+    html_out_path = os.path.join(output_dir, 'index.html')
+    logger.debug(f'[Extracting content] Images directory: {images_dir}, HTML output path: {html_out_path}')
+    # Ensure output directories exist
+    os.makedirs(images_dir, exist_ok=True)
+    try:
+        prs = Presentation(pptx_path)
+    except Exception:
+        logger.error(f'[Extracting content] Could not open or parse {pptx_path}')
+        return None
+    html_lines = [
+        '<!DOCTYPE html>',
+        '<html lang="en">',
+        '<head>',
+        '  <meta charset="UTF-8">',
+        '  <title>Extracted PPTX Content</title>',
+        '</head>',
+        '<body>',
+    ]
+    image_counter = 0
+    logger.debug('[Extracting content] stated processing slides')
+    for slide_idx, slide in enumerate(prs.slides, start=1):
+        logger.debug(f'[Extracting content] stated processing slide {slide_idx}')
+        html_lines.append(f'  <section id="slide-{slide_idx}">')
+        # 1) Title (if any)
+        title_text = None
+        if slide.has_notes_slide:  # Check for title in shapes first
+            for shape in slide.shapes:
+                if shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.TITLE:
+                    title_text = shape.text_frame.text.strip()
+                    break
+        if title_text:
+            html_lines.append(f'    <h1>{title_text}</h1>')
+        # 2) Walk every shape
+        for shape in slide.shapes:
+            # -- TABLES --
+            if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                html_lines.append('    <table border="1">')
+                table = shape.table
+                for row in table.rows:
+                    html_lines.append('      <tr>')
+                    for cell in row.cells:
+                        cell_txt = cell.text.replace('\n', '<br/>')
+                        html_lines.append(f'        <td>{cell_txt}</td>')
+                    html_lines.append('      </tr>')
+                html_lines.append('    </table>')
+            # -- IMAGES --
+            elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                image = shape.image
+                image_counter += 1
+                ext = image.ext  # e.g. 'png', 'jpeg'
+                img_name = f'slide{slide_idx}_img{image_counter}.{ext}'
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, 'wb') as f:
+                    f.write(image.blob)
+                # Relative path for the src attribute
+                html_lines.append(f'    <img src="images/{img_name}" alt="Slide {slide_idx} image"/>')
+            # -- TEXT (including bullets) --
+            elif shape.has_text_frame:
+                # skip re-printing the title placeholder
+                if shape.is_placeholder and shape.placeholder_format.type in (
+                    PP_PLACEHOLDER.TITLE,
+                    PP_PLACEHOLDER.SUBTITLE,
+                ):
+                    continue
+                in_list = False
+                for para in shape.text_frame.paragraphs:
+                    text = ''.join(run.text for run in para.runs).strip()
+                    if not text:
+                        continue
+                    # any indent > 0 treat as a bullet
+                    if para.level > 0:
+                        if not in_list:
+                            html_lines.append('    <ul>')
+                            in_list = True
+                        html_lines.append(f'      <li>{text}</li>')
+                    else:
+                        if in_list:
+                            html_lines.append('    </ul>')
+                            in_list = False
+                        html_lines.append(f'    <p>{text}</p>')
+                if in_list:
+                    html_lines.append('    </ul>')
+        html_lines.append('  </section>')
+        logger.debug(f'[Extracting content] completed processing slide {slide_idx}')
+    logger.debug('[Extracting content] completed processing slides')
+    html_lines.extend(['</body>', '</html>'])
+    # Write out the final HTML file
+    logger.debug('[Extracting content] started saving HTML file')
+    with open(html_out_path, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(html_lines))
+    logger.debug('[Extracting content] completed saving HTML file')
+    logger.info(f'[Extracting content] completed from {pptx_path}')
+    return html_out_path
+def main():
+    """Main function to handle command line arguments and execute the script."""
+    parser = argparse.ArgumentParser(
+        description='Extract content from a PowerPoint (PPTX) file to HTML.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s presentation.pptx                      # Outputs to 'output/' directory by default
+  %(prog)s presentation.pptx -o extracted_content # Outputs to 'extracted_content/' directory
+""",
+    )
+    parser.add_argument('pptx_file', help='Path to the input PPTX file.')
+    parser.add_argument(
+        '-o',
+        '--output',
+        default='output',
+        help="Path to the output directory (if not provided, defaults to 'output').",
+    )
+    args = parser.parse_args()
+    if not os.path.exists(args.pptx_file):
+        logger.error(f'Input file not found at {args.pptx_file}')
+        return 1
+    html_out, images_out = extract_content(args.pptx_file, args.output)
+    if html_out and images_out:
+        logger.info(f'Successfully extracted content to {html_out} with images in {images_out}/')
+        return 0
+    logger.error(f'Extraction failed for {args.pptx_file}.')
+    return 1
+if __name__ == '__main__':
+    sys.exit(main())