PyPI - ocrcontext - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

ocrcontext/__init__.py +49 -0
ocrcontext/analyzer.py +198 -0
ocrcontext/config.py +49 -0
ocrcontext/engines/__init__.py +6 -0
ocrcontext/engines/base.py +45 -0
ocrcontext/engines/handwriting.py +103 -0
ocrcontext/engines/paddle.py +264 -0
ocrcontext/engines/pdf_text.py +126 -0
ocrcontext/engines/registry.py +67 -0
ocrcontext/engines/trocr.py +191 -0
ocrcontext/engines/vision.py +538 -0
ocrcontext/exceptions.py +45 -0
ocrcontext/llm/__init__.py +10 -0
ocrcontext/llm/drift.py +58 -0
ocrcontext/llm/extractor.py +63 -0
ocrcontext/llm/formatting.py +39 -0
ocrcontext/llm/literal_preserve.py +164 -0
ocrcontext/llm/prompts.py +157 -0
ocrcontext/llm/refiner.py +114 -0
ocrcontext/llm/schemas.py +99 -0
ocrcontext/pipeline.py +162 -0
ocrcontext/preprocessing/__init__.py +5 -0
ocrcontext/preprocessing/image.py +177 -0
ocrcontext/py.typed +0 -0
ocrcontext/quality.py +76 -0
ocrcontext/schemas.py +8 -0
ocrcontext/types.py +55 -0
ocrcontext/utils/__init__.py +1 -0
ocrcontext/utils/files.py +172 -0
ocrcontext/utils/lang.py +77 -0
ocrcontext-0.1.0.dist-info/METADATA +207 -0
ocrcontext-0.1.0.dist-info/RECORD +34 -0
ocrcontext-0.1.0.dist-info/WHEEL +4 -0
ocrcontext-0.1.0.dist-info/licenses/LICENSE +21 -0

ocrcontext/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""ocrcontext — decoupled, LLM-agnostic document OCR + structured extraction.
+Quick start::
+    from ocrcontext import Analyzer
+    result = Analyzer().analyze("invoice.pdf")
+    print(result.text)
+With an injected LangChain model::
+    from langchain_openai import ChatOpenAI
+    from ocrcontext import Analyzer
+    from ocrcontext.schemas import Invoice
+    analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"))
+    invoice = analyzer.extract("invoice.pdf", schema=Invoice)
+"""
+from __future__ import annotations
+from .analyzer import Analyzer
+from .config import AnalyzerConfig
+from .engines.registry import EngineRegistry
+from .exceptions import (
+    EngineError,
+    LLMNotConfiguredError,
+    MissingDependencyError,
+    NoTextDetectedError,
+    OcrContextError,
+    UnsupportedFileError,
+)
+from .types import OcrResult, RefinementMode
+__version__ = "0.1.0"
+__all__ = [
+    "Analyzer",
+    "AnalyzerConfig",
+    "EngineRegistry",
+    "OcrResult",
+    "RefinementMode",
+    "OcrContextError",
+    "MissingDependencyError",
+    "UnsupportedFileError",
+    "NoTextDetectedError",
+    "LLMNotConfiguredError",
+    "EngineError",
+    "__version__",
+]

ocrcontext/analyzer.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""The public facade: instantiate, pass a document, get text or a Pydantic model.
+    from ocrcontext import Analyzer
+    result = Analyzer().analyze("invoice.pdf")
+    print(result.text)
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional, TypeVar
+from pydantic import BaseModel
+from .config import AnalyzerConfig
+from .engines.registry import EngineRegistry
+from .exceptions import LLMNotConfiguredError
+from .pipeline import Pipeline
+from .quality import handwriting_refinement_mode
+from .types import OcrResult, RefinementMode
+from .utils.files import Source
+if TYPE_CHECKING:
+    from langchain_core.language_models import BaseChatModel
+    from .llm.extractor import StructuredExtractor
+    from .llm.refiner import Refiner
+TSchema = TypeVar("TSchema", bound=BaseModel)
+_HANDWRITING_SOURCES = {"vision_handwriting", "trocr_handwriting", "handwriting_ocr"}
+class Analyzer:
+    """High-level document analyzer.
+    Parameters
+    ----------
+    llm:
+        Optional LangChain ``BaseChatModel``. Required only for ``refine``/``extract``.
+        Bring your own provider (``langchain_openai.ChatOpenAI`` etc.).
+    lang:
+        Default document language code (e.g. ``"en"``, ``"tr"``).
+    config:
+        Advanced pipeline tuning. Overrides ``lang`` if both are set.
+    registry:
+        Shared engine registry (singleton model cache). Defaults to a process-wide
+        shared instance so PaddleOCR/TrOCR load at most once.
+    """
+    def __init__(
+        self,
+        llm: "Optional[BaseChatModel]" = None,
+        *,
+        lang: str = "en",
+        config: Optional[AnalyzerConfig] = None,
+        registry: Optional[EngineRegistry] = None,
+    ) -> None:
+        self._llm = llm
+        self.config = config or AnalyzerConfig(lang=lang)
+        self.registry = registry or EngineRegistry.shared()
+        self._pipeline = Pipeline(registry=self.registry, config=self.config)
+        self._refiner: "Refiner | None" = None
+        self._extractor: "StructuredExtractor | None" = None
+    # --- Public API ----------------------------------------------------------
+    def analyze(
+        self,
+        source: Source,
+        *,
+        handwriting: bool = False,
+        refine: Optional[bool] = None,
+        lang: Optional[str] = None,
+        mode: Optional[RefinementMode] = None,
+        filename: Optional[str] = None,
+    ) -> OcrResult:
+        """OCR a document (PDF/image) and optionally LLM-refine the text.
+        ``refine=None`` (default) refines only when an LLM is configured and the
+        text did not come from an exact digital PDF text layer.
+        """
+        result = self._pipeline.run(
+            source, lang=lang, handwriting=handwriting, filename=filename
+        )
+        if self._should_refine(result, refine):
+            chosen_mode = mode or self._infer_mode(result)
+            refined = self.refine(
+                result.text, language=lang or self.config.lang, mode=chosen_mode
+            )
+            if refined != result.text:
+                result.raw_text = result.text
+                result.text = refined
+                result.refined = True
+        return result
+    def extract(
+        self,
+        source: Source,
+        schema: type[TSchema],
+        *,
+        handwriting: bool = False,
+        refine: bool = False,
+        lang: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        filename: Optional[str] = None,
+    ) -> TSchema:
+        """OCR a document and extract a structured Pydantic model from it.
+        Refinement is OFF by default for extraction (the LLM extractor reads raw
+        OCR text directly, mirroring the original invoice pipeline).
+        """
+        result = self.analyze(
+            source,
+            handwriting=handwriting,
+            refine=refine,
+            lang=lang,
+            filename=filename,
+        )
+        return self.extract_text(
+            result.text,
+            schema,
+            language=lang or self.config.lang,
+            system_prompt=system_prompt,
+        )
+    def extract_text(
+        self,
+        text: str,
+        schema: type[TSchema],
+        *,
+        language: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+    ) -> TSchema:
+        """Extract a structured Pydantic model from already-OCR'd text.
+        Useful when you already have text (e.g. from a prior ``analyze`` call) and
+        want to avoid re-running OCR. Requires a configured LLM.
+        """
+        return self._get_extractor().extract(
+            text,
+            schema,
+            language=language or self.config.lang,
+            system_prompt=system_prompt,
+        )
+    def refine(
+        self,
+        text: str,
+        *,
+        language: Optional[str] = None,
+        mode: RefinementMode = RefinementMode.CONSERVATIVE,
+    ) -> str:
+        """Refine arbitrary OCR text directly (requires a configured LLM)."""
+        return self._get_refiner().refine(
+            text, language=language or self.config.lang, mode=mode
+        )
+    # --- Internals -----------------------------------------------------------
+    def _should_refine(self, result: OcrResult, refine: Optional[bool]) -> bool:
+        if refine is False:
+            return False
+        if refine is True:
+            if self._llm is None:
+                raise LLMNotConfiguredError("refine=True")
+            return True
+        # refine is None -> auto
+        if self._llm is None or not self.config.refine_by_default:
+            return False
+        # Never "correct" an exact digital PDF text layer.
+        return result.text_source != "pdf_text_layer"
+    def _infer_mode(self, result: OcrResult) -> RefinementMode:
+        if result.text_source in _HANDWRITING_SOURCES:
+            return handwriting_refinement_mode(result.text, result.has_dikw_structure)
+        if result.text_source == "pdf_text_layer":
+            return RefinementMode.LAYOUT
+        return RefinementMode.CONSERVATIVE
+    def _get_refiner(self) -> "Refiner":
+        if self._llm is None:
+            raise LLMNotConfiguredError("Refinement")
+        if self._refiner is None:
+            from .llm.refiner import Refiner
+            self._refiner = Refiner(self._llm)
+        return self._refiner
+    def _get_extractor(self) -> "StructuredExtractor":
+        if self._llm is None:
+            raise LLMNotConfiguredError("Structured extraction")
+        if self._extractor is None:
+            from .llm.extractor import StructuredExtractor
+            self._extractor = StructuredExtractor(self._llm)
+        return self._extractor

ocrcontext/config.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Configuration for the analyzer / pipeline.
+All knobs mirror constants from the original Modal service so OCR behaviour is
+identical after decoupling.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+# PDF rasterization scale when falling back to image OCR (handwriting needs finer detail).
+OCR_PDF_RENDER_SCALE = 2.75
+OCR_PDF_RENDER_SCALE_HANDWRITING = 3.5
+# Minimum expected non-empty lines per page before the line-band fallback kicks in.
+MIN_EXPECTED_LINES_PER_PAGE = 3
+MIN_EXPECTED_LINES_HANDWRITING = 1
+@dataclass
+class AnalyzerConfig:
+    """Tunable settings for an :class:`~ocrcontext.analyzer.Analyzer`.
+    Defaults reproduce the production pipeline's behaviour.
+    """
+    # Default document language (UI-style code, e.g. "en", "tr"). Mapped to a
+    # PaddleOCR model via ocrcontext.utils.lang.normalize_paddle_lang.
+    lang: str = "en"
+    # Prefer a digital PDF's embedded text layer over OCR when it is sufficient.
+    prefer_pdf_text_layer: bool = True
+    # PDF rasterization scales.
+    pdf_render_scale: float = OCR_PDF_RENDER_SCALE
+    pdf_render_scale_handwriting: float = OCR_PDF_RENDER_SCALE_HANDWRITING
+    # Line-band fallback thresholds.
+    min_lines_per_page: int = MIN_EXPECTED_LINES_PER_PAGE
+    min_lines_handwriting: int = MIN_EXPECTED_LINES_HANDWRITING
+    # When True, automatically retry with the handwriting engine if printed OCR
+    # returns insufficient text (mirrors the documents/process retry ladder).
+    auto_handwriting_fallback: bool = True
+    # Default refinement behaviour when Analyzer.analyze(refine=None):
+    #   - refine when an LLM is configured AND the text did not come from an exact
+    #     digital PDF text layer (which must not be "corrected").
+    refine_by_default: bool = True

ocrcontext/engines/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""OCR engines and the singleton model registry."""
+from .base import OcrEngine, PageOcr
+from .registry import EngineRegistry
+__all__ = ["OcrEngine", "PageOcr", "EngineRegistry"]

ocrcontext/engines/base.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Engine abstractions shared by all OCR backends."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+@dataclass
+class PageOcr:
+    """Recognition output for a single page image."""
+    text: str
+    scores: list[float] = field(default_factory=list)
+    # Set by the handwriting engine when the page looks like a DIKW/pyramid diagram.
+    has_dikw_structure: bool = False
+    # Engine-reported text source label (e.g. "vision_handwriting", "trocr_handwriting").
+    text_source: str | None = None
+    @property
+    def line_count(self) -> int:
+        return len([ln for ln in self.text.splitlines() if ln.strip()])
+class OcrEngine(ABC):
+    """Recognize text from a single page image on disk.
+    Engines are responsible for their own preprocessing and for cleaning up any
+    temporary files they create.
+    """
+    #: Default text_source label reported in OcrResult when this engine is used.
+    text_source: str = "ocr"
+    @abstractmethod
+    def recognize(
+        self,
+        img_path: str,
+        *,
+        lang: str = "en",
+        min_lines: int = 1,
+        handwriting: bool = False,
+    ) -> PageOcr:
+        """Recognize a single page image and return its text + per-token scores."""
+        raise NotImplementedError

ocrcontext/engines/handwriting.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Composite handwriting engine: Google Vision primary, TrOCR fallback.
+Mirrors ocr-service/modal_app.py::HandwritingOCRService per-page logic without
+the Modal class wrapper. Each sub-engine is loaded lazily on first use.
+"""
+from __future__ import annotations
+from ..preprocessing.image import preprocess_image_for_ocr
+from ..utils.files import cleanup_paths
+from .base import OcrEngine, PageOcr
+from .trocr import TrOCRHandwritingEngine, run_trocr_on_page
+from .vision import GoogleVisionHandwritingEngine, detect_dikw_structure, run_vision_on_page
+class HandwritingEngine(OcrEngine):
+    """Vision-first handwriting recognition with a TrOCR fallback per page."""
+    text_source = "handwriting_ocr"
+    def __init__(self) -> None:
+        self._vision: GoogleVisionHandwritingEngine | None = None
+        self._trocr: TrOCRHandwritingEngine | None = None
+    def _ensure_vision(self) -> GoogleVisionHandwritingEngine:
+        if self._vision is None:
+            engine = GoogleVisionHandwritingEngine()
+            engine.load()  # no-op disable if creds missing; raises only if pkg absent
+            self._vision = engine
+        return self._vision
+    def _ensure_trocr(self) -> TrOCRHandwritingEngine:
+        if self._trocr is None:
+            engine = TrOCRHandwritingEngine()
+            engine.load()
+            engine.warmup_inference()
+            self._trocr = engine
+        return self._trocr
+    def recognize(
+        self,
+        img_path: str,
+        *,
+        lang: str = "en",
+        min_lines: int = 1,
+        handwriting: bool = True,
+    ) -> PageOcr:
+        preprocessed: list[str] = []
+        try:
+            ocr_img_path = preprocess_image_for_ocr(img_path, handwriting=True)
+            if ocr_img_path != img_path:
+                preprocessed.append(ocr_img_path)
+            page_text = ""
+            page_conf = 0.0
+            used_vision = False
+            used_trocr = False
+            has_dikw = False
+            # Vision is optional: load() leaves it disabled when no credentials exist.
+            try:
+                vision = self._ensure_vision()
+            except Exception:
+                vision = None
+            if vision is not None and vision.enabled:
+                try:
+                    page_text, page_conf = run_vision_on_page(
+                        vision, ocr_img_path, ocr_lang=lang
+                    )
+                    if page_text:
+                        used_vision = True
+                        if vision.last_has_dikw_structure:
+                            has_dikw = True
+                except Exception:
+                    page_text = ""
+            if not page_text:
+                trocr = self._ensure_trocr()
+                page_text, page_conf = run_trocr_on_page(trocr, ocr_img_path)
+                if page_text:
+                    used_trocr = True
+            text_source = (
+                "vision_handwriting"
+                if used_vision and not used_trocr
+                else "trocr_handwriting"
+                if used_trocr
+                else "handwriting_ocr"
+            )
+            if not has_dikw and page_text.strip():
+                has_dikw = detect_dikw_structure(page_text)
+            scores = [page_conf] if page_conf > 0 else []
+            return PageOcr(
+                text=page_text.strip(),
+                scores=scores,
+                has_dikw_structure=has_dikw,
+                text_source=text_source,
+            )
+        finally:
+            cleanup_paths(preprocessed)