PyPI - ocrcontext - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

ocrcontext/__init__.py +49 -0
ocrcontext/analyzer.py +198 -0
ocrcontext/config.py +49 -0
ocrcontext/engines/__init__.py +6 -0
ocrcontext/engines/base.py +45 -0
ocrcontext/engines/handwriting.py +103 -0
ocrcontext/engines/paddle.py +264 -0
ocrcontext/engines/pdf_text.py +126 -0
ocrcontext/engines/registry.py +67 -0
ocrcontext/engines/trocr.py +191 -0
ocrcontext/engines/vision.py +538 -0
ocrcontext/exceptions.py +45 -0
ocrcontext/llm/__init__.py +10 -0
ocrcontext/llm/drift.py +58 -0
ocrcontext/llm/extractor.py +63 -0
ocrcontext/llm/formatting.py +39 -0
ocrcontext/llm/literal_preserve.py +164 -0
ocrcontext/llm/prompts.py +157 -0
ocrcontext/llm/refiner.py +114 -0
ocrcontext/llm/schemas.py +99 -0
ocrcontext/pipeline.py +162 -0
ocrcontext/preprocessing/__init__.py +5 -0
ocrcontext/preprocessing/image.py +177 -0
ocrcontext/py.typed +0 -0
ocrcontext/quality.py +76 -0
ocrcontext/schemas.py +8 -0
ocrcontext/types.py +55 -0
ocrcontext/utils/__init__.py +1 -0
ocrcontext/utils/files.py +172 -0
ocrcontext/utils/lang.py +77 -0
ocrcontext-0.1.0.dist-info/METADATA +207 -0
ocrcontext-0.1.0.dist-info/RECORD +34 -0
ocrcontext-0.1.0.dist-info/WHEEL +4 -0
ocrcontext-0.1.0.dist-info/licenses/LICENSE +21 -0

ocrcontext/engines/paddle.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""PaddleOCR engine for printed text and scanned documents.
+Ported from ocr-service/modal_app.py::OCRService — the lazy per-language model
+cache, multi-language *coverage-first* candidate selection, and the line-band
+recovery fallback are preserved exactly. The Modal/GPU plumbing is removed.
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+from ..exceptions import EngineError, MissingDependencyError
+from ..preprocessing.image import preprocess_image_for_ocr, split_image_into_line_bands
+from ..utils.files import ascii_safe_dir, cleanup_paths, is_ascii
+from ..utils.lang import candidate_langs
+from .base import OcrEngine, PageOcr
+def _ensure_ascii_model_cache() -> None:
+    """Point PaddleX/HuggingFace model caches at an ASCII-safe path on Windows.
+    PaddlePaddle's C++ model loader cannot open files whose path contains
+    non-ASCII characters (e.g. a non-ASCII Windows username), failing with an
+    "attempting to parse an empty input" JSON error. Redirecting the cache to the
+    8.3 short path of the home directory aliases the very same files via ASCII.
+    Respects any cache env vars the user already set.
+    """
+    if sys.platform != "win32":
+        return
+    home = str(Path.home())
+    if is_ascii(home):
+        return
+    safe_home = ascii_safe_dir(home)
+    if not is_ascii(safe_home):
+        return  # no ASCII short path available; nothing we can safely do
+    os.environ.setdefault("PADDLE_PDX_CACHE_HOME", os.path.join(safe_home, ".paddlex"))
+    os.environ.setdefault("HF_HOME", os.path.join(safe_home, ".cache", "huggingface"))
+def _ensure_paddle_runtime_flags() -> None:
+    """Disable oneDNN/MKLDNN on CPU.
+    PaddlePaddle 3.x's new-IR (PIR) executor hits an unimplemented oneDNN op for
+    some PP-OCR models on CPU ("ConvertPirAttribute2RuntimeAttribute not
+    support"). Turning oneDNN off routes inference through the standard kernels.
+    Set as an env FLAG so it applies regardless of constructor support, and
+    respects any value the user already chose.
+    """
+    os.environ.setdefault("FLAGS_use_mkldnn", "0")
+def _extract_from_result(result):
+    """Normalize PaddleOCR / PaddleX result objects into (text, scores)."""
+    extracted_text = ""
+    extracted_scores: list[float] = []
+    if not result:
+        return extracted_text, extracted_scores
+    first_page_result = result[0] if isinstance(result, list) and len(result) > 0 else result
+    if hasattr(first_page_result, "keys") and "rec_texts" in first_page_result:
+        texts = first_page_result.get("rec_texts", [])
+        scores = first_page_result.get("rec_scores", [])
+        for i, text in enumerate(texts):
+            extracted_text += str(text) + "\n"
+            if i < len(scores):
+                extracted_scores.append(scores[i])
+    elif isinstance(first_page_result, list):
+        for line in first_page_result:
+            try:
+                if isinstance(line, (list, tuple)) and len(line) >= 2:
+                    if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
+                        extracted_text += str(line[1][0]) + "\n"
+                        extracted_scores.append(line[1][1])
+            except Exception:
+                continue
+    return extracted_text, extracted_scores
+class PaddleEngine(OcrEngine):
+    """Lazy, per-language singleton-style PaddleOCR wrapper.
+    A single instance caches one PaddleOCR model per language code, so models are
+    loaded into memory at most once (resource-efficiency requirement).
+    """
+    text_source = "ocr"
+    def __init__(self) -> None:
+        self._ocr_by_lang: dict[str, object] = {}
+    def _get_ocr(self, paddle_lang: str):
+        """Lazy-load + cache a PaddleOCR model for a language (ported loader)."""
+        if paddle_lang in self._ocr_by_lang:
+            return self._ocr_by_lang[paddle_lang]
+        _ensure_ascii_model_cache()
+        _ensure_paddle_runtime_flags()
+        try:
+            from paddleocr import PaddleOCR
+        except ImportError as exc:  # pragma: no cover - exercised via install matrix
+            raise MissingDependencyError("paddleocr", "paddle") from exc
+        import logging
+        logging.getLogger("ppocr").setLevel(logging.ERROR)
+        requested = paddle_lang
+        ocr, errors = self._try_init(PaddleOCR, paddle_lang)
+        if ocr is None and paddle_lang != "en":
+            ocr, en_errors = self._try_init(PaddleOCR, "en")
+            errors.extend(en_errors)
+        if ocr is None:
+            detail = "; ".join(errors[-3:]) if errors else "no profiles attempted"
+            raise EngineError(
+                f"PaddleOCR could not be initialized for lang={paddle_lang!r}. "
+                f"Last errors: {detail}"
+            )
+        self._ocr_by_lang[requested] = ocr
+        return ocr
+    @staticmethod
+    def _try_init(PaddleOCR, lang: str):
+        """Try several constructor signatures (PaddleOCR 3.x and legacy 2.x).
+        Returns (engine_or_None, [error_strings]). The real exceptions are kept so
+        a total failure can be diagnosed instead of silently swallowed.
+        """
+        # PaddleOCR 3.x: disable the doc-orientation / unwarping sub-models (unneeded
+        # for plain OCR) and oneDNN (CPU PIR incompatibility). Each kwarg is tried and
+        # gracefully dropped if a given build rejects it.
+        base_3x = {
+            "lang": lang,
+            "use_doc_orientation_classify": False,
+            "use_doc_unwarping": False,
+            "use_textline_orientation": False,
+        }
+        profiles = [
+            {**base_3x, "enable_mkldnn": False},
+            base_3x,
+            {"lang": lang, "enable_mkldnn": False},
+            {"lang": lang},
+            # Legacy 2.x signature (use_angle_cls / show_log removed in 3.x).
+            {"use_angle_cls": True, "use_textline_orientation": True, "lang": lang,
+             "show_log": False},
+        ]
+        errors: list[str] = []
+        for kwargs in profiles:
+            try:
+                return PaddleOCR(**kwargs), errors
+            except Exception as exc:  # noqa: BLE001 - we record and try the next profile
+                errors.append(f"{type(exc).__name__}: {exc}")
+        return None, errors
+    @staticmethod
+    def _run_ocr(ocr_engine, path: str):
+        """Run recognition across PaddleOCR 2.x (.ocr) and 3.x (.predict)."""
+        predict = getattr(ocr_engine, "predict", None)
+        if callable(predict):
+            try:
+                return predict(path)
+            except Exception:
+                pass
+        return ocr_engine.ocr(path)
+    def recognize(
+        self,
+        img_path: str,
+        *,
+        lang: str = "en",
+        min_lines: int = 3,
+        handwriting: bool = False,
+    ) -> PageOcr:
+        langs = candidate_langs(lang)
+        preprocessed_paths: list[str] = []
+        try:
+            ocr_img_path = preprocess_image_for_ocr(img_path, handwriting=handwriting)
+            if ocr_img_path != img_path:
+                preprocessed_paths.append(ocr_img_path)
+            best_text = ""
+            best_scores: list[float] = []
+            best_line_count = 0
+            for lang_code in langs:
+                ocr_engine = self._get_ocr(lang_code)
+                result = self._run_ocr(ocr_engine, ocr_img_path)
+                candidate_text, candidate_scores = _extract_from_result(result)
+                candidate_line_count = len(
+                    [ln for ln in candidate_text.splitlines() if ln.strip()]
+                )
+                # Coverage-first selection (confidence ignored):
+                #   1) more non-empty lines wins
+                #   2) on a tie, longer non-whitespace text wins
+                if candidate_line_count > best_line_count or (
+                    candidate_line_count == best_line_count
+                    and len(candidate_text.strip()) > len(best_text.strip())
+                ):
+                    best_line_count = candidate_line_count
+                    best_text = candidate_text
+                    best_scores = candidate_scores
+            text = best_text
+            scores = list(best_scores)
+            # If full-image OCR still sees too few lines, run line-band fallback.
+            best_line_count = len([ln for ln in best_text.splitlines() if ln.strip()])
+            if best_line_count < min_lines:
+                text, scores = self._line_band_fallback(
+                    ocr_img_path, langs, base_text=best_text, base_scores=scores,
+                    best_line_count=best_line_count,
+                )
+            return PageOcr(text=text.strip(), scores=scores)
+        finally:
+            cleanup_paths(preprocessed_paths)
+    def _line_band_fallback(
+        self,
+        ocr_img_path: str,
+        langs: list[str],
+        *,
+        base_text: str,
+        base_scores: list[float],
+        best_line_count: int,
+    ) -> tuple[str, list[float]]:
+        band_paths = split_image_into_line_bands(ocr_img_path)
+        if not band_paths:
+            return base_text, base_scores
+        recovered_lines: list[str] = []
+        recovered_scores: list[float] = []
+        created_paths = [bp for _, bp in band_paths]
+        try:
+            for _, band_path in sorted(band_paths, key=lambda x: x[0]):
+                band_best_text = ""
+                band_best_len = 0
+                band_best_scores: list[float] = []
+                for lang_code in langs:
+                    ocr_engine = self._get_ocr(lang_code)
+                    result = self._run_ocr(ocr_engine, band_path)
+                    txt, sc = _extract_from_result(result)
+                    txt_len = len(txt.strip())
+                    if txt_len > band_best_len:
+                        band_best_len = txt_len
+                        band_best_text = txt
+                        band_best_scores = sc
+                line = " ".join(
+                    [p.strip() for p in band_best_text.splitlines() if p.strip()]
+                ).strip()
+                if line:
+                    recovered_lines.append(line)
+                    recovered_scores.extend(band_best_scores)
+        finally:
+            cleanup_paths(created_paths)
+        if len(recovered_lines) > best_line_count:
+            text = base_text.rstrip()
+            if text and not text.endswith("\n"):
+                text += "\n"
+            text += "\n".join(recovered_lines) + "\n"
+            return text, base_scores + recovered_scores
+        return base_text, base_scores

ocrcontext/engines/pdf_text.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Digital PDF text-layer extraction (no GPU / no OCR).
+Ported verbatim from ocr-service/modal_app.py. Used to skip OCR entirely when a
+PDF already carries an accurate text layer.
+"""
+from __future__ import annotations
+import re
+# PowerPoint / Google Slides PDFs often expose internal image names in the text layer.
+_PDF_IMAGE_ARTIFACT_RE = re.compile(
+    r"^[\w.\-]{1,120}\.(?:png|jpe?g|gif|webp|bmp|tiff?|svg)$",
+    re.IGNORECASE,
+)
+_PDF_KNOWN_ARTIFACTS = frozenset(
+    {
+        "preencoded.png",
+        "image.png",
+        "image1.png",
+        "image2.png",
+    }
+)
+def is_pdf_text_artifact(line: str) -> bool:
+    """Filter embedded image filenames leaked into PDF text extraction."""
+    s = (line or "").strip()
+    if not s:
+        return False
+    lower = s.lower()
+    if lower in _PDF_KNOWN_ARTIFACTS:
+        return True
+    if " " in s or "/" in s or "\\" in s:
+        return False
+    if _PDF_IMAGE_ARTIFACT_RE.match(s):
+        return True
+    return False
+def extract_pdf_text_preserve_layout(file_bytes: bytes) -> tuple[str, int]:
+    """Extract text from digital PDFs while preserving line order/layout."""
+    import fitz
+    pdf_document = fitz.open(stream=file_bytes, filetype="pdf")
+    page_count = len(pdf_document)
+    pages_output: list[str] = []
+    for page in pdf_document:
+        # Use block-level extraction to preserve paragraph breaks and reading order.
+        blocks = page.get_text("blocks")
+        if not blocks:
+            pages_output.append("")
+            continue
+        # block tuple: (x0, y0, x1, y1, text, block_no, block_type) - block_type 0=text, 1=image
+        text_blocks = [
+            b
+            for b in blocks
+            if len(b) >= 5
+            and (len(b) < 7 or b[6] == 0)
+            and isinstance(b[4], str)
+            and b[4].strip()
+        ]
+        text_blocks.sort(key=lambda b: (round(float(b[1]), 1), round(float(b[0]), 1)))
+        if not text_blocks:
+            pages_output.append("")
+            continue
+        page_lines: list[str] = []
+        prev_bottom = None
+        for block in text_blocks:
+            y0, y1 = float(block[1]), float(block[3])
+            block_text = block[4].replace("\r\n", "\n").replace("\r", "\n").strip()
+            if not block_text:
+                continue
+            # Insert paragraph gap if there is visible vertical space between blocks.
+            if prev_bottom is not None and (y0 - prev_bottom) > 8:
+                if page_lines and page_lines[-1] != "":
+                    page_lines.append("")
+            block_lines = [
+                ln.rstrip()
+                for ln in block_text.split("\n")
+                if ln.strip() and not is_pdf_text_artifact(ln)
+            ]
+            page_lines.extend(block_lines)
+            prev_bottom = y1
+        # Collapse accidental triple+ gaps while keeping intentional paragraph breaks.
+        compact_lines: list[str] = []
+        empty_streak = 0
+        for ln in page_lines:
+            if ln.strip() == "":
+                empty_streak += 1
+                if empty_streak <= 1:
+                    compact_lines.append("")
+            else:
+                empty_streak = 0
+                compact_lines.append(ln)
+        pages_output.append("\n".join(compact_lines).strip())
+    pdf_document.close()
+    full_text = ""
+    for idx, page_text in enumerate(pages_output):
+        if idx > 0:
+            full_text += f"\n\n--- Page {idx + 1} ---\n\n"
+        full_text += page_text
+    return full_text, page_count
+def has_sufficient_pdf_text(text: str) -> bool:
+    """True when the PDF text layer is rich enough to use instead of OCR."""
+    stripped = (text or "").strip()
+    if len(stripped) < 80:
+        return False
+    alnum_count = sum(ch.isalnum() for ch in stripped)
+    ratio = alnum_count / max(len(stripped), 1)
+    return ratio >= 0.25

ocrcontext/engines/registry.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Singleton registry for heavy OCR engines.
+PaddleOCR and TrOCR models are expensive to load. The registry guarantees each
+engine (and therefore each model) is instantiated at most once per process,
+satisfying the resource-efficiency requirement. Loading is lazy: an engine is
+only created the first time it is requested.
+Thread-safe so the same singleton is shared across threads (e.g. a web worker
+pool that wraps this library).
+"""
+from __future__ import annotations
+import threading
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .handwriting import HandwritingEngine
+    from .paddle import PaddleEngine
+class EngineRegistry:
+    """Process-wide lazy cache of OCR engines.
+    A default shared instance is exposed via :meth:`shared`, but callers may also
+    create isolated registries (useful for tests).
+    """
+    _shared: "EngineRegistry | None" = None
+    _shared_lock = threading.Lock()
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._paddle: "PaddleEngine | None" = None
+        self._handwriting: "HandwritingEngine | None" = None
+    @classmethod
+    def shared(cls) -> "EngineRegistry":
+        if cls._shared is None:
+            with cls._shared_lock:
+                if cls._shared is None:
+                    cls._shared = cls()
+        return cls._shared
+    def paddle(self) -> "PaddleEngine":
+        if self._paddle is None:
+            with self._lock:
+                if self._paddle is None:
+                    from .paddle import PaddleEngine
+                    self._paddle = PaddleEngine()
+        return self._paddle
+    def handwriting(self) -> "HandwritingEngine":
+        if self._handwriting is None:
+            with self._lock:
+                if self._handwriting is None:
+                    from .handwriting import HandwritingEngine
+                    self._handwriting = HandwritingEngine()
+        return self._handwriting
+    def reset(self) -> None:
+        """Drop cached engines (frees model memory). Mainly for tests."""
+        with self._lock:
+            self._paddle = None
+            self._handwriting = None

ocrcontext/engines/trocr.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Microsoft TrOCR handwriting engine (line-by-line).
+Ported verbatim from ocr-service/handwriting_ocr.py. Used as the fallback when
+Google Vision is unavailable or returns nothing. ``transformers``/``torch`` are
+imported lazily (install the ``trocr`` extra).
+"""
+from __future__ import annotations
+from ..exceptions import MissingDependencyError
+from ..utils.files import cleanup_paths, new_temp_path
+TROCR_MODEL_ID = "microsoft/trocr-base-handwritten"
+MIN_BAND_HEIGHT = 12
+MAX_NEW_TOKENS = 128
+TARGET_HEIGHT = 384
+MAX_WIDTH = 1280
+def split_image_into_line_bands(img_path: str) -> list[tuple[int, str]]:
+    """Horizontal projection -> line crops for TrOCR (one line per image)."""
+    import cv2
+    bands: list[tuple[int, str]] = []
+    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
+    if img is None:
+        return bands
+    h, w = img.shape[:2]
+    if h < 80 or w < 80:
+        return bands
+    blur = cv2.GaussianBlur(img, (3, 3), 0)
+    bw = cv2.adaptiveThreshold(
+        blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 12
+    )
+    import numpy as np
+    row_sum = np.sum(bw > 0, axis=1)
+    threshold = max(8, int(0.02 * w))
+    active_rows = row_sum > threshold
+    segments: list[tuple[int, int]] = []
+    start = None
+    for i, active in enumerate(active_rows):
+        if active and start is None:
+            start = i
+        elif not active and start is not None:
+            if i - start >= 10:
+                segments.append((start, i))
+            start = None
+    if start is not None and (len(active_rows) - start) >= 10:
+        segments.append((start, len(active_rows)))
+    if not segments:
+        return bands
+    merged: list[list[int]] = []
+    for s, e in segments:
+        if not merged:
+            merged.append([s, e])
+        elif s - merged[-1][1] <= 12:
+            merged[-1][1] = e
+        else:
+            merged.append([s, e])
+    for idx_band, (s, e) in enumerate(merged):
+        pad = 12
+        y0 = max(0, s - pad)
+        y1 = min(h, e + pad)
+        crop = img[y0:y1, :]
+        if crop.shape[0] < MIN_BAND_HEIGHT:
+            continue
+        upscaled = cv2.resize(crop, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
+        band_path = new_temp_path("png")
+        cv2.imwrite(band_path, upscaled)
+        bands.append((y0, band_path))
+    return bands
+def prepare_image_for_trocr(image):
+    """Resize to TrOCR-friendly dimensions (avoids ViT tensor errors on tiny/huge crops)."""
+    from PIL import Image
+    image = image.convert("RGB")
+    w, h = image.size
+    if h < 1 or w < 1:
+        return image
+    if h < 32 or w < 32:
+        scale = max(32 / w, 32 / h)
+        w, h = max(32, int(w * scale)), max(32, int(h * scale))
+        image = image.resize((w, h), Image.Resampling.LANCZOS)
+    if h != TARGET_HEIGHT:
+        new_w = max(32, int(w * (TARGET_HEIGHT / h)))
+        image = image.resize((new_w, TARGET_HEIGHT), Image.Resampling.LANCZOS)
+        w, h = image.size
+    if w > MAX_WIDTH:
+        image = image.resize((MAX_WIDTH, int(h * MAX_WIDTH / w)), Image.Resampling.LANCZOS)
+    return image
+class TrOCRHandwritingEngine:
+    def __init__(self, model_id: str = TROCR_MODEL_ID) -> None:
+        self.model_id = model_id
+        self._processor = None
+        self._model = None
+        self._device = None
+    def load(self) -> None:
+        try:
+            import torch
+            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+        except ImportError as exc:  # pragma: no cover - exercised via install matrix
+            raise MissingDependencyError("transformers", "trocr") from exc
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._processor = TrOCRProcessor.from_pretrained(self.model_id)
+        self._model = VisionEncoderDecoderModel.from_pretrained(self.model_id)
+        self._model.to(self._device)
+        self._model.eval()
+    def warmup_inference(self) -> None:
+        if self._processor is None or self._model is None:
+            return
+        from PIL import Image
+        dummy = Image.new("RGB", (384, 96), color=(255, 255, 255))
+        try:
+            _ = self.recognize_pil(dummy)
+        except Exception:
+            pass
+    def recognize_line_image_path(self, path: str) -> str:
+        from PIL import Image
+        image = Image.open(path).convert("RGB")
+        return self.recognize_pil(image)
+    def recognize_pil(self, image) -> str:
+        import torch
+        if self._processor is None or self._model is None:
+            raise RuntimeError("TrOCRHandwritingEngine.load() was not called")
+        image = prepare_image_for_trocr(image)
+        # Positional call matches HF docs; avoids kwarg edge cases in older processors.
+        pixel_values = self._processor(image, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(self._device)
+        with torch.no_grad():
+            generated_ids = self._model.generate(
+                pixel_values,
+                max_new_tokens=MAX_NEW_TOKENS,
+                num_beams=4,
+                early_stopping=True,
+            )
+        text = self._processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return (text or "").strip()
+def run_trocr_on_page(engine: TrOCRHandwritingEngine, img_path: str) -> tuple[str, float]:
+    """OCR one page image with TrOCR line bands. Returns (text, pseudo_confidence 0..1)."""
+    bands = split_image_into_line_bands(img_path)
+    created: list[str] = []
+    lines: list[str] = []
+    try:
+        if not bands:
+            text = engine.recognize_line_image_path(img_path)
+            if text:
+                lines.append(text)
+        else:
+            for _, band_path in sorted(bands, key=lambda x: x[0]):
+                created.append(band_path)
+                line = engine.recognize_line_image_path(band_path)
+                line = " ".join(line.split())
+                if line:
+                    lines.append(line)
+    finally:
+        cleanup_paths(created)
+    full = "\n".join(lines).strip()
+    conf = min(1.0, len(full) / 200.0) if full else 0.0
+    return full, conf