ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrcontext/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ """ocrcontext — decoupled, LLM-agnostic document OCR + structured extraction.
2
+
3
+ Quick start::
4
+
5
+ from ocrcontext import Analyzer
6
+ result = Analyzer().analyze("invoice.pdf")
7
+ print(result.text)
8
+
9
+ With an injected LangChain model::
10
+
11
+ from langchain_openai import ChatOpenAI
12
+ from ocrcontext import Analyzer
13
+ from ocrcontext.schemas import Invoice
14
+
15
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"))
16
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice)
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from .analyzer import Analyzer
22
+ from .config import AnalyzerConfig
23
+ from .engines.registry import EngineRegistry
24
+ from .exceptions import (
25
+ EngineError,
26
+ LLMNotConfiguredError,
27
+ MissingDependencyError,
28
+ NoTextDetectedError,
29
+ OcrContextError,
30
+ UnsupportedFileError,
31
+ )
32
+ from .types import OcrResult, RefinementMode
33
+
34
+ __version__ = "0.1.0"
35
+
36
+ __all__ = [
37
+ "Analyzer",
38
+ "AnalyzerConfig",
39
+ "EngineRegistry",
40
+ "OcrResult",
41
+ "RefinementMode",
42
+ "OcrContextError",
43
+ "MissingDependencyError",
44
+ "UnsupportedFileError",
45
+ "NoTextDetectedError",
46
+ "LLMNotConfiguredError",
47
+ "EngineError",
48
+ "__version__",
49
+ ]
ocrcontext/analyzer.py ADDED
@@ -0,0 +1,198 @@
1
+ """The public facade: instantiate, pass a document, get text or a Pydantic model.
2
+
3
+ from ocrcontext import Analyzer
4
+ result = Analyzer().analyze("invoice.pdf")
5
+ print(result.text)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TYPE_CHECKING, Optional, TypeVar
11
+
12
+ from pydantic import BaseModel
13
+
14
+ from .config import AnalyzerConfig
15
+ from .engines.registry import EngineRegistry
16
+ from .exceptions import LLMNotConfiguredError
17
+ from .pipeline import Pipeline
18
+ from .quality import handwriting_refinement_mode
19
+ from .types import OcrResult, RefinementMode
20
+ from .utils.files import Source
21
+
22
+ if TYPE_CHECKING:
23
+ from langchain_core.language_models import BaseChatModel
24
+
25
+ from .llm.extractor import StructuredExtractor
26
+ from .llm.refiner import Refiner
27
+
28
+ TSchema = TypeVar("TSchema", bound=BaseModel)
29
+
30
+ _HANDWRITING_SOURCES = {"vision_handwriting", "trocr_handwriting", "handwriting_ocr"}
31
+
32
+
33
+ class Analyzer:
34
+ """High-level document analyzer.
35
+
36
+ Parameters
37
+ ----------
38
+ llm:
39
+ Optional LangChain ``BaseChatModel``. Required only for ``refine``/``extract``.
40
+ Bring your own provider (``langchain_openai.ChatOpenAI`` etc.).
41
+ lang:
42
+ Default document language code (e.g. ``"en"``, ``"tr"``).
43
+ config:
44
+ Advanced pipeline tuning. Overrides ``lang`` if both are set.
45
+ registry:
46
+ Shared engine registry (singleton model cache). Defaults to a process-wide
47
+ shared instance so PaddleOCR/TrOCR load at most once.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ llm: "Optional[BaseChatModel]" = None,
53
+ *,
54
+ lang: str = "en",
55
+ config: Optional[AnalyzerConfig] = None,
56
+ registry: Optional[EngineRegistry] = None,
57
+ ) -> None:
58
+ self._llm = llm
59
+ self.config = config or AnalyzerConfig(lang=lang)
60
+ self.registry = registry or EngineRegistry.shared()
61
+ self._pipeline = Pipeline(registry=self.registry, config=self.config)
62
+ self._refiner: "Refiner | None" = None
63
+ self._extractor: "StructuredExtractor | None" = None
64
+
65
+ # --- Public API ----------------------------------------------------------
66
+
67
+ def analyze(
68
+ self,
69
+ source: Source,
70
+ *,
71
+ handwriting: bool = False,
72
+ refine: Optional[bool] = None,
73
+ lang: Optional[str] = None,
74
+ mode: Optional[RefinementMode] = None,
75
+ filename: Optional[str] = None,
76
+ ) -> OcrResult:
77
+ """OCR a document (PDF/image) and optionally LLM-refine the text.
78
+
79
+ ``refine=None`` (default) refines only when an LLM is configured and the
80
+ text did not come from an exact digital PDF text layer.
81
+ """
82
+ result = self._pipeline.run(
83
+ source, lang=lang, handwriting=handwriting, filename=filename
84
+ )
85
+
86
+ if self._should_refine(result, refine):
87
+ chosen_mode = mode or self._infer_mode(result)
88
+ refined = self.refine(
89
+ result.text, language=lang or self.config.lang, mode=chosen_mode
90
+ )
91
+ if refined != result.text:
92
+ result.raw_text = result.text
93
+ result.text = refined
94
+ result.refined = True
95
+
96
+ return result
97
+
98
+ def extract(
99
+ self,
100
+ source: Source,
101
+ schema: type[TSchema],
102
+ *,
103
+ handwriting: bool = False,
104
+ refine: bool = False,
105
+ lang: Optional[str] = None,
106
+ system_prompt: Optional[str] = None,
107
+ filename: Optional[str] = None,
108
+ ) -> TSchema:
109
+ """OCR a document and extract a structured Pydantic model from it.
110
+
111
+ Refinement is OFF by default for extraction (the LLM extractor reads raw
112
+ OCR text directly, mirroring the original invoice pipeline).
113
+ """
114
+ result = self.analyze(
115
+ source,
116
+ handwriting=handwriting,
117
+ refine=refine,
118
+ lang=lang,
119
+ filename=filename,
120
+ )
121
+ return self.extract_text(
122
+ result.text,
123
+ schema,
124
+ language=lang or self.config.lang,
125
+ system_prompt=system_prompt,
126
+ )
127
+
128
+ def extract_text(
129
+ self,
130
+ text: str,
131
+ schema: type[TSchema],
132
+ *,
133
+ language: Optional[str] = None,
134
+ system_prompt: Optional[str] = None,
135
+ ) -> TSchema:
136
+ """Extract a structured Pydantic model from already-OCR'd text.
137
+
138
+ Useful when you already have text (e.g. from a prior ``analyze`` call) and
139
+ want to avoid re-running OCR. Requires a configured LLM.
140
+ """
141
+ return self._get_extractor().extract(
142
+ text,
143
+ schema,
144
+ language=language or self.config.lang,
145
+ system_prompt=system_prompt,
146
+ )
147
+
148
+ def refine(
149
+ self,
150
+ text: str,
151
+ *,
152
+ language: Optional[str] = None,
153
+ mode: RefinementMode = RefinementMode.CONSERVATIVE,
154
+ ) -> str:
155
+ """Refine arbitrary OCR text directly (requires a configured LLM)."""
156
+ return self._get_refiner().refine(
157
+ text, language=language or self.config.lang, mode=mode
158
+ )
159
+
160
+ # --- Internals -----------------------------------------------------------
161
+
162
+ def _should_refine(self, result: OcrResult, refine: Optional[bool]) -> bool:
163
+ if refine is False:
164
+ return False
165
+ if refine is True:
166
+ if self._llm is None:
167
+ raise LLMNotConfiguredError("refine=True")
168
+ return True
169
+ # refine is None -> auto
170
+ if self._llm is None or not self.config.refine_by_default:
171
+ return False
172
+ # Never "correct" an exact digital PDF text layer.
173
+ return result.text_source != "pdf_text_layer"
174
+
175
+ def _infer_mode(self, result: OcrResult) -> RefinementMode:
176
+ if result.text_source in _HANDWRITING_SOURCES:
177
+ return handwriting_refinement_mode(result.text, result.has_dikw_structure)
178
+ if result.text_source == "pdf_text_layer":
179
+ return RefinementMode.LAYOUT
180
+ return RefinementMode.CONSERVATIVE
181
+
182
+ def _get_refiner(self) -> "Refiner":
183
+ if self._llm is None:
184
+ raise LLMNotConfiguredError("Refinement")
185
+ if self._refiner is None:
186
+ from .llm.refiner import Refiner
187
+
188
+ self._refiner = Refiner(self._llm)
189
+ return self._refiner
190
+
191
+ def _get_extractor(self) -> "StructuredExtractor":
192
+ if self._llm is None:
193
+ raise LLMNotConfiguredError("Structured extraction")
194
+ if self._extractor is None:
195
+ from .llm.extractor import StructuredExtractor
196
+
197
+ self._extractor = StructuredExtractor(self._llm)
198
+ return self._extractor
ocrcontext/config.py ADDED
@@ -0,0 +1,49 @@
1
+ """Configuration for the analyzer / pipeline.
2
+
3
+ All knobs mirror constants from the original Modal service so OCR behaviour is
4
+ identical after decoupling.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ # PDF rasterization scale when falling back to image OCR (handwriting needs finer detail).
12
+ OCR_PDF_RENDER_SCALE = 2.75
13
+ OCR_PDF_RENDER_SCALE_HANDWRITING = 3.5
14
+
15
+ # Minimum expected non-empty lines per page before the line-band fallback kicks in.
16
+ MIN_EXPECTED_LINES_PER_PAGE = 3
17
+ MIN_EXPECTED_LINES_HANDWRITING = 1
18
+
19
+
20
+ @dataclass
21
+ class AnalyzerConfig:
22
+ """Tunable settings for an :class:`~ocrcontext.analyzer.Analyzer`.
23
+
24
+ Defaults reproduce the production pipeline's behaviour.
25
+ """
26
+
27
+ # Default document language (UI-style code, e.g. "en", "tr"). Mapped to a
28
+ # PaddleOCR model via ocrcontext.utils.lang.normalize_paddle_lang.
29
+ lang: str = "en"
30
+
31
+ # Prefer a digital PDF's embedded text layer over OCR when it is sufficient.
32
+ prefer_pdf_text_layer: bool = True
33
+
34
+ # PDF rasterization scales.
35
+ pdf_render_scale: float = OCR_PDF_RENDER_SCALE
36
+ pdf_render_scale_handwriting: float = OCR_PDF_RENDER_SCALE_HANDWRITING
37
+
38
+ # Line-band fallback thresholds.
39
+ min_lines_per_page: int = MIN_EXPECTED_LINES_PER_PAGE
40
+ min_lines_handwriting: int = MIN_EXPECTED_LINES_HANDWRITING
41
+
42
+ # When True, automatically retry with the handwriting engine if printed OCR
43
+ # returns insufficient text (mirrors the documents/process retry ladder).
44
+ auto_handwriting_fallback: bool = True
45
+
46
+ # Default refinement behaviour when Analyzer.analyze(refine=None):
47
+ # - refine when an LLM is configured AND the text did not come from an exact
48
+ # digital PDF text layer (which must not be "corrected").
49
+ refine_by_default: bool = True
@@ -0,0 +1,6 @@
1
+ """OCR engines and the singleton model registry."""
2
+
3
+ from .base import OcrEngine, PageOcr
4
+ from .registry import EngineRegistry
5
+
6
+ __all__ = ["OcrEngine", "PageOcr", "EngineRegistry"]
@@ -0,0 +1,45 @@
1
+ """Engine abstractions shared by all OCR backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass
10
+ class PageOcr:
11
+ """Recognition output for a single page image."""
12
+
13
+ text: str
14
+ scores: list[float] = field(default_factory=list)
15
+ # Set by the handwriting engine when the page looks like a DIKW/pyramid diagram.
16
+ has_dikw_structure: bool = False
17
+ # Engine-reported text source label (e.g. "vision_handwriting", "trocr_handwriting").
18
+ text_source: str | None = None
19
+
20
+ @property
21
+ def line_count(self) -> int:
22
+ return len([ln for ln in self.text.splitlines() if ln.strip()])
23
+
24
+
25
+ class OcrEngine(ABC):
26
+ """Recognize text from a single page image on disk.
27
+
28
+ Engines are responsible for their own preprocessing and for cleaning up any
29
+ temporary files they create.
30
+ """
31
+
32
+ #: Default text_source label reported in OcrResult when this engine is used.
33
+ text_source: str = "ocr"
34
+
35
+ @abstractmethod
36
+ def recognize(
37
+ self,
38
+ img_path: str,
39
+ *,
40
+ lang: str = "en",
41
+ min_lines: int = 1,
42
+ handwriting: bool = False,
43
+ ) -> PageOcr:
44
+ """Recognize a single page image and return its text + per-token scores."""
45
+ raise NotImplementedError
@@ -0,0 +1,103 @@
1
+ """Composite handwriting engine: Google Vision primary, TrOCR fallback.
2
+
3
+ Mirrors ocr-service/modal_app.py::HandwritingOCRService per-page logic without
4
+ the Modal class wrapper. Each sub-engine is loaded lazily on first use.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from ..preprocessing.image import preprocess_image_for_ocr
10
+ from ..utils.files import cleanup_paths
11
+ from .base import OcrEngine, PageOcr
12
+ from .trocr import TrOCRHandwritingEngine, run_trocr_on_page
13
+ from .vision import GoogleVisionHandwritingEngine, detect_dikw_structure, run_vision_on_page
14
+
15
+
16
+ class HandwritingEngine(OcrEngine):
17
+ """Vision-first handwriting recognition with a TrOCR fallback per page."""
18
+
19
+ text_source = "handwriting_ocr"
20
+
21
+ def __init__(self) -> None:
22
+ self._vision: GoogleVisionHandwritingEngine | None = None
23
+ self._trocr: TrOCRHandwritingEngine | None = None
24
+
25
+ def _ensure_vision(self) -> GoogleVisionHandwritingEngine:
26
+ if self._vision is None:
27
+ engine = GoogleVisionHandwritingEngine()
28
+ engine.load() # no-op disable if creds missing; raises only if pkg absent
29
+ self._vision = engine
30
+ return self._vision
31
+
32
+ def _ensure_trocr(self) -> TrOCRHandwritingEngine:
33
+ if self._trocr is None:
34
+ engine = TrOCRHandwritingEngine()
35
+ engine.load()
36
+ engine.warmup_inference()
37
+ self._trocr = engine
38
+ return self._trocr
39
+
40
+ def recognize(
41
+ self,
42
+ img_path: str,
43
+ *,
44
+ lang: str = "en",
45
+ min_lines: int = 1,
46
+ handwriting: bool = True,
47
+ ) -> PageOcr:
48
+ preprocessed: list[str] = []
49
+ try:
50
+ ocr_img_path = preprocess_image_for_ocr(img_path, handwriting=True)
51
+ if ocr_img_path != img_path:
52
+ preprocessed.append(ocr_img_path)
53
+
54
+ page_text = ""
55
+ page_conf = 0.0
56
+ used_vision = False
57
+ used_trocr = False
58
+ has_dikw = False
59
+
60
+ # Vision is optional: load() leaves it disabled when no credentials exist.
61
+ try:
62
+ vision = self._ensure_vision()
63
+ except Exception:
64
+ vision = None
65
+
66
+ if vision is not None and vision.enabled:
67
+ try:
68
+ page_text, page_conf = run_vision_on_page(
69
+ vision, ocr_img_path, ocr_lang=lang
70
+ )
71
+ if page_text:
72
+ used_vision = True
73
+ if vision.last_has_dikw_structure:
74
+ has_dikw = True
75
+ except Exception:
76
+ page_text = ""
77
+
78
+ if not page_text:
79
+ trocr = self._ensure_trocr()
80
+ page_text, page_conf = run_trocr_on_page(trocr, ocr_img_path)
81
+ if page_text:
82
+ used_trocr = True
83
+
84
+ text_source = (
85
+ "vision_handwriting"
86
+ if used_vision and not used_trocr
87
+ else "trocr_handwriting"
88
+ if used_trocr
89
+ else "handwriting_ocr"
90
+ )
91
+
92
+ if not has_dikw and page_text.strip():
93
+ has_dikw = detect_dikw_structure(page_text)
94
+
95
+ scores = [page_conf] if page_conf > 0 else []
96
+ return PageOcr(
97
+ text=page_text.strip(),
98
+ scores=scores,
99
+ has_dikw_structure=has_dikw,
100
+ text_source=text_source,
101
+ )
102
+ finally:
103
+ cleanup_paths(preprocessed)