PyPI - ocrcontext - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

ocrcontext/__init__.py +49 -0
ocrcontext/analyzer.py +198 -0
ocrcontext/config.py +49 -0
ocrcontext/engines/__init__.py +6 -0
ocrcontext/engines/base.py +45 -0
ocrcontext/engines/handwriting.py +103 -0
ocrcontext/engines/paddle.py +264 -0
ocrcontext/engines/pdf_text.py +126 -0
ocrcontext/engines/registry.py +67 -0
ocrcontext/engines/trocr.py +191 -0
ocrcontext/engines/vision.py +538 -0
ocrcontext/exceptions.py +45 -0
ocrcontext/llm/__init__.py +10 -0
ocrcontext/llm/drift.py +58 -0
ocrcontext/llm/extractor.py +63 -0
ocrcontext/llm/formatting.py +39 -0
ocrcontext/llm/literal_preserve.py +164 -0
ocrcontext/llm/prompts.py +157 -0
ocrcontext/llm/refiner.py +114 -0
ocrcontext/llm/schemas.py +99 -0
ocrcontext/pipeline.py +162 -0
ocrcontext/preprocessing/__init__.py +5 -0
ocrcontext/preprocessing/image.py +177 -0
ocrcontext/py.typed +0 -0
ocrcontext/quality.py +76 -0
ocrcontext/schemas.py +8 -0
ocrcontext/types.py +55 -0
ocrcontext/utils/__init__.py +1 -0
ocrcontext/utils/files.py +172 -0
ocrcontext/utils/lang.py +77 -0
ocrcontext-0.1.0.dist-info/METADATA +207 -0
ocrcontext-0.1.0.dist-info/RECORD +34 -0
ocrcontext-0.1.0.dist-info/WHEEL +4 -0
ocrcontext-0.1.0.dist-info/licenses/LICENSE +21 -0

ocrcontext/utils/lang.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""Language code helpers.
+``normalize_paddle_lang`` and the language map are ported verbatim from
+``ocr-service/modal_app.py`` and ``lib/ocr/refine.ts`` respectively.
+"""
+from __future__ import annotations
+from typing import Optional
+# Mirrors languageMap in lib/ocr/refine.ts — UI code -> human-readable name used
+# inside the refinement prompts.
+LANGUAGE_MAP: dict[str, str] = {
+    "tr": "Turkish",
+    "en": "English",
+    "es": "Spanish",
+    "fr": "French",
+    "de": "German",
+    "it": "Italian",
+    "pt": "Portuguese",
+    "ru": "Russian",
+    "zh": "Chinese",
+    "ja": "Japanese",
+    "ko": "Korean",
+}
+def language_full_name(lang: Optional[str]) -> Optional[str]:
+    """Return the human-readable language name for a UI code, or the code itself."""
+    if not lang:
+        return None
+    return LANGUAGE_MAP.get(lang, lang)
+def normalize_paddle_lang(lang: Optional[str]) -> str:
+    """Map UI / document language codes to PaddleOCR recognition models.
+    Turkish is not a separate 'tr' pack in many PaddleOCR builds; 'latin' covers
+    Latin-script languages with a wider charset than 'en' alone.
+    Ported verbatim from ocr-service/modal_app.py::normalize_paddle_lang.
+    """
+    if not lang:
+        return "en"
+    code = str(lang).strip().lower()
+    if code in ("auto", "unknown"):
+        return "en"
+    # Turkish / similar Latin-extended -> latin model (better s, g, i, o, u than en-only)
+    if code in ("tr", "tur", "turkish"):
+        return "latin"
+    return {
+        "en": "en",
+        "english": "en",
+        "de": "german",
+        "german": "german",
+        "fr": "french",
+        "french": "french",
+        "es": "es",
+        "spanish": "es",
+        "pt": "portuguese",
+        "portuguese": "portuguese",
+        "it": "it",
+        "italian": "it",
+    }.get(code, code if len(code) <= 20 else "en")
+def candidate_langs(lang: Optional[str]) -> list[str]:
+    """Ordered, de-duplicated PaddleOCR model candidates: primary -> latin -> en.
+    Mirrors the candidate selection in OCRService.process.
+    """
+    primary = normalize_paddle_lang(lang)
+    out: list[str] = []
+    for code in (primary, "latin", "en"):
+        if code not in out:
+            out.append(code)
+    return out

ocrcontext-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,207 @@
+Metadata-Version: 2.4
+Name: ocrcontext
+Version: 0.1.0
+Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
+Project-URL: Homepage, https://github.com/bahadirkarsli/ocrcontext
+Project-URL: Repository, https://github.com/bahadirkarsli/ocrcontext
+Project-URL: Issues, https://github.com/bahadirkarsli/ocrcontext/issues
+Project-URL: Changelog, https://github.com/bahadirkarsli/ocrcontext/blob/main/CHANGELOG.md
+Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
+Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
+License: MIT
+License-File: LICENSE
+Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Image Recognition
+Classifier: Topic :: Text Processing :: Linguistic
+Classifier: Typing :: Typed
+Requires-Python: >=3.10
+Requires-Dist: langchain-core>=0.3
+Requires-Dist: numpy>=1.24
+Requires-Dist: pillow>=9.0
+Requires-Dist: pydantic>=2.5
+Requires-Dist: pymupdf>=1.23
+Provides-Extra: all
+Requires-Dist: accelerate>=0.27; extra == 'all'
+Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
+Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
+Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
+Requires-Dist: paddlepaddle>=2.6; extra == 'all'
+Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
+Requires-Dist: torch>=2.1; extra == 'all'
+Requires-Dist: torchvision>=0.16; extra == 'all'
+Requires-Dist: transformers>=4.40; extra == 'all'
+Provides-Extra: dev
+Requires-Dist: build>=1.2; extra == 'dev'
+Requires-Dist: pytest-cov>=4.0; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: ruff>=0.5; extra == 'dev'
+Requires-Dist: twine>=5.0; extra == 'dev'
+Provides-Extra: paddle
+Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
+Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
+Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
+Provides-Extra: trocr
+Requires-Dist: accelerate>=0.27; extra == 'trocr'
+Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
+Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
+Requires-Dist: torch>=2.1; extra == 'trocr'
+Requires-Dist: torchvision>=0.16; extra == 'trocr'
+Requires-Dist: transformers>=4.40; extra == 'trocr'
+Provides-Extra: vision
+Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
+Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
+Description-Content-Type: text/markdown
+# ocrcontext
+**Decoupled, LLM-agnostic document OCR + structured extraction.** Turn a PDF or
+image into clean text — or a typed Pydantic model — in three lines.
+`ocrcontext` is the extraction core of a document-analysis platform, lifted out
+of its web stack into a pure, pip-installable library. No FastAPI, no servers,
+no hardcoded model providers.
+```python
+from ocrcontext import Analyzer
+result = Analyzer().analyze("invoice.pdf")
+print(result.text)
+```
+## Why
+- **3-line DX** — instantiate, pass a file, get a result.
+- **LLM-agnostic** — inject any LangChain chat model (OpenAI, Anthropic, Ollama,
+  local). Only `langchain-core` is required; you bring the provider.
+- **Resource-efficient** — heavy OCR models (PaddleOCR, TrOCR) load lazily and
+  are cached as process-wide singletons, so they never reload per call.
+- **Lightweight base install** — engines are opt-in extras.
+## Install
+```bash
+pip install ocrcontext              # core only (PDF text layer + the API surface)
+pip install 'ocrcontext[paddle]'    # printed text + scanned PDFs (PaddleOCR)
+pip install 'ocrcontext[trocr]'     # handwriting fallback (Microsoft TrOCR)
+pip install 'ocrcontext[vision]'    # handwriting primary (Google Cloud Vision)
+pip install 'ocrcontext[all]'       # everything
+```
+Pick an LLM provider for refinement / extraction:
+```bash
+pip install langchain-openai        # or langchain-anthropic, langchain-ollama, ...
+```
+## Usage
+### Raw OCR (no LLM, no API key)
+```python
+from ocrcontext import Analyzer
+result = Analyzer().analyze("scan.png")
+print(result.text, result.confidence, result.pages, result.text_source)
+```
+### LLM-refined OCR
+Refinement fixes OCR errors **without** paraphrasing, translating, or inventing
+text. Emails/URLs/IBANs are frozen so the model can't "correct" them, and output
+that drifts too far from the source is rejected in favour of the raw text.
+```python
+from langchain_openai import ChatOpenAI
+from ocrcontext import Analyzer
+analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
+result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
+print(result.text)          # refined
+print(result.raw_text)      # original OCR, kept alongside
+```
+### Structured extraction
+```python
+from langchain_openai import ChatOpenAI
+from ocrcontext import Analyzer
+from ocrcontext.schemas import Invoice
+analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
+invoice = analyzer.extract("invoice.pdf", schema=Invoice)   # -> Invoice instance
+print(invoice.total_amount, invoice.currency)
+```
+Define your own schema with plain Pydantic:
+```python
+from pydantic import BaseModel, Field
+class Receipt(BaseModel):
+    merchant: str | None = Field(None, description="Store name")
+    total: float | None = Field(None, description="Grand total")
+receipt = analyzer.extract("receipt.jpg", schema=Receipt)
+```
+### Same code, local model (no API key)
+```python
+from langchain_ollama import ChatOllama
+from ocrcontext import Analyzer
+analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
+print(analyzer.analyze("scan.png").text)
+```
+## How it routes a document
+1. **Digital PDF** → embedded text-layer extraction (exact text; LLM refine is
+   skipped so identifiers aren't altered).
+2. **Image / scanned PDF** → PaddleOCR with preprocessing (deskew, denoise,
+   CLAHE), multi-language *coverage-first* selection, and a line-band recovery
+   fallback.
+3. **Handwriting** (`handwriting=True`, or auto when printed OCR yields too
+   little text) → Google Vision primary, TrOCR fallback.
+4. **Optional LLM refine** → fidelity-first, literal-preserved, drift-guarded.
+5. **Optional `extract(schema=...)`** → typed Pydantic model.
+## Refinement modes
+`RefinementMode`: `conservative` (scans), `layout` (digital PDFs),
+`handwriting_prose`, `handwriting_layout`. The handwriting mode is auto-selected
+based on whether the text looks like a DIKW/pyramid diagram. Modes and prompts
+are ported verbatim from the production pipeline.
+## Configuration
+```python
+from ocrcontext import Analyzer, AnalyzerConfig
+cfg = AnalyzerConfig(
+    lang="tr",
+    prefer_pdf_text_layer=True,
+    auto_handwriting_fallback=True,
+)
+analyzer = Analyzer(llm=..., config=cfg)
+```
+## Development
+```bash
+pip install -e '.[dev]'
+pytest            # runs without GPU/network — engines and LLM are faked
+```
+## License
+MIT

ocrcontext-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,34 @@
+ocrcontext/__init__.py,sha256=Dv2UkpPlAQ-Oayh1akjdXmJ6keXvsrAM-TUk-jssoAs,1162
+ocrcontext/analyzer.py,sha256=dvlQumA0rU0J0pJwahpj4yXNcXkB5BvgkgxVDSH2jsU,6631
+ocrcontext/config.py,sha256=0j2XZVM7zwTb9QwSyB1FNPqNU1a6iOfT4Sg0LUmx5SE,1784
+ocrcontext/exceptions.py,sha256=1sCAv2i7gICuTCgy-z0dnUVBEwDz-1NG5a1_N7OgIcQ,1435
+ocrcontext/pipeline.py,sha256=R56X_y9Oev93dadLTvJkeMrdc8-elZnLxbSI2b2lIX0,5451
+ocrcontext/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ocrcontext/quality.py,sha256=CqIJNUdTpObuXIuybnDXwqlcPtvJH9r_zCqqDXrjIjU,2323
+ocrcontext/schemas.py,sha256=yW9wxiVj00aMdMAHbV27l3C-XC-a07YHrcWerrp0U8M,245
+ocrcontext/types.py,sha256=A81QEb2UA6IdGz7pdGDGWxxWnbM5yOyMxrQ5z31OyIU,1923
+ocrcontext/engines/__init__.py,sha256=CzOYVHco484KR3ioZoVNMkPAn01k1_rFoqS7pPH7wyw,181
+ocrcontext/engines/base.py,sha256=lr9RBeBVFB_t7XnCiPOShA6SaKUPqBw0CJkZzWdmDsQ,1312
+ocrcontext/engines/handwriting.py,sha256=_tiGYLqHC1COiK-tuhqOZgD2Z4GpHvHtDsdm5S1B04w,3537
+ocrcontext/engines/paddle.py,sha256=ZPLxPyRnvZ9gDBdafsNq_r1AI0U30VG6t36vOVLhnMI,10521
+ocrcontext/engines/pdf_text.py,sha256=_wPSwBbTmiz75_w0dvrVWe2pMZ--r79vbux0y73AvNs,3876
+ocrcontext/engines/registry.py,sha256=50JSUU-ic-zdEiAlhCXET8v_2lHQNdOm2bdprBtQhio,2133
+ocrcontext/engines/trocr.py,sha256=hltZch8zdm6mQrDCf2GdWyd9-eK0GLp6EY1sUNA69Jg,6160
+ocrcontext/engines/vision.py,sha256=OKwijPPUMxIJLOP-tGZwUFAyosLw8K_EOZ1HjviW4bA,16986
+ocrcontext/llm/__init__.py,sha256=I0qXc1lPzztunJ9L4PJwDb35ILmZXZFoEe7jqC68sHs,344
+ocrcontext/llm/drift.py,sha256=ZP3SXxFRehoTuhHkyWXx25-z5QM7zjRb9gu2hTf_wQo,2183
+ocrcontext/llm/extractor.py,sha256=qakUToNKvUKmzUieJ4B24WUZNETeFgLMeJKALT9oYx8,2154
+ocrcontext/llm/formatting.py,sha256=ffkZ8FXdu0SRsSYHV6PNCd7Ey4bKgr_lGtSLdv9qLNY,1339
+ocrcontext/llm/literal_preserve.py,sha256=VZw6ebHQqoTcz_56Dopd9zCh6wD2JUl2Ck0zte3nz10,5623
+ocrcontext/llm/prompts.py,sha256=XpIfiGtP3VtkV2kkdWQbfm3r7wUfgvEgpAUVDI4BXa8,7586
+ocrcontext/llm/refiner.py,sha256=gpqu5nHSenBfDx9Ft7b_4TYnnLkO9WxvDpjg_2MnDi0,4323
+ocrcontext/llm/schemas.py,sha256=Y4RrLmwVzw5wkSGVZhUaRIzsCdAfjob4K9jOAtx9rr8,4248
+ocrcontext/preprocessing/__init__.py,sha256=L_OHsKRcbGSOnILTPvJ2D5dgfmsedHiCEpUeIaOZzm0,226
+ocrcontext/preprocessing/image.py,sha256=L5KF1t_-kaNfL5ycjEYsXfrC7YDaPs9_bTgrdrRkWVw,5540
+ocrcontext/utils/__init__.py,sha256=2frdDgbVpf6ODK2JIhLus1Md8-6WhkIzB6V_KHrLcj4,60
+ocrcontext/utils/files.py,sha256=Wl2GkQf9TCuUthKs2ovO20geliL73cLbcaoWbw7xZtw,5474
+ocrcontext/utils/lang.py,sha256=v52hwYrHCJB6tfvrbgYKhSODSQ6amL1ACLnQOusfGSA,2245
+ocrcontext-0.1.0.dist-info/METADATA,sha256=g8wrTzxhl7bUUuUDgrGH-DrjrKURfvXwnOIzqc2aoCo,7257
+ocrcontext-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+ocrcontext-0.1.0.dist-info/licenses/LICENSE,sha256=coVOBGbnFj0umrt9J48B_5gRJY3n67WyP-6SESmhyP8,1073
+ocrcontext-0.1.0.dist-info/RECORD,,

ocrcontext-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

ocrcontext-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Bahadır Karslı
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.