ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ """Language code helpers.
2
+
3
+ ``normalize_paddle_lang`` and the language map are ported verbatim from
4
+ ``ocr-service/modal_app.py`` and ``lib/ocr/refine.ts`` respectively.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Optional
10
+
11
+ # Mirrors languageMap in lib/ocr/refine.ts — UI code -> human-readable name used
12
+ # inside the refinement prompts.
13
+ LANGUAGE_MAP: dict[str, str] = {
14
+ "tr": "Turkish",
15
+ "en": "English",
16
+ "es": "Spanish",
17
+ "fr": "French",
18
+ "de": "German",
19
+ "it": "Italian",
20
+ "pt": "Portuguese",
21
+ "ru": "Russian",
22
+ "zh": "Chinese",
23
+ "ja": "Japanese",
24
+ "ko": "Korean",
25
+ }
26
+
27
+
28
+ def language_full_name(lang: Optional[str]) -> Optional[str]:
29
+ """Return the human-readable language name for a UI code, or the code itself."""
30
+ if not lang:
31
+ return None
32
+ return LANGUAGE_MAP.get(lang, lang)
33
+
34
+
35
+ def normalize_paddle_lang(lang: Optional[str]) -> str:
36
+ """Map UI / document language codes to PaddleOCR recognition models.
37
+
38
+ Turkish is not a separate 'tr' pack in many PaddleOCR builds; 'latin' covers
39
+ Latin-script languages with a wider charset than 'en' alone.
40
+
41
+ Ported verbatim from ocr-service/modal_app.py::normalize_paddle_lang.
42
+ """
43
+ if not lang:
44
+ return "en"
45
+ code = str(lang).strip().lower()
46
+ if code in ("auto", "unknown"):
47
+ return "en"
48
+ # Turkish / similar Latin-extended -> latin model (better s, g, i, o, u than en-only)
49
+ if code in ("tr", "tur", "turkish"):
50
+ return "latin"
51
+ return {
52
+ "en": "en",
53
+ "english": "en",
54
+ "de": "german",
55
+ "german": "german",
56
+ "fr": "french",
57
+ "french": "french",
58
+ "es": "es",
59
+ "spanish": "es",
60
+ "pt": "portuguese",
61
+ "portuguese": "portuguese",
62
+ "it": "it",
63
+ "italian": "it",
64
+ }.get(code, code if len(code) <= 20 else "en")
65
+
66
+
67
+ def candidate_langs(lang: Optional[str]) -> list[str]:
68
+ """Ordered, de-duplicated PaddleOCR model candidates: primary -> latin -> en.
69
+
70
+ Mirrors the candidate selection in OCRService.process.
71
+ """
72
+ primary = normalize_paddle_lang(lang)
73
+ out: list[str] = []
74
+ for code in (primary, "latin", "en"):
75
+ if code not in out:
76
+ out.append(code)
77
+ return out
@@ -0,0 +1,207 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocrcontext
3
+ Version: 0.1.0
4
+ Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
+ Project-URL: Homepage, https://github.com/bahadirkarsli/ocrcontext
6
+ Project-URL: Repository, https://github.com/bahadirkarsli/ocrcontext
7
+ Project-URL: Issues, https://github.com/bahadirkarsli/ocrcontext/issues
8
+ Project-URL: Changelog, https://github.com/bahadirkarsli/ocrcontext/blob/main/CHANGELOG.md
9
+ Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
10
+ Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: langchain-core>=0.3
28
+ Requires-Dist: numpy>=1.24
29
+ Requires-Dist: pillow>=9.0
30
+ Requires-Dist: pydantic>=2.5
31
+ Requires-Dist: pymupdf>=1.23
32
+ Provides-Extra: all
33
+ Requires-Dist: accelerate>=0.27; extra == 'all'
34
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
35
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
36
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
37
+ Requires-Dist: paddlepaddle>=2.6; extra == 'all'
38
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
39
+ Requires-Dist: torch>=2.1; extra == 'all'
40
+ Requires-Dist: torchvision>=0.16; extra == 'all'
41
+ Requires-Dist: transformers>=4.40; extra == 'all'
42
+ Provides-Extra: dev
43
+ Requires-Dist: build>=1.2; extra == 'dev'
44
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
45
+ Requires-Dist: pytest>=8.0; extra == 'dev'
46
+ Requires-Dist: ruff>=0.5; extra == 'dev'
47
+ Requires-Dist: twine>=5.0; extra == 'dev'
48
+ Provides-Extra: paddle
49
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
50
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
51
+ Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
52
+ Provides-Extra: trocr
53
+ Requires-Dist: accelerate>=0.27; extra == 'trocr'
54
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
55
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
56
+ Requires-Dist: torch>=2.1; extra == 'trocr'
57
+ Requires-Dist: torchvision>=0.16; extra == 'trocr'
58
+ Requires-Dist: transformers>=4.40; extra == 'trocr'
59
+ Provides-Extra: vision
60
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
61
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
62
+ Description-Content-Type: text/markdown
63
+
64
+ # ocrcontext
65
+
66
+ **Decoupled, LLM-agnostic document OCR + structured extraction.** Turn a PDF or
67
+ image into clean text — or a typed Pydantic model — in three lines.
68
+
69
+ `ocrcontext` is the extraction core of a document-analysis platform, lifted out
70
+ of its web stack into a pure, pip-installable library. No FastAPI, no servers,
71
+ no hardcoded model providers.
72
+
73
+ ```python
74
+ from ocrcontext import Analyzer
75
+
76
+ result = Analyzer().analyze("invoice.pdf")
77
+ print(result.text)
78
+ ```
79
+
80
+ ## Why
81
+
82
+ - **3-line DX** — instantiate, pass a file, get a result.
83
+ - **LLM-agnostic** — inject any LangChain chat model (OpenAI, Anthropic, Ollama,
84
+ local). Only `langchain-core` is required; you bring the provider.
85
+ - **Resource-efficient** — heavy OCR models (PaddleOCR, TrOCR) load lazily and
86
+ are cached as process-wide singletons, so they never reload per call.
87
+ - **Lightweight base install** — engines are opt-in extras.
88
+
89
+ ## Install
90
+
91
+ ```bash
92
+ pip install ocrcontext # core only (PDF text layer + the API surface)
93
+ pip install 'ocrcontext[paddle]' # printed text + scanned PDFs (PaddleOCR)
94
+ pip install 'ocrcontext[trocr]' # handwriting fallback (Microsoft TrOCR)
95
+ pip install 'ocrcontext[vision]' # handwriting primary (Google Cloud Vision)
96
+ pip install 'ocrcontext[all]' # everything
97
+ ```
98
+
99
+ Pick an LLM provider for refinement / extraction:
100
+
101
+ ```bash
102
+ pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
103
+ ```
104
+
105
+ ## Usage
106
+
107
+ ### Raw OCR (no LLM, no API key)
108
+
109
+ ```python
110
+ from ocrcontext import Analyzer
111
+
112
+ result = Analyzer().analyze("scan.png")
113
+ print(result.text, result.confidence, result.pages, result.text_source)
114
+ ```
115
+
116
+ ### LLM-refined OCR
117
+
118
+ Refinement fixes OCR errors **without** paraphrasing, translating, or inventing
119
+ text. Emails/URLs/IBANs are frozen so the model can't "correct" them, and output
120
+ that drifts too far from the source is rejected in favour of the raw text.
121
+
122
+ ```python
123
+ from langchain_openai import ChatOpenAI
124
+ from ocrcontext import Analyzer
125
+
126
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
127
+ result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
128
+ print(result.text) # refined
129
+ print(result.raw_text) # original OCR, kept alongside
130
+ ```
131
+
132
+ ### Structured extraction
133
+
134
+ ```python
135
+ from langchain_openai import ChatOpenAI
136
+ from ocrcontext import Analyzer
137
+ from ocrcontext.schemas import Invoice
138
+
139
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
140
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice) # -> Invoice instance
141
+ print(invoice.total_amount, invoice.currency)
142
+ ```
143
+
144
+ Define your own schema with plain Pydantic:
145
+
146
+ ```python
147
+ from pydantic import BaseModel, Field
148
+
149
+ class Receipt(BaseModel):
150
+ merchant: str | None = Field(None, description="Store name")
151
+ total: float | None = Field(None, description="Grand total")
152
+
153
+ receipt = analyzer.extract("receipt.jpg", schema=Receipt)
154
+ ```
155
+
156
+ ### Same code, local model (no API key)
157
+
158
+ ```python
159
+ from langchain_ollama import ChatOllama
160
+ from ocrcontext import Analyzer
161
+
162
+ analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
163
+ print(analyzer.analyze("scan.png").text)
164
+ ```
165
+
166
+ ## How it routes a document
167
+
168
+ 1. **Digital PDF** → embedded text-layer extraction (exact text; LLM refine is
169
+ skipped so identifiers aren't altered).
170
+ 2. **Image / scanned PDF** → PaddleOCR with preprocessing (deskew, denoise,
171
+ CLAHE), multi-language *coverage-first* selection, and a line-band recovery
172
+ fallback.
173
+ 3. **Handwriting** (`handwriting=True`, or auto when printed OCR yields too
174
+ little text) → Google Vision primary, TrOCR fallback.
175
+ 4. **Optional LLM refine** → fidelity-first, literal-preserved, drift-guarded.
176
+ 5. **Optional `extract(schema=...)`** → typed Pydantic model.
177
+
178
+ ## Refinement modes
179
+
180
+ `RefinementMode`: `conservative` (scans), `layout` (digital PDFs),
181
+ `handwriting_prose`, `handwriting_layout`. The handwriting mode is auto-selected
182
+ based on whether the text looks like a DIKW/pyramid diagram. Modes and prompts
183
+ are ported verbatim from the production pipeline.
184
+
185
+ ## Configuration
186
+
187
+ ```python
188
+ from ocrcontext import Analyzer, AnalyzerConfig
189
+
190
+ cfg = AnalyzerConfig(
191
+ lang="tr",
192
+ prefer_pdf_text_layer=True,
193
+ auto_handwriting_fallback=True,
194
+ )
195
+ analyzer = Analyzer(llm=..., config=cfg)
196
+ ```
197
+
198
+ ## Development
199
+
200
+ ```bash
201
+ pip install -e '.[dev]'
202
+ pytest # runs without GPU/network — engines and LLM are faked
203
+ ```
204
+
205
+ ## License
206
+
207
+ MIT
@@ -0,0 +1,34 @@
1
+ ocrcontext/__init__.py,sha256=Dv2UkpPlAQ-Oayh1akjdXmJ6keXvsrAM-TUk-jssoAs,1162
2
+ ocrcontext/analyzer.py,sha256=dvlQumA0rU0J0pJwahpj4yXNcXkB5BvgkgxVDSH2jsU,6631
3
+ ocrcontext/config.py,sha256=0j2XZVM7zwTb9QwSyB1FNPqNU1a6iOfT4Sg0LUmx5SE,1784
4
+ ocrcontext/exceptions.py,sha256=1sCAv2i7gICuTCgy-z0dnUVBEwDz-1NG5a1_N7OgIcQ,1435
5
+ ocrcontext/pipeline.py,sha256=R56X_y9Oev93dadLTvJkeMrdc8-elZnLxbSI2b2lIX0,5451
6
+ ocrcontext/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ ocrcontext/quality.py,sha256=CqIJNUdTpObuXIuybnDXwqlcPtvJH9r_zCqqDXrjIjU,2323
8
+ ocrcontext/schemas.py,sha256=yW9wxiVj00aMdMAHbV27l3C-XC-a07YHrcWerrp0U8M,245
9
+ ocrcontext/types.py,sha256=A81QEb2UA6IdGz7pdGDGWxxWnbM5yOyMxrQ5z31OyIU,1923
10
+ ocrcontext/engines/__init__.py,sha256=CzOYVHco484KR3ioZoVNMkPAn01k1_rFoqS7pPH7wyw,181
11
+ ocrcontext/engines/base.py,sha256=lr9RBeBVFB_t7XnCiPOShA6SaKUPqBw0CJkZzWdmDsQ,1312
12
+ ocrcontext/engines/handwriting.py,sha256=_tiGYLqHC1COiK-tuhqOZgD2Z4GpHvHtDsdm5S1B04w,3537
13
+ ocrcontext/engines/paddle.py,sha256=ZPLxPyRnvZ9gDBdafsNq_r1AI0U30VG6t36vOVLhnMI,10521
14
+ ocrcontext/engines/pdf_text.py,sha256=_wPSwBbTmiz75_w0dvrVWe2pMZ--r79vbux0y73AvNs,3876
15
+ ocrcontext/engines/registry.py,sha256=50JSUU-ic-zdEiAlhCXET8v_2lHQNdOm2bdprBtQhio,2133
16
+ ocrcontext/engines/trocr.py,sha256=hltZch8zdm6mQrDCf2GdWyd9-eK0GLp6EY1sUNA69Jg,6160
17
+ ocrcontext/engines/vision.py,sha256=OKwijPPUMxIJLOP-tGZwUFAyosLw8K_EOZ1HjviW4bA,16986
18
+ ocrcontext/llm/__init__.py,sha256=I0qXc1lPzztunJ9L4PJwDb35ILmZXZFoEe7jqC68sHs,344
19
+ ocrcontext/llm/drift.py,sha256=ZP3SXxFRehoTuhHkyWXx25-z5QM7zjRb9gu2hTf_wQo,2183
20
+ ocrcontext/llm/extractor.py,sha256=qakUToNKvUKmzUieJ4B24WUZNETeFgLMeJKALT9oYx8,2154
21
+ ocrcontext/llm/formatting.py,sha256=ffkZ8FXdu0SRsSYHV6PNCd7Ey4bKgr_lGtSLdv9qLNY,1339
22
+ ocrcontext/llm/literal_preserve.py,sha256=VZw6ebHQqoTcz_56Dopd9zCh6wD2JUl2Ck0zte3nz10,5623
23
+ ocrcontext/llm/prompts.py,sha256=XpIfiGtP3VtkV2kkdWQbfm3r7wUfgvEgpAUVDI4BXa8,7586
24
+ ocrcontext/llm/refiner.py,sha256=gpqu5nHSenBfDx9Ft7b_4TYnnLkO9WxvDpjg_2MnDi0,4323
25
+ ocrcontext/llm/schemas.py,sha256=Y4RrLmwVzw5wkSGVZhUaRIzsCdAfjob4K9jOAtx9rr8,4248
26
+ ocrcontext/preprocessing/__init__.py,sha256=L_OHsKRcbGSOnILTPvJ2D5dgfmsedHiCEpUeIaOZzm0,226
27
+ ocrcontext/preprocessing/image.py,sha256=L5KF1t_-kaNfL5ycjEYsXfrC7YDaPs9_bTgrdrRkWVw,5540
28
+ ocrcontext/utils/__init__.py,sha256=2frdDgbVpf6ODK2JIhLus1Md8-6WhkIzB6V_KHrLcj4,60
29
+ ocrcontext/utils/files.py,sha256=Wl2GkQf9TCuUthKs2ovO20geliL73cLbcaoWbw7xZtw,5474
30
+ ocrcontext/utils/lang.py,sha256=v52hwYrHCJB6tfvrbgYKhSODSQ6amL1ACLnQOusfGSA,2245
31
+ ocrcontext-0.1.0.dist-info/METADATA,sha256=g8wrTzxhl7bUUuUDgrGH-DrjrKURfvXwnOIzqc2aoCo,7257
32
+ ocrcontext-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
33
+ ocrcontext-0.1.0.dist-info/licenses/LICENSE,sha256=coVOBGbnFj0umrt9J48B_5gRJY3n67WyP-6SESmhyP8,1073
34
+ ocrcontext-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Bahadır Karslı
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.