ocrcontext 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. ocrcontext-0.1.0/.gitignore +27 -0
  2. ocrcontext-0.1.0/CHANGELOG.md +43 -0
  3. ocrcontext-0.1.0/LICENSE +21 -0
  4. ocrcontext-0.1.0/PKG-INFO +207 -0
  5. ocrcontext-0.1.0/README.md +144 -0
  6. ocrcontext-0.1.0/examples/01_quickstart.py +12 -0
  7. ocrcontext-0.1.0/examples/02_refine_openai.py +19 -0
  8. ocrcontext-0.1.0/examples/03_structured_invoice.py +32 -0
  9. ocrcontext-0.1.0/examples/04_local_ollama.py +15 -0
  10. ocrcontext-0.1.0/examples/image_smoke_test.py +107 -0
  11. ocrcontext-0.1.0/examples/pdf_smoke_test.py +170 -0
  12. ocrcontext-0.1.0/examples/structured_smoke_test.py +140 -0
  13. ocrcontext-0.1.0/pyproject.toml +108 -0
  14. ocrcontext-0.1.0/src/ocrcontext/__init__.py +49 -0
  15. ocrcontext-0.1.0/src/ocrcontext/analyzer.py +198 -0
  16. ocrcontext-0.1.0/src/ocrcontext/config.py +49 -0
  17. ocrcontext-0.1.0/src/ocrcontext/engines/__init__.py +6 -0
  18. ocrcontext-0.1.0/src/ocrcontext/engines/base.py +45 -0
  19. ocrcontext-0.1.0/src/ocrcontext/engines/handwriting.py +103 -0
  20. ocrcontext-0.1.0/src/ocrcontext/engines/paddle.py +264 -0
  21. ocrcontext-0.1.0/src/ocrcontext/engines/pdf_text.py +126 -0
  22. ocrcontext-0.1.0/src/ocrcontext/engines/registry.py +67 -0
  23. ocrcontext-0.1.0/src/ocrcontext/engines/trocr.py +191 -0
  24. ocrcontext-0.1.0/src/ocrcontext/engines/vision.py +538 -0
  25. ocrcontext-0.1.0/src/ocrcontext/exceptions.py +45 -0
  26. ocrcontext-0.1.0/src/ocrcontext/llm/__init__.py +10 -0
  27. ocrcontext-0.1.0/src/ocrcontext/llm/drift.py +58 -0
  28. ocrcontext-0.1.0/src/ocrcontext/llm/extractor.py +63 -0
  29. ocrcontext-0.1.0/src/ocrcontext/llm/formatting.py +39 -0
  30. ocrcontext-0.1.0/src/ocrcontext/llm/literal_preserve.py +164 -0
  31. ocrcontext-0.1.0/src/ocrcontext/llm/prompts.py +157 -0
  32. ocrcontext-0.1.0/src/ocrcontext/llm/refiner.py +114 -0
  33. ocrcontext-0.1.0/src/ocrcontext/llm/schemas.py +99 -0
  34. ocrcontext-0.1.0/src/ocrcontext/pipeline.py +162 -0
  35. ocrcontext-0.1.0/src/ocrcontext/preprocessing/__init__.py +5 -0
  36. ocrcontext-0.1.0/src/ocrcontext/preprocessing/image.py +177 -0
  37. ocrcontext-0.1.0/src/ocrcontext/py.typed +0 -0
  38. ocrcontext-0.1.0/src/ocrcontext/quality.py +76 -0
  39. ocrcontext-0.1.0/src/ocrcontext/schemas.py +8 -0
  40. ocrcontext-0.1.0/src/ocrcontext/types.py +55 -0
  41. ocrcontext-0.1.0/src/ocrcontext/utils/__init__.py +1 -0
  42. ocrcontext-0.1.0/src/ocrcontext/utils/files.py +172 -0
  43. ocrcontext-0.1.0/src/ocrcontext/utils/lang.py +77 -0
  44. ocrcontext-0.1.0/tests/__init__.py +0 -0
  45. ocrcontext-0.1.0/tests/conftest.py +74 -0
  46. ocrcontext-0.1.0/tests/test_literal_preserve.py +56 -0
  47. ocrcontext-0.1.0/tests/test_llm.py +68 -0
  48. ocrcontext-0.1.0/tests/test_pipeline_analyzer.py +96 -0
  49. ocrcontext-0.1.0/tests/test_text_helpers.py +93 -0
@@ -0,0 +1,27 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ .coverage
11
+ htmlcov/
12
+
13
+ # Virtual envs
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # Models / caches
19
+ .cache/
20
+ *.onnx
21
+ *.pdmodel
22
+ *.pdiparams
23
+
24
+ # OS / editor
25
+ .DS_Store
26
+ .idea/
27
+ .vscode/
@@ -0,0 +1,43 @@
1
+ # Changelog
2
+
3
+ All notable changes to **ocrcontext** are documented here.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2026-06-25
11
+
12
+ Initial release — the document extraction core, decoupled from its web stack
13
+ into a standalone, LLM-agnostic library.
14
+
15
+ ### Added
16
+ - **`Analyzer` facade** — 3-line developer experience:
17
+ `Analyzer().analyze("file.pdf").text`.
18
+ - **Routing ladder** (`pipeline.py`):
19
+ - Digital PDFs → PyMuPDF text-layer extraction (no OCR); LLM refine is
20
+ auto-skipped so exact text/identifiers are never altered.
21
+ - Images / scanned PDFs → PaddleOCR with image preprocessing, multi-language
22
+ *coverage-first* candidate selection, and a line-band recovery fallback.
23
+ - Handwriting (explicit or auto on insufficient text) → Google Vision primary,
24
+ Microsoft TrOCR fallback.
25
+ - Multi-page documents joined with `--- Page N ---` separators.
26
+ - **LLM-agnostic LLM layer** — works with any LangChain `BaseChatModel`
27
+ (`langchain-openai`, `langchain-anthropic`, `langchain-ollama`, ...). Only
28
+ `langchain-core` is required at the core.
29
+ - `Refiner` — fidelity-first OCR refinement (4 modes) with literal/contact
30
+ preservation (`{{OCRLITn}}` masking) and drift/hallucination rejection.
31
+ - `StructuredExtractor` + `Analyzer.extract()` / `Analyzer.extract_text()` —
32
+ structured extraction into any Pydantic schema via `with_structured_output`.
33
+ - Built-in `Invoice` / `LineItem` schemas and prompt.
34
+ - **Resource efficiency** — `EngineRegistry` singleton caches PaddleOCR/TrOCR
35
+ engines (and per-language models) so they load at most once per process.
36
+ - **Windows robustness** — model cache and temp files are routed through ASCII
37
+ 8.3 short paths to survive non-ASCII usernames; oneDNN is disabled on CPU to
38
+ avoid the PaddlePaddle 3.x PIR/oneDNN `NotImplementedError`.
39
+ - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
40
+ PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
41
+
42
+ [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...HEAD
43
+ [0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Bahadır Karslı
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,207 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocrcontext
3
+ Version: 0.1.0
4
+ Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
+ Project-URL: Homepage, https://github.com/bahadirkarsli/ocrcontext
6
+ Project-URL: Repository, https://github.com/bahadirkarsli/ocrcontext
7
+ Project-URL: Issues, https://github.com/bahadirkarsli/ocrcontext/issues
8
+ Project-URL: Changelog, https://github.com/bahadirkarsli/ocrcontext/blob/main/CHANGELOG.md
9
+ Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
10
+ Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: langchain-core>=0.3
28
+ Requires-Dist: numpy>=1.24
29
+ Requires-Dist: pillow>=9.0
30
+ Requires-Dist: pydantic>=2.5
31
+ Requires-Dist: pymupdf>=1.23
32
+ Provides-Extra: all
33
+ Requires-Dist: accelerate>=0.27; extra == 'all'
34
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
35
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
36
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
37
+ Requires-Dist: paddlepaddle>=2.6; extra == 'all'
38
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
39
+ Requires-Dist: torch>=2.1; extra == 'all'
40
+ Requires-Dist: torchvision>=0.16; extra == 'all'
41
+ Requires-Dist: transformers>=4.40; extra == 'all'
42
+ Provides-Extra: dev
43
+ Requires-Dist: build>=1.2; extra == 'dev'
44
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
45
+ Requires-Dist: pytest>=8.0; extra == 'dev'
46
+ Requires-Dist: ruff>=0.5; extra == 'dev'
47
+ Requires-Dist: twine>=5.0; extra == 'dev'
48
+ Provides-Extra: paddle
49
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
50
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
51
+ Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
52
+ Provides-Extra: trocr
53
+ Requires-Dist: accelerate>=0.27; extra == 'trocr'
54
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
55
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
56
+ Requires-Dist: torch>=2.1; extra == 'trocr'
57
+ Requires-Dist: torchvision>=0.16; extra == 'trocr'
58
+ Requires-Dist: transformers>=4.40; extra == 'trocr'
59
+ Provides-Extra: vision
60
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
61
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
62
+ Description-Content-Type: text/markdown
63
+
64
+ # ocrcontext
65
+
66
+ **Decoupled, LLM-agnostic document OCR + structured extraction.** Turn a PDF or
67
+ image into clean text — or a typed Pydantic model — in three lines.
68
+
69
+ `ocrcontext` is the extraction core of a document-analysis platform, lifted out
70
+ of its web stack into a pure, pip-installable library. No FastAPI, no servers,
71
+ no hardcoded model providers.
72
+
73
+ ```python
74
+ from ocrcontext import Analyzer
75
+
76
+ result = Analyzer().analyze("invoice.pdf")
77
+ print(result.text)
78
+ ```
79
+
80
+ ## Why
81
+
82
+ - **3-line DX** — instantiate, pass a file, get a result.
83
+ - **LLM-agnostic** — inject any LangChain chat model (OpenAI, Anthropic, Ollama,
84
+ local). Only `langchain-core` is required; you bring the provider.
85
+ - **Resource-efficient** — heavy OCR models (PaddleOCR, TrOCR) load lazily and
86
+ are cached as process-wide singletons, so they never reload per call.
87
+ - **Lightweight base install** — engines are opt-in extras.
88
+
89
+ ## Install
90
+
91
+ ```bash
92
+ pip install ocrcontext # core only (PDF text layer + the API surface)
93
+ pip install 'ocrcontext[paddle]' # printed text + scanned PDFs (PaddleOCR)
94
+ pip install 'ocrcontext[trocr]' # handwriting fallback (Microsoft TrOCR)
95
+ pip install 'ocrcontext[vision]' # handwriting primary (Google Cloud Vision)
96
+ pip install 'ocrcontext[all]' # everything
97
+ ```
98
+
99
+ Pick an LLM provider for refinement / extraction:
100
+
101
+ ```bash
102
+ pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
103
+ ```
104
+
105
+ ## Usage
106
+
107
+ ### Raw OCR (no LLM, no API key)
108
+
109
+ ```python
110
+ from ocrcontext import Analyzer
111
+
112
+ result = Analyzer().analyze("scan.png")
113
+ print(result.text, result.confidence, result.pages, result.text_source)
114
+ ```
115
+
116
+ ### LLM-refined OCR
117
+
118
+ Refinement fixes OCR errors **without** paraphrasing, translating, or inventing
119
+ text. Emails/URLs/IBANs are frozen so the model can't "correct" them, and output
120
+ that drifts too far from the source is rejected in favour of the raw text.
121
+
122
+ ```python
123
+ from langchain_openai import ChatOpenAI
124
+ from ocrcontext import Analyzer
125
+
126
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
127
+ result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
128
+ print(result.text) # refined
129
+ print(result.raw_text) # original OCR, kept alongside
130
+ ```
131
+
132
+ ### Structured extraction
133
+
134
+ ```python
135
+ from langchain_openai import ChatOpenAI
136
+ from ocrcontext import Analyzer
137
+ from ocrcontext.schemas import Invoice
138
+
139
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
140
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice) # -> Invoice instance
141
+ print(invoice.total_amount, invoice.currency)
142
+ ```
143
+
144
+ Define your own schema with plain Pydantic:
145
+
146
+ ```python
147
+ from pydantic import BaseModel, Field
148
+
149
+ class Receipt(BaseModel):
150
+ merchant: str | None = Field(None, description="Store name")
151
+ total: float | None = Field(None, description="Grand total")
152
+
153
+ receipt = analyzer.extract("receipt.jpg", schema=Receipt)
154
+ ```
155
+
156
+ ### Same code, local model (no API key)
157
+
158
+ ```python
159
+ from langchain_ollama import ChatOllama
160
+ from ocrcontext import Analyzer
161
+
162
+ analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
163
+ print(analyzer.analyze("scan.png").text)
164
+ ```
165
+
166
+ ## How it routes a document
167
+
168
+ 1. **Digital PDF** → embedded text-layer extraction (exact text; LLM refine is
169
+ skipped so identifiers aren't altered).
170
+ 2. **Image / scanned PDF** → PaddleOCR with preprocessing (deskew, denoise,
171
+ CLAHE), multi-language *coverage-first* selection, and a line-band recovery
172
+ fallback.
173
+ 3. **Handwriting** (`handwriting=True`, or auto when printed OCR yields too
174
+ little text) → Google Vision primary, TrOCR fallback.
175
+ 4. **Optional LLM refine** → fidelity-first, literal-preserved, drift-guarded.
176
+ 5. **Optional `extract(schema=...)`** → typed Pydantic model.
177
+
178
+ ## Refinement modes
179
+
180
+ `RefinementMode`: `conservative` (scans), `layout` (digital PDFs),
181
+ `handwriting_prose`, `handwriting_layout`. The handwriting mode is auto-selected
182
+ based on whether the text looks like a DIKW/pyramid diagram. Modes and prompts
183
+ are ported verbatim from the production pipeline.
184
+
185
+ ## Configuration
186
+
187
+ ```python
188
+ from ocrcontext import Analyzer, AnalyzerConfig
189
+
190
+ cfg = AnalyzerConfig(
191
+ lang="tr",
192
+ prefer_pdf_text_layer=True,
193
+ auto_handwriting_fallback=True,
194
+ )
195
+ analyzer = Analyzer(llm=..., config=cfg)
196
+ ```
197
+
198
+ ## Development
199
+
200
+ ```bash
201
+ pip install -e '.[dev]'
202
+ pytest # runs without GPU/network — engines and LLM are faked
203
+ ```
204
+
205
+ ## License
206
+
207
+ MIT
@@ -0,0 +1,144 @@
1
+ # ocrcontext
2
+
3
+ **Decoupled, LLM-agnostic document OCR + structured extraction.** Turn a PDF or
4
+ image into clean text — or a typed Pydantic model — in three lines.
5
+
6
+ `ocrcontext` is the extraction core of a document-analysis platform, lifted out
7
+ of its web stack into a pure, pip-installable library. No FastAPI, no servers,
8
+ no hardcoded model providers.
9
+
10
+ ```python
11
+ from ocrcontext import Analyzer
12
+
13
+ result = Analyzer().analyze("invoice.pdf")
14
+ print(result.text)
15
+ ```
16
+
17
+ ## Why
18
+
19
+ - **3-line DX** — instantiate, pass a file, get a result.
20
+ - **LLM-agnostic** — inject any LangChain chat model (OpenAI, Anthropic, Ollama,
21
+ local). Only `langchain-core` is required; you bring the provider.
22
+ - **Resource-efficient** — heavy OCR models (PaddleOCR, TrOCR) load lazily and
23
+ are cached as process-wide singletons, so they never reload per call.
24
+ - **Lightweight base install** — engines are opt-in extras.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install ocrcontext # core only (PDF text layer + the API surface)
30
+ pip install 'ocrcontext[paddle]' # printed text + scanned PDFs (PaddleOCR)
31
+ pip install 'ocrcontext[trocr]' # handwriting fallback (Microsoft TrOCR)
32
+ pip install 'ocrcontext[vision]' # handwriting primary (Google Cloud Vision)
33
+ pip install 'ocrcontext[all]' # everything
34
+ ```
35
+
36
+ Pick an LLM provider for refinement / extraction:
37
+
38
+ ```bash
39
+ pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### Raw OCR (no LLM, no API key)
45
+
46
+ ```python
47
+ from ocrcontext import Analyzer
48
+
49
+ result = Analyzer().analyze("scan.png")
50
+ print(result.text, result.confidence, result.pages, result.text_source)
51
+ ```
52
+
53
+ ### LLM-refined OCR
54
+
55
+ Refinement fixes OCR errors **without** paraphrasing, translating, or inventing
56
+ text. Emails/URLs/IBANs are frozen so the model can't "correct" them, and output
57
+ that drifts too far from the source is rejected in favour of the raw text.
58
+
59
+ ```python
60
+ from langchain_openai import ChatOpenAI
61
+ from ocrcontext import Analyzer
62
+
63
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
64
+ result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
65
+ print(result.text) # refined
66
+ print(result.raw_text) # original OCR, kept alongside
67
+ ```
68
+
69
+ ### Structured extraction
70
+
71
+ ```python
72
+ from langchain_openai import ChatOpenAI
73
+ from ocrcontext import Analyzer
74
+ from ocrcontext.schemas import Invoice
75
+
76
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
77
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice) # -> Invoice instance
78
+ print(invoice.total_amount, invoice.currency)
79
+ ```
80
+
81
+ Define your own schema with plain Pydantic:
82
+
83
+ ```python
84
+ from pydantic import BaseModel, Field
85
+
86
+ class Receipt(BaseModel):
87
+ merchant: str | None = Field(None, description="Store name")
88
+ total: float | None = Field(None, description="Grand total")
89
+
90
+ receipt = analyzer.extract("receipt.jpg", schema=Receipt)
91
+ ```
92
+
93
+ ### Same code, local model (no API key)
94
+
95
+ ```python
96
+ from langchain_ollama import ChatOllama
97
+ from ocrcontext import Analyzer
98
+
99
+ analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
100
+ print(analyzer.analyze("scan.png").text)
101
+ ```
102
+
103
+ ## How it routes a document
104
+
105
+ 1. **Digital PDF** → embedded text-layer extraction (exact text; LLM refine is
106
+ skipped so identifiers aren't altered).
107
+ 2. **Image / scanned PDF** → PaddleOCR with preprocessing (deskew, denoise,
108
+ CLAHE), multi-language *coverage-first* selection, and a line-band recovery
109
+ fallback.
110
+ 3. **Handwriting** (`handwriting=True`, or auto when printed OCR yields too
111
+ little text) → Google Vision primary, TrOCR fallback.
112
+ 4. **Optional LLM refine** → fidelity-first, literal-preserved, drift-guarded.
113
+ 5. **Optional `extract(schema=...)`** → typed Pydantic model.
114
+
115
+ ## Refinement modes
116
+
117
+ `RefinementMode`: `conservative` (scans), `layout` (digital PDFs),
118
+ `handwriting_prose`, `handwriting_layout`. The handwriting mode is auto-selected
119
+ based on whether the text looks like a DIKW/pyramid diagram. Modes and prompts
120
+ are ported verbatim from the production pipeline.
121
+
122
+ ## Configuration
123
+
124
+ ```python
125
+ from ocrcontext import Analyzer, AnalyzerConfig
126
+
127
+ cfg = AnalyzerConfig(
128
+ lang="tr",
129
+ prefer_pdf_text_layer=True,
130
+ auto_handwriting_fallback=True,
131
+ )
132
+ analyzer = Analyzer(llm=..., config=cfg)
133
+ ```
134
+
135
+ ## Development
136
+
137
+ ```bash
138
+ pip install -e '.[dev]'
139
+ pytest # runs without GPU/network — engines and LLM are faked
140
+ ```
141
+
142
+ ## License
143
+
144
+ MIT
@@ -0,0 +1,12 @@
1
+ """Raw OCR in 3 lines — no LLM, no API key required.
2
+
3
+ pip install 'ocrcontext[paddle]'
4
+ """
5
+
6
+ from ocrcontext import Analyzer
7
+
8
+ result = Analyzer().analyze("invoice.pdf")
9
+ print(result.text)
10
+
11
+ # `result` is a Pydantic model with extra metadata:
12
+ print("source:", result.text_source, "| pages:", result.pages, "| conf:", result.confidence)
@@ -0,0 +1,19 @@
1
+ """LLM-refined OCR with OpenAI.
2
+
3
+ pip install 'ocrcontext[paddle]' langchain-openai
4
+ export OPENAI_API_KEY=sk-...
5
+ """
6
+
7
+ from langchain_openai import ChatOpenAI
8
+
9
+ from ocrcontext import Analyzer
10
+
11
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
12
+
13
+ # refine=None (default) auto-refines OCR output (but never an exact PDF text layer).
14
+ result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
15
+
16
+ print("Refined:", result.refined)
17
+ print(result.text)
18
+ if result.raw_text:
19
+ print("\n--- raw OCR (before refine) ---\n", result.raw_text)
@@ -0,0 +1,32 @@
1
+ """Structured extraction into a Pydantic model.
2
+
3
+ pip install 'ocrcontext[paddle]' langchain-openai
4
+ export OPENAI_API_KEY=sk-...
5
+ """
6
+
7
+ from langchain_openai import ChatOpenAI
8
+
9
+ from ocrcontext import Analyzer
10
+ from ocrcontext.schemas import Invoice
11
+
12
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0), lang="tr")
13
+
14
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice)
15
+
16
+ print(invoice.supplier_name, invoice.total_amount, invoice.currency)
17
+ for item in invoice.line_items:
18
+ print(f" - {item.description}: {item.quantity} x {item.unit_price} = {item.total}")
19
+
20
+
21
+ # --- Or define your own schema -------------------------------------------------
22
+ from pydantic import BaseModel, Field # noqa: E402
23
+
24
+
25
+ class Receipt(BaseModel):
26
+ merchant: str | None = Field(None, description="Store / merchant name")
27
+ date: str | None = Field(None, description="Purchase date, YYYY-MM-DD")
28
+ total: float | None = Field(None, description="Grand total")
29
+
30
+
31
+ receipt = analyzer.extract("receipt.jpg", schema=Receipt)
32
+ print(receipt)
@@ -0,0 +1,15 @@
1
+ """LLM-agnostic: the exact same code with a local Ollama model — no API key.
2
+
3
+ pip install 'ocrcontext[paddle]' langchain-ollama
4
+ ollama pull llama3.1
5
+ """
6
+
7
+ from langchain_ollama import ChatOllama
8
+
9
+ from ocrcontext import Analyzer
10
+
11
+ # Swap ChatOpenAI -> ChatOllama; nothing else changes.
12
+ analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
13
+
14
+ result = analyzer.analyze("scan.png")
15
+ print(result.text)
@@ -0,0 +1,107 @@
1
+ """End-to-end smoke test: raw OCR on a single image — no LLM, no schemas.
2
+
3
+ This exercises the *pure* PaddleOCR path:
4
+ load image -> preprocess -> candidate-language OCR -> coverage-first text.
5
+
6
+ Usage
7
+ -----
8
+ python examples/image_smoke_test.py # auto-find a sample image
9
+ python examples/image_smoke_test.py path/to/img.png # explicit path
10
+
11
+ Setup
12
+ -----
13
+ pip install -e '.[paddle]' # installs PaddleOCR + OpenCV
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import sys
19
+ import time
20
+ from pathlib import Path
21
+
22
+ from ocrcontext import Analyzer, AnalyzerConfig
23
+ from ocrcontext.exceptions import MissingDependencyError, UnsupportedFileError
24
+
25
+ # Common sample names / extensions to look for when no path is given.
26
+ _SAMPLE_NAMES = ["sample", "test", "image", "ocr", "smoke"]
27
+ _IMAGE_EXTS = [".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff", ".webp"]
28
+
29
+
30
+ def _find_sample_image() -> Path | None:
31
+ """Look for an image next to this script and in the project root."""
32
+ here = Path(__file__).resolve().parent
33
+ search_dirs = [here, here.parent] # examples/ then ocrcontext_lib/
34
+ # 1) Prefer files whose name hints they're samples.
35
+ for d in search_dirs:
36
+ for stem in _SAMPLE_NAMES:
37
+ for ext in _IMAGE_EXTS:
38
+ candidate = d / f"{stem}{ext}"
39
+ if candidate.exists():
40
+ return candidate
41
+ # 2) Otherwise, the first image we can find in those dirs.
42
+ for d in search_dirs:
43
+ for ext in _IMAGE_EXTS:
44
+ matches = sorted(d.glob(f"*{ext}"))
45
+ if matches:
46
+ return matches[0]
47
+ return None
48
+
49
+
50
+ def main() -> int:
51
+ if len(sys.argv) > 1:
52
+ image_path = Path(sys.argv[1]).expanduser()
53
+ else:
54
+ found = _find_sample_image()
55
+ if found is None:
56
+ print(
57
+ "No image given and none found automatically.\n"
58
+ "Drop an image (e.g. sample.png) into the examples/ folder, or run:\n"
59
+ " python examples/image_smoke_test.py path/to/your/image.png"
60
+ )
61
+ return 2
62
+ image_path = found
63
+ print(f"[i] No path given — using discovered image: {image_path.name}")
64
+
65
+ if not image_path.exists():
66
+ print(f"[x] File not found: {image_path}")
67
+ return 2
68
+
69
+ print(f"[i] OCR target : {image_path}")
70
+ print("[i] Engine : PaddleOCR (raw OCR, no LLM)\n")
71
+
72
+ # Pure PaddleOCR: disable the handwriting fallback so a sparse image doesn't try
73
+ # to load the Vision/TrOCR extras during this smoke test.
74
+ analyzer = Analyzer(config=AnalyzerConfig(lang="en", auto_handwriting_fallback=False))
75
+
76
+ try:
77
+ t0 = time.perf_counter()
78
+ result = analyzer.analyze(image_path)
79
+ elapsed = time.perf_counter() - t0
80
+ except MissingDependencyError as exc:
81
+ print(f"[x] {exc}")
82
+ return 1
83
+ except UnsupportedFileError as exc:
84
+ print(f"[x] {exc}")
85
+ return 2
86
+
87
+ print("=" * 60)
88
+ print("EXTRACTED TEXT")
89
+ print("=" * 60)
90
+ print(result.text if result.text else "(no text detected)")
91
+ print("=" * 60)
92
+ print(
93
+ f"source={result.text_source} pages={result.pages} "
94
+ f"confidence={result.confidence} chars={len(result.text)} "
95
+ f"time={elapsed:.2f}s"
96
+ )
97
+
98
+ # Show the singleton in action: a second call reuses the loaded model (fast).
99
+ t0 = time.perf_counter()
100
+ analyzer.analyze(image_path)
101
+ print(f"[i] 2nd run (warm model): {time.perf_counter() - t0:.2f}s")
102
+
103
+ return 0 if result.text.strip() else 1
104
+
105
+
106
+ if __name__ == "__main__":
107
+ raise SystemExit(main())