ocrcontext 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrcontext-0.1.0/.gitignore +27 -0
- ocrcontext-0.1.0/CHANGELOG.md +43 -0
- ocrcontext-0.1.0/LICENSE +21 -0
- ocrcontext-0.1.0/PKG-INFO +207 -0
- ocrcontext-0.1.0/README.md +144 -0
- ocrcontext-0.1.0/examples/01_quickstart.py +12 -0
- ocrcontext-0.1.0/examples/02_refine_openai.py +19 -0
- ocrcontext-0.1.0/examples/03_structured_invoice.py +32 -0
- ocrcontext-0.1.0/examples/04_local_ollama.py +15 -0
- ocrcontext-0.1.0/examples/image_smoke_test.py +107 -0
- ocrcontext-0.1.0/examples/pdf_smoke_test.py +170 -0
- ocrcontext-0.1.0/examples/structured_smoke_test.py +140 -0
- ocrcontext-0.1.0/pyproject.toml +108 -0
- ocrcontext-0.1.0/src/ocrcontext/__init__.py +49 -0
- ocrcontext-0.1.0/src/ocrcontext/analyzer.py +198 -0
- ocrcontext-0.1.0/src/ocrcontext/config.py +49 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/__init__.py +6 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/base.py +45 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/handwriting.py +103 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/paddle.py +264 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/pdf_text.py +126 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/registry.py +67 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/trocr.py +191 -0
- ocrcontext-0.1.0/src/ocrcontext/engines/vision.py +538 -0
- ocrcontext-0.1.0/src/ocrcontext/exceptions.py +45 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/__init__.py +10 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/drift.py +58 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/extractor.py +63 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/formatting.py +39 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/literal_preserve.py +164 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/prompts.py +157 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/refiner.py +114 -0
- ocrcontext-0.1.0/src/ocrcontext/llm/schemas.py +99 -0
- ocrcontext-0.1.0/src/ocrcontext/pipeline.py +162 -0
- ocrcontext-0.1.0/src/ocrcontext/preprocessing/__init__.py +5 -0
- ocrcontext-0.1.0/src/ocrcontext/preprocessing/image.py +177 -0
- ocrcontext-0.1.0/src/ocrcontext/py.typed +0 -0
- ocrcontext-0.1.0/src/ocrcontext/quality.py +76 -0
- ocrcontext-0.1.0/src/ocrcontext/schemas.py +8 -0
- ocrcontext-0.1.0/src/ocrcontext/types.py +55 -0
- ocrcontext-0.1.0/src/ocrcontext/utils/__init__.py +1 -0
- ocrcontext-0.1.0/src/ocrcontext/utils/files.py +172 -0
- ocrcontext-0.1.0/src/ocrcontext/utils/lang.py +77 -0
- ocrcontext-0.1.0/tests/__init__.py +0 -0
- ocrcontext-0.1.0/tests/conftest.py +74 -0
- ocrcontext-0.1.0/tests/test_literal_preserve.py +56 -0
- ocrcontext-0.1.0/tests/test_llm.py +68 -0
- ocrcontext-0.1.0/tests/test_pipeline_analyzer.py +96 -0
- ocrcontext-0.1.0/tests/test_text_helpers.py +93 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.ruff_cache/
|
|
10
|
+
.coverage
|
|
11
|
+
htmlcov/
|
|
12
|
+
|
|
13
|
+
# Virtual envs
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
# Models / caches
|
|
19
|
+
.cache/
|
|
20
|
+
*.onnx
|
|
21
|
+
*.pdmodel
|
|
22
|
+
*.pdiparams
|
|
23
|
+
|
|
24
|
+
# OS / editor
|
|
25
|
+
.DS_Store
|
|
26
|
+
.idea/
|
|
27
|
+
.vscode/
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to **ocrcontext** are documented here.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-06-25
|
|
11
|
+
|
|
12
|
+
Initial release — the document extraction core, decoupled from its web stack
|
|
13
|
+
into a standalone, LLM-agnostic library.
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- **`Analyzer` facade** — 3-line developer experience:
|
|
17
|
+
`Analyzer().analyze("file.pdf").text`.
|
|
18
|
+
- **Routing ladder** (`pipeline.py`):
|
|
19
|
+
- Digital PDFs → PyMuPDF text-layer extraction (no OCR); LLM refine is
|
|
20
|
+
auto-skipped so exact text/identifiers are never altered.
|
|
21
|
+
- Images / scanned PDFs → PaddleOCR with image preprocessing, multi-language
|
|
22
|
+
*coverage-first* candidate selection, and a line-band recovery fallback.
|
|
23
|
+
- Handwriting (explicit or auto on insufficient text) → Google Vision primary,
|
|
24
|
+
Microsoft TrOCR fallback.
|
|
25
|
+
- Multi-page documents joined with `--- Page N ---` separators.
|
|
26
|
+
- **LLM-agnostic LLM layer** — works with any LangChain `BaseChatModel`
|
|
27
|
+
(`langchain-openai`, `langchain-anthropic`, `langchain-ollama`, ...). Only
|
|
28
|
+
`langchain-core` is required at the core.
|
|
29
|
+
- `Refiner` — fidelity-first OCR refinement (4 modes) with literal/contact
|
|
30
|
+
preservation (`{{OCRLITn}}` masking) and drift/hallucination rejection.
|
|
31
|
+
- `StructuredExtractor` + `Analyzer.extract()` / `Analyzer.extract_text()` —
|
|
32
|
+
structured extraction into any Pydantic schema via `with_structured_output`.
|
|
33
|
+
- Built-in `Invoice` / `LineItem` schemas and prompt.
|
|
34
|
+
- **Resource efficiency** — `EngineRegistry` singleton caches PaddleOCR/TrOCR
|
|
35
|
+
engines (and per-language models) so they load at most once per process.
|
|
36
|
+
- **Windows robustness** — model cache and temp files are routed through ASCII
|
|
37
|
+
8.3 short paths to survive non-ASCII usernames; oneDNN is disabled on CPU to
|
|
38
|
+
avoid the PaddlePaddle 3.x PIR/oneDNN `NotImplementedError`.
|
|
39
|
+
- **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
|
|
40
|
+
PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
|
|
41
|
+
|
|
42
|
+
[Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...HEAD
|
|
43
|
+
[0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
|
ocrcontext-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Bahadır Karslı
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocrcontext
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
|
|
5
|
+
Project-URL: Homepage, https://github.com/bahadirkarsli/ocrcontext
|
|
6
|
+
Project-URL: Repository, https://github.com/bahadirkarsli/ocrcontext
|
|
7
|
+
Project-URL: Issues, https://github.com/bahadirkarsli/ocrcontext/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/bahadirkarsli/ocrcontext/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
|
|
10
|
+
Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
24
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: langchain-core>=0.3
|
|
28
|
+
Requires-Dist: numpy>=1.24
|
|
29
|
+
Requires-Dist: pillow>=9.0
|
|
30
|
+
Requires-Dist: pydantic>=2.5
|
|
31
|
+
Requires-Dist: pymupdf>=1.23
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: accelerate>=0.27; extra == 'all'
|
|
34
|
+
Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
|
|
35
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
|
|
36
|
+
Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
|
|
37
|
+
Requires-Dist: paddlepaddle>=2.6; extra == 'all'
|
|
38
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
|
|
39
|
+
Requires-Dist: torch>=2.1; extra == 'all'
|
|
40
|
+
Requires-Dist: torchvision>=0.16; extra == 'all'
|
|
41
|
+
Requires-Dist: transformers>=4.40; extra == 'all'
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
44
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
45
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
46
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
47
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
48
|
+
Provides-Extra: paddle
|
|
49
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
|
|
50
|
+
Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
|
|
51
|
+
Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
|
|
52
|
+
Provides-Extra: trocr
|
|
53
|
+
Requires-Dist: accelerate>=0.27; extra == 'trocr'
|
|
54
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
|
|
55
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
|
|
56
|
+
Requires-Dist: torch>=2.1; extra == 'trocr'
|
|
57
|
+
Requires-Dist: torchvision>=0.16; extra == 'trocr'
|
|
58
|
+
Requires-Dist: transformers>=4.40; extra == 'trocr'
|
|
59
|
+
Provides-Extra: vision
|
|
60
|
+
Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
|
|
61
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
|
|
62
|
+
Description-Content-Type: text/markdown
|
|
63
|
+
|
|
64
|
+
# ocrcontext
|
|
65
|
+
|
|
66
|
+
**Decoupled, LLM-agnostic document OCR + structured extraction.** Turn a PDF or
|
|
67
|
+
image into clean text — or a typed Pydantic model — in three lines.
|
|
68
|
+
|
|
69
|
+
`ocrcontext` is the extraction core of a document-analysis platform, lifted out
|
|
70
|
+
of its web stack into a pure, pip-installable library. No FastAPI, no servers,
|
|
71
|
+
no hardcoded model providers.
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from ocrcontext import Analyzer
|
|
75
|
+
|
|
76
|
+
result = Analyzer().analyze("invoice.pdf")
|
|
77
|
+
print(result.text)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Why
|
|
81
|
+
|
|
82
|
+
- **3-line DX** — instantiate, pass a file, get a result.
|
|
83
|
+
- **LLM-agnostic** — inject any LangChain chat model (OpenAI, Anthropic, Ollama,
|
|
84
|
+
local). Only `langchain-core` is required; you bring the provider.
|
|
85
|
+
- **Resource-efficient** — heavy OCR models (PaddleOCR, TrOCR) load lazily and
|
|
86
|
+
are cached as process-wide singletons, so they never reload per call.
|
|
87
|
+
- **Lightweight base install** — engines are opt-in extras.
|
|
88
|
+
|
|
89
|
+
## Install
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install ocrcontext # core only (PDF text layer + the API surface)
|
|
93
|
+
pip install 'ocrcontext[paddle]' # printed text + scanned PDFs (PaddleOCR)
|
|
94
|
+
pip install 'ocrcontext[trocr]' # handwriting fallback (Microsoft TrOCR)
|
|
95
|
+
pip install 'ocrcontext[vision]' # handwriting primary (Google Cloud Vision)
|
|
96
|
+
pip install 'ocrcontext[all]' # everything
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Pick an LLM provider for refinement / extraction:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Usage
|
|
106
|
+
|
|
107
|
+
### Raw OCR (no LLM, no API key)
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from ocrcontext import Analyzer
|
|
111
|
+
|
|
112
|
+
result = Analyzer().analyze("scan.png")
|
|
113
|
+
print(result.text, result.confidence, result.pages, result.text_source)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### LLM-refined OCR
|
|
117
|
+
|
|
118
|
+
Refinement fixes OCR errors **without** paraphrasing, translating, or inventing
|
|
119
|
+
text. Emails/URLs/IBANs are frozen so the model can't "correct" them, and output
|
|
120
|
+
that drifts too far from the source is rejected in favour of the raw text.
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from langchain_openai import ChatOpenAI
|
|
124
|
+
from ocrcontext import Analyzer
|
|
125
|
+
|
|
126
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
|
|
127
|
+
result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
|
|
128
|
+
print(result.text) # refined
|
|
129
|
+
print(result.raw_text) # original OCR, kept alongside
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Structured extraction
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from langchain_openai import ChatOpenAI
|
|
136
|
+
from ocrcontext import Analyzer
|
|
137
|
+
from ocrcontext.schemas import Invoice
|
|
138
|
+
|
|
139
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
|
|
140
|
+
invoice = analyzer.extract("invoice.pdf", schema=Invoice) # -> Invoice instance
|
|
141
|
+
print(invoice.total_amount, invoice.currency)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Define your own schema with plain Pydantic:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from pydantic import BaseModel, Field
|
|
148
|
+
|
|
149
|
+
class Receipt(BaseModel):
|
|
150
|
+
merchant: str | None = Field(None, description="Store name")
|
|
151
|
+
total: float | None = Field(None, description="Grand total")
|
|
152
|
+
|
|
153
|
+
receipt = analyzer.extract("receipt.jpg", schema=Receipt)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Same code, local model (no API key)
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from langchain_ollama import ChatOllama
|
|
160
|
+
from ocrcontext import Analyzer
|
|
161
|
+
|
|
162
|
+
analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
|
|
163
|
+
print(analyzer.analyze("scan.png").text)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## How it routes a document
|
|
167
|
+
|
|
168
|
+
1. **Digital PDF** → embedded text-layer extraction (exact text; LLM refine is
|
|
169
|
+
skipped so identifiers aren't altered).
|
|
170
|
+
2. **Image / scanned PDF** → PaddleOCR with preprocessing (deskew, denoise,
|
|
171
|
+
CLAHE), multi-language *coverage-first* selection, and a line-band recovery
|
|
172
|
+
fallback.
|
|
173
|
+
3. **Handwriting** (`handwriting=True`, or auto when printed OCR yields too
|
|
174
|
+
little text) → Google Vision primary, TrOCR fallback.
|
|
175
|
+
4. **Optional LLM refine** → fidelity-first, literal-preserved, drift-guarded.
|
|
176
|
+
5. **Optional `extract(schema=...)`** → typed Pydantic model.
|
|
177
|
+
|
|
178
|
+
## Refinement modes
|
|
179
|
+
|
|
180
|
+
`RefinementMode`: `conservative` (scans), `layout` (digital PDFs),
|
|
181
|
+
`handwriting_prose`, `handwriting_layout`. The handwriting mode is auto-selected
|
|
182
|
+
based on whether the text looks like a DIKW/pyramid diagram. Modes and prompts
|
|
183
|
+
are ported verbatim from the production pipeline.
|
|
184
|
+
|
|
185
|
+
## Configuration
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
from ocrcontext import Analyzer, AnalyzerConfig
|
|
189
|
+
|
|
190
|
+
cfg = AnalyzerConfig(
|
|
191
|
+
lang="tr",
|
|
192
|
+
prefer_pdf_text_layer=True,
|
|
193
|
+
auto_handwriting_fallback=True,
|
|
194
|
+
)
|
|
195
|
+
analyzer = Analyzer(llm=..., config=cfg)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Development
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
pip install -e '.[dev]'
|
|
202
|
+
pytest # runs without GPU/network — engines and LLM are faked
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## License
|
|
206
|
+
|
|
207
|
+
MIT
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# ocrcontext
|
|
2
|
+
|
|
3
|
+
**Decoupled, LLM-agnostic document OCR + structured extraction.** Turn a PDF or
|
|
4
|
+
image into clean text — or a typed Pydantic model — in three lines.
|
|
5
|
+
|
|
6
|
+
`ocrcontext` is the extraction core of a document-analysis platform, lifted out
|
|
7
|
+
of its web stack into a pure, pip-installable library. No FastAPI, no servers,
|
|
8
|
+
no hardcoded model providers.
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
from ocrcontext import Analyzer
|
|
12
|
+
|
|
13
|
+
result = Analyzer().analyze("invoice.pdf")
|
|
14
|
+
print(result.text)
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Why
|
|
18
|
+
|
|
19
|
+
- **3-line DX** — instantiate, pass a file, get a result.
|
|
20
|
+
- **LLM-agnostic** — inject any LangChain chat model (OpenAI, Anthropic, Ollama,
|
|
21
|
+
local). Only `langchain-core` is required; you bring the provider.
|
|
22
|
+
- **Resource-efficient** — heavy OCR models (PaddleOCR, TrOCR) load lazily and
|
|
23
|
+
are cached as process-wide singletons, so they never reload per call.
|
|
24
|
+
- **Lightweight base install** — engines are opt-in extras.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install ocrcontext # core only (PDF text layer + the API surface)
|
|
30
|
+
pip install 'ocrcontext[paddle]' # printed text + scanned PDFs (PaddleOCR)
|
|
31
|
+
pip install 'ocrcontext[trocr]' # handwriting fallback (Microsoft TrOCR)
|
|
32
|
+
pip install 'ocrcontext[vision]' # handwriting primary (Google Cloud Vision)
|
|
33
|
+
pip install 'ocrcontext[all]' # everything
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Pick an LLM provider for refinement / extraction:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Raw OCR (no LLM, no API key)
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from ocrcontext import Analyzer
|
|
48
|
+
|
|
49
|
+
result = Analyzer().analyze("scan.png")
|
|
50
|
+
print(result.text, result.confidence, result.pages, result.text_source)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### LLM-refined OCR
|
|
54
|
+
|
|
55
|
+
Refinement fixes OCR errors **without** paraphrasing, translating, or inventing
|
|
56
|
+
text. Emails/URLs/IBANs are frozen so the model can't "correct" them, and output
|
|
57
|
+
that drifts too far from the source is rejected in favour of the raw text.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from langchain_openai import ChatOpenAI
|
|
61
|
+
from ocrcontext import Analyzer
|
|
62
|
+
|
|
63
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
|
|
64
|
+
result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
|
|
65
|
+
print(result.text) # refined
|
|
66
|
+
print(result.raw_text) # original OCR, kept alongside
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Structured extraction
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from langchain_openai import ChatOpenAI
|
|
73
|
+
from ocrcontext import Analyzer
|
|
74
|
+
from ocrcontext.schemas import Invoice
|
|
75
|
+
|
|
76
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
|
|
77
|
+
invoice = analyzer.extract("invoice.pdf", schema=Invoice) # -> Invoice instance
|
|
78
|
+
print(invoice.total_amount, invoice.currency)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Define your own schema with plain Pydantic:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from pydantic import BaseModel, Field
|
|
85
|
+
|
|
86
|
+
class Receipt(BaseModel):
|
|
87
|
+
merchant: str | None = Field(None, description="Store name")
|
|
88
|
+
total: float | None = Field(None, description="Grand total")
|
|
89
|
+
|
|
90
|
+
receipt = analyzer.extract("receipt.jpg", schema=Receipt)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Same code, local model (no API key)
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from langchain_ollama import ChatOllama
|
|
97
|
+
from ocrcontext import Analyzer
|
|
98
|
+
|
|
99
|
+
analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
|
|
100
|
+
print(analyzer.analyze("scan.png").text)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## How it routes a document
|
|
104
|
+
|
|
105
|
+
1. **Digital PDF** → embedded text-layer extraction (exact text; LLM refine is
|
|
106
|
+
skipped so identifiers aren't altered).
|
|
107
|
+
2. **Image / scanned PDF** → PaddleOCR with preprocessing (deskew, denoise,
|
|
108
|
+
CLAHE), multi-language *coverage-first* selection, and a line-band recovery
|
|
109
|
+
fallback.
|
|
110
|
+
3. **Handwriting** (`handwriting=True`, or auto when printed OCR yields too
|
|
111
|
+
little text) → Google Vision primary, TrOCR fallback.
|
|
112
|
+
4. **Optional LLM refine** → fidelity-first, literal-preserved, drift-guarded.
|
|
113
|
+
5. **Optional `extract(schema=...)`** → typed Pydantic model.
|
|
114
|
+
|
|
115
|
+
## Refinement modes
|
|
116
|
+
|
|
117
|
+
`RefinementMode`: `conservative` (scans), `layout` (digital PDFs),
|
|
118
|
+
`handwriting_prose`, `handwriting_layout`. The handwriting mode is auto-selected
|
|
119
|
+
based on whether the text looks like a DIKW/pyramid diagram. Modes and prompts
|
|
120
|
+
are ported verbatim from the production pipeline.
|
|
121
|
+
|
|
122
|
+
## Configuration
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from ocrcontext import Analyzer, AnalyzerConfig
|
|
126
|
+
|
|
127
|
+
cfg = AnalyzerConfig(
|
|
128
|
+
lang="tr",
|
|
129
|
+
prefer_pdf_text_layer=True,
|
|
130
|
+
auto_handwriting_fallback=True,
|
|
131
|
+
)
|
|
132
|
+
analyzer = Analyzer(llm=..., config=cfg)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Development
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
pip install -e '.[dev]'
|
|
139
|
+
pytest # runs without GPU/network — engines and LLM are faked
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Raw OCR in 3 lines — no LLM, no API key required.
|
|
2
|
+
|
|
3
|
+
pip install 'ocrcontext[paddle]'
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from ocrcontext import Analyzer
|
|
7
|
+
|
|
8
|
+
result = Analyzer().analyze("invoice.pdf")
|
|
9
|
+
print(result.text)
|
|
10
|
+
|
|
11
|
+
# `result` is a Pydantic model with extra metadata:
|
|
12
|
+
print("source:", result.text_source, "| pages:", result.pages, "| conf:", result.confidence)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""LLM-refined OCR with OpenAI.
|
|
2
|
+
|
|
3
|
+
pip install 'ocrcontext[paddle]' langchain-openai
|
|
4
|
+
export OPENAI_API_KEY=sk-...
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from langchain_openai import ChatOpenAI
|
|
8
|
+
|
|
9
|
+
from ocrcontext import Analyzer
|
|
10
|
+
|
|
11
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o"), lang="tr")
|
|
12
|
+
|
|
13
|
+
# refine=None (default) auto-refines OCR output (but never an exact PDF text layer).
|
|
14
|
+
result = analyzer.analyze("handwritten_note.jpg", handwriting=True)
|
|
15
|
+
|
|
16
|
+
print("Refined:", result.refined)
|
|
17
|
+
print(result.text)
|
|
18
|
+
if result.raw_text:
|
|
19
|
+
print("\n--- raw OCR (before refine) ---\n", result.raw_text)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Structured extraction into a Pydantic model.
|
|
2
|
+
|
|
3
|
+
pip install 'ocrcontext[paddle]' langchain-openai
|
|
4
|
+
export OPENAI_API_KEY=sk-...
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from langchain_openai import ChatOpenAI
|
|
8
|
+
|
|
9
|
+
from ocrcontext import Analyzer
|
|
10
|
+
from ocrcontext.schemas import Invoice
|
|
11
|
+
|
|
12
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0), lang="tr")
|
|
13
|
+
|
|
14
|
+
invoice = analyzer.extract("invoice.pdf", schema=Invoice)
|
|
15
|
+
|
|
16
|
+
print(invoice.supplier_name, invoice.total_amount, invoice.currency)
|
|
17
|
+
for item in invoice.line_items:
|
|
18
|
+
print(f" - {item.description}: {item.quantity} x {item.unit_price} = {item.total}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# --- Or define your own schema -------------------------------------------------
|
|
22
|
+
from pydantic import BaseModel, Field # noqa: E402
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Receipt(BaseModel):
|
|
26
|
+
merchant: str | None = Field(None, description="Store / merchant name")
|
|
27
|
+
date: str | None = Field(None, description="Purchase date, YYYY-MM-DD")
|
|
28
|
+
total: float | None = Field(None, description="Grand total")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
receipt = analyzer.extract("receipt.jpg", schema=Receipt)
|
|
32
|
+
print(receipt)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""LLM-agnostic: the exact same code with a local Ollama model — no API key.
|
|
2
|
+
|
|
3
|
+
pip install 'ocrcontext[paddle]' langchain-ollama
|
|
4
|
+
ollama pull llama3.1
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from langchain_ollama import ChatOllama
|
|
8
|
+
|
|
9
|
+
from ocrcontext import Analyzer
|
|
10
|
+
|
|
11
|
+
# Swap ChatOpenAI -> ChatOllama; nothing else changes.
|
|
12
|
+
analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
|
|
13
|
+
|
|
14
|
+
result = analyzer.analyze("scan.png")
|
|
15
|
+
print(result.text)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""End-to-end smoke test: raw OCR on a single image — no LLM, no schemas.
|
|
2
|
+
|
|
3
|
+
This exercises the *pure* PaddleOCR path:
|
|
4
|
+
load image -> preprocess -> candidate-language OCR -> coverage-first text.
|
|
5
|
+
|
|
6
|
+
Usage
|
|
7
|
+
-----
|
|
8
|
+
python examples/image_smoke_test.py # auto-find a sample image
|
|
9
|
+
python examples/image_smoke_test.py path/to/img.png # explicit path
|
|
10
|
+
|
|
11
|
+
Setup
|
|
12
|
+
-----
|
|
13
|
+
pip install -e '.[paddle]' # installs PaddleOCR + OpenCV
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import sys
|
|
19
|
+
import time
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from ocrcontext import Analyzer, AnalyzerConfig
|
|
23
|
+
from ocrcontext.exceptions import MissingDependencyError, UnsupportedFileError
|
|
24
|
+
|
|
25
|
+
# Common sample names / extensions to look for when no path is given.
|
|
26
|
+
_SAMPLE_NAMES = ["sample", "test", "image", "ocr", "smoke"]
|
|
27
|
+
_IMAGE_EXTS = [".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff", ".webp"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _find_sample_image() -> Path | None:
|
|
31
|
+
"""Look for an image next to this script and in the project root."""
|
|
32
|
+
here = Path(__file__).resolve().parent
|
|
33
|
+
search_dirs = [here, here.parent] # examples/ then ocrcontext_lib/
|
|
34
|
+
# 1) Prefer files whose name hints they're samples.
|
|
35
|
+
for d in search_dirs:
|
|
36
|
+
for stem in _SAMPLE_NAMES:
|
|
37
|
+
for ext in _IMAGE_EXTS:
|
|
38
|
+
candidate = d / f"{stem}{ext}"
|
|
39
|
+
if candidate.exists():
|
|
40
|
+
return candidate
|
|
41
|
+
# 2) Otherwise, the first image we can find in those dirs.
|
|
42
|
+
for d in search_dirs:
|
|
43
|
+
for ext in _IMAGE_EXTS:
|
|
44
|
+
matches = sorted(d.glob(f"*{ext}"))
|
|
45
|
+
if matches:
|
|
46
|
+
return matches[0]
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main() -> int:
|
|
51
|
+
if len(sys.argv) > 1:
|
|
52
|
+
image_path = Path(sys.argv[1]).expanduser()
|
|
53
|
+
else:
|
|
54
|
+
found = _find_sample_image()
|
|
55
|
+
if found is None:
|
|
56
|
+
print(
|
|
57
|
+
"No image given and none found automatically.\n"
|
|
58
|
+
"Drop an image (e.g. sample.png) into the examples/ folder, or run:\n"
|
|
59
|
+
" python examples/image_smoke_test.py path/to/your/image.png"
|
|
60
|
+
)
|
|
61
|
+
return 2
|
|
62
|
+
image_path = found
|
|
63
|
+
print(f"[i] No path given — using discovered image: {image_path.name}")
|
|
64
|
+
|
|
65
|
+
if not image_path.exists():
|
|
66
|
+
print(f"[x] File not found: {image_path}")
|
|
67
|
+
return 2
|
|
68
|
+
|
|
69
|
+
print(f"[i] OCR target : {image_path}")
|
|
70
|
+
print("[i] Engine : PaddleOCR (raw OCR, no LLM)\n")
|
|
71
|
+
|
|
72
|
+
# Pure PaddleOCR: disable the handwriting fallback so a sparse image doesn't try
|
|
73
|
+
# to load the Vision/TrOCR extras during this smoke test.
|
|
74
|
+
analyzer = Analyzer(config=AnalyzerConfig(lang="en", auto_handwriting_fallback=False))
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
t0 = time.perf_counter()
|
|
78
|
+
result = analyzer.analyze(image_path)
|
|
79
|
+
elapsed = time.perf_counter() - t0
|
|
80
|
+
except MissingDependencyError as exc:
|
|
81
|
+
print(f"[x] {exc}")
|
|
82
|
+
return 1
|
|
83
|
+
except UnsupportedFileError as exc:
|
|
84
|
+
print(f"[x] {exc}")
|
|
85
|
+
return 2
|
|
86
|
+
|
|
87
|
+
print("=" * 60)
|
|
88
|
+
print("EXTRACTED TEXT")
|
|
89
|
+
print("=" * 60)
|
|
90
|
+
print(result.text if result.text else "(no text detected)")
|
|
91
|
+
print("=" * 60)
|
|
92
|
+
print(
|
|
93
|
+
f"source={result.text_source} pages={result.pages} "
|
|
94
|
+
f"confidence={result.confidence} chars={len(result.text)} "
|
|
95
|
+
f"time={elapsed:.2f}s"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Show the singleton in action: a second call reuses the loaded model (fast).
|
|
99
|
+
t0 = time.perf_counter()
|
|
100
|
+
analyzer.analyze(image_path)
|
|
101
|
+
print(f"[i] 2nd run (warm model): {time.perf_counter() - t0:.2f}s")
|
|
102
|
+
|
|
103
|
+
return 0 if result.text.strip() else 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
raise SystemExit(main())
|