ocrcontext 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrcontext-0.1.2/.gitignore +41 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/CHANGELOG.md +37 -1
- ocrcontext-0.1.2/PKG-INFO +487 -0
- ocrcontext-0.1.2/README.md +420 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/pyproject.toml +115 -108
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/__init__.py +3 -1
- ocrcontext-0.1.2/src/ocrcontext/cli.py +188 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/paddle.py +17 -10
- ocrcontext-0.1.2/src/ocrcontext/llm/schemas.py +292 -0
- ocrcontext-0.1.2/src/ocrcontext/loaders.py +84 -0
- ocrcontext-0.1.2/src/ocrcontext/schemas.py +43 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/conftest.py +15 -0
- ocrcontext-0.1.2/tests/test_cli.py +181 -0
- ocrcontext-0.1.2/tests/test_langchain_loader.py +90 -0
- ocrcontext-0.1.2/tests/test_schemas.py +192 -0
- ocrcontext-0.1.0/.gitignore +0 -27
- ocrcontext-0.1.0/PKG-INFO +0 -207
- ocrcontext-0.1.0/README.md +0 -144
- ocrcontext-0.1.0/src/ocrcontext/llm/schemas.py +0 -99
- ocrcontext-0.1.0/src/ocrcontext/schemas.py +0 -8
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/LICENSE +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/01_quickstart.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/02_refine_openai.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/03_structured_invoice.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/04_local_ollama.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/image_smoke_test.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/pdf_smoke_test.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/structured_smoke_test.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/analyzer.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/config.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/__init__.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/base.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/handwriting.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/pdf_text.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/registry.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/trocr.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/vision.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/exceptions.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/__init__.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/drift.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/extractor.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/formatting.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/literal_preserve.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/prompts.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/refiner.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/pipeline.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/preprocessing/__init__.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/preprocessing/image.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/py.typed +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/quality.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/types.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/utils/__init__.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/utils/files.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/utils/lang.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/__init__.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_literal_preserve.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_llm.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_pipeline_analyzer.py +0 -0
- {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_text_helpers.py +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.ruff_cache/
|
|
10
|
+
.coverage
|
|
11
|
+
htmlcov/
|
|
12
|
+
|
|
13
|
+
# Virtual envs
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
# Models / caches
|
|
19
|
+
.cache/
|
|
20
|
+
*.onnx
|
|
21
|
+
*.pdmodel
|
|
22
|
+
*.pdiparams
|
|
23
|
+
|
|
24
|
+
# OS / editor
|
|
25
|
+
.DS_Store
|
|
26
|
+
.idea/
|
|
27
|
+
.vscode/
|
|
28
|
+
|
|
29
|
+
# Claude session memory — never publish
|
|
30
|
+
CLAUDE.md
|
|
31
|
+
|
|
32
|
+
# Local sample / personal documents — keep them out of the public repo.
|
|
33
|
+
# (The example scripts auto-discover whatever you drop here.)
|
|
34
|
+
examples/*.pdf
|
|
35
|
+
examples/*.png
|
|
36
|
+
examples/*.jpg
|
|
37
|
+
examples/*.jpeg
|
|
38
|
+
examples/*.webp
|
|
39
|
+
examples/*.tif
|
|
40
|
+
examples/*.tiff
|
|
41
|
+
|
|
@@ -7,6 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.1.2] - 2026-06-26
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- CI: disable Rich markup mode in typer (`rich_markup_mode=None`) so help output
|
|
14
|
+
is plain text on all platforms — Rich's panel renderer produced ANSI escape
|
|
15
|
+
codes that CliRunner could not strip on Linux, causing `--help` tests to fail.
|
|
16
|
+
- Replace `typing.List` with built-in `list` in schemas for Python 3.12
|
|
17
|
+
compatibility and to avoid deprecation warnings.
|
|
18
|
+
|
|
19
|
+
## [0.1.1] - 2026-06-26
|
|
20
|
+
|
|
21
|
+
### Added
|
|
22
|
+
- **`OCRContextLoader`** — LangChain `BaseLoader` integration. Drop-in loader for
|
|
23
|
+
any LangChain pipeline: `OCRContextLoader("file.pdf").load()` returns a
|
|
24
|
+
`Document` with OCR text and metadata (`source`, `text_source`, `pages`,
|
|
25
|
+
`confidence`, `refined`).
|
|
26
|
+
- **Built-in extraction schemas** — four new ready-to-use Pydantic schemas with
|
|
27
|
+
system prompts, importable from `ocrcontext.schemas`:
|
|
28
|
+
- `Receipt` / `ReceiptItem` — store name, date, items, subtotal, tax, total,
|
|
29
|
+
payment method.
|
|
30
|
+
- `Contract` / `ContractParty` — parties, effective/expiry dates, value,
|
|
31
|
+
governing law, key obligations.
|
|
32
|
+
- `IdCard` — national_id / passport / driver_license / residence_permit with
|
|
33
|
+
ICD-standard date normalisation and ISO 3166-1 nationality codes.
|
|
34
|
+
- `MedicalReport` / `Medication` — diagnosis, ICD codes, prescriptions, notes.
|
|
35
|
+
- **CLI** (`ocrcontext extract`) — terminal-first developer experience via the
|
|
36
|
+
new `[cli]` extra (`pip install "ocrcontext[cli]"`):
|
|
37
|
+
- `ocrcontext extract invoice.pdf` — plain OCR to stdout.
|
|
38
|
+
- `ocrcontext extract scan.pdf --schema receipt --output json` — structured
|
|
39
|
+
extraction as JSON.
|
|
40
|
+
- `--provider openai|anthropic|ollama|google --model <name>` — bring-your-own
|
|
41
|
+
LLM provider.
|
|
42
|
+
- `--handwriting`, `--lang`, `--refine auto|yes|no` flags.
|
|
43
|
+
|
|
10
44
|
## [0.1.0] - 2026-06-25
|
|
11
45
|
|
|
12
46
|
Initial release — the document extraction core, decoupled from its web stack
|
|
@@ -39,5 +73,7 @@ into a standalone, LLM-agnostic library.
|
|
|
39
73
|
- **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
|
|
40
74
|
PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
|
|
41
75
|
|
|
42
|
-
[Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.
|
|
76
|
+
[Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...HEAD
|
|
77
|
+
[0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
|
|
78
|
+
[0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
|
|
43
79
|
[0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
|
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocrcontext
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
|
|
5
|
+
Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
|
|
6
|
+
Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
|
|
7
|
+
Project-URL: Issues, https://github.com/BahadirKarsli/OCRContext/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/BahadirKarsli/OCRContext/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
|
|
10
|
+
Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
24
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: langchain-core>=0.3
|
|
28
|
+
Requires-Dist: numpy>=1.24
|
|
29
|
+
Requires-Dist: pillow>=9.0
|
|
30
|
+
Requires-Dist: pydantic>=2.5
|
|
31
|
+
Requires-Dist: pymupdf>=1.23
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: accelerate>=0.27; extra == 'all'
|
|
34
|
+
Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
|
|
35
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
|
|
36
|
+
Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
|
|
37
|
+
Requires-Dist: paddlepaddle>=2.6; extra == 'all'
|
|
38
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
|
|
39
|
+
Requires-Dist: torch>=2.1; extra == 'all'
|
|
40
|
+
Requires-Dist: torchvision>=0.16; extra == 'all'
|
|
41
|
+
Requires-Dist: transformers>=4.40; extra == 'all'
|
|
42
|
+
Requires-Dist: typer>=0.12; extra == 'all'
|
|
43
|
+
Provides-Extra: cli
|
|
44
|
+
Requires-Dist: typer>=0.12; extra == 'cli'
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
47
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
48
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
49
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
50
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: typer>=0.12; extra == 'dev'
|
|
52
|
+
Provides-Extra: paddle
|
|
53
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
|
|
54
|
+
Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
|
|
55
|
+
Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
|
|
56
|
+
Provides-Extra: trocr
|
|
57
|
+
Requires-Dist: accelerate>=0.27; extra == 'trocr'
|
|
58
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
|
|
59
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
|
|
60
|
+
Requires-Dist: torch>=2.1; extra == 'trocr'
|
|
61
|
+
Requires-Dist: torchvision>=0.16; extra == 'trocr'
|
|
62
|
+
Requires-Dist: transformers>=4.40; extra == 'trocr'
|
|
63
|
+
Provides-Extra: vision
|
|
64
|
+
Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
|
|
65
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
|
|
66
|
+
Description-Content-Type: text/markdown
|
|
67
|
+
|
|
68
|
+
<div align="center">
|
|
69
|
+
|
|
70
|
+
# OCR Context
|
|
71
|
+
|
|
72
|
+
**Turn any PDF or image into clean text — or a typed Pydantic model — in three lines.**
|
|
73
|
+
|
|
74
|
+
Decoupled, LLM-agnostic document OCR + structured extraction. No web server, no vendor lock-in.
|
|
75
|
+
|
|
76
|
+
[](https://github.com/BahadirKarsli/OCRContext/actions/workflows/ci.yml)
|
|
77
|
+
[](https://pypi.org/project/ocrcontext/)
|
|
78
|
+
[](https://pypi.org/project/ocrcontext/)
|
|
79
|
+
[](LICENSE)
|
|
80
|
+
[](https://peps.python.org/pep-0561/)
|
|
81
|
+
|
|
82
|
+
</div>
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from ocrcontext import Analyzer
|
|
86
|
+
|
|
87
|
+
result = Analyzer().analyze("invoice.pdf")
|
|
88
|
+
print(result.text)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
`ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
|
|
94
|
+
|
|
95
|
+
## Contents
|
|
96
|
+
|
|
97
|
+
- [Install](#install)
|
|
98
|
+
- [Quick start](#quick-start)
|
|
99
|
+
- [CLI](#cli)
|
|
100
|
+
- [LangChain integration](#langchain-integration)
|
|
101
|
+
- [Built-in schemas](#built-in-schemas)
|
|
102
|
+
- [How it routes a document](#how-it-routes-a-document)
|
|
103
|
+
- [Refinement modes](#refinement-modes)
|
|
104
|
+
- [Configuration](#configuration)
|
|
105
|
+
- [Development](#development)
|
|
106
|
+
- [License](#license)
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Install
|
|
111
|
+
|
|
112
|
+
Engines are opt-in so your base install stays small:
|
|
113
|
+
|
|
114
|
+
| Command | What you get |
|
|
115
|
+
|---|---|
|
|
116
|
+
| `pip install ocrcontext` | Digital PDFs only (PyMuPDF text-layer — no OCR, no GPU, no API key) |
|
|
117
|
+
| `pip install 'ocrcontext[paddle]'` | + printed images & scanned PDFs (PaddleOCR, CPU/GPU) |
|
|
118
|
+
| `pip install 'ocrcontext[trocr]'` | + handwriting fallback (Microsoft TrOCR) |
|
|
119
|
+
| `pip install 'ocrcontext[vision]'` | + handwriting primary (Google Cloud Vision) |
|
|
120
|
+
| `pip install 'ocrcontext[cli]'` | + terminal CLI (`ocrcontext extract`) |
|
|
121
|
+
| `pip install 'ocrcontext[all]'` | everything above |
|
|
122
|
+
|
|
123
|
+
Add an LLM provider for refinement and structured extraction:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
> **Images and scanned PDFs require `[paddle]`.** Passing an image file to a bare `pip install ocrcontext` raises an `EngineError` with a clear install hint.
|
|
130
|
+
|
|
131
|
+
### Google Cloud Vision (`[vision]`)
|
|
132
|
+
|
|
133
|
+
1. Enable the **Cloud Vision API** in [Google Cloud Console](https://console.cloud.google.com/)
|
|
134
|
+
2. Create a service account key (JSON) under IAM & Admin → Service Accounts → Keys
|
|
135
|
+
3. Export the path:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/key.json" # Linux/macOS
|
|
139
|
+
$env:GOOGLE_APPLICATION_CREDENTIALS = "C:\path\to\key.json" # PowerShell
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Quick start
|
|
145
|
+
|
|
146
|
+
### Digital PDF
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from ocrcontext import Analyzer
|
|
150
|
+
|
|
151
|
+
result = Analyzer().analyze("document.pdf")
|
|
152
|
+
print(result.text) # extracted text
|
|
153
|
+
print(result.pages) # page count
|
|
154
|
+
print(result.text_source) # "pdf_text_layer"
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Image / scanned PDF
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
pip install 'ocrcontext[paddle]'
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from ocrcontext import Analyzer
|
|
165
|
+
|
|
166
|
+
result = Analyzer().analyze("scan.png")
|
|
167
|
+
print(result.text, result.confidence)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### LLM-refined OCR
|
|
171
|
+
|
|
172
|
+
Refinement fixes character-level OCR errors without paraphrasing, translating, or inventing.
|
|
173
|
+
Emails, URLs, and IBANs are masked before the model sees them and restored verbatim after.
|
|
174
|
+
Output that drifts too far from the source is rejected in favour of the raw OCR text.
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
pip install 'ocrcontext[paddle]' langchain-openai
|
|
178
|
+
export OPENAI_API_KEY="sk-..."
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from langchain_openai import ChatOpenAI
|
|
183
|
+
from ocrcontext import Analyzer
|
|
184
|
+
|
|
185
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini"), lang="en")
|
|
186
|
+
result = analyzer.analyze("scan.jpg")
|
|
187
|
+
|
|
188
|
+
print(result.text) # refined
|
|
189
|
+
print(result.raw_text) # original OCR output
|
|
190
|
+
print(result.refined) # True
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Structured extraction
|
|
194
|
+
|
|
195
|
+
Hand the analyzer a Pydantic schema and get a populated instance back.
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from langchain_openai import ChatOpenAI
|
|
199
|
+
from ocrcontext import Analyzer
|
|
200
|
+
from ocrcontext.schemas import Invoice
|
|
201
|
+
|
|
202
|
+
analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
|
|
203
|
+
invoice = analyzer.extract("invoice.pdf", schema=Invoice)
|
|
204
|
+
|
|
205
|
+
print(invoice.supplier_name, invoice.total_amount, invoice.currency)
|
|
206
|
+
for item in invoice.line_items:
|
|
207
|
+
print(item.description, item.quantity, item.unit_price)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
Define your own schema — field descriptions are the prompt:
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from pydantic import BaseModel, Field
|
|
214
|
+
|
|
215
|
+
class ShippingLabel(BaseModel):
|
|
216
|
+
sender: str | None = Field(None, description="Sender full name and address")
|
|
217
|
+
recipient: str | None = Field(None, description="Recipient full name and address")
|
|
218
|
+
tracking_number: str | None = Field(None, description="Carrier tracking number")
|
|
219
|
+
|
|
220
|
+
label = analyzer.extract("label.jpg", schema=ShippingLabel)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### No API key? Use a local model
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from langchain_ollama import ChatOllama
|
|
227
|
+
from ocrcontext import Analyzer
|
|
228
|
+
|
|
229
|
+
analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
|
|
230
|
+
result = analyzer.analyze("scan.png")
|
|
231
|
+
print(result.text)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## CLI
|
|
237
|
+
|
|
238
|
+
Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
pip install 'ocrcontext[cli]'
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
**Extract plain text:**
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
ocrcontext extract invoice.pdf
|
|
248
|
+
ocrcontext extract scan.png --output json
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
**Extract structured data with a built-in schema:**
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
ocrcontext extract invoice.pdf --schema invoice
|
|
255
|
+
ocrcontext extract receipt.jpg --schema receipt
|
|
256
|
+
ocrcontext extract contract.pdf --schema contract
|
|
257
|
+
ocrcontext extract passport.jpg --schema idcard
|
|
258
|
+
ocrcontext extract lab_report.pdf --schema medical
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
**Choose your LLM provider:**
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
ocrcontext extract invoice.pdf --schema invoice \
|
|
265
|
+
--provider openai --model gpt-4o-mini
|
|
266
|
+
|
|
267
|
+
ocrcontext extract invoice.pdf --schema invoice \
|
|
268
|
+
--provider anthropic --model claude-haiku-4-5-20251001
|
|
269
|
+
|
|
270
|
+
ocrcontext extract invoice.pdf --schema invoice \
|
|
271
|
+
--provider ollama --model llama3.1
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
**All options:**
|
|
275
|
+
|
|
276
|
+
```
|
|
277
|
+
ocrcontext extract FILE [OPTIONS]
|
|
278
|
+
|
|
279
|
+
--schema -s invoice | receipt | contract | idcard | medical
|
|
280
|
+
--lang -l Language code (default: en)
|
|
281
|
+
--handwriting Force handwriting engine
|
|
282
|
+
--refine auto (default) | yes | no
|
|
283
|
+
--output -o text (default) | json
|
|
284
|
+
--provider -p openai | anthropic | ollama | google
|
|
285
|
+
--model -m Model name (default: gpt-4o-mini)
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## LangChain integration
|
|
291
|
+
|
|
292
|
+
`OCRContextLoader` is a drop-in LangChain `BaseLoader`. It slots into any LangChain pipeline — RAG, document Q&A, chain-of-thought — without glue code.
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
from ocrcontext.loaders import OCRContextLoader
|
|
296
|
+
|
|
297
|
+
# Plain OCR
|
|
298
|
+
loader = OCRContextLoader("contract.pdf")
|
|
299
|
+
docs = loader.load() # -> [Document(page_content="...", metadata={...})]
|
|
300
|
+
|
|
301
|
+
# With LLM refinement
|
|
302
|
+
from langchain_openai import ChatOpenAI
|
|
303
|
+
|
|
304
|
+
loader = OCRContextLoader(
|
|
305
|
+
"scan.pdf",
|
|
306
|
+
llm=ChatOpenAI(model="gpt-4o-mini"),
|
|
307
|
+
lang="en",
|
|
308
|
+
refine="yes",
|
|
309
|
+
)
|
|
310
|
+
docs = loader.load()
|
|
311
|
+
print(docs[0].page_content)
|
|
312
|
+
print(docs[0].metadata)
|
|
313
|
+
# {
|
|
314
|
+
# "source": "scan.pdf",
|
|
315
|
+
# "text_source": "ocr",
|
|
316
|
+
# "pages": 3,
|
|
317
|
+
# "confidence": 0.94,
|
|
318
|
+
# "refined": True,
|
|
319
|
+
# "raw_text": "..."
|
|
320
|
+
# }
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
**In a RAG pipeline:**
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
|
327
|
+
from langchain_community.vectorstores import FAISS
|
|
328
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
329
|
+
from ocrcontext.loaders import OCRContextLoader
|
|
330
|
+
|
|
331
|
+
docs = OCRContextLoader("annual_report.pdf").load()
|
|
332
|
+
chunks = RecursiveCharacterTextSplitter(chunk_size=1000).split_documents(docs)
|
|
333
|
+
vectorstore = FAISS.from_documents(chunks, OpenAIEmbeddings())
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
## Built-in schemas
|
|
339
|
+
|
|
340
|
+
Five ready-to-use Pydantic schemas with system prompts, importable from `ocrcontext.schemas`.
|
|
341
|
+
Pass them directly to `analyzer.extract()` or the CLI `--schema` flag.
|
|
342
|
+
|
|
343
|
+
### Invoice
|
|
344
|
+
|
|
345
|
+
```python
|
|
346
|
+
from ocrcontext.schemas import Invoice
|
|
347
|
+
|
|
348
|
+
invoice = analyzer.extract("invoice.pdf", schema=Invoice)
|
|
349
|
+
# invoice.supplier_name, .invoice_number, .invoice_date, .total_amount,
|
|
350
|
+
# .currency, .tax_id, .tax_rate, .line_items (list[LineItem])
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### Receipt
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from ocrcontext.schemas import Receipt
|
|
357
|
+
|
|
358
|
+
receipt = analyzer.extract("receipt.jpg", schema=Receipt)
|
|
359
|
+
# receipt.store_name, .date, .time, .total_amount, .tax_amount,
|
|
360
|
+
# .subtotal, .payment_method, .currency, .items (list[ReceiptItem])
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
### Contract
|
|
364
|
+
|
|
365
|
+
```python
|
|
366
|
+
from ocrcontext.schemas import Contract
|
|
367
|
+
|
|
368
|
+
contract = analyzer.extract("agreement.pdf", schema=Contract)
|
|
369
|
+
# contract.title, .effective_date, .expiration_date, .contract_value,
|
|
370
|
+
# .currency, .governing_law, .key_obligations,
|
|
371
|
+
# .parties (list[ContractParty] with .name, .role)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### IdCard
|
|
375
|
+
|
|
376
|
+
Supports national_id, passport, driver_license, residence_permit.
|
|
377
|
+
|
|
378
|
+
```python
|
|
379
|
+
from ocrcontext.schemas import IdCard
|
|
380
|
+
|
|
381
|
+
card = analyzer.extract("passport.jpg", schema=IdCard)
|
|
382
|
+
# card.document_type, .full_name, .date_of_birth, .gender,
|
|
383
|
+
# .nationality, .document_number, .issue_date, .expiry_date,
|
|
384
|
+
# .issuing_authority, .address
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
### MedicalReport
|
|
388
|
+
|
|
389
|
+
```python
|
|
390
|
+
from ocrcontext.schemas import MedicalReport
|
|
391
|
+
|
|
392
|
+
report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
|
|
393
|
+
# report.patient_name, .patient_dob, .report_date, .doctor_name,
|
|
394
|
+
# .institution, .diagnosis, .icd_codes (list[str]),
|
|
395
|
+
# .medications (list[Medication]), .notes
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
---
|
|
399
|
+
|
|
400
|
+
## How it routes a document
|
|
401
|
+
|
|
402
|
+
```
|
|
403
|
+
┌─────────────┐
|
|
404
|
+
document ───▶│ Analyzer │
|
|
405
|
+
└──────┬──────┘
|
|
406
|
+
▼
|
|
407
|
+
┌──────────────────────────────────────┐
|
|
408
|
+
│ 1. Digital PDF? │
|
|
409
|
+
│ └─▶ PyMuPDF text layer │
|
|
410
|
+
│ LLM refine auto-skipped │
|
|
411
|
+
│ │
|
|
412
|
+
│ 2. Image / scanned PDF? │
|
|
413
|
+
│ └─▶ PaddleOCR │
|
|
414
|
+
│ (preprocess → coverage-first │
|
|
415
|
+
│ → line-band fallback) │
|
|
416
|
+
│ │
|
|
417
|
+
│ 3. Handwriting (explicit or auto)? │
|
|
418
|
+
│ └─▶ Google Cloud Vision │
|
|
419
|
+
│ → TrOCR fallback │
|
|
420
|
+
│ │
|
|
421
|
+
│ 4. (optional) LLM refine │
|
|
422
|
+
│ fidelity-first · literal-safe │
|
|
423
|
+
│ │
|
|
424
|
+
│ 5. (optional) extract(schema) │
|
|
425
|
+
│ └─▶ typed Pydantic model │
|
|
426
|
+
└──────────────────────────────────────┘
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
Multi-page documents are joined with `--- Page N ---` separators.
|
|
430
|
+
Handwriting kicks in automatically when printed OCR returns too little text.
|
|
431
|
+
|
|
432
|
+
---
|
|
433
|
+
|
|
434
|
+
## Refinement modes
|
|
435
|
+
|
|
436
|
+
| Mode | When it's used |
|
|
437
|
+
|---|---|
|
|
438
|
+
| `conservative` | Scanned images — minimal char-level correction only |
|
|
439
|
+
| `layout` | Digital PDFs — reconstruct clean structure |
|
|
440
|
+
| `handwriting_layout` | Handwritten notes / lists / diagrams |
|
|
441
|
+
| `handwriting_prose` | Handwritten poems / paragraphs / letters |
|
|
442
|
+
|
|
443
|
+
Modes are auto-selected based on the document type and text content. The handwriting mode choice is driven by whether the text looks like a DIKW/pyramid diagram. All prompts are ported verbatim from the production pipeline.
|
|
444
|
+
|
|
445
|
+
Override manually:
|
|
446
|
+
|
|
447
|
+
```python
|
|
448
|
+
from ocrcontext import Analyzer, RefinementMode
|
|
449
|
+
|
|
450
|
+
result = analyzer.analyze("scan.png", mode=RefinementMode.CONSERVATIVE)
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
---
|
|
454
|
+
|
|
455
|
+
## Configuration
|
|
456
|
+
|
|
457
|
+
```python
|
|
458
|
+
from ocrcontext import Analyzer, AnalyzerConfig
|
|
459
|
+
|
|
460
|
+
cfg = AnalyzerConfig(
|
|
461
|
+
lang="tr", # default document language
|
|
462
|
+
prefer_pdf_text_layer=True, # skip OCR when a text layer exists
|
|
463
|
+
auto_handwriting_fallback=True, # retry with handwriting if OCR returns too little
|
|
464
|
+
refine_by_default=True, # auto-refine whenever an LLM is configured
|
|
465
|
+
)
|
|
466
|
+
analyzer = Analyzer(llm=..., config=cfg)
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
---
|
|
470
|
+
|
|
471
|
+
## Development
|
|
472
|
+
|
|
473
|
+
```bash
|
|
474
|
+
git clone https://github.com/BahadirKarsli/OCRContext
|
|
475
|
+
cd OCRContext
|
|
476
|
+
pip install -e '.[dev]'
|
|
477
|
+
pytest # runs without GPU or network — engines and LLM are faked
|
|
478
|
+
ruff check .
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
See [`examples/`](examples/) for runnable smoke tests (image OCR, structured extraction, PDF routing).
|
|
482
|
+
|
|
483
|
+
---
|
|
484
|
+
|
|
485
|
+
## License
|
|
486
|
+
|
|
487
|
+
[MIT](LICENSE) © Bahadır Karslı
|