ocrcontext 0.1.0__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. ocrcontext-0.1.3/.gitignore +41 -0
  2. ocrcontext-0.1.3/CHANGELOG.md +102 -0
  3. ocrcontext-0.1.3/PKG-INFO +505 -0
  4. ocrcontext-0.1.3/README.md +450 -0
  5. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/pyproject.toml +105 -108
  6. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/__init__.py +3 -1
  7. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/analyzer.py +209 -198
  8. ocrcontext-0.1.3/src/ocrcontext/cli.py +188 -0
  9. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/config.py +3 -1
  10. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/handwriting.py +7 -34
  11. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/paddle.py +274 -264
  12. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/registry.py +68 -67
  13. ocrcontext-0.1.3/src/ocrcontext/llm/schemas.py +292 -0
  14. ocrcontext-0.1.3/src/ocrcontext/loaders.py +84 -0
  15. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/pipeline.py +5 -9
  16. ocrcontext-0.1.3/src/ocrcontext/schemas.py +43 -0
  17. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/types.py +0 -1
  18. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/tests/conftest.py +15 -0
  19. ocrcontext-0.1.3/tests/test_cli.py +181 -0
  20. ocrcontext-0.1.3/tests/test_langchain_loader.py +90 -0
  21. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/tests/test_pipeline_analyzer.py +14 -5
  22. ocrcontext-0.1.3/tests/test_schemas.py +192 -0
  23. ocrcontext-0.1.0/.gitignore +0 -27
  24. ocrcontext-0.1.0/CHANGELOG.md +0 -43
  25. ocrcontext-0.1.0/PKG-INFO +0 -207
  26. ocrcontext-0.1.0/README.md +0 -144
  27. ocrcontext-0.1.0/src/ocrcontext/llm/schemas.py +0 -99
  28. ocrcontext-0.1.0/src/ocrcontext/schemas.py +0 -8
  29. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/LICENSE +0 -0
  30. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/01_quickstart.py +0 -0
  31. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/02_refine_openai.py +0 -0
  32. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/03_structured_invoice.py +0 -0
  33. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/04_local_ollama.py +0 -0
  34. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/image_smoke_test.py +0 -0
  35. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/pdf_smoke_test.py +0 -0
  36. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/examples/structured_smoke_test.py +0 -0
  37. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/__init__.py +0 -0
  38. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/base.py +0 -0
  39. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/pdf_text.py +0 -0
  40. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/trocr.py +0 -0
  41. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/engines/vision.py +0 -0
  42. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/exceptions.py +0 -0
  43. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/__init__.py +0 -0
  44. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/drift.py +0 -0
  45. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/extractor.py +0 -0
  46. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/formatting.py +0 -0
  47. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/literal_preserve.py +0 -0
  48. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/prompts.py +0 -0
  49. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/llm/refiner.py +0 -0
  50. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/preprocessing/__init__.py +0 -0
  51. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/preprocessing/image.py +0 -0
  52. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/py.typed +0 -0
  53. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/quality.py +0 -0
  54. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/utils/__init__.py +0 -0
  55. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/utils/files.py +0 -0
  56. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/src/ocrcontext/utils/lang.py +0 -0
  57. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/tests/__init__.py +0 -0
  58. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/tests/test_literal_preserve.py +0 -0
  59. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/tests/test_llm.py +0 -0
  60. {ocrcontext-0.1.0 → ocrcontext-0.1.3}/tests/test_text_helpers.py +0 -0
@@ -0,0 +1,41 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ .coverage
11
+ htmlcov/
12
+
13
+ # Virtual envs
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # Models / caches
19
+ .cache/
20
+ *.onnx
21
+ *.pdmodel
22
+ *.pdiparams
23
+
24
+ # OS / editor
25
+ .DS_Store
26
+ .idea/
27
+ .vscode/
28
+
29
+ # Claude session memory — never publish
30
+ CLAUDE.md
31
+
32
+ # Local sample / personal documents — keep them out of the public repo.
33
+ # (The example scripts auto-discover whatever you drop here.)
34
+ examples/*.pdf
35
+ examples/*.png
36
+ examples/*.jpg
37
+ examples/*.jpeg
38
+ examples/*.webp
39
+ examples/*.tif
40
+ examples/*.tiff
41
+
@@ -0,0 +1,102 @@
1
+ # Changelog
2
+
3
+ All notable changes to **ocrcontext** are documented here.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.3] - 2026-06-27
11
+
12
+ ### Added
13
+ - **GPU acceleration** — `Analyzer(use_gpu=True)` routes PaddleOCR inference to a
14
+ CUDA-capable GPU. Requires the GPU build of PaddlePaddle (`pip install paddlepaddle-gpu`).
15
+ CPU remains the default (`use_gpu=False`) so existing code needs no changes.
16
+ The `use_gpu` flag is forwarded through `EngineRegistry` → `PaddleEngine` →
17
+ all `PaddleOCR` constructor profiles, including the version-pinned fallback ladder
18
+ (PP-OCRv6 → PP-OCRv5 → PP-OCRv4 → legacy 2.x).
19
+ - **Vision→Paddle fallback** — when `handwriting=True` and Google Vision returns
20
+ insufficient text (e.g. no credentials, unsupported language), PaddleOCR is tried
21
+ automatically. Users no longer need TrOCR for a handwriting safety net.
22
+
23
+ ### Changed
24
+ - **Removed TrOCR engine** — Microsoft TrOCR (`[trocr]` extra) is removed from the
25
+ project. PaddleOCR outperforms TrOCR on printed text; Google Vision outperforms it
26
+ on handwriting. The `[trocr]` extra and its heavy deps (torch, transformers, etc.)
27
+ are gone. The extras table is now `[paddle]`, `[vision]`, `[cli]`, `[all]`.
28
+ - **`auto_handwriting_fallback` default changed to `False`** — PaddleOCR is now the
29
+ sole default engine. Set `AnalyzerConfig(auto_handwriting_fallback=True)` to enable
30
+ automatic Vision retry on insufficient printed OCR output.
31
+
32
+ ## [0.1.2] - 2026-06-26
33
+
34
+ ### Fixed
35
+ - CI: disable Rich markup mode in typer (`rich_markup_mode=None`) so help output
36
+ is plain text on all platforms — Rich's panel renderer produced ANSI escape
37
+ codes that CliRunner could not strip on Linux, causing `--help` tests to fail.
38
+ - Replace `typing.List` with built-in `list` in schemas for Python 3.12
39
+ compatibility and to avoid deprecation warnings.
40
+
41
+ ## [0.1.1] - 2026-06-26
42
+
43
+ ### Added
44
+ - **`OCRContextLoader`** — LangChain `BaseLoader` integration. Drop-in loader for
45
+ any LangChain pipeline: `OCRContextLoader("file.pdf").load()` returns a
46
+ `Document` with OCR text and metadata (`source`, `text_source`, `pages`,
47
+ `confidence`, `refined`).
48
+ - **Built-in extraction schemas** — four new ready-to-use Pydantic schemas with
49
+ system prompts, importable from `ocrcontext.schemas`:
50
+ - `Receipt` / `ReceiptItem` — store name, date, items, subtotal, tax, total,
51
+ payment method.
52
+ - `Contract` / `ContractParty` — parties, effective/expiry dates, value,
53
+ governing law, key obligations.
54
+ - `IdCard` — national_id / passport / driver_license / residence_permit with
55
+ ICD-standard date normalisation and ISO 3166-1 nationality codes.
56
+ - `MedicalReport` / `Medication` — diagnosis, ICD codes, prescriptions, notes.
57
+ - **CLI** (`ocrcontext extract`) — terminal-first developer experience via the
58
+ new `[cli]` extra (`pip install "ocrcontext[cli]"`):
59
+ - `ocrcontext extract invoice.pdf` — plain OCR to stdout.
60
+ - `ocrcontext extract scan.pdf --schema receipt --output json` — structured
61
+ extraction as JSON.
62
+ - `--provider openai|anthropic|ollama|google --model <name>` — bring-your-own
63
+ LLM provider.
64
+ - `--handwriting`, `--lang`, `--refine auto|yes|no` flags.
65
+
66
+ ## [0.1.0] - 2026-06-25
67
+
68
+ Initial release — the document extraction core, decoupled from its web stack
69
+ into a standalone, LLM-agnostic library.
70
+
71
+ ### Added
72
+ - **`Analyzer` facade** — 3-line developer experience:
73
+ `Analyzer().analyze("file.pdf").text`.
74
+ - **Routing ladder** (`pipeline.py`):
75
+ - Digital PDFs → PyMuPDF text-layer extraction (no OCR); LLM refine is
76
+ auto-skipped so exact text/identifiers are never altered.
77
+ - Images / scanned PDFs → PaddleOCR with image preprocessing, multi-language
78
+ *coverage-first* candidate selection, and a line-band recovery fallback.
79
+ - Handwriting (explicit or auto on insufficient text) → Google Vision primary,
80
+ Microsoft TrOCR fallback.
81
+ - Multi-page documents joined with `--- Page N ---` separators.
82
+ - **LLM-agnostic LLM layer** — works with any LangChain `BaseChatModel`
83
+ (`langchain-openai`, `langchain-anthropic`, `langchain-ollama`, ...). Only
84
+ `langchain-core` is required at the core.
85
+ - `Refiner` — fidelity-first OCR refinement (4 modes) with literal/contact
86
+ preservation (`{{OCRLITn}}` masking) and drift/hallucination rejection.
87
+ - `StructuredExtractor` + `Analyzer.extract()` / `Analyzer.extract_text()` —
88
+ structured extraction into any Pydantic schema via `with_structured_output`.
89
+ - Built-in `Invoice` / `LineItem` schemas and prompt.
90
+ - **Resource efficiency** — `EngineRegistry` singleton caches PaddleOCR/TrOCR
91
+ engines (and per-language models) so they load at most once per process.
92
+ - **Windows robustness** — model cache and temp files are routed through ASCII
93
+ 8.3 short paths to survive non-ASCII usernames; oneDNN is disabled on CPU to
94
+ avoid the PaddlePaddle 3.x PIR/oneDNN `NotImplementedError`.
95
+ - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
96
+ PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
97
+
98
+ [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...HEAD
99
+ [0.1.3]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...v0.1.3
100
+ [0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
101
+ [0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
102
+ [0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
@@ -0,0 +1,505 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocrcontext
3
+ Version: 0.1.3
4
+ Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
+ Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
6
+ Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
7
+ Project-URL: Issues, https://github.com/BahadirKarsli/OCRContext/issues
8
+ Project-URL: Changelog, https://github.com/BahadirKarsli/OCRContext/blob/main/CHANGELOG.md
9
+ Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
10
+ Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: langchain-core>=0.3
28
+ Requires-Dist: numpy>=1.24
29
+ Requires-Dist: pillow>=9.0
30
+ Requires-Dist: pydantic>=2.5
31
+ Requires-Dist: pymupdf>=1.23
32
+ Provides-Extra: all
33
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
34
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
35
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
36
+ Requires-Dist: paddlepaddle>=2.6; extra == 'all'
37
+ Requires-Dist: typer>=0.12; extra == 'all'
38
+ Provides-Extra: cli
39
+ Requires-Dist: typer>=0.12; extra == 'cli'
40
+ Provides-Extra: dev
41
+ Requires-Dist: build>=1.2; extra == 'dev'
42
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
43
+ Requires-Dist: pytest>=8.0; extra == 'dev'
44
+ Requires-Dist: ruff>=0.5; extra == 'dev'
45
+ Requires-Dist: twine>=5.0; extra == 'dev'
46
+ Requires-Dist: typer>=0.12; extra == 'dev'
47
+ Provides-Extra: paddle
48
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
49
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
50
+ Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
51
+ Provides-Extra: vision
52
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
53
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
54
+ Description-Content-Type: text/markdown
55
+
56
+ <div align="center">
57
+
58
+ # OCR Context
59
+
60
+ **Turn any PDF or image into clean text — or a typed Pydantic model — in three lines.**
61
+
62
+ Decoupled, LLM-agnostic document OCR + structured extraction. No web server, no vendor lock-in.
63
+
64
+ [![CI](https://github.com/BahadirKarsli/OCRContext/actions/workflows/ci.yml/badge.svg)](https://github.com/BahadirKarsli/OCRContext/actions/workflows/ci.yml)
65
+ [![PyPI version](https://img.shields.io/pypi/v/ocrcontext.svg?color=blue)](https://pypi.org/project/ocrcontext/)
66
+ [![Python versions](https://img.shields.io/pypi/pyversions/ocrcontext.svg)](https://pypi.org/project/ocrcontext/)
67
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
68
+ [![Typed](https://img.shields.io/badge/typing-PEP%20561-blue.svg)](https://peps.python.org/pep-0561/)
69
+
70
+ </div>
71
+
72
+ **Try it in 30 seconds — no Python script needed:**
73
+
74
+ ```bash
75
+ pip install 'ocrcontext[paddle,cli]'
76
+ ocrcontext extract invoice.pdf
77
+ ocrcontext extract receipt.jpg --output json
78
+ ```
79
+
80
+ **Or use the Python API:**
81
+
82
+ ```python
83
+ from ocrcontext import Analyzer
84
+
85
+ result = Analyzer().analyze("invoice.pdf")
86
+ print(result.text)
87
+ ```
88
+
89
+ ---
90
+
91
+ `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
92
+
93
+ ## Contents
94
+
95
+ - [Install](#install)
96
+ - [CLI](#cli)
97
+ - [Quick start (Python API)](#quick-start-python-api)
98
+ - [GPU acceleration](#gpu-acceleration)
99
+ - [LangChain integration](#langchain-integration)
100
+ - [Built-in schemas](#built-in-schemas)
101
+ - [How it routes a document](#how-it-routes-a-document)
102
+ - [Refinement modes](#refinement-modes)
103
+ - [Configuration](#configuration)
104
+ - [Development](#development)
105
+ - [License](#license)
106
+
107
+ ---
108
+
109
+ ## Install
110
+
111
+ Engines are opt-in so your base install stays small:
112
+
113
+ | Command | What you get |
114
+ |---|---|
115
+ | `pip install ocrcontext` | Digital PDFs only (PyMuPDF text-layer — no OCR, no GPU, no API key) |
116
+ | `pip install 'ocrcontext[paddle]'` | + printed images & scanned PDFs (PaddleOCR, CPU/GPU) |
117
+ | `pip install 'ocrcontext[vision]'` | + handwriting (Google Cloud Vision) |
118
+ | `pip install 'ocrcontext[cli]'` | + terminal CLI (`ocrcontext extract`) |
119
+ | `pip install 'ocrcontext[all]'` | everything above |
120
+
121
+ Add an LLM provider for refinement and structured extraction:
122
+
123
+ ```bash
124
+ pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
125
+ ```
126
+
127
+ > **Images and scanned PDFs require `[paddle]`.** Passing an image file to a bare `pip install ocrcontext` raises an `EngineError` with a clear install hint.
128
+
129
+ ### Google Cloud Vision (`[vision]`)
130
+
131
+ 1. Enable the **Cloud Vision API** in [Google Cloud Console](https://console.cloud.google.com/)
132
+ 2. Create a service account key (JSON) under IAM & Admin → Service Accounts → Keys
133
+ 3. Export the path:
134
+
135
+ ```bash
136
+ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/key.json" # Linux/macOS
137
+ $env:GOOGLE_APPLICATION_CREDENTIALS = "C:\path\to\key.json" # PowerShell
138
+ ```
139
+
140
+ ---
141
+
142
+ ## CLI
143
+
144
+ Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
145
+
146
+ ```bash
147
+ pip install 'ocrcontext[paddle,cli]'
148
+ ```
149
+
150
+ **Extract plain text:**
151
+
152
+ ```bash
153
+ ocrcontext extract invoice.pdf
154
+ ocrcontext extract scan.png --output json
155
+ ```
156
+
157
+ **Extract structured data with a built-in schema:**
158
+
159
+ ```bash
160
+ ocrcontext extract invoice.pdf --schema invoice
161
+ ocrcontext extract receipt.jpg --schema receipt
162
+ ocrcontext extract contract.pdf --schema contract
163
+ ocrcontext extract passport.jpg --schema idcard
164
+ ocrcontext extract lab_report.pdf --schema medical
165
+ ```
166
+
167
+ **Choose your LLM provider:**
168
+
169
+ ```bash
170
+ ocrcontext extract invoice.pdf --schema invoice \
171
+ --provider openai --model gpt-4o-mini
172
+
173
+ ocrcontext extract invoice.pdf --schema invoice \
174
+ --provider anthropic --model claude-haiku-4-5-20251001
175
+
176
+ ocrcontext extract invoice.pdf --schema invoice \
177
+ --provider ollama --model llama3.1
178
+ ```
179
+
180
+ **All options:**
181
+
182
+ ```
183
+ ocrcontext extract FILE [OPTIONS]
184
+
185
+ --schema -s invoice | receipt | contract | idcard | medical
186
+ --lang -l Language code (default: en)
187
+ --handwriting Force handwriting engine
188
+ --refine auto (default) | yes | no
189
+ --output -o text (default) | json
190
+ --provider -p openai | anthropic | ollama | google
191
+ --model -m Model name (default: gpt-4o-mini)
192
+ ```
193
+
194
+ ---
195
+
196
+ ## Quick start (Python API)
197
+
198
+ ### Digital PDF
199
+
200
+ ```python
201
+ from ocrcontext import Analyzer
202
+
203
+ result = Analyzer().analyze("document.pdf")
204
+ print(result.text) # extracted text
205
+ print(result.pages) # page count
206
+ print(result.text_source) # "pdf_text_layer"
207
+ ```
208
+
209
+ ### Image / scanned PDF
210
+
211
+ ```bash
212
+ pip install 'ocrcontext[paddle]'
213
+ ```
214
+
215
+ ```python
216
+ from ocrcontext import Analyzer
217
+
218
+ result = Analyzer().analyze("scan.png")
219
+ print(result.text, result.confidence)
220
+ ```
221
+
222
+ ### GPU acceleration
223
+
224
+ If you have a CUDA-capable GPU, swap the CPU PaddlePaddle build for the GPU one and pass `use_gpu=True`:
225
+
226
+ ```bash
227
+ pip install 'ocrcontext[paddle]'
228
+ pip install paddlepaddle-gpu # replaces the CPU build; pick the wheel that matches your CUDA version
229
+ ```
230
+
231
+ ```python
232
+ from ocrcontext import Analyzer
233
+
234
+ analyzer = Analyzer(use_gpu=True)
235
+ result = analyzer.analyze("scan.png")
236
+ print(result.text, result.confidence)
237
+ ```
238
+
239
+ > PaddleOCR is typically 5–10× faster on GPU for large documents or batch workloads.
240
+ > CPU (`use_gpu=False`, the default) works out of the box with no extra steps.
241
+
242
+ ### LLM-refined OCR
243
+
244
+ Refinement fixes character-level OCR errors without paraphrasing, translating, or inventing.
245
+ Emails, URLs, and IBANs are masked before the model sees them and restored verbatim after.
246
+ Output that drifts too far from the source is rejected in favour of the raw OCR text.
247
+
248
+ ```bash
249
+ pip install 'ocrcontext[paddle]' langchain-openai
250
+ export OPENAI_API_KEY="sk-..."
251
+ ```
252
+
253
+ ```python
254
+ from langchain_openai import ChatOpenAI
255
+ from ocrcontext import Analyzer
256
+
257
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini"), lang="en")
258
+ result = analyzer.analyze("scan.jpg")
259
+
260
+ print(result.text) # refined
261
+ print(result.raw_text) # original OCR output
262
+ print(result.refined) # True
263
+ ```
264
+
265
+ ### Structured extraction
266
+
267
+ Hand the analyzer a Pydantic schema and get a populated instance back.
268
+
269
+ ```python
270
+ from langchain_openai import ChatOpenAI
271
+ from ocrcontext import Analyzer
272
+ from ocrcontext.schemas import Invoice
273
+
274
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
275
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice)
276
+
277
+ print(invoice.supplier_name, invoice.total_amount, invoice.currency)
278
+ for item in invoice.line_items:
279
+ print(item.description, item.quantity, item.unit_price)
280
+ ```
281
+
282
+ Define your own schema — field descriptions are the prompt:
283
+
284
+ ```python
285
+ from pydantic import BaseModel, Field
286
+
287
+ class ShippingLabel(BaseModel):
288
+ sender: str | None = Field(None, description="Sender full name and address")
289
+ recipient: str | None = Field(None, description="Recipient full name and address")
290
+ tracking_number: str | None = Field(None, description="Carrier tracking number")
291
+
292
+ label = analyzer.extract("label.jpg", schema=ShippingLabel)
293
+ ```
294
+
295
+ ### No API key? Use a local model
296
+
297
+ ```python
298
+ from langchain_ollama import ChatOllama
299
+ from ocrcontext import Analyzer
300
+
301
+ analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
302
+ result = analyzer.analyze("scan.png")
303
+ print(result.text)
304
+ ```
305
+
306
+ ---
307
+
308
+ ## LangChain integration
309
+
310
+ `OCRContextLoader` is a drop-in LangChain `BaseLoader`. It slots into any LangChain pipeline — RAG, document Q&A, chain-of-thought — without glue code.
311
+
312
+ ```python
313
+ from ocrcontext.loaders import OCRContextLoader
314
+
315
+ # Plain OCR
316
+ loader = OCRContextLoader("contract.pdf")
317
+ docs = loader.load() # -> [Document(page_content="...", metadata={...})]
318
+
319
+ # With LLM refinement
320
+ from langchain_openai import ChatOpenAI
321
+
322
+ loader = OCRContextLoader(
323
+ "scan.pdf",
324
+ llm=ChatOpenAI(model="gpt-4o-mini"),
325
+ lang="en",
326
+ refine="yes",
327
+ )
328
+ docs = loader.load()
329
+ print(docs[0].page_content)
330
+ print(docs[0].metadata)
331
+ # {
332
+ # "source": "scan.pdf",
333
+ # "text_source": "ocr",
334
+ # "pages": 3,
335
+ # "confidence": 0.94,
336
+ # "refined": True,
337
+ # "raw_text": "..."
338
+ # }
339
+ ```
340
+
341
+ **In a RAG pipeline:**
342
+
343
+ ```python
344
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
345
+ from langchain_community.vectorstores import FAISS
346
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
347
+ from ocrcontext.loaders import OCRContextLoader
348
+
349
+ docs = OCRContextLoader("annual_report.pdf").load()
350
+ chunks = RecursiveCharacterTextSplitter(chunk_size=1000).split_documents(docs)
351
+ vectorstore = FAISS.from_documents(chunks, OpenAIEmbeddings())
352
+ ```
353
+
354
+ ---
355
+
356
+ ## Built-in schemas
357
+
358
+ Five ready-to-use Pydantic schemas with system prompts, importable from `ocrcontext.schemas`.
359
+ Pass them directly to `analyzer.extract()` or the CLI `--schema` flag.
360
+
361
+ ### Invoice
362
+
363
+ ```python
364
+ from ocrcontext.schemas import Invoice
365
+
366
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice)
367
+ # invoice.supplier_name, .invoice_number, .invoice_date, .total_amount,
368
+ # .currency, .tax_id, .tax_rate, .line_items (list[LineItem])
369
+ ```
370
+
371
+ ### Receipt
372
+
373
+ ```python
374
+ from ocrcontext.schemas import Receipt
375
+
376
+ receipt = analyzer.extract("receipt.jpg", schema=Receipt)
377
+ # receipt.store_name, .date, .time, .total_amount, .tax_amount,
378
+ # .subtotal, .payment_method, .currency, .items (list[ReceiptItem])
379
+ ```
380
+
381
+ ### Contract
382
+
383
+ ```python
384
+ from ocrcontext.schemas import Contract
385
+
386
+ contract = analyzer.extract("agreement.pdf", schema=Contract)
387
+ # contract.title, .effective_date, .expiration_date, .contract_value,
388
+ # .currency, .governing_law, .key_obligations,
389
+ # .parties (list[ContractParty] with .name, .role)
390
+ ```
391
+
392
+ ### IdCard
393
+
394
+ Supports national_id, passport, driver_license, residence_permit.
395
+
396
+ ```python
397
+ from ocrcontext.schemas import IdCard
398
+
399
+ card = analyzer.extract("passport.jpg", schema=IdCard)
400
+ # card.document_type, .full_name, .date_of_birth, .gender,
401
+ # .nationality, .document_number, .issue_date, .expiry_date,
402
+ # .issuing_authority, .address
403
+ ```
404
+
405
+ ### MedicalReport
406
+
407
+ ```python
408
+ from ocrcontext.schemas import MedicalReport
409
+
410
+ report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
411
+ # report.patient_name, .patient_dob, .report_date, .doctor_name,
412
+ # .institution, .diagnosis, .icd_codes (list[str]),
413
+ # .medications (list[Medication]), .notes
414
+ ```
415
+
416
+ ---
417
+
418
+ ## How it routes a document
419
+
420
+ ```
421
+ ┌─────────────┐
422
+ document ───▶│ Analyzer │
423
+ └──────┬──────┘
424
+
425
+ ┌──────────────────────────────────────┐
426
+ │ 1. Digital PDF? │
427
+ │ └─▶ PyMuPDF text layer │
428
+ │ LLM refine auto-skipped │
429
+ │ │
430
+ │ 2. Image / scanned PDF? │
431
+ │ └─▶ PaddleOCR │
432
+ │ (preprocess → coverage-first │
433
+ │ → line-band fallback) │
434
+ │ │
435
+ │ 3. Handwriting (explicit or auto)? │
436
+ │ └─▶ Google Cloud Vision │
437
+ │ → PaddleOCR if Vision empty │
438
+ │ │
439
+ │ 4. (optional) LLM refine │
440
+ │ fidelity-first · literal-safe │
441
+ │ │
442
+ │ 5. (optional) extract(schema) │
443
+ │ └─▶ typed Pydantic model │
444
+ └──────────────────────────────────────┘
445
+ ```
446
+
447
+ Multi-page documents are joined with `--- Page N ---` separators.
448
+ Handwriting step 3 is explicit-only by default; set `auto_handwriting_fallback=True` to enable automatic retry.
449
+
450
+ ---
451
+
452
+ ## Refinement modes
453
+
454
+ | Mode | When it's used |
455
+ |---|---|
456
+ | `conservative` | Scanned images — minimal char-level correction only |
457
+ | `layout` | Digital PDFs — reconstruct clean structure |
458
+ | `handwriting_layout` | Handwritten notes / lists / diagrams |
459
+ | `handwriting_prose` | Handwritten poems / paragraphs / letters |
460
+
461
+ Modes are auto-selected based on the document type and text content. The handwriting mode choice is driven by whether the text looks like a DIKW/pyramid diagram. All prompts are ported verbatim from the production pipeline.
462
+
463
+ Override manually:
464
+
465
+ ```python
466
+ from ocrcontext import Analyzer, RefinementMode
467
+
468
+ result = analyzer.analyze("scan.png", mode=RefinementMode.CONSERVATIVE)
469
+ ```
470
+
471
+ ---
472
+
473
+ ## Configuration
474
+
475
+ ```python
476
+ from ocrcontext import Analyzer, AnalyzerConfig
477
+
478
+ cfg = AnalyzerConfig(
479
+ lang="tr", # default document language
480
+ prefer_pdf_text_layer=True, # skip OCR when a text layer exists
481
+ auto_handwriting_fallback=False, # keep PaddleOCR as sole engine (default); set True to enable Vision fallback
482
+ refine_by_default=True, # auto-refine whenever an LLM is configured
483
+ )
484
+ analyzer = Analyzer(llm=..., config=cfg, use_gpu=False) # set use_gpu=True for CUDA-capable devices
485
+ ```
486
+
487
+ ---
488
+
489
+ ## Development
490
+
491
+ ```bash
492
+ git clone https://github.com/BahadirKarsli/OCRContext
493
+ cd OCRContext
494
+ pip install -e '.[dev]'
495
+ pytest # runs without GPU or network — engines and LLM are faked
496
+ ruff check .
497
+ ```
498
+
499
+ See [`examples/`](examples/) for runnable smoke tests (image OCR, structured extraction, PDF routing).
500
+
501
+ ---
502
+
503
+ ## License
504
+
505
+ [MIT](LICENSE) © Bahadır Karslı