ocrcontext 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/CHANGELOG.md +103 -79
  2. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/PKG-INFO +104 -75
  3. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/README.md +461 -420
  4. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/pyproject.toml +3 -13
  5. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/__init__.py +1 -1
  6. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/analyzer.py +209 -198
  7. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/cli.py +84 -20
  8. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/config.py +3 -1
  9. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/handwriting.py +7 -34
  10. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/paddle.py +278 -271
  11. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/registry.py +68 -67
  12. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/schemas.py +19 -18
  13. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/pipeline.py +5 -9
  14. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/types.py +0 -1
  15. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_cli.py +6 -3
  16. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_pipeline_analyzer.py +14 -5
  17. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/.gitignore +0 -0
  18. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/LICENSE +0 -0
  19. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/01_quickstart.py +0 -0
  20. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/02_refine_openai.py +0 -0
  21. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/03_structured_invoice.py +0 -0
  22. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/04_local_ollama.py +0 -0
  23. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/image_smoke_test.py +0 -0
  24. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/pdf_smoke_test.py +0 -0
  25. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/structured_smoke_test.py +0 -0
  26. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/__init__.py +0 -0
  27. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/base.py +0 -0
  28. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/pdf_text.py +0 -0
  29. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/trocr.py +0 -0
  30. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/vision.py +0 -0
  31. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/exceptions.py +0 -0
  32. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/__init__.py +0 -0
  33. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/drift.py +0 -0
  34. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/extractor.py +0 -0
  35. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/formatting.py +0 -0
  36. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/literal_preserve.py +0 -0
  37. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/prompts.py +0 -0
  38. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/refiner.py +0 -0
  39. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/loaders.py +0 -0
  40. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/__init__.py +0 -0
  41. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/image.py +0 -0
  42. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/py.typed +0 -0
  43. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/quality.py +0 -0
  44. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/schemas.py +0 -0
  45. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/utils/__init__.py +0 -0
  46. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/utils/files.py +0 -0
  47. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/utils/lang.py +0 -0
  48. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/__init__.py +0 -0
  49. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/conftest.py +0 -0
  50. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_langchain_loader.py +0 -0
  51. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_literal_preserve.py +0 -0
  52. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_llm.py +0 -0
  53. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_schemas.py +0 -0
  54. {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_text_helpers.py +0 -0
@@ -1,79 +1,103 @@
1
- # Changelog
2
-
3
- All notable changes to **ocrcontext** are documented here.
4
-
5
- The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
- and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
-
8
- ## [Unreleased]
9
-
10
- ## [0.1.2] - 2026-06-26
11
-
12
- ### Fixed
13
- - CI: disable Rich markup mode in typer (`rich_markup_mode=None`) so help output
14
- is plain text on all platforms Rich's panel renderer produced ANSI escape
15
- codes that CliRunner could not strip on Linux, causing `--help` tests to fail.
16
- - Replace `typing.List` with built-in `list` in schemas for Python 3.12
17
- compatibility and to avoid deprecation warnings.
18
-
19
- ## [0.1.1] - 2026-06-26
20
-
21
- ### Added
22
- - **`OCRContextLoader`** — LangChain `BaseLoader` integration. Drop-in loader for
23
- any LangChain pipeline: `OCRContextLoader("file.pdf").load()` returns a
24
- `Document` with OCR text and metadata (`source`, `text_source`, `pages`,
25
- `confidence`, `refined`).
26
- - **Built-in extraction schemas** four new ready-to-use Pydantic schemas with
27
- system prompts, importable from `ocrcontext.schemas`:
28
- - `Receipt` / `ReceiptItem`store name, date, items, subtotal, tax, total,
29
- payment method.
30
- - `Contract` / `ContractParty` parties, effective/expiry dates, value,
31
- governing law, key obligations.
32
- - `IdCard` national_id / passport / driver_license / residence_permit with
33
- ICD-standard date normalisation and ISO 3166-1 nationality codes.
34
- - `MedicalReport` / `Medication` — diagnosis, ICD codes, prescriptions, notes.
35
- - **CLI** (`ocrcontext extract`) terminal-first developer experience via the
36
- new `[cli]` extra (`pip install "ocrcontext[cli]"`):
37
- - `ocrcontext extract invoice.pdf` plain OCR to stdout.
38
- - `ocrcontext extract scan.pdf --schema receipt --output json` structured
39
- extraction as JSON.
40
- - `--provider openai|anthropic|ollama|google --model <name>` — bring-your-own
41
- LLM provider.
42
- - `--handwriting`, `--lang`, `--refine auto|yes|no` flags.
43
-
44
- ## [0.1.0] - 2026-06-25
45
-
46
- Initial release the document extraction core, decoupled from its web stack
47
- into a standalone, LLM-agnostic library.
48
-
49
- ### Added
50
- - **`Analyzer` facade**3-line developer experience:
51
- `Analyzer().analyze("file.pdf").text`.
52
- - **Routing ladder** (`pipeline.py`):
53
- - Digital PDFs → PyMuPDF text-layer extraction (no OCR); LLM refine is
54
- auto-skipped so exact text/identifiers are never altered.
55
- - Images / scanned PDFs PaddleOCR with image preprocessing, multi-language
56
- *coverage-first* candidate selection, and a line-band recovery fallback.
57
- - Handwriting (explicit or auto on insufficient text) Google Vision primary,
58
- Microsoft TrOCR fallback.
59
- - Multi-page documents joined with `--- Page N ---` separators.
60
- - **LLM-agnostic LLM layer** works with any LangChain `BaseChatModel`
61
- (`langchain-openai`, `langchain-anthropic`, `langchain-ollama`, ...). Only
62
- `langchain-core` is required at the core.
63
- - `Refiner` — fidelity-first OCR refinement (4 modes) with literal/contact
64
- preservation (`{{OCRLITn}}` masking) and drift/hallucination rejection.
65
- - `StructuredExtractor` + `Analyzer.extract()` / `Analyzer.extract_text()` —
66
- structured extraction into any Pydantic schema via `with_structured_output`.
67
- - Built-in `Invoice` / `LineItem` schemas and prompt.
68
- - **Resource efficiency** `EngineRegistry` singleton caches PaddleOCR/TrOCR
69
- engines (and per-language models) so they load at most once per process.
70
- - **Windows robustness** — model cache and temp files are routed through ASCII
71
- 8.3 short paths to survive non-ASCII usernames; oneDNN is disabled on CPU to
72
- avoid the PaddlePaddle 3.x PIR/oneDNN `NotImplementedError`.
73
- - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
74
- PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
75
-
76
- [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...HEAD
77
- [0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
78
- [0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
79
- [0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
1
+ # Changelog
2
+
3
+ All notable changes to **ocrcontext** are documented here.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.4] - 2026-06-27
11
+
12
+ ### Added
13
+ - **GPU acceleration** `Analyzer(use_gpu=True)` routes PaddleOCR inference to a
14
+ CUDA-capable GPU. Requires the GPU build of PaddlePaddle (`pip install paddlepaddle-gpu`).
15
+ CPU remains the default (`use_gpu=False`) so existing code needs no changes.
16
+ The `use_gpu` flag is forwarded through `EngineRegistry` `PaddleEngine`
17
+ all `PaddleOCR` constructor profiles, including the version-pinned fallback ladder
18
+ (PP-OCRv6 → PP-OCRv5 → PP-OCRv4 → legacy 2.x).
19
+ - **Vision→Paddle fallback** — when `handwriting=True` and Google Vision returns
20
+ insufficient text (e.g. no credentials, unsupported language), PaddleOCR is tried
21
+ automatically. Users no longer need TrOCR for a handwriting safety net.
22
+
23
+ ### Changed
24
+ - **Removed TrOCR engine** Microsoft TrOCR (`[trocr]` extra) is removed from the
25
+ project. PaddleOCR outperforms TrOCR on printed text; Google Vision outperforms it
26
+ on handwriting. The `[trocr]` extra and its heavy deps (torch, transformers, etc.)
27
+ are gone. The extras table is now `[paddle]`, `[vision]`, `[cli]`, `[all]`.
28
+ - **`auto_handwriting_fallback` default changed to `False`**PaddleOCR is now the
29
+ sole default engine. Set `AnalyzerConfig(auto_handwriting_fallback=True)` to enable
30
+ automatic Vision retry on insufficient printed OCR output.
31
+
32
+ ## [0.1.2] - 2026-06-26
33
+
34
+ ### Fixed
35
+ - CI: disable Rich markup mode in typer (`rich_markup_mode=None`) so help output
36
+ is plain text on all platforms — Rich's panel renderer produced ANSI escape
37
+ codes that CliRunner could not strip on Linux, causing `--help` tests to fail.
38
+ - Replace `typing.List` with built-in `list` in schemas for Python 3.12
39
+ compatibility and to avoid deprecation warnings.
40
+
41
+ ## [0.1.1] - 2026-06-26
42
+
43
+ ### Added
44
+ - **`OCRContextLoader`** — LangChain `BaseLoader` integration. Drop-in loader for
45
+ any LangChain pipeline: `OCRContextLoader("file.pdf").load()` returns a
46
+ `Document` with OCR text and metadata (`source`, `text_source`, `pages`,
47
+ `confidence`, `refined`).
48
+ - **Built-in extraction schemas** — four new ready-to-use Pydantic schemas with
49
+ system prompts, importable from `ocrcontext.schemas`:
50
+ - `Receipt` / `ReceiptItem` store name, date, items, subtotal, tax, total,
51
+ payment method.
52
+ - `Contract` / `ContractParty` — parties, effective/expiry dates, value,
53
+ governing law, key obligations.
54
+ - `IdCard` national_id / passport / driver_license / residence_permit with
55
+ ICD-standard date normalisation and ISO 3166-1 nationality codes.
56
+ - `MedicalReport` / `Medication` — diagnosis, ICD codes, prescriptions, notes.
57
+ - **CLI** (`ocrcontext extract`) terminal-first developer experience via the
58
+ new `[cli]` extra (`pip install "ocrcontext[cli]"`):
59
+ - `ocrcontext extract invoice.pdf` plain OCR to stdout.
60
+ - `ocrcontext extract scan.pdf --schema receipt --output json` structured
61
+ extraction as JSON.
62
+ - `--provider openai|anthropic|ollama|google --model <name>` — bring-your-own
63
+ LLM provider.
64
+ - `--handwriting`, `--lang`, `--refine auto|yes|no` flags.
65
+
66
+ ## [0.1.0] - 2026-06-25
67
+
68
+ Initial releasethe document extraction core, decoupled from its web stack
69
+ into a standalone, LLM-agnostic library.
70
+
71
+ ### Added
72
+ - **`Analyzer` facade** 3-line developer experience:
73
+ `Analyzer().analyze("file.pdf").text`.
74
+ - **Routing ladder** (`pipeline.py`):
75
+ - Digital PDFs → PyMuPDF text-layer extraction (no OCR); LLM refine is
76
+ auto-skipped so exact text/identifiers are never altered.
77
+ - Images / scanned PDFs → PaddleOCR with image preprocessing, multi-language
78
+ *coverage-first* candidate selection, and a line-band recovery fallback.
79
+ - Handwriting (explicit or auto on insufficient text) → Google Vision primary,
80
+ Microsoft TrOCR fallback.
81
+ - Multi-page documents joined with `--- Page N ---` separators.
82
+ - **LLM-agnostic LLM layer** — works with any LangChain `BaseChatModel`
83
+ (`langchain-openai`, `langchain-anthropic`, `langchain-ollama`, ...). Only
84
+ `langchain-core` is required at the core.
85
+ - `Refiner` — fidelity-first OCR refinement (4 modes) with literal/contact
86
+ preservation (`{{OCRLITn}}` masking) and drift/hallucination rejection.
87
+ - `StructuredExtractor` + `Analyzer.extract()` / `Analyzer.extract_text()` —
88
+ structured extraction into any Pydantic schema via `with_structured_output`.
89
+ - Built-in `Invoice` / `LineItem` schemas and prompt.
90
+ - **Resource efficiency** — `EngineRegistry` singleton caches PaddleOCR/TrOCR
91
+ engines (and per-language models) so they load at most once per process.
92
+ - **Windows robustness** — model cache and temp files are routed through ASCII
93
+ 8.3 short paths to survive non-ASCII usernames; oneDNN is disabled on CPU to
94
+ avoid the PaddlePaddle 3.x PIR/oneDNN `NotImplementedError`.
95
+ - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
96
+ PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
97
+
98
+ [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.4...HEAD
99
+ [0.1.4]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...v0.1.4
100
+ [0.1.3]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...v0.1.3
101
+ [0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
102
+ [0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
103
+ [0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocrcontext
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
5
  Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
6
6
  Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
@@ -30,15 +30,10 @@ Requires-Dist: pillow>=9.0
30
30
  Requires-Dist: pydantic>=2.5
31
31
  Requires-Dist: pymupdf>=1.23
32
32
  Provides-Extra: all
33
- Requires-Dist: accelerate>=0.27; extra == 'all'
34
33
  Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
35
34
  Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
36
35
  Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
37
36
  Requires-Dist: paddlepaddle>=2.6; extra == 'all'
38
- Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
39
- Requires-Dist: torch>=2.1; extra == 'all'
40
- Requires-Dist: torchvision>=0.16; extra == 'all'
41
- Requires-Dist: transformers>=4.40; extra == 'all'
42
37
  Requires-Dist: typer>=0.12; extra == 'all'
43
38
  Provides-Extra: cli
44
39
  Requires-Dist: typer>=0.12; extra == 'cli'
@@ -53,13 +48,6 @@ Provides-Extra: paddle
53
48
  Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
54
49
  Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
55
50
  Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
56
- Provides-Extra: trocr
57
- Requires-Dist: accelerate>=0.27; extra == 'trocr'
58
- Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
59
- Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
60
- Requires-Dist: torch>=2.1; extra == 'trocr'
61
- Requires-Dist: torchvision>=0.16; extra == 'trocr'
62
- Requires-Dist: transformers>=4.40; extra == 'trocr'
63
51
  Provides-Extra: vision
64
52
  Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
65
53
  Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
@@ -81,6 +69,16 @@ Decoupled, LLM-agnostic document OCR + structured extraction. No web server, no
81
69
 
82
70
  </div>
83
71
 
72
+ **Try it in 30 seconds — no Python script needed:**
73
+
74
+ ```bash
75
+ pip install 'ocrcontext[paddle,cli]'
76
+ ocrcontext extract invoice.pdf
77
+ ocrcontext extract receipt.jpg --output json
78
+ ```
79
+
80
+ **Or use the Python API:**
81
+
84
82
  ```python
85
83
  from ocrcontext import Analyzer
86
84
 
@@ -92,11 +90,23 @@ print(result.text)
92
90
 
93
91
  `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
94
92
 
93
+ ## Demo
94
+
95
+ **Structured invoice extraction from an image:**
96
+
97
+ <img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
98
+
99
+ **Digital PDF text extraction:**
100
+
101
+ <img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
102
+
95
103
  ## Contents
96
104
 
105
+ - [Demo](#demo)
97
106
  - [Install](#install)
98
- - [Quick start](#quick-start)
99
107
  - [CLI](#cli)
108
+ - [Quick start (Python API)](#quick-start-python-api)
109
+ - [GPU acceleration](#gpu-acceleration)
100
110
  - [LangChain integration](#langchain-integration)
101
111
  - [Built-in schemas](#built-in-schemas)
102
112
  - [How it routes a document](#how-it-routes-a-document)
@@ -115,8 +125,7 @@ Engines are opt-in so your base install stays small:
115
125
  |---|---|
116
126
  | `pip install ocrcontext` | Digital PDFs only (PyMuPDF text-layer — no OCR, no GPU, no API key) |
117
127
  | `pip install 'ocrcontext[paddle]'` | + printed images & scanned PDFs (PaddleOCR, CPU/GPU) |
118
- | `pip install 'ocrcontext[trocr]'` | + handwriting fallback (Microsoft TrOCR) |
119
- | `pip install 'ocrcontext[vision]'` | + handwriting primary (Google Cloud Vision) |
128
+ | `pip install 'ocrcontext[vision]'` | + handwriting (Google Cloud Vision) |
120
129
  | `pip install 'ocrcontext[cli]'` | + terminal CLI (`ocrcontext extract`) |
121
130
  | `pip install 'ocrcontext[all]'` | everything above |
122
131
 
@@ -141,7 +150,61 @@ $env:GOOGLE_APPLICATION_CREDENTIALS = "C:\path\to\key.json" # PowerShell
141
150
 
142
151
  ---
143
152
 
144
- ## Quick start
153
+ ## CLI
154
+
155
+ Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
156
+
157
+ ```bash
158
+ pip install 'ocrcontext[paddle,cli]'
159
+ ```
160
+
161
+ **Extract plain text:**
162
+
163
+ ```bash
164
+ ocrcontext extract invoice.pdf
165
+ ocrcontext extract scan.png --output json
166
+ ```
167
+
168
+ **Extract structured data with a built-in schema:**
169
+
170
+ ```bash
171
+ ocrcontext extract invoice.pdf --schema invoice
172
+ ocrcontext extract receipt.jpg --schema receipt
173
+ ocrcontext extract contract.pdf --schema contract
174
+ ocrcontext extract passport.jpg --schema idcard
175
+ ocrcontext extract lab_report.pdf --schema medical
176
+ ```
177
+
178
+ **Choose your LLM provider:**
179
+
180
+ ```bash
181
+ ocrcontext extract invoice.pdf --schema invoice \
182
+ --provider openai --model gpt-4o-mini
183
+
184
+ ocrcontext extract invoice.pdf --schema invoice \
185
+ --provider anthropic --model claude-haiku-4-5-20251001
186
+
187
+ ocrcontext extract invoice.pdf --schema invoice \
188
+ --provider ollama --model llama3.1
189
+ ```
190
+
191
+ **All options:**
192
+
193
+ ```
194
+ ocrcontext extract FILE [OPTIONS]
195
+
196
+ --schema -s invoice | receipt | contract | idcard | medical
197
+ --lang -l Language code (default: en)
198
+ --handwriting Force handwriting engine
199
+ --refine auto (default) | yes | no
200
+ --output -o text (default) | json
201
+ --provider -p openai | anthropic | ollama | google
202
+ --model -m Model name (default: gpt-4o-mini)
203
+ ```
204
+
205
+ ---
206
+
207
+ ## Quick start (Python API)
145
208
 
146
209
  ### Digital PDF
147
210
 
@@ -167,6 +230,26 @@ result = Analyzer().analyze("scan.png")
167
230
  print(result.text, result.confidence)
168
231
  ```
169
232
 
233
+ ### GPU acceleration
234
+
235
+ If you have a CUDA-capable GPU, swap the CPU PaddlePaddle build for the GPU one and pass `use_gpu=True`:
236
+
237
+ ```bash
238
+ pip install 'ocrcontext[paddle]'
239
+ pip install paddlepaddle-gpu # replaces the CPU build; pick the wheel that matches your CUDA version
240
+ ```
241
+
242
+ ```python
243
+ from ocrcontext import Analyzer
244
+
245
+ analyzer = Analyzer(use_gpu=True)
246
+ result = analyzer.analyze("scan.png")
247
+ print(result.text, result.confidence)
248
+ ```
249
+
250
+ > PaddleOCR is typically 5–10× faster on GPU for large documents or batch workloads.
251
+ > CPU (`use_gpu=False`, the default) works out of the box with no extra steps.
252
+
170
253
  ### LLM-refined OCR
171
254
 
172
255
  Refinement fixes character-level OCR errors without paraphrasing, translating, or inventing.
@@ -233,60 +316,6 @@ print(result.text)
233
316
 
234
317
  ---
235
318
 
236
- ## CLI
237
-
238
- Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
239
-
240
- ```bash
241
- pip install 'ocrcontext[cli]'
242
- ```
243
-
244
- **Extract plain text:**
245
-
246
- ```bash
247
- ocrcontext extract invoice.pdf
248
- ocrcontext extract scan.png --output json
249
- ```
250
-
251
- **Extract structured data with a built-in schema:**
252
-
253
- ```bash
254
- ocrcontext extract invoice.pdf --schema invoice
255
- ocrcontext extract receipt.jpg --schema receipt
256
- ocrcontext extract contract.pdf --schema contract
257
- ocrcontext extract passport.jpg --schema idcard
258
- ocrcontext extract lab_report.pdf --schema medical
259
- ```
260
-
261
- **Choose your LLM provider:**
262
-
263
- ```bash
264
- ocrcontext extract invoice.pdf --schema invoice \
265
- --provider openai --model gpt-4o-mini
266
-
267
- ocrcontext extract invoice.pdf --schema invoice \
268
- --provider anthropic --model claude-haiku-4-5-20251001
269
-
270
- ocrcontext extract invoice.pdf --schema invoice \
271
- --provider ollama --model llama3.1
272
- ```
273
-
274
- **All options:**
275
-
276
- ```
277
- ocrcontext extract FILE [OPTIONS]
278
-
279
- --schema -s invoice | receipt | contract | idcard | medical
280
- --lang -l Language code (default: en)
281
- --handwriting Force handwriting engine
282
- --refine auto (default) | yes | no
283
- --output -o text (default) | json
284
- --provider -p openai | anthropic | ollama | google
285
- --model -m Model name (default: gpt-4o-mini)
286
- ```
287
-
288
- ---
289
-
290
319
  ## LangChain integration
291
320
 
292
321
  `OCRContextLoader` is a drop-in LangChain `BaseLoader`. It slots into any LangChain pipeline — RAG, document Q&A, chain-of-thought — without glue code.
@@ -416,7 +445,7 @@ report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
416
445
  │ │
417
446
  │ 3. Handwriting (explicit or auto)? │
418
447
  │ └─▶ Google Cloud Vision │
419
- │ → TrOCR fallback
448
+ │ → PaddleOCR if Vision empty
420
449
  │ │
421
450
  │ 4. (optional) LLM refine │
422
451
  │ fidelity-first · literal-safe │
@@ -427,7 +456,7 @@ report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
427
456
  ```
428
457
 
429
458
  Multi-page documents are joined with `--- Page N ---` separators.
430
- Handwriting kicks in automatically when printed OCR returns too little text.
459
+ Handwriting step 3 is explicit-only by default; set `auto_handwriting_fallback=True` to enable automatic retry.
431
460
 
432
461
  ---
433
462
 
@@ -460,10 +489,10 @@ from ocrcontext import Analyzer, AnalyzerConfig
460
489
  cfg = AnalyzerConfig(
461
490
  lang="tr", # default document language
462
491
  prefer_pdf_text_layer=True, # skip OCR when a text layer exists
463
- auto_handwriting_fallback=True, # retry with handwriting if OCR returns too little
492
+ auto_handwriting_fallback=False, # keep PaddleOCR as sole engine (default); set True to enable Vision fallback
464
493
  refine_by_default=True, # auto-refine whenever an LLM is configured
465
494
  )
466
- analyzer = Analyzer(llm=..., config=cfg)
495
+ analyzer = Analyzer(llm=..., config=cfg, use_gpu=False) # set use_gpu=True for CUDA-capable devices
467
496
  ```
468
497
 
469
498
  ---