ocrcontext 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/CHANGELOG.md +103 -79
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/PKG-INFO +104 -75
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/README.md +461 -420
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/pyproject.toml +3 -13
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/__init__.py +1 -1
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/analyzer.py +209 -198
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/cli.py +84 -20
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/config.py +3 -1
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/handwriting.py +7 -34
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/paddle.py +278 -271
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/registry.py +68 -67
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/schemas.py +19 -18
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/pipeline.py +5 -9
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/types.py +0 -1
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_cli.py +6 -3
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_pipeline_analyzer.py +14 -5
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/.gitignore +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/LICENSE +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/01_quickstart.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/02_refine_openai.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/03_structured_invoice.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/04_local_ollama.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/image_smoke_test.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/pdf_smoke_test.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/examples/structured_smoke_test.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/__init__.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/base.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/pdf_text.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/trocr.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/engines/vision.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/exceptions.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/__init__.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/drift.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/extractor.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/formatting.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/literal_preserve.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/prompts.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/llm/refiner.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/loaders.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/__init__.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/image.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/py.typed +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/quality.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/schemas.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/utils/__init__.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/utils/files.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/src/ocrcontext/utils/lang.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/__init__.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/conftest.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_langchain_loader.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_literal_preserve.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_llm.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_schemas.py +0 -0
- {ocrcontext-0.1.2 → ocrcontext-0.1.4}/tests/test_text_helpers.py +0 -0
|
@@ -1,79 +1,103 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
All notable changes to **ocrcontext** are documented here.
|
|
4
|
-
|
|
5
|
-
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
-
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
-
|
|
8
|
-
## [Unreleased]
|
|
9
|
-
|
|
10
|
-
## [0.1.
|
|
11
|
-
|
|
12
|
-
###
|
|
13
|
-
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
-
|
|
51
|
-
|
|
52
|
-
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
-
|
|
60
|
-
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to **ocrcontext** are documented here.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.4] - 2026-06-27
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **GPU acceleration** — `Analyzer(use_gpu=True)` routes PaddleOCR inference to a
|
|
14
|
+
CUDA-capable GPU. Requires the GPU build of PaddlePaddle (`pip install paddlepaddle-gpu`).
|
|
15
|
+
CPU remains the default (`use_gpu=False`) so existing code needs no changes.
|
|
16
|
+
The `use_gpu` flag is forwarded through `EngineRegistry` → `PaddleEngine` →
|
|
17
|
+
all `PaddleOCR` constructor profiles, including the version-pinned fallback ladder
|
|
18
|
+
(PP-OCRv6 → PP-OCRv5 → PP-OCRv4 → legacy 2.x).
|
|
19
|
+
- **Vision→Paddle fallback** — when `handwriting=True` and Google Vision returns
|
|
20
|
+
insufficient text (e.g. no credentials, unsupported language), PaddleOCR is tried
|
|
21
|
+
automatically. Users no longer need TrOCR for a handwriting safety net.
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
- **Removed TrOCR engine** — Microsoft TrOCR (`[trocr]` extra) is removed from the
|
|
25
|
+
project. PaddleOCR outperforms TrOCR on printed text; Google Vision outperforms it
|
|
26
|
+
on handwriting. The `[trocr]` extra and its heavy deps (torch, transformers, etc.)
|
|
27
|
+
are gone. The extras table is now `[paddle]`, `[vision]`, `[cli]`, `[all]`.
|
|
28
|
+
- **`auto_handwriting_fallback` default changed to `False`** — PaddleOCR is now the
|
|
29
|
+
sole default engine. Set `AnalyzerConfig(auto_handwriting_fallback=True)` to enable
|
|
30
|
+
automatic Vision retry on insufficient printed OCR output.
|
|
31
|
+
|
|
32
|
+
## [0.1.2] - 2026-06-26
|
|
33
|
+
|
|
34
|
+
### Fixed
|
|
35
|
+
- CI: disable Rich markup mode in typer (`rich_markup_mode=None`) so help output
|
|
36
|
+
is plain text on all platforms — Rich's panel renderer produced ANSI escape
|
|
37
|
+
codes that CliRunner could not strip on Linux, causing `--help` tests to fail.
|
|
38
|
+
- Replace `typing.List` with built-in `list` in schemas for Python 3.12
|
|
39
|
+
compatibility and to avoid deprecation warnings.
|
|
40
|
+
|
|
41
|
+
## [0.1.1] - 2026-06-26
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
- **`OCRContextLoader`** — LangChain `BaseLoader` integration. Drop-in loader for
|
|
45
|
+
any LangChain pipeline: `OCRContextLoader("file.pdf").load()` returns a
|
|
46
|
+
`Document` with OCR text and metadata (`source`, `text_source`, `pages`,
|
|
47
|
+
`confidence`, `refined`).
|
|
48
|
+
- **Built-in extraction schemas** — four new ready-to-use Pydantic schemas with
|
|
49
|
+
system prompts, importable from `ocrcontext.schemas`:
|
|
50
|
+
- `Receipt` / `ReceiptItem` — store name, date, items, subtotal, tax, total,
|
|
51
|
+
payment method.
|
|
52
|
+
- `Contract` / `ContractParty` — parties, effective/expiry dates, value,
|
|
53
|
+
governing law, key obligations.
|
|
54
|
+
- `IdCard` — national_id / passport / driver_license / residence_permit with
|
|
55
|
+
ICD-standard date normalisation and ISO 3166-1 nationality codes.
|
|
56
|
+
- `MedicalReport` / `Medication` — diagnosis, ICD codes, prescriptions, notes.
|
|
57
|
+
- **CLI** (`ocrcontext extract`) — terminal-first developer experience via the
|
|
58
|
+
new `[cli]` extra (`pip install "ocrcontext[cli]"`):
|
|
59
|
+
- `ocrcontext extract invoice.pdf` — plain OCR to stdout.
|
|
60
|
+
- `ocrcontext extract scan.pdf --schema receipt --output json` — structured
|
|
61
|
+
extraction as JSON.
|
|
62
|
+
- `--provider openai|anthropic|ollama|google --model <name>` — bring-your-own
|
|
63
|
+
LLM provider.
|
|
64
|
+
- `--handwriting`, `--lang`, `--refine auto|yes|no` flags.
|
|
65
|
+
|
|
66
|
+
## [0.1.0] - 2026-06-25
|
|
67
|
+
|
|
68
|
+
Initial release — the document extraction core, decoupled from its web stack
|
|
69
|
+
into a standalone, LLM-agnostic library.
|
|
70
|
+
|
|
71
|
+
### Added
|
|
72
|
+
- **`Analyzer` facade** — 3-line developer experience:
|
|
73
|
+
`Analyzer().analyze("file.pdf").text`.
|
|
74
|
+
- **Routing ladder** (`pipeline.py`):
|
|
75
|
+
- Digital PDFs → PyMuPDF text-layer extraction (no OCR); LLM refine is
|
|
76
|
+
auto-skipped so exact text/identifiers are never altered.
|
|
77
|
+
- Images / scanned PDFs → PaddleOCR with image preprocessing, multi-language
|
|
78
|
+
*coverage-first* candidate selection, and a line-band recovery fallback.
|
|
79
|
+
- Handwriting (explicit or auto on insufficient text) → Google Vision primary,
|
|
80
|
+
Microsoft TrOCR fallback.
|
|
81
|
+
- Multi-page documents joined with `--- Page N ---` separators.
|
|
82
|
+
- **LLM-agnostic LLM layer** — works with any LangChain `BaseChatModel`
|
|
83
|
+
(`langchain-openai`, `langchain-anthropic`, `langchain-ollama`, ...). Only
|
|
84
|
+
`langchain-core` is required at the core.
|
|
85
|
+
- `Refiner` — fidelity-first OCR refinement (4 modes) with literal/contact
|
|
86
|
+
preservation (`{{OCRLITn}}` masking) and drift/hallucination rejection.
|
|
87
|
+
- `StructuredExtractor` + `Analyzer.extract()` / `Analyzer.extract_text()` —
|
|
88
|
+
structured extraction into any Pydantic schema via `with_structured_output`.
|
|
89
|
+
- Built-in `Invoice` / `LineItem` schemas and prompt.
|
|
90
|
+
- **Resource efficiency** — `EngineRegistry` singleton caches PaddleOCR/TrOCR
|
|
91
|
+
engines (and per-language models) so they load at most once per process.
|
|
92
|
+
- **Windows robustness** — model cache and temp files are routed through ASCII
|
|
93
|
+
8.3 short paths to survive non-ASCII usernames; oneDNN is disabled on CPU to
|
|
94
|
+
avoid the PaddlePaddle 3.x PIR/oneDNN `NotImplementedError`.
|
|
95
|
+
- **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
|
|
96
|
+
PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
|
|
97
|
+
|
|
98
|
+
[Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.4...HEAD
|
|
99
|
+
[0.1.4]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...v0.1.4
|
|
100
|
+
[0.1.3]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...v0.1.3
|
|
101
|
+
[0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
|
|
102
|
+
[0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
|
|
103
|
+
[0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ocrcontext
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
|
|
5
5
|
Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
|
|
6
6
|
Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
|
|
@@ -30,15 +30,10 @@ Requires-Dist: pillow>=9.0
|
|
|
30
30
|
Requires-Dist: pydantic>=2.5
|
|
31
31
|
Requires-Dist: pymupdf>=1.23
|
|
32
32
|
Provides-Extra: all
|
|
33
|
-
Requires-Dist: accelerate>=0.27; extra == 'all'
|
|
34
33
|
Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
|
|
35
34
|
Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
|
|
36
35
|
Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
|
|
37
36
|
Requires-Dist: paddlepaddle>=2.6; extra == 'all'
|
|
38
|
-
Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
|
|
39
|
-
Requires-Dist: torch>=2.1; extra == 'all'
|
|
40
|
-
Requires-Dist: torchvision>=0.16; extra == 'all'
|
|
41
|
-
Requires-Dist: transformers>=4.40; extra == 'all'
|
|
42
37
|
Requires-Dist: typer>=0.12; extra == 'all'
|
|
43
38
|
Provides-Extra: cli
|
|
44
39
|
Requires-Dist: typer>=0.12; extra == 'cli'
|
|
@@ -53,13 +48,6 @@ Provides-Extra: paddle
|
|
|
53
48
|
Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
|
|
54
49
|
Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
|
|
55
50
|
Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
|
|
56
|
-
Provides-Extra: trocr
|
|
57
|
-
Requires-Dist: accelerate>=0.27; extra == 'trocr'
|
|
58
|
-
Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
|
|
59
|
-
Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
|
|
60
|
-
Requires-Dist: torch>=2.1; extra == 'trocr'
|
|
61
|
-
Requires-Dist: torchvision>=0.16; extra == 'trocr'
|
|
62
|
-
Requires-Dist: transformers>=4.40; extra == 'trocr'
|
|
63
51
|
Provides-Extra: vision
|
|
64
52
|
Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
|
|
65
53
|
Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
|
|
@@ -81,6 +69,16 @@ Decoupled, LLM-agnostic document OCR + structured extraction. No web server, no
|
|
|
81
69
|
|
|
82
70
|
</div>
|
|
83
71
|
|
|
72
|
+
**Try it in 30 seconds — no Python script needed:**
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install 'ocrcontext[paddle,cli]'
|
|
76
|
+
ocrcontext extract invoice.pdf
|
|
77
|
+
ocrcontext extract receipt.jpg --output json
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Or use the Python API:**
|
|
81
|
+
|
|
84
82
|
```python
|
|
85
83
|
from ocrcontext import Analyzer
|
|
86
84
|
|
|
@@ -92,11 +90,23 @@ print(result.text)
|
|
|
92
90
|
|
|
93
91
|
`ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
|
|
94
92
|
|
|
93
|
+
## Demo
|
|
94
|
+
|
|
95
|
+
**Structured invoice extraction from an image:**
|
|
96
|
+
|
|
97
|
+
<img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
|
|
98
|
+
|
|
99
|
+
**Digital PDF text extraction:**
|
|
100
|
+
|
|
101
|
+
<img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
|
|
102
|
+
|
|
95
103
|
## Contents
|
|
96
104
|
|
|
105
|
+
- [Demo](#demo)
|
|
97
106
|
- [Install](#install)
|
|
98
|
-
- [Quick start](#quick-start)
|
|
99
107
|
- [CLI](#cli)
|
|
108
|
+
- [Quick start (Python API)](#quick-start-python-api)
|
|
109
|
+
- [GPU acceleration](#gpu-acceleration)
|
|
100
110
|
- [LangChain integration](#langchain-integration)
|
|
101
111
|
- [Built-in schemas](#built-in-schemas)
|
|
102
112
|
- [How it routes a document](#how-it-routes-a-document)
|
|
@@ -115,8 +125,7 @@ Engines are opt-in so your base install stays small:
|
|
|
115
125
|
|---|---|
|
|
116
126
|
| `pip install ocrcontext` | Digital PDFs only (PyMuPDF text-layer — no OCR, no GPU, no API key) |
|
|
117
127
|
| `pip install 'ocrcontext[paddle]'` | + printed images & scanned PDFs (PaddleOCR, CPU/GPU) |
|
|
118
|
-
| `pip install 'ocrcontext[
|
|
119
|
-
| `pip install 'ocrcontext[vision]'` | + handwriting primary (Google Cloud Vision) |
|
|
128
|
+
| `pip install 'ocrcontext[vision]'` | + handwriting (Google Cloud Vision) |
|
|
120
129
|
| `pip install 'ocrcontext[cli]'` | + terminal CLI (`ocrcontext extract`) |
|
|
121
130
|
| `pip install 'ocrcontext[all]'` | everything above |
|
|
122
131
|
|
|
@@ -141,7 +150,61 @@ $env:GOOGLE_APPLICATION_CREDENTIALS = "C:\path\to\key.json" # PowerShell
|
|
|
141
150
|
|
|
142
151
|
---
|
|
143
152
|
|
|
144
|
-
##
|
|
153
|
+
## CLI
|
|
154
|
+
|
|
155
|
+
Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
pip install 'ocrcontext[paddle,cli]'
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
**Extract plain text:**
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
ocrcontext extract invoice.pdf
|
|
165
|
+
ocrcontext extract scan.png --output json
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Extract structured data with a built-in schema:**
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
ocrcontext extract invoice.pdf --schema invoice
|
|
172
|
+
ocrcontext extract receipt.jpg --schema receipt
|
|
173
|
+
ocrcontext extract contract.pdf --schema contract
|
|
174
|
+
ocrcontext extract passport.jpg --schema idcard
|
|
175
|
+
ocrcontext extract lab_report.pdf --schema medical
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Choose your LLM provider:**
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
ocrcontext extract invoice.pdf --schema invoice \
|
|
182
|
+
--provider openai --model gpt-4o-mini
|
|
183
|
+
|
|
184
|
+
ocrcontext extract invoice.pdf --schema invoice \
|
|
185
|
+
--provider anthropic --model claude-haiku-4-5-20251001
|
|
186
|
+
|
|
187
|
+
ocrcontext extract invoice.pdf --schema invoice \
|
|
188
|
+
--provider ollama --model llama3.1
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
**All options:**
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
ocrcontext extract FILE [OPTIONS]
|
|
195
|
+
|
|
196
|
+
--schema -s invoice | receipt | contract | idcard | medical
|
|
197
|
+
--lang -l Language code (default: en)
|
|
198
|
+
--handwriting Force handwriting engine
|
|
199
|
+
--refine auto (default) | yes | no
|
|
200
|
+
--output -o text (default) | json
|
|
201
|
+
--provider -p openai | anthropic | ollama | google
|
|
202
|
+
--model -m Model name (default: gpt-4o-mini)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Quick start (Python API)
|
|
145
208
|
|
|
146
209
|
### Digital PDF
|
|
147
210
|
|
|
@@ -167,6 +230,26 @@ result = Analyzer().analyze("scan.png")
|
|
|
167
230
|
print(result.text, result.confidence)
|
|
168
231
|
```
|
|
169
232
|
|
|
233
|
+
### GPU acceleration
|
|
234
|
+
|
|
235
|
+
If you have a CUDA-capable GPU, swap the CPU PaddlePaddle build for the GPU one and pass `use_gpu=True`:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
pip install 'ocrcontext[paddle]'
|
|
239
|
+
pip install paddlepaddle-gpu # replaces the CPU build; pick the wheel that matches your CUDA version
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
from ocrcontext import Analyzer
|
|
244
|
+
|
|
245
|
+
analyzer = Analyzer(use_gpu=True)
|
|
246
|
+
result = analyzer.analyze("scan.png")
|
|
247
|
+
print(result.text, result.confidence)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
> PaddleOCR is typically 5–10× faster on GPU for large documents or batch workloads.
|
|
251
|
+
> CPU (`use_gpu=False`, the default) works out of the box with no extra steps.
|
|
252
|
+
|
|
170
253
|
### LLM-refined OCR
|
|
171
254
|
|
|
172
255
|
Refinement fixes character-level OCR errors without paraphrasing, translating, or inventing.
|
|
@@ -233,60 +316,6 @@ print(result.text)
|
|
|
233
316
|
|
|
234
317
|
---
|
|
235
318
|
|
|
236
|
-
## CLI
|
|
237
|
-
|
|
238
|
-
Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
|
|
239
|
-
|
|
240
|
-
```bash
|
|
241
|
-
pip install 'ocrcontext[cli]'
|
|
242
|
-
```
|
|
243
|
-
|
|
244
|
-
**Extract plain text:**
|
|
245
|
-
|
|
246
|
-
```bash
|
|
247
|
-
ocrcontext extract invoice.pdf
|
|
248
|
-
ocrcontext extract scan.png --output json
|
|
249
|
-
```
|
|
250
|
-
|
|
251
|
-
**Extract structured data with a built-in schema:**
|
|
252
|
-
|
|
253
|
-
```bash
|
|
254
|
-
ocrcontext extract invoice.pdf --schema invoice
|
|
255
|
-
ocrcontext extract receipt.jpg --schema receipt
|
|
256
|
-
ocrcontext extract contract.pdf --schema contract
|
|
257
|
-
ocrcontext extract passport.jpg --schema idcard
|
|
258
|
-
ocrcontext extract lab_report.pdf --schema medical
|
|
259
|
-
```
|
|
260
|
-
|
|
261
|
-
**Choose your LLM provider:**
|
|
262
|
-
|
|
263
|
-
```bash
|
|
264
|
-
ocrcontext extract invoice.pdf --schema invoice \
|
|
265
|
-
--provider openai --model gpt-4o-mini
|
|
266
|
-
|
|
267
|
-
ocrcontext extract invoice.pdf --schema invoice \
|
|
268
|
-
--provider anthropic --model claude-haiku-4-5-20251001
|
|
269
|
-
|
|
270
|
-
ocrcontext extract invoice.pdf --schema invoice \
|
|
271
|
-
--provider ollama --model llama3.1
|
|
272
|
-
```
|
|
273
|
-
|
|
274
|
-
**All options:**
|
|
275
|
-
|
|
276
|
-
```
|
|
277
|
-
ocrcontext extract FILE [OPTIONS]
|
|
278
|
-
|
|
279
|
-
--schema -s invoice | receipt | contract | idcard | medical
|
|
280
|
-
--lang -l Language code (default: en)
|
|
281
|
-
--handwriting Force handwriting engine
|
|
282
|
-
--refine auto (default) | yes | no
|
|
283
|
-
--output -o text (default) | json
|
|
284
|
-
--provider -p openai | anthropic | ollama | google
|
|
285
|
-
--model -m Model name (default: gpt-4o-mini)
|
|
286
|
-
```
|
|
287
|
-
|
|
288
|
-
---
|
|
289
|
-
|
|
290
319
|
## LangChain integration
|
|
291
320
|
|
|
292
321
|
`OCRContextLoader` is a drop-in LangChain `BaseLoader`. It slots into any LangChain pipeline — RAG, document Q&A, chain-of-thought — without glue code.
|
|
@@ -416,7 +445,7 @@ report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
|
|
|
416
445
|
│ │
|
|
417
446
|
│ 3. Handwriting (explicit or auto)? │
|
|
418
447
|
│ └─▶ Google Cloud Vision │
|
|
419
|
-
│ →
|
|
448
|
+
│ → PaddleOCR if Vision empty │
|
|
420
449
|
│ │
|
|
421
450
|
│ 4. (optional) LLM refine │
|
|
422
451
|
│ fidelity-first · literal-safe │
|
|
@@ -427,7 +456,7 @@ report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
|
|
|
427
456
|
```
|
|
428
457
|
|
|
429
458
|
Multi-page documents are joined with `--- Page N ---` separators.
|
|
430
|
-
Handwriting
|
|
459
|
+
Handwriting step 3 is explicit-only by default; set `auto_handwriting_fallback=True` to enable automatic retry.
|
|
431
460
|
|
|
432
461
|
---
|
|
433
462
|
|
|
@@ -460,10 +489,10 @@ from ocrcontext import Analyzer, AnalyzerConfig
|
|
|
460
489
|
cfg = AnalyzerConfig(
|
|
461
490
|
lang="tr", # default document language
|
|
462
491
|
prefer_pdf_text_layer=True, # skip OCR when a text layer exists
|
|
463
|
-
auto_handwriting_fallback=
|
|
492
|
+
auto_handwriting_fallback=False, # keep PaddleOCR as sole engine (default); set True to enable Vision fallback
|
|
464
493
|
refine_by_default=True, # auto-refine whenever an LLM is configured
|
|
465
494
|
)
|
|
466
|
-
analyzer = Analyzer(llm=..., config=cfg)
|
|
495
|
+
analyzer = Analyzer(llm=..., config=cfg, use_gpu=False) # set use_gpu=True for CUDA-capable devices
|
|
467
496
|
```
|
|
468
497
|
|
|
469
498
|
---
|