ocrcontext 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. ocrcontext-0.1.2/.gitignore +41 -0
  2. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/CHANGELOG.md +37 -1
  3. ocrcontext-0.1.2/PKG-INFO +487 -0
  4. ocrcontext-0.1.2/README.md +420 -0
  5. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/pyproject.toml +115 -108
  6. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/__init__.py +3 -1
  7. ocrcontext-0.1.2/src/ocrcontext/cli.py +188 -0
  8. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/paddle.py +17 -10
  9. ocrcontext-0.1.2/src/ocrcontext/llm/schemas.py +292 -0
  10. ocrcontext-0.1.2/src/ocrcontext/loaders.py +84 -0
  11. ocrcontext-0.1.2/src/ocrcontext/schemas.py +43 -0
  12. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/conftest.py +15 -0
  13. ocrcontext-0.1.2/tests/test_cli.py +181 -0
  14. ocrcontext-0.1.2/tests/test_langchain_loader.py +90 -0
  15. ocrcontext-0.1.2/tests/test_schemas.py +192 -0
  16. ocrcontext-0.1.0/.gitignore +0 -27
  17. ocrcontext-0.1.0/PKG-INFO +0 -207
  18. ocrcontext-0.1.0/README.md +0 -144
  19. ocrcontext-0.1.0/src/ocrcontext/llm/schemas.py +0 -99
  20. ocrcontext-0.1.0/src/ocrcontext/schemas.py +0 -8
  21. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/LICENSE +0 -0
  22. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/01_quickstart.py +0 -0
  23. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/02_refine_openai.py +0 -0
  24. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/03_structured_invoice.py +0 -0
  25. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/04_local_ollama.py +0 -0
  26. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/image_smoke_test.py +0 -0
  27. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/pdf_smoke_test.py +0 -0
  28. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/examples/structured_smoke_test.py +0 -0
  29. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/analyzer.py +0 -0
  30. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/config.py +0 -0
  31. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/__init__.py +0 -0
  32. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/base.py +0 -0
  33. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/handwriting.py +0 -0
  34. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/pdf_text.py +0 -0
  35. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/registry.py +0 -0
  36. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/trocr.py +0 -0
  37. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/engines/vision.py +0 -0
  38. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/exceptions.py +0 -0
  39. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/__init__.py +0 -0
  40. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/drift.py +0 -0
  41. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/extractor.py +0 -0
  42. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/formatting.py +0 -0
  43. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/literal_preserve.py +0 -0
  44. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/prompts.py +0 -0
  45. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/llm/refiner.py +0 -0
  46. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/pipeline.py +0 -0
  47. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/preprocessing/__init__.py +0 -0
  48. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/preprocessing/image.py +0 -0
  49. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/py.typed +0 -0
  50. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/quality.py +0 -0
  51. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/types.py +0 -0
  52. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/utils/__init__.py +0 -0
  53. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/utils/files.py +0 -0
  54. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/src/ocrcontext/utils/lang.py +0 -0
  55. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/__init__.py +0 -0
  56. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_literal_preserve.py +0 -0
  57. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_llm.py +0 -0
  58. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_pipeline_analyzer.py +0 -0
  59. {ocrcontext-0.1.0 → ocrcontext-0.1.2}/tests/test_text_helpers.py +0 -0
@@ -0,0 +1,41 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ .coverage
11
+ htmlcov/
12
+
13
+ # Virtual envs
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # Models / caches
19
+ .cache/
20
+ *.onnx
21
+ *.pdmodel
22
+ *.pdiparams
23
+
24
+ # OS / editor
25
+ .DS_Store
26
+ .idea/
27
+ .vscode/
28
+
29
+ # Claude session memory — never publish
30
+ CLAUDE.md
31
+
32
+ # Local sample / personal documents — keep them out of the public repo.
33
+ # (The example scripts auto-discover whatever you drop here.)
34
+ examples/*.pdf
35
+ examples/*.png
36
+ examples/*.jpg
37
+ examples/*.jpeg
38
+ examples/*.webp
39
+ examples/*.tif
40
+ examples/*.tiff
41
+
@@ -7,6 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.1.2] - 2026-06-26
11
+
12
+ ### Fixed
13
+ - CI: disable Rich markup mode in typer (`rich_markup_mode=None`) so help output
14
+ is plain text on all platforms — Rich's panel renderer produced ANSI escape
15
+ codes that CliRunner could not strip on Linux, causing `--help` tests to fail.
16
+ - Replace `typing.List` with built-in `list` in schemas for Python 3.12
17
+ compatibility and to avoid deprecation warnings.
18
+
19
+ ## [0.1.1] - 2026-06-26
20
+
21
+ ### Added
22
+ - **`OCRContextLoader`** — LangChain `BaseLoader` integration. Drop-in loader for
23
+ any LangChain pipeline: `OCRContextLoader("file.pdf").load()` returns a
24
+ `Document` with OCR text and metadata (`source`, `text_source`, `pages`,
25
+ `confidence`, `refined`).
26
+ - **Built-in extraction schemas** — four new ready-to-use Pydantic schemas with
27
+ system prompts, importable from `ocrcontext.schemas`:
28
+ - `Receipt` / `ReceiptItem` — store name, date, items, subtotal, tax, total,
29
+ payment method.
30
+ - `Contract` / `ContractParty` — parties, effective/expiry dates, value,
31
+ governing law, key obligations.
32
+ - `IdCard` — national_id / passport / driver_license / residence_permit with
33
+ ICD-standard date normalisation and ISO 3166-1 nationality codes.
34
+ - `MedicalReport` / `Medication` — diagnosis, ICD codes, prescriptions, notes.
35
+ - **CLI** (`ocrcontext extract`) — terminal-first developer experience via the
36
+ new `[cli]` extra (`pip install "ocrcontext[cli]"`):
37
+ - `ocrcontext extract invoice.pdf` — plain OCR to stdout.
38
+ - `ocrcontext extract scan.pdf --schema receipt --output json` — structured
39
+ extraction as JSON.
40
+ - `--provider openai|anthropic|ollama|google --model <name>` — bring-your-own
41
+ LLM provider.
42
+ - `--handwriting`, `--lang`, `--refine auto|yes|no` flags.
43
+
10
44
  ## [0.1.0] - 2026-06-25
11
45
 
12
46
  Initial release — the document extraction core, decoupled from its web stack
@@ -39,5 +73,7 @@ into a standalone, LLM-agnostic library.
39
73
  - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
40
74
  PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
41
75
 
42
- [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...HEAD
76
+ [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...HEAD
77
+ [0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
78
+ [0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
43
79
  [0.1.0]: https://github.com/bahadirkarsli/ocrcontext/releases/tag/v0.1.0
@@ -0,0 +1,487 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocrcontext
3
+ Version: 0.1.2
4
+ Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
+ Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
6
+ Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
7
+ Project-URL: Issues, https://github.com/BahadirKarsli/OCRContext/issues
8
+ Project-URL: Changelog, https://github.com/BahadirKarsli/OCRContext/blob/main/CHANGELOG.md
9
+ Author-email: Bahadır Karslı <bahadrkrsl@outlook.com>
10
+ Maintainer-email: Bahadır Karslı <bahadrkrsl@outlook.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: document-ai,langchain,ocr,paddleocr,pdf,structured-extraction
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: langchain-core>=0.3
28
+ Requires-Dist: numpy>=1.24
29
+ Requires-Dist: pillow>=9.0
30
+ Requires-Dist: pydantic>=2.5
31
+ Requires-Dist: pymupdf>=1.23
32
+ Provides-Extra: all
33
+ Requires-Dist: accelerate>=0.27; extra == 'all'
34
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'all'
35
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'all'
36
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'all'
37
+ Requires-Dist: paddlepaddle>=2.6; extra == 'all'
38
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'all'
39
+ Requires-Dist: torch>=2.1; extra == 'all'
40
+ Requires-Dist: torchvision>=0.16; extra == 'all'
41
+ Requires-Dist: transformers>=4.40; extra == 'all'
42
+ Requires-Dist: typer>=0.12; extra == 'all'
43
+ Provides-Extra: cli
44
+ Requires-Dist: typer>=0.12; extra == 'cli'
45
+ Provides-Extra: dev
46
+ Requires-Dist: build>=1.2; extra == 'dev'
47
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
48
+ Requires-Dist: pytest>=8.0; extra == 'dev'
49
+ Requires-Dist: ruff>=0.5; extra == 'dev'
50
+ Requires-Dist: twine>=5.0; extra == 'dev'
51
+ Requires-Dist: typer>=0.12; extra == 'dev'
52
+ Provides-Extra: paddle
53
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'paddle'
54
+ Requires-Dist: paddleocr>=2.7.0.3; extra == 'paddle'
55
+ Requires-Dist: paddlepaddle>=2.6; extra == 'paddle'
56
+ Provides-Extra: trocr
57
+ Requires-Dist: accelerate>=0.27; extra == 'trocr'
58
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'trocr'
59
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'trocr'
60
+ Requires-Dist: torch>=2.1; extra == 'trocr'
61
+ Requires-Dist: torchvision>=0.16; extra == 'trocr'
62
+ Requires-Dist: transformers>=4.40; extra == 'trocr'
63
+ Provides-Extra: vision
64
+ Requires-Dist: google-cloud-vision>=3.8.1; extra == 'vision'
65
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'vision'
66
+ Description-Content-Type: text/markdown
67
+
68
+ <div align="center">
69
+
70
+ # OCR Context
71
+
72
+ **Turn any PDF or image into clean text — or a typed Pydantic model — in three lines.**
73
+
74
+ Decoupled, LLM-agnostic document OCR + structured extraction. No web server, no vendor lock-in.
75
+
76
+ [![CI](https://github.com/BahadirKarsli/OCRContext/actions/workflows/ci.yml/badge.svg)](https://github.com/BahadirKarsli/OCRContext/actions/workflows/ci.yml)
77
+ [![PyPI version](https://img.shields.io/pypi/v/ocrcontext.svg?color=blue)](https://pypi.org/project/ocrcontext/)
78
+ [![Python versions](https://img.shields.io/pypi/pyversions/ocrcontext.svg)](https://pypi.org/project/ocrcontext/)
79
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
80
+ [![Typed](https://img.shields.io/badge/typing-PEP%20561-blue.svg)](https://peps.python.org/pep-0561/)
81
+
82
+ </div>
83
+
84
+ ```python
85
+ from ocrcontext import Analyzer
86
+
87
+ result = Analyzer().analyze("invoice.pdf")
88
+ print(result.text)
89
+ ```
90
+
91
+ ---
92
+
93
+ `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
94
+
95
+ ## Contents
96
+
97
+ - [Install](#install)
98
+ - [Quick start](#quick-start)
99
+ - [CLI](#cli)
100
+ - [LangChain integration](#langchain-integration)
101
+ - [Built-in schemas](#built-in-schemas)
102
+ - [How it routes a document](#how-it-routes-a-document)
103
+ - [Refinement modes](#refinement-modes)
104
+ - [Configuration](#configuration)
105
+ - [Development](#development)
106
+ - [License](#license)
107
+
108
+ ---
109
+
110
+ ## Install
111
+
112
+ Engines are opt-in so your base install stays small:
113
+
114
+ | Command | What you get |
115
+ |---|---|
116
+ | `pip install ocrcontext` | Digital PDFs only (PyMuPDF text-layer — no OCR, no GPU, no API key) |
117
+ | `pip install 'ocrcontext[paddle]'` | + printed images & scanned PDFs (PaddleOCR, CPU/GPU) |
118
+ | `pip install 'ocrcontext[trocr]'` | + handwriting fallback (Microsoft TrOCR) |
119
+ | `pip install 'ocrcontext[vision]'` | + handwriting primary (Google Cloud Vision) |
120
+ | `pip install 'ocrcontext[cli]'` | + terminal CLI (`ocrcontext extract`) |
121
+ | `pip install 'ocrcontext[all]'` | everything above |
122
+
123
+ Add an LLM provider for refinement and structured extraction:
124
+
125
+ ```bash
126
+ pip install langchain-openai # or langchain-anthropic, langchain-ollama, ...
127
+ ```
128
+
129
+ > **Images and scanned PDFs require `[paddle]`.** Passing an image file to a bare `pip install ocrcontext` raises an `EngineError` with a clear install hint.
130
+
131
+ ### Google Cloud Vision (`[vision]`)
132
+
133
+ 1. Enable the **Cloud Vision API** in [Google Cloud Console](https://console.cloud.google.com/)
134
+ 2. Create a service account key (JSON) under IAM & Admin → Service Accounts → Keys
135
+ 3. Export the path:
136
+
137
+ ```bash
138
+ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/key.json" # Linux/macOS
139
+ $env:GOOGLE_APPLICATION_CREDENTIALS = "C:\path\to\key.json" # PowerShell
140
+ ```
141
+
142
+ ---
143
+
144
+ ## Quick start
145
+
146
+ ### Digital PDF
147
+
148
+ ```python
149
+ from ocrcontext import Analyzer
150
+
151
+ result = Analyzer().analyze("document.pdf")
152
+ print(result.text) # extracted text
153
+ print(result.pages) # page count
154
+ print(result.text_source) # "pdf_text_layer"
155
+ ```
156
+
157
+ ### Image / scanned PDF
158
+
159
+ ```bash
160
+ pip install 'ocrcontext[paddle]'
161
+ ```
162
+
163
+ ```python
164
+ from ocrcontext import Analyzer
165
+
166
+ result = Analyzer().analyze("scan.png")
167
+ print(result.text, result.confidence)
168
+ ```
169
+
170
+ ### LLM-refined OCR
171
+
172
+ Refinement fixes character-level OCR errors without paraphrasing, translating, or inventing.
173
+ Emails, URLs, and IBANs are masked before the model sees them and restored verbatim after.
174
+ Output that drifts too far from the source is rejected in favour of the raw OCR text.
175
+
176
+ ```bash
177
+ pip install 'ocrcontext[paddle]' langchain-openai
178
+ export OPENAI_API_KEY="sk-..."
179
+ ```
180
+
181
+ ```python
182
+ from langchain_openai import ChatOpenAI
183
+ from ocrcontext import Analyzer
184
+
185
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini"), lang="en")
186
+ result = analyzer.analyze("scan.jpg")
187
+
188
+ print(result.text) # refined
189
+ print(result.raw_text) # original OCR output
190
+ print(result.refined) # True
191
+ ```
192
+
193
+ ### Structured extraction
194
+
195
+ Hand the analyzer a Pydantic schema and get a populated instance back.
196
+
197
+ ```python
198
+ from langchain_openai import ChatOpenAI
199
+ from ocrcontext import Analyzer
200
+ from ocrcontext.schemas import Invoice
201
+
202
+ analyzer = Analyzer(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))
203
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice)
204
+
205
+ print(invoice.supplier_name, invoice.total_amount, invoice.currency)
206
+ for item in invoice.line_items:
207
+ print(item.description, item.quantity, item.unit_price)
208
+ ```
209
+
210
+ Define your own schema — field descriptions are the prompt:
211
+
212
+ ```python
213
+ from pydantic import BaseModel, Field
214
+
215
+ class ShippingLabel(BaseModel):
216
+ sender: str | None = Field(None, description="Sender full name and address")
217
+ recipient: str | None = Field(None, description="Recipient full name and address")
218
+ tracking_number: str | None = Field(None, description="Carrier tracking number")
219
+
220
+ label = analyzer.extract("label.jpg", schema=ShippingLabel)
221
+ ```
222
+
223
+ ### No API key? Use a local model
224
+
225
+ ```python
226
+ from langchain_ollama import ChatOllama
227
+ from ocrcontext import Analyzer
228
+
229
+ analyzer = Analyzer(llm=ChatOllama(model="llama3.1"))
230
+ result = analyzer.analyze("scan.png")
231
+ print(result.text)
232
+ ```
233
+
234
+ ---
235
+
236
+ ## CLI
237
+
238
+ Install the `[cli]` extra to use `ocrcontext` straight from the terminal — no Python script needed.
239
+
240
+ ```bash
241
+ pip install 'ocrcontext[cli]'
242
+ ```
243
+
244
+ **Extract plain text:**
245
+
246
+ ```bash
247
+ ocrcontext extract invoice.pdf
248
+ ocrcontext extract scan.png --output json
249
+ ```
250
+
251
+ **Extract structured data with a built-in schema:**
252
+
253
+ ```bash
254
+ ocrcontext extract invoice.pdf --schema invoice
255
+ ocrcontext extract receipt.jpg --schema receipt
256
+ ocrcontext extract contract.pdf --schema contract
257
+ ocrcontext extract passport.jpg --schema idcard
258
+ ocrcontext extract lab_report.pdf --schema medical
259
+ ```
260
+
261
+ **Choose your LLM provider:**
262
+
263
+ ```bash
264
+ ocrcontext extract invoice.pdf --schema invoice \
265
+ --provider openai --model gpt-4o-mini
266
+
267
+ ocrcontext extract invoice.pdf --schema invoice \
268
+ --provider anthropic --model claude-haiku-4-5-20251001
269
+
270
+ ocrcontext extract invoice.pdf --schema invoice \
271
+ --provider ollama --model llama3.1
272
+ ```
273
+
274
+ **All options:**
275
+
276
+ ```
277
+ ocrcontext extract FILE [OPTIONS]
278
+
279
+ --schema -s invoice | receipt | contract | idcard | medical
280
+ --lang -l Language code (default: en)
281
+ --handwriting Force handwriting engine
282
+ --refine auto (default) | yes | no
283
+ --output -o text (default) | json
284
+ --provider -p openai | anthropic | ollama | google
285
+ --model -m Model name (default: gpt-4o-mini)
286
+ ```
287
+
288
+ ---
289
+
290
+ ## LangChain integration
291
+
292
+ `OCRContextLoader` is a drop-in LangChain `BaseLoader`. It slots into any LangChain pipeline — RAG, document Q&A, chain-of-thought — without glue code.
293
+
294
+ ```python
295
+ from ocrcontext.loaders import OCRContextLoader
296
+
297
+ # Plain OCR
298
+ loader = OCRContextLoader("contract.pdf")
299
+ docs = loader.load() # -> [Document(page_content="...", metadata={...})]
300
+
301
+ # With LLM refinement
302
+ from langchain_openai import ChatOpenAI
303
+
304
+ loader = OCRContextLoader(
305
+ "scan.pdf",
306
+ llm=ChatOpenAI(model="gpt-4o-mini"),
307
+ lang="en",
308
+ refine="yes",
309
+ )
310
+ docs = loader.load()
311
+ print(docs[0].page_content)
312
+ print(docs[0].metadata)
313
+ # {
314
+ # "source": "scan.pdf",
315
+ # "text_source": "ocr",
316
+ # "pages": 3,
317
+ # "confidence": 0.94,
318
+ # "refined": True,
319
+ # "raw_text": "..."
320
+ # }
321
+ ```
322
+
323
+ **In a RAG pipeline:**
324
+
325
+ ```python
326
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
327
+ from langchain_community.vectorstores import FAISS
328
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
329
+ from ocrcontext.loaders import OCRContextLoader
330
+
331
+ docs = OCRContextLoader("annual_report.pdf").load()
332
+ chunks = RecursiveCharacterTextSplitter(chunk_size=1000).split_documents(docs)
333
+ vectorstore = FAISS.from_documents(chunks, OpenAIEmbeddings())
334
+ ```
335
+
336
+ ---
337
+
338
+ ## Built-in schemas
339
+
340
+ Five ready-to-use Pydantic schemas with system prompts, importable from `ocrcontext.schemas`.
341
+ Pass them directly to `analyzer.extract()` or the CLI `--schema` flag.
342
+
343
+ ### Invoice
344
+
345
+ ```python
346
+ from ocrcontext.schemas import Invoice
347
+
348
+ invoice = analyzer.extract("invoice.pdf", schema=Invoice)
349
+ # invoice.supplier_name, .invoice_number, .invoice_date, .total_amount,
350
+ # .currency, .tax_id, .tax_rate, .line_items (list[LineItem])
351
+ ```
352
+
353
+ ### Receipt
354
+
355
+ ```python
356
+ from ocrcontext.schemas import Receipt
357
+
358
+ receipt = analyzer.extract("receipt.jpg", schema=Receipt)
359
+ # receipt.store_name, .date, .time, .total_amount, .tax_amount,
360
+ # .subtotal, .payment_method, .currency, .items (list[ReceiptItem])
361
+ ```
362
+
363
+ ### Contract
364
+
365
+ ```python
366
+ from ocrcontext.schemas import Contract
367
+
368
+ contract = analyzer.extract("agreement.pdf", schema=Contract)
369
+ # contract.title, .effective_date, .expiration_date, .contract_value,
370
+ # .currency, .governing_law, .key_obligations,
371
+ # .parties (list[ContractParty] with .name, .role)
372
+ ```
373
+
374
+ ### IdCard
375
+
376
+ Supports national_id, passport, driver_license, residence_permit.
377
+
378
+ ```python
379
+ from ocrcontext.schemas import IdCard
380
+
381
+ card = analyzer.extract("passport.jpg", schema=IdCard)
382
+ # card.document_type, .full_name, .date_of_birth, .gender,
383
+ # .nationality, .document_number, .issue_date, .expiry_date,
384
+ # .issuing_authority, .address
385
+ ```
386
+
387
+ ### MedicalReport
388
+
389
+ ```python
390
+ from ocrcontext.schemas import MedicalReport
391
+
392
+ report = analyzer.extract("lab_report.pdf", schema=MedicalReport)
393
+ # report.patient_name, .patient_dob, .report_date, .doctor_name,
394
+ # .institution, .diagnosis, .icd_codes (list[str]),
395
+ # .medications (list[Medication]), .notes
396
+ ```
397
+
398
+ ---
399
+
400
+ ## How it routes a document
401
+
402
+ ```
403
+ ┌─────────────┐
404
+ document ───▶│ Analyzer │
405
+ └──────┬──────┘
406
+
407
+ ┌──────────────────────────────────────┐
408
+ │ 1. Digital PDF? │
409
+ │ └─▶ PyMuPDF text layer │
410
+ │ LLM refine auto-skipped │
411
+ │ │
412
+ │ 2. Image / scanned PDF? │
413
+ │ └─▶ PaddleOCR │
414
+ │ (preprocess → coverage-first │
415
+ │ → line-band fallback) │
416
+ │ │
417
+ │ 3. Handwriting (explicit or auto)? │
418
+ │ └─▶ Google Cloud Vision │
419
+ │ → TrOCR fallback │
420
+ │ │
421
+ │ 4. (optional) LLM refine │
422
+ │ fidelity-first · literal-safe │
423
+ │ │
424
+ │ 5. (optional) extract(schema) │
425
+ │ └─▶ typed Pydantic model │
426
+ └──────────────────────────────────────┘
427
+ ```
428
+
429
+ Multi-page documents are joined with `--- Page N ---` separators.
430
+ Handwriting kicks in automatically when printed OCR returns too little text.
431
+
432
+ ---
433
+
434
+ ## Refinement modes
435
+
436
+ | Mode | When it's used |
437
+ |---|---|
438
+ | `conservative` | Scanned images — minimal char-level correction only |
439
+ | `layout` | Digital PDFs — reconstruct clean structure |
440
+ | `handwriting_layout` | Handwritten notes / lists / diagrams |
441
+ | `handwriting_prose` | Handwritten poems / paragraphs / letters |
442
+
443
+ Modes are auto-selected based on the document type and text content. The handwriting mode choice is driven by whether the text looks like a DIKW/pyramid diagram. All prompts are ported verbatim from the production pipeline.
444
+
445
+ Override manually:
446
+
447
+ ```python
448
+ from ocrcontext import Analyzer, RefinementMode
449
+
450
+ result = analyzer.analyze("scan.png", mode=RefinementMode.CONSERVATIVE)
451
+ ```
452
+
453
+ ---
454
+
455
+ ## Configuration
456
+
457
+ ```python
458
+ from ocrcontext import Analyzer, AnalyzerConfig
459
+
460
+ cfg = AnalyzerConfig(
461
+ lang="tr", # default document language
462
+ prefer_pdf_text_layer=True, # skip OCR when a text layer exists
463
+ auto_handwriting_fallback=True, # retry with handwriting if OCR returns too little
464
+ refine_by_default=True, # auto-refine whenever an LLM is configured
465
+ )
466
+ analyzer = Analyzer(llm=..., config=cfg)
467
+ ```
468
+
469
+ ---
470
+
471
+ ## Development
472
+
473
+ ```bash
474
+ git clone https://github.com/BahadirKarsli/OCRContext
475
+ cd OCRContext
476
+ pip install -e '.[dev]'
477
+ pytest # runs without GPU or network — engines and LLM are faked
478
+ ruff check .
479
+ ```
480
+
481
+ See [`examples/`](examples/) for runnable smoke tests (image OCR, structured extraction, PDF routing).
482
+
483
+ ---
484
+
485
+ ## License
486
+
487
+ [MIT](LICENSE) © Bahadır Karslı