ocrcontext 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/CHANGELOG.md +3 -2
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/PKG-INFO +12 -1
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/README.md +11 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/pyproject.toml +1 -1
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/cli.py +84 -20
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/paddle.py +8 -4
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/schemas.py +19 -18
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_cli.py +6 -3
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/.gitignore +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/LICENSE +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/01_quickstart.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/02_refine_openai.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/03_structured_invoice.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/04_local_ollama.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/image_smoke_test.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/pdf_smoke_test.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/structured_smoke_test.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/__init__.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/analyzer.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/config.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/__init__.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/base.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/handwriting.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/pdf_text.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/registry.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/trocr.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/vision.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/exceptions.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/__init__.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/drift.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/extractor.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/formatting.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/literal_preserve.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/prompts.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/refiner.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/loaders.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/pipeline.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/__init__.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/image.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/py.typed +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/quality.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/schemas.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/types.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/utils/__init__.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/utils/files.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/utils/lang.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/__init__.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/conftest.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_langchain_loader.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_literal_preserve.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_llm.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_pipeline_analyzer.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_schemas.py +0 -0
- {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_text_helpers.py +0 -0
|
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
-
## [0.1.
|
|
10
|
+
## [0.1.4] - 2026-06-27
|
|
11
11
|
|
|
12
12
|
### Added
|
|
13
13
|
- **GPU acceleration** — `Analyzer(use_gpu=True)` routes PaddleOCR inference to a
|
|
@@ -95,7 +95,8 @@ into a standalone, LLM-agnostic library.
|
|
|
95
95
|
- **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
|
|
96
96
|
PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
|
|
97
97
|
|
|
98
|
-
[Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.
|
|
98
|
+
[Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.4...HEAD
|
|
99
|
+
[0.1.4]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...v0.1.4
|
|
99
100
|
[0.1.3]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...v0.1.3
|
|
100
101
|
[0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
|
|
101
102
|
[0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ocrcontext
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
|
|
5
5
|
Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
|
|
6
6
|
Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
|
|
@@ -90,8 +90,19 @@ print(result.text)
|
|
|
90
90
|
|
|
91
91
|
`ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
|
|
92
92
|
|
|
93
|
+
## Demo
|
|
94
|
+
|
|
95
|
+
**Structured invoice extraction from an image:**
|
|
96
|
+
|
|
97
|
+
<img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
|
|
98
|
+
|
|
99
|
+
**Digital PDF text extraction:**
|
|
100
|
+
|
|
101
|
+
<img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
|
|
102
|
+
|
|
93
103
|
## Contents
|
|
94
104
|
|
|
105
|
+
- [Demo](#demo)
|
|
95
106
|
- [Install](#install)
|
|
96
107
|
- [CLI](#cli)
|
|
97
108
|
- [Quick start (Python API)](#quick-start-python-api)
|
|
@@ -35,8 +35,19 @@ print(result.text)
|
|
|
35
35
|
|
|
36
36
|
`ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
|
|
37
37
|
|
|
38
|
+
## Demo
|
|
39
|
+
|
|
40
|
+
**Structured invoice extraction from an image:**
|
|
41
|
+
|
|
42
|
+
<img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
|
|
43
|
+
|
|
44
|
+
**Digital PDF text extraction:**
|
|
45
|
+
|
|
46
|
+
<img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
|
|
47
|
+
|
|
38
48
|
## Contents
|
|
39
49
|
|
|
50
|
+
- [Demo](#demo)
|
|
40
51
|
- [Install](#install)
|
|
41
52
|
- [CLI](#cli)
|
|
42
53
|
- [Quick start (Python API)](#quick-start-python-api)
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ocrcontext"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.4"
|
|
8
8
|
description = "Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -11,6 +11,7 @@ Then run:
|
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
|
13
13
|
|
|
14
|
+
import os
|
|
14
15
|
import sys
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from typing import Optional
|
|
@@ -25,6 +26,7 @@ except ImportError: # pragma: no cover
|
|
|
25
26
|
|
|
26
27
|
from .analyzer import Analyzer
|
|
27
28
|
from .config import AnalyzerConfig
|
|
29
|
+
from .types import OcrResult
|
|
28
30
|
from .schemas import (
|
|
29
31
|
Contract,
|
|
30
32
|
IdCard,
|
|
@@ -33,6 +35,61 @@ from .schemas import (
|
|
|
33
35
|
Receipt,
|
|
34
36
|
)
|
|
35
37
|
|
|
38
|
+
def _suppress_paddle_noise() -> None:
|
|
39
|
+
import logging
|
|
40
|
+
import warnings
|
|
41
|
+
|
|
42
|
+
# Set env vars BEFORE any paddle/paddlex import so they see the right paths.
|
|
43
|
+
# _ensure_ascii_model_cache() in paddle.py does the same but only when the
|
|
44
|
+
# engine lazy-loads; calling it here guarantees it runs first.
|
|
45
|
+
from .engines.paddle import _ensure_ascii_model_cache, _ensure_paddle_runtime_flags
|
|
46
|
+
_ensure_ascii_model_cache()
|
|
47
|
+
_ensure_paddle_runtime_flags()
|
|
48
|
+
|
|
49
|
+
os.environ.setdefault("GLOG_minloglevel", "3")
|
|
50
|
+
|
|
51
|
+
# Silence Python-level loggers (no paddlex import — that would defeat the purpose).
|
|
52
|
+
null = logging.NullHandler()
|
|
53
|
+
for name in ("ppocr", "paddlex", "paddle", "paddle.utils", "paddle.fluid"):
|
|
54
|
+
lg = logging.getLogger(name)
|
|
55
|
+
lg.setLevel(logging.ERROR)
|
|
56
|
+
lg.handlers = [null]
|
|
57
|
+
lg.propagate = False
|
|
58
|
+
|
|
59
|
+
# Root-level filter catches sub-loggers that bypass the above (e.g. paddlex.utils.*).
|
|
60
|
+
class _NoiseFilter(logging.Filter):
|
|
61
|
+
_NOISE = ("Could not find files", "ccache", "oneDNN", "mkldnn")
|
|
62
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
63
|
+
return not any(t in record.getMessage() for t in self._NOISE)
|
|
64
|
+
|
|
65
|
+
logging.getLogger().addFilter(_NoiseFilter())
|
|
66
|
+
|
|
67
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="paddle")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _route_label(result: OcrResult, file_path: Path) -> str:
|
|
73
|
+
src = result.text_source
|
|
74
|
+
if src == "pdf_text_layer":
|
|
75
|
+
return "DIGITAL PDF -> text layer"
|
|
76
|
+
if src == "ocr":
|
|
77
|
+
return "SCANNED PDF -> rasterize + PaddleOCR" if file_path.suffix.lower() == ".pdf" else "IMAGE -> PaddleOCR"
|
|
78
|
+
if src == "vision_handwriting":
|
|
79
|
+
return "HANDWRITING -> Google Vision"
|
|
80
|
+
if src == "handwriting_ocr":
|
|
81
|
+
return "HANDWRITING -> PaddleOCR"
|
|
82
|
+
return src
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _info(msg: str) -> None:
|
|
86
|
+
typer.echo(f"[i] {msg}", err=True)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _ok(msg: str) -> None:
|
|
90
|
+
typer.echo(f"[OK] {msg}", err=True)
|
|
91
|
+
|
|
92
|
+
|
|
36
93
|
app = typer.Typer(
|
|
37
94
|
name="ocrcontext",
|
|
38
95
|
help="OCR a document and optionally extract structured data.",
|
|
@@ -129,12 +186,13 @@ def extract(
|
|
|
129
186
|
) -> None:
|
|
130
187
|
"""OCR a document and optionally extract structured data."""
|
|
131
188
|
|
|
189
|
+
_suppress_paddle_noise()
|
|
190
|
+
|
|
132
191
|
file_path = Path(file)
|
|
133
192
|
if not file_path.exists():
|
|
134
193
|
typer.echo(f"[ERROR] File not found: {file}", err=True)
|
|
135
194
|
raise typer.Exit(code=1)
|
|
136
195
|
|
|
137
|
-
# Validate --schema value early for a clear error message.
|
|
138
196
|
if schema is not None and schema not in _SCHEMAS:
|
|
139
197
|
typer.echo(
|
|
140
198
|
f"[ERROR] Unknown schema '{schema}'. "
|
|
@@ -148,36 +206,42 @@ def extract(
|
|
|
148
206
|
raise typer.Exit(code=1)
|
|
149
207
|
|
|
150
208
|
refine_flag = _parse_refine(refine)
|
|
151
|
-
|
|
152
|
-
# Build LLM only when needed.
|
|
153
209
|
needs_llm = schema is not None or refine_flag is True
|
|
154
210
|
llm = _build_llm(provider, model) if needs_llm else None
|
|
155
211
|
|
|
156
|
-
analyzer = Analyzer(
|
|
157
|
-
llm=llm,
|
|
158
|
-
config=AnalyzerConfig(lang=lang),
|
|
159
|
-
)
|
|
212
|
+
analyzer = Analyzer(llm=llm, config=AnalyzerConfig(lang=lang))
|
|
160
213
|
|
|
161
214
|
try:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
schema=schema_cls,
|
|
167
|
-
handwriting=handwriting,
|
|
168
|
-
refine=refine_flag or False,
|
|
169
|
-
)
|
|
170
|
-
typer.echo(result.model_dump_json(indent=2))
|
|
171
|
-
else:
|
|
172
|
-
result = analyzer.analyze(
|
|
215
|
+
_info(f"file: {file_path.name}")
|
|
216
|
+
_info("OCR...")
|
|
217
|
+
|
|
218
|
+
ocr_result = analyzer.analyze(
|
|
173
219
|
file_path,
|
|
174
220
|
handwriting=handwriting,
|
|
175
221
|
refine=refine_flag,
|
|
176
222
|
)
|
|
223
|
+
|
|
224
|
+
conf = f"confidence: {ocr_result.confidence:.0%}" if ocr_result.confidence < 1.0 else "exact"
|
|
225
|
+
_ok(f"route: {_route_label(ocr_result, file_path)} ({conf})")
|
|
226
|
+
|
|
227
|
+
if ocr_result.refined:
|
|
228
|
+
_ok("refine: APPLIED")
|
|
229
|
+
|
|
230
|
+
if schema is not None:
|
|
231
|
+
schema_cls = _SCHEMAS[schema]
|
|
232
|
+
_info(f"extract: {schema} schema...")
|
|
233
|
+
structured = analyzer.extract_text(
|
|
234
|
+
ocr_result.text,
|
|
235
|
+
schema_cls,
|
|
236
|
+
language=lang,
|
|
237
|
+
)
|
|
238
|
+
_ok(f"extract: {schema} [OK]")
|
|
239
|
+
typer.echo(structured.model_dump_json(indent=2))
|
|
240
|
+
else:
|
|
177
241
|
if output == "json":
|
|
178
|
-
typer.echo(
|
|
242
|
+
typer.echo(ocr_result.model_dump_json(indent=2))
|
|
179
243
|
else:
|
|
180
|
-
typer.echo(
|
|
244
|
+
typer.echo(ocr_result.text)
|
|
181
245
|
|
|
182
246
|
except Exception as exc: # noqa: BLE001
|
|
183
247
|
typer.echo(f"[ERROR] {exc}", err=True)
|
|
@@ -104,6 +104,7 @@ class PaddleEngine(OcrEngine):
|
|
|
104
104
|
import logging
|
|
105
105
|
|
|
106
106
|
logging.getLogger("ppocr").setLevel(logging.ERROR)
|
|
107
|
+
logging.getLogger("paddlex").setLevel(logging.ERROR)
|
|
107
108
|
requested = paddle_lang
|
|
108
109
|
ocr, errors = self._try_init(PaddleOCR, paddle_lang, use_gpu=self._use_gpu)
|
|
109
110
|
if ocr is None and paddle_lang != "en":
|
|
@@ -129,9 +130,12 @@ class PaddleEngine(OcrEngine):
|
|
|
129
130
|
# Shared 3.x flags: disable sub-models unneeded for plain OCR.
|
|
130
131
|
# enable_mkldnn is forced False on CPU to avoid PaddlePaddle 3.x PIR bug;
|
|
131
132
|
# on GPU it's irrelevant (MKLDNN is CPU-only) but harmless to keep False.
|
|
133
|
+
# use_gpu is only injected when True — some PaddleOCR 3.x builds reject it
|
|
134
|
+
# as an unknown argument even when set to False.
|
|
135
|
+
gpu_kwargs = {"use_gpu": True} if use_gpu else {}
|
|
132
136
|
base_3x = {
|
|
133
137
|
"lang": lang,
|
|
134
|
-
|
|
138
|
+
**gpu_kwargs,
|
|
135
139
|
"use_doc_orientation_classify": False,
|
|
136
140
|
"use_doc_unwarping": False,
|
|
137
141
|
"use_textline_orientation": False,
|
|
@@ -147,10 +151,10 @@ class PaddleEngine(OcrEngine):
|
|
|
147
151
|
# 3.x default — version determined by installed package, no pin
|
|
148
152
|
base_3x,
|
|
149
153
|
# Minimal 3.x (for builds that reject the sub-model flags)
|
|
150
|
-
{"lang": lang,
|
|
151
|
-
{"lang": lang,
|
|
154
|
+
{"lang": lang, **gpu_kwargs, "enable_mkldnn": False},
|
|
155
|
+
{"lang": lang, **gpu_kwargs},
|
|
152
156
|
# Legacy 2.x (use_angle_cls; use_doc_* / show_log don't exist in 2.x)
|
|
153
|
-
{"use_angle_cls": True, "lang": lang,
|
|
157
|
+
{"use_angle_cls": True, "lang": lang, **gpu_kwargs},
|
|
154
158
|
]
|
|
155
159
|
errors: list[str] = []
|
|
156
160
|
for kwargs in profiles:
|
|
@@ -22,7 +22,7 @@ class LineItem(BaseModel):
|
|
|
22
22
|
"Default 1 only if neither is available."
|
|
23
23
|
),
|
|
24
24
|
)
|
|
25
|
-
unit: Optional[str] = Field(None, description="Unit
|
|
25
|
+
unit: Optional[str] = Field(None, description="Unit of measure as written on the document (e.g. pcs, kg, hrs). Null if not present.")
|
|
26
26
|
unit_price: Optional[float] = Field(None, description="Price per unit.")
|
|
27
27
|
tax_rate: Optional[str] = Field(
|
|
28
28
|
None, description="Tax percentage (e.g., 20, 10, 0) or pattern."
|
|
@@ -34,9 +34,9 @@ class Invoice(BaseModel):
|
|
|
34
34
|
supplier_name: Optional[str] = Field(None, description="Name of the vendor/supplier.")
|
|
35
35
|
invoice_date: Optional[str] = Field(None, description="Format YYYY-MM-DD.")
|
|
36
36
|
invoice_number: Optional[str] = Field(None, description="The invoice ID/number.")
|
|
37
|
-
tax_id: Optional[str] = Field(None, description="Tax ID /
|
|
37
|
+
tax_id: Optional[str] = Field(None, description="Tax ID / VAT registration number.")
|
|
38
38
|
tax_rate: Optional[str] = Field(
|
|
39
|
-
None, description="e.g. '
|
|
39
|
+
None, description="Tax/VAT rate as written on the document (e.g. 'VAT 20%', 'GST 10%')."
|
|
40
40
|
)
|
|
41
41
|
currency: Optional[str] = Field(None, description="Currency code (TRY, USD, EUR, etc.).")
|
|
42
42
|
total_amount: Optional[float] = Field(None, description="Final total amount (numeric).")
|
|
@@ -63,37 +63,38 @@ class Invoice(BaseModel):
|
|
|
63
63
|
return self
|
|
64
64
|
|
|
65
65
|
|
|
66
|
-
# Verbatim system prompt from app/api/invoices/process/route.ts.
|
|
67
66
|
INVOICE_EXTRACTION_PROMPT = """You are an expert invoice data extraction assistant.
|
|
68
67
|
|
|
69
68
|
CRITICAL RULES:
|
|
70
|
-
1. **
|
|
71
|
-
|
|
72
|
-
- If language is 'tr' (Turkish), intelligently fix missing Turkish characters.
|
|
69
|
+
1. **OCR REPAIR**: The text may come from OCR and may have missing or garbled characters.
|
|
70
|
+
Use context to infer the correct value — do not invent values that are not on the document.
|
|
73
71
|
|
|
74
72
|
2. **NUMBER PARSING**:
|
|
75
|
-
- Be
|
|
76
|
-
-
|
|
77
|
-
- NEVER confuse a quantity (e.g.,
|
|
73
|
+
- Be careful with comma (,) and dot (.) as thousand separators vs decimal points.
|
|
74
|
+
- European format: '1.200,50' = 1200.50. US/UK format: '1,200.50' = 1200.50.
|
|
75
|
+
- NEVER confuse a quantity (e.g., 2) with a unit price (e.g., 45.00).
|
|
78
76
|
|
|
79
77
|
3. **CURRENCY DETECTION**:
|
|
80
|
-
- Look for symbols:
|
|
81
|
-
-
|
|
78
|
+
- Look for symbols or codes on the document: $, USD, €, EUR, £, GBP, ₺, TRY, etc.
|
|
79
|
+
- Use ONLY what is explicitly stated. Do not default to any currency.
|
|
82
80
|
|
|
83
|
-
|
|
81
|
+
4. **UNITS**: Copy the unit exactly as written on the document (pcs, kg, hrs, m², etc.).
|
|
82
|
+
If no unit is shown, use null — never invent one.
|
|
83
|
+
|
|
84
|
+
Extract the following fields if present:
|
|
84
85
|
- 'supplier_name': Name of the vendor/supplier.
|
|
85
86
|
- 'invoice_date': Format YYYY-MM-DD.
|
|
86
87
|
- 'invoice_number': The invoice ID/number.
|
|
87
|
-
- 'tax_id': Tax ID
|
|
88
|
-
- 'tax_rate':
|
|
89
|
-
- 'currency':
|
|
88
|
+
- 'tax_id': Tax ID or VAT registration number.
|
|
89
|
+
- 'tax_rate': Tax/VAT rate as written (e.g. 'VAT 20%', 'GST 10%').
|
|
90
|
+
- 'currency': ISO currency code (USD, EUR, GBP, TRY, etc.).
|
|
90
91
|
- 'total_amount': Final total amount (numeric).
|
|
91
92
|
- 'line_items': An array of items/services. Each item should have:
|
|
92
93
|
- 'description': Product/Service name.
|
|
93
94
|
- 'quantity': Numeric quantity. If missing, calculate it as total / unit_price. Default 1 only if neither is available.
|
|
94
|
-
- 'unit': Unit
|
|
95
|
+
- 'unit': Unit of measure exactly as written on the document. Null if not present.
|
|
95
96
|
- 'unit_price': Price per unit.
|
|
96
|
-
- 'tax_rate': Tax percentage (e.g., 20, 10, 0) or
|
|
97
|
+
- 'tax_rate': Tax percentage (e.g., 20, 10, 0) or null.
|
|
97
98
|
- 'total': Total price for this line.
|
|
98
99
|
|
|
99
100
|
Return ONLY a valid JSON object. If a field is not found, use null."""
|
|
@@ -48,6 +48,9 @@ def _patch_analyzer(monkeypatch, ocr_text: str = "hello world", structured: dict
|
|
|
48
48
|
def extract(self, *args, schema=None, **kwargs):
|
|
49
49
|
return schema(**(structured or {}))
|
|
50
50
|
|
|
51
|
+
def extract_text(self, text, schema, **kwargs):
|
|
52
|
+
return schema(**(structured or {}))
|
|
53
|
+
|
|
51
54
|
monkeypatch.setattr(cli_mod, "Analyzer", _FakeAnalyzer)
|
|
52
55
|
monkeypatch.setattr(cli_mod, "_build_llm", lambda provider, model: None)
|
|
53
56
|
|
|
@@ -99,7 +102,7 @@ def test_extract_json_output(ascii_tmp, monkeypatch):
|
|
|
99
102
|
result = runner.invoke(app, ["extract", str(png), "--output", "json"])
|
|
100
103
|
assert result.exit_code == 0
|
|
101
104
|
import json
|
|
102
|
-
data = json.loads(result.output)
|
|
105
|
+
data = json.loads(result.output[result.output.index("{"):])
|
|
103
106
|
assert data["text"] == "some text"
|
|
104
107
|
assert data["text_source"] == "ocr"
|
|
105
108
|
|
|
@@ -119,7 +122,7 @@ def test_extract_invoice_schema(ascii_tmp, monkeypatch):
|
|
|
119
122
|
result = runner.invoke(app, ["extract", str(png), "--schema", "invoice"])
|
|
120
123
|
assert result.exit_code == 0
|
|
121
124
|
import json
|
|
122
|
-
data = json.loads(result.output)
|
|
125
|
+
data = json.loads(result.output[result.output.index("{"):])
|
|
123
126
|
assert data["supplier_name"] == "ACME"
|
|
124
127
|
assert data["total_amount"] == 250.0
|
|
125
128
|
|
|
@@ -131,7 +134,7 @@ def test_extract_receipt_schema(ascii_tmp, monkeypatch):
|
|
|
131
134
|
result = runner.invoke(app, ["extract", str(png), "--schema", "receipt"])
|
|
132
135
|
assert result.exit_code == 0
|
|
133
136
|
import json
|
|
134
|
-
data = json.loads(result.output)
|
|
137
|
+
data = json.loads(result.output[result.output.index("{"):])
|
|
135
138
|
assert data["store_name"] == "Migros"
|
|
136
139
|
|
|
137
140
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|