ocrcontext 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/CHANGELOG.md +12 -2
  2. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/PKG-INFO +12 -1
  3. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/README.md +11 -0
  4. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/pyproject.toml +1 -1
  5. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/cli.py +109 -20
  6. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/paddle.py +8 -4
  7. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/schemas.py +19 -18
  8. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_cli.py +6 -3
  9. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/.gitignore +0 -0
  10. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/LICENSE +0 -0
  11. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/01_quickstart.py +0 -0
  12. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/02_refine_openai.py +0 -0
  13. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/03_structured_invoice.py +0 -0
  14. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/04_local_ollama.py +0 -0
  15. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/image_smoke_test.py +0 -0
  16. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/pdf_smoke_test.py +0 -0
  17. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/examples/structured_smoke_test.py +0 -0
  18. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/__init__.py +0 -0
  19. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/analyzer.py +0 -0
  20. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/config.py +0 -0
  21. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/__init__.py +0 -0
  22. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/base.py +0 -0
  23. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/handwriting.py +0 -0
  24. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/pdf_text.py +0 -0
  25. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/registry.py +0 -0
  26. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/trocr.py +0 -0
  27. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/engines/vision.py +0 -0
  28. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/exceptions.py +0 -0
  29. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/__init__.py +0 -0
  30. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/drift.py +0 -0
  31. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/extractor.py +0 -0
  32. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/formatting.py +0 -0
  33. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/literal_preserve.py +0 -0
  34. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/prompts.py +0 -0
  35. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/llm/refiner.py +0 -0
  36. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/loaders.py +0 -0
  37. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/pipeline.py +0 -0
  38. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/preprocessing/__init__.py +0 -0
  39. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/preprocessing/image.py +0 -0
  40. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/py.typed +0 -0
  41. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/quality.py +0 -0
  42. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/schemas.py +0 -0
  43. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/types.py +0 -0
  44. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/utils/__init__.py +0 -0
  45. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/utils/files.py +0 -0
  46. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/src/ocrcontext/utils/lang.py +0 -0
  47. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/__init__.py +0 -0
  48. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/conftest.py +0 -0
  49. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_langchain_loader.py +0 -0
  50. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_literal_preserve.py +0 -0
  51. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_llm.py +0 -0
  52. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_pipeline_analyzer.py +0 -0
  53. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_schemas.py +0 -0
  54. {ocrcontext-0.1.3 → ocrcontext-0.1.5}/tests/test_text_helpers.py +0 -0
@@ -7,7 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
- ## [0.1.3] - 2026-06-27
10
+ ## [0.1.5] - 2026-06-27
11
+
12
+ ### Fixed
13
+ - CLI now shows a clear error message when an LLM provider API key is missing
14
+ instead of a raw traceback (e.g. `OPENAI_API_KEY` not set).
15
+ - CLI prints a first-run warning before the OCR step when PaddleOCR models
16
+ have not been downloaded yet, so users know the ~90 MB download is expected.
17
+
18
+ ## [0.1.4] - 2026-06-27
11
19
 
12
20
  ### Added
13
21
  - **GPU acceleration** — `Analyzer(use_gpu=True)` routes PaddleOCR inference to a
@@ -95,7 +103,9 @@ into a standalone, LLM-agnostic library.
95
103
  - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
96
104
  PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
97
105
 
98
- [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...HEAD
106
+ [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.5...HEAD
107
+ [0.1.5]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.4...v0.1.5
108
+ [0.1.4]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...v0.1.4
99
109
  [0.1.3]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...v0.1.3
100
110
  [0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
101
111
  [0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocrcontext
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
5
  Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
6
6
  Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
@@ -90,8 +90,19 @@ print(result.text)
90
90
 
91
91
  `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
92
92
 
93
+ ## Demo
94
+
95
+ **Structured invoice extraction from an image:**
96
+
97
+ <img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
98
+
99
+ **Digital PDF text extraction:**
100
+
101
+ <img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
102
+
93
103
  ## Contents
94
104
 
105
+ - [Demo](#demo)
95
106
  - [Install](#install)
96
107
  - [CLI](#cli)
97
108
  - [Quick start (Python API)](#quick-start-python-api)
@@ -35,8 +35,19 @@ print(result.text)
35
35
 
36
36
  `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
37
37
 
38
+ ## Demo
39
+
40
+ **Structured invoice extraction from an image:**
41
+
42
+ <img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
43
+
44
+ **Digital PDF text extraction:**
45
+
46
+ <img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
47
+
38
48
  ## Contents
39
49
 
50
+ - [Demo](#demo)
40
51
  - [Install](#install)
41
52
  - [CLI](#cli)
42
53
  - [Quick start (Python API)](#quick-start-python-api)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "ocrcontext"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -11,6 +11,7 @@ Then run:
11
11
 
12
12
  from __future__ import annotations
13
13
 
14
+ import os
14
15
  import sys
15
16
  from pathlib import Path
16
17
  from typing import Optional
@@ -25,6 +26,7 @@ except ImportError: # pragma: no cover
25
26
 
26
27
  from .analyzer import Analyzer
27
28
  from .config import AnalyzerConfig
29
+ from .types import OcrResult
28
30
  from .schemas import (
29
31
  Contract,
30
32
  IdCard,
@@ -33,6 +35,61 @@ from .schemas import (
33
35
  Receipt,
34
36
  )
35
37
 
38
+ def _suppress_paddle_noise() -> None:
39
+ import logging
40
+ import warnings
41
+
42
+ # Set env vars BEFORE any paddle/paddlex import so they see the right paths.
43
+ # _ensure_ascii_model_cache() in paddle.py does the same but only when the
44
+ # engine lazy-loads; calling it here guarantees it runs first.
45
+ from .engines.paddle import _ensure_ascii_model_cache, _ensure_paddle_runtime_flags
46
+ _ensure_ascii_model_cache()
47
+ _ensure_paddle_runtime_flags()
48
+
49
+ os.environ.setdefault("GLOG_minloglevel", "3")
50
+
51
+ # Silence Python-level loggers (no paddlex import — that would defeat the purpose).
52
+ null = logging.NullHandler()
53
+ for name in ("ppocr", "paddlex", "paddle", "paddle.utils", "paddle.fluid"):
54
+ lg = logging.getLogger(name)
55
+ lg.setLevel(logging.ERROR)
56
+ lg.handlers = [null]
57
+ lg.propagate = False
58
+
59
+ # Root-level filter catches sub-loggers that bypass the above (e.g. paddlex.utils.*).
60
+ class _NoiseFilter(logging.Filter):
61
+ _NOISE = ("Could not find files", "ccache", "oneDNN", "mkldnn")
62
+ def filter(self, record: logging.LogRecord) -> bool:
63
+ return not any(t in record.getMessage() for t in self._NOISE)
64
+
65
+ logging.getLogger().addFilter(_NoiseFilter())
66
+
67
+ warnings.filterwarnings("ignore", category=UserWarning, module="paddle")
68
+
69
+
70
+
71
+
72
+ def _route_label(result: OcrResult, file_path: Path) -> str:
73
+ src = result.text_source
74
+ if src == "pdf_text_layer":
75
+ return "DIGITAL PDF -> text layer"
76
+ if src == "ocr":
77
+ return "SCANNED PDF -> rasterize + PaddleOCR" if file_path.suffix.lower() == ".pdf" else "IMAGE -> PaddleOCR"
78
+ if src == "vision_handwriting":
79
+ return "HANDWRITING -> Google Vision"
80
+ if src == "handwriting_ocr":
81
+ return "HANDWRITING -> PaddleOCR"
82
+ return src
83
+
84
+
85
+ def _info(msg: str) -> None:
86
+ typer.echo(f"[i] {msg}", err=True)
87
+
88
+
89
+ def _ok(msg: str) -> None:
90
+ typer.echo(f"[OK] {msg}", err=True)
91
+
92
+
36
93
  app = typer.Typer(
37
94
  name="ocrcontext",
38
95
  help="OCR a document and optionally extract structured data.",
@@ -59,6 +116,13 @@ _SCHEMA_NAMES = list(_SCHEMAS)
59
116
 
60
117
  def _build_llm(provider: str, model: str):
61
118
  """Dynamically import the right LangChain provider class."""
119
+ _API_KEY_HINTS = {
120
+ "openai": ("OPENAI_API_KEY", "platform.openai.com/api-keys"),
121
+ "anthropic": ("ANTHROPIC_API_KEY", "console.anthropic.com/settings/keys"),
122
+ "google": ("GOOGLE_API_KEY", "aistudio.google.com/apikey"),
123
+ "ollama": (None, None),
124
+ }
125
+
62
126
  try:
63
127
  if provider == "openai":
64
128
  from langchain_openai import ChatOpenAI # type: ignore[import-untyped]
@@ -79,6 +143,19 @@ def _build_llm(provider: str, model: str):
79
143
  err=True,
80
144
  )
81
145
  raise typer.Exit(code=1)
146
+ except Exception as exc:
147
+ msg = str(exc)
148
+ if "api_key" in msg.lower() or "credentials" in msg.lower() or "auth" in msg.lower():
149
+ env_var, url = _API_KEY_HINTS.get(provider, (None, None))
150
+ hint = f"Set it with: $env:{env_var} = \"...\"" if env_var else ""
151
+ url_hint = f"\nGet a key at: {url}" if url else ""
152
+ typer.echo(
153
+ f"[ERROR] No API key found for '{provider}'.\n{hint}{url_hint}",
154
+ err=True,
155
+ )
156
+ else:
157
+ typer.echo(f"[ERROR] Failed to initialize '{provider}': {exc}", err=True)
158
+ raise typer.Exit(code=1)
82
159
 
83
160
  typer.echo(
84
161
  f"[ERROR] Unknown provider '{provider}'. "
@@ -129,12 +206,13 @@ def extract(
129
206
  ) -> None:
130
207
  """OCR a document and optionally extract structured data."""
131
208
 
209
+ _suppress_paddle_noise()
210
+
132
211
  file_path = Path(file)
133
212
  if not file_path.exists():
134
213
  typer.echo(f"[ERROR] File not found: {file}", err=True)
135
214
  raise typer.Exit(code=1)
136
215
 
137
- # Validate --schema value early for a clear error message.
138
216
  if schema is not None and schema not in _SCHEMAS:
139
217
  typer.echo(
140
218
  f"[ERROR] Unknown schema '{schema}'. "
@@ -148,36 +226,47 @@ def extract(
148
226
  raise typer.Exit(code=1)
149
227
 
150
228
  refine_flag = _parse_refine(refine)
151
-
152
- # Build LLM only when needed.
153
229
  needs_llm = schema is not None or refine_flag is True
154
230
  llm = _build_llm(provider, model) if needs_llm else None
155
231
 
156
- analyzer = Analyzer(
157
- llm=llm,
158
- config=AnalyzerConfig(lang=lang),
159
- )
232
+ analyzer = Analyzer(llm=llm, config=AnalyzerConfig(lang=lang))
160
233
 
161
234
  try:
162
- if schema is not None:
163
- schema_cls = _SCHEMAS[schema]
164
- result = analyzer.extract(
165
- file_path,
166
- schema=schema_cls,
167
- handwriting=handwriting,
168
- refine=refine_flag or False,
169
- )
170
- typer.echo(result.model_dump_json(indent=2))
171
- else:
172
- result = analyzer.analyze(
235
+ _info(f"file: {file_path.name}")
236
+
237
+ paddlex_cache = Path(os.environ.get("PADDLE_PDX_CACHE_HOME", Path.home() / ".paddlex"))
238
+ if not (paddlex_cache / "official_models").exists():
239
+ _info("first run: downloading OCR model (~90 MB), this may take a minute...")
240
+
241
+ _info("OCR...")
242
+
243
+ ocr_result = analyzer.analyze(
173
244
  file_path,
174
245
  handwriting=handwriting,
175
246
  refine=refine_flag,
176
247
  )
248
+
249
+ conf = f"confidence: {ocr_result.confidence:.0%}" if ocr_result.confidence < 1.0 else "exact"
250
+ _ok(f"route: {_route_label(ocr_result, file_path)} ({conf})")
251
+
252
+ if ocr_result.refined:
253
+ _ok("refine: APPLIED")
254
+
255
+ if schema is not None:
256
+ schema_cls = _SCHEMAS[schema]
257
+ _info(f"extract: {schema} schema...")
258
+ structured = analyzer.extract_text(
259
+ ocr_result.text,
260
+ schema_cls,
261
+ language=lang,
262
+ )
263
+ _ok(f"extract: {schema} [OK]")
264
+ typer.echo(structured.model_dump_json(indent=2))
265
+ else:
177
266
  if output == "json":
178
- typer.echo(result.model_dump_json(indent=2))
267
+ typer.echo(ocr_result.model_dump_json(indent=2))
179
268
  else:
180
- typer.echo(result.text)
269
+ typer.echo(ocr_result.text)
181
270
 
182
271
  except Exception as exc: # noqa: BLE001
183
272
  typer.echo(f"[ERROR] {exc}", err=True)
@@ -104,6 +104,7 @@ class PaddleEngine(OcrEngine):
104
104
  import logging
105
105
 
106
106
  logging.getLogger("ppocr").setLevel(logging.ERROR)
107
+ logging.getLogger("paddlex").setLevel(logging.ERROR)
107
108
  requested = paddle_lang
108
109
  ocr, errors = self._try_init(PaddleOCR, paddle_lang, use_gpu=self._use_gpu)
109
110
  if ocr is None and paddle_lang != "en":
@@ -129,9 +130,12 @@ class PaddleEngine(OcrEngine):
129
130
  # Shared 3.x flags: disable sub-models unneeded for plain OCR.
130
131
  # enable_mkldnn is forced False on CPU to avoid PaddlePaddle 3.x PIR bug;
131
132
  # on GPU it's irrelevant (MKLDNN is CPU-only) but harmless to keep False.
133
+ # use_gpu is only injected when True — some PaddleOCR 3.x builds reject it
134
+ # as an unknown argument even when set to False.
135
+ gpu_kwargs = {"use_gpu": True} if use_gpu else {}
132
136
  base_3x = {
133
137
  "lang": lang,
134
- "use_gpu": use_gpu,
138
+ **gpu_kwargs,
135
139
  "use_doc_orientation_classify": False,
136
140
  "use_doc_unwarping": False,
137
141
  "use_textline_orientation": False,
@@ -147,10 +151,10 @@ class PaddleEngine(OcrEngine):
147
151
  # 3.x default — version determined by installed package, no pin
148
152
  base_3x,
149
153
  # Minimal 3.x (for builds that reject the sub-model flags)
150
- {"lang": lang, "use_gpu": use_gpu, "enable_mkldnn": False},
151
- {"lang": lang, "use_gpu": use_gpu},
154
+ {"lang": lang, **gpu_kwargs, "enable_mkldnn": False},
155
+ {"lang": lang, **gpu_kwargs},
152
156
  # Legacy 2.x (use_angle_cls; use_doc_* / show_log don't exist in 2.x)
153
- {"use_angle_cls": True, "lang": lang, "use_gpu": use_gpu},
157
+ {"use_angle_cls": True, "lang": lang, **gpu_kwargs},
154
158
  ]
155
159
  errors: list[str] = []
156
160
  for kwargs in profiles:
@@ -22,7 +22,7 @@ class LineItem(BaseModel):
22
22
  "Default 1 only if neither is available."
23
23
  ),
24
24
  )
25
- unit: Optional[str] = Field(None, description="Unit type (Adet, Kg, Saat, etc.).")
25
+ unit: Optional[str] = Field(None, description="Unit of measure as written on the document (e.g. pcs, kg, hrs). Null if not present.")
26
26
  unit_price: Optional[float] = Field(None, description="Price per unit.")
27
27
  tax_rate: Optional[str] = Field(
28
28
  None, description="Tax percentage (e.g., 20, 10, 0) or pattern."
@@ -34,9 +34,9 @@ class Invoice(BaseModel):
34
34
  supplier_name: Optional[str] = Field(None, description="Name of the vendor/supplier.")
35
35
  invoice_date: Optional[str] = Field(None, description="Format YYYY-MM-DD.")
36
36
  invoice_number: Optional[str] = Field(None, description="The invoice ID/number.")
37
- tax_id: Optional[str] = Field(None, description="Tax ID / VKN / TCKN.")
37
+ tax_id: Optional[str] = Field(None, description="Tax ID / VAT registration number.")
38
38
  tax_rate: Optional[str] = Field(
39
- None, description="e.g. 'KDV %20' when KDV is 20%."
39
+ None, description="Tax/VAT rate as written on the document (e.g. 'VAT 20%', 'GST 10%')."
40
40
  )
41
41
  currency: Optional[str] = Field(None, description="Currency code (TRY, USD, EUR, etc.).")
42
42
  total_amount: Optional[float] = Field(None, description="Final total amount (numeric).")
@@ -63,37 +63,38 @@ class Invoice(BaseModel):
63
63
  return self
64
64
 
65
65
 
66
- # Verbatim system prompt from app/api/invoices/process/route.ts.
67
66
  INVOICE_EXTRACTION_PROMPT = """You are an expert invoice data extraction assistant.
68
67
 
69
68
  CRITICAL RULES:
70
- 1. **LANGUAGE REPAIR**:
71
- - The text may come from OCR and may have missing characters.
72
- - If language is 'tr' (Turkish), intelligently fix missing Turkish characters.
69
+ 1. **OCR REPAIR**: The text may come from OCR and may have missing or garbled characters.
70
+ Use context to infer the correct value do not invent values that are not on the document.
73
71
 
74
72
  2. **NUMBER PARSING**:
75
- - Be extremely careful with comma (,) and dot (.).
76
- - In Turkish/European invoices, '1.200,50' means One Thousand Two Hundred and 50 cents.
77
- - NEVER confuse a quantity (e.g., 500) with a price (e.g. 5,00).
73
+ - Be careful with comma (,) and dot (.) as thousand separators vs decimal points.
74
+ - European format: '1.200,50' = 1200.50. US/UK format: '1,200.50' = 1200.50.
75
+ - NEVER confuse a quantity (e.g., 2) with a unit price (e.g., 45.00).
78
76
 
79
77
  3. **CURRENCY DETECTION**:
80
- - Look for symbols: ₺, TL, TRY, USD, $, EUR, €.
81
- - Prioritize 'TRY' / 'TL' unless explicitly stated otherwise.
78
+ - Look for symbols or codes on the document: $, USD, €, EUR, £, GBP, ₺, TRY, etc.
79
+ - Use ONLY what is explicitly stated. Do not default to any currency.
82
80
 
83
- Extract the following fields if it exists:
81
+ 4. **UNITS**: Copy the unit exactly as written on the document (pcs, kg, hrs, m², etc.).
82
+ If no unit is shown, use null — never invent one.
83
+
84
+ Extract the following fields if present:
84
85
  - 'supplier_name': Name of the vendor/supplier.
85
86
  - 'invoice_date': Format YYYY-MM-DD.
86
87
  - 'invoice_number': The invoice ID/number.
87
- - 'tax_id': Tax ID / VKN / TCKN.
88
- - 'tax_rate': It can be like 'KDV' and for example if it is 'KDV' and it is %20, write it as 'KDV %20' in excel.
89
- - 'currency': Currency code (TRY, USD, EUR, etc.).
88
+ - 'tax_id': Tax ID or VAT registration number.
89
+ - 'tax_rate': Tax/VAT rate as written (e.g. 'VAT 20%', 'GST 10%').
90
+ - 'currency': ISO currency code (USD, EUR, GBP, TRY, etc.).
90
91
  - 'total_amount': Final total amount (numeric).
91
92
  - 'line_items': An array of items/services. Each item should have:
92
93
  - 'description': Product/Service name.
93
94
  - 'quantity': Numeric quantity. If missing, calculate it as total / unit_price. Default 1 only if neither is available.
94
- - 'unit': Unit type (Adet, Kg, Saat, etc.).
95
+ - 'unit': Unit of measure exactly as written on the document. Null if not present.
95
96
  - 'unit_price': Price per unit.
96
- - 'tax_rate': Tax percentage (e.g., 20, 10, 0) or pattern.
97
+ - 'tax_rate': Tax percentage (e.g., 20, 10, 0) or null.
97
98
  - 'total': Total price for this line.
98
99
 
99
100
  Return ONLY a valid JSON object. If a field is not found, use null."""
@@ -48,6 +48,9 @@ def _patch_analyzer(monkeypatch, ocr_text: str = "hello world", structured: dict
48
48
  def extract(self, *args, schema=None, **kwargs):
49
49
  return schema(**(structured or {}))
50
50
 
51
+ def extract_text(self, text, schema, **kwargs):
52
+ return schema(**(structured or {}))
53
+
51
54
  monkeypatch.setattr(cli_mod, "Analyzer", _FakeAnalyzer)
52
55
  monkeypatch.setattr(cli_mod, "_build_llm", lambda provider, model: None)
53
56
 
@@ -99,7 +102,7 @@ def test_extract_json_output(ascii_tmp, monkeypatch):
99
102
  result = runner.invoke(app, ["extract", str(png), "--output", "json"])
100
103
  assert result.exit_code == 0
101
104
  import json
102
- data = json.loads(result.output)
105
+ data = json.loads(result.output[result.output.index("{"):])
103
106
  assert data["text"] == "some text"
104
107
  assert data["text_source"] == "ocr"
105
108
 
@@ -119,7 +122,7 @@ def test_extract_invoice_schema(ascii_tmp, monkeypatch):
119
122
  result = runner.invoke(app, ["extract", str(png), "--schema", "invoice"])
120
123
  assert result.exit_code == 0
121
124
  import json
122
- data = json.loads(result.output)
125
+ data = json.loads(result.output[result.output.index("{"):])
123
126
  assert data["supplier_name"] == "ACME"
124
127
  assert data["total_amount"] == 250.0
125
128
 
@@ -131,7 +134,7 @@ def test_extract_receipt_schema(ascii_tmp, monkeypatch):
131
134
  result = runner.invoke(app, ["extract", str(png), "--schema", "receipt"])
132
135
  assert result.exit_code == 0
133
136
  import json
134
- data = json.loads(result.output)
137
+ data = json.loads(result.output[result.output.index("{"):])
135
138
  assert data["store_name"] == "Migros"
136
139
 
137
140
 
File without changes
File without changes
File without changes
File without changes
File without changes