ocrcontext 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/CHANGELOG.md +3 -2
  2. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/PKG-INFO +12 -1
  3. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/README.md +11 -0
  4. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/pyproject.toml +1 -1
  5. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/cli.py +84 -20
  6. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/paddle.py +8 -4
  7. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/schemas.py +19 -18
  8. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_cli.py +6 -3
  9. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/.gitignore +0 -0
  10. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/LICENSE +0 -0
  11. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/01_quickstart.py +0 -0
  12. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/02_refine_openai.py +0 -0
  13. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/03_structured_invoice.py +0 -0
  14. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/04_local_ollama.py +0 -0
  15. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/image_smoke_test.py +0 -0
  16. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/pdf_smoke_test.py +0 -0
  17. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/examples/structured_smoke_test.py +0 -0
  18. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/__init__.py +0 -0
  19. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/analyzer.py +0 -0
  20. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/config.py +0 -0
  21. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/__init__.py +0 -0
  22. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/base.py +0 -0
  23. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/handwriting.py +0 -0
  24. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/pdf_text.py +0 -0
  25. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/registry.py +0 -0
  26. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/trocr.py +0 -0
  27. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/engines/vision.py +0 -0
  28. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/exceptions.py +0 -0
  29. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/__init__.py +0 -0
  30. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/drift.py +0 -0
  31. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/extractor.py +0 -0
  32. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/formatting.py +0 -0
  33. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/literal_preserve.py +0 -0
  34. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/prompts.py +0 -0
  35. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/llm/refiner.py +0 -0
  36. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/loaders.py +0 -0
  37. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/pipeline.py +0 -0
  38. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/__init__.py +0 -0
  39. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/preprocessing/image.py +0 -0
  40. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/py.typed +0 -0
  41. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/quality.py +0 -0
  42. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/schemas.py +0 -0
  43. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/types.py +0 -0
  44. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/utils/__init__.py +0 -0
  45. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/utils/files.py +0 -0
  46. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/src/ocrcontext/utils/lang.py +0 -0
  47. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/__init__.py +0 -0
  48. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/conftest.py +0 -0
  49. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_langchain_loader.py +0 -0
  50. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_literal_preserve.py +0 -0
  51. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_llm.py +0 -0
  52. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_pipeline_analyzer.py +0 -0
  53. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_schemas.py +0 -0
  54. {ocrcontext-0.1.3 → ocrcontext-0.1.4}/tests/test_text_helpers.py +0 -0
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
- ## [0.1.3] - 2026-06-27
10
+ ## [0.1.4] - 2026-06-27
11
11
 
12
12
  ### Added
13
13
  - **GPU acceleration** — `Analyzer(use_gpu=True)` routes PaddleOCR inference to a
@@ -95,7 +95,8 @@ into a standalone, LLM-agnostic library.
95
95
  - **Packaging** — optional extras `[paddle]`, `[trocr]`, `[vision]`, `[all]`;
96
96
  PEP 561 typed (`py.typed`); examples and a GPU/network-free test suite.
97
97
 
98
- [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...HEAD
98
+ [Unreleased]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.4...HEAD
99
+ [0.1.4]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.3...v0.1.4
99
100
  [0.1.3]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.2...v0.1.3
100
101
  [0.1.2]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.1...v0.1.2
101
102
  [0.1.1]: https://github.com/bahadirkarsli/ocrcontext/compare/v0.1.0...v0.1.1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocrcontext
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code.
5
5
  Project-URL: Homepage, https://github.com/BahadirKarsli/OCRContext
6
6
  Project-URL: Repository, https://github.com/BahadirKarsli/OCRContext
@@ -90,8 +90,19 @@ print(result.text)
90
90
 
91
91
  `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
92
92
 
93
+ ## Demo
94
+
95
+ **Structured invoice extraction from an image:**
96
+
97
+ <img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
98
+
99
+ **Digital PDF text extraction:**
100
+
101
+ <img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
102
+
93
103
  ## Contents
94
104
 
105
+ - [Demo](#demo)
95
106
  - [Install](#install)
96
107
  - [CLI](#cli)
97
108
  - [Quick start (Python API)](#quick-start-python-api)
@@ -35,8 +35,19 @@ print(result.text)
35
35
 
36
36
  `ocrcontext` is the extraction core of a production document-analysis platform, lifted out of its FastAPI/Next.js stack into a pure, pip-installable library. It handles OCR engine routing, fidelity-first LLM cleanup, and schema-based structured extraction — and gets out of your way.
37
37
 
38
+ ## Demo
39
+
40
+ **Structured invoice extraction from an image:**
41
+
42
+ <img width="100%" alt="Invoice extraction demo" src="https://github.com/user-attachments/assets/8e77ab83-fff3-4929-9a54-7f4a75acc16f" />
43
+
44
+ **Digital PDF text extraction:**
45
+
46
+ <img width="100%" alt="PDF extraction demo" src="https://github.com/user-attachments/assets/84437bd0-9d24-4a2e-8e0c-0014c9e85820" />
47
+
38
48
  ## Contents
39
49
 
50
+ - [Demo](#demo)
40
51
  - [Install](#install)
41
52
  - [CLI](#cli)
42
53
  - [Quick start (Python API)](#quick-start-python-api)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "ocrcontext"
7
- version = "0.1.3"
7
+ version = "0.1.4"
8
8
  description = "Decoupled, LLM-agnostic document OCR + structured extraction. Vision and LLM parsing in 3 lines of code."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -11,6 +11,7 @@ Then run:
11
11
 
12
12
  from __future__ import annotations
13
13
 
14
+ import os
14
15
  import sys
15
16
  from pathlib import Path
16
17
  from typing import Optional
@@ -25,6 +26,7 @@ except ImportError: # pragma: no cover
25
26
 
26
27
  from .analyzer import Analyzer
27
28
  from .config import AnalyzerConfig
29
+ from .types import OcrResult
28
30
  from .schemas import (
29
31
  Contract,
30
32
  IdCard,
@@ -33,6 +35,61 @@ from .schemas import (
33
35
  Receipt,
34
36
  )
35
37
 
38
+ def _suppress_paddle_noise() -> None:
39
+ import logging
40
+ import warnings
41
+
42
+ # Set env vars BEFORE any paddle/paddlex import so they see the right paths.
43
+ # _ensure_ascii_model_cache() in paddle.py does the same but only when the
44
+ # engine lazy-loads; calling it here guarantees it runs first.
45
+ from .engines.paddle import _ensure_ascii_model_cache, _ensure_paddle_runtime_flags
46
+ _ensure_ascii_model_cache()
47
+ _ensure_paddle_runtime_flags()
48
+
49
+ os.environ.setdefault("GLOG_minloglevel", "3")
50
+
51
+ # Silence Python-level loggers (no paddlex import — that would defeat the purpose).
52
+ null = logging.NullHandler()
53
+ for name in ("ppocr", "paddlex", "paddle", "paddle.utils", "paddle.fluid"):
54
+ lg = logging.getLogger(name)
55
+ lg.setLevel(logging.ERROR)
56
+ lg.handlers = [null]
57
+ lg.propagate = False
58
+
59
+ # Root-level filter catches sub-loggers that bypass the above (e.g. paddlex.utils.*).
60
+ class _NoiseFilter(logging.Filter):
61
+ _NOISE = ("Could not find files", "ccache", "oneDNN", "mkldnn")
62
+ def filter(self, record: logging.LogRecord) -> bool:
63
+ return not any(t in record.getMessage() for t in self._NOISE)
64
+
65
+ logging.getLogger().addFilter(_NoiseFilter())
66
+
67
+ warnings.filterwarnings("ignore", category=UserWarning, module="paddle")
68
+
69
+
70
+
71
+
72
+ def _route_label(result: OcrResult, file_path: Path) -> str:
73
+ src = result.text_source
74
+ if src == "pdf_text_layer":
75
+ return "DIGITAL PDF -> text layer"
76
+ if src == "ocr":
77
+ return "SCANNED PDF -> rasterize + PaddleOCR" if file_path.suffix.lower() == ".pdf" else "IMAGE -> PaddleOCR"
78
+ if src == "vision_handwriting":
79
+ return "HANDWRITING -> Google Vision"
80
+ if src == "handwriting_ocr":
81
+ return "HANDWRITING -> PaddleOCR"
82
+ return src
83
+
84
+
85
+ def _info(msg: str) -> None:
86
+ typer.echo(f"[i] {msg}", err=True)
87
+
88
+
89
+ def _ok(msg: str) -> None:
90
+ typer.echo(f"[OK] {msg}", err=True)
91
+
92
+
36
93
  app = typer.Typer(
37
94
  name="ocrcontext",
38
95
  help="OCR a document and optionally extract structured data.",
@@ -129,12 +186,13 @@ def extract(
129
186
  ) -> None:
130
187
  """OCR a document and optionally extract structured data."""
131
188
 
189
+ _suppress_paddle_noise()
190
+
132
191
  file_path = Path(file)
133
192
  if not file_path.exists():
134
193
  typer.echo(f"[ERROR] File not found: {file}", err=True)
135
194
  raise typer.Exit(code=1)
136
195
 
137
- # Validate --schema value early for a clear error message.
138
196
  if schema is not None and schema not in _SCHEMAS:
139
197
  typer.echo(
140
198
  f"[ERROR] Unknown schema '{schema}'. "
@@ -148,36 +206,42 @@ def extract(
148
206
  raise typer.Exit(code=1)
149
207
 
150
208
  refine_flag = _parse_refine(refine)
151
-
152
- # Build LLM only when needed.
153
209
  needs_llm = schema is not None or refine_flag is True
154
210
  llm = _build_llm(provider, model) if needs_llm else None
155
211
 
156
- analyzer = Analyzer(
157
- llm=llm,
158
- config=AnalyzerConfig(lang=lang),
159
- )
212
+ analyzer = Analyzer(llm=llm, config=AnalyzerConfig(lang=lang))
160
213
 
161
214
  try:
162
- if schema is not None:
163
- schema_cls = _SCHEMAS[schema]
164
- result = analyzer.extract(
165
- file_path,
166
- schema=schema_cls,
167
- handwriting=handwriting,
168
- refine=refine_flag or False,
169
- )
170
- typer.echo(result.model_dump_json(indent=2))
171
- else:
172
- result = analyzer.analyze(
215
+ _info(f"file: {file_path.name}")
216
+ _info("OCR...")
217
+
218
+ ocr_result = analyzer.analyze(
173
219
  file_path,
174
220
  handwriting=handwriting,
175
221
  refine=refine_flag,
176
222
  )
223
+
224
+ conf = f"confidence: {ocr_result.confidence:.0%}" if ocr_result.confidence < 1.0 else "exact"
225
+ _ok(f"route: {_route_label(ocr_result, file_path)} ({conf})")
226
+
227
+ if ocr_result.refined:
228
+ _ok("refine: APPLIED")
229
+
230
+ if schema is not None:
231
+ schema_cls = _SCHEMAS[schema]
232
+ _info(f"extract: {schema} schema...")
233
+ structured = analyzer.extract_text(
234
+ ocr_result.text,
235
+ schema_cls,
236
+ language=lang,
237
+ )
238
+ _ok(f"extract: {schema} [OK]")
239
+ typer.echo(structured.model_dump_json(indent=2))
240
+ else:
177
241
  if output == "json":
178
- typer.echo(result.model_dump_json(indent=2))
242
+ typer.echo(ocr_result.model_dump_json(indent=2))
179
243
  else:
180
- typer.echo(result.text)
244
+ typer.echo(ocr_result.text)
181
245
 
182
246
  except Exception as exc: # noqa: BLE001
183
247
  typer.echo(f"[ERROR] {exc}", err=True)
@@ -104,6 +104,7 @@ class PaddleEngine(OcrEngine):
104
104
  import logging
105
105
 
106
106
  logging.getLogger("ppocr").setLevel(logging.ERROR)
107
+ logging.getLogger("paddlex").setLevel(logging.ERROR)
107
108
  requested = paddle_lang
108
109
  ocr, errors = self._try_init(PaddleOCR, paddle_lang, use_gpu=self._use_gpu)
109
110
  if ocr is None and paddle_lang != "en":
@@ -129,9 +130,12 @@ class PaddleEngine(OcrEngine):
129
130
  # Shared 3.x flags: disable sub-models unneeded for plain OCR.
130
131
  # enable_mkldnn is forced False on CPU to avoid PaddlePaddle 3.x PIR bug;
131
132
  # on GPU it's irrelevant (MKLDNN is CPU-only) but harmless to keep False.
133
+ # use_gpu is only injected when True — some PaddleOCR 3.x builds reject it
134
+ # as an unknown argument even when set to False.
135
+ gpu_kwargs = {"use_gpu": True} if use_gpu else {}
132
136
  base_3x = {
133
137
  "lang": lang,
134
- "use_gpu": use_gpu,
138
+ **gpu_kwargs,
135
139
  "use_doc_orientation_classify": False,
136
140
  "use_doc_unwarping": False,
137
141
  "use_textline_orientation": False,
@@ -147,10 +151,10 @@ class PaddleEngine(OcrEngine):
147
151
  # 3.x default — version determined by installed package, no pin
148
152
  base_3x,
149
153
  # Minimal 3.x (for builds that reject the sub-model flags)
150
- {"lang": lang, "use_gpu": use_gpu, "enable_mkldnn": False},
151
- {"lang": lang, "use_gpu": use_gpu},
154
+ {"lang": lang, **gpu_kwargs, "enable_mkldnn": False},
155
+ {"lang": lang, **gpu_kwargs},
152
156
  # Legacy 2.x (use_angle_cls; use_doc_* / show_log don't exist in 2.x)
153
- {"use_angle_cls": True, "lang": lang, "use_gpu": use_gpu},
157
+ {"use_angle_cls": True, "lang": lang, **gpu_kwargs},
154
158
  ]
155
159
  errors: list[str] = []
156
160
  for kwargs in profiles:
@@ -22,7 +22,7 @@ class LineItem(BaseModel):
22
22
  "Default 1 only if neither is available."
23
23
  ),
24
24
  )
25
- unit: Optional[str] = Field(None, description="Unit type (Adet, Kg, Saat, etc.).")
25
+ unit: Optional[str] = Field(None, description="Unit of measure as written on the document (e.g. pcs, kg, hrs). Null if not present.")
26
26
  unit_price: Optional[float] = Field(None, description="Price per unit.")
27
27
  tax_rate: Optional[str] = Field(
28
28
  None, description="Tax percentage (e.g., 20, 10, 0) or pattern."
@@ -34,9 +34,9 @@ class Invoice(BaseModel):
34
34
  supplier_name: Optional[str] = Field(None, description="Name of the vendor/supplier.")
35
35
  invoice_date: Optional[str] = Field(None, description="Format YYYY-MM-DD.")
36
36
  invoice_number: Optional[str] = Field(None, description="The invoice ID/number.")
37
- tax_id: Optional[str] = Field(None, description="Tax ID / VKN / TCKN.")
37
+ tax_id: Optional[str] = Field(None, description="Tax ID / VAT registration number.")
38
38
  tax_rate: Optional[str] = Field(
39
- None, description="e.g. 'KDV %20' when KDV is 20%."
39
+ None, description="Tax/VAT rate as written on the document (e.g. 'VAT 20%', 'GST 10%')."
40
40
  )
41
41
  currency: Optional[str] = Field(None, description="Currency code (TRY, USD, EUR, etc.).")
42
42
  total_amount: Optional[float] = Field(None, description="Final total amount (numeric).")
@@ -63,37 +63,38 @@ class Invoice(BaseModel):
63
63
  return self
64
64
 
65
65
 
66
- # Verbatim system prompt from app/api/invoices/process/route.ts.
67
66
  INVOICE_EXTRACTION_PROMPT = """You are an expert invoice data extraction assistant.
68
67
 
69
68
  CRITICAL RULES:
70
- 1. **LANGUAGE REPAIR**:
71
- - The text may come from OCR and may have missing characters.
72
- - If language is 'tr' (Turkish), intelligently fix missing Turkish characters.
69
+ 1. **OCR REPAIR**: The text may come from OCR and may have missing or garbled characters.
70
+ Use context to infer the correct value do not invent values that are not on the document.
73
71
 
74
72
  2. **NUMBER PARSING**:
75
- - Be extremely careful with comma (,) and dot (.).
76
- - In Turkish/European invoices, '1.200,50' means One Thousand Two Hundred and 50 cents.
77
- - NEVER confuse a quantity (e.g., 500) with a price (e.g. 5,00).
73
+ - Be careful with comma (,) and dot (.) as thousand separators vs decimal points.
74
+ - European format: '1.200,50' = 1200.50. US/UK format: '1,200.50' = 1200.50.
75
+ - NEVER confuse a quantity (e.g., 2) with a unit price (e.g., 45.00).
78
76
 
79
77
  3. **CURRENCY DETECTION**:
80
- - Look for symbols: ₺, TL, TRY, USD, $, EUR, €.
81
- - Prioritize 'TRY' / 'TL' unless explicitly stated otherwise.
78
+ - Look for symbols or codes on the document: $, USD, €, EUR, £, GBP, ₺, TRY, etc.
79
+ - Use ONLY what is explicitly stated. Do not default to any currency.
82
80
 
83
- Extract the following fields if it exists:
81
+ 4. **UNITS**: Copy the unit exactly as written on the document (pcs, kg, hrs, m², etc.).
82
+ If no unit is shown, use null — never invent one.
83
+
84
+ Extract the following fields if present:
84
85
  - 'supplier_name': Name of the vendor/supplier.
85
86
  - 'invoice_date': Format YYYY-MM-DD.
86
87
  - 'invoice_number': The invoice ID/number.
87
- - 'tax_id': Tax ID / VKN / TCKN.
88
- - 'tax_rate': It can be like 'KDV' and for example if it is 'KDV' and it is %20, write it as 'KDV %20' in excel.
89
- - 'currency': Currency code (TRY, USD, EUR, etc.).
88
+ - 'tax_id': Tax ID or VAT registration number.
89
+ - 'tax_rate': Tax/VAT rate as written (e.g. 'VAT 20%', 'GST 10%').
90
+ - 'currency': ISO currency code (USD, EUR, GBP, TRY, etc.).
90
91
  - 'total_amount': Final total amount (numeric).
91
92
  - 'line_items': An array of items/services. Each item should have:
92
93
  - 'description': Product/Service name.
93
94
  - 'quantity': Numeric quantity. If missing, calculate it as total / unit_price. Default 1 only if neither is available.
94
- - 'unit': Unit type (Adet, Kg, Saat, etc.).
95
+ - 'unit': Unit of measure exactly as written on the document. Null if not present.
95
96
  - 'unit_price': Price per unit.
96
- - 'tax_rate': Tax percentage (e.g., 20, 10, 0) or pattern.
97
+ - 'tax_rate': Tax percentage (e.g., 20, 10, 0) or null.
97
98
  - 'total': Total price for this line.
98
99
 
99
100
  Return ONLY a valid JSON object. If a field is not found, use null."""
@@ -48,6 +48,9 @@ def _patch_analyzer(monkeypatch, ocr_text: str = "hello world", structured: dict
48
48
  def extract(self, *args, schema=None, **kwargs):
49
49
  return schema(**(structured or {}))
50
50
 
51
+ def extract_text(self, text, schema, **kwargs):
52
+ return schema(**(structured or {}))
53
+
51
54
  monkeypatch.setattr(cli_mod, "Analyzer", _FakeAnalyzer)
52
55
  monkeypatch.setattr(cli_mod, "_build_llm", lambda provider, model: None)
53
56
 
@@ -99,7 +102,7 @@ def test_extract_json_output(ascii_tmp, monkeypatch):
99
102
  result = runner.invoke(app, ["extract", str(png), "--output", "json"])
100
103
  assert result.exit_code == 0
101
104
  import json
102
- data = json.loads(result.output)
105
+ data = json.loads(result.output[result.output.index("{"):])
103
106
  assert data["text"] == "some text"
104
107
  assert data["text_source"] == "ocr"
105
108
 
@@ -119,7 +122,7 @@ def test_extract_invoice_schema(ascii_tmp, monkeypatch):
119
122
  result = runner.invoke(app, ["extract", str(png), "--schema", "invoice"])
120
123
  assert result.exit_code == 0
121
124
  import json
122
- data = json.loads(result.output)
125
+ data = json.loads(result.output[result.output.index("{"):])
123
126
  assert data["supplier_name"] == "ACME"
124
127
  assert data["total_amount"] == 250.0
125
128
 
@@ -131,7 +134,7 @@ def test_extract_receipt_schema(ascii_tmp, monkeypatch):
131
134
  result = runner.invoke(app, ["extract", str(png), "--schema", "receipt"])
132
135
  assert result.exit_code == 0
133
136
  import json
134
- data = json.loads(result.output)
137
+ data = json.loads(result.output[result.output.index("{"):])
135
138
  assert data["store_name"] == "Migros"
136
139
 
137
140
 
File without changes
File without changes
File without changes
File without changes
File without changes