ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ """LLM-agnostic structured extraction via LangChain's ``with_structured_output``.
2
+
3
+ Give it any Pydantic schema and a chat model; get a populated model instance
4
+ back. The Invoice schema in :mod:`ocrcontext.llm.schemas` is auto-detected so it
5
+ uses the verbatim invoice prompt, but any schema works.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TYPE_CHECKING, TypeVar
11
+
12
+ from pydantic import BaseModel
13
+
14
+ from .schemas import INVOICE_EXTRACTION_PROMPT, Invoice
15
+
16
+ if TYPE_CHECKING:
17
+ from langchain_core.language_models import BaseChatModel
18
+
19
+ TSchema = TypeVar("TSchema", bound=BaseModel)
20
+
21
+ _GENERIC_PROMPT = (
22
+ "You are an expert document data extraction assistant. The text may come from "
23
+ "OCR and may contain scanning errors and missing characters. Extract the "
24
+ "requested fields faithfully from the document text. Do not invent values: if a "
25
+ "field is not present in the text, leave it null/empty. Preserve the document's "
26
+ "original language for textual fields and do not translate."
27
+ )
28
+
29
+
30
+ class StructuredExtractor:
31
+ """Extract a Pydantic schema from raw text using an injected chat model."""
32
+
33
+ def __init__(self, llm: "BaseChatModel") -> None:
34
+ self._llm = llm
35
+
36
+ def extract(
37
+ self,
38
+ text: str,
39
+ schema: type[TSchema],
40
+ *,
41
+ language: str = "auto",
42
+ system_prompt: str | None = None,
43
+ ) -> TSchema:
44
+ from langchain_core.messages import HumanMessage, SystemMessage
45
+
46
+ system = system_prompt or self._default_prompt(schema)
47
+ user = (
48
+ f"Language Context: {language}\n"
49
+ f"Extract detailed data from this document text:\n\n{text}"
50
+ )
51
+
52
+ structured = self._llm.with_structured_output(schema)
53
+ result = structured.invoke(
54
+ [SystemMessage(content=system), HumanMessage(content=user)]
55
+ )
56
+ # with_structured_output returns an instance of `schema`.
57
+ return result # type: ignore[return-value]
58
+
59
+ @staticmethod
60
+ def _default_prompt(schema: type[BaseModel]) -> str:
61
+ if schema is Invoice:
62
+ return INVOICE_EXTRACTION_PROMPT
63
+ return _GENERIC_PROMPT
@@ -0,0 +1,39 @@
1
+ """Plain-text formatting, ported from lib/ocr/plain-text-format.ts.
2
+
3
+ Strip Markdown syntax so the model's structured output stays clean plain text
4
+ while keeping layout (blank lines, headings, bullets).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+
11
+ _HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s+")
12
+ _BLOCKQUOTE_RE = re.compile(r"^(\s{0,3})>\s?")
13
+ _BULLET_RE = re.compile(r"^(\s*)[-*+]\s+")
14
+ _CODE_FENCE_RE = re.compile(r"```[^\n`]*\n?")
15
+ _BOLD_RE = re.compile(r"\*\*([^*\n]+)\*\*")
16
+ _UNDERSCORE_BOLD_RE = re.compile(r"__([^_\n]+)__")
17
+ _INLINE_CODE_RE = re.compile(r"`([^`\n]+)`")
18
+ _TRAILING_WS_RE = re.compile(r"[ \t]+$", re.MULTILINE)
19
+ _EXTRA_BLANKS_RE = re.compile(r"\n{3,}")
20
+
21
+
22
+ def strip_markdown_formatting(text: str, *, convert_bullets: bool = False) -> str:
23
+ lines = []
24
+ for line in text.split("\n"):
25
+ line = _HEADING_RE.sub("", line)
26
+ line = _BLOCKQUOTE_RE.sub(r"\1", line)
27
+ if convert_bullets:
28
+ line = _BULLET_RE.sub(r"\1• ", line)
29
+ lines.append(line)
30
+
31
+ result = "\n".join(lines)
32
+ result = _CODE_FENCE_RE.sub("", result)
33
+ result = _BOLD_RE.sub(r"\1", result)
34
+ result = _UNDERSCORE_BOLD_RE.sub(r"\1", result)
35
+ result = _INLINE_CODE_RE.sub(r"\1", result)
36
+ result = _TRAILING_WS_RE.sub("", result)
37
+ result = _EXTRA_BLANKS_RE.sub("\n\n", result)
38
+
39
+ return result.strip()
@@ -0,0 +1,164 @@
1
+ """Literal / contact-data preservation, ported from lib/ocr/literal-preserve.ts.
2
+
3
+ Emails, URLs, IBANs and card numbers are masked to ``{{OCRLITn}}`` placeholders
4
+ before the LLM sees the text and restored verbatim afterwards, so the model
5
+ cannot "fix" identifiers (e.g. bahadrkrsl@... -> bahadirkarsli@...).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import dataclass
12
+
13
+ # Placeholders injected before LLM refine; restored verbatim after.
14
+ def _token_for(index: int) -> str:
15
+ return f"{{{{OCRLIT{index}}}}}"
16
+
17
+
18
+ EMAIL_PATTERN = re.compile(
19
+ r"[a-zA-Z0-9](?:[a-zA-Z0-9._%+-]*[a-zA-Z0-9])?"
20
+ r"@[a-zA-Z0-9](?:[a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}"
21
+ )
22
+
23
+ _LITERAL_PATTERNS: list[re.Pattern[str]] = [
24
+ EMAIL_PATTERN,
25
+ re.compile(r"https?://[^\s<>\"'\])}+]+", re.IGNORECASE),
26
+ re.compile(r"www\.[a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,}[^\s<>\"']*", re.IGNORECASE),
27
+ re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b"),
28
+ re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"),
29
+ ]
30
+
31
+
32
+ @dataclass
33
+ class MaskResult:
34
+ masked_text: str
35
+ literals: list[str]
36
+
37
+
38
+ def preprocess_literal_text(text: str) -> str:
39
+ """OCR often inserts spaces/newlines around @ - join before masking."""
40
+ text = re.sub(
41
+ r"([a-zA-Z0-9._%+-]+)\s*\n\s*@\s*([a-zA-Z0-9][a-zA-Z0-9.-]*)", r"\1@\2", text
42
+ )
43
+ text = re.sub(
44
+ r"([a-zA-Z0-9._%+-]+)\s+@\s+([a-zA-Z0-9][a-zA-Z0-9.-]*)", r"\1@\2", text
45
+ )
46
+ return text
47
+
48
+
49
+ def extract_emails(text: str) -> list[str]:
50
+ normalized = preprocess_literal_text(text)
51
+ return [m.group(0) for m in EMAIL_PATTERN.finditer(normalized)]
52
+
53
+
54
+ def _levenshtein(a: str, b: str) -> int:
55
+ m, n = len(a), len(b)
56
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
57
+ for i in range(m + 1):
58
+ dp[i][0] = i
59
+ for j in range(n + 1):
60
+ dp[0][j] = j
61
+ for i in range(1, m + 1):
62
+ for j in range(1, n + 1):
63
+ cost = 0 if a[i - 1] == b[j - 1] else 1
64
+ dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)
65
+ return dp[m][n]
66
+
67
+
68
+ def _is_likely_same_email(candidate: str, original: str) -> bool:
69
+ c = candidate.lower().split("@")
70
+ o = original.lower().split("@")
71
+ if len(c) != 2 or len(o) != 2:
72
+ return False
73
+ c_local, c_domain = c
74
+ o_local, o_domain = o
75
+ if not c_local or not o_local or c_domain != o_domain:
76
+ return False
77
+ if candidate == original:
78
+ return True
79
+ max_dist = max(2, int(len(o_local) * 0.35))
80
+ return _levenshtein(c_local, o_local) <= max_dist
81
+
82
+
83
+ def enforce_original_literals(original_text: str, refined_text: str) -> str:
84
+ """Force the OCR/original email spelling back if the model rewrote it."""
85
+ originals = extract_emails(original_text)
86
+ if not originals:
87
+ return refined_text
88
+
89
+ output = refined_text
90
+ for orig in originals:
91
+ parts = orig.split("@")
92
+ if len(parts) != 2:
93
+ continue
94
+ domain = parts[1]
95
+ if not domain:
96
+ continue
97
+ domain_escaped = re.escape(domain)
98
+ domain_re = re.compile(
99
+ r"[a-zA-Z0-9](?:[a-zA-Z0-9._%+-]*[a-zA-Z0-9])?@" + domain_escaped,
100
+ re.IGNORECASE,
101
+ )
102
+
103
+ def _replace(match: re.Match[str], _orig: str = orig) -> str:
104
+ text = match.group(0)
105
+ if text == _orig:
106
+ return _orig
107
+ return _orig if _is_likely_same_email(text, _orig) else text
108
+
109
+ output = domain_re.sub(_replace, output)
110
+
111
+ return output
112
+
113
+
114
+ def _collect_non_overlapping_spans(text: str) -> list[tuple[int, int, str]]:
115
+ spans: list[tuple[int, int, str]] = []
116
+ for pattern in _LITERAL_PATTERNS:
117
+ for match in pattern.finditer(text):
118
+ spans.append((match.start(), match.end(), match.group(0)))
119
+
120
+ # start asc; at equal start, the longer span wins (so it is kept, shorter dropped).
121
+ spans.sort(key=lambda s: (s[0], -s[1]))
122
+
123
+ merged: list[tuple[int, int, str]] = []
124
+ for span in spans:
125
+ if not merged or span[0] >= merged[-1][1]:
126
+ merged.append(span)
127
+ return merged
128
+
129
+
130
+ def mask_protected_literals(text: str) -> MaskResult:
131
+ preprocessed = preprocess_literal_text(text)
132
+ spans = _collect_non_overlapping_spans(preprocessed)
133
+ literals: list[str] = []
134
+ masked_text = ""
135
+ cursor = 0
136
+ for start, end, value in spans:
137
+ masked_text += preprocessed[cursor:start]
138
+ literals.append(value)
139
+ masked_text += _token_for(len(literals) - 1)
140
+ cursor = end
141
+ masked_text += preprocessed[cursor:]
142
+ return MaskResult(masked_text=masked_text, literals=literals)
143
+
144
+
145
+ def unmask_protected_literals(text: str, literals: list[str]) -> str:
146
+ output = text
147
+ for i, literal in enumerate(literals):
148
+ placeholder = _token_for(i)
149
+ if placeholder in output:
150
+ output = output.replace(placeholder, literal)
151
+ continue
152
+ fuzzy = re.compile(r"\{\{\s*OCRLIT\s*" + str(i) + r"\s*\}\}", re.IGNORECASE)
153
+ output = fuzzy.sub(literal, output)
154
+ return output
155
+
156
+
157
+ LITERAL_PRESERVE_PROMPT = """
158
+ LITERAL / CONTACT DATA (CRITICAL):
159
+ - Tokens like {{OCRLIT0}}, {{OCRLIT1}}, ... are frozen placeholders for emails, URLs, IBANs, and similar identifiers.
160
+ - Copy every {{OCRLITn}} token EXACTLY — same spelling, same characters, same position in the sentence.
161
+ - NEVER "fix", complete, or guess emails/usernames (e.g. do NOT change bahadrkrsl@outlook.com to bahadirkarsli@outlook.com).
162
+ - NEVER invent @ symbols or domains. If a placeholder is present, output it unchanged.
163
+ - Apply OCR fixes only to normal words around these placeholders, not to the placeholders themselves.
164
+ """
@@ -0,0 +1,157 @@
1
+ """Refinement prompts, ported VERBATIM from lib/ocr/refine.ts.
2
+
3
+ These prompts were heavily tuned for fidelity; do not paraphrase them. Model
4
+ selection (gpt-4.1 vs gpt-4o) is intentionally dropped — the chat model is
5
+ injected by the caller. Only the per-mode temperature recommendation is kept.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from ..types import RefinementMode
11
+ from ..utils.lang import language_full_name
12
+ from .literal_preserve import LITERAL_PRESERVE_PROMPT
13
+
14
+
15
+ def refine_temperature(mode: RefinementMode) -> float:
16
+ """Deterministic for handwriting/conservative; light creativity for layout."""
17
+ if mode == RefinementMode.CONSERVATIVE:
18
+ return 0.0
19
+ if mode == RefinementMode.HANDWRITING_PROSE:
20
+ return 0.0
21
+ if mode == RefinementMode.HANDWRITING_LAYOUT:
22
+ return 0.0
23
+ return 0.1
24
+
25
+
26
+ _LAYOUT = """
27
+ LAYOUT MODE (for digital PDFs):
28
+ - Reconstruct clean document structure in plain text.
29
+ - Keep clear section separation with blank lines between paragraphs/sections.
30
+ - Preserve list structure as plain text list items (no markdown markers unless present in source).
31
+ - Preserve the original reading order and do not merge unrelated sections.
32
+ """
33
+
34
+ _HANDWRITING_LAYOUT = """
35
+ HANDWRITING LAYOUT MODE (for handwritten notes, lists, tables and diagrams of any topic):
36
+ - FIDELITY FIRST: stay faithful to what is actually written. Your job is to fix OCR errors, NOT to
37
+ improve the text. Do NOT paraphrase, summarize, complete unfinished sentences, smooth awkward
38
+ phrasing, or add connecting words. A slightly awkward but faithful transcription is the goal.
39
+ - Fix only clear OCR errors: missing diacritics, visually confused letters, and words the OCR split
40
+ or joined. If a word is plausible as written, keep it exactly.
41
+ - If a word or phrase is illegible, give your closest LITERAL character reading. NEVER replace it
42
+ with a fluent but made-up sentence, and never add facts that are not in the source. It is better
43
+ to leave a rough/partial phrase than to invent a clean one.
44
+ - Remove lines that are ONLY margin ruler numbers (e.g. lone "23", "2213").
45
+ - PRESERVE all of the source's content: keep every heading, label, list item, definition,
46
+ and arrow ("→") line. Never drop a line. If you are unsure about a line, keep it.
47
+ - PRESERVE THE ORIGINAL ORDER: keep lines and sentences in the exact order they appear in the source.
48
+ Do NOT reorder, move, or merge sentences to make the text flow better. If the source order looks
49
+ odd, leave it odd — do not "fix" it.
50
+ - Keep numbered lists and definition paragraphs as they are; fix only OCR errors in them.
51
+ - LAYOUT: only tidy spacing — keep the source's own line and section breaks, put each existing heading
52
+ on its own line, leave ONE blank line between existing sections, and use "• " for bullets that are
53
+ already bullets. Do NOT restructure content. Keep it PLAIN TEXT: no Markdown symbols (#, *, _,
54
+ backticks, >) and no code fences.
55
+ - Do NOT translate.
56
+ """
57
+
58
+ _HANDWRITING_PROSE = """
59
+ HANDWRITING PROSE MODE (poems, paragraphs, letters, notes — no sensitive data):
60
+ - FIDELITY FIRST: fix OCR errors but stay faithful to what is written. Do NOT paraphrase, rewrite
61
+ style, complete unfinished sentences, or add new words/sentences/ideas.
62
+ - Fix OCR errors: missing diacritics, visually confused letters, and words the OCR split or joined
63
+ (e.g. "düşünmedin sen" → "düşünmediysen"; split a wrongly-joined word). If a word is plausible as
64
+ written, keep it exactly — do NOT swap it for a synonym or a "better" word.
65
+ - Fix line breaks the OCR got wrong and keep real verse/paragraph breaks. Remove duplicate words and
66
+ stray margin numbers caused by OCR.
67
+ - Keep lines and sentences in their original order; do NOT reorder or move content.
68
+ - If a word is illegible, give your closest LITERAL reading; never invent a fluent replacement.
69
+ - Keep signatures, author names, and titles; only fix obvious typos in them.
70
+ - Do NOT translate. Keep the original language.
71
+ """
72
+
73
+ _CONSERVATIVE = """
74
+ CONSERVATIVE MODE (for OCR images/scans):
75
+ - Perform minimal, character-level OCR correction only.
76
+ - Do NOT replace a valid-looking word with a different semantic word.
77
+ - If uncertain, keep the original token exactly as-is.
78
+ - Do NOT infer missing entities (names, places, brands, email local-parts) from context.
79
+ - Preserve line order and keep output close to source line-by-line.
80
+ """
81
+
82
+ _MODE_INSTRUCTIONS = {
83
+ RefinementMode.LAYOUT: _LAYOUT,
84
+ RefinementMode.HANDWRITING_LAYOUT: _HANDWRITING_LAYOUT,
85
+ RefinementMode.HANDWRITING_PROSE: _HANDWRITING_PROSE,
86
+ RefinementMode.CONSERVATIVE: _CONSERVATIVE,
87
+ }
88
+
89
+ _SYSTEM_HANDWRITING_LAYOUT = (
90
+ "You are a world-class OCR post-processor for handwritten notes. "
91
+ "Fix OCR errors and tidy the layout, but stay faithful to what is written: "
92
+ "never paraphrase, complete, or invent text you cannot read. "
93
+ "Never alter frozen {{OCRLITn}} placeholders. Output plain text only."
94
+ )
95
+ _SYSTEM_HANDWRITING_PROSE = (
96
+ "You are a world-class OCR post-processor for handwritten prose and poetry. "
97
+ "Fix misread words and broken line breaks, but stay faithful: never paraphrase, "
98
+ "complete, or invent content, and never translate. "
99
+ "Never alter frozen {{OCRLITn}} placeholders. Output plain text only."
100
+ )
101
+ _SYSTEM_DEFAULT = (
102
+ "You are a world-class OCR post-processor. Fix OCR noise in normal prose only. "
103
+ "Never alter frozen {{OCRLITn}} placeholders (emails, URLs, banking IDs). "
104
+ "Never guess or complete email addresses or usernames. Output plain text only."
105
+ )
106
+
107
+
108
+ def build_refinement_prompt(
109
+ masked_text: str, language: str, mode: RefinementMode
110
+ ) -> tuple[str, str]:
111
+ """Return ``(system, user)`` prompt strings for the given mode."""
112
+ full_language = language_full_name(language) if language else None
113
+ if full_language and full_language != "auto":
114
+ language_prompt = (
115
+ f"The text is in {full_language}. Preserve the original language and only fix "
116
+ f"OCR errors using {full_language} spelling rules."
117
+ )
118
+ else:
119
+ language_prompt = (
120
+ "Preserve the original language of the text. Do not translate or change the language."
121
+ )
122
+
123
+ mode_instructions = _MODE_INSTRUCTIONS[mode]
124
+
125
+ user = f"""
126
+ You are an expert OCR post-processing AI. Your ONLY task is to reconstruct the original text from OCR output that contains scanning errors.
127
+ Never add new information and never remove existing information.
128
+ {mode_instructions}
129
+ {LITERAL_PRESERVE_PROMPT}
130
+
131
+ UNDERSTANDING OCR ERRORS:
132
+ OCR engines make SYSTEMATIC errors — they don't understand language, they only recognize shapes.
133
+
134
+ A) DIACRITIC STRIPPING (Turkish, French, German, Spanish, etc.)
135
+ B) VISUALLY SIMILAR CHARACTER CONFUSION (0↔O, rn→m, ...)
136
+ C) TRUNCATION & MISSING CHARACTERS
137
+ D) WORD BOUNDARY ERRORS
138
+
139
+ YOUR TASK:
140
+ 1. Fix OCR errors in regular words using sentence context.
141
+ 2. DO NOT translate. DO NOT change the language. DO NOT add or remove content.
142
+ 3. DO NOT add commentary. Output ONLY the corrected plain text.
143
+ 4. Output PLAIN TEXT only. Do NOT use Markdown syntax (#, *, _, backticks, >) and do NOT wrap the output in code fences.
144
+ 5. {language_prompt}
145
+
146
+ Input Text:
147
+ {masked_text}
148
+ """
149
+
150
+ if mode == RefinementMode.HANDWRITING_LAYOUT:
151
+ system = _SYSTEM_HANDWRITING_LAYOUT
152
+ elif mode == RefinementMode.HANDWRITING_PROSE:
153
+ system = _SYSTEM_HANDWRITING_PROSE
154
+ else:
155
+ system = _SYSTEM_DEFAULT
156
+
157
+ return system, user
@@ -0,0 +1,114 @@
1
+ """LLM-agnostic OCR refinement, ported from lib/ocr/refine.ts::refineOcrText.
2
+
3
+ Works with any LangChain ``BaseChatModel``. The fidelity pipeline is preserved:
4
+ mask literals -> prompt -> invoke -> unmask -> enforce literals -> strip markdown
5
+ -> drift/hallucination rejection.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import TYPE_CHECKING
12
+
13
+ from ..types import RefinementMode
14
+ from .drift import refine_hallucinated_length, refinement_drifted
15
+ from .formatting import strip_markdown_formatting
16
+ from .literal_preserve import (
17
+ enforce_original_literals,
18
+ mask_protected_literals,
19
+ unmask_protected_literals,
20
+ )
21
+ from .prompts import build_refinement_prompt, refine_temperature
22
+
23
+ if TYPE_CHECKING:
24
+ from langchain_core.language_models import BaseChatModel
25
+
26
+ logger = logging.getLogger("ocrcontext.refine")
27
+
28
+
29
+ class Refiner:
30
+ """Post-OCR refinement using an injected LangChain chat model."""
31
+
32
+ def __init__(self, llm: "BaseChatModel", *, apply_temperature: bool = True) -> None:
33
+ self._llm = llm
34
+ # When True, bind the mode's recommended temperature to the model call.
35
+ # Falls back gracefully for providers that reject the kwarg.
36
+ self._apply_temperature = apply_temperature
37
+
38
+ def refine(
39
+ self,
40
+ text: str,
41
+ language: str = "auto",
42
+ mode: RefinementMode = RefinementMode.CONSERVATIVE,
43
+ ) -> str:
44
+ """Refine OCR ``text``. Returns the original text unchanged on drift/empty."""
45
+ mask = mask_protected_literals(text)
46
+ system, user = build_refinement_prompt(mask.masked_text, language, mode)
47
+
48
+ raw = self._invoke(system, user, refine_temperature(mode))
49
+ refined = raw or mask.masked_text
50
+
51
+ unmasked = unmask_protected_literals(refined, mask.literals)
52
+ literal_safe = enforce_original_literals(text, unmasked)
53
+
54
+ convert_bullets = mode in (RefinementMode.HANDWRITING_LAYOUT, RefinementMode.LAYOUT)
55
+ cleaned = strip_markdown_formatting(literal_safe, convert_bullets=convert_bullets)
56
+
57
+ # Handwritten notes/prose: trust word + layout fixes; reject only wholesale
58
+ # hallucination (size bears little resemblance to source).
59
+ if mode in (RefinementMode.HANDWRITING_PROSE, RefinementMode.HANDWRITING_LAYOUT):
60
+ if not cleaned.strip():
61
+ return text
62
+ if refine_hallucinated_length(text, cleaned):
63
+ logger.warning(
64
+ "Handwriting output length diverged too far; keeping original OCR text "
65
+ "(mode=%s)",
66
+ mode.value,
67
+ )
68
+ return text
69
+ return cleaned
70
+
71
+ if refinement_drifted(text, cleaned):
72
+ logger.warning(
73
+ "Output drifted too far from source; keeping original OCR text "
74
+ "(mode=%s, original_lines=%d, refined_lines=%d)",
75
+ mode.value,
76
+ len(text.split("\n")),
77
+ len(cleaned.split("\n")),
78
+ )
79
+ return text
80
+
81
+ return cleaned
82
+
83
+ def _invoke(self, system: str, user: str, temperature: float) -> str:
84
+ from langchain_core.messages import HumanMessage, SystemMessage
85
+
86
+ messages = [SystemMessage(content=system), HumanMessage(content=user)]
87
+
88
+ if self._apply_temperature:
89
+ try:
90
+ bound = self._llm.bind(temperature=temperature)
91
+ response = bound.invoke(messages)
92
+ return _message_text(response)
93
+ except Exception:
94
+ # Provider may not accept a temperature kwarg — fall back to plain invoke.
95
+ logger.debug("temperature bind failed; retrying without it", exc_info=True)
96
+
97
+ response = self._llm.invoke(messages)
98
+ return _message_text(response)
99
+
100
+
101
+ def _message_text(response) -> str:
102
+ content = getattr(response, "content", response)
103
+ if isinstance(content, str):
104
+ return content
105
+ # Some providers return a list of content blocks.
106
+ if isinstance(content, list):
107
+ parts = []
108
+ for block in content:
109
+ if isinstance(block, str):
110
+ parts.append(block)
111
+ elif isinstance(block, dict) and "text" in block:
112
+ parts.append(str(block["text"]))
113
+ return "".join(parts)
114
+ return str(content)
@@ -0,0 +1,99 @@
1
+ """Built-in extraction schemas.
2
+
3
+ The Invoice schema + extraction prompt are ported from
4
+ app/api/invoices/process/route.ts, including the quantity back-fill rule.
5
+ These double as ready-to-use schemas and as worked examples for users defining
6
+ their own Pydantic schemas.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Optional
12
+
13
+ from pydantic import BaseModel, Field, model_validator
14
+
15
+
16
+ class LineItem(BaseModel):
17
+ description: Optional[str] = Field(None, description="Product/Service name.")
18
+ quantity: Optional[float] = Field(
19
+ None,
20
+ description=(
21
+ "Numeric quantity. If missing, calculate it as total / unit_price. "
22
+ "Default 1 only if neither is available."
23
+ ),
24
+ )
25
+ unit: Optional[str] = Field(None, description="Unit type (Adet, Kg, Saat, etc.).")
26
+ unit_price: Optional[float] = Field(None, description="Price per unit.")
27
+ tax_rate: Optional[str] = Field(
28
+ None, description="Tax percentage (e.g., 20, 10, 0) or pattern."
29
+ )
30
+ total: Optional[float] = Field(None, description="Total price for this line.")
31
+
32
+
33
+ class Invoice(BaseModel):
34
+ supplier_name: Optional[str] = Field(None, description="Name of the vendor/supplier.")
35
+ invoice_date: Optional[str] = Field(None, description="Format YYYY-MM-DD.")
36
+ invoice_number: Optional[str] = Field(None, description="The invoice ID/number.")
37
+ tax_id: Optional[str] = Field(None, description="Tax ID / VKN / TCKN.")
38
+ tax_rate: Optional[str] = Field(
39
+ None, description="e.g. 'KDV %20' when KDV is 20%."
40
+ )
41
+ currency: Optional[str] = Field(None, description="Currency code (TRY, USD, EUR, etc.).")
42
+ total_amount: Optional[float] = Field(None, description="Final total amount (numeric).")
43
+ line_items: list[LineItem] = Field(
44
+ default_factory=list, description="Array of items/services."
45
+ )
46
+
47
+ @model_validator(mode="after")
48
+ def _backfill_line_item_quantities(self) -> "Invoice":
49
+ """Port of the route's quantity = total / unit_price correction."""
50
+ for item in self.line_items:
51
+ if item.unit_price is None or item.total is None:
52
+ continue
53
+ try:
54
+ unit_price = float(item.unit_price)
55
+ total = float(item.total)
56
+ except (TypeError, ValueError):
57
+ continue
58
+ if unit_price > 0:
59
+ calculated_qty = total / unit_price
60
+ qty = item.quantity
61
+ if (qty is None or qty == 1) and abs(calculated_qty - 1) > 0.01:
62
+ item.quantity = round(calculated_qty, 2)
63
+ return self
64
+
65
+
66
+ # Verbatim system prompt from app/api/invoices/process/route.ts.
67
+ INVOICE_EXTRACTION_PROMPT = """You are an expert invoice data extraction assistant.
68
+
69
+ CRITICAL RULES:
70
+ 1. **LANGUAGE REPAIR**:
71
+ - The text may come from OCR and may have missing characters.
72
+ - If language is 'tr' (Turkish), intelligently fix missing Turkish characters.
73
+
74
+ 2. **NUMBER PARSING**:
75
+ - Be extremely careful with comma (,) and dot (.).
76
+ - In Turkish/European invoices, '1.200,50' means One Thousand Two Hundred and 50 cents.
77
+ - NEVER confuse a quantity (e.g., 500) with a price (e.g. 5,00).
78
+
79
+ 3. **CURRENCY DETECTION**:
80
+ - Look for symbols: ₺, TL, TRY, USD, $, EUR, €.
81
+ - Prioritize 'TRY' / 'TL' unless explicitly stated otherwise.
82
+
83
+ Extract the following fields if it exists:
84
+ - 'supplier_name': Name of the vendor/supplier.
85
+ - 'invoice_date': Format YYYY-MM-DD.
86
+ - 'invoice_number': The invoice ID/number.
87
+ - 'tax_id': Tax ID / VKN / TCKN.
88
+ - 'tax_rate': It can be like 'KDV' and for example if it is 'KDV' and it is %20, write it as 'KDV %20' in excel.
89
+ - 'currency': Currency code (TRY, USD, EUR, etc.).
90
+ - 'total_amount': Final total amount (numeric).
91
+ - 'line_items': An array of items/services. Each item should have:
92
+ - 'description': Product/Service name.
93
+ - 'quantity': Numeric quantity. If missing, calculate it as total / unit_price. Default 1 only if neither is available.
94
+ - 'unit': Unit type (Adet, Kg, Saat, etc.).
95
+ - 'unit_price': Price per unit.
96
+ - 'tax_rate': Tax percentage (e.g., 20, 10, 0) or pattern.
97
+ - 'total': Total price for this line.
98
+
99
+ Return ONLY a valid JSON object. If a field is not found, use null."""