ocrcontext 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrcontext/__init__.py +49 -0
- ocrcontext/analyzer.py +198 -0
- ocrcontext/config.py +49 -0
- ocrcontext/engines/__init__.py +6 -0
- ocrcontext/engines/base.py +45 -0
- ocrcontext/engines/handwriting.py +103 -0
- ocrcontext/engines/paddle.py +264 -0
- ocrcontext/engines/pdf_text.py +126 -0
- ocrcontext/engines/registry.py +67 -0
- ocrcontext/engines/trocr.py +191 -0
- ocrcontext/engines/vision.py +538 -0
- ocrcontext/exceptions.py +45 -0
- ocrcontext/llm/__init__.py +10 -0
- ocrcontext/llm/drift.py +58 -0
- ocrcontext/llm/extractor.py +63 -0
- ocrcontext/llm/formatting.py +39 -0
- ocrcontext/llm/literal_preserve.py +164 -0
- ocrcontext/llm/prompts.py +157 -0
- ocrcontext/llm/refiner.py +114 -0
- ocrcontext/llm/schemas.py +99 -0
- ocrcontext/pipeline.py +162 -0
- ocrcontext/preprocessing/__init__.py +5 -0
- ocrcontext/preprocessing/image.py +177 -0
- ocrcontext/py.typed +0 -0
- ocrcontext/quality.py +76 -0
- ocrcontext/schemas.py +8 -0
- ocrcontext/types.py +55 -0
- ocrcontext/utils/__init__.py +1 -0
- ocrcontext/utils/files.py +172 -0
- ocrcontext/utils/lang.py +77 -0
- ocrcontext-0.1.0.dist-info/METADATA +207 -0
- ocrcontext-0.1.0.dist-info/RECORD +34 -0
- ocrcontext-0.1.0.dist-info/WHEEL +4 -0
- ocrcontext-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""LLM-agnostic structured extraction via LangChain's ``with_structured_output``.
|
|
2
|
+
|
|
3
|
+
Give it any Pydantic schema and a chat model; get a populated model instance
|
|
4
|
+
back. The Invoice schema in :mod:`ocrcontext.llm.schemas` is auto-detected so it
|
|
5
|
+
uses the verbatim invoice prompt, but any schema works.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from .schemas import INVOICE_EXTRACTION_PROMPT, Invoice
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from langchain_core.language_models import BaseChatModel
|
|
18
|
+
|
|
19
|
+
TSchema = TypeVar("TSchema", bound=BaseModel)
|
|
20
|
+
|
|
21
|
+
_GENERIC_PROMPT = (
|
|
22
|
+
"You are an expert document data extraction assistant. The text may come from "
|
|
23
|
+
"OCR and may contain scanning errors and missing characters. Extract the "
|
|
24
|
+
"requested fields faithfully from the document text. Do not invent values: if a "
|
|
25
|
+
"field is not present in the text, leave it null/empty. Preserve the document's "
|
|
26
|
+
"original language for textual fields and do not translate."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class StructuredExtractor:
|
|
31
|
+
"""Extract a Pydantic schema from raw text using an injected chat model."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, llm: "BaseChatModel") -> None:
|
|
34
|
+
self._llm = llm
|
|
35
|
+
|
|
36
|
+
def extract(
|
|
37
|
+
self,
|
|
38
|
+
text: str,
|
|
39
|
+
schema: type[TSchema],
|
|
40
|
+
*,
|
|
41
|
+
language: str = "auto",
|
|
42
|
+
system_prompt: str | None = None,
|
|
43
|
+
) -> TSchema:
|
|
44
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
45
|
+
|
|
46
|
+
system = system_prompt or self._default_prompt(schema)
|
|
47
|
+
user = (
|
|
48
|
+
f"Language Context: {language}\n"
|
|
49
|
+
f"Extract detailed data from this document text:\n\n{text}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
structured = self._llm.with_structured_output(schema)
|
|
53
|
+
result = structured.invoke(
|
|
54
|
+
[SystemMessage(content=system), HumanMessage(content=user)]
|
|
55
|
+
)
|
|
56
|
+
# with_structured_output returns an instance of `schema`.
|
|
57
|
+
return result # type: ignore[return-value]
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _default_prompt(schema: type[BaseModel]) -> str:
|
|
61
|
+
if schema is Invoice:
|
|
62
|
+
return INVOICE_EXTRACTION_PROMPT
|
|
63
|
+
return _GENERIC_PROMPT
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Plain-text formatting, ported from lib/ocr/plain-text-format.ts.
|
|
2
|
+
|
|
3
|
+
Strip Markdown syntax so the model's structured output stays clean plain text
|
|
4
|
+
while keeping layout (blank lines, headings, bullets).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
_HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s+")
|
|
12
|
+
_BLOCKQUOTE_RE = re.compile(r"^(\s{0,3})>\s?")
|
|
13
|
+
_BULLET_RE = re.compile(r"^(\s*)[-*+]\s+")
|
|
14
|
+
_CODE_FENCE_RE = re.compile(r"```[^\n`]*\n?")
|
|
15
|
+
_BOLD_RE = re.compile(r"\*\*([^*\n]+)\*\*")
|
|
16
|
+
_UNDERSCORE_BOLD_RE = re.compile(r"__([^_\n]+)__")
|
|
17
|
+
_INLINE_CODE_RE = re.compile(r"`([^`\n]+)`")
|
|
18
|
+
_TRAILING_WS_RE = re.compile(r"[ \t]+$", re.MULTILINE)
|
|
19
|
+
_EXTRA_BLANKS_RE = re.compile(r"\n{3,}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def strip_markdown_formatting(text: str, *, convert_bullets: bool = False) -> str:
|
|
23
|
+
lines = []
|
|
24
|
+
for line in text.split("\n"):
|
|
25
|
+
line = _HEADING_RE.sub("", line)
|
|
26
|
+
line = _BLOCKQUOTE_RE.sub(r"\1", line)
|
|
27
|
+
if convert_bullets:
|
|
28
|
+
line = _BULLET_RE.sub(r"\1• ", line)
|
|
29
|
+
lines.append(line)
|
|
30
|
+
|
|
31
|
+
result = "\n".join(lines)
|
|
32
|
+
result = _CODE_FENCE_RE.sub("", result)
|
|
33
|
+
result = _BOLD_RE.sub(r"\1", result)
|
|
34
|
+
result = _UNDERSCORE_BOLD_RE.sub(r"\1", result)
|
|
35
|
+
result = _INLINE_CODE_RE.sub(r"\1", result)
|
|
36
|
+
result = _TRAILING_WS_RE.sub("", result)
|
|
37
|
+
result = _EXTRA_BLANKS_RE.sub("\n\n", result)
|
|
38
|
+
|
|
39
|
+
return result.strip()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Literal / contact-data preservation, ported from lib/ocr/literal-preserve.ts.
|
|
2
|
+
|
|
3
|
+
Emails, URLs, IBANs and card numbers are masked to ``{{OCRLITn}}`` placeholders
|
|
4
|
+
before the LLM sees the text and restored verbatim afterwards, so the model
|
|
5
|
+
cannot "fix" identifiers (e.g. bahadrkrsl@... -> bahadirkarsli@...).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
# Placeholders injected before LLM refine; restored verbatim after.
|
|
14
|
+
def _token_for(index: int) -> str:
|
|
15
|
+
return f"{{{{OCRLIT{index}}}}}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
EMAIL_PATTERN = re.compile(
|
|
19
|
+
r"[a-zA-Z0-9](?:[a-zA-Z0-9._%+-]*[a-zA-Z0-9])?"
|
|
20
|
+
r"@[a-zA-Z0-9](?:[a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
_LITERAL_PATTERNS: list[re.Pattern[str]] = [
|
|
24
|
+
EMAIL_PATTERN,
|
|
25
|
+
re.compile(r"https?://[^\s<>\"'\])}+]+", re.IGNORECASE),
|
|
26
|
+
re.compile(r"www\.[a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,}[^\s<>\"']*", re.IGNORECASE),
|
|
27
|
+
re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b"),
|
|
28
|
+
re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"),
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class MaskResult:
|
|
34
|
+
masked_text: str
|
|
35
|
+
literals: list[str]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def preprocess_literal_text(text: str) -> str:
|
|
39
|
+
"""OCR often inserts spaces/newlines around @ - join before masking."""
|
|
40
|
+
text = re.sub(
|
|
41
|
+
r"([a-zA-Z0-9._%+-]+)\s*\n\s*@\s*([a-zA-Z0-9][a-zA-Z0-9.-]*)", r"\1@\2", text
|
|
42
|
+
)
|
|
43
|
+
text = re.sub(
|
|
44
|
+
r"([a-zA-Z0-9._%+-]+)\s+@\s+([a-zA-Z0-9][a-zA-Z0-9.-]*)", r"\1@\2", text
|
|
45
|
+
)
|
|
46
|
+
return text
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_emails(text: str) -> list[str]:
|
|
50
|
+
normalized = preprocess_literal_text(text)
|
|
51
|
+
return [m.group(0) for m in EMAIL_PATTERN.finditer(normalized)]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _levenshtein(a: str, b: str) -> int:
|
|
55
|
+
m, n = len(a), len(b)
|
|
56
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
57
|
+
for i in range(m + 1):
|
|
58
|
+
dp[i][0] = i
|
|
59
|
+
for j in range(n + 1):
|
|
60
|
+
dp[0][j] = j
|
|
61
|
+
for i in range(1, m + 1):
|
|
62
|
+
for j in range(1, n + 1):
|
|
63
|
+
cost = 0 if a[i - 1] == b[j - 1] else 1
|
|
64
|
+
dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)
|
|
65
|
+
return dp[m][n]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _is_likely_same_email(candidate: str, original: str) -> bool:
|
|
69
|
+
c = candidate.lower().split("@")
|
|
70
|
+
o = original.lower().split("@")
|
|
71
|
+
if len(c) != 2 or len(o) != 2:
|
|
72
|
+
return False
|
|
73
|
+
c_local, c_domain = c
|
|
74
|
+
o_local, o_domain = o
|
|
75
|
+
if not c_local or not o_local or c_domain != o_domain:
|
|
76
|
+
return False
|
|
77
|
+
if candidate == original:
|
|
78
|
+
return True
|
|
79
|
+
max_dist = max(2, int(len(o_local) * 0.35))
|
|
80
|
+
return _levenshtein(c_local, o_local) <= max_dist
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def enforce_original_literals(original_text: str, refined_text: str) -> str:
|
|
84
|
+
"""Force the OCR/original email spelling back if the model rewrote it."""
|
|
85
|
+
originals = extract_emails(original_text)
|
|
86
|
+
if not originals:
|
|
87
|
+
return refined_text
|
|
88
|
+
|
|
89
|
+
output = refined_text
|
|
90
|
+
for orig in originals:
|
|
91
|
+
parts = orig.split("@")
|
|
92
|
+
if len(parts) != 2:
|
|
93
|
+
continue
|
|
94
|
+
domain = parts[1]
|
|
95
|
+
if not domain:
|
|
96
|
+
continue
|
|
97
|
+
domain_escaped = re.escape(domain)
|
|
98
|
+
domain_re = re.compile(
|
|
99
|
+
r"[a-zA-Z0-9](?:[a-zA-Z0-9._%+-]*[a-zA-Z0-9])?@" + domain_escaped,
|
|
100
|
+
re.IGNORECASE,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def _replace(match: re.Match[str], _orig: str = orig) -> str:
|
|
104
|
+
text = match.group(0)
|
|
105
|
+
if text == _orig:
|
|
106
|
+
return _orig
|
|
107
|
+
return _orig if _is_likely_same_email(text, _orig) else text
|
|
108
|
+
|
|
109
|
+
output = domain_re.sub(_replace, output)
|
|
110
|
+
|
|
111
|
+
return output
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _collect_non_overlapping_spans(text: str) -> list[tuple[int, int, str]]:
|
|
115
|
+
spans: list[tuple[int, int, str]] = []
|
|
116
|
+
for pattern in _LITERAL_PATTERNS:
|
|
117
|
+
for match in pattern.finditer(text):
|
|
118
|
+
spans.append((match.start(), match.end(), match.group(0)))
|
|
119
|
+
|
|
120
|
+
# start asc; at equal start, the longer span wins (so it is kept, shorter dropped).
|
|
121
|
+
spans.sort(key=lambda s: (s[0], -s[1]))
|
|
122
|
+
|
|
123
|
+
merged: list[tuple[int, int, str]] = []
|
|
124
|
+
for span in spans:
|
|
125
|
+
if not merged or span[0] >= merged[-1][1]:
|
|
126
|
+
merged.append(span)
|
|
127
|
+
return merged
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def mask_protected_literals(text: str) -> MaskResult:
|
|
131
|
+
preprocessed = preprocess_literal_text(text)
|
|
132
|
+
spans = _collect_non_overlapping_spans(preprocessed)
|
|
133
|
+
literals: list[str] = []
|
|
134
|
+
masked_text = ""
|
|
135
|
+
cursor = 0
|
|
136
|
+
for start, end, value in spans:
|
|
137
|
+
masked_text += preprocessed[cursor:start]
|
|
138
|
+
literals.append(value)
|
|
139
|
+
masked_text += _token_for(len(literals) - 1)
|
|
140
|
+
cursor = end
|
|
141
|
+
masked_text += preprocessed[cursor:]
|
|
142
|
+
return MaskResult(masked_text=masked_text, literals=literals)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def unmask_protected_literals(text: str, literals: list[str]) -> str:
|
|
146
|
+
output = text
|
|
147
|
+
for i, literal in enumerate(literals):
|
|
148
|
+
placeholder = _token_for(i)
|
|
149
|
+
if placeholder in output:
|
|
150
|
+
output = output.replace(placeholder, literal)
|
|
151
|
+
continue
|
|
152
|
+
fuzzy = re.compile(r"\{\{\s*OCRLIT\s*" + str(i) + r"\s*\}\}", re.IGNORECASE)
|
|
153
|
+
output = fuzzy.sub(literal, output)
|
|
154
|
+
return output
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
LITERAL_PRESERVE_PROMPT = """
|
|
158
|
+
LITERAL / CONTACT DATA (CRITICAL):
|
|
159
|
+
- Tokens like {{OCRLIT0}}, {{OCRLIT1}}, ... are frozen placeholders for emails, URLs, IBANs, and similar identifiers.
|
|
160
|
+
- Copy every {{OCRLITn}} token EXACTLY — same spelling, same characters, same position in the sentence.
|
|
161
|
+
- NEVER "fix", complete, or guess emails/usernames (e.g. do NOT change bahadrkrsl@outlook.com to bahadirkarsli@outlook.com).
|
|
162
|
+
- NEVER invent @ symbols or domains. If a placeholder is present, output it unchanged.
|
|
163
|
+
- Apply OCR fixes only to normal words around these placeholders, not to the placeholders themselves.
|
|
164
|
+
"""
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Refinement prompts, ported VERBATIM from lib/ocr/refine.ts.
|
|
2
|
+
|
|
3
|
+
These prompts were heavily tuned for fidelity; do not paraphrase them. Model
|
|
4
|
+
selection (gpt-4.1 vs gpt-4o) is intentionally dropped — the chat model is
|
|
5
|
+
injected by the caller. Only the per-mode temperature recommendation is kept.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from ..types import RefinementMode
|
|
11
|
+
from ..utils.lang import language_full_name
|
|
12
|
+
from .literal_preserve import LITERAL_PRESERVE_PROMPT
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def refine_temperature(mode: RefinementMode) -> float:
|
|
16
|
+
"""Deterministic for handwriting/conservative; light creativity for layout."""
|
|
17
|
+
if mode == RefinementMode.CONSERVATIVE:
|
|
18
|
+
return 0.0
|
|
19
|
+
if mode == RefinementMode.HANDWRITING_PROSE:
|
|
20
|
+
return 0.0
|
|
21
|
+
if mode == RefinementMode.HANDWRITING_LAYOUT:
|
|
22
|
+
return 0.0
|
|
23
|
+
return 0.1
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_LAYOUT = """
|
|
27
|
+
LAYOUT MODE (for digital PDFs):
|
|
28
|
+
- Reconstruct clean document structure in plain text.
|
|
29
|
+
- Keep clear section separation with blank lines between paragraphs/sections.
|
|
30
|
+
- Preserve list structure as plain text list items (no markdown markers unless present in source).
|
|
31
|
+
- Preserve the original reading order and do not merge unrelated sections.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
_HANDWRITING_LAYOUT = """
|
|
35
|
+
HANDWRITING LAYOUT MODE (for handwritten notes, lists, tables and diagrams of any topic):
|
|
36
|
+
- FIDELITY FIRST: stay faithful to what is actually written. Your job is to fix OCR errors, NOT to
|
|
37
|
+
improve the text. Do NOT paraphrase, summarize, complete unfinished sentences, smooth awkward
|
|
38
|
+
phrasing, or add connecting words. A slightly awkward but faithful transcription is the goal.
|
|
39
|
+
- Fix only clear OCR errors: missing diacritics, visually confused letters, and words the OCR split
|
|
40
|
+
or joined. If a word is plausible as written, keep it exactly.
|
|
41
|
+
- If a word or phrase is illegible, give your closest LITERAL character reading. NEVER replace it
|
|
42
|
+
with a fluent but made-up sentence, and never add facts that are not in the source. It is better
|
|
43
|
+
to leave a rough/partial phrase than to invent a clean one.
|
|
44
|
+
- Remove lines that are ONLY margin ruler numbers (e.g. lone "23", "2213").
|
|
45
|
+
- PRESERVE all of the source's content: keep every heading, label, list item, definition,
|
|
46
|
+
and arrow ("→") line. Never drop a line. If you are unsure about a line, keep it.
|
|
47
|
+
- PRESERVE THE ORIGINAL ORDER: keep lines and sentences in the exact order they appear in the source.
|
|
48
|
+
Do NOT reorder, move, or merge sentences to make the text flow better. If the source order looks
|
|
49
|
+
odd, leave it odd — do not "fix" it.
|
|
50
|
+
- Keep numbered lists and definition paragraphs as they are; fix only OCR errors in them.
|
|
51
|
+
- LAYOUT: only tidy spacing — keep the source's own line and section breaks, put each existing heading
|
|
52
|
+
on its own line, leave ONE blank line between existing sections, and use "• " for bullets that are
|
|
53
|
+
already bullets. Do NOT restructure content. Keep it PLAIN TEXT: no Markdown symbols (#, *, _,
|
|
54
|
+
backticks, >) and no code fences.
|
|
55
|
+
- Do NOT translate.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
_HANDWRITING_PROSE = """
|
|
59
|
+
HANDWRITING PROSE MODE (poems, paragraphs, letters, notes — no sensitive data):
|
|
60
|
+
- FIDELITY FIRST: fix OCR errors but stay faithful to what is written. Do NOT paraphrase, rewrite
|
|
61
|
+
style, complete unfinished sentences, or add new words/sentences/ideas.
|
|
62
|
+
- Fix OCR errors: missing diacritics, visually confused letters, and words the OCR split or joined
|
|
63
|
+
(e.g. "düşünmedin sen" → "düşünmediysen"; split a wrongly-joined word). If a word is plausible as
|
|
64
|
+
written, keep it exactly — do NOT swap it for a synonym or a "better" word.
|
|
65
|
+
- Fix line breaks the OCR got wrong and keep real verse/paragraph breaks. Remove duplicate words and
|
|
66
|
+
stray margin numbers caused by OCR.
|
|
67
|
+
- Keep lines and sentences in their original order; do NOT reorder or move content.
|
|
68
|
+
- If a word is illegible, give your closest LITERAL reading; never invent a fluent replacement.
|
|
69
|
+
- Keep signatures, author names, and titles; only fix obvious typos in them.
|
|
70
|
+
- Do NOT translate. Keep the original language.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
_CONSERVATIVE = """
|
|
74
|
+
CONSERVATIVE MODE (for OCR images/scans):
|
|
75
|
+
- Perform minimal, character-level OCR correction only.
|
|
76
|
+
- Do NOT replace a valid-looking word with a different semantic word.
|
|
77
|
+
- If uncertain, keep the original token exactly as-is.
|
|
78
|
+
- Do NOT infer missing entities (names, places, brands, email local-parts) from context.
|
|
79
|
+
- Preserve line order and keep output close to source line-by-line.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
_MODE_INSTRUCTIONS = {
|
|
83
|
+
RefinementMode.LAYOUT: _LAYOUT,
|
|
84
|
+
RefinementMode.HANDWRITING_LAYOUT: _HANDWRITING_LAYOUT,
|
|
85
|
+
RefinementMode.HANDWRITING_PROSE: _HANDWRITING_PROSE,
|
|
86
|
+
RefinementMode.CONSERVATIVE: _CONSERVATIVE,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_SYSTEM_HANDWRITING_LAYOUT = (
|
|
90
|
+
"You are a world-class OCR post-processor for handwritten notes. "
|
|
91
|
+
"Fix OCR errors and tidy the layout, but stay faithful to what is written: "
|
|
92
|
+
"never paraphrase, complete, or invent text you cannot read. "
|
|
93
|
+
"Never alter frozen {{OCRLITn}} placeholders. Output plain text only."
|
|
94
|
+
)
|
|
95
|
+
_SYSTEM_HANDWRITING_PROSE = (
|
|
96
|
+
"You are a world-class OCR post-processor for handwritten prose and poetry. "
|
|
97
|
+
"Fix misread words and broken line breaks, but stay faithful: never paraphrase, "
|
|
98
|
+
"complete, or invent content, and never translate. "
|
|
99
|
+
"Never alter frozen {{OCRLITn}} placeholders. Output plain text only."
|
|
100
|
+
)
|
|
101
|
+
_SYSTEM_DEFAULT = (
|
|
102
|
+
"You are a world-class OCR post-processor. Fix OCR noise in normal prose only. "
|
|
103
|
+
"Never alter frozen {{OCRLITn}} placeholders (emails, URLs, banking IDs). "
|
|
104
|
+
"Never guess or complete email addresses or usernames. Output plain text only."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def build_refinement_prompt(
|
|
109
|
+
masked_text: str, language: str, mode: RefinementMode
|
|
110
|
+
) -> tuple[str, str]:
|
|
111
|
+
"""Return ``(system, user)`` prompt strings for the given mode."""
|
|
112
|
+
full_language = language_full_name(language) if language else None
|
|
113
|
+
if full_language and full_language != "auto":
|
|
114
|
+
language_prompt = (
|
|
115
|
+
f"The text is in {full_language}. Preserve the original language and only fix "
|
|
116
|
+
f"OCR errors using {full_language} spelling rules."
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
language_prompt = (
|
|
120
|
+
"Preserve the original language of the text. Do not translate or change the language."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
mode_instructions = _MODE_INSTRUCTIONS[mode]
|
|
124
|
+
|
|
125
|
+
user = f"""
|
|
126
|
+
You are an expert OCR post-processing AI. Your ONLY task is to reconstruct the original text from OCR output that contains scanning errors.
|
|
127
|
+
Never add new information and never remove existing information.
|
|
128
|
+
{mode_instructions}
|
|
129
|
+
{LITERAL_PRESERVE_PROMPT}
|
|
130
|
+
|
|
131
|
+
UNDERSTANDING OCR ERRORS:
|
|
132
|
+
OCR engines make SYSTEMATIC errors — they don't understand language, they only recognize shapes.
|
|
133
|
+
|
|
134
|
+
A) DIACRITIC STRIPPING (Turkish, French, German, Spanish, etc.)
|
|
135
|
+
B) VISUALLY SIMILAR CHARACTER CONFUSION (0↔O, rn→m, ...)
|
|
136
|
+
C) TRUNCATION & MISSING CHARACTERS
|
|
137
|
+
D) WORD BOUNDARY ERRORS
|
|
138
|
+
|
|
139
|
+
YOUR TASK:
|
|
140
|
+
1. Fix OCR errors in regular words using sentence context.
|
|
141
|
+
2. DO NOT translate. DO NOT change the language. DO NOT add or remove content.
|
|
142
|
+
3. DO NOT add commentary. Output ONLY the corrected plain text.
|
|
143
|
+
4. Output PLAIN TEXT only. Do NOT use Markdown syntax (#, *, _, backticks, >) and do NOT wrap the output in code fences.
|
|
144
|
+
5. {language_prompt}
|
|
145
|
+
|
|
146
|
+
Input Text:
|
|
147
|
+
{masked_text}
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
if mode == RefinementMode.HANDWRITING_LAYOUT:
|
|
151
|
+
system = _SYSTEM_HANDWRITING_LAYOUT
|
|
152
|
+
elif mode == RefinementMode.HANDWRITING_PROSE:
|
|
153
|
+
system = _SYSTEM_HANDWRITING_PROSE
|
|
154
|
+
else:
|
|
155
|
+
system = _SYSTEM_DEFAULT
|
|
156
|
+
|
|
157
|
+
return system, user
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""LLM-agnostic OCR refinement, ported from lib/ocr/refine.ts::refineOcrText.
|
|
2
|
+
|
|
3
|
+
Works with any LangChain ``BaseChatModel``. The fidelity pipeline is preserved:
|
|
4
|
+
mask literals -> prompt -> invoke -> unmask -> enforce literals -> strip markdown
|
|
5
|
+
-> drift/hallucination rejection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from ..types import RefinementMode
|
|
14
|
+
from .drift import refine_hallucinated_length, refinement_drifted
|
|
15
|
+
from .formatting import strip_markdown_formatting
|
|
16
|
+
from .literal_preserve import (
|
|
17
|
+
enforce_original_literals,
|
|
18
|
+
mask_protected_literals,
|
|
19
|
+
unmask_protected_literals,
|
|
20
|
+
)
|
|
21
|
+
from .prompts import build_refinement_prompt, refine_temperature
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from langchain_core.language_models import BaseChatModel
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger("ocrcontext.refine")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Refiner:
|
|
30
|
+
"""Post-OCR refinement using an injected LangChain chat model."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, llm: "BaseChatModel", *, apply_temperature: bool = True) -> None:
|
|
33
|
+
self._llm = llm
|
|
34
|
+
# When True, bind the mode's recommended temperature to the model call.
|
|
35
|
+
# Falls back gracefully for providers that reject the kwarg.
|
|
36
|
+
self._apply_temperature = apply_temperature
|
|
37
|
+
|
|
38
|
+
def refine(
|
|
39
|
+
self,
|
|
40
|
+
text: str,
|
|
41
|
+
language: str = "auto",
|
|
42
|
+
mode: RefinementMode = RefinementMode.CONSERVATIVE,
|
|
43
|
+
) -> str:
|
|
44
|
+
"""Refine OCR ``text``. Returns the original text unchanged on drift/empty."""
|
|
45
|
+
mask = mask_protected_literals(text)
|
|
46
|
+
system, user = build_refinement_prompt(mask.masked_text, language, mode)
|
|
47
|
+
|
|
48
|
+
raw = self._invoke(system, user, refine_temperature(mode))
|
|
49
|
+
refined = raw or mask.masked_text
|
|
50
|
+
|
|
51
|
+
unmasked = unmask_protected_literals(refined, mask.literals)
|
|
52
|
+
literal_safe = enforce_original_literals(text, unmasked)
|
|
53
|
+
|
|
54
|
+
convert_bullets = mode in (RefinementMode.HANDWRITING_LAYOUT, RefinementMode.LAYOUT)
|
|
55
|
+
cleaned = strip_markdown_formatting(literal_safe, convert_bullets=convert_bullets)
|
|
56
|
+
|
|
57
|
+
# Handwritten notes/prose: trust word + layout fixes; reject only wholesale
|
|
58
|
+
# hallucination (size bears little resemblance to source).
|
|
59
|
+
if mode in (RefinementMode.HANDWRITING_PROSE, RefinementMode.HANDWRITING_LAYOUT):
|
|
60
|
+
if not cleaned.strip():
|
|
61
|
+
return text
|
|
62
|
+
if refine_hallucinated_length(text, cleaned):
|
|
63
|
+
logger.warning(
|
|
64
|
+
"Handwriting output length diverged too far; keeping original OCR text "
|
|
65
|
+
"(mode=%s)",
|
|
66
|
+
mode.value,
|
|
67
|
+
)
|
|
68
|
+
return text
|
|
69
|
+
return cleaned
|
|
70
|
+
|
|
71
|
+
if refinement_drifted(text, cleaned):
|
|
72
|
+
logger.warning(
|
|
73
|
+
"Output drifted too far from source; keeping original OCR text "
|
|
74
|
+
"(mode=%s, original_lines=%d, refined_lines=%d)",
|
|
75
|
+
mode.value,
|
|
76
|
+
len(text.split("\n")),
|
|
77
|
+
len(cleaned.split("\n")),
|
|
78
|
+
)
|
|
79
|
+
return text
|
|
80
|
+
|
|
81
|
+
return cleaned
|
|
82
|
+
|
|
83
|
+
def _invoke(self, system: str, user: str, temperature: float) -> str:
|
|
84
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
85
|
+
|
|
86
|
+
messages = [SystemMessage(content=system), HumanMessage(content=user)]
|
|
87
|
+
|
|
88
|
+
if self._apply_temperature:
|
|
89
|
+
try:
|
|
90
|
+
bound = self._llm.bind(temperature=temperature)
|
|
91
|
+
response = bound.invoke(messages)
|
|
92
|
+
return _message_text(response)
|
|
93
|
+
except Exception:
|
|
94
|
+
# Provider may not accept a temperature kwarg — fall back to plain invoke.
|
|
95
|
+
logger.debug("temperature bind failed; retrying without it", exc_info=True)
|
|
96
|
+
|
|
97
|
+
response = self._llm.invoke(messages)
|
|
98
|
+
return _message_text(response)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _message_text(response) -> str:
|
|
102
|
+
content = getattr(response, "content", response)
|
|
103
|
+
if isinstance(content, str):
|
|
104
|
+
return content
|
|
105
|
+
# Some providers return a list of content blocks.
|
|
106
|
+
if isinstance(content, list):
|
|
107
|
+
parts = []
|
|
108
|
+
for block in content:
|
|
109
|
+
if isinstance(block, str):
|
|
110
|
+
parts.append(block)
|
|
111
|
+
elif isinstance(block, dict) and "text" in block:
|
|
112
|
+
parts.append(str(block["text"]))
|
|
113
|
+
return "".join(parts)
|
|
114
|
+
return str(content)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Built-in extraction schemas.
|
|
2
|
+
|
|
3
|
+
The Invoice schema + extraction prompt are ported from
|
|
4
|
+
app/api/invoices/process/route.ts, including the quantity back-fill rule.
|
|
5
|
+
These double as ready-to-use schemas and as worked examples for users defining
|
|
6
|
+
their own Pydantic schemas.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field, model_validator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LineItem(BaseModel):
|
|
17
|
+
description: Optional[str] = Field(None, description="Product/Service name.")
|
|
18
|
+
quantity: Optional[float] = Field(
|
|
19
|
+
None,
|
|
20
|
+
description=(
|
|
21
|
+
"Numeric quantity. If missing, calculate it as total / unit_price. "
|
|
22
|
+
"Default 1 only if neither is available."
|
|
23
|
+
),
|
|
24
|
+
)
|
|
25
|
+
unit: Optional[str] = Field(None, description="Unit type (Adet, Kg, Saat, etc.).")
|
|
26
|
+
unit_price: Optional[float] = Field(None, description="Price per unit.")
|
|
27
|
+
tax_rate: Optional[str] = Field(
|
|
28
|
+
None, description="Tax percentage (e.g., 20, 10, 0) or pattern."
|
|
29
|
+
)
|
|
30
|
+
total: Optional[float] = Field(None, description="Total price for this line.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Invoice(BaseModel):
|
|
34
|
+
supplier_name: Optional[str] = Field(None, description="Name of the vendor/supplier.")
|
|
35
|
+
invoice_date: Optional[str] = Field(None, description="Format YYYY-MM-DD.")
|
|
36
|
+
invoice_number: Optional[str] = Field(None, description="The invoice ID/number.")
|
|
37
|
+
tax_id: Optional[str] = Field(None, description="Tax ID / VKN / TCKN.")
|
|
38
|
+
tax_rate: Optional[str] = Field(
|
|
39
|
+
None, description="e.g. 'KDV %20' when KDV is 20%."
|
|
40
|
+
)
|
|
41
|
+
currency: Optional[str] = Field(None, description="Currency code (TRY, USD, EUR, etc.).")
|
|
42
|
+
total_amount: Optional[float] = Field(None, description="Final total amount (numeric).")
|
|
43
|
+
line_items: list[LineItem] = Field(
|
|
44
|
+
default_factory=list, description="Array of items/services."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@model_validator(mode="after")
|
|
48
|
+
def _backfill_line_item_quantities(self) -> "Invoice":
|
|
49
|
+
"""Port of the route's quantity = total / unit_price correction."""
|
|
50
|
+
for item in self.line_items:
|
|
51
|
+
if item.unit_price is None or item.total is None:
|
|
52
|
+
continue
|
|
53
|
+
try:
|
|
54
|
+
unit_price = float(item.unit_price)
|
|
55
|
+
total = float(item.total)
|
|
56
|
+
except (TypeError, ValueError):
|
|
57
|
+
continue
|
|
58
|
+
if unit_price > 0:
|
|
59
|
+
calculated_qty = total / unit_price
|
|
60
|
+
qty = item.quantity
|
|
61
|
+
if (qty is None or qty == 1) and abs(calculated_qty - 1) > 0.01:
|
|
62
|
+
item.quantity = round(calculated_qty, 2)
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Verbatim system prompt from app/api/invoices/process/route.ts.
|
|
67
|
+
INVOICE_EXTRACTION_PROMPT = """You are an expert invoice data extraction assistant.
|
|
68
|
+
|
|
69
|
+
CRITICAL RULES:
|
|
70
|
+
1. **LANGUAGE REPAIR**:
|
|
71
|
+
- The text may come from OCR and may have missing characters.
|
|
72
|
+
- If language is 'tr' (Turkish), intelligently fix missing Turkish characters.
|
|
73
|
+
|
|
74
|
+
2. **NUMBER PARSING**:
|
|
75
|
+
- Be extremely careful with comma (,) and dot (.).
|
|
76
|
+
- In Turkish/European invoices, '1.200,50' means One Thousand Two Hundred and 50 cents.
|
|
77
|
+
- NEVER confuse a quantity (e.g., 500) with a price (e.g. 5,00).
|
|
78
|
+
|
|
79
|
+
3. **CURRENCY DETECTION**:
|
|
80
|
+
- Look for symbols: ₺, TL, TRY, USD, $, EUR, €.
|
|
81
|
+
- Prioritize 'TRY' / 'TL' unless explicitly stated otherwise.
|
|
82
|
+
|
|
83
|
+
Extract the following fields if it exists:
|
|
84
|
+
- 'supplier_name': Name of the vendor/supplier.
|
|
85
|
+
- 'invoice_date': Format YYYY-MM-DD.
|
|
86
|
+
- 'invoice_number': The invoice ID/number.
|
|
87
|
+
- 'tax_id': Tax ID / VKN / TCKN.
|
|
88
|
+
- 'tax_rate': It can be like 'KDV' and for example if it is 'KDV' and it is %20, write it as 'KDV %20' in excel.
|
|
89
|
+
- 'currency': Currency code (TRY, USD, EUR, etc.).
|
|
90
|
+
- 'total_amount': Final total amount (numeric).
|
|
91
|
+
- 'line_items': An array of items/services. Each item should have:
|
|
92
|
+
- 'description': Product/Service name.
|
|
93
|
+
- 'quantity': Numeric quantity. If missing, calculate it as total / unit_price. Default 1 only if neither is available.
|
|
94
|
+
- 'unit': Unit type (Adet, Kg, Saat, etc.).
|
|
95
|
+
- 'unit_price': Price per unit.
|
|
96
|
+
- 'tax_rate': Tax percentage (e.g., 20, 10, 0) or pattern.
|
|
97
|
+
- 'total': Total price for this line.
|
|
98
|
+
|
|
99
|
+
Return ONLY a valid JSON object. If a field is not found, use null."""
|