ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,89 @@
1
+ """Stage 6 — Transform: apply transform ops and compute derived fields."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from ocr_postprocess.models import Candidate, PipelineContext
9
+ from ocr_postprocess.transformer import apply_transforms, compute_field
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def transform_stage(ctx: PipelineContext) -> None:
15
+ """Pipeline stage 6: transform candidate values and compute derived fields."""
16
+ profile = ctx.profile
17
+ if not profile:
18
+ return
19
+
20
+ # Build field lookup
21
+ field_map = {f.key: f for f in profile.fields}
22
+
23
+ # Apply transforms to existing candidates
24
+ transform_failures = 0
25
+ for cand in ctx.candidates:
26
+ field = field_map.get(cand.key)
27
+ if field is None or not field.transform:
28
+ continue
29
+ try:
30
+ transformed = apply_transforms(cand.value, field.transform)
31
+ cand.value = transformed
32
+ except Exception as exc:
33
+ transform_failures += 1
34
+ logger.warning("Transform failed for '%s': %s", cand.key, exc)
35
+ cand.notes.append(f"transform_error: {exc}")
36
+ cand.needs_llm_review = True
37
+
38
+ # Compute derived fields (topological order via profile.compute)
39
+ existing_values: dict[str, Any] = {c.key: c.value for c in ctx.candidates}
40
+ # Separate confidence map — dep VALUES are not confidences (values may be > 1)
41
+ existing_confidences: dict[str, float] = {c.key: c.confidence for c in ctx.candidates}
42
+
43
+ for compute_def in profile.compute:
44
+ key = compute_def.get("name") or compute_def.get("key", "")
45
+ expr = compute_def.get("expr") or compute_def.get("compute", "")
46
+ if not key or not expr:
47
+ continue
48
+
49
+ deps = compute_def.get("deps", [])
50
+ dep_values = {d: existing_values.get(d) for d in deps}
51
+
52
+ # Skip if any dep is None
53
+ if any(v is None for v in dep_values.values()):
54
+ logger.debug("Skipping compute '%s': missing deps %s", key, deps)
55
+ continue
56
+
57
+ try:
58
+ result = compute_field(expr, dep_values)
59
+ existing_values[key] = result
60
+ ctx.candidates.append(
61
+ Candidate(
62
+ key=key,
63
+ value=result,
64
+ raw=expr,
65
+ extractor="computed",
66
+ sources=["computed"] + [f"dep:{d}" for d in deps],
67
+ # Confidence = min confidence of deps (from actual candidate confidences),
68
+ # or 0.9 if there are no deps.
69
+ confidence=(
70
+ min(existing_confidences.get(d, 0.8) for d in deps) if deps else 0.9
71
+ ),
72
+ )
73
+ )
74
+ except Exception as exc:
75
+ logger.warning("Compute '%s' failed: %s", key, exc)
76
+ ctx.warnings.append(f"compute_error: {key} — {exc}")
77
+
78
+ computed_count = sum(1 for c in ctx.candidates if c.extractor == "computed")
79
+ if transform_failures:
80
+ logger.warning(
81
+ "Transform stage: %d transform failure(s), %d compute field(s) added",
82
+ transform_failures,
83
+ computed_count,
84
+ )
85
+ else:
86
+ logger.debug(
87
+ "Transform stage: %d compute field(s) added, 0 failures",
88
+ computed_count,
89
+ )
@@ -0,0 +1,30 @@
1
+ """Custom exceptions for ocr_postprocess."""
2
+
3
+
4
+ class OcrPostprocessError(Exception):
5
+ """Base exception for all ocr_postprocess errors."""
6
+
7
+
8
+ class ProfileNotFoundError(OcrPostprocessError):
9
+ """Profile id not found in registry."""
10
+
11
+
12
+ class ProfileValidationError(OcrPostprocessError):
13
+ """YAML profile failed schema validation."""
14
+
15
+ def __init__(self, message: str, field_path: str | None = None) -> None:
16
+ self.field_path = field_path
17
+ detail = f"[{field_path}] {message}" if field_path else message
18
+ super().__init__(detail)
19
+
20
+
21
+ class ExtractorNotFoundError(OcrPostprocessError):
22
+ """Extractor name not registered."""
23
+
24
+
25
+ class TransformError(OcrPostprocessError):
26
+ """Transform op or compute expression failed."""
27
+
28
+
29
+ class CyclicComputeError(OcrPostprocessError):
30
+ """Cyclic dependency detected in compute fields."""
File without changes
@@ -0,0 +1,103 @@
1
+ """Extractor abstract base classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ from ocr_postprocess.models import Candidate, PipelineContext
9
+
10
+ if TYPE_CHECKING:
11
+ from ocr_postprocess.profiles.schema import FieldDef
12
+
13
+
14
+ class Extractor(ABC):
15
+ """Base class for all extractors."""
16
+
17
+ name: str
18
+ kind: Literal["pattern", "label_anchor", "structured"]
19
+
20
+ @abstractmethod
21
+ def extract(
22
+ self,
23
+ ctx: PipelineContext,
24
+ field: "FieldDef | None" = None,
25
+ ) -> list[Candidate]:
26
+ """Extract candidates from pipeline context."""
27
+ ...
28
+
29
+
30
+ class PatternExtractor(Extractor):
31
+ """Extractor that scans full text for a regex pattern."""
32
+
33
+ kind: Literal["pattern", "label_anchor", "structured"] = "pattern"
34
+ pattern: str = ""
35
+
36
+ def validate(self, value: str) -> bool:
37
+ """Optional validation beyond regex match."""
38
+ return True
39
+
40
+ def confidence_for(self, value: str, match: object) -> float:
41
+ """Return confidence score for a match."""
42
+ return 0.9
43
+
44
+ def extract(
45
+ self,
46
+ ctx: PipelineContext,
47
+ field: "FieldDef | None" = None,
48
+ ) -> list[Candidate]:
49
+ """Default pattern scan across all sections."""
50
+ # Import locally to avoid circular imports at package init time
51
+ import regex as re
52
+
53
+ if not self.pattern:
54
+ return []
55
+
56
+ candidates: list[Candidate] = []
57
+ compiled = re.compile(self.pattern)
58
+ text = ctx.normalized_text or ctx.raw_text
59
+
60
+ for m in compiled.finditer(text):
61
+ value = m.group(0)
62
+ if not self.validate(value):
63
+ continue
64
+ conf = self.confidence_for(value, m)
65
+ key = field.key if field else self.name
66
+ candidates.append(
67
+ Candidate(
68
+ key=key,
69
+ value=value,
70
+ raw=value,
71
+ extractor=self.name,
72
+ sources=[f"pattern:{self.name}"],
73
+ span=(m.start(), m.end()),
74
+ confidence=conf,
75
+ )
76
+ )
77
+ return candidates
78
+
79
+
80
+ class LabelAnchorExtractor(Extractor):
81
+ """Extractor that uses label positions to locate values."""
82
+
83
+ kind: Literal["pattern", "label_anchor", "structured"] = "label_anchor"
84
+
85
+ def extract(
86
+ self,
87
+ ctx: PipelineContext,
88
+ field: "FieldDef | None" = None,
89
+ ) -> list[Candidate]:
90
+ return []
91
+
92
+
93
+ class StructuredExtractor(Extractor):
94
+ """Extractor that parses structured regions (MRZ, tables)."""
95
+
96
+ kind: Literal["pattern", "label_anchor", "structured"] = "structured"
97
+
98
+ def extract(
99
+ self,
100
+ ctx: PipelineContext,
101
+ field: "FieldDef | None" = None,
102
+ ) -> list[Candidate]:
103
+ return []
@@ -0,0 +1,63 @@
1
+ """Shared helpers for extractors: label lookup and fuzzy matching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from rapidfuzz import fuzz
8
+
9
+ from ocr_postprocess.models import LabelHit, PipelineContext
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def fuzzy_match_label(text: str, label: str, threshold: float = 0.8) -> float:
15
+ """Return partial_ratio score in [0, 1] if >= threshold, else 0."""
16
+ score = fuzz.partial_ratio(label.lower(), text.lower()) / 100.0
17
+ return score if score >= threshold else 0.0
18
+
19
+
20
+ def find_label(
21
+ ctx: PipelineContext,
22
+ aliases: list[str],
23
+ fuzzy: bool = True,
24
+ threshold: float = 0.8,
25
+ ) -> list[LabelHit]:
26
+ """Find all label hits in ctx.label_index matching any alias.
27
+
28
+ Uses exact match first, falls back to fuzzy if fuzzy=True.
29
+ """
30
+ hits: list[LabelHit] = []
31
+ seen: set[tuple[str, int, int]] = set()
32
+
33
+ for alias in aliases:
34
+ alias_lower = alias.lower()
35
+ # exact match against label_index keys
36
+ for key, label_hits in ctx.label_index.items():
37
+ key_lower = key.lower()
38
+ score = 1.0
39
+ if key_lower != alias_lower:
40
+ if not fuzzy:
41
+ continue
42
+ score = fuzzy_match_label(key_lower, alias_lower, threshold)
43
+ if score == 0.0:
44
+ continue
45
+ for hit in label_hits:
46
+ # When fuzzy matching is disabled, skip hits that were themselves
47
+ # added to the index via a fuzzy (non-exact) text search.
48
+ if not fuzzy and hit.fuzzy_score < 1.0:
49
+ continue
50
+ dedup_key = (hit.section_id, hit.line_index, hit.char_start)
51
+ if dedup_key in seen:
52
+ continue
53
+ seen.add(dedup_key)
54
+ # clone with updated score and aliases_matched
55
+ updated = hit.model_copy(
56
+ update={
57
+ "fuzzy_score": score,
58
+ "aliases_matched": hit.aliases_matched + [alias],
59
+ }
60
+ )
61
+ hits.append(updated)
62
+
63
+ return hits
File without changes
@@ -0,0 +1,53 @@
1
+ """Label-anchor extractor: value on the line immediately after the label."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ocr_postprocess.extractors.base import LabelAnchorExtractor
6
+ from ocr_postprocess.extractors.helpers import find_label
7
+ from ocr_postprocess.extractors.registry import register
8
+ from ocr_postprocess.models import Candidate, PipelineContext
9
+
10
+
11
+ @register("line_after_label")
12
+ class LineAfterLabelExtractor(LabelAnchorExtractor):
13
+ """Extract value from line immediately following the label line."""
14
+
15
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
16
+ if not field or not field.aliases:
17
+ return []
18
+
19
+ hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
20
+ candidates: list[Candidate] = []
21
+
22
+ for hit in hits:
23
+ section = next((s for s in ctx.sections if s.id == hit.section_id), None)
24
+ if section is None:
25
+ continue
26
+
27
+ lines_sorted = sorted(section.lines, key=lambda ln: ln.index)
28
+ label_pos = next(
29
+ (i for i, ln in enumerate(lines_sorted) if ln.index == hit.line_index), None
30
+ )
31
+ if label_pos is None or label_pos + 1 >= len(lines_sorted):
32
+ continue
33
+
34
+ next_line = lines_sorted[label_pos + 1]
35
+ value = next_line.text.strip()
36
+ if not value:
37
+ continue
38
+
39
+ conf = 0.7 * hit.fuzzy_score
40
+ candidates.append(
41
+ Candidate(
42
+ key=field.key,
43
+ value=value,
44
+ raw=value,
45
+ extractor="line_after_label",
46
+ sources=["label_anchor"],
47
+ section_id=hit.section_id,
48
+ line_index=next_line.index,
49
+ confidence=conf,
50
+ )
51
+ )
52
+
53
+ return candidates
@@ -0,0 +1,75 @@
1
+ """Label-anchor extractor: apply regex after finding a label."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import LabelAnchorExtractor
8
+ from ocr_postprocess.extractors.helpers import find_label
9
+ from ocr_postprocess.extractors.registry import register
10
+ from ocr_postprocess.models import Candidate, PipelineContext
11
+
12
+
13
+ @register("regex_after_label")
14
+ class RegexAfterLabelExtractor(LabelAnchorExtractor):
15
+ """Apply a regex to text after a label; group(1) is the extracted value."""
16
+
17
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
18
+ if not field or not field.aliases or not field.pattern:
19
+ return []
20
+
21
+ compiled = re.compile(field.pattern)
22
+ hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
23
+ candidates: list[Candidate] = []
24
+
25
+ for hit in hits:
26
+ section = next((s for s in ctx.sections if s.id == hit.section_id), None)
27
+ if section is None:
28
+ continue
29
+ line = next((ln for ln in section.lines if ln.index == hit.line_index), None)
30
+ if line is None:
31
+ continue
32
+
33
+ after = line.text[hit.char_end :]
34
+ m = compiled.search(after)
35
+
36
+ # If no match on label line, search subsequent lines (up to field.next_lines).
37
+ if not m and getattr(field, "next_lines", 0) > 0:
38
+ lines_sorted = sorted(section.lines, key=lambda ln: ln.index)
39
+ label_pos = next(
40
+ (i for i, ln in enumerate(lines_sorted) if ln.index == hit.line_index), None
41
+ )
42
+ if label_pos is not None:
43
+ for extra_ln in lines_sorted[
44
+ label_pos + 1 : label_pos + 1 + getattr(field, "next_lines", 0)
45
+ ]:
46
+ m = compiled.search(extra_ln.text)
47
+ if m:
48
+ break
49
+
50
+ if not m:
51
+ continue
52
+
53
+ value = m.group(1) if m.lastindex and m.lastindex >= 1 else m.group(0)
54
+ value = value.strip()
55
+ if not value:
56
+ continue
57
+
58
+ abs_start = hit.char_end + m.start()
59
+ abs_end = hit.char_end + m.end()
60
+ conf = 0.7 * hit.fuzzy_score
61
+ candidates.append(
62
+ Candidate(
63
+ key=field.key,
64
+ value=value,
65
+ raw=value,
66
+ extractor="regex_after_label",
67
+ sources=["label_anchor"],
68
+ section_id=hit.section_id,
69
+ line_index=hit.line_index,
70
+ span=(abs_start, abs_end),
71
+ confidence=conf,
72
+ )
73
+ )
74
+
75
+ return candidates
@@ -0,0 +1,79 @@
1
+ """Label-anchor extractor: multi-line text until next known label (stop_labels)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ocr_postprocess.extractors.base import LabelAnchorExtractor
6
+ from ocr_postprocess.extractors.helpers import find_label
7
+ from ocr_postprocess.extractors.registry import register
8
+ from ocr_postprocess.models import Candidate, PipelineContext
9
+
10
+
11
+ @register("text_until_next_label")
12
+ class TextUntilNextLabelExtractor(LabelAnchorExtractor):
13
+ """Collect multi-line text from label until next stop label."""
14
+
15
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
16
+ if not field or not field.aliases:
17
+ return []
18
+
19
+ stop_aliases: list[str] = field.stop_labels or []
20
+ # Also use all known aliases as implicit stops
21
+ if ctx.profile and not stop_aliases:
22
+ for f in ctx.profile.fields:
23
+ if f.key != field.key:
24
+ stop_aliases.extend(f.aliases)
25
+
26
+ stop_hits_by_section: dict[str, list[int]] = {}
27
+ if stop_aliases:
28
+ for s_hit in find_label(ctx, stop_aliases, fuzzy=False):
29
+ stop_hits_by_section.setdefault(s_hit.section_id, []).append(s_hit.line_index)
30
+
31
+ hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
32
+ candidates: list[Candidate] = []
33
+
34
+ for hit in hits:
35
+ section = next((s for s in ctx.sections if s.id == hit.section_id), None)
36
+ if section is None:
37
+ continue
38
+
39
+ stop_lines = set(stop_hits_by_section.get(hit.section_id, []))
40
+ lines_sorted = sorted(section.lines, key=lambda ln: ln.index)
41
+ label_pos = next(
42
+ (i for i, ln in enumerate(lines_sorted) if ln.index == hit.line_index), None
43
+ )
44
+ if label_pos is None:
45
+ continue
46
+
47
+ # Collect lines from label_pos+1 to next stop
48
+ collected: list[str] = []
49
+ # Also include same-line tail
50
+ label_line = lines_sorted[label_pos]
51
+ tail = label_line.text[hit.char_end :].strip().lstrip(":: ")
52
+ if tail:
53
+ collected.append(tail)
54
+
55
+ for ln in lines_sorted[label_pos + 1 :]:
56
+ if ln.index in stop_lines:
57
+ break
58
+ if ln.text.strip():
59
+ collected.append(ln.text.strip())
60
+
61
+ value = " ".join(collected).strip()
62
+ if not value:
63
+ continue
64
+
65
+ conf = 0.7 * hit.fuzzy_score
66
+ candidates.append(
67
+ Candidate(
68
+ key=field.key,
69
+ value=value,
70
+ raw=value,
71
+ extractor="text_until_next_label",
72
+ sources=["label_anchor"],
73
+ section_id=hit.section_id,
74
+ line_index=hit.line_index,
75
+ confidence=conf,
76
+ )
77
+ )
78
+
79
+ return candidates
@@ -0,0 +1,65 @@
1
+ """Label-anchor extractor: value between two labels on the same line."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import LabelAnchorExtractor
8
+ from ocr_postprocess.extractors.helpers import find_label
9
+ from ocr_postprocess.extractors.registry import register
10
+ from ocr_postprocess.models import Candidate, PipelineContext
11
+
12
+
13
+ @register("value_between_labels")
14
+ class ValueBetweenLabelsExtractor(LabelAnchorExtractor):
15
+ """Extract value between two consecutive labels on the same line."""
16
+
17
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
18
+ if not field or not field.aliases:
19
+ return []
20
+
21
+ hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
22
+ candidates: list[Candidate] = []
23
+
24
+ for hit in hits:
25
+ section = next((s for s in ctx.sections if s.id == hit.section_id), None)
26
+ if section is None:
27
+ continue
28
+ line = next((ln for ln in section.lines if ln.index == hit.line_index), None)
29
+ if line is None:
30
+ continue
31
+
32
+ after = line.text[hit.char_end :].strip().lstrip(":: ")
33
+
34
+ # Find end of value = start of another label or end of line
35
+ all_aliases = []
36
+ if ctx.profile:
37
+ for f in ctx.profile.fields:
38
+ all_aliases.extend(f.aliases)
39
+
40
+ end_pos = len(after)
41
+ for alias in all_aliases:
42
+ m = re.search(re.escape(alias), after, re.IGNORECASE)
43
+ if m and m.start() < end_pos:
44
+ end_pos = m.start()
45
+
46
+ value = after[:end_pos].strip()
47
+ if not value:
48
+ continue
49
+
50
+ conf = 0.7 * hit.fuzzy_score
51
+ candidates.append(
52
+ Candidate(
53
+ key=field.key,
54
+ value=value,
55
+ raw=line.text,
56
+ extractor="value_between_labels",
57
+ sources=["label_anchor"],
58
+ section_id=hit.section_id,
59
+ line_index=hit.line_index,
60
+ span=(hit.char_end, hit.char_end + end_pos),
61
+ confidence=conf,
62
+ )
63
+ )
64
+
65
+ return candidates
@@ -0,0 +1,60 @@
1
+ """Label-anchor extractor: value on the same line as a label."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import LabelAnchorExtractor
8
+ from ocr_postprocess.extractors.helpers import find_label
9
+ from ocr_postprocess.extractors.registry import register
10
+ from ocr_postprocess.models import Candidate, PipelineContext
11
+
12
+ _COLON_SPLIT = re.compile(r"[::]\s*")
13
+
14
+
15
+ @register("value_in_same_line")
16
+ class ValueInSameLineExtractor(LabelAnchorExtractor):
17
+ """Extract value appearing after a label on the same line."""
18
+
19
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
20
+ if not field or not field.aliases:
21
+ return []
22
+
23
+ hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
24
+ candidates: list[Candidate] = []
25
+
26
+ for hit in hits:
27
+ # Find the section and line
28
+ section = next((s for s in ctx.sections if s.id == hit.section_id), None)
29
+ if section is None:
30
+ continue
31
+ line = next((ln for ln in section.lines if ln.index == hit.line_index), None)
32
+ if line is None:
33
+ continue
34
+
35
+ # Text after label end on the same line
36
+ after = line.text[hit.char_end :].strip().lstrip(":: ")
37
+ after = _COLON_SPLIT.split(after, 1)[-1].strip()
38
+
39
+ # Stop before next label-like pattern
40
+ after = re.split(r"(?:\s{2,}|\t)", after)[0].strip()
41
+
42
+ if not after:
43
+ continue
44
+
45
+ conf = 0.7 * hit.fuzzy_score
46
+ candidates.append(
47
+ Candidate(
48
+ key=field.key,
49
+ value=after,
50
+ raw=line.text,
51
+ extractor="value_in_same_line",
52
+ sources=["label_anchor"],
53
+ section_id=hit.section_id,
54
+ line_index=hit.line_index,
55
+ span=(hit.char_end, hit.char_end + len(after)),
56
+ confidence=conf,
57
+ )
58
+ )
59
+
60
+ return candidates
File without changes