ocr-postprocess 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_postprocess/__init__.py +33 -0
- ocr_postprocess/classifier.py +63 -0
- ocr_postprocess/cli.py +130 -0
- ocr_postprocess/engine/__init__.py +0 -0
- ocr_postprocess/engine/denoiser.py +134 -0
- ocr_postprocess/engine/extractor_stage.py +107 -0
- ocr_postprocess/engine/normalizer.py +128 -0
- ocr_postprocess/engine/reconciler.py +170 -0
- ocr_postprocess/engine/reconstructor.py +469 -0
- ocr_postprocess/engine/transform_stage.py +89 -0
- ocr_postprocess/exceptions.py +30 -0
- ocr_postprocess/extractors/__init__.py +0 -0
- ocr_postprocess/extractors/base.py +103 -0
- ocr_postprocess/extractors/helpers.py +63 -0
- ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
- ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
- ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
- ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
- ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
- ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
- ocr_postprocess/extractors/pattern/__init__.py +0 -0
- ocr_postprocess/extractors/pattern/cccd.py +120 -0
- ocr_postprocess/extractors/pattern/cmnd.py +38 -0
- ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
- ocr_postprocess/extractors/pattern/date.py +89 -0
- ocr_postprocess/extractors/pattern/email.py +38 -0
- ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
- ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
- ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
- ocr_postprocess/extractors/pattern/tax_code.py +53 -0
- ocr_postprocess/extractors/registry.py +45 -0
- ocr_postprocess/extractors/structured/__init__.py +0 -0
- ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
- ocr_postprocess/extractors/universal.py +39 -0
- ocr_postprocess/models.py +131 -0
- ocr_postprocess/pipeline.py +179 -0
- ocr_postprocess/profiles/__init__.py +0 -0
- ocr_postprocess/profiles/_generic.yml +13 -0
- ocr_postprocess/profiles/cccd_2024.yml +113 -0
- ocr_postprocess/profiles/dang_kiem.yml +105 -0
- ocr_postprocess/profiles/loader.py +63 -0
- ocr_postprocess/profiles/matcher.py +71 -0
- ocr_postprocess/profiles/schema.py +197 -0
- ocr_postprocess/py.typed +0 -0
- ocr_postprocess/renderer/__init__.py +0 -0
- ocr_postprocess/renderer/json_renderer.py +59 -0
- ocr_postprocess/renderer/llm.py +41 -0
- ocr_postprocess/renderer/markdown.py +172 -0
- ocr_postprocess/scorer.py +78 -0
- ocr_postprocess/transformer.py +304 -0
- ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
- ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
- ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
- ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
- ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Stage 7 — Reconciler: merge multi-source candidates, cross-validate."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import unicodedata
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ocr_postprocess.models import Candidate, CrossCheck, PipelineContext
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# Priority order for conflict resolution (higher index = higher trust)
|
|
14
|
+
_SOURCE_PRIORITY = [
|
|
15
|
+
"label_anchor",
|
|
16
|
+
"pattern",
|
|
17
|
+
"pattern_with_checksum",
|
|
18
|
+
"structured",
|
|
19
|
+
"constant",
|
|
20
|
+
"computed",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _source_priority(sources: list[str]) -> int:
|
|
25
|
+
best = 0
|
|
26
|
+
for src in sources:
|
|
27
|
+
for i, key in enumerate(_SOURCE_PRIORITY):
|
|
28
|
+
if key in src:
|
|
29
|
+
best = max(best, i)
|
|
30
|
+
return best
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _normalize_for_compare(value: Any) -> str:
|
|
34
|
+
"""Normalise value to a comparable string."""
|
|
35
|
+
if value is None:
|
|
36
|
+
return ""
|
|
37
|
+
s = str(value).strip().lower()
|
|
38
|
+
# Remove diacritics
|
|
39
|
+
nfkd = unicodedata.normalize("NFKD", s)
|
|
40
|
+
return "".join(c for c in nfkd if not unicodedata.combining(c))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def reconcile_stage(ctx: PipelineContext) -> None:
|
|
44
|
+
"""Pipeline stage 7: reconcile and merge candidates."""
|
|
45
|
+
profile = ctx.profile
|
|
46
|
+
|
|
47
|
+
# Group by key
|
|
48
|
+
grouped: dict[str, list[Candidate]] = {}
|
|
49
|
+
for cand in ctx.candidates:
|
|
50
|
+
grouped.setdefault(cand.key, []).append(cand)
|
|
51
|
+
|
|
52
|
+
merged: list[Candidate] = []
|
|
53
|
+
cross_checks: list[CrossCheck] = []
|
|
54
|
+
|
|
55
|
+
for key, candidates in grouped.items():
|
|
56
|
+
if not candidates:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
if len(candidates) == 1:
|
|
60
|
+
merged.append(candidates[0])
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
# Normalize values for comparison
|
|
64
|
+
norm_values = [_normalize_for_compare(c.value) for c in candidates]
|
|
65
|
+
unique_values = set(v for v in norm_values if v)
|
|
66
|
+
|
|
67
|
+
all_sources = []
|
|
68
|
+
for c in candidates:
|
|
69
|
+
all_sources.extend(c.sources)
|
|
70
|
+
|
|
71
|
+
if len(unique_values) <= 1:
|
|
72
|
+
# All agree — boost confidence
|
|
73
|
+
best = max(candidates, key=lambda c: c.confidence)
|
|
74
|
+
boosted_conf = min(1.0, 1.0 - (1.0 - best.confidence) * 0.7)
|
|
75
|
+
merged.append(
|
|
76
|
+
best.model_copy(
|
|
77
|
+
update={
|
|
78
|
+
"confidence": boosted_conf,
|
|
79
|
+
"sources": list(dict.fromkeys(all_sources)),
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
# Conflict — choose by source priority, flag conflict
|
|
85
|
+
best = max(candidates, key=lambda c: (_source_priority(c.sources), c.confidence))
|
|
86
|
+
# Internal detail (for logs): full extractor=value pairs
|
|
87
|
+
detail = " vs ".join(f"{c.extractor}='{c.value}'" for c in candidates)
|
|
88
|
+
# Human-readable: unique values only
|
|
89
|
+
seen_vals: list[str] = []
|
|
90
|
+
for c in candidates:
|
|
91
|
+
v = str(c.value)
|
|
92
|
+
if v not in seen_vals:
|
|
93
|
+
seen_vals.append(v)
|
|
94
|
+
field_label = key
|
|
95
|
+
if ctx.profile:
|
|
96
|
+
fdef = next((f for f in ctx.profile.fields if f.key == key), None)
|
|
97
|
+
if fdef and fdef.aliases:
|
|
98
|
+
field_label = fdef.aliases[0]
|
|
99
|
+
readable_vals = ", ".join(f'"{v}"' for v in seen_vals)
|
|
100
|
+
warning_msg = f"Conflict: {field_label} — nhiều giá trị khác nhau: {readable_vals}"
|
|
101
|
+
logger.warning("Conflict for field '%s': %s", key, detail)
|
|
102
|
+
ctx.warnings.append(warning_msg)
|
|
103
|
+
merged.append(
|
|
104
|
+
best.model_copy(
|
|
105
|
+
update={
|
|
106
|
+
"conflict": True,
|
|
107
|
+
"sources": list(dict.fromkeys(all_sources)),
|
|
108
|
+
"notes": best.notes + [f"Conflict: {detail}"],
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Cross-check: compare candidates from different extractors
|
|
114
|
+
if len(candidates) >= 2:
|
|
115
|
+
all_agree = len(unique_values) <= 1
|
|
116
|
+
# Collect unique original (non-normalized) values for human-readable display
|
|
117
|
+
seen: list[str] = []
|
|
118
|
+
for c in candidates:
|
|
119
|
+
v = str(c.value)
|
|
120
|
+
if v not in seen:
|
|
121
|
+
seen.append(v)
|
|
122
|
+
# Build value_sources: for each unique value, pick best-confidence candidate as source
|
|
123
|
+
vs_map: dict[str, dict] = {}
|
|
124
|
+
for c in candidates:
|
|
125
|
+
v = str(c.value)
|
|
126
|
+
if v not in vs_map or c.confidence > vs_map[v]["confidence"]:
|
|
127
|
+
vs_map[v] = {
|
|
128
|
+
"value": v,
|
|
129
|
+
"extractor": c.extractor,
|
|
130
|
+
"confidence": c.confidence,
|
|
131
|
+
"raw": c.raw or "",
|
|
132
|
+
"line_index": c.line_index,
|
|
133
|
+
}
|
|
134
|
+
cross_checks.append(
|
|
135
|
+
CrossCheck(
|
|
136
|
+
field_key=key,
|
|
137
|
+
sources=list(dict.fromkeys(all_sources)),
|
|
138
|
+
matched=all_agree,
|
|
139
|
+
detail=f"{norm_values[0]!r}" if all_agree else detail,
|
|
140
|
+
values=seen,
|
|
141
|
+
value_sources=list(vs_map.values()),
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Inject constant fields from profile
|
|
146
|
+
if profile:
|
|
147
|
+
existing_keys = {c.key for c in merged}
|
|
148
|
+
for field in profile.fields:
|
|
149
|
+
if field.constant is not None and field.key not in existing_keys:
|
|
150
|
+
merged.append(
|
|
151
|
+
Candidate(
|
|
152
|
+
key=field.key,
|
|
153
|
+
value=field.constant,
|
|
154
|
+
raw=str(field.constant),
|
|
155
|
+
extractor="constant",
|
|
156
|
+
sources=["constant"],
|
|
157
|
+
confidence=1.0,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Warn on missing required fields
|
|
162
|
+
for field in profile.fields:
|
|
163
|
+
if field.required and not any(c.key == field.key for c in merged):
|
|
164
|
+
msg = f"Required field missing: {field.key}"
|
|
165
|
+
ctx.warnings.append(msg)
|
|
166
|
+
logger.warning(msg)
|
|
167
|
+
|
|
168
|
+
ctx.candidates = merged
|
|
169
|
+
ctx.cross_checks = cross_checks
|
|
170
|
+
logger.debug("Reconciler: %d candidates, %d cross-checks", len(merged), len(cross_checks))
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""Stage 4 — LineReconstructor: 6 sub-steps to rebuild document structure."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import unicodedata
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import regex as re
|
|
10
|
+
from rapidfuzz import fuzz
|
|
11
|
+
|
|
12
|
+
from ocr_postprocess.models import LabelHit, Line, PipelineContext, Section
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# Helpers
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _normalize_label(text: str) -> str:
|
|
23
|
+
"""Lowercase + strip diacritics for fuzzy label comparison."""
|
|
24
|
+
nfkd = unicodedata.normalize("NFKD", text)
|
|
25
|
+
return "".join(c for c in nfkd if not unicodedata.combining(c)).lower().strip()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _all_aliases(profile: Any) -> list[str]:
|
|
29
|
+
"""Return all field aliases from profile (flat list)."""
|
|
30
|
+
aliases = []
|
|
31
|
+
if not profile:
|
|
32
|
+
return aliases
|
|
33
|
+
for field in profile.fields:
|
|
34
|
+
aliases.extend(field.aliases)
|
|
35
|
+
return aliases
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Sub-step (a) — Section splitter
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
_ALL_CAPS_LINE = re.compile(r"^[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠƯẠ-Ỵ0-9 \-/()]{8,}$")
|
|
43
|
+
_NUMBERED_HEADING = re.compile(r"^\d+\.\s+[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠƯẠ-Ỵ]")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _step_a_section_split(
|
|
47
|
+
text: str, section_defs: list[Any]
|
|
48
|
+
) -> list[tuple[str, str | None, list[str]]]:
|
|
49
|
+
"""Split text into sections.
|
|
50
|
+
|
|
51
|
+
Returns list of (section_id, title, lines).
|
|
52
|
+
"""
|
|
53
|
+
lines = text.split("\n")
|
|
54
|
+
|
|
55
|
+
# Build compiled markers from profile
|
|
56
|
+
markers: list[tuple[str, str, bool]] = [] # (id, pattern, is_regex)
|
|
57
|
+
for sdef in section_defs:
|
|
58
|
+
for start in sdef.start:
|
|
59
|
+
markers.append((sdef.id, start, sdef.is_regex))
|
|
60
|
+
|
|
61
|
+
if not markers:
|
|
62
|
+
return [("_default", None, lines)]
|
|
63
|
+
|
|
64
|
+
sections: list[tuple[str, str | None, list[str]]] = []
|
|
65
|
+
current_id = "_default"
|
|
66
|
+
current_title: str | None = None
|
|
67
|
+
current_lines: list[str] = []
|
|
68
|
+
|
|
69
|
+
for line in lines:
|
|
70
|
+
matched_id = None
|
|
71
|
+
stripped = line.strip()
|
|
72
|
+
|
|
73
|
+
for sec_id, pattern, is_regex in markers:
|
|
74
|
+
if is_regex:
|
|
75
|
+
if re.search(pattern, stripped):
|
|
76
|
+
matched_id = sec_id
|
|
77
|
+
break
|
|
78
|
+
else:
|
|
79
|
+
if pattern.lower() in stripped.lower():
|
|
80
|
+
matched_id = sec_id
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
# Fallback heuristic
|
|
84
|
+
if matched_id is None and len(stripped) >= 8:
|
|
85
|
+
if _ALL_CAPS_LINE.match(stripped) or _NUMBERED_HEADING.match(stripped):
|
|
86
|
+
matched_id = re.sub(r"\s+", "_", stripped[:20]).upper()
|
|
87
|
+
|
|
88
|
+
if matched_id is not None and matched_id != current_id:
|
|
89
|
+
if current_lines:
|
|
90
|
+
sections.append((current_id, current_title, current_lines))
|
|
91
|
+
current_id = matched_id
|
|
92
|
+
current_title = stripped
|
|
93
|
+
current_lines = []
|
|
94
|
+
else:
|
|
95
|
+
current_lines.append(line)
|
|
96
|
+
|
|
97
|
+
if current_lines or not sections:
|
|
98
|
+
sections.append((current_id, current_title, current_lines))
|
|
99
|
+
|
|
100
|
+
return sections
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# Sub-step (b) — Bilingual label normalizer
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
_BILINGUAL_SLASH = re.compile(r"(.+?)\s*/\s*([A-Za-z].+?)(\s*:)?$")
|
|
108
|
+
_BILINGUAL_PAREN = re.compile(r"(.+?)\s*\(([A-Za-z][^)]+)\)(\s*:)?$")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _step_b_build_bilingual_map(
|
|
112
|
+
lines: list[str],
|
|
113
|
+
bilingual_pairs: list[list[str]],
|
|
114
|
+
) -> dict[str, str]:
|
|
115
|
+
"""Build VN↔EN alias map from explicit YAML pairs and auto-detected patterns."""
|
|
116
|
+
alias_map: dict[str, str] = {}
|
|
117
|
+
|
|
118
|
+
# Explicit pairs from YAML
|
|
119
|
+
for pair in bilingual_pairs:
|
|
120
|
+
if len(pair) == 2:
|
|
121
|
+
alias_map[_normalize_label(pair[0])] = pair[1]
|
|
122
|
+
alias_map[_normalize_label(pair[1])] = pair[0]
|
|
123
|
+
|
|
124
|
+
# Auto-detect from text
|
|
125
|
+
for line in lines:
|
|
126
|
+
stripped = line.strip().rstrip(":")
|
|
127
|
+
for pattern in (_BILINGUAL_SLASH, _BILINGUAL_PAREN):
|
|
128
|
+
m = pattern.match(stripped)
|
|
129
|
+
if m:
|
|
130
|
+
vn, en = m.group(1).strip(), m.group(2).strip()
|
|
131
|
+
alias_map[_normalize_label(vn)] = en
|
|
132
|
+
alias_map[_normalize_label(en)] = vn
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
return alias_map
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
# Sub-step (c) — Multi-label-line splitter
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _step_c_split_multi_label_lines(
|
|
144
|
+
lines: list[str],
|
|
145
|
+
known_labels: list[str],
|
|
146
|
+
threshold: float = 0.8,
|
|
147
|
+
min_count: int = 2,
|
|
148
|
+
alias_map: dict[str, str] | None = None,
|
|
149
|
+
) -> list[str]:
|
|
150
|
+
"""Split lines containing multiple labels into separate lines.
|
|
151
|
+
|
|
152
|
+
Bilingual label lines (e.g. "Họ và tên / Full name:") are NOT split because
|
|
153
|
+
both labels refer to the same semantic field — splitting them causes downstream
|
|
154
|
+
extractors to see the English translation as the field value.
|
|
155
|
+
"""
|
|
156
|
+
if not known_labels:
|
|
157
|
+
return lines
|
|
158
|
+
|
|
159
|
+
_alias_map = alias_map or {}
|
|
160
|
+
|
|
161
|
+
def _is_bilingual_pair(labels: list[str]) -> bool:
|
|
162
|
+
"""Return True if every label in *labels* is a bilingual alias of another label."""
|
|
163
|
+
if len(labels) < 2:
|
|
164
|
+
return False
|
|
165
|
+
norm = [_normalize_label(la) for la in labels]
|
|
166
|
+
for a in norm:
|
|
167
|
+
for b in norm:
|
|
168
|
+
if a != b and _alias_map.get(a) and _normalize_label(_alias_map[a]) == b:
|
|
169
|
+
return True
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
result: list[str] = []
|
|
173
|
+
|
|
174
|
+
for line in lines:
|
|
175
|
+
# Find positions of known labels in this line
|
|
176
|
+
positions: list[tuple[int, int, str]] = [] # (start, end, label)
|
|
177
|
+
line_lower = line.lower()
|
|
178
|
+
|
|
179
|
+
for label in known_labels:
|
|
180
|
+
label_lower = label.lower()
|
|
181
|
+
for m in re.finditer(re.escape(label_lower), line_lower):
|
|
182
|
+
positions.append((m.start(), m.end(), label))
|
|
183
|
+
|
|
184
|
+
# Also fuzzy-match labels
|
|
185
|
+
if len(line) > 10:
|
|
186
|
+
words = line.split()
|
|
187
|
+
for i in range(len(words)):
|
|
188
|
+
for j in range(i + 1, min(i + 5, len(words) + 1)):
|
|
189
|
+
fragment = " ".join(words[i:j])
|
|
190
|
+
for label in known_labels:
|
|
191
|
+
score = fuzz.ratio(fragment.lower(), label.lower()) / 100.0
|
|
192
|
+
if score >= threshold and (fragment.lower() != label.lower()):
|
|
193
|
+
start = line.lower().find(fragment.lower())
|
|
194
|
+
if start >= 0:
|
|
195
|
+
positions.append((start, start + len(fragment), label))
|
|
196
|
+
|
|
197
|
+
# Remove duplicates and overlaps, sort by position
|
|
198
|
+
positions.sort(key=lambda x: x[0])
|
|
199
|
+
filtered: list[tuple[int, int, str]] = []
|
|
200
|
+
last_end = -1
|
|
201
|
+
for start, end, label in positions:
|
|
202
|
+
if start >= last_end:
|
|
203
|
+
filtered.append((start, end, label))
|
|
204
|
+
last_end = end
|
|
205
|
+
|
|
206
|
+
found_labels = [la for _, _, la in filtered]
|
|
207
|
+
|
|
208
|
+
if len(filtered) >= min_count and not _is_bilingual_pair(found_labels):
|
|
209
|
+
# Split line at each label position
|
|
210
|
+
parts: list[str] = []
|
|
211
|
+
for idx, (start, end, label) in enumerate(filtered):
|
|
212
|
+
# Value = text between this label's end and next label's start
|
|
213
|
+
next_start = filtered[idx + 1][0] if idx + 1 < len(filtered) else len(line)
|
|
214
|
+
value = line[end:next_start].strip().lstrip(": ")
|
|
215
|
+
parts.append(f"{label}: {value}" if value else label)
|
|
216
|
+
result.extend(parts)
|
|
217
|
+
else:
|
|
218
|
+
result.append(line)
|
|
219
|
+
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ---------------------------------------------------------------------------
|
|
224
|
+
# Sub-step (d) — Wrap rejoin
|
|
225
|
+
# ---------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _step_d_wrap_rejoin(lines: list[str]) -> list[str]:
|
|
229
|
+
"""Rejoin lines that were wrapped mid-sentence by OCR."""
|
|
230
|
+
result: list[str] = []
|
|
231
|
+
i = 0
|
|
232
|
+
while i < len(lines):
|
|
233
|
+
line = lines[i]
|
|
234
|
+
stripped = line.strip()
|
|
235
|
+
|
|
236
|
+
if not stripped:
|
|
237
|
+
result.append(line)
|
|
238
|
+
i += 1
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
# Line ends with colon: next line is value
|
|
242
|
+
if stripped.endswith(":") and i + 1 < len(lines):
|
|
243
|
+
next_line = lines[i + 1].strip()
|
|
244
|
+
if next_line and not next_line.endswith(":"):
|
|
245
|
+
result.append(f"{stripped} {next_line}")
|
|
246
|
+
i += 2
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
# Continuation: next line starts with lowercase (wrapped)
|
|
250
|
+
if i + 1 < len(lines) and not stripped.endswith((".", "!", "?")):
|
|
251
|
+
next_stripped = lines[i + 1].strip()
|
|
252
|
+
if next_stripped and next_stripped[0].islower():
|
|
253
|
+
result.append(f"{stripped} {next_stripped}")
|
|
254
|
+
i += 2
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
result.append(line)
|
|
258
|
+
i += 1
|
|
259
|
+
|
|
260
|
+
return result
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# ---------------------------------------------------------------------------
|
|
264
|
+
# Sub-step (e) — EN-paren-rejoin
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
|
|
267
|
+
_EN_PAREN_LINE = re.compile(r"^\s*\(([A-Za-z][^)]*)\)\s*$")
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _step_e_en_paren_rejoin(lines: list[str]) -> list[str]:
|
|
271
|
+
"""Merge EN-only paren lines as aliases on the label above."""
|
|
272
|
+
result: list[str] = []
|
|
273
|
+
for i, line in enumerate(lines):
|
|
274
|
+
m = _EN_PAREN_LINE.match(line)
|
|
275
|
+
if m and result:
|
|
276
|
+
# Append as alias to last line
|
|
277
|
+
result[-1] = f"{result[-1].rstrip()} / {m.group(1)}"
|
|
278
|
+
else:
|
|
279
|
+
result.append(line)
|
|
280
|
+
return result
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
# Sub-step (f) — Label-boundary splitter
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _step_f_label_boundary_split(
|
|
289
|
+
lines: list[str], known_labels: list[str], max_iters: int = 5
|
|
290
|
+
) -> list[str]:
|
|
291
|
+
"""Split lines where a label is glued directly after a value."""
|
|
292
|
+
if not known_labels:
|
|
293
|
+
return lines
|
|
294
|
+
|
|
295
|
+
for _ in range(max_iters):
|
|
296
|
+
changed = False
|
|
297
|
+
new_lines: list[str] = []
|
|
298
|
+
for line in lines:
|
|
299
|
+
split_done = False
|
|
300
|
+
for label in known_labels:
|
|
301
|
+
# Pattern: value immediately followed by label (no space separator)
|
|
302
|
+
escaped = re.escape(label)
|
|
303
|
+
pattern = re.compile(rf"(\S)({escaped})", re.IGNORECASE)
|
|
304
|
+
m = pattern.search(line)
|
|
305
|
+
if m and m.start() > 0:
|
|
306
|
+
before = line[: m.start() + 1].strip()
|
|
307
|
+
after = line[m.start() + 1 :].strip()
|
|
308
|
+
if before and after:
|
|
309
|
+
new_lines.append(before)
|
|
310
|
+
new_lines.append(after)
|
|
311
|
+
split_done = True
|
|
312
|
+
changed = True
|
|
313
|
+
break
|
|
314
|
+
if not split_done:
|
|
315
|
+
new_lines.append(line)
|
|
316
|
+
lines = new_lines
|
|
317
|
+
if not changed:
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
return lines
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
# ---------------------------------------------------------------------------
|
|
324
|
+
# Label index builder
|
|
325
|
+
# ---------------------------------------------------------------------------
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _build_label_index(
|
|
329
|
+
sections: list[Section],
|
|
330
|
+
known_labels: list[str],
|
|
331
|
+
alias_map: dict[str, str],
|
|
332
|
+
threshold: float = 0.8,
|
|
333
|
+
) -> dict[str, list[LabelHit]]:
|
|
334
|
+
"""Build label_index: lowercased label → list of LabelHit."""
|
|
335
|
+
label_index: dict[str, list[LabelHit]] = {}
|
|
336
|
+
|
|
337
|
+
for section in sections:
|
|
338
|
+
for line in section.lines:
|
|
339
|
+
for label in known_labels:
|
|
340
|
+
label_lower = label.lower()
|
|
341
|
+
text_lower = line.text.lower()
|
|
342
|
+
start = text_lower.find(label_lower)
|
|
343
|
+
if start < 0:
|
|
344
|
+
# Fuzzy search
|
|
345
|
+
score = fuzz.partial_ratio(label_lower, text_lower) / 100.0
|
|
346
|
+
if score < threshold:
|
|
347
|
+
continue
|
|
348
|
+
start = 0
|
|
349
|
+
end = len(line.text)
|
|
350
|
+
else:
|
|
351
|
+
end = start + len(label)
|
|
352
|
+
score = 1.0
|
|
353
|
+
|
|
354
|
+
hit = LabelHit(
|
|
355
|
+
label=label,
|
|
356
|
+
aliases_matched=[label],
|
|
357
|
+
section_id=section.id,
|
|
358
|
+
line_index=line.index,
|
|
359
|
+
char_start=start,
|
|
360
|
+
char_end=end,
|
|
361
|
+
fuzzy_score=score,
|
|
362
|
+
)
|
|
363
|
+
label_index.setdefault(label_lower, []).append(hit)
|
|
364
|
+
|
|
365
|
+
# Also index alias
|
|
366
|
+
if label_lower in alias_map:
|
|
367
|
+
alias = alias_map[label_lower]
|
|
368
|
+
label_index.setdefault(alias.lower(), []).append(hit)
|
|
369
|
+
|
|
370
|
+
return label_index
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ---------------------------------------------------------------------------
|
|
374
|
+
# Main reconstruct stage
|
|
375
|
+
# ---------------------------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def reconstruct_stage(ctx: PipelineContext) -> None:
|
|
379
|
+
"""Pipeline stage 4: LineReconstructor."""
|
|
380
|
+
profile = ctx.profile
|
|
381
|
+
rcfg = profile.reconstruct if profile else None
|
|
382
|
+
section_defs = profile.sections if profile else []
|
|
383
|
+
bilingual_pairs = rcfg.bilingual_pairs if rcfg else []
|
|
384
|
+
fuzzy_threshold = rcfg.fuzzy_threshold if rcfg else 0.8
|
|
385
|
+
enabled = set(rcfg.enabled_steps if rcfg else ["a", "b", "c", "d", "e", "f"])
|
|
386
|
+
|
|
387
|
+
text = ctx.normalized_text or ctx.raw_text
|
|
388
|
+
|
|
389
|
+
# (a) Section split
|
|
390
|
+
raw_sections = (
|
|
391
|
+
_step_a_section_split(text, section_defs)
|
|
392
|
+
if "a" in enabled
|
|
393
|
+
else [("_default", None, text.split("\n"))]
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
sections: list[Section] = []
|
|
397
|
+
all_aliases = _all_aliases(profile)
|
|
398
|
+
alias_map: dict[str, str] = {}
|
|
399
|
+
|
|
400
|
+
for sec_id, sec_title, sec_lines in raw_sections:
|
|
401
|
+
lines = sec_lines
|
|
402
|
+
|
|
403
|
+
# (b) Bilingual map
|
|
404
|
+
if "b" in enabled:
|
|
405
|
+
try:
|
|
406
|
+
alias_map.update(_step_b_build_bilingual_map(lines, bilingual_pairs))
|
|
407
|
+
except Exception:
|
|
408
|
+
logger.exception("Reconstructor step-b failed (bilingual map); skipping")
|
|
409
|
+
|
|
410
|
+
# Build full label set: profile aliases + bilingual aliases
|
|
411
|
+
known_labels = list(set(all_aliases + list(alias_map.keys()) + list(alias_map.values())))
|
|
412
|
+
known_labels = [la for la in known_labels if la.strip()]
|
|
413
|
+
|
|
414
|
+
# (c) Multi-label-line split
|
|
415
|
+
if "c" in enabled:
|
|
416
|
+
try:
|
|
417
|
+
min_count = rcfg.multi_label_min_count if rcfg else 2
|
|
418
|
+
lines = _step_c_split_multi_label_lines(
|
|
419
|
+
lines, known_labels, fuzzy_threshold, min_count, alias_map=alias_map
|
|
420
|
+
)
|
|
421
|
+
except Exception:
|
|
422
|
+
logger.exception("Reconstructor step-c failed (label split); skipping")
|
|
423
|
+
|
|
424
|
+
# (d) Wrap rejoin
|
|
425
|
+
if "d" in enabled:
|
|
426
|
+
try:
|
|
427
|
+
lines = _step_d_wrap_rejoin(lines)
|
|
428
|
+
except Exception:
|
|
429
|
+
logger.exception("Reconstructor step-d failed (wrap rejoin); skipping")
|
|
430
|
+
|
|
431
|
+
# (e) EN-paren-rejoin
|
|
432
|
+
if "e" in enabled:
|
|
433
|
+
try:
|
|
434
|
+
lines = _step_e_en_paren_rejoin(lines)
|
|
435
|
+
except Exception:
|
|
436
|
+
logger.exception("Reconstructor step-e failed (paren rejoin); skipping")
|
|
437
|
+
|
|
438
|
+
# (f) Label-boundary split
|
|
439
|
+
if "f" in enabled:
|
|
440
|
+
try:
|
|
441
|
+
lines = _step_f_label_boundary_split(lines, known_labels)
|
|
442
|
+
except Exception:
|
|
443
|
+
logger.exception("Reconstructor step-f failed (boundary split); skipping")
|
|
444
|
+
|
|
445
|
+
section = Section(
|
|
446
|
+
id=sec_id,
|
|
447
|
+
title=sec_title,
|
|
448
|
+
lines=[
|
|
449
|
+
Line(index=i, text=line)
|
|
450
|
+
for i, line in enumerate(lines)
|
|
451
|
+
if line.strip() # skip blank lines
|
|
452
|
+
],
|
|
453
|
+
)
|
|
454
|
+
sections.append(section)
|
|
455
|
+
|
|
456
|
+
ctx.sections = sections
|
|
457
|
+
|
|
458
|
+
# Build label_index using the full known_labels (profile aliases + auto-detected bilingual
|
|
459
|
+
# labels), so stop_labels and extractors can reference any label found in the text.
|
|
460
|
+
full_known_labels = list(set(all_aliases + list(alias_map.keys()) + list(alias_map.values())))
|
|
461
|
+
full_known_labels = [la for la in full_known_labels if la.strip()]
|
|
462
|
+
ctx.label_index = _build_label_index(sections, full_known_labels, alias_map, fuzzy_threshold)
|
|
463
|
+
total_lines = sum(len(s.lines) for s in sections)
|
|
464
|
+
logger.debug(
|
|
465
|
+
"Reconstructor: %d section(s), %d line(s), %d label entries",
|
|
466
|
+
len(sections),
|
|
467
|
+
total_lines,
|
|
468
|
+
len(ctx.label_index),
|
|
469
|
+
)
|