ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,170 @@
1
+ """Stage 7 — Reconciler: merge multi-source candidates, cross-validate."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import unicodedata
7
+ from typing import Any
8
+
9
+ from ocr_postprocess.models import Candidate, CrossCheck, PipelineContext
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Priority order for conflict resolution (higher index = higher trust)
14
+ _SOURCE_PRIORITY = [
15
+ "label_anchor",
16
+ "pattern",
17
+ "pattern_with_checksum",
18
+ "structured",
19
+ "constant",
20
+ "computed",
21
+ ]
22
+
23
+
24
+ def _source_priority(sources: list[str]) -> int:
25
+ best = 0
26
+ for src in sources:
27
+ for i, key in enumerate(_SOURCE_PRIORITY):
28
+ if key in src:
29
+ best = max(best, i)
30
+ return best
31
+
32
+
33
+ def _normalize_for_compare(value: Any) -> str:
34
+ """Normalise value to a comparable string."""
35
+ if value is None:
36
+ return ""
37
+ s = str(value).strip().lower()
38
+ # Remove diacritics
39
+ nfkd = unicodedata.normalize("NFKD", s)
40
+ return "".join(c for c in nfkd if not unicodedata.combining(c))
41
+
42
+
43
+ def reconcile_stage(ctx: PipelineContext) -> None:
44
+ """Pipeline stage 7: reconcile and merge candidates."""
45
+ profile = ctx.profile
46
+
47
+ # Group by key
48
+ grouped: dict[str, list[Candidate]] = {}
49
+ for cand in ctx.candidates:
50
+ grouped.setdefault(cand.key, []).append(cand)
51
+
52
+ merged: list[Candidate] = []
53
+ cross_checks: list[CrossCheck] = []
54
+
55
+ for key, candidates in grouped.items():
56
+ if not candidates:
57
+ continue
58
+
59
+ if len(candidates) == 1:
60
+ merged.append(candidates[0])
61
+ continue
62
+
63
+ # Normalize values for comparison
64
+ norm_values = [_normalize_for_compare(c.value) for c in candidates]
65
+ unique_values = set(v for v in norm_values if v)
66
+
67
+ all_sources = []
68
+ for c in candidates:
69
+ all_sources.extend(c.sources)
70
+
71
+ if len(unique_values) <= 1:
72
+ # All agree — boost confidence
73
+ best = max(candidates, key=lambda c: c.confidence)
74
+ boosted_conf = min(1.0, 1.0 - (1.0 - best.confidence) * 0.7)
75
+ merged.append(
76
+ best.model_copy(
77
+ update={
78
+ "confidence": boosted_conf,
79
+ "sources": list(dict.fromkeys(all_sources)),
80
+ }
81
+ )
82
+ )
83
+ else:
84
+ # Conflict — choose by source priority, flag conflict
85
+ best = max(candidates, key=lambda c: (_source_priority(c.sources), c.confidence))
86
+ # Internal detail (for logs): full extractor=value pairs
87
+ detail = " vs ".join(f"{c.extractor}='{c.value}'" for c in candidates)
88
+ # Human-readable: unique values only
89
+ seen_vals: list[str] = []
90
+ for c in candidates:
91
+ v = str(c.value)
92
+ if v not in seen_vals:
93
+ seen_vals.append(v)
94
+ field_label = key
95
+ if ctx.profile:
96
+ fdef = next((f for f in ctx.profile.fields if f.key == key), None)
97
+ if fdef and fdef.aliases:
98
+ field_label = fdef.aliases[0]
99
+ readable_vals = ", ".join(f'"{v}"' for v in seen_vals)
100
+ warning_msg = f"Conflict: {field_label} — nhiều giá trị khác nhau: {readable_vals}"
101
+ logger.warning("Conflict for field '%s': %s", key, detail)
102
+ ctx.warnings.append(warning_msg)
103
+ merged.append(
104
+ best.model_copy(
105
+ update={
106
+ "conflict": True,
107
+ "sources": list(dict.fromkeys(all_sources)),
108
+ "notes": best.notes + [f"Conflict: {detail}"],
109
+ }
110
+ )
111
+ )
112
+
113
+ # Cross-check: compare candidates from different extractors
114
+ if len(candidates) >= 2:
115
+ all_agree = len(unique_values) <= 1
116
+ # Collect unique original (non-normalized) values for human-readable display
117
+ seen: list[str] = []
118
+ for c in candidates:
119
+ v = str(c.value)
120
+ if v not in seen:
121
+ seen.append(v)
122
+ # Build value_sources: for each unique value, pick best-confidence candidate as source
123
+ vs_map: dict[str, dict] = {}
124
+ for c in candidates:
125
+ v = str(c.value)
126
+ if v not in vs_map or c.confidence > vs_map[v]["confidence"]:
127
+ vs_map[v] = {
128
+ "value": v,
129
+ "extractor": c.extractor,
130
+ "confidence": c.confidence,
131
+ "raw": c.raw or "",
132
+ "line_index": c.line_index,
133
+ }
134
+ cross_checks.append(
135
+ CrossCheck(
136
+ field_key=key,
137
+ sources=list(dict.fromkeys(all_sources)),
138
+ matched=all_agree,
139
+ detail=f"{norm_values[0]!r}" if all_agree else detail,
140
+ values=seen,
141
+ value_sources=list(vs_map.values()),
142
+ )
143
+ )
144
+
145
+ # Inject constant fields from profile
146
+ if profile:
147
+ existing_keys = {c.key for c in merged}
148
+ for field in profile.fields:
149
+ if field.constant is not None and field.key not in existing_keys:
150
+ merged.append(
151
+ Candidate(
152
+ key=field.key,
153
+ value=field.constant,
154
+ raw=str(field.constant),
155
+ extractor="constant",
156
+ sources=["constant"],
157
+ confidence=1.0,
158
+ )
159
+ )
160
+
161
+ # Warn on missing required fields
162
+ for field in profile.fields:
163
+ if field.required and not any(c.key == field.key for c in merged):
164
+ msg = f"Required field missing: {field.key}"
165
+ ctx.warnings.append(msg)
166
+ logger.warning(msg)
167
+
168
+ ctx.candidates = merged
169
+ ctx.cross_checks = cross_checks
170
+ logger.debug("Reconciler: %d candidates, %d cross-checks", len(merged), len(cross_checks))
@@ -0,0 +1,469 @@
1
+ """Stage 4 — LineReconstructor: 6 sub-steps to rebuild document structure."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import unicodedata
7
+ from typing import Any
8
+
9
+ import regex as re
10
+ from rapidfuzz import fuzz
11
+
12
+ from ocr_postprocess.models import LabelHit, Line, PipelineContext, Section
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Helpers
19
+ # ---------------------------------------------------------------------------
20
+
21
+
22
+ def _normalize_label(text: str) -> str:
23
+ """Lowercase + strip diacritics for fuzzy label comparison."""
24
+ nfkd = unicodedata.normalize("NFKD", text)
25
+ return "".join(c for c in nfkd if not unicodedata.combining(c)).lower().strip()
26
+
27
+
28
+ def _all_aliases(profile: Any) -> list[str]:
29
+ """Return all field aliases from profile (flat list)."""
30
+ aliases = []
31
+ if not profile:
32
+ return aliases
33
+ for field in profile.fields:
34
+ aliases.extend(field.aliases)
35
+ return aliases
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Sub-step (a) — Section splitter
40
+ # ---------------------------------------------------------------------------
41
+
42
+ _ALL_CAPS_LINE = re.compile(r"^[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠƯẠ-Ỵ0-9 \-/()]{8,}$")
43
+ _NUMBERED_HEADING = re.compile(r"^\d+\.\s+[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠƯẠ-Ỵ]")
44
+
45
+
46
+ def _step_a_section_split(
47
+ text: str, section_defs: list[Any]
48
+ ) -> list[tuple[str, str | None, list[str]]]:
49
+ """Split text into sections.
50
+
51
+ Returns list of (section_id, title, lines).
52
+ """
53
+ lines = text.split("\n")
54
+
55
+ # Build compiled markers from profile
56
+ markers: list[tuple[str, str, bool]] = [] # (id, pattern, is_regex)
57
+ for sdef in section_defs:
58
+ for start in sdef.start:
59
+ markers.append((sdef.id, start, sdef.is_regex))
60
+
61
+ if not markers:
62
+ return [("_default", None, lines)]
63
+
64
+ sections: list[tuple[str, str | None, list[str]]] = []
65
+ current_id = "_default"
66
+ current_title: str | None = None
67
+ current_lines: list[str] = []
68
+
69
+ for line in lines:
70
+ matched_id = None
71
+ stripped = line.strip()
72
+
73
+ for sec_id, pattern, is_regex in markers:
74
+ if is_regex:
75
+ if re.search(pattern, stripped):
76
+ matched_id = sec_id
77
+ break
78
+ else:
79
+ if pattern.lower() in stripped.lower():
80
+ matched_id = sec_id
81
+ break
82
+
83
+ # Fallback heuristic
84
+ if matched_id is None and len(stripped) >= 8:
85
+ if _ALL_CAPS_LINE.match(stripped) or _NUMBERED_HEADING.match(stripped):
86
+ matched_id = re.sub(r"\s+", "_", stripped[:20]).upper()
87
+
88
+ if matched_id is not None and matched_id != current_id:
89
+ if current_lines:
90
+ sections.append((current_id, current_title, current_lines))
91
+ current_id = matched_id
92
+ current_title = stripped
93
+ current_lines = []
94
+ else:
95
+ current_lines.append(line)
96
+
97
+ if current_lines or not sections:
98
+ sections.append((current_id, current_title, current_lines))
99
+
100
+ return sections
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # Sub-step (b) — Bilingual label normalizer
105
+ # ---------------------------------------------------------------------------
106
+
107
+ _BILINGUAL_SLASH = re.compile(r"(.+?)\s*/\s*([A-Za-z].+?)(\s*:)?$")
108
+ _BILINGUAL_PAREN = re.compile(r"(.+?)\s*\(([A-Za-z][^)]+)\)(\s*:)?$")
109
+
110
+
111
+ def _step_b_build_bilingual_map(
112
+ lines: list[str],
113
+ bilingual_pairs: list[list[str]],
114
+ ) -> dict[str, str]:
115
+ """Build VN↔EN alias map from explicit YAML pairs and auto-detected patterns."""
116
+ alias_map: dict[str, str] = {}
117
+
118
+ # Explicit pairs from YAML
119
+ for pair in bilingual_pairs:
120
+ if len(pair) == 2:
121
+ alias_map[_normalize_label(pair[0])] = pair[1]
122
+ alias_map[_normalize_label(pair[1])] = pair[0]
123
+
124
+ # Auto-detect from text
125
+ for line in lines:
126
+ stripped = line.strip().rstrip(":")
127
+ for pattern in (_BILINGUAL_SLASH, _BILINGUAL_PAREN):
128
+ m = pattern.match(stripped)
129
+ if m:
130
+ vn, en = m.group(1).strip(), m.group(2).strip()
131
+ alias_map[_normalize_label(vn)] = en
132
+ alias_map[_normalize_label(en)] = vn
133
+ break
134
+
135
+ return alias_map
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Sub-step (c) — Multi-label-line splitter
140
+ # ---------------------------------------------------------------------------
141
+
142
+
143
+ def _step_c_split_multi_label_lines(
144
+ lines: list[str],
145
+ known_labels: list[str],
146
+ threshold: float = 0.8,
147
+ min_count: int = 2,
148
+ alias_map: dict[str, str] | None = None,
149
+ ) -> list[str]:
150
+ """Split lines containing multiple labels into separate lines.
151
+
152
+ Bilingual label lines (e.g. "Họ và tên / Full name:") are NOT split because
153
+ both labels refer to the same semantic field — splitting them causes downstream
154
+ extractors to see the English translation as the field value.
155
+ """
156
+ if not known_labels:
157
+ return lines
158
+
159
+ _alias_map = alias_map or {}
160
+
161
+ def _is_bilingual_pair(labels: list[str]) -> bool:
162
+ """Return True if every label in *labels* is a bilingual alias of another label."""
163
+ if len(labels) < 2:
164
+ return False
165
+ norm = [_normalize_label(la) for la in labels]
166
+ for a in norm:
167
+ for b in norm:
168
+ if a != b and _alias_map.get(a) and _normalize_label(_alias_map[a]) == b:
169
+ return True
170
+ return False
171
+
172
+ result: list[str] = []
173
+
174
+ for line in lines:
175
+ # Find positions of known labels in this line
176
+ positions: list[tuple[int, int, str]] = [] # (start, end, label)
177
+ line_lower = line.lower()
178
+
179
+ for label in known_labels:
180
+ label_lower = label.lower()
181
+ for m in re.finditer(re.escape(label_lower), line_lower):
182
+ positions.append((m.start(), m.end(), label))
183
+
184
+ # Also fuzzy-match labels
185
+ if len(line) > 10:
186
+ words = line.split()
187
+ for i in range(len(words)):
188
+ for j in range(i + 1, min(i + 5, len(words) + 1)):
189
+ fragment = " ".join(words[i:j])
190
+ for label in known_labels:
191
+ score = fuzz.ratio(fragment.lower(), label.lower()) / 100.0
192
+ if score >= threshold and (fragment.lower() != label.lower()):
193
+ start = line.lower().find(fragment.lower())
194
+ if start >= 0:
195
+ positions.append((start, start + len(fragment), label))
196
+
197
+ # Remove duplicates and overlaps, sort by position
198
+ positions.sort(key=lambda x: x[0])
199
+ filtered: list[tuple[int, int, str]] = []
200
+ last_end = -1
201
+ for start, end, label in positions:
202
+ if start >= last_end:
203
+ filtered.append((start, end, label))
204
+ last_end = end
205
+
206
+ found_labels = [la for _, _, la in filtered]
207
+
208
+ if len(filtered) >= min_count and not _is_bilingual_pair(found_labels):
209
+ # Split line at each label position
210
+ parts: list[str] = []
211
+ for idx, (start, end, label) in enumerate(filtered):
212
+ # Value = text between this label's end and next label's start
213
+ next_start = filtered[idx + 1][0] if idx + 1 < len(filtered) else len(line)
214
+ value = line[end:next_start].strip().lstrip(": ")
215
+ parts.append(f"{label}: {value}" if value else label)
216
+ result.extend(parts)
217
+ else:
218
+ result.append(line)
219
+
220
+ return result
221
+
222
+
223
+ # ---------------------------------------------------------------------------
224
+ # Sub-step (d) — Wrap rejoin
225
+ # ---------------------------------------------------------------------------
226
+
227
+
228
+ def _step_d_wrap_rejoin(lines: list[str]) -> list[str]:
229
+ """Rejoin lines that were wrapped mid-sentence by OCR."""
230
+ result: list[str] = []
231
+ i = 0
232
+ while i < len(lines):
233
+ line = lines[i]
234
+ stripped = line.strip()
235
+
236
+ if not stripped:
237
+ result.append(line)
238
+ i += 1
239
+ continue
240
+
241
+ # Line ends with colon: next line is value
242
+ if stripped.endswith(":") and i + 1 < len(lines):
243
+ next_line = lines[i + 1].strip()
244
+ if next_line and not next_line.endswith(":"):
245
+ result.append(f"{stripped} {next_line}")
246
+ i += 2
247
+ continue
248
+
249
+ # Continuation: next line starts with lowercase (wrapped)
250
+ if i + 1 < len(lines) and not stripped.endswith((".", "!", "?")):
251
+ next_stripped = lines[i + 1].strip()
252
+ if next_stripped and next_stripped[0].islower():
253
+ result.append(f"{stripped} {next_stripped}")
254
+ i += 2
255
+ continue
256
+
257
+ result.append(line)
258
+ i += 1
259
+
260
+ return result
261
+
262
+
263
+ # ---------------------------------------------------------------------------
264
+ # Sub-step (e) — EN-paren-rejoin
265
+ # ---------------------------------------------------------------------------
266
+
267
+ _EN_PAREN_LINE = re.compile(r"^\s*\(([A-Za-z][^)]*)\)\s*$")
268
+
269
+
270
+ def _step_e_en_paren_rejoin(lines: list[str]) -> list[str]:
271
+ """Merge EN-only paren lines as aliases on the label above."""
272
+ result: list[str] = []
273
+ for i, line in enumerate(lines):
274
+ m = _EN_PAREN_LINE.match(line)
275
+ if m and result:
276
+ # Append as alias to last line
277
+ result[-1] = f"{result[-1].rstrip()} / {m.group(1)}"
278
+ else:
279
+ result.append(line)
280
+ return result
281
+
282
+
283
+ # ---------------------------------------------------------------------------
284
+ # Sub-step (f) — Label-boundary splitter
285
+ # ---------------------------------------------------------------------------
286
+
287
+
288
+ def _step_f_label_boundary_split(
289
+ lines: list[str], known_labels: list[str], max_iters: int = 5
290
+ ) -> list[str]:
291
+ """Split lines where a label is glued directly after a value."""
292
+ if not known_labels:
293
+ return lines
294
+
295
+ for _ in range(max_iters):
296
+ changed = False
297
+ new_lines: list[str] = []
298
+ for line in lines:
299
+ split_done = False
300
+ for label in known_labels:
301
+ # Pattern: value immediately followed by label (no space separator)
302
+ escaped = re.escape(label)
303
+ pattern = re.compile(rf"(\S)({escaped})", re.IGNORECASE)
304
+ m = pattern.search(line)
305
+ if m and m.start() > 0:
306
+ before = line[: m.start() + 1].strip()
307
+ after = line[m.start() + 1 :].strip()
308
+ if before and after:
309
+ new_lines.append(before)
310
+ new_lines.append(after)
311
+ split_done = True
312
+ changed = True
313
+ break
314
+ if not split_done:
315
+ new_lines.append(line)
316
+ lines = new_lines
317
+ if not changed:
318
+ break
319
+
320
+ return lines
321
+
322
+
323
+ # ---------------------------------------------------------------------------
324
+ # Label index builder
325
+ # ---------------------------------------------------------------------------
326
+
327
+
328
+ def _build_label_index(
329
+ sections: list[Section],
330
+ known_labels: list[str],
331
+ alias_map: dict[str, str],
332
+ threshold: float = 0.8,
333
+ ) -> dict[str, list[LabelHit]]:
334
+ """Build label_index: lowercased label → list of LabelHit."""
335
+ label_index: dict[str, list[LabelHit]] = {}
336
+
337
+ for section in sections:
338
+ for line in section.lines:
339
+ for label in known_labels:
340
+ label_lower = label.lower()
341
+ text_lower = line.text.lower()
342
+ start = text_lower.find(label_lower)
343
+ if start < 0:
344
+ # Fuzzy search
345
+ score = fuzz.partial_ratio(label_lower, text_lower) / 100.0
346
+ if score < threshold:
347
+ continue
348
+ start = 0
349
+ end = len(line.text)
350
+ else:
351
+ end = start + len(label)
352
+ score = 1.0
353
+
354
+ hit = LabelHit(
355
+ label=label,
356
+ aliases_matched=[label],
357
+ section_id=section.id,
358
+ line_index=line.index,
359
+ char_start=start,
360
+ char_end=end,
361
+ fuzzy_score=score,
362
+ )
363
+ label_index.setdefault(label_lower, []).append(hit)
364
+
365
+ # Also index alias
366
+ if label_lower in alias_map:
367
+ alias = alias_map[label_lower]
368
+ label_index.setdefault(alias.lower(), []).append(hit)
369
+
370
+ return label_index
371
+
372
+
373
+ # ---------------------------------------------------------------------------
374
+ # Main reconstruct stage
375
+ # ---------------------------------------------------------------------------
376
+
377
+
378
+ def reconstruct_stage(ctx: PipelineContext) -> None:
379
+ """Pipeline stage 4: LineReconstructor."""
380
+ profile = ctx.profile
381
+ rcfg = profile.reconstruct if profile else None
382
+ section_defs = profile.sections if profile else []
383
+ bilingual_pairs = rcfg.bilingual_pairs if rcfg else []
384
+ fuzzy_threshold = rcfg.fuzzy_threshold if rcfg else 0.8
385
+ enabled = set(rcfg.enabled_steps if rcfg else ["a", "b", "c", "d", "e", "f"])
386
+
387
+ text = ctx.normalized_text or ctx.raw_text
388
+
389
+ # (a) Section split
390
+ raw_sections = (
391
+ _step_a_section_split(text, section_defs)
392
+ if "a" in enabled
393
+ else [("_default", None, text.split("\n"))]
394
+ )
395
+
396
+ sections: list[Section] = []
397
+ all_aliases = _all_aliases(profile)
398
+ alias_map: dict[str, str] = {}
399
+
400
+ for sec_id, sec_title, sec_lines in raw_sections:
401
+ lines = sec_lines
402
+
403
+ # (b) Bilingual map
404
+ if "b" in enabled:
405
+ try:
406
+ alias_map.update(_step_b_build_bilingual_map(lines, bilingual_pairs))
407
+ except Exception:
408
+ logger.exception("Reconstructor step-b failed (bilingual map); skipping")
409
+
410
+ # Build full label set: profile aliases + bilingual aliases
411
+ known_labels = list(set(all_aliases + list(alias_map.keys()) + list(alias_map.values())))
412
+ known_labels = [la for la in known_labels if la.strip()]
413
+
414
+ # (c) Multi-label-line split
415
+ if "c" in enabled:
416
+ try:
417
+ min_count = rcfg.multi_label_min_count if rcfg else 2
418
+ lines = _step_c_split_multi_label_lines(
419
+ lines, known_labels, fuzzy_threshold, min_count, alias_map=alias_map
420
+ )
421
+ except Exception:
422
+ logger.exception("Reconstructor step-c failed (label split); skipping")
423
+
424
+ # (d) Wrap rejoin
425
+ if "d" in enabled:
426
+ try:
427
+ lines = _step_d_wrap_rejoin(lines)
428
+ except Exception:
429
+ logger.exception("Reconstructor step-d failed (wrap rejoin); skipping")
430
+
431
+ # (e) EN-paren-rejoin
432
+ if "e" in enabled:
433
+ try:
434
+ lines = _step_e_en_paren_rejoin(lines)
435
+ except Exception:
436
+ logger.exception("Reconstructor step-e failed (paren rejoin); skipping")
437
+
438
+ # (f) Label-boundary split
439
+ if "f" in enabled:
440
+ try:
441
+ lines = _step_f_label_boundary_split(lines, known_labels)
442
+ except Exception:
443
+ logger.exception("Reconstructor step-f failed (boundary split); skipping")
444
+
445
+ section = Section(
446
+ id=sec_id,
447
+ title=sec_title,
448
+ lines=[
449
+ Line(index=i, text=line)
450
+ for i, line in enumerate(lines)
451
+ if line.strip() # skip blank lines
452
+ ],
453
+ )
454
+ sections.append(section)
455
+
456
+ ctx.sections = sections
457
+
458
+ # Build label_index using the full known_labels (profile aliases + auto-detected bilingual
459
+ # labels), so stop_labels and extractors can reference any label found in the text.
460
+ full_known_labels = list(set(all_aliases + list(alias_map.keys()) + list(alias_map.values())))
461
+ full_known_labels = [la for la in full_known_labels if la.strip()]
462
+ ctx.label_index = _build_label_index(sections, full_known_labels, alias_map, fuzzy_threshold)
463
+ total_lines = sum(len(s.lines) for s in sections)
464
+ logger.debug(
465
+ "Reconstructor: %d section(s), %d line(s), %d label entries",
466
+ len(sections),
467
+ total_lines,
468
+ len(ctx.label_index),
469
+ )