ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,33 @@
1
+ """ocr_postprocess — OCR post-processing pipeline."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from ocr_postprocess.exceptions import (
6
+ CyclicComputeError,
7
+ ExtractorNotFoundError,
8
+ OcrPostprocessError,
9
+ ProfileNotFoundError,
10
+ ProfileValidationError,
11
+ TransformError,
12
+ )
13
+ from ocr_postprocess.models import PipelineContext, ProcessedDocument
14
+ from ocr_postprocess.pipeline import Pipeline
15
+ from ocr_postprocess.renderer.llm import render_llm_markdown
16
+
17
+ __all__ = [
18
+ # Core
19
+ "Pipeline",
20
+ "ProcessedDocument",
21
+ "PipelineContext",
22
+ # Renderers
23
+ "render_llm_markdown",
24
+ # Exceptions — import these to catch errors without knowing internal paths
25
+ "OcrPostprocessError",
26
+ "ProfileNotFoundError",
27
+ "ProfileValidationError",
28
+ "ExtractorNotFoundError",
29
+ "TransformError",
30
+ "CyclicComputeError",
31
+ # Version
32
+ "__version__",
33
+ ]
@@ -0,0 +1,63 @@
1
+ """Stage 2 — Classifier: select best matching DocumentProfile."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ocr_postprocess.models import PipelineContext
8
+ from ocr_postprocess.profiles.matcher import evaluate
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def classify_stage(ctx: PipelineContext) -> None:
14
+ """Pipeline stage 2: classify document and load matching profile."""
15
+ profiles: dict = ctx.__dict__.get("_profiles", {})
16
+
17
+ if not profiles:
18
+ logger.warning("No profiles loaded; using generic fallback")
19
+ ctx.classification_score = 0.0
20
+ return
21
+
22
+ text = ctx.normalized_text or ctx.raw_text
23
+ best_id: str | None = None
24
+ best_score = 0.0
25
+ second_score = 0.0
26
+
27
+ scores: list[tuple[float, str]] = []
28
+ for pid, profile in profiles.items():
29
+ if pid.startswith("_"):
30
+ continue # skip generic fallback in main scoring
31
+ try:
32
+ score = evaluate(profile.classify, text)
33
+ except Exception:
34
+ logger.exception("Error evaluating classify for profile '%s'", pid)
35
+ score = 0.0
36
+ scores.append((score, pid))
37
+
38
+ scores.sort(reverse=True)
39
+
40
+ if scores:
41
+ best_score, best_id = scores[0]
42
+ second_score = scores[1][0] if len(scores) > 1 else 0.0
43
+
44
+ if best_score < 0.5:
45
+ # Fall back to _generic
46
+ best_id = "_generic"
47
+ best_score = 0.0
48
+ logger.info("Classification score too low; using _generic")
49
+ elif len(scores) > 1 and best_score - second_score < 0.1:
50
+ logger.warning(
51
+ "Ambiguous classification: '%s'=%.2f vs '%s'=%.2f",
52
+ best_id,
53
+ best_score,
54
+ scores[1][1],
55
+ second_score,
56
+ )
57
+ else:
58
+ best_id = "_generic"
59
+
60
+ profile_obj = profiles.get(best_id)
61
+ ctx.profile = profile_obj
62
+ ctx.classification_score = best_score
63
+ logger.info("Classified as '%s' (score=%.3f)", best_id, best_score)
ocr_postprocess/cli.py ADDED
@@ -0,0 +1,130 @@
1
+ """Typer CLI for ocr_postprocess."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import typer
11
+
12
+ app = typer.Typer(name="ocrpp", help="OCR post-processing pipeline CLI.", add_completion=False)
13
+
14
+ _LOG_LEVELS = {
15
+ "debug": logging.DEBUG,
16
+ "info": logging.INFO,
17
+ "warning": logging.WARNING,
18
+ "error": logging.ERROR,
19
+ }
20
+
21
+
22
+ def _setup_logging(level: str) -> None:
23
+ """Configure root logger with given level string."""
24
+ logging.basicConfig(
25
+ level=_LOG_LEVELS.get(level.lower(), logging.INFO),
26
+ format="%(levelname)s %(name)s: %(message)s",
27
+ )
28
+
29
+
30
+ @app.command()
31
+ def classify(
32
+ input_file: Optional[Path] = typer.Argument(
33
+ None, help="Path to raw text file. Reads stdin if omitted."
34
+ ),
35
+ profiles_dir: Path = typer.Option(
36
+ Path("profiles"), "--profiles", "-p", help="Profiles directory."
37
+ ),
38
+ log_level: str = typer.Option("info", "--log-level", "-l"),
39
+ ) -> None:
40
+ """Classify a document and print the best matching profile ID."""
41
+ _setup_logging(log_level)
42
+ from ocr_postprocess.pipeline import Pipeline
43
+
44
+ raw = _read_input(input_file)
45
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
46
+ profile_id, score = pipeline.classify(raw)
47
+ typer.echo(f"{profile_id}\t{score:.4f}")
48
+
49
+
50
+ @app.command()
51
+ def process(
52
+ input_file: Optional[Path] = typer.Argument(
53
+ None, help="Path to raw text file. Reads stdin if omitted."
54
+ ),
55
+ profiles_dir: Path = typer.Option(
56
+ Path("profiles"), "--profiles", "-p", help="Profiles directory."
57
+ ),
58
+ output_format: str = typer.Option(
59
+ "json", "--format", "-f", help="Output format: json, markdown, or llm."
60
+ ),
61
+ debug: bool = typer.Option(False, "--debug", "-d", help="Include debug trace."),
62
+ log_level: str = typer.Option("info", "--log-level", "-l"),
63
+ ) -> None:
64
+ """Process a document and print extracted fields."""
65
+ _setup_logging(log_level)
66
+ from ocr_postprocess.pipeline import Pipeline
67
+ from ocr_postprocess.renderer.json_renderer import to_json
68
+ from ocr_postprocess.renderer.llm import render_llm_markdown
69
+ from ocr_postprocess.renderer.markdown import render_markdown
70
+
71
+ raw = _read_input(input_file)
72
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
73
+ doc = pipeline.process(raw, debug=debug)
74
+
75
+ if output_format == "markdown":
76
+ typer.echo(render_markdown(doc))
77
+ elif output_format == "llm":
78
+ typer.echo(render_llm_markdown(doc))
79
+ else:
80
+ typer.echo(to_json(doc))
81
+
82
+
83
+ @app.command("validate-profile")
84
+ def validate_profile(
85
+ profile_file: Path = typer.Argument(..., help="Path to YAML profile file."),
86
+ log_level: str = typer.Option("info", "--log-level", "-l"),
87
+ ) -> None:
88
+ """Validate a YAML profile file and print errors if any."""
89
+ _setup_logging(log_level)
90
+ from ocr_postprocess.profiles.loader import load_profile
91
+
92
+ try:
93
+ profile = load_profile(profile_file)
94
+ typer.echo(f"OK: {profile.id} (v{profile.version})")
95
+ except Exception as exc:
96
+ typer.echo(f"ERROR: {exc}", err=True)
97
+ raise typer.Exit(code=1)
98
+
99
+
100
+ @app.command("dump-canonical")
101
+ def dump_canonical(
102
+ input_file: Optional[Path] = typer.Argument(None, help="Path to raw text file."),
103
+ profiles_dir: Path = typer.Option(
104
+ Path("profiles"), "--profiles", "-p", help="Profiles directory."
105
+ ),
106
+ log_level: str = typer.Option("info", "--log-level", "-l"),
107
+ ) -> None:
108
+ """Process document and dump canonical JSON to stdout."""
109
+ _setup_logging(log_level)
110
+ from ocr_postprocess.pipeline import Pipeline
111
+ from ocr_postprocess.renderer.json_renderer import to_json
112
+
113
+ raw = _read_input(input_file)
114
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
115
+ doc = pipeline.process(raw)
116
+ typer.echo(to_json(doc))
117
+
118
+
119
+ def _read_input(path: Optional[Path]) -> str:
120
+ """Read text from file path or stdin; exit with code 1 if no input available."""
121
+ if path is not None:
122
+ return path.read_text(encoding="utf-8")
123
+ if not sys.stdin.isatty():
124
+ return sys.stdin.read()
125
+ typer.echo("Error: no input provided", err=True)
126
+ raise typer.Exit(code=1)
127
+
128
+
129
+ if __name__ == "__main__":
130
+ app()
File without changes
@@ -0,0 +1,134 @@
1
+ """Stage 3 — Denoiser: remove boilerplate lines and patterns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import regex as re
8
+
9
+ from ocr_postprocess.models import PipelineContext
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Built-in rules (always applied regardless of profile config)
14
+ _BUILTIN_DROP_PATTERNS = [
15
+ re.compile(
16
+ r"^[\W_]{3,}$"
17
+ ), # lines consisting only of non-alphanumeric chars (e.g. "---", "***")
18
+ re.compile(r"^\s{0,2}\S\s{0,2}$"), # single character with optional surrounding spaces
19
+ ]
20
+
21
+
22
+ def _is_builtin_noise(line: str) -> bool:
23
+ """Return True if line matches built-in noise rules."""
24
+ stripped = line.strip()
25
+ if len(stripped) < 3:
26
+ return True
27
+ for pat in _BUILTIN_DROP_PATTERNS:
28
+ if pat.fullmatch(stripped):
29
+ return True
30
+ return False
31
+
32
+
33
+ def denoise(
34
+ text: str,
35
+ drop_line_patterns: list[str] | None = None,
36
+ drop_inline_patterns: list[str] | None = None,
37
+ mask_patterns: list[dict[str, str]] | None = None,
38
+ collapse_repeats: bool = False,
39
+ protected_substrings: set[str] | None = None,
40
+ ) -> str:
41
+ """Remove noise from text, preserving line count (empty string for dropped lines)."""
42
+ compiled_drop = [re.compile(p) for p in (drop_line_patterns or [])]
43
+ compiled_inline = [re.compile(p) for p in (drop_inline_patterns or [])]
44
+ compiled_mask = [
45
+ (re.compile(m["pattern"]), m.get("replacement", "")) for m in (mask_patterns or [])
46
+ ]
47
+
48
+ lines = text.split("\n")
49
+ prev_line: str | None = None
50
+ result: list[str] = []
51
+
52
+ for line in lines:
53
+ # 1. Consecutive duplicate
54
+ if collapse_repeats and line == prev_line and line.strip():
55
+ result.append("")
56
+ continue
57
+
58
+ # 2. Built-in noise (skip if line contains protected label)
59
+ stripped = line.strip()
60
+ if _is_builtin_noise(stripped):
61
+ if protected_substrings and any(p in line for p in protected_substrings):
62
+ pass # keep
63
+ else:
64
+ result.append("")
65
+ prev_line = line
66
+ continue
67
+
68
+ # 3. Profile drop_lines rules
69
+ dropped = False
70
+ for pat in compiled_drop:
71
+ if pat.search(stripped):
72
+ if protected_substrings and any(p in line for p in protected_substrings):
73
+ break
74
+ dropped = True
75
+ break
76
+ if dropped:
77
+ result.append("")
78
+ prev_line = line
79
+ continue
80
+
81
+ # 4. Inline drop patterns
82
+ for pat in compiled_inline:
83
+ line = pat.sub("", line)
84
+
85
+ # 5. Mask patterns
86
+ for pat, repl in compiled_mask:
87
+ line = pat.sub(repl, line)
88
+
89
+ result.append(line)
90
+ prev_line = line
91
+
92
+ return "\n".join(result)
93
+
94
+
95
+ def denoise_stage(ctx: PipelineContext) -> None:
96
+ """Pipeline stage 3: denoise text."""
97
+ profile = ctx.profile
98
+ noise = profile.noise if profile else None
99
+
100
+ protected: set[str] = set()
101
+ if profile:
102
+ for field in profile.fields:
103
+ for alias in field.aliases:
104
+ protected.add(alias)
105
+
106
+ # Build drop_line_patterns from DropLinesRule
107
+ drop_line_patterns: list[str] = []
108
+ if noise and noise.drop_lines:
109
+ dl = noise.drop_lines
110
+ drop_line_patterns.extend(dl.regex)
111
+ # contains_any patterns → convert to regex
112
+ for phrase in dl.contains_any:
113
+ drop_line_patterns.append(re.escape(phrase))
114
+
115
+ text = ctx.normalized_text or ctx.raw_text
116
+ before_lines = [ln for ln in text.splitlines() if ln.strip()]
117
+ ctx.normalized_text = denoise(
118
+ text,
119
+ drop_line_patterns=drop_line_patterns,
120
+ drop_inline_patterns=list(noise.drop_patterns) if noise else [],
121
+ mask_patterns=list(noise.mask_patterns) if noise else [],
122
+ collapse_repeats=noise.collapse_repeats if noise else False,
123
+ protected_substrings=protected,
124
+ )
125
+ after_lines = [ln for ln in ctx.normalized_text.splitlines() if ln.strip()]
126
+ dropped = len(before_lines) - len(after_lines)
127
+ logger.debug(
128
+ "Denoiser: %d non-empty lines → %d (dropped %d, %d drop-patterns, %d mask-patterns)",
129
+ len(before_lines),
130
+ len(after_lines),
131
+ dropped,
132
+ len(drop_line_patterns),
133
+ len(noise.mask_patterns) if noise else 0,
134
+ )
@@ -0,0 +1,107 @@
1
+ """Stage 5 — Extractor dispatcher."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ocr_postprocess.models import Candidate, PipelineContext
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def extract_stage(ctx: PipelineContext) -> None:
13
+ """Pipeline stage 5: run extractors for all profile fields."""
14
+ from ocr_postprocess.extractors import registry
15
+
16
+ profile = ctx.profile
17
+ if not profile:
18
+ logger.warning("No profile loaded; skipping extraction")
19
+ return
20
+
21
+ # Auto-import all registered extractors
22
+ _ensure_extractors_imported()
23
+
24
+ candidates: list[Candidate] = []
25
+
26
+ # Structured extractors first
27
+ failed_count = 0
28
+ for struct_ref in profile.structured_extractors:
29
+ try:
30
+ extractor = registry.get_instance(struct_ref.name)
31
+ cands = extractor.extract(ctx, field=None)
32
+ candidates.extend(cands)
33
+ except Exception:
34
+ failed_count += 1
35
+ logger.exception("Structured extractor '%s' failed", struct_ref.name)
36
+
37
+ # Per-field extractors
38
+ for field in profile.fields:
39
+ if field.constant is not None:
40
+ # Constant — skip extraction, reconciler will inject
41
+ continue
42
+
43
+ if field.compute is not None:
44
+ # Compute — skip extraction, transform stage handles
45
+ continue
46
+
47
+ if not field.extractor:
48
+ continue
49
+
50
+ try:
51
+ extractor = registry.get_instance(field.extractor)
52
+ cands = extractor.extract(ctx, field=field)
53
+ for c in cands:
54
+ c.key = field.key
55
+ candidates.extend(cands)
56
+ except Exception:
57
+ failed_count += 1
58
+ logger.exception("Extractor '%s' for field '%s' failed", field.extractor, field.key)
59
+
60
+ # Default fallback
61
+ if not any(c.key == field.key for c in candidates) and field.default is not None:
62
+ candidates.append(
63
+ Candidate(
64
+ key=field.key,
65
+ value=field.default,
66
+ raw=str(field.default),
67
+ extractor="default",
68
+ sources=["default"],
69
+ confidence=0.3,
70
+ needs_llm_review=True,
71
+ )
72
+ )
73
+
74
+ ctx.candidates = candidates
75
+ if failed_count:
76
+ logger.warning(
77
+ "Extractor stage: %d candidate(s) from %d field(s); %d extractor(s) failed",
78
+ len(candidates),
79
+ len(profile.fields),
80
+ failed_count,
81
+ )
82
+ else:
83
+ logger.debug(
84
+ "Extractor stage: %d candidate(s) from %d field(s)",
85
+ len(candidates),
86
+ len(profile.fields),
87
+ )
88
+
89
+
90
+ def _ensure_extractors_imported() -> None:
91
+ """Import all extractor submodules so @register decorators run."""
92
+ import importlib
93
+ import pkgutil
94
+
95
+ import ocr_postprocess.extractors.label_anchor as la_pkg
96
+ import ocr_postprocess.extractors.pattern as pat_pkg
97
+ import ocr_postprocess.extractors.structured as st_pkg
98
+
99
+ for pkg in (pat_pkg, la_pkg, st_pkg):
100
+ pkg_path = pkg.__path__
101
+ pkg_name = pkg.__name__
102
+ for _finder, mod_name, _ispkg in pkgutil.iter_modules(pkg_path):
103
+ full_name = f"{pkg_name}.{mod_name}"
104
+ try:
105
+ importlib.import_module(full_name)
106
+ except Exception:
107
+ logger.debug("Could not import %s", full_name)
@@ -0,0 +1,128 @@
1
+ """Stage 1 — Normalizer: Unicode, whitespace, OCR fixes, dedup repeats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import unicodedata
7
+
8
+ import regex as re
9
+
10
+ from ocr_postprocess.models import PipelineContext
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Maps common OCR letter-for-digit misreads: O→0, l→1, S→5
15
+ _OCR_DIGIT_MAP = str.maketrans("OlS", "015")
16
+
17
+
18
+ def _fix_ocr_in_digit_tokens(text: str) -> str:
19
+ """Replace common OCR letter mistakes inside all-digit-like tokens."""
20
+
21
+ def fix_token(m: re.Match) -> str:
22
+ token = m.group(0)
23
+ # Only fix tokens that look like numbers mixed with OCR errors
24
+ candidate = token.translate(_OCR_DIGIT_MAP)
25
+ if candidate.isdigit() and not token.isdigit():
26
+ return candidate
27
+ return token
28
+
29
+ return re.sub(r"\b[0-9OlS]{2,}\b", fix_token, text)
30
+
31
+
32
+ def _dedup_adjacent_repeats(text: str, min_len: int = 5, max_len: int = 30) -> str:
33
+ """Remove directly adjacent repeated substrings on the same line."""
34
+ lines = text.split("\n")
35
+ result = []
36
+ for line in lines:
37
+ if len(line) < min_len * 2:
38
+ result.append(line)
39
+ continue
40
+ # Try windows from largest to smallest
41
+ changed = True
42
+ while changed:
43
+ changed = False
44
+ for length in range(min(max_len, len(line) // 2), min_len - 1, -1):
45
+ i = 0
46
+ while i + length * 2 <= len(line):
47
+ chunk = line[i : i + length]
48
+ if line[i + length : i + length * 2] == chunk:
49
+ # Remove the first occurrence
50
+ line = line[:i] + line[i + length :]
51
+ changed = True
52
+ break
53
+ i += 1
54
+ if changed:
55
+ break
56
+ result.append(line)
57
+ return "\n".join(result)
58
+
59
+
60
+ def normalize(
61
+ text: str,
62
+ ocr_fixes: bool = True,
63
+ dedup_repeats: bool = True,
64
+ preserve_multi_space: bool = True,
65
+ ) -> str:
66
+ """Apply full normalization pipeline to raw OCR text."""
67
+ # 1. Unicode NFC
68
+ text = unicodedata.normalize("NFC", text)
69
+
70
+ # 2. Strip control chars (keep \n \t)
71
+ text = "".join(ch for ch in text if ch in ("\n", "\t") or (ord(ch) >= 0x20 and ord(ch) != 0x7F))
72
+
73
+ # 3. Whitespace normalize
74
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
75
+ text = text.replace("\t", " ")
76
+
77
+ lines = text.split("\n")
78
+ normalized_lines = []
79
+ for line in lines:
80
+ # Trim trailing space
81
+ line = line.rstrip()
82
+ # Collapse multiple spaces (unless preserve_multi_space)
83
+ if not preserve_multi_space:
84
+ line = re.sub(r" {2,}", " ", line)
85
+ normalized_lines.append(line)
86
+ text = "\n".join(normalized_lines)
87
+
88
+ # 4. Smart quotes
89
+ text = (
90
+ text.replace("\u2018", "'")
91
+ .replace("\u2019", "'")
92
+ .replace("\u201c", '"')
93
+ .replace("\u201d", '"')
94
+ )
95
+
96
+ # 5. OCR digit fixes
97
+ if ocr_fixes:
98
+ text = _fix_ocr_in_digit_tokens(text)
99
+
100
+ # 6. Dedup adjacent repeats
101
+ if dedup_repeats:
102
+ text = _dedup_adjacent_repeats(text)
103
+
104
+ return text
105
+
106
+
107
+ def normalize_stage(ctx: PipelineContext) -> None:
108
+ """Pipeline stage 1: normalize raw text."""
109
+ profile = ctx.profile
110
+ cfg = {}
111
+ if profile and profile.normalize:
112
+ cfg = profile.normalize
113
+
114
+ ctx.normalized_text = normalize(
115
+ ctx.raw_text,
116
+ ocr_fixes=cfg.get("ocr_fixes", True),
117
+ dedup_repeats=cfg.get("dedup_repeats", True),
118
+ preserve_multi_space=cfg.get("preserve_multi_space", True),
119
+ )
120
+ in_lines = ctx.raw_text.count("\n") + 1
121
+ out_lines = ctx.normalized_text.count("\n") + 1
122
+ logger.debug(
123
+ "Normalizer: %d → %d chars, %d → %d lines",
124
+ len(ctx.raw_text),
125
+ len(ctx.normalized_text),
126
+ in_lines,
127
+ out_lines,
128
+ )