ocr-postprocess 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_postprocess/__init__.py +33 -0
- ocr_postprocess/classifier.py +63 -0
- ocr_postprocess/cli.py +130 -0
- ocr_postprocess/engine/__init__.py +0 -0
- ocr_postprocess/engine/denoiser.py +134 -0
- ocr_postprocess/engine/extractor_stage.py +107 -0
- ocr_postprocess/engine/normalizer.py +128 -0
- ocr_postprocess/engine/reconciler.py +170 -0
- ocr_postprocess/engine/reconstructor.py +469 -0
- ocr_postprocess/engine/transform_stage.py +89 -0
- ocr_postprocess/exceptions.py +30 -0
- ocr_postprocess/extractors/__init__.py +0 -0
- ocr_postprocess/extractors/base.py +103 -0
- ocr_postprocess/extractors/helpers.py +63 -0
- ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
- ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
- ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
- ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
- ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
- ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
- ocr_postprocess/extractors/pattern/__init__.py +0 -0
- ocr_postprocess/extractors/pattern/cccd.py +120 -0
- ocr_postprocess/extractors/pattern/cmnd.py +38 -0
- ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
- ocr_postprocess/extractors/pattern/date.py +89 -0
- ocr_postprocess/extractors/pattern/email.py +38 -0
- ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
- ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
- ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
- ocr_postprocess/extractors/pattern/tax_code.py +53 -0
- ocr_postprocess/extractors/registry.py +45 -0
- ocr_postprocess/extractors/structured/__init__.py +0 -0
- ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
- ocr_postprocess/extractors/universal.py +39 -0
- ocr_postprocess/models.py +131 -0
- ocr_postprocess/pipeline.py +179 -0
- ocr_postprocess/profiles/__init__.py +0 -0
- ocr_postprocess/profiles/_generic.yml +13 -0
- ocr_postprocess/profiles/cccd_2024.yml +113 -0
- ocr_postprocess/profiles/dang_kiem.yml +105 -0
- ocr_postprocess/profiles/loader.py +63 -0
- ocr_postprocess/profiles/matcher.py +71 -0
- ocr_postprocess/profiles/schema.py +197 -0
- ocr_postprocess/py.typed +0 -0
- ocr_postprocess/renderer/__init__.py +0 -0
- ocr_postprocess/renderer/json_renderer.py +59 -0
- ocr_postprocess/renderer/llm.py +41 -0
- ocr_postprocess/renderer/markdown.py +172 -0
- ocr_postprocess/scorer.py +78 -0
- ocr_postprocess/transformer.py +304 -0
- ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
- ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
- ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
- ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
- ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""ocr_postprocess — OCR post-processing pipeline."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from ocr_postprocess.exceptions import (
|
|
6
|
+
CyclicComputeError,
|
|
7
|
+
ExtractorNotFoundError,
|
|
8
|
+
OcrPostprocessError,
|
|
9
|
+
ProfileNotFoundError,
|
|
10
|
+
ProfileValidationError,
|
|
11
|
+
TransformError,
|
|
12
|
+
)
|
|
13
|
+
from ocr_postprocess.models import PipelineContext, ProcessedDocument
|
|
14
|
+
from ocr_postprocess.pipeline import Pipeline
|
|
15
|
+
from ocr_postprocess.renderer.llm import render_llm_markdown
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# Core
|
|
19
|
+
"Pipeline",
|
|
20
|
+
"ProcessedDocument",
|
|
21
|
+
"PipelineContext",
|
|
22
|
+
# Renderers
|
|
23
|
+
"render_llm_markdown",
|
|
24
|
+
# Exceptions — import these to catch errors without knowing internal paths
|
|
25
|
+
"OcrPostprocessError",
|
|
26
|
+
"ProfileNotFoundError",
|
|
27
|
+
"ProfileValidationError",
|
|
28
|
+
"ExtractorNotFoundError",
|
|
29
|
+
"TransformError",
|
|
30
|
+
"CyclicComputeError",
|
|
31
|
+
# Version
|
|
32
|
+
"__version__",
|
|
33
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Stage 2 — Classifier: select best matching DocumentProfile."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ocr_postprocess.models import PipelineContext
|
|
8
|
+
from ocr_postprocess.profiles.matcher import evaluate
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def classify_stage(ctx: PipelineContext) -> None:
|
|
14
|
+
"""Pipeline stage 2: classify document and load matching profile."""
|
|
15
|
+
profiles: dict = ctx.__dict__.get("_profiles", {})
|
|
16
|
+
|
|
17
|
+
if not profiles:
|
|
18
|
+
logger.warning("No profiles loaded; using generic fallback")
|
|
19
|
+
ctx.classification_score = 0.0
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
text = ctx.normalized_text or ctx.raw_text
|
|
23
|
+
best_id: str | None = None
|
|
24
|
+
best_score = 0.0
|
|
25
|
+
second_score = 0.0
|
|
26
|
+
|
|
27
|
+
scores: list[tuple[float, str]] = []
|
|
28
|
+
for pid, profile in profiles.items():
|
|
29
|
+
if pid.startswith("_"):
|
|
30
|
+
continue # skip generic fallback in main scoring
|
|
31
|
+
try:
|
|
32
|
+
score = evaluate(profile.classify, text)
|
|
33
|
+
except Exception:
|
|
34
|
+
logger.exception("Error evaluating classify for profile '%s'", pid)
|
|
35
|
+
score = 0.0
|
|
36
|
+
scores.append((score, pid))
|
|
37
|
+
|
|
38
|
+
scores.sort(reverse=True)
|
|
39
|
+
|
|
40
|
+
if scores:
|
|
41
|
+
best_score, best_id = scores[0]
|
|
42
|
+
second_score = scores[1][0] if len(scores) > 1 else 0.0
|
|
43
|
+
|
|
44
|
+
if best_score < 0.5:
|
|
45
|
+
# Fall back to _generic
|
|
46
|
+
best_id = "_generic"
|
|
47
|
+
best_score = 0.0
|
|
48
|
+
logger.info("Classification score too low; using _generic")
|
|
49
|
+
elif len(scores) > 1 and best_score - second_score < 0.1:
|
|
50
|
+
logger.warning(
|
|
51
|
+
"Ambiguous classification: '%s'=%.2f vs '%s'=%.2f",
|
|
52
|
+
best_id,
|
|
53
|
+
best_score,
|
|
54
|
+
scores[1][1],
|
|
55
|
+
second_score,
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
best_id = "_generic"
|
|
59
|
+
|
|
60
|
+
profile_obj = profiles.get(best_id)
|
|
61
|
+
ctx.profile = profile_obj
|
|
62
|
+
ctx.classification_score = best_score
|
|
63
|
+
logger.info("Classified as '%s' (score=%.3f)", best_id, best_score)
|
ocr_postprocess/cli.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Typer CLI for ocr_postprocess."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
|
|
12
|
+
app = typer.Typer(name="ocrpp", help="OCR post-processing pipeline CLI.", add_completion=False)
|
|
13
|
+
|
|
14
|
+
_LOG_LEVELS = {
|
|
15
|
+
"debug": logging.DEBUG,
|
|
16
|
+
"info": logging.INFO,
|
|
17
|
+
"warning": logging.WARNING,
|
|
18
|
+
"error": logging.ERROR,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _setup_logging(level: str) -> None:
|
|
23
|
+
"""Configure root logger with given level string."""
|
|
24
|
+
logging.basicConfig(
|
|
25
|
+
level=_LOG_LEVELS.get(level.lower(), logging.INFO),
|
|
26
|
+
format="%(levelname)s %(name)s: %(message)s",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@app.command()
|
|
31
|
+
def classify(
|
|
32
|
+
input_file: Optional[Path] = typer.Argument(
|
|
33
|
+
None, help="Path to raw text file. Reads stdin if omitted."
|
|
34
|
+
),
|
|
35
|
+
profiles_dir: Path = typer.Option(
|
|
36
|
+
Path("profiles"), "--profiles", "-p", help="Profiles directory."
|
|
37
|
+
),
|
|
38
|
+
log_level: str = typer.Option("info", "--log-level", "-l"),
|
|
39
|
+
) -> None:
|
|
40
|
+
"""Classify a document and print the best matching profile ID."""
|
|
41
|
+
_setup_logging(log_level)
|
|
42
|
+
from ocr_postprocess.pipeline import Pipeline
|
|
43
|
+
|
|
44
|
+
raw = _read_input(input_file)
|
|
45
|
+
pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
|
|
46
|
+
profile_id, score = pipeline.classify(raw)
|
|
47
|
+
typer.echo(f"{profile_id}\t{score:.4f}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@app.command()
|
|
51
|
+
def process(
|
|
52
|
+
input_file: Optional[Path] = typer.Argument(
|
|
53
|
+
None, help="Path to raw text file. Reads stdin if omitted."
|
|
54
|
+
),
|
|
55
|
+
profiles_dir: Path = typer.Option(
|
|
56
|
+
Path("profiles"), "--profiles", "-p", help="Profiles directory."
|
|
57
|
+
),
|
|
58
|
+
output_format: str = typer.Option(
|
|
59
|
+
"json", "--format", "-f", help="Output format: json, markdown, or llm."
|
|
60
|
+
),
|
|
61
|
+
debug: bool = typer.Option(False, "--debug", "-d", help="Include debug trace."),
|
|
62
|
+
log_level: str = typer.Option("info", "--log-level", "-l"),
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Process a document and print extracted fields."""
|
|
65
|
+
_setup_logging(log_level)
|
|
66
|
+
from ocr_postprocess.pipeline import Pipeline
|
|
67
|
+
from ocr_postprocess.renderer.json_renderer import to_json
|
|
68
|
+
from ocr_postprocess.renderer.llm import render_llm_markdown
|
|
69
|
+
from ocr_postprocess.renderer.markdown import render_markdown
|
|
70
|
+
|
|
71
|
+
raw = _read_input(input_file)
|
|
72
|
+
pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
|
|
73
|
+
doc = pipeline.process(raw, debug=debug)
|
|
74
|
+
|
|
75
|
+
if output_format == "markdown":
|
|
76
|
+
typer.echo(render_markdown(doc))
|
|
77
|
+
elif output_format == "llm":
|
|
78
|
+
typer.echo(render_llm_markdown(doc))
|
|
79
|
+
else:
|
|
80
|
+
typer.echo(to_json(doc))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@app.command("validate-profile")
|
|
84
|
+
def validate_profile(
|
|
85
|
+
profile_file: Path = typer.Argument(..., help="Path to YAML profile file."),
|
|
86
|
+
log_level: str = typer.Option("info", "--log-level", "-l"),
|
|
87
|
+
) -> None:
|
|
88
|
+
"""Validate a YAML profile file and print errors if any."""
|
|
89
|
+
_setup_logging(log_level)
|
|
90
|
+
from ocr_postprocess.profiles.loader import load_profile
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
profile = load_profile(profile_file)
|
|
94
|
+
typer.echo(f"OK: {profile.id} (v{profile.version})")
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
typer.echo(f"ERROR: {exc}", err=True)
|
|
97
|
+
raise typer.Exit(code=1)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@app.command("dump-canonical")
|
|
101
|
+
def dump_canonical(
|
|
102
|
+
input_file: Optional[Path] = typer.Argument(None, help="Path to raw text file."),
|
|
103
|
+
profiles_dir: Path = typer.Option(
|
|
104
|
+
Path("profiles"), "--profiles", "-p", help="Profiles directory."
|
|
105
|
+
),
|
|
106
|
+
log_level: str = typer.Option("info", "--log-level", "-l"),
|
|
107
|
+
) -> None:
|
|
108
|
+
"""Process document and dump canonical JSON to stdout."""
|
|
109
|
+
_setup_logging(log_level)
|
|
110
|
+
from ocr_postprocess.pipeline import Pipeline
|
|
111
|
+
from ocr_postprocess.renderer.json_renderer import to_json
|
|
112
|
+
|
|
113
|
+
raw = _read_input(input_file)
|
|
114
|
+
pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
|
|
115
|
+
doc = pipeline.process(raw)
|
|
116
|
+
typer.echo(to_json(doc))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _read_input(path: Optional[Path]) -> str:
|
|
120
|
+
"""Read text from file path or stdin; exit with code 1 if no input available."""
|
|
121
|
+
if path is not None:
|
|
122
|
+
return path.read_text(encoding="utf-8")
|
|
123
|
+
if not sys.stdin.isatty():
|
|
124
|
+
return sys.stdin.read()
|
|
125
|
+
typer.echo("Error: no input provided", err=True)
|
|
126
|
+
raise typer.Exit(code=1)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Stage 3 — Denoiser: remove boilerplate lines and patterns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import regex as re
|
|
8
|
+
|
|
9
|
+
from ocr_postprocess.models import PipelineContext
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# Built-in rules (always applied regardless of profile config)
|
|
14
|
+
_BUILTIN_DROP_PATTERNS = [
|
|
15
|
+
re.compile(
|
|
16
|
+
r"^[\W_]{3,}$"
|
|
17
|
+
), # lines consisting only of non-alphanumeric chars (e.g. "---", "***")
|
|
18
|
+
re.compile(r"^\s{0,2}\S\s{0,2}$"), # single character with optional surrounding spaces
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_builtin_noise(line: str) -> bool:
|
|
23
|
+
"""Return True if line matches built-in noise rules."""
|
|
24
|
+
stripped = line.strip()
|
|
25
|
+
if len(stripped) < 3:
|
|
26
|
+
return True
|
|
27
|
+
for pat in _BUILTIN_DROP_PATTERNS:
|
|
28
|
+
if pat.fullmatch(stripped):
|
|
29
|
+
return True
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def denoise(
|
|
34
|
+
text: str,
|
|
35
|
+
drop_line_patterns: list[str] | None = None,
|
|
36
|
+
drop_inline_patterns: list[str] | None = None,
|
|
37
|
+
mask_patterns: list[dict[str, str]] | None = None,
|
|
38
|
+
collapse_repeats: bool = False,
|
|
39
|
+
protected_substrings: set[str] | None = None,
|
|
40
|
+
) -> str:
|
|
41
|
+
"""Remove noise from text, preserving line count (empty string for dropped lines)."""
|
|
42
|
+
compiled_drop = [re.compile(p) for p in (drop_line_patterns or [])]
|
|
43
|
+
compiled_inline = [re.compile(p) for p in (drop_inline_patterns or [])]
|
|
44
|
+
compiled_mask = [
|
|
45
|
+
(re.compile(m["pattern"]), m.get("replacement", "")) for m in (mask_patterns or [])
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
lines = text.split("\n")
|
|
49
|
+
prev_line: str | None = None
|
|
50
|
+
result: list[str] = []
|
|
51
|
+
|
|
52
|
+
for line in lines:
|
|
53
|
+
# 1. Consecutive duplicate
|
|
54
|
+
if collapse_repeats and line == prev_line and line.strip():
|
|
55
|
+
result.append("")
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# 2. Built-in noise (skip if line contains protected label)
|
|
59
|
+
stripped = line.strip()
|
|
60
|
+
if _is_builtin_noise(stripped):
|
|
61
|
+
if protected_substrings and any(p in line for p in protected_substrings):
|
|
62
|
+
pass # keep
|
|
63
|
+
else:
|
|
64
|
+
result.append("")
|
|
65
|
+
prev_line = line
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
# 3. Profile drop_lines rules
|
|
69
|
+
dropped = False
|
|
70
|
+
for pat in compiled_drop:
|
|
71
|
+
if pat.search(stripped):
|
|
72
|
+
if protected_substrings and any(p in line for p in protected_substrings):
|
|
73
|
+
break
|
|
74
|
+
dropped = True
|
|
75
|
+
break
|
|
76
|
+
if dropped:
|
|
77
|
+
result.append("")
|
|
78
|
+
prev_line = line
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
# 4. Inline drop patterns
|
|
82
|
+
for pat in compiled_inline:
|
|
83
|
+
line = pat.sub("", line)
|
|
84
|
+
|
|
85
|
+
# 5. Mask patterns
|
|
86
|
+
for pat, repl in compiled_mask:
|
|
87
|
+
line = pat.sub(repl, line)
|
|
88
|
+
|
|
89
|
+
result.append(line)
|
|
90
|
+
prev_line = line
|
|
91
|
+
|
|
92
|
+
return "\n".join(result)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def denoise_stage(ctx: PipelineContext) -> None:
|
|
96
|
+
"""Pipeline stage 3: denoise text."""
|
|
97
|
+
profile = ctx.profile
|
|
98
|
+
noise = profile.noise if profile else None
|
|
99
|
+
|
|
100
|
+
protected: set[str] = set()
|
|
101
|
+
if profile:
|
|
102
|
+
for field in profile.fields:
|
|
103
|
+
for alias in field.aliases:
|
|
104
|
+
protected.add(alias)
|
|
105
|
+
|
|
106
|
+
# Build drop_line_patterns from DropLinesRule
|
|
107
|
+
drop_line_patterns: list[str] = []
|
|
108
|
+
if noise and noise.drop_lines:
|
|
109
|
+
dl = noise.drop_lines
|
|
110
|
+
drop_line_patterns.extend(dl.regex)
|
|
111
|
+
# contains_any patterns → convert to regex
|
|
112
|
+
for phrase in dl.contains_any:
|
|
113
|
+
drop_line_patterns.append(re.escape(phrase))
|
|
114
|
+
|
|
115
|
+
text = ctx.normalized_text or ctx.raw_text
|
|
116
|
+
before_lines = [ln for ln in text.splitlines() if ln.strip()]
|
|
117
|
+
ctx.normalized_text = denoise(
|
|
118
|
+
text,
|
|
119
|
+
drop_line_patterns=drop_line_patterns,
|
|
120
|
+
drop_inline_patterns=list(noise.drop_patterns) if noise else [],
|
|
121
|
+
mask_patterns=list(noise.mask_patterns) if noise else [],
|
|
122
|
+
collapse_repeats=noise.collapse_repeats if noise else False,
|
|
123
|
+
protected_substrings=protected,
|
|
124
|
+
)
|
|
125
|
+
after_lines = [ln for ln in ctx.normalized_text.splitlines() if ln.strip()]
|
|
126
|
+
dropped = len(before_lines) - len(after_lines)
|
|
127
|
+
logger.debug(
|
|
128
|
+
"Denoiser: %d non-empty lines → %d (dropped %d, %d drop-patterns, %d mask-patterns)",
|
|
129
|
+
len(before_lines),
|
|
130
|
+
len(after_lines),
|
|
131
|
+
dropped,
|
|
132
|
+
len(drop_line_patterns),
|
|
133
|
+
len(noise.mask_patterns) if noise else 0,
|
|
134
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Stage 5 — Extractor dispatcher."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ocr_postprocess.models import Candidate, PipelineContext
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_stage(ctx: PipelineContext) -> None:
|
|
13
|
+
"""Pipeline stage 5: run extractors for all profile fields."""
|
|
14
|
+
from ocr_postprocess.extractors import registry
|
|
15
|
+
|
|
16
|
+
profile = ctx.profile
|
|
17
|
+
if not profile:
|
|
18
|
+
logger.warning("No profile loaded; skipping extraction")
|
|
19
|
+
return
|
|
20
|
+
|
|
21
|
+
# Auto-import all registered extractors
|
|
22
|
+
_ensure_extractors_imported()
|
|
23
|
+
|
|
24
|
+
candidates: list[Candidate] = []
|
|
25
|
+
|
|
26
|
+
# Structured extractors first
|
|
27
|
+
failed_count = 0
|
|
28
|
+
for struct_ref in profile.structured_extractors:
|
|
29
|
+
try:
|
|
30
|
+
extractor = registry.get_instance(struct_ref.name)
|
|
31
|
+
cands = extractor.extract(ctx, field=None)
|
|
32
|
+
candidates.extend(cands)
|
|
33
|
+
except Exception:
|
|
34
|
+
failed_count += 1
|
|
35
|
+
logger.exception("Structured extractor '%s' failed", struct_ref.name)
|
|
36
|
+
|
|
37
|
+
# Per-field extractors
|
|
38
|
+
for field in profile.fields:
|
|
39
|
+
if field.constant is not None:
|
|
40
|
+
# Constant — skip extraction, reconciler will inject
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
if field.compute is not None:
|
|
44
|
+
# Compute — skip extraction, transform stage handles
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
if not field.extractor:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
extractor = registry.get_instance(field.extractor)
|
|
52
|
+
cands = extractor.extract(ctx, field=field)
|
|
53
|
+
for c in cands:
|
|
54
|
+
c.key = field.key
|
|
55
|
+
candidates.extend(cands)
|
|
56
|
+
except Exception:
|
|
57
|
+
failed_count += 1
|
|
58
|
+
logger.exception("Extractor '%s' for field '%s' failed", field.extractor, field.key)
|
|
59
|
+
|
|
60
|
+
# Default fallback
|
|
61
|
+
if not any(c.key == field.key for c in candidates) and field.default is not None:
|
|
62
|
+
candidates.append(
|
|
63
|
+
Candidate(
|
|
64
|
+
key=field.key,
|
|
65
|
+
value=field.default,
|
|
66
|
+
raw=str(field.default),
|
|
67
|
+
extractor="default",
|
|
68
|
+
sources=["default"],
|
|
69
|
+
confidence=0.3,
|
|
70
|
+
needs_llm_review=True,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
ctx.candidates = candidates
|
|
75
|
+
if failed_count:
|
|
76
|
+
logger.warning(
|
|
77
|
+
"Extractor stage: %d candidate(s) from %d field(s); %d extractor(s) failed",
|
|
78
|
+
len(candidates),
|
|
79
|
+
len(profile.fields),
|
|
80
|
+
failed_count,
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
logger.debug(
|
|
84
|
+
"Extractor stage: %d candidate(s) from %d field(s)",
|
|
85
|
+
len(candidates),
|
|
86
|
+
len(profile.fields),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _ensure_extractors_imported() -> None:
|
|
91
|
+
"""Import all extractor submodules so @register decorators run."""
|
|
92
|
+
import importlib
|
|
93
|
+
import pkgutil
|
|
94
|
+
|
|
95
|
+
import ocr_postprocess.extractors.label_anchor as la_pkg
|
|
96
|
+
import ocr_postprocess.extractors.pattern as pat_pkg
|
|
97
|
+
import ocr_postprocess.extractors.structured as st_pkg
|
|
98
|
+
|
|
99
|
+
for pkg in (pat_pkg, la_pkg, st_pkg):
|
|
100
|
+
pkg_path = pkg.__path__
|
|
101
|
+
pkg_name = pkg.__name__
|
|
102
|
+
for _finder, mod_name, _ispkg in pkgutil.iter_modules(pkg_path):
|
|
103
|
+
full_name = f"{pkg_name}.{mod_name}"
|
|
104
|
+
try:
|
|
105
|
+
importlib.import_module(full_name)
|
|
106
|
+
except Exception:
|
|
107
|
+
logger.debug("Could not import %s", full_name)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Stage 1 — Normalizer: Unicode, whitespace, OCR fixes, dedup repeats."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import unicodedata
|
|
7
|
+
|
|
8
|
+
import regex as re
|
|
9
|
+
|
|
10
|
+
from ocr_postprocess.models import PipelineContext
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Maps common OCR letter-for-digit misreads: O→0, l→1, S→5
|
|
15
|
+
_OCR_DIGIT_MAP = str.maketrans("OlS", "015")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _fix_ocr_in_digit_tokens(text: str) -> str:
|
|
19
|
+
"""Replace common OCR letter mistakes inside all-digit-like tokens."""
|
|
20
|
+
|
|
21
|
+
def fix_token(m: re.Match) -> str:
|
|
22
|
+
token = m.group(0)
|
|
23
|
+
# Only fix tokens that look like numbers mixed with OCR errors
|
|
24
|
+
candidate = token.translate(_OCR_DIGIT_MAP)
|
|
25
|
+
if candidate.isdigit() and not token.isdigit():
|
|
26
|
+
return candidate
|
|
27
|
+
return token
|
|
28
|
+
|
|
29
|
+
return re.sub(r"\b[0-9OlS]{2,}\b", fix_token, text)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _dedup_adjacent_repeats(text: str, min_len: int = 5, max_len: int = 30) -> str:
|
|
33
|
+
"""Remove directly adjacent repeated substrings on the same line."""
|
|
34
|
+
lines = text.split("\n")
|
|
35
|
+
result = []
|
|
36
|
+
for line in lines:
|
|
37
|
+
if len(line) < min_len * 2:
|
|
38
|
+
result.append(line)
|
|
39
|
+
continue
|
|
40
|
+
# Try windows from largest to smallest
|
|
41
|
+
changed = True
|
|
42
|
+
while changed:
|
|
43
|
+
changed = False
|
|
44
|
+
for length in range(min(max_len, len(line) // 2), min_len - 1, -1):
|
|
45
|
+
i = 0
|
|
46
|
+
while i + length * 2 <= len(line):
|
|
47
|
+
chunk = line[i : i + length]
|
|
48
|
+
if line[i + length : i + length * 2] == chunk:
|
|
49
|
+
# Remove the first occurrence
|
|
50
|
+
line = line[:i] + line[i + length :]
|
|
51
|
+
changed = True
|
|
52
|
+
break
|
|
53
|
+
i += 1
|
|
54
|
+
if changed:
|
|
55
|
+
break
|
|
56
|
+
result.append(line)
|
|
57
|
+
return "\n".join(result)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def normalize(
|
|
61
|
+
text: str,
|
|
62
|
+
ocr_fixes: bool = True,
|
|
63
|
+
dedup_repeats: bool = True,
|
|
64
|
+
preserve_multi_space: bool = True,
|
|
65
|
+
) -> str:
|
|
66
|
+
"""Apply full normalization pipeline to raw OCR text."""
|
|
67
|
+
# 1. Unicode NFC
|
|
68
|
+
text = unicodedata.normalize("NFC", text)
|
|
69
|
+
|
|
70
|
+
# 2. Strip control chars (keep \n \t)
|
|
71
|
+
text = "".join(ch for ch in text if ch in ("\n", "\t") or (ord(ch) >= 0x20 and ord(ch) != 0x7F))
|
|
72
|
+
|
|
73
|
+
# 3. Whitespace normalize
|
|
74
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
75
|
+
text = text.replace("\t", " ")
|
|
76
|
+
|
|
77
|
+
lines = text.split("\n")
|
|
78
|
+
normalized_lines = []
|
|
79
|
+
for line in lines:
|
|
80
|
+
# Trim trailing space
|
|
81
|
+
line = line.rstrip()
|
|
82
|
+
# Collapse multiple spaces (unless preserve_multi_space)
|
|
83
|
+
if not preserve_multi_space:
|
|
84
|
+
line = re.sub(r" {2,}", " ", line)
|
|
85
|
+
normalized_lines.append(line)
|
|
86
|
+
text = "\n".join(normalized_lines)
|
|
87
|
+
|
|
88
|
+
# 4. Smart quotes
|
|
89
|
+
text = (
|
|
90
|
+
text.replace("\u2018", "'")
|
|
91
|
+
.replace("\u2019", "'")
|
|
92
|
+
.replace("\u201c", '"')
|
|
93
|
+
.replace("\u201d", '"')
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 5. OCR digit fixes
|
|
97
|
+
if ocr_fixes:
|
|
98
|
+
text = _fix_ocr_in_digit_tokens(text)
|
|
99
|
+
|
|
100
|
+
# 6. Dedup adjacent repeats
|
|
101
|
+
if dedup_repeats:
|
|
102
|
+
text = _dedup_adjacent_repeats(text)
|
|
103
|
+
|
|
104
|
+
return text
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def normalize_stage(ctx: PipelineContext) -> None:
|
|
108
|
+
"""Pipeline stage 1: normalize raw text."""
|
|
109
|
+
profile = ctx.profile
|
|
110
|
+
cfg = {}
|
|
111
|
+
if profile and profile.normalize:
|
|
112
|
+
cfg = profile.normalize
|
|
113
|
+
|
|
114
|
+
ctx.normalized_text = normalize(
|
|
115
|
+
ctx.raw_text,
|
|
116
|
+
ocr_fixes=cfg.get("ocr_fixes", True),
|
|
117
|
+
dedup_repeats=cfg.get("dedup_repeats", True),
|
|
118
|
+
preserve_multi_space=cfg.get("preserve_multi_space", True),
|
|
119
|
+
)
|
|
120
|
+
in_lines = ctx.raw_text.count("\n") + 1
|
|
121
|
+
out_lines = ctx.normalized_text.count("\n") + 1
|
|
122
|
+
logger.debug(
|
|
123
|
+
"Normalizer: %d → %d chars, %d → %d lines",
|
|
124
|
+
len(ctx.raw_text),
|
|
125
|
+
len(ctx.normalized_text),
|
|
126
|
+
in_lines,
|
|
127
|
+
out_lines,
|
|
128
|
+
)
|