ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,120 @@
1
+ """Pattern extractor: CCCD (Căn cước công dân) 12-digit numbers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ # Province codes (first 3 digits of CCCD = province code * 1 or gender)
12
+ # Digits 1-3 encode: province code (001-096 range)
13
+ # Valid province codes in Vietnam
14
+ _VALID_PROVINCE_CODES = {
15
+ "001",
16
+ "002",
17
+ "004",
18
+ "006",
19
+ "008",
20
+ "010",
21
+ "011",
22
+ "012",
23
+ "014",
24
+ "015",
25
+ "017",
26
+ "019",
27
+ "020",
28
+ "022",
29
+ "024",
30
+ "025",
31
+ "026",
32
+ "027",
33
+ "030",
34
+ "031",
35
+ "033",
36
+ "034",
37
+ "035",
38
+ "036",
39
+ "037",
40
+ "038",
41
+ "040",
42
+ "042",
43
+ "044",
44
+ "045",
45
+ "046",
46
+ "048",
47
+ "049",
48
+ "051",
49
+ "052",
50
+ "054",
51
+ "056",
52
+ "058",
53
+ "060",
54
+ "062",
55
+ "064",
56
+ "066",
57
+ "067",
58
+ "068",
59
+ "070",
60
+ "072",
61
+ "074",
62
+ "075",
63
+ "077",
64
+ "079",
65
+ "080",
66
+ "082",
67
+ "083",
68
+ "084",
69
+ "086",
70
+ "087",
71
+ "089",
72
+ "091",
73
+ "092",
74
+ "093",
75
+ "094",
76
+ "095",
77
+ "096",
78
+ }
79
+
80
+ _PATTERN = re.compile(r"\b(\d{12})\b")
81
+
82
+
83
+ def _validate_cccd(number: str) -> tuple[bool, str]:
84
+ """Validate CCCD structure. Returns (is_valid, reason)."""
85
+ if len(number) != 12 or not number.isdigit():
86
+ return False, "not 12 digits"
87
+ province = number[:3]
88
+ if province not in _VALID_PROVINCE_CODES:
89
+ return False, f"invalid province code {province}"
90
+ return True, "ok"
91
+
92
+
93
+ @register("cccd")
94
+ class CccdExtractor(PatternExtractor):
95
+ """Extract 12-digit CCCD numbers with province code validation."""
96
+
97
+ pattern = r"\b\d{12}\b"
98
+
99
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
100
+ text = ctx.normalized_text or ctx.raw_text
101
+ candidates: list[Candidate] = []
102
+
103
+ for m in _PATTERN.finditer(text):
104
+ value = m.group(0)
105
+ valid, reason = _validate_cccd(value)
106
+ confidence = 0.95 if valid else 0.7
107
+ key = field.key if field else "cccd"
108
+ candidates.append(
109
+ Candidate(
110
+ key=key,
111
+ value=value,
112
+ raw=value,
113
+ extractor="cccd",
114
+ sources=["pattern:cccd"] + (["pattern:cccd:checksum"] if valid else []),
115
+ span=(m.start(), m.end()),
116
+ confidence=confidence,
117
+ notes=[] if valid else [f"province code validation: {reason}"],
118
+ )
119
+ )
120
+ return candidates
@@ -0,0 +1,38 @@
1
+ """Pattern extractor: CMND (9-digit old ID card)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ _PATTERN_9 = re.compile(r"\b(\d{9})\b")
12
+
13
+
14
+ @register("cmnd")
15
+ class CmndExtractor(PatternExtractor):
16
+ """Extract 9-digit CMND (old Vietnamese ID card) numbers."""
17
+
18
+ pattern = r"\b\d{9}\b"
19
+
20
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
21
+ text = ctx.normalized_text or ctx.raw_text
22
+ candidates: list[Candidate] = []
23
+ key = field.key if field else "cmnd"
24
+
25
+ for m in _PATTERN_9.finditer(text):
26
+ value = m.group(0)
27
+ candidates.append(
28
+ Candidate(
29
+ key=key,
30
+ value=value,
31
+ raw=value,
32
+ extractor="cmnd",
33
+ sources=["pattern:cmnd"],
34
+ span=(m.start(), m.end()),
35
+ confidence=0.85,
36
+ )
37
+ )
38
+ return candidates
@@ -0,0 +1,48 @@
1
+ """Pattern extractor: Vietnamese currency (VND)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ _PATTERN = re.compile(r"[\d.,]+\s*(?:đ|đồng|VND|VNĐ)\b", re.IGNORECASE)
12
+
13
+
14
+ def _parse_amount(raw: str) -> int | None:
15
+ cleaned = re.sub(r"[đồng VNĐvnd\s]", "", raw, flags=re.IGNORECASE)
16
+ cleaned = cleaned.replace(".", "").replace(",", "")
17
+ try:
18
+ return int(cleaned)
19
+ except (ValueError, TypeError):
20
+ return None
21
+
22
+
23
+ @register("currency_vnd")
24
+ class CurrencyVndExtractor(PatternExtractor):
25
+ """Extract Vietnamese Dong currency amounts."""
26
+
27
+ pattern = r"[\d.,]+\s*(?:đ|đồng|VND|VNĐ)\b"
28
+
29
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
30
+ text = ctx.normalized_text or ctx.raw_text
31
+ candidates: list[Candidate] = []
32
+ key = field.key if field else "currency_vnd"
33
+
34
+ for m in _PATTERN.finditer(text):
35
+ raw = m.group(0)
36
+ amount = _parse_amount(raw)
37
+ candidates.append(
38
+ Candidate(
39
+ key=key,
40
+ value=amount,
41
+ raw=raw,
42
+ extractor="currency_vnd",
43
+ sources=["pattern:currency_vnd"],
44
+ span=(m.start(), m.end()),
45
+ confidence=0.9,
46
+ )
47
+ )
48
+ return candidates
@@ -0,0 +1,89 @@
1
+ """Pattern extractor: dates in Vietnamese and ISO formats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import regex as re
8
+ from dateutil import parser as dateutil_parser
9
+
10
+ from ocr_postprocess.extractors.base import PatternExtractor
11
+ from ocr_postprocess.extractors.registry import register
12
+ from ocr_postprocess.models import Candidate, PipelineContext
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY
17
+ _SLASH = re.compile(r"\b(\d{1,2})[/\-.](\d{1,2})[/\-.](\d{4})\b")
18
+ # YYYY-MM-DD
19
+ _ISO = re.compile(r"\b((?:19|20)\d{2})-(\d{2})-(\d{2})\b")
20
+ # ngày DD tháng MM năm YYYY
21
+ _VN = re.compile(
22
+ r"ngày\s+(\d{1,2})\s+tháng\s+(\d{1,2})\s+năm\s+((?:19|20)\d{2})",
23
+ re.IGNORECASE,
24
+ )
25
+
26
+
27
+ def _parse_date(day: str, month: str, year: str) -> str | None:
28
+ try:
29
+ dt = dateutil_parser.parse(f"{year}-{month}-{day}", dayfirst=False)
30
+ return dt.date().isoformat()
31
+ except (ValueError, TypeError):
32
+ return None
33
+
34
+
35
+ @register("date")
36
+ class DateExtractor(PatternExtractor):
37
+ """Extract dates in Vietnamese and international formats."""
38
+
39
+ pattern = r"\b\d{1,2}[/\-.]\d{1,2}[/\-.]\d{4}\b"
40
+
41
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
42
+ text = ctx.normalized_text or ctx.raw_text
43
+ candidates: list[Candidate] = []
44
+ key = field.key if field else "date"
45
+ seen_spans: set[tuple[int, int]] = set()
46
+
47
+ def add(m: re.Match, day: str, month: str, year: str, confidence: float) -> None:
48
+ span = (m.start(), m.end())
49
+ if span in seen_spans:
50
+ return
51
+ seen_spans.add(span)
52
+ iso = _parse_date(day, month, year)
53
+ if iso:
54
+ candidates.append(
55
+ Candidate(
56
+ key=key,
57
+ value=iso,
58
+ raw=m.group(0),
59
+ extractor="date",
60
+ sources=["pattern:date"],
61
+ span=span,
62
+ confidence=confidence,
63
+ )
64
+ )
65
+
66
+ for m in _SLASH.finditer(text):
67
+ add(m, m.group(1), m.group(2), m.group(3), 0.95)
68
+
69
+ for m in _ISO.finditer(text):
70
+ iso = f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
71
+ span = (m.start(), m.end())
72
+ if span not in seen_spans:
73
+ seen_spans.add(span)
74
+ candidates.append(
75
+ Candidate(
76
+ key=key,
77
+ value=iso,
78
+ raw=m.group(0),
79
+ extractor="date",
80
+ sources=["pattern:date"],
81
+ span=span,
82
+ confidence=0.95,
83
+ )
84
+ )
85
+
86
+ for m in _VN.finditer(text):
87
+ add(m, m.group(1), m.group(2), m.group(3), 0.9)
88
+
89
+ return candidates
@@ -0,0 +1,38 @@
1
+ """Pattern extractor: email addresses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ _PATTERN = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
12
+
13
+
14
+ @register("email")
15
+ class EmailExtractor(PatternExtractor):
16
+ """Extract email addresses."""
17
+
18
+ pattern = r"[\w.+\-]+@[\w\-]+\.[\w.\-]+"
19
+
20
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
21
+ text = ctx.normalized_text or ctx.raw_text
22
+ candidates: list[Candidate] = []
23
+ key = field.key if field else "email"
24
+
25
+ for m in _PATTERN.finditer(text):
26
+ value = m.group(0).lower()
27
+ candidates.append(
28
+ Candidate(
29
+ key=key,
30
+ value=value,
31
+ raw=m.group(0),
32
+ extractor="email",
33
+ sources=["pattern:email"],
34
+ span=(m.start(), m.end()),
35
+ confidence=0.95,
36
+ )
37
+ )
38
+ return candidates
@@ -0,0 +1,48 @@
1
+ """Pattern extractor: Vietnamese gender."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ _PATTERN = re.compile(r"\b(Nam|Nữ|Male|Female|[MF])\b")
12
+
13
+ _NORMALIZE: dict[str, str] = {
14
+ "nam": "Nam",
15
+ "male": "Nam",
16
+ "m": "Nam",
17
+ "nữ": "Nữ",
18
+ "female": "Nữ",
19
+ "f": "Nữ",
20
+ }
21
+
22
+
23
+ @register("gender_vn")
24
+ class GenderVnExtractor(PatternExtractor):
25
+ """Extract gender in Vietnamese or English."""
26
+
27
+ pattern = r"\b(Nam|Nữ|Male|Female|[MF])\b"
28
+
29
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
30
+ text = ctx.normalized_text or ctx.raw_text
31
+ candidates: list[Candidate] = []
32
+ key = field.key if field else "gender_vn"
33
+
34
+ for m in _PATTERN.finditer(text):
35
+ raw = m.group(0)
36
+ value = _NORMALIZE.get(raw.lower(), raw)
37
+ candidates.append(
38
+ Candidate(
39
+ key=key,
40
+ value=value,
41
+ raw=raw,
42
+ extractor="gender_vn",
43
+ sources=["pattern:gender_vn"],
44
+ span=(m.start(), m.end()),
45
+ confidence=0.9,
46
+ )
47
+ )
48
+ return candidates
@@ -0,0 +1,83 @@
1
+ """Pattern extractor: Vietnamese phone numbers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ _PATTERN = re.compile(r"(?:\+84|0)(?:3[2-9]|5[6-9]|7[06-9]|8[1-9]|9[0-9])\d{7}\b")
12
+
13
+ _VALID_PREFIXES = {
14
+ "032",
15
+ "033",
16
+ "034",
17
+ "035",
18
+ "036",
19
+ "037",
20
+ "038",
21
+ "039", # Viettel
22
+ "056",
23
+ "058", # Vietnamobile
24
+ "070",
25
+ "076",
26
+ "077",
27
+ "078",
28
+ "079", # Mobifone
29
+ "081",
30
+ "082",
31
+ "083",
32
+ "084",
33
+ "085",
34
+ "086",
35
+ "089", # Vinaphone / others
36
+ "090",
37
+ "091",
38
+ "092",
39
+ "093",
40
+ "094",
41
+ "095",
42
+ "096",
43
+ "097",
44
+ "098",
45
+ "099",
46
+ }
47
+
48
+
49
+ def _normalize_phone(raw: str) -> str:
50
+ """Convert +84xxx to 0xxx."""
51
+ if raw.startswith("+84"):
52
+ return "0" + raw[3:]
53
+ return raw
54
+
55
+
56
+ @register("phone_vn")
57
+ class PhoneVnExtractor(PatternExtractor):
58
+ """Extract Vietnamese mobile phone numbers."""
59
+
60
+ pattern = r"(?:\+84|0)(?:3[2-9]|5[6-9]|7[06-9]|8[1-9]|9[0-9])\d{7}"
61
+
62
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
63
+ text = ctx.normalized_text or ctx.raw_text
64
+ candidates: list[Candidate] = []
65
+ key = field.key if field else "phone_vn"
66
+
67
+ for m in _PATTERN.finditer(text):
68
+ raw = m.group(0)
69
+ normalized = _normalize_phone(raw)
70
+ prefix = normalized[:3]
71
+ valid = prefix in _VALID_PREFIXES
72
+ candidates.append(
73
+ Candidate(
74
+ key=key,
75
+ value=normalized,
76
+ raw=raw,
77
+ extractor="phone_vn",
78
+ sources=["pattern:phone_vn"],
79
+ span=(m.start(), m.end()),
80
+ confidence=0.95 if valid else 0.8,
81
+ )
82
+ )
83
+ return candidates
@@ -0,0 +1,39 @@
1
+ """Pattern extractor: Vietnamese vehicle plate numbers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ # e.g. 30A-12345, 15H-087.71, 51F1-12345
12
+ _PATTERN = re.compile(r"\b\d{2}[A-Z]{1,2}[-\s]?\d{3,5}(?:[.\-]\d{1,2})?\b")
13
+
14
+
15
+ @register("plate_vn")
16
+ class PlateVnExtractor(PatternExtractor):
17
+ """Extract Vietnamese vehicle plate numbers."""
18
+
19
+ pattern = r"\b\d{2}[A-Z]{1,2}[-\s]?\d{3,5}(?:[.\-]\d{1,2})?\b"
20
+
21
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
22
+ text = ctx.normalized_text or ctx.raw_text
23
+ candidates: list[Candidate] = []
24
+ key = field.key if field else "plate_vn"
25
+
26
+ for m in _PATTERN.finditer(text):
27
+ value = m.group(0).strip()
28
+ candidates.append(
29
+ Candidate(
30
+ key=key,
31
+ value=value,
32
+ raw=value,
33
+ extractor="plate_vn",
34
+ sources=["pattern:plate_vn"],
35
+ span=(m.start(), m.end()),
36
+ confidence=0.9,
37
+ )
38
+ )
39
+ return candidates
@@ -0,0 +1,53 @@
1
+ """Pattern extractor: Vietnamese tax codes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import regex as re
6
+
7
+ from ocr_postprocess.extractors.base import PatternExtractor
8
+ from ocr_postprocess.extractors.registry import register
9
+ from ocr_postprocess.models import Candidate, PipelineContext
10
+
11
+ _PATTERN = re.compile(r"\b(\d{10})(?:-(\d{3}))?\b")
12
+
13
+
14
+ def _validate_tax_code(code: str) -> bool:
15
+ """Validate 10-digit tax code checksum (TT 105/2020/TT-BTC)."""
16
+ if len(code) != 10 or not code.isdigit():
17
+ return False
18
+ weights = [31, 29, 23, 19, 17, 13, 7, 5, 3]
19
+ total = sum(int(code[i]) * weights[i] for i in range(9))
20
+ check = 10 - (total % 11)
21
+ if check == 10:
22
+ check = 0
23
+ return check == int(code[9])
24
+
25
+
26
+ @register("tax_code")
27
+ class TaxCodeExtractor(PatternExtractor):
28
+ """Extract Vietnamese tax codes with checksum validation."""
29
+
30
+ pattern = r"\b\d{10}(?:-\d{3})?\b"
31
+
32
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
33
+ text = ctx.normalized_text or ctx.raw_text
34
+ candidates: list[Candidate] = []
35
+ key = field.key if field else "tax_code"
36
+
37
+ for m in _PATTERN.finditer(text):
38
+ code = m.group(1)
39
+ suffix = m.group(2)
40
+ value = f"{code}-{suffix}" if suffix else code
41
+ valid = _validate_tax_code(code)
42
+ candidates.append(
43
+ Candidate(
44
+ key=key,
45
+ value=value,
46
+ raw=value,
47
+ extractor="tax_code",
48
+ sources=["pattern:tax_code"] + (["pattern_with_checksum"] if valid else []),
49
+ span=(m.start(), m.end()),
50
+ confidence=0.95 if valid else 0.7,
51
+ )
52
+ )
53
+ return candidates
@@ -0,0 +1,45 @@
1
+ """Extractor registry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING
7
+
8
+ from ocr_postprocess.exceptions import ExtractorNotFoundError
9
+
10
+ if TYPE_CHECKING:
11
+ from ocr_postprocess.extractors.base import Extractor
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _REGISTRY: dict[str, type[Extractor]] = {}
16
+
17
+
18
+ def register(name: str):
19
+ """Decorator to register an extractor class under a name."""
20
+
21
+ def deco(cls):
22
+ if name in _REGISTRY:
23
+ logger.warning("Overwriting extractor registration: %s", name)
24
+ _REGISTRY[name] = cls
25
+ cls.name = name
26
+ return cls
27
+
28
+ return deco
29
+
30
+
31
+ def get(name: str) -> type[Extractor]:
32
+ """Return registered extractor class by name."""
33
+ if name not in _REGISTRY:
34
+ raise ExtractorNotFoundError(f"Extractor '{name}' not registered")
35
+ return _REGISTRY[name]
36
+
37
+
38
+ def get_instance(name: str) -> Extractor:
39
+ """Return an instantiated extractor."""
40
+ return get(name)()
41
+
42
+
43
+ def all_names() -> list[str]:
44
+ """Return all registered extractor names."""
45
+ return list(_REGISTRY.keys())
File without changes