resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,77 @@
1
+ # Matchers Module
2
+
3
+ ## Purpose
4
+
5
+ The matchers module implements the candidate generation cascade - a series of increasingly sophisticated matching strategies applied in sequence until candidates are found.
6
+
7
+ ## Components
8
+
9
+ ### Matcher Classes
10
+
11
+ 1. **ExactCodeMatcher** (`exact_code.py`)
12
+ - Direct lookup of standardized codes (ISO, M49, NUTS, OCHA P-codes, DCIDs)
13
+ - Validates code format before lookup
14
+ - Highest priority matcher (deterministic)
15
+
16
+ 2. **CanonicalNameMatcher** (`canonical_name.py`)
17
+ - Exact match against normalized canonical entity names
18
+ - Uses hash table for O(1) lookup
19
+ - Second-highest priority
20
+
21
+ 3. **AliasExactMatcher** (`alias_exact.py`)
22
+ - Exact match against normalized aliases
23
+ - Handles multilingual aliases
24
+ - Returns all matching entities (may be multiple)
25
+
26
+ 4. **FTSMatcher** (`fts_matcher.py`)
27
+ - Full-text search using SQLite FTS5
28
+ - Returns top-K candidates ranked by BM25-like score
29
+ - Queries across base pack + overlay FTS indexes
30
+
31
+ 5. **FuzzyMatcher** (`fuzzy_matcher.py`)
32
+ - Bounded fuzzy matching on FTS candidates
33
+ - Uses edit distance, trigram similarity, Jaccard coefficient
34
+ - Reduces top-K to top-K' highest-scoring candidates
35
+
36
+ 6. **SemanticMatcher** (`semantic_matcher.py`)
37
+ - Optional semantic similarity via embeddings
38
+ - Only invoked on ambiguous cases (low margin or registry hit)
39
+ - Requires `resolver[semantic]` extra
40
+
41
+ ### Shared Components
42
+
43
+ - `base.py`: Abstract base class for all matchers
44
+ - `cascade.py`: Orchestrates matcher execution in priority order
45
+ - `candidate.py`: Candidate data structures and scoring
46
+
47
+ ## Matcher Cascade Flow
48
+
49
+ ```
50
+ Input Query
51
+
52
+ [1. ExactCodeMatcher] → Found? → Return
53
+ ↓ Not found
54
+ [2. CanonicalNameMatcher] → Found? → Return
55
+ ↓ Not found
56
+ [3. AliasExactMatcher] → Found? → Return candidates
57
+ ↓ Not found or ambiguous
58
+ [4. FTSMatcher] → top-K candidates (K=50)
59
+
60
+ [5. FuzzyMatcher] → refine to top-K' (K'=12)
61
+
62
+ [6. SemanticMatcher] → re-rank if ambiguous (optional)
63
+
64
+ Return ranked candidates
65
+ ```
66
+
67
+ ## Design Principles
68
+
69
+ 1. **Fail-fast**: Early matchers return immediately on high-confidence matches
70
+ 2. **Bounded computation**: Limit fuzzy/semantic work to top-K candidates
71
+ 3. **Pluggable**: Easy to add custom matchers
72
+ 4. **Feature extraction**: Each matcher emits features for calibration
73
+
74
+ ## Implementation Priority
75
+
76
+ **Phase A** - Core resolver (items 1-5)
77
+ **Phase E** - Ambiguity subsystem (item 6)
@@ -0,0 +1,65 @@
1
+ """Matcher cascade module for entity resolution."""
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ # Lazy imports to avoid circular dependency with data.code_repository
6
+ # The code_repository needs code_validators, but if we eagerly import
7
+ # cascade/exact_code here, they will try to import code_repository
8
+ if TYPE_CHECKING:
9
+ from resolvekit.matchers.alias_exact import AliasExactMatcher
10
+ from resolvekit.matchers.canonical_name import CanonicalNameMatcher
11
+ from resolvekit.matchers.cascade import MatcherCascade
12
+ from resolvekit.matchers.exact_code import ExactCodeMatcher
13
+ from resolvekit.matchers.fts_matcher import FTSMatcher
14
+ from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
15
+
16
+ # These are safe to import eagerly as they don't depend on data layer
17
+ from resolvekit.matchers.code_validators import CODE_VALIDATORS, get_validator
18
+ from resolvekit.matchers.priorities import InferencePriority
19
+ from resolvekit.matchers.protocols import CodeValidatorProtocol, MatcherProtocol
20
+ from resolvekit.types import Candidate, MatchContext
21
+
22
+ __all__ = [
23
+ "CODE_VALIDATORS",
24
+ "AliasExactMatcher",
25
+ "Candidate",
26
+ "CanonicalNameMatcher",
27
+ "CodeValidatorProtocol",
28
+ "ExactCodeMatcher",
29
+ "FTSMatcher",
30
+ "FuzzyMatcher",
31
+ "InferencePriority",
32
+ "MatchContext",
33
+ "MatcherCascade",
34
+ "MatcherProtocol",
35
+ "get_validator",
36
+ ]
37
+
38
+
39
+ def __getattr__(name: str) -> object:
40
+ """Lazy load matcher classes to avoid circular imports."""
41
+ if name == "AliasExactMatcher":
42
+ from resolvekit.matchers.alias_exact import AliasExactMatcher
43
+
44
+ return AliasExactMatcher
45
+ elif name == "CanonicalNameMatcher":
46
+ from resolvekit.matchers.canonical_name import CanonicalNameMatcher
47
+
48
+ return CanonicalNameMatcher
49
+ elif name == "MatcherCascade":
50
+ from resolvekit.matchers.cascade import MatcherCascade
51
+
52
+ return MatcherCascade
53
+ elif name == "ExactCodeMatcher":
54
+ from resolvekit.matchers.exact_code import ExactCodeMatcher
55
+
56
+ return ExactCodeMatcher
57
+ elif name == "FTSMatcher":
58
+ from resolvekit.matchers.fts_matcher import FTSMatcher
59
+
60
+ return FTSMatcher
61
+ elif name == "FuzzyMatcher":
62
+ from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
63
+
64
+ return FuzzyMatcher
65
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,65 @@
1
+ """Exact alias matcher for normalized alias lookups."""
2
+
3
+ from resolvekit.data.alias_repository import AliasRepository
4
+ from resolvekit.types import Candidate, MatchContext, MatcherType
5
+
6
+
7
+ class AliasExactMatcher:
8
+ """
9
+ Matches exact aliases (normalized).
10
+
11
+ Tier 2 matcher: May return multiple candidates (ambiguous).
12
+ Cascade stops if single match, continues if multiple.
13
+
14
+ Matches against normalized aliases using exact lookup.
15
+ """
16
+
17
+ def __init__(self, alias_repo: AliasRepository):
18
+ """
19
+ Initialize matcher.
20
+
21
+ Args:
22
+ alias_repo: Alias repository for lookups
23
+ """
24
+ self.alias_repo = alias_repo
25
+
26
+ def match(
27
+ self,
28
+ query: str,
29
+ normalized_query: str,
30
+ limit: int = 10,
31
+ context: MatchContext | None = None,
32
+ ) -> list[Candidate]:
33
+ """
34
+ Find all entities with exact normalized alias match.
35
+
36
+ Args:
37
+ query: Original query string (unused)
38
+ normalized_query: Normalized query string
39
+ limit: Maximum candidates to return
40
+ context: Optional filtering context
41
+
42
+ Returns:
43
+ List of candidates (may be multiple for ambiguous aliases)
44
+ """
45
+ # Find all entities with this alias
46
+ matches = self.alias_repo.find_exact_normalized(normalized_query, context)
47
+
48
+ if not matches:
49
+ return []
50
+
51
+ # Convert to candidates
52
+ candidates = []
53
+ for entity, matched_alias in matches:
54
+ candidates.append(
55
+ Candidate(
56
+ entity=entity,
57
+ score=1.0,
58
+ matcher_type=MatcherType.ALIAS_EXACT,
59
+ features={"alias_exact": True, "matched_alias": matched_alias},
60
+ matched_alias=matched_alias,
61
+ )
62
+ )
63
+
64
+ # Limit results
65
+ return candidates[:limit]
@@ -0,0 +1,62 @@
1
+ """Canonical name matcher for exact name lookups."""
2
+
3
+ from resolvekit.data.entity_repository import EntityRepository
4
+ from resolvekit.normalization.normalizer import TextNormalizer
5
+ from resolvekit.types import Candidate, MatchContext, MatcherType
6
+
7
+
8
+ class CanonicalNameMatcher:
9
+ """
10
+ Matches exact canonical names (normalized).
11
+
12
+ Tier 1 matcher: Deterministic, returns single match, stops cascade.
13
+
14
+ Matches against normalized canonical names using exact lookup.
15
+ """
16
+
17
+ def __init__(self, entity_repo: EntityRepository, normalizer: TextNormalizer):
18
+ """
19
+ Initialize matcher.
20
+
21
+ Args:
22
+ entity_repo: Entity repository for lookups
23
+ normalizer: Text normalizer (for consistency)
24
+ """
25
+ self.entity_repo = entity_repo
26
+ self.normalizer = normalizer
27
+
28
+ def match(
29
+ self,
30
+ query: str,
31
+ normalized_query: str,
32
+ limit: int = 10,
33
+ context: MatchContext | None = None,
34
+ ) -> list[Candidate]:
35
+ """
36
+ Lookup by normalized canonical name.
37
+
38
+ Args:
39
+ query: Original query string (unused)
40
+ normalized_query: Normalized query string
41
+ limit: Maximum candidates (unused, always returns 0 or 1)
42
+ context: Optional filtering context
43
+
44
+ Returns:
45
+ List with single candidate if match found, empty list otherwise
46
+ """
47
+ # Lookup by normalized canonical name
48
+ entity = self.entity_repo.find_by_canonical_name(normalized_query, context)
49
+
50
+ if not entity:
51
+ return []
52
+
53
+ # Return single candidate
54
+ return [
55
+ Candidate(
56
+ entity=entity,
57
+ score=1.0,
58
+ matcher_type=MatcherType.CANONICAL_NAME,
59
+ features={"canonical_exact": True},
60
+ matched_alias=None,
61
+ )
62
+ ]
@@ -0,0 +1,127 @@
1
+ """Matcher cascade orchestrator with tier-based short-circuiting."""
2
+
3
+ from resolvekit.matchers.alias_exact import AliasExactMatcher
4
+ from resolvekit.matchers.canonical_name import CanonicalNameMatcher
5
+ from resolvekit.matchers.exact_code import ExactCodeMatcher
6
+ from resolvekit.matchers.fts_matcher import FTSMatcher
7
+ from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
8
+ from resolvekit.normalization.normalizer import TextNormalizer
9
+ from resolvekit.types import Candidate, CodeSystem, MatchContext
10
+
11
+
12
+ class MatcherCascade:
13
+ """
14
+ Orchestrates matcher execution with tier-based short-circuiting.
15
+
16
+ Cascade Tiers:
17
+ Tier 1 (Deterministic): ExactCode, CanonicalName -> STOP if found
18
+ Tier 2 (Exact Multi): AliasExact -> STOP if unambiguous
19
+ Tier 3 (Fuzzy): FTS + Fuzzy -> Return top-K
20
+
21
+ Performance Target:
22
+ - Tier 1: p50 < 10ms (most queries)
23
+ - Tier 3: p95 < 50ms (complex queries)
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ exact_code: ExactCodeMatcher,
29
+ canonical_name: CanonicalNameMatcher,
30
+ alias_exact: AliasExactMatcher,
31
+ fts: FTSMatcher,
32
+ fuzzy: FuzzyMatcher,
33
+ normalizer: TextNormalizer,
34
+ code_priority: list[CodeSystem] | None = None,
35
+ ):
36
+ """
37
+ Initialize cascade with all matchers.
38
+
39
+ Args:
40
+ exact_code: Tier 1 exact code matcher
41
+ canonical_name: Tier 1 canonical name matcher
42
+ alias_exact: Tier 2 alias exact matcher
43
+ fts: Tier 3 FTS matcher
44
+ fuzzy: Tier 3 fuzzy matcher
45
+ normalizer: Text normalizer
46
+ code_priority: Optional code system inference priority.
47
+ If provided, overrides the exact_code matcher's priority.
48
+ Use InferencePriority presets (humanitarian, world_bank, etc.)
49
+ or provide a custom list of CodeSystem values.
50
+ """
51
+ self.exact_code = exact_code
52
+ self.canonical_name = canonical_name
53
+ self.alias_exact = alias_exact
54
+ self.fts = fts
55
+ self.fuzzy = fuzzy
56
+ self.normalizer = normalizer
57
+
58
+ # Apply code priority if provided (cascade-level configuration)
59
+ if code_priority is not None:
60
+ self.exact_code.priority = code_priority
61
+
62
+ def resolve( # noqa: PLR0911
63
+ self, query: str, context: MatchContext | None = None, top_k: int = 12
64
+ ) -> list[Candidate]:
65
+ """
66
+ Execute cascade with tier-based short-circuiting.
67
+
68
+ Multiple returns are intentional - each tier has early exit conditions
69
+ for optimal performance (short-circuiting pattern).
70
+
71
+ Args:
72
+ query: Query string to resolve
73
+ context: Optional filtering context
74
+ top_k: Maximum candidates to return
75
+
76
+ Returns:
77
+ List of candidates ordered by score (descending)
78
+ """
79
+ # Fast reject for empty/whitespace queries
80
+ if not query or not query.strip():
81
+ return []
82
+
83
+ # Truncate long queries (DoS protection)
84
+ if len(query) > 1000:
85
+ query = query[:1000]
86
+
87
+ # Normalize once for all matchers
88
+ normalized = self.normalizer.normalize(query)
89
+
90
+ # Empty after normalization? (e.g., only punctuation)
91
+ if not normalized:
92
+ return []
93
+
94
+ # TIER 1: Deterministic exact matches (STOP if found)
95
+
96
+ # Try exact code first (fastest)
97
+ candidates = self.exact_code.match(query, normalized, context=context)
98
+ if candidates:
99
+ return candidates # Single match, high confidence, STOP
100
+
101
+ # Try canonical name
102
+ candidates = self.canonical_name.match(query, normalized, context=context)
103
+ if candidates:
104
+ return candidates # Single match, high confidence, STOP
105
+
106
+ # TIER 2: Exact alias (may be multiple, continue if ambiguous)
107
+ candidates = self.alias_exact.match(query, normalized, context=context)
108
+ if len(candidates) == 1:
109
+ return candidates # Unambiguous, STOP
110
+ elif len(candidates) > 1:
111
+ # Ambiguous - return multiple for calibration to decide
112
+ return candidates[:top_k]
113
+
114
+ # TIER 3: FTS + Fuzzy (ranked candidates)
115
+
116
+ # FTS returns top-50
117
+ fts_candidates = self.fts.match(query, normalized, limit=50, context=context)
118
+
119
+ if not fts_candidates:
120
+ return [] # No matches
121
+
122
+ # Fuzzy refines to top-K
123
+ refined = self.fuzzy.match(
124
+ query, normalized, limit=top_k, fts_candidates=fts_candidates
125
+ )
126
+
127
+ return refined
@@ -0,0 +1,250 @@
1
+ """Code system validators for format validation and normalization."""
2
+
3
+ import re
4
+ from typing import TYPE_CHECKING
5
+
6
+ from resolvekit.types import CodeSystem
7
+
8
+ if TYPE_CHECKING:
9
+ from resolvekit.matchers.protocols import CodeValidatorProtocol
10
+
11
+
12
+ class ISO2Validator:
13
+ """Validator for ISO 3166-1 alpha-2 codes."""
14
+
15
+ def validate(self, value: str) -> tuple[bool, str | None]:
16
+ """Validate ISO2 format (2 letters)."""
17
+ if len(value) != 2 or not value.isalpha():
18
+ return False, "ISO2 must be 2 letters"
19
+ return True, None
20
+
21
+ def normalize(self, value: str) -> str:
22
+ """Normalize to uppercase."""
23
+ return value.upper()
24
+
25
+
26
+ class ISO3Validator:
27
+ """Validator for ISO 3166-1 alpha-3 codes."""
28
+
29
+ def validate(self, value: str) -> tuple[bool, str | None]:
30
+ """Validate ISO3 format (3 letters)."""
31
+ if len(value) != 3 or not value.isalpha():
32
+ return False, "ISO3 must be 3 letters"
33
+ return True, None
34
+
35
+ def normalize(self, value: str) -> str:
36
+ """Normalize to uppercase."""
37
+ return value.upper()
38
+
39
+
40
+ class ISONumericValidator:
41
+ """Validator for ISO 3166-1 numeric codes."""
42
+
43
+ def validate(self, value: str) -> tuple[bool, str | None]:
44
+ """Validate ISO numeric format (3 digits)."""
45
+ if len(value) != 3 or not value.isdigit():
46
+ return False, "ISO numeric must be 3 digits"
47
+ return True, None
48
+
49
+ def normalize(self, value: str) -> str:
50
+ """Normalize preserves digits."""
51
+ return value
52
+
53
+
54
+ class DCIDValidator:
55
+ """Validator for Data Commons IDs."""
56
+
57
+ def validate(self, value: str) -> tuple[bool, str | None]:
58
+ """Validate DCID format (prefix/identifier)."""
59
+ if "/" not in value:
60
+ return False, "DCID must contain '/'"
61
+ return True, None
62
+
63
+ def normalize(self, value: str) -> str:
64
+ """DCIDs are case-sensitive, no normalization."""
65
+ return value
66
+
67
+
68
+ class M49Validator:
69
+ """Validator for UN M49 region codes."""
70
+
71
+ def validate(self, value: str) -> tuple[bool, str | None]:
72
+ """Validate M49 format (3 digits)."""
73
+ if len(value) != 3 or not value.isdigit():
74
+ return False, "M49 must be 3 digits"
75
+ return True, None
76
+
77
+ def normalize(self, value: str) -> str:
78
+ """Normalize preserves digits."""
79
+ return value
80
+
81
+
82
+ class ISO31662Validator:
83
+ """Validator for ISO 3166-2 subdivision codes."""
84
+
85
+ _PATTERN = re.compile(r"^[A-Z]{2}-[A-Z0-9]{1,3}$")
86
+
87
+ def validate(self, value: str) -> tuple[bool, str | None]:
88
+ """Validate ISO 3166-2 format (CC-SSS)."""
89
+ if not self._PATTERN.match(value.upper()):
90
+ return False, "ISO3166-2 must be format CC-SSS (e.g., US-CA)"
91
+ return True, None
92
+
93
+ def normalize(self, value: str) -> str:
94
+ """Normalize to uppercase."""
95
+ return value.upper()
96
+
97
+
98
+ class NUTSValidator:
99
+ """Validator for EU NUTS codes."""
100
+
101
+ _PATTERN = re.compile(r"^[A-Z]{2}[0-9A-Z]{0,3}$")
102
+
103
+ def validate(self, value: str) -> tuple[bool, str | None]:
104
+ """Validate NUTS format (CC[0-9A-Z]{0,3})."""
105
+ if not self._PATTERN.match(value.upper()):
106
+ return False, "NUTS must be format CC[0-9A-Z]{0,3} (e.g., DE3, FR10)"
107
+ return True, None
108
+
109
+ def normalize(self, value: str) -> str:
110
+ """Normalize to uppercase."""
111
+ return value.upper()
112
+
113
+
114
+ class LAUValidator:
115
+ """Validator for EU LAU codes."""
116
+
117
+ def validate(self, value: str) -> tuple[bool, str | None]:
118
+ """Validate LAU format (variable by country)."""
119
+ # LAU format is variable, accept any non-empty string
120
+ if not value or not value.strip():
121
+ return False, "LAU code cannot be empty"
122
+ return True, None
123
+
124
+ def normalize(self, value: str) -> str:
125
+ """Normalize to uppercase."""
126
+ return value.upper()
127
+
128
+
129
+ class PCodeValidator:
130
+ """Validator for OCHA P-codes."""
131
+
132
+ def validate(self, value: str) -> tuple[bool, str | None]:
133
+ """Validate P-code format (variable format)."""
134
+ # P-codes have variable format, accept any non-empty alphanumeric
135
+ if not value or not value.strip():
136
+ return False, "P-code cannot be empty"
137
+ return True, None
138
+
139
+ def normalize(self, value: str) -> str:
140
+ """Normalize to uppercase."""
141
+ return value.upper()
142
+
143
+
144
+ class WikidataValidator:
145
+ """Validator for Wikidata QIDs."""
146
+
147
+ _PATTERN = re.compile(r"^Q[0-9]+$")
148
+
149
+ def validate(self, value: str) -> tuple[bool, str | None]:
150
+ """Validate Wikidata format (Q[0-9]+)."""
151
+ if not self._PATTERN.match(value.upper()):
152
+ return False, "Wikidata QID must be format Q[0-9]+ (e.g., Q30)"
153
+ return True, None
154
+
155
+ def normalize(self, value: str) -> str:
156
+ """Normalize to uppercase."""
157
+ return value.upper()
158
+
159
+
160
+ class GeoNamesValidator:
161
+ """Validator for GeoNames IDs."""
162
+
163
+ def validate(self, value: str) -> tuple[bool, str | None]:
164
+ """Validate GeoNames format (numeric ID)."""
165
+ if not value.isdigit():
166
+ return False, "GeoNames ID must be numeric"
167
+ return True, None
168
+
169
+ def normalize(self, value: str) -> str:
170
+ """Normalize preserves digits."""
171
+ return value
172
+
173
+
174
+ class WorldBankValidator:
175
+ """Validator for World Bank country codes."""
176
+
177
+ def validate(self, value: str) -> tuple[bool, str | None]:
178
+ """Validate World Bank format (2-3 letters)."""
179
+ if len(value) not in (2, 3) or not value.isalpha():
180
+ return False, "World Bank code must be 2-3 letters"
181
+ return True, None
182
+
183
+ def normalize(self, value: str) -> str:
184
+ """Normalize to uppercase."""
185
+ return value.upper()
186
+
187
+
188
+ class DACValidator:
189
+ """Validator for OECD DAC codes."""
190
+
191
+ def validate(self, value: str) -> tuple[bool, str | None]:
192
+ """Validate DAC format (numeric)."""
193
+ if not value.isdigit():
194
+ return False, "DAC code must be numeric"
195
+ return True, None
196
+
197
+ def normalize(self, value: str) -> str:
198
+ """Normalize preserves digits."""
199
+ return value
200
+
201
+
202
+ class FIPSValidator:
203
+ """Validator for FIPS codes (deprecated but still used)."""
204
+
205
+ def validate(self, value: str) -> tuple[bool, str | None]:
206
+ """Validate FIPS format (2 letters)."""
207
+ if len(value) != 2 or not value.isalpha():
208
+ return False, "FIPS must be 2 letters"
209
+ return True, None
210
+
211
+ def normalize(self, value: str) -> str:
212
+ """Normalize to uppercase."""
213
+ return value.upper()
214
+
215
+
216
+ # Simple constant dict - no registration ceremony
217
+ # Note: Using object type here to avoid circular import with protocols
218
+ # All validators implement CodeValidatorProtocol structurally
219
+ CODE_VALIDATORS: dict[CodeSystem, object] = {
220
+ CodeSystem.ISO2: ISO2Validator(),
221
+ CodeSystem.ISO3: ISO3Validator(),
222
+ CodeSystem.ISO_NUMERIC: ISONumericValidator(),
223
+ CodeSystem.DCID: DCIDValidator(),
224
+ CodeSystem.M49: M49Validator(),
225
+ CodeSystem.ISO3166_2: ISO31662Validator(),
226
+ CodeSystem.NUTS: NUTSValidator(),
227
+ CodeSystem.LAU: LAUValidator(),
228
+ CodeSystem.PCODE: PCodeValidator(),
229
+ CodeSystem.WIKIDATA: WikidataValidator(),
230
+ CodeSystem.GEONAMES: GeoNamesValidator(),
231
+ CodeSystem.WB: WorldBankValidator(),
232
+ CodeSystem.DAC: DACValidator(),
233
+ CodeSystem.FIPS: FIPSValidator(),
234
+ }
235
+
236
+
237
+ def get_validator(system: CodeSystem) -> "CodeValidatorProtocol":
238
+ """
239
+ Get validator for code system.
240
+
241
+ Args:
242
+ system: Code system enum value
243
+
244
+ Returns:
245
+ Validator instance implementing CodeValidatorProtocol
246
+
247
+ Raises:
248
+ KeyError: If code system not supported
249
+ """
250
+ return CODE_VALIDATORS[system] # type: ignore[return-value]