resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Matchers Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The matchers module implements the candidate generation cascade - a series of increasingly sophisticated matching strategies applied in sequence until candidates are found.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Matcher Classes
|
|
10
|
+
|
|
11
|
+
1. **ExactCodeMatcher** (`exact_code.py`)
|
|
12
|
+
- Direct lookup of standardized codes (ISO, M49, NUTS, OCHA P-codes, DCIDs)
|
|
13
|
+
- Validates code format before lookup
|
|
14
|
+
- Highest priority matcher (deterministic)
|
|
15
|
+
|
|
16
|
+
2. **CanonicalNameMatcher** (`canonical_name.py`)
|
|
17
|
+
- Exact match against normalized canonical entity names
|
|
18
|
+
- Uses hash table for O(1) lookup
|
|
19
|
+
- Second-highest priority
|
|
20
|
+
|
|
21
|
+
3. **AliasExactMatcher** (`alias_exact.py`)
|
|
22
|
+
- Exact match against normalized aliases
|
|
23
|
+
- Handles multilingual aliases
|
|
24
|
+
- Returns all matching entities (may be multiple)
|
|
25
|
+
|
|
26
|
+
4. **FTSMatcher** (`fts_matcher.py`)
|
|
27
|
+
- Full-text search using SQLite FTS5
|
|
28
|
+
- Returns top-K candidates ranked by BM25-like score
|
|
29
|
+
- Queries across base pack + overlay FTS indexes
|
|
30
|
+
|
|
31
|
+
5. **FuzzyMatcher** (`fuzzy_matcher.py`)
|
|
32
|
+
- Bounded fuzzy matching on FTS candidates
|
|
33
|
+
- Uses edit distance, trigram similarity, Jaccard coefficient
|
|
34
|
+
- Reduces top-K to top-K' highest-scoring candidates
|
|
35
|
+
|
|
36
|
+
6. **SemanticMatcher** (`semantic_matcher.py`)
|
|
37
|
+
- Optional semantic similarity via embeddings
|
|
38
|
+
- Only invoked on ambiguous cases (low margin or registry hit)
|
|
39
|
+
- Requires `resolver[semantic]` extra
|
|
40
|
+
|
|
41
|
+
### Shared Components
|
|
42
|
+
|
|
43
|
+
- `base.py`: Abstract base class for all matchers
|
|
44
|
+
- `cascade.py`: Orchestrates matcher execution in priority order
|
|
45
|
+
- `candidate.py`: Candidate data structures and scoring
|
|
46
|
+
|
|
47
|
+
## Matcher Cascade Flow
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
Input Query
|
|
51
|
+
↓
|
|
52
|
+
[1. ExactCodeMatcher] → Found? → Return
|
|
53
|
+
↓ Not found
|
|
54
|
+
[2. CanonicalNameMatcher] → Found? → Return
|
|
55
|
+
↓ Not found
|
|
56
|
+
[3. AliasExactMatcher] → Found? → Return candidates
|
|
57
|
+
↓ Not found or ambiguous
|
|
58
|
+
[4. FTSMatcher] → top-K candidates (K=50)
|
|
59
|
+
↓
|
|
60
|
+
[5. FuzzyMatcher] → refine to top-K' (K'=12)
|
|
61
|
+
↓
|
|
62
|
+
[6. SemanticMatcher] → re-rank if ambiguous (optional)
|
|
63
|
+
↓
|
|
64
|
+
Return ranked candidates
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Design Principles
|
|
68
|
+
|
|
69
|
+
1. **Fail-fast**: Early matchers return immediately on high-confidence matches
|
|
70
|
+
2. **Bounded computation**: Limit fuzzy/semantic work to top-K candidates
|
|
71
|
+
3. **Pluggable**: Easy to add custom matchers
|
|
72
|
+
4. **Feature extraction**: Each matcher emits features for calibration
|
|
73
|
+
|
|
74
|
+
## Implementation Priority
|
|
75
|
+
|
|
76
|
+
**Phase A** - Core resolver (items 1-5)
|
|
77
|
+
**Phase E** - Ambiguity subsystem (item 6)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Matcher cascade module for entity resolution."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
# Lazy imports to avoid circular dependency with data.code_repository
|
|
6
|
+
# The code_repository needs code_validators, but if we eagerly import
|
|
7
|
+
# cascade/exact_code here, they will try to import code_repository
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from resolvekit.matchers.alias_exact import AliasExactMatcher
|
|
10
|
+
from resolvekit.matchers.canonical_name import CanonicalNameMatcher
|
|
11
|
+
from resolvekit.matchers.cascade import MatcherCascade
|
|
12
|
+
from resolvekit.matchers.exact_code import ExactCodeMatcher
|
|
13
|
+
from resolvekit.matchers.fts_matcher import FTSMatcher
|
|
14
|
+
from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
|
|
15
|
+
|
|
16
|
+
# These are safe to import eagerly as they don't depend on data layer
|
|
17
|
+
from resolvekit.matchers.code_validators import CODE_VALIDATORS, get_validator
|
|
18
|
+
from resolvekit.matchers.priorities import InferencePriority
|
|
19
|
+
from resolvekit.matchers.protocols import CodeValidatorProtocol, MatcherProtocol
|
|
20
|
+
from resolvekit.types import Candidate, MatchContext
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"CODE_VALIDATORS",
|
|
24
|
+
"AliasExactMatcher",
|
|
25
|
+
"Candidate",
|
|
26
|
+
"CanonicalNameMatcher",
|
|
27
|
+
"CodeValidatorProtocol",
|
|
28
|
+
"ExactCodeMatcher",
|
|
29
|
+
"FTSMatcher",
|
|
30
|
+
"FuzzyMatcher",
|
|
31
|
+
"InferencePriority",
|
|
32
|
+
"MatchContext",
|
|
33
|
+
"MatcherCascade",
|
|
34
|
+
"MatcherProtocol",
|
|
35
|
+
"get_validator",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def __getattr__(name: str) -> object:
|
|
40
|
+
"""Lazy load matcher classes to avoid circular imports."""
|
|
41
|
+
if name == "AliasExactMatcher":
|
|
42
|
+
from resolvekit.matchers.alias_exact import AliasExactMatcher
|
|
43
|
+
|
|
44
|
+
return AliasExactMatcher
|
|
45
|
+
elif name == "CanonicalNameMatcher":
|
|
46
|
+
from resolvekit.matchers.canonical_name import CanonicalNameMatcher
|
|
47
|
+
|
|
48
|
+
return CanonicalNameMatcher
|
|
49
|
+
elif name == "MatcherCascade":
|
|
50
|
+
from resolvekit.matchers.cascade import MatcherCascade
|
|
51
|
+
|
|
52
|
+
return MatcherCascade
|
|
53
|
+
elif name == "ExactCodeMatcher":
|
|
54
|
+
from resolvekit.matchers.exact_code import ExactCodeMatcher
|
|
55
|
+
|
|
56
|
+
return ExactCodeMatcher
|
|
57
|
+
elif name == "FTSMatcher":
|
|
58
|
+
from resolvekit.matchers.fts_matcher import FTSMatcher
|
|
59
|
+
|
|
60
|
+
return FTSMatcher
|
|
61
|
+
elif name == "FuzzyMatcher":
|
|
62
|
+
from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
|
|
63
|
+
|
|
64
|
+
return FuzzyMatcher
|
|
65
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Exact alias matcher for normalized alias lookups."""
|
|
2
|
+
|
|
3
|
+
from resolvekit.data.alias_repository import AliasRepository
|
|
4
|
+
from resolvekit.types import Candidate, MatchContext, MatcherType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AliasExactMatcher:
|
|
8
|
+
"""
|
|
9
|
+
Matches exact aliases (normalized).
|
|
10
|
+
|
|
11
|
+
Tier 2 matcher: May return multiple candidates (ambiguous).
|
|
12
|
+
Cascade stops if single match, continues if multiple.
|
|
13
|
+
|
|
14
|
+
Matches against normalized aliases using exact lookup.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, alias_repo: AliasRepository):
|
|
18
|
+
"""
|
|
19
|
+
Initialize matcher.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
alias_repo: Alias repository for lookups
|
|
23
|
+
"""
|
|
24
|
+
self.alias_repo = alias_repo
|
|
25
|
+
|
|
26
|
+
def match(
|
|
27
|
+
self,
|
|
28
|
+
query: str,
|
|
29
|
+
normalized_query: str,
|
|
30
|
+
limit: int = 10,
|
|
31
|
+
context: MatchContext | None = None,
|
|
32
|
+
) -> list[Candidate]:
|
|
33
|
+
"""
|
|
34
|
+
Find all entities with exact normalized alias match.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
query: Original query string (unused)
|
|
38
|
+
normalized_query: Normalized query string
|
|
39
|
+
limit: Maximum candidates to return
|
|
40
|
+
context: Optional filtering context
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List of candidates (may be multiple for ambiguous aliases)
|
|
44
|
+
"""
|
|
45
|
+
# Find all entities with this alias
|
|
46
|
+
matches = self.alias_repo.find_exact_normalized(normalized_query, context)
|
|
47
|
+
|
|
48
|
+
if not matches:
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
# Convert to candidates
|
|
52
|
+
candidates = []
|
|
53
|
+
for entity, matched_alias in matches:
|
|
54
|
+
candidates.append(
|
|
55
|
+
Candidate(
|
|
56
|
+
entity=entity,
|
|
57
|
+
score=1.0,
|
|
58
|
+
matcher_type=MatcherType.ALIAS_EXACT,
|
|
59
|
+
features={"alias_exact": True, "matched_alias": matched_alias},
|
|
60
|
+
matched_alias=matched_alias,
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Limit results
|
|
65
|
+
return candidates[:limit]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Canonical name matcher for exact name lookups."""
|
|
2
|
+
|
|
3
|
+
from resolvekit.data.entity_repository import EntityRepository
|
|
4
|
+
from resolvekit.normalization.normalizer import TextNormalizer
|
|
5
|
+
from resolvekit.types import Candidate, MatchContext, MatcherType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CanonicalNameMatcher:
|
|
9
|
+
"""
|
|
10
|
+
Matches exact canonical names (normalized).
|
|
11
|
+
|
|
12
|
+
Tier 1 matcher: Deterministic, returns single match, stops cascade.
|
|
13
|
+
|
|
14
|
+
Matches against normalized canonical names using exact lookup.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, entity_repo: EntityRepository, normalizer: TextNormalizer):
|
|
18
|
+
"""
|
|
19
|
+
Initialize matcher.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
entity_repo: Entity repository for lookups
|
|
23
|
+
normalizer: Text normalizer (for consistency)
|
|
24
|
+
"""
|
|
25
|
+
self.entity_repo = entity_repo
|
|
26
|
+
self.normalizer = normalizer
|
|
27
|
+
|
|
28
|
+
def match(
|
|
29
|
+
self,
|
|
30
|
+
query: str,
|
|
31
|
+
normalized_query: str,
|
|
32
|
+
limit: int = 10,
|
|
33
|
+
context: MatchContext | None = None,
|
|
34
|
+
) -> list[Candidate]:
|
|
35
|
+
"""
|
|
36
|
+
Lookup by normalized canonical name.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
query: Original query string (unused)
|
|
40
|
+
normalized_query: Normalized query string
|
|
41
|
+
limit: Maximum candidates (unused, always returns 0 or 1)
|
|
42
|
+
context: Optional filtering context
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
List with single candidate if match found, empty list otherwise
|
|
46
|
+
"""
|
|
47
|
+
# Lookup by normalized canonical name
|
|
48
|
+
entity = self.entity_repo.find_by_canonical_name(normalized_query, context)
|
|
49
|
+
|
|
50
|
+
if not entity:
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
# Return single candidate
|
|
54
|
+
return [
|
|
55
|
+
Candidate(
|
|
56
|
+
entity=entity,
|
|
57
|
+
score=1.0,
|
|
58
|
+
matcher_type=MatcherType.CANONICAL_NAME,
|
|
59
|
+
features={"canonical_exact": True},
|
|
60
|
+
matched_alias=None,
|
|
61
|
+
)
|
|
62
|
+
]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Matcher cascade orchestrator with tier-based short-circuiting."""
|
|
2
|
+
|
|
3
|
+
from resolvekit.matchers.alias_exact import AliasExactMatcher
|
|
4
|
+
from resolvekit.matchers.canonical_name import CanonicalNameMatcher
|
|
5
|
+
from resolvekit.matchers.exact_code import ExactCodeMatcher
|
|
6
|
+
from resolvekit.matchers.fts_matcher import FTSMatcher
|
|
7
|
+
from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
|
|
8
|
+
from resolvekit.normalization.normalizer import TextNormalizer
|
|
9
|
+
from resolvekit.types import Candidate, CodeSystem, MatchContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MatcherCascade:
|
|
13
|
+
"""
|
|
14
|
+
Orchestrates matcher execution with tier-based short-circuiting.
|
|
15
|
+
|
|
16
|
+
Cascade Tiers:
|
|
17
|
+
Tier 1 (Deterministic): ExactCode, CanonicalName -> STOP if found
|
|
18
|
+
Tier 2 (Exact Multi): AliasExact -> STOP if unambiguous
|
|
19
|
+
Tier 3 (Fuzzy): FTS + Fuzzy -> Return top-K
|
|
20
|
+
|
|
21
|
+
Performance Target:
|
|
22
|
+
- Tier 1: p50 < 10ms (most queries)
|
|
23
|
+
- Tier 3: p95 < 50ms (complex queries)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
exact_code: ExactCodeMatcher,
|
|
29
|
+
canonical_name: CanonicalNameMatcher,
|
|
30
|
+
alias_exact: AliasExactMatcher,
|
|
31
|
+
fts: FTSMatcher,
|
|
32
|
+
fuzzy: FuzzyMatcher,
|
|
33
|
+
normalizer: TextNormalizer,
|
|
34
|
+
code_priority: list[CodeSystem] | None = None,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize cascade with all matchers.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
exact_code: Tier 1 exact code matcher
|
|
41
|
+
canonical_name: Tier 1 canonical name matcher
|
|
42
|
+
alias_exact: Tier 2 alias exact matcher
|
|
43
|
+
fts: Tier 3 FTS matcher
|
|
44
|
+
fuzzy: Tier 3 fuzzy matcher
|
|
45
|
+
normalizer: Text normalizer
|
|
46
|
+
code_priority: Optional code system inference priority.
|
|
47
|
+
If provided, overrides the exact_code matcher's priority.
|
|
48
|
+
Use InferencePriority presets (humanitarian, world_bank, etc.)
|
|
49
|
+
or provide a custom list of CodeSystem values.
|
|
50
|
+
"""
|
|
51
|
+
self.exact_code = exact_code
|
|
52
|
+
self.canonical_name = canonical_name
|
|
53
|
+
self.alias_exact = alias_exact
|
|
54
|
+
self.fts = fts
|
|
55
|
+
self.fuzzy = fuzzy
|
|
56
|
+
self.normalizer = normalizer
|
|
57
|
+
|
|
58
|
+
# Apply code priority if provided (cascade-level configuration)
|
|
59
|
+
if code_priority is not None:
|
|
60
|
+
self.exact_code.priority = code_priority
|
|
61
|
+
|
|
62
|
+
def resolve( # noqa: PLR0911
|
|
63
|
+
self, query: str, context: MatchContext | None = None, top_k: int = 12
|
|
64
|
+
) -> list[Candidate]:
|
|
65
|
+
"""
|
|
66
|
+
Execute cascade with tier-based short-circuiting.
|
|
67
|
+
|
|
68
|
+
Multiple returns are intentional - each tier has early exit conditions
|
|
69
|
+
for optimal performance (short-circuiting pattern).
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
query: Query string to resolve
|
|
73
|
+
context: Optional filtering context
|
|
74
|
+
top_k: Maximum candidates to return
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
List of candidates ordered by score (descending)
|
|
78
|
+
"""
|
|
79
|
+
# Fast reject for empty/whitespace queries
|
|
80
|
+
if not query or not query.strip():
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
# Truncate long queries (DoS protection)
|
|
84
|
+
if len(query) > 1000:
|
|
85
|
+
query = query[:1000]
|
|
86
|
+
|
|
87
|
+
# Normalize once for all matchers
|
|
88
|
+
normalized = self.normalizer.normalize(query)
|
|
89
|
+
|
|
90
|
+
# Empty after normalization? (e.g., only punctuation)
|
|
91
|
+
if not normalized:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
# TIER 1: Deterministic exact matches (STOP if found)
|
|
95
|
+
|
|
96
|
+
# Try exact code first (fastest)
|
|
97
|
+
candidates = self.exact_code.match(query, normalized, context=context)
|
|
98
|
+
if candidates:
|
|
99
|
+
return candidates # Single match, high confidence, STOP
|
|
100
|
+
|
|
101
|
+
# Try canonical name
|
|
102
|
+
candidates = self.canonical_name.match(query, normalized, context=context)
|
|
103
|
+
if candidates:
|
|
104
|
+
return candidates # Single match, high confidence, STOP
|
|
105
|
+
|
|
106
|
+
# TIER 2: Exact alias (may be multiple, continue if ambiguous)
|
|
107
|
+
candidates = self.alias_exact.match(query, normalized, context=context)
|
|
108
|
+
if len(candidates) == 1:
|
|
109
|
+
return candidates # Unambiguous, STOP
|
|
110
|
+
elif len(candidates) > 1:
|
|
111
|
+
# Ambiguous - return multiple for calibration to decide
|
|
112
|
+
return candidates[:top_k]
|
|
113
|
+
|
|
114
|
+
# TIER 3: FTS + Fuzzy (ranked candidates)
|
|
115
|
+
|
|
116
|
+
# FTS returns top-50
|
|
117
|
+
fts_candidates = self.fts.match(query, normalized, limit=50, context=context)
|
|
118
|
+
|
|
119
|
+
if not fts_candidates:
|
|
120
|
+
return [] # No matches
|
|
121
|
+
|
|
122
|
+
# Fuzzy refines to top-K
|
|
123
|
+
refined = self.fuzzy.match(
|
|
124
|
+
query, normalized, limit=top_k, fts_candidates=fts_candidates
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return refined
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""Code system validators for format validation and normalization."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from resolvekit.types import CodeSystem
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from resolvekit.matchers.protocols import CodeValidatorProtocol
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ISO2Validator:
|
|
13
|
+
"""Validator for ISO 3166-1 alpha-2 codes."""
|
|
14
|
+
|
|
15
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
16
|
+
"""Validate ISO2 format (2 letters)."""
|
|
17
|
+
if len(value) != 2 or not value.isalpha():
|
|
18
|
+
return False, "ISO2 must be 2 letters"
|
|
19
|
+
return True, None
|
|
20
|
+
|
|
21
|
+
def normalize(self, value: str) -> str:
|
|
22
|
+
"""Normalize to uppercase."""
|
|
23
|
+
return value.upper()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ISO3Validator:
|
|
27
|
+
"""Validator for ISO 3166-1 alpha-3 codes."""
|
|
28
|
+
|
|
29
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
30
|
+
"""Validate ISO3 format (3 letters)."""
|
|
31
|
+
if len(value) != 3 or not value.isalpha():
|
|
32
|
+
return False, "ISO3 must be 3 letters"
|
|
33
|
+
return True, None
|
|
34
|
+
|
|
35
|
+
def normalize(self, value: str) -> str:
|
|
36
|
+
"""Normalize to uppercase."""
|
|
37
|
+
return value.upper()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ISONumericValidator:
|
|
41
|
+
"""Validator for ISO 3166-1 numeric codes."""
|
|
42
|
+
|
|
43
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
44
|
+
"""Validate ISO numeric format (3 digits)."""
|
|
45
|
+
if len(value) != 3 or not value.isdigit():
|
|
46
|
+
return False, "ISO numeric must be 3 digits"
|
|
47
|
+
return True, None
|
|
48
|
+
|
|
49
|
+
def normalize(self, value: str) -> str:
|
|
50
|
+
"""Normalize preserves digits."""
|
|
51
|
+
return value
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class DCIDValidator:
|
|
55
|
+
"""Validator for Data Commons IDs."""
|
|
56
|
+
|
|
57
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
58
|
+
"""Validate DCID format (prefix/identifier)."""
|
|
59
|
+
if "/" not in value:
|
|
60
|
+
return False, "DCID must contain '/'"
|
|
61
|
+
return True, None
|
|
62
|
+
|
|
63
|
+
def normalize(self, value: str) -> str:
|
|
64
|
+
"""DCIDs are case-sensitive, no normalization."""
|
|
65
|
+
return value
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class M49Validator:
|
|
69
|
+
"""Validator for UN M49 region codes."""
|
|
70
|
+
|
|
71
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
72
|
+
"""Validate M49 format (3 digits)."""
|
|
73
|
+
if len(value) != 3 or not value.isdigit():
|
|
74
|
+
return False, "M49 must be 3 digits"
|
|
75
|
+
return True, None
|
|
76
|
+
|
|
77
|
+
def normalize(self, value: str) -> str:
|
|
78
|
+
"""Normalize preserves digits."""
|
|
79
|
+
return value
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ISO31662Validator:
|
|
83
|
+
"""Validator for ISO 3166-2 subdivision codes."""
|
|
84
|
+
|
|
85
|
+
_PATTERN = re.compile(r"^[A-Z]{2}-[A-Z0-9]{1,3}$")
|
|
86
|
+
|
|
87
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
88
|
+
"""Validate ISO 3166-2 format (CC-SSS)."""
|
|
89
|
+
if not self._PATTERN.match(value.upper()):
|
|
90
|
+
return False, "ISO3166-2 must be format CC-SSS (e.g., US-CA)"
|
|
91
|
+
return True, None
|
|
92
|
+
|
|
93
|
+
def normalize(self, value: str) -> str:
|
|
94
|
+
"""Normalize to uppercase."""
|
|
95
|
+
return value.upper()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class NUTSValidator:
|
|
99
|
+
"""Validator for EU NUTS codes."""
|
|
100
|
+
|
|
101
|
+
_PATTERN = re.compile(r"^[A-Z]{2}[0-9A-Z]{0,3}$")
|
|
102
|
+
|
|
103
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
104
|
+
"""Validate NUTS format (CC[0-9A-Z]{0,3})."""
|
|
105
|
+
if not self._PATTERN.match(value.upper()):
|
|
106
|
+
return False, "NUTS must be format CC[0-9A-Z]{0,3} (e.g., DE3, FR10)"
|
|
107
|
+
return True, None
|
|
108
|
+
|
|
109
|
+
def normalize(self, value: str) -> str:
|
|
110
|
+
"""Normalize to uppercase."""
|
|
111
|
+
return value.upper()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class LAUValidator:
|
|
115
|
+
"""Validator for EU LAU codes."""
|
|
116
|
+
|
|
117
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
118
|
+
"""Validate LAU format (variable by country)."""
|
|
119
|
+
# LAU format is variable, accept any non-empty string
|
|
120
|
+
if not value or not value.strip():
|
|
121
|
+
return False, "LAU code cannot be empty"
|
|
122
|
+
return True, None
|
|
123
|
+
|
|
124
|
+
def normalize(self, value: str) -> str:
|
|
125
|
+
"""Normalize to uppercase."""
|
|
126
|
+
return value.upper()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class PCodeValidator:
|
|
130
|
+
"""Validator for OCHA P-codes."""
|
|
131
|
+
|
|
132
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
133
|
+
"""Validate P-code format (variable format)."""
|
|
134
|
+
# P-codes have variable format, accept any non-empty alphanumeric
|
|
135
|
+
if not value or not value.strip():
|
|
136
|
+
return False, "P-code cannot be empty"
|
|
137
|
+
return True, None
|
|
138
|
+
|
|
139
|
+
def normalize(self, value: str) -> str:
|
|
140
|
+
"""Normalize to uppercase."""
|
|
141
|
+
return value.upper()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class WikidataValidator:
|
|
145
|
+
"""Validator for Wikidata QIDs."""
|
|
146
|
+
|
|
147
|
+
_PATTERN = re.compile(r"^Q[0-9]+$")
|
|
148
|
+
|
|
149
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
150
|
+
"""Validate Wikidata format (Q[0-9]+)."""
|
|
151
|
+
if not self._PATTERN.match(value.upper()):
|
|
152
|
+
return False, "Wikidata QID must be format Q[0-9]+ (e.g., Q30)"
|
|
153
|
+
return True, None
|
|
154
|
+
|
|
155
|
+
def normalize(self, value: str) -> str:
|
|
156
|
+
"""Normalize to uppercase."""
|
|
157
|
+
return value.upper()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class GeoNamesValidator:
|
|
161
|
+
"""Validator for GeoNames IDs."""
|
|
162
|
+
|
|
163
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
164
|
+
"""Validate GeoNames format (numeric ID)."""
|
|
165
|
+
if not value.isdigit():
|
|
166
|
+
return False, "GeoNames ID must be numeric"
|
|
167
|
+
return True, None
|
|
168
|
+
|
|
169
|
+
def normalize(self, value: str) -> str:
|
|
170
|
+
"""Normalize preserves digits."""
|
|
171
|
+
return value
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class WorldBankValidator:
|
|
175
|
+
"""Validator for World Bank country codes."""
|
|
176
|
+
|
|
177
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
178
|
+
"""Validate World Bank format (2-3 letters)."""
|
|
179
|
+
if len(value) not in (2, 3) or not value.isalpha():
|
|
180
|
+
return False, "World Bank code must be 2-3 letters"
|
|
181
|
+
return True, None
|
|
182
|
+
|
|
183
|
+
def normalize(self, value: str) -> str:
|
|
184
|
+
"""Normalize to uppercase."""
|
|
185
|
+
return value.upper()
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class DACValidator:
|
|
189
|
+
"""Validator for OECD DAC codes."""
|
|
190
|
+
|
|
191
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
192
|
+
"""Validate DAC format (numeric)."""
|
|
193
|
+
if not value.isdigit():
|
|
194
|
+
return False, "DAC code must be numeric"
|
|
195
|
+
return True, None
|
|
196
|
+
|
|
197
|
+
def normalize(self, value: str) -> str:
|
|
198
|
+
"""Normalize preserves digits."""
|
|
199
|
+
return value
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class FIPSValidator:
|
|
203
|
+
"""Validator for FIPS codes (deprecated but still used)."""
|
|
204
|
+
|
|
205
|
+
def validate(self, value: str) -> tuple[bool, str | None]:
|
|
206
|
+
"""Validate FIPS format (2 letters)."""
|
|
207
|
+
if len(value) != 2 or not value.isalpha():
|
|
208
|
+
return False, "FIPS must be 2 letters"
|
|
209
|
+
return True, None
|
|
210
|
+
|
|
211
|
+
def normalize(self, value: str) -> str:
|
|
212
|
+
"""Normalize to uppercase."""
|
|
213
|
+
return value.upper()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Simple constant dict - no registration ceremony
|
|
217
|
+
# Note: Using object type here to avoid circular import with protocols
|
|
218
|
+
# All validators implement CodeValidatorProtocol structurally
|
|
219
|
+
CODE_VALIDATORS: dict[CodeSystem, object] = {
|
|
220
|
+
CodeSystem.ISO2: ISO2Validator(),
|
|
221
|
+
CodeSystem.ISO3: ISO3Validator(),
|
|
222
|
+
CodeSystem.ISO_NUMERIC: ISONumericValidator(),
|
|
223
|
+
CodeSystem.DCID: DCIDValidator(),
|
|
224
|
+
CodeSystem.M49: M49Validator(),
|
|
225
|
+
CodeSystem.ISO3166_2: ISO31662Validator(),
|
|
226
|
+
CodeSystem.NUTS: NUTSValidator(),
|
|
227
|
+
CodeSystem.LAU: LAUValidator(),
|
|
228
|
+
CodeSystem.PCODE: PCodeValidator(),
|
|
229
|
+
CodeSystem.WIKIDATA: WikidataValidator(),
|
|
230
|
+
CodeSystem.GEONAMES: GeoNamesValidator(),
|
|
231
|
+
CodeSystem.WB: WorldBankValidator(),
|
|
232
|
+
CodeSystem.DAC: DACValidator(),
|
|
233
|
+
CodeSystem.FIPS: FIPSValidator(),
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def get_validator(system: CodeSystem) -> "CodeValidatorProtocol":
|
|
238
|
+
"""
|
|
239
|
+
Get validator for code system.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
system: Code system enum value
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
Validator instance implementing CodeValidatorProtocol
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
KeyError: If code system not supported
|
|
249
|
+
"""
|
|
250
|
+
return CODE_VALIDATORS[system] # type: ignore[return-value]
|