resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,177 @@
1
+ """Exact code matcher for deterministic code lookups."""
2
+
3
+ from resolvekit.data.code_repository import CodeRepository
4
+ from resolvekit.matchers.code_validators import get_validator
5
+ from resolvekit.matchers.priorities import InferencePriority
6
+ from resolvekit.types import Candidate, CodeSystem, MatchContext, MatcherType
7
+
8
+
9
+ class ExactCodeMatcher:
10
+ """
11
+ Matches exact code lookups with validation.
12
+
13
+ Tier 1 matcher: Deterministic, returns single match, stops cascade.
14
+
15
+ Supports:
16
+ - Explicit code system: "iso2:US", "dcid:country/USA"
17
+ - Inferred code system: "US" (tries ISO2), "840" (tries ISO_NUMERIC)
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ code_repo: CodeRepository,
23
+ priority_order: list[CodeSystem] | None = None,
24
+ ):
25
+ """
26
+ Initialize matcher.
27
+
28
+ Args:
29
+ code_repo: Code repository for validation and lookup
30
+ priority_order: Optional priority order for code system inference.
31
+ If None, uses InferencePriority.default()
32
+ """
33
+ self.code_repo = code_repo
34
+ self.priority = priority_order or InferencePriority.default()
35
+
36
+ def match(
37
+ self,
38
+ query: str,
39
+ normalized_query: str,
40
+ limit: int = 10,
41
+ context: MatchContext | None = None,
42
+ ) -> list[Candidate]:
43
+ """
44
+ Try to parse query as code and lookup.
45
+
46
+ Args:
47
+ query: Original query string
48
+ normalized_query: Normalized query (unused for code matching)
49
+ limit: Maximum candidates (unused, always returns 0 or 1)
50
+ context: Optional filtering context
51
+
52
+ Returns:
53
+ List with single candidate if match found, empty list otherwise
54
+ """
55
+ # Parse code: "system:value" or just "value"
56
+ systems, value = self._parse_code(query)
57
+
58
+ if not systems or value is None:
59
+ return []
60
+
61
+ # Try each candidate system in priority order until lookup succeeds
62
+ for system in systems:
63
+ # Validate code format
64
+ is_valid, _ = self.code_repo.validate_code(system, value)
65
+ if not is_valid:
66
+ continue
67
+
68
+ # Lookup entity
69
+ entity = self.code_repo.find_by_code(system, value, context)
70
+ if entity:
71
+ # Return single candidate with exact_code feature
72
+ return [
73
+ Candidate(
74
+ entity=entity,
75
+ score=1.0,
76
+ matcher_type=MatcherType.EXACT_CODE,
77
+ features={"exact_code": True, "code_system": system.value},
78
+ matched_alias=None,
79
+ )
80
+ ]
81
+
82
+ # No match found in any candidate system
83
+ return []
84
+
85
+ def _parse_code(self, query: str) -> tuple[list[CodeSystem], str | None]:
86
+ """
87
+ Parse query as code with optional system prefix.
88
+
89
+ Formats:
90
+ - "system:value" -> explicit system (single-item list)
91
+ - "value" -> infer systems from format (priority-ordered list)
92
+
93
+ Args:
94
+ query: Query string
95
+
96
+ Returns:
97
+ Tuple of (code_systems_list, code_value) or ([], None) if unparseable
98
+ """
99
+ # Check for explicit system
100
+ if ":" in query:
101
+ parts = query.split(":", 1)
102
+ if len(parts) == 2:
103
+ system_str, value = parts
104
+ try:
105
+ system = CodeSystem(system_str.lower())
106
+ return [system], value
107
+ except ValueError:
108
+ return [], None
109
+
110
+ # Infer systems from format (returns priority-ordered list)
111
+ systems = self._infer_code_system(query)
112
+ return systems, query if systems else ([], None)
113
+
114
+ def _infer_code_system(self, value: str) -> list[CodeSystem]:
115
+ """
116
+ Infer candidate code systems from value format using hybrid approach.
117
+
118
+ Strategy:
119
+ 1. Fast path: Check unique patterns (DCID with "/", Wikidata "Q123")
120
+ - If found, return single-item list (unambiguous)
121
+ 2. Priority iteration: Collect all validators that pass in priority order
122
+ - Returns list of candidates to try
123
+
124
+ Args:
125
+ value: Code value
126
+
127
+ Returns:
128
+ List of candidate code systems in priority order (empty if none match)
129
+ """
130
+ # Fast path for unambiguous patterns (O(1) checks)
131
+ fast_path_result = self._fast_path_inference(value)
132
+ if fast_path_result is not None:
133
+ # Unambiguous pattern - return single candidate
134
+ return [fast_path_result]
135
+
136
+ # Priority-based validation: collect ALL matching systems in priority order
137
+ candidates = []
138
+ for system in self.priority:
139
+ # Skip systems already checked in fast path
140
+ if system in {CodeSystem.DCID, CodeSystem.WIKIDATA}:
141
+ continue
142
+
143
+ validator = get_validator(system)
144
+ is_valid, _ = validator.validate(value)
145
+ if is_valid:
146
+ candidates.append(system)
147
+
148
+ return candidates
149
+
150
+ def _fast_path_inference(self, value: str) -> CodeSystem | None:
151
+ """
152
+ Fast path for unique patterns that don't need validation.
153
+
154
+ These patterns are unambiguous and can be identified with simple
155
+ string operations, avoiding validator lookups.
156
+
157
+ Args:
158
+ value: Code value
159
+
160
+ Returns:
161
+ Code system if unique pattern detected, None otherwise
162
+ """
163
+ # DCID: Contains "/" (e.g., "country/USA", "geoId/06")
164
+ if "/" in value:
165
+ return CodeSystem.DCID
166
+
167
+ # Wikidata: Starts with "Q" followed by digits (e.g., "Q30")
168
+ if len(value) > 1 and value[0].upper() == "Q" and value[1:].isdigit():
169
+ return CodeSystem.WIKIDATA
170
+
171
+ # ISO3166-2: Contains "-" with 2-3 char country prefix (e.g., "US-CA")
172
+ if "-" in value:
173
+ parts = value.split("-", 1)
174
+ if len(parts) == 2 and 2 <= len(parts[0]) <= 3 and parts[0].isalpha():
175
+ return CodeSystem.ISO3166_2
176
+
177
+ return None
@@ -0,0 +1,106 @@
1
+ """FTS matcher using SQLite FTS5 for text search."""
2
+
3
+ from resolvekit.data.alias_repository import AliasRepository
4
+ from resolvekit.types import Candidate, MatchContext, MatcherType
5
+
6
+
7
+ class FTSMatcher:
8
+ """
9
+ Full-text search using SQLite FTS5.
10
+
11
+ Tier 3 matcher: Returns top-K ranked candidates.
12
+
13
+ Uses FTS5 BM25 ranking with LIMIT pushed to SQL for performance.
14
+ """
15
+
16
+ def __init__(self, alias_repo: AliasRepository):
17
+ """
18
+ Initialize matcher.
19
+
20
+ Args:
21
+ alias_repo: Alias repository for FTS queries
22
+ """
23
+ self.alias_repo = alias_repo
24
+
25
+ def match(
26
+ self,
27
+ query: str,
28
+ normalized_query: str,
29
+ limit: int = 50,
30
+ context: MatchContext | None = None,
31
+ ) -> list[Candidate]:
32
+ """
33
+ FTS5 BM25 ranking with LIMIT pushed to SQL.
34
+
35
+ Performance:
36
+ - LIMIT in SQL (not Python filtering)
37
+ - UNION ALL for overlays
38
+ - Returns top-K candidates
39
+
40
+ Args:
41
+ query: Original query string (unused)
42
+ normalized_query: Normalized query string
43
+ limit: Maximum candidates to return (default 50 for Tier 3)
44
+ context: Optional filtering context
45
+
46
+ Returns:
47
+ List of candidates ordered by FTS score (descending)
48
+ """
49
+ # FTS query with limit
50
+ matches = self.alias_repo.search_fts(
51
+ query=normalized_query, limit=limit, context=context
52
+ )
53
+
54
+ if not matches:
55
+ return []
56
+
57
+ # Convert to candidates
58
+ candidates = []
59
+ for entity, bm25_score, rank in matches:
60
+ # Normalize BM25 score to 0-1 range
61
+ # FTS rank is relative, use inverse rank as approximation
62
+ normalized_score = self._normalize_bm25(bm25_score, rank, len(matches))
63
+
64
+ candidates.append(
65
+ Candidate(
66
+ entity=entity,
67
+ score=normalized_score,
68
+ matcher_type=MatcherType.FTS,
69
+ features={"fts_score": bm25_score, "fts_rank": rank},
70
+ matched_alias=None,
71
+ )
72
+ )
73
+
74
+ return candidates
75
+
76
+ def _normalize_bm25(self, bm25_score: float, rank: int, total: int) -> float:
77
+ """
78
+ Normalize BM25 score to 0-1 range.
79
+
80
+ BM25 scores are unbounded, so we use a combination of:
81
+ - Rank position (1/rank)
82
+ - Total results (context)
83
+
84
+ This is a simple heuristic; calibration will learn better weights.
85
+
86
+ Args:
87
+ bm25_score: Raw BM25 score (positive)
88
+ rank: Rank position (1-indexed)
89
+ total: Total number of results
90
+
91
+ Returns:
92
+ Normalized score (0-1)
93
+ """
94
+ # Simple rank-based normalization
95
+ # Top result gets ~0.9, subsequent results decay
96
+ # This is a placeholder; calibration will learn actual mapping
97
+ if rank == 1:
98
+ return 0.9
99
+ elif rank <= 3:
100
+ return 0.8
101
+ elif rank <= 10:
102
+ return 0.7
103
+ elif rank <= 20:
104
+ return 0.6
105
+ else:
106
+ return 0.5
@@ -0,0 +1,142 @@
1
+ """Fuzzy matcher for refining FTS results with similarity scoring."""
2
+
3
+ from rapidfuzz import fuzz
4
+
5
+ from resolvekit.normalization.normalizer import TextNormalizer
6
+ from resolvekit.types import Candidate, MatchContext
7
+
8
+
9
+ class FuzzyMatcher:
10
+ """
11
+ Bounded fuzzy matching on FTS candidates.
12
+
13
+ Tier 3 matcher: Refines FTS results, returns top-K'.
14
+
15
+ Uses rapidfuzz for edit distance and trigram similarity.
16
+ Only operates on FTS candidates (bounded computation).
17
+ """
18
+
19
+ def __init__(self, normalizer: TextNormalizer):
20
+ """
21
+ Initialize matcher.
22
+
23
+ Args:
24
+ normalizer: Text normalizer for canonical name normalization
25
+ """
26
+ self.normalizer = normalizer
27
+
28
+ def match(
29
+ self,
30
+ query: str,
31
+ normalized_query: str,
32
+ limit: int = 12,
33
+ context: MatchContext | None = None,
34
+ fts_candidates: list[Candidate] | None = None,
35
+ ) -> list[Candidate]:
36
+ """
37
+ Refine FTS candidates with fuzzy scoring.
38
+
39
+ Performance:
40
+ - Only operates on FTS candidates (bounded)
41
+ - Uses rapidfuzz
42
+ - Combined score: 0.6 * edit + 0.4 * trigram
43
+
44
+ Args:
45
+ query: Original query string (unused)
46
+ normalized_query: Normalized query string
47
+ limit: Maximum candidates to return (top-K')
48
+ context: Optional filtering context (unused)
49
+ fts_candidates: FTS candidates to refine
50
+
51
+ Returns:
52
+ List of refined candidates ordered by fuzzy score (descending)
53
+ """
54
+ if not fts_candidates:
55
+ return []
56
+
57
+ # Compute fuzzy features for each candidate
58
+ scored = []
59
+ for candidate in fts_candidates:
60
+ # Edit similarity (rapidfuzz.fuzz.ratio returns 0-100)
61
+ edit_score = (
62
+ fuzz.ratio(
63
+ normalized_query,
64
+ self.normalizer.normalize(candidate.entity.canonical_name),
65
+ )
66
+ / 100.0
67
+ )
68
+
69
+ # Trigram Jaccard similarity
70
+ trigram_score = self._trigram_jaccard(
71
+ normalized_query, candidate.entity.canonical_name
72
+ )
73
+
74
+ # Combined fuzzy score
75
+ fuzzy_score = 0.6 * edit_score + 0.4 * trigram_score
76
+
77
+ # Update candidate features
78
+ candidate.features.update(
79
+ {
80
+ "edit_similarity": edit_score,
81
+ "trigram_jaccard": trigram_score,
82
+ "fuzzy_score": fuzzy_score,
83
+ }
84
+ )
85
+
86
+ # Update score (take max of FTS score and fuzzy score)
87
+ candidate.score = max(candidate.score, fuzzy_score)
88
+
89
+ scored.append(candidate)
90
+
91
+ # Sort by score (descending) and return top-K'
92
+ scored.sort(key=lambda c: c.score, reverse=True)
93
+ return scored[:limit]
94
+
95
+ def _trigram_jaccard(self, s1: str, s2: str) -> float:
96
+ """
97
+ Compute trigram Jaccard similarity.
98
+
99
+ Trigrams are 3-character sequences.
100
+ Jaccard = |intersection| / |union|
101
+
102
+ Args:
103
+ s1: First string
104
+ s2: Second string
105
+
106
+ Returns:
107
+ Jaccard similarity (0-1)
108
+ """
109
+ # Normalize second string
110
+ s2_norm = self.normalizer.normalize(s2)
111
+
112
+ # Generate trigrams
113
+ trigrams1 = self._get_trigrams(s1)
114
+ trigrams2 = self._get_trigrams(s2_norm)
115
+
116
+ if not trigrams1 or not trigrams2:
117
+ return 0.0
118
+
119
+ # Compute Jaccard
120
+ intersection = trigrams1 & trigrams2
121
+ union = trigrams1 | trigrams2
122
+
123
+ return len(intersection) / len(union) if union else 0.0
124
+
125
+ def _get_trigrams(self, s: str) -> set[str]:
126
+ """
127
+ Generate trigrams from string.
128
+
129
+ Examples:
130
+ "abc" -> {"abc"}
131
+ "abcd" -> {"abc", "bcd"}
132
+
133
+ Args:
134
+ s: Input string
135
+
136
+ Returns:
137
+ Set of trigrams
138
+ """
139
+ if len(s) < 3:
140
+ return set()
141
+
142
+ return {s[i : i + 3] for i in range(len(s) - 2)}
@@ -0,0 +1,174 @@
1
+ """Code system inference priority presets for different domains."""
2
+
3
+ from resolvekit.types import CodeSystem
4
+
5
+
6
+ class InferencePriority:
7
+ """
8
+ Preset priority orderings for code system inference.
9
+
10
+ Different domains have different priorities for ambiguous codes.
11
+ For example, a 3-letter code "USA" could be ISO3 or World Bank -
12
+ the correct interpretation depends on context.
13
+
14
+ Usage:
15
+ # Use a preset
16
+ cascade = MatcherCascade(..., code_priority=InferencePriority.humanitarian())
17
+
18
+ # Or customize
19
+ custom_priority = [CodeSystem.WB, CodeSystem.ISO2, CodeSystem.ISO3]
20
+ cascade = MatcherCascade(..., code_priority=custom_priority)
21
+ """
22
+
23
+ @staticmethod
24
+ def default() -> list[CodeSystem]:
25
+ """
26
+ General-purpose priority order.
27
+
28
+ Prioritizes:
29
+ 1. Most unique patterns (DCID, Wikidata)
30
+ 2. Common standards (ISO codes)
31
+ 3. Specialized systems (NUTS, P-codes, etc.)
32
+
33
+ This is the fallback when no domain context is specified.
34
+ """
35
+ return [
36
+ # Tier 1: Unique patterns (fast path handles these)
37
+ CodeSystem.DCID, # "country/USA" - slash is unique
38
+ CodeSystem.WIKIDATA, # "Q30" - Q prefix is unique
39
+ # Tier 2: Common standards
40
+ CodeSystem.ISO2, # 2-letter country codes (most common)
41
+ CodeSystem.ISO_NUMERIC, # 3-digit numeric (UN official)
42
+ CodeSystem.M49, # 3-digit UN region codes
43
+ CodeSystem.ISO3, # 3-letter country codes
44
+ CodeSystem.ISO3166_2, # "US-CA" subdivision codes
45
+ # Tier 3: Domain-specific
46
+ CodeSystem.GEONAMES, # Numeric GeoNames ID
47
+ CodeSystem.WB, # World Bank codes (2-3 letters)
48
+ CodeSystem.DAC, # OECD DAC codes
49
+ CodeSystem.NUTS, # EU statistical regions
50
+ CodeSystem.LAU, # EU local admin units
51
+ CodeSystem.PCODE, # OCHA humanitarian P-codes
52
+ CodeSystem.FIPS, # Deprecated FIPS codes (last resort)
53
+ ]
54
+
55
+ @staticmethod
56
+ def humanitarian() -> list[CodeSystem]:
57
+ """
58
+ Humanitarian/OCHA context priority.
59
+
60
+ Prioritizes OCHA P-codes for humanitarian coordination,
61
+ followed by UN standards and ISO codes.
62
+
63
+ Use when working with:
64
+ - UN OCHA data
65
+ - Humanitarian response datasets
66
+ - Emergency coordination systems
67
+ """
68
+ return [
69
+ CodeSystem.DCID,
70
+ CodeSystem.WIKIDATA,
71
+ CodeSystem.PCODE, # OCHA P-codes prioritized
72
+ CodeSystem.M49, # UN M49 regions
73
+ CodeSystem.ISO2,
74
+ CodeSystem.ISO3,
75
+ CodeSystem.ISO_NUMERIC,
76
+ CodeSystem.ISO3166_2,
77
+ CodeSystem.GEONAMES,
78
+ CodeSystem.WB,
79
+ CodeSystem.DAC,
80
+ CodeSystem.NUTS,
81
+ CodeSystem.LAU,
82
+ CodeSystem.FIPS,
83
+ ]
84
+
85
+ @staticmethod
86
+ def world_bank() -> list[CodeSystem]:
87
+ """
88
+ World Bank context priority.
89
+
90
+ Prioritizes World Bank codes for development datasets,
91
+ followed by OECD DAC and ISO standards.
92
+
93
+ Use when working with:
94
+ - World Bank datasets
95
+ - Development indicators
96
+ - OECD DAC data
97
+ """
98
+ return [
99
+ CodeSystem.DCID,
100
+ CodeSystem.WIKIDATA,
101
+ CodeSystem.WB, # World Bank codes prioritized
102
+ CodeSystem.DAC, # OECD DAC codes
103
+ CodeSystem.ISO2,
104
+ CodeSystem.ISO3,
105
+ CodeSystem.ISO_NUMERIC,
106
+ CodeSystem.M49,
107
+ CodeSystem.ISO3166_2,
108
+ CodeSystem.GEONAMES,
109
+ CodeSystem.PCODE,
110
+ CodeSystem.NUTS,
111
+ CodeSystem.LAU,
112
+ CodeSystem.FIPS,
113
+ ]
114
+
115
+ @staticmethod
116
+ def european_union() -> list[CodeSystem]:
117
+ """
118
+ European Union statistical context priority.
119
+
120
+ Prioritizes EU NUTS and LAU codes for European regional statistics,
121
+ followed by ISO codes.
122
+
123
+ Use when working with:
124
+ - Eurostat datasets
125
+ - EU regional statistics
126
+ - NUTS/LAU hierarchies
127
+ """
128
+ return [
129
+ CodeSystem.DCID,
130
+ CodeSystem.WIKIDATA,
131
+ CodeSystem.NUTS, # EU NUTS regions prioritized
132
+ CodeSystem.LAU, # EU local admin units
133
+ CodeSystem.ISO2,
134
+ CodeSystem.ISO3,
135
+ CodeSystem.ISO3166_2,
136
+ CodeSystem.ISO_NUMERIC,
137
+ CodeSystem.M49,
138
+ CodeSystem.GEONAMES,
139
+ CodeSystem.WB,
140
+ CodeSystem.DAC,
141
+ CodeSystem.PCODE,
142
+ CodeSystem.FIPS,
143
+ ]
144
+
145
+ @staticmethod
146
+ def academic() -> list[CodeSystem]:
147
+ """
148
+ Academic/research context priority.
149
+
150
+ Prioritizes Wikidata and GeoNames for research datasets,
151
+ followed by ISO standards.
152
+
153
+ Use when working with:
154
+ - Research datasets
155
+ - Wikidata-based knowledge graphs
156
+ - GeoNames geographic data
157
+ - Academic publications
158
+ """
159
+ return [
160
+ CodeSystem.WIKIDATA, # Wikidata prioritized for research
161
+ CodeSystem.DCID,
162
+ CodeSystem.GEONAMES, # GeoNames prioritized for geography
163
+ CodeSystem.ISO2,
164
+ CodeSystem.ISO3,
165
+ CodeSystem.ISO_NUMERIC,
166
+ CodeSystem.M49,
167
+ CodeSystem.ISO3166_2,
168
+ CodeSystem.WB,
169
+ CodeSystem.DAC,
170
+ CodeSystem.NUTS,
171
+ CodeSystem.LAU,
172
+ CodeSystem.PCODE,
173
+ CodeSystem.FIPS,
174
+ ]
@@ -0,0 +1,75 @@
1
+ """Protocols for matcher implementations."""
2
+
3
+ from typing import Protocol
4
+
5
+ from resolvekit.types import Candidate, MatchContext
6
+
7
+
8
+ class MatcherProtocol(Protocol):
9
+ """
10
+ Structural protocol for matchers.
11
+
12
+ Matchers do not need to inherit from this class.
13
+ Any class implementing the match() method signature
14
+ automatically satisfies this protocol (duck typing).
15
+ """
16
+
17
+ def match(
18
+ self,
19
+ query: str,
20
+ normalized_query: str,
21
+ limit: int = 10,
22
+ context: MatchContext | None = None,
23
+ ) -> list[Candidate]:
24
+ """
25
+ Find matching candidates for a query.
26
+
27
+ Args:
28
+ query: Original query string
29
+ normalized_query: Normalized version (from TextNormalizer)
30
+ limit: Maximum candidates to return
31
+ context: Optional filtering context
32
+
33
+ Returns:
34
+ List of candidates ordered by score (descending)
35
+ """
36
+ ...
37
+
38
+
39
+ class CodeValidatorProtocol(Protocol):
40
+ """
41
+ Structural protocol for code system validators.
42
+
43
+ Each code system (ISO2, ISO3, M49, etc.) implements
44
+ this protocol to validate and normalize codes.
45
+ """
46
+
47
+ def validate(self, value: str) -> tuple[bool, str | None]:
48
+ """
49
+ Validate code format.
50
+
51
+ Args:
52
+ value: Code value to validate
53
+
54
+ Returns:
55
+ Tuple of (is_valid, error_message).
56
+ If valid, error_message is None.
57
+ """
58
+ ...
59
+
60
+ def normalize(self, value: str) -> str:
61
+ """
62
+ Normalize code to canonical form.
63
+
64
+ Examples:
65
+ - "us" -> "US" (ISO2)
66
+ - "fra" -> "FRA" (ISO3)
67
+ - "Q123" -> "Q123" (Wikidata, case-sensitive)
68
+
69
+ Args:
70
+ value: Code value to normalize
71
+
72
+ Returns:
73
+ Normalized code value
74
+ """
75
+ ...