resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,351 @@
1
+ # Calibration Module
2
+
3
+ ## Purpose
4
+
5
+ The calibration module converts raw matcher scores into calibrated confidence probabilities that reflect actual precision, enabling reliable threshold-based decision making.
6
+
7
+ ## Components
8
+
9
+ ### Core Components
10
+
11
+ **Currently Implemented:**
12
+
13
+ 1. **CalibrationModel** (`models.py`)
14
+ - Pydantic schema for calibration model configuration
15
+ - JSON loading and validation
16
+ - Model metadata (ECE, training date, notes)
17
+
18
+ 2. **Feature Extractor** (`features.py`)
19
+ - Extracts standardized features from candidates
20
+ - Converts matcher-specific features to normalized values
21
+ - Handles missing features with sentinel values
22
+ - Optional numpy vectorization for batch processing
23
+
24
+ 3. **Calibrator** (`calibrator.py`)
25
+ - Applies calibration models to candidates
26
+ - Returns probability P(correct match | features)
27
+ - Supports logistic regression models
28
+ - Heuristic fallback when no model available
29
+ - Batch processing with optional numpy acceleration
30
+
31
+ **Planned for Future Phases:**
32
+
33
+ 4. **Model Trainer** (`trainer.py` - not yet implemented)
34
+ - Trains calibration models on labeled data
35
+ - Evaluates calibration quality (ECE - Expected Calibration Error)
36
+ - Exports models to JSON for data packs
37
+
38
+ 5. **Score Fusion** (`fusion.py` - not yet implemented)
39
+ - Combines scores from multiple matchers
40
+ - Handles missing features (when matchers not invoked)
41
+ - Normalizes scores to comparable scales
42
+
43
+ ## Feature Set
44
+
45
+ Features extracted per (query, candidate) pair:
46
+
47
+ ### Match Type Features (binary)
48
+ - `f_exact_code`: Matched via exact code lookup
49
+ - `f_canonical_exact`: Matched via canonical name
50
+ - `f_alias_exact`: Matched via exact alias
51
+ - `f_alias_type_*`: One-hot encoding of alias type (canonical/endonym/exonym/abbr/code)
52
+
53
+ ### Similarity Features (continuous)
54
+ - `f_fts_score`: BM25-like score from FTS
55
+ - `f_fts_rank_inv`: Inverse rank (1/rank)
56
+ - `f_edit_distance_norm`: Normalized edit distance
57
+ - `f_trigram_jaccard`: Trigram Jaccard similarity
58
+
59
+ ### Constraint Features (binary)
60
+ - `f_parent_valid`: Passes parent constraint
61
+ - `f_type_valid`: Passes type constraint
62
+ - `f_date_valid`: Passes temporal constraint
63
+
64
+ ### Semantic Features (optional)
65
+ - `f_sem_used`: Semantic matcher was invoked
66
+ - `f_sem_sim`: Cosine similarity from embeddings (0-1)
67
+
68
+ ### Context Features
69
+ - `f_ambiguity_flag`: Query in ambiguity registry
70
+ - `f_region_hint_match`: Candidate matches region hint
71
+
72
+ ## Calibration Model
73
+
74
+ ### Logistic Regression (default)
75
+
76
+ ```python
77
+ import numpy as np
78
+
79
+ def calibrate_logistic(features: np.ndarray, weights: np.ndarray, bias: float) -> float:
80
+ """Compute calibrated probability via logistic regression."""
81
+ logit = np.dot(weights, features) + bias
82
+ probability = 1 / (1 + np.exp(-logit))
83
+ return probability
84
+ ```
85
+
86
+ ### Model Format (JSON)
87
+
88
+ ```json
89
+ {
90
+ "type": "logistic",
91
+ "features": ["f_exact_code", "f_canonical_exact", ...],
92
+ "weights": [2.3, 1.6, 0.9, ...],
93
+ "bias": -1.4,
94
+ "ece": 0.032,
95
+ "trained_on": "2025-10-15",
96
+ "notes": "Global calibration; overlays may append."
97
+ }
98
+ ```
99
+
100
+ ## Calibration Quality Metrics
101
+
102
+ - **ECE (Expected Calibration Error)**: Measures deviation between predicted probabilities and observed accuracy
103
+ - **Brier Score**: Mean squared error of probabilistic predictions
104
+ - **Reliability Diagram**: Plots predicted vs observed probabilities in bins
105
+
106
+ Target: ECE < 0.05 (well-calibrated)
107
+
108
+ ## Design Principles
109
+
110
+ 1. **Interpretable**: Use simple models (logistic) for explainability
111
+ 2. **Calibrated**: Probabilities should match empirical accuracy
112
+ 3. **Versioned**: Models versioned with data packs
113
+ 4. **Updatable**: Overlays can fine-tune calibration with delta weights
114
+
115
+ ## Usage Examples
116
+
117
+ ### Basic Usage - Heuristic Fallback
118
+
119
+ When no calibration model is available, the calibrator uses rule-based heuristics:
120
+
121
+ ```python
122
+ from resolvekit.calibration import Calibrator
123
+ from resolvekit.types import Candidate, Entity, EntityType, MatcherType
124
+
125
+ # Create calibrator without model (uses heuristic)
126
+ calibrator = Calibrator(model=None)
127
+
128
+ # Example candidate from exact code matcher
129
+ candidate = Candidate(
130
+ entity=Entity(
131
+ dcid="country/USA",
132
+ canonical_name="United States",
133
+ entity_type=EntityType.COUNTRY,
134
+ ),
135
+ score=1.0,
136
+ matcher_type=MatcherType.EXACT_CODE,
137
+ features={"exact_code": True, "code_system": "iso3"},
138
+ )
139
+
140
+ # Calibrate to get confidence probability
141
+ confidence = calibrator.calibrate(candidate)
142
+ print(f"Confidence: {confidence}") # 0.95 for exact code matches
143
+ ```
144
+
145
+ ### Loading and Using a Trained Model
146
+
147
+ Load a calibration model from a data pack and use it for calibration:
148
+
149
+ ```python
150
+ from pathlib import Path
151
+ from resolvekit.calibration import Calibrator, load_calibration_model
152
+
153
+ # Load model from JSON file in data pack
154
+ model_path = Path("data/base/calibration_model.json")
155
+ model = load_calibration_model(model_path)
156
+
157
+ if model:
158
+ print(f"Loaded model with {len(model.features)} features")
159
+ print(f"ECE: {model.ece}") # Expected Calibration Error
160
+
161
+ # Create calibrator with model
162
+ calibrator = Calibrator(model=model)
163
+
164
+ # Calibrate candidates using logistic regression
165
+ confidence = calibrator.calibrate(candidate)
166
+ print(f"Model-calibrated confidence: {confidence}")
167
+ else:
168
+ print("Model not found, falling back to heuristic")
169
+ calibrator = Calibrator(model=None)
170
+ ```
171
+
172
+ ### Batch Processing for Efficiency
173
+
174
+ Process multiple candidates efficiently using batch calibration:
175
+
176
+ ```python
177
+ from resolvekit.calibration import Calibrator
178
+
179
+ # Prepare multiple candidates
180
+ candidates = [
181
+ Candidate(
182
+ entity=Entity(dcid="country/USA", canonical_name="United States",
183
+ entity_type=EntityType.COUNTRY),
184
+ score=1.0,
185
+ matcher_type=MatcherType.EXACT_CODE,
186
+ features={"exact_code": True},
187
+ ),
188
+ Candidate(
189
+ entity=Entity(dcid="country/FRA", canonical_name="France",
190
+ entity_type=EntityType.COUNTRY),
191
+ score=0.9,
192
+ matcher_type=MatcherType.CANONICAL_NAME,
193
+ features={"canonical_exact": True},
194
+ ),
195
+ Candidate(
196
+ entity=Entity(dcid="country/DEU", canonical_name="Germany",
197
+ entity_type=EntityType.COUNTRY),
198
+ score=0.75,
199
+ matcher_type=MatcherType.FUZZY,
200
+ features={"fuzzy_score": 0.78, "fts_score": 0.82},
201
+ ),
202
+ ]
203
+
204
+ calibrator = Calibrator(model=None)
205
+
206
+ # Batch calibration (uses numpy vectorization if available)
207
+ confidences = calibrator.calibrate_batch(candidates)
208
+
209
+ # Results correspond to input order
210
+ for candidate, confidence in zip(candidates, confidences):
211
+ print(f"{candidate.entity.canonical_name}: {confidence:.2f}")
212
+ # Output:
213
+ # United States: 0.95
214
+ # France: 0.90
215
+ # Germany: 0.73
216
+ ```
217
+
218
+ ### Feature Extraction
219
+
220
+ Extract standardized features from candidates for analysis or custom calibration:
221
+
222
+ ```python
223
+ from resolvekit.calibration import FeatureExtractor
224
+
225
+ extractor = FeatureExtractor()
226
+
227
+ # Extract features from a candidate
228
+ features = extractor.extract(candidate)
229
+
230
+ # Features dict contains standardized feature values
231
+ print(features)
232
+ # {
233
+ # 'f_exact_code': 1.0,
234
+ # 'f_canonical_exact': 0.0,
235
+ # 'f_alias_exact': 0.0,
236
+ # 'f_fts_score': -1.0, # Sentinel value (not used)
237
+ # 'f_fts_rank_inv': -1.0,
238
+ # 'f_edit_similarity': -1.0,
239
+ # 'f_trigram_jaccard': -1.0,
240
+ # 'f_fuzzy_score': -1.0,
241
+ # 'f_parent_valid': 0.0,
242
+ # 'f_parent_depth': 0.0,
243
+ # 'f_type_valid': 0.0,
244
+ # 'f_date_valid': 0.0,
245
+ # 'f_membership_valid': 0.0,
246
+ # }
247
+
248
+ # Extract features from multiple candidates
249
+ feature_dicts = extractor.extract_batch(candidates)
250
+ # Returns list of dicts (or numpy array if numpy available)
251
+ ```
252
+
253
+ ### Creating a Calibration Model
254
+
255
+ Define a custom calibration model programmatically:
256
+
257
+ ```python
258
+ from resolvekit.calibration import CalibrationModel
259
+ from datetime import date
260
+
261
+ # Create a simple logistic regression model
262
+ model = CalibrationModel(
263
+ type="logistic",
264
+ features=["f_exact_code", "f_canonical_exact", "f_fts_score"],
265
+ weights=[4.0, 3.5, 1.5],
266
+ bias=-2.0,
267
+ ece=0.025,
268
+ trained_on=date(2025, 10, 15),
269
+ notes="Custom calibration model",
270
+ )
271
+
272
+ # Use the model
273
+ calibrator = Calibrator(model=model)
274
+ confidence = calibrator.calibrate(candidate)
275
+ ```
276
+
277
+ ### Integration with Resolver Pipeline
278
+
279
+ Typical usage within the resolver cascade:
280
+
281
+ ```python
282
+ from resolvekit.calibration import Calibrator, load_calibration_model
283
+ from pathlib import Path
284
+
285
+ # Initialize calibrator at resolver startup
286
+ model = load_calibration_model(Path("data/base/calibration_model.json"))
287
+ calibrator = Calibrator(model=model)
288
+
289
+ # During resolution, matchers populate candidate.features
290
+ # Then calibrator converts to confidence probabilities
291
+
292
+ def resolve(query: str) -> list[tuple[Candidate, float]]:
293
+ # 1. Matchers generate candidates with features
294
+ candidates = run_matcher_cascade(query)
295
+
296
+ # 2. Calibrator converts to confidence probabilities
297
+ confidences = calibrator.calibrate_batch(candidates)
298
+
299
+ # 3. Return ranked results
300
+ results = list(zip(candidates, confidences))
301
+ results.sort(key=lambda x: x[1], reverse=True) # Sort by confidence
302
+
303
+ return results
304
+ ```
305
+
306
+ ### Heuristic Scoring Rules
307
+
308
+ When using heuristic mode (no model), confidence scores follow these rules:
309
+
310
+ - **Tier 1 - Exact Matches** (0.85-0.95):
311
+ - Exact code match: 0.95
312
+ - Canonical name exact: 0.90
313
+ - Alias exact: 0.85
314
+
315
+ - **Tier 2 - Fuzzy Matches** (0.5-0.8):
316
+ - Base: 0.5 + (fuzzy_score × 0.3)
317
+ - Example: fuzzy_score=0.78 → confidence=0.734
318
+
319
+ - **Tier 3 - FTS Only** (0.3-0.6):
320
+ - Base: 0.3 + (fts_score × 0.3)
321
+ - Example: fts_score=0.7 → confidence=0.51
322
+
323
+ - **Fallback**: Returns raw matcher score if no features match
324
+
325
+ ### Performance Considerations
326
+
327
+ ```python
328
+ # Batch processing is faster for multiple candidates
329
+ import time
330
+
331
+ # Single-item processing
332
+ start = time.time()
333
+ for candidate in candidates:
334
+ confidence = calibrator.calibrate(candidate)
335
+ single_time = time.time() - start
336
+
337
+ # Batch processing (with numpy)
338
+ start = time.time()
339
+ confidences = calibrator.calibrate_batch(candidates)
340
+ batch_time = time.time() - start
341
+
342
+ print(f"Single: {single_time:.4f}s")
343
+ print(f"Batch: {batch_time:.4f}s")
344
+ print(f"Speedup: {single_time / batch_time:.1f}x")
345
+ # With numpy: 5-10x speedup for 100+ candidates
346
+ # Without numpy: Similar performance (both use Python loop)
347
+ ```
348
+
349
+ ## Implementation Priority
350
+
351
+ **Phase A** - Core resolver
@@ -0,0 +1,12 @@
1
+ """Calibration module for converting scores to probabilities."""
2
+
3
+ from resolvekit.calibration.calibrator import Calibrator
4
+ from resolvekit.calibration.features import FeatureExtractor
5
+ from resolvekit.calibration.models import CalibrationModel, load_calibration_model
6
+
7
+ __all__ = [
8
+ "CalibrationModel",
9
+ "Calibrator",
10
+ "FeatureExtractor",
11
+ "load_calibration_model",
12
+ ]
@@ -0,0 +1,184 @@
1
+ """Calibration service for converting scores to probabilities."""
2
+
3
+ import math
4
+
5
+ from resolvekit.calibration.features import FeatureExtractor
6
+ from resolvekit.calibration.models import CalibrationModel
7
+ from resolvekit.types import Candidate
8
+
9
+ try:
10
+ import numpy as np
11
+
12
+ HAS_NUMPY = True
13
+ except ImportError:
14
+ HAS_NUMPY = False
15
+
16
+
17
+ class Calibrator:
18
+ """
19
+ Applies calibration models to candidates.
20
+
21
+ Supports logistic regression models or heuristic fallback when no model available.
22
+ """
23
+
24
+ def __init__(self, model: CalibrationModel | None = None):
25
+ """
26
+ Initialize calibrator.
27
+
28
+ Args:
29
+ model: Optional calibration model. If None, uses heuristic fallback.
30
+ """
31
+ self.model = model
32
+ self.feature_extractor = FeatureExtractor()
33
+
34
+ def calibrate(self, candidate: Candidate) -> float:
35
+ """
36
+ Calibrate single candidate to confidence probability.
37
+
38
+ Args:
39
+ candidate: Candidate to calibrate
40
+
41
+ Returns:
42
+ Calibrated confidence in [0.0, 1.0]
43
+ """
44
+ if self.model:
45
+ return self._apply_logistic(candidate)
46
+ else:
47
+ # Use heuristic fallback
48
+ return self._apply_heuristic(candidate)
49
+
50
+ def _apply_heuristic(self, candidate: Candidate) -> float:
51
+ """
52
+ Apply rule-based heuristic scoring.
53
+
54
+ Tier 1: Exact matches (0.85-0.95)
55
+ Tier 2: Fuzzy matches (0.5-0.8)
56
+ Tier 3: FTS only (0.3-0.6)
57
+ Fallback: Raw score
58
+
59
+ Args:
60
+ candidate: Candidate to score
61
+
62
+ Returns:
63
+ Heuristic confidence
64
+ """
65
+ features = candidate.features
66
+
67
+ # Tier 1: Exact matches
68
+ if features.get("exact_code"):
69
+ return 0.95
70
+ if features.get("canonical_exact"):
71
+ return 0.90
72
+ if features.get("alias_exact"):
73
+ return 0.85
74
+
75
+ # Tier 2: Fuzzy matches
76
+ fuzzy_score = features.get("fuzzy_score")
77
+ if fuzzy_score is not None and fuzzy_score >= 0:
78
+ return float(0.5 + (fuzzy_score * 0.3))
79
+
80
+ # Tier 3: FTS only
81
+ fts_score = features.get("fts_score")
82
+ if fts_score is not None and fts_score >= 0:
83
+ return float(0.3 + (min(fts_score, 1.0) * 0.3))
84
+
85
+ # Fallback: raw matcher score
86
+ return candidate.score
87
+
88
+ def _apply_logistic(self, candidate: Candidate) -> float:
89
+ """
90
+ Apply logistic regression model: sigmoid(w·x + b).
91
+
92
+ Uses Python math.exp (works without numpy).
93
+
94
+ Args:
95
+ candidate: Candidate to calibrate
96
+
97
+ Returns:
98
+ Calibrated probability in [0.0, 1.0]
99
+ """
100
+ # Extract features in model's feature order
101
+ features_dict = self.feature_extractor.extract(candidate)
102
+ feature_vector = [features_dict.get(f, -1.0) for f in self.model.features]
103
+
104
+ # Compute logit: w·x + b
105
+ logit = (
106
+ sum(w * x for w, x in zip(self.model.weights, feature_vector, strict=False))
107
+ + self.model.bias
108
+ )
109
+
110
+ # Clamp extreme values to prevent overflow
111
+ logit = max(min(logit, 20.0), -20.0)
112
+
113
+ # Apply sigmoid: 1 / (1 + exp(-logit))
114
+ return 1.0 / (1.0 + math.exp(-logit))
115
+
116
+ def calibrate_batch(self, candidates: list[Candidate]) -> list[float]:
117
+ """
118
+ Calibrate multiple candidates efficiently.
119
+
120
+ Uses vectorized numpy operations when available (fast path),
121
+ falls back to Python loop otherwise (slow path).
122
+
123
+ Args:
124
+ candidates: List of candidates to calibrate
125
+
126
+ Returns:
127
+ List of confidence scores [0.0-1.0] in same order
128
+ """
129
+ if not candidates:
130
+ return []
131
+
132
+ if self.model and HAS_NUMPY:
133
+ # Fast path: vectorized with numpy
134
+ return self._calibrate_batch_vectorized(candidates)
135
+ else:
136
+ # Slow path: Python loop (no numpy or heuristic mode)
137
+ return [self.calibrate(c) for c in candidates]
138
+
139
+ def _calibrate_batch_vectorized(self, candidates: list[Candidate]) -> list[float]:
140
+ """
141
+ Vectorized calibration (requires numpy).
142
+
143
+ Args:
144
+ candidates: List of candidates
145
+
146
+ Returns:
147
+ List of confidence scores
148
+ """
149
+ # Extract features as numpy array (n_candidates, n_features)
150
+ feature_matrix = self.feature_extractor.extract_batch(candidates)
151
+
152
+ # Build feature matrix in model's feature order
153
+ # Handle unknown features (not in FEATURE_SCHEMA) by using sentinel -1.0
154
+ feature_names = list(self.feature_extractor.FEATURE_SCHEMA.keys())
155
+
156
+ # Build column indices for known features, None for unknown
157
+ model_feature_indices = []
158
+ for f in self.model.features:
159
+ try:
160
+ model_feature_indices.append(feature_names.index(f))
161
+ except ValueError:
162
+ # Feature not in schema - will use sentinel
163
+ model_feature_indices.append(None)
164
+
165
+ # Build feature matrix for model features (n_candidates, n_model_features)
166
+ x = np.zeros((feature_matrix.shape[0], len(self.model.features)))
167
+ for i, idx in enumerate(model_feature_indices):
168
+ if idx is not None:
169
+ x[:, i] = feature_matrix[:, idx]
170
+ else:
171
+ # Unknown feature - use sentinel -1.0
172
+ x[:, i] = -1.0
173
+
174
+ # Vectorized logistic: sigmoid(x @ w + b)
175
+ weights = np.array(self.model.weights)
176
+ logits = x @ weights + self.model.bias
177
+
178
+ # Clamp to prevent overflow
179
+ logits = np.clip(logits, -20.0, 20.0)
180
+
181
+ # Sigmoid
182
+ probabilities = 1.0 / (1.0 + np.exp(-logits))
183
+
184
+ return probabilities.tolist()
@@ -0,0 +1,139 @@
1
+ """Feature extraction for calibration."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from resolvekit.types import Candidate
6
+
7
+ # At top of file
8
+ try:
9
+ import numpy as np
10
+
11
+ HAS_NUMPY = True
12
+ except ImportError:
13
+ HAS_NUMPY = False
14
+
15
+
16
+ class FeatureExtractor:
17
+ """
18
+ Extracts standardized features from candidates for calibration.
19
+
20
+ Reads from candidate.features dict (populated by matchers and constraints)
21
+ and converts to normalized feature vector with sentinel values for missing data.
22
+ """
23
+
24
+ # Feature schema: feature_name -> expected type
25
+ FEATURE_SCHEMA: ClassVar[dict[str, type]] = {
26
+ # Match type features (boolean -> float)
27
+ "f_exact_code": bool,
28
+ "f_canonical_exact": bool,
29
+ "f_alias_exact": bool,
30
+ # Similarity features (float or None -> float with sentinel)
31
+ "f_fts_score": float,
32
+ "f_fts_rank_inv": float, # Derived from fts_rank
33
+ "f_edit_similarity": float,
34
+ "f_trigram_jaccard": float,
35
+ "f_fuzzy_score": float,
36
+ # Constraint features (boolean -> float)
37
+ "f_parent_valid": bool,
38
+ "f_parent_depth": float, # Normalized 0-1
39
+ "f_type_valid": bool,
40
+ "f_date_valid": bool,
41
+ "f_membership_valid": bool,
42
+ }
43
+
44
+ def extract(self, candidate: Candidate) -> dict[str, float]:
45
+ """
46
+ Extract standardized features from candidate.
47
+
48
+ Args:
49
+ candidate: Candidate with features dict from matchers/constraints
50
+
51
+ Returns:
52
+ Dict of feature_name -> float value
53
+ Missing numeric features use sentinel -1.0
54
+ Missing boolean features use 0.0 (False)
55
+ """
56
+ raw = candidate.features
57
+ features = {}
58
+
59
+ # Match type features (boolean -> 1.0 or 0.0)
60
+ features["f_exact_code"] = 1.0 if raw.get("exact_code") else 0.0
61
+ features["f_canonical_exact"] = 1.0 if raw.get("canonical_exact") else 0.0
62
+ features["f_alias_exact"] = 1.0 if raw.get("alias_exact") else 0.0
63
+
64
+ # Similarity features (with sentinel -1.0 for missing)
65
+ features["f_fts_score"] = self._get_float(raw, "fts_score", -1.0)
66
+ features["f_edit_similarity"] = self._get_float(raw, "edit_similarity", -1.0)
67
+ features["f_trigram_jaccard"] = self._get_float(raw, "trigram_jaccard", -1.0)
68
+ features["f_fuzzy_score"] = self._get_float(raw, "fuzzy_score", -1.0)
69
+
70
+ # FTS rank inverse (1/rank, or -1.0 if missing)
71
+ fts_rank = raw.get("fts_rank")
72
+ features["f_fts_rank_inv"] = (
73
+ 1.0 / fts_rank if fts_rank and fts_rank > 0 else -1.0
74
+ )
75
+
76
+ # Constraint features (three-state: -1.0=not checked, 0.0=failed, 1.0=passed)
77
+ features["f_parent_valid"] = self._get_constraint_feature(raw, "parent_valid")
78
+ features["f_type_valid"] = self._get_constraint_feature(raw, "type_valid")
79
+ features["f_date_valid"] = self._get_constraint_feature(raw, "date_valid")
80
+ features["f_membership_valid"] = self._get_constraint_feature(
81
+ raw, "membership_valid"
82
+ )
83
+
84
+ # Parent depth normalized (depth / 3, capped at 1.0, or -1.0 if not checked)
85
+ parent_depth = raw.get("parent_depth")
86
+ if parent_depth is None:
87
+ features["f_parent_depth"] = -1.0 # Not checked
88
+ else:
89
+ features["f_parent_depth"] = min(parent_depth / 3.0, 1.0)
90
+
91
+ return features
92
+
93
+ def _get_float(self, raw: dict, key: str, default: float) -> float:
94
+ """Get float value or default if missing/None."""
95
+ value = raw.get(key)
96
+ return float(value) if value is not None else default
97
+
98
+ def _get_constraint_feature(self, raw: dict, key: str) -> float:
99
+ """
100
+ Get constraint feature with three-state logic.
101
+
102
+ Returns:
103
+ -1.0: Constraint not checked (key missing)
104
+ 0.0: Constraint checked and failed (value is False)
105
+ 1.0: Constraint checked and passed (value is True)
106
+ """
107
+ value = raw.get(key)
108
+ if value is None:
109
+ return -1.0 # Not checked
110
+ return 1.0 if value else 0.0 # Checked: True→1.0, False→0.0
111
+
112
+ def extract_batch(
113
+ self, candidates: list[Candidate]
114
+ ) -> "np.ndarray | list[dict[str, float]]":
115
+ """
116
+ Extract features from multiple candidates.
117
+
118
+ Args:
119
+ candidates: List of candidates
120
+
121
+ Returns:
122
+ If numpy available: numpy array of shape (n_candidates, n_features)
123
+ If numpy unavailable: list of feature dicts
124
+ """
125
+ if not candidates:
126
+ return np.array([]) if HAS_NUMPY else []
127
+
128
+ # Extract features for all candidates
129
+ feature_dicts = [self.extract(c) for c in candidates]
130
+
131
+ if not HAS_NUMPY:
132
+ return feature_dicts
133
+
134
+ # Convert to numpy array (all dicts have same keys in same order)
135
+ feature_names = list(self.FEATURE_SCHEMA.keys())
136
+ feature_matrix = np.array(
137
+ [[fd[fname] for fname in feature_names] for fd in feature_dicts]
138
+ )
139
+ return feature_matrix