resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
# Calibration Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The calibration module converts raw matcher scores into calibrated confidence probabilities that reflect actual precision, enabling reliable threshold-based decision making.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Core Components
|
|
10
|
+
|
|
11
|
+
**Currently Implemented:**
|
|
12
|
+
|
|
13
|
+
1. **CalibrationModel** (`models.py`)
|
|
14
|
+
- Pydantic schema for calibration model configuration
|
|
15
|
+
- JSON loading and validation
|
|
16
|
+
- Model metadata (ECE, training date, notes)
|
|
17
|
+
|
|
18
|
+
2. **Feature Extractor** (`features.py`)
|
|
19
|
+
- Extracts standardized features from candidates
|
|
20
|
+
- Converts matcher-specific features to normalized values
|
|
21
|
+
- Handles missing features with sentinel values
|
|
22
|
+
- Optional numpy vectorization for batch processing
|
|
23
|
+
|
|
24
|
+
3. **Calibrator** (`calibrator.py`)
|
|
25
|
+
- Applies calibration models to candidates
|
|
26
|
+
- Returns probability P(correct match | features)
|
|
27
|
+
- Supports logistic regression models
|
|
28
|
+
- Heuristic fallback when no model available
|
|
29
|
+
- Batch processing with optional numpy acceleration
|
|
30
|
+
|
|
31
|
+
**Planned for Future Phases:**
|
|
32
|
+
|
|
33
|
+
4. **Model Trainer** (`trainer.py` - not yet implemented)
|
|
34
|
+
- Trains calibration models on labeled data
|
|
35
|
+
- Evaluates calibration quality (ECE - Expected Calibration Error)
|
|
36
|
+
- Exports models to JSON for data packs
|
|
37
|
+
|
|
38
|
+
5. **Score Fusion** (`fusion.py` - not yet implemented)
|
|
39
|
+
- Combines scores from multiple matchers
|
|
40
|
+
- Handles missing features (when matchers not invoked)
|
|
41
|
+
- Normalizes scores to comparable scales
|
|
42
|
+
|
|
43
|
+
## Feature Set
|
|
44
|
+
|
|
45
|
+
Features extracted per (query, candidate) pair:
|
|
46
|
+
|
|
47
|
+
### Match Type Features (binary)
|
|
48
|
+
- `f_exact_code`: Matched via exact code lookup
|
|
49
|
+
- `f_canonical_exact`: Matched via canonical name
|
|
50
|
+
- `f_alias_exact`: Matched via exact alias
|
|
51
|
+
- `f_alias_type_*`: One-hot encoding of alias type (canonical/endonym/exonym/abbr/code)
|
|
52
|
+
|
|
53
|
+
### Similarity Features (continuous)
|
|
54
|
+
- `f_fts_score`: BM25-like score from FTS
|
|
55
|
+
- `f_fts_rank_inv`: Inverse rank (1/rank)
|
|
56
|
+
- `f_edit_distance_norm`: Normalized edit distance
|
|
57
|
+
- `f_trigram_jaccard`: Trigram Jaccard similarity
|
|
58
|
+
|
|
59
|
+
### Constraint Features (binary)
|
|
60
|
+
- `f_parent_valid`: Passes parent constraint
|
|
61
|
+
- `f_type_valid`: Passes type constraint
|
|
62
|
+
- `f_date_valid`: Passes temporal constraint
|
|
63
|
+
|
|
64
|
+
### Semantic Features (optional)
|
|
65
|
+
- `f_sem_used`: Semantic matcher was invoked
|
|
66
|
+
- `f_sem_sim`: Cosine similarity from embeddings (0-1)
|
|
67
|
+
|
|
68
|
+
### Context Features
|
|
69
|
+
- `f_ambiguity_flag`: Query in ambiguity registry
|
|
70
|
+
- `f_region_hint_match`: Candidate matches region hint
|
|
71
|
+
|
|
72
|
+
## Calibration Model
|
|
73
|
+
|
|
74
|
+
### Logistic Regression (default)
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import numpy as np
|
|
78
|
+
|
|
79
|
+
def calibrate_logistic(features: np.ndarray, weights: np.ndarray, bias: float) -> float:
|
|
80
|
+
"""Compute calibrated probability via logistic regression."""
|
|
81
|
+
logit = np.dot(weights, features) + bias
|
|
82
|
+
probability = 1 / (1 + np.exp(-logit))
|
|
83
|
+
return probability
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Model Format (JSON)
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"type": "logistic",
|
|
91
|
+
"features": ["f_exact_code", "f_canonical_exact", ...],
|
|
92
|
+
"weights": [2.3, 1.6, 0.9, ...],
|
|
93
|
+
"bias": -1.4,
|
|
94
|
+
"ece": 0.032,
|
|
95
|
+
"trained_on": "2025-10-15",
|
|
96
|
+
"notes": "Global calibration; overlays may append."
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Calibration Quality Metrics
|
|
101
|
+
|
|
102
|
+
- **ECE (Expected Calibration Error)**: Measures deviation between predicted probabilities and observed accuracy
|
|
103
|
+
- **Brier Score**: Mean squared error of probabilistic predictions
|
|
104
|
+
- **Reliability Diagram**: Plots predicted vs observed probabilities in bins
|
|
105
|
+
|
|
106
|
+
Target: ECE < 0.05 (well-calibrated)
|
|
107
|
+
|
|
108
|
+
## Design Principles
|
|
109
|
+
|
|
110
|
+
1. **Interpretable**: Use simple models (logistic) for explainability
|
|
111
|
+
2. **Calibrated**: Probabilities should match empirical accuracy
|
|
112
|
+
3. **Versioned**: Models versioned with data packs
|
|
113
|
+
4. **Updatable**: Overlays can fine-tune calibration with delta weights
|
|
114
|
+
|
|
115
|
+
## Usage Examples
|
|
116
|
+
|
|
117
|
+
### Basic Usage - Heuristic Fallback
|
|
118
|
+
|
|
119
|
+
When no calibration model is available, the calibrator uses rule-based heuristics:
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from resolvekit.calibration import Calibrator
|
|
123
|
+
from resolvekit.types import Candidate, Entity, EntityType, MatcherType
|
|
124
|
+
|
|
125
|
+
# Create calibrator without model (uses heuristic)
|
|
126
|
+
calibrator = Calibrator(model=None)
|
|
127
|
+
|
|
128
|
+
# Example candidate from exact code matcher
|
|
129
|
+
candidate = Candidate(
|
|
130
|
+
entity=Entity(
|
|
131
|
+
dcid="country/USA",
|
|
132
|
+
canonical_name="United States",
|
|
133
|
+
entity_type=EntityType.COUNTRY,
|
|
134
|
+
),
|
|
135
|
+
score=1.0,
|
|
136
|
+
matcher_type=MatcherType.EXACT_CODE,
|
|
137
|
+
features={"exact_code": True, "code_system": "iso3"},
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Calibrate to get confidence probability
|
|
141
|
+
confidence = calibrator.calibrate(candidate)
|
|
142
|
+
print(f"Confidence: {confidence}") # 0.95 for exact code matches
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Loading and Using a Trained Model
|
|
146
|
+
|
|
147
|
+
Load a calibration model from a data pack and use it for calibration:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from pathlib import Path
|
|
151
|
+
from resolvekit.calibration import Calibrator, load_calibration_model
|
|
152
|
+
|
|
153
|
+
# Load model from JSON file in data pack
|
|
154
|
+
model_path = Path("data/base/calibration_model.json")
|
|
155
|
+
model = load_calibration_model(model_path)
|
|
156
|
+
|
|
157
|
+
if model:
|
|
158
|
+
print(f"Loaded model with {len(model.features)} features")
|
|
159
|
+
print(f"ECE: {model.ece}") # Expected Calibration Error
|
|
160
|
+
|
|
161
|
+
# Create calibrator with model
|
|
162
|
+
calibrator = Calibrator(model=model)
|
|
163
|
+
|
|
164
|
+
# Calibrate candidates using logistic regression
|
|
165
|
+
confidence = calibrator.calibrate(candidate)
|
|
166
|
+
print(f"Model-calibrated confidence: {confidence}")
|
|
167
|
+
else:
|
|
168
|
+
print("Model not found, falling back to heuristic")
|
|
169
|
+
calibrator = Calibrator(model=None)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Batch Processing for Efficiency
|
|
173
|
+
|
|
174
|
+
Process multiple candidates efficiently using batch calibration:
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from resolvekit.calibration import Calibrator
|
|
178
|
+
|
|
179
|
+
# Prepare multiple candidates
|
|
180
|
+
candidates = [
|
|
181
|
+
Candidate(
|
|
182
|
+
entity=Entity(dcid="country/USA", canonical_name="United States",
|
|
183
|
+
entity_type=EntityType.COUNTRY),
|
|
184
|
+
score=1.0,
|
|
185
|
+
matcher_type=MatcherType.EXACT_CODE,
|
|
186
|
+
features={"exact_code": True},
|
|
187
|
+
),
|
|
188
|
+
Candidate(
|
|
189
|
+
entity=Entity(dcid="country/FRA", canonical_name="France",
|
|
190
|
+
entity_type=EntityType.COUNTRY),
|
|
191
|
+
score=0.9,
|
|
192
|
+
matcher_type=MatcherType.CANONICAL_NAME,
|
|
193
|
+
features={"canonical_exact": True},
|
|
194
|
+
),
|
|
195
|
+
Candidate(
|
|
196
|
+
entity=Entity(dcid="country/DEU", canonical_name="Germany",
|
|
197
|
+
entity_type=EntityType.COUNTRY),
|
|
198
|
+
score=0.75,
|
|
199
|
+
matcher_type=MatcherType.FUZZY,
|
|
200
|
+
features={"fuzzy_score": 0.78, "fts_score": 0.82},
|
|
201
|
+
),
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
calibrator = Calibrator(model=None)
|
|
205
|
+
|
|
206
|
+
# Batch calibration (uses numpy vectorization if available)
|
|
207
|
+
confidences = calibrator.calibrate_batch(candidates)
|
|
208
|
+
|
|
209
|
+
# Results correspond to input order
|
|
210
|
+
for candidate, confidence in zip(candidates, confidences):
|
|
211
|
+
print(f"{candidate.entity.canonical_name}: {confidence:.2f}")
|
|
212
|
+
# Output:
|
|
213
|
+
# United States: 0.95
|
|
214
|
+
# France: 0.90
|
|
215
|
+
# Germany: 0.73
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Feature Extraction
|
|
219
|
+
|
|
220
|
+
Extract standardized features from candidates for analysis or custom calibration:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from resolvekit.calibration import FeatureExtractor
|
|
224
|
+
|
|
225
|
+
extractor = FeatureExtractor()
|
|
226
|
+
|
|
227
|
+
# Extract features from a candidate
|
|
228
|
+
features = extractor.extract(candidate)
|
|
229
|
+
|
|
230
|
+
# Features dict contains standardized feature values
|
|
231
|
+
print(features)
|
|
232
|
+
# {
|
|
233
|
+
# 'f_exact_code': 1.0,
|
|
234
|
+
# 'f_canonical_exact': 0.0,
|
|
235
|
+
# 'f_alias_exact': 0.0,
|
|
236
|
+
# 'f_fts_score': -1.0, # Sentinel value (not used)
|
|
237
|
+
# 'f_fts_rank_inv': -1.0,
|
|
238
|
+
# 'f_edit_similarity': -1.0,
|
|
239
|
+
# 'f_trigram_jaccard': -1.0,
|
|
240
|
+
# 'f_fuzzy_score': -1.0,
|
|
241
|
+
# 'f_parent_valid': 0.0,
|
|
242
|
+
# 'f_parent_depth': 0.0,
|
|
243
|
+
# 'f_type_valid': 0.0,
|
|
244
|
+
# 'f_date_valid': 0.0,
|
|
245
|
+
# 'f_membership_valid': 0.0,
|
|
246
|
+
# }
|
|
247
|
+
|
|
248
|
+
# Extract features from multiple candidates
|
|
249
|
+
feature_dicts = extractor.extract_batch(candidates)
|
|
250
|
+
# Returns list of dicts (or numpy array if numpy available)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Creating a Calibration Model
|
|
254
|
+
|
|
255
|
+
Define a custom calibration model programmatically:
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
from resolvekit.calibration import CalibrationModel
|
|
259
|
+
from datetime import date
|
|
260
|
+
|
|
261
|
+
# Create a simple logistic regression model
|
|
262
|
+
model = CalibrationModel(
|
|
263
|
+
type="logistic",
|
|
264
|
+
features=["f_exact_code", "f_canonical_exact", "f_fts_score"],
|
|
265
|
+
weights=[4.0, 3.5, 1.5],
|
|
266
|
+
bias=-2.0,
|
|
267
|
+
ece=0.025,
|
|
268
|
+
trained_on=date(2025, 10, 15),
|
|
269
|
+
notes="Custom calibration model",
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Use the model
|
|
273
|
+
calibrator = Calibrator(model=model)
|
|
274
|
+
confidence = calibrator.calibrate(candidate)
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
### Integration with Resolver Pipeline
|
|
278
|
+
|
|
279
|
+
Typical usage within the resolver cascade:
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from resolvekit.calibration import Calibrator, load_calibration_model
|
|
283
|
+
from pathlib import Path
|
|
284
|
+
|
|
285
|
+
# Initialize calibrator at resolver startup
|
|
286
|
+
model = load_calibration_model(Path("data/base/calibration_model.json"))
|
|
287
|
+
calibrator = Calibrator(model=model)
|
|
288
|
+
|
|
289
|
+
# During resolution, matchers populate candidate.features
|
|
290
|
+
# Then calibrator converts to confidence probabilities
|
|
291
|
+
|
|
292
|
+
def resolve(query: str) -> list[tuple[Candidate, float]]:
|
|
293
|
+
# 1. Matchers generate candidates with features
|
|
294
|
+
candidates = run_matcher_cascade(query)
|
|
295
|
+
|
|
296
|
+
# 2. Calibrator converts to confidence probabilities
|
|
297
|
+
confidences = calibrator.calibrate_batch(candidates)
|
|
298
|
+
|
|
299
|
+
# 3. Return ranked results
|
|
300
|
+
results = list(zip(candidates, confidences))
|
|
301
|
+
results.sort(key=lambda x: x[1], reverse=True) # Sort by confidence
|
|
302
|
+
|
|
303
|
+
return results
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Heuristic Scoring Rules
|
|
307
|
+
|
|
308
|
+
When using heuristic mode (no model), confidence scores follow these rules:
|
|
309
|
+
|
|
310
|
+
- **Tier 1 - Exact Matches** (0.85-0.95):
|
|
311
|
+
- Exact code match: 0.95
|
|
312
|
+
- Canonical name exact: 0.90
|
|
313
|
+
- Alias exact: 0.85
|
|
314
|
+
|
|
315
|
+
- **Tier 2 - Fuzzy Matches** (0.5-0.8):
|
|
316
|
+
- Base: 0.5 + (fuzzy_score × 0.3)
|
|
317
|
+
- Example: fuzzy_score=0.78 → confidence=0.734
|
|
318
|
+
|
|
319
|
+
- **Tier 3 - FTS Only** (0.3-0.6):
|
|
320
|
+
- Base: 0.3 + (fts_score × 0.3)
|
|
321
|
+
- Example: fts_score=0.7 → confidence=0.51
|
|
322
|
+
|
|
323
|
+
- **Fallback**: Returns raw matcher score if no features match
|
|
324
|
+
|
|
325
|
+
### Performance Considerations
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
# Batch processing is faster for multiple candidates
|
|
329
|
+
import time
|
|
330
|
+
|
|
331
|
+
# Single-item processing
|
|
332
|
+
start = time.time()
|
|
333
|
+
for candidate in candidates:
|
|
334
|
+
confidence = calibrator.calibrate(candidate)
|
|
335
|
+
single_time = time.time() - start
|
|
336
|
+
|
|
337
|
+
# Batch processing (with numpy)
|
|
338
|
+
start = time.time()
|
|
339
|
+
confidences = calibrator.calibrate_batch(candidates)
|
|
340
|
+
batch_time = time.time() - start
|
|
341
|
+
|
|
342
|
+
print(f"Single: {single_time:.4f}s")
|
|
343
|
+
print(f"Batch: {batch_time:.4f}s")
|
|
344
|
+
print(f"Speedup: {single_time / batch_time:.1f}x")
|
|
345
|
+
# With numpy: 5-10x speedup for 100+ candidates
|
|
346
|
+
# Without numpy: Similar performance (both use Python loop)
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
## Implementation Priority
|
|
350
|
+
|
|
351
|
+
**Phase A** - Core resolver
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Calibration module for converting scores to probabilities."""
|
|
2
|
+
|
|
3
|
+
from resolvekit.calibration.calibrator import Calibrator
|
|
4
|
+
from resolvekit.calibration.features import FeatureExtractor
|
|
5
|
+
from resolvekit.calibration.models import CalibrationModel, load_calibration_model
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"CalibrationModel",
|
|
9
|
+
"Calibrator",
|
|
10
|
+
"FeatureExtractor",
|
|
11
|
+
"load_calibration_model",
|
|
12
|
+
]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Calibration service for converting scores to probabilities."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from resolvekit.calibration.features import FeatureExtractor
|
|
6
|
+
from resolvekit.calibration.models import CalibrationModel
|
|
7
|
+
from resolvekit.types import Candidate
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
HAS_NUMPY = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
HAS_NUMPY = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Calibrator:
|
|
18
|
+
"""
|
|
19
|
+
Applies calibration models to candidates.
|
|
20
|
+
|
|
21
|
+
Supports logistic regression models or heuristic fallback when no model available.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, model: CalibrationModel | None = None):
|
|
25
|
+
"""
|
|
26
|
+
Initialize calibrator.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
model: Optional calibration model. If None, uses heuristic fallback.
|
|
30
|
+
"""
|
|
31
|
+
self.model = model
|
|
32
|
+
self.feature_extractor = FeatureExtractor()
|
|
33
|
+
|
|
34
|
+
def calibrate(self, candidate: Candidate) -> float:
|
|
35
|
+
"""
|
|
36
|
+
Calibrate single candidate to confidence probability.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
candidate: Candidate to calibrate
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Calibrated confidence in [0.0, 1.0]
|
|
43
|
+
"""
|
|
44
|
+
if self.model:
|
|
45
|
+
return self._apply_logistic(candidate)
|
|
46
|
+
else:
|
|
47
|
+
# Use heuristic fallback
|
|
48
|
+
return self._apply_heuristic(candidate)
|
|
49
|
+
|
|
50
|
+
def _apply_heuristic(self, candidate: Candidate) -> float:
|
|
51
|
+
"""
|
|
52
|
+
Apply rule-based heuristic scoring.
|
|
53
|
+
|
|
54
|
+
Tier 1: Exact matches (0.85-0.95)
|
|
55
|
+
Tier 2: Fuzzy matches (0.5-0.8)
|
|
56
|
+
Tier 3: FTS only (0.3-0.6)
|
|
57
|
+
Fallback: Raw score
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
candidate: Candidate to score
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Heuristic confidence
|
|
64
|
+
"""
|
|
65
|
+
features = candidate.features
|
|
66
|
+
|
|
67
|
+
# Tier 1: Exact matches
|
|
68
|
+
if features.get("exact_code"):
|
|
69
|
+
return 0.95
|
|
70
|
+
if features.get("canonical_exact"):
|
|
71
|
+
return 0.90
|
|
72
|
+
if features.get("alias_exact"):
|
|
73
|
+
return 0.85
|
|
74
|
+
|
|
75
|
+
# Tier 2: Fuzzy matches
|
|
76
|
+
fuzzy_score = features.get("fuzzy_score")
|
|
77
|
+
if fuzzy_score is not None and fuzzy_score >= 0:
|
|
78
|
+
return float(0.5 + (fuzzy_score * 0.3))
|
|
79
|
+
|
|
80
|
+
# Tier 3: FTS only
|
|
81
|
+
fts_score = features.get("fts_score")
|
|
82
|
+
if fts_score is not None and fts_score >= 0:
|
|
83
|
+
return float(0.3 + (min(fts_score, 1.0) * 0.3))
|
|
84
|
+
|
|
85
|
+
# Fallback: raw matcher score
|
|
86
|
+
return candidate.score
|
|
87
|
+
|
|
88
|
+
def _apply_logistic(self, candidate: Candidate) -> float:
|
|
89
|
+
"""
|
|
90
|
+
Apply logistic regression model: sigmoid(w·x + b).
|
|
91
|
+
|
|
92
|
+
Uses Python math.exp (works without numpy).
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
candidate: Candidate to calibrate
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Calibrated probability in [0.0, 1.0]
|
|
99
|
+
"""
|
|
100
|
+
# Extract features in model's feature order
|
|
101
|
+
features_dict = self.feature_extractor.extract(candidate)
|
|
102
|
+
feature_vector = [features_dict.get(f, -1.0) for f in self.model.features]
|
|
103
|
+
|
|
104
|
+
# Compute logit: w·x + b
|
|
105
|
+
logit = (
|
|
106
|
+
sum(w * x for w, x in zip(self.model.weights, feature_vector, strict=False))
|
|
107
|
+
+ self.model.bias
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Clamp extreme values to prevent overflow
|
|
111
|
+
logit = max(min(logit, 20.0), -20.0)
|
|
112
|
+
|
|
113
|
+
# Apply sigmoid: 1 / (1 + exp(-logit))
|
|
114
|
+
return 1.0 / (1.0 + math.exp(-logit))
|
|
115
|
+
|
|
116
|
+
def calibrate_batch(self, candidates: list[Candidate]) -> list[float]:
|
|
117
|
+
"""
|
|
118
|
+
Calibrate multiple candidates efficiently.
|
|
119
|
+
|
|
120
|
+
Uses vectorized numpy operations when available (fast path),
|
|
121
|
+
falls back to Python loop otherwise (slow path).
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
candidates: List of candidates to calibrate
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of confidence scores [0.0-1.0] in same order
|
|
128
|
+
"""
|
|
129
|
+
if not candidates:
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
if self.model and HAS_NUMPY:
|
|
133
|
+
# Fast path: vectorized with numpy
|
|
134
|
+
return self._calibrate_batch_vectorized(candidates)
|
|
135
|
+
else:
|
|
136
|
+
# Slow path: Python loop (no numpy or heuristic mode)
|
|
137
|
+
return [self.calibrate(c) for c in candidates]
|
|
138
|
+
|
|
139
|
+
def _calibrate_batch_vectorized(self, candidates: list[Candidate]) -> list[float]:
|
|
140
|
+
"""
|
|
141
|
+
Vectorized calibration (requires numpy).
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
candidates: List of candidates
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of confidence scores
|
|
148
|
+
"""
|
|
149
|
+
# Extract features as numpy array (n_candidates, n_features)
|
|
150
|
+
feature_matrix = self.feature_extractor.extract_batch(candidates)
|
|
151
|
+
|
|
152
|
+
# Build feature matrix in model's feature order
|
|
153
|
+
# Handle unknown features (not in FEATURE_SCHEMA) by using sentinel -1.0
|
|
154
|
+
feature_names = list(self.feature_extractor.FEATURE_SCHEMA.keys())
|
|
155
|
+
|
|
156
|
+
# Build column indices for known features, None for unknown
|
|
157
|
+
model_feature_indices = []
|
|
158
|
+
for f in self.model.features:
|
|
159
|
+
try:
|
|
160
|
+
model_feature_indices.append(feature_names.index(f))
|
|
161
|
+
except ValueError:
|
|
162
|
+
# Feature not in schema - will use sentinel
|
|
163
|
+
model_feature_indices.append(None)
|
|
164
|
+
|
|
165
|
+
# Build feature matrix for model features (n_candidates, n_model_features)
|
|
166
|
+
x = np.zeros((feature_matrix.shape[0], len(self.model.features)))
|
|
167
|
+
for i, idx in enumerate(model_feature_indices):
|
|
168
|
+
if idx is not None:
|
|
169
|
+
x[:, i] = feature_matrix[:, idx]
|
|
170
|
+
else:
|
|
171
|
+
# Unknown feature - use sentinel -1.0
|
|
172
|
+
x[:, i] = -1.0
|
|
173
|
+
|
|
174
|
+
# Vectorized logistic: sigmoid(x @ w + b)
|
|
175
|
+
weights = np.array(self.model.weights)
|
|
176
|
+
logits = x @ weights + self.model.bias
|
|
177
|
+
|
|
178
|
+
# Clamp to prevent overflow
|
|
179
|
+
logits = np.clip(logits, -20.0, 20.0)
|
|
180
|
+
|
|
181
|
+
# Sigmoid
|
|
182
|
+
probabilities = 1.0 / (1.0 + np.exp(-logits))
|
|
183
|
+
|
|
184
|
+
return probabilities.tolist()
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Feature extraction for calibration."""
|
|
2
|
+
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
|
|
5
|
+
from resolvekit.types import Candidate
|
|
6
|
+
|
|
7
|
+
# At top of file
|
|
8
|
+
try:
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
HAS_NUMPY = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
HAS_NUMPY = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FeatureExtractor:
|
|
17
|
+
"""
|
|
18
|
+
Extracts standardized features from candidates for calibration.
|
|
19
|
+
|
|
20
|
+
Reads from candidate.features dict (populated by matchers and constraints)
|
|
21
|
+
and converts to normalized feature vector with sentinel values for missing data.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Feature schema: feature_name -> expected type
|
|
25
|
+
FEATURE_SCHEMA: ClassVar[dict[str, type]] = {
|
|
26
|
+
# Match type features (boolean -> float)
|
|
27
|
+
"f_exact_code": bool,
|
|
28
|
+
"f_canonical_exact": bool,
|
|
29
|
+
"f_alias_exact": bool,
|
|
30
|
+
# Similarity features (float or None -> float with sentinel)
|
|
31
|
+
"f_fts_score": float,
|
|
32
|
+
"f_fts_rank_inv": float, # Derived from fts_rank
|
|
33
|
+
"f_edit_similarity": float,
|
|
34
|
+
"f_trigram_jaccard": float,
|
|
35
|
+
"f_fuzzy_score": float,
|
|
36
|
+
# Constraint features (boolean -> float)
|
|
37
|
+
"f_parent_valid": bool,
|
|
38
|
+
"f_parent_depth": float, # Normalized 0-1
|
|
39
|
+
"f_type_valid": bool,
|
|
40
|
+
"f_date_valid": bool,
|
|
41
|
+
"f_membership_valid": bool,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def extract(self, candidate: Candidate) -> dict[str, float]:
|
|
45
|
+
"""
|
|
46
|
+
Extract standardized features from candidate.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
candidate: Candidate with features dict from matchers/constraints
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Dict of feature_name -> float value
|
|
53
|
+
Missing numeric features use sentinel -1.0
|
|
54
|
+
Missing boolean features use 0.0 (False)
|
|
55
|
+
"""
|
|
56
|
+
raw = candidate.features
|
|
57
|
+
features = {}
|
|
58
|
+
|
|
59
|
+
# Match type features (boolean -> 1.0 or 0.0)
|
|
60
|
+
features["f_exact_code"] = 1.0 if raw.get("exact_code") else 0.0
|
|
61
|
+
features["f_canonical_exact"] = 1.0 if raw.get("canonical_exact") else 0.0
|
|
62
|
+
features["f_alias_exact"] = 1.0 if raw.get("alias_exact") else 0.0
|
|
63
|
+
|
|
64
|
+
# Similarity features (with sentinel -1.0 for missing)
|
|
65
|
+
features["f_fts_score"] = self._get_float(raw, "fts_score", -1.0)
|
|
66
|
+
features["f_edit_similarity"] = self._get_float(raw, "edit_similarity", -1.0)
|
|
67
|
+
features["f_trigram_jaccard"] = self._get_float(raw, "trigram_jaccard", -1.0)
|
|
68
|
+
features["f_fuzzy_score"] = self._get_float(raw, "fuzzy_score", -1.0)
|
|
69
|
+
|
|
70
|
+
# FTS rank inverse (1/rank, or -1.0 if missing)
|
|
71
|
+
fts_rank = raw.get("fts_rank")
|
|
72
|
+
features["f_fts_rank_inv"] = (
|
|
73
|
+
1.0 / fts_rank if fts_rank and fts_rank > 0 else -1.0
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Constraint features (three-state: -1.0=not checked, 0.0=failed, 1.0=passed)
|
|
77
|
+
features["f_parent_valid"] = self._get_constraint_feature(raw, "parent_valid")
|
|
78
|
+
features["f_type_valid"] = self._get_constraint_feature(raw, "type_valid")
|
|
79
|
+
features["f_date_valid"] = self._get_constraint_feature(raw, "date_valid")
|
|
80
|
+
features["f_membership_valid"] = self._get_constraint_feature(
|
|
81
|
+
raw, "membership_valid"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Parent depth normalized (depth / 3, capped at 1.0, or -1.0 if not checked)
|
|
85
|
+
parent_depth = raw.get("parent_depth")
|
|
86
|
+
if parent_depth is None:
|
|
87
|
+
features["f_parent_depth"] = -1.0 # Not checked
|
|
88
|
+
else:
|
|
89
|
+
features["f_parent_depth"] = min(parent_depth / 3.0, 1.0)
|
|
90
|
+
|
|
91
|
+
return features
|
|
92
|
+
|
|
93
|
+
def _get_float(self, raw: dict, key: str, default: float) -> float:
|
|
94
|
+
"""Get float value or default if missing/None."""
|
|
95
|
+
value = raw.get(key)
|
|
96
|
+
return float(value) if value is not None else default
|
|
97
|
+
|
|
98
|
+
def _get_constraint_feature(self, raw: dict, key: str) -> float:
|
|
99
|
+
"""
|
|
100
|
+
Get constraint feature with three-state logic.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
-1.0: Constraint not checked (key missing)
|
|
104
|
+
0.0: Constraint checked and failed (value is False)
|
|
105
|
+
1.0: Constraint checked and passed (value is True)
|
|
106
|
+
"""
|
|
107
|
+
value = raw.get(key)
|
|
108
|
+
if value is None:
|
|
109
|
+
return -1.0 # Not checked
|
|
110
|
+
return 1.0 if value else 0.0 # Checked: True→1.0, False→0.0
|
|
111
|
+
|
|
112
|
+
def extract_batch(
|
|
113
|
+
self, candidates: list[Candidate]
|
|
114
|
+
) -> "np.ndarray | list[dict[str, float]]":
|
|
115
|
+
"""
|
|
116
|
+
Extract features from multiple candidates.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
candidates: List of candidates
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
If numpy available: numpy array of shape (n_candidates, n_features)
|
|
123
|
+
If numpy unavailable: list of feature dicts
|
|
124
|
+
"""
|
|
125
|
+
if not candidates:
|
|
126
|
+
return np.array([]) if HAS_NUMPY else []
|
|
127
|
+
|
|
128
|
+
# Extract features for all candidates
|
|
129
|
+
feature_dicts = [self.extract(c) for c in candidates]
|
|
130
|
+
|
|
131
|
+
if not HAS_NUMPY:
|
|
132
|
+
return feature_dicts
|
|
133
|
+
|
|
134
|
+
# Convert to numpy array (all dicts have same keys in same order)
|
|
135
|
+
feature_names = list(self.FEATURE_SCHEMA.keys())
|
|
136
|
+
feature_matrix = np.array(
|
|
137
|
+
[[fd[fname] for fname in feature_names] for fd in feature_dicts]
|
|
138
|
+
)
|
|
139
|
+
return feature_matrix
|