resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""Main Resolver API."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from resolvekit.calibration.calibrator import Calibrator
|
|
7
|
+
from resolvekit.calibration.models import load_calibration_model
|
|
8
|
+
from resolvekit.constraints.constraint_engine import ConstraintEngine
|
|
9
|
+
from resolvekit.data.alias_repository import AliasRepository
|
|
10
|
+
from resolvekit.data.code_repository import CodeRepository
|
|
11
|
+
from resolvekit.data.db_manager import DatabaseManager
|
|
12
|
+
from resolvekit.data.entity_repository import EntityRepository
|
|
13
|
+
from resolvekit.data.membership_repository import MembershipRepository
|
|
14
|
+
from resolvekit.matchers.alias_exact import AliasExactMatcher
|
|
15
|
+
from resolvekit.matchers.canonical_name import CanonicalNameMatcher
|
|
16
|
+
from resolvekit.matchers.cascade import MatcherCascade
|
|
17
|
+
from resolvekit.matchers.exact_code import ExactCodeMatcher
|
|
18
|
+
from resolvekit.matchers.fts_matcher import FTSMatcher
|
|
19
|
+
from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
|
|
20
|
+
from resolvekit.normalization.normalizer import TextNormalizer
|
|
21
|
+
from resolvekit.types import (
|
|
22
|
+
Candidate,
|
|
23
|
+
EntityType,
|
|
24
|
+
Explanation,
|
|
25
|
+
ExplanationMode,
|
|
26
|
+
MatchContext,
|
|
27
|
+
Resolution,
|
|
28
|
+
)
|
|
29
|
+
from resolvekit.utils.dates import parse_date
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Resolver:
|
|
36
|
+
"""
|
|
37
|
+
Main entity resolution API.
|
|
38
|
+
|
|
39
|
+
Orchestrates matchers, constraints, and calibration to resolve
|
|
40
|
+
entity strings to canonical entities with confidence scores.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
*,
|
|
46
|
+
data_pack: str | Path | None = None,
|
|
47
|
+
overlays: list[str | Path] | None = None,
|
|
48
|
+
min_confidence: float = 0.5,
|
|
49
|
+
explanation_mode: ExplanationMode = ExplanationMode.STANDARD,
|
|
50
|
+
top_k: int = 12,
|
|
51
|
+
calibration_model: str | Path | None = None,
|
|
52
|
+
):
|
|
53
|
+
"""
|
|
54
|
+
Initialize resolver with data pack and configuration.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
data_pack: Optional path to data pack. If None, uses default bundled pack.
|
|
58
|
+
overlays: Optional list of overlay data packs (user/org customizations)
|
|
59
|
+
min_confidence: Minimum confidence threshold (default 0.5)
|
|
60
|
+
explanation_mode: Level of detail in explanations
|
|
61
|
+
top_k: Maximum candidates to consider
|
|
62
|
+
calibration_model: Optional path to calibration model
|
|
63
|
+
"""
|
|
64
|
+
self.min_confidence = min_confidence
|
|
65
|
+
self.explanation_mode = explanation_mode
|
|
66
|
+
self.top_k = top_k
|
|
67
|
+
|
|
68
|
+
# Initialize normalizer (needed by repositories)
|
|
69
|
+
self.normalizer = TextNormalizer()
|
|
70
|
+
|
|
71
|
+
# Initialize database and repositories
|
|
72
|
+
# TODO: Auto-discover default data pack if None
|
|
73
|
+
if data_pack is not None:
|
|
74
|
+
data_pack_path = (
|
|
75
|
+
Path(data_pack) if isinstance(data_pack, str) else data_pack
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
# TODO: Auto-discover default data pack
|
|
79
|
+
data_pack_path = Path("data.db") # Temporary placeholder
|
|
80
|
+
|
|
81
|
+
# Convert overlays to Path objects
|
|
82
|
+
overlay_paths: list[Path] | None = None
|
|
83
|
+
if overlays:
|
|
84
|
+
overlay_paths = [
|
|
85
|
+
Path(overlay) if isinstance(overlay, str) else overlay
|
|
86
|
+
for overlay in overlays
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
self.db_manager = DatabaseManager(data_pack_path, overlays=overlay_paths)
|
|
90
|
+
self.db_manager.connect()
|
|
91
|
+
self.entity_repo = EntityRepository(self.db_manager)
|
|
92
|
+
self.alias_repo = AliasRepository(self.db_manager, self.normalizer)
|
|
93
|
+
self.code_repo = CodeRepository(self.db_manager, self.entity_repo)
|
|
94
|
+
self.membership_repo = MembershipRepository(self.db_manager)
|
|
95
|
+
|
|
96
|
+
# Initialize matchers
|
|
97
|
+
self.exact_code = ExactCodeMatcher(self.code_repo)
|
|
98
|
+
self.canonical_name = CanonicalNameMatcher(self.entity_repo, self.normalizer)
|
|
99
|
+
self.alias_exact = AliasExactMatcher(self.alias_repo)
|
|
100
|
+
self.fts = FTSMatcher(self.alias_repo)
|
|
101
|
+
self.fuzzy = FuzzyMatcher(self.normalizer)
|
|
102
|
+
|
|
103
|
+
# Initialize cascade
|
|
104
|
+
self.cascade = MatcherCascade(
|
|
105
|
+
exact_code=self.exact_code,
|
|
106
|
+
canonical_name=self.canonical_name,
|
|
107
|
+
alias_exact=self.alias_exact,
|
|
108
|
+
fts=self.fts,
|
|
109
|
+
fuzzy=self.fuzzy,
|
|
110
|
+
normalizer=self.normalizer,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Initialize constraint engine
|
|
114
|
+
self.constraint_engine = ConstraintEngine(
|
|
115
|
+
entity_repo=self.entity_repo,
|
|
116
|
+
membership_repo=self.membership_repo,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Initialize calibrator
|
|
120
|
+
if calibration_model:
|
|
121
|
+
calibration_model_path = (
|
|
122
|
+
Path(calibration_model)
|
|
123
|
+
if isinstance(calibration_model, str)
|
|
124
|
+
else calibration_model
|
|
125
|
+
)
|
|
126
|
+
model = load_calibration_model(calibration_model_path)
|
|
127
|
+
self.calibrator = Calibrator(model)
|
|
128
|
+
else:
|
|
129
|
+
# Heuristic mode (no model)
|
|
130
|
+
self.calibrator = Calibrator()
|
|
131
|
+
|
|
132
|
+
def resolve(
|
|
133
|
+
self,
|
|
134
|
+
query: str,
|
|
135
|
+
context: MatchContext | None = None,
|
|
136
|
+
) -> Resolution:
|
|
137
|
+
"""
|
|
138
|
+
Resolve single entity query.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
query: Entity string to resolve
|
|
142
|
+
context: Optional match context for filtering/disambiguation
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Resolution result
|
|
146
|
+
"""
|
|
147
|
+
# Stage 1: Cascade (normalization + matching)
|
|
148
|
+
candidates = self.cascade.resolve(query, context=context, top_k=self.top_k)
|
|
149
|
+
|
|
150
|
+
# No candidates found
|
|
151
|
+
if not candidates:
|
|
152
|
+
return Resolution(
|
|
153
|
+
entity=None,
|
|
154
|
+
confidence=0.0,
|
|
155
|
+
alternatives=[],
|
|
156
|
+
explanation=self._build_explanation_empty(query)
|
|
157
|
+
if self.explanation_mode != ExplanationMode.MINIMAL
|
|
158
|
+
else None,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Stage 2: Constraint engine (KG/temporal validation)
|
|
162
|
+
candidates = self.constraint_engine.apply_constraints(candidates, context)
|
|
163
|
+
|
|
164
|
+
# All filtered by constraints
|
|
165
|
+
if not candidates:
|
|
166
|
+
return Resolution(
|
|
167
|
+
entity=None,
|
|
168
|
+
confidence=0.0,
|
|
169
|
+
alternatives=[],
|
|
170
|
+
explanation=self._build_explanation_filtered(query)
|
|
171
|
+
if self.explanation_mode != ExplanationMode.MINIMAL
|
|
172
|
+
else None,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Stage 3: Calibration (score → confidence probability)
|
|
176
|
+
calibrated_scores = self.calibrator.calibrate_batch(candidates)
|
|
177
|
+
|
|
178
|
+
# Stage 4: Sort by calibrated confidence
|
|
179
|
+
sorted_candidates = sorted(
|
|
180
|
+
zip(candidates, calibrated_scores, strict=False),
|
|
181
|
+
key=lambda x: x[1],
|
|
182
|
+
reverse=True,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Stage 5: Threshold filtering
|
|
186
|
+
top_candidate, top_confidence = sorted_candidates[0]
|
|
187
|
+
entity = None if top_confidence < self.min_confidence else top_candidate.entity
|
|
188
|
+
|
|
189
|
+
# Stage 6: Build alternatives (next 2-5 candidates)
|
|
190
|
+
alternatives = [c.entity for c, score in sorted_candidates[1:6]]
|
|
191
|
+
|
|
192
|
+
# Stage 7: Build explanation
|
|
193
|
+
explanation = self._build_explanation(
|
|
194
|
+
top_candidate,
|
|
195
|
+
sorted_candidates,
|
|
196
|
+
query,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return Resolution(
|
|
200
|
+
entity=entity,
|
|
201
|
+
confidence=top_confidence,
|
|
202
|
+
alternatives=alternatives,
|
|
203
|
+
explanation=explanation,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def _build_explanation_empty(self, query: str) -> Explanation | None:
|
|
207
|
+
"""Build explanation for empty results."""
|
|
208
|
+
if self.explanation_mode == ExplanationMode.MINIMAL:
|
|
209
|
+
return None
|
|
210
|
+
return Explanation(
|
|
211
|
+
stages=["cascade"],
|
|
212
|
+
trace={"query": query, "reason": "no_candidates"},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def _build_explanation_filtered(self, query: str) -> Explanation | None:
|
|
216
|
+
"""Build explanation for constraint-filtered results."""
|
|
217
|
+
if self.explanation_mode == ExplanationMode.MINIMAL:
|
|
218
|
+
return None
|
|
219
|
+
return Explanation(
|
|
220
|
+
stages=["cascade", "constraints"],
|
|
221
|
+
trace={"query": query, "reason": "filtered_by_constraints"},
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def _build_explanation(
|
|
225
|
+
self,
|
|
226
|
+
top_candidate: Candidate,
|
|
227
|
+
sorted_candidates: list[tuple[Candidate, float]],
|
|
228
|
+
query: str,
|
|
229
|
+
) -> Explanation | None:
|
|
230
|
+
"""
|
|
231
|
+
Build explanation based on mode.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
top_candidate: Top candidate
|
|
235
|
+
sorted_candidates: All candidates with scores
|
|
236
|
+
query: Original query
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Explanation or None (for MINIMAL mode)
|
|
240
|
+
"""
|
|
241
|
+
if self.explanation_mode == ExplanationMode.MINIMAL:
|
|
242
|
+
return None
|
|
243
|
+
|
|
244
|
+
# STANDARD mode
|
|
245
|
+
explanation = Explanation(
|
|
246
|
+
matcher_used=top_candidate.matcher_type,
|
|
247
|
+
features=top_candidate.features.copy(),
|
|
248
|
+
stages=["cascade", "constraints", "calibration"],
|
|
249
|
+
candidates=[c for c, _ in sorted_candidates[1:4]], # Next 3 as alternatives
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# FULL mode - add trace
|
|
253
|
+
if self.explanation_mode == ExplanationMode.FULL:
|
|
254
|
+
explanation.trace = {
|
|
255
|
+
"query": query,
|
|
256
|
+
"total_candidates": len(sorted_candidates),
|
|
257
|
+
"all_candidates": [
|
|
258
|
+
{
|
|
259
|
+
"dcid": c.entity.dcid,
|
|
260
|
+
"name": c.entity.canonical_name,
|
|
261
|
+
"score": float(score),
|
|
262
|
+
"matcher": c.matcher_type.value,
|
|
263
|
+
}
|
|
264
|
+
for c, score in sorted_candidates
|
|
265
|
+
],
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
return explanation
|
|
269
|
+
|
|
270
|
+
def resolve_many(
|
|
271
|
+
self,
|
|
272
|
+
queries: list[str],
|
|
273
|
+
context: MatchContext | list[MatchContext | None] | None = None,
|
|
274
|
+
) -> list[Resolution]:
|
|
275
|
+
"""
|
|
276
|
+
Resolve multiple queries.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
queries: List of query strings
|
|
280
|
+
context: Optional context - applies to all queries if MatchContext,
|
|
281
|
+
or per-query if list (must match length of queries)
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
List of Resolution objects (same order as queries)
|
|
285
|
+
"""
|
|
286
|
+
# Validate per-query context length
|
|
287
|
+
if isinstance(context, list):
|
|
288
|
+
if len(context) != len(queries):
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"context list length ({len(context)}) must match queries length ({len(queries)})"
|
|
291
|
+
)
|
|
292
|
+
# Per-query contexts - resolve individually
|
|
293
|
+
return [
|
|
294
|
+
self.resolve(query, ctx)
|
|
295
|
+
for query, ctx in zip(queries, context, strict=True)
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
# Shared context - deduplicate queries for efficiency
|
|
299
|
+
unique_queries = list(dict.fromkeys(queries))
|
|
300
|
+
|
|
301
|
+
# Resolve only unique queries
|
|
302
|
+
unique_resolutions = {
|
|
303
|
+
query: self.resolve(query, context) for query in unique_queries
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
# Map results back to original query order
|
|
307
|
+
return [unique_resolutions[query] for query in queries]
|
|
308
|
+
|
|
309
|
+
def __enter__(self) -> "Resolver":
|
|
310
|
+
"""Context manager entry."""
|
|
311
|
+
return self
|
|
312
|
+
|
|
313
|
+
def __exit__(self, *args: object) -> None:
|
|
314
|
+
"""Context manager exit - clean up resources."""
|
|
315
|
+
# Close database connections
|
|
316
|
+
if hasattr(self, "db_manager"):
|
|
317
|
+
self.db_manager.close()
|
|
318
|
+
|
|
319
|
+
def resolve_dataframe( # noqa: PLR0912
|
|
320
|
+
self,
|
|
321
|
+
df: "pd.DataFrame",
|
|
322
|
+
query_column: str,
|
|
323
|
+
resolve_to: str | list[str] = "dcid",
|
|
324
|
+
*,
|
|
325
|
+
context_columns: dict[str, str] | None = None,
|
|
326
|
+
context: MatchContext | None = None,
|
|
327
|
+
include_confidence: bool = True,
|
|
328
|
+
output_prefix: str = "",
|
|
329
|
+
) -> "pd.DataFrame":
|
|
330
|
+
"""
|
|
331
|
+
Resolve entities from DataFrame column to specified code system(s).
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
df: Input DataFrame
|
|
335
|
+
query_column: Column name containing entity strings to resolve
|
|
336
|
+
resolve_to: What to resolve to (default: "dcid")
|
|
337
|
+
context_columns: Map MatchContext fields to DataFrame columns
|
|
338
|
+
context: Single MatchContext for all rows
|
|
339
|
+
include_confidence: Add confidence score column
|
|
340
|
+
output_prefix: Prefix for output columns
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
DataFrame with resolution results added as columns
|
|
344
|
+
"""
|
|
345
|
+
# Import pandas only when needed
|
|
346
|
+
try:
|
|
347
|
+
import pandas as pd
|
|
348
|
+
except ImportError as e:
|
|
349
|
+
raise ImportError("pandas is required for resolve_dataframe(). ") from e
|
|
350
|
+
|
|
351
|
+
# Validate inputs
|
|
352
|
+
if query_column not in df.columns:
|
|
353
|
+
raise ValueError(f"Column '{query_column}' not found in DataFrame")
|
|
354
|
+
|
|
355
|
+
# Normalize resolve_to to list
|
|
356
|
+
if isinstance(resolve_to, str):
|
|
357
|
+
resolve_to = [resolve_to]
|
|
358
|
+
|
|
359
|
+
# Deduplication optimization
|
|
360
|
+
if context_columns is None:
|
|
361
|
+
# Simple case: shared context, dedupe on query only
|
|
362
|
+
unique_queries = df[query_column].unique().tolist()
|
|
363
|
+
unique_resolutions = self.resolve_many(unique_queries, context=context)
|
|
364
|
+
|
|
365
|
+
# Create lookup dict
|
|
366
|
+
lookup = dict(zip(unique_queries, unique_resolutions, strict=True))
|
|
367
|
+
|
|
368
|
+
# Map back to all rows
|
|
369
|
+
resolutions = [lookup[q] for q in df[query_column]]
|
|
370
|
+
else:
|
|
371
|
+
# Build per-row contexts from context_columns
|
|
372
|
+
queries = df[query_column].tolist()
|
|
373
|
+
|
|
374
|
+
# Validate context columns exist
|
|
375
|
+
for field_name, col_name in context_columns.items():
|
|
376
|
+
if col_name not in df.columns:
|
|
377
|
+
raise ValueError(
|
|
378
|
+
f"Context column '{col_name}' (for field '{field_name}') not found in DataFrame"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Build MatchContext for each row
|
|
382
|
+
# Extract only the columns we need and convert to list of dicts
|
|
383
|
+
context_df = df[list(context_columns.values())]
|
|
384
|
+
records = context_df.to_dict("records")
|
|
385
|
+
|
|
386
|
+
# Map column names to field names for lookup
|
|
387
|
+
col_to_field = {
|
|
388
|
+
col_name: field_name for field_name, col_name in context_columns.items()
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
# Build contexts from records
|
|
392
|
+
contexts = []
|
|
393
|
+
for record in records:
|
|
394
|
+
context_kwargs: dict[str, Any] = {}
|
|
395
|
+
for col_name_raw, value in record.items():
|
|
396
|
+
col_name = str(col_name_raw) # Convert Hashable to str
|
|
397
|
+
if pd.isna(value):
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
field_name = col_to_field[col_name]
|
|
401
|
+
|
|
402
|
+
# Type conversions
|
|
403
|
+
if field_name == "entity_type":
|
|
404
|
+
context_kwargs[field_name] = EntityType(value)
|
|
405
|
+
elif field_name == "as_of":
|
|
406
|
+
context_kwargs[field_name] = parse_date(value)
|
|
407
|
+
else:
|
|
408
|
+
context_kwargs[field_name] = value
|
|
409
|
+
|
|
410
|
+
contexts.append(
|
|
411
|
+
MatchContext(**context_kwargs) if context_kwargs else None
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
resolutions = self.resolve_many(queries, context=contexts)
|
|
415
|
+
|
|
416
|
+
# Build output DataFrame
|
|
417
|
+
return self._build_output_dataframe(
|
|
418
|
+
df, resolutions, resolve_to, include_confidence, output_prefix
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
def _build_output_dataframe(
|
|
422
|
+
self,
|
|
423
|
+
df: "pd.DataFrame",
|
|
424
|
+
resolutions: list[Resolution],
|
|
425
|
+
resolve_to: list[str],
|
|
426
|
+
include_confidence: bool,
|
|
427
|
+
output_prefix: str,
|
|
428
|
+
) -> "pd.DataFrame":
|
|
429
|
+
"""Build output DataFrame with resolution results."""
|
|
430
|
+
|
|
431
|
+
# Copy input DataFrame
|
|
432
|
+
result_df = df.copy()
|
|
433
|
+
|
|
434
|
+
# Extract requested fields from resolutions
|
|
435
|
+
for field in resolve_to:
|
|
436
|
+
col_name = f"{output_prefix}{field}"
|
|
437
|
+
values: list[str | None] = []
|
|
438
|
+
|
|
439
|
+
for res in resolutions:
|
|
440
|
+
if res.entity is None:
|
|
441
|
+
values.append(None)
|
|
442
|
+
elif field == "dcid":
|
|
443
|
+
values.append(res.entity.dcid)
|
|
444
|
+
elif field == "canonical_name":
|
|
445
|
+
values.append(res.entity.canonical_name)
|
|
446
|
+
else:
|
|
447
|
+
# Code system (iso3, m49, etc.)
|
|
448
|
+
values.append(res.entity.codes.get(field))
|
|
449
|
+
|
|
450
|
+
result_df[col_name] = values
|
|
451
|
+
|
|
452
|
+
# Add confidence column
|
|
453
|
+
if include_confidence:
|
|
454
|
+
conf_col = f"{output_prefix}confidence"
|
|
455
|
+
result_df[conf_col] = [r.confidence for r in resolutions]
|
|
456
|
+
|
|
457
|
+
return result_df
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Builders Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The builders module contains tools for building data packs from source data, including ETL pipelines, validation, and packaging.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Core Builders
|
|
10
|
+
|
|
11
|
+
1. **Pack Builder** (`pack_builder.py`)
|
|
12
|
+
- Orchestrates full data pack build process
|
|
13
|
+
- Runs extract, transform, validate, enrich, package stages
|
|
14
|
+
- Generates manifests and checksums
|
|
15
|
+
|
|
16
|
+
2. **Schema Builder** (`schema_builder.py`)
|
|
17
|
+
- Creates SQLite databases with proper schema
|
|
18
|
+
- Builds FTS5 indexes
|
|
19
|
+
- Applies optimizations (PRAGMA, indexes)
|
|
20
|
+
|
|
21
|
+
3. **Calibration Trainer** (`calibration_trainer.py`)
|
|
22
|
+
- Trains calibration models on labeled data
|
|
23
|
+
- Evaluates model quality (ECE, Brier score)
|
|
24
|
+
- Exports models to JSON
|
|
25
|
+
|
|
26
|
+
4. **Sidecar Builder** (`sidecar_builder.py`)
|
|
27
|
+
- Builds ambiguity sidecar (HNSW index)
|
|
28
|
+
- Curates ambiguous aliases
|
|
29
|
+
- Generates embeddings and quantizes (Phase E)
|
|
30
|
+
|
|
31
|
+
### ETL Pipeline
|
|
32
|
+
|
|
33
|
+
- `extractors.py`: Extract data from sources (APIs, dumps, files)
|
|
34
|
+
- `transformers.py`: Normalize, align, resolve conflicts
|
|
35
|
+
- `validators.py`: Referential integrity, format validation
|
|
36
|
+
- `enrichers.py`: Derive hierarchies, compute statistics
|
|
37
|
+
- `packagers.py`: Create archives, generate metadata
|
|
38
|
+
|
|
39
|
+
### Data Sources
|
|
40
|
+
|
|
41
|
+
- `sources/`: Source-specific extractors
|
|
42
|
+
- `data_commons.py`: Data Commons entity extraction
|
|
43
|
+
- `iso.py`: ISO 3166 codes
|
|
44
|
+
- `wikidata.py`: Wikidata aliases and temporal data
|
|
45
|
+
- `geonames.py`: GeoNames data
|
|
46
|
+
- `custom.py`: Custom data ingestion
|
|
47
|
+
|
|
48
|
+
### Quality Assurance
|
|
49
|
+
|
|
50
|
+
- `quality_checks.py`: Data quality validation
|
|
51
|
+
- `coverage_metrics.py`: Coverage analysis
|
|
52
|
+
- `conflict_resolver.py`: Handle source conflicts
|
|
53
|
+
- `deduplicator.py`: Entity deduplication
|
|
54
|
+
|
|
55
|
+
## Build Pipeline Stages
|
|
56
|
+
|
|
57
|
+
### 1. Extract
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
# Extract from multiple sources
|
|
61
|
+
entities_dc = extract_data_commons()
|
|
62
|
+
entities_iso = extract_iso_codes()
|
|
63
|
+
entities_wd = extract_wikidata()
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Transform
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
# Normalize and align entities
|
|
70
|
+
entities = align_entities(entities_dc, entities_iso, entities_wd)
|
|
71
|
+
entities = normalize_text(entities)
|
|
72
|
+
entities = resolve_conflicts(entities, precedence_rules)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### 3. Validate
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
# Validate data quality
|
|
79
|
+
check_referential_integrity(entities)
|
|
80
|
+
check_code_formats(entities)
|
|
81
|
+
check_temporal_consistency(entities)
|
|
82
|
+
flag_suspicious_data(entities)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 4. Enrich
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# Compute derived data
|
|
89
|
+
compute_hierarchy_paths(entities)
|
|
90
|
+
build_fts_indexes(entities)
|
|
91
|
+
generate_statistics(entities)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### 5. Package
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
# Create data pack
|
|
98
|
+
export_to_sqlite(entities, "base.sqlite")
|
|
99
|
+
build_manifest(pack_info)
|
|
100
|
+
compute_checksums(files)
|
|
101
|
+
create_archive("resolvekit-data-1.2.0.tar.gz")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## CLI for Building
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Build full data pack
|
|
108
|
+
resolvekit-build pack \
|
|
109
|
+
--sources wikidata,iso,datacommons \
|
|
110
|
+
--output packs/1.2.0/ \
|
|
111
|
+
--validate
|
|
112
|
+
|
|
113
|
+
# Build overlay from CSV
|
|
114
|
+
resolvekit-build overlay \
|
|
115
|
+
--input custom_aliases.csv \
|
|
116
|
+
--name my-custom-pack \
|
|
117
|
+
--output overlays/
|
|
118
|
+
|
|
119
|
+
# Train calibration model
|
|
120
|
+
resolvekit-build calibration \
|
|
121
|
+
--training-data labeled_pairs.csv \
|
|
122
|
+
--output calibration.json
|
|
123
|
+
|
|
124
|
+
# Build ambiguity sidecar
|
|
125
|
+
resolvekit-build sidecar \
|
|
126
|
+
--registry ambiguity_registry.csv \
|
|
127
|
+
--base base.sqlite \
|
|
128
|
+
--output ambiguity.hnsw
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Manifest Generation
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
manifest = {
|
|
135
|
+
"pack_name": "resolvekit-core-countries",
|
|
136
|
+
"version": "1.2.0",
|
|
137
|
+
"build_time": datetime.now(UTC).isoformat(),
|
|
138
|
+
"schema_hash": compute_schema_hash(),
|
|
139
|
+
"sources": [
|
|
140
|
+
{"name": "DataCommons", "snapshot": "2025-09-15"},
|
|
141
|
+
{"name": "ISO3166", "snapshot": "2025-09-01"}
|
|
142
|
+
],
|
|
143
|
+
"components": {
|
|
144
|
+
"base_sqlite": "base.sqlite",
|
|
145
|
+
"fts_built": True,
|
|
146
|
+
"ambiguity_registry": True,
|
|
147
|
+
"calibration": "calibration.json"
|
|
148
|
+
},
|
|
149
|
+
"checksums": {
|
|
150
|
+
"base.sqlite": compute_sha256("base.sqlite"),
|
|
151
|
+
"calibration.json": compute_sha256("calibration.json")
|
|
152
|
+
},
|
|
153
|
+
"license": "CC-BY 4.0 (data), Apache-2.0 (code)",
|
|
154
|
+
"compat": {
|
|
155
|
+
"resolver_min": "0.9.0",
|
|
156
|
+
"resolver_max": "<2.0.0"
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Design Principles
|
|
162
|
+
|
|
163
|
+
1. **Automated**: CI/CD pipeline for regular builds
|
|
164
|
+
2. **Reproducible**: Same sources → same output
|
|
165
|
+
3. **Validated**: Quality gates prevent bad data
|
|
166
|
+
4. **Versioned**: Semantic versioning for packs
|
|
167
|
+
|
|
168
|
+
## Implementation Priority
|
|
169
|
+
|
|
170
|
+
**Phase A** - Basic schema builder
|
|
171
|
+
**Phase C** - Overlay builders
|
|
172
|
+
**Phase D** - Full pack building and distribution
|
|
173
|
+
**Phase E** - Sidecar builder
|
|
File without changes
|