resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,457 @@
1
+ """Main Resolver API."""
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ from resolvekit.calibration.calibrator import Calibrator
7
+ from resolvekit.calibration.models import load_calibration_model
8
+ from resolvekit.constraints.constraint_engine import ConstraintEngine
9
+ from resolvekit.data.alias_repository import AliasRepository
10
+ from resolvekit.data.code_repository import CodeRepository
11
+ from resolvekit.data.db_manager import DatabaseManager
12
+ from resolvekit.data.entity_repository import EntityRepository
13
+ from resolvekit.data.membership_repository import MembershipRepository
14
+ from resolvekit.matchers.alias_exact import AliasExactMatcher
15
+ from resolvekit.matchers.canonical_name import CanonicalNameMatcher
16
+ from resolvekit.matchers.cascade import MatcherCascade
17
+ from resolvekit.matchers.exact_code import ExactCodeMatcher
18
+ from resolvekit.matchers.fts_matcher import FTSMatcher
19
+ from resolvekit.matchers.fuzzy_matcher import FuzzyMatcher
20
+ from resolvekit.normalization.normalizer import TextNormalizer
21
+ from resolvekit.types import (
22
+ Candidate,
23
+ EntityType,
24
+ Explanation,
25
+ ExplanationMode,
26
+ MatchContext,
27
+ Resolution,
28
+ )
29
+ from resolvekit.utils.dates import parse_date
30
+
31
+ if TYPE_CHECKING:
32
+ import pandas as pd
33
+
34
+
35
+ class Resolver:
36
+ """
37
+ Main entity resolution API.
38
+
39
+ Orchestrates matchers, constraints, and calibration to resolve
40
+ entity strings to canonical entities with confidence scores.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ *,
46
+ data_pack: str | Path | None = None,
47
+ overlays: list[str | Path] | None = None,
48
+ min_confidence: float = 0.5,
49
+ explanation_mode: ExplanationMode = ExplanationMode.STANDARD,
50
+ top_k: int = 12,
51
+ calibration_model: str | Path | None = None,
52
+ ):
53
+ """
54
+ Initialize resolver with data pack and configuration.
55
+
56
+ Args:
57
+ data_pack: Optional path to data pack. If None, uses default bundled pack.
58
+ overlays: Optional list of overlay data packs (user/org customizations)
59
+ min_confidence: Minimum confidence threshold (default 0.5)
60
+ explanation_mode: Level of detail in explanations
61
+ top_k: Maximum candidates to consider
62
+ calibration_model: Optional path to calibration model
63
+ """
64
+ self.min_confidence = min_confidence
65
+ self.explanation_mode = explanation_mode
66
+ self.top_k = top_k
67
+
68
+ # Initialize normalizer (needed by repositories)
69
+ self.normalizer = TextNormalizer()
70
+
71
+ # Initialize database and repositories
72
+ # TODO: Auto-discover default data pack if None
73
+ if data_pack is not None:
74
+ data_pack_path = (
75
+ Path(data_pack) if isinstance(data_pack, str) else data_pack
76
+ )
77
+ else:
78
+ # TODO: Auto-discover default data pack
79
+ data_pack_path = Path("data.db") # Temporary placeholder
80
+
81
+ # Convert overlays to Path objects
82
+ overlay_paths: list[Path] | None = None
83
+ if overlays:
84
+ overlay_paths = [
85
+ Path(overlay) if isinstance(overlay, str) else overlay
86
+ for overlay in overlays
87
+ ]
88
+
89
+ self.db_manager = DatabaseManager(data_pack_path, overlays=overlay_paths)
90
+ self.db_manager.connect()
91
+ self.entity_repo = EntityRepository(self.db_manager)
92
+ self.alias_repo = AliasRepository(self.db_manager, self.normalizer)
93
+ self.code_repo = CodeRepository(self.db_manager, self.entity_repo)
94
+ self.membership_repo = MembershipRepository(self.db_manager)
95
+
96
+ # Initialize matchers
97
+ self.exact_code = ExactCodeMatcher(self.code_repo)
98
+ self.canonical_name = CanonicalNameMatcher(self.entity_repo, self.normalizer)
99
+ self.alias_exact = AliasExactMatcher(self.alias_repo)
100
+ self.fts = FTSMatcher(self.alias_repo)
101
+ self.fuzzy = FuzzyMatcher(self.normalizer)
102
+
103
+ # Initialize cascade
104
+ self.cascade = MatcherCascade(
105
+ exact_code=self.exact_code,
106
+ canonical_name=self.canonical_name,
107
+ alias_exact=self.alias_exact,
108
+ fts=self.fts,
109
+ fuzzy=self.fuzzy,
110
+ normalizer=self.normalizer,
111
+ )
112
+
113
+ # Initialize constraint engine
114
+ self.constraint_engine = ConstraintEngine(
115
+ entity_repo=self.entity_repo,
116
+ membership_repo=self.membership_repo,
117
+ )
118
+
119
+ # Initialize calibrator
120
+ if calibration_model:
121
+ calibration_model_path = (
122
+ Path(calibration_model)
123
+ if isinstance(calibration_model, str)
124
+ else calibration_model
125
+ )
126
+ model = load_calibration_model(calibration_model_path)
127
+ self.calibrator = Calibrator(model)
128
+ else:
129
+ # Heuristic mode (no model)
130
+ self.calibrator = Calibrator()
131
+
132
+ def resolve(
133
+ self,
134
+ query: str,
135
+ context: MatchContext | None = None,
136
+ ) -> Resolution:
137
+ """
138
+ Resolve single entity query.
139
+
140
+ Args:
141
+ query: Entity string to resolve
142
+ context: Optional match context for filtering/disambiguation
143
+
144
+ Returns:
145
+ Resolution result
146
+ """
147
+ # Stage 1: Cascade (normalization + matching)
148
+ candidates = self.cascade.resolve(query, context=context, top_k=self.top_k)
149
+
150
+ # No candidates found
151
+ if not candidates:
152
+ return Resolution(
153
+ entity=None,
154
+ confidence=0.0,
155
+ alternatives=[],
156
+ explanation=self._build_explanation_empty(query)
157
+ if self.explanation_mode != ExplanationMode.MINIMAL
158
+ else None,
159
+ )
160
+
161
+ # Stage 2: Constraint engine (KG/temporal validation)
162
+ candidates = self.constraint_engine.apply_constraints(candidates, context)
163
+
164
+ # All filtered by constraints
165
+ if not candidates:
166
+ return Resolution(
167
+ entity=None,
168
+ confidence=0.0,
169
+ alternatives=[],
170
+ explanation=self._build_explanation_filtered(query)
171
+ if self.explanation_mode != ExplanationMode.MINIMAL
172
+ else None,
173
+ )
174
+
175
+ # Stage 3: Calibration (score → confidence probability)
176
+ calibrated_scores = self.calibrator.calibrate_batch(candidates)
177
+
178
+ # Stage 4: Sort by calibrated confidence
179
+ sorted_candidates = sorted(
180
+ zip(candidates, calibrated_scores, strict=False),
181
+ key=lambda x: x[1],
182
+ reverse=True,
183
+ )
184
+
185
+ # Stage 5: Threshold filtering
186
+ top_candidate, top_confidence = sorted_candidates[0]
187
+ entity = None if top_confidence < self.min_confidence else top_candidate.entity
188
+
189
+ # Stage 6: Build alternatives (next 2-5 candidates)
190
+ alternatives = [c.entity for c, score in sorted_candidates[1:6]]
191
+
192
+ # Stage 7: Build explanation
193
+ explanation = self._build_explanation(
194
+ top_candidate,
195
+ sorted_candidates,
196
+ query,
197
+ )
198
+
199
+ return Resolution(
200
+ entity=entity,
201
+ confidence=top_confidence,
202
+ alternatives=alternatives,
203
+ explanation=explanation,
204
+ )
205
+
206
+ def _build_explanation_empty(self, query: str) -> Explanation | None:
207
+ """Build explanation for empty results."""
208
+ if self.explanation_mode == ExplanationMode.MINIMAL:
209
+ return None
210
+ return Explanation(
211
+ stages=["cascade"],
212
+ trace={"query": query, "reason": "no_candidates"},
213
+ )
214
+
215
+ def _build_explanation_filtered(self, query: str) -> Explanation | None:
216
+ """Build explanation for constraint-filtered results."""
217
+ if self.explanation_mode == ExplanationMode.MINIMAL:
218
+ return None
219
+ return Explanation(
220
+ stages=["cascade", "constraints"],
221
+ trace={"query": query, "reason": "filtered_by_constraints"},
222
+ )
223
+
224
+ def _build_explanation(
225
+ self,
226
+ top_candidate: Candidate,
227
+ sorted_candidates: list[tuple[Candidate, float]],
228
+ query: str,
229
+ ) -> Explanation | None:
230
+ """
231
+ Build explanation based on mode.
232
+
233
+ Args:
234
+ top_candidate: Top candidate
235
+ sorted_candidates: All candidates with scores
236
+ query: Original query
237
+
238
+ Returns:
239
+ Explanation or None (for MINIMAL mode)
240
+ """
241
+ if self.explanation_mode == ExplanationMode.MINIMAL:
242
+ return None
243
+
244
+ # STANDARD mode
245
+ explanation = Explanation(
246
+ matcher_used=top_candidate.matcher_type,
247
+ features=top_candidate.features.copy(),
248
+ stages=["cascade", "constraints", "calibration"],
249
+ candidates=[c for c, _ in sorted_candidates[1:4]], # Next 3 as alternatives
250
+ )
251
+
252
+ # FULL mode - add trace
253
+ if self.explanation_mode == ExplanationMode.FULL:
254
+ explanation.trace = {
255
+ "query": query,
256
+ "total_candidates": len(sorted_candidates),
257
+ "all_candidates": [
258
+ {
259
+ "dcid": c.entity.dcid,
260
+ "name": c.entity.canonical_name,
261
+ "score": float(score),
262
+ "matcher": c.matcher_type.value,
263
+ }
264
+ for c, score in sorted_candidates
265
+ ],
266
+ }
267
+
268
+ return explanation
269
+
270
+ def resolve_many(
271
+ self,
272
+ queries: list[str],
273
+ context: MatchContext | list[MatchContext | None] | None = None,
274
+ ) -> list[Resolution]:
275
+ """
276
+ Resolve multiple queries.
277
+
278
+ Args:
279
+ queries: List of query strings
280
+ context: Optional context - applies to all queries if MatchContext,
281
+ or per-query if list (must match length of queries)
282
+
283
+ Returns:
284
+ List of Resolution objects (same order as queries)
285
+ """
286
+ # Validate per-query context length
287
+ if isinstance(context, list):
288
+ if len(context) != len(queries):
289
+ raise ValueError(
290
+ f"context list length ({len(context)}) must match queries length ({len(queries)})"
291
+ )
292
+ # Per-query contexts - resolve individually
293
+ return [
294
+ self.resolve(query, ctx)
295
+ for query, ctx in zip(queries, context, strict=True)
296
+ ]
297
+
298
+ # Shared context - deduplicate queries for efficiency
299
+ unique_queries = list(dict.fromkeys(queries))
300
+
301
+ # Resolve only unique queries
302
+ unique_resolutions = {
303
+ query: self.resolve(query, context) for query in unique_queries
304
+ }
305
+
306
+ # Map results back to original query order
307
+ return [unique_resolutions[query] for query in queries]
308
+
309
+ def __enter__(self) -> "Resolver":
310
+ """Context manager entry."""
311
+ return self
312
+
313
+ def __exit__(self, *args: object) -> None:
314
+ """Context manager exit - clean up resources."""
315
+ # Close database connections
316
+ if hasattr(self, "db_manager"):
317
+ self.db_manager.close()
318
+
319
+ def resolve_dataframe( # noqa: PLR0912
320
+ self,
321
+ df: "pd.DataFrame",
322
+ query_column: str,
323
+ resolve_to: str | list[str] = "dcid",
324
+ *,
325
+ context_columns: dict[str, str] | None = None,
326
+ context: MatchContext | None = None,
327
+ include_confidence: bool = True,
328
+ output_prefix: str = "",
329
+ ) -> "pd.DataFrame":
330
+ """
331
+ Resolve entities from DataFrame column to specified code system(s).
332
+
333
+ Args:
334
+ df: Input DataFrame
335
+ query_column: Column name containing entity strings to resolve
336
+ resolve_to: What to resolve to (default: "dcid")
337
+ context_columns: Map MatchContext fields to DataFrame columns
338
+ context: Single MatchContext for all rows
339
+ include_confidence: Add confidence score column
340
+ output_prefix: Prefix for output columns
341
+
342
+ Returns:
343
+ DataFrame with resolution results added as columns
344
+ """
345
+ # Import pandas only when needed
346
+ try:
347
+ import pandas as pd
348
+ except ImportError as e:
349
+ raise ImportError("pandas is required for resolve_dataframe(). ") from e
350
+
351
+ # Validate inputs
352
+ if query_column not in df.columns:
353
+ raise ValueError(f"Column '{query_column}' not found in DataFrame")
354
+
355
+ # Normalize resolve_to to list
356
+ if isinstance(resolve_to, str):
357
+ resolve_to = [resolve_to]
358
+
359
+ # Deduplication optimization
360
+ if context_columns is None:
361
+ # Simple case: shared context, dedupe on query only
362
+ unique_queries = df[query_column].unique().tolist()
363
+ unique_resolutions = self.resolve_many(unique_queries, context=context)
364
+
365
+ # Create lookup dict
366
+ lookup = dict(zip(unique_queries, unique_resolutions, strict=True))
367
+
368
+ # Map back to all rows
369
+ resolutions = [lookup[q] for q in df[query_column]]
370
+ else:
371
+ # Build per-row contexts from context_columns
372
+ queries = df[query_column].tolist()
373
+
374
+ # Validate context columns exist
375
+ for field_name, col_name in context_columns.items():
376
+ if col_name not in df.columns:
377
+ raise ValueError(
378
+ f"Context column '{col_name}' (for field '{field_name}') not found in DataFrame"
379
+ )
380
+
381
+ # Build MatchContext for each row
382
+ # Extract only the columns we need and convert to list of dicts
383
+ context_df = df[list(context_columns.values())]
384
+ records = context_df.to_dict("records")
385
+
386
+ # Map column names to field names for lookup
387
+ col_to_field = {
388
+ col_name: field_name for field_name, col_name in context_columns.items()
389
+ }
390
+
391
+ # Build contexts from records
392
+ contexts = []
393
+ for record in records:
394
+ context_kwargs: dict[str, Any] = {}
395
+ for col_name_raw, value in record.items():
396
+ col_name = str(col_name_raw) # Convert Hashable to str
397
+ if pd.isna(value):
398
+ continue
399
+
400
+ field_name = col_to_field[col_name]
401
+
402
+ # Type conversions
403
+ if field_name == "entity_type":
404
+ context_kwargs[field_name] = EntityType(value)
405
+ elif field_name == "as_of":
406
+ context_kwargs[field_name] = parse_date(value)
407
+ else:
408
+ context_kwargs[field_name] = value
409
+
410
+ contexts.append(
411
+ MatchContext(**context_kwargs) if context_kwargs else None
412
+ )
413
+
414
+ resolutions = self.resolve_many(queries, context=contexts)
415
+
416
+ # Build output DataFrame
417
+ return self._build_output_dataframe(
418
+ df, resolutions, resolve_to, include_confidence, output_prefix
419
+ )
420
+
421
+ def _build_output_dataframe(
422
+ self,
423
+ df: "pd.DataFrame",
424
+ resolutions: list[Resolution],
425
+ resolve_to: list[str],
426
+ include_confidence: bool,
427
+ output_prefix: str,
428
+ ) -> "pd.DataFrame":
429
+ """Build output DataFrame with resolution results."""
430
+
431
+ # Copy input DataFrame
432
+ result_df = df.copy()
433
+
434
+ # Extract requested fields from resolutions
435
+ for field in resolve_to:
436
+ col_name = f"{output_prefix}{field}"
437
+ values: list[str | None] = []
438
+
439
+ for res in resolutions:
440
+ if res.entity is None:
441
+ values.append(None)
442
+ elif field == "dcid":
443
+ values.append(res.entity.dcid)
444
+ elif field == "canonical_name":
445
+ values.append(res.entity.canonical_name)
446
+ else:
447
+ # Code system (iso3, m49, etc.)
448
+ values.append(res.entity.codes.get(field))
449
+
450
+ result_df[col_name] = values
451
+
452
+ # Add confidence column
453
+ if include_confidence:
454
+ conf_col = f"{output_prefix}confidence"
455
+ result_df[conf_col] = [r.confidence for r in resolutions]
456
+
457
+ return result_df
@@ -0,0 +1,173 @@
1
+ # Builders Module
2
+
3
+ ## Purpose
4
+
5
+ The builders module contains tools for building data packs from source data, including ETL pipelines, validation, and packaging.
6
+
7
+ ## Components
8
+
9
+ ### Core Builders
10
+
11
+ 1. **Pack Builder** (`pack_builder.py`)
12
+ - Orchestrates full data pack build process
13
+ - Runs extract, transform, validate, enrich, package stages
14
+ - Generates manifests and checksums
15
+
16
+ 2. **Schema Builder** (`schema_builder.py`)
17
+ - Creates SQLite databases with proper schema
18
+ - Builds FTS5 indexes
19
+ - Applies optimizations (PRAGMA, indexes)
20
+
21
+ 3. **Calibration Trainer** (`calibration_trainer.py`)
22
+ - Trains calibration models on labeled data
23
+ - Evaluates model quality (ECE, Brier score)
24
+ - Exports models to JSON
25
+
26
+ 4. **Sidecar Builder** (`sidecar_builder.py`)
27
+ - Builds ambiguity sidecar (HNSW index)
28
+ - Curates ambiguous aliases
29
+ - Generates embeddings and quantizes (Phase E)
30
+
31
+ ### ETL Pipeline
32
+
33
+ - `extractors.py`: Extract data from sources (APIs, dumps, files)
34
+ - `transformers.py`: Normalize, align, resolve conflicts
35
+ - `validators.py`: Referential integrity, format validation
36
+ - `enrichers.py`: Derive hierarchies, compute statistics
37
+ - `packagers.py`: Create archives, generate metadata
38
+
39
+ ### Data Sources
40
+
41
+ - `sources/`: Source-specific extractors
42
+ - `data_commons.py`: Data Commons entity extraction
43
+ - `iso.py`: ISO 3166 codes
44
+ - `wikidata.py`: Wikidata aliases and temporal data
45
+ - `geonames.py`: GeoNames data
46
+ - `custom.py`: Custom data ingestion
47
+
48
+ ### Quality Assurance
49
+
50
+ - `quality_checks.py`: Data quality validation
51
+ - `coverage_metrics.py`: Coverage analysis
52
+ - `conflict_resolver.py`: Handle source conflicts
53
+ - `deduplicator.py`: Entity deduplication
54
+
55
+ ## Build Pipeline Stages
56
+
57
+ ### 1. Extract
58
+
59
+ ```python
60
+ # Extract from multiple sources
61
+ entities_dc = extract_data_commons()
62
+ entities_iso = extract_iso_codes()
63
+ entities_wd = extract_wikidata()
64
+ ```
65
+
66
+ ### 2. Transform
67
+
68
+ ```python
69
+ # Normalize and align entities
70
+ entities = align_entities(entities_dc, entities_iso, entities_wd)
71
+ entities = normalize_text(entities)
72
+ entities = resolve_conflicts(entities, precedence_rules)
73
+ ```
74
+
75
+ ### 3. Validate
76
+
77
+ ```python
78
+ # Validate data quality
79
+ check_referential_integrity(entities)
80
+ check_code_formats(entities)
81
+ check_temporal_consistency(entities)
82
+ flag_suspicious_data(entities)
83
+ ```
84
+
85
+ ### 4. Enrich
86
+
87
+ ```python
88
+ # Compute derived data
89
+ compute_hierarchy_paths(entities)
90
+ build_fts_indexes(entities)
91
+ generate_statistics(entities)
92
+ ```
93
+
94
+ ### 5. Package
95
+
96
+ ```python
97
+ # Create data pack
98
+ export_to_sqlite(entities, "base.sqlite")
99
+ build_manifest(pack_info)
100
+ compute_checksums(files)
101
+ create_archive("resolvekit-data-1.2.0.tar.gz")
102
+ ```
103
+
104
+ ## CLI for Building
105
+
106
+ ```bash
107
+ # Build full data pack
108
+ resolvekit-build pack \
109
+ --sources wikidata,iso,datacommons \
110
+ --output packs/1.2.0/ \
111
+ --validate
112
+
113
+ # Build overlay from CSV
114
+ resolvekit-build overlay \
115
+ --input custom_aliases.csv \
116
+ --name my-custom-pack \
117
+ --output overlays/
118
+
119
+ # Train calibration model
120
+ resolvekit-build calibration \
121
+ --training-data labeled_pairs.csv \
122
+ --output calibration.json
123
+
124
+ # Build ambiguity sidecar
125
+ resolvekit-build sidecar \
126
+ --registry ambiguity_registry.csv \
127
+ --base base.sqlite \
128
+ --output ambiguity.hnsw
129
+ ```
130
+
131
+ ## Manifest Generation
132
+
133
+ ```python
134
+ manifest = {
135
+ "pack_name": "resolvekit-core-countries",
136
+ "version": "1.2.0",
137
+ "build_time": datetime.now(UTC).isoformat(),
138
+ "schema_hash": compute_schema_hash(),
139
+ "sources": [
140
+ {"name": "DataCommons", "snapshot": "2025-09-15"},
141
+ {"name": "ISO3166", "snapshot": "2025-09-01"}
142
+ ],
143
+ "components": {
144
+ "base_sqlite": "base.sqlite",
145
+ "fts_built": True,
146
+ "ambiguity_registry": True,
147
+ "calibration": "calibration.json"
148
+ },
149
+ "checksums": {
150
+ "base.sqlite": compute_sha256("base.sqlite"),
151
+ "calibration.json": compute_sha256("calibration.json")
152
+ },
153
+ "license": "CC-BY 4.0 (data), Apache-2.0 (code)",
154
+ "compat": {
155
+ "resolver_min": "0.9.0",
156
+ "resolver_max": "<2.0.0"
157
+ }
158
+ }
159
+ ```
160
+
161
+ ## Design Principles
162
+
163
+ 1. **Automated**: CI/CD pipeline for regular builds
164
+ 2. **Reproducible**: Same sources → same output
165
+ 3. **Validated**: Quality gates prevent bad data
166
+ 4. **Versioned**: Semantic versioning for packs
167
+
168
+ ## Implementation Priority
169
+
170
+ **Phase A** - Basic schema builder
171
+ **Phase C** - Overlay builders
172
+ **Phase D** - Full pack building and distribution
173
+ **Phase E** - Sidecar builder
File without changes