resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
resolvekit/types.py ADDED
@@ -0,0 +1,534 @@
1
+ """Type definitions for resolvekit."""
2
+
3
+ from datetime import date
4
+ from enum import StrEnum
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
8
+ from sqlalchemy import Column
9
+ from sqlalchemy import String as SQLAString
10
+ from sqlmodel import Field as SQLModelField
11
+ from sqlmodel import SQLModel
12
+
13
+ # ==============================================================================
14
+ # Enums (for type safety and SQLModel compatibility)
15
+ # ==============================================================================
16
+
17
+
18
+ class EntityType(StrEnum):
19
+ """Entity type classifications."""
20
+
21
+ COUNTRY = "country"
22
+ ADMIN1 = "admin1"
23
+ ADMIN2 = "admin2"
24
+ ADMIN3 = "admin3"
25
+ ADMIN4 = "admin4"
26
+ CITY = "city"
27
+ ORGANIZATION = "organization"
28
+ GROUP = "group"
29
+ REGION = "region"
30
+ OTHER = "other"
31
+
32
+
33
+ class AliasType(StrEnum):
34
+ """Alias type classifications."""
35
+
36
+ CANONICAL = "canonical" # Official canonical name
37
+ ENDONYM = "endonym" # Native name in local language
38
+ EXONYM = "exonym" # Foreign name
39
+ ABBR = "abbr" # Abbreviation
40
+ CODE = "code" # Code identifier
41
+ HISTORICAL = "historical" # Historical name (no longer current)
42
+ COLLOQUIAL = "colloquial" # Informal/colloquial name
43
+
44
+
45
+ class CodeSystem(StrEnum):
46
+ """Code system identifiers."""
47
+
48
+ DCID = "dcid" # Data Commons ID (canonical)
49
+ ISO2 = "iso2" # ISO 3166-1 alpha-2
50
+ ISO3 = "iso3" # ISO 3166-1 alpha-3
51
+ ISO_NUMERIC = "iso_numeric" # ISO 3166-1 numeric
52
+ ISO3166_2 = "iso3166_2" # ISO 3166-2 subdivision codes
53
+ M49 = "m49" # UN M49 codes
54
+ NUTS = "nuts" # EU NUTS codes
55
+ LAU = "lau" # EU LAU codes
56
+ PCODE = "pcode" # OCHA P-codes
57
+ WIKIDATA = "wikidata" # Wikidata QID
58
+ GEONAMES = "geonames" # GeoNames ID
59
+ WB = "wb" # World Bank codes
60
+ DAC = "dac" # OECD DAC codes
61
+ FIPS = "fips" # FIPS codes (deprecated but still used)
62
+
63
+
64
+ class MatcherType(StrEnum):
65
+ """Matcher type classifications."""
66
+
67
+ EXACT_CODE = "exact_code"
68
+ CANONICAL_NAME = "canonical_name"
69
+ ALIAS_EXACT = "alias_exact"
70
+ FTS = "fts"
71
+ FUZZY = "fuzzy"
72
+ SEMANTIC = "semantic"
73
+
74
+
75
+ class OutputFormat(StrEnum):
76
+ """Output format options."""
77
+
78
+ HUMAN = "human"
79
+ JSON = "json"
80
+ CSV = "csv"
81
+ TABLE = "table"
82
+
83
+
84
+ class ExplanationMode(StrEnum):
85
+ """Explanation detail level."""
86
+
87
+ MINIMAL = "minimal" # Just confidence scores, no explanation object
88
+ STANDARD = "standard" # Matched features + top alternatives
89
+ FULL = "full" # Complete debug trace with all stages
90
+
91
+
92
+ class EntityRow(SQLModel, table=True):
93
+ """Database row representation of an entity (no computed fields)."""
94
+
95
+ __tablename__ = "entities"
96
+
97
+ # Configure model to use Enum values (not names) in database
98
+ model_config = {"use_enum_values": True}
99
+
100
+ dcid: str = SQLModelField(
101
+ primary_key=True, min_length=1, description="Data Commons ID"
102
+ )
103
+ canonical_name: str = SQLModelField(
104
+ min_length=1, description="Canonical entity name"
105
+ )
106
+ normalized_canonical: str = SQLModelField(
107
+ min_length=1,
108
+ index=True,
109
+ description="Normalized canonical name for efficient lookups",
110
+ )
111
+ entity_type: EntityType = SQLModelField(
112
+ sa_column=Column(SQLAString)
113
+ ) # Store as string
114
+ parent_dcid: str | None = SQLModelField(
115
+ default=None, foreign_key="entities.dcid", description="Parent entity DCID"
116
+ )
117
+ centroid_lat: float | None = SQLModelField(
118
+ default=None, ge=-90, le=90, description="Latitude"
119
+ )
120
+ centroid_lon: float | None = SQLModelField(
121
+ default=None, ge=-180, le=180, description="Longitude"
122
+ )
123
+ valid_from: date | None = None
124
+ valid_until: date | None = None
125
+
126
+ @field_validator("valid_from", "valid_until", mode="before")
127
+ @classmethod
128
+ def parse_date(cls, v):
129
+ """Parse date from various formats."""
130
+ if v is None or isinstance(v, date):
131
+ return v
132
+ if isinstance(v, str):
133
+ from datetime import datetime
134
+
135
+ return datetime.fromisoformat(v).date()
136
+ return v
137
+
138
+ @field_serializer("entity_type", when_used="always")
139
+ def serialize_entity_type(self, value: EntityType) -> str:
140
+ """Serialize EntityType enum to its string value for database storage."""
141
+ return value.value if isinstance(value, EntityType) else value
142
+
143
+
144
+ class Entity(BaseModel):
145
+ """Represents a resolved entity."""
146
+
147
+ model_config = ConfigDict(frozen=False)
148
+
149
+ dcid: str = Field(..., min_length=1, description="Data Commons ID")
150
+ canonical_name: str = Field(..., min_length=1, description="Canonical entity name")
151
+ entity_type: EntityType
152
+ codes: dict[str, str] = Field(default_factory=dict, description="Code mappings")
153
+ parent_dcid: str | None = Field(None, description="Parent entity DCID")
154
+ centroid_lat: float | None = Field(None, ge=-90, le=90, description="Latitude")
155
+ centroid_lon: float | None = Field(None, ge=-180, le=180, description="Longitude")
156
+ valid_from: date | None = Field(None, description="Validity start date")
157
+ valid_until: date | None = Field(None, description="Validity end date")
158
+ provenance: dict[str, str] = Field(
159
+ default_factory=dict, description="Data source provenance"
160
+ )
161
+
162
+ @field_validator("valid_from", "valid_until", mode="before")
163
+ @classmethod
164
+ def parse_date(cls, v):
165
+ """Parse date from various formats."""
166
+ if v is None or isinstance(v, date):
167
+ return v
168
+ if isinstance(v, str):
169
+ from datetime import datetime
170
+
171
+ return datetime.fromisoformat(v).date()
172
+ return v
173
+
174
+ def __repr__(self) -> str:
175
+ """Technical representation."""
176
+ return f"Entity(dcid='{self.dcid}', name='{self.canonical_name}')"
177
+
178
+ def __str__(self) -> str:
179
+ """Human-friendly representation."""
180
+ lines = [
181
+ f"{self.canonical_name}",
182
+ f" DCID: {self.dcid}",
183
+ f" Type: {self.entity_type.value}",
184
+ ]
185
+
186
+ if self.codes:
187
+ lines.append(" Codes:")
188
+ for system, code in sorted(self.codes.items())[:5]:
189
+ lines.append(f" {system}: {code}")
190
+
191
+ if self.parent_dcid:
192
+ lines.append(f" Parent: {self.parent_dcid}")
193
+
194
+ return "\n".join(lines)
195
+
196
+
197
+ class AliasRow(SQLModel, table=True):
198
+ """Database row representation of an alias."""
199
+
200
+ __tablename__ = "aliases"
201
+
202
+ # Configure model to use Enum values (not names) in database
203
+ model_config = {"use_enum_values": True}
204
+
205
+ alias_id: int = SQLModelField(
206
+ default=None, primary_key=True, ge=0, description="Unique alias ID"
207
+ )
208
+ entity_dcid: str = SQLModelField(
209
+ foreign_key="entities.dcid", min_length=1, description="Associated entity DCID"
210
+ )
211
+ alias_text: str = SQLModelField(min_length=1, description="Original alias text")
212
+ alias_norm: str = SQLModelField(
213
+ min_length=1, index=True, description="Normalized alias text"
214
+ )
215
+ language: str | None = SQLModelField(
216
+ default=None, max_length=2, description="ISO 639-1 language code"
217
+ )
218
+ alias_type: AliasType = SQLModelField(
219
+ default=AliasType.EXONYM, sa_column=Column(SQLAString)
220
+ )
221
+ valid_from: date | None = None
222
+ valid_until: date | None = None
223
+ source: str | None = None
224
+ alias_uid: str = SQLModelField(
225
+ min_length=1, unique=True, description="Unique alias identifier"
226
+ )
227
+
228
+ @field_validator("valid_from", "valid_until", mode="before")
229
+ @classmethod
230
+ def parse_date(cls, v):
231
+ """Parse date from various formats."""
232
+ if v is None or isinstance(v, date):
233
+ return v
234
+ if isinstance(v, str):
235
+ from datetime import datetime
236
+
237
+ return datetime.fromisoformat(v).date()
238
+ return v
239
+
240
+ @field_serializer("alias_type", when_used="always")
241
+ def serialize_alias_type(self, value: AliasType) -> str:
242
+ """Serialize AliasType enum to its string value for database storage."""
243
+ return value.value if isinstance(value, AliasType) else value
244
+
245
+
246
+ class Alias(BaseModel):
247
+ """Represents an entity alias (business logic model)."""
248
+
249
+ model_config = ConfigDict(frozen=False)
250
+
251
+ alias_id: int = Field(..., ge=0)
252
+ entity_dcid: str = Field(..., min_length=1)
253
+ alias_text: str = Field(..., min_length=1)
254
+ alias_norm: str = Field(..., min_length=1)
255
+ language: str | None = Field(None, pattern=r"^[a-z]{2}$")
256
+ alias_type: AliasType = "exonym"
257
+ valid_from: date | None = None
258
+ valid_until: date | None = None
259
+ source: str | None = None
260
+
261
+
262
+ class MatchContext(BaseModel):
263
+ """Optional filtering context for matching."""
264
+
265
+ entity_type: EntityType | None = None
266
+ parent_dcid: str | None = None
267
+ as_of: date | None = None
268
+ group_dcid: str | None = None # Filter by group membership
269
+
270
+
271
+ class Candidate(BaseModel):
272
+ """
273
+ A resolution candidate with features for calibration.
274
+
275
+ Attributes:
276
+ entity: Matched entity
277
+ score: Raw matcher score (0.0-1.0)
278
+ matcher_type: Which matcher produced this candidate
279
+ features: Feature dictionary for calibration
280
+ matched_alias: The specific alias text that matched (if applicable)
281
+
282
+ Common features:
283
+ - exact_code: bool - Matched via code lookup
284
+ - canonical_exact: bool - Exact canonical name match
285
+ - alias_exact: bool - Exact alias match
286
+ - fts_rank: int | None - FTS result rank
287
+ - fts_score: float | None - FTS BM25 score
288
+ - edit_similarity: float | None - Edit distance (0-1)
289
+ - trigram_jaccard: float | None - Trigram Jaccard similarity
290
+ - fuzzy_score: float | None - Combined fuzzy score
291
+ - code_system: str | None - Which code system matched
292
+ - matched_alias: str | None - Which alias matched
293
+ """
294
+
295
+ model_config = ConfigDict(frozen=False, arbitrary_types_allowed=True)
296
+
297
+ entity: Entity
298
+ score: float = Field(..., ge=0.0, le=1.0, description="Raw matcher score (0.0-1.0)")
299
+ matcher_type: MatcherType
300
+ features: dict[str, Any] = Field(
301
+ default_factory=dict, description="Feature dictionary for calibration"
302
+ )
303
+ matched_alias: str | None = Field(
304
+ None, description="The specific alias text that matched"
305
+ )
306
+
307
+ @property
308
+ def dcid(self) -> str:
309
+ """Get entity DCID."""
310
+ return self.entity.dcid
311
+
312
+
313
+ class Explanation(BaseModel):
314
+ """Explains how a resolution was made."""
315
+
316
+ model_config = ConfigDict(frozen=False)
317
+
318
+ # STANDARD mode fields
319
+ matcher_used: MatcherType | None = None
320
+ features: dict[str, Any] = Field(default_factory=dict)
321
+
322
+ # Already exists
323
+ stages: list[str] = Field(default_factory=list)
324
+ candidates: list[Candidate] = Field(default_factory=list)
325
+ rules_applied: list[str] = Field(default_factory=list)
326
+ calibration: dict[str, float] = Field(default_factory=dict)
327
+ trace: dict = Field(default_factory=dict)
328
+
329
+ def __repr__(self) -> str:
330
+ """Technical representation."""
331
+ return (
332
+ f"Explanation(matcher={self.matcher_used}, features={len(self.features)})"
333
+ )
334
+
335
+ def __str__(self) -> str:
336
+ """Human-friendly scorecard."""
337
+ lines = ["Resolution Scorecard:", "=" * 50]
338
+
339
+ if self.matcher_used:
340
+ lines.append(f"Matcher: {self.matcher_used.value}")
341
+
342
+ # Show key features
343
+ if self.features:
344
+ lines.append("\nKey Features:")
345
+ for key, value in list(self.features.items())[:5]:
346
+ if isinstance(value, bool):
347
+ lines.append(f" {key}: {'✓' if value else '✗'}")
348
+ elif isinstance(value, float):
349
+ lines.append(f" {key}: {value:.3f}")
350
+ else:
351
+ lines.append(f" {key}: {value}")
352
+
353
+ # Show alternatives (candidates)
354
+ if self.candidates:
355
+ lines.append(f"\nAlternatives ({len(self.candidates)}):")
356
+ for i, cand in enumerate(self.candidates[:5], 1):
357
+ lines.append(
358
+ f" {i}. {cand.entity.canonical_name} (score: {cand.score:.3f})"
359
+ )
360
+
361
+ # FULL mode: show trace summary
362
+ if self.trace:
363
+ lines.append(f"\nTrace: {len(self.trace)} stages recorded")
364
+
365
+ return "\n".join(lines)
366
+
367
+
368
+ class Resolution(BaseModel):
369
+ """Result of entity resolution."""
370
+
371
+ model_config = ConfigDict(frozen=False)
372
+
373
+ entity: Entity | None = None
374
+ confidence: float = Field(..., ge=0.0, le=1.0, description="Calibrated confidence")
375
+ alternatives: list[Entity] = Field(default_factory=list)
376
+ explanation: Explanation | None = None
377
+ conflicts: list[dict] | None = None
378
+
379
+ @property
380
+ def success(self) -> bool:
381
+ """Check if resolution was successful."""
382
+ return self.entity is not None
383
+
384
+ def __repr__(self) -> str:
385
+ """Technical representation."""
386
+ return (
387
+ f"Resolution(entity={self.entity.dcid if self.entity else None}, "
388
+ f"confidence={self.confidence:.3f}, "
389
+ f"alternatives={len(self.alternatives)})"
390
+ )
391
+
392
+ def __str__(self) -> str:
393
+ """Human-friendly output."""
394
+ if not self.success:
395
+ return f"No match found (confidence: {self.confidence:.3f})"
396
+
397
+ lines = [
398
+ f"Resolved to: {self.entity.canonical_name}",
399
+ f" DCID: {self.entity.dcid}",
400
+ f" Confidence: {self.confidence:.3f}",
401
+ ]
402
+
403
+ if self.alternatives:
404
+ lines.append(f" Alternatives: {len(self.alternatives)}")
405
+ for i, alt in enumerate(self.alternatives[:3], 1):
406
+ lines.append(f" {i}. {alt.canonical_name} ({alt.dcid})")
407
+
408
+ if self.explanation and self.explanation.matcher_used:
409
+ lines.append(f" Matched via: {self.explanation.matcher_used.value}")
410
+
411
+ return "\n".join(lines)
412
+
413
+
414
+ class ExtractedEntity(BaseModel):
415
+ """Entity extracted from text."""
416
+
417
+ model_config = ConfigDict(frozen=False)
418
+
419
+ text: str = Field(..., min_length=1)
420
+ span: tuple[int, int]
421
+ dcid: str = Field(..., min_length=1)
422
+ canonical_name: str = Field(..., min_length=1)
423
+ entity_type: EntityType
424
+ confidence: float = Field(..., ge=0.0, le=1.0)
425
+ context: str | None = None
426
+ method: str | None = None
427
+
428
+ @field_validator("span")
429
+ @classmethod
430
+ def validate_span(cls, v):
431
+ """Validate span is valid (start < end)."""
432
+ if len(v) != 2:
433
+ raise ValueError("span must be a tuple of (start, end)")
434
+ if v[0] > v[1]:
435
+ raise ValueError("span start must be <= end")
436
+ return v
437
+
438
+
439
+ class CodeRow(SQLModel, table=True):
440
+ """Database row representation of a code mapping."""
441
+
442
+ __tablename__ = "codes"
443
+
444
+ # Configure model to use Enum values (not names) in database
445
+ model_config = {"use_enum_values": True}
446
+
447
+ entity_dcid: str = SQLModelField(
448
+ foreign_key="entities.dcid",
449
+ primary_key=True,
450
+ min_length=1,
451
+ description="Associated entity DCID",
452
+ )
453
+ code_system: CodeSystem = SQLModelField(
454
+ sa_column=Column(SQLAString, primary_key=True),
455
+ description="Code system identifier",
456
+ )
457
+ code_value: str = SQLModelField(min_length=1, description="Code value")
458
+ valid_from: date | None = None
459
+ valid_until: date | None = None
460
+ source: str | None = None
461
+
462
+ @field_validator("valid_from", "valid_until", mode="before")
463
+ @classmethod
464
+ def parse_date(cls, v):
465
+ """Parse date from various formats."""
466
+ if v is None or isinstance(v, date):
467
+ return v
468
+ if isinstance(v, str):
469
+ from datetime import datetime
470
+
471
+ return datetime.fromisoformat(v).date()
472
+ return v
473
+
474
+ @field_serializer("code_system", when_used="always")
475
+ def serialize_code_system(self, value: CodeSystem) -> str:
476
+ """Serialize CodeSystem enum to its string value for database storage."""
477
+ return value.value if isinstance(value, CodeSystem) else value
478
+
479
+
480
+ class MembershipRow(SQLModel, table=True):
481
+ """Database row representation of a group membership."""
482
+
483
+ __tablename__ = "memberships"
484
+
485
+ id: int = SQLModelField(
486
+ default=None, primary_key=True, ge=0, description="Unique membership ID"
487
+ )
488
+ entity_dcid: str = SQLModelField(
489
+ foreign_key="entities.dcid", min_length=1, description="Member entity DCID"
490
+ )
491
+ group_dcid: str = SQLModelField(
492
+ foreign_key="entities.dcid", min_length=1, description="Group entity DCID"
493
+ )
494
+ valid_from: date
495
+ valid_until: date | None = None
496
+ source: str | None = None
497
+
498
+ @field_validator("valid_from", "valid_until", mode="before")
499
+ @classmethod
500
+ def parse_date(cls, v):
501
+ """Parse date from various formats."""
502
+ if v is None or isinstance(v, date):
503
+ return v
504
+ if isinstance(v, str):
505
+ from datetime import datetime
506
+
507
+ return datetime.fromisoformat(v).date()
508
+ return v
509
+
510
+
511
+ class Membership(BaseModel):
512
+ """Group membership record (business logic model)."""
513
+
514
+ model_config = ConfigDict(frozen=False)
515
+
516
+ entity_dcid: str = Field(..., min_length=1)
517
+ group_dcid: str = Field(..., min_length=1)
518
+ valid_from: date
519
+ valid_until: date | None = None
520
+ source: str | None = None
521
+
522
+ @field_validator("valid_until")
523
+ @classmethod
524
+ def validate_dates(cls, v, info):
525
+ """Validate that valid_until > valid_from."""
526
+ if v is not None and "valid_from" in info.data and v <= info.data["valid_from"]:
527
+ raise ValueError("valid_until must be after valid_from")
528
+ return v
529
+
530
+ def is_valid_at(self, at: date) -> bool:
531
+ """Check if membership is valid at given date."""
532
+ if at < self.valid_from:
533
+ return False
534
+ return not (self.valid_until and at >= self.valid_until)