resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
resolvekit/types.py
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""Type definitions for resolvekit."""
|
|
2
|
+
|
|
3
|
+
from datetime import date
|
|
4
|
+
from enum import StrEnum
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
|
|
8
|
+
from sqlalchemy import Column
|
|
9
|
+
from sqlalchemy import String as SQLAString
|
|
10
|
+
from sqlmodel import Field as SQLModelField
|
|
11
|
+
from sqlmodel import SQLModel
|
|
12
|
+
|
|
13
|
+
# ==============================================================================
|
|
14
|
+
# Enums (for type safety and SQLModel compatibility)
|
|
15
|
+
# ==============================================================================
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EntityType(StrEnum):
|
|
19
|
+
"""Entity type classifications."""
|
|
20
|
+
|
|
21
|
+
COUNTRY = "country"
|
|
22
|
+
ADMIN1 = "admin1"
|
|
23
|
+
ADMIN2 = "admin2"
|
|
24
|
+
ADMIN3 = "admin3"
|
|
25
|
+
ADMIN4 = "admin4"
|
|
26
|
+
CITY = "city"
|
|
27
|
+
ORGANIZATION = "organization"
|
|
28
|
+
GROUP = "group"
|
|
29
|
+
REGION = "region"
|
|
30
|
+
OTHER = "other"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AliasType(StrEnum):
|
|
34
|
+
"""Alias type classifications."""
|
|
35
|
+
|
|
36
|
+
CANONICAL = "canonical" # Official canonical name
|
|
37
|
+
ENDONYM = "endonym" # Native name in local language
|
|
38
|
+
EXONYM = "exonym" # Foreign name
|
|
39
|
+
ABBR = "abbr" # Abbreviation
|
|
40
|
+
CODE = "code" # Code identifier
|
|
41
|
+
HISTORICAL = "historical" # Historical name (no longer current)
|
|
42
|
+
COLLOQUIAL = "colloquial" # Informal/colloquial name
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CodeSystem(StrEnum):
|
|
46
|
+
"""Code system identifiers."""
|
|
47
|
+
|
|
48
|
+
DCID = "dcid" # Data Commons ID (canonical)
|
|
49
|
+
ISO2 = "iso2" # ISO 3166-1 alpha-2
|
|
50
|
+
ISO3 = "iso3" # ISO 3166-1 alpha-3
|
|
51
|
+
ISO_NUMERIC = "iso_numeric" # ISO 3166-1 numeric
|
|
52
|
+
ISO3166_2 = "iso3166_2" # ISO 3166-2 subdivision codes
|
|
53
|
+
M49 = "m49" # UN M49 codes
|
|
54
|
+
NUTS = "nuts" # EU NUTS codes
|
|
55
|
+
LAU = "lau" # EU LAU codes
|
|
56
|
+
PCODE = "pcode" # OCHA P-codes
|
|
57
|
+
WIKIDATA = "wikidata" # Wikidata QID
|
|
58
|
+
GEONAMES = "geonames" # GeoNames ID
|
|
59
|
+
WB = "wb" # World Bank codes
|
|
60
|
+
DAC = "dac" # OECD DAC codes
|
|
61
|
+
FIPS = "fips" # FIPS codes (deprecated but still used)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MatcherType(StrEnum):
|
|
65
|
+
"""Matcher type classifications."""
|
|
66
|
+
|
|
67
|
+
EXACT_CODE = "exact_code"
|
|
68
|
+
CANONICAL_NAME = "canonical_name"
|
|
69
|
+
ALIAS_EXACT = "alias_exact"
|
|
70
|
+
FTS = "fts"
|
|
71
|
+
FUZZY = "fuzzy"
|
|
72
|
+
SEMANTIC = "semantic"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class OutputFormat(StrEnum):
|
|
76
|
+
"""Output format options."""
|
|
77
|
+
|
|
78
|
+
HUMAN = "human"
|
|
79
|
+
JSON = "json"
|
|
80
|
+
CSV = "csv"
|
|
81
|
+
TABLE = "table"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class ExplanationMode(StrEnum):
|
|
85
|
+
"""Explanation detail level."""
|
|
86
|
+
|
|
87
|
+
MINIMAL = "minimal" # Just confidence scores, no explanation object
|
|
88
|
+
STANDARD = "standard" # Matched features + top alternatives
|
|
89
|
+
FULL = "full" # Complete debug trace with all stages
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class EntityRow(SQLModel, table=True):
|
|
93
|
+
"""Database row representation of an entity (no computed fields)."""
|
|
94
|
+
|
|
95
|
+
__tablename__ = "entities"
|
|
96
|
+
|
|
97
|
+
# Configure model to use Enum values (not names) in database
|
|
98
|
+
model_config = {"use_enum_values": True}
|
|
99
|
+
|
|
100
|
+
dcid: str = SQLModelField(
|
|
101
|
+
primary_key=True, min_length=1, description="Data Commons ID"
|
|
102
|
+
)
|
|
103
|
+
canonical_name: str = SQLModelField(
|
|
104
|
+
min_length=1, description="Canonical entity name"
|
|
105
|
+
)
|
|
106
|
+
normalized_canonical: str = SQLModelField(
|
|
107
|
+
min_length=1,
|
|
108
|
+
index=True,
|
|
109
|
+
description="Normalized canonical name for efficient lookups",
|
|
110
|
+
)
|
|
111
|
+
entity_type: EntityType = SQLModelField(
|
|
112
|
+
sa_column=Column(SQLAString)
|
|
113
|
+
) # Store as string
|
|
114
|
+
parent_dcid: str | None = SQLModelField(
|
|
115
|
+
default=None, foreign_key="entities.dcid", description="Parent entity DCID"
|
|
116
|
+
)
|
|
117
|
+
centroid_lat: float | None = SQLModelField(
|
|
118
|
+
default=None, ge=-90, le=90, description="Latitude"
|
|
119
|
+
)
|
|
120
|
+
centroid_lon: float | None = SQLModelField(
|
|
121
|
+
default=None, ge=-180, le=180, description="Longitude"
|
|
122
|
+
)
|
|
123
|
+
valid_from: date | None = None
|
|
124
|
+
valid_until: date | None = None
|
|
125
|
+
|
|
126
|
+
@field_validator("valid_from", "valid_until", mode="before")
|
|
127
|
+
@classmethod
|
|
128
|
+
def parse_date(cls, v):
|
|
129
|
+
"""Parse date from various formats."""
|
|
130
|
+
if v is None or isinstance(v, date):
|
|
131
|
+
return v
|
|
132
|
+
if isinstance(v, str):
|
|
133
|
+
from datetime import datetime
|
|
134
|
+
|
|
135
|
+
return datetime.fromisoformat(v).date()
|
|
136
|
+
return v
|
|
137
|
+
|
|
138
|
+
@field_serializer("entity_type", when_used="always")
|
|
139
|
+
def serialize_entity_type(self, value: EntityType) -> str:
|
|
140
|
+
"""Serialize EntityType enum to its string value for database storage."""
|
|
141
|
+
return value.value if isinstance(value, EntityType) else value
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class Entity(BaseModel):
|
|
145
|
+
"""Represents a resolved entity."""
|
|
146
|
+
|
|
147
|
+
model_config = ConfigDict(frozen=False)
|
|
148
|
+
|
|
149
|
+
dcid: str = Field(..., min_length=1, description="Data Commons ID")
|
|
150
|
+
canonical_name: str = Field(..., min_length=1, description="Canonical entity name")
|
|
151
|
+
entity_type: EntityType
|
|
152
|
+
codes: dict[str, str] = Field(default_factory=dict, description="Code mappings")
|
|
153
|
+
parent_dcid: str | None = Field(None, description="Parent entity DCID")
|
|
154
|
+
centroid_lat: float | None = Field(None, ge=-90, le=90, description="Latitude")
|
|
155
|
+
centroid_lon: float | None = Field(None, ge=-180, le=180, description="Longitude")
|
|
156
|
+
valid_from: date | None = Field(None, description="Validity start date")
|
|
157
|
+
valid_until: date | None = Field(None, description="Validity end date")
|
|
158
|
+
provenance: dict[str, str] = Field(
|
|
159
|
+
default_factory=dict, description="Data source provenance"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
@field_validator("valid_from", "valid_until", mode="before")
|
|
163
|
+
@classmethod
|
|
164
|
+
def parse_date(cls, v):
|
|
165
|
+
"""Parse date from various formats."""
|
|
166
|
+
if v is None or isinstance(v, date):
|
|
167
|
+
return v
|
|
168
|
+
if isinstance(v, str):
|
|
169
|
+
from datetime import datetime
|
|
170
|
+
|
|
171
|
+
return datetime.fromisoformat(v).date()
|
|
172
|
+
return v
|
|
173
|
+
|
|
174
|
+
def __repr__(self) -> str:
|
|
175
|
+
"""Technical representation."""
|
|
176
|
+
return f"Entity(dcid='{self.dcid}', name='{self.canonical_name}')"
|
|
177
|
+
|
|
178
|
+
def __str__(self) -> str:
|
|
179
|
+
"""Human-friendly representation."""
|
|
180
|
+
lines = [
|
|
181
|
+
f"{self.canonical_name}",
|
|
182
|
+
f" DCID: {self.dcid}",
|
|
183
|
+
f" Type: {self.entity_type.value}",
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
if self.codes:
|
|
187
|
+
lines.append(" Codes:")
|
|
188
|
+
for system, code in sorted(self.codes.items())[:5]:
|
|
189
|
+
lines.append(f" {system}: {code}")
|
|
190
|
+
|
|
191
|
+
if self.parent_dcid:
|
|
192
|
+
lines.append(f" Parent: {self.parent_dcid}")
|
|
193
|
+
|
|
194
|
+
return "\n".join(lines)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class AliasRow(SQLModel, table=True):
|
|
198
|
+
"""Database row representation of an alias."""
|
|
199
|
+
|
|
200
|
+
__tablename__ = "aliases"
|
|
201
|
+
|
|
202
|
+
# Configure model to use Enum values (not names) in database
|
|
203
|
+
model_config = {"use_enum_values": True}
|
|
204
|
+
|
|
205
|
+
alias_id: int = SQLModelField(
|
|
206
|
+
default=None, primary_key=True, ge=0, description="Unique alias ID"
|
|
207
|
+
)
|
|
208
|
+
entity_dcid: str = SQLModelField(
|
|
209
|
+
foreign_key="entities.dcid", min_length=1, description="Associated entity DCID"
|
|
210
|
+
)
|
|
211
|
+
alias_text: str = SQLModelField(min_length=1, description="Original alias text")
|
|
212
|
+
alias_norm: str = SQLModelField(
|
|
213
|
+
min_length=1, index=True, description="Normalized alias text"
|
|
214
|
+
)
|
|
215
|
+
language: str | None = SQLModelField(
|
|
216
|
+
default=None, max_length=2, description="ISO 639-1 language code"
|
|
217
|
+
)
|
|
218
|
+
alias_type: AliasType = SQLModelField(
|
|
219
|
+
default=AliasType.EXONYM, sa_column=Column(SQLAString)
|
|
220
|
+
)
|
|
221
|
+
valid_from: date | None = None
|
|
222
|
+
valid_until: date | None = None
|
|
223
|
+
source: str | None = None
|
|
224
|
+
alias_uid: str = SQLModelField(
|
|
225
|
+
min_length=1, unique=True, description="Unique alias identifier"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
@field_validator("valid_from", "valid_until", mode="before")
|
|
229
|
+
@classmethod
|
|
230
|
+
def parse_date(cls, v):
|
|
231
|
+
"""Parse date from various formats."""
|
|
232
|
+
if v is None or isinstance(v, date):
|
|
233
|
+
return v
|
|
234
|
+
if isinstance(v, str):
|
|
235
|
+
from datetime import datetime
|
|
236
|
+
|
|
237
|
+
return datetime.fromisoformat(v).date()
|
|
238
|
+
return v
|
|
239
|
+
|
|
240
|
+
@field_serializer("alias_type", when_used="always")
|
|
241
|
+
def serialize_alias_type(self, value: AliasType) -> str:
|
|
242
|
+
"""Serialize AliasType enum to its string value for database storage."""
|
|
243
|
+
return value.value if isinstance(value, AliasType) else value
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class Alias(BaseModel):
|
|
247
|
+
"""Represents an entity alias (business logic model)."""
|
|
248
|
+
|
|
249
|
+
model_config = ConfigDict(frozen=False)
|
|
250
|
+
|
|
251
|
+
alias_id: int = Field(..., ge=0)
|
|
252
|
+
entity_dcid: str = Field(..., min_length=1)
|
|
253
|
+
alias_text: str = Field(..., min_length=1)
|
|
254
|
+
alias_norm: str = Field(..., min_length=1)
|
|
255
|
+
language: str | None = Field(None, pattern=r"^[a-z]{2}$")
|
|
256
|
+
alias_type: AliasType = "exonym"
|
|
257
|
+
valid_from: date | None = None
|
|
258
|
+
valid_until: date | None = None
|
|
259
|
+
source: str | None = None
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class MatchContext(BaseModel):
|
|
263
|
+
"""Optional filtering context for matching."""
|
|
264
|
+
|
|
265
|
+
entity_type: EntityType | None = None
|
|
266
|
+
parent_dcid: str | None = None
|
|
267
|
+
as_of: date | None = None
|
|
268
|
+
group_dcid: str | None = None # Filter by group membership
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class Candidate(BaseModel):
|
|
272
|
+
"""
|
|
273
|
+
A resolution candidate with features for calibration.
|
|
274
|
+
|
|
275
|
+
Attributes:
|
|
276
|
+
entity: Matched entity
|
|
277
|
+
score: Raw matcher score (0.0-1.0)
|
|
278
|
+
matcher_type: Which matcher produced this candidate
|
|
279
|
+
features: Feature dictionary for calibration
|
|
280
|
+
matched_alias: The specific alias text that matched (if applicable)
|
|
281
|
+
|
|
282
|
+
Common features:
|
|
283
|
+
- exact_code: bool - Matched via code lookup
|
|
284
|
+
- canonical_exact: bool - Exact canonical name match
|
|
285
|
+
- alias_exact: bool - Exact alias match
|
|
286
|
+
- fts_rank: int | None - FTS result rank
|
|
287
|
+
- fts_score: float | None - FTS BM25 score
|
|
288
|
+
- edit_similarity: float | None - Edit distance (0-1)
|
|
289
|
+
- trigram_jaccard: float | None - Trigram Jaccard similarity
|
|
290
|
+
- fuzzy_score: float | None - Combined fuzzy score
|
|
291
|
+
- code_system: str | None - Which code system matched
|
|
292
|
+
- matched_alias: str | None - Which alias matched
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
model_config = ConfigDict(frozen=False, arbitrary_types_allowed=True)
|
|
296
|
+
|
|
297
|
+
entity: Entity
|
|
298
|
+
score: float = Field(..., ge=0.0, le=1.0, description="Raw matcher score (0.0-1.0)")
|
|
299
|
+
matcher_type: MatcherType
|
|
300
|
+
features: dict[str, Any] = Field(
|
|
301
|
+
default_factory=dict, description="Feature dictionary for calibration"
|
|
302
|
+
)
|
|
303
|
+
matched_alias: str | None = Field(
|
|
304
|
+
None, description="The specific alias text that matched"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
@property
|
|
308
|
+
def dcid(self) -> str:
|
|
309
|
+
"""Get entity DCID."""
|
|
310
|
+
return self.entity.dcid
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class Explanation(BaseModel):
|
|
314
|
+
"""Explains how a resolution was made."""
|
|
315
|
+
|
|
316
|
+
model_config = ConfigDict(frozen=False)
|
|
317
|
+
|
|
318
|
+
# STANDARD mode fields
|
|
319
|
+
matcher_used: MatcherType | None = None
|
|
320
|
+
features: dict[str, Any] = Field(default_factory=dict)
|
|
321
|
+
|
|
322
|
+
# Already exists
|
|
323
|
+
stages: list[str] = Field(default_factory=list)
|
|
324
|
+
candidates: list[Candidate] = Field(default_factory=list)
|
|
325
|
+
rules_applied: list[str] = Field(default_factory=list)
|
|
326
|
+
calibration: dict[str, float] = Field(default_factory=dict)
|
|
327
|
+
trace: dict = Field(default_factory=dict)
|
|
328
|
+
|
|
329
|
+
def __repr__(self) -> str:
|
|
330
|
+
"""Technical representation."""
|
|
331
|
+
return (
|
|
332
|
+
f"Explanation(matcher={self.matcher_used}, features={len(self.features)})"
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
def __str__(self) -> str:
|
|
336
|
+
"""Human-friendly scorecard."""
|
|
337
|
+
lines = ["Resolution Scorecard:", "=" * 50]
|
|
338
|
+
|
|
339
|
+
if self.matcher_used:
|
|
340
|
+
lines.append(f"Matcher: {self.matcher_used.value}")
|
|
341
|
+
|
|
342
|
+
# Show key features
|
|
343
|
+
if self.features:
|
|
344
|
+
lines.append("\nKey Features:")
|
|
345
|
+
for key, value in list(self.features.items())[:5]:
|
|
346
|
+
if isinstance(value, bool):
|
|
347
|
+
lines.append(f" {key}: {'✓' if value else '✗'}")
|
|
348
|
+
elif isinstance(value, float):
|
|
349
|
+
lines.append(f" {key}: {value:.3f}")
|
|
350
|
+
else:
|
|
351
|
+
lines.append(f" {key}: {value}")
|
|
352
|
+
|
|
353
|
+
# Show alternatives (candidates)
|
|
354
|
+
if self.candidates:
|
|
355
|
+
lines.append(f"\nAlternatives ({len(self.candidates)}):")
|
|
356
|
+
for i, cand in enumerate(self.candidates[:5], 1):
|
|
357
|
+
lines.append(
|
|
358
|
+
f" {i}. {cand.entity.canonical_name} (score: {cand.score:.3f})"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# FULL mode: show trace summary
|
|
362
|
+
if self.trace:
|
|
363
|
+
lines.append(f"\nTrace: {len(self.trace)} stages recorded")
|
|
364
|
+
|
|
365
|
+
return "\n".join(lines)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
class Resolution(BaseModel):
|
|
369
|
+
"""Result of entity resolution."""
|
|
370
|
+
|
|
371
|
+
model_config = ConfigDict(frozen=False)
|
|
372
|
+
|
|
373
|
+
entity: Entity | None = None
|
|
374
|
+
confidence: float = Field(..., ge=0.0, le=1.0, description="Calibrated confidence")
|
|
375
|
+
alternatives: list[Entity] = Field(default_factory=list)
|
|
376
|
+
explanation: Explanation | None = None
|
|
377
|
+
conflicts: list[dict] | None = None
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def success(self) -> bool:
|
|
381
|
+
"""Check if resolution was successful."""
|
|
382
|
+
return self.entity is not None
|
|
383
|
+
|
|
384
|
+
def __repr__(self) -> str:
|
|
385
|
+
"""Technical representation."""
|
|
386
|
+
return (
|
|
387
|
+
f"Resolution(entity={self.entity.dcid if self.entity else None}, "
|
|
388
|
+
f"confidence={self.confidence:.3f}, "
|
|
389
|
+
f"alternatives={len(self.alternatives)})"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
def __str__(self) -> str:
|
|
393
|
+
"""Human-friendly output."""
|
|
394
|
+
if not self.success:
|
|
395
|
+
return f"No match found (confidence: {self.confidence:.3f})"
|
|
396
|
+
|
|
397
|
+
lines = [
|
|
398
|
+
f"Resolved to: {self.entity.canonical_name}",
|
|
399
|
+
f" DCID: {self.entity.dcid}",
|
|
400
|
+
f" Confidence: {self.confidence:.3f}",
|
|
401
|
+
]
|
|
402
|
+
|
|
403
|
+
if self.alternatives:
|
|
404
|
+
lines.append(f" Alternatives: {len(self.alternatives)}")
|
|
405
|
+
for i, alt in enumerate(self.alternatives[:3], 1):
|
|
406
|
+
lines.append(f" {i}. {alt.canonical_name} ({alt.dcid})")
|
|
407
|
+
|
|
408
|
+
if self.explanation and self.explanation.matcher_used:
|
|
409
|
+
lines.append(f" Matched via: {self.explanation.matcher_used.value}")
|
|
410
|
+
|
|
411
|
+
return "\n".join(lines)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
class ExtractedEntity(BaseModel):
|
|
415
|
+
"""Entity extracted from text."""
|
|
416
|
+
|
|
417
|
+
model_config = ConfigDict(frozen=False)
|
|
418
|
+
|
|
419
|
+
text: str = Field(..., min_length=1)
|
|
420
|
+
span: tuple[int, int]
|
|
421
|
+
dcid: str = Field(..., min_length=1)
|
|
422
|
+
canonical_name: str = Field(..., min_length=1)
|
|
423
|
+
entity_type: EntityType
|
|
424
|
+
confidence: float = Field(..., ge=0.0, le=1.0)
|
|
425
|
+
context: str | None = None
|
|
426
|
+
method: str | None = None
|
|
427
|
+
|
|
428
|
+
@field_validator("span")
|
|
429
|
+
@classmethod
|
|
430
|
+
def validate_span(cls, v):
|
|
431
|
+
"""Validate span is valid (start < end)."""
|
|
432
|
+
if len(v) != 2:
|
|
433
|
+
raise ValueError("span must be a tuple of (start, end)")
|
|
434
|
+
if v[0] > v[1]:
|
|
435
|
+
raise ValueError("span start must be <= end")
|
|
436
|
+
return v
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
class CodeRow(SQLModel, table=True):
|
|
440
|
+
"""Database row representation of a code mapping."""
|
|
441
|
+
|
|
442
|
+
__tablename__ = "codes"
|
|
443
|
+
|
|
444
|
+
# Configure model to use Enum values (not names) in database
|
|
445
|
+
model_config = {"use_enum_values": True}
|
|
446
|
+
|
|
447
|
+
entity_dcid: str = SQLModelField(
|
|
448
|
+
foreign_key="entities.dcid",
|
|
449
|
+
primary_key=True,
|
|
450
|
+
min_length=1,
|
|
451
|
+
description="Associated entity DCID",
|
|
452
|
+
)
|
|
453
|
+
code_system: CodeSystem = SQLModelField(
|
|
454
|
+
sa_column=Column(SQLAString, primary_key=True),
|
|
455
|
+
description="Code system identifier",
|
|
456
|
+
)
|
|
457
|
+
code_value: str = SQLModelField(min_length=1, description="Code value")
|
|
458
|
+
valid_from: date | None = None
|
|
459
|
+
valid_until: date | None = None
|
|
460
|
+
source: str | None = None
|
|
461
|
+
|
|
462
|
+
@field_validator("valid_from", "valid_until", mode="before")
|
|
463
|
+
@classmethod
|
|
464
|
+
def parse_date(cls, v):
|
|
465
|
+
"""Parse date from various formats."""
|
|
466
|
+
if v is None or isinstance(v, date):
|
|
467
|
+
return v
|
|
468
|
+
if isinstance(v, str):
|
|
469
|
+
from datetime import datetime
|
|
470
|
+
|
|
471
|
+
return datetime.fromisoformat(v).date()
|
|
472
|
+
return v
|
|
473
|
+
|
|
474
|
+
@field_serializer("code_system", when_used="always")
|
|
475
|
+
def serialize_code_system(self, value: CodeSystem) -> str:
|
|
476
|
+
"""Serialize CodeSystem enum to its string value for database storage."""
|
|
477
|
+
return value.value if isinstance(value, CodeSystem) else value
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
class MembershipRow(SQLModel, table=True):
|
|
481
|
+
"""Database row representation of a group membership."""
|
|
482
|
+
|
|
483
|
+
__tablename__ = "memberships"
|
|
484
|
+
|
|
485
|
+
id: int = SQLModelField(
|
|
486
|
+
default=None, primary_key=True, ge=0, description="Unique membership ID"
|
|
487
|
+
)
|
|
488
|
+
entity_dcid: str = SQLModelField(
|
|
489
|
+
foreign_key="entities.dcid", min_length=1, description="Member entity DCID"
|
|
490
|
+
)
|
|
491
|
+
group_dcid: str = SQLModelField(
|
|
492
|
+
foreign_key="entities.dcid", min_length=1, description="Group entity DCID"
|
|
493
|
+
)
|
|
494
|
+
valid_from: date
|
|
495
|
+
valid_until: date | None = None
|
|
496
|
+
source: str | None = None
|
|
497
|
+
|
|
498
|
+
@field_validator("valid_from", "valid_until", mode="before")
|
|
499
|
+
@classmethod
|
|
500
|
+
def parse_date(cls, v):
|
|
501
|
+
"""Parse date from various formats."""
|
|
502
|
+
if v is None or isinstance(v, date):
|
|
503
|
+
return v
|
|
504
|
+
if isinstance(v, str):
|
|
505
|
+
from datetime import datetime
|
|
506
|
+
|
|
507
|
+
return datetime.fromisoformat(v).date()
|
|
508
|
+
return v
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
class Membership(BaseModel):
|
|
512
|
+
"""Group membership record (business logic model)."""
|
|
513
|
+
|
|
514
|
+
model_config = ConfigDict(frozen=False)
|
|
515
|
+
|
|
516
|
+
entity_dcid: str = Field(..., min_length=1)
|
|
517
|
+
group_dcid: str = Field(..., min_length=1)
|
|
518
|
+
valid_from: date
|
|
519
|
+
valid_until: date | None = None
|
|
520
|
+
source: str | None = None
|
|
521
|
+
|
|
522
|
+
@field_validator("valid_until")
|
|
523
|
+
@classmethod
|
|
524
|
+
def validate_dates(cls, v, info):
|
|
525
|
+
"""Validate that valid_until > valid_from."""
|
|
526
|
+
if v is not None and "valid_from" in info.data and v <= info.data["valid_from"]:
|
|
527
|
+
raise ValueError("valid_until must be after valid_from")
|
|
528
|
+
return v
|
|
529
|
+
|
|
530
|
+
def is_valid_at(self, at: date) -> bool:
|
|
531
|
+
"""Check if membership is valid at given date."""
|
|
532
|
+
if at < self.valid_from:
|
|
533
|
+
return False
|
|
534
|
+
return not (self.valid_until and at >= self.valid_until)
|