resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Database connection manager."""
|
|
2
|
+
|
|
3
|
+
from contextlib import AbstractContextManager, suppress
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import create_engine, text
|
|
8
|
+
from sqlalchemy.engine import Engine
|
|
9
|
+
from sqlalchemy.pool import StaticPool
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from sqlalchemy.engine import Connection
|
|
13
|
+
|
|
14
|
+
from resolvekit.utils.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DatabaseManager:
|
|
20
|
+
"""
|
|
21
|
+
Manages SQLite database connections with overlay support.
|
|
22
|
+
|
|
23
|
+
Handles:
|
|
24
|
+
- Connection creation with SQLAlchemy
|
|
25
|
+
- Performance PRAGMA application
|
|
26
|
+
- Overlay database attachment
|
|
27
|
+
- Transaction management
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
base_path: Path,
|
|
33
|
+
overlays: list[Path] | None = None,
|
|
34
|
+
read_only: bool = True,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize database manager.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
base_path: Path to base database file
|
|
41
|
+
overlays: Optional list of overlay databases (max 5, ordered by precedence)
|
|
42
|
+
read_only: Whether database is read-only
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
ValueError: If database paths are not unique or too many overlays
|
|
46
|
+
"""
|
|
47
|
+
# Validate overlay count
|
|
48
|
+
if overlays and len(overlays) > 5:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Maximum 5 overlays supported, got {len(overlays)}. "
|
|
51
|
+
"Too many overlays can impact performance."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Validate that all paths are unique
|
|
55
|
+
paths = [base_path]
|
|
56
|
+
if overlays:
|
|
57
|
+
paths.extend(overlays)
|
|
58
|
+
|
|
59
|
+
# Resolve paths to catch symbolic links and relative paths
|
|
60
|
+
resolved_paths = [p.resolve() for p in paths]
|
|
61
|
+
|
|
62
|
+
if len(resolved_paths) != len(set(resolved_paths)):
|
|
63
|
+
raise ValueError(
|
|
64
|
+
f"Database paths must be unique. Got duplicates in: {paths}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
self.base_path = base_path
|
|
68
|
+
self.overlay_paths = overlays or []
|
|
69
|
+
self.read_only = read_only
|
|
70
|
+
self.overlays: list[tuple[str, int]] = []
|
|
71
|
+
self.engine: Engine | None = None
|
|
72
|
+
|
|
73
|
+
def connect(self) -> None:
|
|
74
|
+
"""Initialize connection with pragmas and overlay attachment."""
|
|
75
|
+
# Create SQLAlchemy engine
|
|
76
|
+
uri = f"sqlite:///{self.base_path}"
|
|
77
|
+
self.engine = create_engine(
|
|
78
|
+
uri,
|
|
79
|
+
connect_args={"check_same_thread": False},
|
|
80
|
+
poolclass=StaticPool,
|
|
81
|
+
echo=False,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Apply pragmas
|
|
85
|
+
self._apply_pragmas()
|
|
86
|
+
|
|
87
|
+
# Attach overlays
|
|
88
|
+
self._attach_overlays()
|
|
89
|
+
|
|
90
|
+
def _apply_pragmas(self) -> None:
|
|
91
|
+
"""Apply performance pragmas from CLAUDE.md."""
|
|
92
|
+
if not self.engine:
|
|
93
|
+
raise RuntimeError("Engine not initialized")
|
|
94
|
+
|
|
95
|
+
pragmas = [
|
|
96
|
+
"PRAGMA foreign_keys=ON",
|
|
97
|
+
"PRAGMA journal_mode=OFF",
|
|
98
|
+
"PRAGMA synchronous=OFF",
|
|
99
|
+
"PRAGMA temp_store=MEMORY",
|
|
100
|
+
"PRAGMA mmap_size=268435456",
|
|
101
|
+
"PRAGMA cache_size=-100000",
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
# Add read-only enforcement if requested
|
|
105
|
+
if self.read_only:
|
|
106
|
+
pragmas.append("PRAGMA query_only=ON")
|
|
107
|
+
|
|
108
|
+
with self.engine.connect() as conn:
|
|
109
|
+
for pragma in pragmas:
|
|
110
|
+
conn.execute(text(pragma))
|
|
111
|
+
conn.commit()
|
|
112
|
+
|
|
113
|
+
def _attach_overlays(self) -> None:
|
|
114
|
+
"""
|
|
115
|
+
Attach overlay databases with precedence tracking.
|
|
116
|
+
|
|
117
|
+
Precedence is assigned based on list order:
|
|
118
|
+
- First overlay: precedence 100 (highest)
|
|
119
|
+
- Second overlay: precedence 99
|
|
120
|
+
- Third overlay: precedence 98
|
|
121
|
+
- etc.
|
|
122
|
+
"""
|
|
123
|
+
if not self.engine:
|
|
124
|
+
raise RuntimeError("Engine not initialized")
|
|
125
|
+
|
|
126
|
+
# Clear overlays list to prevent duplicates on reconnect
|
|
127
|
+
self.overlays.clear()
|
|
128
|
+
|
|
129
|
+
with self.engine.connect() as conn:
|
|
130
|
+
# Attach overlays in order, assigning descending precedence
|
|
131
|
+
for idx, overlay_path in enumerate(self.overlay_paths):
|
|
132
|
+
schema_name = f"overlay_{idx}"
|
|
133
|
+
precedence = 100 - idx # First=100, second=99, etc.
|
|
134
|
+
|
|
135
|
+
conn.execute(text(f"ATTACH DATABASE '{overlay_path}' AS {schema_name}"))
|
|
136
|
+
self.overlays.append((schema_name, precedence))
|
|
137
|
+
|
|
138
|
+
conn.commit()
|
|
139
|
+
|
|
140
|
+
def execute(self, query: str, params: dict[str, Any] | None = None) -> Any:
|
|
141
|
+
"""
|
|
142
|
+
Execute a query.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
query: SQL query string
|
|
146
|
+
params: Optional parameters dict
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of Row objects for SELECT queries, empty list for non-SELECT
|
|
150
|
+
"""
|
|
151
|
+
if not self.engine:
|
|
152
|
+
raise RuntimeError("Database not connected. Call connect() first.")
|
|
153
|
+
|
|
154
|
+
with self.engine.connect() as conn:
|
|
155
|
+
result = conn.execute(text(query), params or {})
|
|
156
|
+
conn.commit()
|
|
157
|
+
# Consume results before connection closes (if query returns rows)
|
|
158
|
+
if result.returns_rows:
|
|
159
|
+
return result.fetchall()
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
def transaction(self) -> AbstractContextManager["Connection"]:
|
|
163
|
+
"""
|
|
164
|
+
Get transaction context manager.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Transaction context manager
|
|
168
|
+
"""
|
|
169
|
+
if not self.engine:
|
|
170
|
+
raise RuntimeError("Database not connected. Call connect() first.")
|
|
171
|
+
|
|
172
|
+
return self.engine.begin()
|
|
173
|
+
|
|
174
|
+
def close(self) -> None:
|
|
175
|
+
"""Close database connection and detach overlays."""
|
|
176
|
+
if self.engine:
|
|
177
|
+
# Detach overlays before disposing engine
|
|
178
|
+
if self.overlays:
|
|
179
|
+
try:
|
|
180
|
+
with self.engine.connect() as conn:
|
|
181
|
+
for overlay_name, _ in self.overlays:
|
|
182
|
+
conn.execute(text(f"DETACH DATABASE {overlay_name}"))
|
|
183
|
+
conn.commit()
|
|
184
|
+
except Exception as e:
|
|
185
|
+
# Log but don't fail - connection may already be closed
|
|
186
|
+
logger.debug(f"Error detaching overlays during close: {e}")
|
|
187
|
+
finally:
|
|
188
|
+
self.overlays.clear()
|
|
189
|
+
|
|
190
|
+
self.engine.dispose()
|
|
191
|
+
self.engine = None
|
|
192
|
+
|
|
193
|
+
def __del__(self) -> None:
|
|
194
|
+
"""Cleanup: Close database connection when object is garbage collected."""
|
|
195
|
+
with suppress(Exception):
|
|
196
|
+
self.close()
|
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
"""Entity repository."""
|
|
2
|
+
|
|
3
|
+
from datetime import date
|
|
4
|
+
from typing import Any, ClassVar, overload
|
|
5
|
+
|
|
6
|
+
from resolvekit.data.context_filters import ContextFilterBuilder
|
|
7
|
+
from resolvekit.data.db_manager import DatabaseManager
|
|
8
|
+
from resolvekit.data.query_builder import QueryBuilder
|
|
9
|
+
from resolvekit.types import Entity, EntityRow, MatchContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EntityRepository:
|
|
13
|
+
"""Repository for entity operations."""
|
|
14
|
+
|
|
15
|
+
# Derive column list from EntityRow model (single source of truth)
|
|
16
|
+
ENTITY_COLUMNS: ClassVar[list[str]] = list(EntityRow.model_fields.keys())
|
|
17
|
+
|
|
18
|
+
def __init__(self, db_manager: DatabaseManager):
|
|
19
|
+
"""
|
|
20
|
+
Initialize repository.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
db_manager: Database manager instance
|
|
24
|
+
"""
|
|
25
|
+
self.db = db_manager
|
|
26
|
+
self.query_builder = QueryBuilder(db_manager)
|
|
27
|
+
|
|
28
|
+
@overload
|
|
29
|
+
def find_by_dcid(
|
|
30
|
+
self,
|
|
31
|
+
dcid: str,
|
|
32
|
+
as_of: date | None = None,
|
|
33
|
+
include_codes: bool = False,
|
|
34
|
+
) -> Entity | None: ...
|
|
35
|
+
|
|
36
|
+
@overload
|
|
37
|
+
def find_by_dcid(
|
|
38
|
+
self,
|
|
39
|
+
dcid: list[str],
|
|
40
|
+
as_of: date | None = None,
|
|
41
|
+
include_codes: bool = False,
|
|
42
|
+
) -> dict[str, Entity]: ...
|
|
43
|
+
|
|
44
|
+
def find_by_dcid(
|
|
45
|
+
self,
|
|
46
|
+
dcid: str | list[str],
|
|
47
|
+
as_of: date | None = None,
|
|
48
|
+
include_codes: bool = False,
|
|
49
|
+
) -> Entity | None | dict[str, Entity]:
|
|
50
|
+
"""
|
|
51
|
+
Find entity by DCID.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
dcid: Single DCID or list of DCIDs
|
|
55
|
+
as_of: Optional date for temporal filtering
|
|
56
|
+
include_codes: Whether to include code mappings
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Single entity, dict of entities, or None
|
|
60
|
+
"""
|
|
61
|
+
if isinstance(dcid, str):
|
|
62
|
+
return self._find_single_by_dcid(dcid, as_of, include_codes)
|
|
63
|
+
else:
|
|
64
|
+
return self._find_batch_by_dcid(dcid, as_of, include_codes)
|
|
65
|
+
|
|
66
|
+
def _find_single_by_dcid(
|
|
67
|
+
self,
|
|
68
|
+
dcid: str,
|
|
69
|
+
as_of: date | None = None,
|
|
70
|
+
include_codes: bool = False,
|
|
71
|
+
) -> Entity | None:
|
|
72
|
+
"""Find single entity by DCID."""
|
|
73
|
+
sql, params = self.query_builder.build_union_query(
|
|
74
|
+
table="entities",
|
|
75
|
+
columns=self.ENTITY_COLUMNS,
|
|
76
|
+
where_clause="dcid = :dcid",
|
|
77
|
+
params={"dcid": dcid},
|
|
78
|
+
unique_key="dcid",
|
|
79
|
+
as_of=as_of,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
rows = self.db.execute(sql, params)
|
|
83
|
+
|
|
84
|
+
if not rows:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
row = rows[0]
|
|
88
|
+
entity = self._row_to_entity(row)
|
|
89
|
+
|
|
90
|
+
# Load codes if requested
|
|
91
|
+
if include_codes:
|
|
92
|
+
self._load_codes_for_entities([entity])
|
|
93
|
+
|
|
94
|
+
return entity
|
|
95
|
+
|
|
96
|
+
def _find_batch_by_dcid(
|
|
97
|
+
self,
|
|
98
|
+
dcids: list[str],
|
|
99
|
+
as_of: date | None = None,
|
|
100
|
+
include_codes: bool = False,
|
|
101
|
+
) -> dict[str, Entity]:
|
|
102
|
+
"""Find multiple entities by DCID."""
|
|
103
|
+
if not dcids:
|
|
104
|
+
return {}
|
|
105
|
+
|
|
106
|
+
# Build IN clause
|
|
107
|
+
placeholders = ", ".join(f":dcid_{i}" for i in range(len(dcids)))
|
|
108
|
+
params = {f"dcid_{i}": dcid for i, dcid in enumerate(dcids)}
|
|
109
|
+
|
|
110
|
+
sql, params = self.query_builder.build_union_query(
|
|
111
|
+
table="entities",
|
|
112
|
+
columns=self.ENTITY_COLUMNS,
|
|
113
|
+
where_clause=f"dcid IN ({placeholders})",
|
|
114
|
+
params=params,
|
|
115
|
+
unique_key="dcid",
|
|
116
|
+
as_of=as_of,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
result = self.db.execute(sql, params)
|
|
120
|
+
|
|
121
|
+
entities = {}
|
|
122
|
+
for row in result:
|
|
123
|
+
entity = self._row_to_entity(row)
|
|
124
|
+
entities[entity.dcid] = entity
|
|
125
|
+
|
|
126
|
+
# Load codes if requested
|
|
127
|
+
if include_codes and entities:
|
|
128
|
+
self._load_codes_for_entities(list(entities.values()))
|
|
129
|
+
|
|
130
|
+
return entities
|
|
131
|
+
|
|
132
|
+
@overload
|
|
133
|
+
def find_by_code(
|
|
134
|
+
self, code_system: str, code_value: str, context: MatchContext | None = None
|
|
135
|
+
) -> Entity | None: ...
|
|
136
|
+
|
|
137
|
+
@overload
|
|
138
|
+
def find_by_code(
|
|
139
|
+
self,
|
|
140
|
+
code_system: str,
|
|
141
|
+
code_value: list[str],
|
|
142
|
+
context: MatchContext | None = None,
|
|
143
|
+
) -> dict[str, Entity]: ...
|
|
144
|
+
|
|
145
|
+
def find_by_code(
|
|
146
|
+
self,
|
|
147
|
+
code_system: str,
|
|
148
|
+
code_value: str | list[str],
|
|
149
|
+
context: MatchContext | None = None,
|
|
150
|
+
) -> Entity | None | dict[str, Entity]:
|
|
151
|
+
"""
|
|
152
|
+
Find entity by code.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
code_system: Code system (e.g., "iso2", "iso3")
|
|
156
|
+
code_value: Single code or list of codes
|
|
157
|
+
context: Optional filtering context
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Single entity, dict of entities, or None
|
|
161
|
+
"""
|
|
162
|
+
if isinstance(code_value, str):
|
|
163
|
+
return self._find_single_by_code(code_system, code_value, context)
|
|
164
|
+
else:
|
|
165
|
+
return self._find_batch_by_code(code_system, code_value, context)
|
|
166
|
+
|
|
167
|
+
def _find_single_by_code(
|
|
168
|
+
self, code_system: str, code_value: str, context: MatchContext | None = None
|
|
169
|
+
) -> Entity | None:
|
|
170
|
+
"""Find single entity by code."""
|
|
171
|
+
sql, params = self.query_builder.build_code_lookup_union(
|
|
172
|
+
code_system=code_system,
|
|
173
|
+
code_values=code_value,
|
|
174
|
+
entity_columns=self.ENTITY_COLUMNS,
|
|
175
|
+
include_code_value=False,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Apply context filters if provided
|
|
179
|
+
if context:
|
|
180
|
+
# Build context filters (no table prefix - subquery columns don't have aliases)
|
|
181
|
+
temporal_filter, type_filter, parent_filter = (
|
|
182
|
+
ContextFilterBuilder.build_filters(context, params, table_prefix="")
|
|
183
|
+
)
|
|
184
|
+
# Wrap the union query and apply filters
|
|
185
|
+
sql = f"""
|
|
186
|
+
SELECT * FROM ({sql}) AS filtered
|
|
187
|
+
WHERE 1=1
|
|
188
|
+
{temporal_filter}
|
|
189
|
+
{type_filter}
|
|
190
|
+
{parent_filter}
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
# Order by precedence DESC to respect overlay precedence
|
|
194
|
+
sql += "\nORDER BY precedence DESC LIMIT 1"
|
|
195
|
+
|
|
196
|
+
rows = self.db.execute(sql, params)
|
|
197
|
+
|
|
198
|
+
if not rows:
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
row = rows[0]
|
|
202
|
+
return self._row_to_entity(row)
|
|
203
|
+
|
|
204
|
+
def _find_batch_by_code(
|
|
205
|
+
self,
|
|
206
|
+
code_system: str,
|
|
207
|
+
code_values: list[str],
|
|
208
|
+
context: MatchContext | None = None,
|
|
209
|
+
) -> dict[str, Entity]:
|
|
210
|
+
"""Find multiple entities by code."""
|
|
211
|
+
if not code_values:
|
|
212
|
+
return {}
|
|
213
|
+
|
|
214
|
+
sql, params = self.query_builder.build_code_lookup_union(
|
|
215
|
+
code_system=code_system,
|
|
216
|
+
code_values=code_values,
|
|
217
|
+
entity_columns=self.ENTITY_COLUMNS,
|
|
218
|
+
include_code_value=True,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Apply context filters if provided
|
|
222
|
+
if context:
|
|
223
|
+
# Build context filters (no table prefix - subquery columns don't have aliases)
|
|
224
|
+
temporal_filter, type_filter, parent_filter = (
|
|
225
|
+
ContextFilterBuilder.build_filters(context, params, table_prefix="")
|
|
226
|
+
)
|
|
227
|
+
# Wrap the union query and apply filters
|
|
228
|
+
sql = f"""
|
|
229
|
+
SELECT * FROM ({sql}) AS filtered
|
|
230
|
+
WHERE 1=1
|
|
231
|
+
{temporal_filter}
|
|
232
|
+
{type_filter}
|
|
233
|
+
{parent_filter}
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
# Order by code_value and precedence to group overlays together
|
|
237
|
+
sql += "\nORDER BY code_value, precedence DESC"
|
|
238
|
+
|
|
239
|
+
result = self.db.execute(sql, params)
|
|
240
|
+
|
|
241
|
+
# Deduplicate by code_value, keeping highest precedence
|
|
242
|
+
entities = {}
|
|
243
|
+
for row in result:
|
|
244
|
+
code_val = row.code_value
|
|
245
|
+
# Only add if we haven't seen this code_value or this has higher precedence
|
|
246
|
+
if code_val not in entities:
|
|
247
|
+
entity = self._row_to_entity(row)
|
|
248
|
+
entities[code_val] = entity
|
|
249
|
+
|
|
250
|
+
return entities
|
|
251
|
+
|
|
252
|
+
def get_parent(self, dcid: str, as_of: date | None = None) -> Entity | None:
|
|
253
|
+
"""
|
|
254
|
+
Get parent entity.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
dcid: Entity DCID
|
|
258
|
+
as_of: Optional date for temporal filtering
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Parent entity or None
|
|
262
|
+
"""
|
|
263
|
+
# Optimized single-query version using JOIN
|
|
264
|
+
entity_cols = ", ".join(f"parent.{col}" for col in self.ENTITY_COLUMNS)
|
|
265
|
+
|
|
266
|
+
# Build temporal filter if needed
|
|
267
|
+
temporal_filter = ""
|
|
268
|
+
params: dict[str, Any] = {"dcid": dcid}
|
|
269
|
+
if as_of is not None:
|
|
270
|
+
# Apply temporal filter to BOTH child and parent to ensure correct historical row selection
|
|
271
|
+
temporal_filter = """
|
|
272
|
+
AND (child.valid_from IS NULL OR child.valid_from <= :as_of)
|
|
273
|
+
AND (child.valid_until IS NULL OR child.valid_until >= :as_of)
|
|
274
|
+
AND (parent.valid_from IS NULL OR parent.valid_from <= :as_of)
|
|
275
|
+
AND (parent.valid_until IS NULL OR parent.valid_until >= :as_of)
|
|
276
|
+
"""
|
|
277
|
+
params["as_of"] = as_of.isoformat()
|
|
278
|
+
|
|
279
|
+
# Query from main database
|
|
280
|
+
sql = f"""
|
|
281
|
+
SELECT {entity_cols}, 0 AS precedence, 'main' AS source_db
|
|
282
|
+
FROM main.entities child
|
|
283
|
+
INNER JOIN main.entities parent ON child.parent_dcid = parent.dcid
|
|
284
|
+
WHERE child.dcid = :dcid{temporal_filter}
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
# Add overlays if present (join child from any source with parent from any source)
|
|
288
|
+
for schema_name, precedence in self.db.overlays:
|
|
289
|
+
# Child in overlay, parent in overlay
|
|
290
|
+
sql += f"""
|
|
291
|
+
UNION ALL
|
|
292
|
+
SELECT {entity_cols}, {precedence} AS precedence, '{schema_name}' AS source_db
|
|
293
|
+
FROM {schema_name}.entities child
|
|
294
|
+
INNER JOIN {schema_name}.entities parent ON child.parent_dcid = parent.dcid
|
|
295
|
+
WHERE child.dcid = :dcid{temporal_filter}
|
|
296
|
+
"""
|
|
297
|
+
# Child in overlay, parent in main
|
|
298
|
+
sql += f"""
|
|
299
|
+
UNION ALL
|
|
300
|
+
SELECT {entity_cols}, {precedence // 2} AS precedence, 'mixed' AS source_db
|
|
301
|
+
FROM {schema_name}.entities child
|
|
302
|
+
INNER JOIN main.entities parent ON child.parent_dcid = parent.dcid
|
|
303
|
+
WHERE child.dcid = :dcid{temporal_filter}
|
|
304
|
+
"""
|
|
305
|
+
# Child in main, parent in overlay (respect overlay precedence)
|
|
306
|
+
sql += f"""
|
|
307
|
+
UNION ALL
|
|
308
|
+
SELECT {entity_cols}, {precedence} AS precedence, '{schema_name}' AS source_db
|
|
309
|
+
FROM main.entities child
|
|
310
|
+
INNER JOIN {schema_name}.entities parent ON child.parent_dcid = parent.dcid
|
|
311
|
+
WHERE child.dcid = :dcid{temporal_filter}
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
# Order by precedence to get highest precedence result
|
|
315
|
+
sql += "\nORDER BY precedence DESC LIMIT 1"
|
|
316
|
+
|
|
317
|
+
rows = self.db.execute(sql, params)
|
|
318
|
+
|
|
319
|
+
if not rows:
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
row = rows[0]
|
|
323
|
+
return self._row_to_entity(row)
|
|
324
|
+
|
|
325
|
+
def get_children(self, dcid: str, as_of: date | None = None) -> list[Entity]:
|
|
326
|
+
"""
|
|
327
|
+
Get child entities.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
dcid: Parent entity DCID
|
|
331
|
+
as_of: Optional date for temporal filtering
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
List of child entities
|
|
335
|
+
"""
|
|
336
|
+
sql, params = self.query_builder.build_union_query(
|
|
337
|
+
table="entities",
|
|
338
|
+
columns=self.ENTITY_COLUMNS,
|
|
339
|
+
where_clause="parent_dcid = :parent_dcid",
|
|
340
|
+
params={"parent_dcid": dcid},
|
|
341
|
+
unique_key="dcid",
|
|
342
|
+
as_of=as_of,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
result = self.db.execute(sql, params)
|
|
346
|
+
|
|
347
|
+
return [self._row_to_entity(row) for row in result]
|
|
348
|
+
|
|
349
|
+
def _row_to_entity(self, row: Any) -> Entity:
|
|
350
|
+
"""Convert database row to Entity model via Pydantic validation."""
|
|
351
|
+
# Convert SQLAlchemy Row to dict using only entity columns
|
|
352
|
+
row_dict = {col: getattr(row, col) for col in self.ENTITY_COLUMNS}
|
|
353
|
+
|
|
354
|
+
# Validate through EntityRow model
|
|
355
|
+
entity_row = EntityRow.model_validate(row_dict)
|
|
356
|
+
|
|
357
|
+
# Convert EntityRow to Entity (adding computed fields)
|
|
358
|
+
return Entity(
|
|
359
|
+
**entity_row.model_dump(),
|
|
360
|
+
codes={},
|
|
361
|
+
provenance={},
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def find_by_canonical_name(
|
|
365
|
+
self, normalized_canonical: str, context: MatchContext | None = None
|
|
366
|
+
) -> Entity | None:
|
|
367
|
+
"""
|
|
368
|
+
Find entity by normalized canonical name.
|
|
369
|
+
|
|
370
|
+
Uses the indexed normalized_canonical column for efficient lookups.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
normalized_canonical: Normalized canonical name
|
|
374
|
+
context: Optional filtering context
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Entity if found, None otherwise
|
|
378
|
+
"""
|
|
379
|
+
params: dict[str, Any] = {"normalized": normalized_canonical}
|
|
380
|
+
|
|
381
|
+
# Build context filters using shared utility
|
|
382
|
+
temporal_filter, type_filter, parent_filter = (
|
|
383
|
+
ContextFilterBuilder.build_filters(context, params, table_prefix="")
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Query from main database using indexed normalized_canonical column
|
|
387
|
+
entity_cols = ", ".join(self.ENTITY_COLUMNS)
|
|
388
|
+
sql = f"""
|
|
389
|
+
SELECT {entity_cols}, 0 AS precedence
|
|
390
|
+
FROM main.entities
|
|
391
|
+
WHERE normalized_canonical = :normalized
|
|
392
|
+
{temporal_filter}
|
|
393
|
+
{type_filter}
|
|
394
|
+
{parent_filter}
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
# Add overlays if present
|
|
398
|
+
for schema_name, precedence in self.db.overlays:
|
|
399
|
+
sql += f"""
|
|
400
|
+
UNION ALL
|
|
401
|
+
SELECT {entity_cols}, {precedence} AS precedence
|
|
402
|
+
FROM {schema_name}.entities
|
|
403
|
+
WHERE normalized_canonical = :normalized
|
|
404
|
+
{temporal_filter}
|
|
405
|
+
{type_filter}
|
|
406
|
+
{parent_filter}
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
# Order by precedence to get highest precedence result
|
|
410
|
+
sql += "\nORDER BY precedence DESC LIMIT 1"
|
|
411
|
+
|
|
412
|
+
rows = self.db.execute(sql, params)
|
|
413
|
+
|
|
414
|
+
if not rows:
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
row = rows[0]
|
|
418
|
+
return self._row_to_entity(row)
|
|
419
|
+
|
|
420
|
+
def _load_codes_for_entities(self, entities: list[Entity]) -> None:
|
|
421
|
+
"""
|
|
422
|
+
Load codes for entities and populate their codes dict in place.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
entities: List of entities to load codes for
|
|
426
|
+
"""
|
|
427
|
+
if not entities:
|
|
428
|
+
return
|
|
429
|
+
|
|
430
|
+
# Get all DCIDs
|
|
431
|
+
dcids = [e.dcid for e in entities]
|
|
432
|
+
|
|
433
|
+
# Build placeholders for IN clause
|
|
434
|
+
placeholders = ", ".join(f":dcid_{i}" for i in range(len(dcids)))
|
|
435
|
+
params = {f"dcid_{i}": dcid for i, dcid in enumerate(dcids)}
|
|
436
|
+
|
|
437
|
+
# Query codes from main database
|
|
438
|
+
sql = f"""
|
|
439
|
+
SELECT entity_dcid, code_system, code_value
|
|
440
|
+
FROM main.codes
|
|
441
|
+
WHERE entity_dcid IN ({placeholders})
|
|
442
|
+
"""
|
|
443
|
+
|
|
444
|
+
# Add overlays if present
|
|
445
|
+
for schema_name, _ in self.db.overlays:
|
|
446
|
+
sql += f"""
|
|
447
|
+
UNION ALL
|
|
448
|
+
SELECT entity_dcid, code_system, code_value
|
|
449
|
+
FROM {schema_name}.codes
|
|
450
|
+
WHERE entity_dcid IN ({placeholders})
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
result = self.db.execute(sql, params)
|
|
454
|
+
|
|
455
|
+
# Group codes by entity_dcid
|
|
456
|
+
codes_by_dcid: dict[str, dict[str, str]] = {}
|
|
457
|
+
for row in result:
|
|
458
|
+
dcid = row.entity_dcid
|
|
459
|
+
if dcid not in codes_by_dcid:
|
|
460
|
+
codes_by_dcid[dcid] = {}
|
|
461
|
+
codes_by_dcid[dcid][row.code_system] = row.code_value
|
|
462
|
+
|
|
463
|
+
# Populate codes in entities
|
|
464
|
+
for entity in entities:
|
|
465
|
+
if entity.dcid in codes_by_dcid:
|
|
466
|
+
entity.codes = codes_by_dcid[entity.dcid]
|