norm_toolkit 1.3.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/PKG-INFO +1 -1
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/pyproject.toml +1 -1
- norm_toolkit-1.5.0/src/norm_toolkit/normalizer_cache.py +163 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/normalizer_postgres.py +123 -21
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/README.md +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/build_merged.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/build_ontology.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/build_umls.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/constants.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/models.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/normalizer.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/utils.py +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LRU cache for normalized string lookup results.
|
|
3
|
+
|
|
4
|
+
Caches at the normalized string level to avoid repeated DB round trips
|
|
5
|
+
for the same normalized forms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class CacheKey:
|
|
18
|
+
"""Immutable cache key for normalized string lookup results."""
|
|
19
|
+
|
|
20
|
+
nstrs_hash: str # Hash of sorted normalized strings
|
|
21
|
+
top_k: int
|
|
22
|
+
prefer_ttys: tuple[str, ...] | None
|
|
23
|
+
filter_sources: tuple[str, ...] | None
|
|
24
|
+
exclude_sources: tuple[str, ...] | None
|
|
25
|
+
allow_partial: bool
|
|
26
|
+
min_coverage: float
|
|
27
|
+
min_word_hits: int | None
|
|
28
|
+
coverage_weight: int
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NormalizerCache:
|
|
32
|
+
"""
|
|
33
|
+
LRU cache for normalized string lookup results.
|
|
34
|
+
|
|
35
|
+
Caches the fully enriched hits for a given tuple of normalized strings
|
|
36
|
+
and query parameters. Uses an OrderedDict for O(1) LRU eviction.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, maxsize: int = 10000) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Initialize the cache.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
maxsize: Maximum number of entries to cache. When exceeded,
|
|
45
|
+
the least recently used entries are evicted.
|
|
46
|
+
"""
|
|
47
|
+
self._cache: OrderedDict[CacheKey, list[dict[str, Any]]] = OrderedDict()
|
|
48
|
+
self._maxsize = maxsize
|
|
49
|
+
self._hits = 0
|
|
50
|
+
self._misses = 0
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def make_key(
|
|
54
|
+
nstrs: tuple[str, ...],
|
|
55
|
+
*,
|
|
56
|
+
top_k: int,
|
|
57
|
+
prefer_ttys: list[str] | None,
|
|
58
|
+
filter_sources: list[str] | None,
|
|
59
|
+
exclude_sources: list[str] | None,
|
|
60
|
+
allow_partial: bool,
|
|
61
|
+
min_coverage: float,
|
|
62
|
+
min_word_hits: int | None,
|
|
63
|
+
coverage_weight: int,
|
|
64
|
+
) -> CacheKey:
|
|
65
|
+
"""
|
|
66
|
+
Create a cache key from normalized strings and query parameters.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
nstrs: Tuple of normalized strings for the query
|
|
70
|
+
top_k: Maximum number of results
|
|
71
|
+
prefer_ttys: Preferred term types
|
|
72
|
+
filter_sources: Include only these sources
|
|
73
|
+
exclude_sources: Exclude these sources
|
|
74
|
+
allow_partial: Whether partial matching is enabled
|
|
75
|
+
min_coverage: Minimum coverage threshold
|
|
76
|
+
min_word_hits: Minimum word hits required
|
|
77
|
+
coverage_weight: Weight for coverage in scoring
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Immutable CacheKey instance
|
|
81
|
+
"""
|
|
82
|
+
# Hash the normalized strings tuple for compact storage
|
|
83
|
+
# Sort to ensure consistent hashing regardless of order
|
|
84
|
+
nstrs_str = "\0".join(sorted(nstrs))
|
|
85
|
+
nstrs_hash = hashlib.md5(nstrs_str.encode(), usedforsecurity=False).hexdigest()
|
|
86
|
+
|
|
87
|
+
return CacheKey(
|
|
88
|
+
nstrs_hash=nstrs_hash,
|
|
89
|
+
top_k=top_k,
|
|
90
|
+
prefer_ttys=tuple(prefer_ttys) if prefer_ttys else None,
|
|
91
|
+
filter_sources=tuple(filter_sources) if filter_sources else None,
|
|
92
|
+
exclude_sources=tuple(exclude_sources) if exclude_sources else None,
|
|
93
|
+
allow_partial=allow_partial,
|
|
94
|
+
min_coverage=min_coverage,
|
|
95
|
+
min_word_hits=min_word_hits,
|
|
96
|
+
coverage_weight=coverage_weight,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def get(self, key: CacheKey) -> list[dict[str, Any]] | None:
|
|
100
|
+
"""
|
|
101
|
+
Get cached hits for a key.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
key: Cache key to look up
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Cached hits list if found, None if not in cache
|
|
108
|
+
"""
|
|
109
|
+
if key in self._cache:
|
|
110
|
+
# Move to end (most recently used)
|
|
111
|
+
self._cache.move_to_end(key)
|
|
112
|
+
self._hits += 1
|
|
113
|
+
return self._cache[key]
|
|
114
|
+
self._misses += 1
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
def set(self, key: CacheKey, hits: list[dict[str, Any]]) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Store hits in the cache.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
key: Cache key
|
|
123
|
+
hits: List of hit dictionaries to cache
|
|
124
|
+
"""
|
|
125
|
+
if key in self._cache:
|
|
126
|
+
self._cache.move_to_end(key)
|
|
127
|
+
else:
|
|
128
|
+
if len(self._cache) >= self._maxsize:
|
|
129
|
+
# Remove oldest item (LRU eviction)
|
|
130
|
+
self._cache.popitem(last=False)
|
|
131
|
+
self._cache[key] = hits
|
|
132
|
+
|
|
133
|
+
def clear(self) -> None:
|
|
134
|
+
"""Clear all cached entries."""
|
|
135
|
+
self._cache.clear()
|
|
136
|
+
self._hits = 0
|
|
137
|
+
self._misses = 0
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def size(self) -> int:
|
|
141
|
+
"""Current number of cached entries."""
|
|
142
|
+
return len(self._cache)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def hit_rate(self) -> float:
|
|
146
|
+
"""Cache hit rate (0.0 to 1.0)."""
|
|
147
|
+
total = self._hits + self._misses
|
|
148
|
+
return self._hits / total if total > 0 else 0.0
|
|
149
|
+
|
|
150
|
+
def stats(self) -> dict[str, Any]:
|
|
151
|
+
"""
|
|
152
|
+
Get cache statistics.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict with size, maxsize, hits, misses, and hit_rate
|
|
156
|
+
"""
|
|
157
|
+
return {
|
|
158
|
+
"size": self.size,
|
|
159
|
+
"maxsize": self._maxsize,
|
|
160
|
+
"hits": self._hits,
|
|
161
|
+
"misses": self._misses,
|
|
162
|
+
"hit_rate": self.hit_rate,
|
|
163
|
+
}
|
|
@@ -32,6 +32,7 @@ from norm_toolkit.constants import (
|
|
|
32
32
|
TYPES_TABLE,
|
|
33
33
|
)
|
|
34
34
|
from norm_toolkit.models import ConceptInfo, SemanticType
|
|
35
|
+
from norm_toolkit.normalizer_cache import NormalizerCache
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class PostgresNormalizer:
|
|
@@ -47,6 +48,8 @@ class PostgresNormalizer:
|
|
|
47
48
|
engine: AsyncEngine,
|
|
48
49
|
schema: str = "public",
|
|
49
50
|
owned_resource: Any | None = None,
|
|
51
|
+
cache_maxsize: int = 10000,
|
|
52
|
+
enable_cache: bool = True,
|
|
50
53
|
) -> None:
|
|
51
54
|
"""
|
|
52
55
|
Initialize the normalizer with an SQLAlchemy AsyncEngine.
|
|
@@ -56,6 +59,8 @@ class PostgresNormalizer:
|
|
|
56
59
|
schema: PostgreSQL schema where tables are located (default: "public")
|
|
57
60
|
owned_resource: Optional resource with async close() method to clean up
|
|
58
61
|
when this normalizer is closed (e.g., AlloyDB AsyncConnector)
|
|
62
|
+
cache_maxsize: Maximum number of entries in the normalized string cache
|
|
63
|
+
enable_cache: Whether to enable caching of normalized string lookups
|
|
59
64
|
|
|
60
65
|
Note:
|
|
61
66
|
After creating the normalizer, call `await normalizer.initialize()`
|
|
@@ -70,6 +75,11 @@ class PostgresNormalizer:
|
|
|
70
75
|
self._has_stt = False
|
|
71
76
|
self._initialized = False
|
|
72
77
|
|
|
78
|
+
# Initialize cache
|
|
79
|
+
self._cache: NormalizerCache | None = (
|
|
80
|
+
NormalizerCache(maxsize=cache_maxsize) if enable_cache else None
|
|
81
|
+
)
|
|
82
|
+
|
|
73
83
|
# Build qualified table names
|
|
74
84
|
prefix = f"{schema}." if schema else ""
|
|
75
85
|
self._ns_table = f"{prefix}{NS_TABLE}"
|
|
@@ -147,8 +157,8 @@ class PostgresNormalizer:
|
|
|
147
157
|
if prefer_ttys is None:
|
|
148
158
|
prefer_ttys = DEFAULT_PREFER_TTYS
|
|
149
159
|
|
|
150
|
-
# Build normalized string map
|
|
151
|
-
q_to_nstrs: dict[str,
|
|
160
|
+
# Build normalized string map (use tuple for hashable cache keys)
|
|
161
|
+
q_to_nstrs: dict[str, tuple[str, ...]] = {}
|
|
152
162
|
for s in strings:
|
|
153
163
|
nstrs = list(lvg_normalize(s) or [])
|
|
154
164
|
# Add normalized forms of synonyms
|
|
@@ -156,23 +166,87 @@ class PostgresNormalizer:
|
|
|
156
166
|
for syn in synonyms[s]:
|
|
157
167
|
syn_nstrs = list(lvg_normalize(syn) or [])
|
|
158
168
|
nstrs.extend(syn_nstrs)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
169
|
+
# Deduplicate while preserving order, then convert to tuple
|
|
170
|
+
q_to_nstrs[s] = tuple(dict.fromkeys(nstrs))
|
|
171
|
+
|
|
172
|
+
# Check cache for each input
|
|
173
|
+
cached_hits: dict[str, list[dict[str, Any]]] = {}
|
|
174
|
+
uncached_queries: list[str] = []
|
|
175
|
+
uncached_q_to_nstrs: dict[str, tuple[str, ...]] = {}
|
|
176
|
+
|
|
177
|
+
for q, nstrs in q_to_nstrs.items():
|
|
178
|
+
if not nstrs:
|
|
179
|
+
# No normalized strings, empty result
|
|
180
|
+
cached_hits[q] = []
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
if self._cache is not None:
|
|
184
|
+
cache_key = NormalizerCache.make_key(
|
|
185
|
+
nstrs,
|
|
186
|
+
top_k=top_k,
|
|
187
|
+
prefer_ttys=prefer_ttys,
|
|
188
|
+
filter_sources=filter_sources,
|
|
189
|
+
exclude_sources=exclude_sources,
|
|
190
|
+
allow_partial=allow_partial,
|
|
191
|
+
min_coverage=min_coverage,
|
|
192
|
+
min_word_hits=min_word_hits,
|
|
193
|
+
coverage_weight=coverage_weight,
|
|
194
|
+
)
|
|
195
|
+
cached = self._cache.get(cache_key)
|
|
196
|
+
if cached is not None:
|
|
197
|
+
cached_hits[q] = cached
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
uncached_queries.append(q)
|
|
201
|
+
uncached_q_to_nstrs[q] = nstrs
|
|
202
|
+
|
|
203
|
+
# Query DB for uncached entries
|
|
204
|
+
if uncached_q_to_nstrs:
|
|
205
|
+
# Convert tuples back to lists for _lookup
|
|
206
|
+
uncached_q_to_nstrs_list: dict[str, list[str]] = {
|
|
207
|
+
q: list(nstrs) for q, nstrs in uncached_q_to_nstrs.items()
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
fresh_result = await self._lookup(
|
|
211
|
+
q_to_nstrs=uncached_q_to_nstrs_list,
|
|
212
|
+
all_queries=uncached_queries,
|
|
213
|
+
prefer_ttys=prefer_ttys,
|
|
214
|
+
filter_sources=filter_sources,
|
|
215
|
+
exclude_sources=exclude_sources,
|
|
216
|
+
top_k=top_k,
|
|
217
|
+
allow_partial=allow_partial,
|
|
218
|
+
min_coverage=min_coverage,
|
|
219
|
+
min_word_hits=min_word_hits,
|
|
220
|
+
coverage_weight=coverage_weight,
|
|
221
|
+
)
|
|
173
222
|
|
|
174
|
-
|
|
175
|
-
|
|
223
|
+
# Enrich fresh results
|
|
224
|
+
fresh_result = await self._enrich_hits_with_concept_info(fresh_result, prefer_ttys)
|
|
225
|
+
|
|
226
|
+
# Cache fresh results and add to cached_hits
|
|
227
|
+
for row in fresh_result.iter_rows(named=True):
|
|
228
|
+
q = row["input_string"]
|
|
229
|
+
hits = row["hits"] or []
|
|
230
|
+
cached_hits[q] = hits
|
|
231
|
+
|
|
232
|
+
if self._cache is not None:
|
|
233
|
+
nstrs = uncached_q_to_nstrs[q]
|
|
234
|
+
cache_key = NormalizerCache.make_key(
|
|
235
|
+
nstrs,
|
|
236
|
+
top_k=top_k,
|
|
237
|
+
prefer_ttys=prefer_ttys,
|
|
238
|
+
filter_sources=filter_sources,
|
|
239
|
+
exclude_sources=exclude_sources,
|
|
240
|
+
allow_partial=allow_partial,
|
|
241
|
+
min_coverage=min_coverage,
|
|
242
|
+
min_word_hits=min_word_hits,
|
|
243
|
+
coverage_weight=coverage_weight,
|
|
244
|
+
)
|
|
245
|
+
self._cache.set(cache_key, hits)
|
|
246
|
+
|
|
247
|
+
# Build final result in original order
|
|
248
|
+
result_data = [{"input_string": s, "hits": cached_hits.get(s, [])} for s in strings]
|
|
249
|
+
result = pl.DataFrame(result_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
176
250
|
|
|
177
251
|
# Add synonyms column if synonyms were provided
|
|
178
252
|
if synonyms:
|
|
@@ -843,6 +917,7 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
843
917
|
concept_id: str,
|
|
844
918
|
max_depth: int | None = 10,
|
|
845
919
|
filter_sources: list[str] | None = None,
|
|
920
|
+
max_ids: int | None = None,
|
|
846
921
|
) -> list[str]:
|
|
847
922
|
"""
|
|
848
923
|
Get all narrower (descendant) concept IDs using recursive traversal.
|
|
@@ -853,9 +928,11 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
853
928
|
concept_id: Starting concept ID (broader term)
|
|
854
929
|
max_depth: Maximum depth to traverse (1 = direct children only, None = all descendants)
|
|
855
930
|
filter_sources: Only follow edges from these sources (e.g., ["SNOMEDCT_US"])
|
|
931
|
+
max_ids: Maximum number of concept IDs to return (None = no limit)
|
|
856
932
|
|
|
857
933
|
Returns:
|
|
858
|
-
List of descendant concept IDs
|
|
934
|
+
List of descendant concept IDs ordered by depth (shallowest first),
|
|
935
|
+
excludes the starting concept
|
|
859
936
|
"""
|
|
860
937
|
await self._ensure_initialized()
|
|
861
938
|
|
|
@@ -875,10 +952,16 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
875
952
|
sources_sql = ", ".join(src_placeholders)
|
|
876
953
|
source_filter = f" AND e.source IN ({sources_sql})"
|
|
877
954
|
|
|
955
|
+
# Build optional LIMIT clause
|
|
956
|
+
limit_clause = ""
|
|
957
|
+
if max_ids is not None:
|
|
958
|
+
params["max_ids"] = max_ids
|
|
959
|
+
limit_clause = "\nLIMIT :max_ids"
|
|
960
|
+
|
|
878
961
|
# PostgreSQL recursive CTE with named parameters
|
|
879
962
|
# Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
|
|
880
963
|
# UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
|
|
881
|
-
#
|
|
964
|
+
# GROUP BY with MIN(depth) gets shortest path depth for each concept
|
|
882
965
|
query = f"""
|
|
883
966
|
WITH RECURSIVE walk(concept_id, depth) AS (
|
|
884
967
|
SELECT CAST(:concept_id AS VARCHAR), 0
|
|
@@ -890,9 +973,11 @@ WITH RECURSIVE walk(concept_id, depth) AS (
|
|
|
890
973
|
JOIN {self._edges_table} e ON e.parent_id = w.concept_id
|
|
891
974
|
WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){source_filter}
|
|
892
975
|
)
|
|
893
|
-
SELECT
|
|
976
|
+
SELECT concept_id, MIN(depth) AS min_depth
|
|
894
977
|
FROM walk
|
|
895
978
|
WHERE concept_id != :concept_id
|
|
979
|
+
GROUP BY concept_id
|
|
980
|
+
ORDER BY min_depth, concept_id{limit_clause}
|
|
896
981
|
"""
|
|
897
982
|
|
|
898
983
|
async with self._engine.connect() as conn:
|
|
@@ -901,6 +986,23 @@ WHERE concept_id != :concept_id
|
|
|
901
986
|
|
|
902
987
|
return [r["concept_id"] for r in rows]
|
|
903
988
|
|
|
989
|
+
def cache_stats(self) -> dict[str, Any] | None:
|
|
990
|
+
"""
|
|
991
|
+
Get cache statistics.
|
|
992
|
+
|
|
993
|
+
Returns:
|
|
994
|
+
Dict with size, maxsize, hits, misses, and hit_rate,
|
|
995
|
+
or None if caching is disabled.
|
|
996
|
+
"""
|
|
997
|
+
if self._cache is None:
|
|
998
|
+
return None
|
|
999
|
+
return self._cache.stats()
|
|
1000
|
+
|
|
1001
|
+
def clear_cache(self) -> None:
|
|
1002
|
+
"""Clear all cached entries."""
|
|
1003
|
+
if self._cache is not None:
|
|
1004
|
+
self._cache.clear()
|
|
1005
|
+
|
|
904
1006
|
async def close(self) -> None:
|
|
905
1007
|
"""
|
|
906
1008
|
Close the engine and any owned resources.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|