norm_toolkit 1.3.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/PKG-INFO +1 -1
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/pyproject.toml +1 -1
- norm_toolkit-1.4.0/src/norm_toolkit/normalizer_cache.py +163 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/normalizer_postgres.py +109 -18
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/README.md +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/build_merged.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/build_ontology.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/build_umls.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/constants.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/models.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/normalizer.py +0 -0
- {norm_toolkit-1.3.0 → norm_toolkit-1.4.0}/src/norm_toolkit/utils.py +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LRU cache for normalized string lookup results.
|
|
3
|
+
|
|
4
|
+
Caches at the normalized string level to avoid repeated DB round trips
|
|
5
|
+
for the same normalized forms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class CacheKey:
|
|
18
|
+
"""Immutable cache key for normalized string lookup results."""
|
|
19
|
+
|
|
20
|
+
nstrs_hash: str # Hash of sorted normalized strings
|
|
21
|
+
top_k: int
|
|
22
|
+
prefer_ttys: tuple[str, ...] | None
|
|
23
|
+
filter_sources: tuple[str, ...] | None
|
|
24
|
+
exclude_sources: tuple[str, ...] | None
|
|
25
|
+
allow_partial: bool
|
|
26
|
+
min_coverage: float
|
|
27
|
+
min_word_hits: int | None
|
|
28
|
+
coverage_weight: int
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NormalizerCache:
|
|
32
|
+
"""
|
|
33
|
+
LRU cache for normalized string lookup results.
|
|
34
|
+
|
|
35
|
+
Caches the fully enriched hits for a given tuple of normalized strings
|
|
36
|
+
and query parameters. Uses an OrderedDict for O(1) LRU eviction.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, maxsize: int = 10000) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Initialize the cache.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
maxsize: Maximum number of entries to cache. When exceeded,
|
|
45
|
+
the least recently used entries are evicted.
|
|
46
|
+
"""
|
|
47
|
+
self._cache: OrderedDict[CacheKey, list[dict[str, Any]]] = OrderedDict()
|
|
48
|
+
self._maxsize = maxsize
|
|
49
|
+
self._hits = 0
|
|
50
|
+
self._misses = 0
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def make_key(
|
|
54
|
+
nstrs: tuple[str, ...],
|
|
55
|
+
*,
|
|
56
|
+
top_k: int,
|
|
57
|
+
prefer_ttys: list[str] | None,
|
|
58
|
+
filter_sources: list[str] | None,
|
|
59
|
+
exclude_sources: list[str] | None,
|
|
60
|
+
allow_partial: bool,
|
|
61
|
+
min_coverage: float,
|
|
62
|
+
min_word_hits: int | None,
|
|
63
|
+
coverage_weight: int,
|
|
64
|
+
) -> CacheKey:
|
|
65
|
+
"""
|
|
66
|
+
Create a cache key from normalized strings and query parameters.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
nstrs: Tuple of normalized strings for the query
|
|
70
|
+
top_k: Maximum number of results
|
|
71
|
+
prefer_ttys: Preferred term types
|
|
72
|
+
filter_sources: Include only these sources
|
|
73
|
+
exclude_sources: Exclude these sources
|
|
74
|
+
allow_partial: Whether partial matching is enabled
|
|
75
|
+
min_coverage: Minimum coverage threshold
|
|
76
|
+
min_word_hits: Minimum word hits required
|
|
77
|
+
coverage_weight: Weight for coverage in scoring
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Immutable CacheKey instance
|
|
81
|
+
"""
|
|
82
|
+
# Hash the normalized strings tuple for compact storage
|
|
83
|
+
# Sort to ensure consistent hashing regardless of order
|
|
84
|
+
nstrs_str = "\0".join(sorted(nstrs))
|
|
85
|
+
nstrs_hash = hashlib.md5(nstrs_str.encode(), usedforsecurity=False).hexdigest()
|
|
86
|
+
|
|
87
|
+
return CacheKey(
|
|
88
|
+
nstrs_hash=nstrs_hash,
|
|
89
|
+
top_k=top_k,
|
|
90
|
+
prefer_ttys=tuple(prefer_ttys) if prefer_ttys else None,
|
|
91
|
+
filter_sources=tuple(filter_sources) if filter_sources else None,
|
|
92
|
+
exclude_sources=tuple(exclude_sources) if exclude_sources else None,
|
|
93
|
+
allow_partial=allow_partial,
|
|
94
|
+
min_coverage=min_coverage,
|
|
95
|
+
min_word_hits=min_word_hits,
|
|
96
|
+
coverage_weight=coverage_weight,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def get(self, key: CacheKey) -> list[dict[str, Any]] | None:
|
|
100
|
+
"""
|
|
101
|
+
Get cached hits for a key.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
key: Cache key to look up
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Cached hits list if found, None if not in cache
|
|
108
|
+
"""
|
|
109
|
+
if key in self._cache:
|
|
110
|
+
# Move to end (most recently used)
|
|
111
|
+
self._cache.move_to_end(key)
|
|
112
|
+
self._hits += 1
|
|
113
|
+
return self._cache[key]
|
|
114
|
+
self._misses += 1
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
def set(self, key: CacheKey, hits: list[dict[str, Any]]) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Store hits in the cache.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
key: Cache key
|
|
123
|
+
hits: List of hit dictionaries to cache
|
|
124
|
+
"""
|
|
125
|
+
if key in self._cache:
|
|
126
|
+
self._cache.move_to_end(key)
|
|
127
|
+
else:
|
|
128
|
+
if len(self._cache) >= self._maxsize:
|
|
129
|
+
# Remove oldest item (LRU eviction)
|
|
130
|
+
self._cache.popitem(last=False)
|
|
131
|
+
self._cache[key] = hits
|
|
132
|
+
|
|
133
|
+
def clear(self) -> None:
|
|
134
|
+
"""Clear all cached entries."""
|
|
135
|
+
self._cache.clear()
|
|
136
|
+
self._hits = 0
|
|
137
|
+
self._misses = 0
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def size(self) -> int:
|
|
141
|
+
"""Current number of cached entries."""
|
|
142
|
+
return len(self._cache)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def hit_rate(self) -> float:
|
|
146
|
+
"""Cache hit rate (0.0 to 1.0)."""
|
|
147
|
+
total = self._hits + self._misses
|
|
148
|
+
return self._hits / total if total > 0 else 0.0
|
|
149
|
+
|
|
150
|
+
def stats(self) -> dict[str, Any]:
|
|
151
|
+
"""
|
|
152
|
+
Get cache statistics.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict with size, maxsize, hits, misses, and hit_rate
|
|
156
|
+
"""
|
|
157
|
+
return {
|
|
158
|
+
"size": self.size,
|
|
159
|
+
"maxsize": self._maxsize,
|
|
160
|
+
"hits": self._hits,
|
|
161
|
+
"misses": self._misses,
|
|
162
|
+
"hit_rate": self.hit_rate,
|
|
163
|
+
}
|
|
@@ -32,6 +32,7 @@ from norm_toolkit.constants import (
|
|
|
32
32
|
TYPES_TABLE,
|
|
33
33
|
)
|
|
34
34
|
from norm_toolkit.models import ConceptInfo, SemanticType
|
|
35
|
+
from norm_toolkit.normalizer_cache import NormalizerCache
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class PostgresNormalizer:
|
|
@@ -47,6 +48,8 @@ class PostgresNormalizer:
|
|
|
47
48
|
engine: AsyncEngine,
|
|
48
49
|
schema: str = "public",
|
|
49
50
|
owned_resource: Any | None = None,
|
|
51
|
+
cache_maxsize: int = 10000,
|
|
52
|
+
enable_cache: bool = True,
|
|
50
53
|
) -> None:
|
|
51
54
|
"""
|
|
52
55
|
Initialize the normalizer with an SQLAlchemy AsyncEngine.
|
|
@@ -56,6 +59,8 @@ class PostgresNormalizer:
|
|
|
56
59
|
schema: PostgreSQL schema where tables are located (default: "public")
|
|
57
60
|
owned_resource: Optional resource with async close() method to clean up
|
|
58
61
|
when this normalizer is closed (e.g., AlloyDB AsyncConnector)
|
|
62
|
+
cache_maxsize: Maximum number of entries in the normalized string cache
|
|
63
|
+
enable_cache: Whether to enable caching of normalized string lookups
|
|
59
64
|
|
|
60
65
|
Note:
|
|
61
66
|
After creating the normalizer, call `await normalizer.initialize()`
|
|
@@ -70,6 +75,11 @@ class PostgresNormalizer:
|
|
|
70
75
|
self._has_stt = False
|
|
71
76
|
self._initialized = False
|
|
72
77
|
|
|
78
|
+
# Initialize cache
|
|
79
|
+
self._cache: NormalizerCache | None = (
|
|
80
|
+
NormalizerCache(maxsize=cache_maxsize) if enable_cache else None
|
|
81
|
+
)
|
|
82
|
+
|
|
73
83
|
# Build qualified table names
|
|
74
84
|
prefix = f"{schema}." if schema else ""
|
|
75
85
|
self._ns_table = f"{prefix}{NS_TABLE}"
|
|
@@ -147,8 +157,8 @@ class PostgresNormalizer:
|
|
|
147
157
|
if prefer_ttys is None:
|
|
148
158
|
prefer_ttys = DEFAULT_PREFER_TTYS
|
|
149
159
|
|
|
150
|
-
# Build normalized string map
|
|
151
|
-
q_to_nstrs: dict[str,
|
|
160
|
+
# Build normalized string map (use tuple for hashable cache keys)
|
|
161
|
+
q_to_nstrs: dict[str, tuple[str, ...]] = {}
|
|
152
162
|
for s in strings:
|
|
153
163
|
nstrs = list(lvg_normalize(s) or [])
|
|
154
164
|
# Add normalized forms of synonyms
|
|
@@ -156,23 +166,87 @@ class PostgresNormalizer:
|
|
|
156
166
|
for syn in synonyms[s]:
|
|
157
167
|
syn_nstrs = list(lvg_normalize(syn) or [])
|
|
158
168
|
nstrs.extend(syn_nstrs)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
169
|
+
# Deduplicate while preserving order, then convert to tuple
|
|
170
|
+
q_to_nstrs[s] = tuple(dict.fromkeys(nstrs))
|
|
171
|
+
|
|
172
|
+
# Check cache for each input
|
|
173
|
+
cached_hits: dict[str, list[dict[str, Any]]] = {}
|
|
174
|
+
uncached_queries: list[str] = []
|
|
175
|
+
uncached_q_to_nstrs: dict[str, tuple[str, ...]] = {}
|
|
176
|
+
|
|
177
|
+
for q, nstrs in q_to_nstrs.items():
|
|
178
|
+
if not nstrs:
|
|
179
|
+
# No normalized strings, empty result
|
|
180
|
+
cached_hits[q] = []
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
if self._cache is not None:
|
|
184
|
+
cache_key = NormalizerCache.make_key(
|
|
185
|
+
nstrs,
|
|
186
|
+
top_k=top_k,
|
|
187
|
+
prefer_ttys=prefer_ttys,
|
|
188
|
+
filter_sources=filter_sources,
|
|
189
|
+
exclude_sources=exclude_sources,
|
|
190
|
+
allow_partial=allow_partial,
|
|
191
|
+
min_coverage=min_coverage,
|
|
192
|
+
min_word_hits=min_word_hits,
|
|
193
|
+
coverage_weight=coverage_weight,
|
|
194
|
+
)
|
|
195
|
+
cached = self._cache.get(cache_key)
|
|
196
|
+
if cached is not None:
|
|
197
|
+
cached_hits[q] = cached
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
uncached_queries.append(q)
|
|
201
|
+
uncached_q_to_nstrs[q] = nstrs
|
|
202
|
+
|
|
203
|
+
# Query DB for uncached entries
|
|
204
|
+
if uncached_q_to_nstrs:
|
|
205
|
+
# Convert tuples back to lists for _lookup
|
|
206
|
+
uncached_q_to_nstrs_list: dict[str, list[str]] = {
|
|
207
|
+
q: list(nstrs) for q, nstrs in uncached_q_to_nstrs.items()
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
fresh_result = await self._lookup(
|
|
211
|
+
q_to_nstrs=uncached_q_to_nstrs_list,
|
|
212
|
+
all_queries=uncached_queries,
|
|
213
|
+
prefer_ttys=prefer_ttys,
|
|
214
|
+
filter_sources=filter_sources,
|
|
215
|
+
exclude_sources=exclude_sources,
|
|
216
|
+
top_k=top_k,
|
|
217
|
+
allow_partial=allow_partial,
|
|
218
|
+
min_coverage=min_coverage,
|
|
219
|
+
min_word_hits=min_word_hits,
|
|
220
|
+
coverage_weight=coverage_weight,
|
|
221
|
+
)
|
|
173
222
|
|
|
174
|
-
|
|
175
|
-
|
|
223
|
+
# Enrich fresh results
|
|
224
|
+
fresh_result = await self._enrich_hits_with_concept_info(fresh_result, prefer_ttys)
|
|
225
|
+
|
|
226
|
+
# Cache fresh results and add to cached_hits
|
|
227
|
+
for row in fresh_result.iter_rows(named=True):
|
|
228
|
+
q = row["input_string"]
|
|
229
|
+
hits = row["hits"] or []
|
|
230
|
+
cached_hits[q] = hits
|
|
231
|
+
|
|
232
|
+
if self._cache is not None:
|
|
233
|
+
nstrs = uncached_q_to_nstrs[q]
|
|
234
|
+
cache_key = NormalizerCache.make_key(
|
|
235
|
+
nstrs,
|
|
236
|
+
top_k=top_k,
|
|
237
|
+
prefer_ttys=prefer_ttys,
|
|
238
|
+
filter_sources=filter_sources,
|
|
239
|
+
exclude_sources=exclude_sources,
|
|
240
|
+
allow_partial=allow_partial,
|
|
241
|
+
min_coverage=min_coverage,
|
|
242
|
+
min_word_hits=min_word_hits,
|
|
243
|
+
coverage_weight=coverage_weight,
|
|
244
|
+
)
|
|
245
|
+
self._cache.set(cache_key, hits)
|
|
246
|
+
|
|
247
|
+
# Build final result in original order
|
|
248
|
+
result_data = [{"input_string": s, "hits": cached_hits.get(s, [])} for s in strings]
|
|
249
|
+
result = pl.DataFrame(result_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
176
250
|
|
|
177
251
|
# Add synonyms column if synonyms were provided
|
|
178
252
|
if synonyms:
|
|
@@ -901,6 +975,23 @@ WHERE concept_id != :concept_id
|
|
|
901
975
|
|
|
902
976
|
return [r["concept_id"] for r in rows]
|
|
903
977
|
|
|
978
|
+
def cache_stats(self) -> dict[str, Any] | None:
|
|
979
|
+
"""
|
|
980
|
+
Get cache statistics.
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
Dict with size, maxsize, hits, misses, and hit_rate,
|
|
984
|
+
or None if caching is disabled.
|
|
985
|
+
"""
|
|
986
|
+
if self._cache is None:
|
|
987
|
+
return None
|
|
988
|
+
return self._cache.stats()
|
|
989
|
+
|
|
990
|
+
def clear_cache(self) -> None:
|
|
991
|
+
"""Clear all cached entries."""
|
|
992
|
+
if self._cache is not None:
|
|
993
|
+
self._cache.clear()
|
|
994
|
+
|
|
904
995
|
async def close(self) -> None:
|
|
905
996
|
"""
|
|
906
997
|
Close the engine and any owned resources.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|