PyPI - norm_toolkit - Versions diffs - 1.2.0__tar.gz → 1.4.0__tar.gz - Mend

norm_toolkit 1.2.0tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{norm_toolkit-1.2.0 → norm_toolkit-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.2.0
+Version: 1.4.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.2.0 → norm_toolkit-1.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.2.0"
+version = "1.4.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

{norm_toolkit-1.2.0 → norm_toolkit-1.4.0}/src/norm_toolkit/constants.py RENAMED Viewed

@@ -38,6 +38,9 @@ HIT_STRUCT_TYPE = pl.Struct(
         "score": pl.Int64,
         "total_score": pl.Int64,
         "match_type": pl.Utf8,
+        "pref_name": pl.Utf8,
+        "description": pl.Utf8,
+        "synonyms": pl.List(pl.Utf8),
     }
 )

norm_toolkit-1.4.0/src/norm_toolkit/normalizer_cache.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""
+LRU cache for normalized string lookup results.
+Caches at the normalized string level to avoid repeated DB round trips
+for the same normalized forms.
+"""
+from __future__ import annotations
+import hashlib
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any
+@dataclass(frozen=True)
+class CacheKey:
+    """Immutable cache key for normalized string lookup results."""
+    nstrs_hash: str  # Hash of sorted normalized strings
+    top_k: int
+    prefer_ttys: tuple[str, ...] | None
+    filter_sources: tuple[str, ...] | None
+    exclude_sources: tuple[str, ...] | None
+    allow_partial: bool
+    min_coverage: float
+    min_word_hits: int | None
+    coverage_weight: int
+class NormalizerCache:
+    """
+    LRU cache for normalized string lookup results.
+    Caches the fully enriched hits for a given tuple of normalized strings
+    and query parameters. Uses an OrderedDict for O(1) LRU eviction.
+    """
+    def __init__(self, maxsize: int = 10000) -> None:
+        """
+        Initialize the cache.
+        Args:
+            maxsize: Maximum number of entries to cache. When exceeded,
+                the least recently used entries are evicted.
+        """
+        self._cache: OrderedDict[CacheKey, list[dict[str, Any]]] = OrderedDict()
+        self._maxsize = maxsize
+        self._hits = 0
+        self._misses = 0
+    @staticmethod
+    def make_key(
+        nstrs: tuple[str, ...],
+        *,
+        top_k: int,
+        prefer_ttys: list[str] | None,
+        filter_sources: list[str] | None,
+        exclude_sources: list[str] | None,
+        allow_partial: bool,
+        min_coverage: float,
+        min_word_hits: int | None,
+        coverage_weight: int,
+    ) -> CacheKey:
+        """
+        Create a cache key from normalized strings and query parameters.
+        Args:
+            nstrs: Tuple of normalized strings for the query
+            top_k: Maximum number of results
+            prefer_ttys: Preferred term types
+            filter_sources: Include only these sources
+            exclude_sources: Exclude these sources
+            allow_partial: Whether partial matching is enabled
+            min_coverage: Minimum coverage threshold
+            min_word_hits: Minimum word hits required
+            coverage_weight: Weight for coverage in scoring
+        Returns:
+            Immutable CacheKey instance
+        """
+        # Hash the normalized strings tuple for compact storage
+        # Sort to ensure consistent hashing regardless of order
+        nstrs_str = "\0".join(sorted(nstrs))
+        nstrs_hash = hashlib.md5(nstrs_str.encode(), usedforsecurity=False).hexdigest()
+        return CacheKey(
+            nstrs_hash=nstrs_hash,
+            top_k=top_k,
+            prefer_ttys=tuple(prefer_ttys) if prefer_ttys else None,
+            filter_sources=tuple(filter_sources) if filter_sources else None,
+            exclude_sources=tuple(exclude_sources) if exclude_sources else None,
+            allow_partial=allow_partial,
+            min_coverage=min_coverage,
+            min_word_hits=min_word_hits,
+            coverage_weight=coverage_weight,
+        )
+    def get(self, key: CacheKey) -> list[dict[str, Any]] | None:
+        """
+        Get cached hits for a key.
+        Args:
+            key: Cache key to look up
+        Returns:
+            Cached hits list if found, None if not in cache
+        """
+        if key in self._cache:
+            # Move to end (most recently used)
+            self._cache.move_to_end(key)
+            self._hits += 1
+            return self._cache[key]
+        self._misses += 1
+        return None
+    def set(self, key: CacheKey, hits: list[dict[str, Any]]) -> None:
+        """
+        Store hits in the cache.
+        Args:
+            key: Cache key
+            hits: List of hit dictionaries to cache
+        """
+        if key in self._cache:
+            self._cache.move_to_end(key)
+        else:
+            if len(self._cache) >= self._maxsize:
+                # Remove oldest item (LRU eviction)
+                self._cache.popitem(last=False)
+        self._cache[key] = hits
+    def clear(self) -> None:
+        """Clear all cached entries."""
+        self._cache.clear()
+        self._hits = 0
+        self._misses = 0
+    @property
+    def size(self) -> int:
+        """Current number of cached entries."""
+        return len(self._cache)
+    @property
+    def hit_rate(self) -> float:
+        """Cache hit rate (0.0 to 1.0)."""
+        total = self._hits + self._misses
+        return self._hits / total if total > 0 else 0.0
+    def stats(self) -> dict[str, Any]:
+        """
+        Get cache statistics.
+        Returns:
+            Dict with size, maxsize, hits, misses, and hit_rate
+        """
+        return {
+            "size": self.size,
+            "maxsize": self._maxsize,
+            "hits": self._hits,
+            "misses": self._misses,
+            "hit_rate": self.hit_rate,
+        }

{norm_toolkit-1.2.0 → norm_toolkit-1.4.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -32,6 +32,7 @@ from norm_toolkit.constants import (
     TYPES_TABLE,
 )
 from norm_toolkit.models import ConceptInfo, SemanticType
+from norm_toolkit.normalizer_cache import NormalizerCache
 class PostgresNormalizer:
@@ -47,6 +48,8 @@ class PostgresNormalizer:
         engine: AsyncEngine,
         schema: str = "public",
         owned_resource: Any | None = None,
+        cache_maxsize: int = 10000,
+        enable_cache: bool = True,
     ) -> None:
         """
         Initialize the normalizer with an SQLAlchemy AsyncEngine.
@@ -56,6 +59,8 @@ class PostgresNormalizer:
             schema: PostgreSQL schema where tables are located (default: "public")
             owned_resource: Optional resource with async close() method to clean up
                 when this normalizer is closed (e.g., AlloyDB AsyncConnector)
+            cache_maxsize: Maximum number of entries in the normalized string cache
+            enable_cache: Whether to enable caching of normalized string lookups
         Note:
             After creating the normalizer, call `await normalizer.initialize()`
@@ -70,6 +75,11 @@ class PostgresNormalizer:
         self._has_stt = False
         self._initialized = False
+        # Initialize cache
+        self._cache: NormalizerCache | None = (
+            NormalizerCache(maxsize=cache_maxsize) if enable_cache else None
+        )
         # Build qualified table names
         prefix = f"{schema}." if schema else ""
         self._ns_table = f"{prefix}{NS_TABLE}"
@@ -147,8 +157,8 @@ class PostgresNormalizer:
         if prefer_ttys is None:
             prefer_ttys = DEFAULT_PREFER_TTYS
-        # Build normalized string map
-        q_to_nstrs: dict[str, list[str]] = {}
+        # Build normalized string map (use tuple for hashable cache keys)
+        q_to_nstrs: dict[str, tuple[str, ...]] = {}
         for s in strings:
             nstrs = list(lvg_normalize(s) or [])
             # Add normalized forms of synonyms
@@ -156,25 +166,92 @@ class PostgresNormalizer:
                 for syn in synonyms[s]:
                     syn_nstrs = list(lvg_normalize(syn) or [])
                     nstrs.extend(syn_nstrs)
-            q_to_nstrs[s] = nstrs
-        result = await self._lookup(
-            q_to_nstrs=q_to_nstrs,
-            all_queries=list(strings),
-            prefer_ttys=prefer_ttys,
-            filter_sources=filter_sources,
-            exclude_sources=exclude_sources,
-            top_k=top_k,
-            allow_partial=allow_partial,
-            min_coverage=min_coverage,
-            min_word_hits=min_word_hits,
-            coverage_weight=coverage_weight,
-        )
+            # Deduplicate while preserving order, then convert to tuple
+            q_to_nstrs[s] = tuple(dict.fromkeys(nstrs))
+        # Check cache for each input
+        cached_hits: dict[str, list[dict[str, Any]]] = {}
+        uncached_queries: list[str] = []
+        uncached_q_to_nstrs: dict[str, tuple[str, ...]] = {}
+        for q, nstrs in q_to_nstrs.items():
+            if not nstrs:
+                # No normalized strings, empty result
+                cached_hits[q] = []
+                continue
+            if self._cache is not None:
+                cache_key = NormalizerCache.make_key(
+                    nstrs,
+                    top_k=top_k,
+                    prefer_ttys=prefer_ttys,
+                    filter_sources=filter_sources,
+                    exclude_sources=exclude_sources,
+                    allow_partial=allow_partial,
+                    min_coverage=min_coverage,
+                    min_word_hits=min_word_hits,
+                    coverage_weight=coverage_weight,
+                )
+                cached = self._cache.get(cache_key)
+                if cached is not None:
+                    cached_hits[q] = cached
+                    continue
+            uncached_queries.append(q)
+            uncached_q_to_nstrs[q] = nstrs
+        # Query DB for uncached entries
+        if uncached_q_to_nstrs:
+            # Convert tuples back to lists for _lookup
+            uncached_q_to_nstrs_list: dict[str, list[str]] = {
+                q: list(nstrs) for q, nstrs in uncached_q_to_nstrs.items()
+            }
+            fresh_result = await self._lookup(
+                q_to_nstrs=uncached_q_to_nstrs_list,
+                all_queries=uncached_queries,
+                prefer_ttys=prefer_ttys,
+                filter_sources=filter_sources,
+                exclude_sources=exclude_sources,
+                top_k=top_k,
+                allow_partial=allow_partial,
+                min_coverage=min_coverage,
+                min_word_hits=min_word_hits,
+                coverage_weight=coverage_weight,
+            )
+            # Enrich fresh results
+            fresh_result = await self._enrich_hits_with_concept_info(fresh_result, prefer_ttys)
+            # Cache fresh results and add to cached_hits
+            for row in fresh_result.iter_rows(named=True):
+                q = row["input_string"]
+                hits = row["hits"] or []
+                cached_hits[q] = hits
+                if self._cache is not None:
+                    nstrs = uncached_q_to_nstrs[q]
+                    cache_key = NormalizerCache.make_key(
+                        nstrs,
+                        top_k=top_k,
+                        prefer_ttys=prefer_ttys,
+                        filter_sources=filter_sources,
+                        exclude_sources=exclude_sources,
+                        allow_partial=allow_partial,
+                        min_coverage=min_coverage,
+                        min_word_hits=min_word_hits,
+                        coverage_weight=coverage_weight,
+                    )
+                    self._cache.set(cache_key, hits)
+        # Build final result in original order
+        result_data = [{"input_string": s, "hits": cached_hits.get(s, [])} for s in strings]
+        result = pl.DataFrame(result_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
         # Add synonyms column if synonyms were provided
         if synonyms:
             syn_list = [list(synonyms.get(s, [])) for s in strings]
-            result = result.with_columns(pl.Series("synonyms", syn_list))
+            result = result.with_columns(pl.Series("input_synonyms", syn_list))
         return result
@@ -476,6 +553,58 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
+    async def _enrich_hits_with_concept_info(
+        self,
+        result: pl.DataFrame,
+        prefer_ttys: list[str] | None,
+    ) -> pl.DataFrame:
+        """Enrich hits with pref_name, description, and synonyms from concept_info."""
+        # Collect all unique concept_ids from hits
+        all_concept_ids: set[str] = set()
+        for hits in result["hits"].to_list():
+            if hits:
+                for hit in hits:
+                    if hit and "global_identifier" in hit:
+                        all_concept_ids.add(hit["global_identifier"])
+        if not all_concept_ids:
+            # No concepts to enrich, just add empty fields
+            enriched_data = []
+            for row in result.iter_rows(named=True):
+                enriched_hits = []
+                for hit in row["hits"] or []:
+                    enriched_hit = dict(hit)
+                    enriched_hit["pref_name"] = None
+                    enriched_hit["description"] = None
+                    enriched_hit["synonyms"] = []
+                    enriched_hits.append(enriched_hit)
+                enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
+            return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
+        # Get concept info for all concepts
+        concept_infos = await self.concept_info(list(all_concept_ids), prefer_ttys=prefer_ttys)
+        # Enrich each hit
+        enriched_data = []
+        for row in result.iter_rows(named=True):
+            enriched_hits = []
+            for hit in row["hits"] or []:
+                enriched_hit = dict(hit)
+                cid = hit.get("global_identifier")
+                if cid and cid in concept_infos:
+                    info = concept_infos[cid]
+                    enriched_hit["pref_name"] = info.preferred_name
+                    enriched_hit["description"] = info.description
+                    enriched_hit["synonyms"] = info.synonyms or []
+                else:
+                    enriched_hit["pref_name"] = None
+                    enriched_hit["description"] = None
+                    enriched_hit["synonyms"] = []
+                enriched_hits.append(enriched_hit)
+            enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
+        return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
     async def concept_info(
         self,
         concept_ids: Sequence[str],
@@ -846,6 +975,23 @@ WHERE concept_id != :concept_id
         return [r["concept_id"] for r in rows]
+    def cache_stats(self) -> dict[str, Any] | None:
+        """
+        Get cache statistics.
+        Returns:
+            Dict with size, maxsize, hits, misses, and hit_rate,
+            or None if caching is disabled.
+        """
+        if self._cache is None:
+            return None
+        return self._cache.stats()
+    def clear_cache(self) -> None:
+        """Clear all cached entries."""
+        if self._cache is not None:
+            self._cache.clear()
     async def close(self) -> None:
         """
         Close the engine and any owned resources.