PyPI - norm_toolkit - Versions diffs - 1.3.0__tar.gz → 1.5.0__tar.gz - Mend

norm_toolkit 1.3.0tar.gz → 1.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.3.0
+Version: 1.5.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.3.0"
+version = "1.5.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

norm_toolkit-1.5.0/src/norm_toolkit/normalizer_cache.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""
+LRU cache for normalized string lookup results.
+Caches at the normalized string level to avoid repeated DB round trips
+for the same normalized forms.
+"""
+from __future__ import annotations
+import hashlib
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any
+@dataclass(frozen=True)
+class CacheKey:
+    """Immutable cache key for normalized string lookup results."""
+    nstrs_hash: str  # Hash of sorted normalized strings
+    top_k: int
+    prefer_ttys: tuple[str, ...] | None
+    filter_sources: tuple[str, ...] | None
+    exclude_sources: tuple[str, ...] | None
+    allow_partial: bool
+    min_coverage: float
+    min_word_hits: int | None
+    coverage_weight: int
+class NormalizerCache:
+    """
+    LRU cache for normalized string lookup results.
+    Caches the fully enriched hits for a given tuple of normalized strings
+    and query parameters. Uses an OrderedDict for O(1) LRU eviction.
+    """
+    def __init__(self, maxsize: int = 10000) -> None:
+        """
+        Initialize the cache.
+        Args:
+            maxsize: Maximum number of entries to cache. When exceeded,
+                the least recently used entries are evicted.
+        """
+        self._cache: OrderedDict[CacheKey, list[dict[str, Any]]] = OrderedDict()
+        self._maxsize = maxsize
+        self._hits = 0
+        self._misses = 0
+    @staticmethod
+    def make_key(
+        nstrs: tuple[str, ...],
+        *,
+        top_k: int,
+        prefer_ttys: list[str] | None,
+        filter_sources: list[str] | None,
+        exclude_sources: list[str] | None,
+        allow_partial: bool,
+        min_coverage: float,
+        min_word_hits: int | None,
+        coverage_weight: int,
+    ) -> CacheKey:
+        """
+        Create a cache key from normalized strings and query parameters.
+        Args:
+            nstrs: Tuple of normalized strings for the query
+            top_k: Maximum number of results
+            prefer_ttys: Preferred term types
+            filter_sources: Include only these sources
+            exclude_sources: Exclude these sources
+            allow_partial: Whether partial matching is enabled
+            min_coverage: Minimum coverage threshold
+            min_word_hits: Minimum word hits required
+            coverage_weight: Weight for coverage in scoring
+        Returns:
+            Immutable CacheKey instance
+        """
+        # Hash the normalized strings tuple for compact storage
+        # Sort to ensure consistent hashing regardless of order
+        nstrs_str = "\0".join(sorted(nstrs))
+        nstrs_hash = hashlib.md5(nstrs_str.encode(), usedforsecurity=False).hexdigest()
+        return CacheKey(
+            nstrs_hash=nstrs_hash,
+            top_k=top_k,
+            prefer_ttys=tuple(prefer_ttys) if prefer_ttys else None,
+            filter_sources=tuple(filter_sources) if filter_sources else None,
+            exclude_sources=tuple(exclude_sources) if exclude_sources else None,
+            allow_partial=allow_partial,
+            min_coverage=min_coverage,
+            min_word_hits=min_word_hits,
+            coverage_weight=coverage_weight,
+        )
+    def get(self, key: CacheKey) -> list[dict[str, Any]] | None:
+        """
+        Get cached hits for a key.
+        Args:
+            key: Cache key to look up
+        Returns:
+            Cached hits list if found, None if not in cache
+        """
+        if key in self._cache:
+            # Move to end (most recently used)
+            self._cache.move_to_end(key)
+            self._hits += 1
+            return self._cache[key]
+        self._misses += 1
+        return None
+    def set(self, key: CacheKey, hits: list[dict[str, Any]]) -> None:
+        """
+        Store hits in the cache.
+        Args:
+            key: Cache key
+            hits: List of hit dictionaries to cache
+        """
+        if key in self._cache:
+            self._cache.move_to_end(key)
+        else:
+            if len(self._cache) >= self._maxsize:
+                # Remove oldest item (LRU eviction)
+                self._cache.popitem(last=False)
+        self._cache[key] = hits
+    def clear(self) -> None:
+        """Clear all cached entries."""
+        self._cache.clear()
+        self._hits = 0
+        self._misses = 0
+    @property
+    def size(self) -> int:
+        """Current number of cached entries."""
+        return len(self._cache)
+    @property
+    def hit_rate(self) -> float:
+        """Cache hit rate (0.0 to 1.0)."""
+        total = self._hits + self._misses
+        return self._hits / total if total > 0 else 0.0
+    def stats(self) -> dict[str, Any]:
+        """
+        Get cache statistics.
+        Returns:
+            Dict with size, maxsize, hits, misses, and hit_rate
+        """
+        return {
+            "size": self.size,
+            "maxsize": self._maxsize,
+            "hits": self._hits,
+            "misses": self._misses,
+            "hit_rate": self.hit_rate,
+        }

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -32,6 +32,7 @@ from norm_toolkit.constants import (
     TYPES_TABLE,
 )
 from norm_toolkit.models import ConceptInfo, SemanticType
+from norm_toolkit.normalizer_cache import NormalizerCache
 class PostgresNormalizer:
@@ -47,6 +48,8 @@ class PostgresNormalizer:
         engine: AsyncEngine,
         schema: str = "public",
         owned_resource: Any | None = None,
+        cache_maxsize: int = 10000,
+        enable_cache: bool = True,
     ) -> None:
         """
         Initialize the normalizer with an SQLAlchemy AsyncEngine.
@@ -56,6 +59,8 @@ class PostgresNormalizer:
             schema: PostgreSQL schema where tables are located (default: "public")
             owned_resource: Optional resource with async close() method to clean up
                 when this normalizer is closed (e.g., AlloyDB AsyncConnector)
+            cache_maxsize: Maximum number of entries in the normalized string cache
+            enable_cache: Whether to enable caching of normalized string lookups
         Note:
             After creating the normalizer, call `await normalizer.initialize()`
@@ -70,6 +75,11 @@ class PostgresNormalizer:
         self._has_stt = False
         self._initialized = False
+        # Initialize cache
+        self._cache: NormalizerCache | None = (
+            NormalizerCache(maxsize=cache_maxsize) if enable_cache else None
+        )
         # Build qualified table names
         prefix = f"{schema}." if schema else ""
         self._ns_table = f"{prefix}{NS_TABLE}"
@@ -147,8 +157,8 @@ class PostgresNormalizer:
         if prefer_ttys is None:
             prefer_ttys = DEFAULT_PREFER_TTYS
-        # Build normalized string map
-        q_to_nstrs: dict[str, list[str]] = {}
+        # Build normalized string map (use tuple for hashable cache keys)
+        q_to_nstrs: dict[str, tuple[str, ...]] = {}
         for s in strings:
             nstrs = list(lvg_normalize(s) or [])
             # Add normalized forms of synonyms
@@ -156,23 +166,87 @@ class PostgresNormalizer:
                 for syn in synonyms[s]:
                     syn_nstrs = list(lvg_normalize(syn) or [])
                     nstrs.extend(syn_nstrs)
-            q_to_nstrs[s] = nstrs
-        result = await self._lookup(
-            q_to_nstrs=q_to_nstrs,
-            all_queries=list(strings),
-            prefer_ttys=prefer_ttys,
-            filter_sources=filter_sources,
-            exclude_sources=exclude_sources,
-            top_k=top_k,
-            allow_partial=allow_partial,
-            min_coverage=min_coverage,
-            min_word_hits=min_word_hits,
-            coverage_weight=coverage_weight,
-        )
+            # Deduplicate while preserving order, then convert to tuple
+            q_to_nstrs[s] = tuple(dict.fromkeys(nstrs))
+        # Check cache for each input
+        cached_hits: dict[str, list[dict[str, Any]]] = {}
+        uncached_queries: list[str] = []
+        uncached_q_to_nstrs: dict[str, tuple[str, ...]] = {}
+        for q, nstrs in q_to_nstrs.items():
+            if not nstrs:
+                # No normalized strings, empty result
+                cached_hits[q] = []
+                continue
+            if self._cache is not None:
+                cache_key = NormalizerCache.make_key(
+                    nstrs,
+                    top_k=top_k,
+                    prefer_ttys=prefer_ttys,
+                    filter_sources=filter_sources,
+                    exclude_sources=exclude_sources,
+                    allow_partial=allow_partial,
+                    min_coverage=min_coverage,
+                    min_word_hits=min_word_hits,
+                    coverage_weight=coverage_weight,
+                )
+                cached = self._cache.get(cache_key)
+                if cached is not None:
+                    cached_hits[q] = cached
+                    continue
+            uncached_queries.append(q)
+            uncached_q_to_nstrs[q] = nstrs
+        # Query DB for uncached entries
+        if uncached_q_to_nstrs:
+            # Convert tuples back to lists for _lookup
+            uncached_q_to_nstrs_list: dict[str, list[str]] = {
+                q: list(nstrs) for q, nstrs in uncached_q_to_nstrs.items()
+            }
+            fresh_result = await self._lookup(
+                q_to_nstrs=uncached_q_to_nstrs_list,
+                all_queries=uncached_queries,
+                prefer_ttys=prefer_ttys,
+                filter_sources=filter_sources,
+                exclude_sources=exclude_sources,
+                top_k=top_k,
+                allow_partial=allow_partial,
+                min_coverage=min_coverage,
+                min_word_hits=min_word_hits,
+                coverage_weight=coverage_weight,
+            )
-        # Enrich hits with concept info (pref_name, description, synonyms)
-        result = await self._enrich_hits_with_concept_info(result, prefer_ttys)
+            # Enrich fresh results
+            fresh_result = await self._enrich_hits_with_concept_info(fresh_result, prefer_ttys)
+            # Cache fresh results and add to cached_hits
+            for row in fresh_result.iter_rows(named=True):
+                q = row["input_string"]
+                hits = row["hits"] or []
+                cached_hits[q] = hits
+                if self._cache is not None:
+                    nstrs = uncached_q_to_nstrs[q]
+                    cache_key = NormalizerCache.make_key(
+                        nstrs,
+                        top_k=top_k,
+                        prefer_ttys=prefer_ttys,
+                        filter_sources=filter_sources,
+                        exclude_sources=exclude_sources,
+                        allow_partial=allow_partial,
+                        min_coverage=min_coverage,
+                        min_word_hits=min_word_hits,
+                        coverage_weight=coverage_weight,
+                    )
+                    self._cache.set(cache_key, hits)
+        # Build final result in original order
+        result_data = [{"input_string": s, "hits": cached_hits.get(s, [])} for s in strings]
+        result = pl.DataFrame(result_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
         # Add synonyms column if synonyms were provided
         if synonyms:
@@ -843,6 +917,7 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
         concept_id: str,
         max_depth: int | None = 10,
         filter_sources: list[str] | None = None,
+        max_ids: int | None = None,
     ) -> list[str]:
         """
         Get all narrower (descendant) concept IDs using recursive traversal.
@@ -853,9 +928,11 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
             concept_id: Starting concept ID (broader term)
             max_depth: Maximum depth to traverse (1 = direct children only, None = all descendants)
             filter_sources: Only follow edges from these sources (e.g., ["SNOMEDCT_US"])
+            max_ids: Maximum number of concept IDs to return (None = no limit)
         Returns:
-            List of descendant concept IDs (excludes the starting concept)
+            List of descendant concept IDs ordered by depth (shallowest first),
+            excludes the starting concept
         """
         await self._ensure_initialized()
@@ -875,10 +952,16 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
             sources_sql = ", ".join(src_placeholders)
             source_filter = f" AND e.source IN ({sources_sql})"
+        # Build optional LIMIT clause
+        limit_clause = ""
+        if max_ids is not None:
+            params["max_ids"] = max_ids
+            limit_clause = "\nLIMIT :max_ids"
         # PostgreSQL recursive CTE with named parameters
         # Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
         # UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
-        # DISTINCT in output needed since same concept can be reached at different depths
+        # GROUP BY with MIN(depth) gets shortest path depth for each concept
         query = f"""
 WITH RECURSIVE walk(concept_id, depth) AS (
     SELECT CAST(:concept_id AS VARCHAR), 0
@@ -890,9 +973,11 @@ WITH RECURSIVE walk(concept_id, depth) AS (
     JOIN {self._edges_table} e ON e.parent_id = w.concept_id
     WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){source_filter}
 )
-SELECT DISTINCT concept_id
+SELECT concept_id, MIN(depth) AS min_depth
 FROM walk
 WHERE concept_id != :concept_id
+GROUP BY concept_id
+ORDER BY min_depth, concept_id{limit_clause}
 """
         async with self._engine.connect() as conn:
@@ -901,6 +986,23 @@ WHERE concept_id != :concept_id
         return [r["concept_id"] for r in rows]
+    def cache_stats(self) -> dict[str, Any] | None:
+        """
+        Get cache statistics.
+        Returns:
+            Dict with size, maxsize, hits, misses, and hit_rate,
+            or None if caching is disabled.
+        """
+        if self._cache is None:
+            return None
+        return self._cache.stats()
+    def clear_cache(self) -> None:
+        """Clear all cached entries."""
+        if self._cache is not None:
+            self._cache.clear()
     async def close(self) -> None:
         """
         Close the engine and any owned resources.

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/README.md RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/__init__.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/build_merged.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/build_ontology.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/build_umls.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/constants.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/models.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/normalizer.py RENAMED Viewed

File without changes

{norm_toolkit-1.3.0 → norm_toolkit-1.5.0}/src/norm_toolkit/utils.py RENAMED Viewed

File without changes

norm_toolkit 1.3.0__tar.gz → 1.5.0__tar.gz

norm_toolkit 1.3.0tar.gz → 1.5.0tar.gz