PyPI - norm_toolkit - Versions diffs - 1.6.0__tar.gz → 1.7.0__tar.gz - Mend

norm_toolkit 1.6.0tar.gz → 1.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.6.0
+Version: 1.7.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.6.0"
+version = "1.7.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
@@ -24,6 +24,8 @@ dev = [
     "pytest>=8.3",
     "rdkit>=2025.9.3",
     "ruff>=0.6.9",
+    "fire>=0.7.1",
+    "joblib>=1.5.3",
 ]
 [build-system]

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/normalizer.py RENAMED Viewed

@@ -33,7 +33,7 @@ from norm_toolkit.normalizer_utils import (
     build_definitions_sql,
     build_hits_agg_expr,
     build_lookup_sql,
-    build_normalized_string_map,
+    build_normalized_query_map,
     build_ontology_filter_clauses,
     build_pref_join,
     build_query_rows,
@@ -206,7 +206,7 @@ class DuckDBNormalizer:
     def normalize(
         self,
         strings: Sequence[str],
-        synonyms: Mapping[str, Sequence[str]] | None = None,
+        synonyms: Sequence[Sequence[str] | None] | None = None,
         top_k: int | None = 25,
         ont_top_k: int | None = None,
         prefer_ttys: list[str] | None = None,
@@ -222,10 +222,10 @@ class DuckDBNormalizer:
         Args:
             strings: Input strings to normalize
-            synonyms: Optional mapping of input strings to their synonyms.
-                Synonyms are normalized and used alongside the main string
-                to improve matching. Results are still keyed by the original
-                input string.
+            synonyms: Optional list of synonym lists aligned with `strings`
+                (same length required). Synonyms are normalized and used
+                alongside the main string to improve matching. Results are
+                still keyed by the original input string.
             top_k: Maximum number of results per query (mutually exclusive with ont_top_k)
             ont_top_k: Maximum number of results per ontology (mutually exclusive with top_k)
             prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
@@ -244,12 +244,15 @@ class DuckDBNormalizer:
         if prefer_ttys is None:
             prefer_ttys = DEFAULT_PREFER_TTYS
-        # Build normalized string map
-        q_to_nstrs = build_normalized_string_map(strings, synonyms)
+        strings_list = list(strings)
+        query_keys = [f"q{i}" for i in range(len(strings_list))] if synonyms is not None else strings_list
+        # Build normalized string map with per-entry keys
+        q_to_nstrs, syn_list = build_normalized_query_map(strings_list, synonyms, query_keys=query_keys)
         result = self._lookup(
             q_to_nstrs=q_to_nstrs,
-            all_queries=list(strings),
+            all_queries=query_keys,
             prefer_ttys=prefer_ttys,
             filter_ontologies=filter_ontologies,
             exclude_ontologies=exclude_ontologies,
@@ -261,9 +264,11 @@ class DuckDBNormalizer:
             coverage_weight=coverage_weight,
         )
+        result = result.with_columns(pl.Series("input_string", strings_list))
         # Add synonyms column if synonyms were provided
-        if synonyms:
-            syn_list = [list(synonyms.get(s, [])) for s in strings]
+        if synonyms is not None:
+            syn_list = syn_list if syn_list is not None else [[] for _ in strings_list]
             result = result.with_columns(pl.Series("synonyms", syn_list))
         return result

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/normalizer_cache.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-LRU cache for normalized string lookup results.
+LRU caches for normalized string lookups and entity expansion results.
 Caches at the normalized string level to avoid repeated DB round trips
 for the same normalized forms.
@@ -10,7 +10,10 @@ from __future__ import annotations
 import hashlib
 from collections import OrderedDict
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Generic, TypeVar
+K = TypeVar("K")
+V = TypeVar("V")
 @dataclass(frozen=True)
@@ -29,12 +32,21 @@ class CacheKey:
     coverage_weight: int
-class NormalizerCache:
+@dataclass(frozen=True)
+class ExpansionCacheKey:
+    """Immutable cache key for entity expansion results."""
+    concept_id: str
+    max_depth: int | None
+    filter_ontologies: tuple[str, ...] | None
+    max_ids: int | None
+class LRUCache(Generic[K, V]):
     """
-    LRU cache for normalized string lookup results.
+    LRU cache with basic hit/miss statistics.
-    Caches the fully enriched hits for a given tuple of normalized strings
-    and query parameters. Uses an OrderedDict for O(1) LRU eviction.
+    Uses an OrderedDict for O(1) LRU eviction.
     """
     def __init__(self, maxsize: int = 10000) -> None:
@@ -45,11 +57,86 @@ class NormalizerCache:
             maxsize: Maximum number of entries to cache. When exceeded,
                 the least recently used entries are evicted.
         """
-        self._cache: OrderedDict[CacheKey, list[dict[str, Any]]] = OrderedDict()
+        self._cache: OrderedDict[K, V] = OrderedDict()
         self._maxsize = maxsize
         self._hits = 0
         self._misses = 0
+    def get(self, key: K) -> V | None:
+        """
+        Get cached value for a key.
+        Args:
+            key: Cache key to look up
+        Returns:
+            Cached value if found, None if not in cache
+        """
+        if key in self._cache:
+            # Move to end (most recently used)
+            self._cache.move_to_end(key)
+            self._hits += 1
+            return self._cache[key]
+        self._misses += 1
+        return None
+    def set(self, key: K, value: V) -> None:
+        """
+        Store a value in the cache.
+        Args:
+            key: Cache key
+            value: Value to cache
+        """
+        if key in self._cache:
+            self._cache.move_to_end(key)
+        else:
+            if len(self._cache) >= self._maxsize:
+                # Remove oldest item (LRU eviction)
+                self._cache.popitem(last=False)
+        self._cache[key] = value
+    def clear(self) -> None:
+        """Clear all cached entries."""
+        self._cache.clear()
+        self._hits = 0
+        self._misses = 0
+    @property
+    def size(self) -> int:
+        """Current number of cached entries."""
+        return len(self._cache)
+    @property
+    def hit_rate(self) -> float:
+        """Cache hit rate (0.0 to 1.0)."""
+        total = self._hits + self._misses
+        return self._hits / total if total > 0 else 0.0
+    def stats(self) -> dict[str, Any]:
+        """
+        Get cache statistics.
+        Returns:
+            Dict with size, maxsize, hits, misses, and hit_rate
+        """
+        return {
+            "size": self.size,
+            "maxsize": self._maxsize,
+            "hits": self._hits,
+            "misses": self._misses,
+            "hit_rate": self.hit_rate,
+        }
+class NormalizerCache(LRUCache[CacheKey, list[dict[str, Any]]]):
+    """
+    LRU cache for normalized string lookup results.
+    Caches the fully enriched hits for a given tuple of normalized strings
+    and query parameters.
+    """
     @staticmethod
     def make_key(
         nstrs: tuple[str, ...],
@@ -100,68 +187,37 @@ class NormalizerCache:
             coverage_weight=coverage_weight,
         )
-    def get(self, key: CacheKey) -> list[dict[str, Any]] | None:
-        """
-        Get cached hits for a key.
-        Args:
-            key: Cache key to look up
+class ExpansionCache(LRUCache[ExpansionCacheKey, list[str]]):
+    """
+    LRU cache for entity expansion results.
-        Returns:
-            Cached hits list if found, None if not in cache
-        """
-        if key in self._cache:
-            # Move to end (most recently used)
-            self._cache.move_to_end(key)
-            self._hits += 1
-            return self._cache[key]
-        self._misses += 1
-        return None
+    Caches expanded concept IDs for a given concept and traversal parameters.
+    """
-    def set(self, key: CacheKey, hits: list[dict[str, Any]]) -> None:
+    @staticmethod
+    def make_key(
+        concept_id: str,
+        *,
+        max_depth: int | None,
+        filter_ontologies: list[str] | None,
+        max_ids: int | None,
+    ) -> ExpansionCacheKey:
         """
-        Store hits in the cache.
+        Create a cache key from entity expansion parameters.
         Args:
-            key: Cache key
-            hits: List of hit dictionaries to cache
-        """
-        if key in self._cache:
-            self._cache.move_to_end(key)
-        else:
-            if len(self._cache) >= self._maxsize:
-                # Remove oldest item (LRU eviction)
-                self._cache.popitem(last=False)
-        self._cache[key] = hits
-    def clear(self) -> None:
-        """Clear all cached entries."""
-        self._cache.clear()
-        self._hits = 0
-        self._misses = 0
-    @property
-    def size(self) -> int:
-        """Current number of cached entries."""
-        return len(self._cache)
-    @property
-    def hit_rate(self) -> float:
-        """Cache hit rate (0.0 to 1.0)."""
-        total = self._hits + self._misses
-        return self._hits / total if total > 0 else 0.0
-    def stats(self) -> dict[str, Any]:
-        """
-        Get cache statistics.
+            concept_id: Starting concept ID
+            max_depth: Maximum depth to traverse
+            filter_ontologies: Ontologies to include
+            max_ids: Maximum number of IDs to return
         Returns:
-            Dict with size, maxsize, hits, misses, and hit_rate
+            Immutable ExpansionCacheKey instance
         """
-        return {
-            "size": self.size,
-            "maxsize": self._maxsize,
-            "hits": self._hits,
-            "misses": self._misses,
-            "hit_rate": self.hit_rate,
-        }
+        return ExpansionCacheKey(
+            concept_id=concept_id,
+            max_depth=max_depth,
+            filter_ontologies=tuple(filter_ontologies) if filter_ontologies else None,
+            max_ids=max_ids,
+        )

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -25,7 +25,7 @@ from norm_toolkit.constants import (
     TYPES_TABLE,
 )
 from norm_toolkit.models import ConceptInfo
-from norm_toolkit.normalizer_cache import NormalizerCache
+from norm_toolkit.normalizer_cache import ExpansionCache, NormalizerCache
 from norm_toolkit.normalizer_utils import (
     apply_concept_name_rows,
     apply_definition_rows,
@@ -34,7 +34,7 @@ from norm_toolkit.normalizer_utils import (
     build_definitions_sql,
     build_hits_agg_expr,
     build_lookup_sql,
-    build_normalized_string_map,
+    build_normalized_query_map,
     build_ontology_filter_clauses,
     build_pref_join,
     build_query_rows,
@@ -94,8 +94,8 @@ class PostgresNormalizer:
             schema: PostgreSQL schema where tables are located (default: "public")
             owned_resource: Optional resource with async close() method to clean up
                 when this normalizer is closed (e.g., AlloyDB AsyncConnector)
-            cache_maxsize: Maximum number of entries in the normalized string cache
-            enable_cache: Whether to enable caching of normalized string lookups
+            cache_maxsize: Maximum number of entries in each cache
+            enable_cache: Whether to enable caching for normalization and expansion
         Note:
             After creating the normalizer, call `await normalizer.initialize()`
@@ -110,8 +110,9 @@ class PostgresNormalizer:
         self._has_stt = False
         self._initialized = False
-        # Initialize cache
+        # Initialize caches
         self._cache: NormalizerCache | None = NormalizerCache(maxsize=cache_maxsize) if enable_cache else None
+        self._expansion_cache: ExpansionCache | None = ExpansionCache(maxsize=cache_maxsize) if enable_cache else None
         # Build qualified table names
         prefix = f"{schema}." if schema else ""
@@ -158,7 +159,7 @@ class PostgresNormalizer:
     async def normalize(
         self,
         strings: Sequence[str],
-        synonyms: Mapping[str, Sequence[str]] | None = None,
+        synonyms: Sequence[Sequence[str] | None] | None = None,
         top_k: int | None = 25,
         ont_top_k: int | None = None,
         prefer_ttys: list[str] | None = None,
@@ -174,10 +175,10 @@ class PostgresNormalizer:
         Args:
             strings: Input strings to normalize
-            synonyms: Optional mapping of input strings to their synonyms.
-                Synonyms are normalized and used alongside the main string
-                to improve matching. Results are still keyed by the original
-                input string.
+            synonyms: Optional list of synonym lists aligned with `strings`
+                (same length required). Synonyms are normalized and used
+                alongside the main string to improve matching. Results are
+                still keyed by the original input string.
             top_k: Maximum number of results per query (mutually exclusive with ont_top_k)
             ont_top_k: Maximum number of results per ontology (mutually exclusive with top_k)
             prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
@@ -204,6 +205,9 @@ class PostgresNormalizer:
         if ont_top_k is not None:
             ont_top_k = max(1, int(ont_top_k))
+        strings_list = list(strings)
+        query_keys = [f"q{i}" for i in range(len(strings_list))] if synonyms is not None else strings_list
         def make_cache_key(nstrs: tuple[str, ...]) -> Any:
             return NormalizerCache.make_key(
                 nstrs,
@@ -218,8 +222,8 @@ class PostgresNormalizer:
                 coverage_weight=coverage_weight,
             )
-        # Build normalized string map (use tuple for hashable cache keys)
-        q_to_nstrs = build_normalized_string_map(strings, synonyms)
+        # Build normalized string map with per-entry keys (tuples for cache keys)
+        q_to_nstrs, syn_list = build_normalized_query_map(strings_list, synonyms, query_keys=query_keys)
         # Check cache for each input
         cached_hits: dict[str, list[dict[str, Any]]] = {}
@@ -271,12 +275,15 @@ class PostgresNormalizer:
                     self._cache.set(cache_key, hits)
         # Build final result in original order
-        result_data = [{"input_string": s, "hits": cached_hits.get(s, [])} for s in strings]
+        result_data = [
+            {"input_string": strings_list[i], "hits": cached_hits.get(query_keys[i], [])}
+            for i in range(len(strings_list))
+        ]
         result = pl.DataFrame(result_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
         # Add synonyms column if synonyms were provided
-        if synonyms:
-            syn_list = [list(synonyms.get(s, [])) for s in strings]
+        if synonyms is not None:
+            syn_list = syn_list if syn_list is not None else [[] for _ in strings_list]
             result = result.with_columns(pl.Series("synonyms", syn_list))
         return result
@@ -319,8 +326,7 @@ class PostgresNormalizer:
         qwords_values = sql_params.add_rows(qword_rows) if qword_rows else ""
         allq_values = ", ".join(
-            f"({sql_params.add(q)}, {sql_params.add_cast(i, 'INTEGER')})"
-            for i, q in enumerate(all_queries)
+            f"({sql_params.add(q)}, {sql_params.add_cast(i, 'INTEGER')})" for i, q in enumerate(all_queries)
         )
         # Build preference clauses (parameterized to prevent SQL injection)
@@ -601,6 +607,18 @@ class PostgresNormalizer:
         if not self._has_edges:
             return []
+        cache_key = None
+        if self._expansion_cache is not None:
+            cache_key = ExpansionCache.make_key(
+                concept_id,
+                max_depth=max_depth,
+                filter_ontologies=filter_ontologies,
+                max_ids=max_ids,
+            )
+            cached = self._expansion_cache.get(cache_key)
+            if cached is not None:
+                return cached
         params: dict[str, Any] = {"concept_id": concept_id, "max_depth": max_depth}
         # Build ontology filter clause
@@ -646,11 +664,14 @@ class PostgresNormalizer:
         rows = await self._fetch_rows(query, params)
-        return [r["concept_id"] for r in rows]
+        result = [r["concept_id"] for r in rows]
+        if self._expansion_cache is not None and cache_key is not None:
+            self._expansion_cache.set(cache_key, result)
+        return result
     def cache_stats(self) -> dict[str, Any] | None:
         """
-        Get cache statistics.
+        Get normalization cache statistics.
         Returns:
             Dict with size, maxsize, hits, misses, and hit_rate,
@@ -660,10 +681,24 @@ class PostgresNormalizer:
             return None
         return self._cache.stats()
+    def expansion_cache_stats(self) -> dict[str, Any] | None:
+        """
+        Get entity expansion cache statistics.
+        Returns:
+            Dict with size, maxsize, hits, misses, and hit_rate,
+            or None if caching is disabled.
+        """
+        if self._expansion_cache is None:
+            return None
+        return self._expansion_cache.stats()
     def clear_cache(self) -> None:
         """Clear all cached entries."""
         if self._cache is not None:
             self._cache.clear()
+        if self._expansion_cache is not None:
+            self._expansion_cache.clear()
     async def close(self) -> None:
         """

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/normalizer_utils.py RENAMED Viewed

@@ -21,24 +21,81 @@ from norm_toolkit.constants import (
 from norm_toolkit.models import ConceptInfo, SemanticType
+def _coerce_synonyms_list(
+    strings: Sequence[str],
+    synonyms: Sequence[Sequence[str] | None] | None,
+) -> list[list[str]] | None:
+    if synonyms is None:
+        return None
+    if not isinstance(synonyms, Sequence) or isinstance(synonyms, (str, bytes)):
+        raise TypeError("synonyms must be a sequence of sequences aligned with strings")
+    if len(synonyms) != len(strings):
+        raise ValueError("synonyms must have the same length as strings")
+    out: list[list[str]] = []
+    for i, syns in enumerate(synonyms):
+        if syns is None:
+            out.append([])
+            continue
+        if not isinstance(syns, Sequence) or isinstance(syns, (str, bytes)):
+            raise ValueError(f"synonyms[{i}] must be a sequence of strings")
+        out.append(list(syns))
+    return out
 def build_normalized_string_map(
     strings: Sequence[str],
-    synonyms: Mapping[str, Sequence[str]] | None = None,
+    synonyms: Sequence[Sequence[str] | None] | None = None,
 ) -> dict[str, tuple[str, ...]]:
     """
     Build a mapping of input string -> normalized string variants.
     Normalized variants are deduplicated while preserving order.
+    Duplicate input strings will collapse to the last entry.
+    Synonyms must be aligned with `strings` when provided.
     """
+    synonyms_list = _coerce_synonyms_list(strings, synonyms)
+    syns_iter = synonyms_list if synonyms_list is not None else [None] * len(strings)
     q_to_nstrs: dict[str, tuple[str, ...]] = {}
-    for s in strings:
+    for s, syns in zip(strings, syns_iter):
         nstrs = list(lvg_normalize(s) or [])
-        for syn in (synonyms or {}).get(s, []):
-            nstrs.extend(lvg_normalize(syn) or [])
+        if syns:
+            for syn in syns:
+                nstrs.extend(lvg_normalize(syn) or [])
         q_to_nstrs[s] = tuple(dict.fromkeys(nstrs))
     return q_to_nstrs
+def build_normalized_query_map(
+    strings: Sequence[str],
+    synonyms: Sequence[Sequence[str] | None] | None = None,
+    *,
+    query_keys: Sequence[str] | None = None,
+) -> tuple[dict[str, tuple[str, ...]], list[list[str]] | None]:
+    """
+    Build a mapping of query key -> normalized string variants.
+    Normalized variants are deduplicated while preserving order.
+    Synonyms must be aligned with `strings` when provided.
+    """
+    if query_keys is None:
+        query_keys = list(strings)
+    if len(query_keys) != len(strings):
+        raise ValueError("query_keys must have the same length as strings")
+    synonyms_list = _coerce_synonyms_list(strings, synonyms)
+    syns_iter = synonyms_list if synonyms_list is not None else [None] * len(strings)
+    q_to_nstrs: dict[str, tuple[str, ...]] = {}
+    for key, s, syns in zip(query_keys, strings, syns_iter):
+        nstrs = list(lvg_normalize(s) or [])
+        if syns:
+            for syn in syns:
+                nstrs.extend(lvg_normalize(syn) or [])
+        q_to_nstrs[key] = tuple(dict.fromkeys(nstrs))
+    return q_to_nstrs, synonyms_list
 def build_query_rows(
     q_to_nstrs: Mapping[str, Sequence[str]],
     *,

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/README.md RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/__init__.py RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/build_merged.py RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/build_ontology.py RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/build_umls.py RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/constants.py RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/models.py RENAMED Viewed

File without changes

{norm_toolkit-1.6.0 → norm_toolkit-1.7.0}/src/norm_toolkit/utils.py RENAMED Viewed

File without changes

norm_toolkit 1.6.0__tar.gz → 1.7.0__tar.gz

norm_toolkit 1.6.0tar.gz → 1.7.0tar.gz