PyPI - norm_toolkit - Versions diffs - 1.1.0__tar.gz → 1.3.0__tar.gz - Mend

norm_toolkit 1.1.0tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.1.0
+Version: 1.3.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.1.0"
+version = "1.3.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/constants.py RENAMED Viewed

@@ -38,6 +38,9 @@ HIT_STRUCT_TYPE = pl.Struct(
         "score": pl.Int64,
         "total_score": pl.Int64,
         "match_type": pl.Utf8,
+        "pref_name": pl.Utf8,
+        "description": pl.Utf8,
+        "synonyms": pl.List(pl.Utf8),
     }
 )

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/normalizer.py RENAMED Viewed

@@ -324,6 +324,7 @@ LEFT JOIN agg ON agg.Q = aq.Q;
     def normalize(
         self,
         strings: Sequence[str],
+        synonyms: Mapping[str, Sequence[str]] | None = None,
         top_k: int = 25,
         prefer_ttys: list[str] | None = None,
         filter_sources: list[str] | None = None,
@@ -338,6 +339,10 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         Args:
             strings: Input strings to normalize
+            synonyms: Optional mapping of input strings to their synonyms.
+                Synonyms are normalized and used alongside the main string
+                to improve matching. Results are still keyed by the original
+                input string.
             top_k: Maximum number of results per query
             prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
             filter_sources: Restrict to these sources (include only)
@@ -348,7 +353,8 @@ LEFT JOIN agg ON agg.Q = aq.Q;
             coverage_weight: Weight for coverage in scoring
         Returns:
-            DataFrame with columns: input_string, hits (list of match structs)
+            DataFrame with columns: input_string, hits (list of match structs),
+            and synonyms (list of strings) if synonyms were provided.
         """
         # Apply defaults
         if prefer_ttys is None:
@@ -358,9 +364,14 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         q_to_nstrs: dict[str, list[str]] = {}
         for s in strings:
             nstrs = list(lvg_normalize(s) or [])
+            # Add normalized forms of synonyms
+            if synonyms and s in synonyms:
+                for syn in synonyms[s]:
+                    syn_nstrs = list(lvg_normalize(syn) or [])
+                    nstrs.extend(syn_nstrs)
             q_to_nstrs[s] = nstrs
-        return self._lookup(
+        result = self._lookup(
             q_to_nstrs=q_to_nstrs,
             all_queries=list(strings),
             prefer_ttys=prefer_ttys,
@@ -373,6 +384,13 @@ LEFT JOIN agg ON agg.Q = aq.Q;
             coverage_weight=coverage_weight,
         )
+        # Add synonyms column if synonyms were provided
+        if synonyms:
+            syn_list = [list(synonyms.get(s, [])) for s in strings]
+            result = result.with_columns(pl.Series("synonyms", syn_list))
+        return result
     def concept_info(
         self,
         concept_ids: Sequence[str],

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -110,6 +110,7 @@ class PostgresNormalizer:
     async def normalize(
         self,
         strings: Sequence[str],
+        synonyms: Mapping[str, Sequence[str]] | None = None,
         top_k: int = 25,
         prefer_ttys: list[str] | None = None,
         filter_sources: list[str] | None = None,
@@ -124,6 +125,10 @@ class PostgresNormalizer:
         Args:
             strings: Input strings to normalize
+            synonyms: Optional mapping of input strings to their synonyms.
+                Synonyms are normalized and used alongside the main string
+                to improve matching. Results are still keyed by the original
+                input string.
             top_k: Maximum number of results per query
             prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
             filter_sources: Restrict to these sources (include only)
@@ -134,7 +139,8 @@ class PostgresNormalizer:
             coverage_weight: Weight for coverage in scoring
         Returns:
-            DataFrame with columns: input_string, hits (list of match structs)
+            DataFrame with columns: input_string, hits (list of match structs),
+            and synonyms (list of strings) if synonyms were provided.
         """
         await self._ensure_initialized()
@@ -145,9 +151,14 @@ class PostgresNormalizer:
         q_to_nstrs: dict[str, list[str]] = {}
         for s in strings:
             nstrs = list(lvg_normalize(s) or [])
+            # Add normalized forms of synonyms
+            if synonyms and s in synonyms:
+                for syn in synonyms[s]:
+                    syn_nstrs = list(lvg_normalize(syn) or [])
+                    nstrs.extend(syn_nstrs)
             q_to_nstrs[s] = nstrs
-        return await self._lookup(
+        result = await self._lookup(
             q_to_nstrs=q_to_nstrs,
             all_queries=list(strings),
             prefer_ttys=prefer_ttys,
@@ -160,6 +171,16 @@ class PostgresNormalizer:
             coverage_weight=coverage_weight,
         )
+        # Enrich hits with concept info (pref_name, description, synonyms)
+        result = await self._enrich_hits_with_concept_info(result, prefer_ttys)
+        # Add synonyms column if synonyms were provided
+        if synonyms:
+            syn_list = [list(synonyms.get(s, [])) for s in strings]
+            result = result.with_columns(pl.Series("input_synonyms", syn_list))
+        return result
     async def _lookup(
         self,
         q_to_nstrs: Mapping[str, Sequence[str]],
@@ -458,6 +479,58 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
+    async def _enrich_hits_with_concept_info(
+        self,
+        result: pl.DataFrame,
+        prefer_ttys: list[str] | None,
+    ) -> pl.DataFrame:
+        """Enrich hits with pref_name, description, and synonyms from concept_info."""
+        # Collect all unique concept_ids from hits
+        all_concept_ids: set[str] = set()
+        for hits in result["hits"].to_list():
+            if hits:
+                for hit in hits:
+                    if hit and "global_identifier" in hit:
+                        all_concept_ids.add(hit["global_identifier"])
+        if not all_concept_ids:
+            # No concepts to enrich, just add empty fields
+            enriched_data = []
+            for row in result.iter_rows(named=True):
+                enriched_hits = []
+                for hit in row["hits"] or []:
+                    enriched_hit = dict(hit)
+                    enriched_hit["pref_name"] = None
+                    enriched_hit["description"] = None
+                    enriched_hit["synonyms"] = []
+                    enriched_hits.append(enriched_hit)
+                enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
+            return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
+        # Get concept info for all concepts
+        concept_infos = await self.concept_info(list(all_concept_ids), prefer_ttys=prefer_ttys)
+        # Enrich each hit
+        enriched_data = []
+        for row in result.iter_rows(named=True):
+            enriched_hits = []
+            for hit in row["hits"] or []:
+                enriched_hit = dict(hit)
+                cid = hit.get("global_identifier")
+                if cid and cid in concept_infos:
+                    info = concept_infos[cid]
+                    enriched_hit["pref_name"] = info.preferred_name
+                    enriched_hit["description"] = info.description
+                    enriched_hit["synonyms"] = info.synonyms or []
+                else:
+                    enriched_hit["pref_name"] = None
+                    enriched_hit["description"] = None
+                    enriched_hit["synonyms"] = []
+                enriched_hits.append(enriched_hit)
+            enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
+        return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
     async def concept_info(
         self,
         concept_ids: Sequence[str],
@@ -804,11 +877,13 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
         # PostgreSQL recursive CTE with named parameters
         # Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
+        # UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
+        # DISTINCT in output needed since same concept can be reached at different depths
         query = f"""
 WITH RECURSIVE walk(concept_id, depth) AS (
     SELECT CAST(:concept_id AS VARCHAR), 0
-    UNION ALL
+    UNION
     SELECT e.child_id, w.depth + 1
     FROM walk w

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/README.md RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/__init__.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/build_merged.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/build_ontology.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/build_umls.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/models.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.3.0}/src/norm_toolkit/utils.py RENAMED Viewed

File without changes

norm_toolkit 1.1.0__tar.gz → 1.3.0__tar.gz

norm_toolkit 1.1.0tar.gz → 1.3.0tar.gz