PyPI - norm_toolkit - Versions diffs - 1.1.0__tar.gz → 1.2.0__tar.gz - Mend

norm_toolkit 1.1.0tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.1.0
+Version: 1.2.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.1.0"
+version = "1.2.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/normalizer.py RENAMED Viewed

@@ -324,6 +324,7 @@ LEFT JOIN agg ON agg.Q = aq.Q;
     def normalize(
         self,
         strings: Sequence[str],
+        synonyms: Mapping[str, Sequence[str]] | None = None,
         top_k: int = 25,
         prefer_ttys: list[str] | None = None,
         filter_sources: list[str] | None = None,
@@ -338,6 +339,10 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         Args:
             strings: Input strings to normalize
+            synonyms: Optional mapping of input strings to their synonyms.
+                Synonyms are normalized and used alongside the main string
+                to improve matching. Results are still keyed by the original
+                input string.
             top_k: Maximum number of results per query
             prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
             filter_sources: Restrict to these sources (include only)
@@ -348,7 +353,8 @@ LEFT JOIN agg ON agg.Q = aq.Q;
             coverage_weight: Weight for coverage in scoring
         Returns:
-            DataFrame with columns: input_string, hits (list of match structs)
+            DataFrame with columns: input_string, hits (list of match structs),
+            and synonyms (list of strings) if synonyms were provided.
         """
         # Apply defaults
         if prefer_ttys is None:
@@ -358,9 +364,14 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         q_to_nstrs: dict[str, list[str]] = {}
         for s in strings:
             nstrs = list(lvg_normalize(s) or [])
+            # Add normalized forms of synonyms
+            if synonyms and s in synonyms:
+                for syn in synonyms[s]:
+                    syn_nstrs = list(lvg_normalize(syn) or [])
+                    nstrs.extend(syn_nstrs)
             q_to_nstrs[s] = nstrs
-        return self._lookup(
+        result = self._lookup(
             q_to_nstrs=q_to_nstrs,
             all_queries=list(strings),
             prefer_ttys=prefer_ttys,
@@ -373,6 +384,13 @@ LEFT JOIN agg ON agg.Q = aq.Q;
             coverage_weight=coverage_weight,
         )
+        # Add synonyms column if synonyms were provided
+        if synonyms:
+            syn_list = [list(synonyms.get(s, [])) for s in strings]
+            result = result.with_columns(pl.Series("synonyms", syn_list))
+        return result
     def concept_info(
         self,
         concept_ids: Sequence[str],

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -110,6 +110,7 @@ class PostgresNormalizer:
     async def normalize(
         self,
         strings: Sequence[str],
+        synonyms: Mapping[str, Sequence[str]] | None = None,
         top_k: int = 25,
         prefer_ttys: list[str] | None = None,
         filter_sources: list[str] | None = None,
@@ -124,6 +125,10 @@ class PostgresNormalizer:
         Args:
             strings: Input strings to normalize
+            synonyms: Optional mapping of input strings to their synonyms.
+                Synonyms are normalized and used alongside the main string
+                to improve matching. Results are still keyed by the original
+                input string.
             top_k: Maximum number of results per query
             prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
             filter_sources: Restrict to these sources (include only)
@@ -134,7 +139,8 @@ class PostgresNormalizer:
             coverage_weight: Weight for coverage in scoring
         Returns:
-            DataFrame with columns: input_string, hits (list of match structs)
+            DataFrame with columns: input_string, hits (list of match structs),
+            and synonyms (list of strings) if synonyms were provided.
         """
         await self._ensure_initialized()
@@ -145,9 +151,14 @@ class PostgresNormalizer:
         q_to_nstrs: dict[str, list[str]] = {}
         for s in strings:
             nstrs = list(lvg_normalize(s) or [])
+            # Add normalized forms of synonyms
+            if synonyms and s in synonyms:
+                for syn in synonyms[s]:
+                    syn_nstrs = list(lvg_normalize(syn) or [])
+                    nstrs.extend(syn_nstrs)
             q_to_nstrs[s] = nstrs
-        return await self._lookup(
+        result = await self._lookup(
             q_to_nstrs=q_to_nstrs,
             all_queries=list(strings),
             prefer_ttys=prefer_ttys,
@@ -160,6 +171,13 @@ class PostgresNormalizer:
             coverage_weight=coverage_weight,
         )
+        # Add synonyms column if synonyms were provided
+        if synonyms:
+            syn_list = [list(synonyms.get(s, [])) for s in strings]
+            result = result.with_columns(pl.Series("synonyms", syn_list))
+        return result
     async def _lookup(
         self,
         q_to_nstrs: Mapping[str, Sequence[str]],
@@ -804,11 +822,13 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
         # PostgreSQL recursive CTE with named parameters
         # Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
+        # UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
+        # DISTINCT in output needed since same concept can be reached at different depths
         query = f"""
 WITH RECURSIVE walk(concept_id, depth) AS (
     SELECT CAST(:concept_id AS VARCHAR), 0
-    UNION ALL
+    UNION
     SELECT e.child_id, w.depth + 1
     FROM walk w

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/README.md RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/__init__.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/build_merged.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/build_ontology.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/build_umls.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/constants.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/models.py RENAMED Viewed

File without changes

{norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/utils.py RENAMED Viewed

File without changes

norm_toolkit 1.1.0__tar.gz → 1.2.0__tar.gz

norm_toolkit 1.1.0tar.gz → 1.2.0tar.gz