PyPI - norm_toolkit - Versions diffs - 1.2.0__tar.gz → 1.3.0__tar.gz - Mend

norm_toolkit 1.2.0tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.2.0
+Version: 1.3.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.2.0"
+version = "1.3.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

{norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/constants.py RENAMED Viewed

@@ -38,6 +38,9 @@ HIT_STRUCT_TYPE = pl.Struct(
         "score": pl.Int64,
         "total_score": pl.Int64,
         "match_type": pl.Utf8,
+        "pref_name": pl.Utf8,
+        "description": pl.Utf8,
+        "synonyms": pl.List(pl.Utf8),
     }
 )

{norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -171,10 +171,13 @@ class PostgresNormalizer:
             coverage_weight=coverage_weight,
         )
+        # Enrich hits with concept info (pref_name, description, synonyms)
+        result = await self._enrich_hits_with_concept_info(result, prefer_ttys)
         # Add synonyms column if synonyms were provided
         if synonyms:
             syn_list = [list(synonyms.get(s, [])) for s in strings]
-            result = result.with_columns(pl.Series("synonyms", syn_list))
+            result = result.with_columns(pl.Series("input_synonyms", syn_list))
         return result
@@ -476,6 +479,58 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
+    async def _enrich_hits_with_concept_info(
+        self,
+        result: pl.DataFrame,
+        prefer_ttys: list[str] | None,
+    ) -> pl.DataFrame:
+        """Enrich hits with pref_name, description, and synonyms from concept_info."""
+        # Collect all unique concept_ids from hits
+        all_concept_ids: set[str] = set()
+        for hits in result["hits"].to_list():
+            if hits:
+                for hit in hits:
+                    if hit and "global_identifier" in hit:
+                        all_concept_ids.add(hit["global_identifier"])
+        if not all_concept_ids:
+            # No concepts to enrich, just add empty fields
+            enriched_data = []
+            for row in result.iter_rows(named=True):
+                enriched_hits = []
+                for hit in row["hits"] or []:
+                    enriched_hit = dict(hit)
+                    enriched_hit["pref_name"] = None
+                    enriched_hit["description"] = None
+                    enriched_hit["synonyms"] = []
+                    enriched_hits.append(enriched_hit)
+                enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
+            return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
+        # Get concept info for all concepts
+        concept_infos = await self.concept_info(list(all_concept_ids), prefer_ttys=prefer_ttys)
+        # Enrich each hit
+        enriched_data = []
+        for row in result.iter_rows(named=True):
+            enriched_hits = []
+            for hit in row["hits"] or []:
+                enriched_hit = dict(hit)
+                cid = hit.get("global_identifier")
+                if cid and cid in concept_infos:
+                    info = concept_infos[cid]
+                    enriched_hit["pref_name"] = info.preferred_name
+                    enriched_hit["description"] = info.description
+                    enriched_hit["synonyms"] = info.synonyms or []
+                else:
+                    enriched_hit["pref_name"] = None
+                    enriched_hit["description"] = None
+                    enriched_hit["synonyms"] = []
+                enriched_hits.append(enriched_hit)
+            enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
+        return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
     async def concept_info(
         self,
         concept_ids: Sequence[str],