PyPI - norm_toolkit - Versions diffs - 1.9.0__tar.gz → 1.9.2__tar.gz - Mend

norm_toolkit 1.9.0tar.gz → 1.9.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.9.0
+Version: 1.9.2
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.9.0"
+version = "1.9.2"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_backend.py RENAMED Viewed

@@ -38,6 +38,7 @@ from .normalizer_utils import (
     build_lookup_hit_columns,
     build_lookup_scored_cte,
     build_narrower_concepts_sql,
+    build_narrower_identifiers_map_sql,
     build_ontology_filter_clauses,
     build_query_rows,
     build_semantic_types_sql,
@@ -542,6 +543,29 @@ class ClickHouseNormalizer(BaseNormalizer):
         )
         return [row[0] for row in _query_rows(self.client, sql)]
+    def _narrower_identifier_map(
+        self,
+        concept_ids: Sequence[int],
+        max_depth: int | None,
+        filter_ontologies: list[str] | None,
+    ) -> pl.DataFrame:
+        schema = pl.Schema({"root_id": pl.UInt64, "gid": pl.UInt64, "identifier": pl.Utf8})
+        unique_ids = list(dict.fromkeys(concept_ids))
+        if not unique_ids:
+            return pl.DataFrame(schema=schema)
+        roots_source_sql = _values_table_sql((("concept_id", "UInt64"),), [(gid,) for gid in unique_ids])
+        sql = build_narrower_identifiers_map_sql(
+            edges_table=self._table_ref(EDGES_TABLE),
+            concepts_table=self._table_ref(CONCEPTS_TABLE),
+            roots_source_sql=roots_source_sql,
+            max_depth=max_depth,
+            ontology_in_list=_values_as_in_list(filter_ontologies) if filter_ontologies else None,
+        )
+        rows = _query_rows(self.client, sql)
+        if not rows:
+            return pl.DataFrame(schema=schema)
+        return pl.DataFrame(rows, schema=list(schema.keys()), orient="row").cast(schema)
     def close(self) -> None:
         self.client.close()

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/duckdb_backend.py RENAMED Viewed

@@ -28,6 +28,7 @@ from .normalizer_utils import (
     build_lookup_hit_columns,
     build_lookup_scored_cte,
     build_narrower_concepts_sql,
+    build_narrower_identifiers_map_sql,
     build_ontology_filter_clauses,
     build_query_rows,
     build_semantic_types_sql,
@@ -262,6 +263,28 @@ class DuckDBNormalizer(BaseNormalizer):
         )
         return [row[0] for row in self.con.execute(sql).fetchall()]
+    def _narrower_identifier_map(
+        self,
+        concept_ids: Sequence[int],
+        max_depth: int | None,
+        filter_ontologies: list[str] | None,
+    ) -> pl.DataFrame:
+        schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "gid": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
+        if not concept_ids:
+            return pl.DataFrame(schema=schema)
+        roots_df = pl.DataFrame(
+            {"concept_id": list(dict.fromkeys(concept_ids))}, schema={"concept_id": CONCEPT_ID_DTYPE}
+        )
+        sql = build_narrower_identifiers_map_sql(
+            edges_table=EDGES_TABLE,
+            concepts_table=CONCEPTS_TABLE,
+            roots_source_sql="SELECT concept_id FROM idroots",
+            max_depth=max_depth,
+            ontology_in_list=self._values_as_in_list(filter_ontologies) if filter_ontologies else None,
+        )
+        with self._registered_arrow_table("idroots", roots_df):
+            return self.con.execute(sql).pl().cast(schema)
     def close(self) -> None:
         """Close the database connection."""
         self.con.close()

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_base.py RENAMED Viewed

@@ -16,7 +16,7 @@ from .normalizer_utils import (
     build_normalized_query_map,
     canonicalize_semantic_type_ids,
 )
-from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, ConceptInfo, SemanticType
+from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, NARROWER_STRUCT_TYPE, ConceptInfo, SemanticType
 _HIT_FIELD_NAMES = tuple(HIT_STRUCT_TYPE.to_schema())
@@ -63,6 +63,20 @@ class BaseNormalizer(ABC):
     ) -> Iterable[int]:
         """Walk the hierarchy edges and return descendant concept IDs (excluding the root)."""
+    @abstractmethod
+    def _narrower_identifier_map(
+        self,
+        concept_ids: Sequence[int],
+        max_depth: int | None,
+        filter_ontologies: list[str] | None,
+    ) -> pl.DataFrame:
+        """Walk the edges from many roots at once, resolving identifiers in-query.
+        Returns a frame with `root_id`, `gid`, and `identifier` columns mapping
+        each root gid to its descendants' gids and source-local identifiers
+        (excluding the root).
+        """
     @abstractmethod
     def _concept_id_scope(self, concept_ids: Sequence[int]) -> AbstractContextManager[None]:
         """Create a scoped idmap table/view for concept metadata lookups."""
@@ -117,6 +131,9 @@ class BaseNormalizer(ABC):
         filter_tui_descendants_of: list[str] | None = None,
         allow_partial: bool = True,
         enrich_hits: bool = True,
+        expand_narrower: bool = False,
+        expand_max_depth: int | None = None,
+        expand_filter_ontologies: list[str] | None = None,
     ) -> pl.DataFrame:
         """
         Normalize input strings to ranked concepts.
@@ -138,6 +155,15 @@ class BaseNormalizer(ABC):
             enrich_hits: Populate pref_name, description, and synonyms for each hit.
                 Disable this for faster bulk ID/rank normalization; metadata fields
                 remain null in the returned hit structs.
+            expand_narrower: For each hit, walk the edges hierarchy and populate
+                the hit's `narrower` field with the source-local identifiers of
+                its descendant concepts (across all ontologies). Left null when
+                disabled or when the database has no edges.
+            expand_max_depth: Maximum hierarchy depth to expand (1 = direct
+                children only, None = all descendants). Only used when
+                expand_narrower is True.
+            expand_filter_ontologies: Only follow edges from these ontologies when
+                expanding. None follows every ontology's edges.
         Returns:
             DataFrame with columns: input_string, hits (list of match structs),
@@ -168,6 +194,9 @@ class BaseNormalizer(ABC):
         if enrich_hits:
             result = self._enrich_hits_with_concept_info(result)
+        if expand_narrower and self._has_edges:
+            result = self._expand_hits_with_narrower(result, expand_max_depth, expand_filter_ontologies)
         result = result.with_columns(pl.Series("input_string", strings_list))
         if synonyms is not None:
             result = result.with_columns(pl.Series("synonyms", syn_list, dtype=pl.List(pl.Utf8)))
@@ -241,6 +270,68 @@ class BaseNormalizer(ABC):
             return info_df.cast(schema)
+    def _expand_hits_with_narrower(
+        self,
+        result: pl.DataFrame,
+        max_depth: int | None,
+        filter_ontologies: list[str] | None,
+    ) -> pl.DataFrame:
+        """Populate each hit's `narrower` field with descendant identifiers."""
+        if result.is_empty():
+            return result
+        base = result.with_row_index("__row_idx")
+        hit_rows = (
+            base.explode("hits")
+            .unnest("hits")
+            .filter(pl.col("gid").is_not_null())
+            .with_columns(pl.int_range(pl.len()).over("__row_idx").alias("__hit_idx"))
+        )
+        if hit_rows.is_empty():
+            return result
+        root_ids = hit_rows.get_column("gid").unique(maintain_order=True).to_list()
+        narrower_df = self._narrower_identifiers_frame(root_ids, max_depth, filter_ontologies)
+        expanded_hit_rows = (
+            hit_rows.drop("narrower")
+            .join(narrower_df, on="gid", how="left")
+            .with_columns(pl.col("narrower").fill_null([]).cast(pl.List(NARROWER_STRUCT_TYPE)))
+            .sort(["__row_idx", "__hit_idx"])
+        )
+        expanded_hits = expanded_hit_rows.group_by("__row_idx", maintain_order=True).agg(
+            pl.struct(_HIT_FIELD_NAMES).alias("hits")
+        )
+        return (
+            base.select("__row_idx", "input_string")
+            .join(expanded_hits, on="__row_idx", how="left")
+            .sort("__row_idx")
+            .drop("__row_idx")
+            .with_columns(pl.col("hits").fill_null([]).cast(pl.List(HIT_STRUCT_TYPE)))
+        )
+    def _narrower_identifiers_frame(
+        self,
+        root_ids: Sequence[int],
+        max_depth: int | None,
+        filter_ontologies: list[str] | None,
+    ) -> pl.DataFrame:
+        """Return (gid, narrower) rows mapping each root gid to descendant concepts."""
+        empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(NARROWER_STRUCT_TYPE)})
+        if not root_ids:
+            return empty
+        map_df = self._narrower_identifier_map(list(root_ids), max_depth, filter_ontologies)
+        if map_df.is_empty():
+            return empty
+        return (
+            map_df.sort("root_id", "identifier")
+            .group_by("root_id", maintain_order=True)
+            .agg(pl.struct("gid", "identifier").alias("narrower"))
+            .rename({"root_id": "gid"})
+            .with_columns(pl.col("gid").cast(CONCEPT_ID_DTYPE))
+        )
     def _resolve_identifiers(self, identifiers: Sequence[str]) -> dict[str, int]:
         """
         Map source-local identifiers to internal gids via the concepts table.

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_utils.py RENAMED Viewed

@@ -544,3 +544,46 @@ def build_narrower_concepts_sql(
         WHERE concept_id != {root_literal}
         """
     ).strip()
+def build_narrower_identifiers_map_sql(
+    *,
+    edges_table: str,
+    concepts_table: str,
+    roots_source_sql: str,
+    max_depth: int | None,
+    ontology_in_list: str | None,
+) -> str:
+    """
+    Build a recursive descendant walk seeded from many roots at once, resolving
+    descendants to their source-local identifiers in the same query.
+    `roots_source_sql` must be a relation yielding a `concept_id` column for the
+    starting concepts. The result has (root_id, gid, identifier) rows mapping each
+    root gid to its descendants' gids and identifiers, excluding the root itself.
+    """
+    recurse_filters: list[str] = []
+    if max_depth is not None:
+        recurse_filters.append(f"walk.depth < {int(max_depth)}")
+    if ontology_in_list:
+        recurse_filters.append(f"e.ontology IN ({ontology_in_list})")
+    recurse_where = f"WHERE {' AND '.join(recurse_filters)}" if recurse_filters else ""
+    return dedent(
+        f"""
+        WITH RECURSIVE walk AS (
+            SELECT concept_id AS root_id, concept_id, 0 AS depth
+            FROM ({roots_source_sql}) AS roots
+            UNION ALL
+            SELECT walk.root_id, e.child_id AS concept_id, walk.depth + 1 AS depth
+            FROM walk
+            JOIN {edges_table} AS e
+                ON e.parent_id = walk.concept_id
+            {recurse_where}
+        )
+        SELECT DISTINCT walk.root_id, walk.concept_id AS gid, c.identifier
+        FROM walk
+        JOIN {concepts_table} AS c
+            ON c.concept_id = walk.concept_id
+        WHERE walk.concept_id != walk.root_id
+        """
+    ).strip()

{norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/schema.py RENAMED Viewed

@@ -174,9 +174,20 @@ ONTOLOGY_PREF_STT = "PF"
 DEFAULT_ONTOLOGY_PREF_RANK = 3
 DEFAULT_ONTOLOGY_SYNONYM_RANK = 1
+# Struct for a single expanded descendant concept, carrying both the internal
+# dense gid and the source-local identifier.
+NARROWER_STRUCT_TYPE = pl.Struct(
+    {
+        "gid": CONCEPT_ID_DTYPE,
+        "identifier": pl.Utf8,
+    }
+)
 # Polars struct type for normalized hits. The first nine fields are produced by
 # the lookup query (see LOOKUP_HIT_FIELDS in normalizer_utils); pref_name,
-# description, and synonyms are filled afterwards by hit enrichment.
+# description, and synonyms are filled afterwards by hit enrichment. `narrower`
+# is filled only when normalize(expand_narrower=True) walks the edges hierarchy;
+# it stays null otherwise.
 HIT_STRUCT_TYPE = pl.Struct(
     {
         "gid": CONCEPT_ID_DTYPE,
@@ -191,6 +202,7 @@ HIT_STRUCT_TYPE = pl.Struct(
         "pref_name": pl.Utf8,
         "description": pl.Utf8,
         "synonyms": pl.List(pl.Utf8),
+        "narrower": pl.List(NARROWER_STRUCT_TYPE),
     }
 )