PyPI - norm_toolkit - Versions diffs - 1.4.0__tar.gz → 1.6.0__tar.gz - Mend

norm_toolkit 1.4.0tar.gz → 1.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.4.0
+Version: 1.6.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>

{norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.4.0"
+version = "1.6.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]

{norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/build_merged.py RENAMED Viewed

@@ -2,15 +2,16 @@
 Merged database builder for unified normalizer.
 Builds a single DuckDB database containing both UMLS and ontology data,
-allowing simultaneous normalization across all sources.
+allowing simultaneous normalization across all ontologies.
 Tables created:
 - ns: Normalized string index (nstr -> concept_id, name_id)
-- nw: Normalized word index (nwd -> concept_id, string_id, source)
+- nw: Normalized word index (nwd -> concept_id, string_id, source, ontology)
 - atoms: All atoms with unified schema
 - concepts: Concept metadata
 - types: Semantic types (UMLS only)
 - defs: Definitions from all sources
+- edges: Hierarchy edges from all sources
 """
 from __future__ import annotations
@@ -49,7 +50,7 @@ def build_merged_duckdb(
         db_path: Output DuckDB database path
         meta_dir: Directory containing UMLS META RRF files (optional)
         ontology_dfs: List of Polars DataFrames with ontology data (optional)
-        edges_df: Hierarchy edges for ontologies (parent_id, child_id, source columns)
+        edges_df: Hierarchy edges for ontologies (parent_id, child_id, source/ontology columns)
         filter_concepts_df: Optional DataFrame with 'global_identifier' column to filter
             which concepts to include. Only concepts matching these IDs will be included
             (applies to both UMLS CUIs and ontology global_identifiers).
@@ -69,14 +70,16 @@ def build_merged_duckdb(
         - pref_name: str - Preferred name
         - synonyms: list[str] - Display synonyms
         - description: str | None - Definition
-        - source: str - Source ontology name
+        - source: str - Source ontology name (used to populate ontology)
+          (or provide an ontology column directly)
         - pref_name_norm: str - Normalized preferred name
         - synonyms_norm: list[str] - Normalized synonyms
     Edges DataFrame columns (if edges_df provided):
         - parent_id: str - Parent concept ID (broader term)
         - child_id: str - Child concept ID (narrower term)
-        - source: str - Source ontology name
+        - source: str - Source ontology name (used to populate ontology)
+          (or provide an ontology column directly)
     """
     if not meta_dir and not ontology_dfs:
         raise ValueError("At least one of meta_dir or ontology_dfs must be provided")
@@ -133,6 +136,9 @@ def build_merged_duckdb(
         )
         print(f"  Loaded {len(onto_atoms):,} ontology atoms")
+    if edges_df is not None:
+        edges_df = _normalize_edges_df(edges_df)
     # ==========================================================================
     # Merge and write tables
     # ==========================================================================
@@ -207,7 +213,7 @@ def build_merged_duckdb(
     # NW index - partial match lookup
     con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd ON nw(nwd);")
-    con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd_source ON nw(nwd, source);")
+    con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd_ontology ON nw(nwd, ontology);")
     # Atoms - join acceleration
     con.execute("CREATE INDEX IF NOT EXISTS idx_atoms_concept_name ON atoms(concept_id, name_id);")
@@ -227,7 +233,7 @@ def build_merged_duckdb(
     if len(merged_edges) > 0:
         # Edges (hierarchy traversal)
         con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent ON edges(parent_id);")
-        con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent_source ON edges(parent_id, source);")
+        con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent_ontology ON edges(parent_id, ontology);")
     # ==========================================================================
     # Finalize
@@ -251,6 +257,21 @@ def _write_table(con: duckdb.DuckDBPyConnection, df: pl.DataFrame, table_name: s
         con.unregister("_tmp")
+def _normalize_edges_df(edges_df: pl.DataFrame) -> pl.DataFrame:
+    """Ensure edges_df has source/ontology columns aligned with merged schema."""
+    if "ontology" not in edges_df.columns:
+        if "source" not in edges_df.columns:
+            raise ValueError("edges_df must include a 'source' or 'ontology' column")
+        edges_df = edges_df.with_columns(pl.col("source").alias("ontology"))
+    edges_df = edges_df.with_columns(
+        pl.lit(None).cast(pl.Utf8).alias("source"),
+        pl.col("ontology").cast(pl.Utf8),
+    )
+    return edges_df.select("parent_id", "child_id", "source", "ontology")
 def _load_umls_data(
     con: duckdb.DuckDBPyConnection,
     meta_dir: str,
@@ -321,7 +342,7 @@ def _load_umls_data(
         cui_filter_clause_ns = " AND CUI IN (SELECT CUI FROM _cui_filter)"
         cui_filter_clause_nw = " AND nw.CUI IN (SELECT CUI FROM _cui_filter)"
         cui_filter_clause_sty = " WHERE CUI IN (SELECT CUI FROM _cui_filter)"
-        cui_filter_clause_def = " AND CUI IN (SELECT CUI FROM _cui_filter)"
+        cui_filter_clause_def = " AND d.CUI IN (SELECT CUI FROM _cui_filter)"
     # Build enriched atoms (English, non-suppressed, with pre-joined rank)
     # Normalize UMLS ranks to 0-10 scale to be comparable with ontology ranks (1-3)
@@ -334,6 +355,7 @@ def _load_umls_data(
             mc.CUI AS identifier,
             mc.STR AS str,
             mc.SAB AS source,
+            'UMLS' AS ontology,
             mc.TTY AS name_type,
             mc.ISPREF AS ispref,
             mc.STT AS stt,
@@ -355,14 +377,15 @@ def _load_umls_data(
         WHERE 1=1{cui_filter_clause_ns}
     """).pl()
-    # Build NW index (word -> concept, string, source)
+    # Build NW index (word -> concept, string, source, ontology)
     # Note: UMLS mrxnw_eng doesn't have source, so we join to get it
     nw_df = con.execute(f"""
         SELECT DISTINCT
             nw.NWD AS nwd,
             nw.CUI AS concept_id,
             nw.SUI AS string_id,
-            mc.SAB AS source
+            mc.SAB AS source,
+            'UMLS' AS ontology
         FROM _mrxnw_eng nw
         JOIN _mrconso mc ON mc.CUI = nw.CUI AND mc.SUI = nw.SUI
         WHERE mc.LAT = 'ENG' AND mc.SUPPRESS = 'N'{cui_filter_clause_nw}
@@ -376,6 +399,7 @@ def _load_umls_data(
             mc.CUI AS concept_id,
             mc.CUI AS identifier,
             NULL::VARCHAR AS source,
+            'UMLS' AS ontology,
             NULL::VARCHAR AS pref_name,
             NULL::VARCHAR AS description
         FROM _mrconso mc
@@ -392,15 +416,18 @@ def _load_umls_data(
         FROM _mrsty{cui_filter_clause_sty}
     """).pl()
-    # Build definitions
+    # Build definitions (English-only via MRCONSO language)
     defs_df = con.execute(f"""
         SELECT
-            CUI AS concept_id,
-            SAB AS source,
-            DEF AS def_text
-        FROM _mrdef
-        WHERE COALESCE(SUPPRESS, 'N') = 'N'
-          AND DEF IS NOT NULL AND DEF <> ''{cui_filter_clause_def}
+            d.CUI AS concept_id,
+            d.SAB AS source,
+            'UMLS' AS ontology,
+            d.DEF AS def_text
+        FROM _mrdef d
+        JOIN _mrconso mc ON mc.AUI = d.AUI
+        WHERE mc.LAT = 'ENG'
+          AND COALESCE(d.SUPPRESS, 'N') = 'N'
+          AND d.DEF IS NOT NULL AND d.DEF <> ''{cui_filter_clause_def}
     """).pl()
     # Build edges from MRREL (hierarchy relationships)
@@ -410,13 +437,14 @@ def _load_umls_data(
     edges_df: pl.DataFrame | None = None
     if has_mrrel:
         edges_df = con.execute("""
-            SELECT DISTINCT parent_id, child_id, source
+            SELECT DISTINCT parent_id, child_id, source, ontology
             FROM (
                 -- CHD (child) and RN (narrower): CUI1 -> CUI2
                 SELECT
                     CUI1 AS parent_id,
                     CUI2 AS child_id,
-                    SAB AS source
+                    SAB AS source,
+                    'UMLS' AS ontology
                 FROM _mrrel
                 WHERE REL IN ('CHD', 'RN')
                   AND COALESCE(SUPPRESS, 'N') = 'N'
@@ -427,7 +455,8 @@ def _load_umls_data(
                 SELECT
                     CUI2 AS parent_id,
                     CUI1 AS child_id,
-                    SAB AS source
+                    SAB AS source,
+                    'UMLS' AS ontology
                 FROM _mrrel
                 WHERE REL IN ('PAR', 'RB')
                   AND COALESCE(SUPPRESS, 'N') = 'N'
@@ -468,6 +497,16 @@ def _load_ontology_data(
     if filter_ids is not None:
         combined = combined.filter(pl.col("global_identifier").is_in(filter_ids))
+    if "ontology" not in combined.columns:
+        if "source" not in combined.columns:
+            raise ValueError("ontology_dfs must include a 'source' or 'ontology' column")
+        combined = combined.with_columns(pl.col("source").alias("ontology"))
+    combined = combined.with_columns(
+        pl.lit(None).cast(pl.Utf8).alias("source"),
+        pl.col("ontology").cast(pl.Utf8),
+    )
     # Normalize columns
     combined = combined.with_columns(
         pl.col("synonyms").cast(pl.List(pl.Utf8)).fill_null([]),
@@ -479,9 +518,10 @@ def _load_ontology_data(
         pl.col("global_identifier").alias("concept_id"),
         "identifier",
         "source",
+        "ontology",
         "pref_name",
         "description",
-    ).unique(subset=["concept_id", "source"])
+    ).unique(subset=["concept_id", "ontology"])
     # Build definitions from descriptions
     defs_df = (
@@ -489,9 +529,10 @@ def _load_ontology_data(
         .select(
             pl.col("global_identifier").alias("concept_id"),
             "source",
+            "ontology",
             pl.col("description").alias("def_text"),
         )
-        .unique(subset=["concept_id", "source"])
+        .unique(subset=["concept_id", "ontology"])
     )
     # Build atoms: preferred names
@@ -499,6 +540,7 @@ def _load_ontology_data(
         pl.col("global_identifier").alias("concept_id"),
         "identifier",
         "source",
+        "ontology",
         pl.lit("pref").alias("name_type"),
         pl.lit("Y").alias("ispref"),
         pl.lit(None).cast(pl.Utf8).alias("stt"),  # NULL for ontology
@@ -515,6 +557,7 @@ def _load_ontology_data(
             pl.col("global_identifier").alias("concept_id"),
             "identifier",
             "source",
+            "ontology",
             pl.lit("syn").alias("name_type"),
             pl.lit("N").alias("ispref"),
             pl.lit(None).cast(pl.Utf8).alias("stt"),
@@ -527,7 +570,7 @@ def _load_ontology_data(
     # Combine and deduplicate atoms
     names_df = pl.concat([pref_df, syn_df], how="vertical").unique(
-        subset=["concept_id", "source", "name_type", "nstr", "str"]
+        subset=["concept_id", "ontology", "name_type", "nstr", "str"]
     )
     # Generate name_id and string_id
@@ -543,7 +586,17 @@ def _load_ontology_data(
     # Reorder columns to match merged schema (drop nstr since it's in ns_df)
     atoms_df = atoms_df.select(
-        "concept_id", "name_id", "string_id", "identifier", "str", "source", "name_type", "ispref", "stt", "rank"
+        "concept_id",
+        "name_id",
+        "string_id",
+        "identifier",
+        "str",
+        "source",
+        "ontology",
+        "name_type",
+        "ispref",
+        "stt",
+        "rank",
     )
     # Build NW index (word-level)
@@ -560,8 +613,9 @@ def _load_ontology_data(
             "concept_id",
             "string_id",
             "source",
+            "ontology",
         )
-        .unique(subset=["nwd", "concept_id", "string_id", "source"])
+        .unique(subset=["nwd", "concept_id", "string_id", "ontology"])
     )
     return atoms_df, ns_df, nw_df, concepts_df, defs_df

{norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/build_ontology.py RENAMED Viewed

@@ -34,7 +34,7 @@ def build_ontology_duckdb(
         - pref_name: str - Preferred/canonical name
         - synonyms: list[str] - Display synonyms
         - description: str | None - Concept definition
-        - source: str - Source ontology (e.g., "CHEBI")
+        - source: str - Source ontology name (used to populate ontology)
         - pref_name_norm: str - LVG-normalized preferred name
         - synonyms_norm: list[str] - LVG-normalized synonyms

{norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/constants.py RENAMED Viewed

@@ -34,6 +34,7 @@ HIT_STRUCT_TYPE = pl.Struct(
         "nstr": pl.Utf8,
         "name": pl.Utf8,
         "source": pl.Utf8,
+        "ontology": pl.Utf8,
         "name_type": pl.Utf8,
         "score": pl.Int64,
         "total_score": pl.Int64,
@@ -44,7 +45,7 @@ HIT_STRUCT_TYPE = pl.Struct(
     }
 )
-# Schema for ontology DataFrames (input to build_ontology_duckdb)
+# Schema for ontology DataFrames (input to build_ontology_duckdb); source populates ontology.
 ONTOLOGY_DF_SCHEMA = {
     "global_identifier": pl.Utf8,
     "identifier": pl.Utf8,

{norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/models.py RENAMED Viewed

@@ -26,11 +26,12 @@ class ConceptInfo(BaseModel):
     concept_id: str  # CUI or global_id
     identifier: str | None  # Source-specific ID (CUI for UMLS, e.g. "15377" for CHEBI)
-    source: str | None  # SAB or source
+    source: str | None  # UMLS SAB (if available)
+    ontology: str | None  # Ontology name (e.g., "UMLS", "CHEBI")
     preferred_name: str | None
     name_type: str | None  # TTY or name_type
     description: str | None
-    def_source: str | None  # Source of definition (UMLS only)
+    def_source: str | None  # UMLS source of definition (SAB, if available)
     synonyms: list[str]
     semantic_types: list[SemanticType]  # Empty for ontology

norm_toolkit 1.4.0__tar.gz → 1.6.0__tar.gz

norm_toolkit 1.4.0tar.gz → 1.6.0tar.gz