PyPI - norm_toolkit - Versions diffs - 1.7.0__tar.gz → 1.9.0__tar.gz - Mend

norm_toolkit 1.7.0tar.gz → 1.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

norm_toolkit-1.9.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,53 @@
+Metadata-Version: 2.3
+Name: norm_toolkit
+Version: 1.9.0
+Summary: Toolkit to normalize text to UMLS / ontologies
+Author: Haydn Jones
+Author-email: Haydn Jones <haydnjonest@gmail.com>
+Requires-Dist: asyncpg>=0.29.0
+Requires-Dist: clickhouse-connect>=1.0.0
+Requires-Dist: duckdb>=1.5.0
+Requires-Dist: lvg-norm>=1.3.0
+Requires-Dist: polars[rt64]>=1.36.1
+Requires-Dist: pyarrow>=20.0.0
+Requires-Dist: pydantic>=2.12.5
+Requires-Dist: python-dotenv>=1.2.2
+Requires-Dist: sqlalchemy>=2.0.0
+Requires-Dist: tqdm>=4.67.1
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+## ClickHouse backend
+The DuckDB builder remains the source of truth. Build a DuckDB file with
+`build_merged_duckdb`, then upload its canonical tables into ClickHouse:
+```bash
+uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
+```
+The upload shows a progress bar for each copied table; pass `--no-progress` to
+silence it.
+Connection settings are read from `.env` with `python-dotenv` and use the
+official `clickhouse-connect` client. Set `CH_HTTP`, for example
+`http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
+separately and override URL credentials.
+Use the ClickHouse backend from Python:
+```python
+from norm_toolkit import ClickHouseNormalizer
+normalizer = ClickHouseNormalizer(database="normalization")
+result = normalizer.normalize(["aspirin"], top_k=5)
+```
+You can also pass a DSN in code:
+```python
+normalizer = ClickHouseNormalizer(
+    dsn="http://host:8123/normalization",
+    database="normalization",
+)
+```

norm_toolkit-1.9.0/README.md ADDED Viewed

@@ -0,0 +1,34 @@
+## ClickHouse backend
+The DuckDB builder remains the source of truth. Build a DuckDB file with
+`build_merged_duckdb`, then upload its canonical tables into ClickHouse:
+```bash
+uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
+```
+The upload shows a progress bar for each copied table; pass `--no-progress` to
+silence it.
+Connection settings are read from `.env` with `python-dotenv` and use the
+official `clickhouse-connect` client. Set `CH_HTTP`, for example
+`http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
+separately and override URL credentials.
+Use the ClickHouse backend from Python:
+```python
+from norm_toolkit import ClickHouseNormalizer
+normalizer = ClickHouseNormalizer(database="normalization")
+result = normalizer.normalize(["aspirin"], top_k=5)
+```
+You can also pass a DSN in code:
+```python
+normalizer = ClickHouseNormalizer(
+    dsn="http://host:8123/normalization",
+    database="normalization",
+)
+```

{norm_toolkit-1.7.0 → norm_toolkit-1.9.0}/pyproject.toml RENAMED Viewed

@@ -1,45 +1,51 @@
 [project]
 name = "norm_toolkit"
-version = "1.7.0"
+version = "1.9.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
 requires-python = ">=3.12"
 dependencies = [
     "asyncpg>=0.29.0",
-    "duckdb>=1.4.3",
-    "lvg-norm>=1.1.0",
+    "clickhouse-connect>=1.0.0",
+    "duckdb>=1.5.0",
+    "lvg-norm>=1.3.0",
     "polars[rt64]>=1.36.1",
     "pyarrow>=20.0.0",
     "pydantic>=2.12.5",
+    "python-dotenv>=1.2.2",
     "sqlalchemy>=2.0.0",
     "tqdm>=4.67.1",
 ]
 [dependency-groups]
 dev = [
-    "datasets>=4.4.1",
-    "dotenv>=0.9.9",
     "ipython>=9.8.0",
     "pytest>=8.3",
-    "rdkit>=2025.9.3",
     "ruff>=0.6.9",
     "fire>=0.7.1",
-    "joblib>=1.5.3",
+    "ipykernel>=7.2.0",
+    "ipywidgets>=8.1.8",
 ]
 [build-system]
-requires = ["uv_build>=0.9.11,<0.10.0"]
+requires = ["uv_build>=0.9.11,<0.13.0"]
 build-backend = "uv_build"
 [tool.ruff]
 line-length = 120
 indent-width = 4
-target-version = "py313"
+target-version = "py312"
+# v1 is a frozen vendored snapshot of the 1.8.0 release; don't lint/format it.
+extend-exclude = ["src/norm_toolkit/v1"]
+[tool.ty.src]
+# v1 is a frozen vendored snapshot of the 1.8.0 release; don't type-check it.
+exclude = ["src/norm_toolkit/v1"]
 [tool.ruff.lint]
 select = ["E", "F", "UP", "B", "SIM", "I", "FURB"]
-ignore = ["B905", "E501", "SIM108", "SIM103"]
+ignore = ["E501", "SIM108", "SIM103"]
 fixable = ["ALL"]
 [tool.ruff.format]

norm_toolkit-1.9.0/src/norm_toolkit/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+norm_toolkit — biomedical text normalization to UMLS / custom ontologies.
+This package is versioned into two subpackages:
+- ``norm_toolkit.v1``: the 1.8.0 API (DuckDB + PostgreSQL backends)
+- ``norm_toolkit.v2``: the current API (DuckDB + ClickHouse backends)
+The top-level namespace re-exports the **v1** API for backwards compatibility,
+so ``from norm_toolkit import DuckDBNormalizer`` resolves to ``norm_toolkit.v1``.
+New code should import explicitly from ``norm_toolkit.v2`` (or ``norm_toolkit.v1``).
+"""
+from norm_toolkit import v1, v2
+from norm_toolkit.v1 import *  # noqa: F403  (default API == v1)
+from norm_toolkit.v1 import __all__ as _v1_all
+__all__ = ["v1", "v2", *_v1_all]

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/__init__.py RENAMED Viewed

@@ -21,14 +21,14 @@ Data models:
 - SemanticType: Semantic type info (UMLS only)
 """
-from norm_toolkit.build_merged import build_merged_duckdb
-from norm_toolkit.build_ontology import build_ontology_duckdb
-from norm_toolkit.build_umls import build_umls_duckdb
-from norm_toolkit.constants import ONTOLOGY_DF_SCHEMA
-from norm_toolkit.models import ConceptInfo, SemanticType
-from norm_toolkit.normalizer import DuckDBNormalizer
-from norm_toolkit.normalizer_postgres import PostgresNormalizer
-from norm_toolkit.utils import prepare_ontology_df, push_to_postgres
+from .build_merged import build_merged_duckdb
+from .build_ontology import build_ontology_duckdb
+from .build_umls import build_umls_duckdb
+from .constants import ONTOLOGY_DF_SCHEMA
+from .models import ConceptInfo, SemanticType
+from .normalizer import DuckDBNormalizer
+from .normalizer_postgres import PostgresNormalizer
+from .utils import prepare_ontology_df, push_to_postgres
 __all__ = [
     # Build functions

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_ontology.py RENAMED Viewed

@@ -8,7 +8,7 @@ from __future__ import annotations
 import polars as pl
-from norm_toolkit.build_merged import build_merged_duckdb
+from .build_merged import build_merged_duckdb
 def build_ontology_duckdb(

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_umls.py RENAMED Viewed

@@ -6,7 +6,7 @@ This is a convenience wrapper around build_merged_duckdb for UMLS-only builds.
 from __future__ import annotations
-from norm_toolkit.build_merged import build_merged_duckdb
+from .build_merged import build_merged_duckdb
 def build_umls_duckdb(

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer.py RENAMED Viewed

@@ -14,7 +14,7 @@ from textwrap import dedent
 import duckdb
 import polars as pl
-from norm_toolkit.constants import (
+from .constants import (
     ATOMS_TABLE,
     DEFAULT_PREFER_TTYS,
     DEFS_TABLE,
@@ -24,8 +24,8 @@ from norm_toolkit.constants import (
     NW_TABLE,
     TYPES_TABLE,
 )
-from norm_toolkit.models import ConceptInfo
-from norm_toolkit.normalizer_utils import (
+from .models import ConceptInfo
+from .normalizer_utils import (
     apply_concept_name_rows,
     apply_definition_rows,
     apply_semantic_type_rows,

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_postgres.py RENAMED Viewed

@@ -14,7 +14,7 @@ import polars as pl
 from sqlalchemy import RowMapping, text
 from sqlalchemy.ext.asyncio import AsyncEngine
-from norm_toolkit.constants import (
+from .constants import (
     ATOMS_TABLE,
     DEFAULT_PREFER_TTYS,
     DEFS_TABLE,
@@ -24,9 +24,9 @@ from norm_toolkit.constants import (
     NW_TABLE,
     TYPES_TABLE,
 )
-from norm_toolkit.models import ConceptInfo
-from norm_toolkit.normalizer_cache import ExpansionCache, NormalizerCache
-from norm_toolkit.normalizer_utils import (
+from .models import ConceptInfo
+from .normalizer_cache import ExpansionCache, NormalizerCache
+from .normalizer_utils import (
     apply_concept_name_rows,
     apply_definition_rows,
     apply_semantic_type_rows,
@@ -602,72 +602,134 @@ class PostgresNormalizer:
             List of descendant concept IDs ordered by depth (shallowest first),
             excludes the starting concept
         """
+        results = await self.get_narrower_concepts_many(
+            [concept_id],
+            max_depth=max_depth,
+            filter_ontologies=filter_ontologies,
+            max_ids=max_ids,
+        )
+        return results.get(concept_id, [])
+    async def get_narrower_concepts_many(
+        self,
+        concept_ids: Sequence[str],
+        max_depth: int | None = 10,
+        filter_ontologies: list[str] | None = None,
+        max_ids: int | None = None,
+    ) -> dict[str, list[str]]:
+        """
+        Get narrower (descendant) concept IDs for many roots in one query.
+        Uses the hierarchy edges to walk down the tree/DAG from each root concept.
+        Args:
+            concept_ids: Starting concept IDs (broader terms)
+            max_depth: Maximum depth to traverse (1 = direct children only, None = all descendants)
+            filter_ontologies: Only follow edges from these ontologies (e.g., ["UMLS", "CHEBI"])
+            max_ids: Maximum number of concept IDs to return (None = no limit)
+        Returns:
+            Dict mapping each concept ID to descendant IDs ordered by depth
+            (shallowest first), excluding the starting concept.
+        """
         await self._ensure_initialized()
-        if not self._has_edges:
-            return []
+        if not self._has_edges or not concept_ids:
+            return {cid: [] for cid in concept_ids}
+        id_list = list(dict.fromkeys(concept_ids))
+        res: dict[str, list[str]] = {}
+        missing: list[str] = []
+        cache_keys: dict[str, Any] = {}
-        cache_key = None
         if self._expansion_cache is not None:
-            cache_key = ExpansionCache.make_key(
-                concept_id,
-                max_depth=max_depth,
-                filter_ontologies=filter_ontologies,
-                max_ids=max_ids,
-            )
-            cached = self._expansion_cache.get(cache_key)
-            if cached is not None:
-                return cached
+            for cid in id_list:
+                cache_key = ExpansionCache.make_key(
+                    cid,
+                    max_depth=max_depth,
+                    filter_ontologies=filter_ontologies,
+                    max_ids=max_ids,
+                )
+                cache_keys[cid] = cache_key
+                cached = self._expansion_cache.get(cache_key)
+                if cached is not None:
+                    res[cid] = cached
+                else:
+                    res[cid] = []
+                    missing.append(cid)
+        else:
+            for cid in id_list:
+                res[cid] = []
+            missing = id_list
-        params: dict[str, Any] = {"concept_id": concept_id, "max_depth": max_depth}
+        if not missing:
+            return res
+        sql_params = _SqlParams()
+        idmap_values = sql_params.add_single_column_values(missing)
+        params = sql_params.params
+        params["max_depth"] = max_depth
-        # Build ontology filter clause
         ontology_filter = ""
         if filter_ontologies:
-            ont_placeholders = []
-            for i, ont in enumerate(filter_ontologies):
-                key = f"ont{i}"
-                params[key] = ont
-                ont_placeholders.append(f":{key}")
-            ontologies_sql = ", ".join(ont_placeholders)
+            ontologies_sql = sql_params.add_values(filter_ontologies)
             ontology_filter = f" AND e.ontology IN ({ontologies_sql})"
-        # Build optional LIMIT clause
-        limit_clause = ""
-        if max_ids is not None:
+        if max_ids is None:
+            select_sql = """
+            SELECT root_id, concept_id, MIN(depth) AS min_depth
+            FROM walk
+            WHERE concept_id != root_id
+            GROUP BY root_id, concept_id
+            ORDER BY root_id, min_depth, concept_id
+            """
+        else:
             params["max_ids"] = max_ids
-            limit_clause = "\nLIMIT :max_ids"
+            select_sql = """
+            SELECT root_id, concept_id, min_depth
+            FROM (
+                SELECT root_id, concept_id, min_depth,
+                    ROW_NUMBER() OVER (PARTITION BY root_id ORDER BY min_depth, concept_id) AS rn
+                FROM (
+                    SELECT root_id, concept_id, MIN(depth) AS min_depth
+                    FROM walk
+                    WHERE concept_id != root_id
+                    GROUP BY root_id, concept_id
+                ) base
+            ) ranked
+            WHERE rn <= :max_ids
+            ORDER BY root_id, min_depth, concept_id
+            """
-        # PostgreSQL recursive CTE with named parameters
-        # Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
-        # UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
-        # GROUP BY with MIN(depth) gets shortest path depth for each concept
         query = dedent(
             f"""
-            WITH RECURSIVE walk(concept_id, depth) AS (
-                SELECT CAST(:concept_id AS VARCHAR), 0
+            WITH RECURSIVE idmap(root_id) AS (VALUES {idmap_values}),
+            walk(root_id, concept_id, depth) AS (
+                SELECT root_id, root_id, 0
+                FROM idmap
                 UNION
-                SELECT e.child_id, w.depth + 1
+                SELECT w.root_id, e.child_id, w.depth + 1
                 FROM walk w
                 JOIN {self._edges_table} e ON e.parent_id = w.concept_id
                 WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){ontology_filter}
             )
-            SELECT concept_id, MIN(depth) AS min_depth
-            FROM walk
-            WHERE concept_id != :concept_id
-            GROUP BY concept_id
-            ORDER BY min_depth, concept_id{limit_clause}
+            {select_sql}
             """
         )
         rows = await self._fetch_rows(query, params)
-        result = [r["concept_id"] for r in rows]
-        if self._expansion_cache is not None and cache_key is not None:
-            self._expansion_cache.set(cache_key, result)
-        return result
+        for row in rows:
+            res[row["root_id"]].append(row["concept_id"])
+        if self._expansion_cache is not None:
+            for cid in missing:
+                self._expansion_cache.set(cache_keys[cid], res[cid])
+        return res
     def cache_stats(self) -> dict[str, Any] | None:
         """

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_utils.py RENAMED Viewed

@@ -11,14 +11,14 @@ from typing import Any, Literal
 from lvg_norm import lvg_normalize
 from sqlalchemy import RowMapping
-from norm_toolkit.constants import (
+from .constants import (
     EXACT_BUMP,
     ISPREF_WEIGHT,
     RANK_MULTIPLIER,
     STT_WEIGHT,
     TTY_WEIGHT,
 )
-from norm_toolkit.models import ConceptInfo, SemanticType
+from .models import ConceptInfo, SemanticType
 def _coerce_synonyms_list(

{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/utils.py RENAMED Viewed

@@ -11,7 +11,7 @@ import polars as pl
 from lvg_norm import lvg_normalize
 from tqdm import tqdm
-from norm_toolkit.constants import (
+from .constants import (
     ATOMS_TABLE,
     CONCEPTS_TABLE,
     DEFS_TABLE,

norm_toolkit-1.9.0/src/norm_toolkit/v2/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""
+Unified normalization package.
+Provides normalizer implementations that work with UMLS, ontology,
+and merged databases using a standardized schema.
+Build functions:
+- build_merged_duckdb: Build a DuckDB database from UMLS and/or ontology data
+Normalizers:
+- DuckDBNormalizer: High-throughput sync normalizer for DuckDB (batch processing)
+- ClickHouseNormalizer: Normalizer backed by ClickHouse tables uploaded from a DuckDB build
+ClickHouse:
+- upload_duckdb_to_clickhouse: Upload a norm_toolkit DuckDB database into ClickHouse
+- clickhouse_client_from_env: Build a ClickHouse client from .env / environment settings
+Data models:
+- ConceptInfo: Unified concept metadata
+- SemanticType: Semantic type info (UMLS only)
+"""
+from .build_merged import build_merged_duckdb
+from .clickhouse_backend import ClickHouseNormalizer
+from .clickhouse_common import clickhouse_client_from_env
+from .clickhouse_upload import upload_duckdb_to_clickhouse
+from .duckdb_backend import DuckDBNormalizer
+from .schema import ONTOLOGY_DF_SCHEMA, ConceptInfo, SemanticType
+__all__ = [
+    # Build functions
+    "build_merged_duckdb",
+    "upload_duckdb_to_clickhouse",
+    # Normalizers
+    "ClickHouseNormalizer",
+    "DuckDBNormalizer",
+    # ClickHouse
+    "clickhouse_client_from_env",
+    # Models
+    "ConceptInfo",
+    "SemanticType",
+    # Schemas
+    "ONTOLOGY_DF_SCHEMA",
+]

norm_toolkit 1.7.0__tar.gz → 1.9.0__tar.gz

norm_toolkit 1.7.0tar.gz → 1.9.0tar.gz