norm_toolkit 1.7.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. norm_toolkit-1.9.0/PKG-INFO +53 -0
  2. norm_toolkit-1.9.0/README.md +34 -0
  3. {norm_toolkit-1.7.0 → norm_toolkit-1.9.0}/pyproject.toml +16 -10
  4. norm_toolkit-1.9.0/src/norm_toolkit/__init__.py +18 -0
  5. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/__init__.py +8 -8
  6. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_ontology.py +1 -1
  7. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_umls.py +1 -1
  8. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer.py +3 -3
  9. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_postgres.py +106 -44
  10. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_utils.py +2 -2
  11. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/utils.py +1 -1
  12. norm_toolkit-1.9.0/src/norm_toolkit/v2/__init__.py +44 -0
  13. norm_toolkit-1.9.0/src/norm_toolkit/v2/build_merged.py +1249 -0
  14. norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_backend.py +768 -0
  15. norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_common.py +84 -0
  16. norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_upload.py +148 -0
  17. norm_toolkit-1.9.0/src/norm_toolkit/v2/duckdb_backend.py +413 -0
  18. norm_toolkit-1.9.0/src/norm_toolkit/v2/normalizer_base.py +396 -0
  19. norm_toolkit-1.9.0/src/norm_toolkit/v2/normalizer_utils.py +546 -0
  20. norm_toolkit-1.9.0/src/norm_toolkit/v2/schema.py +239 -0
  21. norm_toolkit-1.7.0/PKG-INFO +0 -17
  22. norm_toolkit-1.7.0/README.md +0 -0
  23. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_merged.py +0 -0
  24. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/constants.py +0 -0
  25. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/models.py +0 -0
  26. {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_cache.py +0 -0
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.3
2
+ Name: norm_toolkit
3
+ Version: 1.9.0
4
+ Summary: Toolkit to normalize text to UMLS / ontologies
5
+ Author: Haydn Jones
6
+ Author-email: Haydn Jones <haydnjonest@gmail.com>
7
+ Requires-Dist: asyncpg>=0.29.0
8
+ Requires-Dist: clickhouse-connect>=1.0.0
9
+ Requires-Dist: duckdb>=1.5.0
10
+ Requires-Dist: lvg-norm>=1.3.0
11
+ Requires-Dist: polars[rt64]>=1.36.1
12
+ Requires-Dist: pyarrow>=20.0.0
13
+ Requires-Dist: pydantic>=2.12.5
14
+ Requires-Dist: python-dotenv>=1.2.2
15
+ Requires-Dist: sqlalchemy>=2.0.0
16
+ Requires-Dist: tqdm>=4.67.1
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+
20
+ ## ClickHouse backend
21
+
22
+ The DuckDB builder remains the source of truth. Build a DuckDB file with
23
+ `build_merged_duckdb`, then upload its canonical tables into ClickHouse:
24
+
25
+ ```bash
26
+ uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
27
+ ```
28
+
29
+ The upload shows a progress bar for each copied table; pass `--no-progress` to
30
+ silence it.
31
+
32
+ Connection settings are read from `.env` with `python-dotenv` and use the
33
+ official `clickhouse-connect` client. Set `CH_HTTP`, for example
34
+ `http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
35
+ separately and override URL credentials.
36
+
37
+ Use the ClickHouse backend from Python:
38
+
39
+ ```python
40
+ from norm_toolkit import ClickHouseNormalizer
41
+
42
+ normalizer = ClickHouseNormalizer(database="normalization")
43
+ result = normalizer.normalize(["aspirin"], top_k=5)
44
+ ```
45
+
46
+ You can also pass a DSN in code:
47
+
48
+ ```python
49
+ normalizer = ClickHouseNormalizer(
50
+ dsn="http://host:8123/normalization",
51
+ database="normalization",
52
+ )
53
+ ```
@@ -0,0 +1,34 @@
1
+ ## ClickHouse backend
2
+
3
+ The DuckDB builder remains the source of truth. Build a DuckDB file with
4
+ `build_merged_duckdb`, then upload its canonical tables into ClickHouse:
5
+
6
+ ```bash
7
+ uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
8
+ ```
9
+
10
+ The upload shows a progress bar for each copied table; pass `--no-progress` to
11
+ silence it.
12
+
13
+ Connection settings are read from `.env` with `python-dotenv` and use the
14
+ official `clickhouse-connect` client. Set `CH_HTTP`, for example
15
+ `http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
16
+ separately and override URL credentials.
17
+
18
+ Use the ClickHouse backend from Python:
19
+
20
+ ```python
21
+ from norm_toolkit import ClickHouseNormalizer
22
+
23
+ normalizer = ClickHouseNormalizer(database="normalization")
24
+ result = normalizer.normalize(["aspirin"], top_k=5)
25
+ ```
26
+
27
+ You can also pass a DSN in code:
28
+
29
+ ```python
30
+ normalizer = ClickHouseNormalizer(
31
+ dsn="http://host:8123/normalization",
32
+ database="normalization",
33
+ )
34
+ ```
@@ -1,45 +1,51 @@
1
1
  [project]
2
2
  name = "norm_toolkit"
3
- version = "1.7.0"
3
+ version = "1.9.0"
4
4
  description = "Toolkit to normalize text to UMLS / ontologies"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
7
7
  requires-python = ">=3.12"
8
8
  dependencies = [
9
9
  "asyncpg>=0.29.0",
10
- "duckdb>=1.4.3",
11
- "lvg-norm>=1.1.0",
10
+ "clickhouse-connect>=1.0.0",
11
+ "duckdb>=1.5.0",
12
+ "lvg-norm>=1.3.0",
12
13
  "polars[rt64]>=1.36.1",
13
14
  "pyarrow>=20.0.0",
14
15
  "pydantic>=2.12.5",
16
+ "python-dotenv>=1.2.2",
15
17
  "sqlalchemy>=2.0.0",
16
18
  "tqdm>=4.67.1",
17
19
  ]
18
20
 
19
21
  [dependency-groups]
20
22
  dev = [
21
- "datasets>=4.4.1",
22
- "dotenv>=0.9.9",
23
23
  "ipython>=9.8.0",
24
24
  "pytest>=8.3",
25
- "rdkit>=2025.9.3",
26
25
  "ruff>=0.6.9",
27
26
  "fire>=0.7.1",
28
- "joblib>=1.5.3",
27
+ "ipykernel>=7.2.0",
28
+ "ipywidgets>=8.1.8",
29
29
  ]
30
30
 
31
31
  [build-system]
32
- requires = ["uv_build>=0.9.11,<0.10.0"]
32
+ requires = ["uv_build>=0.9.11,<0.13.0"]
33
33
  build-backend = "uv_build"
34
34
 
35
35
  [tool.ruff]
36
36
  line-length = 120
37
37
  indent-width = 4
38
- target-version = "py313"
38
+ target-version = "py312"
39
+ # v1 is a frozen vendored snapshot of the 1.8.0 release; don't lint/format it.
40
+ extend-exclude = ["src/norm_toolkit/v1"]
41
+
42
+ [tool.ty.src]
43
+ # v1 is a frozen vendored snapshot of the 1.8.0 release; don't type-check it.
44
+ exclude = ["src/norm_toolkit/v1"]
39
45
 
40
46
  [tool.ruff.lint]
41
47
  select = ["E", "F", "UP", "B", "SIM", "I", "FURB"]
42
- ignore = ["B905", "E501", "SIM108", "SIM103"]
48
+ ignore = ["E501", "SIM108", "SIM103"]
43
49
  fixable = ["ALL"]
44
50
 
45
51
  [tool.ruff.format]
@@ -0,0 +1,18 @@
1
+ """
2
+ norm_toolkit — biomedical text normalization to UMLS / custom ontologies.
3
+
4
+ This package is versioned into two subpackages:
5
+
6
+ - ``norm_toolkit.v1``: the 1.8.0 API (DuckDB + PostgreSQL backends)
7
+ - ``norm_toolkit.v2``: the current API (DuckDB + ClickHouse backends)
8
+
9
+ The top-level namespace re-exports the **v1** API for backwards compatibility,
10
+ so ``from norm_toolkit import DuckDBNormalizer`` resolves to ``norm_toolkit.v1``.
11
+ New code should import explicitly from ``norm_toolkit.v2`` (or ``norm_toolkit.v1``).
12
+ """
13
+
14
+ from norm_toolkit import v1, v2
15
+ from norm_toolkit.v1 import * # noqa: F403 (default API == v1)
16
+ from norm_toolkit.v1 import __all__ as _v1_all
17
+
18
+ __all__ = ["v1", "v2", *_v1_all]
@@ -21,14 +21,14 @@ Data models:
21
21
  - SemanticType: Semantic type info (UMLS only)
22
22
  """
23
23
 
24
- from norm_toolkit.build_merged import build_merged_duckdb
25
- from norm_toolkit.build_ontology import build_ontology_duckdb
26
- from norm_toolkit.build_umls import build_umls_duckdb
27
- from norm_toolkit.constants import ONTOLOGY_DF_SCHEMA
28
- from norm_toolkit.models import ConceptInfo, SemanticType
29
- from norm_toolkit.normalizer import DuckDBNormalizer
30
- from norm_toolkit.normalizer_postgres import PostgresNormalizer
31
- from norm_toolkit.utils import prepare_ontology_df, push_to_postgres
24
+ from .build_merged import build_merged_duckdb
25
+ from .build_ontology import build_ontology_duckdb
26
+ from .build_umls import build_umls_duckdb
27
+ from .constants import ONTOLOGY_DF_SCHEMA
28
+ from .models import ConceptInfo, SemanticType
29
+ from .normalizer import DuckDBNormalizer
30
+ from .normalizer_postgres import PostgresNormalizer
31
+ from .utils import prepare_ontology_df, push_to_postgres
32
32
 
33
33
  __all__ = [
34
34
  # Build functions
@@ -8,7 +8,7 @@ from __future__ import annotations
8
8
 
9
9
  import polars as pl
10
10
 
11
- from norm_toolkit.build_merged import build_merged_duckdb
11
+ from .build_merged import build_merged_duckdb
12
12
 
13
13
 
14
14
  def build_ontology_duckdb(
@@ -6,7 +6,7 @@ This is a convenience wrapper around build_merged_duckdb for UMLS-only builds.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from norm_toolkit.build_merged import build_merged_duckdb
9
+ from .build_merged import build_merged_duckdb
10
10
 
11
11
 
12
12
  def build_umls_duckdb(
@@ -14,7 +14,7 @@ from textwrap import dedent
14
14
  import duckdb
15
15
  import polars as pl
16
16
 
17
- from norm_toolkit.constants import (
17
+ from .constants import (
18
18
  ATOMS_TABLE,
19
19
  DEFAULT_PREFER_TTYS,
20
20
  DEFS_TABLE,
@@ -24,8 +24,8 @@ from norm_toolkit.constants import (
24
24
  NW_TABLE,
25
25
  TYPES_TABLE,
26
26
  )
27
- from norm_toolkit.models import ConceptInfo
28
- from norm_toolkit.normalizer_utils import (
27
+ from .models import ConceptInfo
28
+ from .normalizer_utils import (
29
29
  apply_concept_name_rows,
30
30
  apply_definition_rows,
31
31
  apply_semantic_type_rows,
@@ -14,7 +14,7 @@ import polars as pl
14
14
  from sqlalchemy import RowMapping, text
15
15
  from sqlalchemy.ext.asyncio import AsyncEngine
16
16
 
17
- from norm_toolkit.constants import (
17
+ from .constants import (
18
18
  ATOMS_TABLE,
19
19
  DEFAULT_PREFER_TTYS,
20
20
  DEFS_TABLE,
@@ -24,9 +24,9 @@ from norm_toolkit.constants import (
24
24
  NW_TABLE,
25
25
  TYPES_TABLE,
26
26
  )
27
- from norm_toolkit.models import ConceptInfo
28
- from norm_toolkit.normalizer_cache import ExpansionCache, NormalizerCache
29
- from norm_toolkit.normalizer_utils import (
27
+ from .models import ConceptInfo
28
+ from .normalizer_cache import ExpansionCache, NormalizerCache
29
+ from .normalizer_utils import (
30
30
  apply_concept_name_rows,
31
31
  apply_definition_rows,
32
32
  apply_semantic_type_rows,
@@ -602,72 +602,134 @@ class PostgresNormalizer:
602
602
  List of descendant concept IDs ordered by depth (shallowest first),
603
603
  excludes the starting concept
604
604
  """
605
+ results = await self.get_narrower_concepts_many(
606
+ [concept_id],
607
+ max_depth=max_depth,
608
+ filter_ontologies=filter_ontologies,
609
+ max_ids=max_ids,
610
+ )
611
+ return results.get(concept_id, [])
612
+
613
+ async def get_narrower_concepts_many(
614
+ self,
615
+ concept_ids: Sequence[str],
616
+ max_depth: int | None = 10,
617
+ filter_ontologies: list[str] | None = None,
618
+ max_ids: int | None = None,
619
+ ) -> dict[str, list[str]]:
620
+ """
621
+ Get narrower (descendant) concept IDs for many roots in one query.
622
+
623
+ Uses the hierarchy edges to walk down the tree/DAG from each root concept.
624
+
625
+ Args:
626
+ concept_ids: Starting concept IDs (broader terms)
627
+ max_depth: Maximum depth to traverse (1 = direct children only, None = all descendants)
628
+ filter_ontologies: Only follow edges from these ontologies (e.g., ["UMLS", "CHEBI"])
629
+ max_ids: Maximum number of concept IDs to return (None = no limit)
630
+
631
+ Returns:
632
+ Dict mapping each concept ID to descendant IDs ordered by depth
633
+ (shallowest first), excluding the starting concept.
634
+ """
605
635
  await self._ensure_initialized()
606
636
 
607
- if not self._has_edges:
608
- return []
637
+ if not self._has_edges or not concept_ids:
638
+ return {cid: [] for cid in concept_ids}
639
+
640
+ id_list = list(dict.fromkeys(concept_ids))
641
+
642
+ res: dict[str, list[str]] = {}
643
+ missing: list[str] = []
644
+ cache_keys: dict[str, Any] = {}
609
645
 
610
- cache_key = None
611
646
  if self._expansion_cache is not None:
612
- cache_key = ExpansionCache.make_key(
613
- concept_id,
614
- max_depth=max_depth,
615
- filter_ontologies=filter_ontologies,
616
- max_ids=max_ids,
617
- )
618
- cached = self._expansion_cache.get(cache_key)
619
- if cached is not None:
620
- return cached
647
+ for cid in id_list:
648
+ cache_key = ExpansionCache.make_key(
649
+ cid,
650
+ max_depth=max_depth,
651
+ filter_ontologies=filter_ontologies,
652
+ max_ids=max_ids,
653
+ )
654
+ cache_keys[cid] = cache_key
655
+ cached = self._expansion_cache.get(cache_key)
656
+ if cached is not None:
657
+ res[cid] = cached
658
+ else:
659
+ res[cid] = []
660
+ missing.append(cid)
661
+ else:
662
+ for cid in id_list:
663
+ res[cid] = []
664
+ missing = id_list
621
665
 
622
- params: dict[str, Any] = {"concept_id": concept_id, "max_depth": max_depth}
666
+ if not missing:
667
+ return res
668
+
669
+ sql_params = _SqlParams()
670
+ idmap_values = sql_params.add_single_column_values(missing)
671
+ params = sql_params.params
672
+ params["max_depth"] = max_depth
623
673
 
624
- # Build ontology filter clause
625
674
  ontology_filter = ""
626
675
  if filter_ontologies:
627
- ont_placeholders = []
628
- for i, ont in enumerate(filter_ontologies):
629
- key = f"ont{i}"
630
- params[key] = ont
631
- ont_placeholders.append(f":{key}")
632
- ontologies_sql = ", ".join(ont_placeholders)
676
+ ontologies_sql = sql_params.add_values(filter_ontologies)
633
677
  ontology_filter = f" AND e.ontology IN ({ontologies_sql})"
634
678
 
635
- # Build optional LIMIT clause
636
- limit_clause = ""
637
- if max_ids is not None:
679
+ if max_ids is None:
680
+ select_sql = """
681
+ SELECT root_id, concept_id, MIN(depth) AS min_depth
682
+ FROM walk
683
+ WHERE concept_id != root_id
684
+ GROUP BY root_id, concept_id
685
+ ORDER BY root_id, min_depth, concept_id
686
+ """
687
+ else:
638
688
  params["max_ids"] = max_ids
639
- limit_clause = "\nLIMIT :max_ids"
689
+ select_sql = """
690
+ SELECT root_id, concept_id, min_depth
691
+ FROM (
692
+ SELECT root_id, concept_id, min_depth,
693
+ ROW_NUMBER() OVER (PARTITION BY root_id ORDER BY min_depth, concept_id) AS rn
694
+ FROM (
695
+ SELECT root_id, concept_id, MIN(depth) AS min_depth
696
+ FROM walk
697
+ WHERE concept_id != root_id
698
+ GROUP BY root_id, concept_id
699
+ ) base
700
+ ) ranked
701
+ WHERE rn <= :max_ids
702
+ ORDER BY root_id, min_depth, concept_id
703
+ """
640
704
 
641
- # PostgreSQL recursive CTE with named parameters
642
- # Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
643
- # UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
644
- # GROUP BY with MIN(depth) gets shortest path depth for each concept
645
705
  query = dedent(
646
706
  f"""
647
- WITH RECURSIVE walk(concept_id, depth) AS (
648
- SELECT CAST(:concept_id AS VARCHAR), 0
707
+ WITH RECURSIVE idmap(root_id) AS (VALUES {idmap_values}),
708
+ walk(root_id, concept_id, depth) AS (
709
+ SELECT root_id, root_id, 0
710
+ FROM idmap
649
711
 
650
712
  UNION
651
713
 
652
- SELECT e.child_id, w.depth + 1
714
+ SELECT w.root_id, e.child_id, w.depth + 1
653
715
  FROM walk w
654
716
  JOIN {self._edges_table} e ON e.parent_id = w.concept_id
655
717
  WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){ontology_filter}
656
718
  )
657
- SELECT concept_id, MIN(depth) AS min_depth
658
- FROM walk
659
- WHERE concept_id != :concept_id
660
- GROUP BY concept_id
661
- ORDER BY min_depth, concept_id{limit_clause}
719
+ {select_sql}
662
720
  """
663
721
  )
664
722
 
665
723
  rows = await self._fetch_rows(query, params)
666
724
 
667
- result = [r["concept_id"] for r in rows]
668
- if self._expansion_cache is not None and cache_key is not None:
669
- self._expansion_cache.set(cache_key, result)
670
- return result
725
+ for row in rows:
726
+ res[row["root_id"]].append(row["concept_id"])
727
+
728
+ if self._expansion_cache is not None:
729
+ for cid in missing:
730
+ self._expansion_cache.set(cache_keys[cid], res[cid])
731
+
732
+ return res
671
733
 
672
734
  def cache_stats(self) -> dict[str, Any] | None:
673
735
  """
@@ -11,14 +11,14 @@ from typing import Any, Literal
11
11
  from lvg_norm import lvg_normalize
12
12
  from sqlalchemy import RowMapping
13
13
 
14
- from norm_toolkit.constants import (
14
+ from .constants import (
15
15
  EXACT_BUMP,
16
16
  ISPREF_WEIGHT,
17
17
  RANK_MULTIPLIER,
18
18
  STT_WEIGHT,
19
19
  TTY_WEIGHT,
20
20
  )
21
- from norm_toolkit.models import ConceptInfo, SemanticType
21
+ from .models import ConceptInfo, SemanticType
22
22
 
23
23
 
24
24
  def _coerce_synonyms_list(
@@ -11,7 +11,7 @@ import polars as pl
11
11
  from lvg_norm import lvg_normalize
12
12
  from tqdm import tqdm
13
13
 
14
- from norm_toolkit.constants import (
14
+ from .constants import (
15
15
  ATOMS_TABLE,
16
16
  CONCEPTS_TABLE,
17
17
  DEFS_TABLE,
@@ -0,0 +1,44 @@
1
+ """
2
+ Unified normalization package.
3
+
4
+ Provides normalizer implementations that work with UMLS, ontology,
5
+ and merged databases using a standardized schema.
6
+
7
+ Build functions:
8
+ - build_merged_duckdb: Build a DuckDB database from UMLS and/or ontology data
9
+
10
+ Normalizers:
11
+ - DuckDBNormalizer: High-throughput sync normalizer for DuckDB (batch processing)
12
+ - ClickHouseNormalizer: Normalizer backed by ClickHouse tables uploaded from a DuckDB build
13
+
14
+ ClickHouse:
15
+ - upload_duckdb_to_clickhouse: Upload a norm_toolkit DuckDB database into ClickHouse
16
+ - clickhouse_client_from_env: Build a ClickHouse client from .env / environment settings
17
+
18
+ Data models:
19
+ - ConceptInfo: Unified concept metadata
20
+ - SemanticType: Semantic type info (UMLS only)
21
+ """
22
+
23
+ from .build_merged import build_merged_duckdb
24
+ from .clickhouse_backend import ClickHouseNormalizer
25
+ from .clickhouse_common import clickhouse_client_from_env
26
+ from .clickhouse_upload import upload_duckdb_to_clickhouse
27
+ from .duckdb_backend import DuckDBNormalizer
28
+ from .schema import ONTOLOGY_DF_SCHEMA, ConceptInfo, SemanticType
29
+
30
+ __all__ = [
31
+ # Build functions
32
+ "build_merged_duckdb",
33
+ "upload_duckdb_to_clickhouse",
34
+ # Normalizers
35
+ "ClickHouseNormalizer",
36
+ "DuckDBNormalizer",
37
+ # ClickHouse
38
+ "clickhouse_client_from_env",
39
+ # Models
40
+ "ConceptInfo",
41
+ "SemanticType",
42
+ # Schemas
43
+ "ONTOLOGY_DF_SCHEMA",
44
+ ]