norm_toolkit 1.8.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. norm_toolkit-1.9.0/PKG-INFO +53 -0
  2. norm_toolkit-1.9.0/README.md +34 -0
  3. {norm_toolkit-1.8.0 → norm_toolkit-1.9.0}/pyproject.toml +16 -10
  4. norm_toolkit-1.9.0/src/norm_toolkit/__init__.py +18 -0
  5. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/__init__.py +8 -8
  6. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_ontology.py +1 -1
  7. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_umls.py +1 -1
  8. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer.py +3 -3
  9. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_postgres.py +4 -4
  10. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_utils.py +2 -2
  11. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/utils.py +1 -1
  12. norm_toolkit-1.9.0/src/norm_toolkit/v2/__init__.py +44 -0
  13. norm_toolkit-1.9.0/src/norm_toolkit/v2/build_merged.py +1249 -0
  14. norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_backend.py +768 -0
  15. norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_common.py +84 -0
  16. norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_upload.py +148 -0
  17. norm_toolkit-1.9.0/src/norm_toolkit/v2/duckdb_backend.py +413 -0
  18. norm_toolkit-1.9.0/src/norm_toolkit/v2/normalizer_base.py +396 -0
  19. norm_toolkit-1.9.0/src/norm_toolkit/v2/normalizer_utils.py +546 -0
  20. norm_toolkit-1.9.0/src/norm_toolkit/v2/schema.py +239 -0
  21. norm_toolkit-1.8.0/PKG-INFO +0 -17
  22. norm_toolkit-1.8.0/README.md +0 -0
  23. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_merged.py +0 -0
  24. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/constants.py +0 -0
  25. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/models.py +0 -0
  26. {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_cache.py +0 -0
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.3
2
+ Name: norm_toolkit
3
+ Version: 1.9.0
4
+ Summary: Toolkit to normalize text to UMLS / ontologies
5
+ Author: Haydn Jones
6
+ Author-email: Haydn Jones <haydnjonest@gmail.com>
7
+ Requires-Dist: asyncpg>=0.29.0
8
+ Requires-Dist: clickhouse-connect>=1.0.0
9
+ Requires-Dist: duckdb>=1.5.0
10
+ Requires-Dist: lvg-norm>=1.3.0
11
+ Requires-Dist: polars[rt64]>=1.36.1
12
+ Requires-Dist: pyarrow>=20.0.0
13
+ Requires-Dist: pydantic>=2.12.5
14
+ Requires-Dist: python-dotenv>=1.2.2
15
+ Requires-Dist: sqlalchemy>=2.0.0
16
+ Requires-Dist: tqdm>=4.67.1
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+
20
+ ## ClickHouse backend
21
+
22
+ The DuckDB builder remains the source of truth. Build a DuckDB file with
23
+ `build_merged_duckdb`, then upload its canonical tables into ClickHouse:
24
+
25
+ ```bash
26
+ uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
27
+ ```
28
+
29
+ The upload shows a progress bar for each copied table; pass `--no-progress` to
30
+ silence it.
31
+
32
+ Connection settings are read from `.env` with `python-dotenv` and use the
33
+ official `clickhouse-connect` client. Set `CH_HTTP`, for example
34
+ `http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
35
+ separately and override URL credentials.
36
+
37
+ Use the ClickHouse backend from Python:
38
+
39
+ ```python
40
+ from norm_toolkit import ClickHouseNormalizer
41
+
42
+ normalizer = ClickHouseNormalizer(database="normalization")
43
+ result = normalizer.normalize(["aspirin"], top_k=5)
44
+ ```
45
+
46
+ You can also pass a DSN in code:
47
+
48
+ ```python
49
+ normalizer = ClickHouseNormalizer(
50
+ dsn="http://host:8123/normalization",
51
+ database="normalization",
52
+ )
53
+ ```
@@ -0,0 +1,34 @@
1
+ ## ClickHouse backend
2
+
3
+ The DuckDB builder remains the source of truth. Build a DuckDB file with
4
+ `build_merged_duckdb`, then upload its canonical tables into ClickHouse:
5
+
6
+ ```bash
7
+ uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
8
+ ```
9
+
10
+ The upload shows a progress bar for each copied table; pass `--no-progress` to
11
+ silence it.
12
+
13
+ Connection settings are read from `.env` with `python-dotenv` and use the
14
+ official `clickhouse-connect` client. Set `CH_HTTP`, for example
15
+ `http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
16
+ separately and override URL credentials.
17
+
18
+ Use the ClickHouse backend from Python:
19
+
20
+ ```python
21
+ from norm_toolkit import ClickHouseNormalizer
22
+
23
+ normalizer = ClickHouseNormalizer(database="normalization")
24
+ result = normalizer.normalize(["aspirin"], top_k=5)
25
+ ```
26
+
27
+ You can also pass a DSN in code:
28
+
29
+ ```python
30
+ normalizer = ClickHouseNormalizer(
31
+ dsn="http://host:8123/normalization",
32
+ database="normalization",
33
+ )
34
+ ```
@@ -1,45 +1,51 @@
1
1
  [project]
2
2
  name = "norm_toolkit"
3
- version = "1.8.0"
3
+ version = "1.9.0"
4
4
  description = "Toolkit to normalize text to UMLS / ontologies"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
7
7
  requires-python = ">=3.12"
8
8
  dependencies = [
9
9
  "asyncpg>=0.29.0",
10
- "duckdb>=1.4.3",
11
- "lvg-norm>=1.1.0",
10
+ "clickhouse-connect>=1.0.0",
11
+ "duckdb>=1.5.0",
12
+ "lvg-norm>=1.3.0",
12
13
  "polars[rt64]>=1.36.1",
13
14
  "pyarrow>=20.0.0",
14
15
  "pydantic>=2.12.5",
16
+ "python-dotenv>=1.2.2",
15
17
  "sqlalchemy>=2.0.0",
16
18
  "tqdm>=4.67.1",
17
19
  ]
18
20
 
19
21
  [dependency-groups]
20
22
  dev = [
21
- "datasets>=4.4.1",
22
- "dotenv>=0.9.9",
23
23
  "ipython>=9.8.0",
24
24
  "pytest>=8.3",
25
- "rdkit>=2025.9.3",
26
25
  "ruff>=0.6.9",
27
26
  "fire>=0.7.1",
28
- "joblib>=1.5.3",
27
+ "ipykernel>=7.2.0",
28
+ "ipywidgets>=8.1.8",
29
29
  ]
30
30
 
31
31
  [build-system]
32
- requires = ["uv_build>=0.9.11,<0.10.0"]
32
+ requires = ["uv_build>=0.9.11,<0.13.0"]
33
33
  build-backend = "uv_build"
34
34
 
35
35
  [tool.ruff]
36
36
  line-length = 120
37
37
  indent-width = 4
38
- target-version = "py313"
38
+ target-version = "py312"
39
+ # v1 is a frozen vendored snapshot of the 1.8.0 release; don't lint/format it.
40
+ extend-exclude = ["src/norm_toolkit/v1"]
41
+
42
+ [tool.ty.src]
43
+ # v1 is a frozen vendored snapshot of the 1.8.0 release; don't type-check it.
44
+ exclude = ["src/norm_toolkit/v1"]
39
45
 
40
46
  [tool.ruff.lint]
41
47
  select = ["E", "F", "UP", "B", "SIM", "I", "FURB"]
42
- ignore = ["B905", "E501", "SIM108", "SIM103"]
48
+ ignore = ["E501", "SIM108", "SIM103"]
43
49
  fixable = ["ALL"]
44
50
 
45
51
  [tool.ruff.format]
@@ -0,0 +1,18 @@
1
+ """
2
+ norm_toolkit — biomedical text normalization to UMLS / custom ontologies.
3
+
4
+ This package is versioned into two subpackages:
5
+
6
+ - ``norm_toolkit.v1``: the 1.8.0 API (DuckDB + PostgreSQL backends)
7
+ - ``norm_toolkit.v2``: the current API (DuckDB + ClickHouse backends)
8
+
9
+ The top-level namespace re-exports the **v1** API for backwards compatibility,
10
+ so ``from norm_toolkit import DuckDBNormalizer`` resolves to ``norm_toolkit.v1``.
11
+ New code should import explicitly from ``norm_toolkit.v2`` (or ``norm_toolkit.v1``).
12
+ """
13
+
14
+ from norm_toolkit import v1, v2
15
+ from norm_toolkit.v1 import * # noqa: F403 (default API == v1)
16
+ from norm_toolkit.v1 import __all__ as _v1_all
17
+
18
+ __all__ = ["v1", "v2", *_v1_all]
@@ -21,14 +21,14 @@ Data models:
21
21
  - SemanticType: Semantic type info (UMLS only)
22
22
  """
23
23
 
24
- from norm_toolkit.build_merged import build_merged_duckdb
25
- from norm_toolkit.build_ontology import build_ontology_duckdb
26
- from norm_toolkit.build_umls import build_umls_duckdb
27
- from norm_toolkit.constants import ONTOLOGY_DF_SCHEMA
28
- from norm_toolkit.models import ConceptInfo, SemanticType
29
- from norm_toolkit.normalizer import DuckDBNormalizer
30
- from norm_toolkit.normalizer_postgres import PostgresNormalizer
31
- from norm_toolkit.utils import prepare_ontology_df, push_to_postgres
24
+ from .build_merged import build_merged_duckdb
25
+ from .build_ontology import build_ontology_duckdb
26
+ from .build_umls import build_umls_duckdb
27
+ from .constants import ONTOLOGY_DF_SCHEMA
28
+ from .models import ConceptInfo, SemanticType
29
+ from .normalizer import DuckDBNormalizer
30
+ from .normalizer_postgres import PostgresNormalizer
31
+ from .utils import prepare_ontology_df, push_to_postgres
32
32
 
33
33
  __all__ = [
34
34
  # Build functions
@@ -8,7 +8,7 @@ from __future__ import annotations
8
8
 
9
9
  import polars as pl
10
10
 
11
- from norm_toolkit.build_merged import build_merged_duckdb
11
+ from .build_merged import build_merged_duckdb
12
12
 
13
13
 
14
14
  def build_ontology_duckdb(
@@ -6,7 +6,7 @@ This is a convenience wrapper around build_merged_duckdb for UMLS-only builds.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from norm_toolkit.build_merged import build_merged_duckdb
9
+ from .build_merged import build_merged_duckdb
10
10
 
11
11
 
12
12
  def build_umls_duckdb(
@@ -14,7 +14,7 @@ from textwrap import dedent
14
14
  import duckdb
15
15
  import polars as pl
16
16
 
17
- from norm_toolkit.constants import (
17
+ from .constants import (
18
18
  ATOMS_TABLE,
19
19
  DEFAULT_PREFER_TTYS,
20
20
  DEFS_TABLE,
@@ -24,8 +24,8 @@ from norm_toolkit.constants import (
24
24
  NW_TABLE,
25
25
  TYPES_TABLE,
26
26
  )
27
- from norm_toolkit.models import ConceptInfo
28
- from norm_toolkit.normalizer_utils import (
27
+ from .models import ConceptInfo
28
+ from .normalizer_utils import (
29
29
  apply_concept_name_rows,
30
30
  apply_definition_rows,
31
31
  apply_semantic_type_rows,
@@ -14,7 +14,7 @@ import polars as pl
14
14
  from sqlalchemy import RowMapping, text
15
15
  from sqlalchemy.ext.asyncio import AsyncEngine
16
16
 
17
- from norm_toolkit.constants import (
17
+ from .constants import (
18
18
  ATOMS_TABLE,
19
19
  DEFAULT_PREFER_TTYS,
20
20
  DEFS_TABLE,
@@ -24,9 +24,9 @@ from norm_toolkit.constants import (
24
24
  NW_TABLE,
25
25
  TYPES_TABLE,
26
26
  )
27
- from norm_toolkit.models import ConceptInfo
28
- from norm_toolkit.normalizer_cache import ExpansionCache, NormalizerCache
29
- from norm_toolkit.normalizer_utils import (
27
+ from .models import ConceptInfo
28
+ from .normalizer_cache import ExpansionCache, NormalizerCache
29
+ from .normalizer_utils import (
30
30
  apply_concept_name_rows,
31
31
  apply_definition_rows,
32
32
  apply_semantic_type_rows,
@@ -11,14 +11,14 @@ from typing import Any, Literal
11
11
  from lvg_norm import lvg_normalize
12
12
  from sqlalchemy import RowMapping
13
13
 
14
- from norm_toolkit.constants import (
14
+ from .constants import (
15
15
  EXACT_BUMP,
16
16
  ISPREF_WEIGHT,
17
17
  RANK_MULTIPLIER,
18
18
  STT_WEIGHT,
19
19
  TTY_WEIGHT,
20
20
  )
21
- from norm_toolkit.models import ConceptInfo, SemanticType
21
+ from .models import ConceptInfo, SemanticType
22
22
 
23
23
 
24
24
  def _coerce_synonyms_list(
@@ -11,7 +11,7 @@ import polars as pl
11
11
  from lvg_norm import lvg_normalize
12
12
  from tqdm import tqdm
13
13
 
14
- from norm_toolkit.constants import (
14
+ from .constants import (
15
15
  ATOMS_TABLE,
16
16
  CONCEPTS_TABLE,
17
17
  DEFS_TABLE,
@@ -0,0 +1,44 @@
1
+ """
2
+ Unified normalization package.
3
+
4
+ Provides normalizer implementations that work with UMLS, ontology,
5
+ and merged databases using a standardized schema.
6
+
7
+ Build functions:
8
+ - build_merged_duckdb: Build a DuckDB database from UMLS and/or ontology data
9
+
10
+ Normalizers:
11
+ - DuckDBNormalizer: High-throughput sync normalizer for DuckDB (batch processing)
12
+ - ClickHouseNormalizer: Normalizer backed by ClickHouse tables uploaded from a DuckDB build
13
+
14
+ ClickHouse:
15
+ - upload_duckdb_to_clickhouse: Upload a norm_toolkit DuckDB database into ClickHouse
16
+ - clickhouse_client_from_env: Build a ClickHouse client from .env / environment settings
17
+
18
+ Data models:
19
+ - ConceptInfo: Unified concept metadata
20
+ - SemanticType: Semantic type info (UMLS only)
21
+ """
22
+
23
+ from .build_merged import build_merged_duckdb
24
+ from .clickhouse_backend import ClickHouseNormalizer
25
+ from .clickhouse_common import clickhouse_client_from_env
26
+ from .clickhouse_upload import upload_duckdb_to_clickhouse
27
+ from .duckdb_backend import DuckDBNormalizer
28
+ from .schema import ONTOLOGY_DF_SCHEMA, ConceptInfo, SemanticType
29
+
30
+ __all__ = [
31
+ # Build functions
32
+ "build_merged_duckdb",
33
+ "upload_duckdb_to_clickhouse",
34
+ # Normalizers
35
+ "ClickHouseNormalizer",
36
+ "DuckDBNormalizer",
37
+ # ClickHouse
38
+ "clickhouse_client_from_env",
39
+ # Models
40
+ "ConceptInfo",
41
+ "SemanticType",
42
+ # Schemas
43
+ "ONTOLOGY_DF_SCHEMA",
44
+ ]