norm_toolkit 1.8.0__tar.gz → 1.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- norm_toolkit-1.9.1/PKG-INFO +53 -0
- norm_toolkit-1.9.1/README.md +34 -0
- {norm_toolkit-1.8.0 → norm_toolkit-1.9.1}/pyproject.toml +16 -10
- norm_toolkit-1.9.1/src/norm_toolkit/__init__.py +18 -0
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/__init__.py +8 -8
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/build_ontology.py +1 -1
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/build_umls.py +1 -1
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/normalizer.py +3 -3
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/normalizer_postgres.py +4 -4
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/normalizer_utils.py +2 -2
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/utils.py +1 -1
- norm_toolkit-1.9.1/src/norm_toolkit/v2/__init__.py +44 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/build_merged.py +1249 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/clickhouse_backend.py +792 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/clickhouse_common.py +84 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/clickhouse_upload.py +148 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/duckdb_backend.py +436 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/normalizer_base.py +485 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/normalizer_utils.py +589 -0
- norm_toolkit-1.9.1/src/norm_toolkit/v2/schema.py +242 -0
- norm_toolkit-1.8.0/PKG-INFO +0 -17
- norm_toolkit-1.8.0/README.md +0 -0
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/build_merged.py +0 -0
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/constants.py +0 -0
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/models.py +0 -0
- {norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/normalizer_cache.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: norm_toolkit
|
|
3
|
+
Version: 1.9.1
|
|
4
|
+
Summary: Toolkit to normalize text to UMLS / ontologies
|
|
5
|
+
Author: Haydn Jones
|
|
6
|
+
Author-email: Haydn Jones <haydnjonest@gmail.com>
|
|
7
|
+
Requires-Dist: asyncpg>=0.29.0
|
|
8
|
+
Requires-Dist: clickhouse-connect>=1.0.0
|
|
9
|
+
Requires-Dist: duckdb>=1.5.0
|
|
10
|
+
Requires-Dist: lvg-norm>=1.3.0
|
|
11
|
+
Requires-Dist: polars[rt64]>=1.36.1
|
|
12
|
+
Requires-Dist: pyarrow>=20.0.0
|
|
13
|
+
Requires-Dist: pydantic>=2.12.5
|
|
14
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
15
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
16
|
+
Requires-Dist: tqdm>=4.67.1
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
## ClickHouse backend
|
|
21
|
+
|
|
22
|
+
The DuckDB builder remains the source of truth. Build a DuckDB file with
|
|
23
|
+
`build_merged_duckdb`, then upload its canonical tables into ClickHouse:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
The upload shows a progress bar for each copied table; pass `--no-progress` to
|
|
30
|
+
silence it.
|
|
31
|
+
|
|
32
|
+
Connection settings are read from `.env` with `python-dotenv` and use the
|
|
33
|
+
official `clickhouse-connect` client. Set `CH_HTTP`, for example
|
|
34
|
+
`http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
|
|
35
|
+
separately and override URL credentials.
|
|
36
|
+
|
|
37
|
+
Use the ClickHouse backend from Python:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from norm_toolkit import ClickHouseNormalizer
|
|
41
|
+
|
|
42
|
+
normalizer = ClickHouseNormalizer(database="normalization")
|
|
43
|
+
result = normalizer.normalize(["aspirin"], top_k=5)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
You can also pass a DSN in code:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
normalizer = ClickHouseNormalizer(
|
|
50
|
+
dsn="http://host:8123/normalization",
|
|
51
|
+
database="normalization",
|
|
52
|
+
)
|
|
53
|
+
```
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
## ClickHouse backend
|
|
2
|
+
|
|
3
|
+
The DuckDB builder remains the source of truth. Build a DuckDB file with
|
|
4
|
+
`build_merged_duckdb`, then upload its canonical tables into ClickHouse:
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
The upload shows a progress bar for each copied table; pass `--no-progress` to
|
|
11
|
+
silence it.
|
|
12
|
+
|
|
13
|
+
Connection settings are read from `.env` with `python-dotenv` and use the
|
|
14
|
+
official `clickhouse-connect` client. Set `CH_HTTP`, for example
|
|
15
|
+
`http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
|
|
16
|
+
separately and override URL credentials.
|
|
17
|
+
|
|
18
|
+
Use the ClickHouse backend from Python:
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from norm_toolkit import ClickHouseNormalizer
|
|
22
|
+
|
|
23
|
+
normalizer = ClickHouseNormalizer(database="normalization")
|
|
24
|
+
result = normalizer.normalize(["aspirin"], top_k=5)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
You can also pass a DSN in code:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
normalizer = ClickHouseNormalizer(
|
|
31
|
+
dsn="http://host:8123/normalization",
|
|
32
|
+
database="normalization",
|
|
33
|
+
)
|
|
34
|
+
```
|
|
@@ -1,45 +1,51 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "norm_toolkit"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.9.1"
|
|
4
4
|
description = "Toolkit to normalize text to UMLS / ontologies"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
|
|
7
7
|
requires-python = ">=3.12"
|
|
8
8
|
dependencies = [
|
|
9
9
|
"asyncpg>=0.29.0",
|
|
10
|
-
"
|
|
11
|
-
"
|
|
10
|
+
"clickhouse-connect>=1.0.0",
|
|
11
|
+
"duckdb>=1.5.0",
|
|
12
|
+
"lvg-norm>=1.3.0",
|
|
12
13
|
"polars[rt64]>=1.36.1",
|
|
13
14
|
"pyarrow>=20.0.0",
|
|
14
15
|
"pydantic>=2.12.5",
|
|
16
|
+
"python-dotenv>=1.2.2",
|
|
15
17
|
"sqlalchemy>=2.0.0",
|
|
16
18
|
"tqdm>=4.67.1",
|
|
17
19
|
]
|
|
18
20
|
|
|
19
21
|
[dependency-groups]
|
|
20
22
|
dev = [
|
|
21
|
-
"datasets>=4.4.1",
|
|
22
|
-
"dotenv>=0.9.9",
|
|
23
23
|
"ipython>=9.8.0",
|
|
24
24
|
"pytest>=8.3",
|
|
25
|
-
"rdkit>=2025.9.3",
|
|
26
25
|
"ruff>=0.6.9",
|
|
27
26
|
"fire>=0.7.1",
|
|
28
|
-
"
|
|
27
|
+
"ipykernel>=7.2.0",
|
|
28
|
+
"ipywidgets>=8.1.8",
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
[build-system]
|
|
32
|
-
requires = ["uv_build>=0.9.11,<0.
|
|
32
|
+
requires = ["uv_build>=0.9.11,<0.13.0"]
|
|
33
33
|
build-backend = "uv_build"
|
|
34
34
|
|
|
35
35
|
[tool.ruff]
|
|
36
36
|
line-length = 120
|
|
37
37
|
indent-width = 4
|
|
38
|
-
target-version = "
|
|
38
|
+
target-version = "py312"
|
|
39
|
+
# v1 is a frozen vendored snapshot of the 1.8.0 release; don't lint/format it.
|
|
40
|
+
extend-exclude = ["src/norm_toolkit/v1"]
|
|
41
|
+
|
|
42
|
+
[tool.ty.src]
|
|
43
|
+
# v1 is a frozen vendored snapshot of the 1.8.0 release; don't type-check it.
|
|
44
|
+
exclude = ["src/norm_toolkit/v1"]
|
|
39
45
|
|
|
40
46
|
[tool.ruff.lint]
|
|
41
47
|
select = ["E", "F", "UP", "B", "SIM", "I", "FURB"]
|
|
42
|
-
ignore = ["
|
|
48
|
+
ignore = ["E501", "SIM108", "SIM103"]
|
|
43
49
|
fixable = ["ALL"]
|
|
44
50
|
|
|
45
51
|
[tool.ruff.format]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
norm_toolkit — biomedical text normalization to UMLS / custom ontologies.
|
|
3
|
+
|
|
4
|
+
This package is versioned into two subpackages:
|
|
5
|
+
|
|
6
|
+
- ``norm_toolkit.v1``: the 1.8.0 API (DuckDB + PostgreSQL backends)
|
|
7
|
+
- ``norm_toolkit.v2``: the current API (DuckDB + ClickHouse backends)
|
|
8
|
+
|
|
9
|
+
The top-level namespace re-exports the **v1** API for backwards compatibility,
|
|
10
|
+
so ``from norm_toolkit import DuckDBNormalizer`` resolves to ``norm_toolkit.v1``.
|
|
11
|
+
New code should import explicitly from ``norm_toolkit.v2`` (or ``norm_toolkit.v1``).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from norm_toolkit import v1, v2
|
|
15
|
+
from norm_toolkit.v1 import * # noqa: F403 (default API == v1)
|
|
16
|
+
from norm_toolkit.v1 import __all__ as _v1_all
|
|
17
|
+
|
|
18
|
+
__all__ = ["v1", "v2", *_v1_all]
|
|
@@ -21,14 +21,14 @@ Data models:
|
|
|
21
21
|
- SemanticType: Semantic type info (UMLS only)
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from
|
|
30
|
-
from
|
|
31
|
-
from
|
|
24
|
+
from .build_merged import build_merged_duckdb
|
|
25
|
+
from .build_ontology import build_ontology_duckdb
|
|
26
|
+
from .build_umls import build_umls_duckdb
|
|
27
|
+
from .constants import ONTOLOGY_DF_SCHEMA
|
|
28
|
+
from .models import ConceptInfo, SemanticType
|
|
29
|
+
from .normalizer import DuckDBNormalizer
|
|
30
|
+
from .normalizer_postgres import PostgresNormalizer
|
|
31
|
+
from .utils import prepare_ontology_df, push_to_postgres
|
|
32
32
|
|
|
33
33
|
__all__ = [
|
|
34
34
|
# Build functions
|
{norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/normalizer.py
RENAMED
|
@@ -14,7 +14,7 @@ from textwrap import dedent
|
|
|
14
14
|
import duckdb
|
|
15
15
|
import polars as pl
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from .constants import (
|
|
18
18
|
ATOMS_TABLE,
|
|
19
19
|
DEFAULT_PREFER_TTYS,
|
|
20
20
|
DEFS_TABLE,
|
|
@@ -24,8 +24,8 @@ from norm_toolkit.constants import (
|
|
|
24
24
|
NW_TABLE,
|
|
25
25
|
TYPES_TABLE,
|
|
26
26
|
)
|
|
27
|
-
from
|
|
28
|
-
from
|
|
27
|
+
from .models import ConceptInfo
|
|
28
|
+
from .normalizer_utils import (
|
|
29
29
|
apply_concept_name_rows,
|
|
30
30
|
apply_definition_rows,
|
|
31
31
|
apply_semantic_type_rows,
|
|
@@ -14,7 +14,7 @@ import polars as pl
|
|
|
14
14
|
from sqlalchemy import RowMapping, text
|
|
15
15
|
from sqlalchemy.ext.asyncio import AsyncEngine
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from .constants import (
|
|
18
18
|
ATOMS_TABLE,
|
|
19
19
|
DEFAULT_PREFER_TTYS,
|
|
20
20
|
DEFS_TABLE,
|
|
@@ -24,9 +24,9 @@ from norm_toolkit.constants import (
|
|
|
24
24
|
NW_TABLE,
|
|
25
25
|
TYPES_TABLE,
|
|
26
26
|
)
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from
|
|
27
|
+
from .models import ConceptInfo
|
|
28
|
+
from .normalizer_cache import ExpansionCache, NormalizerCache
|
|
29
|
+
from .normalizer_utils import (
|
|
30
30
|
apply_concept_name_rows,
|
|
31
31
|
apply_definition_rows,
|
|
32
32
|
apply_semantic_type_rows,
|
{norm_toolkit-1.8.0/src/norm_toolkit → norm_toolkit-1.9.1/src/norm_toolkit/v1}/normalizer_utils.py
RENAMED
|
@@ -11,14 +11,14 @@ from typing import Any, Literal
|
|
|
11
11
|
from lvg_norm import lvg_normalize
|
|
12
12
|
from sqlalchemy import RowMapping
|
|
13
13
|
|
|
14
|
-
from
|
|
14
|
+
from .constants import (
|
|
15
15
|
EXACT_BUMP,
|
|
16
16
|
ISPREF_WEIGHT,
|
|
17
17
|
RANK_MULTIPLIER,
|
|
18
18
|
STT_WEIGHT,
|
|
19
19
|
TTY_WEIGHT,
|
|
20
20
|
)
|
|
21
|
-
from
|
|
21
|
+
from .models import ConceptInfo, SemanticType
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _coerce_synonyms_list(
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified normalization package.
|
|
3
|
+
|
|
4
|
+
Provides normalizer implementations that work with UMLS, ontology,
|
|
5
|
+
and merged databases using a standardized schema.
|
|
6
|
+
|
|
7
|
+
Build functions:
|
|
8
|
+
- build_merged_duckdb: Build a DuckDB database from UMLS and/or ontology data
|
|
9
|
+
|
|
10
|
+
Normalizers:
|
|
11
|
+
- DuckDBNormalizer: High-throughput sync normalizer for DuckDB (batch processing)
|
|
12
|
+
- ClickHouseNormalizer: Normalizer backed by ClickHouse tables uploaded from a DuckDB build
|
|
13
|
+
|
|
14
|
+
ClickHouse:
|
|
15
|
+
- upload_duckdb_to_clickhouse: Upload a norm_toolkit DuckDB database into ClickHouse
|
|
16
|
+
- clickhouse_client_from_env: Build a ClickHouse client from .env / environment settings
|
|
17
|
+
|
|
18
|
+
Data models:
|
|
19
|
+
- ConceptInfo: Unified concept metadata
|
|
20
|
+
- SemanticType: Semantic type info (UMLS only)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .build_merged import build_merged_duckdb
|
|
24
|
+
from .clickhouse_backend import ClickHouseNormalizer
|
|
25
|
+
from .clickhouse_common import clickhouse_client_from_env
|
|
26
|
+
from .clickhouse_upload import upload_duckdb_to_clickhouse
|
|
27
|
+
from .duckdb_backend import DuckDBNormalizer
|
|
28
|
+
from .schema import ONTOLOGY_DF_SCHEMA, ConceptInfo, SemanticType
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Build functions
|
|
32
|
+
"build_merged_duckdb",
|
|
33
|
+
"upload_duckdb_to_clickhouse",
|
|
34
|
+
# Normalizers
|
|
35
|
+
"ClickHouseNormalizer",
|
|
36
|
+
"DuckDBNormalizer",
|
|
37
|
+
# ClickHouse
|
|
38
|
+
"clickhouse_client_from_env",
|
|
39
|
+
# Models
|
|
40
|
+
"ConceptInfo",
|
|
41
|
+
"SemanticType",
|
|
42
|
+
# Schemas
|
|
43
|
+
"ONTOLOGY_DF_SCHEMA",
|
|
44
|
+
]
|