norm_toolkit 1.7.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- norm_toolkit-1.9.0/PKG-INFO +53 -0
- norm_toolkit-1.9.0/README.md +34 -0
- {norm_toolkit-1.7.0 → norm_toolkit-1.9.0}/pyproject.toml +16 -10
- norm_toolkit-1.9.0/src/norm_toolkit/__init__.py +18 -0
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/__init__.py +8 -8
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_ontology.py +1 -1
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_umls.py +1 -1
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer.py +3 -3
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_postgres.py +106 -44
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_utils.py +2 -2
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/utils.py +1 -1
- norm_toolkit-1.9.0/src/norm_toolkit/v2/__init__.py +44 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/build_merged.py +1249 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_backend.py +768 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_common.py +84 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/clickhouse_upload.py +148 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/duckdb_backend.py +413 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/normalizer_base.py +396 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/normalizer_utils.py +546 -0
- norm_toolkit-1.9.0/src/norm_toolkit/v2/schema.py +239 -0
- norm_toolkit-1.7.0/PKG-INFO +0 -17
- norm_toolkit-1.7.0/README.md +0 -0
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/build_merged.py +0 -0
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/constants.py +0 -0
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/models.py +0 -0
- {norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_cache.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: norm_toolkit
|
|
3
|
+
Version: 1.9.0
|
|
4
|
+
Summary: Toolkit to normalize text to UMLS / ontologies
|
|
5
|
+
Author: Haydn Jones
|
|
6
|
+
Author-email: Haydn Jones <haydnjonest@gmail.com>
|
|
7
|
+
Requires-Dist: asyncpg>=0.29.0
|
|
8
|
+
Requires-Dist: clickhouse-connect>=1.0.0
|
|
9
|
+
Requires-Dist: duckdb>=1.5.0
|
|
10
|
+
Requires-Dist: lvg-norm>=1.3.0
|
|
11
|
+
Requires-Dist: polars[rt64]>=1.36.1
|
|
12
|
+
Requires-Dist: pyarrow>=20.0.0
|
|
13
|
+
Requires-Dist: pydantic>=2.12.5
|
|
14
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
15
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
16
|
+
Requires-Dist: tqdm>=4.67.1
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
## ClickHouse backend
|
|
21
|
+
|
|
22
|
+
The DuckDB builder remains the source of truth. Build a DuckDB file with
|
|
23
|
+
`build_merged_duckdb`, then upload its canonical tables into ClickHouse:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
The upload shows a progress bar for each copied table; pass `--no-progress` to
|
|
30
|
+
silence it.
|
|
31
|
+
|
|
32
|
+
Connection settings are read from `.env` with `python-dotenv` and use the
|
|
33
|
+
official `clickhouse-connect` client. Set `CH_HTTP`, for example
|
|
34
|
+
`http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
|
|
35
|
+
separately and override URL credentials.
|
|
36
|
+
|
|
37
|
+
Use the ClickHouse backend from Python:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from norm_toolkit import ClickHouseNormalizer
|
|
41
|
+
|
|
42
|
+
normalizer = ClickHouseNormalizer(database="normalization")
|
|
43
|
+
result = normalizer.normalize(["aspirin"], top_k=5)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
You can also pass a DSN in code:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
normalizer = ClickHouseNormalizer(
|
|
50
|
+
dsn="http://host:8123/normalization",
|
|
51
|
+
database="normalization",
|
|
52
|
+
)
|
|
53
|
+
```
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
## ClickHouse backend
|
|
2
|
+
|
|
3
|
+
The DuckDB builder remains the source of truth. Build a DuckDB file with
|
|
4
|
+
`build_merged_duckdb`, then upload its canonical tables into ClickHouse:
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
uv run python scripts/upload_clickhouse.py data/dbs_final/SmallMolecule.duckdb --database normalization
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
The upload shows a progress bar for each copied table; pass `--no-progress` to
|
|
11
|
+
silence it.
|
|
12
|
+
|
|
13
|
+
Connection settings are read from `.env` with `python-dotenv` and use the
|
|
14
|
+
official `clickhouse-connect` client. Set `CH_HTTP`, for example
|
|
15
|
+
`http://host:8123/normalization`; `CH_USER` and `CH_PASSWORD` may be supplied
|
|
16
|
+
separately and override URL credentials.
|
|
17
|
+
|
|
18
|
+
Use the ClickHouse backend from Python:
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from norm_toolkit import ClickHouseNormalizer
|
|
22
|
+
|
|
23
|
+
normalizer = ClickHouseNormalizer(database="normalization")
|
|
24
|
+
result = normalizer.normalize(["aspirin"], top_k=5)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
You can also pass a DSN in code:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
normalizer = ClickHouseNormalizer(
|
|
31
|
+
dsn="http://host:8123/normalization",
|
|
32
|
+
database="normalization",
|
|
33
|
+
)
|
|
34
|
+
```
|
|
@@ -1,45 +1,51 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "norm_toolkit"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.9.0"
|
|
4
4
|
description = "Toolkit to normalize text to UMLS / ontologies"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
|
|
7
7
|
requires-python = ">=3.12"
|
|
8
8
|
dependencies = [
|
|
9
9
|
"asyncpg>=0.29.0",
|
|
10
|
-
"
|
|
11
|
-
"
|
|
10
|
+
"clickhouse-connect>=1.0.0",
|
|
11
|
+
"duckdb>=1.5.0",
|
|
12
|
+
"lvg-norm>=1.3.0",
|
|
12
13
|
"polars[rt64]>=1.36.1",
|
|
13
14
|
"pyarrow>=20.0.0",
|
|
14
15
|
"pydantic>=2.12.5",
|
|
16
|
+
"python-dotenv>=1.2.2",
|
|
15
17
|
"sqlalchemy>=2.0.0",
|
|
16
18
|
"tqdm>=4.67.1",
|
|
17
19
|
]
|
|
18
20
|
|
|
19
21
|
[dependency-groups]
|
|
20
22
|
dev = [
|
|
21
|
-
"datasets>=4.4.1",
|
|
22
|
-
"dotenv>=0.9.9",
|
|
23
23
|
"ipython>=9.8.0",
|
|
24
24
|
"pytest>=8.3",
|
|
25
|
-
"rdkit>=2025.9.3",
|
|
26
25
|
"ruff>=0.6.9",
|
|
27
26
|
"fire>=0.7.1",
|
|
28
|
-
"
|
|
27
|
+
"ipykernel>=7.2.0",
|
|
28
|
+
"ipywidgets>=8.1.8",
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
[build-system]
|
|
32
|
-
requires = ["uv_build>=0.9.11,<0.
|
|
32
|
+
requires = ["uv_build>=0.9.11,<0.13.0"]
|
|
33
33
|
build-backend = "uv_build"
|
|
34
34
|
|
|
35
35
|
[tool.ruff]
|
|
36
36
|
line-length = 120
|
|
37
37
|
indent-width = 4
|
|
38
|
-
target-version = "
|
|
38
|
+
target-version = "py312"
|
|
39
|
+
# v1 is a frozen vendored snapshot of the 1.8.0 release; don't lint/format it.
|
|
40
|
+
extend-exclude = ["src/norm_toolkit/v1"]
|
|
41
|
+
|
|
42
|
+
[tool.ty.src]
|
|
43
|
+
# v1 is a frozen vendored snapshot of the 1.8.0 release; don't type-check it.
|
|
44
|
+
exclude = ["src/norm_toolkit/v1"]
|
|
39
45
|
|
|
40
46
|
[tool.ruff.lint]
|
|
41
47
|
select = ["E", "F", "UP", "B", "SIM", "I", "FURB"]
|
|
42
|
-
ignore = ["
|
|
48
|
+
ignore = ["E501", "SIM108", "SIM103"]
|
|
43
49
|
fixable = ["ALL"]
|
|
44
50
|
|
|
45
51
|
[tool.ruff.format]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
norm_toolkit — biomedical text normalization to UMLS / custom ontologies.
|
|
3
|
+
|
|
4
|
+
This package is versioned into two subpackages:
|
|
5
|
+
|
|
6
|
+
- ``norm_toolkit.v1``: the 1.8.0 API (DuckDB + PostgreSQL backends)
|
|
7
|
+
- ``norm_toolkit.v2``: the current API (DuckDB + ClickHouse backends)
|
|
8
|
+
|
|
9
|
+
The top-level namespace re-exports the **v1** API for backwards compatibility,
|
|
10
|
+
so ``from norm_toolkit import DuckDBNormalizer`` resolves to ``norm_toolkit.v1``.
|
|
11
|
+
New code should import explicitly from ``norm_toolkit.v2`` (or ``norm_toolkit.v1``).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from norm_toolkit import v1, v2
|
|
15
|
+
from norm_toolkit.v1 import * # noqa: F403 (default API == v1)
|
|
16
|
+
from norm_toolkit.v1 import __all__ as _v1_all
|
|
17
|
+
|
|
18
|
+
__all__ = ["v1", "v2", *_v1_all]
|
|
@@ -21,14 +21,14 @@ Data models:
|
|
|
21
21
|
- SemanticType: Semantic type info (UMLS only)
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from
|
|
30
|
-
from
|
|
31
|
-
from
|
|
24
|
+
from .build_merged import build_merged_duckdb
|
|
25
|
+
from .build_ontology import build_ontology_duckdb
|
|
26
|
+
from .build_umls import build_umls_duckdb
|
|
27
|
+
from .constants import ONTOLOGY_DF_SCHEMA
|
|
28
|
+
from .models import ConceptInfo, SemanticType
|
|
29
|
+
from .normalizer import DuckDBNormalizer
|
|
30
|
+
from .normalizer_postgres import PostgresNormalizer
|
|
31
|
+
from .utils import prepare_ontology_df, push_to_postgres
|
|
32
32
|
|
|
33
33
|
__all__ = [
|
|
34
34
|
# Build functions
|
{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer.py
RENAMED
|
@@ -14,7 +14,7 @@ from textwrap import dedent
|
|
|
14
14
|
import duckdb
|
|
15
15
|
import polars as pl
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from .constants import (
|
|
18
18
|
ATOMS_TABLE,
|
|
19
19
|
DEFAULT_PREFER_TTYS,
|
|
20
20
|
DEFS_TABLE,
|
|
@@ -24,8 +24,8 @@ from norm_toolkit.constants import (
|
|
|
24
24
|
NW_TABLE,
|
|
25
25
|
TYPES_TABLE,
|
|
26
26
|
)
|
|
27
|
-
from
|
|
28
|
-
from
|
|
27
|
+
from .models import ConceptInfo
|
|
28
|
+
from .normalizer_utils import (
|
|
29
29
|
apply_concept_name_rows,
|
|
30
30
|
apply_definition_rows,
|
|
31
31
|
apply_semantic_type_rows,
|
|
@@ -14,7 +14,7 @@ import polars as pl
|
|
|
14
14
|
from sqlalchemy import RowMapping, text
|
|
15
15
|
from sqlalchemy.ext.asyncio import AsyncEngine
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from .constants import (
|
|
18
18
|
ATOMS_TABLE,
|
|
19
19
|
DEFAULT_PREFER_TTYS,
|
|
20
20
|
DEFS_TABLE,
|
|
@@ -24,9 +24,9 @@ from norm_toolkit.constants import (
|
|
|
24
24
|
NW_TABLE,
|
|
25
25
|
TYPES_TABLE,
|
|
26
26
|
)
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from
|
|
27
|
+
from .models import ConceptInfo
|
|
28
|
+
from .normalizer_cache import ExpansionCache, NormalizerCache
|
|
29
|
+
from .normalizer_utils import (
|
|
30
30
|
apply_concept_name_rows,
|
|
31
31
|
apply_definition_rows,
|
|
32
32
|
apply_semantic_type_rows,
|
|
@@ -602,72 +602,134 @@ class PostgresNormalizer:
|
|
|
602
602
|
List of descendant concept IDs ordered by depth (shallowest first),
|
|
603
603
|
excludes the starting concept
|
|
604
604
|
"""
|
|
605
|
+
results = await self.get_narrower_concepts_many(
|
|
606
|
+
[concept_id],
|
|
607
|
+
max_depth=max_depth,
|
|
608
|
+
filter_ontologies=filter_ontologies,
|
|
609
|
+
max_ids=max_ids,
|
|
610
|
+
)
|
|
611
|
+
return results.get(concept_id, [])
|
|
612
|
+
|
|
613
|
+
async def get_narrower_concepts_many(
|
|
614
|
+
self,
|
|
615
|
+
concept_ids: Sequence[str],
|
|
616
|
+
max_depth: int | None = 10,
|
|
617
|
+
filter_ontologies: list[str] | None = None,
|
|
618
|
+
max_ids: int | None = None,
|
|
619
|
+
) -> dict[str, list[str]]:
|
|
620
|
+
"""
|
|
621
|
+
Get narrower (descendant) concept IDs for many roots in one query.
|
|
622
|
+
|
|
623
|
+
Uses the hierarchy edges to walk down the tree/DAG from each root concept.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
concept_ids: Starting concept IDs (broader terms)
|
|
627
|
+
max_depth: Maximum depth to traverse (1 = direct children only, None = all descendants)
|
|
628
|
+
filter_ontologies: Only follow edges from these ontologies (e.g., ["UMLS", "CHEBI"])
|
|
629
|
+
max_ids: Maximum number of concept IDs to return (None = no limit)
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
Dict mapping each concept ID to descendant IDs ordered by depth
|
|
633
|
+
(shallowest first), excluding the starting concept.
|
|
634
|
+
"""
|
|
605
635
|
await self._ensure_initialized()
|
|
606
636
|
|
|
607
|
-
if not self._has_edges:
|
|
608
|
-
return []
|
|
637
|
+
if not self._has_edges or not concept_ids:
|
|
638
|
+
return {cid: [] for cid in concept_ids}
|
|
639
|
+
|
|
640
|
+
id_list = list(dict.fromkeys(concept_ids))
|
|
641
|
+
|
|
642
|
+
res: dict[str, list[str]] = {}
|
|
643
|
+
missing: list[str] = []
|
|
644
|
+
cache_keys: dict[str, Any] = {}
|
|
609
645
|
|
|
610
|
-
cache_key = None
|
|
611
646
|
if self._expansion_cache is not None:
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
647
|
+
for cid in id_list:
|
|
648
|
+
cache_key = ExpansionCache.make_key(
|
|
649
|
+
cid,
|
|
650
|
+
max_depth=max_depth,
|
|
651
|
+
filter_ontologies=filter_ontologies,
|
|
652
|
+
max_ids=max_ids,
|
|
653
|
+
)
|
|
654
|
+
cache_keys[cid] = cache_key
|
|
655
|
+
cached = self._expansion_cache.get(cache_key)
|
|
656
|
+
if cached is not None:
|
|
657
|
+
res[cid] = cached
|
|
658
|
+
else:
|
|
659
|
+
res[cid] = []
|
|
660
|
+
missing.append(cid)
|
|
661
|
+
else:
|
|
662
|
+
for cid in id_list:
|
|
663
|
+
res[cid] = []
|
|
664
|
+
missing = id_list
|
|
621
665
|
|
|
622
|
-
|
|
666
|
+
if not missing:
|
|
667
|
+
return res
|
|
668
|
+
|
|
669
|
+
sql_params = _SqlParams()
|
|
670
|
+
idmap_values = sql_params.add_single_column_values(missing)
|
|
671
|
+
params = sql_params.params
|
|
672
|
+
params["max_depth"] = max_depth
|
|
623
673
|
|
|
624
|
-
# Build ontology filter clause
|
|
625
674
|
ontology_filter = ""
|
|
626
675
|
if filter_ontologies:
|
|
627
|
-
|
|
628
|
-
for i, ont in enumerate(filter_ontologies):
|
|
629
|
-
key = f"ont{i}"
|
|
630
|
-
params[key] = ont
|
|
631
|
-
ont_placeholders.append(f":{key}")
|
|
632
|
-
ontologies_sql = ", ".join(ont_placeholders)
|
|
676
|
+
ontologies_sql = sql_params.add_values(filter_ontologies)
|
|
633
677
|
ontology_filter = f" AND e.ontology IN ({ontologies_sql})"
|
|
634
678
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
679
|
+
if max_ids is None:
|
|
680
|
+
select_sql = """
|
|
681
|
+
SELECT root_id, concept_id, MIN(depth) AS min_depth
|
|
682
|
+
FROM walk
|
|
683
|
+
WHERE concept_id != root_id
|
|
684
|
+
GROUP BY root_id, concept_id
|
|
685
|
+
ORDER BY root_id, min_depth, concept_id
|
|
686
|
+
"""
|
|
687
|
+
else:
|
|
638
688
|
params["max_ids"] = max_ids
|
|
639
|
-
|
|
689
|
+
select_sql = """
|
|
690
|
+
SELECT root_id, concept_id, min_depth
|
|
691
|
+
FROM (
|
|
692
|
+
SELECT root_id, concept_id, min_depth,
|
|
693
|
+
ROW_NUMBER() OVER (PARTITION BY root_id ORDER BY min_depth, concept_id) AS rn
|
|
694
|
+
FROM (
|
|
695
|
+
SELECT root_id, concept_id, MIN(depth) AS min_depth
|
|
696
|
+
FROM walk
|
|
697
|
+
WHERE concept_id != root_id
|
|
698
|
+
GROUP BY root_id, concept_id
|
|
699
|
+
) base
|
|
700
|
+
) ranked
|
|
701
|
+
WHERE rn <= :max_ids
|
|
702
|
+
ORDER BY root_id, min_depth, concept_id
|
|
703
|
+
"""
|
|
640
704
|
|
|
641
|
-
# PostgreSQL recursive CTE with named parameters
|
|
642
|
-
# Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
|
|
643
|
-
# UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
|
|
644
|
-
# GROUP BY with MIN(depth) gets shortest path depth for each concept
|
|
645
705
|
query = dedent(
|
|
646
706
|
f"""
|
|
647
|
-
WITH RECURSIVE
|
|
648
|
-
|
|
707
|
+
WITH RECURSIVE idmap(root_id) AS (VALUES {idmap_values}),
|
|
708
|
+
walk(root_id, concept_id, depth) AS (
|
|
709
|
+
SELECT root_id, root_id, 0
|
|
710
|
+
FROM idmap
|
|
649
711
|
|
|
650
712
|
UNION
|
|
651
713
|
|
|
652
|
-
SELECT e.child_id, w.depth + 1
|
|
714
|
+
SELECT w.root_id, e.child_id, w.depth + 1
|
|
653
715
|
FROM walk w
|
|
654
716
|
JOIN {self._edges_table} e ON e.parent_id = w.concept_id
|
|
655
717
|
WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){ontology_filter}
|
|
656
718
|
)
|
|
657
|
-
|
|
658
|
-
FROM walk
|
|
659
|
-
WHERE concept_id != :concept_id
|
|
660
|
-
GROUP BY concept_id
|
|
661
|
-
ORDER BY min_depth, concept_id{limit_clause}
|
|
719
|
+
{select_sql}
|
|
662
720
|
"""
|
|
663
721
|
)
|
|
664
722
|
|
|
665
723
|
rows = await self._fetch_rows(query, params)
|
|
666
724
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
725
|
+
for row in rows:
|
|
726
|
+
res[row["root_id"]].append(row["concept_id"])
|
|
727
|
+
|
|
728
|
+
if self._expansion_cache is not None:
|
|
729
|
+
for cid in missing:
|
|
730
|
+
self._expansion_cache.set(cache_keys[cid], res[cid])
|
|
731
|
+
|
|
732
|
+
return res
|
|
671
733
|
|
|
672
734
|
def cache_stats(self) -> dict[str, Any] | None:
|
|
673
735
|
"""
|
{norm_toolkit-1.7.0/src/norm_toolkit → norm_toolkit-1.9.0/src/norm_toolkit/v1}/normalizer_utils.py
RENAMED
|
@@ -11,14 +11,14 @@ from typing import Any, Literal
|
|
|
11
11
|
from lvg_norm import lvg_normalize
|
|
12
12
|
from sqlalchemy import RowMapping
|
|
13
13
|
|
|
14
|
-
from
|
|
14
|
+
from .constants import (
|
|
15
15
|
EXACT_BUMP,
|
|
16
16
|
ISPREF_WEIGHT,
|
|
17
17
|
RANK_MULTIPLIER,
|
|
18
18
|
STT_WEIGHT,
|
|
19
19
|
TTY_WEIGHT,
|
|
20
20
|
)
|
|
21
|
-
from
|
|
21
|
+
from .models import ConceptInfo, SemanticType
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _coerce_synonyms_list(
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified normalization package.
|
|
3
|
+
|
|
4
|
+
Provides normalizer implementations that work with UMLS, ontology,
|
|
5
|
+
and merged databases using a standardized schema.
|
|
6
|
+
|
|
7
|
+
Build functions:
|
|
8
|
+
- build_merged_duckdb: Build a DuckDB database from UMLS and/or ontology data
|
|
9
|
+
|
|
10
|
+
Normalizers:
|
|
11
|
+
- DuckDBNormalizer: High-throughput sync normalizer for DuckDB (batch processing)
|
|
12
|
+
- ClickHouseNormalizer: Normalizer backed by ClickHouse tables uploaded from a DuckDB build
|
|
13
|
+
|
|
14
|
+
ClickHouse:
|
|
15
|
+
- upload_duckdb_to_clickhouse: Upload a norm_toolkit DuckDB database into ClickHouse
|
|
16
|
+
- clickhouse_client_from_env: Build a ClickHouse client from .env / environment settings
|
|
17
|
+
|
|
18
|
+
Data models:
|
|
19
|
+
- ConceptInfo: Unified concept metadata
|
|
20
|
+
- SemanticType: Semantic type info (UMLS only)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .build_merged import build_merged_duckdb
|
|
24
|
+
from .clickhouse_backend import ClickHouseNormalizer
|
|
25
|
+
from .clickhouse_common import clickhouse_client_from_env
|
|
26
|
+
from .clickhouse_upload import upload_duckdb_to_clickhouse
|
|
27
|
+
from .duckdb_backend import DuckDBNormalizer
|
|
28
|
+
from .schema import ONTOLOGY_DF_SCHEMA, ConceptInfo, SemanticType
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Build functions
|
|
32
|
+
"build_merged_duckdb",
|
|
33
|
+
"upload_duckdb_to_clickhouse",
|
|
34
|
+
# Normalizers
|
|
35
|
+
"ClickHouseNormalizer",
|
|
36
|
+
"DuckDBNormalizer",
|
|
37
|
+
# ClickHouse
|
|
38
|
+
"clickhouse_client_from_env",
|
|
39
|
+
# Models
|
|
40
|
+
"ConceptInfo",
|
|
41
|
+
"SemanticType",
|
|
42
|
+
# Schemas
|
|
43
|
+
"ONTOLOGY_DF_SCHEMA",
|
|
44
|
+
]
|