norm_toolkit 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- norm_toolkit/__init__.py +49 -0
- norm_toolkit/build_merged.py +567 -0
- norm_toolkit/build_ontology.py +51 -0
- norm_toolkit/build_umls.py +41 -0
- norm_toolkit/constants.py +112 -0
- norm_toolkit/models.py +40 -0
- norm_toolkit/normalizer.py +679 -0
- norm_toolkit/normalizer_postgres.py +840 -0
- norm_toolkit/utils.py +213 -0
- norm_toolkit-1.0.0.dist-info/METADATA +16 -0
- norm_toolkit-1.0.0.dist-info/RECORD +12 -0
- norm_toolkit-1.0.0.dist-info/WHEEL +4 -0
norm_toolkit/utils.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for norm_toolkit.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import contextlib
|
|
8
|
+
|
|
9
|
+
import duckdb
|
|
10
|
+
import polars as pl
|
|
11
|
+
from lvg_norm import lvg_normalize
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
|
|
14
|
+
from norm_toolkit.constants import (
|
|
15
|
+
ATOMS_TABLE,
|
|
16
|
+
CONCEPTS_TABLE,
|
|
17
|
+
DEFS_TABLE,
|
|
18
|
+
EDGES_TABLE,
|
|
19
|
+
NS_TABLE,
|
|
20
|
+
NW_TABLE,
|
|
21
|
+
ONTOLOGY_DF_SCHEMA,
|
|
22
|
+
TYPES_TABLE,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# All tables in the normalizer schema
|
|
26
|
+
ALL_TABLES = [NS_TABLE, NW_TABLE, ATOMS_TABLE, CONCEPTS_TABLE, TYPES_TABLE, DEFS_TABLE, EDGES_TABLE]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def prepare_ontology_df(
|
|
30
|
+
df: pl.DataFrame,
|
|
31
|
+
name_col: str = "name",
|
|
32
|
+
source_col: str = "source",
|
|
33
|
+
dedupe: bool = True,
|
|
34
|
+
) -> pl.DataFrame:
|
|
35
|
+
"""
|
|
36
|
+
Prepare a simple name/source DataFrame for use with build_ontology_duckdb.
|
|
37
|
+
|
|
38
|
+
Takes a minimal DataFrame with names and sources and adds all required columns
|
|
39
|
+
for the ontology builder: global_identifier, identifier, pref_name, synonyms,
|
|
40
|
+
description, pref_name_norm, and synonyms_norm.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
df: Input DataFrame with at least name and source columns
|
|
44
|
+
name_col: Name of the column containing concept names (default: "name")
|
|
45
|
+
source_col: Name of the column containing source identifiers (default: "source")
|
|
46
|
+
dedupe: Whether to deduplicate by nstring and source (default: True)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
DataFrame with all required columns for build_ontology_duckdb:
|
|
50
|
+
- global_identifier: Unique ID (e.g., "SOURCE:0", "SOURCE:1", ...)
|
|
51
|
+
- identifier: Row index as string
|
|
52
|
+
- source: Source ontology name
|
|
53
|
+
- pref_name: Original name
|
|
54
|
+
- description: None (null)
|
|
55
|
+
- pref_name_norm: First normalized form from lvg_normalize
|
|
56
|
+
- synonyms: Empty list
|
|
57
|
+
- synonyms_norm: Additional normalized forms (if lvg_normalize returns multiple)
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
>>> df = pl.DataFrame({
|
|
61
|
+
... "name": ["Aspirin", "Ibuprofen"],
|
|
62
|
+
... "source": ["DRUG", "DRUG"]
|
|
63
|
+
... })
|
|
64
|
+
>>> onto_df = prepare_ontology_df(df)
|
|
65
|
+
>>> build_ontology_duckdb(onto_df, "drugs.duckdb")
|
|
66
|
+
"""
|
|
67
|
+
# Normalize names and split into pref_name_norm + synonyms_norm
|
|
68
|
+
norm_results = []
|
|
69
|
+
for name in tqdm(df[name_col].to_list()):
|
|
70
|
+
norms = list(lvg_normalize(name) or [])
|
|
71
|
+
if norms:
|
|
72
|
+
pref_norm = norms[0]
|
|
73
|
+
syn_norms = norms[1:] if len(norms) > 1 else []
|
|
74
|
+
else:
|
|
75
|
+
# Fallback: use lowercase if normalization fails
|
|
76
|
+
pref_norm = name.lower() if name else ""
|
|
77
|
+
syn_norms = []
|
|
78
|
+
norm_results.append((pref_norm, syn_norms))
|
|
79
|
+
|
|
80
|
+
pref_norms = [r[0] for r in norm_results]
|
|
81
|
+
syn_norms = [r[1] for r in norm_results]
|
|
82
|
+
|
|
83
|
+
df = pl.DataFrame(
|
|
84
|
+
{
|
|
85
|
+
"global_identifier": None,
|
|
86
|
+
"identifier": None,
|
|
87
|
+
"source": df[source_col],
|
|
88
|
+
"pref_name": df[name_col],
|
|
89
|
+
"description": None,
|
|
90
|
+
"pref_name_norm": pref_norms,
|
|
91
|
+
"synonyms": None,
|
|
92
|
+
"synonyms_norm": syn_norms,
|
|
93
|
+
},
|
|
94
|
+
schema=ONTOLOGY_DF_SCHEMA,
|
|
95
|
+
)
|
|
96
|
+
if dedupe:
|
|
97
|
+
df = df.unique(["pref_name_norm", "source"])
|
|
98
|
+
|
|
99
|
+
df = df.with_columns(
|
|
100
|
+
pl.row_index("identifier").cast(pl.Utf8),
|
|
101
|
+
pl.col("synonyms").fill_null(pl.lit([])),
|
|
102
|
+
).with_columns(
|
|
103
|
+
pl.concat_str([pl.col("source"), pl.col("identifier")], separator=":").alias("global_identifier"),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return df
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def push_to_postgres(
|
|
110
|
+
duckdb_path: str,
|
|
111
|
+
postgres_dsn: str,
|
|
112
|
+
schema: str = "public",
|
|
113
|
+
tables: list[str] | None = None,
|
|
114
|
+
drop_existing: bool = True,
|
|
115
|
+
create_indexes: bool = True,
|
|
116
|
+
) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Push normalizer tables from a DuckDB database to PostgreSQL.
|
|
119
|
+
|
|
120
|
+
Uses DuckDB's postgres extension for efficient bulk transfer.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
duckdb_path: Path to source DuckDB database
|
|
124
|
+
postgres_dsn: PostgreSQL connection string (e.g., "postgresql://user:pass@host:5432/db")
|
|
125
|
+
schema: PostgreSQL schema to create tables in (default: "public")
|
|
126
|
+
tables: List of tables to push (default: all normalizer tables)
|
|
127
|
+
drop_existing: Drop existing tables before creating (default: True)
|
|
128
|
+
create_indexes: Create indexes after pushing data (default: True)
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
>>> # Build DuckDB first
|
|
132
|
+
>>> build_ontology_duckdb(onto_df, "my_ontology.duckdb")
|
|
133
|
+
>>> # Push to PostgreSQL
|
|
134
|
+
>>> push_to_postgres(
|
|
135
|
+
... "my_ontology.duckdb",
|
|
136
|
+
... "postgresql://user:pass@localhost:5432/normdb"
|
|
137
|
+
... )
|
|
138
|
+
"""
|
|
139
|
+
if tables is None:
|
|
140
|
+
tables = ALL_TABLES
|
|
141
|
+
|
|
142
|
+
con = duckdb.connect(duckdb_path, read_only=True)
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
# Install and load postgres extension
|
|
146
|
+
con.execute("INSTALL postgres; LOAD postgres;")
|
|
147
|
+
|
|
148
|
+
# Attach PostgreSQL database in read-write mode
|
|
149
|
+
con.execute(f"ATTACH '{postgres_dsn}' AS pg (TYPE POSTGRES, READ_WRITE)")
|
|
150
|
+
|
|
151
|
+
# Get list of tables that actually exist in the DuckDB database
|
|
152
|
+
existing_tables = {
|
|
153
|
+
row[0]
|
|
154
|
+
for row in con.execute(
|
|
155
|
+
"SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'"
|
|
156
|
+
).fetchall()
|
|
157
|
+
}
|
|
158
|
+
tables_to_push = [t for t in tables if t in existing_tables]
|
|
159
|
+
|
|
160
|
+
if not tables_to_push:
|
|
161
|
+
print("No tables found to push")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
# Push each table
|
|
165
|
+
for table in tqdm(tables_to_push, desc="Pushing tables"):
|
|
166
|
+
qualified_name = f"pg.{schema}.{table}" if schema else f"pg.{table}"
|
|
167
|
+
|
|
168
|
+
if drop_existing:
|
|
169
|
+
con.execute(f"DROP TABLE IF EXISTS {qualified_name}")
|
|
170
|
+
|
|
171
|
+
# Copy table to PostgreSQL
|
|
172
|
+
con.execute(f"CREATE TABLE {qualified_name} AS SELECT * FROM {table}")
|
|
173
|
+
|
|
174
|
+
# Create indexes for query performance
|
|
175
|
+
if create_indexes:
|
|
176
|
+
_create_postgres_indexes(con, schema, tables_to_push)
|
|
177
|
+
|
|
178
|
+
finally:
|
|
179
|
+
con.close()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _create_postgres_indexes(con: duckdb.DuckDBPyConnection, schema: str, tables: list[str]) -> None:
|
|
183
|
+
"""Create indexes on PostgreSQL tables for query performance."""
|
|
184
|
+
schema_prefix = f"{schema}." if schema else ""
|
|
185
|
+
|
|
186
|
+
index_definitions = [
|
|
187
|
+
# ns table - exact string lookup
|
|
188
|
+
(NS_TABLE, "ns_nstr_idx", "nstr"),
|
|
189
|
+
(NS_TABLE, "ns_concept_idx", "concept_id"),
|
|
190
|
+
# nw table - word lookup
|
|
191
|
+
(NW_TABLE, "nw_nwd_idx", "nwd"),
|
|
192
|
+
(NW_TABLE, "nw_concept_idx", "concept_id"),
|
|
193
|
+
# atoms table - joins
|
|
194
|
+
(ATOMS_TABLE, "atoms_concept_idx", "concept_id"),
|
|
195
|
+
(ATOMS_TABLE, "atoms_name_idx", "concept_id, name_id"),
|
|
196
|
+
(ATOMS_TABLE, "atoms_string_idx", "string_id"),
|
|
197
|
+
# concepts table
|
|
198
|
+
(CONCEPTS_TABLE, "concepts_pk_idx", "concept_id"),
|
|
199
|
+
# types table
|
|
200
|
+
(TYPES_TABLE, "types_concept_idx", "concept_id"),
|
|
201
|
+
# defs table
|
|
202
|
+
(DEFS_TABLE, "defs_concept_idx", "concept_id"),
|
|
203
|
+
# edges table - hierarchy traversal
|
|
204
|
+
(EDGES_TABLE, "edges_parent_idx", "parent_id"),
|
|
205
|
+
(EDGES_TABLE, "edges_child_idx", "child_id"),
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
for table, idx_name, columns in index_definitions:
|
|
209
|
+
if table not in tables:
|
|
210
|
+
continue
|
|
211
|
+
# Index creation might fail if table is empty or column doesn't exist
|
|
212
|
+
with contextlib.suppress(Exception):
|
|
213
|
+
con.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON pg.{schema_prefix}{table} ({columns})")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: norm_toolkit
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Toolkit to normalize text to UMLS / ontologies
|
|
5
|
+
Author: Haydn Jones
|
|
6
|
+
Author-email: Haydn Jones <haydnjonest@gmail.com>
|
|
7
|
+
Requires-Dist: asyncpg>=0.29.0
|
|
8
|
+
Requires-Dist: duckdb>=1.4.3
|
|
9
|
+
Requires-Dist: lvg-norm>=1.1.0
|
|
10
|
+
Requires-Dist: polars[rt64]>=1.36.1
|
|
11
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
12
|
+
Requires-Dist: pydantic>=2.12.5
|
|
13
|
+
Requires-Dist: tqdm>=4.67.1
|
|
14
|
+
Requires-Python: >=3.12
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
norm_toolkit/__init__.py,sha256=aK4bn-0rSnzjvebWUCj6KUhpp1RMCC8t2Op5REJRsOg,1624
|
|
2
|
+
norm_toolkit/build_merged.py,sha256=eYmqNbqEBg4ZXA3cNLIdZxMNHqrxHerXWAi2EtBjUdk,22804
|
|
3
|
+
norm_toolkit/build_ontology.py,sha256=OU6BnV5fItfE6PXUQQFVAmHta9soZOo2EKO95Ciao1k,1586
|
|
4
|
+
norm_toolkit/build_umls.py,sha256=S3ygzr5kNGAJf8MIUu0KfaUKHQdA09tC7K-pL3cR-F0,1102
|
|
5
|
+
norm_toolkit/constants.py,sha256=hkPTL1fV_my70MUFHi4lFTGe1BvCBz8CJTeTATba1uU,2883
|
|
6
|
+
norm_toolkit/models.py,sha256=RY9fvyTiTt14oz7ArVRCRPWWTjZxRC_2FCVUQHhe1mY,1049
|
|
7
|
+
norm_toolkit/normalizer.py,sha256=0njuvRq6A3gTKv9LLWQXdPmTOQGzIetyMAMLrIDRO_k,21616
|
|
8
|
+
norm_toolkit/normalizer_postgres.py,sha256=IeY-RQye9qgoj7Vs-m--l67zvXSRZqAz_ae_feQNOzE,27587
|
|
9
|
+
norm_toolkit/utils.py,sha256=zjwoHVrbBqRqcKfBMs7Fmn1kPYAYD6qupNlxIMGNyOA,7318
|
|
10
|
+
norm_toolkit-1.0.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
11
|
+
norm_toolkit-1.0.0.dist-info/METADATA,sha256=nrv2zzjo_pT8SflWDo2QfPBLf6tzCmX0QUHf4H0hszc,465
|
|
12
|
+
norm_toolkit-1.0.0.dist-info/RECORD,,
|