norm_toolkit 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
norm_toolkit/utils.py ADDED
@@ -0,0 +1,213 @@
1
+ """
2
+ Utility functions for norm_toolkit.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import contextlib
8
+
9
+ import duckdb
10
+ import polars as pl
11
+ from lvg_norm import lvg_normalize
12
+ from tqdm import tqdm
13
+
14
+ from norm_toolkit.constants import (
15
+ ATOMS_TABLE,
16
+ CONCEPTS_TABLE,
17
+ DEFS_TABLE,
18
+ EDGES_TABLE,
19
+ NS_TABLE,
20
+ NW_TABLE,
21
+ ONTOLOGY_DF_SCHEMA,
22
+ TYPES_TABLE,
23
+ )
24
+
25
+ # All tables in the normalizer schema
26
+ ALL_TABLES = [NS_TABLE, NW_TABLE, ATOMS_TABLE, CONCEPTS_TABLE, TYPES_TABLE, DEFS_TABLE, EDGES_TABLE]
27
+
28
+
29
+ def prepare_ontology_df(
30
+ df: pl.DataFrame,
31
+ name_col: str = "name",
32
+ source_col: str = "source",
33
+ dedupe: bool = True,
34
+ ) -> pl.DataFrame:
35
+ """
36
+ Prepare a simple name/source DataFrame for use with build_ontology_duckdb.
37
+
38
+ Takes a minimal DataFrame with names and sources and adds all required columns
39
+ for the ontology builder: global_identifier, identifier, pref_name, synonyms,
40
+ description, pref_name_norm, and synonyms_norm.
41
+
42
+ Args:
43
+ df: Input DataFrame with at least name and source columns
44
+ name_col: Name of the column containing concept names (default: "name")
45
+ source_col: Name of the column containing source identifiers (default: "source")
46
+ dedupe: Whether to deduplicate by nstring and source (default: True)
47
+
48
+ Returns:
49
+ DataFrame with all required columns for build_ontology_duckdb:
50
+ - global_identifier: Unique ID (e.g., "SOURCE:0", "SOURCE:1", ...)
51
+ - identifier: Row index as string
52
+ - source: Source ontology name
53
+ - pref_name: Original name
54
+ - description: None (null)
55
+ - pref_name_norm: First normalized form from lvg_normalize
56
+ - synonyms: Empty list
57
+ - synonyms_norm: Additional normalized forms (if lvg_normalize returns multiple)
58
+
59
+ Example:
60
+ >>> df = pl.DataFrame({
61
+ ... "name": ["Aspirin", "Ibuprofen"],
62
+ ... "source": ["DRUG", "DRUG"]
63
+ ... })
64
+ >>> onto_df = prepare_ontology_df(df)
65
+ >>> build_ontology_duckdb(onto_df, "drugs.duckdb")
66
+ """
67
+ # Normalize names and split into pref_name_norm + synonyms_norm
68
+ norm_results = []
69
+ for name in tqdm(df[name_col].to_list()):
70
+ norms = list(lvg_normalize(name) or [])
71
+ if norms:
72
+ pref_norm = norms[0]
73
+ syn_norms = norms[1:] if len(norms) > 1 else []
74
+ else:
75
+ # Fallback: use lowercase if normalization fails
76
+ pref_norm = name.lower() if name else ""
77
+ syn_norms = []
78
+ norm_results.append((pref_norm, syn_norms))
79
+
80
+ pref_norms = [r[0] for r in norm_results]
81
+ syn_norms = [r[1] for r in norm_results]
82
+
83
+ df = pl.DataFrame(
84
+ {
85
+ "global_identifier": None,
86
+ "identifier": None,
87
+ "source": df[source_col],
88
+ "pref_name": df[name_col],
89
+ "description": None,
90
+ "pref_name_norm": pref_norms,
91
+ "synonyms": None,
92
+ "synonyms_norm": syn_norms,
93
+ },
94
+ schema=ONTOLOGY_DF_SCHEMA,
95
+ )
96
+ if dedupe:
97
+ df = df.unique(["pref_name_norm", "source"])
98
+
99
+ df = df.with_columns(
100
+ pl.row_index("identifier").cast(pl.Utf8),
101
+ pl.col("synonyms").fill_null(pl.lit([])),
102
+ ).with_columns(
103
+ pl.concat_str([pl.col("source"), pl.col("identifier")], separator=":").alias("global_identifier"),
104
+ )
105
+
106
+ return df
107
+
108
+
109
+ def push_to_postgres(
110
+ duckdb_path: str,
111
+ postgres_dsn: str,
112
+ schema: str = "public",
113
+ tables: list[str] | None = None,
114
+ drop_existing: bool = True,
115
+ create_indexes: bool = True,
116
+ ) -> None:
117
+ """
118
+ Push normalizer tables from a DuckDB database to PostgreSQL.
119
+
120
+ Uses DuckDB's postgres extension for efficient bulk transfer.
121
+
122
+ Args:
123
+ duckdb_path: Path to source DuckDB database
124
+ postgres_dsn: PostgreSQL connection string (e.g., "postgresql://user:pass@host:5432/db")
125
+ schema: PostgreSQL schema to create tables in (default: "public")
126
+ tables: List of tables to push (default: all normalizer tables)
127
+ drop_existing: Drop existing tables before creating (default: True)
128
+ create_indexes: Create indexes after pushing data (default: True)
129
+
130
+ Example:
131
+ >>> # Build DuckDB first
132
+ >>> build_ontology_duckdb(onto_df, "my_ontology.duckdb")
133
+ >>> # Push to PostgreSQL
134
+ >>> push_to_postgres(
135
+ ... "my_ontology.duckdb",
136
+ ... "postgresql://user:pass@localhost:5432/normdb"
137
+ ... )
138
+ """
139
+ if tables is None:
140
+ tables = ALL_TABLES
141
+
142
+ con = duckdb.connect(duckdb_path, read_only=True)
143
+
144
+ try:
145
+ # Install and load postgres extension
146
+ con.execute("INSTALL postgres; LOAD postgres;")
147
+
148
+ # Attach PostgreSQL database in read-write mode
149
+ con.execute(f"ATTACH '{postgres_dsn}' AS pg (TYPE POSTGRES, READ_WRITE)")
150
+
151
+ # Get list of tables that actually exist in the DuckDB database
152
+ existing_tables = {
153
+ row[0]
154
+ for row in con.execute(
155
+ "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'"
156
+ ).fetchall()
157
+ }
158
+ tables_to_push = [t for t in tables if t in existing_tables]
159
+
160
+ if not tables_to_push:
161
+ print("No tables found to push")
162
+ return
163
+
164
+ # Push each table
165
+ for table in tqdm(tables_to_push, desc="Pushing tables"):
166
+ qualified_name = f"pg.{schema}.{table}" if schema else f"pg.{table}"
167
+
168
+ if drop_existing:
169
+ con.execute(f"DROP TABLE IF EXISTS {qualified_name}")
170
+
171
+ # Copy table to PostgreSQL
172
+ con.execute(f"CREATE TABLE {qualified_name} AS SELECT * FROM {table}")
173
+
174
+ # Create indexes for query performance
175
+ if create_indexes:
176
+ _create_postgres_indexes(con, schema, tables_to_push)
177
+
178
+ finally:
179
+ con.close()
180
+
181
+
182
+ def _create_postgres_indexes(con: duckdb.DuckDBPyConnection, schema: str, tables: list[str]) -> None:
183
+ """Create indexes on PostgreSQL tables for query performance."""
184
+ schema_prefix = f"{schema}." if schema else ""
185
+
186
+ index_definitions = [
187
+ # ns table - exact string lookup
188
+ (NS_TABLE, "ns_nstr_idx", "nstr"),
189
+ (NS_TABLE, "ns_concept_idx", "concept_id"),
190
+ # nw table - word lookup
191
+ (NW_TABLE, "nw_nwd_idx", "nwd"),
192
+ (NW_TABLE, "nw_concept_idx", "concept_id"),
193
+ # atoms table - joins
194
+ (ATOMS_TABLE, "atoms_concept_idx", "concept_id"),
195
+ (ATOMS_TABLE, "atoms_name_idx", "concept_id, name_id"),
196
+ (ATOMS_TABLE, "atoms_string_idx", "string_id"),
197
+ # concepts table
198
+ (CONCEPTS_TABLE, "concepts_pk_idx", "concept_id"),
199
+ # types table
200
+ (TYPES_TABLE, "types_concept_idx", "concept_id"),
201
+ # defs table
202
+ (DEFS_TABLE, "defs_concept_idx", "concept_id"),
203
+ # edges table - hierarchy traversal
204
+ (EDGES_TABLE, "edges_parent_idx", "parent_id"),
205
+ (EDGES_TABLE, "edges_child_idx", "child_id"),
206
+ ]
207
+
208
+ for table, idx_name, columns in index_definitions:
209
+ if table not in tables:
210
+ continue
211
+ # Index creation might fail if table is empty or column doesn't exist
212
+ with contextlib.suppress(Exception):
213
+ con.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON pg.{schema_prefix}{table} ({columns})")
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.3
2
+ Name: norm_toolkit
3
+ Version: 1.0.0
4
+ Summary: Toolkit to normalize text to UMLS / ontologies
5
+ Author: Haydn Jones
6
+ Author-email: Haydn Jones <haydnjonest@gmail.com>
7
+ Requires-Dist: asyncpg>=0.29.0
8
+ Requires-Dist: duckdb>=1.4.3
9
+ Requires-Dist: lvg-norm>=1.1.0
10
+ Requires-Dist: polars[rt64]>=1.36.1
11
+ Requires-Dist: pyarrow>=22.0.0
12
+ Requires-Dist: pydantic>=2.12.5
13
+ Requires-Dist: tqdm>=4.67.1
14
+ Requires-Python: >=3.12
15
+ Description-Content-Type: text/markdown
16
+
@@ -0,0 +1,12 @@
1
+ norm_toolkit/__init__.py,sha256=aK4bn-0rSnzjvebWUCj6KUhpp1RMCC8t2Op5REJRsOg,1624
2
+ norm_toolkit/build_merged.py,sha256=eYmqNbqEBg4ZXA3cNLIdZxMNHqrxHerXWAi2EtBjUdk,22804
3
+ norm_toolkit/build_ontology.py,sha256=OU6BnV5fItfE6PXUQQFVAmHta9soZOo2EKO95Ciao1k,1586
4
+ norm_toolkit/build_umls.py,sha256=S3ygzr5kNGAJf8MIUu0KfaUKHQdA09tC7K-pL3cR-F0,1102
5
+ norm_toolkit/constants.py,sha256=hkPTL1fV_my70MUFHi4lFTGe1BvCBz8CJTeTATba1uU,2883
6
+ norm_toolkit/models.py,sha256=RY9fvyTiTt14oz7ArVRCRPWWTjZxRC_2FCVUQHhe1mY,1049
7
+ norm_toolkit/normalizer.py,sha256=0njuvRq6A3gTKv9LLWQXdPmTOQGzIetyMAMLrIDRO_k,21616
8
+ norm_toolkit/normalizer_postgres.py,sha256=IeY-RQye9qgoj7Vs-m--l67zvXSRZqAz_ae_feQNOzE,27587
9
+ norm_toolkit/utils.py,sha256=zjwoHVrbBqRqcKfBMs7Fmn1kPYAYD6qupNlxIMGNyOA,7318
10
+ norm_toolkit-1.0.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
11
+ norm_toolkit-1.0.0.dist-info/METADATA,sha256=nrv2zzjo_pT8SflWDo2QfPBLf6tzCmX0QUHf4H0hszc,465
12
+ norm_toolkit-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.17
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any