norm_toolkit 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.3
2
+ Name: norm_toolkit
3
+ Version: 1.0.0
4
+ Summary: Toolkit to normalize text to UMLS / ontologies
5
+ Author: Haydn Jones
6
+ Author-email: Haydn Jones <haydnjonest@gmail.com>
7
+ Requires-Dist: asyncpg>=0.29.0
8
+ Requires-Dist: duckdb>=1.4.3
9
+ Requires-Dist: lvg-norm>=1.1.0
10
+ Requires-Dist: polars[rt64]>=1.36.1
11
+ Requires-Dist: pyarrow>=22.0.0
12
+ Requires-Dist: pydantic>=2.12.5
13
+ Requires-Dist: tqdm>=4.67.1
14
+ Requires-Python: >=3.12
15
+ Description-Content-Type: text/markdown
16
+
File without changes
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "norm_toolkit"
3
+ version = "1.0.0"
4
+ description = "Toolkit to normalize text to UMLS / ontologies"
5
+ readme = "README.md"
6
+ authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
7
+ requires-python = ">=3.12"
8
+ dependencies = [
9
+ "asyncpg>=0.29.0",
10
+ "duckdb>=1.4.3",
11
+ "lvg-norm>=1.1.0",
12
+ "polars[rt64]>=1.36.1",
13
+ "pyarrow>=22.0.0",
14
+ "pydantic>=2.12.5",
15
+ "tqdm>=4.67.1",
16
+ ]
17
+
18
+ [dependency-groups]
19
+ dev = [
20
+ "datasets>=4.4.1",
21
+ "dotenv>=0.9.9",
22
+ "ipython>=9.8.0",
23
+ "pytest>=8.3",
24
+ "rdkit>=2025.9.3",
25
+ "ruff>=0.6.9",
26
+ ]
27
+
28
+ [build-system]
29
+ requires = ["uv_build>=0.9.11,<0.10.0"]
30
+ build-backend = "uv_build"
31
+
32
+ [tool.ruff]
33
+ line-length = 120
34
+ indent-width = 4
35
+ target-version = "py313"
36
+
37
+ [tool.ruff.lint]
38
+ select = ["E", "F", "UP", "B", "SIM", "I", "FURB"]
39
+ ignore = ["B905", "E501", "SIM108", "SIM103"]
40
+ fixable = ["ALL"]
41
+
42
+ [tool.ruff.format]
43
+ quote-style = "double"
44
+ indent-style = "space"
@@ -0,0 +1,49 @@
1
+ """
2
+ Unified normalization package.
3
+
4
+ Provides normalizer implementations that work with UMLS, ontology,
5
+ and merged databases using a standardized schema.
6
+
7
+ Build functions:
8
+ - build_umls_duckdb: Build UMLS database from Metathesaurus RRF files
9
+ - build_ontology_duckdb: Build ontology database from Polars DataFrame
10
+ - build_merged_duckdb: Build merged UMLS + ontology database
11
+
12
+ All build functions output the same schema, so you can use DuckDBNormalizer
13
+ or PostgresNormalizer with any database built by any of the build functions.
14
+
15
+ Normalizers:
16
+ - DuckDBNormalizer: High-throughput sync normalizer for DuckDB (batch processing)
17
+ - PostgresNormalizer: Async normalizer for PostgreSQL via asyncpg (small batches)
18
+
19
+ Data models:
20
+ - ConceptInfo: Unified concept metadata
21
+ - SemanticType: Semantic type info (UMLS only)
22
+ """
23
+
24
+ from norm_toolkit.build_merged import build_merged_duckdb
25
+ from norm_toolkit.build_ontology import build_ontology_duckdb
26
+ from norm_toolkit.build_umls import build_umls_duckdb
27
+ from norm_toolkit.constants import ONTOLOGY_DF_SCHEMA
28
+ from norm_toolkit.models import ConceptInfo, SemanticType
29
+ from norm_toolkit.normalizer import DuckDBNormalizer
30
+ from norm_toolkit.normalizer_postgres import PostgresNormalizer
31
+ from norm_toolkit.utils import prepare_ontology_df, push_to_postgres
32
+
33
+ __all__ = [
34
+ # Build functions
35
+ "build_umls_duckdb",
36
+ "build_ontology_duckdb",
37
+ "build_merged_duckdb",
38
+ # Normalizers
39
+ "DuckDBNormalizer",
40
+ "PostgresNormalizer",
41
+ # Models
42
+ "ConceptInfo",
43
+ "SemanticType",
44
+ # Schemas
45
+ "ONTOLOGY_DF_SCHEMA",
46
+ # Utils
47
+ "prepare_ontology_df",
48
+ "push_to_postgres",
49
+ ]
@@ -0,0 +1,567 @@
1
+ """
2
+ Merged database builder for unified normalizer.
3
+
4
+ Builds a single DuckDB database containing both UMLS and ontology data,
5
+ allowing simultaneous normalization across all sources.
6
+
7
+ Tables created:
8
+ - ns: Normalized string index (nstr -> concept_id, name_id)
9
+ - nw: Normalized word index (nwd -> concept_id, string_id, source)
10
+ - atoms: All atoms with unified schema
11
+ - concepts: Concept metadata
12
+ - types: Semantic types (UMLS only)
13
+ - defs: Definitions from all sources
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from pathlib import Path
19
+
20
+ import duckdb
21
+ import polars as pl
22
+
23
+ # UMLS RRF column definitions
24
+ # fmt: off
25
+ MRCONSO_COLS = ["CUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "AUI", "SAUI", "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF"]
26
+ MRXNS_ENG_COLS = ["LAT", "NSTR", "CUI", "LUI", "SUI"]
27
+ MRXNW_ENG_COLS = ["LAT", "NWD", "CUI", "LUI", "SUI"]
28
+ MRSTY_COLS = ["CUI", "TUI", "STN", "STY", "ATUI", "CVF"]
29
+ MRRANK_COLS = ["RANK", "SAB", "TTY", "SUPPRESS"]
30
+ MRDEF_COLS = ["CUI", "AUI", "ATUI", "SATUI", "SAB", "DEF", "SUPPRESS", "CVF"]
31
+ MRREL_COLS = ["CUI1", "AUI1", "STYPE1", "REL", "CUI2", "AUI2", "STYPE2", "RELA", "RUI", "SRUI", "SAB", "SL", "RG", "DIR", "SUPPRESS", "CVF"]
32
+ # fmt: on
33
+
34
+
35
+ def build_merged_duckdb(
36
+ db_path: str,
37
+ meta_dir: str | None = None,
38
+ ontology_dfs: list[pl.DataFrame] | None = None,
39
+ edges_df: pl.DataFrame | None = None,
40
+ filter_concepts_df: pl.DataFrame | None = None,
41
+ threads: int = 8,
42
+ pref_rank: int = 3,
43
+ syn_rank: int = 1,
44
+ ) -> None:
45
+ """
46
+ Build merged DuckDB database from UMLS and/or ontology sources.
47
+
48
+ Args:
49
+ db_path: Output DuckDB database path
50
+ meta_dir: Directory containing UMLS META RRF files (optional)
51
+ ontology_dfs: List of Polars DataFrames with ontology data (optional)
52
+ edges_df: Hierarchy edges for ontologies (parent_id, child_id, source columns)
53
+ filter_concepts_df: Optional DataFrame with 'global_identifier' column to filter
54
+ which concepts to include. Only concepts matching these IDs will be included
55
+ (applies to both UMLS CUIs and ontology global_identifiers).
56
+ threads: Number of DuckDB threads to use
57
+ pref_rank: Scoring weight for ontology preferred names (default: 3)
58
+ syn_rank: Scoring weight for ontology synonyms (default: 1)
59
+
60
+ At least one of meta_dir or ontology_dfs must be provided.
61
+
62
+ UMLS RRF files required (if meta_dir provided):
63
+ - MRCONSO.RRF, MRXNS_ENG.RRF, MRXNW_ENG.RRF, MRSTY.RRF, MRRANK.RRF, MRDEF.RRF
64
+ - MRREL.RRF (optional, for hierarchy traversal)
65
+
66
+ Ontology DataFrame columns required (if ontology_dfs provided):
67
+ - global_identifier: str - Unique concept ID
68
+ - identifier: str - Source-specific ID
69
+ - pref_name: str - Preferred name
70
+ - synonyms: list[str] - Display synonyms
71
+ - description: str | None - Definition
72
+ - source: str - Source ontology name
73
+ - pref_name_norm: str - Normalized preferred name
74
+ - synonyms_norm: list[str] - Normalized synonyms
75
+
76
+ Edges DataFrame columns (if edges_df provided):
77
+ - parent_id: str - Parent concept ID (broader term)
78
+ - child_id: str - Child concept ID (narrower term)
79
+ - source: str - Source ontology name
80
+ """
81
+ if not meta_dir and not ontology_dfs:
82
+ raise ValueError("At least one of meta_dir or ontology_dfs must be provided")
83
+
84
+ # Extract filter set if provided
85
+ filter_ids: set[str] | None = None
86
+ if filter_concepts_df is not None:
87
+ if "global_identifier" not in filter_concepts_df.columns:
88
+ raise ValueError("filter_concepts_df must have a 'global_identifier' column")
89
+ filter_ids = set(filter_concepts_df["global_identifier"].drop_nulls().to_list())
90
+ print(f"Filtering to {len(filter_ids):,} concepts")
91
+
92
+ # Ensure output directory exists
93
+ Path(db_path).parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ con = duckdb.connect(db_path)
96
+ con.execute(f"PRAGMA threads={threads}")
97
+
98
+ # ==========================================================================
99
+ # Process UMLS data (if provided)
100
+ # ==========================================================================
101
+
102
+ umls_atoms: pl.DataFrame | None = None
103
+ umls_ns: pl.DataFrame | None = None
104
+ umls_nw: pl.DataFrame | None = None
105
+ umls_concepts: pl.DataFrame | None = None
106
+ umls_types: pl.DataFrame | None = None
107
+ umls_defs: pl.DataFrame | None = None
108
+ umls_edges: pl.DataFrame | None = None
109
+
110
+ if meta_dir:
111
+ print("Loading UMLS data...")
112
+ umls_atoms, umls_ns, umls_nw, umls_concepts, umls_types, umls_defs, umls_edges = _load_umls_data(
113
+ con, meta_dir, filter_ids
114
+ )
115
+ print(f" Loaded {len(umls_atoms):,} UMLS atoms")
116
+ if umls_edges is not None:
117
+ print(f" Loaded {len(umls_edges):,} UMLS hierarchy edges")
118
+
119
+ # ==========================================================================
120
+ # Process ontology data (if provided)
121
+ # ==========================================================================
122
+
123
+ onto_atoms: pl.DataFrame | None = None
124
+ onto_ns: pl.DataFrame | None = None
125
+ onto_nw: pl.DataFrame | None = None
126
+ onto_concepts: pl.DataFrame | None = None
127
+ onto_defs: pl.DataFrame | None = None
128
+
129
+ if ontology_dfs:
130
+ print("Loading ontology data...")
131
+ onto_atoms, onto_ns, onto_nw, onto_concepts, onto_defs = _load_ontology_data(
132
+ ontology_dfs, pref_rank, syn_rank, filter_ids
133
+ )
134
+ print(f" Loaded {len(onto_atoms):,} ontology atoms")
135
+
136
+ # ==========================================================================
137
+ # Merge and write tables
138
+ # ==========================================================================
139
+
140
+ print("Writing merged tables...")
141
+
142
+ # Drop existing tables
143
+ for tbl in ("atoms", "ns", "nw", "concepts", "types", "defs", "edges"):
144
+ con.execute(f"DROP TABLE IF EXISTS {tbl};")
145
+
146
+ # Merge atoms
147
+ atoms_dfs = [df for df in [umls_atoms, onto_atoms] if df is not None]
148
+ merged_atoms = pl.concat(atoms_dfs, how="vertical") if atoms_dfs else pl.DataFrame()
149
+ _write_table(con, merged_atoms, "atoms")
150
+ print(f" atoms: {len(merged_atoms):,} rows")
151
+
152
+ # Merge NS index
153
+ ns_dfs = [df for df in [umls_ns, onto_ns] if df is not None]
154
+ merged_ns = pl.concat(ns_dfs, how="vertical") if ns_dfs else pl.DataFrame()
155
+ _write_table(con, merged_ns, "ns")
156
+ print(f" ns: {len(merged_ns):,} rows")
157
+
158
+ # Merge NW index
159
+ nw_dfs = [df for df in [umls_nw, onto_nw] if df is not None]
160
+ merged_nw = pl.concat(nw_dfs, how="vertical") if nw_dfs else pl.DataFrame()
161
+ _write_table(con, merged_nw, "nw")
162
+ print(f" nw: {len(merged_nw):,} rows")
163
+
164
+ # Merge concepts
165
+ concepts_dfs = [df for df in [umls_concepts, onto_concepts] if df is not None]
166
+ merged_concepts = pl.concat(concepts_dfs, how="vertical") if concepts_dfs else pl.DataFrame()
167
+ _write_table(con, merged_concepts, "concepts")
168
+ print(f" concepts: {len(merged_concepts):,} rows")
169
+
170
+ # Types (UMLS only)
171
+ merged_types = (
172
+ umls_types
173
+ if umls_types is not None
174
+ else pl.DataFrame(
175
+ schema={"concept_id": pl.Utf8, "type_id": pl.Utf8, "type_name": pl.Utf8, "type_tree": pl.Utf8}
176
+ )
177
+ )
178
+ _write_table(con, merged_types, "types")
179
+ print(f" types: {len(merged_types):,} rows")
180
+
181
+ # Merge definitions
182
+ defs_dfs = [df for df in [umls_defs, onto_defs] if df is not None]
183
+ merged_defs = pl.concat(defs_dfs, how="vertical") if defs_dfs else pl.DataFrame()
184
+ _write_table(con, merged_defs, "defs")
185
+ print(f" defs: {len(merged_defs):,} rows")
186
+
187
+ # Merge edges (hierarchy relationships)
188
+ # Note: edges are NOT filtered by filter_concepts_df - we keep the full hierarchy
189
+ edges_dfs = [df for df in [umls_edges, edges_df] if df is not None]
190
+ merged_edges = pl.concat(edges_dfs, how="vertical") if edges_dfs else pl.DataFrame()
191
+ _write_table(con, merged_edges, "edges")
192
+ print(f" edges: {len(merged_edges):,} rows")
193
+
194
+ # Print size of database in GB
195
+ size_bytes = Path(db_path).stat().st_size
196
+ size_gb = size_bytes / (1024**3)
197
+ print(f" Database size: {size_gb:.2f} GB")
198
+
199
+ # ==========================================================================
200
+ # Create indexes
201
+ # ==========================================================================
202
+
203
+ print("Creating indexes...")
204
+
205
+ # NS index - exact match lookup
206
+ con.execute("CREATE INDEX IF NOT EXISTS idx_ns_nstr ON ns(nstr);")
207
+
208
+ # NW index - partial match lookup
209
+ con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd ON nw(nwd);")
210
+ con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd_source ON nw(nwd, source);")
211
+
212
+ # Atoms - join acceleration
213
+ con.execute("CREATE INDEX IF NOT EXISTS idx_atoms_concept_name ON atoms(concept_id, name_id);")
214
+ con.execute("CREATE INDEX IF NOT EXISTS idx_atoms_string ON atoms(string_id);")
215
+
216
+ # Concepts - metadata lookup
217
+ con.execute("CREATE INDEX IF NOT EXISTS idx_concepts_id ON concepts(concept_id);")
218
+
219
+ # Types - TUI filtering and hierarchy expansion
220
+ con.execute("CREATE INDEX IF NOT EXISTS idx_types_concept ON types(concept_id);")
221
+ con.execute("CREATE INDEX IF NOT EXISTS idx_types_type ON types(type_id);")
222
+ con.execute("CREATE INDEX IF NOT EXISTS idx_types_tree ON types(type_tree);")
223
+
224
+ # Definitions
225
+ con.execute("CREATE INDEX IF NOT EXISTS idx_defs_concept ON defs(concept_id);")
226
+
227
+ if len(merged_edges) > 0:
228
+ # Edges (hierarchy traversal)
229
+ con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent ON edges(parent_id);")
230
+ con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent_source ON edges(parent_id, source);")
231
+
232
+ # ==========================================================================
233
+ # Finalize
234
+ # ==========================================================================
235
+
236
+ con.execute("ANALYZE;")
237
+ con.close()
238
+
239
+ print(f"Built merged DuckDB at {db_path}")
240
+
241
+
242
+ def _write_table(con: duckdb.DuckDBPyConnection, df: pl.DataFrame, table_name: str) -> None:
243
+ """Write a Polars DataFrame to DuckDB."""
244
+ if len(df) == 0:
245
+ # Create empty table with schema
246
+ cols = ", ".join(f"{c} VARCHAR" for c in df.columns) if df.columns else "dummy VARCHAR"
247
+ con.execute(f"CREATE TABLE {table_name} ({cols});")
248
+ else:
249
+ con.register("_tmp", df.to_arrow())
250
+ con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM _tmp;")
251
+ con.unregister("_tmp")
252
+
253
+
254
+ def _load_umls_data(
255
+ con: duckdb.DuckDBPyConnection,
256
+ meta_dir: str,
257
+ filter_ids: set[str] | None = None,
258
+ ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame | None]:
259
+ """
260
+ Load UMLS data and transform to unified schema.
261
+
262
+ Args:
263
+ con: DuckDB connection
264
+ meta_dir: Directory containing UMLS META RRF files
265
+ filter_ids: Optional set of CUIs to filter to (only include these concepts)
266
+
267
+ Returns: (atoms, ns, nw, concepts, types, defs, edges)
268
+ """
269
+ meta = Path(meta_dir)
270
+
271
+ paths = {
272
+ "MRCONSO": meta / "MRCONSO.RRF",
273
+ "MRXNS_ENG": meta / "MRXNS_ENG.RRF",
274
+ "MRXNW_ENG": meta / "MRXNW_ENG.RRF",
275
+ "MRSTY": meta / "MRSTY.RRF",
276
+ "MRRANK": meta / "MRRANK.RRF",
277
+ "MRDEF": meta / "MRDEF.RRF",
278
+ }
279
+
280
+ # MRREL is optional (for hierarchy traversal)
281
+ mrrel_path = meta / "MRREL.RRF"
282
+ has_mrrel = mrrel_path.exists()
283
+
284
+ for name, path in paths.items():
285
+ if not path.exists():
286
+ raise FileNotFoundError(f"Missing {name}: {path}")
287
+
288
+ # Load raw RRF tables into temporary DuckDB tables
289
+ con.execute(f"CREATE TEMP TABLE _mrconso({', '.join(c + ' VARCHAR' for c in MRCONSO_COLS)});")
290
+ con.execute(f"CREATE TEMP TABLE _mrxns_eng({', '.join(c + ' VARCHAR' for c in MRXNS_ENG_COLS)});")
291
+ con.execute(f"CREATE TEMP TABLE _mrxnw_eng({', '.join(c + ' VARCHAR' for c in MRXNW_ENG_COLS)});")
292
+ con.execute(f"CREATE TEMP TABLE _mrsty({', '.join(c + ' VARCHAR' for c in MRSTY_COLS)});")
293
+ con.execute(f"CREATE TEMP TABLE _mrrank({', '.join(c + ' VARCHAR' for c in MRRANK_COLS)});")
294
+ con.execute(f"CREATE TEMP TABLE _mrdef({', '.join(c + ' VARCHAR' for c in MRDEF_COLS)});")
295
+
296
+ con.execute(f"COPY _mrconso FROM '{paths['MRCONSO']}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
297
+ con.execute(f"COPY _mrxns_eng FROM '{paths['MRXNS_ENG']}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
298
+ con.execute(f"COPY _mrxnw_eng FROM '{paths['MRXNW_ENG']}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
299
+ con.execute(f"COPY _mrsty FROM '{paths['MRSTY']}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
300
+ con.execute(f"COPY _mrrank FROM '{paths['MRRANK']}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
301
+ con.execute(f"COPY _mrdef FROM '{paths['MRDEF']}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
302
+
303
+ # Load MRREL if available
304
+ if has_mrrel:
305
+ con.execute(f"CREATE TEMP TABLE _mrrel({', '.join(c + ' VARCHAR' for c in MRREL_COLS)});")
306
+ con.execute(f"COPY _mrrel FROM '{mrrel_path}' (DELIMITER '|', HEADER false, QUOTE '', ESCAPE '');")
307
+
308
+ # Convert RANK to integer
309
+ con.execute("ALTER TABLE _mrrank ALTER COLUMN RANK TYPE INTEGER;")
310
+
311
+ # Register filter table if provided
312
+ cui_filter_clause = ""
313
+ cui_filter_clause_ns = ""
314
+ cui_filter_clause_nw = ""
315
+ cui_filter_clause_sty = ""
316
+ cui_filter_clause_def = ""
317
+ if filter_ids is not None:
318
+ filter_df = pl.DataFrame({"CUI": list(filter_ids)})
319
+ con.register("_cui_filter", filter_df.to_arrow())
320
+ cui_filter_clause = " AND mc.CUI IN (SELECT CUI FROM _cui_filter)"
321
+ cui_filter_clause_ns = " AND CUI IN (SELECT CUI FROM _cui_filter)"
322
+ cui_filter_clause_nw = " AND nw.CUI IN (SELECT CUI FROM _cui_filter)"
323
+ cui_filter_clause_sty = " WHERE CUI IN (SELECT CUI FROM _cui_filter)"
324
+ cui_filter_clause_def = " AND CUI IN (SELECT CUI FROM _cui_filter)"
325
+
326
+ # Build enriched atoms (English, non-suppressed, with pre-joined rank)
327
+ # Normalize UMLS ranks to 0-10 scale to be comparable with ontology ranks (1-3)
328
+ # MRRANK values typically range 0-1000+, so divide by 100
329
+ atoms_df = con.execute(f"""
330
+ SELECT
331
+ mc.CUI AS concept_id,
332
+ mc.LUI AS name_id,
333
+ mc.SUI AS string_id,
334
+ mc.CUI AS identifier,
335
+ mc.STR AS str,
336
+ mc.SAB AS source,
337
+ mc.TTY AS name_type,
338
+ mc.ISPREF AS ispref,
339
+ mc.STT AS stt,
340
+ ROUND(COALESCE(mr.RANK, 0) / 100.0)::INTEGER AS rank
341
+ FROM _mrconso mc
342
+ LEFT JOIN _mrrank mr ON mr.SAB = mc.SAB AND mr.TTY = mc.TTY
343
+ WHERE mc.LAT = 'ENG'
344
+ AND mc.SUPPRESS = 'N'
345
+ AND COALESCE(mr.SUPPRESS, 'N') = 'N'{cui_filter_clause}
346
+ """).pl()
347
+
348
+ # Build NS index (normalized string -> concept, name)
349
+ ns_df = con.execute(f"""
350
+ SELECT DISTINCT
351
+ NSTR AS nstr,
352
+ CUI AS concept_id,
353
+ LUI AS name_id
354
+ FROM _mrxns_eng
355
+ WHERE 1=1{cui_filter_clause_ns}
356
+ """).pl()
357
+
358
+ # Build NW index (word -> concept, string, source)
359
+ # Note: UMLS mrxnw_eng doesn't have source, so we join to get it
360
+ nw_df = con.execute(f"""
361
+ SELECT DISTINCT
362
+ nw.NWD AS nwd,
363
+ nw.CUI AS concept_id,
364
+ nw.SUI AS string_id,
365
+ mc.SAB AS source
366
+ FROM _mrxnw_eng nw
367
+ JOIN _mrconso mc ON mc.CUI = nw.CUI AND mc.SUI = nw.SUI
368
+ WHERE mc.LAT = 'ENG' AND mc.SUPPRESS = 'N'{cui_filter_clause_nw}
369
+ """).pl()
370
+
371
+ # Build concepts (distinct CUIs)
372
+ # Note: We don't pre-compute pref_name here; concept_info() handles that
373
+ # Cast NULLs to VARCHAR to match ontology schema for concat
374
+ concepts_df = con.execute(f"""
375
+ SELECT DISTINCT
376
+ mc.CUI AS concept_id,
377
+ mc.CUI AS identifier,
378
+ NULL::VARCHAR AS source,
379
+ NULL::VARCHAR AS pref_name,
380
+ NULL::VARCHAR AS description
381
+ FROM _mrconso mc
382
+ WHERE mc.LAT = 'ENG' AND mc.SUPPRESS = 'N'{cui_filter_clause}
383
+ """).pl()
384
+
385
+ # Build types (semantic types)
386
+ types_df = con.execute(f"""
387
+ SELECT DISTINCT
388
+ CUI AS concept_id,
389
+ TUI AS type_id,
390
+ STY AS type_name,
391
+ STN AS type_tree
392
+ FROM _mrsty{cui_filter_clause_sty}
393
+ """).pl()
394
+
395
+ # Build definitions
396
+ defs_df = con.execute(f"""
397
+ SELECT
398
+ CUI AS concept_id,
399
+ SAB AS source,
400
+ DEF AS def_text
401
+ FROM _mrdef
402
+ WHERE COALESCE(SUPPRESS, 'N') = 'N'
403
+ AND DEF IS NOT NULL AND DEF <> ''{cui_filter_clause_def}
404
+ """).pl()
405
+
406
+ # Build edges from MRREL (hierarchy relationships)
407
+ # CHD/RN: CUI1 is parent of CUI2 (direct child/narrower)
408
+ # PAR/RB: CUI2 is parent of CUI1 (reversed - CUI1 is child/narrower)
409
+ # Note: edges are NOT filtered by filter_ids - we keep the full hierarchy
410
+ edges_df: pl.DataFrame | None = None
411
+ if has_mrrel:
412
+ edges_df = con.execute("""
413
+ SELECT DISTINCT parent_id, child_id, source
414
+ FROM (
415
+ -- CHD (child) and RN (narrower): CUI1 -> CUI2
416
+ SELECT
417
+ CUI1 AS parent_id,
418
+ CUI2 AS child_id,
419
+ SAB AS source
420
+ FROM _mrrel
421
+ WHERE REL IN ('CHD', 'RN')
422
+ AND COALESCE(SUPPRESS, 'N') = 'N'
423
+
424
+ UNION
425
+
426
+ -- PAR (parent) and RB (broader): CUI2 -> CUI1 (reversed)
427
+ SELECT
428
+ CUI2 AS parent_id,
429
+ CUI1 AS child_id,
430
+ SAB AS source
431
+ FROM _mrrel
432
+ WHERE REL IN ('PAR', 'RB')
433
+ AND COALESCE(SUPPRESS, 'N') = 'N'
434
+ ) combined
435
+ WHERE parent_id <> child_id
436
+ """).pl()
437
+
438
+ # Clean up temp tables
439
+ for tbl in ("_mrconso", "_mrxns_eng", "_mrxnw_eng", "_mrsty", "_mrrank", "_mrdef", "_mrrel"):
440
+ con.execute(f"DROP TABLE IF EXISTS {tbl};")
441
+ if filter_ids is not None:
442
+ con.unregister("_cui_filter")
443
+
444
+ return atoms_df, ns_df, nw_df, concepts_df, types_df, defs_df, edges_df
445
+
446
+
447
+ def _load_ontology_data(
448
+ ontology_dfs: list[pl.DataFrame],
449
+ pref_rank: int,
450
+ syn_rank: int,
451
+ filter_ids: set[str] | None = None,
452
+ ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame]:
453
+ """
454
+ Load ontology data and transform to unified schema.
455
+
456
+ Args:
457
+ ontology_dfs: List of Polars DataFrames with ontology data
458
+ pref_rank: Scoring weight for preferred names
459
+ syn_rank: Scoring weight for synonyms
460
+ filter_ids: Optional set of global_identifiers to filter to
461
+
462
+ Returns: (atoms, ns, nw, concepts, defs)
463
+ """
464
+ # Combine all ontology DataFrames
465
+ combined = pl.concat(ontology_dfs, how="vertical")
466
+
467
+ # Filter to specified concepts if filter_ids provided
468
+ if filter_ids is not None:
469
+ combined = combined.filter(pl.col("global_identifier").is_in(filter_ids))
470
+
471
+ # Normalize columns
472
+ combined = combined.with_columns(
473
+ pl.col("synonyms").cast(pl.List(pl.Utf8)).fill_null([]),
474
+ pl.col("synonyms_norm").cast(pl.List(pl.Utf8)).fill_null([]),
475
+ )
476
+
477
+ # Build concepts table
478
+ concepts_df = combined.select(
479
+ pl.col("global_identifier").alias("concept_id"),
480
+ "identifier",
481
+ "source",
482
+ "pref_name",
483
+ "description",
484
+ ).unique(subset=["concept_id", "source"])
485
+
486
+ # Build definitions from descriptions
487
+ defs_df = (
488
+ combined.filter(pl.col("description").is_not_null() & (pl.col("description") != ""))
489
+ .select(
490
+ pl.col("global_identifier").alias("concept_id"),
491
+ "source",
492
+ pl.col("description").alias("def_text"),
493
+ )
494
+ .unique(subset=["concept_id", "source"])
495
+ )
496
+
497
+ # Build atoms: preferred names
498
+ pref_df = combined.select(
499
+ pl.col("global_identifier").alias("concept_id"),
500
+ "identifier",
501
+ "source",
502
+ pl.lit("pref").alias("name_type"),
503
+ pl.lit("Y").alias("ispref"),
504
+ pl.lit(None).cast(pl.Utf8).alias("stt"), # NULL for ontology
505
+ pl.col("pref_name_norm").alias("nstr"),
506
+ pl.col("pref_name").alias("str"),
507
+ pl.lit(pref_rank).alias("rank"),
508
+ ).filter(pl.col("nstr").is_not_null() & (pl.col("nstr") != ""))
509
+
510
+ # Build atoms: synonyms
511
+ # Note: Using synonyms_norm for both nstr and str since we can't align lists
512
+ syn_df = (
513
+ combined.explode("synonyms_norm")
514
+ .select(
515
+ pl.col("global_identifier").alias("concept_id"),
516
+ "identifier",
517
+ "source",
518
+ pl.lit("syn").alias("name_type"),
519
+ pl.lit("N").alias("ispref"),
520
+ pl.lit(None).cast(pl.Utf8).alias("stt"),
521
+ pl.col("synonyms_norm").alias("nstr"),
522
+ pl.col("synonyms_norm").alias("str"),
523
+ pl.lit(syn_rank).alias("rank"),
524
+ )
525
+ .filter(pl.col("nstr").is_not_null() & (pl.col("nstr") != ""))
526
+ )
527
+
528
+ # Combine and deduplicate atoms
529
+ names_df = pl.concat([pref_df, syn_df], how="vertical").unique(
530
+ subset=["concept_id", "source", "name_type", "nstr", "str"]
531
+ )
532
+
533
+ # Generate name_id and string_id
534
+ atoms_df = names_df.with_columns(
535
+ # name_id = hash of (concept_id, nstr) - groups variants with same normalized form
536
+ pl.concat_str([pl.col("concept_id"), pl.lit("|"), pl.col("nstr")]).hash().cast(pl.Utf8).alias("name_id"),
537
+ # string_id = hash of (concept_id, str) - unique per display string
538
+ pl.concat_str([pl.col("concept_id"), pl.lit("|"), pl.col("str")]).hash().cast(pl.Utf8).alias("string_id"),
539
+ )
540
+
541
+ # Build NS index (before we drop nstr from atoms_df)
542
+ ns_df = atoms_df.select("nstr", "concept_id", "name_id").unique(subset=["nstr", "concept_id", "name_id"])
543
+
544
+ # Reorder columns to match merged schema (drop nstr since it's in ns_df)
545
+ atoms_df = atoms_df.select(
546
+ "concept_id", "name_id", "string_id", "identifier", "str", "source", "name_type", "ispref", "stt", "rank"
547
+ )
548
+
549
+ # Build NW index (word-level)
550
+ nw_base = names_df.with_columns(
551
+ pl.concat_str([pl.col("concept_id"), pl.lit("|"), pl.col("str")]).hash().cast(pl.Utf8).alias("string_id"),
552
+ )
553
+
554
+ nw_df = (
555
+ nw_base.with_columns(pl.col("nstr").fill_null("").str.strip_chars().str.split(" ").alias("tokens"))
556
+ .explode("tokens")
557
+ .filter(pl.col("tokens") != "")
558
+ .select(
559
+ pl.col("tokens").alias("nwd"),
560
+ "concept_id",
561
+ "string_id",
562
+ "source",
563
+ )
564
+ .unique(subset=["nwd", "concept_id", "string_id", "source"])
565
+ )
566
+
567
+ return atoms_df, ns_df, nw_df, concepts_df, defs_df