norm_toolkit 1.4.0__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/PKG-INFO +1 -1
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/pyproject.toml +1 -1
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/build_merged.py +79 -25
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/build_ontology.py +1 -1
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/constants.py +2 -1
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/models.py +3 -2
- norm_toolkit-1.6.0/src/norm_toolkit/normalizer.py +450 -0
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/normalizer_cache.py +15 -11
- norm_toolkit-1.6.0/src/norm_toolkit/normalizer_postgres.py +677 -0
- norm_toolkit-1.6.0/src/norm_toolkit/normalizer_utils.py +581 -0
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/utils.py +6 -3
- norm_toolkit-1.4.0/src/norm_toolkit/normalizer.py +0 -697
- norm_toolkit-1.4.0/src/norm_toolkit/normalizer_postgres.py +0 -1004
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/README.md +0 -0
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.4.0 → norm_toolkit-1.6.0}/src/norm_toolkit/build_umls.py +0 -0
|
@@ -2,15 +2,16 @@
|
|
|
2
2
|
Merged database builder for unified normalizer.
|
|
3
3
|
|
|
4
4
|
Builds a single DuckDB database containing both UMLS and ontology data,
|
|
5
|
-
allowing simultaneous normalization across all
|
|
5
|
+
allowing simultaneous normalization across all ontologies.
|
|
6
6
|
|
|
7
7
|
Tables created:
|
|
8
8
|
- ns: Normalized string index (nstr -> concept_id, name_id)
|
|
9
|
-
- nw: Normalized word index (nwd -> concept_id, string_id, source)
|
|
9
|
+
- nw: Normalized word index (nwd -> concept_id, string_id, source, ontology)
|
|
10
10
|
- atoms: All atoms with unified schema
|
|
11
11
|
- concepts: Concept metadata
|
|
12
12
|
- types: Semantic types (UMLS only)
|
|
13
13
|
- defs: Definitions from all sources
|
|
14
|
+
- edges: Hierarchy edges from all sources
|
|
14
15
|
"""
|
|
15
16
|
|
|
16
17
|
from __future__ import annotations
|
|
@@ -49,7 +50,7 @@ def build_merged_duckdb(
|
|
|
49
50
|
db_path: Output DuckDB database path
|
|
50
51
|
meta_dir: Directory containing UMLS META RRF files (optional)
|
|
51
52
|
ontology_dfs: List of Polars DataFrames with ontology data (optional)
|
|
52
|
-
edges_df: Hierarchy edges for ontologies (parent_id, child_id, source columns)
|
|
53
|
+
edges_df: Hierarchy edges for ontologies (parent_id, child_id, source/ontology columns)
|
|
53
54
|
filter_concepts_df: Optional DataFrame with 'global_identifier' column to filter
|
|
54
55
|
which concepts to include. Only concepts matching these IDs will be included
|
|
55
56
|
(applies to both UMLS CUIs and ontology global_identifiers).
|
|
@@ -69,14 +70,16 @@ def build_merged_duckdb(
|
|
|
69
70
|
- pref_name: str - Preferred name
|
|
70
71
|
- synonyms: list[str] - Display synonyms
|
|
71
72
|
- description: str | None - Definition
|
|
72
|
-
- source: str - Source ontology name
|
|
73
|
+
- source: str - Source ontology name (used to populate ontology)
|
|
74
|
+
(or provide an ontology column directly)
|
|
73
75
|
- pref_name_norm: str - Normalized preferred name
|
|
74
76
|
- synonyms_norm: list[str] - Normalized synonyms
|
|
75
77
|
|
|
76
78
|
Edges DataFrame columns (if edges_df provided):
|
|
77
79
|
- parent_id: str - Parent concept ID (broader term)
|
|
78
80
|
- child_id: str - Child concept ID (narrower term)
|
|
79
|
-
- source: str - Source ontology name
|
|
81
|
+
- source: str - Source ontology name (used to populate ontology)
|
|
82
|
+
(or provide an ontology column directly)
|
|
80
83
|
"""
|
|
81
84
|
if not meta_dir and not ontology_dfs:
|
|
82
85
|
raise ValueError("At least one of meta_dir or ontology_dfs must be provided")
|
|
@@ -133,6 +136,9 @@ def build_merged_duckdb(
|
|
|
133
136
|
)
|
|
134
137
|
print(f" Loaded {len(onto_atoms):,} ontology atoms")
|
|
135
138
|
|
|
139
|
+
if edges_df is not None:
|
|
140
|
+
edges_df = _normalize_edges_df(edges_df)
|
|
141
|
+
|
|
136
142
|
# ==========================================================================
|
|
137
143
|
# Merge and write tables
|
|
138
144
|
# ==========================================================================
|
|
@@ -207,7 +213,7 @@ def build_merged_duckdb(
|
|
|
207
213
|
|
|
208
214
|
# NW index - partial match lookup
|
|
209
215
|
con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd ON nw(nwd);")
|
|
210
|
-
con.execute("CREATE INDEX IF NOT EXISTS
|
|
216
|
+
con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd_ontology ON nw(nwd, ontology);")
|
|
211
217
|
|
|
212
218
|
# Atoms - join acceleration
|
|
213
219
|
con.execute("CREATE INDEX IF NOT EXISTS idx_atoms_concept_name ON atoms(concept_id, name_id);")
|
|
@@ -227,7 +233,7 @@ def build_merged_duckdb(
|
|
|
227
233
|
if len(merged_edges) > 0:
|
|
228
234
|
# Edges (hierarchy traversal)
|
|
229
235
|
con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent ON edges(parent_id);")
|
|
230
|
-
con.execute("CREATE INDEX IF NOT EXISTS
|
|
236
|
+
con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent_ontology ON edges(parent_id, ontology);")
|
|
231
237
|
|
|
232
238
|
# ==========================================================================
|
|
233
239
|
# Finalize
|
|
@@ -251,6 +257,21 @@ def _write_table(con: duckdb.DuckDBPyConnection, df: pl.DataFrame, table_name: s
|
|
|
251
257
|
con.unregister("_tmp")
|
|
252
258
|
|
|
253
259
|
|
|
260
|
+
def _normalize_edges_df(edges_df: pl.DataFrame) -> pl.DataFrame:
|
|
261
|
+
"""Ensure edges_df has source/ontology columns aligned with merged schema."""
|
|
262
|
+
if "ontology" not in edges_df.columns:
|
|
263
|
+
if "source" not in edges_df.columns:
|
|
264
|
+
raise ValueError("edges_df must include a 'source' or 'ontology' column")
|
|
265
|
+
edges_df = edges_df.with_columns(pl.col("source").alias("ontology"))
|
|
266
|
+
|
|
267
|
+
edges_df = edges_df.with_columns(
|
|
268
|
+
pl.lit(None).cast(pl.Utf8).alias("source"),
|
|
269
|
+
pl.col("ontology").cast(pl.Utf8),
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return edges_df.select("parent_id", "child_id", "source", "ontology")
|
|
273
|
+
|
|
274
|
+
|
|
254
275
|
def _load_umls_data(
|
|
255
276
|
con: duckdb.DuckDBPyConnection,
|
|
256
277
|
meta_dir: str,
|
|
@@ -321,7 +342,7 @@ def _load_umls_data(
|
|
|
321
342
|
cui_filter_clause_ns = " AND CUI IN (SELECT CUI FROM _cui_filter)"
|
|
322
343
|
cui_filter_clause_nw = " AND nw.CUI IN (SELECT CUI FROM _cui_filter)"
|
|
323
344
|
cui_filter_clause_sty = " WHERE CUI IN (SELECT CUI FROM _cui_filter)"
|
|
324
|
-
cui_filter_clause_def = " AND CUI IN (SELECT CUI FROM _cui_filter)"
|
|
345
|
+
cui_filter_clause_def = " AND d.CUI IN (SELECT CUI FROM _cui_filter)"
|
|
325
346
|
|
|
326
347
|
# Build enriched atoms (English, non-suppressed, with pre-joined rank)
|
|
327
348
|
# Normalize UMLS ranks to 0-10 scale to be comparable with ontology ranks (1-3)
|
|
@@ -334,6 +355,7 @@ def _load_umls_data(
|
|
|
334
355
|
mc.CUI AS identifier,
|
|
335
356
|
mc.STR AS str,
|
|
336
357
|
mc.SAB AS source,
|
|
358
|
+
'UMLS' AS ontology,
|
|
337
359
|
mc.TTY AS name_type,
|
|
338
360
|
mc.ISPREF AS ispref,
|
|
339
361
|
mc.STT AS stt,
|
|
@@ -355,14 +377,15 @@ def _load_umls_data(
|
|
|
355
377
|
WHERE 1=1{cui_filter_clause_ns}
|
|
356
378
|
""").pl()
|
|
357
379
|
|
|
358
|
-
# Build NW index (word -> concept, string, source)
|
|
380
|
+
# Build NW index (word -> concept, string, source, ontology)
|
|
359
381
|
# Note: UMLS mrxnw_eng doesn't have source, so we join to get it
|
|
360
382
|
nw_df = con.execute(f"""
|
|
361
383
|
SELECT DISTINCT
|
|
362
384
|
nw.NWD AS nwd,
|
|
363
385
|
nw.CUI AS concept_id,
|
|
364
386
|
nw.SUI AS string_id,
|
|
365
|
-
mc.SAB AS source
|
|
387
|
+
mc.SAB AS source,
|
|
388
|
+
'UMLS' AS ontology
|
|
366
389
|
FROM _mrxnw_eng nw
|
|
367
390
|
JOIN _mrconso mc ON mc.CUI = nw.CUI AND mc.SUI = nw.SUI
|
|
368
391
|
WHERE mc.LAT = 'ENG' AND mc.SUPPRESS = 'N'{cui_filter_clause_nw}
|
|
@@ -376,6 +399,7 @@ def _load_umls_data(
|
|
|
376
399
|
mc.CUI AS concept_id,
|
|
377
400
|
mc.CUI AS identifier,
|
|
378
401
|
NULL::VARCHAR AS source,
|
|
402
|
+
'UMLS' AS ontology,
|
|
379
403
|
NULL::VARCHAR AS pref_name,
|
|
380
404
|
NULL::VARCHAR AS description
|
|
381
405
|
FROM _mrconso mc
|
|
@@ -392,15 +416,18 @@ def _load_umls_data(
|
|
|
392
416
|
FROM _mrsty{cui_filter_clause_sty}
|
|
393
417
|
""").pl()
|
|
394
418
|
|
|
395
|
-
# Build definitions
|
|
419
|
+
# Build definitions (English-only via MRCONSO language)
|
|
396
420
|
defs_df = con.execute(f"""
|
|
397
421
|
SELECT
|
|
398
|
-
CUI AS concept_id,
|
|
399
|
-
SAB AS source,
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
422
|
+
d.CUI AS concept_id,
|
|
423
|
+
d.SAB AS source,
|
|
424
|
+
'UMLS' AS ontology,
|
|
425
|
+
d.DEF AS def_text
|
|
426
|
+
FROM _mrdef d
|
|
427
|
+
JOIN _mrconso mc ON mc.AUI = d.AUI
|
|
428
|
+
WHERE mc.LAT = 'ENG'
|
|
429
|
+
AND COALESCE(d.SUPPRESS, 'N') = 'N'
|
|
430
|
+
AND d.DEF IS NOT NULL AND d.DEF <> ''{cui_filter_clause_def}
|
|
404
431
|
""").pl()
|
|
405
432
|
|
|
406
433
|
# Build edges from MRREL (hierarchy relationships)
|
|
@@ -410,13 +437,14 @@ def _load_umls_data(
|
|
|
410
437
|
edges_df: pl.DataFrame | None = None
|
|
411
438
|
if has_mrrel:
|
|
412
439
|
edges_df = con.execute("""
|
|
413
|
-
SELECT DISTINCT parent_id, child_id, source
|
|
440
|
+
SELECT DISTINCT parent_id, child_id, source, ontology
|
|
414
441
|
FROM (
|
|
415
442
|
-- CHD (child) and RN (narrower): CUI1 -> CUI2
|
|
416
443
|
SELECT
|
|
417
444
|
CUI1 AS parent_id,
|
|
418
445
|
CUI2 AS child_id,
|
|
419
|
-
SAB AS source
|
|
446
|
+
SAB AS source,
|
|
447
|
+
'UMLS' AS ontology
|
|
420
448
|
FROM _mrrel
|
|
421
449
|
WHERE REL IN ('CHD', 'RN')
|
|
422
450
|
AND COALESCE(SUPPRESS, 'N') = 'N'
|
|
@@ -427,7 +455,8 @@ def _load_umls_data(
|
|
|
427
455
|
SELECT
|
|
428
456
|
CUI2 AS parent_id,
|
|
429
457
|
CUI1 AS child_id,
|
|
430
|
-
SAB AS source
|
|
458
|
+
SAB AS source,
|
|
459
|
+
'UMLS' AS ontology
|
|
431
460
|
FROM _mrrel
|
|
432
461
|
WHERE REL IN ('PAR', 'RB')
|
|
433
462
|
AND COALESCE(SUPPRESS, 'N') = 'N'
|
|
@@ -468,6 +497,16 @@ def _load_ontology_data(
|
|
|
468
497
|
if filter_ids is not None:
|
|
469
498
|
combined = combined.filter(pl.col("global_identifier").is_in(filter_ids))
|
|
470
499
|
|
|
500
|
+
if "ontology" not in combined.columns:
|
|
501
|
+
if "source" not in combined.columns:
|
|
502
|
+
raise ValueError("ontology_dfs must include a 'source' or 'ontology' column")
|
|
503
|
+
combined = combined.with_columns(pl.col("source").alias("ontology"))
|
|
504
|
+
|
|
505
|
+
combined = combined.with_columns(
|
|
506
|
+
pl.lit(None).cast(pl.Utf8).alias("source"),
|
|
507
|
+
pl.col("ontology").cast(pl.Utf8),
|
|
508
|
+
)
|
|
509
|
+
|
|
471
510
|
# Normalize columns
|
|
472
511
|
combined = combined.with_columns(
|
|
473
512
|
pl.col("synonyms").cast(pl.List(pl.Utf8)).fill_null([]),
|
|
@@ -479,9 +518,10 @@ def _load_ontology_data(
|
|
|
479
518
|
pl.col("global_identifier").alias("concept_id"),
|
|
480
519
|
"identifier",
|
|
481
520
|
"source",
|
|
521
|
+
"ontology",
|
|
482
522
|
"pref_name",
|
|
483
523
|
"description",
|
|
484
|
-
).unique(subset=["concept_id", "
|
|
524
|
+
).unique(subset=["concept_id", "ontology"])
|
|
485
525
|
|
|
486
526
|
# Build definitions from descriptions
|
|
487
527
|
defs_df = (
|
|
@@ -489,9 +529,10 @@ def _load_ontology_data(
|
|
|
489
529
|
.select(
|
|
490
530
|
pl.col("global_identifier").alias("concept_id"),
|
|
491
531
|
"source",
|
|
532
|
+
"ontology",
|
|
492
533
|
pl.col("description").alias("def_text"),
|
|
493
534
|
)
|
|
494
|
-
.unique(subset=["concept_id", "
|
|
535
|
+
.unique(subset=["concept_id", "ontology"])
|
|
495
536
|
)
|
|
496
537
|
|
|
497
538
|
# Build atoms: preferred names
|
|
@@ -499,6 +540,7 @@ def _load_ontology_data(
|
|
|
499
540
|
pl.col("global_identifier").alias("concept_id"),
|
|
500
541
|
"identifier",
|
|
501
542
|
"source",
|
|
543
|
+
"ontology",
|
|
502
544
|
pl.lit("pref").alias("name_type"),
|
|
503
545
|
pl.lit("Y").alias("ispref"),
|
|
504
546
|
pl.lit(None).cast(pl.Utf8).alias("stt"), # NULL for ontology
|
|
@@ -515,6 +557,7 @@ def _load_ontology_data(
|
|
|
515
557
|
pl.col("global_identifier").alias("concept_id"),
|
|
516
558
|
"identifier",
|
|
517
559
|
"source",
|
|
560
|
+
"ontology",
|
|
518
561
|
pl.lit("syn").alias("name_type"),
|
|
519
562
|
pl.lit("N").alias("ispref"),
|
|
520
563
|
pl.lit(None).cast(pl.Utf8).alias("stt"),
|
|
@@ -527,7 +570,7 @@ def _load_ontology_data(
|
|
|
527
570
|
|
|
528
571
|
# Combine and deduplicate atoms
|
|
529
572
|
names_df = pl.concat([pref_df, syn_df], how="vertical").unique(
|
|
530
|
-
subset=["concept_id", "
|
|
573
|
+
subset=["concept_id", "ontology", "name_type", "nstr", "str"]
|
|
531
574
|
)
|
|
532
575
|
|
|
533
576
|
# Generate name_id and string_id
|
|
@@ -543,7 +586,17 @@ def _load_ontology_data(
|
|
|
543
586
|
|
|
544
587
|
# Reorder columns to match merged schema (drop nstr since it's in ns_df)
|
|
545
588
|
atoms_df = atoms_df.select(
|
|
546
|
-
"concept_id",
|
|
589
|
+
"concept_id",
|
|
590
|
+
"name_id",
|
|
591
|
+
"string_id",
|
|
592
|
+
"identifier",
|
|
593
|
+
"str",
|
|
594
|
+
"source",
|
|
595
|
+
"ontology",
|
|
596
|
+
"name_type",
|
|
597
|
+
"ispref",
|
|
598
|
+
"stt",
|
|
599
|
+
"rank",
|
|
547
600
|
)
|
|
548
601
|
|
|
549
602
|
# Build NW index (word-level)
|
|
@@ -560,8 +613,9 @@ def _load_ontology_data(
|
|
|
560
613
|
"concept_id",
|
|
561
614
|
"string_id",
|
|
562
615
|
"source",
|
|
616
|
+
"ontology",
|
|
563
617
|
)
|
|
564
|
-
.unique(subset=["nwd", "concept_id", "string_id", "
|
|
618
|
+
.unique(subset=["nwd", "concept_id", "string_id", "ontology"])
|
|
565
619
|
)
|
|
566
620
|
|
|
567
621
|
return atoms_df, ns_df, nw_df, concepts_df, defs_df
|
|
@@ -34,7 +34,7 @@ def build_ontology_duckdb(
|
|
|
34
34
|
- pref_name: str - Preferred/canonical name
|
|
35
35
|
- synonyms: list[str] - Display synonyms
|
|
36
36
|
- description: str | None - Concept definition
|
|
37
|
-
- source: str - Source ontology (
|
|
37
|
+
- source: str - Source ontology name (used to populate ontology)
|
|
38
38
|
- pref_name_norm: str - LVG-normalized preferred name
|
|
39
39
|
- synonyms_norm: list[str] - LVG-normalized synonyms
|
|
40
40
|
|
|
@@ -34,6 +34,7 @@ HIT_STRUCT_TYPE = pl.Struct(
|
|
|
34
34
|
"nstr": pl.Utf8,
|
|
35
35
|
"name": pl.Utf8,
|
|
36
36
|
"source": pl.Utf8,
|
|
37
|
+
"ontology": pl.Utf8,
|
|
37
38
|
"name_type": pl.Utf8,
|
|
38
39
|
"score": pl.Int64,
|
|
39
40
|
"total_score": pl.Int64,
|
|
@@ -44,7 +45,7 @@ HIT_STRUCT_TYPE = pl.Struct(
|
|
|
44
45
|
}
|
|
45
46
|
)
|
|
46
47
|
|
|
47
|
-
# Schema for ontology DataFrames (input to build_ontology_duckdb)
|
|
48
|
+
# Schema for ontology DataFrames (input to build_ontology_duckdb); source populates ontology.
|
|
48
49
|
ONTOLOGY_DF_SCHEMA = {
|
|
49
50
|
"global_identifier": pl.Utf8,
|
|
50
51
|
"identifier": pl.Utf8,
|
|
@@ -26,11 +26,12 @@ class ConceptInfo(BaseModel):
|
|
|
26
26
|
|
|
27
27
|
concept_id: str # CUI or global_id
|
|
28
28
|
identifier: str | None # Source-specific ID (CUI for UMLS, e.g. "15377" for CHEBI)
|
|
29
|
-
source: str | None # SAB
|
|
29
|
+
source: str | None # UMLS SAB (if available)
|
|
30
|
+
ontology: str | None # Ontology name (e.g., "UMLS", "CHEBI")
|
|
30
31
|
preferred_name: str | None
|
|
31
32
|
name_type: str | None # TTY or name_type
|
|
32
33
|
description: str | None
|
|
33
|
-
def_source: str | None #
|
|
34
|
+
def_source: str | None # UMLS source of definition (SAB, if available)
|
|
34
35
|
synonyms: list[str]
|
|
35
36
|
semantic_types: list[SemanticType] # Empty for ontology
|
|
36
37
|
|