norm_toolkit 1.4.0__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: norm_toolkit
3
- Version: 1.4.0
3
+ Version: 1.6.0
4
4
  Summary: Toolkit to normalize text to UMLS / ontologies
5
5
  Author: Haydn Jones
6
6
  Author-email: Haydn Jones <haydnjonest@gmail.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "norm_toolkit"
3
- version = "1.4.0"
3
+ version = "1.6.0"
4
4
  description = "Toolkit to normalize text to UMLS / ontologies"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
@@ -2,15 +2,16 @@
2
2
  Merged database builder for unified normalizer.
3
3
 
4
4
  Builds a single DuckDB database containing both UMLS and ontology data,
5
- allowing simultaneous normalization across all sources.
5
+ allowing simultaneous normalization across all ontologies.
6
6
 
7
7
  Tables created:
8
8
  - ns: Normalized string index (nstr -> concept_id, name_id)
9
- - nw: Normalized word index (nwd -> concept_id, string_id, source)
9
+ - nw: Normalized word index (nwd -> concept_id, string_id, source, ontology)
10
10
  - atoms: All atoms with unified schema
11
11
  - concepts: Concept metadata
12
12
  - types: Semantic types (UMLS only)
13
13
  - defs: Definitions from all sources
14
+ - edges: Hierarchy edges from all sources
14
15
  """
15
16
 
16
17
  from __future__ import annotations
@@ -49,7 +50,7 @@ def build_merged_duckdb(
49
50
  db_path: Output DuckDB database path
50
51
  meta_dir: Directory containing UMLS META RRF files (optional)
51
52
  ontology_dfs: List of Polars DataFrames with ontology data (optional)
52
- edges_df: Hierarchy edges for ontologies (parent_id, child_id, source columns)
53
+ edges_df: Hierarchy edges for ontologies (parent_id, child_id, source/ontology columns)
53
54
  filter_concepts_df: Optional DataFrame with 'global_identifier' column to filter
54
55
  which concepts to include. Only concepts matching these IDs will be included
55
56
  (applies to both UMLS CUIs and ontology global_identifiers).
@@ -69,14 +70,16 @@ def build_merged_duckdb(
69
70
  - pref_name: str - Preferred name
70
71
  - synonyms: list[str] - Display synonyms
71
72
  - description: str | None - Definition
72
- - source: str - Source ontology name
73
+ - source: str - Source ontology name (used to populate ontology)
74
+ (or provide an ontology column directly)
73
75
  - pref_name_norm: str - Normalized preferred name
74
76
  - synonyms_norm: list[str] - Normalized synonyms
75
77
 
76
78
  Edges DataFrame columns (if edges_df provided):
77
79
  - parent_id: str - Parent concept ID (broader term)
78
80
  - child_id: str - Child concept ID (narrower term)
79
- - source: str - Source ontology name
81
+ - source: str - Source ontology name (used to populate ontology)
82
+ (or provide an ontology column directly)
80
83
  """
81
84
  if not meta_dir and not ontology_dfs:
82
85
  raise ValueError("At least one of meta_dir or ontology_dfs must be provided")
@@ -133,6 +136,9 @@ def build_merged_duckdb(
133
136
  )
134
137
  print(f" Loaded {len(onto_atoms):,} ontology atoms")
135
138
 
139
+ if edges_df is not None:
140
+ edges_df = _normalize_edges_df(edges_df)
141
+
136
142
  # ==========================================================================
137
143
  # Merge and write tables
138
144
  # ==========================================================================
@@ -207,7 +213,7 @@ def build_merged_duckdb(
207
213
 
208
214
  # NW index - partial match lookup
209
215
  con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd ON nw(nwd);")
210
- con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd_source ON nw(nwd, source);")
216
+ con.execute("CREATE INDEX IF NOT EXISTS idx_nw_nwd_ontology ON nw(nwd, ontology);")
211
217
 
212
218
  # Atoms - join acceleration
213
219
  con.execute("CREATE INDEX IF NOT EXISTS idx_atoms_concept_name ON atoms(concept_id, name_id);")
@@ -227,7 +233,7 @@ def build_merged_duckdb(
227
233
  if len(merged_edges) > 0:
228
234
  # Edges (hierarchy traversal)
229
235
  con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent ON edges(parent_id);")
230
- con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent_source ON edges(parent_id, source);")
236
+ con.execute("CREATE INDEX IF NOT EXISTS idx_edges_parent_ontology ON edges(parent_id, ontology);")
231
237
 
232
238
  # ==========================================================================
233
239
  # Finalize
@@ -251,6 +257,21 @@ def _write_table(con: duckdb.DuckDBPyConnection, df: pl.DataFrame, table_name: s
251
257
  con.unregister("_tmp")
252
258
 
253
259
 
260
+ def _normalize_edges_df(edges_df: pl.DataFrame) -> pl.DataFrame:
261
+ """Ensure edges_df has source/ontology columns aligned with merged schema."""
262
+ if "ontology" not in edges_df.columns:
263
+ if "source" not in edges_df.columns:
264
+ raise ValueError("edges_df must include a 'source' or 'ontology' column")
265
+ edges_df = edges_df.with_columns(pl.col("source").alias("ontology"))
266
+
267
+ edges_df = edges_df.with_columns(
268
+ pl.lit(None).cast(pl.Utf8).alias("source"),
269
+ pl.col("ontology").cast(pl.Utf8),
270
+ )
271
+
272
+ return edges_df.select("parent_id", "child_id", "source", "ontology")
273
+
274
+
254
275
  def _load_umls_data(
255
276
  con: duckdb.DuckDBPyConnection,
256
277
  meta_dir: str,
@@ -321,7 +342,7 @@ def _load_umls_data(
321
342
  cui_filter_clause_ns = " AND CUI IN (SELECT CUI FROM _cui_filter)"
322
343
  cui_filter_clause_nw = " AND nw.CUI IN (SELECT CUI FROM _cui_filter)"
323
344
  cui_filter_clause_sty = " WHERE CUI IN (SELECT CUI FROM _cui_filter)"
324
- cui_filter_clause_def = " AND CUI IN (SELECT CUI FROM _cui_filter)"
345
+ cui_filter_clause_def = " AND d.CUI IN (SELECT CUI FROM _cui_filter)"
325
346
 
326
347
  # Build enriched atoms (English, non-suppressed, with pre-joined rank)
327
348
  # Normalize UMLS ranks to 0-10 scale to be comparable with ontology ranks (1-3)
@@ -334,6 +355,7 @@ def _load_umls_data(
334
355
  mc.CUI AS identifier,
335
356
  mc.STR AS str,
336
357
  mc.SAB AS source,
358
+ 'UMLS' AS ontology,
337
359
  mc.TTY AS name_type,
338
360
  mc.ISPREF AS ispref,
339
361
  mc.STT AS stt,
@@ -355,14 +377,15 @@ def _load_umls_data(
355
377
  WHERE 1=1{cui_filter_clause_ns}
356
378
  """).pl()
357
379
 
358
- # Build NW index (word -> concept, string, source)
380
+ # Build NW index (word -> concept, string, source, ontology)
359
381
  # Note: UMLS mrxnw_eng doesn't have source, so we join to get it
360
382
  nw_df = con.execute(f"""
361
383
  SELECT DISTINCT
362
384
  nw.NWD AS nwd,
363
385
  nw.CUI AS concept_id,
364
386
  nw.SUI AS string_id,
365
- mc.SAB AS source
387
+ mc.SAB AS source,
388
+ 'UMLS' AS ontology
366
389
  FROM _mrxnw_eng nw
367
390
  JOIN _mrconso mc ON mc.CUI = nw.CUI AND mc.SUI = nw.SUI
368
391
  WHERE mc.LAT = 'ENG' AND mc.SUPPRESS = 'N'{cui_filter_clause_nw}
@@ -376,6 +399,7 @@ def _load_umls_data(
376
399
  mc.CUI AS concept_id,
377
400
  mc.CUI AS identifier,
378
401
  NULL::VARCHAR AS source,
402
+ 'UMLS' AS ontology,
379
403
  NULL::VARCHAR AS pref_name,
380
404
  NULL::VARCHAR AS description
381
405
  FROM _mrconso mc
@@ -392,15 +416,18 @@ def _load_umls_data(
392
416
  FROM _mrsty{cui_filter_clause_sty}
393
417
  """).pl()
394
418
 
395
- # Build definitions
419
+ # Build definitions (English-only via MRCONSO language)
396
420
  defs_df = con.execute(f"""
397
421
  SELECT
398
- CUI AS concept_id,
399
- SAB AS source,
400
- DEF AS def_text
401
- FROM _mrdef
402
- WHERE COALESCE(SUPPRESS, 'N') = 'N'
403
- AND DEF IS NOT NULL AND DEF <> ''{cui_filter_clause_def}
422
+ d.CUI AS concept_id,
423
+ d.SAB AS source,
424
+ 'UMLS' AS ontology,
425
+ d.DEF AS def_text
426
+ FROM _mrdef d
427
+ JOIN _mrconso mc ON mc.AUI = d.AUI
428
+ WHERE mc.LAT = 'ENG'
429
+ AND COALESCE(d.SUPPRESS, 'N') = 'N'
430
+ AND d.DEF IS NOT NULL AND d.DEF <> ''{cui_filter_clause_def}
404
431
  """).pl()
405
432
 
406
433
  # Build edges from MRREL (hierarchy relationships)
@@ -410,13 +437,14 @@ def _load_umls_data(
410
437
  edges_df: pl.DataFrame | None = None
411
438
  if has_mrrel:
412
439
  edges_df = con.execute("""
413
- SELECT DISTINCT parent_id, child_id, source
440
+ SELECT DISTINCT parent_id, child_id, source, ontology
414
441
  FROM (
415
442
  -- CHD (child) and RN (narrower): CUI1 -> CUI2
416
443
  SELECT
417
444
  CUI1 AS parent_id,
418
445
  CUI2 AS child_id,
419
- SAB AS source
446
+ SAB AS source,
447
+ 'UMLS' AS ontology
420
448
  FROM _mrrel
421
449
  WHERE REL IN ('CHD', 'RN')
422
450
  AND COALESCE(SUPPRESS, 'N') = 'N'
@@ -427,7 +455,8 @@ def _load_umls_data(
427
455
  SELECT
428
456
  CUI2 AS parent_id,
429
457
  CUI1 AS child_id,
430
- SAB AS source
458
+ SAB AS source,
459
+ 'UMLS' AS ontology
431
460
  FROM _mrrel
432
461
  WHERE REL IN ('PAR', 'RB')
433
462
  AND COALESCE(SUPPRESS, 'N') = 'N'
@@ -468,6 +497,16 @@ def _load_ontology_data(
468
497
  if filter_ids is not None:
469
498
  combined = combined.filter(pl.col("global_identifier").is_in(filter_ids))
470
499
 
500
+ if "ontology" not in combined.columns:
501
+ if "source" not in combined.columns:
502
+ raise ValueError("ontology_dfs must include a 'source' or 'ontology' column")
503
+ combined = combined.with_columns(pl.col("source").alias("ontology"))
504
+
505
+ combined = combined.with_columns(
506
+ pl.lit(None).cast(pl.Utf8).alias("source"),
507
+ pl.col("ontology").cast(pl.Utf8),
508
+ )
509
+
471
510
  # Normalize columns
472
511
  combined = combined.with_columns(
473
512
  pl.col("synonyms").cast(pl.List(pl.Utf8)).fill_null([]),
@@ -479,9 +518,10 @@ def _load_ontology_data(
479
518
  pl.col("global_identifier").alias("concept_id"),
480
519
  "identifier",
481
520
  "source",
521
+ "ontology",
482
522
  "pref_name",
483
523
  "description",
484
- ).unique(subset=["concept_id", "source"])
524
+ ).unique(subset=["concept_id", "ontology"])
485
525
 
486
526
  # Build definitions from descriptions
487
527
  defs_df = (
@@ -489,9 +529,10 @@ def _load_ontology_data(
489
529
  .select(
490
530
  pl.col("global_identifier").alias("concept_id"),
491
531
  "source",
532
+ "ontology",
492
533
  pl.col("description").alias("def_text"),
493
534
  )
494
- .unique(subset=["concept_id", "source"])
535
+ .unique(subset=["concept_id", "ontology"])
495
536
  )
496
537
 
497
538
  # Build atoms: preferred names
@@ -499,6 +540,7 @@ def _load_ontology_data(
499
540
  pl.col("global_identifier").alias("concept_id"),
500
541
  "identifier",
501
542
  "source",
543
+ "ontology",
502
544
  pl.lit("pref").alias("name_type"),
503
545
  pl.lit("Y").alias("ispref"),
504
546
  pl.lit(None).cast(pl.Utf8).alias("stt"), # NULL for ontology
@@ -515,6 +557,7 @@ def _load_ontology_data(
515
557
  pl.col("global_identifier").alias("concept_id"),
516
558
  "identifier",
517
559
  "source",
560
+ "ontology",
518
561
  pl.lit("syn").alias("name_type"),
519
562
  pl.lit("N").alias("ispref"),
520
563
  pl.lit(None).cast(pl.Utf8).alias("stt"),
@@ -527,7 +570,7 @@ def _load_ontology_data(
527
570
 
528
571
  # Combine and deduplicate atoms
529
572
  names_df = pl.concat([pref_df, syn_df], how="vertical").unique(
530
- subset=["concept_id", "source", "name_type", "nstr", "str"]
573
+ subset=["concept_id", "ontology", "name_type", "nstr", "str"]
531
574
  )
532
575
 
533
576
  # Generate name_id and string_id
@@ -543,7 +586,17 @@ def _load_ontology_data(
543
586
 
544
587
  # Reorder columns to match merged schema (drop nstr since it's in ns_df)
545
588
  atoms_df = atoms_df.select(
546
- "concept_id", "name_id", "string_id", "identifier", "str", "source", "name_type", "ispref", "stt", "rank"
589
+ "concept_id",
590
+ "name_id",
591
+ "string_id",
592
+ "identifier",
593
+ "str",
594
+ "source",
595
+ "ontology",
596
+ "name_type",
597
+ "ispref",
598
+ "stt",
599
+ "rank",
547
600
  )
548
601
 
549
602
  # Build NW index (word-level)
@@ -560,8 +613,9 @@ def _load_ontology_data(
560
613
  "concept_id",
561
614
  "string_id",
562
615
  "source",
616
+ "ontology",
563
617
  )
564
- .unique(subset=["nwd", "concept_id", "string_id", "source"])
618
+ .unique(subset=["nwd", "concept_id", "string_id", "ontology"])
565
619
  )
566
620
 
567
621
  return atoms_df, ns_df, nw_df, concepts_df, defs_df
@@ -34,7 +34,7 @@ def build_ontology_duckdb(
34
34
  - pref_name: str - Preferred/canonical name
35
35
  - synonyms: list[str] - Display synonyms
36
36
  - description: str | None - Concept definition
37
- - source: str - Source ontology (e.g., "CHEBI")
37
+ - source: str - Source ontology name (used to populate ontology)
38
38
  - pref_name_norm: str - LVG-normalized preferred name
39
39
  - synonyms_norm: list[str] - LVG-normalized synonyms
40
40
 
@@ -34,6 +34,7 @@ HIT_STRUCT_TYPE = pl.Struct(
34
34
  "nstr": pl.Utf8,
35
35
  "name": pl.Utf8,
36
36
  "source": pl.Utf8,
37
+ "ontology": pl.Utf8,
37
38
  "name_type": pl.Utf8,
38
39
  "score": pl.Int64,
39
40
  "total_score": pl.Int64,
@@ -44,7 +45,7 @@ HIT_STRUCT_TYPE = pl.Struct(
44
45
  }
45
46
  )
46
47
 
47
- # Schema for ontology DataFrames (input to build_ontology_duckdb)
48
+ # Schema for ontology DataFrames (input to build_ontology_duckdb); source populates ontology.
48
49
  ONTOLOGY_DF_SCHEMA = {
49
50
  "global_identifier": pl.Utf8,
50
51
  "identifier": pl.Utf8,
@@ -26,11 +26,12 @@ class ConceptInfo(BaseModel):
26
26
 
27
27
  concept_id: str # CUI or global_id
28
28
  identifier: str | None # Source-specific ID (CUI for UMLS, e.g. "15377" for CHEBI)
29
- source: str | None # SAB or source
29
+ source: str | None # UMLS SAB (if available)
30
+ ontology: str | None # Ontology name (e.g., "UMLS", "CHEBI")
30
31
  preferred_name: str | None
31
32
  name_type: str | None # TTY or name_type
32
33
  description: str | None
33
- def_source: str | None # Source of definition (UMLS only)
34
+ def_source: str | None # UMLS source of definition (SAB, if available)
34
35
  synonyms: list[str]
35
36
  semantic_types: list[SemanticType] # Empty for ontology
36
37