norm_toolkit 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- norm_toolkit/__init__.py +49 -0
- norm_toolkit/build_merged.py +567 -0
- norm_toolkit/build_ontology.py +51 -0
- norm_toolkit/build_umls.py +41 -0
- norm_toolkit/constants.py +112 -0
- norm_toolkit/models.py +40 -0
- norm_toolkit/normalizer.py +679 -0
- norm_toolkit/normalizer_postgres.py +840 -0
- norm_toolkit/utils.py +213 -0
- norm_toolkit-1.0.0.dist-info/METADATA +16 -0
- norm_toolkit-1.0.0.dist-info/RECORD +12 -0
- norm_toolkit-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
UMLS database builder for unified normalizer.
|
|
3
|
+
|
|
4
|
+
This is a convenience wrapper around build_merged_duckdb for UMLS-only builds.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from norm_toolkit.build_merged import build_merged_duckdb
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def build_umls_duckdb(
|
|
13
|
+
meta_dir: str,
|
|
14
|
+
db_path: str,
|
|
15
|
+
threads: int = 20,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Build UMLS-only DuckDB database with unified schema.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
meta_dir: Directory containing UMLS META RRF files
|
|
22
|
+
db_path: Output DuckDB database path
|
|
23
|
+
threads: Number of DuckDB threads to use
|
|
24
|
+
|
|
25
|
+
Required RRF files:
|
|
26
|
+
- MRCONSO.RRF: Concept names and atoms
|
|
27
|
+
- MRXNS_ENG.RRF: Normalized strings (English)
|
|
28
|
+
- MRXNW_ENG.RRF: Normalized words (English)
|
|
29
|
+
- MRSTY.RRF: Semantic types
|
|
30
|
+
- MRRANK.RRF: Source vocabulary rankings
|
|
31
|
+
- MRDEF.RRF: Definitions
|
|
32
|
+
|
|
33
|
+
This creates a database with the same schema as build_merged_duckdb,
|
|
34
|
+
but containing only UMLS data.
|
|
35
|
+
"""
|
|
36
|
+
build_merged_duckdb(
|
|
37
|
+
db_path=db_path,
|
|
38
|
+
meta_dir=meta_dir,
|
|
39
|
+
ontology_dfs=None,
|
|
40
|
+
threads=threads,
|
|
41
|
+
)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
# =============================================================================
|
|
6
|
+
# Database Schema Constants
|
|
7
|
+
# =============================================================================
|
|
8
|
+
|
|
9
|
+
NS_TABLE = "ns" # Normalized string index
|
|
10
|
+
NW_TABLE = "nw" # Normalized word index
|
|
11
|
+
ATOMS_TABLE = "atoms" # Atom details
|
|
12
|
+
CONCEPTS_TABLE = "concepts" # Concept metadata
|
|
13
|
+
TYPES_TABLE = "types" # Semantic types (UMLS only)
|
|
14
|
+
DEFS_TABLE = "defs" # Definitions
|
|
15
|
+
EDGES_TABLE = "edges" # Hierarchy edges
|
|
16
|
+
|
|
17
|
+
# =============================================================================
|
|
18
|
+
# Scoring Constants
|
|
19
|
+
# =============================================================================
|
|
20
|
+
|
|
21
|
+
RANK_MULTIPLIER = 100
|
|
22
|
+
ISPREF_WEIGHT = 10
|
|
23
|
+
STT_WEIGHT = 5
|
|
24
|
+
TTY_WEIGHT = 1
|
|
25
|
+
EXACT_BUMP = 1000
|
|
26
|
+
|
|
27
|
+
DEFAULT_PREFER_TTYS = ["MH", "PT", "PN"]
|
|
28
|
+
|
|
29
|
+
# Polars struct type for normalized hits
|
|
30
|
+
HIT_STRUCT_TYPE = pl.Struct(
|
|
31
|
+
{
|
|
32
|
+
"global_identifier": pl.Utf8,
|
|
33
|
+
"identifier": pl.Utf8,
|
|
34
|
+
"nstr": pl.Utf8,
|
|
35
|
+
"name": pl.Utf8,
|
|
36
|
+
"source": pl.Utf8,
|
|
37
|
+
"name_type": pl.Utf8,
|
|
38
|
+
"score": pl.Int64,
|
|
39
|
+
"total_score": pl.Int64,
|
|
40
|
+
"match_type": pl.Utf8,
|
|
41
|
+
}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Schema for ontology DataFrames (input to build_ontology_duckdb)
|
|
45
|
+
ONTOLOGY_DF_SCHEMA = {
|
|
46
|
+
"global_identifier": pl.Utf8,
|
|
47
|
+
"identifier": pl.Utf8,
|
|
48
|
+
"source": pl.Utf8,
|
|
49
|
+
"pref_name": pl.Utf8,
|
|
50
|
+
"description": pl.Utf8,
|
|
51
|
+
"pref_name_norm": pl.Utf8,
|
|
52
|
+
"synonyms": pl.List(pl.Utf8),
|
|
53
|
+
"synonyms_norm": pl.List(pl.Utf8),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
# =============================================================================
|
|
57
|
+
# Entity Type Constants
|
|
58
|
+
# =============================================================================
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class EntityType(StrEnum):
|
|
62
|
+
PROTEIN_GENEFAMILY = "Protein/GeneFamily"
|
|
63
|
+
DISEASE = "Disease"
|
|
64
|
+
CELLTYPE = "CellType"
|
|
65
|
+
GOTERM = "GOTerm"
|
|
66
|
+
ANATOMY = "Anatomy"
|
|
67
|
+
PHENOTYPE = "Phenotype"
|
|
68
|
+
SMALLMOLECULECLASS = "SmallMoleculeClass"
|
|
69
|
+
ORGANISM = "Organism"
|
|
70
|
+
ASSAY_RESULT = "Assay/Result"
|
|
71
|
+
PATHWAY = "Pathway"
|
|
72
|
+
CELLLINE = "CellLine"
|
|
73
|
+
GENE = "Gene"
|
|
74
|
+
PROTEIN = "Protein"
|
|
75
|
+
GENEVARIANT = "GeneVariant"
|
|
76
|
+
SMALLMOLECULE = "SmallMolecule"
|
|
77
|
+
CLINICALTRIAL = "ClinicalTrial"
|
|
78
|
+
PEPTIDE = "Peptide"
|
|
79
|
+
ANTIBODY = "Antibody"
|
|
80
|
+
RNA = "RNA"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
UMLS_ENTITY_TYPES = {
|
|
84
|
+
EntityType.PROTEIN_GENEFAMILY,
|
|
85
|
+
EntityType.DISEASE,
|
|
86
|
+
EntityType.CELLTYPE,
|
|
87
|
+
EntityType.GOTERM,
|
|
88
|
+
EntityType.ANATOMY,
|
|
89
|
+
EntityType.PHENOTYPE,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
ONT_ENTITY_TYPES = {
|
|
93
|
+
EntityType.SMALLMOLECULECLASS,
|
|
94
|
+
EntityType.ORGANISM,
|
|
95
|
+
EntityType.ASSAY_RESULT,
|
|
96
|
+
EntityType.PATHWAY,
|
|
97
|
+
EntityType.CELLLINE,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
MANUAL_ENTITY_TYPES = {
|
|
101
|
+
EntityType.GENE,
|
|
102
|
+
EntityType.PROTEIN,
|
|
103
|
+
EntityType.GENEVARIANT,
|
|
104
|
+
EntityType.SMALLMOLECULE,
|
|
105
|
+
EntityType.CLINICALTRIAL,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
UNK_ENTITY_TYPES = {
|
|
109
|
+
EntityType.PEPTIDE,
|
|
110
|
+
EntityType.ANTIBODY,
|
|
111
|
+
EntityType.RNA,
|
|
112
|
+
}
|
norm_toolkit/models.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data models for unified normalizer.
|
|
3
|
+
|
|
4
|
+
Provides unified Pydantic models that work for both UMLS and ontology concepts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SemanticType(BaseModel):
|
|
13
|
+
"""Semantic type (UMLS only)."""
|
|
14
|
+
|
|
15
|
+
type_id: str # TUI
|
|
16
|
+
type_name: str # STY
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ConceptInfo(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Unified concept information.
|
|
22
|
+
|
|
23
|
+
Works for both UMLS (CUI) and ontology (global_id) concepts.
|
|
24
|
+
For ontology databases, semantic_types will be empty.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
concept_id: str # CUI or global_id
|
|
28
|
+
identifier: str | None # Source-specific ID (CUI for UMLS, e.g. "15377" for CHEBI)
|
|
29
|
+
source: str | None # SAB or source
|
|
30
|
+
preferred_name: str | None
|
|
31
|
+
name_type: str | None # TTY or name_type
|
|
32
|
+
description: str | None
|
|
33
|
+
def_source: str | None # Source of definition (UMLS only)
|
|
34
|
+
synonyms: list[str]
|
|
35
|
+
semantic_types: list[SemanticType] # Empty for ontology
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Type aliases for backward compatibility
|
|
39
|
+
OntologyConceptInfo = ConceptInfo
|
|
40
|
+
UMLSConceptInfo = ConceptInfo
|