norm_toolkit 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ """
2
+ UMLS database builder for unified normalizer.
3
+
4
+ This is a convenience wrapper around build_merged_duckdb for UMLS-only builds.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from norm_toolkit.build_merged import build_merged_duckdb
10
+
11
+
12
+ def build_umls_duckdb(
13
+ meta_dir: str,
14
+ db_path: str,
15
+ threads: int = 20,
16
+ ) -> None:
17
+ """
18
+ Build UMLS-only DuckDB database with unified schema.
19
+
20
+ Args:
21
+ meta_dir: Directory containing UMLS META RRF files
22
+ db_path: Output DuckDB database path
23
+ threads: Number of DuckDB threads to use
24
+
25
+ Required RRF files:
26
+ - MRCONSO.RRF: Concept names and atoms
27
+ - MRXNS_ENG.RRF: Normalized strings (English)
28
+ - MRXNW_ENG.RRF: Normalized words (English)
29
+ - MRSTY.RRF: Semantic types
30
+ - MRRANK.RRF: Source vocabulary rankings
31
+ - MRDEF.RRF: Definitions
32
+
33
+ This creates a database with the same schema as build_merged_duckdb,
34
+ but containing only UMLS data.
35
+ """
36
+ build_merged_duckdb(
37
+ db_path=db_path,
38
+ meta_dir=meta_dir,
39
+ ontology_dfs=None,
40
+ threads=threads,
41
+ )
@@ -0,0 +1,112 @@
1
+ from enum import StrEnum
2
+
3
+ import polars as pl
4
+
5
+ # =============================================================================
6
+ # Database Schema Constants
7
+ # =============================================================================
8
+
9
+ NS_TABLE = "ns" # Normalized string index
10
+ NW_TABLE = "nw" # Normalized word index
11
+ ATOMS_TABLE = "atoms" # Atom details
12
+ CONCEPTS_TABLE = "concepts" # Concept metadata
13
+ TYPES_TABLE = "types" # Semantic types (UMLS only)
14
+ DEFS_TABLE = "defs" # Definitions
15
+ EDGES_TABLE = "edges" # Hierarchy edges
16
+
17
+ # =============================================================================
18
+ # Scoring Constants
19
+ # =============================================================================
20
+
21
+ RANK_MULTIPLIER = 100
22
+ ISPREF_WEIGHT = 10
23
+ STT_WEIGHT = 5
24
+ TTY_WEIGHT = 1
25
+ EXACT_BUMP = 1000
26
+
27
+ DEFAULT_PREFER_TTYS = ["MH", "PT", "PN"]
28
+
29
+ # Polars struct type for normalized hits
30
+ HIT_STRUCT_TYPE = pl.Struct(
31
+ {
32
+ "global_identifier": pl.Utf8,
33
+ "identifier": pl.Utf8,
34
+ "nstr": pl.Utf8,
35
+ "name": pl.Utf8,
36
+ "source": pl.Utf8,
37
+ "name_type": pl.Utf8,
38
+ "score": pl.Int64,
39
+ "total_score": pl.Int64,
40
+ "match_type": pl.Utf8,
41
+ }
42
+ )
43
+
44
+ # Schema for ontology DataFrames (input to build_ontology_duckdb)
45
+ ONTOLOGY_DF_SCHEMA = {
46
+ "global_identifier": pl.Utf8,
47
+ "identifier": pl.Utf8,
48
+ "source": pl.Utf8,
49
+ "pref_name": pl.Utf8,
50
+ "description": pl.Utf8,
51
+ "pref_name_norm": pl.Utf8,
52
+ "synonyms": pl.List(pl.Utf8),
53
+ "synonyms_norm": pl.List(pl.Utf8),
54
+ }
55
+
56
+ # =============================================================================
57
+ # Entity Type Constants
58
+ # =============================================================================
59
+
60
+
61
+ class EntityType(StrEnum):
62
+ PROTEIN_GENEFAMILY = "Protein/GeneFamily"
63
+ DISEASE = "Disease"
64
+ CELLTYPE = "CellType"
65
+ GOTERM = "GOTerm"
66
+ ANATOMY = "Anatomy"
67
+ PHENOTYPE = "Phenotype"
68
+ SMALLMOLECULECLASS = "SmallMoleculeClass"
69
+ ORGANISM = "Organism"
70
+ ASSAY_RESULT = "Assay/Result"
71
+ PATHWAY = "Pathway"
72
+ CELLLINE = "CellLine"
73
+ GENE = "Gene"
74
+ PROTEIN = "Protein"
75
+ GENEVARIANT = "GeneVariant"
76
+ SMALLMOLECULE = "SmallMolecule"
77
+ CLINICALTRIAL = "ClinicalTrial"
78
+ PEPTIDE = "Peptide"
79
+ ANTIBODY = "Antibody"
80
+ RNA = "RNA"
81
+
82
+
83
+ UMLS_ENTITY_TYPES = {
84
+ EntityType.PROTEIN_GENEFAMILY,
85
+ EntityType.DISEASE,
86
+ EntityType.CELLTYPE,
87
+ EntityType.GOTERM,
88
+ EntityType.ANATOMY,
89
+ EntityType.PHENOTYPE,
90
+ }
91
+
92
+ ONT_ENTITY_TYPES = {
93
+ EntityType.SMALLMOLECULECLASS,
94
+ EntityType.ORGANISM,
95
+ EntityType.ASSAY_RESULT,
96
+ EntityType.PATHWAY,
97
+ EntityType.CELLLINE,
98
+ }
99
+
100
+ MANUAL_ENTITY_TYPES = {
101
+ EntityType.GENE,
102
+ EntityType.PROTEIN,
103
+ EntityType.GENEVARIANT,
104
+ EntityType.SMALLMOLECULE,
105
+ EntityType.CLINICALTRIAL,
106
+ }
107
+
108
+ UNK_ENTITY_TYPES = {
109
+ EntityType.PEPTIDE,
110
+ EntityType.ANTIBODY,
111
+ EntityType.RNA,
112
+ }
norm_toolkit/models.py ADDED
@@ -0,0 +1,40 @@
1
+ """
2
+ Data models for unified normalizer.
3
+
4
+ Provides unified Pydantic models that work for both UMLS and ontology concepts.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pydantic import BaseModel
10
+
11
+
12
+ class SemanticType(BaseModel):
13
+ """Semantic type (UMLS only)."""
14
+
15
+ type_id: str # TUI
16
+ type_name: str # STY
17
+
18
+
19
+ class ConceptInfo(BaseModel):
20
+ """
21
+ Unified concept information.
22
+
23
+ Works for both UMLS (CUI) and ontology (global_id) concepts.
24
+ For ontology databases, semantic_types will be empty.
25
+ """
26
+
27
+ concept_id: str # CUI or global_id
28
+ identifier: str | None # Source-specific ID (CUI for UMLS, e.g. "15377" for CHEBI)
29
+ source: str | None # SAB or source
30
+ preferred_name: str | None
31
+ name_type: str | None # TTY or name_type
32
+ description: str | None
33
+ def_source: str | None # Source of definition (UMLS only)
34
+ synonyms: list[str]
35
+ semantic_types: list[SemanticType] # Empty for ontology
36
+
37
+
38
+ # Type aliases for backward compatibility
39
+ OntologyConceptInfo = ConceptInfo
40
+ UMLSConceptInfo = ConceptInfo