valuesets 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valuesets might be problematic. Click here for more details.

Files changed (248) hide show
  1. valuesets/__init__.py +7 -0
  2. valuesets/_version.py +8 -0
  3. valuesets/datamodel/valuesets.py +13796 -0
  4. valuesets/datamodel/valuesets_dataclass.py +24503 -0
  5. valuesets/datamodel/valuesets_pydantic.py +13796 -0
  6. valuesets/enums/__init__.py +590 -0
  7. valuesets/enums/academic/__init__.py +1 -0
  8. valuesets/enums/academic/research.py +559 -0
  9. valuesets/enums/analytical_chemistry/__init__.py +1 -0
  10. valuesets/enums/analytical_chemistry/mass_spectrometry.py +198 -0
  11. valuesets/enums/bio/__init__.py +1 -0
  12. valuesets/enums/bio/biological_colors.py +238 -0
  13. valuesets/enums/bio/cell_cycle.py +180 -0
  14. valuesets/enums/bio/currency_chemicals.py +52 -0
  15. valuesets/enums/bio/developmental_stages.py +103 -0
  16. valuesets/enums/bio/genome_features.py +182 -0
  17. valuesets/enums/bio/genomics.py +91 -0
  18. valuesets/enums/bio/go_aspect.py +32 -0
  19. valuesets/enums/bio/go_causality.py +58 -0
  20. valuesets/enums/bio/go_evidence.py +129 -0
  21. valuesets/enums/bio/human_developmental_stages.py +62 -0
  22. valuesets/enums/bio/insdc_geographic_locations.py +591 -0
  23. valuesets/enums/bio/insdc_missing_values.py +49 -0
  24. valuesets/enums/bio/lipid_categories.py +67 -0
  25. valuesets/enums/bio/mouse_developmental_stages.py +62 -0
  26. valuesets/enums/bio/plant_biology.py +86 -0
  27. valuesets/enums/bio/plant_developmental_stages.py +54 -0
  28. valuesets/enums/bio/plant_sex.py +81 -0
  29. valuesets/enums/bio/protein_evidence.py +61 -0
  30. valuesets/enums/bio/proteomics_standards.py +123 -0
  31. valuesets/enums/bio/psi_mi.py +306 -0
  32. valuesets/enums/bio/relationship_to_oxygen.py +37 -0
  33. valuesets/enums/bio/sequence_alphabets.py +449 -0
  34. valuesets/enums/bio/sequence_chemistry.py +357 -0
  35. valuesets/enums/bio/sequencing_platforms.py +302 -0
  36. valuesets/enums/bio/structural_biology.py +320 -0
  37. valuesets/enums/bio/taxonomy.py +238 -0
  38. valuesets/enums/bio/trophic_levels.py +85 -0
  39. valuesets/enums/bio/uniprot_species.py +344 -0
  40. valuesets/enums/bio/viral_genome_types.py +47 -0
  41. valuesets/enums/bioprocessing/__init__.py +1 -0
  42. valuesets/enums/bioprocessing/scale_up.py +249 -0
  43. valuesets/enums/business/__init__.py +1 -0
  44. valuesets/enums/business/human_resources.py +275 -0
  45. valuesets/enums/business/industry_classifications.py +181 -0
  46. valuesets/enums/business/management_operations.py +228 -0
  47. valuesets/enums/business/organizational_structures.py +236 -0
  48. valuesets/enums/business/quality_management.py +181 -0
  49. valuesets/enums/business/supply_chain.py +232 -0
  50. valuesets/enums/chemistry/__init__.py +1 -0
  51. valuesets/enums/chemistry/chemical_entities.py +315 -0
  52. valuesets/enums/chemistry/reaction_directionality.py +65 -0
  53. valuesets/enums/chemistry/reactions.py +256 -0
  54. valuesets/enums/clinical/__init__.py +1 -0
  55. valuesets/enums/clinical/nih_demographics.py +177 -0
  56. valuesets/enums/clinical/phenopackets.py +254 -0
  57. valuesets/enums/common_value_sets.py +8791 -0
  58. valuesets/enums/computing/__init__.py +1 -0
  59. valuesets/enums/computing/file_formats.py +294 -0
  60. valuesets/enums/computing/maturity_levels.py +196 -0
  61. valuesets/enums/computing/mime_types.py +227 -0
  62. valuesets/enums/confidence_levels.py +168 -0
  63. valuesets/enums/contributor.py +30 -0
  64. valuesets/enums/core.py +42 -0
  65. valuesets/enums/data/__init__.py +1 -0
  66. valuesets/enums/data/data_absent_reason.py +53 -0
  67. valuesets/enums/data_science/__init__.py +1 -0
  68. valuesets/enums/data_science/binary_classification.py +87 -0
  69. valuesets/enums/data_science/emotion_classification.py +66 -0
  70. valuesets/enums/data_science/priority_severity.py +73 -0
  71. valuesets/enums/data_science/quality_control.py +46 -0
  72. valuesets/enums/data_science/sentiment_analysis.py +50 -0
  73. valuesets/enums/data_science/text_classification.py +97 -0
  74. valuesets/enums/demographics.py +206 -0
  75. valuesets/enums/ecological_interactions.py +151 -0
  76. valuesets/enums/energy/__init__.py +1 -0
  77. valuesets/enums/energy/energy.py +343 -0
  78. valuesets/enums/energy/fossil_fuels.py +29 -0
  79. valuesets/enums/energy/nuclear/__init__.py +1 -0
  80. valuesets/enums/energy/nuclear/nuclear_facilities.py +195 -0
  81. valuesets/enums/energy/nuclear/nuclear_fuel_cycle.py +96 -0
  82. valuesets/enums/energy/nuclear/nuclear_fuels.py +175 -0
  83. valuesets/enums/energy/nuclear/nuclear_operations.py +191 -0
  84. valuesets/enums/energy/nuclear/nuclear_regulatory.py +188 -0
  85. valuesets/enums/energy/nuclear/nuclear_safety.py +164 -0
  86. valuesets/enums/energy/nuclear/nuclear_waste.py +158 -0
  87. valuesets/enums/energy/nuclear/reactor_types.py +163 -0
  88. valuesets/enums/environmental_health/__init__.py +1 -0
  89. valuesets/enums/environmental_health/exposures.py +265 -0
  90. valuesets/enums/geography/__init__.py +1 -0
  91. valuesets/enums/geography/geographic_codes.py +741 -0
  92. valuesets/enums/health/__init__.py +12 -0
  93. valuesets/enums/health/vaccination.py +98 -0
  94. valuesets/enums/health.py +36 -0
  95. valuesets/enums/health_base.py +36 -0
  96. valuesets/enums/healthcare.py +45 -0
  97. valuesets/enums/industry/__init__.py +1 -0
  98. valuesets/enums/industry/extractive_industry.py +94 -0
  99. valuesets/enums/industry/mining.py +388 -0
  100. valuesets/enums/industry/safety_colors.py +201 -0
  101. valuesets/enums/investigation.py +27 -0
  102. valuesets/enums/materials_science/__init__.py +1 -0
  103. valuesets/enums/materials_science/characterization_methods.py +112 -0
  104. valuesets/enums/materials_science/crystal_structures.py +76 -0
  105. valuesets/enums/materials_science/material_properties.py +119 -0
  106. valuesets/enums/materials_science/material_types.py +104 -0
  107. valuesets/enums/materials_science/pigments_dyes.py +198 -0
  108. valuesets/enums/materials_science/synthesis_methods.py +109 -0
  109. valuesets/enums/medical/__init__.py +1 -0
  110. valuesets/enums/medical/clinical.py +277 -0
  111. valuesets/enums/medical/neuroimaging.py +119 -0
  112. valuesets/enums/mining_processing.py +302 -0
  113. valuesets/enums/physics/__init__.py +1 -0
  114. valuesets/enums/physics/states_of_matter.py +46 -0
  115. valuesets/enums/social/__init__.py +1 -0
  116. valuesets/enums/social/person_status.py +29 -0
  117. valuesets/enums/spatial/__init__.py +1 -0
  118. valuesets/enums/spatial/spatial_qualifiers.py +246 -0
  119. valuesets/enums/statistics/__init__.py +5 -0
  120. valuesets/enums/statistics/prediction_outcomes.py +31 -0
  121. valuesets/enums/statistics.py +31 -0
  122. valuesets/enums/time/__init__.py +1 -0
  123. valuesets/enums/time/temporal.py +254 -0
  124. valuesets/enums/units/__init__.py +1 -0
  125. valuesets/enums/units/measurements.py +310 -0
  126. valuesets/enums/visual/__init__.py +1 -0
  127. valuesets/enums/visual/colors.py +376 -0
  128. valuesets/generators/__init__.py +19 -0
  129. valuesets/generators/auto_slot_injector.py +280 -0
  130. valuesets/generators/enhanced_pydantic_generator.py +100 -0
  131. valuesets/generators/enum_slot_generator.py +201 -0
  132. valuesets/generators/modular_rich_generator.py +353 -0
  133. valuesets/generators/prefix_standardizer.py +198 -0
  134. valuesets/generators/rich_enum.py +127 -0
  135. valuesets/generators/rich_pydantic_generator.py +310 -0
  136. valuesets/generators/smart_slot_syncer.py +428 -0
  137. valuesets/generators/sssom_generator.py +394 -0
  138. valuesets/merged/merged_hierarchy.yaml +21649 -0
  139. valuesets/schema/README.md +3 -0
  140. valuesets/schema/academic/research.yaml +911 -0
  141. valuesets/schema/analytical_chemistry/mass_spectrometry.yaml +206 -0
  142. valuesets/schema/bio/bio_entities.yaml +364 -0
  143. valuesets/schema/bio/biological_colors.yaml +434 -0
  144. valuesets/schema/bio/cell_cycle.yaml +309 -0
  145. valuesets/schema/bio/currency_chemicals.yaml +70 -0
  146. valuesets/schema/bio/developmental_stages.yaml +226 -0
  147. valuesets/schema/bio/genome_features.yaml +342 -0
  148. valuesets/schema/bio/genomics.yaml +101 -0
  149. valuesets/schema/bio/go_aspect.yaml +39 -0
  150. valuesets/schema/bio/go_causality.yaml +119 -0
  151. valuesets/schema/bio/go_evidence.yaml +215 -0
  152. valuesets/schema/bio/insdc_geographic_locations.yaml +911 -0
  153. valuesets/schema/bio/insdc_missing_values.yaml +85 -0
  154. valuesets/schema/bio/lipid_categories.yaml +72 -0
  155. valuesets/schema/bio/plant_biology.yaml +125 -0
  156. valuesets/schema/bio/plant_developmental_stages.yaml +77 -0
  157. valuesets/schema/bio/plant_sex.yaml +108 -0
  158. valuesets/schema/bio/protein_evidence.yaml +63 -0
  159. valuesets/schema/bio/proteomics_standards.yaml +116 -0
  160. valuesets/schema/bio/psi_mi.yaml +400 -0
  161. valuesets/schema/bio/relationship_to_oxygen.yaml +46 -0
  162. valuesets/schema/bio/sequence_alphabets.yaml +1168 -0
  163. valuesets/schema/bio/sequence_chemistry.yaml +477 -0
  164. valuesets/schema/bio/sequencing_platforms.yaml +515 -0
  165. valuesets/schema/bio/structural_biology.yaml +428 -0
  166. valuesets/schema/bio/taxonomy.yaml +453 -0
  167. valuesets/schema/bio/trophic_levels.yaml +118 -0
  168. valuesets/schema/bio/uniprot_species.yaml +1209 -0
  169. valuesets/schema/bio/viral_genome_types.yaml +99 -0
  170. valuesets/schema/bioprocessing/scale_up.yaml +458 -0
  171. valuesets/schema/business/human_resources.yaml +752 -0
  172. valuesets/schema/business/industry_classifications.yaml +448 -0
  173. valuesets/schema/business/management_operations.yaml +602 -0
  174. valuesets/schema/business/organizational_structures.yaml +645 -0
  175. valuesets/schema/business/quality_management.yaml +502 -0
  176. valuesets/schema/business/supply_chain.yaml +688 -0
  177. valuesets/schema/chemistry/chemical_entities.yaml +639 -0
  178. valuesets/schema/chemistry/reaction_directionality.yaml +60 -0
  179. valuesets/schema/chemistry/reactions.yaml +442 -0
  180. valuesets/schema/clinical/nih_demographics.yaml +285 -0
  181. valuesets/schema/clinical/phenopackets.yaml +429 -0
  182. valuesets/schema/computing/file_formats.yaml +631 -0
  183. valuesets/schema/computing/maturity_levels.yaml +229 -0
  184. valuesets/schema/computing/mime_types.yaml +266 -0
  185. valuesets/schema/confidence_levels.yaml +206 -0
  186. valuesets/schema/contributor.yaml +30 -0
  187. valuesets/schema/core.yaml +55 -0
  188. valuesets/schema/data/data_absent_reason.yaml +82 -0
  189. valuesets/schema/data_science/binary_classification.yaml +125 -0
  190. valuesets/schema/data_science/emotion_classification.yaml +109 -0
  191. valuesets/schema/data_science/priority_severity.yaml +122 -0
  192. valuesets/schema/data_science/quality_control.yaml +68 -0
  193. valuesets/schema/data_science/sentiment_analysis.yaml +81 -0
  194. valuesets/schema/data_science/text_classification.yaml +135 -0
  195. valuesets/schema/demographics.yaml +238 -0
  196. valuesets/schema/ecological_interactions.yaml +298 -0
  197. valuesets/schema/energy/energy.yaml +595 -0
  198. valuesets/schema/energy/fossil_fuels.yaml +28 -0
  199. valuesets/schema/energy/nuclear/nuclear_facilities.yaml +463 -0
  200. valuesets/schema/energy/nuclear/nuclear_fuel_cycle.yaml +82 -0
  201. valuesets/schema/energy/nuclear/nuclear_fuels.yaml +421 -0
  202. valuesets/schema/energy/nuclear/nuclear_operations.yaml +480 -0
  203. valuesets/schema/energy/nuclear/nuclear_regulatory.yaml +200 -0
  204. valuesets/schema/energy/nuclear/nuclear_safety.yaml +352 -0
  205. valuesets/schema/energy/nuclear/nuclear_waste.yaml +332 -0
  206. valuesets/schema/energy/nuclear/reactor_types.yaml +394 -0
  207. valuesets/schema/environmental_health/exposures.yaml +355 -0
  208. valuesets/schema/generated_slots.yaml +1828 -0
  209. valuesets/schema/geography/geographic_codes.yaml +1018 -0
  210. valuesets/schema/health/vaccination.yaml +102 -0
  211. valuesets/schema/health.yaml +38 -0
  212. valuesets/schema/healthcare.yaml +53 -0
  213. valuesets/schema/industry/extractive_industry.yaml +89 -0
  214. valuesets/schema/industry/mining.yaml +888 -0
  215. valuesets/schema/industry/safety_colors.yaml +375 -0
  216. valuesets/schema/investigation.yaml +64 -0
  217. valuesets/schema/materials_science/characterization_methods.yaml +193 -0
  218. valuesets/schema/materials_science/crystal_structures.yaml +138 -0
  219. valuesets/schema/materials_science/material_properties.yaml +135 -0
  220. valuesets/schema/materials_science/material_types.yaml +151 -0
  221. valuesets/schema/materials_science/pigments_dyes.yaml +465 -0
  222. valuesets/schema/materials_science/synthesis_methods.yaml +186 -0
  223. valuesets/schema/medical/clinical.yaml +610 -0
  224. valuesets/schema/medical/neuroimaging.yaml +325 -0
  225. valuesets/schema/mining_processing.yaml +295 -0
  226. valuesets/schema/physics/states_of_matter.yaml +46 -0
  227. valuesets/schema/slot_mixins.yaml +143 -0
  228. valuesets/schema/social/person_status.yaml +28 -0
  229. valuesets/schema/spatial/spatial_qualifiers.yaml +466 -0
  230. valuesets/schema/statistics/prediction_outcomes.yaml +26 -0
  231. valuesets/schema/statistics.yaml +34 -0
  232. valuesets/schema/time/temporal.yaml +435 -0
  233. valuesets/schema/types.yaml +15 -0
  234. valuesets/schema/units/measurements.yaml +675 -0
  235. valuesets/schema/valuesets.yaml +100 -0
  236. valuesets/schema/visual/colors.yaml +778 -0
  237. valuesets/utils/__init__.py +6 -0
  238. valuesets/utils/comparison.py +102 -0
  239. valuesets/utils/expand_dynamic_enums.py +414 -0
  240. valuesets/utils/mapping_utils.py +236 -0
  241. valuesets/validators/__init__.py +11 -0
  242. valuesets/validators/enum_evaluator.py +669 -0
  243. valuesets/validators/oak_config.yaml +70 -0
  244. valuesets/validators/validate_with_ols.py +241 -0
  245. valuesets-0.3.1.dist-info/METADATA +395 -0
  246. valuesets-0.3.1.dist-info/RECORD +248 -0
  247. valuesets-0.3.1.dist-info/WHEEL +4 -0
  248. valuesets-0.3.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,669 @@
1
+ """
2
+ Enum evaluator for validating ontology mappings in LinkML schemas.
3
+
4
+ This module validates that ontology term mappings (meanings) in enum definitions
5
+ match the expected labels from the ontology.
6
+
7
+ Uses OAK (Ontology Access Kit) as the abstraction layer for all ontology access.
8
+ """
9
+
10
+ import re
11
+ import logging
12
+ import sys
13
+ import os
14
+ import warnings
15
+ import csv
16
+ import yaml
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from typing import List, Optional, Dict, Set
20
+ from pydantic import BaseModel, Field, ConfigDict
21
+ from linkml_runtime.utils.schemaview import SchemaView
22
+ from linkml_runtime.linkml_model import EnumDefinition, PermissibleValue
23
+
24
+ LIMIT = 300
25
+
26
+ try:
27
+ from oaklib import get_adapter
28
+ HAS_OAK = True
29
+ except ImportError:
30
+ HAS_OAK = False
31
+
32
+ logging.basicConfig(level=logging.INFO)
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class ValidationConfig(BaseModel):
37
+ """Configuration for validation."""
38
+ model_config = ConfigDict(extra="forbid")
39
+
40
+ oak_adapter_string: str = Field(
41
+ default="sqlite:obo:",
42
+ description="OAK adapter string (e.g., sqlite:obo:, ols:, bioportal:)"
43
+ )
44
+ strict_mode: bool = Field(
45
+ default=False,
46
+ description="Treat warnings as errors"
47
+ )
48
+ cache_labels: bool = Field(
49
+ default=True,
50
+ description="Cache ontology labels to avoid redundant lookups"
51
+ )
52
+ oak_config_path: Optional[Path] = Field(
53
+ default=None,
54
+ description="Path to OAK configuration YAML file"
55
+ )
56
+ cache_dir: Path = Field(
57
+ default=Path("cache"),
58
+ description="Directory for storing cached terms"
59
+ )
60
+
61
+
62
+ class ValidationIssue(BaseModel):
63
+ """Represents a single validation issue."""
64
+ model_config = ConfigDict(extra="forbid")
65
+
66
+ enum_name: str
67
+ value_name: str
68
+ severity: str = Field(pattern="^(ERROR|WARNING|INFO)$")
69
+ message: str
70
+ meaning: Optional[str] = None
71
+ expected_label: Optional[str] = None
72
+ actual_label: Optional[str] = None
73
+
74
+
75
+ class ValidationResult(BaseModel):
76
+ """Results from validating a schema."""
77
+ model_config = ConfigDict(extra="forbid")
78
+
79
+ schema_path: Optional[Path] = None
80
+ issues: List[ValidationIssue] = Field(default_factory=list)
81
+ total_enums_checked: int = 0
82
+ total_values_checked: int = 0
83
+ total_mappings_checked: int = 0
84
+
85
+ def has_errors(self) -> bool:
86
+ """Check if there are any errors."""
87
+ return any(i.severity == "ERROR" for i in self.issues)
88
+
89
+ def has_warnings(self) -> bool:
90
+ """Check if there are any warnings."""
91
+ return any(i.severity == "WARNING" for i in self.issues)
92
+
93
+ def print_summary(self):
94
+ """Print a summary of validation results."""
95
+ print(f"\nValidation Summary:")
96
+ print(f" Enums checked: {self.total_enums_checked}")
97
+ print(f" Values checked: {self.total_values_checked}")
98
+ print(f" Mappings checked: {self.total_mappings_checked}")
99
+
100
+ errors = [i for i in self.issues if i.severity == "ERROR"]
101
+ warnings = [i for i in self.issues if i.severity == "WARNING"]
102
+ info = [i for i in self.issues if i.severity == "INFO"]
103
+
104
+ print(f" Errors: {len(errors)}")
105
+ print(f" Warnings: {len(warnings)}")
106
+ print(f" Info: {len(info)}")
107
+
108
+
109
+ class EnumEvaluator:
110
+ """Evaluator for validating ontology mappings in enums."""
111
+
112
+ def __init__(self, config: Optional[ValidationConfig] = None):
113
+ """
114
+ Initialize the evaluator.
115
+
116
+ Args:
117
+ config: Validation configuration
118
+ """
119
+ self.config = config or ValidationConfig()
120
+ self._label_cache = {} if self.config.cache_labels else None
121
+ self._per_prefix_adapters = {} # Cache of per-ontology adapters
122
+ self._oak_config = self._load_oak_config()
123
+ self._prefix_caches = {} # Initialize here to avoid AttributeError
124
+ self._warned_prefixes = set() # Track prefixes we've already warned about
125
+ self._initialize_oak()
126
+
127
+ def _load_oak_config(self) -> Dict[str, str]:
128
+ """Load OAK configuration from YAML file."""
129
+ config_path = self.config.oak_config_path
130
+ if not config_path:
131
+ # Default to config file next to this module
132
+ config_path = Path(__file__).parent / "oak_config.yaml"
133
+
134
+ if not config_path.exists():
135
+ logger.warning(f"OAK config file not found: {config_path}")
136
+ return {}
137
+
138
+ try:
139
+ with open(config_path, 'r') as f:
140
+ config_data = yaml.safe_load(f)
141
+ adapters = config_data.get('ontology_adapters', {})
142
+ # Convert keys to lowercase for case-insensitive lookup
143
+ return {k.lower(): v for k, v in adapters.items()}
144
+ except Exception as e:
145
+ logger.warning(f"Could not load OAK config: {e}")
146
+ return {}
147
+
148
+ def _get_cache_file(self, prefix: str) -> Path:
149
+ """Get the cache file path for a given prefix."""
150
+ cache_dir = self.config.cache_dir / prefix.lower()
151
+ cache_dir.mkdir(parents=True, exist_ok=True)
152
+ return cache_dir / "terms.csv"
153
+
154
+ def _load_cache(self, prefix: str) -> Dict[str, str]:
155
+ """Load cached terms for a prefix."""
156
+ cache_file = self._get_cache_file(prefix)
157
+ cache = {}
158
+
159
+ if cache_file.exists():
160
+ try:
161
+ with open(cache_file, 'r', newline='') as f:
162
+ reader = csv.reader(f)
163
+ next(reader) # Skip header
164
+ for row in reader:
165
+ if len(row) >= 2:
166
+ cache[row[0]] = row[1] # curie -> label
167
+ except Exception as e:
168
+ logger.warning(f"Could not load cache for {prefix}: {e}")
169
+
170
+ return cache
171
+
172
+ def _save_to_cache(self, prefix: str, curie: str, label: Optional[str]):
173
+ """Save a term to cache."""
174
+ if prefix.lower() not in self._oak_config:
175
+ return # Only cache for configured prefixes
176
+
177
+ cache_file = self._get_cache_file(prefix)
178
+
179
+ # Read existing cache
180
+ existing_cache = set()
181
+ if cache_file.exists():
182
+ try:
183
+ with open(cache_file, 'r', newline='') as f:
184
+ reader = csv.reader(f)
185
+ next(reader) # Skip header
186
+ for row in reader:
187
+ if len(row) >= 1:
188
+ existing_cache.add(row[0])
189
+ except Exception:
190
+ pass
191
+
192
+ # Don't add if already exists
193
+ if curie in existing_cache:
194
+ return
195
+
196
+ # Append new entry
197
+ try:
198
+ # Create file with header if it doesn't exist
199
+ if not cache_file.exists():
200
+ with open(cache_file, 'w', newline='') as f:
201
+ writer = csv.writer(f)
202
+ writer.writerow(['curie', 'label', 'retrieved_at'])
203
+
204
+ # Append new row
205
+ with open(cache_file, 'a', newline='') as f:
206
+ writer = csv.writer(f)
207
+ timestamp = datetime.now().isoformat()
208
+ writer.writerow([curie, label or '', timestamp])
209
+ except Exception as e:
210
+ logger.warning(f"Could not save to cache for {prefix}: {e}")
211
+
212
+ def _initialize_oak(self):
213
+ """Initialize OAK adapters dynamically based on usage."""
214
+ if not HAS_OAK:
215
+ logger.warning("OAK is not installed. Install with: pip install oaklib")
216
+ return
217
+
218
+ # Don't initialize a main adapter if using dynamic sqlite:obo:
219
+ # We'll create per-prefix adapters on demand
220
+ if self.config.oak_adapter_string == "sqlite:obo:":
221
+ logger.info("Using dynamic SemSQL adapter selection based on CURIE prefix")
222
+ return
223
+
224
+ # For other adapter types (ols:, bioportal:, etc), create a single adapter
225
+ try:
226
+ self._per_prefix_adapters['_default'] = get_adapter(self.config.oak_adapter_string)
227
+ logger.info(f"Initialized OAK adapter: {self.config.oak_adapter_string}")
228
+ except Exception as e:
229
+ logger.warning(f"Could not initialize OAK adapter: {e}")
230
+
231
+ def get_ontology_label(self, curie: str) -> Optional[str]:
232
+ """
233
+ Get the label for an ontology term using OAK.
234
+
235
+ Checks local cache first, then tries OAK lookup, then saves to cache.
236
+ """
237
+ # Check in-memory cache first
238
+ if self._label_cache is not None and curie in self._label_cache:
239
+ return self._label_cache[curie]
240
+
241
+ # Parse the CURIE to get the prefix
242
+ prefix = curie.split(":")[0] if ":" in curie else None
243
+ if not prefix:
244
+ return None
245
+
246
+ prefix_lower = prefix.lower()
247
+
248
+ # Check file cache for configured prefixes
249
+ if prefix_lower in self._prefix_caches:
250
+ if curie in self._prefix_caches[prefix_lower]:
251
+ label = self._prefix_caches[prefix_lower][curie]
252
+ # Also cache in memory
253
+ if self._label_cache is not None:
254
+ self._label_cache[curie] = label
255
+ return label if label else None
256
+
257
+ label = None
258
+ adapter = None
259
+
260
+ # Try configured adapter first for this prefix
261
+ if prefix_lower in self._oak_config:
262
+ adapter_string = self._oak_config[prefix_lower]
263
+
264
+ # If adapter string is empty or None, skip validation entirely
265
+ if not adapter_string:
266
+ logger.debug(f"Skipping validation for {prefix} (empty adapter string in config)")
267
+ self._per_prefix_adapters[prefix_lower] = None
268
+ return None
269
+
270
+ if prefix_lower not in self._per_prefix_adapters:
271
+ try:
272
+ self._per_prefix_adapters[prefix_lower] = get_adapter(adapter_string)
273
+ logger.info(f"Created configured adapter for {prefix} ontology")
274
+ except Exception as e:
275
+ logger.warning(f"Could not create configured adapter for {prefix}: {e}")
276
+ self._per_prefix_adapters[prefix_lower] = None
277
+
278
+ adapter = self._per_prefix_adapters.get(prefix_lower)
279
+ elif self.config.oak_adapter_string == "sqlite:obo:" and prefix:
280
+ # Dynamic mode: create per-ontology adapter on demand
281
+ if prefix_lower not in self._per_prefix_adapters:
282
+ try:
283
+ adapter_string = f"sqlite:obo:{prefix_lower}"
284
+ self._per_prefix_adapters[prefix_lower] = get_adapter(adapter_string)
285
+ logger.info(f"Created adapter for {prefix} ontology")
286
+ except Exception as e:
287
+ logger.debug(f"Could not create adapter for {prefix}: {e}")
288
+ # Track unknown prefix for end-of-run reporting
289
+ if prefix_lower not in self._warned_prefixes:
290
+ self._warned_prefixes.add(prefix_lower)
291
+ self._per_prefix_adapters[prefix_lower] = None
292
+
293
+ adapter = self._per_prefix_adapters.get(prefix_lower)
294
+ else:
295
+ # Use default adapter for other configurations
296
+ adapter = self._per_prefix_adapters.get('_default')
297
+
298
+ # Get the label
299
+ if adapter:
300
+ try:
301
+ label = adapter.label(curie)
302
+ except Exception as e:
303
+ logger.debug(f"Could not get label for {curie}: {e}")
304
+
305
+ # Cache the result in memory
306
+ if self._label_cache is not None:
307
+ self._label_cache[curie] = label
308
+
309
+ # Save to file cache for configured prefixes
310
+ if prefix_lower in self._oak_config:
311
+ self._save_to_cache(prefix, curie, label)
312
+ # Also update in-memory cache
313
+ if prefix_lower in self._prefix_caches:
314
+ self._prefix_caches[prefix_lower][curie] = label or ''
315
+
316
+ return label
317
+
318
+ def is_prefix_configured(self, prefix: str) -> bool:
319
+ """Check if a prefix is configured for strict validation."""
320
+ prefix_lower = prefix.lower()
321
+ return (prefix_lower in self._oak_config and
322
+ bool(self._oak_config[prefix_lower]))
323
+
324
+ def normalize_string(self, s: str) -> str:
325
+ """
326
+ Normalize a string for comparison by removing non-alphanumeric chars
327
+ and converting to lowercase.
328
+ """
329
+ if not s:
330
+ return ""
331
+ # Remove non-alphanumeric characters
332
+ s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
333
+ # Collapse multiple spaces
334
+ s = re.sub(r'\s+', ' ', s)
335
+ return s.strip().lower()
336
+
337
+ def extract_aliases(self, pv: PermissibleValue, value_name: str) -> Set[str]:
338
+ """
339
+ Extract all possible aliases for a permissible value.
340
+
341
+ This includes:
342
+ - The value name itself
343
+ - The title (if present)
344
+ - Any aliases (if present)
345
+ - Annotations that might contain display names
346
+ """
347
+ aliases = {value_name}
348
+
349
+ if pv.title:
350
+ aliases.add(pv.title)
351
+
352
+ if pv.aliases:
353
+ aliases.update(pv.aliases)
354
+
355
+ # Add structured_aliases if present
356
+ if hasattr(pv, 'structured_aliases') and pv.structured_aliases:
357
+ for struct_alias in pv.structured_aliases:
358
+ if hasattr(struct_alias, 'literal_form') and struct_alias.literal_form:
359
+ aliases.add(struct_alias.literal_form)
360
+
361
+ # Check annotations for common alias fields
362
+ if pv.annotations:
363
+ for key in ['label', 'display_name', 'preferred_name', 'synonym']:
364
+ if key in pv.annotations:
365
+ val = pv.annotations[key]
366
+ if val and hasattr(val, 'value'):
367
+ aliases.add(str(val.value))
368
+ elif val:
369
+ aliases.add(str(val))
370
+
371
+ return aliases
372
+
373
+ def validate_enum(self, enum_def: EnumDefinition, enum_name: str) -> List[ValidationIssue]:
374
+ """
375
+ Validate a single enum definition.
376
+ """
377
+ issues = []
378
+
379
+ if not enum_def.permissible_values:
380
+ return issues
381
+
382
+ for value_name, pv in enum_def.permissible_values.items():
383
+ # Check if there's a meaning (ontology mapping)
384
+ meaning = pv.meaning
385
+ if not meaning:
386
+ continue
387
+
388
+ # Check if this prefix has an empty adapter string (skip validation)
389
+ prefix = meaning.split(":")[0] if ":" in meaning else None
390
+ if prefix and prefix.lower() in self._oak_config and not self._oak_config[prefix.lower()]:
391
+ logger.debug(f"Skipping validation for {meaning} (empty adapter string in config)")
392
+ continue
393
+
394
+ # Get the actual label from ontology
395
+ actual_label = self.get_ontology_label(meaning)
396
+
397
+ # Get all possible expected labels
398
+ expected_labels = self.extract_aliases(pv, value_name)
399
+
400
+ # Normalize for comparison
401
+ normalized_expected = {self.normalize_string(label) for label in expected_labels}
402
+ normalized_actual = self.normalize_string(actual_label) if actual_label else None
403
+
404
+ # Check if actual label matches any expected label
405
+ if actual_label is None:
406
+ # Could not retrieve label - severity depends on whether prefix is configured
407
+ prefix = meaning.split(":")[0] if ":" in meaning else None
408
+ if prefix and self.is_prefix_configured(prefix):
409
+ # Strict mode for configured prefixes
410
+ severity = "ERROR"
411
+ message = f"Could not retrieve label for configured ontology term {meaning}"
412
+ else:
413
+ # Lenient mode for unconfigured prefixes
414
+ severity = "INFO"
415
+ message = f"Could not retrieve label for {meaning}"
416
+
417
+ issue = ValidationIssue(
418
+ enum_name=enum_name,
419
+ value_name=value_name,
420
+ severity=severity,
421
+ message=message,
422
+ meaning=meaning
423
+ )
424
+ issues.append(issue)
425
+ elif normalized_actual not in normalized_expected:
426
+ # Label mismatch - treat as ERROR for configured prefixes or in strict mode
427
+ prefix = meaning.split(":")[0] if ":" in meaning else None
428
+ is_configured = prefix and self.is_prefix_configured(prefix)
429
+ severity = "ERROR" if (self.config.strict_mode or is_configured) else "WARNING"
430
+ issue = ValidationIssue(
431
+ enum_name=enum_name,
432
+ value_name=value_name,
433
+ severity=severity,
434
+ message=f"Ontology label mismatch: expected one of {expected_labels}, got '{actual_label}'",
435
+ meaning=meaning,
436
+ expected_label=value_name,
437
+ actual_label=actual_label
438
+ )
439
+ issues.append(issue)
440
+
441
+ return issues
442
+
443
+ def validate_schema(self, schema_path: Path) -> ValidationResult:
444
+ """
445
+ Validate all enums in a schema.
446
+ """
447
+ result = ValidationResult(schema_path=schema_path)
448
+
449
+ try:
450
+ # Load schema
451
+ sv = SchemaView(str(schema_path))
452
+
453
+ # Validate each enum
454
+ for enum_name, enum_def in sv.all_enums().items():
455
+ result.total_enums_checked += 1
456
+
457
+ if enum_def.permissible_values:
458
+ result.total_values_checked += len(enum_def.permissible_values)
459
+
460
+ # Count mappings
461
+ for pv in enum_def.permissible_values.values():
462
+ if pv.meaning:
463
+ result.total_mappings_checked += 1
464
+
465
+ # Validate the enum
466
+ issues = self.validate_enum(enum_def, enum_name)
467
+ result.issues.extend(issues)
468
+
469
+ except Exception as e:
470
+ logger.error(f"Error validating schema {schema_path}: {e}")
471
+ issue = ValidationIssue(
472
+ enum_name="<schema>",
473
+ value_name="<error>",
474
+ severity="ERROR",
475
+ message=f"Failed to validate schema: {e}",
476
+ meaning=None
477
+ )
478
+ result.issues.append(issue)
479
+
480
+ return result
481
+
482
+ def report_unknown_prefixes(self) -> None:
483
+ """Report unknown ontology prefixes that were encountered during validation."""
484
+ if self._warned_prefixes:
485
+ print(f"\n📋 Unknown ontology prefixes encountered:")
486
+ print(" Consider adding these to oak_config.yaml if they are valid ontologies:")
487
+ for prefix in sorted(self._warned_prefixes):
488
+ print(f" • {prefix.upper()}: sqlite:obo:{prefix}")
489
+ print(f" Or remove the 'meaning:' mappings if these are not valid ontology terms.")
490
+
491
+
492
+ def main():
493
+ """Main function for CLI usage."""
494
+ import argparse
495
+
496
+ parser = argparse.ArgumentParser(description="Validate LinkML enum ontology mappings")
497
+ parser.add_argument("path", type=Path, help="Path to schema file or directory")
498
+ parser.add_argument("--adapter", default="sqlite:obo:",
499
+ help="OAK adapter string (e.g., sqlite:obo:, sqlite:obo:merged, ols:, bioportal:)")
500
+ parser.add_argument("--strict", action="store_true", help="Treat warnings as errors")
501
+ parser.add_argument("--no-cache", action="store_true", help="Disable label caching")
502
+ parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output with detailed information")
503
+
504
+ args = parser.parse_args()
505
+
506
+ # Build configuration
507
+ config = ValidationConfig(
508
+ oak_adapter_string=args.adapter,
509
+ strict_mode=args.strict,
510
+ cache_labels=not args.no_cache
511
+ )
512
+
513
+ # Configure logging based on verbose flag
514
+ if args.verbose:
515
+ logging.basicConfig(level=logging.INFO, force=True)
516
+ else:
517
+ # Suppress all logging output in non-verbose mode
518
+ logging.basicConfig(level=logging.CRITICAL, force=True)
519
+ # Also suppress oaklib and other library logging
520
+ for logger_name in ['oaklib', 'root', 'pystow', 'linkml_runtime', 'urllib3', 'httpx', 'httpcore']:
521
+ logging.getLogger(logger_name).setLevel(logging.CRITICAL)
522
+
523
+ # Suppress pystow progress bars
524
+ import os
525
+ os.environ['PYSTOW_NO_PROGRESS'] = '1'
526
+
527
+ # Create evaluator
528
+ evaluator = EnumEvaluator(config)
529
+
530
+ if not HAS_OAK:
531
+ print("Error: OAK is not installed. Please install with: pip install oaklib")
532
+ return 1
533
+
534
+ # Process path
535
+ if args.path.is_file():
536
+ result = evaluator.validate_schema(args.path)
537
+
538
+ # Handle output based on results and verbosity
539
+ if not result.has_errors() and not result.has_warnings():
540
+ if args.verbose:
541
+ result.print_summary()
542
+ else:
543
+ print("✅") # Just a checkmark for success
544
+
545
+ # Report unknown prefixes even on success
546
+ evaluator.report_unknown_prefixes()
547
+ return 0
548
+ else:
549
+ # Always show errors and warnings, but format differently based on verbosity
550
+ if args.verbose:
551
+ result.print_summary()
552
+ # Show all issues in verbose mode
553
+ for issue in result.issues:
554
+ print(f"\n{issue.severity}: {issue.enum_name}.{issue.value_name}")
555
+ print(f" {issue.message}")
556
+ if issue.meaning:
557
+ print(f" CURIE: {issue.meaning}")
558
+ else:
559
+ # Concise output for non-verbose mode
560
+ errors = [i for i in result.issues if i.severity == "ERROR"]
561
+ warnings = [i for i in result.issues if i.severity == "WARNING"]
562
+
563
+ if errors:
564
+ print(f"❌ Validation failed with {len(errors)} error(s)\n")
565
+ print("ERRORS:")
566
+ for issue in errors:
567
+ print(f" • {args.path.name}:{issue.enum_name}.{issue.value_name}: {issue.message}")
568
+ if issue.meaning:
569
+ print(f" Fix: Check CURIE {issue.meaning}")
570
+
571
+ if warnings and not args.strict:
572
+ print(f"\n⚠️ {len(warnings)} warning(s):")
573
+ for issue in warnings[:LIMIT]: # Show first 100 warnings
574
+ id_info = f" [{issue.meaning}]" if issue.meaning else ""
575
+ print(f" • {issue.enum_name}.{issue.value_name}{id_info}: {issue.message}")
576
+ if len(warnings) > LIMIT:
577
+ print(f" ... and {len(warnings) - LIMIT} more warnings")
578
+
579
+ # Report unknown prefixes
580
+ evaluator.report_unknown_prefixes()
581
+
582
+ return 1 if result.has_errors() or (args.strict and result.has_warnings()) else 0
583
+
584
+ elif args.path.is_dir():
585
+ all_results = []
586
+ schema_files = sorted([f for f in args.path.rglob("*.yaml")
587
+ if "linkml_model" not in str(f)])
588
+
589
+ if args.verbose:
590
+ print(f"🔍 Validating {len(schema_files)} schema files...\n")
591
+
592
+ # Collect results
593
+ for schema_file in schema_files:
594
+ if args.verbose:
595
+ print(f"Validating {schema_file.name}...")
596
+
597
+ result = evaluator.validate_schema(schema_file)
598
+ result.schema_path = schema_file # Store path for error reporting
599
+ all_results.append(result)
600
+
601
+ if args.verbose:
602
+ result.print_summary()
603
+
604
+ # Calculate totals
605
+ total_errors = sum(len([i for i in r.issues if i.severity == "ERROR"]) for r in all_results)
606
+ total_warnings = sum(len([i for i in r.issues if i.severity == "WARNING"]) for r in all_results)
607
+
608
+ # Output based on results
609
+ if total_errors == 0 and total_warnings == 0:
610
+ if args.verbose:
611
+ print(f"\n{'='*60}")
612
+ print(f"✅ All {len(schema_files)} schemas validated successfully!")
613
+ else:
614
+ print("✅") # Just a checkmark for complete success
615
+
616
+ # Report unknown prefixes even on success
617
+ evaluator.report_unknown_prefixes()
618
+ return 0
619
+ else:
620
+ # Show errors and warnings
621
+ if not args.verbose:
622
+ # Concise error listing
623
+ if total_errors > 0:
624
+ print(f"❌ Validation failed with {total_errors} error(s) in {sum(1 for r in all_results if r.has_errors())} file(s)\n")
625
+ print("ERRORS:")
626
+ for result in all_results:
627
+ errors = [i for i in result.issues if i.severity == "ERROR"]
628
+ if errors:
629
+ for issue in errors:
630
+ schema_name = result.schema_path.name if hasattr(result, 'schema_path') else 'unknown'
631
+ print(f" • {schema_name}:{issue.enum_name}.{issue.value_name}: {issue.message}")
632
+ if issue.meaning:
633
+ print(f" Fix: Check CURIE {issue.meaning}")
634
+
635
+ if total_warnings > 0 and not args.strict:
636
+ print(f"\n⚠️ {total_warnings} warning(s) in {sum(1 for r in all_results if r.has_warnings())} file(s)")
637
+ # Show first few warnings
638
+ warning_count = 0
639
+ for result in all_results:
640
+ warnings = [i for i in result.issues if i.severity == "WARNING"]
641
+ for issue in warnings:
642
+ if warning_count < 100:
643
+ schema_name = result.schema_path.name if hasattr(result, 'schema_path') else 'unknown'
644
+ id_info = f" [{issue.meaning}]" if issue.meaning else ""
645
+ print(f" • {schema_name}:{issue.enum_name}.{issue.value_name}{id_info}: {issue.message}")
646
+ warning_count += 1
647
+ else:
648
+ break
649
+ if warning_count >= 100:
650
+ break
651
+ if total_warnings > 100:
652
+ print(f" ... and {total_warnings - 100} more warnings")
653
+ else:
654
+ # Verbose output
655
+ print(f"\n{'='*60}")
656
+ print(f"Overall: {total_errors} errors, {total_warnings} warnings in {len(schema_files)} files")
657
+
658
+ # Report unknown prefixes
659
+ evaluator.report_unknown_prefixes()
660
+
661
+ return 1 if total_errors > 0 or (args.strict and total_warnings > 0) else 0
662
+ else:
663
+ print(f"Error: {args.path} is not a file or directory")
664
+ return 1
665
+
666
+
667
+ if __name__ == "__main__":
668
+ import sys
669
+ sys.exit(main())