valuesets 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valuesets might be problematic. Click here for more details.

Files changed (248) hide show
  1. valuesets/__init__.py +7 -0
  2. valuesets/_version.py +8 -0
  3. valuesets/datamodel/valuesets.py +13796 -0
  4. valuesets/datamodel/valuesets_dataclass.py +24503 -0
  5. valuesets/datamodel/valuesets_pydantic.py +13796 -0
  6. valuesets/enums/__init__.py +590 -0
  7. valuesets/enums/academic/__init__.py +1 -0
  8. valuesets/enums/academic/research.py +559 -0
  9. valuesets/enums/analytical_chemistry/__init__.py +1 -0
  10. valuesets/enums/analytical_chemistry/mass_spectrometry.py +198 -0
  11. valuesets/enums/bio/__init__.py +1 -0
  12. valuesets/enums/bio/biological_colors.py +238 -0
  13. valuesets/enums/bio/cell_cycle.py +180 -0
  14. valuesets/enums/bio/currency_chemicals.py +52 -0
  15. valuesets/enums/bio/developmental_stages.py +103 -0
  16. valuesets/enums/bio/genome_features.py +182 -0
  17. valuesets/enums/bio/genomics.py +91 -0
  18. valuesets/enums/bio/go_aspect.py +32 -0
  19. valuesets/enums/bio/go_causality.py +58 -0
  20. valuesets/enums/bio/go_evidence.py +129 -0
  21. valuesets/enums/bio/human_developmental_stages.py +62 -0
  22. valuesets/enums/bio/insdc_geographic_locations.py +591 -0
  23. valuesets/enums/bio/insdc_missing_values.py +49 -0
  24. valuesets/enums/bio/lipid_categories.py +67 -0
  25. valuesets/enums/bio/mouse_developmental_stages.py +62 -0
  26. valuesets/enums/bio/plant_biology.py +86 -0
  27. valuesets/enums/bio/plant_developmental_stages.py +54 -0
  28. valuesets/enums/bio/plant_sex.py +81 -0
  29. valuesets/enums/bio/protein_evidence.py +61 -0
  30. valuesets/enums/bio/proteomics_standards.py +123 -0
  31. valuesets/enums/bio/psi_mi.py +306 -0
  32. valuesets/enums/bio/relationship_to_oxygen.py +37 -0
  33. valuesets/enums/bio/sequence_alphabets.py +449 -0
  34. valuesets/enums/bio/sequence_chemistry.py +357 -0
  35. valuesets/enums/bio/sequencing_platforms.py +302 -0
  36. valuesets/enums/bio/structural_biology.py +320 -0
  37. valuesets/enums/bio/taxonomy.py +238 -0
  38. valuesets/enums/bio/trophic_levels.py +85 -0
  39. valuesets/enums/bio/uniprot_species.py +344 -0
  40. valuesets/enums/bio/viral_genome_types.py +47 -0
  41. valuesets/enums/bioprocessing/__init__.py +1 -0
  42. valuesets/enums/bioprocessing/scale_up.py +249 -0
  43. valuesets/enums/business/__init__.py +1 -0
  44. valuesets/enums/business/human_resources.py +275 -0
  45. valuesets/enums/business/industry_classifications.py +181 -0
  46. valuesets/enums/business/management_operations.py +228 -0
  47. valuesets/enums/business/organizational_structures.py +236 -0
  48. valuesets/enums/business/quality_management.py +181 -0
  49. valuesets/enums/business/supply_chain.py +232 -0
  50. valuesets/enums/chemistry/__init__.py +1 -0
  51. valuesets/enums/chemistry/chemical_entities.py +315 -0
  52. valuesets/enums/chemistry/reaction_directionality.py +65 -0
  53. valuesets/enums/chemistry/reactions.py +256 -0
  54. valuesets/enums/clinical/__init__.py +1 -0
  55. valuesets/enums/clinical/nih_demographics.py +177 -0
  56. valuesets/enums/clinical/phenopackets.py +254 -0
  57. valuesets/enums/common_value_sets.py +8791 -0
  58. valuesets/enums/computing/__init__.py +1 -0
  59. valuesets/enums/computing/file_formats.py +294 -0
  60. valuesets/enums/computing/maturity_levels.py +196 -0
  61. valuesets/enums/computing/mime_types.py +227 -0
  62. valuesets/enums/confidence_levels.py +168 -0
  63. valuesets/enums/contributor.py +30 -0
  64. valuesets/enums/core.py +42 -0
  65. valuesets/enums/data/__init__.py +1 -0
  66. valuesets/enums/data/data_absent_reason.py +53 -0
  67. valuesets/enums/data_science/__init__.py +1 -0
  68. valuesets/enums/data_science/binary_classification.py +87 -0
  69. valuesets/enums/data_science/emotion_classification.py +66 -0
  70. valuesets/enums/data_science/priority_severity.py +73 -0
  71. valuesets/enums/data_science/quality_control.py +46 -0
  72. valuesets/enums/data_science/sentiment_analysis.py +50 -0
  73. valuesets/enums/data_science/text_classification.py +97 -0
  74. valuesets/enums/demographics.py +206 -0
  75. valuesets/enums/ecological_interactions.py +151 -0
  76. valuesets/enums/energy/__init__.py +1 -0
  77. valuesets/enums/energy/energy.py +343 -0
  78. valuesets/enums/energy/fossil_fuels.py +29 -0
  79. valuesets/enums/energy/nuclear/__init__.py +1 -0
  80. valuesets/enums/energy/nuclear/nuclear_facilities.py +195 -0
  81. valuesets/enums/energy/nuclear/nuclear_fuel_cycle.py +96 -0
  82. valuesets/enums/energy/nuclear/nuclear_fuels.py +175 -0
  83. valuesets/enums/energy/nuclear/nuclear_operations.py +191 -0
  84. valuesets/enums/energy/nuclear/nuclear_regulatory.py +188 -0
  85. valuesets/enums/energy/nuclear/nuclear_safety.py +164 -0
  86. valuesets/enums/energy/nuclear/nuclear_waste.py +158 -0
  87. valuesets/enums/energy/nuclear/reactor_types.py +163 -0
  88. valuesets/enums/environmental_health/__init__.py +1 -0
  89. valuesets/enums/environmental_health/exposures.py +265 -0
  90. valuesets/enums/geography/__init__.py +1 -0
  91. valuesets/enums/geography/geographic_codes.py +741 -0
  92. valuesets/enums/health/__init__.py +12 -0
  93. valuesets/enums/health/vaccination.py +98 -0
  94. valuesets/enums/health.py +36 -0
  95. valuesets/enums/health_base.py +36 -0
  96. valuesets/enums/healthcare.py +45 -0
  97. valuesets/enums/industry/__init__.py +1 -0
  98. valuesets/enums/industry/extractive_industry.py +94 -0
  99. valuesets/enums/industry/mining.py +388 -0
  100. valuesets/enums/industry/safety_colors.py +201 -0
  101. valuesets/enums/investigation.py +27 -0
  102. valuesets/enums/materials_science/__init__.py +1 -0
  103. valuesets/enums/materials_science/characterization_methods.py +112 -0
  104. valuesets/enums/materials_science/crystal_structures.py +76 -0
  105. valuesets/enums/materials_science/material_properties.py +119 -0
  106. valuesets/enums/materials_science/material_types.py +104 -0
  107. valuesets/enums/materials_science/pigments_dyes.py +198 -0
  108. valuesets/enums/materials_science/synthesis_methods.py +109 -0
  109. valuesets/enums/medical/__init__.py +1 -0
  110. valuesets/enums/medical/clinical.py +277 -0
  111. valuesets/enums/medical/neuroimaging.py +119 -0
  112. valuesets/enums/mining_processing.py +302 -0
  113. valuesets/enums/physics/__init__.py +1 -0
  114. valuesets/enums/physics/states_of_matter.py +46 -0
  115. valuesets/enums/social/__init__.py +1 -0
  116. valuesets/enums/social/person_status.py +29 -0
  117. valuesets/enums/spatial/__init__.py +1 -0
  118. valuesets/enums/spatial/spatial_qualifiers.py +246 -0
  119. valuesets/enums/statistics/__init__.py +5 -0
  120. valuesets/enums/statistics/prediction_outcomes.py +31 -0
  121. valuesets/enums/statistics.py +31 -0
  122. valuesets/enums/time/__init__.py +1 -0
  123. valuesets/enums/time/temporal.py +254 -0
  124. valuesets/enums/units/__init__.py +1 -0
  125. valuesets/enums/units/measurements.py +310 -0
  126. valuesets/enums/visual/__init__.py +1 -0
  127. valuesets/enums/visual/colors.py +376 -0
  128. valuesets/generators/__init__.py +19 -0
  129. valuesets/generators/auto_slot_injector.py +280 -0
  130. valuesets/generators/enhanced_pydantic_generator.py +100 -0
  131. valuesets/generators/enum_slot_generator.py +201 -0
  132. valuesets/generators/modular_rich_generator.py +353 -0
  133. valuesets/generators/prefix_standardizer.py +198 -0
  134. valuesets/generators/rich_enum.py +127 -0
  135. valuesets/generators/rich_pydantic_generator.py +310 -0
  136. valuesets/generators/smart_slot_syncer.py +428 -0
  137. valuesets/generators/sssom_generator.py +394 -0
  138. valuesets/merged/merged_hierarchy.yaml +21649 -0
  139. valuesets/schema/README.md +3 -0
  140. valuesets/schema/academic/research.yaml +911 -0
  141. valuesets/schema/analytical_chemistry/mass_spectrometry.yaml +206 -0
  142. valuesets/schema/bio/bio_entities.yaml +364 -0
  143. valuesets/schema/bio/biological_colors.yaml +434 -0
  144. valuesets/schema/bio/cell_cycle.yaml +309 -0
  145. valuesets/schema/bio/currency_chemicals.yaml +70 -0
  146. valuesets/schema/bio/developmental_stages.yaml +226 -0
  147. valuesets/schema/bio/genome_features.yaml +342 -0
  148. valuesets/schema/bio/genomics.yaml +101 -0
  149. valuesets/schema/bio/go_aspect.yaml +39 -0
  150. valuesets/schema/bio/go_causality.yaml +119 -0
  151. valuesets/schema/bio/go_evidence.yaml +215 -0
  152. valuesets/schema/bio/insdc_geographic_locations.yaml +911 -0
  153. valuesets/schema/bio/insdc_missing_values.yaml +85 -0
  154. valuesets/schema/bio/lipid_categories.yaml +72 -0
  155. valuesets/schema/bio/plant_biology.yaml +125 -0
  156. valuesets/schema/bio/plant_developmental_stages.yaml +77 -0
  157. valuesets/schema/bio/plant_sex.yaml +108 -0
  158. valuesets/schema/bio/protein_evidence.yaml +63 -0
  159. valuesets/schema/bio/proteomics_standards.yaml +116 -0
  160. valuesets/schema/bio/psi_mi.yaml +400 -0
  161. valuesets/schema/bio/relationship_to_oxygen.yaml +46 -0
  162. valuesets/schema/bio/sequence_alphabets.yaml +1168 -0
  163. valuesets/schema/bio/sequence_chemistry.yaml +477 -0
  164. valuesets/schema/bio/sequencing_platforms.yaml +515 -0
  165. valuesets/schema/bio/structural_biology.yaml +428 -0
  166. valuesets/schema/bio/taxonomy.yaml +453 -0
  167. valuesets/schema/bio/trophic_levels.yaml +118 -0
  168. valuesets/schema/bio/uniprot_species.yaml +1209 -0
  169. valuesets/schema/bio/viral_genome_types.yaml +99 -0
  170. valuesets/schema/bioprocessing/scale_up.yaml +458 -0
  171. valuesets/schema/business/human_resources.yaml +752 -0
  172. valuesets/schema/business/industry_classifications.yaml +448 -0
  173. valuesets/schema/business/management_operations.yaml +602 -0
  174. valuesets/schema/business/organizational_structures.yaml +645 -0
  175. valuesets/schema/business/quality_management.yaml +502 -0
  176. valuesets/schema/business/supply_chain.yaml +688 -0
  177. valuesets/schema/chemistry/chemical_entities.yaml +639 -0
  178. valuesets/schema/chemistry/reaction_directionality.yaml +60 -0
  179. valuesets/schema/chemistry/reactions.yaml +442 -0
  180. valuesets/schema/clinical/nih_demographics.yaml +285 -0
  181. valuesets/schema/clinical/phenopackets.yaml +429 -0
  182. valuesets/schema/computing/file_formats.yaml +631 -0
  183. valuesets/schema/computing/maturity_levels.yaml +229 -0
  184. valuesets/schema/computing/mime_types.yaml +266 -0
  185. valuesets/schema/confidence_levels.yaml +206 -0
  186. valuesets/schema/contributor.yaml +30 -0
  187. valuesets/schema/core.yaml +55 -0
  188. valuesets/schema/data/data_absent_reason.yaml +82 -0
  189. valuesets/schema/data_science/binary_classification.yaml +125 -0
  190. valuesets/schema/data_science/emotion_classification.yaml +109 -0
  191. valuesets/schema/data_science/priority_severity.yaml +122 -0
  192. valuesets/schema/data_science/quality_control.yaml +68 -0
  193. valuesets/schema/data_science/sentiment_analysis.yaml +81 -0
  194. valuesets/schema/data_science/text_classification.yaml +135 -0
  195. valuesets/schema/demographics.yaml +238 -0
  196. valuesets/schema/ecological_interactions.yaml +298 -0
  197. valuesets/schema/energy/energy.yaml +595 -0
  198. valuesets/schema/energy/fossil_fuels.yaml +28 -0
  199. valuesets/schema/energy/nuclear/nuclear_facilities.yaml +463 -0
  200. valuesets/schema/energy/nuclear/nuclear_fuel_cycle.yaml +82 -0
  201. valuesets/schema/energy/nuclear/nuclear_fuels.yaml +421 -0
  202. valuesets/schema/energy/nuclear/nuclear_operations.yaml +480 -0
  203. valuesets/schema/energy/nuclear/nuclear_regulatory.yaml +200 -0
  204. valuesets/schema/energy/nuclear/nuclear_safety.yaml +352 -0
  205. valuesets/schema/energy/nuclear/nuclear_waste.yaml +332 -0
  206. valuesets/schema/energy/nuclear/reactor_types.yaml +394 -0
  207. valuesets/schema/environmental_health/exposures.yaml +355 -0
  208. valuesets/schema/generated_slots.yaml +1828 -0
  209. valuesets/schema/geography/geographic_codes.yaml +1018 -0
  210. valuesets/schema/health/vaccination.yaml +102 -0
  211. valuesets/schema/health.yaml +38 -0
  212. valuesets/schema/healthcare.yaml +53 -0
  213. valuesets/schema/industry/extractive_industry.yaml +89 -0
  214. valuesets/schema/industry/mining.yaml +888 -0
  215. valuesets/schema/industry/safety_colors.yaml +375 -0
  216. valuesets/schema/investigation.yaml +64 -0
  217. valuesets/schema/materials_science/characterization_methods.yaml +193 -0
  218. valuesets/schema/materials_science/crystal_structures.yaml +138 -0
  219. valuesets/schema/materials_science/material_properties.yaml +135 -0
  220. valuesets/schema/materials_science/material_types.yaml +151 -0
  221. valuesets/schema/materials_science/pigments_dyes.yaml +465 -0
  222. valuesets/schema/materials_science/synthesis_methods.yaml +186 -0
  223. valuesets/schema/medical/clinical.yaml +610 -0
  224. valuesets/schema/medical/neuroimaging.yaml +325 -0
  225. valuesets/schema/mining_processing.yaml +295 -0
  226. valuesets/schema/physics/states_of_matter.yaml +46 -0
  227. valuesets/schema/slot_mixins.yaml +143 -0
  228. valuesets/schema/social/person_status.yaml +28 -0
  229. valuesets/schema/spatial/spatial_qualifiers.yaml +466 -0
  230. valuesets/schema/statistics/prediction_outcomes.yaml +26 -0
  231. valuesets/schema/statistics.yaml +34 -0
  232. valuesets/schema/time/temporal.yaml +435 -0
  233. valuesets/schema/types.yaml +15 -0
  234. valuesets/schema/units/measurements.yaml +675 -0
  235. valuesets/schema/valuesets.yaml +100 -0
  236. valuesets/schema/visual/colors.yaml +778 -0
  237. valuesets/utils/__init__.py +6 -0
  238. valuesets/utils/comparison.py +102 -0
  239. valuesets/utils/expand_dynamic_enums.py +414 -0
  240. valuesets/utils/mapping_utils.py +236 -0
  241. valuesets/validators/__init__.py +11 -0
  242. valuesets/validators/enum_evaluator.py +669 -0
  243. valuesets/validators/oak_config.yaml +70 -0
  244. valuesets/validators/validate_with_ols.py +241 -0
  245. valuesets-0.3.1.dist-info/METADATA +395 -0
  246. valuesets-0.3.1.dist-info/RECORD +248 -0
  247. valuesets-0.3.1.dist-info/WHEEL +4 -0
  248. valuesets-0.3.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,6 @@
1
+ """Utilities for working with common value sets."""
2
+
3
+ from .comparison import same_meaning_as
4
+ from .expand_dynamic_enums import DynamicEnumExpander
5
+
6
+ __all__ = ["same_meaning_as", "DynamicEnumExpander"]
@@ -0,0 +1,102 @@
1
+ """Comparison utilities for enum values."""
2
+
3
+ from enum import Enum
4
+ from typing import Any, Optional, Union
5
+
6
+
7
+ def same_meaning_as(enum1: Any, enum2: Any) -> Optional[bool]:
8
+ """
9
+ Check if two enum values have the same semantic meaning.
10
+
11
+ This function compares the 'meaning' attribute of two enum values.
12
+ For rich enums generated from LinkML schemas, this will be the ontology term CURIE.
13
+ For standard Python enums without a meaning attribute, this returns None.
14
+
15
+ Args:
16
+ enum1: First enum value to compare
17
+ enum2: Second enum value to compare
18
+
19
+ Returns:
20
+ True if both enums have the same meaning
21
+ False if both enums have meanings but they differ
22
+ None if either enum lacks a meaning attribute
23
+
24
+ Examples:
25
+ >>> from enum import Enum
26
+
27
+ # Create test enums with and without meaning attributes
28
+ >>> class EnumWithMeaning(Enum):
29
+ ... VALUE1 = "val1"
30
+ ... VALUE2 = "val2"
31
+ ... def __init__(self, value):
32
+ ... self._value_ = value
33
+ ... # Simulate meaning attribute for some values
34
+ ... if self.name == 'VALUE1':
35
+ ... self.meaning = "ONTOLOGY:123"
36
+ ... elif self.name == 'VALUE2':
37
+ ... self.meaning = "ONTOLOGY:456"
38
+
39
+ >>> class AnotherEnumWithMeaning(Enum):
40
+ ... ITEM1 = "item1"
41
+ ... ITEM2 = "item2"
42
+ ... def __init__(self, value):
43
+ ... self._value_ = value
44
+ ... # Same meaning as VALUE1
45
+ ... if self.name == 'ITEM1':
46
+ ... self.meaning = "ONTOLOGY:123"
47
+ ... elif self.name == 'ITEM2':
48
+ ... self.meaning = "ONTOLOGY:789"
49
+
50
+ # Test with enums that have same meaning
51
+ >>> same_meaning_as(EnumWithMeaning.VALUE1, AnotherEnumWithMeaning.ITEM1)
52
+ True
53
+
54
+ # Test with enums that have different meanings
55
+ >>> same_meaning_as(EnumWithMeaning.VALUE1, EnumWithMeaning.VALUE2)
56
+ False
57
+
58
+ # Test with standard Python enum (no meaning attribute)
59
+ >>> class StandardEnum(Enum):
60
+ ... VALUE1 = "val1"
61
+ ... VALUE2 = "val2"
62
+ >>> same_meaning_as(StandardEnum.VALUE1, StandardEnum.VALUE2) is None
63
+ True
64
+
65
+ # Test mixed case - one has meaning, one doesn't
66
+ >>> same_meaning_as(EnumWithMeaning.VALUE1, StandardEnum.VALUE1) is None
67
+ True
68
+
69
+ # Test with non-enum values
70
+ >>> same_meaning_as("not_enum", EnumWithMeaning.VALUE1) is None
71
+ True
72
+
73
+ # Test with None values
74
+ >>> same_meaning_as(None, EnumWithMeaning.VALUE1) is None
75
+ True
76
+
77
+ # Test identity case
78
+ >>> same_meaning_as(EnumWithMeaning.VALUE1, EnumWithMeaning.VALUE1)
79
+ True
80
+ """
81
+ # Handle None or non-enum inputs
82
+ if enum1 is None or enum2 is None:
83
+ return None
84
+
85
+ if not isinstance(enum1, Enum) or not isinstance(enum2, Enum):
86
+ return None
87
+
88
+ # Check if both enums have meaning attributes
89
+ meaning1 = getattr(enum1, 'meaning', None)
90
+ meaning2 = getattr(enum2, 'meaning', None)
91
+
92
+ # If either doesn't have a meaning, return None
93
+ if meaning1 is None or meaning2 is None:
94
+ return None
95
+
96
+ # Both have meanings - compare them
97
+ return meaning1 == meaning2
98
+
99
+
100
+ if __name__ == "__main__":
101
+ import doctest
102
+ doctest.testmod(verbose=True)
@@ -0,0 +1,414 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Utility script to expand all dynamic enums from LinkML schemas using OAK's vskit.
4
+
5
+ This script:
6
+ 1. Scans all schema files for dynamic enum definitions
7
+ 2. Uses OAK's vskit expand_in_place to expand each dynamic enum
8
+ 3. Saves the expanded enums to a parallel directory structure under src/valuesets/expanded/
9
+ """
10
+
11
+ import yaml
12
+ from pathlib import Path
13
+ from typing import Dict, Any, Optional, Tuple
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ import logging
16
+ import tempfile
17
+ from oaklib.utilities.subsets.value_set_expander import ValueSetExpander
18
+ from copy import deepcopy
19
+
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class DynamicEnumExpander:
25
+ """Expands dynamic enums from LinkML schemas using OAK's vskit."""
26
+
27
+ def __init__(self, schema_dir: Path, output_dir: Path):
28
+ """
29
+ Initialize the expander.
30
+
31
+ Args:
32
+ schema_dir: Directory containing LinkML schema files
33
+ output_dir: Directory where expanded enums will be saved
34
+ """
35
+ self.schema_dir = Path(schema_dir)
36
+ self.output_dir = Path(output_dir)
37
+ self.output_dir.mkdir(parents=True, exist_ok=True)
38
+
39
+ # Cache for value set expanders
40
+ self.expanders = {}
41
+
42
+ def get_expander(self, ontology: str) -> Optional[ValueSetExpander]:
43
+ """Get or create a ValueSetExpander for an ontology."""
44
+ if ontology not in self.expanders:
45
+ try:
46
+ # Use OBO format for standard ontologies
47
+ if ontology.startswith('obo:'):
48
+ ontology_id = ontology.replace('obo:', '')
49
+ adapter_spec = f"sqlite:obo:{ontology_id}"
50
+ else:
51
+ adapter_spec = ontology
52
+
53
+ logger.info(f"Creating expander for {ontology}: {adapter_spec}")
54
+ # Create expander
55
+ self.expanders[ontology] = ValueSetExpander(resource=adapter_spec)
56
+ except Exception as e:
57
+ logger.error(f"Failed to create expander for {ontology}: {e}")
58
+ import traceback
59
+ logger.error(f"Traceback: {traceback.format_exc()}")
60
+ return None
61
+ return self.expanders[ontology]
62
+
63
+ def expand_dynamic_enum(self, enum_name: str, enum_def: Dict[str, Any],
64
+ source_file: Path, source_schema: Dict[str, Any] = None) -> Optional[Dict[str, Any]]:
65
+ """
66
+ Expand a single dynamic enum definition using OAK's expand_in_place.
67
+
68
+ Args:
69
+ enum_name: Name of the enum
70
+ enum_def: Enum definition from schema
71
+ source_file: Path to the source schema file
72
+ source_schema: The full source schema (for extracting prefixes)
73
+
74
+ Returns:
75
+ Expanded enum with permissible_values populated
76
+ """
77
+ if 'reachable_from' not in enum_def:
78
+ return None
79
+
80
+ reachable = enum_def['reachable_from']
81
+ source_ontology = reachable.get('source_ontology')
82
+
83
+ # If no source_ontology specified, try to infer from source_nodes
84
+ if not source_ontology:
85
+ source_nodes = reachable.get('source_nodes', [])
86
+ if source_nodes:
87
+ first_node = source_nodes[0] if isinstance(source_nodes, list) else source_nodes
88
+ # Infer ontology from prefix
89
+ if first_node.startswith('OBI:'):
90
+ source_ontology = 'obo:obi'
91
+ elif first_node.startswith('NCBITaxon:'):
92
+ source_ontology = 'obo:ncbitaxon'
93
+ elif first_node.startswith('MONDO:'):
94
+ source_ontology = 'obo:mondo'
95
+ elif first_node.startswith('HP:'):
96
+ source_ontology = 'obo:hp'
97
+ elif first_node.startswith('UBERON:'):
98
+ source_ontology = 'obo:uberon'
99
+ elif first_node.startswith('CL:'):
100
+ source_ontology = 'obo:cl'
101
+ elif first_node.startswith('PO:'):
102
+ source_ontology = 'obo:po'
103
+ elif first_node.startswith('PATO:'):
104
+ source_ontology = 'obo:pato'
105
+ elif first_node.startswith('CHEBI:'):
106
+ source_ontology = 'obo:chebi'
107
+ elif first_node.startswith('GO:'):
108
+ source_ontology = 'obo:go'
109
+ else:
110
+ # Default fallback
111
+ source_ontology = 'obo:mondo'
112
+ logger.info(f" Inferred source_ontology={source_ontology} from node {first_node}")
113
+
114
+ if not source_ontology:
115
+ source_ontology = 'obo:mondo' # Ultimate fallback
116
+
117
+ logger.info(f"Expanding {enum_name} from {source_file}")
118
+ logger.info(f" Source: {source_ontology}")
119
+
120
+ expander = self.get_expander(source_ontology)
121
+ if not expander:
122
+ logger.error(f"Could not get expander for {source_ontology}")
123
+ return None
124
+
125
+ try:
126
+ # Create a copy of the enum definition for expansion
127
+ expanded_enum = deepcopy(enum_def)
128
+
129
+ # Ensure source_ontology is set in reachable_from
130
+ if 'reachable_from' in expanded_enum:
131
+ expanded_enum['reachable_from']['source_ontology'] = source_ontology
132
+
133
+ # Create a minimal schema with just this enum
134
+ schema_dict = {
135
+ 'id': f'https://example.org/temp/{enum_name}',
136
+ 'name': f'temp_schema_{enum_name}',
137
+ 'enums': {
138
+ enum_name: expanded_enum
139
+ }
140
+ }
141
+
142
+ # Write to temporary files for expand_in_place
143
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as tmp:
144
+ yaml.safe_dump(schema_dict, tmp)
145
+ tmp_path = tmp.name
146
+
147
+ with tempfile.NamedTemporaryFile(mode='w', suffix='_expanded.yaml', delete=False) as out:
148
+ out_path = out.name
149
+
150
+ try:
151
+ # Use expand_in_place with pv_syntax for LABEL format and output_path
152
+ expanded_schema = expander.expand_in_place(
153
+ tmp_path,
154
+ value_set_names=[enum_name],
155
+ output_path=out_path,
156
+ pv_syntax="{label}" # Use label as the text field
157
+ )
158
+
159
+ # Read the expanded schema from output file
160
+ try:
161
+ with open(out_path, 'r') as f:
162
+ expanded_data = yaml.safe_load(f)
163
+
164
+ # Get the expanded enum
165
+ if expanded_data and 'enums' in expanded_data and enum_name in expanded_data['enums']:
166
+ expanded_enum_def = expanded_data['enums'][enum_name]
167
+ else:
168
+ logger.warning(f"No expanded enum found for {enum_name}")
169
+ expanded_enum_def = {'permissible_values': {}}
170
+ except yaml.YAMLError as e:
171
+ # If YAML parsing fails, try using a safer pv_syntax
172
+ logger.warning(f"YAML parsing failed for {enum_name}, retrying with ID-based keys: {e}")
173
+
174
+ # Clean up the failed output file
175
+ Path(out_path).unlink(missing_ok=True)
176
+
177
+ # Retry with ID-based syntax which should be YAML-safe
178
+ with tempfile.NamedTemporaryFile(mode='w', suffix='_expanded_safe.yaml', delete=False) as out2:
179
+ out_path2 = out2.name
180
+
181
+ try:
182
+ expanded_schema = expander.expand_in_place(
183
+ tmp_path,
184
+ value_set_names=[enum_name],
185
+ output_path=out_path2,
186
+ pv_syntax="{id}" # Use ID which should be YAML-safe
187
+ )
188
+
189
+ with open(out_path2, 'r') as f:
190
+ expanded_data = yaml.safe_load(f)
191
+
192
+ if expanded_data and 'enums' in expanded_data and enum_name in expanded_data['enums']:
193
+ expanded_enum_def = expanded_data['enums'][enum_name]
194
+ else:
195
+ expanded_enum_def = {'permissible_values': {}}
196
+ finally:
197
+ Path(out_path2).unlink(missing_ok=True)
198
+ finally:
199
+ # Clean up temp files
200
+ Path(tmp_path).unlink(missing_ok=True)
201
+ Path(out_path).unlink(missing_ok=True)
202
+
203
+ # Extract just the parts we need
204
+ result = {
205
+ 'description': expanded_enum_def.get('description', f'Expanded from {enum_name}'),
206
+ 'permissible_values': expanded_enum_def.get('permissible_values', {}),
207
+ '_source': {
208
+ 'enum_name': enum_name,
209
+ 'source_file': str(source_file),
210
+ 'source_ontology': source_ontology,
211
+ 'reachable_from': reachable,
212
+ 'total_terms': len(expanded_enum_def.get('permissible_values', {})),
213
+ 'source_schema': source_schema # Include source schema for prefix extraction
214
+ }
215
+ }
216
+
217
+ logger.info(f" Expanded to {result['_source']['total_terms']} permissible values")
218
+ return result
219
+
220
+ except Exception as e:
221
+ logger.error(f"Failed to expand {enum_name}: {e}")
222
+ return None
223
+
224
+ def find_dynamic_enums(self) -> Dict[Path, Tuple[Dict[str, Any], Dict[str, Any]]]:
225
+ """
226
+ Find all dynamic enum definitions in schema files.
227
+
228
+ Returns:
229
+ Dictionary mapping file paths to tuples of (dynamic_enums, full_schema)
230
+ """
231
+ dynamic_enums = {}
232
+
233
+ for yaml_file in self.schema_dir.rglob('*.yaml'):
234
+ try:
235
+ with open(yaml_file, 'r') as f:
236
+ schema = yaml.safe_load(f)
237
+
238
+ if not schema or 'enums' not in schema:
239
+ continue
240
+
241
+ file_dynamic_enums = {}
242
+ for enum_name, enum_def in schema['enums'].items():
243
+ if 'reachable_from' in enum_def:
244
+ file_dynamic_enums[enum_name] = enum_def
245
+
246
+ if file_dynamic_enums:
247
+ dynamic_enums[yaml_file] = (file_dynamic_enums, schema)
248
+ logger.info(f"Found {len(file_dynamic_enums)} dynamic enums in {yaml_file}")
249
+
250
+ except Exception as e:
251
+ logger.warning(f"Could not parse {yaml_file}: {e}")
252
+
253
+ return dynamic_enums
254
+
255
+ def save_expanded_enum(self, enum_name: str, expanded_enum: Dict[str, Any],
256
+ source_file: Path) -> Path:
257
+ """
258
+ Save an expanded enum to the output directory as a valid LinkML schema YAML.
259
+
260
+ Args:
261
+ enum_name: Name of the enum
262
+ expanded_enum: Expanded enum data
263
+ source_file: Original source file path
264
+
265
+ Returns:
266
+ Path to the saved file
267
+ """
268
+ # Create parallel directory structure
269
+ relative_path = source_file.relative_to(self.schema_dir)
270
+ output_file = self.output_dir / relative_path.parent / f"{enum_name}.yaml"
271
+ output_file.parent.mkdir(parents=True, exist_ok=True)
272
+
273
+ # Get prefixes from source schema if available
274
+ source_schema = expanded_enum.get('_source', {}).get('source_schema', {})
275
+ prefixes = source_schema.get('prefixes', {}).copy() if source_schema else {}
276
+
277
+ # Ensure basic prefixes are present
278
+ if 'linkml' not in prefixes:
279
+ prefixes['linkml'] = 'https://w3id.org/linkml/'
280
+ if 'valuesets' not in prefixes:
281
+ prefixes['valuesets'] = 'https://w3id.org/valuesets/'
282
+
283
+ # Get default prefix from source or use valuesets
284
+ default_prefix = source_schema.get('default_prefix', 'valuesets') if source_schema else 'valuesets'
285
+
286
+ # Create a valid LinkML schema with the enum
287
+ schema_yaml = {
288
+ 'id': f'https://w3id.org/valuesets/expanded/{enum_name}',
289
+ 'name': f'{enum_name}_expanded',
290
+ 'description': f'Expanded value set for {enum_name}',
291
+ 'imports': ['linkml:types'],
292
+ 'prefixes': prefixes,
293
+ 'default_prefix': default_prefix,
294
+ 'enums': {
295
+ enum_name: {
296
+ 'description': expanded_enum['description'],
297
+ 'permissible_values': expanded_enum['permissible_values']
298
+ }
299
+ }
300
+ }
301
+
302
+ # Save as YAML using safe_dump to avoid Python object tags
303
+ with open(output_file, 'w') as f:
304
+ yaml.safe_dump(schema_yaml, f, default_flow_style=False, sort_keys=False,
305
+ allow_unicode=True, width=120)
306
+
307
+ logger.info(f"Saved expanded enum to {output_file}")
308
+ return output_file
309
+
310
+ def expand_all(self, max_workers: int = 4):
311
+ """
312
+ Expand all dynamic enums found in the schema directory.
313
+
314
+ Args:
315
+ max_workers: Maximum number of parallel workers
316
+ """
317
+ # Find all dynamic enums
318
+ dynamic_enums = self.find_dynamic_enums()
319
+
320
+ if not dynamic_enums:
321
+ logger.info("No dynamic enums found")
322
+ return
323
+
324
+ logger.info(f"Found dynamic enums in {len(dynamic_enums)} files")
325
+
326
+ # Process enums
327
+ total_enums = sum(len(enums_and_schema[0]) for enums_and_schema in dynamic_enums.values())
328
+ processed = 0
329
+ failed = 0
330
+
331
+ # Create a flat list of tasks
332
+ tasks = []
333
+ for file_path, (enums, schema) in dynamic_enums.items():
334
+ for enum_name, enum_def in enums.items():
335
+ tasks.append((enum_name, enum_def, file_path, schema))
336
+
337
+ # Process in parallel
338
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
339
+ futures = {}
340
+
341
+ for enum_name, enum_def, file_path, schema in tasks:
342
+ future = executor.submit(
343
+ self.expand_dynamic_enum,
344
+ enum_name,
345
+ enum_def,
346
+ file_path,
347
+ schema
348
+ )
349
+ futures[future] = (enum_name, file_path)
350
+
351
+ for future in as_completed(futures):
352
+ enum_name, file_path = futures[future]
353
+ try:
354
+ expanded_enum = future.result(timeout=60)
355
+ if expanded_enum:
356
+ self.save_expanded_enum(enum_name, expanded_enum, file_path)
357
+ processed += 1
358
+ else:
359
+ failed += 1
360
+ except Exception as e:
361
+ logger.error(f"Failed to process {enum_name}: {e}")
362
+ failed += 1
363
+
364
+ logger.info(f"Expansion complete: {processed} successful, {failed} failed out of {total_enums} total")
365
+
366
+ # Create summary file
367
+ summary = {
368
+ 'total_enums': total_enums,
369
+ 'processed': processed,
370
+ 'failed': failed,
371
+ 'source_files': [str(f) for f in dynamic_enums.keys()]
372
+ }
373
+
374
+ summary_file = self.output_dir / 'expansion_summary.yaml'
375
+ with open(summary_file, 'w') as f:
376
+ yaml.safe_dump(summary, f, default_flow_style=False, sort_keys=False)
377
+
378
+ logger.info(f"Summary saved to {summary_file}")
379
+
380
+
381
+ def main():
382
+ """Main entry point."""
383
+ import argparse
384
+
385
+ parser = argparse.ArgumentParser(
386
+ description='Expand dynamic enums from LinkML schemas using OAK'
387
+ )
388
+ parser.add_argument(
389
+ '--schema-dir',
390
+ type=Path,
391
+ default=Path('src/valuesets/schema'),
392
+ help='Directory containing LinkML schema files'
393
+ )
394
+ parser.add_argument(
395
+ '--output-dir',
396
+ type=Path,
397
+ default=Path('src/valuesets/expanded'),
398
+ help='Output directory for expanded enums (default: src/valuesets/expanded)'
399
+ )
400
+ parser.add_argument(
401
+ '--workers',
402
+ type=int,
403
+ default=4,
404
+ help='Number of parallel workers'
405
+ )
406
+
407
+ args = parser.parse_args()
408
+
409
+ expander = DynamicEnumExpander(args.schema_dir, args.output_dir)
410
+ expander.expand_all(max_workers=args.workers)
411
+
412
+
413
+ if __name__ == '__main__':
414
+ main()