valuesets 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of valuesets might be problematic. Click here for more details.
- valuesets/__init__.py +7 -0
- valuesets/_version.py +8 -0
- valuesets/datamodel/valuesets.py +13796 -0
- valuesets/datamodel/valuesets_dataclass.py +24503 -0
- valuesets/datamodel/valuesets_pydantic.py +13796 -0
- valuesets/enums/__init__.py +590 -0
- valuesets/enums/academic/__init__.py +1 -0
- valuesets/enums/academic/research.py +559 -0
- valuesets/enums/analytical_chemistry/__init__.py +1 -0
- valuesets/enums/analytical_chemistry/mass_spectrometry.py +198 -0
- valuesets/enums/bio/__init__.py +1 -0
- valuesets/enums/bio/biological_colors.py +238 -0
- valuesets/enums/bio/cell_cycle.py +180 -0
- valuesets/enums/bio/currency_chemicals.py +52 -0
- valuesets/enums/bio/developmental_stages.py +103 -0
- valuesets/enums/bio/genome_features.py +182 -0
- valuesets/enums/bio/genomics.py +91 -0
- valuesets/enums/bio/go_aspect.py +32 -0
- valuesets/enums/bio/go_causality.py +58 -0
- valuesets/enums/bio/go_evidence.py +129 -0
- valuesets/enums/bio/human_developmental_stages.py +62 -0
- valuesets/enums/bio/insdc_geographic_locations.py +591 -0
- valuesets/enums/bio/insdc_missing_values.py +49 -0
- valuesets/enums/bio/lipid_categories.py +67 -0
- valuesets/enums/bio/mouse_developmental_stages.py +62 -0
- valuesets/enums/bio/plant_biology.py +86 -0
- valuesets/enums/bio/plant_developmental_stages.py +54 -0
- valuesets/enums/bio/plant_sex.py +81 -0
- valuesets/enums/bio/protein_evidence.py +61 -0
- valuesets/enums/bio/proteomics_standards.py +123 -0
- valuesets/enums/bio/psi_mi.py +306 -0
- valuesets/enums/bio/relationship_to_oxygen.py +37 -0
- valuesets/enums/bio/sequence_alphabets.py +449 -0
- valuesets/enums/bio/sequence_chemistry.py +357 -0
- valuesets/enums/bio/sequencing_platforms.py +302 -0
- valuesets/enums/bio/structural_biology.py +320 -0
- valuesets/enums/bio/taxonomy.py +238 -0
- valuesets/enums/bio/trophic_levels.py +85 -0
- valuesets/enums/bio/uniprot_species.py +344 -0
- valuesets/enums/bio/viral_genome_types.py +47 -0
- valuesets/enums/bioprocessing/__init__.py +1 -0
- valuesets/enums/bioprocessing/scale_up.py +249 -0
- valuesets/enums/business/__init__.py +1 -0
- valuesets/enums/business/human_resources.py +275 -0
- valuesets/enums/business/industry_classifications.py +181 -0
- valuesets/enums/business/management_operations.py +228 -0
- valuesets/enums/business/organizational_structures.py +236 -0
- valuesets/enums/business/quality_management.py +181 -0
- valuesets/enums/business/supply_chain.py +232 -0
- valuesets/enums/chemistry/__init__.py +1 -0
- valuesets/enums/chemistry/chemical_entities.py +315 -0
- valuesets/enums/chemistry/reaction_directionality.py +65 -0
- valuesets/enums/chemistry/reactions.py +256 -0
- valuesets/enums/clinical/__init__.py +1 -0
- valuesets/enums/clinical/nih_demographics.py +177 -0
- valuesets/enums/clinical/phenopackets.py +254 -0
- valuesets/enums/common_value_sets.py +8791 -0
- valuesets/enums/computing/__init__.py +1 -0
- valuesets/enums/computing/file_formats.py +294 -0
- valuesets/enums/computing/maturity_levels.py +196 -0
- valuesets/enums/computing/mime_types.py +227 -0
- valuesets/enums/confidence_levels.py +168 -0
- valuesets/enums/contributor.py +30 -0
- valuesets/enums/core.py +42 -0
- valuesets/enums/data/__init__.py +1 -0
- valuesets/enums/data/data_absent_reason.py +53 -0
- valuesets/enums/data_science/__init__.py +1 -0
- valuesets/enums/data_science/binary_classification.py +87 -0
- valuesets/enums/data_science/emotion_classification.py +66 -0
- valuesets/enums/data_science/priority_severity.py +73 -0
- valuesets/enums/data_science/quality_control.py +46 -0
- valuesets/enums/data_science/sentiment_analysis.py +50 -0
- valuesets/enums/data_science/text_classification.py +97 -0
- valuesets/enums/demographics.py +206 -0
- valuesets/enums/ecological_interactions.py +151 -0
- valuesets/enums/energy/__init__.py +1 -0
- valuesets/enums/energy/energy.py +343 -0
- valuesets/enums/energy/fossil_fuels.py +29 -0
- valuesets/enums/energy/nuclear/__init__.py +1 -0
- valuesets/enums/energy/nuclear/nuclear_facilities.py +195 -0
- valuesets/enums/energy/nuclear/nuclear_fuel_cycle.py +96 -0
- valuesets/enums/energy/nuclear/nuclear_fuels.py +175 -0
- valuesets/enums/energy/nuclear/nuclear_operations.py +191 -0
- valuesets/enums/energy/nuclear/nuclear_regulatory.py +188 -0
- valuesets/enums/energy/nuclear/nuclear_safety.py +164 -0
- valuesets/enums/energy/nuclear/nuclear_waste.py +158 -0
- valuesets/enums/energy/nuclear/reactor_types.py +163 -0
- valuesets/enums/environmental_health/__init__.py +1 -0
- valuesets/enums/environmental_health/exposures.py +265 -0
- valuesets/enums/geography/__init__.py +1 -0
- valuesets/enums/geography/geographic_codes.py +741 -0
- valuesets/enums/health/__init__.py +12 -0
- valuesets/enums/health/vaccination.py +98 -0
- valuesets/enums/health.py +36 -0
- valuesets/enums/health_base.py +36 -0
- valuesets/enums/healthcare.py +45 -0
- valuesets/enums/industry/__init__.py +1 -0
- valuesets/enums/industry/extractive_industry.py +94 -0
- valuesets/enums/industry/mining.py +388 -0
- valuesets/enums/industry/safety_colors.py +201 -0
- valuesets/enums/investigation.py +27 -0
- valuesets/enums/materials_science/__init__.py +1 -0
- valuesets/enums/materials_science/characterization_methods.py +112 -0
- valuesets/enums/materials_science/crystal_structures.py +76 -0
- valuesets/enums/materials_science/material_properties.py +119 -0
- valuesets/enums/materials_science/material_types.py +104 -0
- valuesets/enums/materials_science/pigments_dyes.py +198 -0
- valuesets/enums/materials_science/synthesis_methods.py +109 -0
- valuesets/enums/medical/__init__.py +1 -0
- valuesets/enums/medical/clinical.py +277 -0
- valuesets/enums/medical/neuroimaging.py +119 -0
- valuesets/enums/mining_processing.py +302 -0
- valuesets/enums/physics/__init__.py +1 -0
- valuesets/enums/physics/states_of_matter.py +46 -0
- valuesets/enums/social/__init__.py +1 -0
- valuesets/enums/social/person_status.py +29 -0
- valuesets/enums/spatial/__init__.py +1 -0
- valuesets/enums/spatial/spatial_qualifiers.py +246 -0
- valuesets/enums/statistics/__init__.py +5 -0
- valuesets/enums/statistics/prediction_outcomes.py +31 -0
- valuesets/enums/statistics.py +31 -0
- valuesets/enums/time/__init__.py +1 -0
- valuesets/enums/time/temporal.py +254 -0
- valuesets/enums/units/__init__.py +1 -0
- valuesets/enums/units/measurements.py +310 -0
- valuesets/enums/visual/__init__.py +1 -0
- valuesets/enums/visual/colors.py +376 -0
- valuesets/generators/__init__.py +19 -0
- valuesets/generators/auto_slot_injector.py +280 -0
- valuesets/generators/enhanced_pydantic_generator.py +100 -0
- valuesets/generators/enum_slot_generator.py +201 -0
- valuesets/generators/modular_rich_generator.py +353 -0
- valuesets/generators/prefix_standardizer.py +198 -0
- valuesets/generators/rich_enum.py +127 -0
- valuesets/generators/rich_pydantic_generator.py +310 -0
- valuesets/generators/smart_slot_syncer.py +428 -0
- valuesets/generators/sssom_generator.py +394 -0
- valuesets/merged/merged_hierarchy.yaml +21649 -0
- valuesets/schema/README.md +3 -0
- valuesets/schema/academic/research.yaml +911 -0
- valuesets/schema/analytical_chemistry/mass_spectrometry.yaml +206 -0
- valuesets/schema/bio/bio_entities.yaml +364 -0
- valuesets/schema/bio/biological_colors.yaml +434 -0
- valuesets/schema/bio/cell_cycle.yaml +309 -0
- valuesets/schema/bio/currency_chemicals.yaml +70 -0
- valuesets/schema/bio/developmental_stages.yaml +226 -0
- valuesets/schema/bio/genome_features.yaml +342 -0
- valuesets/schema/bio/genomics.yaml +101 -0
- valuesets/schema/bio/go_aspect.yaml +39 -0
- valuesets/schema/bio/go_causality.yaml +119 -0
- valuesets/schema/bio/go_evidence.yaml +215 -0
- valuesets/schema/bio/insdc_geographic_locations.yaml +911 -0
- valuesets/schema/bio/insdc_missing_values.yaml +85 -0
- valuesets/schema/bio/lipid_categories.yaml +72 -0
- valuesets/schema/bio/plant_biology.yaml +125 -0
- valuesets/schema/bio/plant_developmental_stages.yaml +77 -0
- valuesets/schema/bio/plant_sex.yaml +108 -0
- valuesets/schema/bio/protein_evidence.yaml +63 -0
- valuesets/schema/bio/proteomics_standards.yaml +116 -0
- valuesets/schema/bio/psi_mi.yaml +400 -0
- valuesets/schema/bio/relationship_to_oxygen.yaml +46 -0
- valuesets/schema/bio/sequence_alphabets.yaml +1168 -0
- valuesets/schema/bio/sequence_chemistry.yaml +477 -0
- valuesets/schema/bio/sequencing_platforms.yaml +515 -0
- valuesets/schema/bio/structural_biology.yaml +428 -0
- valuesets/schema/bio/taxonomy.yaml +453 -0
- valuesets/schema/bio/trophic_levels.yaml +118 -0
- valuesets/schema/bio/uniprot_species.yaml +1209 -0
- valuesets/schema/bio/viral_genome_types.yaml +99 -0
- valuesets/schema/bioprocessing/scale_up.yaml +458 -0
- valuesets/schema/business/human_resources.yaml +752 -0
- valuesets/schema/business/industry_classifications.yaml +448 -0
- valuesets/schema/business/management_operations.yaml +602 -0
- valuesets/schema/business/organizational_structures.yaml +645 -0
- valuesets/schema/business/quality_management.yaml +502 -0
- valuesets/schema/business/supply_chain.yaml +688 -0
- valuesets/schema/chemistry/chemical_entities.yaml +639 -0
- valuesets/schema/chemistry/reaction_directionality.yaml +60 -0
- valuesets/schema/chemistry/reactions.yaml +442 -0
- valuesets/schema/clinical/nih_demographics.yaml +285 -0
- valuesets/schema/clinical/phenopackets.yaml +429 -0
- valuesets/schema/computing/file_formats.yaml +631 -0
- valuesets/schema/computing/maturity_levels.yaml +229 -0
- valuesets/schema/computing/mime_types.yaml +266 -0
- valuesets/schema/confidence_levels.yaml +206 -0
- valuesets/schema/contributor.yaml +30 -0
- valuesets/schema/core.yaml +55 -0
- valuesets/schema/data/data_absent_reason.yaml +82 -0
- valuesets/schema/data_science/binary_classification.yaml +125 -0
- valuesets/schema/data_science/emotion_classification.yaml +109 -0
- valuesets/schema/data_science/priority_severity.yaml +122 -0
- valuesets/schema/data_science/quality_control.yaml +68 -0
- valuesets/schema/data_science/sentiment_analysis.yaml +81 -0
- valuesets/schema/data_science/text_classification.yaml +135 -0
- valuesets/schema/demographics.yaml +238 -0
- valuesets/schema/ecological_interactions.yaml +298 -0
- valuesets/schema/energy/energy.yaml +595 -0
- valuesets/schema/energy/fossil_fuels.yaml +28 -0
- valuesets/schema/energy/nuclear/nuclear_facilities.yaml +463 -0
- valuesets/schema/energy/nuclear/nuclear_fuel_cycle.yaml +82 -0
- valuesets/schema/energy/nuclear/nuclear_fuels.yaml +421 -0
- valuesets/schema/energy/nuclear/nuclear_operations.yaml +480 -0
- valuesets/schema/energy/nuclear/nuclear_regulatory.yaml +200 -0
- valuesets/schema/energy/nuclear/nuclear_safety.yaml +352 -0
- valuesets/schema/energy/nuclear/nuclear_waste.yaml +332 -0
- valuesets/schema/energy/nuclear/reactor_types.yaml +394 -0
- valuesets/schema/environmental_health/exposures.yaml +355 -0
- valuesets/schema/generated_slots.yaml +1828 -0
- valuesets/schema/geography/geographic_codes.yaml +1018 -0
- valuesets/schema/health/vaccination.yaml +102 -0
- valuesets/schema/health.yaml +38 -0
- valuesets/schema/healthcare.yaml +53 -0
- valuesets/schema/industry/extractive_industry.yaml +89 -0
- valuesets/schema/industry/mining.yaml +888 -0
- valuesets/schema/industry/safety_colors.yaml +375 -0
- valuesets/schema/investigation.yaml +64 -0
- valuesets/schema/materials_science/characterization_methods.yaml +193 -0
- valuesets/schema/materials_science/crystal_structures.yaml +138 -0
- valuesets/schema/materials_science/material_properties.yaml +135 -0
- valuesets/schema/materials_science/material_types.yaml +151 -0
- valuesets/schema/materials_science/pigments_dyes.yaml +465 -0
- valuesets/schema/materials_science/synthesis_methods.yaml +186 -0
- valuesets/schema/medical/clinical.yaml +610 -0
- valuesets/schema/medical/neuroimaging.yaml +325 -0
- valuesets/schema/mining_processing.yaml +295 -0
- valuesets/schema/physics/states_of_matter.yaml +46 -0
- valuesets/schema/slot_mixins.yaml +143 -0
- valuesets/schema/social/person_status.yaml +28 -0
- valuesets/schema/spatial/spatial_qualifiers.yaml +466 -0
- valuesets/schema/statistics/prediction_outcomes.yaml +26 -0
- valuesets/schema/statistics.yaml +34 -0
- valuesets/schema/time/temporal.yaml +435 -0
- valuesets/schema/types.yaml +15 -0
- valuesets/schema/units/measurements.yaml +675 -0
- valuesets/schema/valuesets.yaml +100 -0
- valuesets/schema/visual/colors.yaml +778 -0
- valuesets/utils/__init__.py +6 -0
- valuesets/utils/comparison.py +102 -0
- valuesets/utils/expand_dynamic_enums.py +414 -0
- valuesets/utils/mapping_utils.py +236 -0
- valuesets/validators/__init__.py +11 -0
- valuesets/validators/enum_evaluator.py +669 -0
- valuesets/validators/oak_config.yaml +70 -0
- valuesets/validators/validate_with_ols.py +241 -0
- valuesets-0.3.1.dist-info/METADATA +395 -0
- valuesets-0.3.1.dist-info/RECORD +248 -0
- valuesets-0.3.1.dist-info/WHEEL +4 -0
- valuesets-0.3.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Comparison utilities for enum values."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any, Optional, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def same_meaning_as(enum1: Any, enum2: Any) -> Optional[bool]:
|
|
8
|
+
"""
|
|
9
|
+
Check if two enum values have the same semantic meaning.
|
|
10
|
+
|
|
11
|
+
This function compares the 'meaning' attribute of two enum values.
|
|
12
|
+
For rich enums generated from LinkML schemas, this will be the ontology term CURIE.
|
|
13
|
+
For standard Python enums without a meaning attribute, this returns None.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
enum1: First enum value to compare
|
|
17
|
+
enum2: Second enum value to compare
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
True if both enums have the same meaning
|
|
21
|
+
False if both enums have meanings but they differ
|
|
22
|
+
None if either enum lacks a meaning attribute
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
>>> from enum import Enum
|
|
26
|
+
|
|
27
|
+
# Create test enums with and without meaning attributes
|
|
28
|
+
>>> class EnumWithMeaning(Enum):
|
|
29
|
+
... VALUE1 = "val1"
|
|
30
|
+
... VALUE2 = "val2"
|
|
31
|
+
... def __init__(self, value):
|
|
32
|
+
... self._value_ = value
|
|
33
|
+
... # Simulate meaning attribute for some values
|
|
34
|
+
... if self.name == 'VALUE1':
|
|
35
|
+
... self.meaning = "ONTOLOGY:123"
|
|
36
|
+
... elif self.name == 'VALUE2':
|
|
37
|
+
... self.meaning = "ONTOLOGY:456"
|
|
38
|
+
|
|
39
|
+
>>> class AnotherEnumWithMeaning(Enum):
|
|
40
|
+
... ITEM1 = "item1"
|
|
41
|
+
... ITEM2 = "item2"
|
|
42
|
+
... def __init__(self, value):
|
|
43
|
+
... self._value_ = value
|
|
44
|
+
... # Same meaning as VALUE1
|
|
45
|
+
... if self.name == 'ITEM1':
|
|
46
|
+
... self.meaning = "ONTOLOGY:123"
|
|
47
|
+
... elif self.name == 'ITEM2':
|
|
48
|
+
... self.meaning = "ONTOLOGY:789"
|
|
49
|
+
|
|
50
|
+
# Test with enums that have same meaning
|
|
51
|
+
>>> same_meaning_as(EnumWithMeaning.VALUE1, AnotherEnumWithMeaning.ITEM1)
|
|
52
|
+
True
|
|
53
|
+
|
|
54
|
+
# Test with enums that have different meanings
|
|
55
|
+
>>> same_meaning_as(EnumWithMeaning.VALUE1, EnumWithMeaning.VALUE2)
|
|
56
|
+
False
|
|
57
|
+
|
|
58
|
+
# Test with standard Python enum (no meaning attribute)
|
|
59
|
+
>>> class StandardEnum(Enum):
|
|
60
|
+
... VALUE1 = "val1"
|
|
61
|
+
... VALUE2 = "val2"
|
|
62
|
+
>>> same_meaning_as(StandardEnum.VALUE1, StandardEnum.VALUE2) is None
|
|
63
|
+
True
|
|
64
|
+
|
|
65
|
+
# Test mixed case - one has meaning, one doesn't
|
|
66
|
+
>>> same_meaning_as(EnumWithMeaning.VALUE1, StandardEnum.VALUE1) is None
|
|
67
|
+
True
|
|
68
|
+
|
|
69
|
+
# Test with non-enum values
|
|
70
|
+
>>> same_meaning_as("not_enum", EnumWithMeaning.VALUE1) is None
|
|
71
|
+
True
|
|
72
|
+
|
|
73
|
+
# Test with None values
|
|
74
|
+
>>> same_meaning_as(None, EnumWithMeaning.VALUE1) is None
|
|
75
|
+
True
|
|
76
|
+
|
|
77
|
+
# Test identity case
|
|
78
|
+
>>> same_meaning_as(EnumWithMeaning.VALUE1, EnumWithMeaning.VALUE1)
|
|
79
|
+
True
|
|
80
|
+
"""
|
|
81
|
+
# Handle None or non-enum inputs
|
|
82
|
+
if enum1 is None or enum2 is None:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
if not isinstance(enum1, Enum) or not isinstance(enum2, Enum):
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
# Check if both enums have meaning attributes
|
|
89
|
+
meaning1 = getattr(enum1, 'meaning', None)
|
|
90
|
+
meaning2 = getattr(enum2, 'meaning', None)
|
|
91
|
+
|
|
92
|
+
# If either doesn't have a meaning, return None
|
|
93
|
+
if meaning1 is None or meaning2 is None:
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
# Both have meanings - compare them
|
|
97
|
+
return meaning1 == meaning2
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
import doctest
|
|
102
|
+
doctest.testmod(verbose=True)
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Utility script to expand all dynamic enums from LinkML schemas using OAK's vskit.
|
|
4
|
+
|
|
5
|
+
This script:
|
|
6
|
+
1. Scans all schema files for dynamic enum definitions
|
|
7
|
+
2. Uses OAK's vskit expand_in_place to expand each dynamic enum
|
|
8
|
+
3. Saves the expanded enums to a parallel directory structure under src/valuesets/expanded/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, Any, Optional, Tuple
|
|
14
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
15
|
+
import logging
|
|
16
|
+
import tempfile
|
|
17
|
+
from oaklib.utilities.subsets.value_set_expander import ValueSetExpander
|
|
18
|
+
from copy import deepcopy
|
|
19
|
+
|
|
20
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DynamicEnumExpander:
|
|
25
|
+
"""Expands dynamic enums from LinkML schemas using OAK's vskit."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, schema_dir: Path, output_dir: Path):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the expander.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
schema_dir: Directory containing LinkML schema files
|
|
33
|
+
output_dir: Directory where expanded enums will be saved
|
|
34
|
+
"""
|
|
35
|
+
self.schema_dir = Path(schema_dir)
|
|
36
|
+
self.output_dir = Path(output_dir)
|
|
37
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
|
|
39
|
+
# Cache for value set expanders
|
|
40
|
+
self.expanders = {}
|
|
41
|
+
|
|
42
|
+
def get_expander(self, ontology: str) -> Optional[ValueSetExpander]:
|
|
43
|
+
"""Get or create a ValueSetExpander for an ontology."""
|
|
44
|
+
if ontology not in self.expanders:
|
|
45
|
+
try:
|
|
46
|
+
# Use OBO format for standard ontologies
|
|
47
|
+
if ontology.startswith('obo:'):
|
|
48
|
+
ontology_id = ontology.replace('obo:', '')
|
|
49
|
+
adapter_spec = f"sqlite:obo:{ontology_id}"
|
|
50
|
+
else:
|
|
51
|
+
adapter_spec = ontology
|
|
52
|
+
|
|
53
|
+
logger.info(f"Creating expander for {ontology}: {adapter_spec}")
|
|
54
|
+
# Create expander
|
|
55
|
+
self.expanders[ontology] = ValueSetExpander(resource=adapter_spec)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(f"Failed to create expander for {ontology}: {e}")
|
|
58
|
+
import traceback
|
|
59
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
60
|
+
return None
|
|
61
|
+
return self.expanders[ontology]
|
|
62
|
+
|
|
63
|
+
def expand_dynamic_enum(self, enum_name: str, enum_def: Dict[str, Any],
|
|
64
|
+
source_file: Path, source_schema: Dict[str, Any] = None) -> Optional[Dict[str, Any]]:
|
|
65
|
+
"""
|
|
66
|
+
Expand a single dynamic enum definition using OAK's expand_in_place.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
enum_name: Name of the enum
|
|
70
|
+
enum_def: Enum definition from schema
|
|
71
|
+
source_file: Path to the source schema file
|
|
72
|
+
source_schema: The full source schema (for extracting prefixes)
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Expanded enum with permissible_values populated
|
|
76
|
+
"""
|
|
77
|
+
if 'reachable_from' not in enum_def:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
reachable = enum_def['reachable_from']
|
|
81
|
+
source_ontology = reachable.get('source_ontology')
|
|
82
|
+
|
|
83
|
+
# If no source_ontology specified, try to infer from source_nodes
|
|
84
|
+
if not source_ontology:
|
|
85
|
+
source_nodes = reachable.get('source_nodes', [])
|
|
86
|
+
if source_nodes:
|
|
87
|
+
first_node = source_nodes[0] if isinstance(source_nodes, list) else source_nodes
|
|
88
|
+
# Infer ontology from prefix
|
|
89
|
+
if first_node.startswith('OBI:'):
|
|
90
|
+
source_ontology = 'obo:obi'
|
|
91
|
+
elif first_node.startswith('NCBITaxon:'):
|
|
92
|
+
source_ontology = 'obo:ncbitaxon'
|
|
93
|
+
elif first_node.startswith('MONDO:'):
|
|
94
|
+
source_ontology = 'obo:mondo'
|
|
95
|
+
elif first_node.startswith('HP:'):
|
|
96
|
+
source_ontology = 'obo:hp'
|
|
97
|
+
elif first_node.startswith('UBERON:'):
|
|
98
|
+
source_ontology = 'obo:uberon'
|
|
99
|
+
elif first_node.startswith('CL:'):
|
|
100
|
+
source_ontology = 'obo:cl'
|
|
101
|
+
elif first_node.startswith('PO:'):
|
|
102
|
+
source_ontology = 'obo:po'
|
|
103
|
+
elif first_node.startswith('PATO:'):
|
|
104
|
+
source_ontology = 'obo:pato'
|
|
105
|
+
elif first_node.startswith('CHEBI:'):
|
|
106
|
+
source_ontology = 'obo:chebi'
|
|
107
|
+
elif first_node.startswith('GO:'):
|
|
108
|
+
source_ontology = 'obo:go'
|
|
109
|
+
else:
|
|
110
|
+
# Default fallback
|
|
111
|
+
source_ontology = 'obo:mondo'
|
|
112
|
+
logger.info(f" Inferred source_ontology={source_ontology} from node {first_node}")
|
|
113
|
+
|
|
114
|
+
if not source_ontology:
|
|
115
|
+
source_ontology = 'obo:mondo' # Ultimate fallback
|
|
116
|
+
|
|
117
|
+
logger.info(f"Expanding {enum_name} from {source_file}")
|
|
118
|
+
logger.info(f" Source: {source_ontology}")
|
|
119
|
+
|
|
120
|
+
expander = self.get_expander(source_ontology)
|
|
121
|
+
if not expander:
|
|
122
|
+
logger.error(f"Could not get expander for {source_ontology}")
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
# Create a copy of the enum definition for expansion
|
|
127
|
+
expanded_enum = deepcopy(enum_def)
|
|
128
|
+
|
|
129
|
+
# Ensure source_ontology is set in reachable_from
|
|
130
|
+
if 'reachable_from' in expanded_enum:
|
|
131
|
+
expanded_enum['reachable_from']['source_ontology'] = source_ontology
|
|
132
|
+
|
|
133
|
+
# Create a minimal schema with just this enum
|
|
134
|
+
schema_dict = {
|
|
135
|
+
'id': f'https://example.org/temp/{enum_name}',
|
|
136
|
+
'name': f'temp_schema_{enum_name}',
|
|
137
|
+
'enums': {
|
|
138
|
+
enum_name: expanded_enum
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# Write to temporary files for expand_in_place
|
|
143
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as tmp:
|
|
144
|
+
yaml.safe_dump(schema_dict, tmp)
|
|
145
|
+
tmp_path = tmp.name
|
|
146
|
+
|
|
147
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='_expanded.yaml', delete=False) as out:
|
|
148
|
+
out_path = out.name
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Use expand_in_place with pv_syntax for LABEL format and output_path
|
|
152
|
+
expanded_schema = expander.expand_in_place(
|
|
153
|
+
tmp_path,
|
|
154
|
+
value_set_names=[enum_name],
|
|
155
|
+
output_path=out_path,
|
|
156
|
+
pv_syntax="{label}" # Use label as the text field
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Read the expanded schema from output file
|
|
160
|
+
try:
|
|
161
|
+
with open(out_path, 'r') as f:
|
|
162
|
+
expanded_data = yaml.safe_load(f)
|
|
163
|
+
|
|
164
|
+
# Get the expanded enum
|
|
165
|
+
if expanded_data and 'enums' in expanded_data and enum_name in expanded_data['enums']:
|
|
166
|
+
expanded_enum_def = expanded_data['enums'][enum_name]
|
|
167
|
+
else:
|
|
168
|
+
logger.warning(f"No expanded enum found for {enum_name}")
|
|
169
|
+
expanded_enum_def = {'permissible_values': {}}
|
|
170
|
+
except yaml.YAMLError as e:
|
|
171
|
+
# If YAML parsing fails, try using a safer pv_syntax
|
|
172
|
+
logger.warning(f"YAML parsing failed for {enum_name}, retrying with ID-based keys: {e}")
|
|
173
|
+
|
|
174
|
+
# Clean up the failed output file
|
|
175
|
+
Path(out_path).unlink(missing_ok=True)
|
|
176
|
+
|
|
177
|
+
# Retry with ID-based syntax which should be YAML-safe
|
|
178
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='_expanded_safe.yaml', delete=False) as out2:
|
|
179
|
+
out_path2 = out2.name
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
expanded_schema = expander.expand_in_place(
|
|
183
|
+
tmp_path,
|
|
184
|
+
value_set_names=[enum_name],
|
|
185
|
+
output_path=out_path2,
|
|
186
|
+
pv_syntax="{id}" # Use ID which should be YAML-safe
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
with open(out_path2, 'r') as f:
|
|
190
|
+
expanded_data = yaml.safe_load(f)
|
|
191
|
+
|
|
192
|
+
if expanded_data and 'enums' in expanded_data and enum_name in expanded_data['enums']:
|
|
193
|
+
expanded_enum_def = expanded_data['enums'][enum_name]
|
|
194
|
+
else:
|
|
195
|
+
expanded_enum_def = {'permissible_values': {}}
|
|
196
|
+
finally:
|
|
197
|
+
Path(out_path2).unlink(missing_ok=True)
|
|
198
|
+
finally:
|
|
199
|
+
# Clean up temp files
|
|
200
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
201
|
+
Path(out_path).unlink(missing_ok=True)
|
|
202
|
+
|
|
203
|
+
# Extract just the parts we need
|
|
204
|
+
result = {
|
|
205
|
+
'description': expanded_enum_def.get('description', f'Expanded from {enum_name}'),
|
|
206
|
+
'permissible_values': expanded_enum_def.get('permissible_values', {}),
|
|
207
|
+
'_source': {
|
|
208
|
+
'enum_name': enum_name,
|
|
209
|
+
'source_file': str(source_file),
|
|
210
|
+
'source_ontology': source_ontology,
|
|
211
|
+
'reachable_from': reachable,
|
|
212
|
+
'total_terms': len(expanded_enum_def.get('permissible_values', {})),
|
|
213
|
+
'source_schema': source_schema # Include source schema for prefix extraction
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
logger.info(f" Expanded to {result['_source']['total_terms']} permissible values")
|
|
218
|
+
return result
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.error(f"Failed to expand {enum_name}: {e}")
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
def find_dynamic_enums(self) -> Dict[Path, Tuple[Dict[str, Any], Dict[str, Any]]]:
|
|
225
|
+
"""
|
|
226
|
+
Find all dynamic enum definitions in schema files.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Dictionary mapping file paths to tuples of (dynamic_enums, full_schema)
|
|
230
|
+
"""
|
|
231
|
+
dynamic_enums = {}
|
|
232
|
+
|
|
233
|
+
for yaml_file in self.schema_dir.rglob('*.yaml'):
|
|
234
|
+
try:
|
|
235
|
+
with open(yaml_file, 'r') as f:
|
|
236
|
+
schema = yaml.safe_load(f)
|
|
237
|
+
|
|
238
|
+
if not schema or 'enums' not in schema:
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
file_dynamic_enums = {}
|
|
242
|
+
for enum_name, enum_def in schema['enums'].items():
|
|
243
|
+
if 'reachable_from' in enum_def:
|
|
244
|
+
file_dynamic_enums[enum_name] = enum_def
|
|
245
|
+
|
|
246
|
+
if file_dynamic_enums:
|
|
247
|
+
dynamic_enums[yaml_file] = (file_dynamic_enums, schema)
|
|
248
|
+
logger.info(f"Found {len(file_dynamic_enums)} dynamic enums in {yaml_file}")
|
|
249
|
+
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.warning(f"Could not parse {yaml_file}: {e}")
|
|
252
|
+
|
|
253
|
+
return dynamic_enums
|
|
254
|
+
|
|
255
|
+
def save_expanded_enum(self, enum_name: str, expanded_enum: Dict[str, Any],
|
|
256
|
+
source_file: Path) -> Path:
|
|
257
|
+
"""
|
|
258
|
+
Save an expanded enum to the output directory as a valid LinkML schema YAML.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
enum_name: Name of the enum
|
|
262
|
+
expanded_enum: Expanded enum data
|
|
263
|
+
source_file: Original source file path
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Path to the saved file
|
|
267
|
+
"""
|
|
268
|
+
# Create parallel directory structure
|
|
269
|
+
relative_path = source_file.relative_to(self.schema_dir)
|
|
270
|
+
output_file = self.output_dir / relative_path.parent / f"{enum_name}.yaml"
|
|
271
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
272
|
+
|
|
273
|
+
# Get prefixes from source schema if available
|
|
274
|
+
source_schema = expanded_enum.get('_source', {}).get('source_schema', {})
|
|
275
|
+
prefixes = source_schema.get('prefixes', {}).copy() if source_schema else {}
|
|
276
|
+
|
|
277
|
+
# Ensure basic prefixes are present
|
|
278
|
+
if 'linkml' not in prefixes:
|
|
279
|
+
prefixes['linkml'] = 'https://w3id.org/linkml/'
|
|
280
|
+
if 'valuesets' not in prefixes:
|
|
281
|
+
prefixes['valuesets'] = 'https://w3id.org/valuesets/'
|
|
282
|
+
|
|
283
|
+
# Get default prefix from source or use valuesets
|
|
284
|
+
default_prefix = source_schema.get('default_prefix', 'valuesets') if source_schema else 'valuesets'
|
|
285
|
+
|
|
286
|
+
# Create a valid LinkML schema with the enum
|
|
287
|
+
schema_yaml = {
|
|
288
|
+
'id': f'https://w3id.org/valuesets/expanded/{enum_name}',
|
|
289
|
+
'name': f'{enum_name}_expanded',
|
|
290
|
+
'description': f'Expanded value set for {enum_name}',
|
|
291
|
+
'imports': ['linkml:types'],
|
|
292
|
+
'prefixes': prefixes,
|
|
293
|
+
'default_prefix': default_prefix,
|
|
294
|
+
'enums': {
|
|
295
|
+
enum_name: {
|
|
296
|
+
'description': expanded_enum['description'],
|
|
297
|
+
'permissible_values': expanded_enum['permissible_values']
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
# Save as YAML using safe_dump to avoid Python object tags
|
|
303
|
+
with open(output_file, 'w') as f:
|
|
304
|
+
yaml.safe_dump(schema_yaml, f, default_flow_style=False, sort_keys=False,
|
|
305
|
+
allow_unicode=True, width=120)
|
|
306
|
+
|
|
307
|
+
logger.info(f"Saved expanded enum to {output_file}")
|
|
308
|
+
return output_file
|
|
309
|
+
|
|
310
|
+
def expand_all(self, max_workers: int = 4):
|
|
311
|
+
"""
|
|
312
|
+
Expand all dynamic enums found in the schema directory.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
max_workers: Maximum number of parallel workers
|
|
316
|
+
"""
|
|
317
|
+
# Find all dynamic enums
|
|
318
|
+
dynamic_enums = self.find_dynamic_enums()
|
|
319
|
+
|
|
320
|
+
if not dynamic_enums:
|
|
321
|
+
logger.info("No dynamic enums found")
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
logger.info(f"Found dynamic enums in {len(dynamic_enums)} files")
|
|
325
|
+
|
|
326
|
+
# Process enums
|
|
327
|
+
total_enums = sum(len(enums_and_schema[0]) for enums_and_schema in dynamic_enums.values())
|
|
328
|
+
processed = 0
|
|
329
|
+
failed = 0
|
|
330
|
+
|
|
331
|
+
# Create a flat list of tasks
|
|
332
|
+
tasks = []
|
|
333
|
+
for file_path, (enums, schema) in dynamic_enums.items():
|
|
334
|
+
for enum_name, enum_def in enums.items():
|
|
335
|
+
tasks.append((enum_name, enum_def, file_path, schema))
|
|
336
|
+
|
|
337
|
+
# Process in parallel
|
|
338
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
339
|
+
futures = {}
|
|
340
|
+
|
|
341
|
+
for enum_name, enum_def, file_path, schema in tasks:
|
|
342
|
+
future = executor.submit(
|
|
343
|
+
self.expand_dynamic_enum,
|
|
344
|
+
enum_name,
|
|
345
|
+
enum_def,
|
|
346
|
+
file_path,
|
|
347
|
+
schema
|
|
348
|
+
)
|
|
349
|
+
futures[future] = (enum_name, file_path)
|
|
350
|
+
|
|
351
|
+
for future in as_completed(futures):
|
|
352
|
+
enum_name, file_path = futures[future]
|
|
353
|
+
try:
|
|
354
|
+
expanded_enum = future.result(timeout=60)
|
|
355
|
+
if expanded_enum:
|
|
356
|
+
self.save_expanded_enum(enum_name, expanded_enum, file_path)
|
|
357
|
+
processed += 1
|
|
358
|
+
else:
|
|
359
|
+
failed += 1
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.error(f"Failed to process {enum_name}: {e}")
|
|
362
|
+
failed += 1
|
|
363
|
+
|
|
364
|
+
logger.info(f"Expansion complete: {processed} successful, {failed} failed out of {total_enums} total")
|
|
365
|
+
|
|
366
|
+
# Create summary file
|
|
367
|
+
summary = {
|
|
368
|
+
'total_enums': total_enums,
|
|
369
|
+
'processed': processed,
|
|
370
|
+
'failed': failed,
|
|
371
|
+
'source_files': [str(f) for f in dynamic_enums.keys()]
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
summary_file = self.output_dir / 'expansion_summary.yaml'
|
|
375
|
+
with open(summary_file, 'w') as f:
|
|
376
|
+
yaml.safe_dump(summary, f, default_flow_style=False, sort_keys=False)
|
|
377
|
+
|
|
378
|
+
logger.info(f"Summary saved to {summary_file}")
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def main():
|
|
382
|
+
"""Main entry point."""
|
|
383
|
+
import argparse
|
|
384
|
+
|
|
385
|
+
parser = argparse.ArgumentParser(
|
|
386
|
+
description='Expand dynamic enums from LinkML schemas using OAK'
|
|
387
|
+
)
|
|
388
|
+
parser.add_argument(
|
|
389
|
+
'--schema-dir',
|
|
390
|
+
type=Path,
|
|
391
|
+
default=Path('src/valuesets/schema'),
|
|
392
|
+
help='Directory containing LinkML schema files'
|
|
393
|
+
)
|
|
394
|
+
parser.add_argument(
|
|
395
|
+
'--output-dir',
|
|
396
|
+
type=Path,
|
|
397
|
+
default=Path('src/valuesets/expanded'),
|
|
398
|
+
help='Output directory for expanded enums (default: src/valuesets/expanded)'
|
|
399
|
+
)
|
|
400
|
+
parser.add_argument(
|
|
401
|
+
'--workers',
|
|
402
|
+
type=int,
|
|
403
|
+
default=4,
|
|
404
|
+
help='Number of parallel workers'
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
args = parser.parse_args()
|
|
408
|
+
|
|
409
|
+
expander = DynamicEnumExpander(args.schema_dir, args.output_dir)
|
|
410
|
+
expander.expand_all(max_workers=args.workers)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
if __name__ == '__main__':
|
|
414
|
+
main()
|