valuesets 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of valuesets might be problematic. Click here for more details.
- valuesets/__init__.py +7 -0
- valuesets/_version.py +8 -0
- valuesets/datamodel/valuesets.py +13796 -0
- valuesets/datamodel/valuesets_dataclass.py +24503 -0
- valuesets/datamodel/valuesets_pydantic.py +13796 -0
- valuesets/enums/__init__.py +590 -0
- valuesets/enums/academic/__init__.py +1 -0
- valuesets/enums/academic/research.py +559 -0
- valuesets/enums/analytical_chemistry/__init__.py +1 -0
- valuesets/enums/analytical_chemistry/mass_spectrometry.py +198 -0
- valuesets/enums/bio/__init__.py +1 -0
- valuesets/enums/bio/biological_colors.py +238 -0
- valuesets/enums/bio/cell_cycle.py +180 -0
- valuesets/enums/bio/currency_chemicals.py +52 -0
- valuesets/enums/bio/developmental_stages.py +103 -0
- valuesets/enums/bio/genome_features.py +182 -0
- valuesets/enums/bio/genomics.py +91 -0
- valuesets/enums/bio/go_aspect.py +32 -0
- valuesets/enums/bio/go_causality.py +58 -0
- valuesets/enums/bio/go_evidence.py +129 -0
- valuesets/enums/bio/human_developmental_stages.py +62 -0
- valuesets/enums/bio/insdc_geographic_locations.py +591 -0
- valuesets/enums/bio/insdc_missing_values.py +49 -0
- valuesets/enums/bio/lipid_categories.py +67 -0
- valuesets/enums/bio/mouse_developmental_stages.py +62 -0
- valuesets/enums/bio/plant_biology.py +86 -0
- valuesets/enums/bio/plant_developmental_stages.py +54 -0
- valuesets/enums/bio/plant_sex.py +81 -0
- valuesets/enums/bio/protein_evidence.py +61 -0
- valuesets/enums/bio/proteomics_standards.py +123 -0
- valuesets/enums/bio/psi_mi.py +306 -0
- valuesets/enums/bio/relationship_to_oxygen.py +37 -0
- valuesets/enums/bio/sequence_alphabets.py +449 -0
- valuesets/enums/bio/sequence_chemistry.py +357 -0
- valuesets/enums/bio/sequencing_platforms.py +302 -0
- valuesets/enums/bio/structural_biology.py +320 -0
- valuesets/enums/bio/taxonomy.py +238 -0
- valuesets/enums/bio/trophic_levels.py +85 -0
- valuesets/enums/bio/uniprot_species.py +344 -0
- valuesets/enums/bio/viral_genome_types.py +47 -0
- valuesets/enums/bioprocessing/__init__.py +1 -0
- valuesets/enums/bioprocessing/scale_up.py +249 -0
- valuesets/enums/business/__init__.py +1 -0
- valuesets/enums/business/human_resources.py +275 -0
- valuesets/enums/business/industry_classifications.py +181 -0
- valuesets/enums/business/management_operations.py +228 -0
- valuesets/enums/business/organizational_structures.py +236 -0
- valuesets/enums/business/quality_management.py +181 -0
- valuesets/enums/business/supply_chain.py +232 -0
- valuesets/enums/chemistry/__init__.py +1 -0
- valuesets/enums/chemistry/chemical_entities.py +315 -0
- valuesets/enums/chemistry/reaction_directionality.py +65 -0
- valuesets/enums/chemistry/reactions.py +256 -0
- valuesets/enums/clinical/__init__.py +1 -0
- valuesets/enums/clinical/nih_demographics.py +177 -0
- valuesets/enums/clinical/phenopackets.py +254 -0
- valuesets/enums/common_value_sets.py +8791 -0
- valuesets/enums/computing/__init__.py +1 -0
- valuesets/enums/computing/file_formats.py +294 -0
- valuesets/enums/computing/maturity_levels.py +196 -0
- valuesets/enums/computing/mime_types.py +227 -0
- valuesets/enums/confidence_levels.py +168 -0
- valuesets/enums/contributor.py +30 -0
- valuesets/enums/core.py +42 -0
- valuesets/enums/data/__init__.py +1 -0
- valuesets/enums/data/data_absent_reason.py +53 -0
- valuesets/enums/data_science/__init__.py +1 -0
- valuesets/enums/data_science/binary_classification.py +87 -0
- valuesets/enums/data_science/emotion_classification.py +66 -0
- valuesets/enums/data_science/priority_severity.py +73 -0
- valuesets/enums/data_science/quality_control.py +46 -0
- valuesets/enums/data_science/sentiment_analysis.py +50 -0
- valuesets/enums/data_science/text_classification.py +97 -0
- valuesets/enums/demographics.py +206 -0
- valuesets/enums/ecological_interactions.py +151 -0
- valuesets/enums/energy/__init__.py +1 -0
- valuesets/enums/energy/energy.py +343 -0
- valuesets/enums/energy/fossil_fuels.py +29 -0
- valuesets/enums/energy/nuclear/__init__.py +1 -0
- valuesets/enums/energy/nuclear/nuclear_facilities.py +195 -0
- valuesets/enums/energy/nuclear/nuclear_fuel_cycle.py +96 -0
- valuesets/enums/energy/nuclear/nuclear_fuels.py +175 -0
- valuesets/enums/energy/nuclear/nuclear_operations.py +191 -0
- valuesets/enums/energy/nuclear/nuclear_regulatory.py +188 -0
- valuesets/enums/energy/nuclear/nuclear_safety.py +164 -0
- valuesets/enums/energy/nuclear/nuclear_waste.py +158 -0
- valuesets/enums/energy/nuclear/reactor_types.py +163 -0
- valuesets/enums/environmental_health/__init__.py +1 -0
- valuesets/enums/environmental_health/exposures.py +265 -0
- valuesets/enums/geography/__init__.py +1 -0
- valuesets/enums/geography/geographic_codes.py +741 -0
- valuesets/enums/health/__init__.py +12 -0
- valuesets/enums/health/vaccination.py +98 -0
- valuesets/enums/health.py +36 -0
- valuesets/enums/health_base.py +36 -0
- valuesets/enums/healthcare.py +45 -0
- valuesets/enums/industry/__init__.py +1 -0
- valuesets/enums/industry/extractive_industry.py +94 -0
- valuesets/enums/industry/mining.py +388 -0
- valuesets/enums/industry/safety_colors.py +201 -0
- valuesets/enums/investigation.py +27 -0
- valuesets/enums/materials_science/__init__.py +1 -0
- valuesets/enums/materials_science/characterization_methods.py +112 -0
- valuesets/enums/materials_science/crystal_structures.py +76 -0
- valuesets/enums/materials_science/material_properties.py +119 -0
- valuesets/enums/materials_science/material_types.py +104 -0
- valuesets/enums/materials_science/pigments_dyes.py +198 -0
- valuesets/enums/materials_science/synthesis_methods.py +109 -0
- valuesets/enums/medical/__init__.py +1 -0
- valuesets/enums/medical/clinical.py +277 -0
- valuesets/enums/medical/neuroimaging.py +119 -0
- valuesets/enums/mining_processing.py +302 -0
- valuesets/enums/physics/__init__.py +1 -0
- valuesets/enums/physics/states_of_matter.py +46 -0
- valuesets/enums/social/__init__.py +1 -0
- valuesets/enums/social/person_status.py +29 -0
- valuesets/enums/spatial/__init__.py +1 -0
- valuesets/enums/spatial/spatial_qualifiers.py +246 -0
- valuesets/enums/statistics/__init__.py +5 -0
- valuesets/enums/statistics/prediction_outcomes.py +31 -0
- valuesets/enums/statistics.py +31 -0
- valuesets/enums/time/__init__.py +1 -0
- valuesets/enums/time/temporal.py +254 -0
- valuesets/enums/units/__init__.py +1 -0
- valuesets/enums/units/measurements.py +310 -0
- valuesets/enums/visual/__init__.py +1 -0
- valuesets/enums/visual/colors.py +376 -0
- valuesets/generators/__init__.py +19 -0
- valuesets/generators/auto_slot_injector.py +280 -0
- valuesets/generators/enhanced_pydantic_generator.py +100 -0
- valuesets/generators/enum_slot_generator.py +201 -0
- valuesets/generators/modular_rich_generator.py +353 -0
- valuesets/generators/prefix_standardizer.py +198 -0
- valuesets/generators/rich_enum.py +127 -0
- valuesets/generators/rich_pydantic_generator.py +310 -0
- valuesets/generators/smart_slot_syncer.py +428 -0
- valuesets/generators/sssom_generator.py +394 -0
- valuesets/merged/merged_hierarchy.yaml +21649 -0
- valuesets/schema/README.md +3 -0
- valuesets/schema/academic/research.yaml +911 -0
- valuesets/schema/analytical_chemistry/mass_spectrometry.yaml +206 -0
- valuesets/schema/bio/bio_entities.yaml +364 -0
- valuesets/schema/bio/biological_colors.yaml +434 -0
- valuesets/schema/bio/cell_cycle.yaml +309 -0
- valuesets/schema/bio/currency_chemicals.yaml +70 -0
- valuesets/schema/bio/developmental_stages.yaml +226 -0
- valuesets/schema/bio/genome_features.yaml +342 -0
- valuesets/schema/bio/genomics.yaml +101 -0
- valuesets/schema/bio/go_aspect.yaml +39 -0
- valuesets/schema/bio/go_causality.yaml +119 -0
- valuesets/schema/bio/go_evidence.yaml +215 -0
- valuesets/schema/bio/insdc_geographic_locations.yaml +911 -0
- valuesets/schema/bio/insdc_missing_values.yaml +85 -0
- valuesets/schema/bio/lipid_categories.yaml +72 -0
- valuesets/schema/bio/plant_biology.yaml +125 -0
- valuesets/schema/bio/plant_developmental_stages.yaml +77 -0
- valuesets/schema/bio/plant_sex.yaml +108 -0
- valuesets/schema/bio/protein_evidence.yaml +63 -0
- valuesets/schema/bio/proteomics_standards.yaml +116 -0
- valuesets/schema/bio/psi_mi.yaml +400 -0
- valuesets/schema/bio/relationship_to_oxygen.yaml +46 -0
- valuesets/schema/bio/sequence_alphabets.yaml +1168 -0
- valuesets/schema/bio/sequence_chemistry.yaml +477 -0
- valuesets/schema/bio/sequencing_platforms.yaml +515 -0
- valuesets/schema/bio/structural_biology.yaml +428 -0
- valuesets/schema/bio/taxonomy.yaml +453 -0
- valuesets/schema/bio/trophic_levels.yaml +118 -0
- valuesets/schema/bio/uniprot_species.yaml +1209 -0
- valuesets/schema/bio/viral_genome_types.yaml +99 -0
- valuesets/schema/bioprocessing/scale_up.yaml +458 -0
- valuesets/schema/business/human_resources.yaml +752 -0
- valuesets/schema/business/industry_classifications.yaml +448 -0
- valuesets/schema/business/management_operations.yaml +602 -0
- valuesets/schema/business/organizational_structures.yaml +645 -0
- valuesets/schema/business/quality_management.yaml +502 -0
- valuesets/schema/business/supply_chain.yaml +688 -0
- valuesets/schema/chemistry/chemical_entities.yaml +639 -0
- valuesets/schema/chemistry/reaction_directionality.yaml +60 -0
- valuesets/schema/chemistry/reactions.yaml +442 -0
- valuesets/schema/clinical/nih_demographics.yaml +285 -0
- valuesets/schema/clinical/phenopackets.yaml +429 -0
- valuesets/schema/computing/file_formats.yaml +631 -0
- valuesets/schema/computing/maturity_levels.yaml +229 -0
- valuesets/schema/computing/mime_types.yaml +266 -0
- valuesets/schema/confidence_levels.yaml +206 -0
- valuesets/schema/contributor.yaml +30 -0
- valuesets/schema/core.yaml +55 -0
- valuesets/schema/data/data_absent_reason.yaml +82 -0
- valuesets/schema/data_science/binary_classification.yaml +125 -0
- valuesets/schema/data_science/emotion_classification.yaml +109 -0
- valuesets/schema/data_science/priority_severity.yaml +122 -0
- valuesets/schema/data_science/quality_control.yaml +68 -0
- valuesets/schema/data_science/sentiment_analysis.yaml +81 -0
- valuesets/schema/data_science/text_classification.yaml +135 -0
- valuesets/schema/demographics.yaml +238 -0
- valuesets/schema/ecological_interactions.yaml +298 -0
- valuesets/schema/energy/energy.yaml +595 -0
- valuesets/schema/energy/fossil_fuels.yaml +28 -0
- valuesets/schema/energy/nuclear/nuclear_facilities.yaml +463 -0
- valuesets/schema/energy/nuclear/nuclear_fuel_cycle.yaml +82 -0
- valuesets/schema/energy/nuclear/nuclear_fuels.yaml +421 -0
- valuesets/schema/energy/nuclear/nuclear_operations.yaml +480 -0
- valuesets/schema/energy/nuclear/nuclear_regulatory.yaml +200 -0
- valuesets/schema/energy/nuclear/nuclear_safety.yaml +352 -0
- valuesets/schema/energy/nuclear/nuclear_waste.yaml +332 -0
- valuesets/schema/energy/nuclear/reactor_types.yaml +394 -0
- valuesets/schema/environmental_health/exposures.yaml +355 -0
- valuesets/schema/generated_slots.yaml +1828 -0
- valuesets/schema/geography/geographic_codes.yaml +1018 -0
- valuesets/schema/health/vaccination.yaml +102 -0
- valuesets/schema/health.yaml +38 -0
- valuesets/schema/healthcare.yaml +53 -0
- valuesets/schema/industry/extractive_industry.yaml +89 -0
- valuesets/schema/industry/mining.yaml +888 -0
- valuesets/schema/industry/safety_colors.yaml +375 -0
- valuesets/schema/investigation.yaml +64 -0
- valuesets/schema/materials_science/characterization_methods.yaml +193 -0
- valuesets/schema/materials_science/crystal_structures.yaml +138 -0
- valuesets/schema/materials_science/material_properties.yaml +135 -0
- valuesets/schema/materials_science/material_types.yaml +151 -0
- valuesets/schema/materials_science/pigments_dyes.yaml +465 -0
- valuesets/schema/materials_science/synthesis_methods.yaml +186 -0
- valuesets/schema/medical/clinical.yaml +610 -0
- valuesets/schema/medical/neuroimaging.yaml +325 -0
- valuesets/schema/mining_processing.yaml +295 -0
- valuesets/schema/physics/states_of_matter.yaml +46 -0
- valuesets/schema/slot_mixins.yaml +143 -0
- valuesets/schema/social/person_status.yaml +28 -0
- valuesets/schema/spatial/spatial_qualifiers.yaml +466 -0
- valuesets/schema/statistics/prediction_outcomes.yaml +26 -0
- valuesets/schema/statistics.yaml +34 -0
- valuesets/schema/time/temporal.yaml +435 -0
- valuesets/schema/types.yaml +15 -0
- valuesets/schema/units/measurements.yaml +675 -0
- valuesets/schema/valuesets.yaml +100 -0
- valuesets/schema/visual/colors.yaml +778 -0
- valuesets/utils/__init__.py +6 -0
- valuesets/utils/comparison.py +102 -0
- valuesets/utils/expand_dynamic_enums.py +414 -0
- valuesets/utils/mapping_utils.py +236 -0
- valuesets/validators/__init__.py +11 -0
- valuesets/validators/enum_evaluator.py +669 -0
- valuesets/validators/oak_config.yaml +70 -0
- valuesets/validators/validate_with_ols.py +241 -0
- valuesets-0.3.1.dist-info/METADATA +395 -0
- valuesets-0.3.1.dist-info/RECORD +248 -0
- valuesets-0.3.1.dist-info/WHEEL +4 -0
- valuesets-0.3.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared utilities for extracting and processing mappings from LinkML schemas.
|
|
3
|
+
|
|
4
|
+
This module provides reusable functions for extracting all types of mappings
|
|
5
|
+
from PermissibleValue objects, which can be used by both the SSSOM generator
|
|
6
|
+
and the enum validator.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Dict, Tuple, Optional, Any, Union
|
|
10
|
+
from linkml_runtime.linkml_model import PermissibleValue
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Mapping predicates for different mapping types
|
|
16
|
+
MAPPING_PREDICATES = {
|
|
17
|
+
'meaning': 'skos:exactMatch',
|
|
18
|
+
'exact_mappings': 'skos:exactMatch',
|
|
19
|
+
'close_mappings': 'skos:closeMatch',
|
|
20
|
+
'narrow_mappings': 'skos:narrowMatch',
|
|
21
|
+
'broad_mappings': 'skos:broadMatch',
|
|
22
|
+
'related_mappings': 'skos:relatedMatch'
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_all_mappings(
|
|
27
|
+
pv: PermissibleValue,
|
|
28
|
+
include_meaning: bool = True,
|
|
29
|
+
include_annotations: bool = True
|
|
30
|
+
) -> List[Tuple[str, str, Optional[str]]]:
|
|
31
|
+
"""
|
|
32
|
+
Extract all mappings from a PermissibleValue.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
pv: The PermissibleValue object to extract mappings from
|
|
36
|
+
include_meaning: Whether to include the 'meaning' field
|
|
37
|
+
include_annotations: Whether to check annotations for related_mappings
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of tuples: (object_id, predicate, comment)
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> pv = PermissibleValue(
|
|
44
|
+
... meaning="NCIT:C12345",
|
|
45
|
+
... exact_mappings=["IAO:0000013", "FABIO:Article"],
|
|
46
|
+
... close_mappings=["MESH:D012345"]
|
|
47
|
+
... )
|
|
48
|
+
>>> mappings = extract_all_mappings(pv)
|
|
49
|
+
>>> # Returns: [
|
|
50
|
+
>>> # ("NCIT:C12345", "skos:exactMatch", None),
|
|
51
|
+
>>> # ("IAO:0000013", "skos:exactMatch", None),
|
|
52
|
+
>>> # ("FABIO:Article", "skos:exactMatch", None),
|
|
53
|
+
>>> # ("MESH:D012345", "skos:closeMatch", None)
|
|
54
|
+
>>> # ]
|
|
55
|
+
"""
|
|
56
|
+
mappings = []
|
|
57
|
+
|
|
58
|
+
# Extract 'meaning' field
|
|
59
|
+
if include_meaning and hasattr(pv, 'meaning') and pv.meaning:
|
|
60
|
+
mappings.append((pv.meaning, MAPPING_PREDICATES['meaning'], None))
|
|
61
|
+
|
|
62
|
+
# Extract exact_mappings
|
|
63
|
+
if hasattr(pv, 'exact_mappings') and pv.exact_mappings:
|
|
64
|
+
for mapping in pv.exact_mappings:
|
|
65
|
+
if mapping and isinstance(mapping, str):
|
|
66
|
+
mappings.append((mapping, MAPPING_PREDICATES['exact_mappings'], None))
|
|
67
|
+
|
|
68
|
+
# Extract close_mappings
|
|
69
|
+
if hasattr(pv, 'close_mappings') and pv.close_mappings:
|
|
70
|
+
for mapping in pv.close_mappings:
|
|
71
|
+
if mapping and isinstance(mapping, str):
|
|
72
|
+
mappings.append((mapping, MAPPING_PREDICATES['close_mappings'], None))
|
|
73
|
+
|
|
74
|
+
# Extract narrow_mappings
|
|
75
|
+
if hasattr(pv, 'narrow_mappings') and pv.narrow_mappings:
|
|
76
|
+
for mapping in pv.narrow_mappings:
|
|
77
|
+
if mapping and isinstance(mapping, str):
|
|
78
|
+
mappings.append((mapping, MAPPING_PREDICATES['narrow_mappings'], None))
|
|
79
|
+
|
|
80
|
+
# Extract broad_mappings
|
|
81
|
+
if hasattr(pv, 'broad_mappings') and pv.broad_mappings:
|
|
82
|
+
for mapping in pv.broad_mappings:
|
|
83
|
+
if mapping and isinstance(mapping, str):
|
|
84
|
+
mappings.append((mapping, MAPPING_PREDICATES['broad_mappings'], None))
|
|
85
|
+
|
|
86
|
+
# Extract related_mappings from annotations if present
|
|
87
|
+
if include_annotations and hasattr(pv, 'annotations') and pv.annotations:
|
|
88
|
+
related = pv.annotations.get('related_mappings')
|
|
89
|
+
if related:
|
|
90
|
+
if isinstance(related, list):
|
|
91
|
+
for mapping in related:
|
|
92
|
+
if mapping and isinstance(mapping, str):
|
|
93
|
+
mappings.append((mapping, MAPPING_PREDICATES['related_mappings'], "From annotations"))
|
|
94
|
+
elif isinstance(related, str):
|
|
95
|
+
# Handle single value
|
|
96
|
+
mappings.append((related, MAPPING_PREDICATES['related_mappings'], "From annotations"))
|
|
97
|
+
|
|
98
|
+
# Also check for the related_mappings field directly (if it exists)
|
|
99
|
+
if hasattr(pv, 'related_mappings') and pv.related_mappings:
|
|
100
|
+
for mapping in pv.related_mappings:
|
|
101
|
+
if mapping and isinstance(mapping, str):
|
|
102
|
+
mappings.append((mapping, MAPPING_PREDICATES['related_mappings'], None))
|
|
103
|
+
|
|
104
|
+
return mappings
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_mapping_statistics(pv: PermissibleValue) -> Dict[str, int]:
|
|
108
|
+
"""
|
|
109
|
+
Get statistics about mappings in a PermissibleValue.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
pv: The PermissibleValue object
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Dictionary with counts for each mapping type
|
|
116
|
+
"""
|
|
117
|
+
stats = {
|
|
118
|
+
'meaning': 0,
|
|
119
|
+
'exact_mappings': 0,
|
|
120
|
+
'close_mappings': 0,
|
|
121
|
+
'narrow_mappings': 0,
|
|
122
|
+
'broad_mappings': 0,
|
|
123
|
+
'related_mappings': 0,
|
|
124
|
+
'total': 0
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if hasattr(pv, 'meaning') and pv.meaning:
|
|
128
|
+
stats['meaning'] = 1
|
|
129
|
+
|
|
130
|
+
for mapping_type in ['exact_mappings', 'close_mappings', 'narrow_mappings', 'broad_mappings', 'related_mappings']:
|
|
131
|
+
if hasattr(pv, mapping_type):
|
|
132
|
+
value = getattr(pv, mapping_type)
|
|
133
|
+
if value:
|
|
134
|
+
if isinstance(value, list):
|
|
135
|
+
stats[mapping_type] = len(value)
|
|
136
|
+
else:
|
|
137
|
+
stats[mapping_type] = 1
|
|
138
|
+
|
|
139
|
+
stats['total'] = sum(v for k, v in stats.items() if k != 'total')
|
|
140
|
+
return stats
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def validate_curie_format(curie: str) -> bool:
|
|
144
|
+
"""
|
|
145
|
+
Validate that a string is in CURIE format (prefix:local_id).
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
curie: String to validate
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
True if valid CURIE format, False otherwise
|
|
152
|
+
"""
|
|
153
|
+
if not curie or not isinstance(curie, str):
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
parts = curie.split(':')
|
|
157
|
+
if len(parts) != 2:
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
prefix, local_id = parts
|
|
161
|
+
if not prefix or not local_id:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
# Basic validation - prefix should be alphanumeric (allowing underscores)
|
|
165
|
+
# local_id can contain more characters
|
|
166
|
+
return prefix.replace('_', '').isalnum()
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def extract_ontology_prefix(curie: str) -> Optional[str]:
|
|
170
|
+
"""
|
|
171
|
+
Extract the ontology prefix from a CURIE.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
curie: CURIE string (e.g., "NCIT:C12345")
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
The prefix part or None if invalid
|
|
178
|
+
"""
|
|
179
|
+
if not validate_curie_format(curie):
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
return curie.split(':')[0]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def group_mappings_by_ontology(
|
|
186
|
+
mappings: List[Tuple[str, str, Optional[str]]]
|
|
187
|
+
) -> Dict[str, List[Tuple[str, str, Optional[str]]]]:
|
|
188
|
+
"""
|
|
189
|
+
Group mappings by their ontology prefix.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
mappings: List of mapping tuples from extract_all_mappings
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Dictionary with ontology prefixes as keys and lists of mappings as values
|
|
196
|
+
"""
|
|
197
|
+
grouped = {}
|
|
198
|
+
|
|
199
|
+
for object_id, predicate, comment in mappings:
|
|
200
|
+
prefix = extract_ontology_prefix(object_id)
|
|
201
|
+
if prefix:
|
|
202
|
+
if prefix not in grouped:
|
|
203
|
+
grouped[prefix] = []
|
|
204
|
+
grouped[prefix].append((object_id, predicate, comment))
|
|
205
|
+
else:
|
|
206
|
+
# Handle non-CURIE mappings
|
|
207
|
+
if 'OTHER' not in grouped:
|
|
208
|
+
grouped['OTHER'] = []
|
|
209
|
+
grouped['OTHER'].append((object_id, predicate, comment))
|
|
210
|
+
|
|
211
|
+
return grouped
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def deduplicate_mappings(
|
|
215
|
+
mappings: List[Tuple[str, str, Optional[str]]]
|
|
216
|
+
) -> List[Tuple[str, str, Optional[str]]]:
|
|
217
|
+
"""
|
|
218
|
+
Remove duplicate mappings, keeping the first occurrence.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
mappings: List of mapping tuples
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Deduplicated list of mappings
|
|
225
|
+
"""
|
|
226
|
+
seen = set()
|
|
227
|
+
deduped = []
|
|
228
|
+
|
|
229
|
+
for object_id, predicate, comment in mappings:
|
|
230
|
+
# Use object_id and predicate as the key for deduplication
|
|
231
|
+
key = (object_id, predicate)
|
|
232
|
+
if key not in seen:
|
|
233
|
+
seen.add(key)
|
|
234
|
+
deduped.append((object_id, predicate, comment))
|
|
235
|
+
|
|
236
|
+
return deduped
|