esgvoc 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgvoc/__init__.py +3 -0
- esgvoc/api/__init__.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
- esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
- esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
- esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
- esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
- esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
- esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
- esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
- esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
- esgvoc/api/data_descriptors/__init__.py +159 -0
- esgvoc/api/data_descriptors/activity.py +72 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/area_label.py +30 -0
- esgvoc/api/data_descriptors/branded_suffix.py +30 -0
- esgvoc/api/data_descriptors/branded_variable.py +21 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/contact.py +5 -0
- esgvoc/api/data_descriptors/conventions.py +28 -0
- esgvoc/api/data_descriptors/creation_date.py +18 -0
- esgvoc/api/data_descriptors/data_descriptor.py +127 -0
- esgvoc/api/data_descriptors/data_specs_version.py +25 -0
- esgvoc/api/data_descriptors/date.py +5 -0
- esgvoc/api/data_descriptors/directory_date.py +22 -0
- esgvoc/api/data_descriptors/drs_specs.py +38 -0
- esgvoc/api/data_descriptors/experiment.py +215 -0
- esgvoc/api/data_descriptors/forcing_index.py +21 -0
- esgvoc/api/data_descriptors/frequency.py +48 -0
- esgvoc/api/data_descriptors/further_info_url.py +5 -0
- esgvoc/api/data_descriptors/grid.py +43 -0
- esgvoc/api/data_descriptors/horizontal_label.py +20 -0
- esgvoc/api/data_descriptors/initialization_index.py +27 -0
- esgvoc/api/data_descriptors/institution.py +80 -0
- esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
- esgvoc/api/data_descriptors/license.py +31 -0
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/mip_era.py +26 -0
- esgvoc/api/data_descriptors/model_component.py +32 -0
- esgvoc/api/data_descriptors/models_test/models.py +17 -0
- esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
- esgvoc/api/data_descriptors/obs_type.py +5 -0
- esgvoc/api/data_descriptors/organisation.py +22 -0
- esgvoc/api/data_descriptors/physics_index.py +21 -0
- esgvoc/api/data_descriptors/product.py +16 -0
- esgvoc/api/data_descriptors/publication_status.py +5 -0
- esgvoc/api/data_descriptors/realization_index.py +24 -0
- esgvoc/api/data_descriptors/realm.py +16 -0
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/region.py +35 -0
- esgvoc/api/data_descriptors/resolution.py +7 -0
- esgvoc/api/data_descriptors/source.py +120 -0
- esgvoc/api/data_descriptors/source_type.py +5 -0
- esgvoc/api/data_descriptors/sub_experiment.py +5 -0
- esgvoc/api/data_descriptors/table.py +28 -0
- esgvoc/api/data_descriptors/temporal_label.py +20 -0
- esgvoc/api/data_descriptors/time_range.py +17 -0
- esgvoc/api/data_descriptors/title.py +5 -0
- esgvoc/api/data_descriptors/tracking_id.py +67 -0
- esgvoc/api/data_descriptors/variable.py +56 -0
- esgvoc/api/data_descriptors/variant_label.py +25 -0
- esgvoc/api/data_descriptors/vertical_label.py +20 -0
- esgvoc/api/project_specs.py +143 -0
- esgvoc/api/projects.py +1253 -0
- esgvoc/api/py.typed +0 -0
- esgvoc/api/pydantic_handler.py +146 -0
- esgvoc/api/report.py +127 -0
- esgvoc/api/search.py +171 -0
- esgvoc/api/universe.py +434 -0
- esgvoc/apps/__init__.py +6 -0
- esgvoc/apps/cmor_tables/__init__.py +7 -0
- esgvoc/apps/cmor_tables/cvs_table.py +948 -0
- esgvoc/apps/drs/__init__.py +0 -0
- esgvoc/apps/drs/constants.py +2 -0
- esgvoc/apps/drs/generator.py +429 -0
- esgvoc/apps/drs/report.py +540 -0
- esgvoc/apps/drs/validator.py +312 -0
- esgvoc/apps/ga/__init__.py +104 -0
- esgvoc/apps/ga/example_usage.py +315 -0
- esgvoc/apps/ga/models/__init__.py +47 -0
- esgvoc/apps/ga/models/netcdf_header.py +306 -0
- esgvoc/apps/ga/models/validator.py +491 -0
- esgvoc/apps/ga/test_ga.py +161 -0
- esgvoc/apps/ga/validator.py +277 -0
- esgvoc/apps/jsg/json_schema_generator.py +341 -0
- esgvoc/apps/jsg/templates/template.jinja +241 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/__init__.py +0 -0
- esgvoc/apps/test_cv/cv_tester.py +1611 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/clean.py +304 -0
- esgvoc/cli/cmor.py +46 -0
- esgvoc/cli/config.py +1300 -0
- esgvoc/cli/drs.py +267 -0
- esgvoc/cli/find.py +138 -0
- esgvoc/cli/get.py +155 -0
- esgvoc/cli/install.py +41 -0
- esgvoc/cli/main.py +60 -0
- esgvoc/cli/offline.py +269 -0
- esgvoc/cli/status.py +79 -0
- esgvoc/cli/test_cv.py +258 -0
- esgvoc/cli/valid.py +147 -0
- esgvoc/core/constants.py +17 -0
- esgvoc/core/convert.py +0 -0
- esgvoc/core/data_handler.py +206 -0
- esgvoc/core/db/__init__.py +3 -0
- esgvoc/core/db/connection.py +40 -0
- esgvoc/core/db/models/mixins.py +25 -0
- esgvoc/core/db/models/project.py +102 -0
- esgvoc/core/db/models/universe.py +98 -0
- esgvoc/core/db/project_ingestion.py +231 -0
- esgvoc/core/db/universe_ingestion.py +172 -0
- esgvoc/core/exceptions.py +33 -0
- esgvoc/core/logging_handler.py +26 -0
- esgvoc/core/repo_fetcher.py +345 -0
- esgvoc/core/service/__init__.py +41 -0
- esgvoc/core/service/configuration/config_manager.py +196 -0
- esgvoc/core/service/configuration/setting.py +363 -0
- esgvoc/core/service/data_merger.py +634 -0
- esgvoc/core/service/esg_voc.py +77 -0
- esgvoc/core/service/resolver_config.py +56 -0
- esgvoc/core/service/state.py +324 -0
- esgvoc/core/service/string_heuristics.py +98 -0
- esgvoc/core/service/term_cache.py +108 -0
- esgvoc/core/service/uri_resolver.py +133 -0
- esgvoc-2.0.2.dist-info/METADATA +82 -0
- esgvoc-2.0.2.dist-info/RECORD +147 -0
- esgvoc-2.0.2.dist-info/WHEEL +4 -0
- esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
- esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
|
@@ -0,0 +1,634 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, List, Set
|
|
3
|
+
|
|
4
|
+
from esgvoc.core.data_handler import JsonLdResource
|
|
5
|
+
from esgvoc.core.service.resolver_config import ResolverConfig
|
|
6
|
+
from esgvoc.core.service.string_heuristics import StringHeuristics
|
|
7
|
+
from esgvoc.core.service.term_cache import TermCache
|
|
8
|
+
from esgvoc.core.service.uri_resolver import URIResolver
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def merge_dicts(base: list, override: list) -> dict:
|
|
14
|
+
"""
|
|
15
|
+
Merge two JSON-LD dictionaries, with override taking precedence.
|
|
16
|
+
|
|
17
|
+
This performs a shallow merge where:
|
|
18
|
+
1. Start with override data (custom/project-specific)
|
|
19
|
+
2. Overlay with base data (parent/universe defaults)
|
|
20
|
+
3. Skip @id field from both (handled separately in JSON-LD)
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
base: List containing the base/parent dictionary (expects [dict])
|
|
24
|
+
override: List containing the override/child dictionary (expects [dict])
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Merged dictionary with override values taking precedence
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> base = [{"name": "Base", "value": 1}]
|
|
31
|
+
>>> override = [{"value": 2, "new": "field"}]
|
|
32
|
+
>>> merge_dicts(base, override)
|
|
33
|
+
{'value': 2, 'new': 'field', 'name': 'Base'}
|
|
34
|
+
|
|
35
|
+
Note:
|
|
36
|
+
Takes lists for backward compatibility with JSON-LD processing,
|
|
37
|
+
but only uses the first element of each.
|
|
38
|
+
"""
|
|
39
|
+
base_data = base[0]
|
|
40
|
+
override_data = override[0]
|
|
41
|
+
|
|
42
|
+
# Merge strategy: base first (fills in defaults), then override (takes precedence)
|
|
43
|
+
merged = {
|
|
44
|
+
**{k: v for k, v in base_data.items() if k != "@id"},
|
|
45
|
+
**{k: v for k, v in override_data.items() if k != "@id"},
|
|
46
|
+
}
|
|
47
|
+
return merged
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def merge(uri: str) -> Dict:
|
|
51
|
+
mdm = DataMerger(data=JsonLdResource(uri=uri))
|
|
52
|
+
return mdm.merge_linked_json()[-1]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def resolve_nested_ids_in_dict(data: dict, merger: "DataMerger") -> dict:
|
|
56
|
+
"""
|
|
57
|
+
Resolve all nested @id references in a dictionary using a DataMerger instance.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
data: The dictionary containing potential @id references
|
|
61
|
+
merger: The DataMerger instance to use for resolution
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dictionary with all @id references resolved to full objects
|
|
65
|
+
"""
|
|
66
|
+
return merger.resolve_nested_ids(data)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DataMerger:
|
|
70
|
+
"""
|
|
71
|
+
Merge and resolve JSON-LD data with support for @id references.
|
|
72
|
+
|
|
73
|
+
This class handles:
|
|
74
|
+
1. Merging linked JSON-LD documents (following @id chains)
|
|
75
|
+
2. Resolving nested @id references to full objects
|
|
76
|
+
3. Converting between remote URIs and local paths
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
data: JsonLdResource,
|
|
82
|
+
allowed_base_uris: Set[str] | None = None,
|
|
83
|
+
locally_available: Dict[str, str] | None = None,
|
|
84
|
+
config: ResolverConfig | None = None,
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
Initialize the DataMerger.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
data: The JSON-LD resource to process
|
|
91
|
+
allowed_base_uris: Set of base URIs that are allowed to be resolved.
|
|
92
|
+
Defaults to {"https://espri-mod.github.io/mip-cmor-tables"}
|
|
93
|
+
for backward compatibility.
|
|
94
|
+
locally_available: Mapping from remote base URIs to local directory paths.
|
|
95
|
+
Defaults to empty dict for backward compatibility.
|
|
96
|
+
config: Configuration for resolution behavior. If None, uses defaults.
|
|
97
|
+
"""
|
|
98
|
+
self.data = data
|
|
99
|
+
|
|
100
|
+
# Backward compatibility: use default CMIP URI if none provided
|
|
101
|
+
if allowed_base_uris is None:
|
|
102
|
+
allowed_base_uris = {"https://espri-mod.github.io/mip-cmor-tables"}
|
|
103
|
+
self.allowed_base_uris = allowed_base_uris
|
|
104
|
+
|
|
105
|
+
# Fix mutable default anti-pattern
|
|
106
|
+
if locally_available is None:
|
|
107
|
+
locally_available = {}
|
|
108
|
+
self.locally_available = locally_available
|
|
109
|
+
|
|
110
|
+
# Initialize configuration and helpers
|
|
111
|
+
self.config = config or ResolverConfig()
|
|
112
|
+
self.uri_resolver = URIResolver(self.locally_available)
|
|
113
|
+
self.string_heuristics = StringHeuristics(
|
|
114
|
+
max_length=self.config.max_string_length, exclude_patterns=self.config.exclude_patterns
|
|
115
|
+
)
|
|
116
|
+
self.term_cache = TermCache(max_size=self.config.cache_size, enabled=self.config.enable_caching)
|
|
117
|
+
|
|
118
|
+
def _should_resolve(self, uri: str) -> bool:
|
|
119
|
+
"""Check if a given URI should be resolved based on allowed URIs."""
|
|
120
|
+
return any(uri.startswith(base) for base in self.allowed_base_uris)
|
|
121
|
+
|
|
122
|
+
def _get_resolve_mode(self, key: str) -> str:
|
|
123
|
+
"""
|
|
124
|
+
Get the resolve mode for a field from the context.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
key: The field name to check
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
"full" (default), "shallow", or "reference"
|
|
131
|
+
"""
|
|
132
|
+
if not hasattr(self.data, "context"):
|
|
133
|
+
return "full"
|
|
134
|
+
|
|
135
|
+
context = self.data.context
|
|
136
|
+
|
|
137
|
+
# Check for esgvoc_resolve_modes at the root level (outside @context)
|
|
138
|
+
if isinstance(context, dict) and "esgvoc_resolve_modes" in context:
|
|
139
|
+
resolve_modes = context["esgvoc_resolve_modes"]
|
|
140
|
+
if isinstance(resolve_modes, dict) and key in resolve_modes:
|
|
141
|
+
return resolve_modes[key]
|
|
142
|
+
|
|
143
|
+
return "full" # Default: full resolution
|
|
144
|
+
|
|
145
|
+
def _get_next_id(self, data: dict, current_uri: str = None) -> str | None:
|
|
146
|
+
"""
|
|
147
|
+
Extract the next @id from the data if it is a valid customization reference.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
data: The expanded JSON-LD data
|
|
151
|
+
current_uri: The URI of the current resource (to avoid self-reference)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
The next URI to fetch and merge, or None if no valid reference exists
|
|
155
|
+
"""
|
|
156
|
+
if isinstance(data, list):
|
|
157
|
+
data = data[0]
|
|
158
|
+
if "@id" in data and self._should_resolve(data["@id"]):
|
|
159
|
+
result = self.uri_resolver.ensure_json_extension(data["@id"])
|
|
160
|
+
|
|
161
|
+
# Don't follow the reference if it points to the same resource
|
|
162
|
+
if current_uri and result == current_uri:
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
return result
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
def merge_linked_json(self) -> List[Dict]:
|
|
169
|
+
"""Fetch and merge data recursively, returning a list of progressively merged Data json instances."""
|
|
170
|
+
# Start with the original json object
|
|
171
|
+
result_list = [self.data.json_dict]
|
|
172
|
+
visited = set() # Track visited URIs (remote URIs) to prevent cycles
|
|
173
|
+
current_expanded = self.data.expanded[0]
|
|
174
|
+
current_json = self.data.json_dict
|
|
175
|
+
current_remote_uri = None # Track the remote URI of the current resource
|
|
176
|
+
|
|
177
|
+
while True:
|
|
178
|
+
# Get the next @id to follow, passing the current remote URI to avoid self-reference
|
|
179
|
+
next_id = self._get_next_id(current_expanded, current_remote_uri)
|
|
180
|
+
if not next_id or next_id in visited or not self._should_resolve(next_id):
|
|
181
|
+
break
|
|
182
|
+
visited.add(next_id)
|
|
183
|
+
current_remote_uri = next_id # Save for next iteration
|
|
184
|
+
|
|
185
|
+
# Fetch and merge the next customization
|
|
186
|
+
# Convert remote URI to local path if available
|
|
187
|
+
next_id_local = self.uri_resolver.to_local_path(next_id)
|
|
188
|
+
|
|
189
|
+
next_data_instance = JsonLdResource(uri=next_id_local)
|
|
190
|
+
merged_json_data = merge_dicts([next_data_instance.json_dict], [current_json])
|
|
191
|
+
|
|
192
|
+
# Add the merged instance to the result list
|
|
193
|
+
result_list.append(merged_json_data)
|
|
194
|
+
|
|
195
|
+
# For the next iteration, use the expanded data from the newly loaded resource
|
|
196
|
+
# (NOT from the merged data, as merge is about overlaying, not chaining references)
|
|
197
|
+
current_expanded = next_data_instance.expanded[0]
|
|
198
|
+
current_json = merged_json_data
|
|
199
|
+
return result_list
|
|
200
|
+
|
|
201
|
+
def resolve_nested_ids(
|
|
202
|
+
self,
|
|
203
|
+
data,
|
|
204
|
+
expanded_data=None,
|
|
205
|
+
visited: Set[str] = None,
|
|
206
|
+
_is_root_call: bool = True,
|
|
207
|
+
resolve_mode: str = "full",
|
|
208
|
+
_current_property: str | None = None,
|
|
209
|
+
) -> dict | list:
|
|
210
|
+
"""
|
|
211
|
+
Recursively resolve all @id references in nested structures.
|
|
212
|
+
|
|
213
|
+
Uses the expanded JSON-LD to find full URIs, fetches referenced terms,
|
|
214
|
+
and replaces references with full objects.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
data: The compact JSON data to process (dict, list, or primitive)
|
|
218
|
+
expanded_data: The expanded JSON-LD version (with full URIs)
|
|
219
|
+
visited: Set of URIs already visited to prevent circular references
|
|
220
|
+
_is_root_call: Internal flag to detect the top-level call
|
|
221
|
+
resolve_mode: Resolution mode - "full", "shallow", or "reference"
|
|
222
|
+
- "full": Resolve and recurse (default)
|
|
223
|
+
- "shallow": Resolve but don't recurse into resolved object
|
|
224
|
+
- "reference": Keep as string, validate it exists
|
|
225
|
+
_current_property: Internal tracking of which property is being resolved (for better error messages)
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
The data structure with all @id references resolved
|
|
229
|
+
"""
|
|
230
|
+
if visited is None:
|
|
231
|
+
visited = set()
|
|
232
|
+
|
|
233
|
+
# On first call only, get the expanded data if not provided
|
|
234
|
+
if expanded_data is None and _is_root_call:
|
|
235
|
+
expanded_data = self.data.expanded
|
|
236
|
+
if isinstance(expanded_data, list) and len(expanded_data) > 0:
|
|
237
|
+
expanded_data = expanded_data[0]
|
|
238
|
+
|
|
239
|
+
# Handle the case where expanded_data is a list with a single dict
|
|
240
|
+
# ONLY on the root call - not for nested list processing!
|
|
241
|
+
if _is_root_call and isinstance(expanded_data, list) and len(expanded_data) == 1:
|
|
242
|
+
expanded_data = expanded_data[0]
|
|
243
|
+
|
|
244
|
+
if isinstance(data, dict):
|
|
245
|
+
# Check if this dict is a simple @id reference (like {"@id": "hadgem3_gc31_atmos_100km"})
|
|
246
|
+
if "@id" in data and len(data) == 1:
|
|
247
|
+
id_value = data["@id"]
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
# The expanded_data should have the full URI
|
|
251
|
+
uri = expanded_data.get("@id", id_value) if isinstance(expanded_data, dict) else id_value
|
|
252
|
+
|
|
253
|
+
# Only resolve if it's in our allowed URIs
|
|
254
|
+
if not self._should_resolve(uri):
|
|
255
|
+
return data
|
|
256
|
+
|
|
257
|
+
# Ensure it has .json extension
|
|
258
|
+
uri = self.uri_resolver.ensure_json_extension(uri)
|
|
259
|
+
|
|
260
|
+
# Prevent circular references (only within the current resolution chain)
|
|
261
|
+
if uri in visited:
|
|
262
|
+
logger.warning(f"Circular reference detected: {uri}")
|
|
263
|
+
return data
|
|
264
|
+
|
|
265
|
+
# Add to visited for this branch only
|
|
266
|
+
new_visited = visited.copy()
|
|
267
|
+
new_visited.add(uri)
|
|
268
|
+
|
|
269
|
+
# Convert remote URI to local path
|
|
270
|
+
local_uri = self.uri_resolver.to_local_path(uri)
|
|
271
|
+
|
|
272
|
+
# Create a temporary resource for the nested term
|
|
273
|
+
temp_resource = JsonLdResource(uri=local_uri)
|
|
274
|
+
|
|
275
|
+
# Create a DataMerger for this nested term to get project+universe merge
|
|
276
|
+
nested_merger = DataMerger(
|
|
277
|
+
data=temp_resource,
|
|
278
|
+
allowed_base_uris=self.allowed_base_uris,
|
|
279
|
+
locally_available=self.locally_available,
|
|
280
|
+
config=self.config,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Get the merged project+universe data
|
|
284
|
+
merge_results = nested_merger.merge_linked_json()
|
|
285
|
+
resolved = merge_results[-1] # Final merged result
|
|
286
|
+
|
|
287
|
+
# Get proper expansion for the merged term
|
|
288
|
+
temp_expanded = temp_resource.expanded
|
|
289
|
+
if isinstance(temp_expanded, list) and len(temp_expanded) > 0:
|
|
290
|
+
temp_expanded = temp_expanded[0]
|
|
291
|
+
|
|
292
|
+
# Recursively resolve any nested references in the merged data
|
|
293
|
+
# Pass the expanded data for this specific term
|
|
294
|
+
return self.resolve_nested_ids(resolved, temp_expanded, new_visited, _is_root_call=False)
|
|
295
|
+
|
|
296
|
+
except Exception as e:
|
|
297
|
+
logger.error(f"Failed to resolve reference {id_value}: {e}")
|
|
298
|
+
return data
|
|
299
|
+
|
|
300
|
+
# Otherwise, recursively process all values in the dict
|
|
301
|
+
result = {}
|
|
302
|
+
for key, value in data.items():
|
|
303
|
+
# Find corresponding expanded value
|
|
304
|
+
# Map compact key to expanded key (e.g., "model_components" -> "http://schema.org/model_components")
|
|
305
|
+
# Also handle JSON-LD keywords: "id" -> "@id", "type" -> "@type"
|
|
306
|
+
expanded_key = key
|
|
307
|
+
if isinstance(expanded_data, dict):
|
|
308
|
+
# First check for JSON-LD keyword mappings
|
|
309
|
+
if key == "id":
|
|
310
|
+
expanded_key = "@id"
|
|
311
|
+
elif key == "type":
|
|
312
|
+
expanded_key = "@type"
|
|
313
|
+
else:
|
|
314
|
+
# Try to find the key in expanded data
|
|
315
|
+
# It might be under a full URI
|
|
316
|
+
for exp_key in expanded_data.keys():
|
|
317
|
+
# Check for exact match or if the URI contains the key
|
|
318
|
+
# URIs may have trailing slashes: https://.../activity/
|
|
319
|
+
if (
|
|
320
|
+
exp_key == key
|
|
321
|
+
or exp_key.endswith("/" + key)
|
|
322
|
+
or exp_key.endswith("/" + key + "/")
|
|
323
|
+
or exp_key.endswith("#" + key)
|
|
324
|
+
):
|
|
325
|
+
expanded_key = exp_key
|
|
326
|
+
break
|
|
327
|
+
|
|
328
|
+
# If not found, check the context to see if this key has a different @id
|
|
329
|
+
# (e.g., required_model_components has @id of source_type/)
|
|
330
|
+
if expanded_key == key and hasattr(self.data, "context"):
|
|
331
|
+
context = self.data.context
|
|
332
|
+
if isinstance(context, dict) and "@context" in context:
|
|
333
|
+
context = context["@context"]
|
|
334
|
+
if isinstance(context, dict) and key in context:
|
|
335
|
+
term_def = context[key]
|
|
336
|
+
if isinstance(term_def, dict) and "@id" in term_def:
|
|
337
|
+
# The @id value should match a key in expanded_data
|
|
338
|
+
id_value = term_def["@id"]
|
|
339
|
+
# Try with and without trailing slash
|
|
340
|
+
if id_value in expanded_data:
|
|
341
|
+
expanded_key = id_value
|
|
342
|
+
elif id_value.rstrip("/") + "/" in expanded_data:
|
|
343
|
+
expanded_key = id_value.rstrip("/") + "/"
|
|
344
|
+
elif id_value.rstrip("/") in expanded_data:
|
|
345
|
+
expanded_key = id_value.rstrip("/")
|
|
346
|
+
|
|
347
|
+
expanded_value = expanded_data.get(expanded_key) if isinstance(expanded_data, dict) else None
|
|
348
|
+
|
|
349
|
+
# Check if this field has a @resolve mode in the context
|
|
350
|
+
field_resolve_mode = self._get_resolve_mode(key)
|
|
351
|
+
|
|
352
|
+
resolved = self.resolve_nested_ids(
|
|
353
|
+
value,
|
|
354
|
+
expanded_value,
|
|
355
|
+
visited,
|
|
356
|
+
_is_root_call=False,
|
|
357
|
+
resolve_mode=field_resolve_mode,
|
|
358
|
+
_current_property=key,
|
|
359
|
+
)
|
|
360
|
+
result[key] = resolved
|
|
361
|
+
return result
|
|
362
|
+
|
|
363
|
+
elif isinstance(data, list) and isinstance(expanded_data, list):
|
|
364
|
+
# Recursively process each item in the list with corresponding expanded item
|
|
365
|
+
result = []
|
|
366
|
+
for i, item in enumerate(data):
|
|
367
|
+
expanded_item = expanded_data[i] if i < len(expanded_data) else None
|
|
368
|
+
# Pass visited set and resolve_mode to prevent circular references across list items
|
|
369
|
+
resolved_item = self.resolve_nested_ids(
|
|
370
|
+
item,
|
|
371
|
+
expanded_item,
|
|
372
|
+
visited,
|
|
373
|
+
_is_root_call=False,
|
|
374
|
+
resolve_mode=resolve_mode,
|
|
375
|
+
_current_property=_current_property,
|
|
376
|
+
)
|
|
377
|
+
result.append(resolved_item)
|
|
378
|
+
return result
|
|
379
|
+
|
|
380
|
+
elif isinstance(data, list):
|
|
381
|
+
# List but no corresponding expanded list, process without expanded data
|
|
382
|
+
# Each list item gets its own visited set
|
|
383
|
+
return [
|
|
384
|
+
self.resolve_nested_ids(
|
|
385
|
+
item,
|
|
386
|
+
None,
|
|
387
|
+
set(),
|
|
388
|
+
_is_root_call=False,
|
|
389
|
+
resolve_mode=resolve_mode,
|
|
390
|
+
_current_property=_current_property,
|
|
391
|
+
)
|
|
392
|
+
for item in data
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
else:
|
|
396
|
+
# Primitive values - but check if they're ID references
|
|
397
|
+
# If the compact form is a string but expanded form is {"@id": "..."},
|
|
398
|
+
# it's an ID reference that needs resolving
|
|
399
|
+
|
|
400
|
+
# JSON-LD expansion often wraps values in arrays, unwrap single-element arrays
|
|
401
|
+
if isinstance(expanded_data, list) and len(expanded_data) == 1:
|
|
402
|
+
expanded_data = expanded_data[0]
|
|
403
|
+
|
|
404
|
+
if isinstance(data, str) and isinstance(expanded_data, dict):
|
|
405
|
+
# Skip empty or whitespace-only strings
|
|
406
|
+
if not data or not data.strip():
|
|
407
|
+
return data
|
|
408
|
+
|
|
409
|
+
# Skip if it's a @value (literal string, not a reference)
|
|
410
|
+
if self.string_heuristics.should_skip_literal(expanded_data):
|
|
411
|
+
return data
|
|
412
|
+
|
|
413
|
+
if not self.string_heuristics.has_id_in_expanded(expanded_data):
|
|
414
|
+
return data
|
|
415
|
+
|
|
416
|
+
uri = expanded_data["@id"]
|
|
417
|
+
|
|
418
|
+
# Check resolve_mode FIRST before any expensive operations
|
|
419
|
+
if resolve_mode == "reference":
|
|
420
|
+
# "reference" mode: just validate the ID exists, keep as string
|
|
421
|
+
uri_to_check = self.uri_resolver.ensure_json_extension(uri)
|
|
422
|
+
if not self.uri_resolver.exists(uri_to_check):
|
|
423
|
+
property_msg = f" in property '{_current_property}'" if _current_property else ""
|
|
424
|
+
logger.warning(
|
|
425
|
+
f"Reference validation failed: ID '{data}' does not exist at {uri_to_check}{property_msg}"
|
|
426
|
+
)
|
|
427
|
+
return data # Keep as string regardless
|
|
428
|
+
|
|
429
|
+
# Use string heuristics to determine if this should be resolved
|
|
430
|
+
if not self.string_heuristics.is_resolvable(data):
|
|
431
|
+
return data
|
|
432
|
+
|
|
433
|
+
# Only resolve if it's in our allowed URIs
|
|
434
|
+
if not self._should_resolve(uri):
|
|
435
|
+
return data
|
|
436
|
+
|
|
437
|
+
# Check if recursion depth is too deep (prevent infinite loops)
|
|
438
|
+
if len(visited) > self.config.max_depth:
|
|
439
|
+
if self.config.log_depth_warnings:
|
|
440
|
+
logger.warning(
|
|
441
|
+
f"Max depth ({self.config.max_depth}) exceeded. Visited {len(visited)} URIs. Current: {uri}"
|
|
442
|
+
)
|
|
443
|
+
return data
|
|
444
|
+
|
|
445
|
+
# Ensure it has .json extension
|
|
446
|
+
uri = self.uri_resolver.ensure_json_extension(uri)
|
|
447
|
+
|
|
448
|
+
# Prevent circular references
|
|
449
|
+
if uri in visited:
|
|
450
|
+
logger.debug(f"Circular reference detected: {uri}")
|
|
451
|
+
return data
|
|
452
|
+
|
|
453
|
+
# Check if the file exists before trying to resolve
|
|
454
|
+
# Don't resolve strings that are just enum values or simple identifiers
|
|
455
|
+
# Only resolve if it looks like a real component/grid reference
|
|
456
|
+
try:
|
|
457
|
+
# Convert remote URI to local path
|
|
458
|
+
local_uri = self.uri_resolver.to_local_path(uri)
|
|
459
|
+
|
|
460
|
+
# Check if file exists - if not, it's probably not a resolvable reference
|
|
461
|
+
if not self.uri_resolver.exists(uri):
|
|
462
|
+
property_msg = f" Property: '{_current_property}'\n" if _current_property else ""
|
|
463
|
+
logger.warning(
|
|
464
|
+
f"Cannot resolve ID reference: File not found\n"
|
|
465
|
+
f" Current term: {self.data.uri}\n"
|
|
466
|
+
f"{property_msg}"
|
|
467
|
+
f" String value: '{data}'\n"
|
|
468
|
+
f" Expected URI: {uri}\n"
|
|
469
|
+
f" Local path tried: {local_uri}\n"
|
|
470
|
+
f" → Keeping as unresolved string"
|
|
471
|
+
)
|
|
472
|
+
return data
|
|
473
|
+
except (OSError, IOError) as e:
|
|
474
|
+
property_msg = f" Property: '{_current_property}'\n" if _current_property else ""
|
|
475
|
+
logger.warning(
|
|
476
|
+
f"Cannot resolve ID reference: Error checking file existence\n"
|
|
477
|
+
f" Current term: {self.data.uri}\n"
|
|
478
|
+
f"{property_msg}"
|
|
479
|
+
f" String value: '{data}'\n"
|
|
480
|
+
f" Expected URI: {uri}\n"
|
|
481
|
+
f" Error: {e}\n"
|
|
482
|
+
f" → Keeping as unresolved string"
|
|
483
|
+
)
|
|
484
|
+
return data
|
|
485
|
+
|
|
486
|
+
# Add to visited for this branch only
|
|
487
|
+
new_visited = visited.copy()
|
|
488
|
+
new_visited.add(uri)
|
|
489
|
+
|
|
490
|
+
try:
|
|
491
|
+
# Create a temporary resource for the nested term
|
|
492
|
+
temp_resource = JsonLdResource(uri=local_uri)
|
|
493
|
+
|
|
494
|
+
# Create a DataMerger for this nested term to get project+universe merge
|
|
495
|
+
nested_merger = DataMerger(
|
|
496
|
+
data=temp_resource,
|
|
497
|
+
allowed_base_uris=self.allowed_base_uris,
|
|
498
|
+
locally_available=self.locally_available,
|
|
499
|
+
config=self.config,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Get the merged project+universe data
|
|
503
|
+
merge_results = nested_merger.merge_linked_json()
|
|
504
|
+
resolved = merge_results[-1] # Final merged result
|
|
505
|
+
|
|
506
|
+
logger.info(
|
|
507
|
+
f"Successfully resolved ID reference\n"
|
|
508
|
+
f" Current term: {self.data.uri}\n"
|
|
509
|
+
f" String value: '{data}'\n"
|
|
510
|
+
f" Resolved to: {uri}\n"
|
|
511
|
+
f" Mode: {resolve_mode}\n"
|
|
512
|
+
f" → Replacing with {'shallow' if resolve_mode == 'shallow' else 'full'} object"
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Get proper expansion for the merged term
|
|
516
|
+
temp_expanded = temp_resource.expanded
|
|
517
|
+
if isinstance(temp_expanded, list) and len(temp_expanded) > 0:
|
|
518
|
+
temp_expanded = temp_expanded[0]
|
|
519
|
+
|
|
520
|
+
# Handle resolution based on mode
|
|
521
|
+
if resolve_mode == "shallow":
|
|
522
|
+
# "shallow" mode: return the merged object but DON'T resolve its nested IDs
|
|
523
|
+
return resolved
|
|
524
|
+
else: # "full"
|
|
525
|
+
# "full" mode: recursively resolve any nested references in the merged data
|
|
526
|
+
return nested_merger.resolve_nested_ids(
|
|
527
|
+
resolved, temp_expanded, new_visited, _is_root_call=False, resolve_mode="full"
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
except Exception as e:
|
|
531
|
+
property_msg = f" Property: '{_current_property}'\n" if _current_property else ""
|
|
532
|
+
logger.warning(
|
|
533
|
+
f"Cannot resolve ID reference: Exception during resolution\n"
|
|
534
|
+
f" Current term: {self.data.uri}\n"
|
|
535
|
+
f"{property_msg}"
|
|
536
|
+
f" String value: '{data}'\n"
|
|
537
|
+
f" Expected URI: {uri}\n"
|
|
538
|
+
f" Error: {e}\n"
|
|
539
|
+
f" → Keeping as unresolved string"
|
|
540
|
+
)
|
|
541
|
+
return data
|
|
542
|
+
|
|
543
|
+
# Regular primitive values are returned as-is
|
|
544
|
+
return data
|
|
545
|
+
|
|
546
|
+
def resolve_merged_ids(self, merged_data: dict, context_base_path: str | None = None) -> dict:
|
|
547
|
+
"""
|
|
548
|
+
Resolve nested IDs in merged data by re-expanding it with proper context.
|
|
549
|
+
|
|
550
|
+
This is needed because merged data may contain fields from the parent term
|
|
551
|
+
that aren't in the original term's context.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
merged_data: The merged dictionary from merge_linked_json()
|
|
555
|
+
context_base_path: Base path containing context directories. If None,
|
|
556
|
+
attempts to infer from locally_available mappings.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
Dictionary with all nested IDs resolved to full objects
|
|
560
|
+
"""
|
|
561
|
+
import json
|
|
562
|
+
import tempfile
|
|
563
|
+
from pathlib import Path
|
|
564
|
+
|
|
565
|
+
# Determine the base path for context
|
|
566
|
+
if context_base_path is None:
|
|
567
|
+
# Try to infer from locally_available - use first available path
|
|
568
|
+
# Preferring the universe path
|
|
569
|
+
if "https://esgvoc.ipsl.fr/resource/universe" in self.locally_available:
|
|
570
|
+
context_base_path = self.locally_available["https://esgvoc.ipsl.fr/resource/universe"]
|
|
571
|
+
elif self.locally_available:
|
|
572
|
+
# Use first available local path
|
|
573
|
+
context_base_path = next(iter(self.locally_available.values()))
|
|
574
|
+
else:
|
|
575
|
+
# No local paths available, fallback to regular resolution
|
|
576
|
+
return self.resolve_nested_ids(merged_data)
|
|
577
|
+
|
|
578
|
+
# Find the data descriptor directory from merged_data type
|
|
579
|
+
data_descriptor = merged_data.get("type", "")
|
|
580
|
+
if not data_descriptor:
|
|
581
|
+
return self.resolve_nested_ids(merged_data)
|
|
582
|
+
|
|
583
|
+
context_dir = Path(context_base_path) / data_descriptor
|
|
584
|
+
|
|
585
|
+
if not context_dir.exists():
|
|
586
|
+
# Fallback if directory doesn't exist
|
|
587
|
+
return self.resolve_nested_ids(merged_data)
|
|
588
|
+
|
|
589
|
+
# Create temp file in the universe data descriptor directory
|
|
590
|
+
# This ensures JsonLdResource picks up the correct context
|
|
591
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, dir=str(context_dir)) as tmp:
|
|
592
|
+
json.dump(merged_data, tmp)
|
|
593
|
+
tmp_path = tmp.name
|
|
594
|
+
|
|
595
|
+
try:
|
|
596
|
+
# Create new resource with proper context expansion
|
|
597
|
+
merged_resource = JsonLdResource(uri=tmp_path)
|
|
598
|
+
merged_expanded = merged_resource.expanded
|
|
599
|
+
if isinstance(merged_expanded, list) and len(merged_expanded) > 0:
|
|
600
|
+
merged_expanded = merged_expanded[0]
|
|
601
|
+
|
|
602
|
+
# Temporarily update self.data to use merged resource's context
|
|
603
|
+
# so that _get_resolve_mode() uses the correct esgvoc_resolve_modes
|
|
604
|
+
original_data = self.data
|
|
605
|
+
self.data = merged_resource
|
|
606
|
+
|
|
607
|
+
try:
|
|
608
|
+
# Resolve with correct expansion and context
|
|
609
|
+
return self.resolve_nested_ids(merged_data, expanded_data=merged_expanded)
|
|
610
|
+
finally:
|
|
611
|
+
# Restore original data
|
|
612
|
+
self.data = original_data
|
|
613
|
+
finally:
|
|
614
|
+
Path(tmp_path).unlink()
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
if __name__ == "__main__":
|
|
618
|
+
import warnings
|
|
619
|
+
|
|
620
|
+
warnings.simplefilter("ignore")
|
|
621
|
+
|
|
622
|
+
# test from institution_id ipsl exapnd and merge with institution ipsl
|
|
623
|
+
# proj_ipsl = JsonLdResource(uri = "https://espri-mod.github.io/CMIP6Plus_CVs/institution_id/ipsl.json")
|
|
624
|
+
# allowed_uris = {"https://espri-mod.github.io/CMIP6Plus_CVs/","https://espri-mod.github.io/mip-cmor-tables/"}
|
|
625
|
+
# mdm = DataMerger(data =proj_ipsl, allowed_base_uris = allowed_uris)
|
|
626
|
+
# json_list = mdm.merge_linked_json()
|
|
627
|
+
#
|
|
628
|
+
# pprint([res for res in json_list])
|
|
629
|
+
|
|
630
|
+
# a = JsonLdResource(uri = ".cache/repos/CMIP6Plus_CVs/institution_id/ipsl.json")
|
|
631
|
+
# mdm = DataMerger(data=a)
|
|
632
|
+
# print(mdm.merge_linked_json())
|
|
633
|
+
#
|
|
634
|
+
#
|