esgvoc 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. esgvoc/__init__.py +3 -0
  2. esgvoc/api/__init__.py +91 -0
  3. esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
  4. esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
  5. esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
  6. esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
  7. esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
  8. esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
  9. esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
  10. esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
  11. esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
  12. esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
  13. esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
  14. esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
  15. esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
  16. esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
  17. esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
  18. esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
  19. esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
  20. esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
  21. esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
  22. esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
  23. esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
  24. esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
  25. esgvoc/api/data_descriptors/__init__.py +159 -0
  26. esgvoc/api/data_descriptors/activity.py +72 -0
  27. esgvoc/api/data_descriptors/archive.py +5 -0
  28. esgvoc/api/data_descriptors/area_label.py +30 -0
  29. esgvoc/api/data_descriptors/branded_suffix.py +30 -0
  30. esgvoc/api/data_descriptors/branded_variable.py +21 -0
  31. esgvoc/api/data_descriptors/citation_url.py +5 -0
  32. esgvoc/api/data_descriptors/contact.py +5 -0
  33. esgvoc/api/data_descriptors/conventions.py +28 -0
  34. esgvoc/api/data_descriptors/creation_date.py +18 -0
  35. esgvoc/api/data_descriptors/data_descriptor.py +127 -0
  36. esgvoc/api/data_descriptors/data_specs_version.py +25 -0
  37. esgvoc/api/data_descriptors/date.py +5 -0
  38. esgvoc/api/data_descriptors/directory_date.py +22 -0
  39. esgvoc/api/data_descriptors/drs_specs.py +38 -0
  40. esgvoc/api/data_descriptors/experiment.py +215 -0
  41. esgvoc/api/data_descriptors/forcing_index.py +21 -0
  42. esgvoc/api/data_descriptors/frequency.py +48 -0
  43. esgvoc/api/data_descriptors/further_info_url.py +5 -0
  44. esgvoc/api/data_descriptors/grid.py +43 -0
  45. esgvoc/api/data_descriptors/horizontal_label.py +20 -0
  46. esgvoc/api/data_descriptors/initialization_index.py +27 -0
  47. esgvoc/api/data_descriptors/institution.py +80 -0
  48. esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
  49. esgvoc/api/data_descriptors/license.py +31 -0
  50. esgvoc/api/data_descriptors/member_id.py +9 -0
  51. esgvoc/api/data_descriptors/mip_era.py +26 -0
  52. esgvoc/api/data_descriptors/model_component.py +32 -0
  53. esgvoc/api/data_descriptors/models_test/models.py +17 -0
  54. esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
  55. esgvoc/api/data_descriptors/obs_type.py +5 -0
  56. esgvoc/api/data_descriptors/organisation.py +22 -0
  57. esgvoc/api/data_descriptors/physics_index.py +21 -0
  58. esgvoc/api/data_descriptors/product.py +16 -0
  59. esgvoc/api/data_descriptors/publication_status.py +5 -0
  60. esgvoc/api/data_descriptors/realization_index.py +24 -0
  61. esgvoc/api/data_descriptors/realm.py +16 -0
  62. esgvoc/api/data_descriptors/regex.py +5 -0
  63. esgvoc/api/data_descriptors/region.py +35 -0
  64. esgvoc/api/data_descriptors/resolution.py +7 -0
  65. esgvoc/api/data_descriptors/source.py +120 -0
  66. esgvoc/api/data_descriptors/source_type.py +5 -0
  67. esgvoc/api/data_descriptors/sub_experiment.py +5 -0
  68. esgvoc/api/data_descriptors/table.py +28 -0
  69. esgvoc/api/data_descriptors/temporal_label.py +20 -0
  70. esgvoc/api/data_descriptors/time_range.py +17 -0
  71. esgvoc/api/data_descriptors/title.py +5 -0
  72. esgvoc/api/data_descriptors/tracking_id.py +67 -0
  73. esgvoc/api/data_descriptors/variable.py +56 -0
  74. esgvoc/api/data_descriptors/variant_label.py +25 -0
  75. esgvoc/api/data_descriptors/vertical_label.py +20 -0
  76. esgvoc/api/project_specs.py +143 -0
  77. esgvoc/api/projects.py +1253 -0
  78. esgvoc/api/py.typed +0 -0
  79. esgvoc/api/pydantic_handler.py +146 -0
  80. esgvoc/api/report.py +127 -0
  81. esgvoc/api/search.py +171 -0
  82. esgvoc/api/universe.py +434 -0
  83. esgvoc/apps/__init__.py +6 -0
  84. esgvoc/apps/cmor_tables/__init__.py +7 -0
  85. esgvoc/apps/cmor_tables/cvs_table.py +948 -0
  86. esgvoc/apps/drs/__init__.py +0 -0
  87. esgvoc/apps/drs/constants.py +2 -0
  88. esgvoc/apps/drs/generator.py +429 -0
  89. esgvoc/apps/drs/report.py +540 -0
  90. esgvoc/apps/drs/validator.py +312 -0
  91. esgvoc/apps/ga/__init__.py +104 -0
  92. esgvoc/apps/ga/example_usage.py +315 -0
  93. esgvoc/apps/ga/models/__init__.py +47 -0
  94. esgvoc/apps/ga/models/netcdf_header.py +306 -0
  95. esgvoc/apps/ga/models/validator.py +491 -0
  96. esgvoc/apps/ga/test_ga.py +161 -0
  97. esgvoc/apps/ga/validator.py +277 -0
  98. esgvoc/apps/jsg/json_schema_generator.py +341 -0
  99. esgvoc/apps/jsg/templates/template.jinja +241 -0
  100. esgvoc/apps/test_cv/README.md +214 -0
  101. esgvoc/apps/test_cv/__init__.py +0 -0
  102. esgvoc/apps/test_cv/cv_tester.py +1611 -0
  103. esgvoc/apps/test_cv/example_usage.py +216 -0
  104. esgvoc/apps/vr/__init__.py +12 -0
  105. esgvoc/apps/vr/build_variable_registry.py +71 -0
  106. esgvoc/apps/vr/example_usage.py +60 -0
  107. esgvoc/apps/vr/vr_app.py +333 -0
  108. esgvoc/cli/clean.py +304 -0
  109. esgvoc/cli/cmor.py +46 -0
  110. esgvoc/cli/config.py +1300 -0
  111. esgvoc/cli/drs.py +267 -0
  112. esgvoc/cli/find.py +138 -0
  113. esgvoc/cli/get.py +155 -0
  114. esgvoc/cli/install.py +41 -0
  115. esgvoc/cli/main.py +60 -0
  116. esgvoc/cli/offline.py +269 -0
  117. esgvoc/cli/status.py +79 -0
  118. esgvoc/cli/test_cv.py +258 -0
  119. esgvoc/cli/valid.py +147 -0
  120. esgvoc/core/constants.py +17 -0
  121. esgvoc/core/convert.py +0 -0
  122. esgvoc/core/data_handler.py +206 -0
  123. esgvoc/core/db/__init__.py +3 -0
  124. esgvoc/core/db/connection.py +40 -0
  125. esgvoc/core/db/models/mixins.py +25 -0
  126. esgvoc/core/db/models/project.py +102 -0
  127. esgvoc/core/db/models/universe.py +98 -0
  128. esgvoc/core/db/project_ingestion.py +231 -0
  129. esgvoc/core/db/universe_ingestion.py +172 -0
  130. esgvoc/core/exceptions.py +33 -0
  131. esgvoc/core/logging_handler.py +26 -0
  132. esgvoc/core/repo_fetcher.py +345 -0
  133. esgvoc/core/service/__init__.py +41 -0
  134. esgvoc/core/service/configuration/config_manager.py +196 -0
  135. esgvoc/core/service/configuration/setting.py +363 -0
  136. esgvoc/core/service/data_merger.py +634 -0
  137. esgvoc/core/service/esg_voc.py +77 -0
  138. esgvoc/core/service/resolver_config.py +56 -0
  139. esgvoc/core/service/state.py +324 -0
  140. esgvoc/core/service/string_heuristics.py +98 -0
  141. esgvoc/core/service/term_cache.py +108 -0
  142. esgvoc/core/service/uri_resolver.py +133 -0
  143. esgvoc-2.0.2.dist-info/METADATA +82 -0
  144. esgvoc-2.0.2.dist-info/RECORD +147 -0
  145. esgvoc-2.0.2.dist-info/WHEEL +4 -0
  146. esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
  147. esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
@@ -0,0 +1,634 @@
1
+ import logging
2
+ from typing import Dict, List, Set
3
+
4
+ from esgvoc.core.data_handler import JsonLdResource
5
+ from esgvoc.core.service.resolver_config import ResolverConfig
6
+ from esgvoc.core.service.string_heuristics import StringHeuristics
7
+ from esgvoc.core.service.term_cache import TermCache
8
+ from esgvoc.core.service.uri_resolver import URIResolver
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def merge_dicts(base: list, override: list) -> dict:
14
+ """
15
+ Merge two JSON-LD dictionaries, with override taking precedence.
16
+
17
+ This performs a shallow merge where:
18
+ 1. Start with override data (custom/project-specific)
19
+ 2. Overlay with base data (parent/universe defaults)
20
+ 3. Skip @id field from both (handled separately in JSON-LD)
21
+
22
+ Args:
23
+ base: List containing the base/parent dictionary (expects [dict])
24
+ override: List containing the override/child dictionary (expects [dict])
25
+
26
+ Returns:
27
+ Merged dictionary with override values taking precedence
28
+
29
+ Example:
30
+ >>> base = [{"name": "Base", "value": 1}]
31
+ >>> override = [{"value": 2, "new": "field"}]
32
+ >>> merge_dicts(base, override)
33
+ {'value': 2, 'new': 'field', 'name': 'Base'}
34
+
35
+ Note:
36
+ Takes lists for backward compatibility with JSON-LD processing,
37
+ but only uses the first element of each.
38
+ """
39
+ base_data = base[0]
40
+ override_data = override[0]
41
+
42
+ # Merge strategy: base first (fills in defaults), then override (takes precedence)
43
+ merged = {
44
+ **{k: v for k, v in base_data.items() if k != "@id"},
45
+ **{k: v for k, v in override_data.items() if k != "@id"},
46
+ }
47
+ return merged
48
+
49
+
50
+ def merge(uri: str) -> Dict:
51
+ mdm = DataMerger(data=JsonLdResource(uri=uri))
52
+ return mdm.merge_linked_json()[-1]
53
+
54
+
55
+ def resolve_nested_ids_in_dict(data: dict, merger: "DataMerger") -> dict:
56
+ """
57
+ Resolve all nested @id references in a dictionary using a DataMerger instance.
58
+
59
+ Args:
60
+ data: The dictionary containing potential @id references
61
+ merger: The DataMerger instance to use for resolution
62
+
63
+ Returns:
64
+ Dictionary with all @id references resolved to full objects
65
+ """
66
+ return merger.resolve_nested_ids(data)
67
+
68
+
69
+ class DataMerger:
70
+ """
71
+ Merge and resolve JSON-LD data with support for @id references.
72
+
73
+ This class handles:
74
+ 1. Merging linked JSON-LD documents (following @id chains)
75
+ 2. Resolving nested @id references to full objects
76
+ 3. Converting between remote URIs and local paths
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ data: JsonLdResource,
82
+ allowed_base_uris: Set[str] | None = None,
83
+ locally_available: Dict[str, str] | None = None,
84
+ config: ResolverConfig | None = None,
85
+ ):
86
+ """
87
+ Initialize the DataMerger.
88
+
89
+ Args:
90
+ data: The JSON-LD resource to process
91
+ allowed_base_uris: Set of base URIs that are allowed to be resolved.
92
+ Defaults to {"https://espri-mod.github.io/mip-cmor-tables"}
93
+ for backward compatibility.
94
+ locally_available: Mapping from remote base URIs to local directory paths.
95
+ Defaults to empty dict for backward compatibility.
96
+ config: Configuration for resolution behavior. If None, uses defaults.
97
+ """
98
+ self.data = data
99
+
100
+ # Backward compatibility: use default CMIP URI if none provided
101
+ if allowed_base_uris is None:
102
+ allowed_base_uris = {"https://espri-mod.github.io/mip-cmor-tables"}
103
+ self.allowed_base_uris = allowed_base_uris
104
+
105
+ # Fix mutable default anti-pattern
106
+ if locally_available is None:
107
+ locally_available = {}
108
+ self.locally_available = locally_available
109
+
110
+ # Initialize configuration and helpers
111
+ self.config = config or ResolverConfig()
112
+ self.uri_resolver = URIResolver(self.locally_available)
113
+ self.string_heuristics = StringHeuristics(
114
+ max_length=self.config.max_string_length, exclude_patterns=self.config.exclude_patterns
115
+ )
116
+ self.term_cache = TermCache(max_size=self.config.cache_size, enabled=self.config.enable_caching)
117
+
118
+ def _should_resolve(self, uri: str) -> bool:
119
+ """Check if a given URI should be resolved based on allowed URIs."""
120
+ return any(uri.startswith(base) for base in self.allowed_base_uris)
121
+
122
+ def _get_resolve_mode(self, key: str) -> str:
123
+ """
124
+ Get the resolve mode for a field from the context.
125
+
126
+ Args:
127
+ key: The field name to check
128
+
129
+ Returns:
130
+ "full" (default), "shallow", or "reference"
131
+ """
132
+ if not hasattr(self.data, "context"):
133
+ return "full"
134
+
135
+ context = self.data.context
136
+
137
+ # Check for esgvoc_resolve_modes at the root level (outside @context)
138
+ if isinstance(context, dict) and "esgvoc_resolve_modes" in context:
139
+ resolve_modes = context["esgvoc_resolve_modes"]
140
+ if isinstance(resolve_modes, dict) and key in resolve_modes:
141
+ return resolve_modes[key]
142
+
143
+ return "full" # Default: full resolution
144
+
145
+ def _get_next_id(self, data: dict, current_uri: str = None) -> str | None:
146
+ """
147
+ Extract the next @id from the data if it is a valid customization reference.
148
+
149
+ Args:
150
+ data: The expanded JSON-LD data
151
+ current_uri: The URI of the current resource (to avoid self-reference)
152
+
153
+ Returns:
154
+ The next URI to fetch and merge, or None if no valid reference exists
155
+ """
156
+ if isinstance(data, list):
157
+ data = data[0]
158
+ if "@id" in data and self._should_resolve(data["@id"]):
159
+ result = self.uri_resolver.ensure_json_extension(data["@id"])
160
+
161
+ # Don't follow the reference if it points to the same resource
162
+ if current_uri and result == current_uri:
163
+ return None
164
+
165
+ return result
166
+ return None
167
+
168
+ def merge_linked_json(self) -> List[Dict]:
169
+ """Fetch and merge data recursively, returning a list of progressively merged Data json instances."""
170
+ # Start with the original json object
171
+ result_list = [self.data.json_dict]
172
+ visited = set() # Track visited URIs (remote URIs) to prevent cycles
173
+ current_expanded = self.data.expanded[0]
174
+ current_json = self.data.json_dict
175
+ current_remote_uri = None # Track the remote URI of the current resource
176
+
177
+ while True:
178
+ # Get the next @id to follow, passing the current remote URI to avoid self-reference
179
+ next_id = self._get_next_id(current_expanded, current_remote_uri)
180
+ if not next_id or next_id in visited or not self._should_resolve(next_id):
181
+ break
182
+ visited.add(next_id)
183
+ current_remote_uri = next_id # Save for next iteration
184
+
185
+ # Fetch and merge the next customization
186
+ # Convert remote URI to local path if available
187
+ next_id_local = self.uri_resolver.to_local_path(next_id)
188
+
189
+ next_data_instance = JsonLdResource(uri=next_id_local)
190
+ merged_json_data = merge_dicts([next_data_instance.json_dict], [current_json])
191
+
192
+ # Add the merged instance to the result list
193
+ result_list.append(merged_json_data)
194
+
195
+ # For the next iteration, use the expanded data from the newly loaded resource
196
+ # (NOT from the merged data, as merge is about overlaying, not chaining references)
197
+ current_expanded = next_data_instance.expanded[0]
198
+ current_json = merged_json_data
199
+ return result_list
200
+
201
+ def resolve_nested_ids(
202
+ self,
203
+ data,
204
+ expanded_data=None,
205
+ visited: Set[str] = None,
206
+ _is_root_call: bool = True,
207
+ resolve_mode: str = "full",
208
+ _current_property: str | None = None,
209
+ ) -> dict | list:
210
+ """
211
+ Recursively resolve all @id references in nested structures.
212
+
213
+ Uses the expanded JSON-LD to find full URIs, fetches referenced terms,
214
+ and replaces references with full objects.
215
+
216
+ Args:
217
+ data: The compact JSON data to process (dict, list, or primitive)
218
+ expanded_data: The expanded JSON-LD version (with full URIs)
219
+ visited: Set of URIs already visited to prevent circular references
220
+ _is_root_call: Internal flag to detect the top-level call
221
+ resolve_mode: Resolution mode - "full", "shallow", or "reference"
222
+ - "full": Resolve and recurse (default)
223
+ - "shallow": Resolve but don't recurse into resolved object
224
+ - "reference": Keep as string, validate it exists
225
+ _current_property: Internal tracking of which property is being resolved (for better error messages)
226
+
227
+ Returns:
228
+ The data structure with all @id references resolved
229
+ """
230
+ if visited is None:
231
+ visited = set()
232
+
233
+ # On first call only, get the expanded data if not provided
234
+ if expanded_data is None and _is_root_call:
235
+ expanded_data = self.data.expanded
236
+ if isinstance(expanded_data, list) and len(expanded_data) > 0:
237
+ expanded_data = expanded_data[0]
238
+
239
+ # Handle the case where expanded_data is a list with a single dict
240
+ # ONLY on the root call - not for nested list processing!
241
+ if _is_root_call and isinstance(expanded_data, list) and len(expanded_data) == 1:
242
+ expanded_data = expanded_data[0]
243
+
244
+ if isinstance(data, dict):
245
+ # Check if this dict is a simple @id reference (like {"@id": "hadgem3_gc31_atmos_100km"})
246
+ if "@id" in data and len(data) == 1:
247
+ id_value = data["@id"]
248
+
249
+ try:
250
+ # The expanded_data should have the full URI
251
+ uri = expanded_data.get("@id", id_value) if isinstance(expanded_data, dict) else id_value
252
+
253
+ # Only resolve if it's in our allowed URIs
254
+ if not self._should_resolve(uri):
255
+ return data
256
+
257
+ # Ensure it has .json extension
258
+ uri = self.uri_resolver.ensure_json_extension(uri)
259
+
260
+ # Prevent circular references (only within the current resolution chain)
261
+ if uri in visited:
262
+ logger.warning(f"Circular reference detected: {uri}")
263
+ return data
264
+
265
+ # Add to visited for this branch only
266
+ new_visited = visited.copy()
267
+ new_visited.add(uri)
268
+
269
+ # Convert remote URI to local path
270
+ local_uri = self.uri_resolver.to_local_path(uri)
271
+
272
+ # Create a temporary resource for the nested term
273
+ temp_resource = JsonLdResource(uri=local_uri)
274
+
275
+ # Create a DataMerger for this nested term to get project+universe merge
276
+ nested_merger = DataMerger(
277
+ data=temp_resource,
278
+ allowed_base_uris=self.allowed_base_uris,
279
+ locally_available=self.locally_available,
280
+ config=self.config,
281
+ )
282
+
283
+ # Get the merged project+universe data
284
+ merge_results = nested_merger.merge_linked_json()
285
+ resolved = merge_results[-1] # Final merged result
286
+
287
+ # Get proper expansion for the merged term
288
+ temp_expanded = temp_resource.expanded
289
+ if isinstance(temp_expanded, list) and len(temp_expanded) > 0:
290
+ temp_expanded = temp_expanded[0]
291
+
292
+ # Recursively resolve any nested references in the merged data
293
+ # Pass the expanded data for this specific term
294
+ return self.resolve_nested_ids(resolved, temp_expanded, new_visited, _is_root_call=False)
295
+
296
+ except Exception as e:
297
+ logger.error(f"Failed to resolve reference {id_value}: {e}")
298
+ return data
299
+
300
+ # Otherwise, recursively process all values in the dict
301
+ result = {}
302
+ for key, value in data.items():
303
+ # Find corresponding expanded value
304
+ # Map compact key to expanded key (e.g., "model_components" -> "http://schema.org/model_components")
305
+ # Also handle JSON-LD keywords: "id" -> "@id", "type" -> "@type"
306
+ expanded_key = key
307
+ if isinstance(expanded_data, dict):
308
+ # First check for JSON-LD keyword mappings
309
+ if key == "id":
310
+ expanded_key = "@id"
311
+ elif key == "type":
312
+ expanded_key = "@type"
313
+ else:
314
+ # Try to find the key in expanded data
315
+ # It might be under a full URI
316
+ for exp_key in expanded_data.keys():
317
+ # Check for exact match or if the URI contains the key
318
+ # URIs may have trailing slashes: https://.../activity/
319
+ if (
320
+ exp_key == key
321
+ or exp_key.endswith("/" + key)
322
+ or exp_key.endswith("/" + key + "/")
323
+ or exp_key.endswith("#" + key)
324
+ ):
325
+ expanded_key = exp_key
326
+ break
327
+
328
+ # If not found, check the context to see if this key has a different @id
329
+ # (e.g., required_model_components has @id of source_type/)
330
+ if expanded_key == key and hasattr(self.data, "context"):
331
+ context = self.data.context
332
+ if isinstance(context, dict) and "@context" in context:
333
+ context = context["@context"]
334
+ if isinstance(context, dict) and key in context:
335
+ term_def = context[key]
336
+ if isinstance(term_def, dict) and "@id" in term_def:
337
+ # The @id value should match a key in expanded_data
338
+ id_value = term_def["@id"]
339
+ # Try with and without trailing slash
340
+ if id_value in expanded_data:
341
+ expanded_key = id_value
342
+ elif id_value.rstrip("/") + "/" in expanded_data:
343
+ expanded_key = id_value.rstrip("/") + "/"
344
+ elif id_value.rstrip("/") in expanded_data:
345
+ expanded_key = id_value.rstrip("/")
346
+
347
+ expanded_value = expanded_data.get(expanded_key) if isinstance(expanded_data, dict) else None
348
+
349
+ # Check if this field has a @resolve mode in the context
350
+ field_resolve_mode = self._get_resolve_mode(key)
351
+
352
+ resolved = self.resolve_nested_ids(
353
+ value,
354
+ expanded_value,
355
+ visited,
356
+ _is_root_call=False,
357
+ resolve_mode=field_resolve_mode,
358
+ _current_property=key,
359
+ )
360
+ result[key] = resolved
361
+ return result
362
+
363
+ elif isinstance(data, list) and isinstance(expanded_data, list):
364
+ # Recursively process each item in the list with corresponding expanded item
365
+ result = []
366
+ for i, item in enumerate(data):
367
+ expanded_item = expanded_data[i] if i < len(expanded_data) else None
368
+ # Pass visited set and resolve_mode to prevent circular references across list items
369
+ resolved_item = self.resolve_nested_ids(
370
+ item,
371
+ expanded_item,
372
+ visited,
373
+ _is_root_call=False,
374
+ resolve_mode=resolve_mode,
375
+ _current_property=_current_property,
376
+ )
377
+ result.append(resolved_item)
378
+ return result
379
+
380
+ elif isinstance(data, list):
381
+ # List but no corresponding expanded list, process without expanded data
382
+ # Each list item gets its own visited set
383
+ return [
384
+ self.resolve_nested_ids(
385
+ item,
386
+ None,
387
+ set(),
388
+ _is_root_call=False,
389
+ resolve_mode=resolve_mode,
390
+ _current_property=_current_property,
391
+ )
392
+ for item in data
393
+ ]
394
+
395
+ else:
396
+ # Primitive values - but check if they're ID references
397
+ # If the compact form is a string but expanded form is {"@id": "..."},
398
+ # it's an ID reference that needs resolving
399
+
400
+ # JSON-LD expansion often wraps values in arrays, unwrap single-element arrays
401
+ if isinstance(expanded_data, list) and len(expanded_data) == 1:
402
+ expanded_data = expanded_data[0]
403
+
404
+ if isinstance(data, str) and isinstance(expanded_data, dict):
405
+ # Skip empty or whitespace-only strings
406
+ if not data or not data.strip():
407
+ return data
408
+
409
+ # Skip if it's a @value (literal string, not a reference)
410
+ if self.string_heuristics.should_skip_literal(expanded_data):
411
+ return data
412
+
413
+ if not self.string_heuristics.has_id_in_expanded(expanded_data):
414
+ return data
415
+
416
+ uri = expanded_data["@id"]
417
+
418
+ # Check resolve_mode FIRST before any expensive operations
419
+ if resolve_mode == "reference":
420
+ # "reference" mode: just validate the ID exists, keep as string
421
+ uri_to_check = self.uri_resolver.ensure_json_extension(uri)
422
+ if not self.uri_resolver.exists(uri_to_check):
423
+ property_msg = f" in property '{_current_property}'" if _current_property else ""
424
+ logger.warning(
425
+ f"Reference validation failed: ID '{data}' does not exist at {uri_to_check}{property_msg}"
426
+ )
427
+ return data # Keep as string regardless
428
+
429
+ # Use string heuristics to determine if this should be resolved
430
+ if not self.string_heuristics.is_resolvable(data):
431
+ return data
432
+
433
+ # Only resolve if it's in our allowed URIs
434
+ if not self._should_resolve(uri):
435
+ return data
436
+
437
+ # Check if recursion depth is too deep (prevent infinite loops)
438
+ if len(visited) > self.config.max_depth:
439
+ if self.config.log_depth_warnings:
440
+ logger.warning(
441
+ f"Max depth ({self.config.max_depth}) exceeded. Visited {len(visited)} URIs. Current: {uri}"
442
+ )
443
+ return data
444
+
445
+ # Ensure it has .json extension
446
+ uri = self.uri_resolver.ensure_json_extension(uri)
447
+
448
+ # Prevent circular references
449
+ if uri in visited:
450
+ logger.debug(f"Circular reference detected: {uri}")
451
+ return data
452
+
453
+ # Check if the file exists before trying to resolve
454
+ # Don't resolve strings that are just enum values or simple identifiers
455
+ # Only resolve if it looks like a real component/grid reference
456
+ try:
457
+ # Convert remote URI to local path
458
+ local_uri = self.uri_resolver.to_local_path(uri)
459
+
460
+ # Check if file exists - if not, it's probably not a resolvable reference
461
+ if not self.uri_resolver.exists(uri):
462
+ property_msg = f" Property: '{_current_property}'\n" if _current_property else ""
463
+ logger.warning(
464
+ f"Cannot resolve ID reference: File not found\n"
465
+ f" Current term: {self.data.uri}\n"
466
+ f"{property_msg}"
467
+ f" String value: '{data}'\n"
468
+ f" Expected URI: {uri}\n"
469
+ f" Local path tried: {local_uri}\n"
470
+ f" → Keeping as unresolved string"
471
+ )
472
+ return data
473
+ except (OSError, IOError) as e:
474
+ property_msg = f" Property: '{_current_property}'\n" if _current_property else ""
475
+ logger.warning(
476
+ f"Cannot resolve ID reference: Error checking file existence\n"
477
+ f" Current term: {self.data.uri}\n"
478
+ f"{property_msg}"
479
+ f" String value: '{data}'\n"
480
+ f" Expected URI: {uri}\n"
481
+ f" Error: {e}\n"
482
+ f" → Keeping as unresolved string"
483
+ )
484
+ return data
485
+
486
+ # Add to visited for this branch only
487
+ new_visited = visited.copy()
488
+ new_visited.add(uri)
489
+
490
+ try:
491
+ # Create a temporary resource for the nested term
492
+ temp_resource = JsonLdResource(uri=local_uri)
493
+
494
+ # Create a DataMerger for this nested term to get project+universe merge
495
+ nested_merger = DataMerger(
496
+ data=temp_resource,
497
+ allowed_base_uris=self.allowed_base_uris,
498
+ locally_available=self.locally_available,
499
+ config=self.config,
500
+ )
501
+
502
+ # Get the merged project+universe data
503
+ merge_results = nested_merger.merge_linked_json()
504
+ resolved = merge_results[-1] # Final merged result
505
+
506
+ logger.info(
507
+ f"Successfully resolved ID reference\n"
508
+ f" Current term: {self.data.uri}\n"
509
+ f" String value: '{data}'\n"
510
+ f" Resolved to: {uri}\n"
511
+ f" Mode: {resolve_mode}\n"
512
+ f" → Replacing with {'shallow' if resolve_mode == 'shallow' else 'full'} object"
513
+ )
514
+
515
+ # Get proper expansion for the merged term
516
+ temp_expanded = temp_resource.expanded
517
+ if isinstance(temp_expanded, list) and len(temp_expanded) > 0:
518
+ temp_expanded = temp_expanded[0]
519
+
520
+ # Handle resolution based on mode
521
+ if resolve_mode == "shallow":
522
+ # "shallow" mode: return the merged object but DON'T resolve its nested IDs
523
+ return resolved
524
+ else: # "full"
525
+ # "full" mode: recursively resolve any nested references in the merged data
526
+ return nested_merger.resolve_nested_ids(
527
+ resolved, temp_expanded, new_visited, _is_root_call=False, resolve_mode="full"
528
+ )
529
+
530
+ except Exception as e:
531
+ property_msg = f" Property: '{_current_property}'\n" if _current_property else ""
532
+ logger.warning(
533
+ f"Cannot resolve ID reference: Exception during resolution\n"
534
+ f" Current term: {self.data.uri}\n"
535
+ f"{property_msg}"
536
+ f" String value: '{data}'\n"
537
+ f" Expected URI: {uri}\n"
538
+ f" Error: {e}\n"
539
+ f" → Keeping as unresolved string"
540
+ )
541
+ return data
542
+
543
+ # Regular primitive values are returned as-is
544
+ return data
545
+
546
+ def resolve_merged_ids(self, merged_data: dict, context_base_path: str | None = None) -> dict:
547
+ """
548
+ Resolve nested IDs in merged data by re-expanding it with proper context.
549
+
550
+ This is needed because merged data may contain fields from the parent term
551
+ that aren't in the original term's context.
552
+
553
+ Args:
554
+ merged_data: The merged dictionary from merge_linked_json()
555
+ context_base_path: Base path containing context directories. If None,
556
+ attempts to infer from locally_available mappings.
557
+
558
+ Returns:
559
+ Dictionary with all nested IDs resolved to full objects
560
+ """
561
+ import json
562
+ import tempfile
563
+ from pathlib import Path
564
+
565
+ # Determine the base path for context
566
+ if context_base_path is None:
567
+ # Try to infer from locally_available - use first available path
568
+ # Preferring the universe path
569
+ if "https://esgvoc.ipsl.fr/resource/universe" in self.locally_available:
570
+ context_base_path = self.locally_available["https://esgvoc.ipsl.fr/resource/universe"]
571
+ elif self.locally_available:
572
+ # Use first available local path
573
+ context_base_path = next(iter(self.locally_available.values()))
574
+ else:
575
+ # No local paths available, fallback to regular resolution
576
+ return self.resolve_nested_ids(merged_data)
577
+
578
+ # Find the data descriptor directory from merged_data type
579
+ data_descriptor = merged_data.get("type", "")
580
+ if not data_descriptor:
581
+ return self.resolve_nested_ids(merged_data)
582
+
583
+ context_dir = Path(context_base_path) / data_descriptor
584
+
585
+ if not context_dir.exists():
586
+ # Fallback if directory doesn't exist
587
+ return self.resolve_nested_ids(merged_data)
588
+
589
+ # Create temp file in the universe data descriptor directory
590
+ # This ensures JsonLdResource picks up the correct context
591
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, dir=str(context_dir)) as tmp:
592
+ json.dump(merged_data, tmp)
593
+ tmp_path = tmp.name
594
+
595
+ try:
596
+ # Create new resource with proper context expansion
597
+ merged_resource = JsonLdResource(uri=tmp_path)
598
+ merged_expanded = merged_resource.expanded
599
+ if isinstance(merged_expanded, list) and len(merged_expanded) > 0:
600
+ merged_expanded = merged_expanded[0]
601
+
602
+ # Temporarily update self.data to use merged resource's context
603
+ # so that _get_resolve_mode() uses the correct esgvoc_resolve_modes
604
+ original_data = self.data
605
+ self.data = merged_resource
606
+
607
+ try:
608
+ # Resolve with correct expansion and context
609
+ return self.resolve_nested_ids(merged_data, expanded_data=merged_expanded)
610
+ finally:
611
+ # Restore original data
612
+ self.data = original_data
613
+ finally:
614
+ Path(tmp_path).unlink()
615
+
616
+
617
+ if __name__ == "__main__":
618
+ import warnings
619
+
620
+ warnings.simplefilter("ignore")
621
+
622
+ # test from institution_id ipsl exapnd and merge with institution ipsl
623
+ # proj_ipsl = JsonLdResource(uri = "https://espri-mod.github.io/CMIP6Plus_CVs/institution_id/ipsl.json")
624
+ # allowed_uris = {"https://espri-mod.github.io/CMIP6Plus_CVs/","https://espri-mod.github.io/mip-cmor-tables/"}
625
+ # mdm = DataMerger(data =proj_ipsl, allowed_base_uris = allowed_uris)
626
+ # json_list = mdm.merge_linked_json()
627
+ #
628
+ # pprint([res for res in json_list])
629
+
630
+ # a = JsonLdResource(uri = ".cache/repos/CMIP6Plus_CVs/institution_id/ipsl.json")
631
+ # mdm = DataMerger(data=a)
632
+ # print(mdm.merge_linked_json())
633
+ #
634
+ #