esgvoc 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. esgvoc/__init__.py +3 -0
  2. esgvoc/api/__init__.py +91 -0
  3. esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
  4. esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
  5. esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
  6. esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
  7. esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
  8. esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
  9. esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
  10. esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
  11. esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
  12. esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
  13. esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
  14. esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
  15. esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
  16. esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
  17. esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
  18. esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
  19. esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
  20. esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
  21. esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
  22. esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
  23. esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
  24. esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
  25. esgvoc/api/data_descriptors/__init__.py +159 -0
  26. esgvoc/api/data_descriptors/activity.py +72 -0
  27. esgvoc/api/data_descriptors/archive.py +5 -0
  28. esgvoc/api/data_descriptors/area_label.py +30 -0
  29. esgvoc/api/data_descriptors/branded_suffix.py +30 -0
  30. esgvoc/api/data_descriptors/branded_variable.py +21 -0
  31. esgvoc/api/data_descriptors/citation_url.py +5 -0
  32. esgvoc/api/data_descriptors/contact.py +5 -0
  33. esgvoc/api/data_descriptors/conventions.py +28 -0
  34. esgvoc/api/data_descriptors/creation_date.py +18 -0
  35. esgvoc/api/data_descriptors/data_descriptor.py +127 -0
  36. esgvoc/api/data_descriptors/data_specs_version.py +25 -0
  37. esgvoc/api/data_descriptors/date.py +5 -0
  38. esgvoc/api/data_descriptors/directory_date.py +22 -0
  39. esgvoc/api/data_descriptors/drs_specs.py +38 -0
  40. esgvoc/api/data_descriptors/experiment.py +215 -0
  41. esgvoc/api/data_descriptors/forcing_index.py +21 -0
  42. esgvoc/api/data_descriptors/frequency.py +48 -0
  43. esgvoc/api/data_descriptors/further_info_url.py +5 -0
  44. esgvoc/api/data_descriptors/grid.py +43 -0
  45. esgvoc/api/data_descriptors/horizontal_label.py +20 -0
  46. esgvoc/api/data_descriptors/initialization_index.py +27 -0
  47. esgvoc/api/data_descriptors/institution.py +80 -0
  48. esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
  49. esgvoc/api/data_descriptors/license.py +31 -0
  50. esgvoc/api/data_descriptors/member_id.py +9 -0
  51. esgvoc/api/data_descriptors/mip_era.py +26 -0
  52. esgvoc/api/data_descriptors/model_component.py +32 -0
  53. esgvoc/api/data_descriptors/models_test/models.py +17 -0
  54. esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
  55. esgvoc/api/data_descriptors/obs_type.py +5 -0
  56. esgvoc/api/data_descriptors/organisation.py +22 -0
  57. esgvoc/api/data_descriptors/physics_index.py +21 -0
  58. esgvoc/api/data_descriptors/product.py +16 -0
  59. esgvoc/api/data_descriptors/publication_status.py +5 -0
  60. esgvoc/api/data_descriptors/realization_index.py +24 -0
  61. esgvoc/api/data_descriptors/realm.py +16 -0
  62. esgvoc/api/data_descriptors/regex.py +5 -0
  63. esgvoc/api/data_descriptors/region.py +35 -0
  64. esgvoc/api/data_descriptors/resolution.py +7 -0
  65. esgvoc/api/data_descriptors/source.py +120 -0
  66. esgvoc/api/data_descriptors/source_type.py +5 -0
  67. esgvoc/api/data_descriptors/sub_experiment.py +5 -0
  68. esgvoc/api/data_descriptors/table.py +28 -0
  69. esgvoc/api/data_descriptors/temporal_label.py +20 -0
  70. esgvoc/api/data_descriptors/time_range.py +17 -0
  71. esgvoc/api/data_descriptors/title.py +5 -0
  72. esgvoc/api/data_descriptors/tracking_id.py +67 -0
  73. esgvoc/api/data_descriptors/variable.py +56 -0
  74. esgvoc/api/data_descriptors/variant_label.py +25 -0
  75. esgvoc/api/data_descriptors/vertical_label.py +20 -0
  76. esgvoc/api/project_specs.py +143 -0
  77. esgvoc/api/projects.py +1253 -0
  78. esgvoc/api/py.typed +0 -0
  79. esgvoc/api/pydantic_handler.py +146 -0
  80. esgvoc/api/report.py +127 -0
  81. esgvoc/api/search.py +171 -0
  82. esgvoc/api/universe.py +434 -0
  83. esgvoc/apps/__init__.py +6 -0
  84. esgvoc/apps/cmor_tables/__init__.py +7 -0
  85. esgvoc/apps/cmor_tables/cvs_table.py +948 -0
  86. esgvoc/apps/drs/__init__.py +0 -0
  87. esgvoc/apps/drs/constants.py +2 -0
  88. esgvoc/apps/drs/generator.py +429 -0
  89. esgvoc/apps/drs/report.py +540 -0
  90. esgvoc/apps/drs/validator.py +312 -0
  91. esgvoc/apps/ga/__init__.py +104 -0
  92. esgvoc/apps/ga/example_usage.py +315 -0
  93. esgvoc/apps/ga/models/__init__.py +47 -0
  94. esgvoc/apps/ga/models/netcdf_header.py +306 -0
  95. esgvoc/apps/ga/models/validator.py +491 -0
  96. esgvoc/apps/ga/test_ga.py +161 -0
  97. esgvoc/apps/ga/validator.py +277 -0
  98. esgvoc/apps/jsg/json_schema_generator.py +341 -0
  99. esgvoc/apps/jsg/templates/template.jinja +241 -0
  100. esgvoc/apps/test_cv/README.md +214 -0
  101. esgvoc/apps/test_cv/__init__.py +0 -0
  102. esgvoc/apps/test_cv/cv_tester.py +1611 -0
  103. esgvoc/apps/test_cv/example_usage.py +216 -0
  104. esgvoc/apps/vr/__init__.py +12 -0
  105. esgvoc/apps/vr/build_variable_registry.py +71 -0
  106. esgvoc/apps/vr/example_usage.py +60 -0
  107. esgvoc/apps/vr/vr_app.py +333 -0
  108. esgvoc/cli/clean.py +304 -0
  109. esgvoc/cli/cmor.py +46 -0
  110. esgvoc/cli/config.py +1300 -0
  111. esgvoc/cli/drs.py +267 -0
  112. esgvoc/cli/find.py +138 -0
  113. esgvoc/cli/get.py +155 -0
  114. esgvoc/cli/install.py +41 -0
  115. esgvoc/cli/main.py +60 -0
  116. esgvoc/cli/offline.py +269 -0
  117. esgvoc/cli/status.py +79 -0
  118. esgvoc/cli/test_cv.py +258 -0
  119. esgvoc/cli/valid.py +147 -0
  120. esgvoc/core/constants.py +17 -0
  121. esgvoc/core/convert.py +0 -0
  122. esgvoc/core/data_handler.py +206 -0
  123. esgvoc/core/db/__init__.py +3 -0
  124. esgvoc/core/db/connection.py +40 -0
  125. esgvoc/core/db/models/mixins.py +25 -0
  126. esgvoc/core/db/models/project.py +102 -0
  127. esgvoc/core/db/models/universe.py +98 -0
  128. esgvoc/core/db/project_ingestion.py +231 -0
  129. esgvoc/core/db/universe_ingestion.py +172 -0
  130. esgvoc/core/exceptions.py +33 -0
  131. esgvoc/core/logging_handler.py +26 -0
  132. esgvoc/core/repo_fetcher.py +345 -0
  133. esgvoc/core/service/__init__.py +41 -0
  134. esgvoc/core/service/configuration/config_manager.py +196 -0
  135. esgvoc/core/service/configuration/setting.py +363 -0
  136. esgvoc/core/service/data_merger.py +634 -0
  137. esgvoc/core/service/esg_voc.py +77 -0
  138. esgvoc/core/service/resolver_config.py +56 -0
  139. esgvoc/core/service/state.py +324 -0
  140. esgvoc/core/service/string_heuristics.py +98 -0
  141. esgvoc/core/service/term_cache.py +108 -0
  142. esgvoc/core/service/uri_resolver.py +133 -0
  143. esgvoc-2.0.2.dist-info/METADATA +82 -0
  144. esgvoc-2.0.2.dist-info/RECORD +147 -0
  145. esgvoc-2.0.2.dist-info/WHEEL +4 -0
  146. esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
  147. esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
@@ -0,0 +1,948 @@
1
+ """
2
+ Support for generating CMOR CVs tables
3
+
4
+ Note: this really shouldn't be in esgvoc.
5
+ It should be in CMOR, as CMOR knows the structure it needs,
6
+ not esgvoc. Anyway, can do that later.
7
+ """
8
+
9
+ import itertools
10
+ import re
11
+ from functools import partial
12
+ from typing import Any, TypeAlias
13
+
14
+ from pydantic import BaseModel, ConfigDict, HttpUrl
15
+
16
+ import esgvoc.api as ev_api
17
+
18
+ AllowedDict: TypeAlias = dict[str, Any]
19
+ """
20
+ Dictionary (key-value pairs). The keys define the allowed values for the given attribute
21
+
22
+ The values can be anything,
23
+ they generally provide extra information about the meaning of the keys.
24
+ """
25
+
26
+ RegularExpressionValidators: TypeAlias = list[str]
27
+ """
28
+ List of values which are assumed to be regular expressions
29
+
30
+ Attribute values provided by teams are then validated
31
+ against these regular expressions.
32
+ """
33
+
34
+
35
+ class CMORDRSDefinition(BaseModel):
36
+ """
37
+ CMOR data reference syntax (DRS) definition
38
+ """
39
+
40
+ directory_path_example: str
41
+ """
42
+ Example of a directory path that follows this DRS
43
+ """
44
+
45
+ directory_path_template: str
46
+ """
47
+ Template to use for generating directory paths
48
+ """
49
+
50
+ filename_path_example: str
51
+ """
52
+ Example of a filename path that follows this DRS
53
+ """
54
+
55
+ filename_path_template: str
56
+ """
57
+ Template to use for generating filename paths
58
+ """
59
+
60
+
61
+ class CMORExperimentDefinition(BaseModel):
62
+ """
63
+ CMOR experiment definition
64
+ """
65
+
66
+ activity_id: list[str]
67
+ """
68
+ Activity ID to which this experiment belongs
69
+ """
70
+
71
+ # required_model_components: RegularExpressionValidators
72
+ # """
73
+ # Required model components to run this experiment
74
+ # """
75
+ #
76
+ # additional_allowed_model_components: RegularExpressionValidators
77
+ # """
78
+ # Additional model components that can be included when running this experiment
79
+ # """
80
+
81
+ description: str
82
+ """
83
+ Experiment description
84
+ """
85
+
86
+ experiment: str
87
+ """
88
+ Experiment description (same as description)
89
+ """
90
+
91
+ # TODO: check if we should switch to timestamps
92
+ start_year: int | None
93
+ """Start year of the experiment"""
94
+
95
+ end_year: int | None
96
+ """End year of the experiment"""
97
+
98
+ min_number_yrs_per_sim: int | None
99
+ """Minimum number of years of simulation required"""
100
+
101
+ experiment_id: str
102
+ """
103
+ Experiment ID
104
+ """
105
+
106
+ # # Not a thing anymore, hence remove
107
+ # host_collection: str
108
+ # """
109
+ # Host collection of this experiment
110
+ # """
111
+
112
+ parent_activity_id: list[str]
113
+ """Activity ID for the parent of this experiment"""
114
+
115
+ parent_experiment_id: list[str]
116
+ """Experiment ID for the parent of this experiment"""
117
+
118
+ tier: int
119
+ """
120
+ Tier i.e. priority of this experiment
121
+
122
+ Lower is higher priority i.e. 1 is the highest priority
123
+ """
124
+
125
+
126
+ class CMORFrequencyDefinition(BaseModel):
127
+ """
128
+ CMOR frequency definition
129
+ """
130
+
131
+ approx_interval: float
132
+ """
133
+ Approximate interval in days
134
+ """
135
+
136
+ description: str
137
+ """
138
+ Description
139
+ """
140
+
141
+
142
+ class CMORSpecificLicenseDefinition(BaseModel):
143
+ """
144
+ CMOR-style specific license definition
145
+ """
146
+
147
+ license_type: str
148
+ """
149
+ Type of the license
150
+ """
151
+
152
+ license_url: HttpUrl
153
+ """
154
+ URL that describes the license
155
+ """
156
+
157
+
158
+ class CMORLicenseDefinition(BaseModel):
159
+ """
160
+ CMOR license definition
161
+ """
162
+
163
+ license_id: dict[str, CMORSpecificLicenseDefinition]
164
+ """
165
+ Supported licenses
166
+ """
167
+
168
+ # (rightfully) not in esgvoc
169
+ license_template: str
170
+ """
171
+ Template for writing license strings
172
+ """
173
+
174
+
175
+ class CMORModelComponentDefintion(BaseModel):
176
+ """
177
+ CMOR model component definition
178
+ """
179
+
180
+ description: str
181
+ """Description"""
182
+
183
+ native_nominal_resolution: str
184
+ """Native nominal resolution of this component"""
185
+
186
+
187
+ class CMORSourceDefinition(BaseModel):
188
+ """
189
+ CMOR source definition
190
+
191
+ The meaning of 'source' is a bit fuzzy across projects,
192
+ but for CMIP phases it refers to the model which provided the simulation.
193
+ """
194
+
195
+ # # Don't think this is used or relevant hence drop
196
+ # activity_participation: RegularExpressionValidators
197
+ # """
198
+ # Activities in which this source has participated
199
+ # """
200
+
201
+ # # Don't know what this is hence drop
202
+ # cohort: RegularExpressionValidators
203
+ # """
204
+ # Cohort to which this source belongs
205
+ #
206
+ # TODO: clarify what this means
207
+ # """
208
+
209
+ institution_id: RegularExpressionValidators
210
+ """
211
+ Institution ID for this source
212
+ """
213
+
214
+ label: str
215
+ """
216
+ Label to use for this source ID
217
+
218
+ TODO: check, does this mean in graphs/plots?
219
+ """
220
+
221
+ label_extended: str
222
+ """
223
+ Extended label to use for this source ID
224
+
225
+ TODO: check, does this mean in graphs/plots?
226
+ """
227
+
228
+ model_component: dict[str, CMORModelComponentDefintion]
229
+ """
230
+ Model components of this source
231
+ """
232
+
233
+ # # Not relevant hence drop
234
+ # release_year: int | None
235
+ # """
236
+ # Release year of the model/source
237
+ #
238
+ # `None` if the release concept does not apply to this source
239
+ # """
240
+
241
+ source: str
242
+ """
243
+ Source information
244
+
245
+ Combination of source name and information about each model component
246
+ """
247
+
248
+ source_id: str
249
+ """
250
+ Source ID for `self`
251
+ """
252
+
253
+
254
+ def convert_none_value_to_empty_string(v: Any) -> Any:
255
+ return v if v is not None else ""
256
+
257
+
258
+ def remove_none_values_from_dict(inv: dict[str, Any]) -> dict[str, Any]:
259
+ res = {}
260
+ for k, v in inv.items():
261
+ if isinstance(v, list):
262
+ res[k] = [convert_none_value_to_empty_string(vv) for vv in v]
263
+
264
+ elif isinstance(v, dict):
265
+ res[k] = remove_none_values_from_dict(v)
266
+
267
+ else:
268
+ res[k] = convert_none_value_to_empty_string(v)
269
+
270
+ return res
271
+
272
+
273
+ class CMORCVsTable(BaseModel):
274
+ """
275
+ Representation of the JSON table required by CMOR for CVs
276
+ CMOR also takes in variable tables,
277
+ as well as a user input table.
278
+ This model doesn't consider those tables
279
+ or their interactions with this table at the moment.
280
+ """
281
+
282
+ model_config = ConfigDict(extra="forbid")
283
+
284
+ DRS: CMORDRSDefinition
285
+ """
286
+ CMOR definition of the data reference syntax
287
+ """
288
+
289
+ # Note; not a required global attribute hence dropped
290
+ # archive_id: AllowedDict
291
+ # """
292
+ # Allowed values of `archive_id`
293
+ # """
294
+
295
+ activity_id: AllowedDict
296
+ """
297
+ Allowed values of `activity_id`
298
+ """
299
+
300
+ area_label: AllowedDict
301
+ """
302
+ Allowed values of `area_label`
303
+ """
304
+
305
+ branding_suffix: str
306
+ """
307
+ Template for branding suffix
308
+ """
309
+
310
+ creation_date: RegularExpressionValidators
311
+ """
312
+ Allowed patterns for `creation_date`
313
+ """
314
+
315
+ data_specs_version: str
316
+ """
317
+ Allowed value of `data_specs_version`
318
+ """
319
+
320
+ drs_specs: AllowedDict
321
+ """
322
+ Allowed values of `drs_specs`
323
+ """
324
+
325
+ experiment_id: dict[str, CMORExperimentDefinition]
326
+ """
327
+ CMOR-style experiment definitions
328
+ """
329
+
330
+ forcing_index: RegularExpressionValidators
331
+ """
332
+ Allowed patterns for `forcing_index`
333
+ """
334
+
335
+ frequency: AllowedDict
336
+ """
337
+ Allowed values of `frequency`
338
+ """
339
+
340
+ grid_label: AllowedDict
341
+ """
342
+ Allowed values of `grid_label`
343
+ """
344
+
345
+ horizontal_label: AllowedDict
346
+ """
347
+ Allowed values of `horizontal_label`
348
+ """
349
+
350
+ initialization_index: RegularExpressionValidators
351
+ """
352
+ Allowed patterns for `initialization_index`
353
+ """
354
+
355
+ institution_id: AllowedDict
356
+ """
357
+ Allowed values of `institution_id`
358
+ """
359
+
360
+ license: CMORLicenseDefinition
361
+ """
362
+ CMOR-style license definition
363
+ """
364
+
365
+ mip_era: str
366
+ """
367
+ Allowed value of `mip_era`
368
+ """
369
+
370
+ nominal_resolution: RegularExpressionValidators
371
+ """
372
+ Allowed values of `nominal_resolution`
373
+ """
374
+
375
+ physics_index: RegularExpressionValidators
376
+ """
377
+ Allowed patterns for `physics_index`
378
+ """
379
+
380
+ product: AllowedDict
381
+ """
382
+ Allowed values of `product`
383
+ """
384
+
385
+ realization_index: RegularExpressionValidators
386
+ """
387
+ Allowed patterns for `realization_index`
388
+ """
389
+
390
+ realm: AllowedDict
391
+ """
392
+ Allowed values of `realm`
393
+ """
394
+
395
+ region: AllowedDict
396
+ """
397
+ Allowed values of `region`
398
+ """
399
+
400
+ required_global_attributes: list[str]
401
+ """
402
+ Required global attributes
403
+ """
404
+
405
+ source_id: dict[str, CMORSourceDefinition]
406
+ """
407
+ CMOR-style source definitions
408
+ """
409
+
410
+ temporal_label: AllowedDict
411
+ """
412
+ Allowed values of `temporal_label`
413
+ """
414
+
415
+ tracking_id: RegularExpressionValidators
416
+ """
417
+ Allowed patterns for `tracking_id`
418
+ """
419
+
420
+ variant_label: RegularExpressionValidators
421
+ """
422
+ Allowed patterns for `variant_label`
423
+ """
424
+
425
+ vertical_label: AllowedDict
426
+ """
427
+ Allowed values of `vertical_label`
428
+ """
429
+
430
+ def to_cvs_json(
431
+ self, top_level_key: str = "CV"
432
+ ) -> dict[str, dict[str, str, AllowedDict, RegularExpressionValidators]]:
433
+ md = self.model_dump(mode="json")
434
+
435
+ # # Unclear why this is done for some keys and not others,
436
+ # # which makes reasoning hard.
437
+ # to_hyphenise = list(md["drs"].keys())
438
+ # for k in to_hyphenise:
439
+ # md["drs"][k.replace("_", "-")] = md["drs"].pop(k)
440
+ #
441
+ # md["experiment_id"] = {k: v.to_json() for k, v in self.experiment_id.experiments.items()}
442
+ # # More fun
443
+ # md["DRS"] = md.pop("drs")
444
+
445
+ md_no_none = remove_none_values_from_dict(md)
446
+
447
+ cvs_json = {top_level_key: md_no_none}
448
+
449
+ return cvs_json
450
+
451
+
452
+ def get_project_attribute_property(
453
+ attribute_value: str, attribute_to_match: str, ev_project: ev_api.project_specs.ProjectSpecs
454
+ ) -> ev_api.project_specs.AttributeProperty:
455
+ for ev_attribute_property in ev_project.attr_specs:
456
+ if getattr(ev_attribute_property, attribute_to_match) == attribute_value:
457
+ break
458
+
459
+ else:
460
+ msg = f"Nothing in attr_specs had {attribute_to_match} equal to {attribute_value}"
461
+ raise KeyError(msg)
462
+
463
+ return ev_attribute_property
464
+
465
+
466
+ def get_allowed_dict_for_attribute(attribute_name: str, ev_project: ev_api.project_specs.ProjectSpecs) -> AllowedDict:
467
+ ev_attribute_property = get_project_attribute_property(
468
+ attribute_value=attribute_name,
469
+ attribute_to_match="field_name",
470
+ ev_project=ev_project,
471
+ )
472
+
473
+ attribute_instances = ev_api.get_all_terms_in_collection(
474
+ ev_project.project_id, ev_attribute_property.source_collection
475
+ )
476
+
477
+ res = {v.drs_name: v.description for v in attribute_instances}
478
+
479
+ return res
480
+
481
+
482
+ def convert_python_regex_to_cmor_regex(inv: str) -> list[str]:
483
+ # Not ideal that we have to do this ourselves,
484
+ # but I can't see another way
485
+ # (it doesn't make sense to use posix regex in the CV JSON
486
+ # because then esgvoc's Python API won't work)
487
+
488
+ if "|" in inv:
489
+ or_sections = re.findall(r"\([^|(]*\|[^)]*\)", inv)
490
+ if not or_sections:
491
+ raise AssertionError(inv)
492
+
493
+ substitution_components = []
494
+ for or_section in or_sections:
495
+ tmp = []
496
+ for subs in (v.strip("()") for v in or_section.split("|")):
497
+ tmp.append((or_section, subs))
498
+
499
+ substitution_components.append(tmp)
500
+
501
+ to_substitute = []
502
+ for substitution_set in itertools.product(*substitution_components):
503
+ filled = inv
504
+ for old, new in substitution_set:
505
+ filled = filled.replace(old, new)
506
+
507
+ to_substitute.append(filled)
508
+
509
+ else:
510
+ to_substitute = [inv]
511
+
512
+ res = []
513
+ for start in to_substitute:
514
+ # Get rid of Python style capturing groups.
515
+ # Super brittle, might break if there are brackets inside the caught exptmpsion.
516
+ # We'll have to fix as we find problems, regex is annoyingly complicated.
517
+ tmp = re.sub(r"\(\?P\<[^>]*\>([^)]*)\)", r"\1", start)
518
+
519
+ # Other things we seem to have to change
520
+ tmp = tmp.replace("{", r"\{")
521
+ tmp = tmp.replace("}", r"\}")
522
+ tmp = tmp.replace("(", r"\(")
523
+ tmp = tmp.replace(")", r"\)")
524
+ tmp = tmp.replace(r"\d", "[[:digit:]]")
525
+ tmp = tmp.replace("+", r"\{1,\}")
526
+ tmp = tmp.replace("?", r"\{0,\}")
527
+
528
+ res.append(tmp)
529
+
530
+ return res
531
+
532
+
533
+ def get_regular_expression_validator_for_attribute(
534
+ attribute_property: ev_api.project_specs.AttributeProperty,
535
+ ev_project: ev_api.project_specs.ProjectSpecs,
536
+ ) -> RegularExpressionValidators:
537
+ attribute_instances = ev_api.get_all_terms_in_collection(
538
+ ev_project.project_id, attribute_property.source_collection
539
+ )
540
+ res = []
541
+ for v in attribute_instances:
542
+ res.extend(convert_python_regex_to_cmor_regex(v.regex))
543
+
544
+ return res
545
+
546
+
547
+ def get_template_for_composite_attribute(attribute_name: str, ev_project: ev_api.project_specs.ProjectSpecs) -> str:
548
+ ev_attribute_property = get_project_attribute_property(
549
+ attribute_value=attribute_name,
550
+ attribute_to_match="field_name",
551
+ ev_project=ev_project,
552
+ )
553
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, ev_attribute_property.source_collection)
554
+ if len(terms) > 1:
555
+ raise AssertionError(terms)
556
+
557
+ term = terms[0]
558
+
559
+ parts_l = []
560
+ for v in term.parts:
561
+ va = get_project_attribute_property(v.type, "source_collection", ev_project)
562
+ parts_l.append(f"<{va.field_name}>")
563
+
564
+ if term.separator != "-":
565
+ msg = f"CMOR only supports '-' as a separator, received {term.separator=} for {term=}"
566
+ raise NotImplementedError(msg)
567
+
568
+ res = "".join(parts_l)
569
+
570
+ return res
571
+
572
+
573
+ def get_single_allowed_value_for_attribute(attribute_name: str, ev_project: ev_api.project_specs.ProjectSpecs) -> str:
574
+ ev_attribute_property = get_project_attribute_property(
575
+ attribute_value=attribute_name,
576
+ attribute_to_match="field_name",
577
+ ev_project=ev_project,
578
+ )
579
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, ev_attribute_property.source_collection)
580
+ if len(terms) > 1:
581
+ raise AssertionError(terms)
582
+
583
+ term = terms[0]
584
+
585
+ res = term.drs_name
586
+
587
+ return res
588
+
589
+
590
+ def get_cmor_license_definition(
591
+ source_collection: str, ev_project: ev_api.project_specs.ProjectSpecs
592
+ ) -> CMORLicenseDefinition:
593
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, source_collection)
594
+
595
+ license_ids_d = {
596
+ v.drs_name: CMORSpecificLicenseDefinition(
597
+ license_type=v.description,
598
+ license_url=v.url,
599
+ )
600
+ for v in terms
601
+ }
602
+
603
+ res = CMORLicenseDefinition(
604
+ license_id=license_ids_d,
605
+ license_template=(
606
+ "<license_id>; CMIP7 data produced by <institution_id> "
607
+ "is licensed under a <license_type> License (<license_url>). "
608
+ "Consult [TODO terms of use link] for terms of use governing CMIP7 output, "
609
+ "including citation requirements and proper acknowledgment. "
610
+ "The data producers and data providers make no warranty, "
611
+ "either express or implied, including, but not limited to, "
612
+ "warranties of merchantability and fitness for a particular purpose. "
613
+ "All liabilities arising from the supply of the information "
614
+ "(including any liability arising in negligence) "
615
+ "are excluded to the fullest extent permitted by law."
616
+ ),
617
+ )
618
+
619
+ return res
620
+
621
+
622
+ def get_approx_interval(interval: float, units: str) -> float:
623
+ try:
624
+ import pint
625
+
626
+ ur = pint.get_application_registry()
627
+ except ImportError as exc:
628
+ msg = "Missing optional dependency `pint`, please install"
629
+ raise ImportError(msg) from exc
630
+
631
+ if units == "month":
632
+ # Special case, month is 30 days
633
+ res = interval * 30.0
634
+ else:
635
+ res = ur.Quantity(interval, units).to("day").m
636
+
637
+ return res
638
+
639
+
640
+ def get_cmor_experiment_id_definitions(
641
+ source_collection: str, ev_project: ev_api.project_specs.ProjectSpecs
642
+ ) -> dict[str, CMORExperimentDefinition]:
643
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, source_collection)
644
+
645
+ get_term = partial(ev_api.get_term_in_project, ev_project.project_id)
646
+ res = {}
647
+ for v in terms:
648
+ res[v.drs_name] = CMORExperimentDefinition(
649
+ activity_id=[get_term(v.activity).drs_name],
650
+ # required_model_components=[vv.drs_name for vv in v.required_model_components],
651
+ # additional_allowed_model_components=[vv.drs_name for vv in v.additional_allowed_model_components],
652
+ description=v.description,
653
+ experiment=v.description,
654
+ start_year=v.start_timestamp.year if v.start_timestamp else v.start_timestamp,
655
+ end_year=v.end_timestamp.year if v.end_timestamp else v.end_timestamp,
656
+ min_number_yrs_per_sim=v.min_number_yrs_per_sim,
657
+ experiment_id=v.drs_name,
658
+ parent_activity_id=[v.parent_activity.drs_name] if v.parent_activity else [],
659
+ parent_experiment_id=[v.parent_experiment.drs_name] if v.parent_experiment else [],
660
+ tier=v.tier,
661
+ )
662
+
663
+ return res
664
+
665
+
666
+ def get_cmor_nominal_resolution_defintions(
667
+ source_collection: str, ev_project: ev_api.project_specs.ProjectSpecs
668
+ ) -> list[str]:
669
+ try:
670
+ import pint
671
+
672
+ ur = pint.get_application_registry()
673
+ except ImportError as exc:
674
+ msg = "Missing optional dependency `pint`, please install"
675
+ raise ImportError(msg) from exc
676
+
677
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, source_collection)
678
+ res = []
679
+ for t in terms:
680
+ size_km = ur.Quantity(t.magnitude, t.units).to("km").m
681
+ if int(size_km) == size_km:
682
+ allowed = f"{size_km:.0f} km"
683
+ else:
684
+ allowed = f"{size_km:.1f} km"
685
+
686
+ res.append(allowed)
687
+
688
+ return sorted(res)
689
+
690
+
691
+ def get_cmor_source_id_definitions(
692
+ source_collection: str, ev_project: ev_api.project_specs.ProjectSpecs
693
+ ) -> dict[str, CMORSourceDefinition]:
694
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, source_collection)
695
+
696
+ get_term = partial(ev_api.get_term_in_project, ev_project.project_id)
697
+ res = {}
698
+ for v in terms:
699
+ model_components = {}
700
+ for mc in v.model_components:
701
+ raise NotImplementedError(mc)
702
+
703
+ source = "\n".join([f"{v.drs_name}:", *[f"{key}: {v.description}" for key, v in model_components.items()]])
704
+ res[v.drs_name] = CMORSourceDefinition(
705
+ institution_id=[get_term(vv).drs_name for vv in v.contributors],
706
+ label=v.label,
707
+ label_extended=v.label_extended,
708
+ model_component=model_components,
709
+ source=source,
710
+ source_id=v.drs_name,
711
+ )
712
+
713
+ return res
714
+
715
+
716
+ def get_cmor_frequency_definitions(
717
+ source_collection: str, ev_project: ev_api.project_specs.ProjectSpecs
718
+ ) -> dict[str, CMORFrequencyDefinition]:
719
+ terms = ev_api.get_all_terms_in_collection(ev_project.project_id, source_collection)
720
+
721
+ res = {
722
+ v.drs_name: CMORFrequencyDefinition(
723
+ description=v.description,
724
+ approx_interval=get_approx_interval(v.interval, units=v.units),
725
+ )
726
+ if v.interval
727
+ # I'm still not convinced that it wouldn't be simpler to use the same schema for all types
728
+ else "fixed (time invariant) field"
729
+ for v in terms
730
+ }
731
+
732
+ return res
733
+
734
+
735
+ def get_cmor_drs_definition(ev_project: ev_api.project_specs.ProjectSpecs) -> CMORDRSDefinition:
736
+ # Creating a valid example is quite hard because of the coupling between elements.
737
+ # Try and anticipate those here.
738
+ # Note that a perfect way to do this is beyond me right now.
739
+ # grid region
740
+ activity_example = ev_api.get_term_in_collection(ev_project.project_id, "activity", "cmip")
741
+ experiment_example = ev_api.get_term_in_collection(
742
+ ev_project.project_id, "experiment", activity_example.experiments[0]
743
+ )
744
+
745
+ institution_example = ev_api.get_all_terms_in_collection(ev_project.project_id, "organisation")[0]
746
+ sources = ev_api.get_all_terms_in_collection(ev_project.project_id, "source")
747
+ for source in sources:
748
+ if institution_example.id in source.contributors:
749
+ source_example = source
750
+ break
751
+ else:
752
+ msg = f"No example source found for {institution_example.id}"
753
+ raise AssertionError(msg)
754
+
755
+ grid_example = ev_api.get_all_terms_in_collection(ev_project.project_id, "grid")[0]
756
+ region_example = ev_api.get_term_in_collection(ev_project.project_id, "region", grid_example.region)
757
+
758
+ frequency_example = "mon"
759
+ time_range_example = "185001-202112"
760
+
761
+ # Creating example regexp terms on the fly also doesn't work
762
+ variant_label_example = "r1i1p1f1"
763
+ branded_suffix_example = "tavg-h2m-hxy-u"
764
+
765
+ directory_path_template_l = []
766
+ directory_path_example_l = []
767
+ for part in ev_project.drs_specs["directory"].parts:
768
+ if not part.is_required:
769
+ raise NotImplementedError
770
+
771
+ if part.source_collection == "directory_date":
772
+ # Maybe should be using catalogue specs rather than attr specs?
773
+ # Hard-coded CMOR weirdness
774
+ directory_path_template_l.append("<version>")
775
+ directory_path_example_l.append("20251104")
776
+
777
+ continue
778
+
779
+ project_attribute_property = get_project_attribute_property(
780
+ attribute_value=part.source_collection, attribute_to_match="source_collection", ev_project=ev_project
781
+ )
782
+ directory_path_template_l.append(f"<{project_attribute_property.field_name}>")
783
+
784
+ if part.source_collection == "activity":
785
+ directory_path_example_l.append(activity_example.drs_name)
786
+ elif part.source_collection == "experiment":
787
+ directory_path_example_l.append(experiment_example.drs_name)
788
+ elif part.source_collection == "frequency":
789
+ directory_path_example_l.append(frequency_example)
790
+ elif part.source_collection == "institution":
791
+ directory_path_example_l.append(institution_example.drs_name)
792
+ elif part.source_collection == "source":
793
+ directory_path_example_l.append(source_example.drs_name)
794
+ elif part.source_collection == "grid":
795
+ directory_path_example_l.append(grid_example.drs_name)
796
+ elif part.source_collection == "region":
797
+ directory_path_example_l.append(region_example.drs_name)
798
+ elif part.source_collection == "variant_label":
799
+ # Urgh
800
+ directory_path_example_l.append(variant_label_example)
801
+ elif part.source_collection == "branded_suffix":
802
+ # Urgh
803
+ directory_path_example_l.append(branded_suffix_example)
804
+ else:
805
+ example_drs_name = ev_api.get_all_terms_in_collection(ev_project.project_id, part.source_collection)[
806
+ 0
807
+ ].drs_name
808
+ directory_path_example_l.append(example_drs_name)
809
+
810
+ directory_path_template = ev_project.drs_specs["directory"].separator.join(directory_path_template_l)
811
+ directory_path_example = ev_project.drs_specs["directory"].separator.join(directory_path_example_l)
812
+
813
+ filename_path_template_l = []
814
+ filename_path_example_l = []
815
+ for i, part in enumerate(ev_project.drs_specs["file_name"].parts):
816
+ if i > 0:
817
+ prefix = ev_project.drs_specs["file_name"].separator
818
+ else:
819
+ prefix = ""
820
+
821
+ if part.source_collection == "time_range":
822
+ # Maybe should be using catalogue specs rather than attr specs?
823
+ # Hard-coded CMOR weirdness
824
+ cmor_placeholder = "timeRange"
825
+ example_value = time_range_example
826
+
827
+ else:
828
+ project_attribute_property = get_project_attribute_property(
829
+ attribute_value=part.source_collection, attribute_to_match="source_collection", ev_project=ev_project
830
+ )
831
+ cmor_placeholder = project_attribute_property.field_name
832
+
833
+ if part.source_collection == "experiment":
834
+ example_value = experiment_example.drs_name
835
+ elif part.source_collection == "frequency":
836
+ example_value = frequency_example
837
+ elif part.source_collection == "source":
838
+ example_value = source_example.drs_name
839
+ elif part.source_collection == "grid":
840
+ example_value = grid_example.drs_name
841
+ elif part.source_collection == "region":
842
+ example_value = region_example.drs_name
843
+ elif part.source_collection == "variant_label":
844
+ # Urgh
845
+ example_value = variant_label_example
846
+ elif part.source_collection == "branded_suffix":
847
+ # Urgh
848
+ example_value = branded_suffix_example
849
+ else:
850
+ example_value = ev_api.get_all_terms_in_collection(ev_project.project_id, part.source_collection)[
851
+ 0
852
+ ].drs_name
853
+
854
+ if part.is_required:
855
+ filename_path_template_l.append(f"{prefix}<{cmor_placeholder}>")
856
+ else:
857
+ filename_path_template_l.append(f"[{prefix}<{cmor_placeholder}>]")
858
+
859
+ filename_path_example_l.append(f"{prefix}{example_value}")
860
+
861
+ filename_path_template_excl_ext = "".join(filename_path_template_l)
862
+ filename_path_template = f"{filename_path_template_excl_ext}.nc"
863
+ filename_path_example_excl_ext = "".join(filename_path_example_l)
864
+ filename_path_example = f"{filename_path_example_excl_ext}.nc"
865
+
866
+ res = CMORDRSDefinition(
867
+ directory_path_example=directory_path_example,
868
+ directory_path_template=directory_path_template,
869
+ filename_path_example=filename_path_example,
870
+ filename_path_template=filename_path_template,
871
+ )
872
+
873
+ return res
874
+
875
+
876
+ def generate_cvs_table(project: str) -> CMORCVsTable:
877
+ ev_project = ev_api.projects.get_project(project)
878
+
879
+ init_kwargs = {"required_global_attributes": []}
880
+ for attr_property in ev_project.attr_specs:
881
+ if attr_property.is_required:
882
+ init_kwargs["required_global_attributes"].append(attr_property.field_name)
883
+
884
+ # Logic: https://github.com/WCRP-CMIP/CMIP7-CVs/issues/271#issuecomment-3286291815
885
+ if attr_property.field_name in [
886
+ "Conventions",
887
+ "branded_variable",
888
+ "variable_id",
889
+ ]:
890
+ # Not handled in CMOR tables
891
+ continue
892
+
893
+ elif attr_property.field_name in [
894
+ "data_specs_version",
895
+ "mip_era",
896
+ ]:
897
+ # Special single value entries
898
+ value = get_single_allowed_value_for_attribute(attr_property.field_name, ev_project)
899
+ kwarg = attr_property.field_name
900
+
901
+ elif attr_property.field_name == "license_id":
902
+ value = get_cmor_license_definition(attr_property.source_collection, ev_project)
903
+ kwarg = "license"
904
+
905
+ elif attr_property.field_name == "frequency":
906
+ value = get_cmor_frequency_definitions(attr_property.source_collection, ev_project)
907
+ kwarg = attr_property.field_name
908
+
909
+ elif attr_property.field_name == "experiment_id":
910
+ value = get_cmor_experiment_id_definitions(attr_property.source_collection, ev_project)
911
+ kwarg = attr_property.field_name
912
+
913
+ elif attr_property.field_name == "nominal_resolution":
914
+ kwarg = attr_property.field_name
915
+ value = get_cmor_nominal_resolution_defintions(attr_property.field_name, ev_project)
916
+
917
+ elif attr_property.field_name == "source_id":
918
+ value = get_cmor_source_id_definitions(attr_property.source_collection, ev_project)
919
+ kwarg = attr_property.field_name
920
+
921
+ elif attr_property.field_name in ("activity_id",):
922
+ # Hard-code for now
923
+ # TODO: figure out how to unpack typing.Annotated
924
+ kwarg = attr_property.field_name
925
+ value = get_allowed_dict_for_attribute(attr_property.field_name, ev_project)
926
+
927
+ else:
928
+ kwarg = attr_property.field_name
929
+ pydantic_class = ev_api.pydantic_handler.get_pydantic_class(attr_property.source_collection)
930
+ if issubclass(pydantic_class, ev_api.data_descriptors.data_descriptor.PlainTermDataDescriptor):
931
+ value = get_allowed_dict_for_attribute(attr_property.field_name, ev_project)
932
+
933
+ elif issubclass(pydantic_class, ev_api.data_descriptors.data_descriptor.PatternTermDataDescriptor):
934
+ value = get_regular_expression_validator_for_attribute(attr_property, ev_project)
935
+
936
+ elif issubclass(pydantic_class, ev_api.data_descriptors.data_descriptor.CompositeTermDataDescriptor):
937
+ value = get_template_for_composite_attribute(attr_property.field_name, ev_project)
938
+
939
+ else:
940
+ raise NotImplementedError(pydantic_class)
941
+
942
+ init_kwargs[kwarg] = value
943
+
944
+ init_kwargs["DRS"] = get_cmor_drs_definition(ev_project)
945
+
946
+ cmor_cvs_table = CMORCVsTable(**init_kwargs)
947
+
948
+ return cmor_cvs_table