deep-code 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deep_code/__init__.py +24 -0
- deep_code/cli/__init__.py +3 -0
- deep_code/cli/main.py +21 -0
- deep_code/cli/publish.py +21 -0
- deep_code/constants.py +31 -0
- deep_code/tests/tools/__init__.py +3 -0
- deep_code/tests/tools/test_publish.py +108 -0
- deep_code/tests/utils/__init__.py +3 -0
- deep_code/tests/utils/test_dataset_stac_generator.py +277 -0
- deep_code/tests/utils/test_github_automation.py +171 -0
- deep_code/tests/utils/test_ogc_api_record.py +243 -0
- deep_code/tests/utils/test_ogc_record_generator.py +60 -0
- deep_code/tests/utils/test_osc_extension.py +117 -0
- deep_code/tools/__init__.py +3 -0
- deep_code/tools/check.py +4 -0
- deep_code/tools/new.py +5 -0
- deep_code/tools/publish.py +419 -0
- deep_code/tools/register.py +0 -0
- deep_code/tools/setup_ci.py +1 -0
- deep_code/tools/test.py +2 -0
- deep_code/utils/__init__.py +3 -0
- deep_code/utils/dataset_stac_generator.py +547 -0
- deep_code/utils/github_automation.py +145 -0
- deep_code/utils/helper.py +14 -0
- deep_code/utils/ogc_api_record.py +268 -0
- deep_code/utils/ogc_record_generator.py +69 -0
- deep_code/utils/osc_extension.py +189 -0
- deep_code/version.py +22 -0
- deep_code-0.0.1.dist-info/LICENSE +21 -0
- deep_code-0.0.1.dist-info/METADATA +154 -0
- deep_code-0.0.1.dist-info/RECORD +34 -0
- deep_code-0.0.1.dist-info/WHEEL +5 -0
- deep_code-0.0.1.dist-info/entry_points.txt +2 -0
- deep_code-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) 2025 by Brockmann Consult GmbH
|
|
3
|
+
# Permissions are hereby granted under the terms of the MIT License:
|
|
4
|
+
# https://opensource.org/licenses/MIT.
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent
|
|
12
|
+
from xcube.core.store import new_data_store
|
|
13
|
+
|
|
14
|
+
from deep_code.constants import (
|
|
15
|
+
DEEPESDL_COLLECTION_SELF_HREF,
|
|
16
|
+
OSC_THEME_SCHEME,
|
|
17
|
+
PRODUCT_BASE_CATALOG_SELF_HREF,
|
|
18
|
+
VARIABLE_BASE_CATALOG_SELF_HREF,
|
|
19
|
+
)
|
|
20
|
+
from deep_code.utils.ogc_api_record import Theme, ThemeConcept
|
|
21
|
+
from deep_code.utils.osc_extension import OscExtension
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OscDatasetStacGenerator:
|
|
25
|
+
"""Generates OSC STAC Collections for a product from Zarr datasets.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
dataset_id: ID of the Zarr dataset.
|
|
29
|
+
collection_id: Unique identifier for the STAC collection.
|
|
30
|
+
access_link: Public access link to the dataset.
|
|
31
|
+
documentation_link: Link to dataset documentation.
|
|
32
|
+
osc_status: Status of the dataset (e.g., "ongoing").
|
|
33
|
+
osc_region: Geographical region associated with the dataset.
|
|
34
|
+
osc_themes: List of themes related to the dataset (e.g., ["climate"]).
|
|
35
|
+
osc_missions: List of satellite missions associated with the dataset.
|
|
36
|
+
cf_params: CF metadata parameters for the dataset.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
dataset_id: str,
|
|
42
|
+
collection_id: str,
|
|
43
|
+
access_link: str | None = None,
|
|
44
|
+
documentation_link: str | None = None,
|
|
45
|
+
osc_status: str = "ongoing",
|
|
46
|
+
osc_region: str = "Global",
|
|
47
|
+
osc_themes: list[str] | None = None,
|
|
48
|
+
osc_missions: list[str] | None = None,
|
|
49
|
+
cf_params: list[dict[str]] | None = None,
|
|
50
|
+
):
|
|
51
|
+
self.dataset_id = dataset_id
|
|
52
|
+
self.collection_id = collection_id
|
|
53
|
+
self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}"
|
|
54
|
+
self.documentation_link = documentation_link
|
|
55
|
+
self.osc_status = osc_status
|
|
56
|
+
self.osc_region = osc_region
|
|
57
|
+
self.osc_themes = osc_themes or []
|
|
58
|
+
self.osc_missions = osc_missions or []
|
|
59
|
+
self.cf_params = cf_params or {}
|
|
60
|
+
self.logger = logging.getLogger(__name__)
|
|
61
|
+
self.dataset = self._open_dataset()
|
|
62
|
+
self.variables_metadata = self.get_variables_metadata()
|
|
63
|
+
|
|
64
|
+
def _open_dataset(self):
|
|
65
|
+
"""Open the dataset using a S3 store as a xarray Dataset."""
|
|
66
|
+
|
|
67
|
+
store_configs = [
|
|
68
|
+
{
|
|
69
|
+
"description": "Public store",
|
|
70
|
+
"params": {
|
|
71
|
+
"storage_type": "s3",
|
|
72
|
+
"root": "deep-esdl-public",
|
|
73
|
+
"storage_options": {"anon": True},
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"description": "Authenticated store",
|
|
78
|
+
"params": {
|
|
79
|
+
"storage_type": "s3",
|
|
80
|
+
"root": os.environ.get("S3_USER_STORAGE_BUCKET"),
|
|
81
|
+
"storage_options": {
|
|
82
|
+
"anon": False,
|
|
83
|
+
"key": os.environ.get("S3_USER_STORAGE_KEY"),
|
|
84
|
+
"secret": os.environ.get("S3_USER_STORAGE_SECRET"),
|
|
85
|
+
},
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# Iterate through configurations and attempt to open the dataset
|
|
91
|
+
last_exception = None
|
|
92
|
+
tried_configurations = []
|
|
93
|
+
for config in store_configs:
|
|
94
|
+
tried_configurations.append(config["description"])
|
|
95
|
+
try:
|
|
96
|
+
self.logger.info(
|
|
97
|
+
f"Attempting to open dataset with configuration: "
|
|
98
|
+
f"{config['description']}"
|
|
99
|
+
)
|
|
100
|
+
store = new_data_store(
|
|
101
|
+
config["params"]["storage_type"],
|
|
102
|
+
root=config["params"]["root"],
|
|
103
|
+
storage_options=config["params"]["storage_options"],
|
|
104
|
+
)
|
|
105
|
+
dataset = store.open_data(self.dataset_id)
|
|
106
|
+
self.logger.info(
|
|
107
|
+
f"Successfully opened dataset with configuration: "
|
|
108
|
+
f"{config['description']}"
|
|
109
|
+
)
|
|
110
|
+
return dataset
|
|
111
|
+
except Exception as e:
|
|
112
|
+
self.logger.error(
|
|
113
|
+
f"Failed to open dataset with configuration: "
|
|
114
|
+
f"{config['description']}. Error: {e}"
|
|
115
|
+
)
|
|
116
|
+
last_exception = e
|
|
117
|
+
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Failed to open Zarr dataset with ID {self.dataset_id}. "
|
|
120
|
+
f"Tried configurations: {', '.join(tried_configurations)}. "
|
|
121
|
+
f"Last error: {last_exception}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def _get_spatial_extent(self) -> SpatialExtent:
|
|
125
|
+
"""Extract spatial extent from the dataset."""
|
|
126
|
+
if {"lon", "lat"}.issubset(self.dataset.coords):
|
|
127
|
+
# For regular gridding
|
|
128
|
+
lon_min, lon_max = (
|
|
129
|
+
float(self.dataset.lon.min()),
|
|
130
|
+
float(self.dataset.lon.max()),
|
|
131
|
+
)
|
|
132
|
+
lat_min, lat_max = (
|
|
133
|
+
float(self.dataset.lat.min()),
|
|
134
|
+
float(self.dataset.lat.max()),
|
|
135
|
+
)
|
|
136
|
+
return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]])
|
|
137
|
+
elif {"longitude", "latitude"}.issubset(self.dataset.coords):
|
|
138
|
+
# For regular gridding with 'longitude' and 'latitude'
|
|
139
|
+
lon_min, lon_max = (
|
|
140
|
+
float(self.dataset.longitude.min()),
|
|
141
|
+
float(self.dataset.longitude.max()),
|
|
142
|
+
)
|
|
143
|
+
lat_min, lat_max = (
|
|
144
|
+
float(self.dataset.latitude.min()),
|
|
145
|
+
float(self.dataset.latitude.max()),
|
|
146
|
+
)
|
|
147
|
+
return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]])
|
|
148
|
+
elif {"x", "y"}.issubset(self.dataset.coords):
|
|
149
|
+
# For irregular gridding
|
|
150
|
+
x_min, x_max = (float(self.dataset.x.min()), float(self.dataset.x.max()))
|
|
151
|
+
y_min, y_max = (float(self.dataset.y.min()), float(self.dataset.y.max()))
|
|
152
|
+
return SpatialExtent([[x_min, y_min, x_max, y_max]])
|
|
153
|
+
else:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
"Dataset does not have recognized spatial coordinates "
|
|
156
|
+
"('lon', 'lat' or 'x', 'y')."
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _get_temporal_extent(self) -> TemporalExtent:
|
|
160
|
+
"""Extract temporal extent from the dataset."""
|
|
161
|
+
if "time" in self.dataset.coords:
|
|
162
|
+
try:
|
|
163
|
+
# Convert the time bounds to datetime objects
|
|
164
|
+
time_min = pd.to_datetime(
|
|
165
|
+
self.dataset.time.min().values
|
|
166
|
+
).to_pydatetime()
|
|
167
|
+
time_max = pd.to_datetime(
|
|
168
|
+
self.dataset.time.max().values
|
|
169
|
+
).to_pydatetime()
|
|
170
|
+
return TemporalExtent([[time_min, time_max]])
|
|
171
|
+
except Exception as e:
|
|
172
|
+
raise ValueError(f"Failed to parse temporal extent: {e}")
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError("Dataset does not have a 'time' coordinate.")
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _normalize_name(name: str | None) -> str | None:
|
|
178
|
+
if name:
|
|
179
|
+
return (name.replace(" ", "-").
|
|
180
|
+
replace("_", "-").lower())
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
def _get_general_metadata(self) -> dict:
|
|
184
|
+
return {
|
|
185
|
+
"description": self.dataset.attrs.get(
|
|
186
|
+
"description", "No description available."
|
|
187
|
+
)
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
def extract_metadata_for_variable(self, variable_data) -> dict:
|
|
191
|
+
"""Extract metadata for a single variable."""
|
|
192
|
+
long_name = variable_data.attrs.get("long_name")
|
|
193
|
+
standard_name = variable_data.attrs.get("standard_name")
|
|
194
|
+
variable_id = standard_name or variable_data.name
|
|
195
|
+
description = variable_data.attrs.get("description", long_name)
|
|
196
|
+
gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url")
|
|
197
|
+
return {
|
|
198
|
+
"variable_id": self._normalize_name(variable_id),
|
|
199
|
+
"description": description,
|
|
200
|
+
"gcmd_keyword_url": gcmd_keyword_url,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
def get_variable_ids(self) -> list[str]:
|
|
204
|
+
"""Get variable IDs for all variables in the dataset."""
|
|
205
|
+
variable_ids = list(self.variables_metadata.keys())
|
|
206
|
+
# Remove 'crs' and 'spatial_ref' from the list if they exist, note that
|
|
207
|
+
# spatial_ref will be normalized to spatial-ref in variable_ids and skipped.
|
|
208
|
+
return [var_id for var_id in variable_ids if var_id not in ["crs",
|
|
209
|
+
"spatial-ref"]]
|
|
210
|
+
|
|
211
|
+
def get_variables_metadata(self) -> dict[str, dict]:
|
|
212
|
+
"""Extract metadata for all variables in the dataset."""
|
|
213
|
+
variables_metadata = {}
|
|
214
|
+
for var_name, variable in self.dataset.data_vars.items():
|
|
215
|
+
var_metadata = self.extract_metadata_for_variable(variable)
|
|
216
|
+
variables_metadata[var_metadata.get("variable_id")] = var_metadata
|
|
217
|
+
return variables_metadata
|
|
218
|
+
|
|
219
|
+
def _add_gcmd_link_to_var_catalog(
|
|
220
|
+
self, var_catalog: Catalog, var_metadata: dict
|
|
221
|
+
) -> None:
|
|
222
|
+
"""
|
|
223
|
+
Checks for a GCMD keyword URL in var_metadata, adds a 'via' link to the catalog
|
|
224
|
+
pointing to the GCMD Keyword Viewer.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
var_catalog: The PySTAC Catalog to which we want to add the link.
|
|
228
|
+
var_metadata: Dictionary containing metadata about the variable,
|
|
229
|
+
including 'gcmd_keyword_url'.
|
|
230
|
+
"""
|
|
231
|
+
gcmd_keyword_url = var_metadata.get("gcmd_keyword_url")
|
|
232
|
+
if not gcmd_keyword_url:
|
|
233
|
+
gcmd_keyword_url = input(
|
|
234
|
+
f"Enter GCMD keyword URL or a similar url for"
|
|
235
|
+
f" {var_metadata.get("variable_id")}: ").strip()
|
|
236
|
+
var_catalog.add_link(
|
|
237
|
+
Link(
|
|
238
|
+
rel="via",
|
|
239
|
+
target=gcmd_keyword_url,
|
|
240
|
+
title="Description",
|
|
241
|
+
media_type="text/html",
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
self.logger.info(
|
|
245
|
+
f'Added GCMD link for {var_metadata.get("variable_id")} '
|
|
246
|
+
f"catalog {gcmd_keyword_url}."
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def build_variable_catalog(self, var_metadata) -> Catalog:
|
|
250
|
+
"""Build an OSC STAC Catalog for the variables in the dataset.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
A pystac.Catalog object.
|
|
254
|
+
"""
|
|
255
|
+
var_id = var_metadata.get("variable_id")
|
|
256
|
+
concepts = [{"id": theme} for theme in self.osc_themes]
|
|
257
|
+
|
|
258
|
+
themes = [
|
|
259
|
+
{
|
|
260
|
+
"scheme": "https://github.com/stac-extensions/osc#theme",
|
|
261
|
+
"concepts": concepts,
|
|
262
|
+
}
|
|
263
|
+
]
|
|
264
|
+
|
|
265
|
+
now_iso = datetime.now(timezone.utc).isoformat()
|
|
266
|
+
|
|
267
|
+
# Create a PySTAC Catalog object
|
|
268
|
+
var_catalog = Catalog(
|
|
269
|
+
id=var_id,
|
|
270
|
+
description=var_metadata.get("description"),
|
|
271
|
+
title=self.format_string(var_id),
|
|
272
|
+
stac_extensions=[
|
|
273
|
+
"https://stac-extensions.github.io/themes/v1.0.0/schema.json"
|
|
274
|
+
],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
var_catalog.stac_version = "1.0.0"
|
|
278
|
+
var_catalog.extra_fields["updated"] = now_iso
|
|
279
|
+
var_catalog.keywords = []
|
|
280
|
+
|
|
281
|
+
# Add the 'themes' block (from your example JSON)
|
|
282
|
+
var_catalog.extra_fields["themes"] = themes
|
|
283
|
+
|
|
284
|
+
var_catalog.remove_links("root")
|
|
285
|
+
# Add relevant links
|
|
286
|
+
var_catalog.add_link(
|
|
287
|
+
Link(
|
|
288
|
+
rel="root",
|
|
289
|
+
target="../../catalog.json",
|
|
290
|
+
media_type="application/json",
|
|
291
|
+
title="Open Science Catalog",
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# 'child' link: points to the product (or one of its collections) using this variable
|
|
296
|
+
var_catalog.add_link(
|
|
297
|
+
Link(
|
|
298
|
+
rel="child",
|
|
299
|
+
target=f"../../products/{self.collection_id}/collection.json",
|
|
300
|
+
media_type="application/json",
|
|
301
|
+
title=self.collection_id,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# 'parent' link: back up to the variables overview
|
|
306
|
+
var_catalog.add_link(
|
|
307
|
+
Link(
|
|
308
|
+
rel="parent",
|
|
309
|
+
target="../catalog.json",
|
|
310
|
+
media_type="application/json",
|
|
311
|
+
title="Variables",
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
# Add gcmd link for the variable definition
|
|
315
|
+
self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata)
|
|
316
|
+
|
|
317
|
+
self.add_themes_as_related_links_var_catalog(var_catalog)
|
|
318
|
+
|
|
319
|
+
self_href = (
|
|
320
|
+
f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables"
|
|
321
|
+
f"/{var_id}/catalog.json"
|
|
322
|
+
)
|
|
323
|
+
# 'self' link: the direct URL where this JSON is hosted
|
|
324
|
+
var_catalog.set_self_href(self_href)
|
|
325
|
+
|
|
326
|
+
return var_catalog
|
|
327
|
+
|
|
328
|
+
def update_product_base_catalog(self, product_catalog_path) -> Catalog:
|
|
329
|
+
"""Link product to base product catalog"""
|
|
330
|
+
product_base_catalog = Catalog.from_file(product_catalog_path)
|
|
331
|
+
product_base_catalog.add_link(
|
|
332
|
+
Link(
|
|
333
|
+
rel="child",
|
|
334
|
+
target=f"./{self.collection_id}/collection.json",
|
|
335
|
+
media_type="application/json",
|
|
336
|
+
title=self.collection_id,
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
# 'self' link: the direct URL where this JSON is hosted
|
|
340
|
+
product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF)
|
|
341
|
+
return product_base_catalog
|
|
342
|
+
|
|
343
|
+
def update_variable_base_catalog(self, variable_base_catalog_path, variable_ids) \
|
|
344
|
+
-> (
|
|
345
|
+
Catalog):
|
|
346
|
+
"""Link product to base product catalog"""
|
|
347
|
+
variable_base_catalog = Catalog.from_file(variable_base_catalog_path)
|
|
348
|
+
for var_id in variable_ids:
|
|
349
|
+
variable_base_catalog.add_link(
|
|
350
|
+
Link(
|
|
351
|
+
rel="child",
|
|
352
|
+
target=f"./{var_id}/catalog.json",
|
|
353
|
+
media_type="application/json",
|
|
354
|
+
title=self.format_string(var_id),
|
|
355
|
+
)
|
|
356
|
+
)
|
|
357
|
+
# 'self' link: the direct URL where this JSON is hosted
|
|
358
|
+
variable_base_catalog.set_self_href(VARIABLE_BASE_CATALOG_SELF_HREF)
|
|
359
|
+
return variable_base_catalog
|
|
360
|
+
|
|
361
|
+
def add_themes_as_related_links_var_catalog(self, var_catalog):
|
|
362
|
+
"""Add themes as related links to variable catalog"""
|
|
363
|
+
for theme in self.osc_themes:
|
|
364
|
+
var_catalog.add_link(
|
|
365
|
+
Link(
|
|
366
|
+
rel="related",
|
|
367
|
+
target=f"../../themes/{theme}/catalog.json",
|
|
368
|
+
media_type="application/json",
|
|
369
|
+
title=f"Theme: {self.format_string(theme)}",
|
|
370
|
+
)
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def update_deepesdl_collection(self, deepesdl_collection_full_path):
|
|
374
|
+
deepesdl_collection = Collection.from_file(deepesdl_collection_full_path)
|
|
375
|
+
deepesdl_collection.add_link(
|
|
376
|
+
Link(
|
|
377
|
+
rel="child",
|
|
378
|
+
target=f"../../products/{self.collection_id}/collection.json",
|
|
379
|
+
media_type="application/json",
|
|
380
|
+
title=self.collection_id,
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
# add themes to deepesdl
|
|
384
|
+
for theme in self.osc_themes:
|
|
385
|
+
deepesdl_collection.add_link(
|
|
386
|
+
Link(
|
|
387
|
+
rel="related",
|
|
388
|
+
target=f"../../themes/{theme}/catalog.json",
|
|
389
|
+
media_type="application/json",
|
|
390
|
+
title=f"Theme: {self.format_string(theme)}"
|
|
391
|
+
)
|
|
392
|
+
)
|
|
393
|
+
deepesdl_collection.set_self_href(DEEPESDL_COLLECTION_SELF_HREF)
|
|
394
|
+
return deepesdl_collection
|
|
395
|
+
|
|
396
|
+
def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog:
|
|
397
|
+
existing_catalog = Catalog.from_file(var_file_path)
|
|
398
|
+
now_iso = datetime.now(timezone.utc).isoformat()
|
|
399
|
+
existing_catalog.extra_fields["updated"] = now_iso
|
|
400
|
+
|
|
401
|
+
# add 'child' link as the product
|
|
402
|
+
existing_catalog.add_link(
|
|
403
|
+
Link(
|
|
404
|
+
rel="child",
|
|
405
|
+
target=f"../../products/{self.collection_id}/collection.json",
|
|
406
|
+
media_type="application/json",
|
|
407
|
+
title=self.collection_id,
|
|
408
|
+
)
|
|
409
|
+
)
|
|
410
|
+
self.add_themes_as_related_links_var_catalog(existing_catalog)
|
|
411
|
+
self_href = (
|
|
412
|
+
f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables"
|
|
413
|
+
f"/{var_id}/catalog.json"
|
|
414
|
+
)
|
|
415
|
+
# 'self' link: the direct URL where this JSON is hosted
|
|
416
|
+
existing_catalog.set_self_href(self_href)
|
|
417
|
+
|
|
418
|
+
return existing_catalog
|
|
419
|
+
|
|
420
|
+
@staticmethod
|
|
421
|
+
def format_string(s: str) -> str:
|
|
422
|
+
# Strip leading/trailing spaces/underscores and replace underscores with spaces
|
|
423
|
+
words = s.strip(" _").replace("_", " ").replace("-", " ").split()
|
|
424
|
+
# Capitalize each word and join them with a space
|
|
425
|
+
return " ".join(word.capitalize() for word in words)
|
|
426
|
+
|
|
427
|
+
@staticmethod
|
|
428
|
+
def build_theme(osc_themes: list[str]) -> Theme:
|
|
429
|
+
"""Convert each string into a ThemeConcept
|
|
430
|
+
"""
|
|
431
|
+
concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes]
|
|
432
|
+
return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME)
|
|
433
|
+
|
|
434
|
+
def build_dataset_stac_collection(self) -> Collection:
|
|
435
|
+
"""Build an OSC STAC Collection for the dataset.
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
A pystac.Collection object.
|
|
439
|
+
"""
|
|
440
|
+
try:
|
|
441
|
+
spatial_extent = self._get_spatial_extent()
|
|
442
|
+
temporal_extent = self._get_temporal_extent()
|
|
443
|
+
variables = self.get_variable_ids()
|
|
444
|
+
general_metadata = self._get_general_metadata()
|
|
445
|
+
except ValueError as e:
|
|
446
|
+
raise ValueError(f"Metadata extraction failed: {e}")
|
|
447
|
+
|
|
448
|
+
# Build base STAC Collection
|
|
449
|
+
collection = Collection(
|
|
450
|
+
id=self.collection_id,
|
|
451
|
+
description=general_metadata.get("description", "No description provided."),
|
|
452
|
+
extent=Extent(spatial=spatial_extent, temporal=temporal_extent),
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Add OSC extension metadata
|
|
456
|
+
osc_extension = OscExtension.add_to(collection)
|
|
457
|
+
# osc_project and osc_type are fixed constant values
|
|
458
|
+
osc_extension.osc_project = "deep-earth-system-data-lab"
|
|
459
|
+
osc_extension.osc_type = "product"
|
|
460
|
+
osc_extension.osc_status = self.osc_status
|
|
461
|
+
osc_extension.osc_region = self.osc_region
|
|
462
|
+
osc_extension.osc_variables = variables
|
|
463
|
+
osc_extension.osc_missions = self.osc_missions
|
|
464
|
+
if self.cf_params:
|
|
465
|
+
osc_extension.cf_parameter = self.cf_params
|
|
466
|
+
else:
|
|
467
|
+
osc_extension.cf_parameter = [{"name": self.collection_id}]
|
|
468
|
+
|
|
469
|
+
# Add creation and update timestamps for the collection
|
|
470
|
+
now_iso = datetime.now(timezone.utc).isoformat()
|
|
471
|
+
collection.extra_fields["created"] = now_iso
|
|
472
|
+
collection.extra_fields["updated"] = now_iso
|
|
473
|
+
collection.title = self.collection_id
|
|
474
|
+
|
|
475
|
+
# Remove any existing root link and re-add it properly
|
|
476
|
+
collection.remove_links("root")
|
|
477
|
+
collection.add_link(
|
|
478
|
+
Link(
|
|
479
|
+
rel="root",
|
|
480
|
+
target="../../catalog.json",
|
|
481
|
+
media_type="application/json",
|
|
482
|
+
title="Open Science Catalog",
|
|
483
|
+
)
|
|
484
|
+
)
|
|
485
|
+
collection.add_link(Link(rel="via", target=self.access_link, title="Access"))
|
|
486
|
+
if self.documentation_link:
|
|
487
|
+
collection.add_link(
|
|
488
|
+
Link(rel="via", target=self.documentation_link, title="Documentation")
|
|
489
|
+
)
|
|
490
|
+
collection.add_link(
|
|
491
|
+
Link(
|
|
492
|
+
rel="parent",
|
|
493
|
+
target="../catalog.json",
|
|
494
|
+
media_type="application/json",
|
|
495
|
+
title="Products",
|
|
496
|
+
)
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Add variables ref
|
|
500
|
+
for var in variables:
|
|
501
|
+
collection.add_link(
|
|
502
|
+
Link(
|
|
503
|
+
rel="related",
|
|
504
|
+
target=f"../../variables/{var}/catalog.json",
|
|
505
|
+
media_type="application/json",
|
|
506
|
+
title="Variable: " + self.format_string(var),
|
|
507
|
+
)
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
self_href = (
|
|
511
|
+
"https://esa-earthcode.github.io/"
|
|
512
|
+
f"open-science-catalog-metadata/products/{self.collection_id}/collection.json"
|
|
513
|
+
)
|
|
514
|
+
collection.set_self_href(self_href)
|
|
515
|
+
|
|
516
|
+
# align with themes instead of osc:themes
|
|
517
|
+
if self.osc_themes:
|
|
518
|
+
theme_obj = self.build_theme(self.osc_themes)
|
|
519
|
+
collection.extra_fields["themes"] = [theme_obj]
|
|
520
|
+
|
|
521
|
+
for theme in self.osc_themes:
|
|
522
|
+
formatted_theme = self.format_string(theme)
|
|
523
|
+
collection.add_link(
|
|
524
|
+
Link(
|
|
525
|
+
rel="related",
|
|
526
|
+
target=f"../../themes/{theme}/catalog.json",
|
|
527
|
+
media_type="application/json",
|
|
528
|
+
title=f"Theme: {formatted_theme}",
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
collection.add_link(
|
|
533
|
+
Link(
|
|
534
|
+
rel="related",
|
|
535
|
+
target="../../projects/deep-earth-system-data-lab/collection.json",
|
|
536
|
+
media_type="application/json",
|
|
537
|
+
title="Project: DeepESDL"
|
|
538
|
+
)
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Validate OSC extension fields
|
|
542
|
+
try:
|
|
543
|
+
osc_extension.validate_extension()
|
|
544
|
+
except ValueError as e:
|
|
545
|
+
raise ValueError(f"OSC Extension validation failed: {e}")
|
|
546
|
+
|
|
547
|
+
return collection
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright (c) 2025 by Brockmann Consult GmbH
|
|
4
|
+
# Permissions are hereby granted under the terms of the MIT License:
|
|
5
|
+
# https://opensource.org/licenses/MIT.
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
|
|
15
|
+
from deep_code.utils.helper import serialize
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GitHubAutomation:
|
|
19
|
+
"""Automates GitHub operations needed to create a Pull Request.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
username: GitHub username.
|
|
23
|
+
token: Personal access token for GitHub.
|
|
24
|
+
repo_owner: Owner of the repository to fork.
|
|
25
|
+
repo_name: Name of the repository to fork.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, username: str, token: str, repo_owner: str, repo_name: str):
|
|
29
|
+
self.username = username
|
|
30
|
+
self.token = token
|
|
31
|
+
self.repo_owner = repo_owner
|
|
32
|
+
self.repo_name = repo_name
|
|
33
|
+
self.base_repo_url = f"https://github.com/{repo_owner}/{repo_name}.git"
|
|
34
|
+
self.fork_repo_url = (
|
|
35
|
+
f"https://{username}:{token}@github.com/{username}/{repo_name}.git"
|
|
36
|
+
)
|
|
37
|
+
self.local_clone_dir = os.path.join(os.path.expanduser("~"), "temp_repo")
|
|
38
|
+
|
|
39
|
+
def fork_repository(self):
|
|
40
|
+
"""Fork the repository to the user's GitHub account."""
|
|
41
|
+
logging.info("Forking repository...")
|
|
42
|
+
url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/forks"
|
|
43
|
+
headers = {"Authorization": f"token {self.token}"}
|
|
44
|
+
response = requests.post(url, headers=headers)
|
|
45
|
+
response.raise_for_status()
|
|
46
|
+
logging.info(f"Repository forked to {self.username}/{self.repo_name}")
|
|
47
|
+
|
|
48
|
+
def clone_sync_repository(self):
|
|
49
|
+
"""Clone the forked repository locally if it doesn't exist, or pull updates if it does."""
|
|
50
|
+
logging.info("Checking local repository...")
|
|
51
|
+
if not os.path.exists(self.local_clone_dir):
|
|
52
|
+
logging.info("Cloning forked repository...")
|
|
53
|
+
try:
|
|
54
|
+
subprocess.run(
|
|
55
|
+
["git", "clone", self.fork_repo_url, self.local_clone_dir],
|
|
56
|
+
check=True,
|
|
57
|
+
)
|
|
58
|
+
logging.info(f"Repository cloned to {self.local_clone_dir}")
|
|
59
|
+
except subprocess.CalledProcessError as e:
|
|
60
|
+
raise RuntimeError(f"Failed to clone repository: {e}")
|
|
61
|
+
else:
|
|
62
|
+
logging.info("Local repository already exists. Pulling latest changes...")
|
|
63
|
+
try:
|
|
64
|
+
os.chdir(self.local_clone_dir)
|
|
65
|
+
subprocess.run(["git", "pull"], check=True)
|
|
66
|
+
logging.info("Repository updated with latest changes.")
|
|
67
|
+
except subprocess.CalledProcessError as e:
|
|
68
|
+
raise RuntimeError(f"Failed to pull latest changes: {e}")
|
|
69
|
+
|
|
70
|
+
def create_branch(self, branch_name: str):
|
|
71
|
+
"""Create a new branch in the local repository."""
|
|
72
|
+
logging.info(f"Creating new branch: {branch_name}...")
|
|
73
|
+
try:
|
|
74
|
+
os.chdir(self.local_clone_dir)
|
|
75
|
+
subprocess.run(["git", "checkout", "-b", branch_name], check=True)
|
|
76
|
+
except subprocess.CalledProcessError as e:
|
|
77
|
+
raise RuntimeError(f"Failed Creating branch: '{branch_name}': {e}")
|
|
78
|
+
|
|
79
|
+
def add_file(self, file_path: str, content):
|
|
80
|
+
"""Add a new file to the local repository."""
|
|
81
|
+
logging.info(f"Adding new file: {file_path}...")
|
|
82
|
+
os.chdir(self.local_clone_dir)
|
|
83
|
+
full_path = Path(self.local_clone_dir) / file_path
|
|
84
|
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
# Ensure content is serializable
|
|
86
|
+
if hasattr(content, "to_dict"):
|
|
87
|
+
content = content.to_dict()
|
|
88
|
+
if not isinstance(content, (dict, list, str, int, float, bool, type(None))):
|
|
89
|
+
raise TypeError(f"Cannot serialize content of type {type(content)}")
|
|
90
|
+
try:
|
|
91
|
+
json_content = json.dumps(
|
|
92
|
+
content, indent=2, ensure_ascii=False, default=serialize
|
|
93
|
+
)
|
|
94
|
+
except TypeError as e:
|
|
95
|
+
raise RuntimeError(f"JSON serialization failed: {e}")
|
|
96
|
+
with open(full_path, "w", encoding="utf-8") as f:
|
|
97
|
+
f.write(json_content)
|
|
98
|
+
try:
|
|
99
|
+
subprocess.run(["git", "add", str(full_path)], check=True)
|
|
100
|
+
except subprocess.CalledProcessError as e:
|
|
101
|
+
raise RuntimeError(f"Failed to add file '{file_path}': {e}")
|
|
102
|
+
|
|
103
|
+
def commit_and_push(self, branch_name: str, commit_message: str):
|
|
104
|
+
"""Commit changes and push to the forked repository."""
|
|
105
|
+
logging.info("Committing and pushing changes...")
|
|
106
|
+
os.chdir(self.local_clone_dir)
|
|
107
|
+
try:
|
|
108
|
+
subprocess.run(["git", "commit", "-m", commit_message], check=True)
|
|
109
|
+
subprocess.run(["git", "push", "-u", "origin", branch_name], check=True)
|
|
110
|
+
except subprocess.CalledProcessError as e:
|
|
111
|
+
raise RuntimeError(f"Failed to commit and push: {e}")
|
|
112
|
+
|
|
113
|
+
def create_pull_request(
|
|
114
|
+
self, branch_name: str, pr_title: str, pr_body: str, base_branch: str = "main"
|
|
115
|
+
):
|
|
116
|
+
"""Create a pull request from the forked repository to the base repository."""
|
|
117
|
+
logging.info("Creating a pull request...")
|
|
118
|
+
os.chdir(self.local_clone_dir)
|
|
119
|
+
url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls"
|
|
120
|
+
headers = {"Authorization": f"token {self.token}"}
|
|
121
|
+
data = {
|
|
122
|
+
"title": pr_title,
|
|
123
|
+
"head": f"{self.username}:{branch_name}",
|
|
124
|
+
"base": base_branch,
|
|
125
|
+
"body": pr_body,
|
|
126
|
+
}
|
|
127
|
+
response = requests.post(url, headers=headers, json=data)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
pr_url = response.json()["html_url"]
|
|
130
|
+
logging.info(f"Pull request created: {pr_url}")
|
|
131
|
+
|
|
132
|
+
def clean_up(self):
|
|
133
|
+
"""Clean up the local cloned repository."""
|
|
134
|
+
logging.info("Cleaning up local repository...")
|
|
135
|
+
os.chdir("..")
|
|
136
|
+
try:
|
|
137
|
+
subprocess.run(["rm", "-rf", self.local_clone_dir])
|
|
138
|
+
except subprocess.CalledProcessError as e:
|
|
139
|
+
raise RuntimeError(f"Failed to clean-up local repository: {e}")
|
|
140
|
+
|
|
141
|
+
def file_exists(self, file_path) -> bool:
|
|
142
|
+
full_path = Path(self.local_clone_dir) / file_path
|
|
143
|
+
exists = os.path.isfile(full_path)
|
|
144
|
+
logging.debug(f"Checking existence of {full_path}: {exists}")
|
|
145
|
+
return exists
|