eotdl 2024.10.7__py3-none-any.whl → 2025.3.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eotdl/__init__.py +1 -1
- eotdl/access/search.py +0 -2
- eotdl/access/sentinelhub/parameters.py +1 -1
- eotdl/cli.py +2 -2
- eotdl/commands/datasets.py +28 -31
- eotdl/commands/models.py +27 -30
- eotdl/commands/stac.py +57 -0
- eotdl/curation/__init__.py +0 -8
- eotdl/curation/stac/__init__.py +1 -8
- eotdl/curation/stac/api.py +58 -0
- eotdl/curation/stac/stac.py +31 -341
- eotdl/datasets/__init__.py +1 -1
- eotdl/datasets/ingest.py +28 -159
- eotdl/datasets/retrieve.py +0 -9
- eotdl/datasets/stage.py +64 -0
- eotdl/files/__init__.py +0 -2
- eotdl/files/ingest.bck +178 -0
- eotdl/files/ingest.py +229 -164
- eotdl/{datasets → files}/metadata.py +16 -17
- eotdl/models/__init__.py +1 -1
- eotdl/models/ingest.py +28 -159
- eotdl/models/stage.py +60 -0
- eotdl/repos/APIRepo.py +1 -1
- eotdl/repos/DatasetsAPIRepo.py +56 -43
- eotdl/repos/FilesAPIRepo.py +260 -167
- eotdl/repos/STACAPIRepo.py +40 -0
- eotdl/repos/__init__.py +1 -0
- eotdl/tools/geo_utils.py +7 -2
- {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/METADATA +5 -4
- eotdl-2025.3.25.dist-info/RECORD +65 -0
- {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/WHEEL +1 -1
- eotdl/curation/stac/assets.py +0 -110
- eotdl/curation/stac/dataframe.py +0 -172
- eotdl/curation/stac/dataframe_bck.py +0 -253
- eotdl/curation/stac/dataframe_labeling.py +0 -63
- eotdl/curation/stac/extensions/__init__.py +0 -23
- eotdl/curation/stac/extensions/base.py +0 -30
- eotdl/curation/stac/extensions/dem.py +0 -18
- eotdl/curation/stac/extensions/eo.py +0 -117
- eotdl/curation/stac/extensions/label/__init__.py +0 -7
- eotdl/curation/stac/extensions/label/base.py +0 -136
- eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
- eotdl/curation/stac/extensions/label/scaneo.py +0 -219
- eotdl/curation/stac/extensions/ml_dataset.py +0 -648
- eotdl/curation/stac/extensions/projection.py +0 -44
- eotdl/curation/stac/extensions/raster.py +0 -53
- eotdl/curation/stac/extensions/sar.py +0 -55
- eotdl/curation/stac/extent.py +0 -158
- eotdl/curation/stac/parsers.py +0 -61
- eotdl/datasets/download.py +0 -104
- eotdl/files/list_files.py +0 -13
- eotdl/models/download.py +0 -101
- eotdl/models/metadata.py +0 -43
- eotdl/wrappers/utils.py +0 -35
- eotdl-2024.10.7.dist-info/RECORD +0 -82
- {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/entry_points.txt +0 -0
eotdl/curation/stac/stac.py
CHANGED
@@ -1,343 +1,33 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
import random
|
6
|
-
from datetime import datetime
|
7
|
-
from typing import Union, Optional
|
8
|
-
from os.path import join, basename, dirname
|
9
|
-
|
10
|
-
import pandas as pd
|
11
|
-
import pystac
|
12
|
-
import rasterio
|
1
|
+
import pyarrow.parquet as pq
|
2
|
+
import stac_geoparquet
|
3
|
+
import json
|
13
4
|
from tqdm import tqdm
|
14
|
-
|
15
|
-
|
16
|
-
from .parsers import STACIdParser, StructuredParser
|
17
|
-
from .assets import STACAssetGenerator
|
18
|
-
from .dataframe_labeling import LabelingStrategy, UnlabeledStrategy
|
19
|
-
from ...tools import (
|
20
|
-
format_time_acquired,
|
21
|
-
cut_images,
|
22
|
-
get_item_metadata,
|
23
|
-
get_all_images_in_path,
|
24
|
-
)
|
25
|
-
from .extensions import (
|
26
|
-
type_stac_extensions_dict,
|
27
|
-
SUPPORTED_EXTENSIONS,
|
28
|
-
)
|
29
|
-
from .extent import get_collection_extent
|
30
|
-
|
31
|
-
|
32
|
-
class STACGenerator:
|
33
|
-
"""
|
34
|
-
STAC generator class
|
35
|
-
"""
|
36
|
-
|
37
|
-
def __init__(
|
38
|
-
self,
|
39
|
-
image_format: str = "tiff",
|
40
|
-
catalog_type: pystac.CatalogType = pystac.CatalogType.SELF_CONTAINED,
|
41
|
-
item_parser: STACIdParser = StructuredParser,
|
42
|
-
assets_generator: STACAssetGenerator = STACAssetGenerator,
|
43
|
-
labeling_strategy: LabelingStrategy = UnlabeledStrategy,
|
44
|
-
) -> None:
|
45
|
-
"""
|
46
|
-
Initialize the STAC generator
|
47
|
-
|
48
|
-
:param image_format: image format of the assets
|
49
|
-
:param catalog_type: type of the catalog
|
50
|
-
:param item_parser: parser to get the item ID
|
51
|
-
:param assets_generator: generator to generate the assets
|
52
|
-
:param labeling_strategy: strategy to label the images
|
53
|
-
"""
|
54
|
-
self._image_format = image_format
|
55
|
-
self._catalog_type = catalog_type
|
56
|
-
self._item_parser = item_parser()
|
57
|
-
self._assets_generator = assets_generator()
|
58
|
-
self._labeling_strategy = labeling_strategy()
|
59
|
-
self._extensions_dict: dict = type_stac_extensions_dict
|
60
|
-
self._stac_dataframe = pd.DataFrame()
|
61
|
-
|
62
|
-
def generate_stac_metadata(
|
63
|
-
self,
|
64
|
-
stac_id: str,
|
65
|
-
description: str,
|
66
|
-
stac_dataframe: pd.DataFrame = None,
|
67
|
-
output_folder: str = "stac",
|
68
|
-
**kwargs,
|
69
|
-
) -> None:
|
70
|
-
"""
|
71
|
-
Generate STAC metadata for a given directory containing the assets to generate metadata
|
72
|
-
|
73
|
-
:param id: id of the catalog
|
74
|
-
:param description: description of the catalog
|
75
|
-
:param stac_dataframe: dataframe with the STAC metadata of a given directory containing the assets to generate metadata
|
76
|
-
:param output_folder: output folder to write the catalog to
|
77
|
-
"""
|
78
|
-
self._stac_dataframe = (
|
79
|
-
stac_dataframe if self._stac_dataframe.empty else self._stac_dataframe
|
80
|
-
)
|
81
|
-
if self._stac_dataframe.empty:
|
82
|
-
raise ValueError("No STAC dataframe provided")
|
83
|
-
|
84
|
-
# Create an empty catalog
|
85
|
-
catalog = pystac.Catalog(id=stac_id, description=description, **kwargs)
|
86
|
-
|
87
|
-
# Add the collections to the catalog
|
88
|
-
collections = self._stac_dataframe.collection.unique()
|
89
|
-
for collection_path in collections:
|
90
|
-
# Generate the collection
|
91
|
-
collection = self.generate_stac_collection(collection_path)
|
92
|
-
# Add the collection to the catalog
|
93
|
-
catalog.add_child(collection)
|
94
|
-
|
95
|
-
# Check there have been generate all the items from the images
|
96
|
-
items_count = 0
|
97
|
-
for collection in catalog.get_children():
|
98
|
-
items = list(
|
99
|
-
set([item.id for item in collection.get_items(recursive=True)])
|
100
|
-
)
|
101
|
-
items_count += len(items)
|
102
|
-
if len(self._stac_dataframe) != items_count:
|
103
|
-
raise pystac.STACError(
|
104
|
-
"Not all the STAC items have been generated, please check the Item parser or the STAC dataframe. If you are using the StructuredParser, check that the images are in the correct folder structure."
|
105
|
-
)
|
106
|
-
|
107
|
-
# Add the catalog to the root directory
|
108
|
-
catalog.normalize_hrefs(output_folder)
|
109
|
-
|
110
|
-
# Validate the catalog
|
111
|
-
print("Validating and saving catalog...")
|
112
|
-
try:
|
113
|
-
pystac.validation.validate(catalog)
|
114
|
-
catalog.save(catalog_type=self._catalog_type)
|
115
|
-
print("Success!")
|
116
|
-
except pystac.STACValidationError as e:
|
117
|
-
print(f"Catalog validation error: {e}")
|
118
|
-
return
|
119
|
-
|
120
|
-
def get_stac_dataframe(
|
121
|
-
self,
|
122
|
-
path: str,
|
123
|
-
collections: Optional[Union[str, dict]] = "source",
|
124
|
-
bands: Optional[dict] = None,
|
125
|
-
extensions: Optional[dict] = None,
|
126
|
-
sample: Optional[int] = None,
|
127
|
-
) -> pd.DataFrame:
|
128
|
-
"""
|
129
|
-
Get a dataframe with the STAC metadata of a given directory containing the assets to generate metadata
|
130
|
-
|
131
|
-
:param path: path to the root directory
|
132
|
-
:param collections: dictionary with the collections
|
133
|
-
:param bands: dictionary with the bands
|
134
|
-
:param extensions: dictionary with the extensions
|
135
|
-
"""
|
136
|
-
images = get_all_images_in_path(path, self._image_format)
|
137
|
-
if len(images) == 0:
|
138
|
-
raise ValueError(
|
139
|
-
"No images found in the given path with the given extension. Please check the path and the extension"
|
140
|
-
)
|
141
|
-
|
142
|
-
if self._assets_generator.type == "Extracted":
|
143
|
-
images = cut_images(images)
|
144
|
-
|
145
|
-
if sample:
|
146
|
-
try:
|
147
|
-
images = random.sample(images, sample)
|
148
|
-
except ValueError:
|
149
|
-
raise ValueError(
|
150
|
-
f"Sample size must be smaller than the number of images ({len(images)}). May be there are no images found in the given path with the given extension"
|
151
|
-
)
|
152
|
-
|
153
|
-
labels, ixs = self._labeling_strategy.get_images_labels(images)
|
154
|
-
bands_values = self._get_items_list_from_dict(labels, bands)
|
155
|
-
extensions_values = self._get_items_list_from_dict(labels, extensions)
|
156
|
-
|
157
|
-
if collections == "source":
|
158
|
-
# List of path with the same value repeated as many times as the number of images
|
159
|
-
collections_values = [join(path, "source") for i in range(len(images))]
|
160
|
-
elif collections == "*":
|
161
|
-
collections_values = [
|
162
|
-
join(path, basename(dirname(image))) for image in images
|
163
|
-
]
|
164
|
-
else:
|
165
|
-
try:
|
166
|
-
collections_values = [
|
167
|
-
join(path, value)
|
168
|
-
for value in self._get_items_list_from_dict(labels, collections)
|
169
|
-
]
|
170
|
-
except TypeError:
|
171
|
-
raise pystac.STACError(
|
172
|
-
"There is an error generating the collections. Please check the collections dictionary"
|
173
|
-
)
|
174
|
-
|
175
|
-
df = pd.DataFrame(
|
176
|
-
{
|
177
|
-
"image": images,
|
178
|
-
"label": labels,
|
179
|
-
"ix": ixs,
|
180
|
-
"collection": collections_values,
|
181
|
-
"extensions": extensions_values,
|
182
|
-
"bands": bands_values,
|
183
|
-
}
|
184
|
-
)
|
185
|
-
|
186
|
-
self._stac_dataframe = df
|
187
|
-
|
188
|
-
return df
|
189
|
-
|
190
|
-
def _get_items_list_from_dict(self, labels: list, items: dict) -> list:
|
191
|
-
"""
|
192
|
-
Get a list of items from a dictionary
|
193
|
-
|
194
|
-
:param labels: list of labels
|
195
|
-
:param items: dictionary with the items
|
196
|
-
"""
|
197
|
-
if not items:
|
198
|
-
# Create list of None with the same length as the labels list
|
199
|
-
return [None for _ in labels]
|
200
|
-
items_list = []
|
201
|
-
for label in labels:
|
202
|
-
if label in items.keys():
|
203
|
-
items_list.append(items[label])
|
204
|
-
else:
|
205
|
-
items_list.append(None)
|
206
|
-
|
207
|
-
return items_list
|
208
|
-
|
209
|
-
def generate_stac_collection(self, collection_path: str) -> pystac.Collection:
|
210
|
-
"""
|
211
|
-
Generate a STAC collection from a directory containing the assets to generate metadata
|
212
|
-
|
213
|
-
:param collection_path: path to the collection
|
214
|
-
"""
|
215
|
-
# Get the images of the collection, as they are needed to obtain the collection extent
|
216
|
-
collection_images = self._stac_dataframe[
|
217
|
-
self._stac_dataframe["collection"] == collection_path
|
218
|
-
]["image"]
|
219
|
-
# Get the collection extent
|
220
|
-
extent = get_collection_extent(collection_images)
|
221
|
-
# Create the collection
|
222
|
-
collection_id = basename(collection_path)
|
223
|
-
collection = pystac.Collection(
|
224
|
-
id=collection_id, description="Collection", extent=extent
|
225
|
-
)
|
226
|
-
|
227
|
-
print(f"Generating {collection_id} collection...")
|
228
|
-
for image in tqdm(collection_images):
|
229
|
-
# Create the item
|
230
|
-
item = self.create_stac_item(image)
|
231
|
-
# Add the item to the collection
|
232
|
-
collection.add_item(item)
|
233
|
-
|
234
|
-
# Return the collection
|
235
|
-
return collection
|
236
|
-
|
237
|
-
def create_stac_item(self, raster_path: str) -> pystac.Item:
|
238
|
-
"""
|
239
|
-
Create a STAC item from a directory containing the raster files and the metadata.json file
|
240
|
-
|
241
|
-
:param raster_path: path to the raster file
|
242
|
-
"""
|
243
|
-
# Check if there is any metadata file in the directory associated to the raster file
|
244
|
-
metadata = get_item_metadata(raster_path)
|
245
|
-
|
246
|
-
# Obtain the bounding box from the raster
|
247
|
-
with rasterio.open(raster_path) as ds:
|
248
|
-
bounds = ds.bounds
|
249
|
-
dst_crs = "EPSG:4326"
|
250
|
-
try:
|
251
|
-
left, bottom, right, top = rasterio.warp.transform_bounds(
|
252
|
-
ds.crs, dst_crs, *bounds
|
253
|
-
)
|
254
|
-
except rasterio.errors.CRSError:
|
255
|
-
# If the raster has no crs, set the bounding box to 0
|
256
|
-
left, bottom, right, top = 0, 0, 0, 0
|
257
|
-
|
258
|
-
# Create bbox
|
259
|
-
bbox = [left, bottom, right, top]
|
260
|
-
|
261
|
-
# Create geojson feature
|
262
|
-
# If the bounding box has no values, set the geometry to None
|
263
|
-
geom = mapping(
|
264
|
-
Polygon([[left, bottom], [left, top], [right, top], [right, bottom]])
|
265
|
-
)
|
266
|
-
|
267
|
-
# Initialize pySTAC item parameters
|
268
|
-
params = {}
|
269
|
-
params["properties"] = {}
|
270
|
-
|
271
|
-
# Obtain the date acquired
|
272
|
-
start_time, end_time = None, None
|
273
|
-
if (
|
274
|
-
metadata
|
275
|
-
and metadata["acquisition-date"]
|
276
|
-
and metadata["type"] not in ("dem", "DEM")
|
277
|
-
):
|
278
|
-
time_acquired = format_time_acquired(metadata["acquisition-date"])
|
279
|
-
else:
|
280
|
-
# Check if the type of the data is DEM
|
281
|
-
if metadata and metadata["type"] and metadata["type"] in ("dem", "DEM"):
|
282
|
-
time_acquired = None
|
283
|
-
start_time = datetime.strptime("2011-01-01", "%Y-%m-%d")
|
284
|
-
end_time = datetime.strptime("2015-01-07", "%Y-%m-%d")
|
285
|
-
params["start_datetime"] = start_time
|
286
|
-
params["end_datetime"] = end_time
|
287
|
-
else:
|
288
|
-
# Set unknown date
|
289
|
-
time_acquired = datetime.strptime("2000-01-01", "%Y-%m-%d")
|
290
|
-
|
291
|
-
# Obtain the item ID. The approach depends on the item parser
|
292
|
-
item_id = self._item_parser.get_item_id(raster_path)
|
293
|
-
# Add the item ID to the dataframe, to be able to get it later
|
294
|
-
self._stac_dataframe.loc[
|
295
|
-
self._stac_dataframe["image"] == raster_path, "id"
|
296
|
-
] = item_id
|
297
|
-
|
298
|
-
# Instantiate pystac item
|
299
|
-
item = pystac.Item(
|
300
|
-
id=item_id, geometry=geom, bbox=bbox, datetime=time_acquired, **params
|
301
|
-
)
|
302
|
-
|
303
|
-
# Get the item info, from the raster path
|
304
|
-
item_info = self._stac_dataframe[self._stac_dataframe["image"] == raster_path]
|
305
|
-
# Get the extensions of the item
|
306
|
-
extensions = item_info["extensions"].values
|
307
|
-
extensions = extensions[0] if extensions else None
|
308
|
-
|
309
|
-
# Add the required extensions to the item
|
310
|
-
if extensions:
|
311
|
-
if isinstance(extensions, str):
|
312
|
-
extensions = [extensions]
|
313
|
-
for extension in extensions:
|
314
|
-
if extension not in SUPPORTED_EXTENSIONS:
|
315
|
-
raise ValueError(f"Extension {extension} not supported")
|
316
|
-
else:
|
317
|
-
extension_obj = self._extensions_dict[extension]
|
318
|
-
extension_obj.add_extension_to_object(item, item_info)
|
319
|
-
|
320
|
-
# Add the assets to the item
|
321
|
-
assets = self._assets_generator.extract_assets(item_info)
|
322
|
-
if not assets:
|
323
|
-
# If there are not assets using the selected generator, try with the default
|
324
|
-
assets = STACAssetGenerator.extract_assets(item_info)
|
5
|
+
import pystac
|
6
|
+
from datetime import datetime
|
325
7
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
8
|
+
def create_stac_catalog(parquet_catalog_path, stac_catalog = None):
|
9
|
+
# parse items and add to collection
|
10
|
+
table = pq.read_table(parquet_catalog_path)
|
11
|
+
items = []
|
12
|
+
for item in tqdm(stac_geoparquet.arrow.stac_table_to_items(table), total=len(table)):
|
13
|
+
item = pystac.Item.from_dict(item)
|
14
|
+
item.validate()
|
15
|
+
# collection.add_item(item)
|
16
|
+
if stac_catalog is not None:
|
17
|
+
stac_catalog.add_item(item)
|
18
|
+
else:
|
19
|
+
items.append(item)
|
20
|
+
# path = "data/stac/" + item["id"] + ".json"
|
21
|
+
# os.makedirs(os.path.dirname(path), exist_ok=True)
|
22
|
+
# with open(path, "w") as f:
|
23
|
+
# json.dump(item, f)
|
24
|
+
# # save item
|
25
|
+
# os.makedirs(path, exist_ok=True)
|
26
|
+
# _path = path + '/' + item.id + ".json"
|
27
|
+
# os.makedirs(os.path.dirname(_path), exist_ok=True)
|
28
|
+
# with open(_path, "w") as f:
|
29
|
+
# json.dump(item.to_dict(), f)
|
30
|
+
# save catalog
|
31
|
+
if stac_catalog is not None:
|
32
|
+
return stac_catalog
|
33
|
+
return items
|
eotdl/datasets/__init__.py
CHANGED
eotdl/datasets/ingest.py
CHANGED
@@ -1,167 +1,36 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
import yaml
|
3
|
-
from tqdm import tqdm
|
4
|
-
import json
|
5
|
-
import frontmatter
|
6
|
-
|
7
|
-
from ..auth import with_auth
|
8
|
-
from .metadata import Metadata
|
9
|
-
from ..repos import DatasetsAPIRepo, FilesAPIRepo
|
10
|
-
from ..files import ingest_files, create_new_version
|
11
|
-
from ..curation.stac import STACDataFrame
|
12
|
-
from ..shared import calculate_checksum
|
13
|
-
from .update import update_dataset
|
14
|
-
from .metadata import generate_metadata
|
15
|
-
|
16
|
-
|
17
|
-
def ingest_dataset(
|
18
|
-
path,
|
19
|
-
verbose=False,
|
20
|
-
logger=print,
|
21
|
-
force_metadata_update=False,
|
22
|
-
sync_metadata=False,
|
23
|
-
):
|
24
|
-
path = Path(path)
|
25
|
-
if not path.is_dir():
|
26
|
-
raise Exception("Path must be a folder")
|
27
|
-
if "catalog.json" in [f.name for f in path.iterdir()]:
|
28
|
-
return ingest_stac(path / "catalog.json", logger)
|
29
|
-
return ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
|
30
2
|
|
3
|
+
from ..repos import DatasetsAPIRepo
|
4
|
+
from ..files.ingest import prep_ingest_stac, prep_ingest_folder, ingest
|
31
5
|
|
32
6
|
def retrieve_dataset(metadata, user):
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
return data
|
46
|
-
|
7
|
+
repo = DatasetsAPIRepo()
|
8
|
+
data, error = repo.retrieve_dataset(metadata.name)
|
9
|
+
# print(data, error)
|
10
|
+
if data and data["uid"] != user["uid"]:
|
11
|
+
raise Exception("Dataset already exists.")
|
12
|
+
if error and error == "Dataset doesn't exist":
|
13
|
+
# create dataset
|
14
|
+
data, error = repo.create_dataset(metadata.dict(), user)
|
15
|
+
# print(data, error)
|
16
|
+
if error:
|
17
|
+
raise Exception(error)
|
18
|
+
return data
|
47
19
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
sync_metadata=False,
|
55
|
-
user=None,
|
56
|
-
):
|
57
|
-
repo = DatasetsAPIRepo()
|
58
|
-
try:
|
59
|
-
readme = frontmatter.load(folder.joinpath("README.md"))
|
60
|
-
metadata, content = readme.metadata, readme.content
|
61
|
-
metadata = Metadata(**metadata)
|
62
|
-
# except FileNotFoundError:
|
63
|
-
# # load metadata (legacy)
|
64
|
-
# metadata = (
|
65
|
-
# yaml.safe_load(open(folder.joinpath("metadata.yml"), "r").read()) or {}
|
66
|
-
# )
|
67
|
-
# metadata = Metadata(**metadata)
|
68
|
-
# content = None
|
69
|
-
except Exception as e:
|
70
|
-
print(str(e))
|
71
|
-
raise Exception("Error loading metadata")
|
72
|
-
# retrieve dataset (create if doesn't exist)
|
73
|
-
dataset = retrieve_dataset(metadata, user)
|
74
|
-
|
75
|
-
update_metadata = True
|
76
|
-
if "description" in dataset:
|
77
|
-
# do not do this if the dataset is new, only if it already exists
|
78
|
-
update_metadata = check_metadata(
|
79
|
-
dataset, metadata, content, force_metadata_update, sync_metadata, folder
|
80
|
-
)
|
81
|
-
if update_metadata:
|
82
|
-
update_dataset(dataset["id"], metadata, content, user)
|
83
|
-
return ingest_files(
|
84
|
-
repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
|
85
|
-
)
|
86
|
-
|
87
|
-
|
88
|
-
def check_metadata(
|
89
|
-
dataset, metadata, content, force_metadata_update, sync_metadata, folder
|
20
|
+
def ingest_dataset(
|
21
|
+
path,
|
22
|
+
verbose=False,
|
23
|
+
logger=print,
|
24
|
+
force_metadata_update=False,
|
25
|
+
sync_metadata=False,
|
90
26
|
):
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
if not force_metadata_update and not sync_metadata:
|
100
|
-
raise Exception(
|
101
|
-
"The provided metadata is not consistent with the current metadata. Use -f to force metadata update or -s to sync your local metadata."
|
102
|
-
)
|
103
|
-
if force_metadata_update:
|
104
|
-
return True
|
105
|
-
if sync_metadata:
|
106
|
-
generate_metadata(str(folder), dataset)
|
107
|
-
return False
|
108
|
-
return False
|
109
|
-
|
110
|
-
|
111
|
-
def retrieve_stac_dataset(dataset_name, user):
|
112
|
-
repo = DatasetsAPIRepo()
|
113
|
-
data, error = repo.retrieve_dataset(dataset_name)
|
114
|
-
# print(data, error)
|
115
|
-
if data and data["uid"] != user["uid"]:
|
116
|
-
raise Exception("Dataset already exists.")
|
117
|
-
if error and error == "Dataset doesn't exist":
|
118
|
-
# create dataset
|
119
|
-
data, error = repo.create_stac_dataset(dataset_name, user)
|
120
|
-
# print(data, error)
|
121
|
-
if error:
|
122
|
-
raise Exception(error)
|
123
|
-
data["id"] = data["dataset_id"]
|
124
|
-
return data["id"]
|
27
|
+
path = Path(path)
|
28
|
+
if not path.is_dir():
|
29
|
+
raise Exception("Path must be a folder")
|
30
|
+
if "catalog.json" in [f.name for f in path.iterdir()]:
|
31
|
+
prep_ingest_stac(path, logger)
|
32
|
+
else:
|
33
|
+
prep_ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
|
34
|
+
return ingest(path, DatasetsAPIRepo(), retrieve_dataset, 'datasets')
|
125
35
|
|
126
36
|
|
127
|
-
@with_auth
|
128
|
-
def ingest_stac(stac_catalog, logger=None, user=None):
|
129
|
-
repo, files_repo = DatasetsAPIRepo(), FilesAPIRepo()
|
130
|
-
# load catalog
|
131
|
-
logger("Loading STAC catalog...")
|
132
|
-
df = STACDataFrame.from_stac_file(stac_catalog)
|
133
|
-
catalog = df[df["type"] == "Catalog"]
|
134
|
-
assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
|
135
|
-
dataset_name = catalog.id.iloc[0]
|
136
|
-
# retrieve dataset (create if doesn't exist)
|
137
|
-
dataset_id = retrieve_stac_dataset(dataset_name, user)
|
138
|
-
# create new version
|
139
|
-
version = create_new_version(repo, dataset_id, user)
|
140
|
-
logger("New version created, version: " + str(version))
|
141
|
-
df2 = df.dropna(subset=["assets"])
|
142
|
-
for row in tqdm(df2.iterrows(), total=len(df2)):
|
143
|
-
try:
|
144
|
-
for k, v in row[1]["assets"].items():
|
145
|
-
data, error = files_repo.ingest_file(
|
146
|
-
v["href"],
|
147
|
-
dataset_id,
|
148
|
-
user,
|
149
|
-
calculate_checksum(v["href"]), # is always absolute?
|
150
|
-
"datasets",
|
151
|
-
version,
|
152
|
-
)
|
153
|
-
if error:
|
154
|
-
raise Exception(error)
|
155
|
-
file_url = f"{repo.url}datasets/{data['dataset_id']}/download/{data['filename']}"
|
156
|
-
df.loc[row[0], "assets"][k]["href"] = file_url
|
157
|
-
except Exception as e:
|
158
|
-
logger(f"Error uploading asset {row[0]}: {e}")
|
159
|
-
break
|
160
|
-
# ingest the STAC catalog into geodb
|
161
|
-
logger("Ingesting STAC catalog...")
|
162
|
-
data, error = repo.ingest_stac(json.loads(df.to_json()), dataset_id, user)
|
163
|
-
if error:
|
164
|
-
# TODO: delete all assets that were uploaded
|
165
|
-
raise Exception(error)
|
166
|
-
logger("Done")
|
167
|
-
return
|
eotdl/datasets/retrieve.py
CHANGED
@@ -25,12 +25,3 @@ def retrieve_dataset_files(dataset_id, version):
|
|
25
25
|
raise Exception(error)
|
26
26
|
return data
|
27
27
|
|
28
|
-
|
29
|
-
# def list_datasets(pattern=None):
|
30
|
-
# datasets = retrieve_datasets()
|
31
|
-
# if pattern:
|
32
|
-
# regex = re.compile(rf".*{re.escape(pattern)}.*", re.IGNORECASE)
|
33
|
-
# names = list(datasets.keys())
|
34
|
-
# valid = [name for name in names if regex.search(name)]
|
35
|
-
# return {name: datasets[name] for name in valid}
|
36
|
-
# return datasets
|
eotdl/datasets/stage.py
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
from tqdm import tqdm
|
4
|
+
import geopandas as gpd
|
5
|
+
|
6
|
+
from ..auth import with_auth
|
7
|
+
from .retrieve import retrieve_dataset
|
8
|
+
from ..repos import FilesAPIRepo
|
9
|
+
|
10
|
+
@with_auth
|
11
|
+
def stage_dataset(
|
12
|
+
dataset_name,
|
13
|
+
version=None,
|
14
|
+
path=None,
|
15
|
+
logger=print,
|
16
|
+
assets=False,
|
17
|
+
force=False,
|
18
|
+
verbose=False,
|
19
|
+
user=None,
|
20
|
+
file=None,
|
21
|
+
):
|
22
|
+
dataset = retrieve_dataset(dataset_name)
|
23
|
+
if version is None:
|
24
|
+
version = sorted([v['version_id'] for v in dataset["versions"]])[-1]
|
25
|
+
else:
|
26
|
+
assert version in [
|
27
|
+
v["version_id"] for v in dataset["versions"]
|
28
|
+
], f"Version {version} not found"
|
29
|
+
download_base_path = os.getenv(
|
30
|
+
"EOTDL_DOWNLOAD_PATH", str(Path.home()) + "/.cache/eotdl/datasets"
|
31
|
+
)
|
32
|
+
if path is None:
|
33
|
+
download_path = download_base_path + "/" + dataset_name
|
34
|
+
else:
|
35
|
+
download_path = path + "/" + dataset_name
|
36
|
+
# check if dataset already exists
|
37
|
+
if os.path.exists(download_path) and not force:
|
38
|
+
os.makedirs(download_path, exist_ok=True)
|
39
|
+
# raise Exception(
|
40
|
+
# f"Dataset `{dataset['name']} v{str(version)}` already exists at {download_path}. To force download, use force=True or -f in the CLI."
|
41
|
+
# )
|
42
|
+
raise Exception(
|
43
|
+
f"Dataset `{dataset['name']}` already exists at {download_path}. To force download, use force=True or -f in the CLI."
|
44
|
+
)
|
45
|
+
|
46
|
+
# stage metadata
|
47
|
+
repo = FilesAPIRepo()
|
48
|
+
catalog_path = repo.stage_file(dataset["id"], f"catalog.v{version}.parquet", user, download_path)
|
49
|
+
|
50
|
+
# TODO: stage README.md
|
51
|
+
|
52
|
+
if assets:
|
53
|
+
gdf = gpd.read_parquet(catalog_path)
|
54
|
+
for _, row in tqdm(gdf.iterrows(), total=len(gdf), desc="Staging assets"):
|
55
|
+
for k, v in row["assets"].items():
|
56
|
+
stage_dataset_file(v["href"], download_path)
|
57
|
+
|
58
|
+
return download_path
|
59
|
+
|
60
|
+
|
61
|
+
@with_auth
|
62
|
+
def stage_dataset_file(file_url, path, user):
|
63
|
+
repo = FilesAPIRepo()
|
64
|
+
return repo.stage_file_url(file_url, path, user)
|
eotdl/files/__init__.py
CHANGED