eotdl 2024.10.7__py3-none-any.whl → 2025.3.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. eotdl/__init__.py +1 -1
  2. eotdl/access/search.py +0 -2
  3. eotdl/access/sentinelhub/parameters.py +1 -1
  4. eotdl/cli.py +2 -2
  5. eotdl/commands/datasets.py +28 -31
  6. eotdl/commands/models.py +27 -30
  7. eotdl/commands/stac.py +57 -0
  8. eotdl/curation/__init__.py +0 -8
  9. eotdl/curation/stac/__init__.py +1 -8
  10. eotdl/curation/stac/api.py +58 -0
  11. eotdl/curation/stac/stac.py +31 -341
  12. eotdl/datasets/__init__.py +1 -1
  13. eotdl/datasets/ingest.py +28 -159
  14. eotdl/datasets/retrieve.py +0 -9
  15. eotdl/datasets/stage.py +64 -0
  16. eotdl/files/__init__.py +0 -2
  17. eotdl/files/ingest.bck +178 -0
  18. eotdl/files/ingest.py +229 -164
  19. eotdl/{datasets → files}/metadata.py +16 -17
  20. eotdl/models/__init__.py +1 -1
  21. eotdl/models/ingest.py +28 -159
  22. eotdl/models/stage.py +60 -0
  23. eotdl/repos/APIRepo.py +1 -1
  24. eotdl/repos/DatasetsAPIRepo.py +56 -43
  25. eotdl/repos/FilesAPIRepo.py +260 -167
  26. eotdl/repos/STACAPIRepo.py +40 -0
  27. eotdl/repos/__init__.py +1 -0
  28. eotdl/tools/geo_utils.py +7 -2
  29. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/METADATA +5 -4
  30. eotdl-2025.3.25.dist-info/RECORD +65 -0
  31. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/WHEEL +1 -1
  32. eotdl/curation/stac/assets.py +0 -110
  33. eotdl/curation/stac/dataframe.py +0 -172
  34. eotdl/curation/stac/dataframe_bck.py +0 -253
  35. eotdl/curation/stac/dataframe_labeling.py +0 -63
  36. eotdl/curation/stac/extensions/__init__.py +0 -23
  37. eotdl/curation/stac/extensions/base.py +0 -30
  38. eotdl/curation/stac/extensions/dem.py +0 -18
  39. eotdl/curation/stac/extensions/eo.py +0 -117
  40. eotdl/curation/stac/extensions/label/__init__.py +0 -7
  41. eotdl/curation/stac/extensions/label/base.py +0 -136
  42. eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
  43. eotdl/curation/stac/extensions/label/scaneo.py +0 -219
  44. eotdl/curation/stac/extensions/ml_dataset.py +0 -648
  45. eotdl/curation/stac/extensions/projection.py +0 -44
  46. eotdl/curation/stac/extensions/raster.py +0 -53
  47. eotdl/curation/stac/extensions/sar.py +0 -55
  48. eotdl/curation/stac/extent.py +0 -158
  49. eotdl/curation/stac/parsers.py +0 -61
  50. eotdl/datasets/download.py +0 -104
  51. eotdl/files/list_files.py +0 -13
  52. eotdl/models/download.py +0 -101
  53. eotdl/models/metadata.py +0 -43
  54. eotdl/wrappers/utils.py +0 -35
  55. eotdl-2024.10.7.dist-info/RECORD +0 -82
  56. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/entry_points.txt +0 -0
@@ -1,343 +1,33 @@
1
- """
2
- Module for generating STAC metadata
3
- """
4
-
5
- import random
6
- from datetime import datetime
7
- from typing import Union, Optional
8
- from os.path import join, basename, dirname
9
-
10
- import pandas as pd
11
- import pystac
12
- import rasterio
1
+ import pyarrow.parquet as pq
2
+ import stac_geoparquet
3
+ import json
13
4
  from tqdm import tqdm
14
- from shapely.geometry import Polygon, mapping
15
-
16
- from .parsers import STACIdParser, StructuredParser
17
- from .assets import STACAssetGenerator
18
- from .dataframe_labeling import LabelingStrategy, UnlabeledStrategy
19
- from ...tools import (
20
- format_time_acquired,
21
- cut_images,
22
- get_item_metadata,
23
- get_all_images_in_path,
24
- )
25
- from .extensions import (
26
- type_stac_extensions_dict,
27
- SUPPORTED_EXTENSIONS,
28
- )
29
- from .extent import get_collection_extent
30
-
31
-
32
- class STACGenerator:
33
- """
34
- STAC generator class
35
- """
36
-
37
- def __init__(
38
- self,
39
- image_format: str = "tiff",
40
- catalog_type: pystac.CatalogType = pystac.CatalogType.SELF_CONTAINED,
41
- item_parser: STACIdParser = StructuredParser,
42
- assets_generator: STACAssetGenerator = STACAssetGenerator,
43
- labeling_strategy: LabelingStrategy = UnlabeledStrategy,
44
- ) -> None:
45
- """
46
- Initialize the STAC generator
47
-
48
- :param image_format: image format of the assets
49
- :param catalog_type: type of the catalog
50
- :param item_parser: parser to get the item ID
51
- :param assets_generator: generator to generate the assets
52
- :param labeling_strategy: strategy to label the images
53
- """
54
- self._image_format = image_format
55
- self._catalog_type = catalog_type
56
- self._item_parser = item_parser()
57
- self._assets_generator = assets_generator()
58
- self._labeling_strategy = labeling_strategy()
59
- self._extensions_dict: dict = type_stac_extensions_dict
60
- self._stac_dataframe = pd.DataFrame()
61
-
62
- def generate_stac_metadata(
63
- self,
64
- stac_id: str,
65
- description: str,
66
- stac_dataframe: pd.DataFrame = None,
67
- output_folder: str = "stac",
68
- **kwargs,
69
- ) -> None:
70
- """
71
- Generate STAC metadata for a given directory containing the assets to generate metadata
72
-
73
- :param id: id of the catalog
74
- :param description: description of the catalog
75
- :param stac_dataframe: dataframe with the STAC metadata of a given directory containing the assets to generate metadata
76
- :param output_folder: output folder to write the catalog to
77
- """
78
- self._stac_dataframe = (
79
- stac_dataframe if self._stac_dataframe.empty else self._stac_dataframe
80
- )
81
- if self._stac_dataframe.empty:
82
- raise ValueError("No STAC dataframe provided")
83
-
84
- # Create an empty catalog
85
- catalog = pystac.Catalog(id=stac_id, description=description, **kwargs)
86
-
87
- # Add the collections to the catalog
88
- collections = self._stac_dataframe.collection.unique()
89
- for collection_path in collections:
90
- # Generate the collection
91
- collection = self.generate_stac_collection(collection_path)
92
- # Add the collection to the catalog
93
- catalog.add_child(collection)
94
-
95
- # Check there have been generate all the items from the images
96
- items_count = 0
97
- for collection in catalog.get_children():
98
- items = list(
99
- set([item.id for item in collection.get_items(recursive=True)])
100
- )
101
- items_count += len(items)
102
- if len(self._stac_dataframe) != items_count:
103
- raise pystac.STACError(
104
- "Not all the STAC items have been generated, please check the Item parser or the STAC dataframe. If you are using the StructuredParser, check that the images are in the correct folder structure."
105
- )
106
-
107
- # Add the catalog to the root directory
108
- catalog.normalize_hrefs(output_folder)
109
-
110
- # Validate the catalog
111
- print("Validating and saving catalog...")
112
- try:
113
- pystac.validation.validate(catalog)
114
- catalog.save(catalog_type=self._catalog_type)
115
- print("Success!")
116
- except pystac.STACValidationError as e:
117
- print(f"Catalog validation error: {e}")
118
- return
119
-
120
- def get_stac_dataframe(
121
- self,
122
- path: str,
123
- collections: Optional[Union[str, dict]] = "source",
124
- bands: Optional[dict] = None,
125
- extensions: Optional[dict] = None,
126
- sample: Optional[int] = None,
127
- ) -> pd.DataFrame:
128
- """
129
- Get a dataframe with the STAC metadata of a given directory containing the assets to generate metadata
130
-
131
- :param path: path to the root directory
132
- :param collections: dictionary with the collections
133
- :param bands: dictionary with the bands
134
- :param extensions: dictionary with the extensions
135
- """
136
- images = get_all_images_in_path(path, self._image_format)
137
- if len(images) == 0:
138
- raise ValueError(
139
- "No images found in the given path with the given extension. Please check the path and the extension"
140
- )
141
-
142
- if self._assets_generator.type == "Extracted":
143
- images = cut_images(images)
144
-
145
- if sample:
146
- try:
147
- images = random.sample(images, sample)
148
- except ValueError:
149
- raise ValueError(
150
- f"Sample size must be smaller than the number of images ({len(images)}). May be there are no images found in the given path with the given extension"
151
- )
152
-
153
- labels, ixs = self._labeling_strategy.get_images_labels(images)
154
- bands_values = self._get_items_list_from_dict(labels, bands)
155
- extensions_values = self._get_items_list_from_dict(labels, extensions)
156
-
157
- if collections == "source":
158
- # List of path with the same value repeated as many times as the number of images
159
- collections_values = [join(path, "source") for i in range(len(images))]
160
- elif collections == "*":
161
- collections_values = [
162
- join(path, basename(dirname(image))) for image in images
163
- ]
164
- else:
165
- try:
166
- collections_values = [
167
- join(path, value)
168
- for value in self._get_items_list_from_dict(labels, collections)
169
- ]
170
- except TypeError:
171
- raise pystac.STACError(
172
- "There is an error generating the collections. Please check the collections dictionary"
173
- )
174
-
175
- df = pd.DataFrame(
176
- {
177
- "image": images,
178
- "label": labels,
179
- "ix": ixs,
180
- "collection": collections_values,
181
- "extensions": extensions_values,
182
- "bands": bands_values,
183
- }
184
- )
185
-
186
- self._stac_dataframe = df
187
-
188
- return df
189
-
190
- def _get_items_list_from_dict(self, labels: list, items: dict) -> list:
191
- """
192
- Get a list of items from a dictionary
193
-
194
- :param labels: list of labels
195
- :param items: dictionary with the items
196
- """
197
- if not items:
198
- # Create list of None with the same length as the labels list
199
- return [None for _ in labels]
200
- items_list = []
201
- for label in labels:
202
- if label in items.keys():
203
- items_list.append(items[label])
204
- else:
205
- items_list.append(None)
206
-
207
- return items_list
208
-
209
- def generate_stac_collection(self, collection_path: str) -> pystac.Collection:
210
- """
211
- Generate a STAC collection from a directory containing the assets to generate metadata
212
-
213
- :param collection_path: path to the collection
214
- """
215
- # Get the images of the collection, as they are needed to obtain the collection extent
216
- collection_images = self._stac_dataframe[
217
- self._stac_dataframe["collection"] == collection_path
218
- ]["image"]
219
- # Get the collection extent
220
- extent = get_collection_extent(collection_images)
221
- # Create the collection
222
- collection_id = basename(collection_path)
223
- collection = pystac.Collection(
224
- id=collection_id, description="Collection", extent=extent
225
- )
226
-
227
- print(f"Generating {collection_id} collection...")
228
- for image in tqdm(collection_images):
229
- # Create the item
230
- item = self.create_stac_item(image)
231
- # Add the item to the collection
232
- collection.add_item(item)
233
-
234
- # Return the collection
235
- return collection
236
-
237
- def create_stac_item(self, raster_path: str) -> pystac.Item:
238
- """
239
- Create a STAC item from a directory containing the raster files and the metadata.json file
240
-
241
- :param raster_path: path to the raster file
242
- """
243
- # Check if there is any metadata file in the directory associated to the raster file
244
- metadata = get_item_metadata(raster_path)
245
-
246
- # Obtain the bounding box from the raster
247
- with rasterio.open(raster_path) as ds:
248
- bounds = ds.bounds
249
- dst_crs = "EPSG:4326"
250
- try:
251
- left, bottom, right, top = rasterio.warp.transform_bounds(
252
- ds.crs, dst_crs, *bounds
253
- )
254
- except rasterio.errors.CRSError:
255
- # If the raster has no crs, set the bounding box to 0
256
- left, bottom, right, top = 0, 0, 0, 0
257
-
258
- # Create bbox
259
- bbox = [left, bottom, right, top]
260
-
261
- # Create geojson feature
262
- # If the bounding box has no values, set the geometry to None
263
- geom = mapping(
264
- Polygon([[left, bottom], [left, top], [right, top], [right, bottom]])
265
- )
266
-
267
- # Initialize pySTAC item parameters
268
- params = {}
269
- params["properties"] = {}
270
-
271
- # Obtain the date acquired
272
- start_time, end_time = None, None
273
- if (
274
- metadata
275
- and metadata["acquisition-date"]
276
- and metadata["type"] not in ("dem", "DEM")
277
- ):
278
- time_acquired = format_time_acquired(metadata["acquisition-date"])
279
- else:
280
- # Check if the type of the data is DEM
281
- if metadata and metadata["type"] and metadata["type"] in ("dem", "DEM"):
282
- time_acquired = None
283
- start_time = datetime.strptime("2011-01-01", "%Y-%m-%d")
284
- end_time = datetime.strptime("2015-01-07", "%Y-%m-%d")
285
- params["start_datetime"] = start_time
286
- params["end_datetime"] = end_time
287
- else:
288
- # Set unknown date
289
- time_acquired = datetime.strptime("2000-01-01", "%Y-%m-%d")
290
-
291
- # Obtain the item ID. The approach depends on the item parser
292
- item_id = self._item_parser.get_item_id(raster_path)
293
- # Add the item ID to the dataframe, to be able to get it later
294
- self._stac_dataframe.loc[
295
- self._stac_dataframe["image"] == raster_path, "id"
296
- ] = item_id
297
-
298
- # Instantiate pystac item
299
- item = pystac.Item(
300
- id=item_id, geometry=geom, bbox=bbox, datetime=time_acquired, **params
301
- )
302
-
303
- # Get the item info, from the raster path
304
- item_info = self._stac_dataframe[self._stac_dataframe["image"] == raster_path]
305
- # Get the extensions of the item
306
- extensions = item_info["extensions"].values
307
- extensions = extensions[0] if extensions else None
308
-
309
- # Add the required extensions to the item
310
- if extensions:
311
- if isinstance(extensions, str):
312
- extensions = [extensions]
313
- for extension in extensions:
314
- if extension not in SUPPORTED_EXTENSIONS:
315
- raise ValueError(f"Extension {extension} not supported")
316
- else:
317
- extension_obj = self._extensions_dict[extension]
318
- extension_obj.add_extension_to_object(item, item_info)
319
-
320
- # Add the assets to the item
321
- assets = self._assets_generator.extract_assets(item_info)
322
- if not assets:
323
- # If there are not assets using the selected generator, try with the default
324
- assets = STACAssetGenerator.extract_assets(item_info)
5
+ import pystac
6
+ from datetime import datetime
325
7
 
326
- # Add the assets to the item
327
- if assets:
328
- for asset in assets:
329
- if isinstance(asset, pystac.Asset):
330
- item.add_asset(asset.title, asset)
331
- # Add the required extensions to the asset if required
332
- if extensions:
333
- if isinstance(extensions, str):
334
- extensions = [extensions]
335
- for extension in extensions:
336
- if extension not in SUPPORTED_EXTENSIONS:
337
- raise ValueError(f"Extension {extension} not supported")
338
- else:
339
- extension_obj = self._extensions_dict[extension]
340
- extension_obj.add_extension_to_object(asset, item_info)
341
- item.set_self_href(join(dirname(raster_path), f"{item_id}.json"))
342
- item.make_asset_hrefs_relative()
343
- return item
8
+ def create_stac_catalog(parquet_catalog_path, stac_catalog = None):
9
+ # parse items and add to collection
10
+ table = pq.read_table(parquet_catalog_path)
11
+ items = []
12
+ for item in tqdm(stac_geoparquet.arrow.stac_table_to_items(table), total=len(table)):
13
+ item = pystac.Item.from_dict(item)
14
+ item.validate()
15
+ # collection.add_item(item)
16
+ if stac_catalog is not None:
17
+ stac_catalog.add_item(item)
18
+ else:
19
+ items.append(item)
20
+ # path = "data/stac/" + item["id"] + ".json"
21
+ # os.makedirs(os.path.dirname(path), exist_ok=True)
22
+ # with open(path, "w") as f:
23
+ # json.dump(item, f)
24
+ # # save item
25
+ # os.makedirs(path, exist_ok=True)
26
+ # _path = path + '/' + item.id + ".json"
27
+ # os.makedirs(os.path.dirname(_path), exist_ok=True)
28
+ # with open(_path, "w") as f:
29
+ # json.dump(item.to_dict(), f)
30
+ # save catalog
31
+ if stac_catalog is not None:
32
+ return stac_catalog
33
+ return items
@@ -1,3 +1,3 @@
1
1
  from .retrieve import retrieve_datasets, retrieve_dataset, retrieve_dataset_files
2
2
  from .ingest import ingest_dataset
3
- from .download import download_dataset, download_file_url
3
+ from .stage import stage_dataset, stage_dataset_file
eotdl/datasets/ingest.py CHANGED
@@ -1,167 +1,36 @@
1
1
  from pathlib import Path
2
- import yaml
3
- from tqdm import tqdm
4
- import json
5
- import frontmatter
6
-
7
- from ..auth import with_auth
8
- from .metadata import Metadata
9
- from ..repos import DatasetsAPIRepo, FilesAPIRepo
10
- from ..files import ingest_files, create_new_version
11
- from ..curation.stac import STACDataFrame
12
- from ..shared import calculate_checksum
13
- from .update import update_dataset
14
- from .metadata import generate_metadata
15
-
16
-
17
- def ingest_dataset(
18
- path,
19
- verbose=False,
20
- logger=print,
21
- force_metadata_update=False,
22
- sync_metadata=False,
23
- ):
24
- path = Path(path)
25
- if not path.is_dir():
26
- raise Exception("Path must be a folder")
27
- if "catalog.json" in [f.name for f in path.iterdir()]:
28
- return ingest_stac(path / "catalog.json", logger)
29
- return ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
30
2
 
3
+ from ..repos import DatasetsAPIRepo
4
+ from ..files.ingest import prep_ingest_stac, prep_ingest_folder, ingest
31
5
 
32
6
  def retrieve_dataset(metadata, user):
33
- repo = DatasetsAPIRepo()
34
- data, error = repo.retrieve_dataset(metadata.name)
35
- # print(data, error)
36
- if data and data["uid"] != user["uid"]:
37
- raise Exception("Dataset already exists.")
38
- if error and error == "Dataset doesn't exist":
39
- # create dataset
40
- data, error = repo.create_dataset(metadata.dict(), user)
41
- # print(data, error)
42
- if error:
43
- raise Exception(error)
44
- data["id"] = data["dataset_id"]
45
- return data
46
-
7
+ repo = DatasetsAPIRepo()
8
+ data, error = repo.retrieve_dataset(metadata.name)
9
+ # print(data, error)
10
+ if data and data["uid"] != user["uid"]:
11
+ raise Exception("Dataset already exists.")
12
+ if error and error == "Dataset doesn't exist":
13
+ # create dataset
14
+ data, error = repo.create_dataset(metadata.dict(), user)
15
+ # print(data, error)
16
+ if error:
17
+ raise Exception(error)
18
+ return data
47
19
 
48
- @with_auth
49
- def ingest_folder(
50
- folder,
51
- verbose=False,
52
- logger=print,
53
- force_metadata_update=False,
54
- sync_metadata=False,
55
- user=None,
56
- ):
57
- repo = DatasetsAPIRepo()
58
- try:
59
- readme = frontmatter.load(folder.joinpath("README.md"))
60
- metadata, content = readme.metadata, readme.content
61
- metadata = Metadata(**metadata)
62
- # except FileNotFoundError:
63
- # # load metadata (legacy)
64
- # metadata = (
65
- # yaml.safe_load(open(folder.joinpath("metadata.yml"), "r").read()) or {}
66
- # )
67
- # metadata = Metadata(**metadata)
68
- # content = None
69
- except Exception as e:
70
- print(str(e))
71
- raise Exception("Error loading metadata")
72
- # retrieve dataset (create if doesn't exist)
73
- dataset = retrieve_dataset(metadata, user)
74
-
75
- update_metadata = True
76
- if "description" in dataset:
77
- # do not do this if the dataset is new, only if it already exists
78
- update_metadata = check_metadata(
79
- dataset, metadata, content, force_metadata_update, sync_metadata, folder
80
- )
81
- if update_metadata:
82
- update_dataset(dataset["id"], metadata, content, user)
83
- return ingest_files(
84
- repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
85
- )
86
-
87
-
88
- def check_metadata(
89
- dataset, metadata, content, force_metadata_update, sync_metadata, folder
20
+ def ingest_dataset(
21
+ path,
22
+ verbose=False,
23
+ logger=print,
24
+ force_metadata_update=False,
25
+ sync_metadata=False,
90
26
  ):
91
- if (
92
- dataset["name"] != metadata.name
93
- or dataset["description"] != content
94
- or dataset["authors"] != metadata.authors
95
- or dataset["source"] != metadata.source
96
- or dataset["license"] != metadata.license
97
- or dataset["thumbnail"] != metadata.thumbnail
98
- ):
99
- if not force_metadata_update and not sync_metadata:
100
- raise Exception(
101
- "The provided metadata is not consistent with the current metadata. Use -f to force metadata update or -s to sync your local metadata."
102
- )
103
- if force_metadata_update:
104
- return True
105
- if sync_metadata:
106
- generate_metadata(str(folder), dataset)
107
- return False
108
- return False
109
-
110
-
111
- def retrieve_stac_dataset(dataset_name, user):
112
- repo = DatasetsAPIRepo()
113
- data, error = repo.retrieve_dataset(dataset_name)
114
- # print(data, error)
115
- if data and data["uid"] != user["uid"]:
116
- raise Exception("Dataset already exists.")
117
- if error and error == "Dataset doesn't exist":
118
- # create dataset
119
- data, error = repo.create_stac_dataset(dataset_name, user)
120
- # print(data, error)
121
- if error:
122
- raise Exception(error)
123
- data["id"] = data["dataset_id"]
124
- return data["id"]
27
+ path = Path(path)
28
+ if not path.is_dir():
29
+ raise Exception("Path must be a folder")
30
+ if "catalog.json" in [f.name for f in path.iterdir()]:
31
+ prep_ingest_stac(path, logger)
32
+ else:
33
+ prep_ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
34
+ return ingest(path, DatasetsAPIRepo(), retrieve_dataset, 'datasets')
125
35
 
126
36
 
127
- @with_auth
128
- def ingest_stac(stac_catalog, logger=None, user=None):
129
- repo, files_repo = DatasetsAPIRepo(), FilesAPIRepo()
130
- # load catalog
131
- logger("Loading STAC catalog...")
132
- df = STACDataFrame.from_stac_file(stac_catalog)
133
- catalog = df[df["type"] == "Catalog"]
134
- assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
135
- dataset_name = catalog.id.iloc[0]
136
- # retrieve dataset (create if doesn't exist)
137
- dataset_id = retrieve_stac_dataset(dataset_name, user)
138
- # create new version
139
- version = create_new_version(repo, dataset_id, user)
140
- logger("New version created, version: " + str(version))
141
- df2 = df.dropna(subset=["assets"])
142
- for row in tqdm(df2.iterrows(), total=len(df2)):
143
- try:
144
- for k, v in row[1]["assets"].items():
145
- data, error = files_repo.ingest_file(
146
- v["href"],
147
- dataset_id,
148
- user,
149
- calculate_checksum(v["href"]), # is always absolute?
150
- "datasets",
151
- version,
152
- )
153
- if error:
154
- raise Exception(error)
155
- file_url = f"{repo.url}datasets/{data['dataset_id']}/download/{data['filename']}"
156
- df.loc[row[0], "assets"][k]["href"] = file_url
157
- except Exception as e:
158
- logger(f"Error uploading asset {row[0]}: {e}")
159
- break
160
- # ingest the STAC catalog into geodb
161
- logger("Ingesting STAC catalog...")
162
- data, error = repo.ingest_stac(json.loads(df.to_json()), dataset_id, user)
163
- if error:
164
- # TODO: delete all assets that were uploaded
165
- raise Exception(error)
166
- logger("Done")
167
- return
@@ -25,12 +25,3 @@ def retrieve_dataset_files(dataset_id, version):
25
25
  raise Exception(error)
26
26
  return data
27
27
 
28
-
29
- # def list_datasets(pattern=None):
30
- # datasets = retrieve_datasets()
31
- # if pattern:
32
- # regex = re.compile(rf".*{re.escape(pattern)}.*", re.IGNORECASE)
33
- # names = list(datasets.keys())
34
- # valid = [name for name in names if regex.search(name)]
35
- # return {name: datasets[name] for name in valid}
36
- # return datasets
@@ -0,0 +1,64 @@
1
+ import os
2
+ from pathlib import Path
3
+ from tqdm import tqdm
4
+ import geopandas as gpd
5
+
6
+ from ..auth import with_auth
7
+ from .retrieve import retrieve_dataset
8
+ from ..repos import FilesAPIRepo
9
+
10
+ @with_auth
11
+ def stage_dataset(
12
+ dataset_name,
13
+ version=None,
14
+ path=None,
15
+ logger=print,
16
+ assets=False,
17
+ force=False,
18
+ verbose=False,
19
+ user=None,
20
+ file=None,
21
+ ):
22
+ dataset = retrieve_dataset(dataset_name)
23
+ if version is None:
24
+ version = sorted([v['version_id'] for v in dataset["versions"]])[-1]
25
+ else:
26
+ assert version in [
27
+ v["version_id"] for v in dataset["versions"]
28
+ ], f"Version {version} not found"
29
+ download_base_path = os.getenv(
30
+ "EOTDL_DOWNLOAD_PATH", str(Path.home()) + "/.cache/eotdl/datasets"
31
+ )
32
+ if path is None:
33
+ download_path = download_base_path + "/" + dataset_name
34
+ else:
35
+ download_path = path + "/" + dataset_name
36
+ # check if dataset already exists
37
+ if os.path.exists(download_path) and not force:
38
+ os.makedirs(download_path, exist_ok=True)
39
+ # raise Exception(
40
+ # f"Dataset `{dataset['name']} v{str(version)}` already exists at {download_path}. To force download, use force=True or -f in the CLI."
41
+ # )
42
+ raise Exception(
43
+ f"Dataset `{dataset['name']}` already exists at {download_path}. To force download, use force=True or -f in the CLI."
44
+ )
45
+
46
+ # stage metadata
47
+ repo = FilesAPIRepo()
48
+ catalog_path = repo.stage_file(dataset["id"], f"catalog.v{version}.parquet", user, download_path)
49
+
50
+ # TODO: stage README.md
51
+
52
+ if assets:
53
+ gdf = gpd.read_parquet(catalog_path)
54
+ for _, row in tqdm(gdf.iterrows(), total=len(gdf), desc="Staging assets"):
55
+ for k, v in row["assets"].items():
56
+ stage_dataset_file(v["href"], download_path)
57
+
58
+ return download_path
59
+
60
+
61
+ @with_auth
62
+ def stage_dataset_file(file_url, path, user):
63
+ repo = FilesAPIRepo()
64
+ return repo.stage_file_url(file_url, path, user)
eotdl/files/__init__.py CHANGED
@@ -1,2 +0,0 @@
1
- from .ingest import ingest_files, create_new_version
2
- from .list_files import list_files