eotdl 2023.11.2.post5__py3-none-any.whl → 2023.11.3.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. eotdl/__init__.py +1 -1
  2. eotdl/access/__init__.py +6 -3
  3. eotdl/access/airbus/__init__.py +5 -1
  4. eotdl/access/airbus/client.py +356 -338
  5. eotdl/access/airbus/parameters.py +19 -4
  6. eotdl/access/airbus/utils.py +26 -21
  7. eotdl/access/download.py +30 -14
  8. eotdl/access/search.py +17 -6
  9. eotdl/access/sentinelhub/__init__.py +5 -1
  10. eotdl/access/sentinelhub/client.py +57 -54
  11. eotdl/access/sentinelhub/evalscripts.py +38 -39
  12. eotdl/access/sentinelhub/parameters.py +43 -23
  13. eotdl/access/sentinelhub/utils.py +38 -28
  14. eotdl/auth/errors.py +2 -1
  15. eotdl/commands/auth.py +3 -3
  16. eotdl/curation/__init__.py +5 -1
  17. eotdl/curation/stac/__init__.py +5 -1
  18. eotdl/curation/stac/assets.py +55 -32
  19. eotdl/curation/stac/dataframe.py +20 -14
  20. eotdl/curation/stac/dataframe_bck.py +2 -2
  21. eotdl/curation/stac/dataframe_labeling.py +15 -12
  22. eotdl/curation/stac/extensions/__init__.py +6 -2
  23. eotdl/curation/stac/extensions/base.py +8 -4
  24. eotdl/curation/stac/extensions/dem.py +6 -3
  25. eotdl/curation/stac/extensions/eo.py +10 -6
  26. eotdl/curation/stac/extensions/label/__init__.py +5 -1
  27. eotdl/curation/stac/extensions/label/base.py +40 -26
  28. eotdl/curation/stac/extensions/label/image_name_labeler.py +64 -43
  29. eotdl/curation/stac/extensions/label/scaneo.py +59 -56
  30. eotdl/curation/stac/extensions/ml_dataset.py +154 -56
  31. eotdl/curation/stac/extensions/projection.py +11 -9
  32. eotdl/curation/stac/extensions/raster.py +22 -14
  33. eotdl/curation/stac/extensions/sar.py +12 -7
  34. eotdl/curation/stac/extent.py +67 -40
  35. eotdl/curation/stac/parsers.py +18 -10
  36. eotdl/curation/stac/stac.py +81 -62
  37. eotdl/datasets/__init__.py +1 -1
  38. eotdl/datasets/download.py +42 -55
  39. eotdl/datasets/ingest.py +68 -11
  40. eotdl/files/__init__.py +1 -1
  41. eotdl/files/ingest.py +3 -1
  42. eotdl/models/download.py +1 -1
  43. eotdl/repos/AuthAPIRepo.py +0 -1
  44. eotdl/repos/DatasetsAPIRepo.py +22 -146
  45. eotdl/repos/FilesAPIRepo.py +7 -92
  46. eotdl/repos/ModelsAPIRepo.py +0 -1
  47. eotdl/tools/__init__.py +5 -1
  48. eotdl/tools/geo_utils.py +78 -48
  49. eotdl/tools/metadata.py +13 -11
  50. eotdl/tools/paths.py +14 -14
  51. eotdl/tools/stac.py +36 -31
  52. eotdl/tools/time_utils.py +53 -26
  53. eotdl/tools/tools.py +84 -50
  54. {eotdl-2023.11.2.post5.dist-info → eotdl-2023.11.3.post2.dist-info}/METADATA +5 -3
  55. eotdl-2023.11.3.post2.dist-info/RECORD +84 -0
  56. eotdl-2023.11.2.post5.dist-info/RECORD +0 -84
  57. {eotdl-2023.11.2.post5.dist-info → eotdl-2023.11.3.post2.dist-info}/WHEEL +0 -0
  58. {eotdl-2023.11.2.post5.dist-info → eotdl-2023.11.3.post2.dist-info}/entry_points.txt +0 -0
eotdl/datasets/ingest.py CHANGED
@@ -1,18 +1,22 @@
1
1
  from pathlib import Path
2
2
  import yaml
3
+ from tqdm import tqdm
4
+ import json
3
5
 
4
6
  from ..auth import with_auth
5
7
  from .metadata import Metadata
6
- from ..repos import DatasetsAPIRepo
7
- from ..files import ingest_files
8
+ from ..repos import DatasetsAPIRepo, FilesAPIRepo
9
+ from ..files import ingest_files, create_new_version
10
+ from ..curation.stac import STACDataFrame
11
+ from ..shared import calculate_checksum
8
12
 
9
13
 
10
14
  def ingest_dataset(path, verbose=False, logger=print):
11
15
  path = Path(path)
12
16
  if not path.is_dir():
13
17
  raise Exception("Path must be a folder")
14
- # if "catalog.json" in [f.name for f in path.iterdir()]:
15
- # return ingest_stac(path / "catalog.json", logger)
18
+ if "catalog.json" in [f.name for f in path.iterdir()]:
19
+ return ingest_stac(path / "catalog.json", logger)
16
20
  return ingest_folder(path, verbose, logger)
17
21
 
18
22
 
@@ -46,10 +50,63 @@ def ingest_folder(folder, verbose=False, logger=print, user=None):
46
50
  )
47
51
 
48
52
 
49
- # @with_auth
50
- # def ingest_stac(stac_catalog, logger=None, user=None):
51
- # api_repo = APIRepo()
52
- # ingest = IngestSTAC(api_repo, ingest_file, logger)
53
- # inputs = ingest.Inputs(stac_catalog=stac_catalog, user=user)
54
- # outputs = ingest(inputs)
55
- # return outputs.dataset
53
+ def retrieve_stac_dataset(dataset_name, user):
54
+ repo = DatasetsAPIRepo()
55
+ data, error = repo.retrieve_dataset(dataset_name)
56
+ # print(data, error)
57
+ if data and data["uid"] != user["sub"]:
58
+ raise Exception("Dataset already exists.")
59
+ if error and error == "Dataset doesn't exist":
60
+ # create dataset
61
+ data, error = repo.create_stac_dataset(dataset_name, user["id_token"])
62
+ # print(data, error)
63
+ if error:
64
+ raise Exception(error)
65
+ data["id"] = data["dataset_id"]
66
+ return data["id"]
67
+
68
+
69
+ @with_auth
70
+ def ingest_stac(stac_catalog, logger=None, user=None):
71
+ repo, files_repo = DatasetsAPIRepo(), FilesAPIRepo()
72
+ # load catalog
73
+ logger("Loading STAC catalog...")
74
+ df = STACDataFrame.from_stac_file(stac_catalog)
75
+ catalog = df[df["type"] == "Catalog"]
76
+ assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
77
+ dataset_name = catalog.id.iloc[0]
78
+ # retrieve dataset (create if doesn't exist)
79
+ dataset_id = retrieve_stac_dataset(dataset_name, user)
80
+ # create new version
81
+ version = create_new_version(repo, dataset_id, user)
82
+ logger("New version created, version: " + str(version))
83
+ df2 = df.dropna(subset=["assets"])
84
+ for row in tqdm(df2.iterrows(), total=len(df2)):
85
+ # for asset in df.assets.dropna().values[:10]:
86
+ try:
87
+ for k, v in row[1]["assets"].items():
88
+ data, error = files_repo.ingest_file(
89
+ v["href"],
90
+ dataset_id,
91
+ user["id_token"],
92
+ calculate_checksum(v["href"]), # is always absolute?
93
+ "datasets",
94
+ version,
95
+ )
96
+ if error:
97
+ raise Exception(error)
98
+ file_url = f"{repo.url}datasets/{data['dataset_id']}/download/{data['filename']}"
99
+ df.loc[row[0], "assets"][k]["href"] = file_url
100
+ except Exception as e:
101
+ logger(f"Error uploading asset {row[0]}: {e}")
102
+ break
103
+ # ingest the STAC catalog into geodb
104
+ logger("Ingesting STAC catalog...")
105
+ data, error = repo.ingest_stac(
106
+ json.loads(df.to_json()), dataset_id, user["id_token"]
107
+ )
108
+ if error:
109
+ # TODO: delete all assets that were uploaded
110
+ raise Exception(error)
111
+ logger("Done")
112
+ return
eotdl/files/__init__.py CHANGED
@@ -1 +1 @@
1
- from .ingest import ingest_files
1
+ from .ingest import ingest_files, create_new_version
eotdl/files/ingest.py CHANGED
@@ -8,7 +8,6 @@ import os
8
8
 
9
9
  from ..repos import FilesAPIRepo
10
10
  from ..shared import calculate_checksum
11
- from ..shared import calculate_checksum
12
11
 
13
12
 
14
13
  def retrieve_files(folder):
@@ -64,6 +63,7 @@ def generate_files_lists(
64
63
  ):
65
64
  files_repo = FilesAPIRepo()
66
65
  current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
66
+ print(current_files)
67
67
  # print(len(current_files), len(items) - len(current_files))
68
68
  # print(current_files, error)
69
69
  if error:
@@ -82,6 +82,7 @@ def generate_files_lists(
82
82
  large_files.append(data)
83
83
  else:
84
84
  upload_files.append(data)
85
+ # TODO: should ingest new version if files removed
85
86
  if len(upload_files) == 0 and len(large_files) == 0:
86
87
  raise Exception("No new files to upload")
87
88
  return upload_files, existing_files, large_files
@@ -98,6 +99,7 @@ def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpo
98
99
  files_repo = FilesAPIRepo()
99
100
  logger(f"Uploading directory {folder}...")
100
101
  items = retrieve_files(folder)
102
+ print(items)
101
103
  # retrieve files
102
104
  upload_files, existing_files, large_files = generate_files_lists(
103
105
  items, folder, dataset_or_model_id, endpoint, logger
eotdl/models/download.py CHANGED
@@ -76,7 +76,7 @@ def download_model(
76
76
  # if calculate_checksum(dst_path) != checksum:
77
77
  # logger(f"Checksum for {file} does not match")
78
78
  if verbose:
79
- logger(f"Done")
79
+ logger("Done")
80
80
  return "/".join(dst_path.split("/")[:-1])
81
81
  else:
82
82
  raise NotImplementedError("Downloading a STAC model is not implemented")
@@ -1,5 +1,4 @@
1
1
  import requests
2
- import os
3
2
  from .APIRepo import APIRepo
4
3
 
5
4
 
@@ -1,5 +1,5 @@
1
1
  import requests
2
- import os
2
+ import geopandas as gpd
3
3
 
4
4
  from ..repos import APIRepo
5
5
 
@@ -39,126 +39,29 @@ class DatasetsAPIRepo(APIRepo):
39
39
  )
40
40
  return self.format_response(response)
41
41
 
42
- # def create_stac_dataset(self, name, id_token):
43
- # response = requests.post(
44
- # self.url + "datasets/stac",
45
- # json={"name": name},
46
- # headers={"Authorization": "Bearer " + id_token},
47
- # )
48
- # if response.status_code == 200:
49
- # return response.json(), None
50
- # return None, response.json()["detail"]
51
-
52
- # def download_file(self, dataset, dataset_id, file, id_token, path):
53
- # url = self.url + "datasets/" + dataset_id + "/download/" + file
54
- # return self.download_file_url(url, path, id_token, progress=True)
55
-
56
- # def download_file_url(self, url, path, id_token, progress=False):
57
- # headers = {"Authorization": "Bearer " + id_token}
58
- # filename = url.split("/")[-1]
59
- # os.makedirs(path, exist_ok=True)
60
- # path = f"{path}/{filename}"
61
- # with requests.get(url, headers=headers, stream=True) as r:
62
- # r.raise_for_status()
63
- # total_size = int(r.headers.get("content-length", 0))
64
- # block_size = 1024 * 1024 * 10
65
- # if progress:
66
- # progress_bar = tqdm(
67
- # total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
68
- # )
69
- # with open(path, "wb") as f:
70
- # for chunk in r.iter_content(block_size):
71
- # if progress:
72
- # progress_bar.update(len(chunk))
73
- # if chunk:
74
- # f.write(chunk)
75
- # if progress:
76
- # progress_bar.close()
77
- # return path
78
-
79
- # def ingest_file_url(self, file, dataset, id_token):
80
- # reponse = requests.post(
81
- # self.url + f"datasets/{dataset}/url",
82
- # json={"url": file},
83
- # headers={"Authorization": "Bearer " + id_token},
84
- # )
85
- # if reponse.status_code != 200:
86
- # return None, reponse.json()["detail"]
87
- # return reponse.json(), None
88
-
89
- # def read_in_chunks(self, file_object, CHUNK_SIZE):
90
- # while True:
91
- # data = file_object.read(CHUNK_SIZE)
92
- # if not data:
93
- # break
94
- # yield data
95
-
96
- # def prepare_large_upload(self, file, dataset_id, checksum, id_token):
97
- # filename = Path(file).name
98
- # response = requests.post(
99
- # self.url + f"datasets/{dataset_id}/uploadId",
100
- # json={"name": filename, "checksum": checksum},
101
- # headers={"Authorization": "Bearer " + id_token},
102
- # )
103
- # if response.status_code != 200:
104
- # raise Exception(response.json()["detail"])
105
- # data = response.json()
106
- # upload_id, parts = (
107
- # data["upload_id"],
108
- # data["parts"] if "parts" in data else [],
109
- # )
110
- # return upload_id, parts
111
-
112
- # def get_chunk_size(self, content_size):
113
- # # adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
114
- # chunk_size = 1024 * 1024 * 10 # 10 MB (up to 100 GB, 10000 parts)
115
- # if content_size >= 1024 * 1024 * 1024 * 100: # 100 GB
116
- # chunk_size = 1024 * 1024 * 100 # 100 MB (up to 1 TB, 10000 parts)
117
- # elif content_size >= 1024 * 1024 * 1024 * 1000: # 1 TB
118
- # chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
119
- # return chunk_size
42
+ def create_stac_dataset(self, name, id_token):
43
+ response = requests.post(
44
+ self.url + "datasets/stac",
45
+ json={"name": name},
46
+ headers={"Authorization": "Bearer " + id_token},
47
+ )
48
+ return self.format_response(response)
120
49
 
121
- # def ingest_large_dataset(self, file, upload_id, id_token, parts):
122
- # content_path = os.path.abspath(file)
123
- # content_size = os.stat(content_path).st_size
124
- # chunk_size = self.get_chunk_size(content_size)
125
- # total_chunks = content_size // chunk_size
126
- # # upload chunks sequentially
127
- # pbar = tqdm(
128
- # self.read_in_chunks(open(content_path, "rb"), chunk_size),
129
- # total=total_chunks,
130
- # )
131
- # index = 0
132
- # for chunk in pbar:
133
- # part = index // chunk_size + 1
134
- # offset = index + len(chunk)
135
- # index = offset
136
- # if part not in parts:
137
- # checksum = hashlib.md5(chunk).hexdigest()
138
- # response = requests.post(
139
- # self.url + "datasets/chunk/" + upload_id,
140
- # files={"file": chunk},
141
- # data={"part_number": part, "checksum": checksum},
142
- # headers={"Authorization": "Bearer " + id_token},
143
- # )
144
- # if response.status_code != 200:
145
- # raise Exception(response.json()["detail"])
146
- # pbar.set_description(
147
- # "{:.2f}/{:.2f} MB".format(
148
- # offset / 1024 / 1024, content_size / 1024 / 1024
149
- # )
150
- # )
151
- # pbar.close()
152
- # return
50
+ def ingest_stac(self, stac_json, dataset_id, id_token):
51
+ response = requests.put(
52
+ self.url + f"datasets/stac/{dataset_id}",
53
+ json={"stac": stac_json},
54
+ headers={"Authorization": "Bearer " + id_token},
55
+ )
56
+ return self.format_response(response)
153
57
 
154
- # def complete_upload(self, id_token, upload_id):
155
- # r = requests.post(
156
- # self.url + "datasets/complete/" + upload_id,
157
- # headers={"Authorization": "Bearer " + id_token},
158
- # )
159
- # if r.status_code != 200:
160
- # return None, r.json()["detail"]
161
- # return r.json(), None
58
+ def download_stac(self, dataset_id, id_token):
59
+ url = self.url + "datasets/" + dataset_id + "/download"
60
+ headers = {"Authorization": "Bearer " + id_token}
61
+ response = requests.get(url, headers=headers)
62
+ if response.status_code != 200:
63
+ return None, response.json()["detail"]
64
+ return gpd.GeoDataFrame.from_features(response.json()["features"]), None
162
65
 
163
66
  # def update_dataset(self, name, path, id_token, checksum):
164
67
  # # check that dataset exists
@@ -220,30 +123,3 @@ class DatasetsAPIRepo(APIRepo):
220
123
  # if r.status_code != 200:
221
124
  # return None, r.json()["detail"]
222
125
  # return r.json(), None
223
-
224
- # def delete_file(self, dataset_id, file_name, id_token):
225
- # response = requests.delete(
226
- # self.url + "datasets/" + dataset_id + "/file/" + file_name,
227
- # headers={"Authorization": "Bearer " + id_token},
228
- # )
229
- # if response.status_code != 200:
230
- # return None, response.json()["detail"]
231
- # return response.json(), None
232
-
233
- # def ingest_stac(self, stac_json, dataset_id, id_token):
234
- # reponse = requests.put(
235
- # self.url + f"datasets/stac/{dataset_id}",
236
- # json={"stac": stac_json},
237
- # headers={"Authorization": "Bearer " + id_token},
238
- # )
239
- # if reponse.status_code != 200:
240
- # return None, reponse.json()["detail"]
241
- # return reponse.json(), None
242
-
243
- # def download_stac(self, dataset_id, id_token):
244
- # url = self.url + "datasets/" + dataset_id + "/download"
245
- # headers = {"Authorization": "Bearer " + id_token}
246
- # response = requests.get(url, headers=headers)
247
- # if response.status_code != 200:
248
- # return None, response.json()["detail"]
249
- # return gpd.GeoDataFrame.from_features(response.json()["features"]), None
@@ -19,7 +19,7 @@ class FilesAPIRepo(APIRepo):
19
19
  endpoint,
20
20
  version=None,
21
21
  ):
22
- url = self.url + f"{endpoint}/{dataset_or_model_id}"
22
+ url = self.url + f"{endpoint}/{dataset_or_model_id}/batch"
23
23
  if version is not None:
24
24
  url += "?version=" + str(version)
25
25
  reponse = requests.post(
@@ -49,21 +49,15 @@ class FilesAPIRepo(APIRepo):
49
49
  return self.format_response(reponse)
50
50
 
51
51
  def ingest_file(
52
- self,
53
- file,
54
- dataset_or_model_id,
55
- version,
56
- parent,
57
- id_token,
58
- checksum,
59
- endpoint,
52
+ self, file, dataset_or_model_id, id_token, checksum, endpoint, version=None
60
53
  ):
54
+ url = self.url + f"{endpoint}/{dataset_or_model_id}"
55
+ if version is not None:
56
+ url += "?version=" + str(version)
61
57
  reponse = requests.post(
62
- self.url + f"{endpoint}/{dataset_or_model_id}",
58
+ url,
63
59
  files={"file": open(file, "rb")},
64
- data={"checksum": checksum, "version": version, "parent": parent}
65
- if checksum
66
- else None,
60
+ data={"checksum": checksum},
67
61
  headers={"Authorization": "Bearer " + id_token},
68
62
  )
69
63
  return self.format_response(reponse)
@@ -205,67 +199,6 @@ class FilesAPIRepo(APIRepo):
205
199
  )
206
200
  return self.format_response(r)
207
201
 
208
- # def update_dataset(self, name, path, id_token, checksum):
209
- # # check that dataset exists
210
- # data, error = self.retrieve_dataset(name)
211
- # if error:
212
- # return None, error
213
- # # first call to get upload id
214
- # dataset_id = data["id"]
215
- # url = self.url + f"datasets/chunk/{dataset_id}?checksum={checksum}"
216
- # response = requests.get(url, headers={"Authorization": "Bearer " + id_token})
217
- # if response.status_code != 200:
218
- # return None, response.json()["detail"]
219
- # data = response.json()
220
- # _, upload_id, parts = data["dataset_id"], data["upload_id"], data["parts"]
221
- # # assert dataset_id is None
222
- # content_path = os.path.abspath(path)
223
- # content_size = os.stat(content_path).st_size
224
- # url = self.url + "datasets/chunk"
225
- # chunk_size = 1024 * 1024 * 100 # 100 MiB
226
- # total_chunks = content_size // chunk_size
227
- # headers = {
228
- # "Authorization": "Bearer " + id_token,
229
- # "Upload-Id": upload_id,
230
- # "Dataset-Id": dataset_id,
231
- # }
232
- # # upload chunks sequentially
233
- # pbar = tqdm(
234
- # self.read_in_chunks(open(content_path, "rb"), chunk_size),
235
- # total=total_chunks,
236
- # )
237
- # index = 0
238
- # for chunk in pbar:
239
- # offset = index + len(chunk)
240
- # part = index // chunk_size + 1
241
- # index = offset
242
- # if part not in parts:
243
- # headers["Part-Number"] = str(part)
244
- # file = {"file": chunk}
245
- # r = requests.post(url, files=file, headers=headers)
246
- # if r.status_code != 200:
247
- # return None, r.json()["detail"]
248
- # pbar.set_description(
249
- # "{:.2f}/{:.2f} MB".format(
250
- # offset / 1024 / 1024, content_size / 1024 / 1024
251
- # )
252
- # )
253
- # pbar.close()
254
- # # complete upload
255
- # url = self.url + "datasets/complete"
256
- # r = requests.post(
257
- # url,
258
- # json={"checksum": checksum},
259
- # headers={
260
- # "Authorization": "Bearer " + id_token,
261
- # "Upload-Id": upload_id,
262
- # "Dataset-Id": dataset_id,
263
- # },
264
- # )
265
- # if r.status_code != 200:
266
- # return None, r.json()["detail"]
267
- # return r.json(), None
268
-
269
202
  # def delete_file(self, dataset_id, file_name, id_token):
270
203
  # response = requests.delete(
271
204
  # self.url + "datasets/" + dataset_id + "/file/" + file_name,
@@ -274,21 +207,3 @@ class FilesAPIRepo(APIRepo):
274
207
  # if response.status_code != 200:
275
208
  # return None, response.json()["detail"]
276
209
  # return response.json(), None
277
-
278
- # def ingest_stac(self, stac_json, dataset_id, id_token):
279
- # reponse = requests.put(
280
- # self.url + f"datasets/stac/{dataset_id}",
281
- # json={"stac": stac_json},
282
- # headers={"Authorization": "Bearer " + id_token},
283
- # )
284
- # if reponse.status_code != 200:
285
- # return None, reponse.json()["detail"]
286
- # return reponse.json(), None
287
-
288
- # def download_stac(self, dataset_id, id_token):
289
- # url = self.url + "datasets/" + dataset_id + "/download"
290
- # headers = {"Authorization": "Bearer " + id_token}
291
- # response = requests.get(url, headers=headers)
292
- # if response.status_code != 200:
293
- # return None, response.json()["detail"]
294
- # return gpd.GeoDataFrame.from_features(response.json()["features"]), None
@@ -1,5 +1,4 @@
1
1
  import requests
2
- import os
3
2
 
4
3
  from ..repos import APIRepo
5
4
 
eotdl/tools/__init__.py CHANGED
@@ -1,6 +1,10 @@
1
+ """
2
+ Tools module for eotdl package.
3
+ """
4
+
1
5
  from .stac import *
2
6
  from .tools import *
3
7
  from .geo_utils import *
4
8
  from .time_utils import *
5
9
  from .metadata import *
6
- from .paths import *
10
+ from .paths import *