eotdl 2023.6.14.post10__py3-none-any.whl → 2023.7.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,11 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
6
6
  import time
7
7
  import multiprocessing
8
8
  import hashlib
9
+ import geopandas as gpd
9
10
 
10
11
 
11
12
  class APIRepo:
12
13
  def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
13
14
  self.url = url
15
+ # print(self.url)
14
16
 
15
17
  def login(self):
16
18
  return requests.get(self.url + "auth/login")
@@ -22,6 +24,16 @@ class APIRepo:
22
24
  response = requests.get(self.url + "auth/logout")
23
25
  return response.json()["logout_url"]
24
26
 
27
+ def create_dataset(self, metadata, id_token):
28
+ response = requests.post(
29
+ self.url + "datasets",
30
+ json=metadata,
31
+ headers={"Authorization": "Bearer " + id_token},
32
+ )
33
+ if response.status_code == 200:
34
+ return response.json(), None
35
+ return None, response.json()["detail"]
36
+
25
37
  def retrieve_datasets(self):
26
38
  return requests.get(self.url + "datasets").json()
27
39
 
@@ -31,12 +43,10 @@ class APIRepo:
31
43
  return response.json(), None
32
44
  return None, response.json()["detail"]
33
45
 
34
- def download_dataset(self, dataset_id, id_token, path):
35
- url = self.url + "datasets/" + dataset_id + "/download"
46
+ def download_file(self, dataset, dataset_id, file, id_token, path):
47
+ url = self.url + "datasets/" + dataset_id + "/download/" + file
36
48
  headers = {"Authorization": "Bearer " + id_token}
37
- if path is None:
38
- path = str(Path.home()) + "/.eotdl/datasets"
39
- os.makedirs(path, exist_ok=True)
49
+ path = f"{path}/{file}"
40
50
  with requests.get(url, headers=headers, stream=True) as r:
41
51
  r.raise_for_status()
42
52
  total_size = int(r.headers.get("content-length", 0))
@@ -44,10 +54,6 @@ class APIRepo:
44
54
  progress_bar = tqdm(
45
55
  total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
46
56
  )
47
- filename = r.headers.get("content-disposition").split("filename=")[1][1:-1]
48
- path = f"{path}/{filename}"
49
- if os.path.exists(path):
50
- raise Exception("File already exists")
51
57
  with open(path, "wb") as f:
52
58
  for chunk in r.iter_content(block_size):
53
59
  progress_bar.update(len(chunk))
@@ -56,6 +62,27 @@ class APIRepo:
56
62
  progress_bar.close()
57
63
  return path
58
64
 
65
+ def ingest_file(self, file, dataset_id, id_token, checksum=None):
66
+ reponse = requests.post(
67
+ self.url + "datasets/" + dataset_id,
68
+ files={"file": open(file, "rb")},
69
+ data={"checksum": checksum} if checksum else None,
70
+ headers={"Authorization": "Bearer " + id_token},
71
+ )
72
+ if reponse.status_code != 200:
73
+ return None, reponse.json()["detail"]
74
+ return reponse.json(), None
75
+
76
+ def ingest_file_url(self, file, dataset, id_token):
77
+ reponse = requests.post(
78
+ self.url + "datasets/url",
79
+ json={"dataset": dataset, "url": file},
80
+ headers={"Authorization": "Bearer " + id_token},
81
+ )
82
+ if reponse.status_code != 200:
83
+ return None, reponse.json()["detail"]
84
+ return reponse.json(), None
85
+
59
86
  def read_in_chunks(self, file_object, CHUNK_SIZE):
60
87
  while True:
61
88
  data = file_object.read(CHUNK_SIZE)
@@ -63,18 +90,21 @@ class APIRepo:
63
90
  break
64
91
  yield data
65
92
 
66
- def prepare_large_upload(self, name, id_token, checksum):
67
- url = self.url + "datasets/chunk?name=" + name + "&checksum=" + checksum
68
- response = requests.get(url, headers={"Authorization": "Bearer " + id_token})
93
+ def prepare_large_upload(self, file, dataset_id, checksum, id_token):
94
+ filename = Path(file).name
95
+ response = requests.post(
96
+ self.url + f"datasets/{dataset_id}/uploadId",
97
+ json={"name": filename, "checksum": checksum},
98
+ headers={"Authorization": "Bearer " + id_token},
99
+ )
69
100
  if response.status_code != 200:
70
101
  raise Exception(response.json()["detail"])
71
102
  data = response.json()
72
- dataset_id, upload_id, parts = (
73
- data["dataset_id"],
103
+ upload_id, parts = (
74
104
  data["upload_id"],
75
105
  data["parts"] if "parts" in data else [],
76
106
  )
77
- return dataset_id, upload_id, parts
107
+ return upload_id, parts
78
108
 
79
109
  def get_chunk_size(self, content_size):
80
110
  # adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
@@ -85,37 +115,31 @@ class APIRepo:
85
115
  chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
86
116
  return chunk_size
87
117
 
88
- def ingest_large_dataset(self, path, upload_id, dataset_id, id_token, parts):
89
- content_path = os.path.abspath(path)
118
+ def ingest_large_dataset(self, file, upload_id, id_token, parts):
119
+ content_path = os.path.abspath(file)
90
120
  content_size = os.stat(content_path).st_size
91
121
  chunk_size = self.get_chunk_size(content_size)
92
122
  total_chunks = content_size // chunk_size
93
- url = self.url + "datasets/chunk"
94
- headers = {
95
- "Authorization": "Bearer " + id_token,
96
- "Upload-Id": upload_id,
97
- "Dataset-Id": dataset_id,
98
- }
99
123
  # upload chunks sequentially
100
124
  pbar = tqdm(
101
125
  self.read_in_chunks(open(content_path, "rb"), chunk_size),
102
126
  total=total_chunks,
103
127
  )
104
128
  index = 0
105
- parts_checkusms = []
106
129
  for chunk in pbar:
107
130
  part = index // chunk_size + 1
108
131
  offset = index + len(chunk)
109
132
  index = offset
110
133
  if part not in parts:
111
- headers["Part-Number"] = str(part)
112
134
  checksum = hashlib.md5(chunk).hexdigest()
113
- parts_checkusms.append(checksum)
114
- headers["Checksum"] = checksum
115
- file = {"file": chunk}
116
- r = requests.post(url, files=file, headers=headers)
117
- if r.status_code != 200:
118
- return None, r.json()["detail"]
135
+ response = requests.post(
136
+ self.url + "datasets/chunk/" + upload_id,
137
+ files={"file": chunk},
138
+ data={"part_number": part, "checksum": checksum},
139
+ headers={"Authorization": "Bearer " + id_token},
140
+ )
141
+ if response.status_code != 200:
142
+ raise Exception(response.json()["detail"])
119
143
  pbar.set_description(
120
144
  "{:.2f}/{:.2f} MB".format(
121
145
  offset / 1024 / 1024, content_size / 1024 / 1024
@@ -124,16 +148,10 @@ class APIRepo:
124
148
  pbar.close()
125
149
  return
126
150
 
127
- def complete_upload(self, name, id_token, upload_id, dataset_id, checksum):
128
- url = self.url + "datasets/complete"
151
+ def complete_upload(self, id_token, upload_id):
129
152
  r = requests.post(
130
- url,
131
- json={"name": name, "checksum": checksum},
132
- headers={
133
- "Authorization": "Bearer " + id_token,
134
- "Upload-Id": upload_id,
135
- "Dataset-Id": dataset_id,
136
- },
153
+ self.url + "datasets/complete/" + upload_id,
154
+ headers={"Authorization": "Bearer " + id_token},
137
155
  )
138
156
  if r.status_code != 200:
139
157
  return None, r.json()["detail"]
@@ -200,63 +218,29 @@ class APIRepo:
200
218
  return None, r.json()["detail"]
201
219
  return r.json(), None
202
220
 
203
- def ingest_large_dataset_parallel(
204
- self,
205
- path,
206
- upload_id,
207
- dataset_id,
208
- id_token,
209
- parts,
210
- threads,
211
- ):
212
- # Create thread pool executor
213
- max_workers = threads if threads > 0 else multiprocessing.cpu_count()
214
- executor = ThreadPoolExecutor(max_workers=max_workers)
215
-
216
- # Divide file into chunks and create tasks for each chunk
217
- offset = 0
218
- tasks = []
219
- content_path = os.path.abspath(path)
220
- content_size = os.stat(content_path).st_size
221
- chunk_size = self.get_chunk_size(content_size)
222
- total_chunks = content_size // chunk_size
223
- while offset < content_size:
224
- chunk_end = min(offset + chunk_size, content_size)
225
- part = str(offset // chunk_size + 1)
226
- if part not in parts:
227
- tasks.append((offset, chunk_end, part))
228
- offset = chunk_end
229
-
230
- # Define the function that will upload each chunk
231
- def upload_chunk(start, end, part):
232
- # print(f"Uploading chunk {start} - {end}", part)
233
- with open(content_path, "rb") as f:
234
- f.seek(start)
235
- chunk = f.read(end - start)
236
- checksum = hashlib.md5(chunk).hexdigest()
237
- response = requests.post(
238
- self.url + "datasets/chunk",
239
- files={"file": chunk},
240
- headers={
241
- "Authorization": "Bearer " + id_token,
242
- "Upload-Id": upload_id,
243
- "Dataset-Id": dataset_id,
244
- "Checksum": checksum,
245
- "Part-Number": str(part),
246
- },
247
- )
248
- if response.status_code != 200:
249
- print(f"Failed to upload chunk {start} - {end}")
250
- return response
221
+ def delete_file(self, dataset_id, file_name, id_token):
222
+ response = requests.delete(
223
+ self.url + "datasets/" + dataset_id + "/file/" + file_name,
224
+ headers={"Authorization": "Bearer " + id_token},
225
+ )
226
+ if response.status_code != 200:
227
+ return None, response.json()["detail"]
228
+ return response.json(), None
251
229
 
252
- # Submit each task to the executor
253
- with tqdm(total=total_chunks) as pbar:
254
- futures = []
255
- for task in tasks:
256
- future = executor.submit(upload_chunk, *task)
257
- future.add_done_callback(lambda p: pbar.update())
258
- futures.append(future)
230
+ def ingest_stac(self, stac_json, dataset, id_token):
231
+ reponse = requests.post(
232
+ self.url + "datasets/stac",
233
+ json={"dataset": dataset, "stac": stac_json},
234
+ headers={"Authorization": "Bearer " + id_token},
235
+ )
236
+ if reponse.status_code != 200:
237
+ return None, reponse.json()["detail"]
238
+ return reponse.json(), None
259
239
 
260
- # Wait for all tasks to complete
261
- for future in futures:
262
- future.result()
240
+ def download_stac(self, dataset_id, id_token):
241
+ url = self.url + "datasets/" + dataset_id + "/download"
242
+ headers = {"Authorization": "Bearer " + id_token}
243
+ response = requests.get(url, headers=headers)
244
+ if response.status_code != 200:
245
+ return None, response.json()["detail"]
246
+ return gpd.GeoDataFrame.from_features(response.json()["features"]), None
@@ -1,27 +1,79 @@
1
1
  from pydantic import BaseModel
2
2
  from ....src.utils import calculate_checksum
3
+ from ....curation.stac import STACDataFrame
4
+ from pathlib import Path
5
+ import os
3
6
 
4
7
 
5
8
  class DownloadDataset:
6
- def __init__(self, repo, logger):
9
+ def __init__(self, repo, retrieve_dataset, logger):
7
10
  self.repo = repo
11
+ self.retrieve_dataset = retrieve_dataset
8
12
  self.logger = logger if logger else print
9
13
 
10
14
  class Inputs(BaseModel):
11
15
  dataset: str
16
+ file: str = None
12
17
  path: str = None
13
18
  user: dict
14
- checksum: str
15
19
 
16
20
  class Outputs(BaseModel):
17
21
  dst_path: str
18
22
 
19
- def __call__(self, inputs: Inputs) -> Outputs:
20
- dst_path = self.repo.download_dataset(
21
- inputs.dataset, inputs.user["id_token"], inputs.path
23
+ def download(self, dataset, dataset_id, file, checksum, path, user):
24
+ self.logger(f"Downloading {file}")
25
+ dst_path = self.repo.download_file(
26
+ dataset, dataset_id, file, user["id_token"], path
22
27
  )
23
- checksum = calculate_checksum(dst_path)
24
- self.logger(f"Checksum: {checksum}")
25
- if inputs.checksum != checksum:
26
- self.logger("Checksums do not match")
27
- return self.Outputs(dst_path=dst_path)
28
+ if calculate_checksum(dst_path) != checksum:
29
+ self.logger(f"Checksum for {file} does not match")
30
+ self.logger(f"Done")
31
+ return dst_path
32
+
33
+ def __call__(self, inputs: Inputs) -> Outputs:
34
+ dataset = self.retrieve_dataset(inputs.dataset)
35
+ if inputs.path is None:
36
+ download_path = str(Path.home()) + "/.eotdl/datasets/" + inputs.dataset
37
+ else:
38
+ download_path = inputs.path + "/" + inputs.dataset
39
+ os.makedirs(download_path, exist_ok=True)
40
+ if dataset["quality"] == 0:
41
+ if inputs.file:
42
+ files = [f for f in dataset["files"] if f["name"] == inputs.file]
43
+ if not files:
44
+ raise Exception(f"File {inputs.file} not found")
45
+ if len(files) > 1:
46
+ raise Exception(f"Multiple files with name {inputs.file} found")
47
+ dst_path = self.download(
48
+ inputs.dataset,
49
+ dataset["id"],
50
+ inputs.file,
51
+ files[0]["checksum"],
52
+ download_path,
53
+ inputs.user,
54
+ )
55
+ return self.Outputs(dst_path=dst_path)
56
+ for file in dataset["files"]:
57
+ dst_path = self.download(
58
+ inputs.dataset,
59
+ dataset["id"],
60
+ file["name"],
61
+ file["checksum"],
62
+ download_path,
63
+ inputs.user,
64
+ )
65
+ return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
66
+ else:
67
+ gdf, error = self.repo.download_stac(
68
+ dataset["id"],
69
+ inputs.user["id_token"],
70
+ )
71
+ if error:
72
+ raise Exception(error)
73
+ df = STACDataFrame(gdf)
74
+ # df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
75
+ path = inputs.path
76
+ if path is None:
77
+ path = str(Path.home()) + "/.eotdl/datasets/" + dataset["name"]
78
+ df.to_stac(path)
79
+ return self.Outputs(dst_path=path)
@@ -0,0 +1,30 @@
1
+ from pydantic import BaseModel
2
+ from ....src.utils import calculate_checksum
3
+
4
+
5
+ class DownloadFile:
6
+ def __init__(self, repo, retrieve_dataset, logger):
7
+ self.repo = repo
8
+ self.retrieve_dataset = retrieve_dataset
9
+ self.logger = logger if logger else print
10
+
11
+ class Inputs(BaseModel):
12
+ dataset: str
13
+ file: str
14
+ path: str = None
15
+ user: dict
16
+ checksum: str
17
+
18
+ class Outputs(BaseModel):
19
+ dst_path: str
20
+
21
+ def __call__(self, inputs: Inputs) -> Outputs:
22
+ dataset = self.retrieve_dataset(inputs.dataset)
23
+ dst_path = self.repo.download_file(
24
+ inputs.dataset, inputs.file, inputs.user["id_token"], inputs.path
25
+ )
26
+ checksum = calculate_checksum(dst_path)
27
+ self.logger(f"Checksum: {checksum}")
28
+ if dataset["checksum"] != checksum:
29
+ self.logger("Checksums do not match")
30
+ return self.Outputs(dst_path=dst_path)
@@ -0,0 +1,60 @@
1
+ from pydantic import BaseModel
2
+ import os
3
+ import typing
4
+
5
+ from ....src.utils import calculate_checksum
6
+
7
+
8
+ class IngestFile:
9
+ def __init__(self, repo, allowed_extensions, logger):
10
+ self.repo = repo
11
+ self.allowed_extensions = allowed_extensions
12
+ self.logger = logger if logger else print
13
+
14
+ class Inputs(BaseModel):
15
+ file: typing.Any
16
+ dataset_id: str
17
+ user: dict
18
+
19
+ class Outputs(BaseModel):
20
+ data: dict
21
+
22
+ def __call__(self, inputs: Inputs) -> Outputs:
23
+ # validate file extension
24
+ extension = os.path.splitext(inputs.file)[1]
25
+ if extension not in self.allowed_extensions:
26
+ raise Exception(
27
+ f"Only {', '.join(self.allowed_extensions)} files are allowed"
28
+ )
29
+ id_token = inputs.user["id_token"]
30
+ self.logger(f"Uploading file {inputs.file}...")
31
+ # if inputs.file.startswith("http://") or inputs.file.startswith("https://"):
32
+ # data, error = self.repo.ingest_file_url(
33
+ # inputs.file, inputs.metadata.name, id_token
34
+ # )
35
+ # else:
36
+ self.logger("Computing checksum...")
37
+ checksum = calculate_checksum(inputs.file)
38
+ self.logger(checksum)
39
+ self.logger("Ingesting file...")
40
+ filesize = os.path.getsize(inputs.file)
41
+ # ingest small file
42
+ if filesize < 1024 * 1024 * 16: # 16 MB
43
+ data, error = self.repo.ingest_file(
44
+ inputs.file, inputs.dataset_id, id_token, checksum
45
+ )
46
+ if error:
47
+ raise Exception(error)
48
+ self.logger("Done")
49
+ return self.Outputs(data=data)
50
+ # ingest large file
51
+ upload_id, parts = self.repo.prepare_large_upload(
52
+ inputs.file, inputs.dataset_id, checksum, id_token
53
+ )
54
+ self.repo.ingest_large_dataset(inputs.file, upload_id, id_token, parts)
55
+ self.logger("\nCompleting upload...")
56
+ data, error = self.repo.complete_upload(id_token, upload_id)
57
+ if error:
58
+ raise Exception(error)
59
+ self.logger("Done")
60
+ return self.Outputs(data=data)
@@ -0,0 +1,98 @@
1
+ from pydantic import BaseModel
2
+ import os
3
+ from pathlib import Path
4
+ import yaml
5
+ from ...models import Metadata
6
+
7
+
8
+ class IngestFolder:
9
+ def __init__(self, repo, ingest_file, allowed_extensions, logger):
10
+ self.repo = repo
11
+ self.ingest_file = ingest_file
12
+ self.allowed_extensions = allowed_extensions
13
+ self.logger = logger if logger else print
14
+
15
+ class Inputs(BaseModel):
16
+ folder: Path
17
+ user: dict
18
+ force: bool = False
19
+ delete: bool = False
20
+
21
+ class Outputs(BaseModel):
22
+ dataset: dict
23
+
24
+ def __call__(self, inputs: Inputs) -> Outputs:
25
+ # validate folder
26
+ self.logger("Uploading directory (only files, not recursive)")
27
+ items = list(inputs.folder.glob("*"))
28
+ filtered_items = [item for item in items if item.is_file()]
29
+ filtered_items = [
30
+ item for item in filtered_items if item.suffix in self.allowed_extensions
31
+ ]
32
+ if len(filtered_items) == 0:
33
+ raise Exception("No files found in directory")
34
+ if len(filtered_items) > 10:
35
+ raise Exception("Too many files in directory, limited to 10")
36
+ if "metadata.yml" not in [item.name for item in filtered_items]:
37
+ raise Exception("metadata.yml not found in directory")
38
+ # load metadata
39
+ metadata = yaml.safe_load(
40
+ open(inputs.folder.joinpath("metadata.yml"), "r").read()
41
+ )
42
+ metadata = Metadata(**metadata)
43
+ # remove metadata.yml from files
44
+ filtered_items = [
45
+ item for item in filtered_items if item.name != "metadata.yml"
46
+ ]
47
+ # create dataset
48
+ data, error = self.repo.create_dataset(metadata.dict(), inputs.user["id_token"])
49
+ # dataset may already exists, but if user is owner continue ingesting files
50
+ current_files = []
51
+ if error:
52
+ data, error2 = self.repo.retrieve_dataset(metadata.name)
53
+ if error2:
54
+ raise Exception(error)
55
+ if data["uid"] != inputs.user["sub"]:
56
+ raise Exception("Dataset already exists.")
57
+ data["dataset_id"] = data["id"]
58
+ current_files = [item["name"] for item in data["files"]]
59
+ if len(current_files) > 0 and not inputs.force:
60
+ self.logger(
61
+ "The following files already exist and will not be uploaded (use --f to force re-upload):"
62
+ )
63
+ for item in current_files:
64
+ self.logger(f"{item}")
65
+ # TODO: delete current_files that are not in filtered_items if --delete
66
+ hanged_files = [
67
+ file
68
+ for file in current_files
69
+ if file not in [item.name for item in filtered_items]
70
+ ]
71
+ if len(hanged_files) > 0:
72
+ self.logger(
73
+ "The following files are no longer in your dataset (use --d to delete):"
74
+ )
75
+ for item in hanged_files:
76
+ self.logger(f"{item}")
77
+ if inputs.delete:
78
+ self.logger(f"Deleting file {item}...")
79
+ _, error = self.repo.delete_file(
80
+ data["dataset_id"], item, inputs.user["id_token"]
81
+ )
82
+ if error:
83
+ self.logger(error)
84
+ else:
85
+ self.logger("Done")
86
+ filtered_items = [
87
+ item for item in filtered_items if item.name not in current_files
88
+ ]
89
+ dataset_id = data["dataset_id"]
90
+ # upload files
91
+ if len(filtered_items) == 0:
92
+ raise Exception("No files to upload")
93
+ self.logger("The following files will be uploaded:")
94
+ for item in filtered_items:
95
+ self.logger(f"{item.name}")
96
+ for item in filtered_items:
97
+ data = self.ingest_file(item, dataset_id, logger=self.logger)
98
+ return self.Outputs(dataset=data)
@@ -0,0 +1,42 @@
1
+ from pydantic import BaseModel
2
+ from ....curation.stac import STACDataFrame
3
+ import json
4
+
5
+
6
+ class IngestSTAC:
7
+ def __init__(self, repo, ingest_file, allowed_extensions):
8
+ self.repo = repo
9
+ self.ingest_file = ingest_file
10
+ self.allowed_extensions = allowed_extensions
11
+
12
+ class Inputs(BaseModel):
13
+ stac_catalog: str
14
+ dataset: str
15
+ user: dict
16
+
17
+ class Outputs(BaseModel):
18
+ dataset: dict
19
+
20
+ def __call__(self, inputs: Inputs) -> Outputs:
21
+ # load the STAC catalog as a STACsetFrame
22
+ df = STACDataFrame.from_stac_file(inputs.stac_catalog)
23
+ # upload all assets to EOTDL
24
+ for row in df.dropna(subset=["assets"]).iterrows():
25
+ # for asset in df.assets.dropna().values[:10]:
26
+ try:
27
+ for k, v in row[1]["assets"].items():
28
+ data = self.ingest_file(
29
+ v["href"],
30
+ inputs.dataset,
31
+ allowed_extensions=self.allowed_extensions + [".tif", ".tiff"],
32
+ )
33
+ file_url = f"{self.repo.url}datasets/{data['dataset_id']}/download/{data['file_name']}"
34
+ df.loc[row[0], "assets"][k]["href"] = file_url
35
+ except Exception as e:
36
+ break
37
+ data, error = self.repo.ingest_stac(
38
+ json.loads(df.to_json()), inputs.dataset, inputs.user["id_token"]
39
+ )
40
+ if error:
41
+ raise Exception(error)
42
+ return self.Outputs(dataset=data)
@@ -1,7 +1,8 @@
1
1
  from pydantic import BaseModel
2
2
  from typing import List
3
3
 
4
- class RetrieveDatasets():
4
+
5
+ class RetrieveDatasets:
5
6
  def __init__(self, repo):
6
7
  self.repo = repo
7
8
 
@@ -9,9 +10,9 @@ class RetrieveDatasets():
9
10
  pass
10
11
 
11
12
  class Outputs(BaseModel):
12
- datasets: List[str]
13
+ datasets: dict
13
14
 
14
15
  def __call__(self, inputs: Inputs) -> Outputs:
15
16
  data = self.repo.retrieve_datasets()
16
- datasets = [d['name'] for d in data]
17
- return self.Outputs(datasets=datasets)
17
+ datasets = {d["name"]: [f["name"] for f in d["files"]] for d in data}
18
+ return self.Outputs(datasets=datasets)
@@ -3,4 +3,6 @@ from .IngestDataset import IngestDataset
3
3
  from .IngestLargeDataset import IngestLargeDataset
4
4
  from .RetrieveDataset import RetrieveDataset
5
5
  from .RetrieveDatasets import RetrieveDatasets
6
- from .UpdateDataset import UpdateDataset
6
+ from .IngestFile import IngestFile
7
+ from .IngestFolder import IngestFolder
8
+ from .IngestSTAC import IngestSTAC
eotdl/src/utils.py CHANGED
@@ -1,17 +1,17 @@
1
1
  import hashlib
2
2
 
3
3
 
4
- def calculate_checksum(file_path):
5
- hasher = hashlib.md5()
6
- with open(file_path, "rb") as f:
7
- for chunk in iter(lambda: f.read(4096), b""):
8
- hasher.update(chunk)
9
- return hasher.hexdigest()
4
+ # def calculate_checksum(file_path):
5
+ # hasher = hashlib.md5()
6
+ # with open(file_path, "rb") as f:
7
+ # for chunk in iter(lambda: f.read(4096), b""):
8
+ # hasher.update(chunk)
9
+ # return hasher.hexdigest()
10
10
 
11
11
 
12
- # def calculate_checksum(file_path):
13
- # sha1_hash = hashlib.sha1()
14
- # with open(file_path, "rb") as file:
15
- # for chunk in iter(lambda: file.read(4096), b""):
16
- # sha1_hash.update(chunk)
17
- # return sha1_hash.hexdigest()
12
+ def calculate_checksum(file_path):
13
+ sha1_hash = hashlib.sha1()
14
+ with open(file_path, "rb") as file:
15
+ for chunk in iter(lambda: file.read(4096), b""):
16
+ sha1_hash.update(chunk)
17
+ return sha1_hash.hexdigest()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: eotdl
3
- Version: 2023.6.14.post10
3
+ Version: 2023.7.19
4
4
  Summary: Earth Observation Training Data Lab
5
5
  License: MIT
6
6
  Author: EarthPulse