eotdl 2023.6.14.post10__py3-none-any.whl → 2023.7.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eotdl/cli.py +0 -1
- eotdl/commands/datasets.py +32 -43
- eotdl/curation/stac/__init__.py +1 -1
- eotdl/curation/stac/dataframe.py +5 -114
- eotdl/curation/stac/dataframe_bck.py +253 -0
- eotdl/datasets/__init__.py +1 -2
- eotdl/datasets/download.py +4 -9
- eotdl/datasets/ingest.py +37 -17
- eotdl/src/models/__init__.py +1 -0
- eotdl/src/models/metadata.py +16 -0
- eotdl/src/repos/APIRepo.py +82 -98
- eotdl/src/usecases/datasets/DownloadDataset.py +62 -10
- eotdl/src/usecases/datasets/DownloadFile.py +30 -0
- eotdl/src/usecases/datasets/IngestFile.py +60 -0
- eotdl/src/usecases/datasets/IngestFolder.py +98 -0
- eotdl/src/usecases/datasets/IngestSTAC.py +42 -0
- eotdl/src/usecases/datasets/RetrieveDatasets.py +5 -4
- eotdl/src/usecases/datasets/__init__.py +3 -1
- eotdl/src/utils.py +12 -12
- {eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/METADATA +1 -1
- {eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/RECORD +23 -18
- eotdl/datasets/update.py +0 -12
- eotdl/src/usecases/datasets/UpdateDataset.py +0 -32
- {eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/WHEEL +0 -0
- {eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/entry_points.txt +0 -0
eotdl/src/repos/APIRepo.py
CHANGED
@@ -6,11 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
|
|
6
6
|
import time
|
7
7
|
import multiprocessing
|
8
8
|
import hashlib
|
9
|
+
import geopandas as gpd
|
9
10
|
|
10
11
|
|
11
12
|
class APIRepo:
|
12
13
|
def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
|
13
14
|
self.url = url
|
15
|
+
# print(self.url)
|
14
16
|
|
15
17
|
def login(self):
|
16
18
|
return requests.get(self.url + "auth/login")
|
@@ -22,6 +24,16 @@ class APIRepo:
|
|
22
24
|
response = requests.get(self.url + "auth/logout")
|
23
25
|
return response.json()["logout_url"]
|
24
26
|
|
27
|
+
def create_dataset(self, metadata, id_token):
|
28
|
+
response = requests.post(
|
29
|
+
self.url + "datasets",
|
30
|
+
json=metadata,
|
31
|
+
headers={"Authorization": "Bearer " + id_token},
|
32
|
+
)
|
33
|
+
if response.status_code == 200:
|
34
|
+
return response.json(), None
|
35
|
+
return None, response.json()["detail"]
|
36
|
+
|
25
37
|
def retrieve_datasets(self):
|
26
38
|
return requests.get(self.url + "datasets").json()
|
27
39
|
|
@@ -31,12 +43,10 @@ class APIRepo:
|
|
31
43
|
return response.json(), None
|
32
44
|
return None, response.json()["detail"]
|
33
45
|
|
34
|
-
def
|
35
|
-
url = self.url + "datasets/" + dataset_id + "/download"
|
46
|
+
def download_file(self, dataset, dataset_id, file, id_token, path):
|
47
|
+
url = self.url + "datasets/" + dataset_id + "/download/" + file
|
36
48
|
headers = {"Authorization": "Bearer " + id_token}
|
37
|
-
|
38
|
-
path = str(Path.home()) + "/.eotdl/datasets"
|
39
|
-
os.makedirs(path, exist_ok=True)
|
49
|
+
path = f"{path}/{file}"
|
40
50
|
with requests.get(url, headers=headers, stream=True) as r:
|
41
51
|
r.raise_for_status()
|
42
52
|
total_size = int(r.headers.get("content-length", 0))
|
@@ -44,10 +54,6 @@ class APIRepo:
|
|
44
54
|
progress_bar = tqdm(
|
45
55
|
total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
|
46
56
|
)
|
47
|
-
filename = r.headers.get("content-disposition").split("filename=")[1][1:-1]
|
48
|
-
path = f"{path}/{filename}"
|
49
|
-
if os.path.exists(path):
|
50
|
-
raise Exception("File already exists")
|
51
57
|
with open(path, "wb") as f:
|
52
58
|
for chunk in r.iter_content(block_size):
|
53
59
|
progress_bar.update(len(chunk))
|
@@ -56,6 +62,27 @@ class APIRepo:
|
|
56
62
|
progress_bar.close()
|
57
63
|
return path
|
58
64
|
|
65
|
+
def ingest_file(self, file, dataset_id, id_token, checksum=None):
|
66
|
+
reponse = requests.post(
|
67
|
+
self.url + "datasets/" + dataset_id,
|
68
|
+
files={"file": open(file, "rb")},
|
69
|
+
data={"checksum": checksum} if checksum else None,
|
70
|
+
headers={"Authorization": "Bearer " + id_token},
|
71
|
+
)
|
72
|
+
if reponse.status_code != 200:
|
73
|
+
return None, reponse.json()["detail"]
|
74
|
+
return reponse.json(), None
|
75
|
+
|
76
|
+
def ingest_file_url(self, file, dataset, id_token):
|
77
|
+
reponse = requests.post(
|
78
|
+
self.url + "datasets/url",
|
79
|
+
json={"dataset": dataset, "url": file},
|
80
|
+
headers={"Authorization": "Bearer " + id_token},
|
81
|
+
)
|
82
|
+
if reponse.status_code != 200:
|
83
|
+
return None, reponse.json()["detail"]
|
84
|
+
return reponse.json(), None
|
85
|
+
|
59
86
|
def read_in_chunks(self, file_object, CHUNK_SIZE):
|
60
87
|
while True:
|
61
88
|
data = file_object.read(CHUNK_SIZE)
|
@@ -63,18 +90,21 @@ class APIRepo:
|
|
63
90
|
break
|
64
91
|
yield data
|
65
92
|
|
66
|
-
def prepare_large_upload(self,
|
67
|
-
|
68
|
-
response = requests.
|
93
|
+
def prepare_large_upload(self, file, dataset_id, checksum, id_token):
|
94
|
+
filename = Path(file).name
|
95
|
+
response = requests.post(
|
96
|
+
self.url + f"datasets/{dataset_id}/uploadId",
|
97
|
+
json={"name": filename, "checksum": checksum},
|
98
|
+
headers={"Authorization": "Bearer " + id_token},
|
99
|
+
)
|
69
100
|
if response.status_code != 200:
|
70
101
|
raise Exception(response.json()["detail"])
|
71
102
|
data = response.json()
|
72
|
-
|
73
|
-
data["dataset_id"],
|
103
|
+
upload_id, parts = (
|
74
104
|
data["upload_id"],
|
75
105
|
data["parts"] if "parts" in data else [],
|
76
106
|
)
|
77
|
-
return
|
107
|
+
return upload_id, parts
|
78
108
|
|
79
109
|
def get_chunk_size(self, content_size):
|
80
110
|
# adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
|
@@ -85,37 +115,31 @@ class APIRepo:
|
|
85
115
|
chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
|
86
116
|
return chunk_size
|
87
117
|
|
88
|
-
def ingest_large_dataset(self,
|
89
|
-
content_path = os.path.abspath(
|
118
|
+
def ingest_large_dataset(self, file, upload_id, id_token, parts):
|
119
|
+
content_path = os.path.abspath(file)
|
90
120
|
content_size = os.stat(content_path).st_size
|
91
121
|
chunk_size = self.get_chunk_size(content_size)
|
92
122
|
total_chunks = content_size // chunk_size
|
93
|
-
url = self.url + "datasets/chunk"
|
94
|
-
headers = {
|
95
|
-
"Authorization": "Bearer " + id_token,
|
96
|
-
"Upload-Id": upload_id,
|
97
|
-
"Dataset-Id": dataset_id,
|
98
|
-
}
|
99
123
|
# upload chunks sequentially
|
100
124
|
pbar = tqdm(
|
101
125
|
self.read_in_chunks(open(content_path, "rb"), chunk_size),
|
102
126
|
total=total_chunks,
|
103
127
|
)
|
104
128
|
index = 0
|
105
|
-
parts_checkusms = []
|
106
129
|
for chunk in pbar:
|
107
130
|
part = index // chunk_size + 1
|
108
131
|
offset = index + len(chunk)
|
109
132
|
index = offset
|
110
133
|
if part not in parts:
|
111
|
-
headers["Part-Number"] = str(part)
|
112
134
|
checksum = hashlib.md5(chunk).hexdigest()
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
135
|
+
response = requests.post(
|
136
|
+
self.url + "datasets/chunk/" + upload_id,
|
137
|
+
files={"file": chunk},
|
138
|
+
data={"part_number": part, "checksum": checksum},
|
139
|
+
headers={"Authorization": "Bearer " + id_token},
|
140
|
+
)
|
141
|
+
if response.status_code != 200:
|
142
|
+
raise Exception(response.json()["detail"])
|
119
143
|
pbar.set_description(
|
120
144
|
"{:.2f}/{:.2f} MB".format(
|
121
145
|
offset / 1024 / 1024, content_size / 1024 / 1024
|
@@ -124,16 +148,10 @@ class APIRepo:
|
|
124
148
|
pbar.close()
|
125
149
|
return
|
126
150
|
|
127
|
-
def complete_upload(self,
|
128
|
-
url = self.url + "datasets/complete"
|
151
|
+
def complete_upload(self, id_token, upload_id):
|
129
152
|
r = requests.post(
|
130
|
-
url,
|
131
|
-
|
132
|
-
headers={
|
133
|
-
"Authorization": "Bearer " + id_token,
|
134
|
-
"Upload-Id": upload_id,
|
135
|
-
"Dataset-Id": dataset_id,
|
136
|
-
},
|
153
|
+
self.url + "datasets/complete/" + upload_id,
|
154
|
+
headers={"Authorization": "Bearer " + id_token},
|
137
155
|
)
|
138
156
|
if r.status_code != 200:
|
139
157
|
return None, r.json()["detail"]
|
@@ -200,63 +218,29 @@ class APIRepo:
|
|
200
218
|
return None, r.json()["detail"]
|
201
219
|
return r.json(), None
|
202
220
|
|
203
|
-
def
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
):
|
212
|
-
# Create thread pool executor
|
213
|
-
max_workers = threads if threads > 0 else multiprocessing.cpu_count()
|
214
|
-
executor = ThreadPoolExecutor(max_workers=max_workers)
|
215
|
-
|
216
|
-
# Divide file into chunks and create tasks for each chunk
|
217
|
-
offset = 0
|
218
|
-
tasks = []
|
219
|
-
content_path = os.path.abspath(path)
|
220
|
-
content_size = os.stat(content_path).st_size
|
221
|
-
chunk_size = self.get_chunk_size(content_size)
|
222
|
-
total_chunks = content_size // chunk_size
|
223
|
-
while offset < content_size:
|
224
|
-
chunk_end = min(offset + chunk_size, content_size)
|
225
|
-
part = str(offset // chunk_size + 1)
|
226
|
-
if part not in parts:
|
227
|
-
tasks.append((offset, chunk_end, part))
|
228
|
-
offset = chunk_end
|
229
|
-
|
230
|
-
# Define the function that will upload each chunk
|
231
|
-
def upload_chunk(start, end, part):
|
232
|
-
# print(f"Uploading chunk {start} - {end}", part)
|
233
|
-
with open(content_path, "rb") as f:
|
234
|
-
f.seek(start)
|
235
|
-
chunk = f.read(end - start)
|
236
|
-
checksum = hashlib.md5(chunk).hexdigest()
|
237
|
-
response = requests.post(
|
238
|
-
self.url + "datasets/chunk",
|
239
|
-
files={"file": chunk},
|
240
|
-
headers={
|
241
|
-
"Authorization": "Bearer " + id_token,
|
242
|
-
"Upload-Id": upload_id,
|
243
|
-
"Dataset-Id": dataset_id,
|
244
|
-
"Checksum": checksum,
|
245
|
-
"Part-Number": str(part),
|
246
|
-
},
|
247
|
-
)
|
248
|
-
if response.status_code != 200:
|
249
|
-
print(f"Failed to upload chunk {start} - {end}")
|
250
|
-
return response
|
221
|
+
def delete_file(self, dataset_id, file_name, id_token):
|
222
|
+
response = requests.delete(
|
223
|
+
self.url + "datasets/" + dataset_id + "/file/" + file_name,
|
224
|
+
headers={"Authorization": "Bearer " + id_token},
|
225
|
+
)
|
226
|
+
if response.status_code != 200:
|
227
|
+
return None, response.json()["detail"]
|
228
|
+
return response.json(), None
|
251
229
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
230
|
+
def ingest_stac(self, stac_json, dataset, id_token):
|
231
|
+
reponse = requests.post(
|
232
|
+
self.url + "datasets/stac",
|
233
|
+
json={"dataset": dataset, "stac": stac_json},
|
234
|
+
headers={"Authorization": "Bearer " + id_token},
|
235
|
+
)
|
236
|
+
if reponse.status_code != 200:
|
237
|
+
return None, reponse.json()["detail"]
|
238
|
+
return reponse.json(), None
|
259
239
|
|
260
|
-
|
261
|
-
|
262
|
-
|
240
|
+
def download_stac(self, dataset_id, id_token):
|
241
|
+
url = self.url + "datasets/" + dataset_id + "/download"
|
242
|
+
headers = {"Authorization": "Bearer " + id_token}
|
243
|
+
response = requests.get(url, headers=headers)
|
244
|
+
if response.status_code != 200:
|
245
|
+
return None, response.json()["detail"]
|
246
|
+
return gpd.GeoDataFrame.from_features(response.json()["features"]), None
|
@@ -1,27 +1,79 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
from ....src.utils import calculate_checksum
|
3
|
+
from ....curation.stac import STACDataFrame
|
4
|
+
from pathlib import Path
|
5
|
+
import os
|
3
6
|
|
4
7
|
|
5
8
|
class DownloadDataset:
|
6
|
-
def __init__(self, repo, logger):
|
9
|
+
def __init__(self, repo, retrieve_dataset, logger):
|
7
10
|
self.repo = repo
|
11
|
+
self.retrieve_dataset = retrieve_dataset
|
8
12
|
self.logger = logger if logger else print
|
9
13
|
|
10
14
|
class Inputs(BaseModel):
|
11
15
|
dataset: str
|
16
|
+
file: str = None
|
12
17
|
path: str = None
|
13
18
|
user: dict
|
14
|
-
checksum: str
|
15
19
|
|
16
20
|
class Outputs(BaseModel):
|
17
21
|
dst_path: str
|
18
22
|
|
19
|
-
def
|
20
|
-
|
21
|
-
|
23
|
+
def download(self, dataset, dataset_id, file, checksum, path, user):
|
24
|
+
self.logger(f"Downloading {file}")
|
25
|
+
dst_path = self.repo.download_file(
|
26
|
+
dataset, dataset_id, file, user["id_token"], path
|
22
27
|
)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
if calculate_checksum(dst_path) != checksum:
|
29
|
+
self.logger(f"Checksum for {file} does not match")
|
30
|
+
self.logger(f"Done")
|
31
|
+
return dst_path
|
32
|
+
|
33
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
34
|
+
dataset = self.retrieve_dataset(inputs.dataset)
|
35
|
+
if inputs.path is None:
|
36
|
+
download_path = str(Path.home()) + "/.eotdl/datasets/" + inputs.dataset
|
37
|
+
else:
|
38
|
+
download_path = inputs.path + "/" + inputs.dataset
|
39
|
+
os.makedirs(download_path, exist_ok=True)
|
40
|
+
if dataset["quality"] == 0:
|
41
|
+
if inputs.file:
|
42
|
+
files = [f for f in dataset["files"] if f["name"] == inputs.file]
|
43
|
+
if not files:
|
44
|
+
raise Exception(f"File {inputs.file} not found")
|
45
|
+
if len(files) > 1:
|
46
|
+
raise Exception(f"Multiple files with name {inputs.file} found")
|
47
|
+
dst_path = self.download(
|
48
|
+
inputs.dataset,
|
49
|
+
dataset["id"],
|
50
|
+
inputs.file,
|
51
|
+
files[0]["checksum"],
|
52
|
+
download_path,
|
53
|
+
inputs.user,
|
54
|
+
)
|
55
|
+
return self.Outputs(dst_path=dst_path)
|
56
|
+
for file in dataset["files"]:
|
57
|
+
dst_path = self.download(
|
58
|
+
inputs.dataset,
|
59
|
+
dataset["id"],
|
60
|
+
file["name"],
|
61
|
+
file["checksum"],
|
62
|
+
download_path,
|
63
|
+
inputs.user,
|
64
|
+
)
|
65
|
+
return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
|
66
|
+
else:
|
67
|
+
gdf, error = self.repo.download_stac(
|
68
|
+
dataset["id"],
|
69
|
+
inputs.user["id_token"],
|
70
|
+
)
|
71
|
+
if error:
|
72
|
+
raise Exception(error)
|
73
|
+
df = STACDataFrame(gdf)
|
74
|
+
# df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
|
75
|
+
path = inputs.path
|
76
|
+
if path is None:
|
77
|
+
path = str(Path.home()) + "/.eotdl/datasets/" + dataset["name"]
|
78
|
+
df.to_stac(path)
|
79
|
+
return self.Outputs(dst_path=path)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from ....src.utils import calculate_checksum
|
3
|
+
|
4
|
+
|
5
|
+
class DownloadFile:
|
6
|
+
def __init__(self, repo, retrieve_dataset, logger):
|
7
|
+
self.repo = repo
|
8
|
+
self.retrieve_dataset = retrieve_dataset
|
9
|
+
self.logger = logger if logger else print
|
10
|
+
|
11
|
+
class Inputs(BaseModel):
|
12
|
+
dataset: str
|
13
|
+
file: str
|
14
|
+
path: str = None
|
15
|
+
user: dict
|
16
|
+
checksum: str
|
17
|
+
|
18
|
+
class Outputs(BaseModel):
|
19
|
+
dst_path: str
|
20
|
+
|
21
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
22
|
+
dataset = self.retrieve_dataset(inputs.dataset)
|
23
|
+
dst_path = self.repo.download_file(
|
24
|
+
inputs.dataset, inputs.file, inputs.user["id_token"], inputs.path
|
25
|
+
)
|
26
|
+
checksum = calculate_checksum(dst_path)
|
27
|
+
self.logger(f"Checksum: {checksum}")
|
28
|
+
if dataset["checksum"] != checksum:
|
29
|
+
self.logger("Checksums do not match")
|
30
|
+
return self.Outputs(dst_path=dst_path)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
import os
|
3
|
+
import typing
|
4
|
+
|
5
|
+
from ....src.utils import calculate_checksum
|
6
|
+
|
7
|
+
|
8
|
+
class IngestFile:
|
9
|
+
def __init__(self, repo, allowed_extensions, logger):
|
10
|
+
self.repo = repo
|
11
|
+
self.allowed_extensions = allowed_extensions
|
12
|
+
self.logger = logger if logger else print
|
13
|
+
|
14
|
+
class Inputs(BaseModel):
|
15
|
+
file: typing.Any
|
16
|
+
dataset_id: str
|
17
|
+
user: dict
|
18
|
+
|
19
|
+
class Outputs(BaseModel):
|
20
|
+
data: dict
|
21
|
+
|
22
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
23
|
+
# validate file extension
|
24
|
+
extension = os.path.splitext(inputs.file)[1]
|
25
|
+
if extension not in self.allowed_extensions:
|
26
|
+
raise Exception(
|
27
|
+
f"Only {', '.join(self.allowed_extensions)} files are allowed"
|
28
|
+
)
|
29
|
+
id_token = inputs.user["id_token"]
|
30
|
+
self.logger(f"Uploading file {inputs.file}...")
|
31
|
+
# if inputs.file.startswith("http://") or inputs.file.startswith("https://"):
|
32
|
+
# data, error = self.repo.ingest_file_url(
|
33
|
+
# inputs.file, inputs.metadata.name, id_token
|
34
|
+
# )
|
35
|
+
# else:
|
36
|
+
self.logger("Computing checksum...")
|
37
|
+
checksum = calculate_checksum(inputs.file)
|
38
|
+
self.logger(checksum)
|
39
|
+
self.logger("Ingesting file...")
|
40
|
+
filesize = os.path.getsize(inputs.file)
|
41
|
+
# ingest small file
|
42
|
+
if filesize < 1024 * 1024 * 16: # 16 MB
|
43
|
+
data, error = self.repo.ingest_file(
|
44
|
+
inputs.file, inputs.dataset_id, id_token, checksum
|
45
|
+
)
|
46
|
+
if error:
|
47
|
+
raise Exception(error)
|
48
|
+
self.logger("Done")
|
49
|
+
return self.Outputs(data=data)
|
50
|
+
# ingest large file
|
51
|
+
upload_id, parts = self.repo.prepare_large_upload(
|
52
|
+
inputs.file, inputs.dataset_id, checksum, id_token
|
53
|
+
)
|
54
|
+
self.repo.ingest_large_dataset(inputs.file, upload_id, id_token, parts)
|
55
|
+
self.logger("\nCompleting upload...")
|
56
|
+
data, error = self.repo.complete_upload(id_token, upload_id)
|
57
|
+
if error:
|
58
|
+
raise Exception(error)
|
59
|
+
self.logger("Done")
|
60
|
+
return self.Outputs(data=data)
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
import os
|
3
|
+
from pathlib import Path
|
4
|
+
import yaml
|
5
|
+
from ...models import Metadata
|
6
|
+
|
7
|
+
|
8
|
+
class IngestFolder:
|
9
|
+
def __init__(self, repo, ingest_file, allowed_extensions, logger):
|
10
|
+
self.repo = repo
|
11
|
+
self.ingest_file = ingest_file
|
12
|
+
self.allowed_extensions = allowed_extensions
|
13
|
+
self.logger = logger if logger else print
|
14
|
+
|
15
|
+
class Inputs(BaseModel):
|
16
|
+
folder: Path
|
17
|
+
user: dict
|
18
|
+
force: bool = False
|
19
|
+
delete: bool = False
|
20
|
+
|
21
|
+
class Outputs(BaseModel):
|
22
|
+
dataset: dict
|
23
|
+
|
24
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
25
|
+
# validate folder
|
26
|
+
self.logger("Uploading directory (only files, not recursive)")
|
27
|
+
items = list(inputs.folder.glob("*"))
|
28
|
+
filtered_items = [item for item in items if item.is_file()]
|
29
|
+
filtered_items = [
|
30
|
+
item for item in filtered_items if item.suffix in self.allowed_extensions
|
31
|
+
]
|
32
|
+
if len(filtered_items) == 0:
|
33
|
+
raise Exception("No files found in directory")
|
34
|
+
if len(filtered_items) > 10:
|
35
|
+
raise Exception("Too many files in directory, limited to 10")
|
36
|
+
if "metadata.yml" not in [item.name for item in filtered_items]:
|
37
|
+
raise Exception("metadata.yml not found in directory")
|
38
|
+
# load metadata
|
39
|
+
metadata = yaml.safe_load(
|
40
|
+
open(inputs.folder.joinpath("metadata.yml"), "r").read()
|
41
|
+
)
|
42
|
+
metadata = Metadata(**metadata)
|
43
|
+
# remove metadata.yml from files
|
44
|
+
filtered_items = [
|
45
|
+
item for item in filtered_items if item.name != "metadata.yml"
|
46
|
+
]
|
47
|
+
# create dataset
|
48
|
+
data, error = self.repo.create_dataset(metadata.dict(), inputs.user["id_token"])
|
49
|
+
# dataset may already exists, but if user is owner continue ingesting files
|
50
|
+
current_files = []
|
51
|
+
if error:
|
52
|
+
data, error2 = self.repo.retrieve_dataset(metadata.name)
|
53
|
+
if error2:
|
54
|
+
raise Exception(error)
|
55
|
+
if data["uid"] != inputs.user["sub"]:
|
56
|
+
raise Exception("Dataset already exists.")
|
57
|
+
data["dataset_id"] = data["id"]
|
58
|
+
current_files = [item["name"] for item in data["files"]]
|
59
|
+
if len(current_files) > 0 and not inputs.force:
|
60
|
+
self.logger(
|
61
|
+
"The following files already exist and will not be uploaded (use --f to force re-upload):"
|
62
|
+
)
|
63
|
+
for item in current_files:
|
64
|
+
self.logger(f"{item}")
|
65
|
+
# TODO: delete current_files that are not in filtered_items if --delete
|
66
|
+
hanged_files = [
|
67
|
+
file
|
68
|
+
for file in current_files
|
69
|
+
if file not in [item.name for item in filtered_items]
|
70
|
+
]
|
71
|
+
if len(hanged_files) > 0:
|
72
|
+
self.logger(
|
73
|
+
"The following files are no longer in your dataset (use --d to delete):"
|
74
|
+
)
|
75
|
+
for item in hanged_files:
|
76
|
+
self.logger(f"{item}")
|
77
|
+
if inputs.delete:
|
78
|
+
self.logger(f"Deleting file {item}...")
|
79
|
+
_, error = self.repo.delete_file(
|
80
|
+
data["dataset_id"], item, inputs.user["id_token"]
|
81
|
+
)
|
82
|
+
if error:
|
83
|
+
self.logger(error)
|
84
|
+
else:
|
85
|
+
self.logger("Done")
|
86
|
+
filtered_items = [
|
87
|
+
item for item in filtered_items if item.name not in current_files
|
88
|
+
]
|
89
|
+
dataset_id = data["dataset_id"]
|
90
|
+
# upload files
|
91
|
+
if len(filtered_items) == 0:
|
92
|
+
raise Exception("No files to upload")
|
93
|
+
self.logger("The following files will be uploaded:")
|
94
|
+
for item in filtered_items:
|
95
|
+
self.logger(f"{item.name}")
|
96
|
+
for item in filtered_items:
|
97
|
+
data = self.ingest_file(item, dataset_id, logger=self.logger)
|
98
|
+
return self.Outputs(dataset=data)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from ....curation.stac import STACDataFrame
|
3
|
+
import json
|
4
|
+
|
5
|
+
|
6
|
+
class IngestSTAC:
|
7
|
+
def __init__(self, repo, ingest_file, allowed_extensions):
|
8
|
+
self.repo = repo
|
9
|
+
self.ingest_file = ingest_file
|
10
|
+
self.allowed_extensions = allowed_extensions
|
11
|
+
|
12
|
+
class Inputs(BaseModel):
|
13
|
+
stac_catalog: str
|
14
|
+
dataset: str
|
15
|
+
user: dict
|
16
|
+
|
17
|
+
class Outputs(BaseModel):
|
18
|
+
dataset: dict
|
19
|
+
|
20
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
21
|
+
# load the STAC catalog as a STACsetFrame
|
22
|
+
df = STACDataFrame.from_stac_file(inputs.stac_catalog)
|
23
|
+
# upload all assets to EOTDL
|
24
|
+
for row in df.dropna(subset=["assets"]).iterrows():
|
25
|
+
# for asset in df.assets.dropna().values[:10]:
|
26
|
+
try:
|
27
|
+
for k, v in row[1]["assets"].items():
|
28
|
+
data = self.ingest_file(
|
29
|
+
v["href"],
|
30
|
+
inputs.dataset,
|
31
|
+
allowed_extensions=self.allowed_extensions + [".tif", ".tiff"],
|
32
|
+
)
|
33
|
+
file_url = f"{self.repo.url}datasets/{data['dataset_id']}/download/{data['file_name']}"
|
34
|
+
df.loc[row[0], "assets"][k]["href"] = file_url
|
35
|
+
except Exception as e:
|
36
|
+
break
|
37
|
+
data, error = self.repo.ingest_stac(
|
38
|
+
json.loads(df.to_json()), inputs.dataset, inputs.user["id_token"]
|
39
|
+
)
|
40
|
+
if error:
|
41
|
+
raise Exception(error)
|
42
|
+
return self.Outputs(dataset=data)
|
@@ -1,7 +1,8 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
from typing import List
|
3
3
|
|
4
|
-
|
4
|
+
|
5
|
+
class RetrieveDatasets:
|
5
6
|
def __init__(self, repo):
|
6
7
|
self.repo = repo
|
7
8
|
|
@@ -9,9 +10,9 @@ class RetrieveDatasets():
|
|
9
10
|
pass
|
10
11
|
|
11
12
|
class Outputs(BaseModel):
|
12
|
-
datasets:
|
13
|
+
datasets: dict
|
13
14
|
|
14
15
|
def __call__(self, inputs: Inputs) -> Outputs:
|
15
16
|
data = self.repo.retrieve_datasets()
|
16
|
-
datasets =
|
17
|
-
return self.Outputs(datasets=datasets)
|
17
|
+
datasets = {d["name"]: [f["name"] for f in d["files"]] for d in data}
|
18
|
+
return self.Outputs(datasets=datasets)
|
@@ -3,4 +3,6 @@ from .IngestDataset import IngestDataset
|
|
3
3
|
from .IngestLargeDataset import IngestLargeDataset
|
4
4
|
from .RetrieveDataset import RetrieveDataset
|
5
5
|
from .RetrieveDatasets import RetrieveDatasets
|
6
|
-
from .
|
6
|
+
from .IngestFile import IngestFile
|
7
|
+
from .IngestFolder import IngestFolder
|
8
|
+
from .IngestSTAC import IngestSTAC
|
eotdl/src/utils.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
import hashlib
|
2
2
|
|
3
3
|
|
4
|
-
def calculate_checksum(file_path):
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
# def calculate_checksum(file_path):
|
5
|
+
# hasher = hashlib.md5()
|
6
|
+
# with open(file_path, "rb") as f:
|
7
|
+
# for chunk in iter(lambda: f.read(4096), b""):
|
8
|
+
# hasher.update(chunk)
|
9
|
+
# return hasher.hexdigest()
|
10
10
|
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
12
|
+
def calculate_checksum(file_path):
|
13
|
+
sha1_hash = hashlib.sha1()
|
14
|
+
with open(file_path, "rb") as file:
|
15
|
+
for chunk in iter(lambda: file.read(4096), b""):
|
16
|
+
sha1_hash.update(chunk)
|
17
|
+
return sha1_hash.hexdigest()
|