eotdl 2023.7.19.post4__py3-none-any.whl → 2023.9.14.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eotdl/commands/datasets.py +15 -29
- eotdl/curation/__init__.py +5 -5
- eotdl/curation/formatters.py +0 -2
- eotdl/curation/metadata.py +34 -9
- eotdl/curation/stac/assets.py +127 -0
- eotdl/curation/stac/dataframe.py +8 -4
- eotdl/curation/stac/extensions.py +295 -46
- eotdl/curation/stac/extent.py +130 -0
- eotdl/curation/stac/ml_dataset.py +509 -0
- eotdl/curation/stac/parsers.py +2 -0
- eotdl/curation/stac/stac.py +309 -286
- eotdl/curation/stac/utils.py +47 -1
- eotdl/datasets/__init__.py +2 -2
- eotdl/datasets/download.py +16 -3
- eotdl/datasets/ingest.py +21 -10
- eotdl/datasets/retrieve.py +10 -2
- eotdl/src/repos/APIRepo.py +40 -17
- eotdl/src/repos/AuthRepo.py +3 -3
- eotdl/src/usecases/auth/IsLogged.py +5 -3
- eotdl/src/usecases/datasets/DownloadDataset.py +35 -6
- eotdl/src/usecases/datasets/DownloadFileURL.py +22 -0
- eotdl/src/usecases/datasets/IngestFile.py +48 -28
- eotdl/src/usecases/datasets/IngestSTAC.py +43 -8
- eotdl/src/usecases/datasets/RetrieveDatasets.py +3 -2
- eotdl/src/usecases/datasets/__init__.py +1 -0
- eotdl/tools/sen12floods/tools.py +3 -3
- eotdl/tools/stac.py +8 -2
- {eotdl-2023.7.19.post4.dist-info → eotdl-2023.9.14.post2.dist-info}/METADATA +2 -1
- {eotdl-2023.7.19.post4.dist-info → eotdl-2023.9.14.post2.dist-info}/RECORD +31 -27
- {eotdl-2023.7.19.post4.dist-info → eotdl-2023.9.14.post2.dist-info}/WHEEL +1 -1
- {eotdl-2023.7.19.post4.dist-info → eotdl-2023.9.14.post2.dist-info}/entry_points.txt +0 -0
eotdl/curation/stac/utils.py
CHANGED
@@ -3,11 +3,13 @@ STAC utils
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import pystac
|
6
|
+
import json
|
6
7
|
|
8
|
+
from os.path import dirname, join, exists
|
9
|
+
from os import listdir
|
7
10
|
from datetime import datetime
|
8
11
|
from dateutil import parser
|
9
12
|
from pandas import isna
|
10
|
-
from numpy import nan
|
11
13
|
from typing import Union
|
12
14
|
|
13
15
|
|
@@ -84,3 +86,47 @@ def get_all_children(obj: pystac.STACObject) -> list:
|
|
84
86
|
children.append(item.to_dict())
|
85
87
|
|
86
88
|
return children
|
89
|
+
|
90
|
+
|
91
|
+
def cut_images(images_list: Union[list, tuple]) -> list:
|
92
|
+
"""
|
93
|
+
"""
|
94
|
+
dirnames = list()
|
95
|
+
images = list()
|
96
|
+
|
97
|
+
for image in images_list:
|
98
|
+
dir = dirname(image)
|
99
|
+
if dir not in dirnames:
|
100
|
+
dirnames.append(dir)
|
101
|
+
images.append(image)
|
102
|
+
|
103
|
+
return images
|
104
|
+
|
105
|
+
|
106
|
+
def get_item_metadata(raster_path: str) -> str:
|
107
|
+
"""
|
108
|
+
Get the metadata JSON file of a given directory, associated to a raster file
|
109
|
+
|
110
|
+
:param raster_path: path to the raster file
|
111
|
+
"""
|
112
|
+
# Get the directory of the raster file
|
113
|
+
raster_dir_path = dirname(raster_path)
|
114
|
+
# Get the metadata JSON file
|
115
|
+
# Check if there is a metadata.json file in the directory
|
116
|
+
if 'metadata.json' in listdir(raster_dir_path):
|
117
|
+
metadata_json = join(raster_dir_path, 'metadata.json')
|
118
|
+
else:
|
119
|
+
# If there is no metadata.json file in the directory, check if there is
|
120
|
+
# a json file with the same name as the raster file
|
121
|
+
raster_name = raster_path.split('/')[-1]
|
122
|
+
raster_name = raster_name.split('.')[0]
|
123
|
+
metadata_json = join(raster_dir_path, f'{raster_name}.json')
|
124
|
+
if not exists(metadata_json):
|
125
|
+
# If there is no metadata.json file in the directory, return None
|
126
|
+
return None
|
127
|
+
|
128
|
+
# Open the metadata.json file and return it
|
129
|
+
with open(metadata_json, 'r') as f:
|
130
|
+
metadata = json.load(f)
|
131
|
+
|
132
|
+
return metadata
|
eotdl/datasets/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
from .ingest import
|
2
|
-
from .download import download_dataset
|
1
|
+
from .ingest import ingest_dataset
|
2
|
+
from .download import download_dataset, download_file_url
|
3
3
|
from .retrieve import retrieve_datasets, retrieve_dataset, list_datasets
|
eotdl/datasets/download.py
CHANGED
@@ -1,13 +1,26 @@
|
|
1
1
|
from ..src.repos import APIRepo
|
2
|
-
from ..src.usecases.datasets import DownloadDataset,
|
2
|
+
from ..src.usecases.datasets import DownloadDataset, DownloadFileURL
|
3
3
|
from .retrieve import retrieve_dataset
|
4
4
|
from ..auth import with_auth
|
5
5
|
|
6
6
|
|
7
7
|
@with_auth
|
8
|
-
def download_dataset(
|
8
|
+
def download_dataset(
|
9
|
+
dataset, file=None, path=None, logger=None, assets=False, force=False, user=None
|
10
|
+
):
|
9
11
|
api_repo = APIRepo()
|
10
12
|
download = DownloadDataset(api_repo, retrieve_dataset, logger)
|
11
|
-
inputs = download.Inputs(
|
13
|
+
inputs = download.Inputs(
|
14
|
+
dataset=dataset, file=file, path=path, user=user, assets=assets, force=force
|
15
|
+
)
|
16
|
+
outputs = download(inputs)
|
17
|
+
return outputs.dst_path
|
18
|
+
|
19
|
+
|
20
|
+
@with_auth
|
21
|
+
def download_file_url(url, path, progress=True, logger=None, user=None):
|
22
|
+
api_repo = APIRepo()
|
23
|
+
download = DownloadFileURL(api_repo, logger, progress)
|
24
|
+
inputs = DownloadFileURL.Inputs(url=url, path=path, user=user)
|
12
25
|
outputs = download(inputs)
|
13
26
|
return outputs.dst_path
|
eotdl/datasets/ingest.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
import
|
1
|
+
from pathlib import Path
|
2
2
|
|
3
3
|
from ..src.repos import APIRepo
|
4
4
|
from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
|
@@ -12,24 +12,35 @@ allowed_extensions = [
|
|
12
12
|
".csv",
|
13
13
|
".txt",
|
14
14
|
".json",
|
15
|
+
".geojson",
|
15
16
|
".pdf",
|
16
17
|
".md",
|
17
18
|
".yml",
|
18
19
|
]
|
19
20
|
|
20
21
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
22
|
+
def ingest_dataset(path, f=False, d=False, logger=print):
|
23
|
+
path = Path(path)
|
24
|
+
if not path.is_dir():
|
25
|
+
raise Exception("Path must be a folder")
|
26
|
+
if "catalog.json" in [f.name for f in path.iterdir()]:
|
27
|
+
return ingest_stac(path / "catalog.json", logger)
|
28
|
+
return ingest_folder(path, f, d, logger)
|
24
29
|
|
25
30
|
|
26
31
|
@with_auth
|
27
32
|
def ingest_file(
|
28
|
-
file,
|
33
|
+
file,
|
34
|
+
dataset_id,
|
35
|
+
logger=None,
|
36
|
+
allowed_extensions=allowed_extensions,
|
37
|
+
verbose=True,
|
38
|
+
root=None,
|
39
|
+
user=None,
|
29
40
|
):
|
30
41
|
api_repo = APIRepo()
|
31
|
-
ingest = IngestFile(api_repo, allowed_extensions, logger)
|
32
|
-
inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
|
42
|
+
ingest = IngestFile(api_repo, allowed_extensions, logger, verbose)
|
43
|
+
inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user, root=root)
|
33
44
|
outputs = ingest(inputs)
|
34
45
|
return outputs.data
|
35
46
|
|
@@ -44,9 +55,9 @@ def ingest_folder(folder, force, delete, logger=None, user=None):
|
|
44
55
|
|
45
56
|
|
46
57
|
@with_auth
|
47
|
-
def ingest_stac(stac_catalog,
|
58
|
+
def ingest_stac(stac_catalog, logger=None, user=None):
|
48
59
|
api_repo = APIRepo()
|
49
|
-
ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
|
50
|
-
inputs = ingest.Inputs(stac_catalog=stac_catalog,
|
60
|
+
ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions, logger)
|
61
|
+
inputs = ingest.Inputs(stac_catalog=stac_catalog, user=user)
|
51
62
|
outputs = ingest(inputs)
|
52
63
|
return outputs.dataset
|
eotdl/datasets/retrieve.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
import re
|
2
|
+
|
1
3
|
from ..src.repos import APIRepo
|
2
4
|
from ..src.usecases.datasets import RetrieveDatasets, RetrieveDataset
|
3
5
|
|
4
6
|
|
5
|
-
def list_datasets():
|
6
|
-
|
7
|
+
def list_datasets(pattern=None):
|
8
|
+
datasets = retrieve_datasets()
|
9
|
+
if pattern:
|
10
|
+
regex = re.compile(rf".*{re.escape(pattern)}.*", re.IGNORECASE)
|
11
|
+
names = list(datasets.keys())
|
12
|
+
valid = [name for name in names if regex.search(name)]
|
13
|
+
return {name: datasets[name] for name in valid}
|
14
|
+
return datasets
|
7
15
|
|
8
16
|
|
9
17
|
def retrieve_datasets():
|
eotdl/src/repos/APIRepo.py
CHANGED
@@ -2,9 +2,6 @@ import requests
|
|
2
2
|
from tqdm import tqdm
|
3
3
|
from pathlib import Path
|
4
4
|
import os
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
6
|
-
import time
|
7
|
-
import multiprocessing
|
8
5
|
import hashlib
|
9
6
|
import geopandas as gpd
|
10
7
|
|
@@ -12,7 +9,6 @@ import geopandas as gpd
|
|
12
9
|
class APIRepo:
|
13
10
|
def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
|
14
11
|
self.url = url
|
15
|
-
# print(self.url)
|
16
12
|
|
17
13
|
def login(self):
|
18
14
|
return requests.get(self.url + "auth/login")
|
@@ -24,9 +20,18 @@ class APIRepo:
|
|
24
20
|
response = requests.get(self.url + "auth/logout")
|
25
21
|
return response.json()["logout_url"]
|
26
22
|
|
23
|
+
def retrieve_credentials(self, id_token):
|
24
|
+
response = requests.get(
|
25
|
+
self.url + "auth/credentials",
|
26
|
+
headers={"Authorization": "Bearer " + id_token},
|
27
|
+
)
|
28
|
+
if response.status_code == 200:
|
29
|
+
return response.json(), None
|
30
|
+
return None, response.json()["detail"]
|
31
|
+
|
27
32
|
def create_dataset(self, metadata, id_token):
|
28
33
|
response = requests.post(
|
29
|
-
self.url + "datasets",
|
34
|
+
self.url + "datasets/q0",
|
30
35
|
json=metadata,
|
31
36
|
headers={"Authorization": "Bearer " + id_token},
|
32
37
|
)
|
@@ -34,6 +39,16 @@ class APIRepo:
|
|
34
39
|
return response.json(), None
|
35
40
|
return None, response.json()["detail"]
|
36
41
|
|
42
|
+
def create_stac_dataset(self, name, id_token):
|
43
|
+
response = requests.post(
|
44
|
+
self.url + "datasets/stac",
|
45
|
+
json={"name": name},
|
46
|
+
headers={"Authorization": "Bearer " + id_token},
|
47
|
+
)
|
48
|
+
if response.status_code == 200:
|
49
|
+
return response.json(), None
|
50
|
+
return None, response.json()["detail"]
|
51
|
+
|
37
52
|
def retrieve_datasets(self):
|
38
53
|
return requests.get(self.url + "datasets").json()
|
39
54
|
|
@@ -45,21 +60,29 @@ class APIRepo:
|
|
45
60
|
|
46
61
|
def download_file(self, dataset, dataset_id, file, id_token, path):
|
47
62
|
url = self.url + "datasets/" + dataset_id + "/download/" + file
|
63
|
+
return self.download_file_url(url, path, id_token, progress=True)
|
64
|
+
|
65
|
+
def download_file_url(self, url, path, id_token, progress=False):
|
48
66
|
headers = {"Authorization": "Bearer " + id_token}
|
49
|
-
|
67
|
+
filename = url.split("/")[-1]
|
68
|
+
os.makedirs(path, exist_ok=True)
|
69
|
+
path = f"{path}/{filename}"
|
50
70
|
with requests.get(url, headers=headers, stream=True) as r:
|
51
71
|
r.raise_for_status()
|
52
72
|
total_size = int(r.headers.get("content-length", 0))
|
53
73
|
block_size = 1024 * 1024 * 10
|
54
|
-
|
55
|
-
|
56
|
-
|
74
|
+
if progress:
|
75
|
+
progress_bar = tqdm(
|
76
|
+
total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
|
77
|
+
)
|
57
78
|
with open(path, "wb") as f:
|
58
79
|
for chunk in r.iter_content(block_size):
|
59
|
-
|
80
|
+
if progress:
|
81
|
+
progress_bar.update(len(chunk))
|
60
82
|
if chunk:
|
61
83
|
f.write(chunk)
|
62
|
-
|
84
|
+
if progress:
|
85
|
+
progress_bar.close()
|
63
86
|
return path
|
64
87
|
|
65
88
|
def ingest_file(self, file, dataset_id, id_token, checksum=None):
|
@@ -75,8 +98,8 @@ class APIRepo:
|
|
75
98
|
|
76
99
|
def ingest_file_url(self, file, dataset, id_token):
|
77
100
|
reponse = requests.post(
|
78
|
-
self.url + "datasets/url",
|
79
|
-
json={"
|
101
|
+
self.url + f"datasets/{dataset}/url",
|
102
|
+
json={"url": file},
|
80
103
|
headers={"Authorization": "Bearer " + id_token},
|
81
104
|
)
|
82
105
|
if reponse.status_code != 200:
|
@@ -227,10 +250,10 @@ class APIRepo:
|
|
227
250
|
return None, response.json()["detail"]
|
228
251
|
return response.json(), None
|
229
252
|
|
230
|
-
def ingest_stac(self, stac_json,
|
231
|
-
reponse = requests.
|
232
|
-
self.url + "datasets/stac",
|
233
|
-
json={"
|
253
|
+
def ingest_stac(self, stac_json, dataset_id, id_token):
|
254
|
+
reponse = requests.put(
|
255
|
+
self.url + f"datasets/stac/{dataset_id}",
|
256
|
+
json={"stac": stac_json},
|
234
257
|
headers={"Authorization": "Bearer " + id_token},
|
235
258
|
)
|
236
259
|
if reponse.status_code != 200:
|
eotdl/src/repos/AuthRepo.py
CHANGED
@@ -7,11 +7,11 @@ import jwt
|
|
7
7
|
class AuthRepo:
|
8
8
|
def __init__(self):
|
9
9
|
self.algorithms = ["RS256"]
|
10
|
-
self.
|
11
|
-
self.
|
10
|
+
self.base_path = str(Path.home()) + "/.cache/eotdl/"
|
11
|
+
os.makedirs(self.base_path, exist_ok=True)
|
12
|
+
self.creds_path = self.base_path + "creds.json"
|
12
13
|
|
13
14
|
def save_creds(self, data):
|
14
|
-
os.makedirs(self.home + "/.eotdl", exist_ok=True)
|
15
15
|
with open(self.creds_path, "w") as f:
|
16
16
|
json.dump(data, f)
|
17
17
|
return self.creds_path
|
@@ -1,15 +1,17 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
|
+
from typing import Union
|
3
|
+
|
2
4
|
|
3
5
|
class IsLogged:
|
4
6
|
def __init__(self, repo):
|
5
7
|
self.repo = repo
|
6
8
|
|
7
9
|
class Inputs(BaseModel):
|
8
|
-
pass
|
10
|
+
pass
|
9
11
|
|
10
12
|
class Outputs(BaseModel):
|
11
|
-
user: dict
|
13
|
+
user: Union[dict, None]
|
12
14
|
|
13
15
|
def __call__(self, inputs: Inputs) -> Outputs:
|
14
16
|
user = self.repo.load_creds()
|
15
|
-
return self.Outputs(user=user)
|
17
|
+
return self.Outputs(user=user)
|
@@ -1,8 +1,11 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
|
-
from ....src.utils import calculate_checksum
|
3
|
-
from ....curation.stac import STACDataFrame
|
4
2
|
from pathlib import Path
|
5
3
|
import os
|
4
|
+
from typing import Union
|
5
|
+
from tqdm import tqdm
|
6
|
+
|
7
|
+
from ....curation.stac import STACDataFrame
|
8
|
+
from ....src.utils import calculate_checksum
|
6
9
|
|
7
10
|
|
8
11
|
class DownloadDataset:
|
@@ -13,9 +16,11 @@ class DownloadDataset:
|
|
13
16
|
|
14
17
|
class Inputs(BaseModel):
|
15
18
|
dataset: str
|
16
|
-
file: str = None
|
17
|
-
path: str = None
|
19
|
+
file: Union[str, None] = None
|
20
|
+
path: Union[str, None] = None
|
18
21
|
user: dict
|
22
|
+
assets: bool = False
|
23
|
+
force: bool = False
|
19
24
|
|
20
25
|
class Outputs(BaseModel):
|
21
26
|
dst_path: str
|
@@ -32,11 +37,20 @@ class DownloadDataset:
|
|
32
37
|
|
33
38
|
def __call__(self, inputs: Inputs) -> Outputs:
|
34
39
|
dataset = self.retrieve_dataset(inputs.dataset)
|
40
|
+
download_base_path = os.getenv(
|
41
|
+
"EOTDL_DOWNLOAD_PATH", str(Path.home()) + "/.cache/eotdl/datasets"
|
42
|
+
)
|
35
43
|
if inputs.path is None:
|
36
|
-
download_path =
|
44
|
+
download_path = download_base_path + "/" + inputs.dataset
|
37
45
|
else:
|
38
46
|
download_path = inputs.path + "/" + inputs.dataset
|
39
47
|
os.makedirs(download_path, exist_ok=True)
|
48
|
+
# check if dataset already exists
|
49
|
+
if os.path.exists(download_path) and not inputs.force:
|
50
|
+
raise Exception(
|
51
|
+
f"Dataset {inputs.dataset} already exists at {download_path}. To force download, use force=True or -f in the CLI."
|
52
|
+
)
|
53
|
+
|
40
54
|
if dataset["quality"] == 0:
|
41
55
|
if inputs.file:
|
42
56
|
files = [f for f in dataset["files"] if f["name"] == inputs.file]
|
@@ -64,6 +78,7 @@ class DownloadDataset:
|
|
64
78
|
)
|
65
79
|
return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
|
66
80
|
else:
|
81
|
+
self.logger("Downloading STAC metadata...")
|
67
82
|
gdf, error = self.repo.download_stac(
|
68
83
|
dataset["id"],
|
69
84
|
inputs.user["id_token"],
|
@@ -74,6 +89,20 @@ class DownloadDataset:
|
|
74
89
|
# df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
|
75
90
|
path = inputs.path
|
76
91
|
if path is None:
|
77
|
-
path =
|
92
|
+
path = download_base_path + "/" + dataset["name"]
|
78
93
|
df.to_stac(path)
|
94
|
+
# download assets
|
95
|
+
if inputs.assets:
|
96
|
+
self.logger("Downloading assets...")
|
97
|
+
df = df.dropna(subset=["assets"])
|
98
|
+
for row in tqdm(df.iterrows(), total=len(df)):
|
99
|
+
id = row[1]["stac_id"]
|
100
|
+
# print(row[1]["links"])
|
101
|
+
for k, v in row[1]["assets"].items():
|
102
|
+
href = v["href"]
|
103
|
+
self.repo.download_file_url(
|
104
|
+
href, f"{path}/assets/{id}", inputs.user["id_token"]
|
105
|
+
)
|
106
|
+
else:
|
107
|
+
self.logger("To download assets, set assets=True or -a in the CLI.")
|
79
108
|
return self.Outputs(dst_path=path)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
|
3
|
+
|
4
|
+
class DownloadFileURL:
|
5
|
+
def __init__(self, repo, logger, progress=True):
|
6
|
+
self.repo = repo
|
7
|
+
self.logger = logger if logger else print
|
8
|
+
self.progress = progress
|
9
|
+
|
10
|
+
class Inputs(BaseModel):
|
11
|
+
url: str
|
12
|
+
path: str = None
|
13
|
+
user: dict
|
14
|
+
|
15
|
+
class Outputs(BaseModel):
|
16
|
+
dst_path: str
|
17
|
+
|
18
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
19
|
+
dst_path = self.repo.download_file_url(
|
20
|
+
inputs.url, inputs.path, inputs.user["id_token"], progress=self.progress
|
21
|
+
)
|
22
|
+
return self.Outputs(dst_path=dst_path)
|
@@ -1,20 +1,24 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
import os
|
3
3
|
import typing
|
4
|
+
from pathlib import Path
|
5
|
+
from glob import glob
|
4
6
|
|
5
7
|
from ....src.utils import calculate_checksum
|
6
8
|
|
7
9
|
|
8
10
|
class IngestFile:
|
9
|
-
def __init__(self, repo, allowed_extensions, logger):
|
11
|
+
def __init__(self, repo, allowed_extensions, logger, verbose=True):
|
10
12
|
self.repo = repo
|
11
13
|
self.allowed_extensions = allowed_extensions
|
12
14
|
self.logger = logger if logger else print
|
15
|
+
self.verbose = verbose
|
13
16
|
|
14
17
|
class Inputs(BaseModel):
|
15
18
|
file: typing.Any
|
16
19
|
dataset_id: str
|
17
20
|
user: dict
|
21
|
+
root: typing.Optional[Path] = None
|
18
22
|
|
19
23
|
class Outputs(BaseModel):
|
20
24
|
data: dict
|
@@ -27,34 +31,50 @@ class IngestFile:
|
|
27
31
|
f"Only {', '.join(self.allowed_extensions)} files are allowed"
|
28
32
|
)
|
29
33
|
id_token = inputs.user["id_token"]
|
30
|
-
self.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
# else:
|
36
|
-
self.logger("Computing checksum...")
|
37
|
-
checksum = calculate_checksum(inputs.file)
|
38
|
-
self.logger(checksum)
|
39
|
-
self.logger("Ingesting file...")
|
40
|
-
filesize = os.path.getsize(inputs.file)
|
41
|
-
# ingest small file
|
42
|
-
if filesize < 1024 * 1024 * 16: # 16 MB
|
43
|
-
data, error = self.repo.ingest_file(
|
44
|
-
inputs.file, inputs.dataset_id, id_token, checksum
|
34
|
+
if self.verbose:
|
35
|
+
self.logger(f"Uploading file {inputs.file}...")
|
36
|
+
if inputs.file.startswith("http://") or inputs.file.startswith("https://"):
|
37
|
+
data, error = self.repo.ingest_file_url(
|
38
|
+
inputs.file, inputs.dataset_id, id_token
|
45
39
|
)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
40
|
+
else:
|
41
|
+
file_path = Path(inputs.file)
|
42
|
+
if not file_path.is_absolute():
|
43
|
+
file_path = glob(
|
44
|
+
str(inputs.root) + "/**/" + os.path.basename(file_path),
|
45
|
+
recursive=True,
|
46
|
+
)
|
47
|
+
if len(file_path) == 0:
|
48
|
+
raise Exception(f"File {inputs.file} not found")
|
49
|
+
elif len(file_path) > 1:
|
50
|
+
raise Exception(f"Multiple files found for {inputs.file}")
|
51
|
+
file_path = file_path[0]
|
52
|
+
if self.verbose:
|
53
|
+
self.logger("Computing checksum...")
|
54
|
+
checksum = calculate_checksum(file_path)
|
55
|
+
if self.verbose:
|
56
|
+
self.logger("Ingesting file...")
|
57
|
+
filesize = os.path.getsize(file_path)
|
58
|
+
# ingest small file
|
59
|
+
if filesize < 1024 * 1024 * 16: # 16 MB
|
60
|
+
data, error = self.repo.ingest_file(
|
61
|
+
file_path, inputs.dataset_id, id_token, checksum
|
62
|
+
)
|
63
|
+
if error:
|
64
|
+
raise Exception(error)
|
65
|
+
if self.verbose:
|
66
|
+
self.logger("Done")
|
67
|
+
return self.Outputs(data=data)
|
68
|
+
# ingest large file
|
69
|
+
upload_id, parts = self.repo.prepare_large_upload(
|
70
|
+
file_path, inputs.dataset_id, checksum, id_token
|
71
|
+
)
|
72
|
+
self.repo.ingest_large_dataset(file_path, upload_id, id_token, parts)
|
73
|
+
if self.verbose:
|
74
|
+
self.logger("\nCompleting upload...")
|
75
|
+
data, error = self.repo.complete_upload(id_token, upload_id)
|
57
76
|
if error:
|
58
77
|
raise Exception(error)
|
59
|
-
self.
|
78
|
+
if self.verbose:
|
79
|
+
self.logger("Done")
|
60
80
|
return self.Outputs(data=data)
|
@@ -1,42 +1,77 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
|
-
from ....curation.stac import STACDataFrame
|
3
2
|
import json
|
3
|
+
from pathlib import Path
|
4
|
+
from tqdm import tqdm
|
5
|
+
|
6
|
+
from ....curation.stac import STACDataFrame
|
4
7
|
|
5
8
|
|
6
9
|
class IngestSTAC:
|
7
|
-
def __init__(self, repo, ingest_file, allowed_extensions):
|
10
|
+
def __init__(self, repo, ingest_file, allowed_extensions, logger):
|
8
11
|
self.repo = repo
|
9
12
|
self.ingest_file = ingest_file
|
10
13
|
self.allowed_extensions = allowed_extensions
|
14
|
+
self.logger = logger if logger else print
|
11
15
|
|
12
16
|
class Inputs(BaseModel):
|
13
|
-
stac_catalog:
|
14
|
-
dataset: str
|
17
|
+
stac_catalog: Path
|
15
18
|
user: dict
|
16
19
|
|
17
20
|
class Outputs(BaseModel):
|
18
21
|
dataset: dict
|
19
22
|
|
20
23
|
def __call__(self, inputs: Inputs) -> Outputs:
|
24
|
+
# retrieve the user's geodb credentials
|
25
|
+
# creds, error = self.repo.retrieve_credentials(inputs.user["id_token"])
|
26
|
+
# self.validate_credentials(creds)
|
21
27
|
# load the STAC catalog as a STACsetFrame
|
28
|
+
self.logger("Loading STAC catalog...")
|
22
29
|
df = STACDataFrame.from_stac_file(inputs.stac_catalog)
|
30
|
+
catalog = df[df["type"] == "Catalog"]
|
31
|
+
assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
|
32
|
+
dataset_name = catalog.id.iloc[0]
|
33
|
+
# create dataset
|
34
|
+
data, error = self.repo.create_stac_dataset(
|
35
|
+
dataset_name, inputs.user["id_token"]
|
36
|
+
)
|
37
|
+
if error:
|
38
|
+
data, error2 = self.repo.retrieve_dataset(dataset_name)
|
39
|
+
if error2:
|
40
|
+
raise Exception(error)
|
41
|
+
if data["uid"] != inputs.user["sub"]:
|
42
|
+
raise Exception("Dataset already exists.")
|
43
|
+
dataset_id = data["id"]
|
44
|
+
# TODO: put size to 0 or else will add up
|
45
|
+
else:
|
46
|
+
dataset_id = data["dataset_id"]
|
47
|
+
# TODO: check that we can ingest in geodb
|
23
48
|
# upload all assets to EOTDL
|
24
|
-
|
49
|
+
self.logger("Uploading assets...")
|
50
|
+
df2 = df.dropna(subset=["assets"])
|
51
|
+
for row in tqdm(df2.iterrows(), total=len(df2)):
|
25
52
|
# for asset in df.assets.dropna().values[:10]:
|
26
53
|
try:
|
27
54
|
for k, v in row[1]["assets"].items():
|
28
55
|
data = self.ingest_file(
|
29
56
|
v["href"],
|
30
|
-
|
31
|
-
|
57
|
+
dataset_id,
|
58
|
+
self.logger,
|
59
|
+
self.allowed_extensions + [".tif", ".tiff", ".jpg"],
|
60
|
+
verbose=False,
|
61
|
+
root=inputs.stac_catalog.parent.parent, # esto será siempre así en STAC?
|
32
62
|
)
|
33
63
|
file_url = f"{self.repo.url}datasets/{data['dataset_id']}/download/{data['file_name']}"
|
34
64
|
df.loc[row[0], "assets"][k]["href"] = file_url
|
35
65
|
except Exception as e:
|
66
|
+
self.logger(f"Error uploading asset {row[0]}: {e}")
|
36
67
|
break
|
68
|
+
# ingest the STAC catalog into geodb
|
69
|
+
self.logger("Ingesting STAC catalog...")
|
37
70
|
data, error = self.repo.ingest_stac(
|
38
|
-
json.loads(df.to_json()),
|
71
|
+
json.loads(df.to_json()), dataset_id, inputs.user["id_token"]
|
39
72
|
)
|
40
73
|
if error:
|
74
|
+
# TODO: delete all assets that were uploaded
|
41
75
|
raise Exception(error)
|
76
|
+
self.logger("Done")
|
42
77
|
return self.Outputs(dataset=data)
|
@@ -10,9 +10,10 @@ class RetrieveDatasets:
|
|
10
10
|
pass
|
11
11
|
|
12
12
|
class Outputs(BaseModel):
|
13
|
-
datasets:
|
13
|
+
datasets: List[str]
|
14
14
|
|
15
15
|
def __call__(self, inputs: Inputs) -> Outputs:
|
16
16
|
data = self.repo.retrieve_datasets()
|
17
|
-
datasets = {d["name"]: [f["name"] for f in d["files"]] for d in data}
|
17
|
+
# datasets = {d["name"]: [f["name"] for f in d["files"]] for d in data}
|
18
|
+
datasets = [d["name"] for d in data]
|
18
19
|
return self.Outputs(datasets=datasets)
|
eotdl/tools/sen12floods/tools.py
CHANGED
@@ -22,7 +22,7 @@ def get_images_by_location(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
|
22
22
|
- images_count: the count of available images of each location.
|
23
23
|
- images_dates: list with the dates of the available images of each location.
|
24
24
|
"""
|
25
|
-
uniques_location_id = gdf['
|
25
|
+
uniques_location_id = gdf['scene_id'].unique() # List of unique location ids
|
26
26
|
uniques_location_id.sort()
|
27
27
|
|
28
28
|
images_count_list, images_dates_list = [], []
|
@@ -30,11 +30,11 @@ def get_images_by_location(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
|
30
30
|
# Iterate the unique location ids, count the number of images per location and generate
|
31
31
|
# a list with the dates of every image in a location
|
32
32
|
for location_id in uniques_location_id:
|
33
|
-
dates = gdf[gdf['
|
33
|
+
dates = gdf[gdf['scene_id'] == location_id]['datetime']
|
34
34
|
images_count_list.append(dates.count())
|
35
35
|
images_dates_list.append(dates.tolist())
|
36
36
|
|
37
|
-
data = {'
|
37
|
+
data = {'scene_id': uniques_location_id, 'dates_count': images_count_list, 'dates_list': images_dates_list}
|
38
38
|
gdf_dates_per_aoi = gpd.GeoDataFrame.from_dict(data)
|
39
39
|
|
40
40
|
return gdf_dates_per_aoi
|