eotdl 2023.7.19.post3__py3-none-any.whl → 2023.9.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,11 +3,13 @@ STAC utils
3
3
  """
4
4
 
5
5
  import pystac
6
+ import json
6
7
 
8
+ from os.path import dirname, join, exists
9
+ from os import listdir
7
10
  from datetime import datetime
8
11
  from dateutil import parser
9
12
  from pandas import isna
10
- from numpy import nan
11
13
  from typing import Union
12
14
 
13
15
 
@@ -84,3 +86,47 @@ def get_all_children(obj: pystac.STACObject) -> list:
84
86
  children.append(item.to_dict())
85
87
 
86
88
  return children
89
+
90
+
91
+ def cut_images(images_list: Union[list, tuple]) -> list:
92
+ """
93
+ """
94
+ dirnames = list()
95
+ images = list()
96
+
97
+ for image in images_list:
98
+ dir = dirname(image)
99
+ if dir not in dirnames:
100
+ dirnames.append(dir)
101
+ images.append(image)
102
+
103
+ return images
104
+
105
+
106
+ def get_item_metadata(raster_path: str) -> str:
107
+ """
108
+ Get the metadata JSON file of a given directory, associated to a raster file
109
+
110
+ :param raster_path: path to the raster file
111
+ """
112
+ # Get the directory of the raster file
113
+ raster_dir_path = dirname(raster_path)
114
+ # Get the metadata JSON file
115
+ # Check if there is a metadata.json file in the directory
116
+ if 'metadata.json' in listdir(raster_dir_path):
117
+ metadata_json = join(raster_dir_path, 'metadata.json')
118
+ else:
119
+ # If there is no metadata.json file in the directory, check if there is
120
+ # a json file with the same name as the raster file
121
+ raster_name = raster_path.split('/')[-1]
122
+ raster_name = raster_name.split('.')[0]
123
+ metadata_json = join(raster_dir_path, f'{raster_name}.json')
124
+ if not exists(metadata_json):
125
+ # If there is no metadata.json file in the directory, return None
126
+ return None
127
+
128
+ # Open the metadata.json file and return it
129
+ with open(metadata_json, 'r') as f:
130
+ metadata = json.load(f)
131
+
132
+ return metadata
@@ -1,3 +1,3 @@
1
- from .ingest import ingest_file, ingest_folder, ingest_q1, ingest_stac
2
- from .download import download_dataset
1
+ from .ingest import ingest_dataset
2
+ from .download import download_dataset, download_file_url
3
3
  from .retrieve import retrieve_datasets, retrieve_dataset, list_datasets
@@ -1,13 +1,26 @@
1
1
  from ..src.repos import APIRepo
2
- from ..src.usecases.datasets import DownloadDataset, DownloadFile
2
+ from ..src.usecases.datasets import DownloadDataset, DownloadFileURL
3
3
  from .retrieve import retrieve_dataset
4
4
  from ..auth import with_auth
5
5
 
6
6
 
7
7
  @with_auth
8
- def download_dataset(dataset, file, path=None, logger=None, user=None):
8
+ def download_dataset(
9
+ dataset, file=None, path=None, logger=None, assets=False, force=False, user=None
10
+ ):
9
11
  api_repo = APIRepo()
10
12
  download = DownloadDataset(api_repo, retrieve_dataset, logger)
11
- inputs = download.Inputs(dataset=dataset, file=file, path=path, user=user)
13
+ inputs = download.Inputs(
14
+ dataset=dataset, file=file, path=path, user=user, assets=assets, force=force
15
+ )
16
+ outputs = download(inputs)
17
+ return outputs.dst_path
18
+
19
+
20
+ @with_auth
21
+ def download_file_url(url, path, progress=True, logger=None, user=None):
22
+ api_repo = APIRepo()
23
+ download = DownloadFileURL(api_repo, logger, progress)
24
+ inputs = DownloadFileURL.Inputs(url=url, path=path, user=user)
12
25
  outputs = download(inputs)
13
26
  return outputs.dst_path
eotdl/datasets/ingest.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
1
+ from pathlib import Path
2
2
 
3
3
  from ..src.repos import APIRepo
4
4
  from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
@@ -8,28 +8,39 @@ from ..auth import with_auth
8
8
  allowed_extensions = [
9
9
  ".zip",
10
10
  ".tar",
11
- ".tar.gz",
11
+ ".gz",
12
12
  ".csv",
13
13
  ".txt",
14
14
  ".json",
15
+ ".geojson",
15
16
  ".pdf",
16
17
  ".md",
17
18
  ".yml",
18
19
  ]
19
20
 
20
21
 
21
- def ingest_q1(dataset, stac_catalog):
22
- print("hola")
23
- return
22
+ def ingest_dataset(path, f=False, d=False, logger=print):
23
+ path = Path(path)
24
+ if not path.is_dir():
25
+ raise Exception("Path must be a folder")
26
+ if "catalog.json" in [f.name for f in path.iterdir()]:
27
+ return ingest_stac(path / "catalog.json", logger)
28
+ return ingest_folder(path, f, d, logger)
24
29
 
25
30
 
26
31
  @with_auth
27
32
  def ingest_file(
28
- file, dataset_id, logger=None, allowed_extensions=allowed_extensions, user=None
33
+ file,
34
+ dataset_id,
35
+ logger=None,
36
+ allowed_extensions=allowed_extensions,
37
+ verbose=True,
38
+ root=None,
39
+ user=None,
29
40
  ):
30
41
  api_repo = APIRepo()
31
- ingest = IngestFile(api_repo, allowed_extensions, logger)
32
- inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
42
+ ingest = IngestFile(api_repo, allowed_extensions, logger, verbose)
43
+ inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user, root=root)
33
44
  outputs = ingest(inputs)
34
45
  return outputs.data
35
46
 
@@ -44,9 +55,9 @@ def ingest_folder(folder, force, delete, logger=None, user=None):
44
55
 
45
56
 
46
57
  @with_auth
47
- def ingest_stac(stac_catalog, dataset, logger=None, user=None):
58
+ def ingest_stac(stac_catalog, logger=None, user=None):
48
59
  api_repo = APIRepo()
49
- ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
50
- inputs = ingest.Inputs(stac_catalog=stac_catalog, dataset=dataset, user=user)
60
+ ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions, logger)
61
+ inputs = ingest.Inputs(stac_catalog=stac_catalog, user=user)
51
62
  outputs = ingest(inputs)
52
63
  return outputs.dataset
@@ -1,9 +1,17 @@
1
+ import re
2
+
1
3
  from ..src.repos import APIRepo
2
4
  from ..src.usecases.datasets import RetrieveDatasets, RetrieveDataset
3
5
 
4
6
 
5
- def list_datasets():
6
- return retrieve_datasets()
7
+ def list_datasets(pattern=None):
8
+ datasets = retrieve_datasets()
9
+ if pattern:
10
+ regex = re.compile(rf".*{re.escape(pattern)}.*", re.IGNORECASE)
11
+ names = list(datasets.keys())
12
+ valid = [name for name in names if regex.search(name)]
13
+ return {name: datasets[name] for name in valid}
14
+ return datasets
7
15
 
8
16
 
9
17
  def retrieve_datasets():
@@ -2,17 +2,14 @@ import requests
2
2
  from tqdm import tqdm
3
3
  from pathlib import Path
4
4
  import os
5
- from concurrent.futures import ThreadPoolExecutor
6
- import time
7
- import multiprocessing
8
5
  import hashlib
9
6
  import geopandas as gpd
10
7
 
11
8
 
12
9
  class APIRepo:
13
- def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
10
+ # def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
11
+ def __init__(self, url=os.getenv("EOTDL_API_URL", "http://localhost:8010/")):
14
12
  self.url = url
15
- # print(self.url)
16
13
 
17
14
  def login(self):
18
15
  return requests.get(self.url + "auth/login")
@@ -24,9 +21,18 @@ class APIRepo:
24
21
  response = requests.get(self.url + "auth/logout")
25
22
  return response.json()["logout_url"]
26
23
 
24
+ def retrieve_credentials(self, id_token):
25
+ response = requests.get(
26
+ self.url + "auth/credentials",
27
+ headers={"Authorization": "Bearer " + id_token},
28
+ )
29
+ if response.status_code == 200:
30
+ return response.json(), None
31
+ return None, response.json()["detail"]
32
+
27
33
  def create_dataset(self, metadata, id_token):
28
34
  response = requests.post(
29
- self.url + "datasets",
35
+ self.url + "datasets/q0",
30
36
  json=metadata,
31
37
  headers={"Authorization": "Bearer " + id_token},
32
38
  )
@@ -34,6 +40,16 @@ class APIRepo:
34
40
  return response.json(), None
35
41
  return None, response.json()["detail"]
36
42
 
43
+ def create_stac_dataset(self, name, id_token):
44
+ response = requests.post(
45
+ self.url + "datasets/stac",
46
+ json={"name": name},
47
+ headers={"Authorization": "Bearer " + id_token},
48
+ )
49
+ if response.status_code == 200:
50
+ return response.json(), None
51
+ return None, response.json()["detail"]
52
+
37
53
  def retrieve_datasets(self):
38
54
  return requests.get(self.url + "datasets").json()
39
55
 
@@ -45,21 +61,29 @@ class APIRepo:
45
61
 
46
62
  def download_file(self, dataset, dataset_id, file, id_token, path):
47
63
  url = self.url + "datasets/" + dataset_id + "/download/" + file
64
+ return self.download_file_url(url, path, id_token, progress=True)
65
+
66
+ def download_file_url(self, url, path, id_token, progress=False):
48
67
  headers = {"Authorization": "Bearer " + id_token}
49
- path = f"{path}/{file}"
68
+ filename = url.split("/")[-1]
69
+ os.makedirs(path, exist_ok=True)
70
+ path = f"{path}/{filename}"
50
71
  with requests.get(url, headers=headers, stream=True) as r:
51
72
  r.raise_for_status()
52
73
  total_size = int(r.headers.get("content-length", 0))
53
74
  block_size = 1024 * 1024 * 10
54
- progress_bar = tqdm(
55
- total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
56
- )
75
+ if progress:
76
+ progress_bar = tqdm(
77
+ total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
78
+ )
57
79
  with open(path, "wb") as f:
58
80
  for chunk in r.iter_content(block_size):
59
- progress_bar.update(len(chunk))
81
+ if progress:
82
+ progress_bar.update(len(chunk))
60
83
  if chunk:
61
84
  f.write(chunk)
62
- progress_bar.close()
85
+ if progress:
86
+ progress_bar.close()
63
87
  return path
64
88
 
65
89
  def ingest_file(self, file, dataset_id, id_token, checksum=None):
@@ -75,8 +99,8 @@ class APIRepo:
75
99
 
76
100
  def ingest_file_url(self, file, dataset, id_token):
77
101
  reponse = requests.post(
78
- self.url + "datasets/url",
79
- json={"dataset": dataset, "url": file},
102
+ self.url + f"datasets/{dataset}/url",
103
+ json={"url": file},
80
104
  headers={"Authorization": "Bearer " + id_token},
81
105
  )
82
106
  if reponse.status_code != 200:
@@ -227,10 +251,10 @@ class APIRepo:
227
251
  return None, response.json()["detail"]
228
252
  return response.json(), None
229
253
 
230
- def ingest_stac(self, stac_json, dataset, id_token):
231
- reponse = requests.post(
232
- self.url + "datasets/stac",
233
- json={"dataset": dataset, "stac": stac_json},
254
+ def ingest_stac(self, stac_json, dataset_id, id_token):
255
+ reponse = requests.put(
256
+ self.url + f"datasets/stac/{dataset_id}",
257
+ json={"stac": stac_json},
234
258
  headers={"Authorization": "Bearer " + id_token},
235
259
  )
236
260
  if reponse.status_code != 200:
@@ -7,11 +7,11 @@ import jwt
7
7
  class AuthRepo:
8
8
  def __init__(self):
9
9
  self.algorithms = ["RS256"]
10
- self.home = str(Path.home())
11
- self.creds_path = self.home + "/.eotdl/creds.json"
10
+ self.base_path = str(Path.home()) + "/.cache/eotdl/"
11
+ os.makedirs(self.base_path, exist_ok=True)
12
+ self.creds_path = self.base_path + "creds.json"
12
13
 
13
14
  def save_creds(self, data):
14
- os.makedirs(self.home + "/.eotdl", exist_ok=True)
15
15
  with open(self.creds_path, "w") as f:
16
16
  json.dump(data, f)
17
17
  return self.creds_path
@@ -1,15 +1,17 @@
1
1
  from pydantic import BaseModel
2
+ from typing import Union
3
+
2
4
 
3
5
  class IsLogged:
4
6
  def __init__(self, repo):
5
7
  self.repo = repo
6
8
 
7
9
  class Inputs(BaseModel):
8
- pass
10
+ pass
9
11
 
10
12
  class Outputs(BaseModel):
11
- user: dict = None
13
+ user: Union[dict, None]
12
14
 
13
15
  def __call__(self, inputs: Inputs) -> Outputs:
14
16
  user = self.repo.load_creds()
15
- return self.Outputs(user=user)
17
+ return self.Outputs(user=user)
@@ -1,8 +1,11 @@
1
1
  from pydantic import BaseModel
2
- from ....src.utils import calculate_checksum
3
- from ....curation.stac import STACDataFrame
4
2
  from pathlib import Path
5
3
  import os
4
+ from typing import Union
5
+ from tqdm import tqdm
6
+
7
+ from ....curation.stac import STACDataFrame
8
+ from ....src.utils import calculate_checksum
6
9
 
7
10
 
8
11
  class DownloadDataset:
@@ -13,9 +16,11 @@ class DownloadDataset:
13
16
 
14
17
  class Inputs(BaseModel):
15
18
  dataset: str
16
- file: str = None
17
- path: str = None
19
+ file: Union[str, None] = None
20
+ path: Union[str, None] = None
18
21
  user: dict
22
+ assets: bool = False
23
+ force: bool = False
19
24
 
20
25
  class Outputs(BaseModel):
21
26
  dst_path: str
@@ -32,11 +37,20 @@ class DownloadDataset:
32
37
 
33
38
  def __call__(self, inputs: Inputs) -> Outputs:
34
39
  dataset = self.retrieve_dataset(inputs.dataset)
40
+ download_base_path = os.getenv(
41
+ "EOTDL_DOWNLOAD_PATH", str(Path.home()) + "/.cache/eotdl/datasets"
42
+ )
35
43
  if inputs.path is None:
36
- download_path = str(Path.home()) + "/.eotdl/datasets/" + inputs.dataset
44
+ download_path = download_base_path + "/" + inputs.dataset
37
45
  else:
38
46
  download_path = inputs.path + "/" + inputs.dataset
39
47
  os.makedirs(download_path, exist_ok=True)
48
+ # check if dataset already exists
49
+ if os.path.exists(download_path) and not inputs.force:
50
+ raise Exception(
51
+ f"Dataset {inputs.dataset} already exists at {download_path}. To force download, use force=True or -f in the CLI."
52
+ )
53
+
40
54
  if dataset["quality"] == 0:
41
55
  if inputs.file:
42
56
  files = [f for f in dataset["files"] if f["name"] == inputs.file]
@@ -64,6 +78,7 @@ class DownloadDataset:
64
78
  )
65
79
  return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
66
80
  else:
81
+ self.logger("Downloading STAC metadata...")
67
82
  gdf, error = self.repo.download_stac(
68
83
  dataset["id"],
69
84
  inputs.user["id_token"],
@@ -74,6 +89,20 @@ class DownloadDataset:
74
89
  # df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
75
90
  path = inputs.path
76
91
  if path is None:
77
- path = str(Path.home()) + "/.eotdl/datasets/" + dataset["name"]
92
+ path = download_base_path + "/" + dataset["name"]
78
93
  df.to_stac(path)
94
+ # download assets
95
+ if inputs.assets:
96
+ self.logger("Downloading assets...")
97
+ df = df.dropna(subset=["assets"])
98
+ for row in tqdm(df.iterrows(), total=len(df)):
99
+ id = row[1]["stac_id"]
100
+ # print(row[1]["links"])
101
+ for k, v in row[1]["assets"].items():
102
+ href = v["href"]
103
+ self.repo.download_file_url(
104
+ href, f"{path}/assets/{id}", inputs.user["id_token"]
105
+ )
106
+ else:
107
+ self.logger("To download assets, set assets=True or -a in the CLI.")
79
108
  return self.Outputs(dst_path=path)
@@ -0,0 +1,22 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class DownloadFileURL:
5
+ def __init__(self, repo, logger, progress=True):
6
+ self.repo = repo
7
+ self.logger = logger if logger else print
8
+ self.progress = progress
9
+
10
+ class Inputs(BaseModel):
11
+ url: str
12
+ path: str = None
13
+ user: dict
14
+
15
+ class Outputs(BaseModel):
16
+ dst_path: str
17
+
18
+ def __call__(self, inputs: Inputs) -> Outputs:
19
+ dst_path = self.repo.download_file_url(
20
+ inputs.url, inputs.path, inputs.user["id_token"], progress=self.progress
21
+ )
22
+ return self.Outputs(dst_path=dst_path)
@@ -1,20 +1,24 @@
1
1
  from pydantic import BaseModel
2
2
  import os
3
3
  import typing
4
+ from pathlib import Path
5
+ from glob import glob
4
6
 
5
7
  from ....src.utils import calculate_checksum
6
8
 
7
9
 
8
10
  class IngestFile:
9
- def __init__(self, repo, allowed_extensions, logger):
11
+ def __init__(self, repo, allowed_extensions, logger, verbose=True):
10
12
  self.repo = repo
11
13
  self.allowed_extensions = allowed_extensions
12
14
  self.logger = logger if logger else print
15
+ self.verbose = verbose
13
16
 
14
17
  class Inputs(BaseModel):
15
18
  file: typing.Any
16
19
  dataset_id: str
17
20
  user: dict
21
+ root: typing.Optional[Path] = None
18
22
 
19
23
  class Outputs(BaseModel):
20
24
  data: dict
@@ -27,34 +31,50 @@ class IngestFile:
27
31
  f"Only {', '.join(self.allowed_extensions)} files are allowed"
28
32
  )
29
33
  id_token = inputs.user["id_token"]
30
- self.logger(f"Uploading file {inputs.file}...")
31
- # if inputs.file.startswith("http://") or inputs.file.startswith("https://"):
32
- # data, error = self.repo.ingest_file_url(
33
- # inputs.file, inputs.metadata.name, id_token
34
- # )
35
- # else:
36
- self.logger("Computing checksum...")
37
- checksum = calculate_checksum(inputs.file)
38
- self.logger(checksum)
39
- self.logger("Ingesting file...")
40
- filesize = os.path.getsize(inputs.file)
41
- # ingest small file
42
- if filesize < 1024 * 1024 * 16: # 16 MB
43
- data, error = self.repo.ingest_file(
44
- inputs.file, inputs.dataset_id, id_token, checksum
34
+ if self.verbose:
35
+ self.logger(f"Uploading file {inputs.file}...")
36
+ if inputs.file.startswith("http://") or inputs.file.startswith("https://"):
37
+ data, error = self.repo.ingest_file_url(
38
+ inputs.file, inputs.dataset_id, id_token
45
39
  )
46
- if error:
47
- raise Exception(error)
48
- self.logger("Done")
49
- return self.Outputs(data=data)
50
- # ingest large file
51
- upload_id, parts = self.repo.prepare_large_upload(
52
- inputs.file, inputs.dataset_id, checksum, id_token
53
- )
54
- self.repo.ingest_large_dataset(inputs.file, upload_id, id_token, parts)
55
- self.logger("\nCompleting upload...")
56
- data, error = self.repo.complete_upload(id_token, upload_id)
40
+ else:
41
+ file_path = Path(inputs.file)
42
+ if not file_path.is_absolute():
43
+ file_path = glob(
44
+ str(inputs.root) + "/**/" + os.path.basename(file_path),
45
+ recursive=True,
46
+ )
47
+ if len(file_path) == 0:
48
+ raise Exception(f"File {inputs.file} not found")
49
+ elif len(file_path) > 1:
50
+ raise Exception(f"Multiple files found for {inputs.file}")
51
+ file_path = file_path[0]
52
+ if self.verbose:
53
+ self.logger("Computing checksum...")
54
+ checksum = calculate_checksum(file_path)
55
+ if self.verbose:
56
+ self.logger("Ingesting file...")
57
+ filesize = os.path.getsize(file_path)
58
+ # ingest small file
59
+ if filesize < 1024 * 1024 * 16: # 16 MB
60
+ data, error = self.repo.ingest_file(
61
+ file_path, inputs.dataset_id, id_token, checksum
62
+ )
63
+ if error:
64
+ raise Exception(error)
65
+ if self.verbose:
66
+ self.logger("Done")
67
+ return self.Outputs(data=data)
68
+ # ingest large file
69
+ upload_id, parts = self.repo.prepare_large_upload(
70
+ file_path, inputs.dataset_id, checksum, id_token
71
+ )
72
+ self.repo.ingest_large_dataset(file_path, upload_id, id_token, parts)
73
+ if self.verbose:
74
+ self.logger("\nCompleting upload...")
75
+ data, error = self.repo.complete_upload(id_token, upload_id)
57
76
  if error:
58
77
  raise Exception(error)
59
- self.logger("Done")
78
+ if self.verbose:
79
+ self.logger("Done")
60
80
  return self.Outputs(data=data)
@@ -1,42 +1,77 @@
1
1
  from pydantic import BaseModel
2
- from ....curation.stac import STACDataFrame
3
2
  import json
3
+ from pathlib import Path
4
+ from tqdm import tqdm
5
+
6
+ from ....curation.stac import STACDataFrame
4
7
 
5
8
 
6
9
  class IngestSTAC:
7
- def __init__(self, repo, ingest_file, allowed_extensions):
10
+ def __init__(self, repo, ingest_file, allowed_extensions, logger):
8
11
  self.repo = repo
9
12
  self.ingest_file = ingest_file
10
13
  self.allowed_extensions = allowed_extensions
14
+ self.logger = logger if logger else print
11
15
 
12
16
  class Inputs(BaseModel):
13
- stac_catalog: str
14
- dataset: str
17
+ stac_catalog: Path
15
18
  user: dict
16
19
 
17
20
  class Outputs(BaseModel):
18
21
  dataset: dict
19
22
 
20
23
  def __call__(self, inputs: Inputs) -> Outputs:
24
+ # retrieve the user's geodb credentials
25
+ # creds, error = self.repo.retrieve_credentials(inputs.user["id_token"])
26
+ # self.validate_credentials(creds)
21
27
  # load the STAC catalog as a STACsetFrame
28
+ self.logger("Loading STAC catalog...")
22
29
  df = STACDataFrame.from_stac_file(inputs.stac_catalog)
30
+ catalog = df[df["type"] == "Catalog"]
31
+ assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
32
+ dataset_name = catalog.id.iloc[0]
33
+ # create dataset
34
+ data, error = self.repo.create_stac_dataset(
35
+ dataset_name, inputs.user["id_token"]
36
+ )
37
+ if error:
38
+ data, error2 = self.repo.retrieve_dataset(dataset_name)
39
+ if error2:
40
+ raise Exception(error)
41
+ if data["uid"] != inputs.user["sub"]:
42
+ raise Exception("Dataset already exists.")
43
+ dataset_id = data["id"]
44
+ # TODO: put size to 0 or else will add up
45
+ else:
46
+ dataset_id = data["dataset_id"]
47
+ # TODO: check that we can ingest in geodb
23
48
  # upload all assets to EOTDL
24
- for row in df.dropna(subset=["assets"]).iterrows():
49
+ self.logger("Uploading assets...")
50
+ df2 = df.dropna(subset=["assets"])
51
+ for row in tqdm(df2.iterrows(), total=len(df2)):
25
52
  # for asset in df.assets.dropna().values[:10]:
26
53
  try:
27
54
  for k, v in row[1]["assets"].items():
28
55
  data = self.ingest_file(
29
56
  v["href"],
30
- inputs.dataset,
31
- allowed_extensions=self.allowed_extensions + [".tif", ".tiff"],
57
+ dataset_id,
58
+ self.logger,
59
+ self.allowed_extensions + [".tif", ".tiff", ".jpg"],
60
+ verbose=False,
61
+ root=inputs.stac_catalog.parent.parent, # esto será siempre así en STAC?
32
62
  )
33
63
  file_url = f"{self.repo.url}datasets/{data['dataset_id']}/download/{data['file_name']}"
34
64
  df.loc[row[0], "assets"][k]["href"] = file_url
35
65
  except Exception as e:
66
+ self.logger(f"Error uploading asset {row[0]}: {e}")
36
67
  break
68
+ # ingest the STAC catalog into geodb
69
+ self.logger("Ingesting STAC catalog...")
37
70
  data, error = self.repo.ingest_stac(
38
- json.loads(df.to_json()), inputs.dataset, inputs.user["id_token"]
71
+ json.loads(df.to_json()), dataset_id, inputs.user["id_token"]
39
72
  )
40
73
  if error:
74
+ # TODO: delete all assets that were uploaded
41
75
  raise Exception(error)
76
+ self.logger("Done")
42
77
  return self.Outputs(dataset=data)
@@ -10,9 +10,10 @@ class RetrieveDatasets:
10
10
  pass
11
11
 
12
12
  class Outputs(BaseModel):
13
- datasets: dict
13
+ datasets: List[str]
14
14
 
15
15
  def __call__(self, inputs: Inputs) -> Outputs:
16
16
  data = self.repo.retrieve_datasets()
17
- datasets = {d["name"]: [f["name"] for f in d["files"]] for d in data}
17
+ # datasets = {d["name"]: [f["name"] for f in d["files"]] for d in data}
18
+ datasets = [d["name"] for d in data]
18
19
  return self.Outputs(datasets=datasets)
@@ -1,4 +1,5 @@
1
1
  from .DownloadDataset import DownloadDataset
2
+ from .DownloadFileURL import DownloadFileURL
2
3
  from .IngestDataset import IngestDataset
3
4
  from .IngestLargeDataset import IngestLargeDataset
4
5
  from .RetrieveDataset import RetrieveDataset