eotdl 2023.6.14.post10__tar.gz → 2023.7.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/PKG-INFO +1 -1
  2. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/cli.py +0 -1
  3. eotdl-2023.7.19/eotdl/commands/datasets.py +67 -0
  4. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/__init__.py +1 -1
  5. eotdl-2023.7.19/eotdl/curation/stac/dataframe.py +144 -0
  6. eotdl-2023.7.19/eotdl/datasets/__init__.py +3 -0
  7. eotdl-2023.7.19/eotdl/datasets/download.py +13 -0
  8. eotdl-2023.7.19/eotdl/datasets/ingest.py +52 -0
  9. eotdl-2023.7.19/eotdl/src/models/__init__.py +1 -0
  10. eotdl-2023.7.19/eotdl/src/models/metadata.py +16 -0
  11. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/repos/APIRepo.py +82 -98
  12. eotdl-2023.7.19/eotdl/src/usecases/datasets/DownloadDataset.py +79 -0
  13. eotdl-2023.6.14.post10/eotdl/src/usecases/datasets/DownloadDataset.py → eotdl-2023.7.19/eotdl/src/usecases/datasets/DownloadFile.py +8 -5
  14. eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestFile.py +60 -0
  15. eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestFolder.py +98 -0
  16. eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestSTAC.py +42 -0
  17. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/RetrieveDatasets.py +5 -4
  18. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/__init__.py +3 -1
  19. eotdl-2023.7.19/eotdl/src/utils.py +17 -0
  20. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/pyproject.toml +2 -1
  21. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/setup.py +2 -1
  22. eotdl-2023.6.14.post10/eotdl/commands/datasets.py +0 -78
  23. eotdl-2023.6.14.post10/eotdl/datasets/__init__.py +0 -4
  24. eotdl-2023.6.14.post10/eotdl/datasets/download.py +0 -18
  25. eotdl-2023.6.14.post10/eotdl/datasets/ingest.py +0 -32
  26. eotdl-2023.6.14.post10/eotdl/datasets/update.py +0 -12
  27. eotdl-2023.6.14.post10/eotdl/src/usecases/datasets/UpdateDataset.py +0 -32
  28. eotdl-2023.6.14.post10/eotdl/src/utils.py +0 -17
  29. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/README.md +0 -0
  30. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/__init__.py +0 -0
  31. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/__init__.py +0 -0
  32. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/parameters.py +0 -0
  33. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/sentinelhub/__init__.py +0 -0
  34. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/sentinelhub/client.py +0 -0
  35. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/sentinelhub/utils.py +0 -0
  36. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/auth/__init__.py +0 -0
  37. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/auth/main.py +0 -0
  38. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/commands/__init__.py +0 -0
  39. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/commands/auth.py +0 -0
  40. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/__init__.py +0 -0
  41. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/formatters.py +0 -0
  42. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/metadata.py +0 -0
  43. /eotdl-2023.6.14.post10/eotdl/curation/stac/dataframe.py → /eotdl-2023.7.19/eotdl/curation/stac/dataframe_bck.py +0 -0
  44. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/extensions.py +0 -0
  45. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/parsers.py +0 -0
  46. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/stac.py +0 -0
  47. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/utils.py +0 -0
  48. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/datasets/retrieve.py +0 -0
  49. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/hello.py +0 -0
  50. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/__init__.py +0 -0
  51. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/errors/__init__.py +0 -0
  52. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/errors/auth.py +0 -0
  53. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/repos/AuthRepo.py +0 -0
  54. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/repos/__init__.py +0 -0
  55. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/__init__.py +0 -0
  56. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/Auth.py +0 -0
  57. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/IsLogged.py +0 -0
  58. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/Logout.py +0 -0
  59. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/__init__.py +0 -0
  60. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestDataset.py +0 -0
  61. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestLargeDataset.py +0 -0
  62. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestLargeDatasetParallel.py +0 -0
  63. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/RetrieveDataset.py +0 -0
  64. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/__init__.py +0 -0
  65. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/sen12floods/__init__.py +0 -0
  66. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/sen12floods/tools.py +0 -0
  67. {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/stac.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: eotdl
3
- Version: 2023.6.14.post10
3
+ Version: 2023.7.19
4
4
  Summary: Earth Observation Training Data Lab
5
5
  License: MIT
6
6
  Author: EarthPulse
@@ -6,6 +6,5 @@ app = typer.Typer()
6
6
  app.add_typer(auth.app, name="auth")
7
7
  app.add_typer(datasets.app, name="datasets")
8
8
 
9
-
10
9
  if __name__ == "__main__":
11
10
  app()
@@ -0,0 +1,67 @@
1
+ import typer
2
+ from pathlib import Path
3
+
4
+ from ..datasets import (
5
+ retrieve_datasets,
6
+ download_dataset,
7
+ ingest_folder,
8
+ ingest_stac,
9
+ )
10
+
11
+ app = typer.Typer()
12
+
13
+
14
+ @app.command()
15
+ def ingest(
16
+ path: Path,
17
+ f: bool = typer.Option(False, "--f", help="Force ingest even if file exists"),
18
+ d: bool = typer.Option(False, "--d", help="Delete files not in the dataset"),
19
+ ):
20
+ """
21
+ Ingest a dataset
22
+
23
+ path: Path to folder with the dataset
24
+ """
25
+ try:
26
+ if not path.is_dir():
27
+ typer.echo("Path must be a folder")
28
+ return
29
+ if "catalog.json" in [f.name for f in path.iterdir()]:
30
+ ingest_stac(str(path) + "/catalog.json", typer.echo)
31
+ else:
32
+ ingest_folder(path, f, d, typer.echo)
33
+ except Exception as e:
34
+ typer.echo(e)
35
+
36
+
37
+ @app.command()
38
+ def list():
39
+ """
40
+ List all datasets and files
41
+ """
42
+ datasets = retrieve_datasets()
43
+ typer.echo(datasets)
44
+
45
+
46
+ @app.command()
47
+ def get(
48
+ dataset: str,
49
+ path: str = None,
50
+ file: str = None,
51
+ ):
52
+ """
53
+ Download a dataset
54
+
55
+ dataset: Name of the dataset
56
+ file: Name of the file to download (optional, if not provided, the whole dataset will be downloaded)
57
+ path: Path to download the dataset to (optional, if not provided, the dataset will be downloaded to ~/.eotdl/datasets)
58
+ """
59
+ try:
60
+ dst_path = download_dataset(dataset, file, path, typer.echo)
61
+ typer.echo(f"Data available at {dst_path}")
62
+ except Exception as e:
63
+ typer.echo(e)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ app()
@@ -1,4 +1,4 @@
1
1
  # from .stac import STACGenerator
2
2
  # from .utils import format_time_acquired
3
3
  # from .parsers import STACIdParser, StructuredParser, UnestructuredParser
4
- # from .dataframe import STACDataFrame, read_stac
4
+ from .dataframe import STACDataFrame, read_stac
@@ -0,0 +1,144 @@
1
+ """
2
+ Module for the STAC dataframe
3
+ """
4
+
5
+ import pandas as pd
6
+ import geopandas as gpd
7
+ import pystac
8
+ import json
9
+ from geomet import wkt
10
+ from os.path import join
11
+ from os import makedirs
12
+ from typing import Union
13
+ from math import isnan
14
+ from .utils import convert_df_geom_to_shape, get_all_children
15
+
16
+
17
+ class STACDataFrame(gpd.GeoDataFrame):
18
+ def __init__(self, *args, **kwargs):
19
+ super().__init__(*args, **kwargs)
20
+
21
+ @classmethod
22
+ def from_stac_file(self, stac_file):
23
+ """
24
+ Create a STACDataFrame from a STAC file
25
+ """
26
+ return read_stac(stac_file)
27
+
28
+ def to_stac(self, path):
29
+ """
30
+ Create a STAC catalog and children from a STACDataFrame
31
+ """
32
+ df = self.copy()
33
+
34
+ if "id" in df.columns and "stac_id" in df.columns:
35
+ id_column = "stac_id"
36
+ stac_id_exists = True
37
+ else:
38
+ id_column = "id"
39
+ stac_id_exists = False
40
+
41
+ # First, create the catalog and its folder, if exists
42
+ catalog_df = df[df["type"] == "Catalog"]
43
+
44
+ if catalog_df.empty:
45
+ makedirs(path, exist_ok=True)
46
+ else:
47
+ for index, row in catalog_df.iterrows():
48
+ root_output_folder = path + "/" + row[id_column]
49
+ makedirs(root_output_folder, exist_ok=True)
50
+ row_json = row.to_dict()
51
+
52
+ # Curate the json row
53
+ row_json = self.curate_json_row(row_json, stac_id_exists)
54
+
55
+ with open(join(root_output_folder, f"catalog.json"), "w") as f:
56
+ json.dump(row_json, f)
57
+
58
+ # Second, create the collections and their folders, if exist
59
+ collections = dict()
60
+ collections_df = df[df["type"] == "Collection"]
61
+ for index, row in collections_df.iterrows():
62
+ stac_output_folder = join(root_output_folder, row[id_column])
63
+ collections[row[id_column]] = stac_output_folder
64
+ makedirs(stac_output_folder, exist_ok=True)
65
+ row_json = row.to_dict()
66
+
67
+ # Curate the json row
68
+ row_json = self.curate_json_row(row_json, stac_id_exists)
69
+
70
+ with open(join(stac_output_folder, f"collection.json"), "w") as f:
71
+ json.dump(row_json, f)
72
+
73
+ # Then, create the items and their folders, if exist
74
+ features_df = df[df["type"] == "Feature"]
75
+ for index, row in features_df.iterrows():
76
+ collection = row["collection"]
77
+ stac_output_folder = join(collections[collection], row[id_column])
78
+
79
+ # Convert the geometry from WKT back to geojson
80
+ row["geometry"] = row["geometry"].wkt
81
+ row["geometry"] = wkt.loads(row["geometry"])
82
+ makedirs(stac_output_folder, exist_ok=True)
83
+ row_json = row.to_dict()
84
+
85
+ # Curate the json row
86
+ row_json = self.curate_json_row(row_json, stac_id_exists)
87
+
88
+ with open(join(stac_output_folder, f'{row_json["id"]}.json'), "w") as f:
89
+ json.dump(row_json, f)
90
+
91
+ def curate_json_row(self, row: dict, stac_id_exists: bool) -> dict:
92
+ """
93
+ Curate the json row of a STACDataFrame, in order to generate a valid STAC file
94
+
95
+ :param row: row of a STACDataFrame
96
+ :param stac_id_exists: if the stac_id column exists
97
+ """
98
+ keys_to_remove = list()
99
+
100
+ # Remove the created_at and modified_at columns, if the STACDataFrame comes from GeoDB
101
+ for i in "created_at", "modified_at":
102
+ if i in row.keys():
103
+ keys_to_remove.append(i)
104
+
105
+ # Rename the stac_id column to id, to avoid conflicts with the id column
106
+ if stac_id_exists:
107
+ row["id"] = row["stac_id"]
108
+ del row["stac_id"]
109
+
110
+ # Remove the NaN values and empty strings
111
+ for k, v in row.items():
112
+ if (isinstance(v, float) and isnan(v)) or v == "":
113
+ keys_to_remove.append(k)
114
+ for key in keys_to_remove:
115
+ del row[key]
116
+ del row["geometry"]
117
+
118
+ return row
119
+
120
+
121
+ def read_stac(
122
+ stac_file: Union[pystac.Catalog, pystac.Collection, str],
123
+ geometry_column: str = "geometry",
124
+ ) -> STACDataFrame:
125
+ """
126
+ Read a STAC file and return a STACDataFrame
127
+
128
+ :param stac_file: STAC file to read
129
+ :param geometry_column: name of the geometry column
130
+ """
131
+ if isinstance(stac_file, str):
132
+ stac_file = pystac.read_file(stac_file)
133
+ children = get_all_children(stac_file)
134
+
135
+ # Convert Dataframe to STACDataFrame
136
+ dataframe = pd.DataFrame(children)
137
+ dataframe[geometry_column] = dataframe.apply(convert_df_geom_to_shape, axis=1)
138
+ stac_dataframe = STACDataFrame(
139
+ dataframe,
140
+ crs="EPSG:4326",
141
+ geometry=gpd.GeoSeries.from_wkt(dataframe[geometry_column]),
142
+ )
143
+
144
+ return stac_dataframe
@@ -0,0 +1,3 @@
1
+ from .ingest import ingest_file, ingest_folder, ingest_q1, ingest_stac
2
+ from .download import download_dataset
3
+ from .retrieve import retrieve_datasets, retrieve_dataset, list_datasets
@@ -0,0 +1,13 @@
1
+ from ..src.repos import APIRepo
2
+ from ..src.usecases.datasets import DownloadDataset, DownloadFile
3
+ from .retrieve import retrieve_dataset
4
+ from ..auth import with_auth
5
+
6
+
7
+ @with_auth
8
+ def download_dataset(dataset, file, path=None, logger=None, user=None):
9
+ api_repo = APIRepo()
10
+ download = DownloadDataset(api_repo, retrieve_dataset, logger)
11
+ inputs = download.Inputs(dataset=dataset, file=file, path=path, user=user)
12
+ outputs = download(inputs)
13
+ return outputs.dst_path
@@ -0,0 +1,52 @@
1
+ import os
2
+
3
+ from ..src.repos import APIRepo
4
+ from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
5
+ from ..auth import with_auth
6
+
7
+
8
+ allowed_extensions = [
9
+ ".zip",
10
+ ".tar",
11
+ ".tar.gz",
12
+ ".csv",
13
+ ".txt",
14
+ ".json",
15
+ ".pdf",
16
+ ".md",
17
+ ".yml",
18
+ ]
19
+
20
+
21
+ def ingest_q1(dataset, stac_catalog):
22
+ print("hola")
23
+ return
24
+
25
+
26
+ @with_auth
27
+ def ingest_file(
28
+ file, dataset_id, logger=None, allowed_extensions=allowed_extensions, user=None
29
+ ):
30
+ api_repo = APIRepo()
31
+ ingest = IngestFile(api_repo, allowed_extensions, logger)
32
+ inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
33
+ outputs = ingest(inputs)
34
+ return outputs.data
35
+
36
+
37
+ @with_auth
38
+ def ingest_folder(folder, force, delete, logger=None, user=None):
39
+ api_repo = APIRepo()
40
+ ingest = IngestFolder(api_repo, ingest_file, allowed_extensions, logger)
41
+ inputs = ingest.Inputs(folder=folder, user=user, force=force, delete=delete)
42
+ outputs = ingest(inputs)
43
+ return outputs.dataset
44
+
45
+
46
+ @with_auth
47
+ def ingest_stac(stac_catalog, dataset, logger=None, user=None):
48
+ api_repo = APIRepo()
49
+ ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
50
+ inputs = ingest.Inputs(stac_catalog=stac_catalog, dataset=dataset, user=user)
51
+ outputs = ingest(inputs)
52
+ return outputs.dataset
@@ -0,0 +1 @@
1
+ from .metadata import Metadata
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel, validator
2
+ from typing import List
3
+
4
+
5
+ class Metadata(BaseModel):
6
+ authors: List[str]
7
+ license: str
8
+ source: str
9
+ name: str
10
+
11
+ # validate source is a URL
12
+ @validator("source")
13
+ def source_is_url(cls, v):
14
+ if not v.startswith("http") and not v.startswith("https"):
15
+ raise ValueError("source must be a URL")
16
+ return v
@@ -6,11 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
6
6
  import time
7
7
  import multiprocessing
8
8
  import hashlib
9
+ import geopandas as gpd
9
10
 
10
11
 
11
12
  class APIRepo:
12
13
  def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
13
14
  self.url = url
15
+ # print(self.url)
14
16
 
15
17
  def login(self):
16
18
  return requests.get(self.url + "auth/login")
@@ -22,6 +24,16 @@ class APIRepo:
22
24
  response = requests.get(self.url + "auth/logout")
23
25
  return response.json()["logout_url"]
24
26
 
27
+ def create_dataset(self, metadata, id_token):
28
+ response = requests.post(
29
+ self.url + "datasets",
30
+ json=metadata,
31
+ headers={"Authorization": "Bearer " + id_token},
32
+ )
33
+ if response.status_code == 200:
34
+ return response.json(), None
35
+ return None, response.json()["detail"]
36
+
25
37
  def retrieve_datasets(self):
26
38
  return requests.get(self.url + "datasets").json()
27
39
 
@@ -31,12 +43,10 @@ class APIRepo:
31
43
  return response.json(), None
32
44
  return None, response.json()["detail"]
33
45
 
34
- def download_dataset(self, dataset_id, id_token, path):
35
- url = self.url + "datasets/" + dataset_id + "/download"
46
+ def download_file(self, dataset, dataset_id, file, id_token, path):
47
+ url = self.url + "datasets/" + dataset_id + "/download/" + file
36
48
  headers = {"Authorization": "Bearer " + id_token}
37
- if path is None:
38
- path = str(Path.home()) + "/.eotdl/datasets"
39
- os.makedirs(path, exist_ok=True)
49
+ path = f"{path}/{file}"
40
50
  with requests.get(url, headers=headers, stream=True) as r:
41
51
  r.raise_for_status()
42
52
  total_size = int(r.headers.get("content-length", 0))
@@ -44,10 +54,6 @@ class APIRepo:
44
54
  progress_bar = tqdm(
45
55
  total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
46
56
  )
47
- filename = r.headers.get("content-disposition").split("filename=")[1][1:-1]
48
- path = f"{path}/{filename}"
49
- if os.path.exists(path):
50
- raise Exception("File already exists")
51
57
  with open(path, "wb") as f:
52
58
  for chunk in r.iter_content(block_size):
53
59
  progress_bar.update(len(chunk))
@@ -56,6 +62,27 @@ class APIRepo:
56
62
  progress_bar.close()
57
63
  return path
58
64
 
65
+ def ingest_file(self, file, dataset_id, id_token, checksum=None):
66
+ reponse = requests.post(
67
+ self.url + "datasets/" + dataset_id,
68
+ files={"file": open(file, "rb")},
69
+ data={"checksum": checksum} if checksum else None,
70
+ headers={"Authorization": "Bearer " + id_token},
71
+ )
72
+ if reponse.status_code != 200:
73
+ return None, reponse.json()["detail"]
74
+ return reponse.json(), None
75
+
76
+ def ingest_file_url(self, file, dataset, id_token):
77
+ reponse = requests.post(
78
+ self.url + "datasets/url",
79
+ json={"dataset": dataset, "url": file},
80
+ headers={"Authorization": "Bearer " + id_token},
81
+ )
82
+ if reponse.status_code != 200:
83
+ return None, reponse.json()["detail"]
84
+ return reponse.json(), None
85
+
59
86
  def read_in_chunks(self, file_object, CHUNK_SIZE):
60
87
  while True:
61
88
  data = file_object.read(CHUNK_SIZE)
@@ -63,18 +90,21 @@ class APIRepo:
63
90
  break
64
91
  yield data
65
92
 
66
- def prepare_large_upload(self, name, id_token, checksum):
67
- url = self.url + "datasets/chunk?name=" + name + "&checksum=" + checksum
68
- response = requests.get(url, headers={"Authorization": "Bearer " + id_token})
93
+ def prepare_large_upload(self, file, dataset_id, checksum, id_token):
94
+ filename = Path(file).name
95
+ response = requests.post(
96
+ self.url + f"datasets/{dataset_id}/uploadId",
97
+ json={"name": filename, "checksum": checksum},
98
+ headers={"Authorization": "Bearer " + id_token},
99
+ )
69
100
  if response.status_code != 200:
70
101
  raise Exception(response.json()["detail"])
71
102
  data = response.json()
72
- dataset_id, upload_id, parts = (
73
- data["dataset_id"],
103
+ upload_id, parts = (
74
104
  data["upload_id"],
75
105
  data["parts"] if "parts" in data else [],
76
106
  )
77
- return dataset_id, upload_id, parts
107
+ return upload_id, parts
78
108
 
79
109
  def get_chunk_size(self, content_size):
80
110
  # adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
@@ -85,37 +115,31 @@ class APIRepo:
85
115
  chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
86
116
  return chunk_size
87
117
 
88
- def ingest_large_dataset(self, path, upload_id, dataset_id, id_token, parts):
89
- content_path = os.path.abspath(path)
118
+ def ingest_large_dataset(self, file, upload_id, id_token, parts):
119
+ content_path = os.path.abspath(file)
90
120
  content_size = os.stat(content_path).st_size
91
121
  chunk_size = self.get_chunk_size(content_size)
92
122
  total_chunks = content_size // chunk_size
93
- url = self.url + "datasets/chunk"
94
- headers = {
95
- "Authorization": "Bearer " + id_token,
96
- "Upload-Id": upload_id,
97
- "Dataset-Id": dataset_id,
98
- }
99
123
  # upload chunks sequentially
100
124
  pbar = tqdm(
101
125
  self.read_in_chunks(open(content_path, "rb"), chunk_size),
102
126
  total=total_chunks,
103
127
  )
104
128
  index = 0
105
- parts_checkusms = []
106
129
  for chunk in pbar:
107
130
  part = index // chunk_size + 1
108
131
  offset = index + len(chunk)
109
132
  index = offset
110
133
  if part not in parts:
111
- headers["Part-Number"] = str(part)
112
134
  checksum = hashlib.md5(chunk).hexdigest()
113
- parts_checkusms.append(checksum)
114
- headers["Checksum"] = checksum
115
- file = {"file": chunk}
116
- r = requests.post(url, files=file, headers=headers)
117
- if r.status_code != 200:
118
- return None, r.json()["detail"]
135
+ response = requests.post(
136
+ self.url + "datasets/chunk/" + upload_id,
137
+ files={"file": chunk},
138
+ data={"part_number": part, "checksum": checksum},
139
+ headers={"Authorization": "Bearer " + id_token},
140
+ )
141
+ if response.status_code != 200:
142
+ raise Exception(response.json()["detail"])
119
143
  pbar.set_description(
120
144
  "{:.2f}/{:.2f} MB".format(
121
145
  offset / 1024 / 1024, content_size / 1024 / 1024
@@ -124,16 +148,10 @@ class APIRepo:
124
148
  pbar.close()
125
149
  return
126
150
 
127
- def complete_upload(self, name, id_token, upload_id, dataset_id, checksum):
128
- url = self.url + "datasets/complete"
151
+ def complete_upload(self, id_token, upload_id):
129
152
  r = requests.post(
130
- url,
131
- json={"name": name, "checksum": checksum},
132
- headers={
133
- "Authorization": "Bearer " + id_token,
134
- "Upload-Id": upload_id,
135
- "Dataset-Id": dataset_id,
136
- },
153
+ self.url + "datasets/complete/" + upload_id,
154
+ headers={"Authorization": "Bearer " + id_token},
137
155
  )
138
156
  if r.status_code != 200:
139
157
  return None, r.json()["detail"]
@@ -200,63 +218,29 @@ class APIRepo:
200
218
  return None, r.json()["detail"]
201
219
  return r.json(), None
202
220
 
203
- def ingest_large_dataset_parallel(
204
- self,
205
- path,
206
- upload_id,
207
- dataset_id,
208
- id_token,
209
- parts,
210
- threads,
211
- ):
212
- # Create thread pool executor
213
- max_workers = threads if threads > 0 else multiprocessing.cpu_count()
214
- executor = ThreadPoolExecutor(max_workers=max_workers)
215
-
216
- # Divide file into chunks and create tasks for each chunk
217
- offset = 0
218
- tasks = []
219
- content_path = os.path.abspath(path)
220
- content_size = os.stat(content_path).st_size
221
- chunk_size = self.get_chunk_size(content_size)
222
- total_chunks = content_size // chunk_size
223
- while offset < content_size:
224
- chunk_end = min(offset + chunk_size, content_size)
225
- part = str(offset // chunk_size + 1)
226
- if part not in parts:
227
- tasks.append((offset, chunk_end, part))
228
- offset = chunk_end
229
-
230
- # Define the function that will upload each chunk
231
- def upload_chunk(start, end, part):
232
- # print(f"Uploading chunk {start} - {end}", part)
233
- with open(content_path, "rb") as f:
234
- f.seek(start)
235
- chunk = f.read(end - start)
236
- checksum = hashlib.md5(chunk).hexdigest()
237
- response = requests.post(
238
- self.url + "datasets/chunk",
239
- files={"file": chunk},
240
- headers={
241
- "Authorization": "Bearer " + id_token,
242
- "Upload-Id": upload_id,
243
- "Dataset-Id": dataset_id,
244
- "Checksum": checksum,
245
- "Part-Number": str(part),
246
- },
247
- )
248
- if response.status_code != 200:
249
- print(f"Failed to upload chunk {start} - {end}")
250
- return response
221
+ def delete_file(self, dataset_id, file_name, id_token):
222
+ response = requests.delete(
223
+ self.url + "datasets/" + dataset_id + "/file/" + file_name,
224
+ headers={"Authorization": "Bearer " + id_token},
225
+ )
226
+ if response.status_code != 200:
227
+ return None, response.json()["detail"]
228
+ return response.json(), None
251
229
 
252
- # Submit each task to the executor
253
- with tqdm(total=total_chunks) as pbar:
254
- futures = []
255
- for task in tasks:
256
- future = executor.submit(upload_chunk, *task)
257
- future.add_done_callback(lambda p: pbar.update())
258
- futures.append(future)
230
+ def ingest_stac(self, stac_json, dataset, id_token):
231
+ reponse = requests.post(
232
+ self.url + "datasets/stac",
233
+ json={"dataset": dataset, "stac": stac_json},
234
+ headers={"Authorization": "Bearer " + id_token},
235
+ )
236
+ if reponse.status_code != 200:
237
+ return None, reponse.json()["detail"]
238
+ return reponse.json(), None
259
239
 
260
- # Wait for all tasks to complete
261
- for future in futures:
262
- future.result()
240
+ def download_stac(self, dataset_id, id_token):
241
+ url = self.url + "datasets/" + dataset_id + "/download"
242
+ headers = {"Authorization": "Bearer " + id_token}
243
+ response = requests.get(url, headers=headers)
244
+ if response.status_code != 200:
245
+ return None, response.json()["detail"]
246
+ return gpd.GeoDataFrame.from_features(response.json()["features"]), None
@@ -0,0 +1,79 @@
1
+ from pydantic import BaseModel
2
+ from ....src.utils import calculate_checksum
3
+ from ....curation.stac import STACDataFrame
4
+ from pathlib import Path
5
+ import os
6
+
7
+
8
+ class DownloadDataset:
9
+ def __init__(self, repo, retrieve_dataset, logger):
10
+ self.repo = repo
11
+ self.retrieve_dataset = retrieve_dataset
12
+ self.logger = logger if logger else print
13
+
14
+ class Inputs(BaseModel):
15
+ dataset: str
16
+ file: str = None
17
+ path: str = None
18
+ user: dict
19
+
20
+ class Outputs(BaseModel):
21
+ dst_path: str
22
+
23
+ def download(self, dataset, dataset_id, file, checksum, path, user):
24
+ self.logger(f"Downloading {file}")
25
+ dst_path = self.repo.download_file(
26
+ dataset, dataset_id, file, user["id_token"], path
27
+ )
28
+ if calculate_checksum(dst_path) != checksum:
29
+ self.logger(f"Checksum for {file} does not match")
30
+ self.logger(f"Done")
31
+ return dst_path
32
+
33
+ def __call__(self, inputs: Inputs) -> Outputs:
34
+ dataset = self.retrieve_dataset(inputs.dataset)
35
+ if inputs.path is None:
36
+ download_path = str(Path.home()) + "/.eotdl/datasets/" + inputs.dataset
37
+ else:
38
+ download_path = inputs.path + "/" + inputs.dataset
39
+ os.makedirs(download_path, exist_ok=True)
40
+ if dataset["quality"] == 0:
41
+ if inputs.file:
42
+ files = [f for f in dataset["files"] if f["name"] == inputs.file]
43
+ if not files:
44
+ raise Exception(f"File {inputs.file} not found")
45
+ if len(files) > 1:
46
+ raise Exception(f"Multiple files with name {inputs.file} found")
47
+ dst_path = self.download(
48
+ inputs.dataset,
49
+ dataset["id"],
50
+ inputs.file,
51
+ files[0]["checksum"],
52
+ download_path,
53
+ inputs.user,
54
+ )
55
+ return self.Outputs(dst_path=dst_path)
56
+ for file in dataset["files"]:
57
+ dst_path = self.download(
58
+ inputs.dataset,
59
+ dataset["id"],
60
+ file["name"],
61
+ file["checksum"],
62
+ download_path,
63
+ inputs.user,
64
+ )
65
+ return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
66
+ else:
67
+ gdf, error = self.repo.download_stac(
68
+ dataset["id"],
69
+ inputs.user["id_token"],
70
+ )
71
+ if error:
72
+ raise Exception(error)
73
+ df = STACDataFrame(gdf)
74
+ # df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
75
+ path = inputs.path
76
+ if path is None:
77
+ path = str(Path.home()) + "/.eotdl/datasets/" + dataset["name"]
78
+ df.to_stac(path)
79
+ return self.Outputs(dst_path=path)