eotdl 2023.6.27__tar.gz → 2023.7.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {eotdl-2023.6.27 → eotdl-2023.7.19}/PKG-INFO +1 -1
  2. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/cli.py +0 -1
  3. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/commands/datasets.py +19 -9
  4. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/stac/__init__.py +1 -1
  5. eotdl-2023.7.19/eotdl/curation/stac/dataframe.py +144 -0
  6. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/datasets/__init__.py +1 -1
  7. eotdl-2023.7.19/eotdl/datasets/ingest.py +52 -0
  8. eotdl-2023.7.19/eotdl/src/models/__init__.py +1 -0
  9. eotdl-2023.7.19/eotdl/src/models/metadata.py +16 -0
  10. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/repos/APIRepo.py +52 -72
  11. eotdl-2023.7.19/eotdl/src/usecases/datasets/DownloadDataset.py +79 -0
  12. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestFile.py +13 -9
  13. eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestFolder.py +98 -0
  14. eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestSTAC.py +42 -0
  15. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/__init__.py +1 -0
  16. {eotdl-2023.6.27 → eotdl-2023.7.19}/pyproject.toml +1 -1
  17. {eotdl-2023.6.27 → eotdl-2023.7.19}/setup.py +2 -1
  18. eotdl-2023.6.27/eotdl/datasets/ingest.py +0 -61
  19. eotdl-2023.6.27/eotdl/src/usecases/datasets/DownloadDataset.py +0 -56
  20. eotdl-2023.6.27/eotdl/src/usecases/datasets/IngestFolder.py +0 -37
  21. {eotdl-2023.6.27 → eotdl-2023.7.19}/README.md +0 -0
  22. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/__init__.py +0 -0
  23. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/access/__init__.py +0 -0
  24. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/access/parameters.py +0 -0
  25. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/access/sentinelhub/__init__.py +0 -0
  26. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/access/sentinelhub/client.py +0 -0
  27. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/access/sentinelhub/utils.py +0 -0
  28. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/auth/__init__.py +0 -0
  29. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/auth/main.py +0 -0
  30. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/commands/__init__.py +0 -0
  31. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/commands/auth.py +0 -0
  32. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/__init__.py +0 -0
  33. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/formatters.py +0 -0
  34. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/metadata.py +0 -0
  35. /eotdl-2023.6.27/eotdl/curation/stac/dataframe.py → /eotdl-2023.7.19/eotdl/curation/stac/dataframe_bck.py +0 -0
  36. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/stac/extensions.py +0 -0
  37. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/stac/parsers.py +0 -0
  38. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/stac/stac.py +0 -0
  39. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/curation/stac/utils.py +0 -0
  40. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/datasets/download.py +0 -0
  41. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/datasets/retrieve.py +0 -0
  42. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/hello.py +0 -0
  43. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/__init__.py +0 -0
  44. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/errors/__init__.py +0 -0
  45. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/errors/auth.py +0 -0
  46. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/repos/AuthRepo.py +0 -0
  47. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/repos/__init__.py +0 -0
  48. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/__init__.py +0 -0
  49. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/auth/Auth.py +0 -0
  50. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/auth/IsLogged.py +0 -0
  51. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/auth/Logout.py +0 -0
  52. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/auth/__init__.py +0 -0
  53. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/DownloadFile.py +0 -0
  54. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestDataset.py +0 -0
  55. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestLargeDataset.py +0 -0
  56. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestLargeDatasetParallel.py +0 -0
  57. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/RetrieveDataset.py +0 -0
  58. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/RetrieveDatasets.py +0 -0
  59. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/src/utils.py +0 -0
  60. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/tools/__init__.py +0 -0
  61. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/tools/sen12floods/__init__.py +0 -0
  62. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/tools/sen12floods/tools.py +0 -0
  63. {eotdl-2023.6.27 → eotdl-2023.7.19}/eotdl/tools/stac.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: eotdl
3
- Version: 2023.6.27
3
+ Version: 2023.7.19
4
4
  Summary: Earth Observation Training Data Lab
5
5
  License: MIT
6
6
  Author: EarthPulse
@@ -6,6 +6,5 @@ app = typer.Typer()
6
6
  app.add_typer(auth.app, name="auth")
7
7
  app.add_typer(datasets.app, name="datasets")
8
8
 
9
-
10
9
  if __name__ == "__main__":
11
10
  app()
@@ -4,26 +4,32 @@ from pathlib import Path
4
4
  from ..datasets import (
5
5
  retrieve_datasets,
6
6
  download_dataset,
7
- ingest_file,
8
7
  ingest_folder,
8
+ ingest_stac,
9
9
  )
10
10
 
11
11
  app = typer.Typer()
12
12
 
13
13
 
14
14
  @app.command()
15
- def ingest(path: Path, dataset: str):
15
+ def ingest(
16
+ path: Path,
17
+ f: bool = typer.Option(False, "--f", help="Force ingest even if file exists"),
18
+ d: bool = typer.Option(False, "--d", help="Delete files not in the dataset"),
19
+ ):
16
20
  """
17
- Ingest a file
21
+ Ingest a dataset
18
22
 
19
- path: Path to folder with data (limited to 10 files, not recursive!)
20
- dataset: Name of the dataset
23
+ path: Path to folder with the dataset
21
24
  """
22
25
  try:
23
- if path.is_dir():
24
- ingest_folder(path, dataset, typer.echo)
26
+ if not path.is_dir():
27
+ typer.echo("Path must be a folder")
28
+ return
29
+ if "catalog.json" in [f.name for f in path.iterdir()]:
30
+ ingest_stac(str(path) + "/catalog.json", typer.echo)
25
31
  else:
26
- ingest_file(path, dataset, typer.echo)
32
+ ingest_folder(path, f, d, typer.echo)
27
33
  except Exception as e:
28
34
  typer.echo(e)
29
35
 
@@ -38,7 +44,11 @@ def list():
38
44
 
39
45
 
40
46
  @app.command()
41
- def get(dataset: str, file: str = None, path: str = None):
47
+ def get(
48
+ dataset: str,
49
+ path: str = None,
50
+ file: str = None,
51
+ ):
42
52
  """
43
53
  Download a dataset
44
54
 
@@ -1,4 +1,4 @@
1
1
  # from .stac import STACGenerator
2
2
  # from .utils import format_time_acquired
3
3
  # from .parsers import STACIdParser, StructuredParser, UnestructuredParser
4
- # from .dataframe import STACDataFrame, read_stac
4
+ from .dataframe import STACDataFrame, read_stac
@@ -0,0 +1,144 @@
1
+ """
2
+ Module for the STAC dataframe
3
+ """
4
+
5
+ import pandas as pd
6
+ import geopandas as gpd
7
+ import pystac
8
+ import json
9
+ from geomet import wkt
10
+ from os.path import join
11
+ from os import makedirs
12
+ from typing import Union
13
+ from math import isnan
14
+ from .utils import convert_df_geom_to_shape, get_all_children
15
+
16
+
17
+ class STACDataFrame(gpd.GeoDataFrame):
18
+ def __init__(self, *args, **kwargs):
19
+ super().__init__(*args, **kwargs)
20
+
21
+ @classmethod
22
+ def from_stac_file(self, stac_file):
23
+ """
24
+ Create a STACDataFrame from a STAC file
25
+ """
26
+ return read_stac(stac_file)
27
+
28
+ def to_stac(self, path):
29
+ """
30
+ Create a STAC catalog and children from a STACDataFrame
31
+ """
32
+ df = self.copy()
33
+
34
+ if "id" in df.columns and "stac_id" in df.columns:
35
+ id_column = "stac_id"
36
+ stac_id_exists = True
37
+ else:
38
+ id_column = "id"
39
+ stac_id_exists = False
40
+
41
+ # First, create the catalog and its folder, if exists
42
+ catalog_df = df[df["type"] == "Catalog"]
43
+
44
+ if catalog_df.empty:
45
+ makedirs(path, exist_ok=True)
46
+ else:
47
+ for index, row in catalog_df.iterrows():
48
+ root_output_folder = path + "/" + row[id_column]
49
+ makedirs(root_output_folder, exist_ok=True)
50
+ row_json = row.to_dict()
51
+
52
+ # Curate the json row
53
+ row_json = self.curate_json_row(row_json, stac_id_exists)
54
+
55
+ with open(join(root_output_folder, f"catalog.json"), "w") as f:
56
+ json.dump(row_json, f)
57
+
58
+ # Second, create the collections and their folders, if exist
59
+ collections = dict()
60
+ collections_df = df[df["type"] == "Collection"]
61
+ for index, row in collections_df.iterrows():
62
+ stac_output_folder = join(root_output_folder, row[id_column])
63
+ collections[row[id_column]] = stac_output_folder
64
+ makedirs(stac_output_folder, exist_ok=True)
65
+ row_json = row.to_dict()
66
+
67
+ # Curate the json row
68
+ row_json = self.curate_json_row(row_json, stac_id_exists)
69
+
70
+ with open(join(stac_output_folder, f"collection.json"), "w") as f:
71
+ json.dump(row_json, f)
72
+
73
+ # Then, create the items and their folders, if exist
74
+ features_df = df[df["type"] == "Feature"]
75
+ for index, row in features_df.iterrows():
76
+ collection = row["collection"]
77
+ stac_output_folder = join(collections[collection], row[id_column])
78
+
79
+ # Convert the geometry from WKT back to geojson
80
+ row["geometry"] = row["geometry"].wkt
81
+ row["geometry"] = wkt.loads(row["geometry"])
82
+ makedirs(stac_output_folder, exist_ok=True)
83
+ row_json = row.to_dict()
84
+
85
+ # Curate the json row
86
+ row_json = self.curate_json_row(row_json, stac_id_exists)
87
+
88
+ with open(join(stac_output_folder, f'{row_json["id"]}.json'), "w") as f:
89
+ json.dump(row_json, f)
90
+
91
+ def curate_json_row(self, row: dict, stac_id_exists: bool) -> dict:
92
+ """
93
+ Curate the json row of a STACDataFrame, in order to generate a valid STAC file
94
+
95
+ :param row: row of a STACDataFrame
96
+ :param stac_id_exists: if the stac_id column exists
97
+ """
98
+ keys_to_remove = list()
99
+
100
+ # Remove the created_at and modified_at columns, if the STACDataFrame comes from GeoDB
101
+ for i in "created_at", "modified_at":
102
+ if i in row.keys():
103
+ keys_to_remove.append(i)
104
+
105
+ # Rename the stac_id column to id, to avoid conflicts with the id column
106
+ if stac_id_exists:
107
+ row["id"] = row["stac_id"]
108
+ del row["stac_id"]
109
+
110
+ # Remove the NaN values and empty strings
111
+ for k, v in row.items():
112
+ if (isinstance(v, float) and isnan(v)) or v == "":
113
+ keys_to_remove.append(k)
114
+ for key in keys_to_remove:
115
+ del row[key]
116
+ del row["geometry"]
117
+
118
+ return row
119
+
120
+
121
+ def read_stac(
122
+ stac_file: Union[pystac.Catalog, pystac.Collection, str],
123
+ geometry_column: str = "geometry",
124
+ ) -> STACDataFrame:
125
+ """
126
+ Read a STAC file and return a STACDataFrame
127
+
128
+ :param stac_file: STAC file to read
129
+ :param geometry_column: name of the geometry column
130
+ """
131
+ if isinstance(stac_file, str):
132
+ stac_file = pystac.read_file(stac_file)
133
+ children = get_all_children(stac_file)
134
+
135
+ # Convert Dataframe to STACDataFrame
136
+ dataframe = pd.DataFrame(children)
137
+ dataframe[geometry_column] = dataframe.apply(convert_df_geom_to_shape, axis=1)
138
+ stac_dataframe = STACDataFrame(
139
+ dataframe,
140
+ crs="EPSG:4326",
141
+ geometry=gpd.GeoSeries.from_wkt(dataframe[geometry_column]),
142
+ )
143
+
144
+ return stac_dataframe
@@ -1,3 +1,3 @@
1
- from .ingest import ingest_file, ingest_folder
1
+ from .ingest import ingest_file, ingest_folder, ingest_q1, ingest_stac
2
2
  from .download import download_dataset
3
3
  from .retrieve import retrieve_datasets, retrieve_dataset, list_datasets
@@ -0,0 +1,52 @@
1
+ import os
2
+
3
+ from ..src.repos import APIRepo
4
+ from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
5
+ from ..auth import with_auth
6
+
7
+
8
+ allowed_extensions = [
9
+ ".zip",
10
+ ".tar",
11
+ ".tar.gz",
12
+ ".csv",
13
+ ".txt",
14
+ ".json",
15
+ ".pdf",
16
+ ".md",
17
+ ".yml",
18
+ ]
19
+
20
+
21
+ def ingest_q1(dataset, stac_catalog):
22
+ print("hola")
23
+ return
24
+
25
+
26
+ @with_auth
27
+ def ingest_file(
28
+ file, dataset_id, logger=None, allowed_extensions=allowed_extensions, user=None
29
+ ):
30
+ api_repo = APIRepo()
31
+ ingest = IngestFile(api_repo, allowed_extensions, logger)
32
+ inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
33
+ outputs = ingest(inputs)
34
+ return outputs.data
35
+
36
+
37
+ @with_auth
38
+ def ingest_folder(folder, force, delete, logger=None, user=None):
39
+ api_repo = APIRepo()
40
+ ingest = IngestFolder(api_repo, ingest_file, allowed_extensions, logger)
41
+ inputs = ingest.Inputs(folder=folder, user=user, force=force, delete=delete)
42
+ outputs = ingest(inputs)
43
+ return outputs.dataset
44
+
45
+
46
+ @with_auth
47
+ def ingest_stac(stac_catalog, dataset, logger=None, user=None):
48
+ api_repo = APIRepo()
49
+ ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
50
+ inputs = ingest.Inputs(stac_catalog=stac_catalog, dataset=dataset, user=user)
51
+ outputs = ingest(inputs)
52
+ return outputs.dataset
@@ -0,0 +1 @@
1
+ from .metadata import Metadata
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel, validator
2
+ from typing import List
3
+
4
+
5
+ class Metadata(BaseModel):
6
+ authors: List[str]
7
+ license: str
8
+ source: str
9
+ name: str
10
+
11
+ # validate source is a URL
12
+ @validator("source")
13
+ def source_is_url(cls, v):
14
+ if not v.startswith("http") and not v.startswith("https"):
15
+ raise ValueError("source must be a URL")
16
+ return v
@@ -6,11 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
6
6
  import time
7
7
  import multiprocessing
8
8
  import hashlib
9
+ import geopandas as gpd
9
10
 
10
11
 
11
12
  class APIRepo:
12
13
  def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
13
14
  self.url = url
15
+ # print(self.url)
14
16
 
15
17
  def login(self):
16
18
  return requests.get(self.url + "auth/login")
@@ -22,6 +24,16 @@ class APIRepo:
22
24
  response = requests.get(self.url + "auth/logout")
23
25
  return response.json()["logout_url"]
24
26
 
27
+ def create_dataset(self, metadata, id_token):
28
+ response = requests.post(
29
+ self.url + "datasets",
30
+ json=metadata,
31
+ headers={"Authorization": "Bearer " + id_token},
32
+ )
33
+ if response.status_code == 200:
34
+ return response.json(), None
35
+ return None, response.json()["detail"]
36
+
25
37
  def retrieve_datasets(self):
26
38
  return requests.get(self.url + "datasets").json()
27
39
 
@@ -34,12 +46,7 @@ class APIRepo:
34
46
  def download_file(self, dataset, dataset_id, file, id_token, path):
35
47
  url = self.url + "datasets/" + dataset_id + "/download/" + file
36
48
  headers = {"Authorization": "Bearer " + id_token}
37
- if path is None:
38
- path = str(Path.home()) + "/.eotdl/datasets/" + dataset
39
- os.makedirs(path, exist_ok=True)
40
49
  path = f"{path}/{file}"
41
- # if os.path.exists(path):
42
- # raise Exception("File already exists")
43
50
  with requests.get(url, headers=headers, stream=True) as r:
44
51
  r.raise_for_status()
45
52
  total_size = int(r.headers.get("content-length", 0))
@@ -55,14 +62,21 @@ class APIRepo:
55
62
  progress_bar.close()
56
63
  return path
57
64
 
58
- def ingest_file(self, file, dataset, id_token, checksum):
65
+ def ingest_file(self, file, dataset_id, id_token, checksum=None):
59
66
  reponse = requests.post(
60
- self.url + "datasets",
67
+ self.url + "datasets/" + dataset_id,
61
68
  files={"file": open(file, "rb")},
62
- data={
63
- "dataset": dataset,
64
- "checksum": checksum,
65
- },
69
+ data={"checksum": checksum} if checksum else None,
70
+ headers={"Authorization": "Bearer " + id_token},
71
+ )
72
+ if reponse.status_code != 200:
73
+ return None, reponse.json()["detail"]
74
+ return reponse.json(), None
75
+
76
+ def ingest_file_url(self, file, dataset, id_token):
77
+ reponse = requests.post(
78
+ self.url + "datasets/url",
79
+ json={"dataset": dataset, "url": file},
66
80
  headers={"Authorization": "Bearer " + id_token},
67
81
  )
68
82
  if reponse.status_code != 200:
@@ -76,11 +90,11 @@ class APIRepo:
76
90
  break
77
91
  yield data
78
92
 
79
- def prepare_large_upload(self, file, dataset, checksum, id_token):
93
+ def prepare_large_upload(self, file, dataset_id, checksum, id_token):
80
94
  filename = Path(file).name
81
95
  response = requests.post(
82
- self.url + "datasets/uploadId",
83
- json={"name": filename, "checksum": checksum, "dataset": dataset},
96
+ self.url + f"datasets/{dataset_id}/uploadId",
97
+ json={"name": filename, "checksum": checksum},
84
98
  headers={"Authorization": "Bearer " + id_token},
85
99
  )
86
100
  if response.status_code != 200:
@@ -204,63 +218,29 @@ class APIRepo:
204
218
  return None, r.json()["detail"]
205
219
  return r.json(), None
206
220
 
207
- def ingest_large_dataset_parallel(
208
- self,
209
- path,
210
- upload_id,
211
- dataset_id,
212
- id_token,
213
- parts,
214
- threads,
215
- ):
216
- # Create thread pool executor
217
- max_workers = threads if threads > 0 else multiprocessing.cpu_count()
218
- executor = ThreadPoolExecutor(max_workers=max_workers)
219
-
220
- # Divide file into chunks and create tasks for each chunk
221
- offset = 0
222
- tasks = []
223
- content_path = os.path.abspath(path)
224
- content_size = os.stat(content_path).st_size
225
- chunk_size = self.get_chunk_size(content_size)
226
- total_chunks = content_size // chunk_size
227
- while offset < content_size:
228
- chunk_end = min(offset + chunk_size, content_size)
229
- part = str(offset // chunk_size + 1)
230
- if part not in parts:
231
- tasks.append((offset, chunk_end, part))
232
- offset = chunk_end
233
-
234
- # Define the function that will upload each chunk
235
- def upload_chunk(start, end, part):
236
- # print(f"Uploading chunk {start} - {end}", part)
237
- with open(content_path, "rb") as f:
238
- f.seek(start)
239
- chunk = f.read(end - start)
240
- checksum = hashlib.md5(chunk).hexdigest()
241
- response = requests.post(
242
- self.url + "datasets/chunk",
243
- files={"file": chunk},
244
- headers={
245
- "Authorization": "Bearer " + id_token,
246
- "Upload-Id": upload_id,
247
- "Dataset-Id": dataset_id,
248
- "Checksum": checksum,
249
- "Part-Number": str(part),
250
- },
251
- )
252
- if response.status_code != 200:
253
- print(f"Failed to upload chunk {start} - {end}")
254
- return response
221
+ def delete_file(self, dataset_id, file_name, id_token):
222
+ response = requests.delete(
223
+ self.url + "datasets/" + dataset_id + "/file/" + file_name,
224
+ headers={"Authorization": "Bearer " + id_token},
225
+ )
226
+ if response.status_code != 200:
227
+ return None, response.json()["detail"]
228
+ return response.json(), None
255
229
 
256
- # Submit each task to the executor
257
- with tqdm(total=total_chunks) as pbar:
258
- futures = []
259
- for task in tasks:
260
- future = executor.submit(upload_chunk, *task)
261
- future.add_done_callback(lambda p: pbar.update())
262
- futures.append(future)
230
+ def ingest_stac(self, stac_json, dataset, id_token):
231
+ reponse = requests.post(
232
+ self.url + "datasets/stac",
233
+ json={"dataset": dataset, "stac": stac_json},
234
+ headers={"Authorization": "Bearer " + id_token},
235
+ )
236
+ if reponse.status_code != 200:
237
+ return None, reponse.json()["detail"]
238
+ return reponse.json(), None
263
239
 
264
- # Wait for all tasks to complete
265
- for future in futures:
266
- future.result()
240
+ def download_stac(self, dataset_id, id_token):
241
+ url = self.url + "datasets/" + dataset_id + "/download"
242
+ headers = {"Authorization": "Bearer " + id_token}
243
+ response = requests.get(url, headers=headers)
244
+ if response.status_code != 200:
245
+ return None, response.json()["detail"]
246
+ return gpd.GeoDataFrame.from_features(response.json()["features"]), None
@@ -0,0 +1,79 @@
1
+ from pydantic import BaseModel
2
+ from ....src.utils import calculate_checksum
3
+ from ....curation.stac import STACDataFrame
4
+ from pathlib import Path
5
+ import os
6
+
7
+
8
+ class DownloadDataset:
9
+ def __init__(self, repo, retrieve_dataset, logger):
10
+ self.repo = repo
11
+ self.retrieve_dataset = retrieve_dataset
12
+ self.logger = logger if logger else print
13
+
14
+ class Inputs(BaseModel):
15
+ dataset: str
16
+ file: str = None
17
+ path: str = None
18
+ user: dict
19
+
20
+ class Outputs(BaseModel):
21
+ dst_path: str
22
+
23
+ def download(self, dataset, dataset_id, file, checksum, path, user):
24
+ self.logger(f"Downloading {file}")
25
+ dst_path = self.repo.download_file(
26
+ dataset, dataset_id, file, user["id_token"], path
27
+ )
28
+ if calculate_checksum(dst_path) != checksum:
29
+ self.logger(f"Checksum for {file} does not match")
30
+ self.logger(f"Done")
31
+ return dst_path
32
+
33
+ def __call__(self, inputs: Inputs) -> Outputs:
34
+ dataset = self.retrieve_dataset(inputs.dataset)
35
+ if inputs.path is None:
36
+ download_path = str(Path.home()) + "/.eotdl/datasets/" + inputs.dataset
37
+ else:
38
+ download_path = inputs.path + "/" + inputs.dataset
39
+ os.makedirs(download_path, exist_ok=True)
40
+ if dataset["quality"] == 0:
41
+ if inputs.file:
42
+ files = [f for f in dataset["files"] if f["name"] == inputs.file]
43
+ if not files:
44
+ raise Exception(f"File {inputs.file} not found")
45
+ if len(files) > 1:
46
+ raise Exception(f"Multiple files with name {inputs.file} found")
47
+ dst_path = self.download(
48
+ inputs.dataset,
49
+ dataset["id"],
50
+ inputs.file,
51
+ files[0]["checksum"],
52
+ download_path,
53
+ inputs.user,
54
+ )
55
+ return self.Outputs(dst_path=dst_path)
56
+ for file in dataset["files"]:
57
+ dst_path = self.download(
58
+ inputs.dataset,
59
+ dataset["id"],
60
+ file["name"],
61
+ file["checksum"],
62
+ download_path,
63
+ inputs.user,
64
+ )
65
+ return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
66
+ else:
67
+ gdf, error = self.repo.download_stac(
68
+ dataset["id"],
69
+ inputs.user["id_token"],
70
+ )
71
+ if error:
72
+ raise Exception(error)
73
+ df = STACDataFrame(gdf)
74
+ # df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
75
+ path = inputs.path
76
+ if path is None:
77
+ path = str(Path.home()) + "/.eotdl/datasets/" + dataset["name"]
78
+ df.to_stac(path)
79
+ return self.Outputs(dst_path=path)
@@ -13,11 +13,11 @@ class IngestFile:
13
13
 
14
14
  class Inputs(BaseModel):
15
15
  file: typing.Any
16
- dataset: str = None
16
+ dataset_id: str
17
17
  user: dict
18
18
 
19
19
  class Outputs(BaseModel):
20
- dataset: dict
20
+ data: dict
21
21
 
22
22
  def __call__(self, inputs: Inputs) -> Outputs:
23
23
  # validate file extension
@@ -26,31 +26,35 @@ class IngestFile:
26
26
  raise Exception(
27
27
  f"Only {', '.join(self.allowed_extensions)} files are allowed"
28
28
  )
29
+ id_token = inputs.user["id_token"]
29
30
  self.logger(f"Uploading file {inputs.file}...")
31
+ # if inputs.file.startswith("http://") or inputs.file.startswith("https://"):
32
+ # data, error = self.repo.ingest_file_url(
33
+ # inputs.file, inputs.metadata.name, id_token
34
+ # )
35
+ # else:
30
36
  self.logger("Computing checksum...")
31
37
  checksum = calculate_checksum(inputs.file)
32
38
  self.logger(checksum)
33
- self.logger("Ingesting dataset...")
34
- id_token = inputs.user["id_token"]
39
+ self.logger("Ingesting file...")
35
40
  filesize = os.path.getsize(inputs.file)
36
41
  # ingest small file
37
42
  if filesize < 1024 * 1024 * 16: # 16 MB
38
43
  data, error = self.repo.ingest_file(
39
- inputs.file, inputs.dataset, id_token, checksum
44
+ inputs.file, inputs.dataset_id, id_token, checksum
40
45
  )
41
46
  if error:
42
47
  raise Exception(error)
43
48
  self.logger("Done")
44
- return self.Outputs(dataset=data)
49
+ return self.Outputs(data=data)
45
50
  # ingest large file
46
51
  upload_id, parts = self.repo.prepare_large_upload(
47
- inputs.file, inputs.dataset, checksum, id_token
52
+ inputs.file, inputs.dataset_id, checksum, id_token
48
53
  )
49
- print(upload_id, parts)
50
54
  self.repo.ingest_large_dataset(inputs.file, upload_id, id_token, parts)
51
55
  self.logger("\nCompleting upload...")
52
56
  data, error = self.repo.complete_upload(id_token, upload_id)
53
57
  if error:
54
58
  raise Exception(error)
55
59
  self.logger("Done")
56
- return self.Outputs(dataset=data)
60
+ return self.Outputs(data=data)
@@ -0,0 +1,98 @@
1
+ from pydantic import BaseModel
2
+ import os
3
+ from pathlib import Path
4
+ import yaml
5
+ from ...models import Metadata
6
+
7
+
8
+ class IngestFolder:
9
+ def __init__(self, repo, ingest_file, allowed_extensions, logger):
10
+ self.repo = repo
11
+ self.ingest_file = ingest_file
12
+ self.allowed_extensions = allowed_extensions
13
+ self.logger = logger if logger else print
14
+
15
+ class Inputs(BaseModel):
16
+ folder: Path
17
+ user: dict
18
+ force: bool = False
19
+ delete: bool = False
20
+
21
+ class Outputs(BaseModel):
22
+ dataset: dict
23
+
24
+ def __call__(self, inputs: Inputs) -> Outputs:
25
+ # validate folder
26
+ self.logger("Uploading directory (only files, not recursive)")
27
+ items = list(inputs.folder.glob("*"))
28
+ filtered_items = [item for item in items if item.is_file()]
29
+ filtered_items = [
30
+ item for item in filtered_items if item.suffix in self.allowed_extensions
31
+ ]
32
+ if len(filtered_items) == 0:
33
+ raise Exception("No files found in directory")
34
+ if len(filtered_items) > 10:
35
+ raise Exception("Too many files in directory, limited to 10")
36
+ if "metadata.yml" not in [item.name for item in filtered_items]:
37
+ raise Exception("metadata.yml not found in directory")
38
+ # load metadata
39
+ metadata = yaml.safe_load(
40
+ open(inputs.folder.joinpath("metadata.yml"), "r").read()
41
+ )
42
+ metadata = Metadata(**metadata)
43
+ # remove metadata.yml from files
44
+ filtered_items = [
45
+ item for item in filtered_items if item.name != "metadata.yml"
46
+ ]
47
+ # create dataset
48
+ data, error = self.repo.create_dataset(metadata.dict(), inputs.user["id_token"])
49
+ # dataset may already exists, but if user is owner continue ingesting files
50
+ current_files = []
51
+ if error:
52
+ data, error2 = self.repo.retrieve_dataset(metadata.name)
53
+ if error2:
54
+ raise Exception(error)
55
+ if data["uid"] != inputs.user["sub"]:
56
+ raise Exception("Dataset already exists.")
57
+ data["dataset_id"] = data["id"]
58
+ current_files = [item["name"] for item in data["files"]]
59
+ if len(current_files) > 0 and not inputs.force:
60
+ self.logger(
61
+ "The following files already exist and will not be uploaded (use --f to force re-upload):"
62
+ )
63
+ for item in current_files:
64
+ self.logger(f"{item}")
65
+ # TODO: delete current_files that are not in filtered_items if --delete
66
+ hanged_files = [
67
+ file
68
+ for file in current_files
69
+ if file not in [item.name for item in filtered_items]
70
+ ]
71
+ if len(hanged_files) > 0:
72
+ self.logger(
73
+ "The following files are no longer in your dataset (use --d to delete):"
74
+ )
75
+ for item in hanged_files:
76
+ self.logger(f"{item}")
77
+ if inputs.delete:
78
+ self.logger(f"Deleting file {item}...")
79
+ _, error = self.repo.delete_file(
80
+ data["dataset_id"], item, inputs.user["id_token"]
81
+ )
82
+ if error:
83
+ self.logger(error)
84
+ else:
85
+ self.logger("Done")
86
+ filtered_items = [
87
+ item for item in filtered_items if item.name not in current_files
88
+ ]
89
+ dataset_id = data["dataset_id"]
90
+ # upload files
91
+ if len(filtered_items) == 0:
92
+ raise Exception("No files to upload")
93
+ self.logger("The following files will be uploaded:")
94
+ for item in filtered_items:
95
+ self.logger(f"{item.name}")
96
+ for item in filtered_items:
97
+ data = self.ingest_file(item, dataset_id, logger=self.logger)
98
+ return self.Outputs(dataset=data)
@@ -0,0 +1,42 @@
1
+ from pydantic import BaseModel
2
+ from ....curation.stac import STACDataFrame
3
+ import json
4
+
5
+
6
+ class IngestSTAC:
7
+ def __init__(self, repo, ingest_file, allowed_extensions):
8
+ self.repo = repo
9
+ self.ingest_file = ingest_file
10
+ self.allowed_extensions = allowed_extensions
11
+
12
+ class Inputs(BaseModel):
13
+ stac_catalog: str
14
+ dataset: str
15
+ user: dict
16
+
17
+ class Outputs(BaseModel):
18
+ dataset: dict
19
+
20
+ def __call__(self, inputs: Inputs) -> Outputs:
21
+ # load the STAC catalog as a STACsetFrame
22
+ df = STACDataFrame.from_stac_file(inputs.stac_catalog)
23
+ # upload all assets to EOTDL
24
+ for row in df.dropna(subset=["assets"]).iterrows():
25
+ # for asset in df.assets.dropna().values[:10]:
26
+ try:
27
+ for k, v in row[1]["assets"].items():
28
+ data = self.ingest_file(
29
+ v["href"],
30
+ inputs.dataset,
31
+ allowed_extensions=self.allowed_extensions + [".tif", ".tiff"],
32
+ )
33
+ file_url = f"{self.repo.url}datasets/{data['dataset_id']}/download/{data['file_name']}"
34
+ df.loc[row[0], "assets"][k]["href"] = file_url
35
+ except Exception as e:
36
+ break
37
+ data, error = self.repo.ingest_stac(
38
+ json.loads(df.to_json()), inputs.dataset, inputs.user["id_token"]
39
+ )
40
+ if error:
41
+ raise Exception(error)
42
+ return self.Outputs(dataset=data)
@@ -5,3 +5,4 @@ from .RetrieveDataset import RetrieveDataset
5
5
  from .RetrieveDatasets import RetrieveDatasets
6
6
  from .IngestFile import IngestFile
7
7
  from .IngestFolder import IngestFolder
8
+ from .IngestSTAC import IngestSTAC
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "eotdl"
3
- version = "2023.06.27"
3
+ version = "2023.07.19"
4
4
  description = "Earth Observation Training Data Lab"
5
5
  authors = ["EarthPulse <it@earthpulse.es>"]
6
6
  license = "MIT"
@@ -12,6 +12,7 @@ packages = \
12
12
  'eotdl.datasets',
13
13
  'eotdl.src',
14
14
  'eotdl.src.errors',
15
+ 'eotdl.src.models',
15
16
  'eotdl.src.repos',
16
17
  'eotdl.src.usecases',
17
18
  'eotdl.src.usecases.auth',
@@ -34,7 +35,7 @@ entry_points = \
34
35
 
35
36
  setup_kwargs = {
36
37
  'name': 'eotdl',
37
- 'version': '2023.6.27',
38
+ 'version': '2023.7.19',
38
39
  'description': 'Earth Observation Training Data Lab',
39
40
  'long_description': '# eotdl \n\nThis is the main library and CLI for EOTDL.\n\n',
40
41
  'author': 'EarthPulse',
@@ -1,61 +0,0 @@
1
- from ..src.repos import APIRepo
2
- from ..src.usecases.datasets import IngestFile, IngestFolder
3
- from ..auth import with_auth
4
-
5
- allowed_extensions = [
6
- ".zip",
7
- ".tar",
8
- ".tar.gz",
9
- ".csv",
10
- ".txt",
11
- ".json",
12
- ".pdf",
13
- ".md",
14
- ]
15
-
16
-
17
- @with_auth
18
- def ingest_file(file, dataset, logger=None, user=None):
19
- api_repo = APIRepo()
20
- ingest = IngestFile(api_repo, allowed_extensions, logger)
21
- inputs = ingest.Inputs(file=file, dataset=dataset, user=user)
22
- outputs = ingest(inputs)
23
- return outputs.dataset
24
-
25
-
26
- @with_auth
27
- def ingest_folder(folder, dataset, logger=None, user=None):
28
- api_repo = APIRepo()
29
- ingest = IngestFolder(api_repo, ingest_file, allowed_extensions, logger)
30
- inputs = ingest.Inputs(folder=folder, dataset=dataset, user=user)
31
- outputs = ingest(inputs)
32
- return outputs.dataset
33
-
34
-
35
- # @with_auth
36
- # def ingest_dataset(name, description, path, logger=None, user=None):
37
- # api_repo = APIRepo()
38
- # ingest = IngestDataset(
39
- # api_repo,
40
- # )
41
- # inputs = ingest.Inputs(name=name, description=description, path=path, user=user)
42
- # outputs = ingest(inputs)
43
- # return outputs.dataset
44
-
45
-
46
- # @with_auth
47
- # def ingest_large_dataset(name, path, logger=None, user=None):
48
- # api_repo = APIRepo()
49
- # ingest = IngestLargeDataset(api_repo, logger)
50
- # inputs = ingest.Inputs(name=name, path=path, user=user)
51
- # outputs = ingest(inputs)
52
- # return outputs.dataset
53
-
54
-
55
- # def ingest_q0(dataset, path):
56
- # return ingest_large_dataset(dataset, path)
57
-
58
-
59
- # def ingest_q1(dataset, stac_catalog):
60
- # print("holas")
61
- # return
@@ -1,56 +0,0 @@
1
- from pydantic import BaseModel
2
- from ....src.utils import calculate_checksum
3
-
4
-
5
- class DownloadDataset:
6
- def __init__(self, repo, retrieve_dataset, logger):
7
- self.repo = repo
8
- self.retrieve_dataset = retrieve_dataset
9
- self.logger = logger if logger else print
10
-
11
- class Inputs(BaseModel):
12
- dataset: str
13
- file: str = None
14
- path: str = None
15
- user: dict
16
-
17
- class Outputs(BaseModel):
18
- dst_path: str
19
-
20
- def download(self, dataset, dataset_id, file, checksum, path, user):
21
- self.logger(f"Downloading {file}")
22
- dst_path = self.repo.download_file(
23
- dataset, dataset_id, file, user["id_token"], path
24
- )
25
- if calculate_checksum(dst_path) != checksum:
26
- self.logger(f"Checksum for {file} does not match")
27
- self.logger(f"Done")
28
- return dst_path
29
-
30
- def __call__(self, inputs: Inputs) -> Outputs:
31
- dataset = self.retrieve_dataset(inputs.dataset)
32
- if inputs.file:
33
- files = [f for f in dataset["files"] if f["name"] == inputs.file]
34
- if not files:
35
- raise Exception(f"File {inputs.file} not found")
36
- if len(files) > 1:
37
- raise Exception(f"Multiple files with name {inputs.file} found")
38
- dst_path = self.download(
39
- inputs.dataset,
40
- dataset["id"],
41
- inputs.file,
42
- files[0]["checksum"],
43
- inputs.path,
44
- inputs.user,
45
- )
46
- return self.Outputs(dst_path=dst_path)
47
- for file in dataset["files"]:
48
- dst_path = self.download(
49
- inputs.dataset,
50
- dataset["id"],
51
- file["name"],
52
- file["checksum"],
53
- inputs.path,
54
- inputs.user,
55
- )
56
- return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
@@ -1,37 +0,0 @@
1
- from pydantic import BaseModel
2
- import os
3
- from pathlib import Path
4
-
5
-
6
- class IngestFolder:
7
- def __init__(self, repo, ingest_file, allowed_extensions, logger):
8
- self.repo = repo
9
- self.ingest_file = ingest_file
10
- self.allowed_extensions = allowed_extensions
11
- self.logger = logger if logger else print
12
-
13
- class Inputs(BaseModel):
14
- folder: Path
15
- dataset: str = None
16
- user: dict
17
-
18
- class Outputs(BaseModel):
19
- dataset: dict
20
-
21
- def __call__(self, inputs: Inputs) -> Outputs:
22
- self.logger("Uploading directory (only files, not recursive)")
23
- items = list(inputs.folder.glob("*"))
24
- filtered_items = [item for item in items if item.is_file()]
25
- filtered_items = [
26
- item for item in filtered_items if item.suffix in self.allowed_extensions
27
- ]
28
- if len(filtered_items) == 0:
29
- raise Exception("No files found in directory")
30
- if len(filtered_items) > 10:
31
- raise Exception("Too many files in directory, limited to 10")
32
- self.logger("The following files will be uploaded:")
33
- for item in filtered_items:
34
- self.logger(f"{item.name}")
35
- for item in filtered_items:
36
- data = self.ingest_file(item, inputs.dataset, logger=self.logger)
37
- return self.Outputs(dataset=data)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes