eotdl 2023.6.14.post10__tar.gz → 2023.7.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/PKG-INFO +1 -1
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/cli.py +0 -1
- eotdl-2023.7.19/eotdl/commands/datasets.py +67 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/__init__.py +1 -1
- eotdl-2023.7.19/eotdl/curation/stac/dataframe.py +144 -0
- eotdl-2023.7.19/eotdl/datasets/__init__.py +3 -0
- eotdl-2023.7.19/eotdl/datasets/download.py +13 -0
- eotdl-2023.7.19/eotdl/datasets/ingest.py +52 -0
- eotdl-2023.7.19/eotdl/src/models/__init__.py +1 -0
- eotdl-2023.7.19/eotdl/src/models/metadata.py +16 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/repos/APIRepo.py +82 -98
- eotdl-2023.7.19/eotdl/src/usecases/datasets/DownloadDataset.py +79 -0
- eotdl-2023.6.14.post10/eotdl/src/usecases/datasets/DownloadDataset.py → eotdl-2023.7.19/eotdl/src/usecases/datasets/DownloadFile.py +8 -5
- eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestFile.py +60 -0
- eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestFolder.py +98 -0
- eotdl-2023.7.19/eotdl/src/usecases/datasets/IngestSTAC.py +42 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/RetrieveDatasets.py +5 -4
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/__init__.py +3 -1
- eotdl-2023.7.19/eotdl/src/utils.py +17 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/pyproject.toml +2 -1
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/setup.py +2 -1
- eotdl-2023.6.14.post10/eotdl/commands/datasets.py +0 -78
- eotdl-2023.6.14.post10/eotdl/datasets/__init__.py +0 -4
- eotdl-2023.6.14.post10/eotdl/datasets/download.py +0 -18
- eotdl-2023.6.14.post10/eotdl/datasets/ingest.py +0 -32
- eotdl-2023.6.14.post10/eotdl/datasets/update.py +0 -12
- eotdl-2023.6.14.post10/eotdl/src/usecases/datasets/UpdateDataset.py +0 -32
- eotdl-2023.6.14.post10/eotdl/src/utils.py +0 -17
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/README.md +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/parameters.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/sentinelhub/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/sentinelhub/client.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/access/sentinelhub/utils.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/auth/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/auth/main.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/commands/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/commands/auth.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/formatters.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/metadata.py +0 -0
- /eotdl-2023.6.14.post10/eotdl/curation/stac/dataframe.py → /eotdl-2023.7.19/eotdl/curation/stac/dataframe_bck.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/extensions.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/parsers.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/stac.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/curation/stac/utils.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/datasets/retrieve.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/hello.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/errors/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/errors/auth.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/repos/AuthRepo.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/repos/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/Auth.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/IsLogged.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/Logout.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/auth/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestDataset.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestLargeDataset.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/IngestLargeDatasetParallel.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/src/usecases/datasets/RetrieveDataset.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/sen12floods/__init__.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/sen12floods/tools.py +0 -0
- {eotdl-2023.6.14.post10 → eotdl-2023.7.19}/eotdl/tools/stac.py +0 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
import typer
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from ..datasets import (
|
5
|
+
retrieve_datasets,
|
6
|
+
download_dataset,
|
7
|
+
ingest_folder,
|
8
|
+
ingest_stac,
|
9
|
+
)
|
10
|
+
|
11
|
+
app = typer.Typer()
|
12
|
+
|
13
|
+
|
14
|
+
@app.command()
|
15
|
+
def ingest(
|
16
|
+
path: Path,
|
17
|
+
f: bool = typer.Option(False, "--f", help="Force ingest even if file exists"),
|
18
|
+
d: bool = typer.Option(False, "--d", help="Delete files not in the dataset"),
|
19
|
+
):
|
20
|
+
"""
|
21
|
+
Ingest a dataset
|
22
|
+
|
23
|
+
path: Path to folder with the dataset
|
24
|
+
"""
|
25
|
+
try:
|
26
|
+
if not path.is_dir():
|
27
|
+
typer.echo("Path must be a folder")
|
28
|
+
return
|
29
|
+
if "catalog.json" in [f.name for f in path.iterdir()]:
|
30
|
+
ingest_stac(str(path) + "/catalog.json", typer.echo)
|
31
|
+
else:
|
32
|
+
ingest_folder(path, f, d, typer.echo)
|
33
|
+
except Exception as e:
|
34
|
+
typer.echo(e)
|
35
|
+
|
36
|
+
|
37
|
+
@app.command()
|
38
|
+
def list():
|
39
|
+
"""
|
40
|
+
List all datasets and files
|
41
|
+
"""
|
42
|
+
datasets = retrieve_datasets()
|
43
|
+
typer.echo(datasets)
|
44
|
+
|
45
|
+
|
46
|
+
@app.command()
|
47
|
+
def get(
|
48
|
+
dataset: str,
|
49
|
+
path: str = None,
|
50
|
+
file: str = None,
|
51
|
+
):
|
52
|
+
"""
|
53
|
+
Download a dataset
|
54
|
+
|
55
|
+
dataset: Name of the dataset
|
56
|
+
file: Name of the file to download (optional, if not provided, the whole dataset will be downloaded)
|
57
|
+
path: Path to download the dataset to (optional, if not provided, the dataset will be downloaded to ~/.eotdl/datasets)
|
58
|
+
"""
|
59
|
+
try:
|
60
|
+
dst_path = download_dataset(dataset, file, path, typer.echo)
|
61
|
+
typer.echo(f"Data available at {dst_path}")
|
62
|
+
except Exception as e:
|
63
|
+
typer.echo(e)
|
64
|
+
|
65
|
+
|
66
|
+
if __name__ == "__main__":
|
67
|
+
app()
|
@@ -0,0 +1,144 @@
|
|
1
|
+
"""
|
2
|
+
Module for the STAC dataframe
|
3
|
+
"""
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import geopandas as gpd
|
7
|
+
import pystac
|
8
|
+
import json
|
9
|
+
from geomet import wkt
|
10
|
+
from os.path import join
|
11
|
+
from os import makedirs
|
12
|
+
from typing import Union
|
13
|
+
from math import isnan
|
14
|
+
from .utils import convert_df_geom_to_shape, get_all_children
|
15
|
+
|
16
|
+
|
17
|
+
class STACDataFrame(gpd.GeoDataFrame):
|
18
|
+
def __init__(self, *args, **kwargs):
|
19
|
+
super().__init__(*args, **kwargs)
|
20
|
+
|
21
|
+
@classmethod
|
22
|
+
def from_stac_file(self, stac_file):
|
23
|
+
"""
|
24
|
+
Create a STACDataFrame from a STAC file
|
25
|
+
"""
|
26
|
+
return read_stac(stac_file)
|
27
|
+
|
28
|
+
def to_stac(self, path):
|
29
|
+
"""
|
30
|
+
Create a STAC catalog and children from a STACDataFrame
|
31
|
+
"""
|
32
|
+
df = self.copy()
|
33
|
+
|
34
|
+
if "id" in df.columns and "stac_id" in df.columns:
|
35
|
+
id_column = "stac_id"
|
36
|
+
stac_id_exists = True
|
37
|
+
else:
|
38
|
+
id_column = "id"
|
39
|
+
stac_id_exists = False
|
40
|
+
|
41
|
+
# First, create the catalog and its folder, if exists
|
42
|
+
catalog_df = df[df["type"] == "Catalog"]
|
43
|
+
|
44
|
+
if catalog_df.empty:
|
45
|
+
makedirs(path, exist_ok=True)
|
46
|
+
else:
|
47
|
+
for index, row in catalog_df.iterrows():
|
48
|
+
root_output_folder = path + "/" + row[id_column]
|
49
|
+
makedirs(root_output_folder, exist_ok=True)
|
50
|
+
row_json = row.to_dict()
|
51
|
+
|
52
|
+
# Curate the json row
|
53
|
+
row_json = self.curate_json_row(row_json, stac_id_exists)
|
54
|
+
|
55
|
+
with open(join(root_output_folder, f"catalog.json"), "w") as f:
|
56
|
+
json.dump(row_json, f)
|
57
|
+
|
58
|
+
# Second, create the collections and their folders, if exist
|
59
|
+
collections = dict()
|
60
|
+
collections_df = df[df["type"] == "Collection"]
|
61
|
+
for index, row in collections_df.iterrows():
|
62
|
+
stac_output_folder = join(root_output_folder, row[id_column])
|
63
|
+
collections[row[id_column]] = stac_output_folder
|
64
|
+
makedirs(stac_output_folder, exist_ok=True)
|
65
|
+
row_json = row.to_dict()
|
66
|
+
|
67
|
+
# Curate the json row
|
68
|
+
row_json = self.curate_json_row(row_json, stac_id_exists)
|
69
|
+
|
70
|
+
with open(join(stac_output_folder, f"collection.json"), "w") as f:
|
71
|
+
json.dump(row_json, f)
|
72
|
+
|
73
|
+
# Then, create the items and their folders, if exist
|
74
|
+
features_df = df[df["type"] == "Feature"]
|
75
|
+
for index, row in features_df.iterrows():
|
76
|
+
collection = row["collection"]
|
77
|
+
stac_output_folder = join(collections[collection], row[id_column])
|
78
|
+
|
79
|
+
# Convert the geometry from WKT back to geojson
|
80
|
+
row["geometry"] = row["geometry"].wkt
|
81
|
+
row["geometry"] = wkt.loads(row["geometry"])
|
82
|
+
makedirs(stac_output_folder, exist_ok=True)
|
83
|
+
row_json = row.to_dict()
|
84
|
+
|
85
|
+
# Curate the json row
|
86
|
+
row_json = self.curate_json_row(row_json, stac_id_exists)
|
87
|
+
|
88
|
+
with open(join(stac_output_folder, f'{row_json["id"]}.json'), "w") as f:
|
89
|
+
json.dump(row_json, f)
|
90
|
+
|
91
|
+
def curate_json_row(self, row: dict, stac_id_exists: bool) -> dict:
|
92
|
+
"""
|
93
|
+
Curate the json row of a STACDataFrame, in order to generate a valid STAC file
|
94
|
+
|
95
|
+
:param row: row of a STACDataFrame
|
96
|
+
:param stac_id_exists: if the stac_id column exists
|
97
|
+
"""
|
98
|
+
keys_to_remove = list()
|
99
|
+
|
100
|
+
# Remove the created_at and modified_at columns, if the STACDataFrame comes from GeoDB
|
101
|
+
for i in "created_at", "modified_at":
|
102
|
+
if i in row.keys():
|
103
|
+
keys_to_remove.append(i)
|
104
|
+
|
105
|
+
# Rename the stac_id column to id, to avoid conflicts with the id column
|
106
|
+
if stac_id_exists:
|
107
|
+
row["id"] = row["stac_id"]
|
108
|
+
del row["stac_id"]
|
109
|
+
|
110
|
+
# Remove the NaN values and empty strings
|
111
|
+
for k, v in row.items():
|
112
|
+
if (isinstance(v, float) and isnan(v)) or v == "":
|
113
|
+
keys_to_remove.append(k)
|
114
|
+
for key in keys_to_remove:
|
115
|
+
del row[key]
|
116
|
+
del row["geometry"]
|
117
|
+
|
118
|
+
return row
|
119
|
+
|
120
|
+
|
121
|
+
def read_stac(
|
122
|
+
stac_file: Union[pystac.Catalog, pystac.Collection, str],
|
123
|
+
geometry_column: str = "geometry",
|
124
|
+
) -> STACDataFrame:
|
125
|
+
"""
|
126
|
+
Read a STAC file and return a STACDataFrame
|
127
|
+
|
128
|
+
:param stac_file: STAC file to read
|
129
|
+
:param geometry_column: name of the geometry column
|
130
|
+
"""
|
131
|
+
if isinstance(stac_file, str):
|
132
|
+
stac_file = pystac.read_file(stac_file)
|
133
|
+
children = get_all_children(stac_file)
|
134
|
+
|
135
|
+
# Convert Dataframe to STACDataFrame
|
136
|
+
dataframe = pd.DataFrame(children)
|
137
|
+
dataframe[geometry_column] = dataframe.apply(convert_df_geom_to_shape, axis=1)
|
138
|
+
stac_dataframe = STACDataFrame(
|
139
|
+
dataframe,
|
140
|
+
crs="EPSG:4326",
|
141
|
+
geometry=gpd.GeoSeries.from_wkt(dataframe[geometry_column]),
|
142
|
+
)
|
143
|
+
|
144
|
+
return stac_dataframe
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from ..src.repos import APIRepo
|
2
|
+
from ..src.usecases.datasets import DownloadDataset, DownloadFile
|
3
|
+
from .retrieve import retrieve_dataset
|
4
|
+
from ..auth import with_auth
|
5
|
+
|
6
|
+
|
7
|
+
@with_auth
|
8
|
+
def download_dataset(dataset, file, path=None, logger=None, user=None):
|
9
|
+
api_repo = APIRepo()
|
10
|
+
download = DownloadDataset(api_repo, retrieve_dataset, logger)
|
11
|
+
inputs = download.Inputs(dataset=dataset, file=file, path=path, user=user)
|
12
|
+
outputs = download(inputs)
|
13
|
+
return outputs.dst_path
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from ..src.repos import APIRepo
|
4
|
+
from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
|
5
|
+
from ..auth import with_auth
|
6
|
+
|
7
|
+
|
8
|
+
allowed_extensions = [
|
9
|
+
".zip",
|
10
|
+
".tar",
|
11
|
+
".tar.gz",
|
12
|
+
".csv",
|
13
|
+
".txt",
|
14
|
+
".json",
|
15
|
+
".pdf",
|
16
|
+
".md",
|
17
|
+
".yml",
|
18
|
+
]
|
19
|
+
|
20
|
+
|
21
|
+
def ingest_q1(dataset, stac_catalog):
|
22
|
+
print("hola")
|
23
|
+
return
|
24
|
+
|
25
|
+
|
26
|
+
@with_auth
|
27
|
+
def ingest_file(
|
28
|
+
file, dataset_id, logger=None, allowed_extensions=allowed_extensions, user=None
|
29
|
+
):
|
30
|
+
api_repo = APIRepo()
|
31
|
+
ingest = IngestFile(api_repo, allowed_extensions, logger)
|
32
|
+
inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
|
33
|
+
outputs = ingest(inputs)
|
34
|
+
return outputs.data
|
35
|
+
|
36
|
+
|
37
|
+
@with_auth
|
38
|
+
def ingest_folder(folder, force, delete, logger=None, user=None):
|
39
|
+
api_repo = APIRepo()
|
40
|
+
ingest = IngestFolder(api_repo, ingest_file, allowed_extensions, logger)
|
41
|
+
inputs = ingest.Inputs(folder=folder, user=user, force=force, delete=delete)
|
42
|
+
outputs = ingest(inputs)
|
43
|
+
return outputs.dataset
|
44
|
+
|
45
|
+
|
46
|
+
@with_auth
|
47
|
+
def ingest_stac(stac_catalog, dataset, logger=None, user=None):
|
48
|
+
api_repo = APIRepo()
|
49
|
+
ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
|
50
|
+
inputs = ingest.Inputs(stac_catalog=stac_catalog, dataset=dataset, user=user)
|
51
|
+
outputs = ingest(inputs)
|
52
|
+
return outputs.dataset
|
@@ -0,0 +1 @@
|
|
1
|
+
from .metadata import Metadata
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from pydantic import BaseModel, validator
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
|
5
|
+
class Metadata(BaseModel):
|
6
|
+
authors: List[str]
|
7
|
+
license: str
|
8
|
+
source: str
|
9
|
+
name: str
|
10
|
+
|
11
|
+
# validate source is a URL
|
12
|
+
@validator("source")
|
13
|
+
def source_is_url(cls, v):
|
14
|
+
if not v.startswith("http") and not v.startswith("https"):
|
15
|
+
raise ValueError("source must be a URL")
|
16
|
+
return v
|
@@ -6,11 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
|
|
6
6
|
import time
|
7
7
|
import multiprocessing
|
8
8
|
import hashlib
|
9
|
+
import geopandas as gpd
|
9
10
|
|
10
11
|
|
11
12
|
class APIRepo:
|
12
13
|
def __init__(self, url=os.getenv("EOTDL_API_URL", "https://api.eotdl.com/")):
|
13
14
|
self.url = url
|
15
|
+
# print(self.url)
|
14
16
|
|
15
17
|
def login(self):
|
16
18
|
return requests.get(self.url + "auth/login")
|
@@ -22,6 +24,16 @@ class APIRepo:
|
|
22
24
|
response = requests.get(self.url + "auth/logout")
|
23
25
|
return response.json()["logout_url"]
|
24
26
|
|
27
|
+
def create_dataset(self, metadata, id_token):
|
28
|
+
response = requests.post(
|
29
|
+
self.url + "datasets",
|
30
|
+
json=metadata,
|
31
|
+
headers={"Authorization": "Bearer " + id_token},
|
32
|
+
)
|
33
|
+
if response.status_code == 200:
|
34
|
+
return response.json(), None
|
35
|
+
return None, response.json()["detail"]
|
36
|
+
|
25
37
|
def retrieve_datasets(self):
|
26
38
|
return requests.get(self.url + "datasets").json()
|
27
39
|
|
@@ -31,12 +43,10 @@ class APIRepo:
|
|
31
43
|
return response.json(), None
|
32
44
|
return None, response.json()["detail"]
|
33
45
|
|
34
|
-
def
|
35
|
-
url = self.url + "datasets/" + dataset_id + "/download"
|
46
|
+
def download_file(self, dataset, dataset_id, file, id_token, path):
|
47
|
+
url = self.url + "datasets/" + dataset_id + "/download/" + file
|
36
48
|
headers = {"Authorization": "Bearer " + id_token}
|
37
|
-
|
38
|
-
path = str(Path.home()) + "/.eotdl/datasets"
|
39
|
-
os.makedirs(path, exist_ok=True)
|
49
|
+
path = f"{path}/{file}"
|
40
50
|
with requests.get(url, headers=headers, stream=True) as r:
|
41
51
|
r.raise_for_status()
|
42
52
|
total_size = int(r.headers.get("content-length", 0))
|
@@ -44,10 +54,6 @@ class APIRepo:
|
|
44
54
|
progress_bar = tqdm(
|
45
55
|
total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
|
46
56
|
)
|
47
|
-
filename = r.headers.get("content-disposition").split("filename=")[1][1:-1]
|
48
|
-
path = f"{path}/{filename}"
|
49
|
-
if os.path.exists(path):
|
50
|
-
raise Exception("File already exists")
|
51
57
|
with open(path, "wb") as f:
|
52
58
|
for chunk in r.iter_content(block_size):
|
53
59
|
progress_bar.update(len(chunk))
|
@@ -56,6 +62,27 @@ class APIRepo:
|
|
56
62
|
progress_bar.close()
|
57
63
|
return path
|
58
64
|
|
65
|
+
def ingest_file(self, file, dataset_id, id_token, checksum=None):
|
66
|
+
reponse = requests.post(
|
67
|
+
self.url + "datasets/" + dataset_id,
|
68
|
+
files={"file": open(file, "rb")},
|
69
|
+
data={"checksum": checksum} if checksum else None,
|
70
|
+
headers={"Authorization": "Bearer " + id_token},
|
71
|
+
)
|
72
|
+
if reponse.status_code != 200:
|
73
|
+
return None, reponse.json()["detail"]
|
74
|
+
return reponse.json(), None
|
75
|
+
|
76
|
+
def ingest_file_url(self, file, dataset, id_token):
|
77
|
+
reponse = requests.post(
|
78
|
+
self.url + "datasets/url",
|
79
|
+
json={"dataset": dataset, "url": file},
|
80
|
+
headers={"Authorization": "Bearer " + id_token},
|
81
|
+
)
|
82
|
+
if reponse.status_code != 200:
|
83
|
+
return None, reponse.json()["detail"]
|
84
|
+
return reponse.json(), None
|
85
|
+
|
59
86
|
def read_in_chunks(self, file_object, CHUNK_SIZE):
|
60
87
|
while True:
|
61
88
|
data = file_object.read(CHUNK_SIZE)
|
@@ -63,18 +90,21 @@ class APIRepo:
|
|
63
90
|
break
|
64
91
|
yield data
|
65
92
|
|
66
|
-
def prepare_large_upload(self,
|
67
|
-
|
68
|
-
response = requests.
|
93
|
+
def prepare_large_upload(self, file, dataset_id, checksum, id_token):
|
94
|
+
filename = Path(file).name
|
95
|
+
response = requests.post(
|
96
|
+
self.url + f"datasets/{dataset_id}/uploadId",
|
97
|
+
json={"name": filename, "checksum": checksum},
|
98
|
+
headers={"Authorization": "Bearer " + id_token},
|
99
|
+
)
|
69
100
|
if response.status_code != 200:
|
70
101
|
raise Exception(response.json()["detail"])
|
71
102
|
data = response.json()
|
72
|
-
|
73
|
-
data["dataset_id"],
|
103
|
+
upload_id, parts = (
|
74
104
|
data["upload_id"],
|
75
105
|
data["parts"] if "parts" in data else [],
|
76
106
|
)
|
77
|
-
return
|
107
|
+
return upload_id, parts
|
78
108
|
|
79
109
|
def get_chunk_size(self, content_size):
|
80
110
|
# adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
|
@@ -85,37 +115,31 @@ class APIRepo:
|
|
85
115
|
chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
|
86
116
|
return chunk_size
|
87
117
|
|
88
|
-
def ingest_large_dataset(self,
|
89
|
-
content_path = os.path.abspath(
|
118
|
+
def ingest_large_dataset(self, file, upload_id, id_token, parts):
|
119
|
+
content_path = os.path.abspath(file)
|
90
120
|
content_size = os.stat(content_path).st_size
|
91
121
|
chunk_size = self.get_chunk_size(content_size)
|
92
122
|
total_chunks = content_size // chunk_size
|
93
|
-
url = self.url + "datasets/chunk"
|
94
|
-
headers = {
|
95
|
-
"Authorization": "Bearer " + id_token,
|
96
|
-
"Upload-Id": upload_id,
|
97
|
-
"Dataset-Id": dataset_id,
|
98
|
-
}
|
99
123
|
# upload chunks sequentially
|
100
124
|
pbar = tqdm(
|
101
125
|
self.read_in_chunks(open(content_path, "rb"), chunk_size),
|
102
126
|
total=total_chunks,
|
103
127
|
)
|
104
128
|
index = 0
|
105
|
-
parts_checkusms = []
|
106
129
|
for chunk in pbar:
|
107
130
|
part = index // chunk_size + 1
|
108
131
|
offset = index + len(chunk)
|
109
132
|
index = offset
|
110
133
|
if part not in parts:
|
111
|
-
headers["Part-Number"] = str(part)
|
112
134
|
checksum = hashlib.md5(chunk).hexdigest()
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
135
|
+
response = requests.post(
|
136
|
+
self.url + "datasets/chunk/" + upload_id,
|
137
|
+
files={"file": chunk},
|
138
|
+
data={"part_number": part, "checksum": checksum},
|
139
|
+
headers={"Authorization": "Bearer " + id_token},
|
140
|
+
)
|
141
|
+
if response.status_code != 200:
|
142
|
+
raise Exception(response.json()["detail"])
|
119
143
|
pbar.set_description(
|
120
144
|
"{:.2f}/{:.2f} MB".format(
|
121
145
|
offset / 1024 / 1024, content_size / 1024 / 1024
|
@@ -124,16 +148,10 @@ class APIRepo:
|
|
124
148
|
pbar.close()
|
125
149
|
return
|
126
150
|
|
127
|
-
def complete_upload(self,
|
128
|
-
url = self.url + "datasets/complete"
|
151
|
+
def complete_upload(self, id_token, upload_id):
|
129
152
|
r = requests.post(
|
130
|
-
url,
|
131
|
-
|
132
|
-
headers={
|
133
|
-
"Authorization": "Bearer " + id_token,
|
134
|
-
"Upload-Id": upload_id,
|
135
|
-
"Dataset-Id": dataset_id,
|
136
|
-
},
|
153
|
+
self.url + "datasets/complete/" + upload_id,
|
154
|
+
headers={"Authorization": "Bearer " + id_token},
|
137
155
|
)
|
138
156
|
if r.status_code != 200:
|
139
157
|
return None, r.json()["detail"]
|
@@ -200,63 +218,29 @@ class APIRepo:
|
|
200
218
|
return None, r.json()["detail"]
|
201
219
|
return r.json(), None
|
202
220
|
|
203
|
-
def
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
):
|
212
|
-
# Create thread pool executor
|
213
|
-
max_workers = threads if threads > 0 else multiprocessing.cpu_count()
|
214
|
-
executor = ThreadPoolExecutor(max_workers=max_workers)
|
215
|
-
|
216
|
-
# Divide file into chunks and create tasks for each chunk
|
217
|
-
offset = 0
|
218
|
-
tasks = []
|
219
|
-
content_path = os.path.abspath(path)
|
220
|
-
content_size = os.stat(content_path).st_size
|
221
|
-
chunk_size = self.get_chunk_size(content_size)
|
222
|
-
total_chunks = content_size // chunk_size
|
223
|
-
while offset < content_size:
|
224
|
-
chunk_end = min(offset + chunk_size, content_size)
|
225
|
-
part = str(offset // chunk_size + 1)
|
226
|
-
if part not in parts:
|
227
|
-
tasks.append((offset, chunk_end, part))
|
228
|
-
offset = chunk_end
|
229
|
-
|
230
|
-
# Define the function that will upload each chunk
|
231
|
-
def upload_chunk(start, end, part):
|
232
|
-
# print(f"Uploading chunk {start} - {end}", part)
|
233
|
-
with open(content_path, "rb") as f:
|
234
|
-
f.seek(start)
|
235
|
-
chunk = f.read(end - start)
|
236
|
-
checksum = hashlib.md5(chunk).hexdigest()
|
237
|
-
response = requests.post(
|
238
|
-
self.url + "datasets/chunk",
|
239
|
-
files={"file": chunk},
|
240
|
-
headers={
|
241
|
-
"Authorization": "Bearer " + id_token,
|
242
|
-
"Upload-Id": upload_id,
|
243
|
-
"Dataset-Id": dataset_id,
|
244
|
-
"Checksum": checksum,
|
245
|
-
"Part-Number": str(part),
|
246
|
-
},
|
247
|
-
)
|
248
|
-
if response.status_code != 200:
|
249
|
-
print(f"Failed to upload chunk {start} - {end}")
|
250
|
-
return response
|
221
|
+
def delete_file(self, dataset_id, file_name, id_token):
|
222
|
+
response = requests.delete(
|
223
|
+
self.url + "datasets/" + dataset_id + "/file/" + file_name,
|
224
|
+
headers={"Authorization": "Bearer " + id_token},
|
225
|
+
)
|
226
|
+
if response.status_code != 200:
|
227
|
+
return None, response.json()["detail"]
|
228
|
+
return response.json(), None
|
251
229
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
230
|
+
def ingest_stac(self, stac_json, dataset, id_token):
|
231
|
+
reponse = requests.post(
|
232
|
+
self.url + "datasets/stac",
|
233
|
+
json={"dataset": dataset, "stac": stac_json},
|
234
|
+
headers={"Authorization": "Bearer " + id_token},
|
235
|
+
)
|
236
|
+
if reponse.status_code != 200:
|
237
|
+
return None, reponse.json()["detail"]
|
238
|
+
return reponse.json(), None
|
259
239
|
|
260
|
-
|
261
|
-
|
262
|
-
|
240
|
+
def download_stac(self, dataset_id, id_token):
|
241
|
+
url = self.url + "datasets/" + dataset_id + "/download"
|
242
|
+
headers = {"Authorization": "Bearer " + id_token}
|
243
|
+
response = requests.get(url, headers=headers)
|
244
|
+
if response.status_code != 200:
|
245
|
+
return None, response.json()["detail"]
|
246
|
+
return gpd.GeoDataFrame.from_features(response.json()["features"]), None
|
@@ -0,0 +1,79 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from ....src.utils import calculate_checksum
|
3
|
+
from ....curation.stac import STACDataFrame
|
4
|
+
from pathlib import Path
|
5
|
+
import os
|
6
|
+
|
7
|
+
|
8
|
+
class DownloadDataset:
|
9
|
+
def __init__(self, repo, retrieve_dataset, logger):
|
10
|
+
self.repo = repo
|
11
|
+
self.retrieve_dataset = retrieve_dataset
|
12
|
+
self.logger = logger if logger else print
|
13
|
+
|
14
|
+
class Inputs(BaseModel):
|
15
|
+
dataset: str
|
16
|
+
file: str = None
|
17
|
+
path: str = None
|
18
|
+
user: dict
|
19
|
+
|
20
|
+
class Outputs(BaseModel):
|
21
|
+
dst_path: str
|
22
|
+
|
23
|
+
def download(self, dataset, dataset_id, file, checksum, path, user):
|
24
|
+
self.logger(f"Downloading {file}")
|
25
|
+
dst_path = self.repo.download_file(
|
26
|
+
dataset, dataset_id, file, user["id_token"], path
|
27
|
+
)
|
28
|
+
if calculate_checksum(dst_path) != checksum:
|
29
|
+
self.logger(f"Checksum for {file} does not match")
|
30
|
+
self.logger(f"Done")
|
31
|
+
return dst_path
|
32
|
+
|
33
|
+
def __call__(self, inputs: Inputs) -> Outputs:
|
34
|
+
dataset = self.retrieve_dataset(inputs.dataset)
|
35
|
+
if inputs.path is None:
|
36
|
+
download_path = str(Path.home()) + "/.eotdl/datasets/" + inputs.dataset
|
37
|
+
else:
|
38
|
+
download_path = inputs.path + "/" + inputs.dataset
|
39
|
+
os.makedirs(download_path, exist_ok=True)
|
40
|
+
if dataset["quality"] == 0:
|
41
|
+
if inputs.file:
|
42
|
+
files = [f for f in dataset["files"] if f["name"] == inputs.file]
|
43
|
+
if not files:
|
44
|
+
raise Exception(f"File {inputs.file} not found")
|
45
|
+
if len(files) > 1:
|
46
|
+
raise Exception(f"Multiple files with name {inputs.file} found")
|
47
|
+
dst_path = self.download(
|
48
|
+
inputs.dataset,
|
49
|
+
dataset["id"],
|
50
|
+
inputs.file,
|
51
|
+
files[0]["checksum"],
|
52
|
+
download_path,
|
53
|
+
inputs.user,
|
54
|
+
)
|
55
|
+
return self.Outputs(dst_path=dst_path)
|
56
|
+
for file in dataset["files"]:
|
57
|
+
dst_path = self.download(
|
58
|
+
inputs.dataset,
|
59
|
+
dataset["id"],
|
60
|
+
file["name"],
|
61
|
+
file["checksum"],
|
62
|
+
download_path,
|
63
|
+
inputs.user,
|
64
|
+
)
|
65
|
+
return self.Outputs(dst_path="/".join(dst_path.split("/")[:-1]))
|
66
|
+
else:
|
67
|
+
gdf, error = self.repo.download_stac(
|
68
|
+
dataset["id"],
|
69
|
+
inputs.user["id_token"],
|
70
|
+
)
|
71
|
+
if error:
|
72
|
+
raise Exception(error)
|
73
|
+
df = STACDataFrame(gdf)
|
74
|
+
# df.geometry = df.geometry.apply(lambda x: Polygon() if x is None else x)
|
75
|
+
path = inputs.path
|
76
|
+
if path is None:
|
77
|
+
path = str(Path.home()) + "/.eotdl/datasets/" + dataset["name"]
|
78
|
+
df.to_stac(path)
|
79
|
+
return self.Outputs(dst_path=path)
|