eotdl 2023.6.14.post10__py3-none-any.whl → 2023.7.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eotdl/cli.py CHANGED
@@ -6,6 +6,5 @@ app = typer.Typer()
6
6
  app.add_typer(auth.app, name="auth")
7
7
  app.add_typer(datasets.app, name="datasets")
8
8
 
9
-
10
9
  if __name__ == "__main__":
11
10
  app()
@@ -1,75 +1,64 @@
1
1
  import typer
2
+ from pathlib import Path
3
+
2
4
  from ..datasets import (
3
5
  retrieve_datasets,
4
6
  download_dataset,
5
- update_dataset,
6
- ingest_large_dataset,
7
- # ingest_large_dataset_parallel,
7
+ ingest_folder,
8
+ ingest_stac,
8
9
  )
9
- from .auth import auth
10
10
 
11
11
  app = typer.Typer()
12
12
 
13
13
 
14
14
  @app.command()
15
- def list():
16
- """
17
- List all datasets
18
- """
19
- datasets = retrieve_datasets()
20
- typer.echo(datasets)
21
-
22
-
23
- @app.command()
24
- def get(name: str, path: str = None):
15
+ def ingest(
16
+ path: Path,
17
+ f: bool = typer.Option(False, "--f", help="Force ingest even if file exists"),
18
+ d: bool = typer.Option(False, "--d", help="Delete files not in the dataset"),
19
+ ):
25
20
  """
26
- Download a dataset
21
+ Ingest a dataset
27
22
 
28
- name: Name of the dataset
29
- path: Path to download the dataset to
23
+ path: Path to folder with the dataset
30
24
  """
31
25
  try:
32
- dst_path = download_dataset(name, path, typer.echo)
33
- typer.echo(f"Dataset {name} downloaded to {dst_path}")
26
+ if not path.is_dir():
27
+ typer.echo("Path must be a folder")
28
+ return
29
+ if "catalog.json" in [f.name for f in path.iterdir()]:
30
+ ingest_stac(str(path) + "/catalog.json", typer.echo)
31
+ else:
32
+ ingest_folder(path, f, d, typer.echo)
34
33
  except Exception as e:
35
34
  typer.echo(e)
36
35
 
37
36
 
38
37
  @app.command()
39
- def ingest(
40
- path: str,
41
- name: str,
42
- # p: Optional[int] = 0,
43
- ):
38
+ def list():
44
39
  """
45
- Ingest a dataset
46
-
47
- path: Path to dataset to ingest
48
- n: Name of the dataset
40
+ List all datasets and files
49
41
  """
50
- try:
51
- # if p:
52
- # ingest_large_dataset_parallel(name, path, user, p, typer.echo)
53
- ingest_large_dataset(name, path, typer.echo)
54
- typer.echo(f"Dataset {name} ingested")
55
- except Exception as e:
56
- typer.echo(e)
42
+ datasets = retrieve_datasets()
43
+ typer.echo(datasets)
57
44
 
58
45
 
59
46
  @app.command()
60
- def update(
61
- name: str,
62
- path: str,
47
+ def get(
48
+ dataset: str,
49
+ path: str = None,
50
+ file: str = None,
63
51
  ):
64
52
  """
65
- Update a dataset
53
+ Download a dataset
66
54
 
67
- name: Name of the dataset
68
- path: Path to dataset to ingest
55
+ dataset: Name of the dataset
56
+ file: Name of the file to download (optional, if not provided, the whole dataset will be downloaded)
57
+ path: Path to download the dataset to (optional, if not provided, the dataset will be downloaded to ~/.eotdl/datasets)
69
58
  """
70
59
  try:
71
- update_dataset(name, path, typer.echo)
72
- typer.echo(f"Dataset {name} updated")
60
+ dst_path = download_dataset(dataset, file, path, typer.echo)
61
+ typer.echo(f"Data available at {dst_path}")
73
62
  except Exception as e:
74
63
  typer.echo(e)
75
64
 
@@ -1,4 +1,4 @@
1
1
  # from .stac import STACGenerator
2
2
  # from .utils import format_time_acquired
3
3
  # from .parsers import STACIdParser, StructuredParser, UnestructuredParser
4
- # from .dataframe import STACDataFrame, read_stac
4
+ from .dataframe import STACDataFrame, read_stac
@@ -6,12 +6,10 @@ import pandas as pd
6
6
  import geopandas as gpd
7
7
  import pystac
8
8
  import json
9
- import os
10
- from xcube_geodb.core.geodb import GeoDBClient
11
9
  from geomet import wkt
12
10
  from os.path import join
13
11
  from os import makedirs
14
-
12
+ from typing import Union
15
13
  from math import isnan
16
14
  from .utils import convert_df_geom_to_shape, get_all_children
17
15
 
@@ -27,113 +25,7 @@ class STACDataFrame(gpd.GeoDataFrame):
27
25
  """
28
26
  return read_stac(stac_file)
29
27
 
30
- @classmethod
31
- def from_geodb(
32
- self,
33
- server_url: str,
34
- server_port: int | str,
35
- client_id: str,
36
- client_secret: str,
37
- auth_aud: str,
38
- collection: str,
39
- database: str = None,
40
- ):
41
- """
42
- Create a STACDataFrame from a GeoDB collection
43
-
44
- :param server_url: GeoDB server url
45
- :param server_port: GeoDB server port
46
- :param client_id: GeoDB client id
47
- :param client_secret: GeoDB client secret
48
- :param auth_aud: GeoDB auth aud
49
- :param collection: GeoDB collection
50
- :param database: GeoDB database
51
- """
52
- geodb_client = GeoDBClient(
53
- server_url=server_url,
54
- server_port=server_port,
55
- client_id=client_id,
56
- client_secret=client_secret,
57
- auth_aud=auth_aud,
58
- )
59
-
60
- data = geodb_client.get_collection(collection, database=database)
61
-
62
- return STACDataFrame(data, crs="EPSG:4326")
63
-
64
- def ingest(
65
- self,
66
- collection: str,
67
- server_url: str = os.environ["SERVER_URL"],
68
- server_port: int = os.environ["SERVER_PORT"],
69
- client_id: str = os.environ["CLIENT_ID"],
70
- client_secret: str = os.environ["CLIENT_SECRET"],
71
- auth_aud: str = os.environ["AUTH_DOMAIN"],
72
- database: str = None,
73
- ):
74
- """
75
- Create a GeoDB collection from a STACDataFrame
76
-
77
- :param collection: dataset name (GeoDB collection)
78
- :param server_url: GeoDB server url
79
- :param server_port: GeoDB server port
80
- :param client_id: GeoDB client id
81
- :param client_secret: GeoDB client secret
82
- :param auth_aud: GeoDB auth aud
83
- :param database: GeoDB database
84
- """
85
-
86
- geodb_client = GeoDBClient(
87
- server_url=server_url,
88
- server_port=server_port,
89
- client_id=client_id,
90
- client_secret=client_secret,
91
- auth_aud=auth_aud,
92
- )
93
-
94
- # TODO: check name is unique (use eotdl-cli)
95
-
96
- # TODO: ingest assets (only if local)
97
- # TODO: rename assets in the dataframe with URLs (only if local)
98
-
99
- # ingest to geodb
100
-
101
- # Check if the collection already exists
102
- if geodb_client.collection_exists(collection, database=database):
103
- # geodb_client.drop_collection(collection, database=database)
104
- raise Exception(f"Collection {collection} already exists")
105
-
106
- # Rename the column id to stac_id, to avoid conflicts with the id column
107
- self.rename(columns={"id": "stac_id"}, inplace=True)
108
- # Fill the NaN with '' to avoid errors, except in the geometry column
109
- copy = self.copy()
110
- columns_to_fill = copy.columns.drop("geometry")
111
- self[columns_to_fill] = self[columns_to_fill].fillna("")
112
-
113
- # Create the collection if it does not exist
114
- # and insert the data
115
- collections = {collection: self._create_collection_structure(self.columns)}
116
- geodb_client.create_collections(collections, database=database)
117
-
118
- geodb_client.insert_into_collection(collection, database=database, values=self)
119
-
120
- # TODO: save data in eotdl
121
-
122
- def _create_collection_structure(self, columns: list) -> dict:
123
- """
124
- Create the schema structure of a GeoDB collection from a STACDataFrame
125
-
126
- :param columns: columns of the STACDataFrame
127
- """
128
- stac_collection = {"crs": 4326, "properties": {}}
129
-
130
- for column in columns:
131
- if column not in ("geometry", "id"):
132
- stac_collection["properties"][column] = "json"
133
-
134
- return stac_collection
135
-
136
- def to_stac(self):
28
+ def to_stac(self, path):
137
29
  """
138
30
  Create a STAC catalog and children from a STACDataFrame
139
31
  """
@@ -150,11 +42,10 @@ class STACDataFrame(gpd.GeoDataFrame):
150
42
  catalog_df = df[df["type"] == "Catalog"]
151
43
 
152
44
  if catalog_df.empty:
153
- root_output_folder = "output"
154
- makedirs(root_output_folder, exist_ok=True)
45
+ makedirs(path, exist_ok=True)
155
46
  else:
156
47
  for index, row in catalog_df.iterrows():
157
- root_output_folder = row[id_column]
48
+ root_output_folder = path + "/" + row[id_column]
158
49
  makedirs(root_output_folder, exist_ok=True)
159
50
  row_json = row.to_dict()
160
51
 
@@ -228,7 +119,7 @@ class STACDataFrame(gpd.GeoDataFrame):
228
119
 
229
120
 
230
121
  def read_stac(
231
- stac_file: pystac.Catalog | pystac.Collection | str,
122
+ stac_file: Union[pystac.Catalog, pystac.Collection, str],
232
123
  geometry_column: str = "geometry",
233
124
  ) -> STACDataFrame:
234
125
  """
@@ -0,0 +1,253 @@
1
+ """
2
+ Module for the STAC dataframe
3
+ """
4
+
5
+ import pandas as pd
6
+ import geopandas as gpd
7
+ import pystac
8
+ import json
9
+ import os
10
+ from xcube_geodb.core.geodb import GeoDBClient
11
+ from geomet import wkt
12
+ from os.path import join
13
+ from os import makedirs
14
+
15
+ from math import isnan
16
+ from .utils import convert_df_geom_to_shape, get_all_children
17
+
18
+
19
+ class STACDataFrame(gpd.GeoDataFrame):
20
+ def __init__(self, *args, **kwargs):
21
+ super().__init__(*args, **kwargs)
22
+
23
+ @classmethod
24
+ def from_stac_file(self, stac_file):
25
+ """
26
+ Create a STACDataFrame from a STAC file
27
+ """
28
+ return read_stac(stac_file)
29
+
30
+ @classmethod
31
+ def from_geodb(
32
+ self,
33
+ server_url: str,
34
+ server_port: int | str,
35
+ client_id: str,
36
+ client_secret: str,
37
+ auth_aud: str,
38
+ collection: str,
39
+ database: str = None,
40
+ ):
41
+ """
42
+ Create a STACDataFrame from a GeoDB collection
43
+
44
+ :param server_url: GeoDB server url
45
+ :param server_port: GeoDB server port
46
+ :param client_id: GeoDB client id
47
+ :param client_secret: GeoDB client secret
48
+ :param auth_aud: GeoDB auth aud
49
+ :param collection: GeoDB collection
50
+ :param database: GeoDB database
51
+ """
52
+ geodb_client = GeoDBClient(
53
+ server_url=server_url,
54
+ server_port=server_port,
55
+ client_id=client_id,
56
+ client_secret=client_secret,
57
+ auth_aud=auth_aud,
58
+ )
59
+
60
+ data = geodb_client.get_collection(collection, database=database)
61
+
62
+ return STACDataFrame(data, crs="EPSG:4326")
63
+
64
+ def ingest(
65
+ self,
66
+ collection: str,
67
+ server_url: str = os.environ["SERVER_URL"],
68
+ server_port: int = os.environ["SERVER_PORT"],
69
+ client_id: str = os.environ["CLIENT_ID"],
70
+ client_secret: str = os.environ["CLIENT_SECRET"],
71
+ auth_aud: str = os.environ["AUTH_DOMAIN"],
72
+ database: str = None,
73
+ ):
74
+ """
75
+ Create a GeoDB collection from a STACDataFrame
76
+
77
+ :param collection: dataset name (GeoDB collection)
78
+ :param server_url: GeoDB server url
79
+ :param server_port: GeoDB server port
80
+ :param client_id: GeoDB client id
81
+ :param client_secret: GeoDB client secret
82
+ :param auth_aud: GeoDB auth aud
83
+ :param database: GeoDB database
84
+ """
85
+
86
+ geodb_client = GeoDBClient(
87
+ server_url=server_url,
88
+ server_port=server_port,
89
+ client_id=client_id,
90
+ client_secret=client_secret,
91
+ auth_aud=auth_aud,
92
+ )
93
+
94
+ # TODO: check name is unique (use eotdl-cli)
95
+
96
+ # TODO: ingest assets (only if local)
97
+ # TODO: rename assets in the dataframe with URLs (only if local)
98
+
99
+ # ingest to geodb
100
+
101
+ # Check if the collection already exists
102
+ if geodb_client.collection_exists(collection, database=database):
103
+ # geodb_client.drop_collection(collection, database=database)
104
+ raise Exception(f"Collection {collection} already exists")
105
+
106
+ # Rename the column id to stac_id, to avoid conflicts with the id column
107
+ self.rename(columns={"id": "stac_id"}, inplace=True)
108
+ # Fill the NaN with '' to avoid errors, except in the geometry column
109
+ copy = self.copy()
110
+ columns_to_fill = copy.columns.drop("geometry")
111
+ self[columns_to_fill] = self[columns_to_fill].fillna("")
112
+
113
+ # Create the collection if it does not exist
114
+ # and insert the data
115
+ collections = {collection: self._create_collection_structure(self.columns)}
116
+ geodb_client.create_collections(collections, database=database)
117
+
118
+ geodb_client.insert_into_collection(collection, database=database, values=self)
119
+
120
+ # TODO: save data in eotdl
121
+
122
+ def _create_collection_structure(self, columns: list) -> dict:
123
+ """
124
+ Create the schema structure of a GeoDB collection from a STACDataFrame
125
+
126
+ :param columns: columns of the STACDataFrame
127
+ """
128
+ stac_collection = {"crs": 4326, "properties": {}}
129
+
130
+ for column in columns:
131
+ if column not in ("geometry", "id"):
132
+ stac_collection["properties"][column] = "json"
133
+
134
+ return stac_collection
135
+
136
+ def to_stac(self):
137
+ """
138
+ Create a STAC catalog and children from a STACDataFrame
139
+ """
140
+ df = self.copy()
141
+
142
+ if "id" in df.columns and "stac_id" in df.columns:
143
+ id_column = "stac_id"
144
+ stac_id_exists = True
145
+ else:
146
+ id_column = "id"
147
+ stac_id_exists = False
148
+
149
+ # First, create the catalog and its folder, if exists
150
+ catalog_df = df[df["type"] == "Catalog"]
151
+
152
+ if catalog_df.empty:
153
+ root_output_folder = "output"
154
+ makedirs(root_output_folder, exist_ok=True)
155
+ else:
156
+ for index, row in catalog_df.iterrows():
157
+ root_output_folder = row[id_column]
158
+ makedirs(root_output_folder, exist_ok=True)
159
+ row_json = row.to_dict()
160
+
161
+ # Curate the json row
162
+ row_json = self.curate_json_row(row_json, stac_id_exists)
163
+
164
+ with open(join(root_output_folder, f"catalog.json"), "w") as f:
165
+ json.dump(row_json, f)
166
+
167
+ # Second, create the collections and their folders, if exist
168
+ collections = dict()
169
+ collections_df = df[df["type"] == "Collection"]
170
+ for index, row in collections_df.iterrows():
171
+ stac_output_folder = join(root_output_folder, row[id_column])
172
+ collections[row[id_column]] = stac_output_folder
173
+ makedirs(stac_output_folder, exist_ok=True)
174
+ row_json = row.to_dict()
175
+
176
+ # Curate the json row
177
+ row_json = self.curate_json_row(row_json, stac_id_exists)
178
+
179
+ with open(join(stac_output_folder, f"collection.json"), "w") as f:
180
+ json.dump(row_json, f)
181
+
182
+ # Then, create the items and their folders, if exist
183
+ features_df = df[df["type"] == "Feature"]
184
+ for index, row in features_df.iterrows():
185
+ collection = row["collection"]
186
+ stac_output_folder = join(collections[collection], row[id_column])
187
+
188
+ # Convert the geometry from WKT back to geojson
189
+ row["geometry"] = row["geometry"].wkt
190
+ row["geometry"] = wkt.loads(row["geometry"])
191
+ makedirs(stac_output_folder, exist_ok=True)
192
+ row_json = row.to_dict()
193
+
194
+ # Curate the json row
195
+ row_json = self.curate_json_row(row_json, stac_id_exists)
196
+
197
+ with open(join(stac_output_folder, f'{row_json["id"]}.json'), "w") as f:
198
+ json.dump(row_json, f)
199
+
200
+ def curate_json_row(self, row: dict, stac_id_exists: bool) -> dict:
201
+ """
202
+ Curate the json row of a STACDataFrame, in order to generate a valid STAC file
203
+
204
+ :param row: row of a STACDataFrame
205
+ :param stac_id_exists: if the stac_id column exists
206
+ """
207
+ keys_to_remove = list()
208
+
209
+ # Remove the created_at and modified_at columns, if the STACDataFrame comes from GeoDB
210
+ for i in "created_at", "modified_at":
211
+ if i in row.keys():
212
+ keys_to_remove.append(i)
213
+
214
+ # Rename the stac_id column to id, to avoid conflicts with the id column
215
+ if stac_id_exists:
216
+ row["id"] = row["stac_id"]
217
+ del row["stac_id"]
218
+
219
+ # Remove the NaN values and empty strings
220
+ for k, v in row.items():
221
+ if (isinstance(v, float) and isnan(v)) or v == "":
222
+ keys_to_remove.append(k)
223
+ for key in keys_to_remove:
224
+ del row[key]
225
+ del row["geometry"]
226
+
227
+ return row
228
+
229
+
230
+ def read_stac(
231
+ stac_file: pystac.Catalog | pystac.Collection | str,
232
+ geometry_column: str = "geometry",
233
+ ) -> STACDataFrame:
234
+ """
235
+ Read a STAC file and return a STACDataFrame
236
+
237
+ :param stac_file: STAC file to read
238
+ :param geometry_column: name of the geometry column
239
+ """
240
+ if isinstance(stac_file, str):
241
+ stac_file = pystac.read_file(stac_file)
242
+ children = get_all_children(stac_file)
243
+
244
+ # Convert Dataframe to STACDataFrame
245
+ dataframe = pd.DataFrame(children)
246
+ dataframe[geometry_column] = dataframe.apply(convert_df_geom_to_shape, axis=1)
247
+ stac_dataframe = STACDataFrame(
248
+ dataframe,
249
+ crs="EPSG:4326",
250
+ geometry=gpd.GeoSeries.from_wkt(dataframe[geometry_column]),
251
+ )
252
+
253
+ return stac_dataframe
@@ -1,4 +1,3 @@
1
- from .ingest import ingest_dataset, ingest_large_dataset, ingest_q0, ingest_q1
1
+ from .ingest import ingest_file, ingest_folder, ingest_q1, ingest_stac
2
2
  from .download import download_dataset
3
3
  from .retrieve import retrieve_datasets, retrieve_dataset, list_datasets
4
- from .update import update_dataset
@@ -1,18 +1,13 @@
1
1
  from ..src.repos import APIRepo
2
- from ..src.usecases.datasets import DownloadDataset
2
+ from ..src.usecases.datasets import DownloadDataset, DownloadFile
3
3
  from .retrieve import retrieve_dataset
4
4
  from ..auth import with_auth
5
5
 
6
6
 
7
7
  @with_auth
8
- def download_dataset(name, path=None, logger=None, user=None):
9
- dataset = retrieve_dataset(name)
10
- dataset_id = dataset["id"]
11
- checksum = dataset["checksum"]
8
+ def download_dataset(dataset, file, path=None, logger=None, user=None):
12
9
  api_repo = APIRepo()
13
- download = DownloadDataset(api_repo, logger)
14
- inputs = download.Inputs(
15
- dataset=dataset_id, checksum=checksum, path=path, user=user
16
- )
10
+ download = DownloadDataset(api_repo, retrieve_dataset, logger)
11
+ inputs = download.Inputs(dataset=dataset, file=file, path=path, user=user)
17
12
  outputs = download(inputs)
18
13
  return outputs.dst_path
eotdl/datasets/ingest.py CHANGED
@@ -1,32 +1,52 @@
1
+ import os
2
+
1
3
  from ..src.repos import APIRepo
2
- from ..src.usecases.datasets import IngestDataset, IngestLargeDataset
4
+ from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
3
5
  from ..auth import with_auth
4
6
 
5
7
 
8
+ allowed_extensions = [
9
+ ".zip",
10
+ ".tar",
11
+ ".tar.gz",
12
+ ".csv",
13
+ ".txt",
14
+ ".json",
15
+ ".pdf",
16
+ ".md",
17
+ ".yml",
18
+ ]
19
+
20
+
21
+ def ingest_q1(dataset, stac_catalog):
22
+ print("hola")
23
+ return
24
+
25
+
6
26
  @with_auth
7
- def ingest_dataset(name, description, path, logger=None, user=None):
27
+ def ingest_file(
28
+ file, dataset_id, logger=None, allowed_extensions=allowed_extensions, user=None
29
+ ):
8
30
  api_repo = APIRepo()
9
- ingest = IngestDataset(
10
- api_repo,
11
- )
12
- inputs = ingest.Inputs(name=name, description=description, path=path, user=user)
31
+ ingest = IngestFile(api_repo, allowed_extensions, logger)
32
+ inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
13
33
  outputs = ingest(inputs)
14
- return outputs.dataset
34
+ return outputs.data
15
35
 
16
36
 
17
37
  @with_auth
18
- def ingest_large_dataset(name, path, logger=None, user=None):
38
+ def ingest_folder(folder, force, delete, logger=None, user=None):
19
39
  api_repo = APIRepo()
20
- ingest = IngestLargeDataset(api_repo, logger)
21
- inputs = ingest.Inputs(name=name, path=path, user=user)
40
+ ingest = IngestFolder(api_repo, ingest_file, allowed_extensions, logger)
41
+ inputs = ingest.Inputs(folder=folder, user=user, force=force, delete=delete)
22
42
  outputs = ingest(inputs)
23
43
  return outputs.dataset
24
44
 
25
45
 
26
- def ingest_q0(dataset, path):
27
- return ingest_large_dataset(dataset, path)
28
-
29
-
30
- def ingest_q1(dataset, stac_catalog):
31
- print("holas")
32
- return
46
+ @with_auth
47
+ def ingest_stac(stac_catalog, dataset, logger=None, user=None):
48
+ api_repo = APIRepo()
49
+ ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
50
+ inputs = ingest.Inputs(stac_catalog=stac_catalog, dataset=dataset, user=user)
51
+ outputs = ingest(inputs)
52
+ return outputs.dataset
@@ -0,0 +1 @@
1
+ from .metadata import Metadata
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel, validator
2
+ from typing import List
3
+
4
+
5
+ class Metadata(BaseModel):
6
+ authors: List[str]
7
+ license: str
8
+ source: str
9
+ name: str
10
+
11
+ # validate source is a URL
12
+ @validator("source")
13
+ def source_is_url(cls, v):
14
+ if not v.startswith("http") and not v.startswith("https"):
15
+ raise ValueError("source must be a URL")
16
+ return v