PyPI - eotdl - Versions diffs - 2023.6.14.post10__py3-none-any.whl → 2023.7.19__py3-none-any.whl - Mend

eotdl 2023.6.14.post10py3-none-any.whl → 2023.7.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

eotdl/cli.py +0 -1
eotdl/commands/datasets.py +32 -43
eotdl/curation/stac/__init__.py +1 -1
eotdl/curation/stac/dataframe.py +5 -114
eotdl/curation/stac/dataframe_bck.py +253 -0
eotdl/datasets/__init__.py +1 -2
eotdl/datasets/download.py +4 -9
eotdl/datasets/ingest.py +37 -17
eotdl/src/models/__init__.py +1 -0
eotdl/src/models/metadata.py +16 -0
eotdl/src/repos/APIRepo.py +82 -98
eotdl/src/usecases/datasets/DownloadDataset.py +62 -10
eotdl/src/usecases/datasets/DownloadFile.py +30 -0
eotdl/src/usecases/datasets/IngestFile.py +60 -0
eotdl/src/usecases/datasets/IngestFolder.py +98 -0
eotdl/src/usecases/datasets/IngestSTAC.py +42 -0
eotdl/src/usecases/datasets/RetrieveDatasets.py +5 -4
eotdl/src/usecases/datasets/__init__.py +3 -1
eotdl/src/utils.py +12 -12
{eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/METADATA +1 -1
{eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/RECORD +23 -18
eotdl/datasets/update.py +0 -12
eotdl/src/usecases/datasets/UpdateDataset.py +0 -32
{eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/WHEEL +0 -0
{eotdl-2023.6.14.post10.dist-info → eotdl-2023.7.19.dist-info}/entry_points.txt +0 -0

eotdl/cli.py CHANGED Viewed

@@ -6,6 +6,5 @@ app = typer.Typer()
 app.add_typer(auth.app, name="auth")
 app.add_typer(datasets.app, name="datasets")
 if __name__ == "__main__":
     app()

eotdl/commands/datasets.py CHANGED Viewed

@@ -1,75 +1,64 @@
 import typer
+from pathlib import Path
 from ..datasets import (
     retrieve_datasets,
     download_dataset,
-    update_dataset,
-    ingest_large_dataset,
-    # ingest_large_dataset_parallel,
+    ingest_folder,
+    ingest_stac,
 )
-from .auth import auth
 app = typer.Typer()
 @app.command()
-def list():
-    """
-    List all datasets
-    """
-    datasets = retrieve_datasets()
-    typer.echo(datasets)
-@app.command()
-def get(name: str, path: str = None):
+def ingest(
+    path: Path,
+    f: bool = typer.Option(False, "--f", help="Force ingest even if file exists"),
+    d: bool = typer.Option(False, "--d", help="Delete files not in the dataset"),
+):
     """
-    Download a dataset
+    Ingest a dataset
-    name: Name of the dataset
-    path: Path to download the dataset to
+    path: Path to folder with the dataset
     """
     try:
-        dst_path = download_dataset(name, path, typer.echo)
-        typer.echo(f"Dataset {name} downloaded to {dst_path}")
+        if not path.is_dir():
+            typer.echo("Path must be a folder")
+            return
+        if "catalog.json" in [f.name for f in path.iterdir()]:
+            ingest_stac(str(path) + "/catalog.json", typer.echo)
+        else:
+            ingest_folder(path, f, d, typer.echo)
     except Exception as e:
         typer.echo(e)
 @app.command()
-def ingest(
-    path: str,
-    name: str,
-    # p: Optional[int] = 0,
-):
+def list():
     """
-    Ingest a dataset
-    path: Path to dataset to ingest
-    n: Name of the dataset
+    List all datasets and files
     """
-    try:
-        # if p:
-        #     ingest_large_dataset_parallel(name, path, user, p, typer.echo)
-        ingest_large_dataset(name, path, typer.echo)
-        typer.echo(f"Dataset {name} ingested")
-    except Exception as e:
-        typer.echo(e)
+    datasets = retrieve_datasets()
+    typer.echo(datasets)
 @app.command()
-def update(
-    name: str,
-    path: str,
+def get(
+    dataset: str,
+    path: str = None,
+    file: str = None,
 ):
     """
-    Update a dataset
+    Download a dataset
-    name: Name of the dataset
-    path: Path to dataset to ingest
+    dataset: Name of the dataset
+    file: Name of the file to download (optional, if not provided, the whole dataset will be downloaded)
+    path: Path to download the dataset to (optional, if not provided, the dataset will be downloaded to ~/.eotdl/datasets)
     """
     try:
-        update_dataset(name, path, typer.echo)
-        typer.echo(f"Dataset {name} updated")
+        dst_path = download_dataset(dataset, file, path, typer.echo)
+        typer.echo(f"Data available at {dst_path}")
     except Exception as e:
         typer.echo(e)

eotdl/curation/stac/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # from .stac import STACGenerator
 # from .utils import format_time_acquired
 # from .parsers import STACIdParser, StructuredParser, UnestructuredParser
-# from .dataframe import STACDataFrame, read_stac
+from .dataframe import STACDataFrame, read_stac

eotdl/curation/stac/dataframe.py CHANGED Viewed

@@ -6,12 +6,10 @@ import pandas as pd
 import geopandas as gpd
 import pystac
 import json
-import os
-from xcube_geodb.core.geodb import GeoDBClient
 from geomet import wkt
 from os.path import join
 from os import makedirs
+from typing import Union
 from math import isnan
 from .utils import convert_df_geom_to_shape, get_all_children
@@ -27,113 +25,7 @@ class STACDataFrame(gpd.GeoDataFrame):
         """
         return read_stac(stac_file)
-    @classmethod
-    def from_geodb(
-        self,
-        server_url: str,
-        server_port: int | str,
-        client_id: str,
-        client_secret: str,
-        auth_aud: str,
-        collection: str,
-        database: str = None,
-    ):
-        """
-        Create a STACDataFrame from a GeoDB collection
-        :param server_url: GeoDB server url
-        :param server_port: GeoDB server port
-        :param client_id: GeoDB client id
-        :param client_secret: GeoDB client secret
-        :param auth_aud: GeoDB auth aud
-        :param collection: GeoDB collection
-        :param database: GeoDB database
-        """
-        geodb_client = GeoDBClient(
-            server_url=server_url,
-            server_port=server_port,
-            client_id=client_id,
-            client_secret=client_secret,
-            auth_aud=auth_aud,
-        )
-        data = geodb_client.get_collection(collection, database=database)
-        return STACDataFrame(data, crs="EPSG:4326")
-    def ingest(
-        self,
-        collection: str,
-        server_url: str = os.environ["SERVER_URL"],
-        server_port: int = os.environ["SERVER_PORT"],
-        client_id: str = os.environ["CLIENT_ID"],
-        client_secret: str = os.environ["CLIENT_SECRET"],
-        auth_aud: str = os.environ["AUTH_DOMAIN"],
-        database: str = None,
-    ):
-        """
-        Create a GeoDB collection from a STACDataFrame
-        :param collection: dataset name (GeoDB collection)
-        :param server_url: GeoDB server url
-        :param server_port: GeoDB server port
-        :param client_id: GeoDB client id
-        :param client_secret: GeoDB client secret
-        :param auth_aud: GeoDB auth aud
-        :param database: GeoDB database
-        """
-        geodb_client = GeoDBClient(
-            server_url=server_url,
-            server_port=server_port,
-            client_id=client_id,
-            client_secret=client_secret,
-            auth_aud=auth_aud,
-        )
-        # TODO: check name is unique (use eotdl-cli)
-        # TODO: ingest assets (only if local)
-        # TODO: rename assets in the dataframe with URLs (only if local)
-        # ingest to geodb
-        # Check if the collection already exists
-        if geodb_client.collection_exists(collection, database=database):
-            # geodb_client.drop_collection(collection, database=database)
-            raise Exception(f"Collection {collection} already exists")
-        # Rename the column id to stac_id, to avoid conflicts with the id column
-        self.rename(columns={"id": "stac_id"}, inplace=True)
-        # Fill the NaN with '' to avoid errors, except in the geometry column
-        copy = self.copy()
-        columns_to_fill = copy.columns.drop("geometry")
-        self[columns_to_fill] = self[columns_to_fill].fillna("")
-        # Create the collection if it does not exist
-        # and insert the data
-        collections = {collection: self._create_collection_structure(self.columns)}
-        geodb_client.create_collections(collections, database=database)
-        geodb_client.insert_into_collection(collection, database=database, values=self)
-        # TODO: save data in eotdl
-    def _create_collection_structure(self, columns: list) -> dict:
-        """
-        Create the schema structure of a GeoDB collection from a STACDataFrame
-        :param columns: columns of the STACDataFrame
-        """
-        stac_collection = {"crs": 4326, "properties": {}}
-        for column in columns:
-            if column not in ("geometry", "id"):
-                stac_collection["properties"][column] = "json"
-        return stac_collection
-    def to_stac(self):
+    def to_stac(self, path):
         """
         Create a STAC catalog and children from a STACDataFrame
         """
@@ -150,11 +42,10 @@ class STACDataFrame(gpd.GeoDataFrame):
         catalog_df = df[df["type"] == "Catalog"]
         if catalog_df.empty:
-            root_output_folder = "output"
-            makedirs(root_output_folder, exist_ok=True)
+            makedirs(path, exist_ok=True)
         else:
             for index, row in catalog_df.iterrows():
-                root_output_folder = row[id_column]
+                root_output_folder = path + "/" + row[id_column]
                 makedirs(root_output_folder, exist_ok=True)
                 row_json = row.to_dict()
@@ -228,7 +119,7 @@ class STACDataFrame(gpd.GeoDataFrame):
 def read_stac(
-    stac_file: pystac.Catalog | pystac.Collection | str,
+    stac_file: Union[pystac.Catalog, pystac.Collection, str],
     geometry_column: str = "geometry",
 ) -> STACDataFrame:
     """

eotdl/curation/stac/dataframe_bck.py ADDED Viewed

@@ -0,0 +1,253 @@
+"""
+Module for the STAC dataframe
+"""
+import pandas as pd
+import geopandas as gpd
+import pystac
+import json
+import os
+from xcube_geodb.core.geodb import GeoDBClient
+from geomet import wkt
+from os.path import join
+from os import makedirs
+from math import isnan
+from .utils import convert_df_geom_to_shape, get_all_children
+class STACDataFrame(gpd.GeoDataFrame):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_stac_file(self, stac_file):
+        """
+        Create a STACDataFrame from a STAC file
+        """
+        return read_stac(stac_file)
+    @classmethod
+    def from_geodb(
+        self,
+        server_url: str,
+        server_port: int | str,
+        client_id: str,
+        client_secret: str,
+        auth_aud: str,
+        collection: str,
+        database: str = None,
+    ):
+        """
+        Create a STACDataFrame from a GeoDB collection
+        :param server_url: GeoDB server url
+        :param server_port: GeoDB server port
+        :param client_id: GeoDB client id
+        :param client_secret: GeoDB client secret
+        :param auth_aud: GeoDB auth aud
+        :param collection: GeoDB collection
+        :param database: GeoDB database
+        """
+        geodb_client = GeoDBClient(
+            server_url=server_url,
+            server_port=server_port,
+            client_id=client_id,
+            client_secret=client_secret,
+            auth_aud=auth_aud,
+        )
+        data = geodb_client.get_collection(collection, database=database)
+        return STACDataFrame(data, crs="EPSG:4326")
+    def ingest(
+        self,
+        collection: str,
+        server_url: str = os.environ["SERVER_URL"],
+        server_port: int = os.environ["SERVER_PORT"],
+        client_id: str = os.environ["CLIENT_ID"],
+        client_secret: str = os.environ["CLIENT_SECRET"],
+        auth_aud: str = os.environ["AUTH_DOMAIN"],
+        database: str = None,
+    ):
+        """
+        Create a GeoDB collection from a STACDataFrame
+        :param collection: dataset name (GeoDB collection)
+        :param server_url: GeoDB server url
+        :param server_port: GeoDB server port
+        :param client_id: GeoDB client id
+        :param client_secret: GeoDB client secret
+        :param auth_aud: GeoDB auth aud
+        :param database: GeoDB database
+        """
+        geodb_client = GeoDBClient(
+            server_url=server_url,
+            server_port=server_port,
+            client_id=client_id,
+            client_secret=client_secret,
+            auth_aud=auth_aud,
+        )
+        # TODO: check name is unique (use eotdl-cli)
+        # TODO: ingest assets (only if local)
+        # TODO: rename assets in the dataframe with URLs (only if local)
+        # ingest to geodb
+        # Check if the collection already exists
+        if geodb_client.collection_exists(collection, database=database):
+            # geodb_client.drop_collection(collection, database=database)
+            raise Exception(f"Collection {collection} already exists")
+        # Rename the column id to stac_id, to avoid conflicts with the id column
+        self.rename(columns={"id": "stac_id"}, inplace=True)
+        # Fill the NaN with '' to avoid errors, except in the geometry column
+        copy = self.copy()
+        columns_to_fill = copy.columns.drop("geometry")
+        self[columns_to_fill] = self[columns_to_fill].fillna("")
+        # Create the collection if it does not exist
+        # and insert the data
+        collections = {collection: self._create_collection_structure(self.columns)}
+        geodb_client.create_collections(collections, database=database)
+        geodb_client.insert_into_collection(collection, database=database, values=self)
+        # TODO: save data in eotdl
+    def _create_collection_structure(self, columns: list) -> dict:
+        """
+        Create the schema structure of a GeoDB collection from a STACDataFrame
+        :param columns: columns of the STACDataFrame
+        """
+        stac_collection = {"crs": 4326, "properties": {}}
+        for column in columns:
+            if column not in ("geometry", "id"):
+                stac_collection["properties"][column] = "json"
+        return stac_collection
+    def to_stac(self):
+        """
+        Create a STAC catalog and children from a STACDataFrame
+        """
+        df = self.copy()
+        if "id" in df.columns and "stac_id" in df.columns:
+            id_column = "stac_id"
+            stac_id_exists = True
+        else:
+            id_column = "id"
+            stac_id_exists = False
+        # First, create the catalog and its folder, if exists
+        catalog_df = df[df["type"] == "Catalog"]
+        if catalog_df.empty:
+            root_output_folder = "output"
+            makedirs(root_output_folder, exist_ok=True)
+        else:
+            for index, row in catalog_df.iterrows():
+                root_output_folder = row[id_column]
+                makedirs(root_output_folder, exist_ok=True)
+                row_json = row.to_dict()
+                # Curate the json row
+                row_json = self.curate_json_row(row_json, stac_id_exists)
+                with open(join(root_output_folder, f"catalog.json"), "w") as f:
+                    json.dump(row_json, f)
+        # Second, create the collections and their folders, if exist
+        collections = dict()
+        collections_df = df[df["type"] == "Collection"]
+        for index, row in collections_df.iterrows():
+            stac_output_folder = join(root_output_folder, row[id_column])
+            collections[row[id_column]] = stac_output_folder
+            makedirs(stac_output_folder, exist_ok=True)
+            row_json = row.to_dict()
+            # Curate the json row
+            row_json = self.curate_json_row(row_json, stac_id_exists)
+            with open(join(stac_output_folder, f"collection.json"), "w") as f:
+                json.dump(row_json, f)
+        # Then, create the items and their folders, if exist
+        features_df = df[df["type"] == "Feature"]
+        for index, row in features_df.iterrows():
+            collection = row["collection"]
+            stac_output_folder = join(collections[collection], row[id_column])
+            # Convert the geometry from WKT back to geojson
+            row["geometry"] = row["geometry"].wkt
+            row["geometry"] = wkt.loads(row["geometry"])
+            makedirs(stac_output_folder, exist_ok=True)
+            row_json = row.to_dict()
+            # Curate the json row
+            row_json = self.curate_json_row(row_json, stac_id_exists)
+            with open(join(stac_output_folder, f'{row_json["id"]}.json'), "w") as f:
+                json.dump(row_json, f)
+    def curate_json_row(self, row: dict, stac_id_exists: bool) -> dict:
+        """
+        Curate the json row of a STACDataFrame, in order to generate a valid STAC file
+        :param row: row of a STACDataFrame
+        :param stac_id_exists: if the stac_id column exists
+        """
+        keys_to_remove = list()
+        # Remove the created_at and modified_at columns, if the STACDataFrame comes from GeoDB
+        for i in "created_at", "modified_at":
+            if i in row.keys():
+                keys_to_remove.append(i)
+        # Rename the stac_id column to id, to avoid conflicts with the id column
+        if stac_id_exists:
+            row["id"] = row["stac_id"]
+            del row["stac_id"]
+        # Remove the NaN values and empty strings
+        for k, v in row.items():
+            if (isinstance(v, float) and isnan(v)) or v == "":
+                keys_to_remove.append(k)
+        for key in keys_to_remove:
+            del row[key]
+        del row["geometry"]
+        return row
+def read_stac(
+    stac_file: pystac.Catalog | pystac.Collection | str,
+    geometry_column: str = "geometry",
+) -> STACDataFrame:
+    """
+    Read a STAC file and return a STACDataFrame
+    :param stac_file: STAC file to read
+    :param geometry_column: name of the geometry column
+    """
+    if isinstance(stac_file, str):
+        stac_file = pystac.read_file(stac_file)
+    children = get_all_children(stac_file)
+    # Convert Dataframe to STACDataFrame
+    dataframe = pd.DataFrame(children)
+    dataframe[geometry_column] = dataframe.apply(convert_df_geom_to_shape, axis=1)
+    stac_dataframe = STACDataFrame(
+        dataframe,
+        crs="EPSG:4326",
+        geometry=gpd.GeoSeries.from_wkt(dataframe[geometry_column]),
+    )
+    return stac_dataframe

eotdl/datasets/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from .ingest import ingest_dataset, ingest_large_dataset, ingest_q0, ingest_q1
+from .ingest import ingest_file, ingest_folder, ingest_q1, ingest_stac
 from .download import download_dataset
 from .retrieve import retrieve_datasets, retrieve_dataset, list_datasets
-from .update import update_dataset

eotdl/datasets/download.py CHANGED Viewed

@@ -1,18 +1,13 @@
 from ..src.repos import APIRepo
-from ..src.usecases.datasets import DownloadDataset
+from ..src.usecases.datasets import DownloadDataset, DownloadFile
 from .retrieve import retrieve_dataset
 from ..auth import with_auth
 @with_auth
-def download_dataset(name, path=None, logger=None, user=None):
-    dataset = retrieve_dataset(name)
-    dataset_id = dataset["id"]
-    checksum = dataset["checksum"]
+def download_dataset(dataset, file, path=None, logger=None, user=None):
     api_repo = APIRepo()
-    download = DownloadDataset(api_repo, logger)
-    inputs = download.Inputs(
-        dataset=dataset_id, checksum=checksum, path=path, user=user
-    )
+    download = DownloadDataset(api_repo, retrieve_dataset, logger)
+    inputs = download.Inputs(dataset=dataset, file=file, path=path, user=user)
     outputs = download(inputs)
     return outputs.dst_path

eotdl/datasets/ingest.py CHANGED Viewed

@@ -1,32 +1,52 @@
+import os
 from ..src.repos import APIRepo
-from ..src.usecases.datasets import IngestDataset, IngestLargeDataset
+from ..src.usecases.datasets import IngestFile, IngestFolder, IngestSTAC
 from ..auth import with_auth
+allowed_extensions = [
+    ".zip",
+    ".tar",
+    ".tar.gz",
+    ".csv",
+    ".txt",
+    ".json",
+    ".pdf",
+    ".md",
+    ".yml",
+]
+def ingest_q1(dataset, stac_catalog):
+    print("hola")
+    return
 @with_auth
-def ingest_dataset(name, description, path, logger=None, user=None):
+def ingest_file(
+    file, dataset_id, logger=None, allowed_extensions=allowed_extensions, user=None
+):
     api_repo = APIRepo()
-    ingest = IngestDataset(
-        api_repo,
-    )
-    inputs = ingest.Inputs(name=name, description=description, path=path, user=user)
+    ingest = IngestFile(api_repo, allowed_extensions, logger)
+    inputs = ingest.Inputs(file=file, dataset_id=dataset_id, user=user)
     outputs = ingest(inputs)
-    return outputs.dataset
+    return outputs.data
 @with_auth
-def ingest_large_dataset(name, path, logger=None, user=None):
+def ingest_folder(folder, force, delete, logger=None, user=None):
     api_repo = APIRepo()
-    ingest = IngestLargeDataset(api_repo, logger)
-    inputs = ingest.Inputs(name=name, path=path, user=user)
+    ingest = IngestFolder(api_repo, ingest_file, allowed_extensions, logger)
+    inputs = ingest.Inputs(folder=folder, user=user, force=force, delete=delete)
     outputs = ingest(inputs)
     return outputs.dataset
-def ingest_q0(dataset, path):
-    return ingest_large_dataset(dataset, path)
-def ingest_q1(dataset, stac_catalog):
-    print("holas")
-    return
+@with_auth
+def ingest_stac(stac_catalog, dataset, logger=None, user=None):
+    api_repo = APIRepo()
+    ingest = IngestSTAC(api_repo, ingest_file, allowed_extensions)
+    inputs = ingest.Inputs(stac_catalog=stac_catalog, dataset=dataset, user=user)
+    outputs = ingest(inputs)
+    return outputs.dataset

eotdl/src/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .metadata import Metadata

eotdl/src/models/metadata.py ADDED Viewed

@@ -0,0 +1,16 @@
+from pydantic import BaseModel, validator
+from typing import List
+class Metadata(BaseModel):
+    authors: List[str]
+    license: str
+    source: str
+    name: str
+    # validate source is a URL
+    @validator("source")
+    def source_is_url(cls, v):
+        if not v.startswith("http") and not v.startswith("https"):
+            raise ValueError("source must be a URL")
+        return v

eotdl 2023.6.14.post10__py3-none-any.whl → 2023.7.19__py3-none-any.whl

eotdl 2023.6.14.post10py3-none-any.whl → 2023.7.19py3-none-any.whl