PyPI - eotdl - Versions diffs - 2023.11.2.post5__py3-none-any.whl → 2023.11.3.post2__py3-none-any.whl - Mend

eotdl 2023.11.2.post5py3-none-any.whl → 2023.11.3.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

eotdl/__init__.py +1 -1
eotdl/access/__init__.py +6 -3
eotdl/access/airbus/__init__.py +5 -1
eotdl/access/airbus/client.py +356 -338
eotdl/access/airbus/parameters.py +19 -4
eotdl/access/airbus/utils.py +26 -21
eotdl/access/download.py +30 -14
eotdl/access/search.py +17 -6
eotdl/access/sentinelhub/__init__.py +5 -1
eotdl/access/sentinelhub/client.py +57 -54
eotdl/access/sentinelhub/evalscripts.py +38 -39
eotdl/access/sentinelhub/parameters.py +43 -23
eotdl/access/sentinelhub/utils.py +38 -28
eotdl/auth/errors.py +2 -1
eotdl/commands/auth.py +3 -3
eotdl/curation/__init__.py +5 -1
eotdl/curation/stac/__init__.py +5 -1
eotdl/curation/stac/assets.py +55 -32
eotdl/curation/stac/dataframe.py +20 -14
eotdl/curation/stac/dataframe_bck.py +2 -2
eotdl/curation/stac/dataframe_labeling.py +15 -12
eotdl/curation/stac/extensions/__init__.py +6 -2
eotdl/curation/stac/extensions/base.py +8 -4
eotdl/curation/stac/extensions/dem.py +6 -3
eotdl/curation/stac/extensions/eo.py +10 -6
eotdl/curation/stac/extensions/label/__init__.py +5 -1
eotdl/curation/stac/extensions/label/base.py +40 -26
eotdl/curation/stac/extensions/label/image_name_labeler.py +64 -43
eotdl/curation/stac/extensions/label/scaneo.py +59 -56
eotdl/curation/stac/extensions/ml_dataset.py +154 -56
eotdl/curation/stac/extensions/projection.py +11 -9
eotdl/curation/stac/extensions/raster.py +22 -14
eotdl/curation/stac/extensions/sar.py +12 -7
eotdl/curation/stac/extent.py +67 -40
eotdl/curation/stac/parsers.py +18 -10
eotdl/curation/stac/stac.py +81 -62
eotdl/datasets/__init__.py +1 -1
eotdl/datasets/download.py +42 -55
eotdl/datasets/ingest.py +68 -11
eotdl/files/__init__.py +1 -1
eotdl/files/ingest.py +3 -1
eotdl/models/download.py +1 -1
eotdl/repos/AuthAPIRepo.py +0 -1
eotdl/repos/DatasetsAPIRepo.py +22 -146
eotdl/repos/FilesAPIRepo.py +7 -92
eotdl/repos/ModelsAPIRepo.py +0 -1
eotdl/tools/__init__.py +5 -1
eotdl/tools/geo_utils.py +78 -48
eotdl/tools/metadata.py +13 -11
eotdl/tools/paths.py +14 -14
eotdl/tools/stac.py +36 -31
eotdl/tools/time_utils.py +53 -26
eotdl/tools/tools.py +84 -50
{eotdl-2023.11.2.post5.dist-info → eotdl-2023.11.3.post2.dist-info}/METADATA +5 -3
eotdl-2023.11.3.post2.dist-info/RECORD +84 -0
eotdl-2023.11.2.post5.dist-info/RECORD +0 -84
{eotdl-2023.11.2.post5.dist-info → eotdl-2023.11.3.post2.dist-info}/WHEEL +0 -0
{eotdl-2023.11.2.post5.dist-info → eotdl-2023.11.3.post2.dist-info}/entry_points.txt +0 -0

eotdl/datasets/ingest.py CHANGED Viewed

@@ -1,18 +1,22 @@
 from pathlib import Path
 import yaml
+from tqdm import tqdm
+import json
 from ..auth import with_auth
 from .metadata import Metadata
-from ..repos import DatasetsAPIRepo
-from ..files import ingest_files
+from ..repos import DatasetsAPIRepo, FilesAPIRepo
+from ..files import ingest_files, create_new_version
+from ..curation.stac import STACDataFrame
+from ..shared import calculate_checksum
 def ingest_dataset(path, verbose=False, logger=print):
     path = Path(path)
     if not path.is_dir():
         raise Exception("Path must be a folder")
-    # if "catalog.json" in [f.name for f in path.iterdir()]:
-    #     return ingest_stac(path / "catalog.json", logger)
+    if "catalog.json" in [f.name for f in path.iterdir()]:
+        return ingest_stac(path / "catalog.json", logger)
     return ingest_folder(path, verbose, logger)
@@ -46,10 +50,63 @@ def ingest_folder(folder, verbose=False, logger=print, user=None):
     )
-# @with_auth
-# def ingest_stac(stac_catalog, logger=None, user=None):
-#     api_repo = APIRepo()
-#     ingest = IngestSTAC(api_repo, ingest_file, logger)
-#     inputs = ingest.Inputs(stac_catalog=stac_catalog, user=user)
-#     outputs = ingest(inputs)
-#     return outputs.dataset
+def retrieve_stac_dataset(dataset_name, user):
+    repo = DatasetsAPIRepo()
+    data, error = repo.retrieve_dataset(dataset_name)
+    # print(data, error)
+    if data and data["uid"] != user["sub"]:
+        raise Exception("Dataset already exists.")
+    if error and error == "Dataset doesn't exist":
+        # create dataset
+        data, error = repo.create_stac_dataset(dataset_name, user["id_token"])
+        # print(data, error)
+        if error:
+            raise Exception(error)
+        data["id"] = data["dataset_id"]
+    return data["id"]
+@with_auth
+def ingest_stac(stac_catalog, logger=None, user=None):
+    repo, files_repo = DatasetsAPIRepo(), FilesAPIRepo()
+    # load catalog
+    logger("Loading STAC catalog...")
+    df = STACDataFrame.from_stac_file(stac_catalog)
+    catalog = df[df["type"] == "Catalog"]
+    assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
+    dataset_name = catalog.id.iloc[0]
+    # retrieve dataset (create if doesn't exist)
+    dataset_id = retrieve_stac_dataset(dataset_name, user)
+    # create new version
+    version = create_new_version(repo, dataset_id, user)
+    logger("New version created, version: " + str(version))
+    df2 = df.dropna(subset=["assets"])
+    for row in tqdm(df2.iterrows(), total=len(df2)):
+        # for asset in df.assets.dropna().values[:10]:
+        try:
+            for k, v in row[1]["assets"].items():
+                data, error = files_repo.ingest_file(
+                    v["href"],
+                    dataset_id,
+                    user["id_token"],
+                    calculate_checksum(v["href"]),  # is always absolute?
+                    "datasets",
+                    version,
+                )
+                if error:
+                    raise Exception(error)
+                file_url = f"{repo.url}datasets/{data['dataset_id']}/download/{data['filename']}"
+                df.loc[row[0], "assets"][k]["href"] = file_url
+        except Exception as e:
+            logger(f"Error uploading asset {row[0]}: {e}")
+            break
+    # ingest the STAC catalog into geodb
+    logger("Ingesting STAC catalog...")
+    data, error = repo.ingest_stac(
+        json.loads(df.to_json()), dataset_id, user["id_token"]
+    )
+    if error:
+        # TODO: delete all assets that were uploaded
+        raise Exception(error)
+    logger("Done")
+    return

eotdl/files/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .ingest import ingest_files
1	+ from .ingest import ingest_files, create_new_version

eotdl/files/ingest.py CHANGED Viewed

@@ -8,7 +8,6 @@ import os
 from ..repos import FilesAPIRepo
 from ..shared import calculate_checksum
-from ..shared import calculate_checksum
 def retrieve_files(folder):
@@ -64,6 +63,7 @@ def generate_files_lists(
 ):
     files_repo = FilesAPIRepo()
     current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
+    print(current_files)
     # print(len(current_files), len(items) - len(current_files))
     # print(current_files, error)
     if error:
@@ -82,6 +82,7 @@ def generate_files_lists(
                 large_files.append(data)
             else:
                 upload_files.append(data)
+    # TODO: should ingest new version if files removed
     if len(upload_files) == 0 and len(large_files) == 0:
         raise Exception("No new files to upload")
     return upload_files, existing_files, large_files
@@ -98,6 +99,7 @@ def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpo
     files_repo = FilesAPIRepo()
     logger(f"Uploading directory {folder}...")
     items = retrieve_files(folder)
+    print(items)
     # retrieve files
     upload_files, existing_files, large_files = generate_files_lists(
         items, folder, dataset_or_model_id, endpoint, logger

eotdl/models/download.py CHANGED Viewed

@@ -76,7 +76,7 @@ def download_model(
             # if calculate_checksum(dst_path) != checksum:
             #     logger(f"Checksum for {file} does not match")
             if verbose:
-                logger(f"Done")
+                logger("Done")
         return "/".join(dst_path.split("/")[:-1])
     else:
         raise NotImplementedError("Downloading a STAC model is not implemented")

eotdl/repos/AuthAPIRepo.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import requests
-import os
 from .APIRepo import APIRepo

eotdl/repos/DatasetsAPIRepo.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import requests
-import os
+import geopandas as gpd
 from ..repos import APIRepo
@@ -39,126 +39,29 @@ class DatasetsAPIRepo(APIRepo):
         )
         return self.format_response(response)
-    # def create_stac_dataset(self, name, id_token):
-    #     response = requests.post(
-    #         self.url + "datasets/stac",
-    #         json={"name": name},
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if response.status_code == 200:
-    #         return response.json(), None
-    #     return None, response.json()["detail"]
-    # def download_file(self, dataset, dataset_id, file, id_token, path):
-    #     url = self.url + "datasets/" + dataset_id + "/download/" + file
-    #     return self.download_file_url(url, path, id_token, progress=True)
-    # def download_file_url(self, url, path, id_token, progress=False):
-    #     headers = {"Authorization": "Bearer " + id_token}
-    #     filename = url.split("/")[-1]
-    #     os.makedirs(path, exist_ok=True)
-    #     path = f"{path}/{filename}"
-    #     with requests.get(url, headers=headers, stream=True) as r:
-    #         r.raise_for_status()
-    #         total_size = int(r.headers.get("content-length", 0))
-    #         block_size = 1024 * 1024 * 10
-    #         if progress:
-    #             progress_bar = tqdm(
-    #                 total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
-    #             )
-    #         with open(path, "wb") as f:
-    #             for chunk in r.iter_content(block_size):
-    #                 if progress:
-    #                     progress_bar.update(len(chunk))
-    #                 if chunk:
-    #                     f.write(chunk)
-    #         if progress:
-    #             progress_bar.close()
-    #         return path
-    # def ingest_file_url(self, file, dataset, id_token):
-    #     reponse = requests.post(
-    #         self.url + f"datasets/{dataset}/url",
-    #         json={"url": file},
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if reponse.status_code != 200:
-    #         return None, reponse.json()["detail"]
-    #     return reponse.json(), None
-    # def read_in_chunks(self, file_object, CHUNK_SIZE):
-    #     while True:
-    #         data = file_object.read(CHUNK_SIZE)
-    #         if not data:
-    #             break
-    #         yield data
-    # def prepare_large_upload(self, file, dataset_id, checksum, id_token):
-    #     filename = Path(file).name
-    #     response = requests.post(
-    #         self.url + f"datasets/{dataset_id}/uploadId",
-    #         json={"name": filename, "checksum": checksum},
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if response.status_code != 200:
-    #         raise Exception(response.json()["detail"])
-    #     data = response.json()
-    #     upload_id, parts = (
-    #         data["upload_id"],
-    #         data["parts"] if "parts" in data else [],
-    #     )
-    #     return upload_id, parts
-    # def get_chunk_size(self, content_size):
-    #     # adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
-    #     chunk_size = 1024 * 1024 * 10  # 10 MB (up to 100 GB, 10000 parts)
-    #     if content_size >= 1024 * 1024 * 1024 * 100:  # 100 GB
-    #         chunk_size = 1024 * 1024 * 100  # 100 MB (up to 1 TB, 10000 parts)
-    #     elif content_size >= 1024 * 1024 * 1024 * 1000:  # 1 TB
-    #         chunk_size = 1024 * 1024 * 500  # 0.5 GB (up to 5 TB, 10000 parts)
-    #     return chunk_size
+    def create_stac_dataset(self, name, id_token):
+        response = requests.post(
+            self.url + "datasets/stac",
+            json={"name": name},
+            headers={"Authorization": "Bearer " + id_token},
+        )
+        return self.format_response(response)
-    # def ingest_large_dataset(self, file, upload_id, id_token, parts):
-    #     content_path = os.path.abspath(file)
-    #     content_size = os.stat(content_path).st_size
-    #     chunk_size = self.get_chunk_size(content_size)
-    #     total_chunks = content_size // chunk_size
-    #     # upload chunks sequentially
-    #     pbar = tqdm(
-    #         self.read_in_chunks(open(content_path, "rb"), chunk_size),
-    #         total=total_chunks,
-    #     )
-    #     index = 0
-    #     for chunk in pbar:
-    #         part = index // chunk_size + 1
-    #         offset = index + len(chunk)
-    #         index = offset
-    #         if part not in parts:
-    #             checksum = hashlib.md5(chunk).hexdigest()
-    #             response = requests.post(
-    #                 self.url + "datasets/chunk/" + upload_id,
-    #                 files={"file": chunk},
-    #                 data={"part_number": part, "checksum": checksum},
-    #                 headers={"Authorization": "Bearer " + id_token},
-    #             )
-    #             if response.status_code != 200:
-    #                 raise Exception(response.json()["detail"])
-    #         pbar.set_description(
-    #             "{:.2f}/{:.2f} MB".format(
-    #                 offset / 1024 / 1024, content_size / 1024 / 1024
-    #             )
-    #         )
-    #     pbar.close()
-    #     return
+    def ingest_stac(self, stac_json, dataset_id, id_token):
+        response = requests.put(
+            self.url + f"datasets/stac/{dataset_id}",
+            json={"stac": stac_json},
+            headers={"Authorization": "Bearer " + id_token},
+        )
+        return self.format_response(response)
-    # def complete_upload(self, id_token, upload_id):
-    #     r = requests.post(
-    #         self.url + "datasets/complete/" + upload_id,
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if r.status_code != 200:
-    #         return None, r.json()["detail"]
-    #     return r.json(), None
+    def download_stac(self, dataset_id, id_token):
+        url = self.url + "datasets/" + dataset_id + "/download"
+        headers = {"Authorization": "Bearer " + id_token}
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            return None, response.json()["detail"]
+        return gpd.GeoDataFrame.from_features(response.json()["features"]), None
     # def update_dataset(self, name, path, id_token, checksum):
     #     # check that dataset exists
@@ -220,30 +123,3 @@ class DatasetsAPIRepo(APIRepo):
     #     if r.status_code != 200:
     #         return None, r.json()["detail"]
     #     return r.json(), None
-    # def delete_file(self, dataset_id, file_name, id_token):
-    #     response = requests.delete(
-    #         self.url + "datasets/" + dataset_id + "/file/" + file_name,
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if response.status_code != 200:
-    #         return None, response.json()["detail"]
-    #     return response.json(), None
-    # def ingest_stac(self, stac_json, dataset_id, id_token):
-    #     reponse = requests.put(
-    #         self.url + f"datasets/stac/{dataset_id}",
-    #         json={"stac": stac_json},
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if reponse.status_code != 200:
-    #         return None, reponse.json()["detail"]
-    #     return reponse.json(), None
-    # def download_stac(self, dataset_id, id_token):
-    #     url = self.url + "datasets/" + dataset_id + "/download"
-    #     headers = {"Authorization": "Bearer " + id_token}
-    #     response = requests.get(url, headers=headers)
-    #     if response.status_code != 200:
-    #         return None, response.json()["detail"]
-    #     return gpd.GeoDataFrame.from_features(response.json()["features"]), None

eotdl/repos/FilesAPIRepo.py CHANGED Viewed

@@ -19,7 +19,7 @@ class FilesAPIRepo(APIRepo):
         endpoint,
         version=None,
     ):
-        url = self.url + f"{endpoint}/{dataset_or_model_id}"
+        url = self.url + f"{endpoint}/{dataset_or_model_id}/batch"
         if version is not None:
             url += "?version=" + str(version)
         reponse = requests.post(
@@ -49,21 +49,15 @@ class FilesAPIRepo(APIRepo):
         return self.format_response(reponse)
     def ingest_file(
-        self,
-        file,
-        dataset_or_model_id,
-        version,
-        parent,
-        id_token,
-        checksum,
-        endpoint,
+        self, file, dataset_or_model_id, id_token, checksum, endpoint, version=None
     ):
+        url = self.url + f"{endpoint}/{dataset_or_model_id}"
+        if version is not None:
+            url += "?version=" + str(version)
         reponse = requests.post(
-            self.url + f"{endpoint}/{dataset_or_model_id}",
+            url,
             files={"file": open(file, "rb")},
-            data={"checksum": checksum, "version": version, "parent": parent}
-            if checksum
-            else None,
+            data={"checksum": checksum},
             headers={"Authorization": "Bearer " + id_token},
         )
         return self.format_response(reponse)
@@ -205,67 +199,6 @@ class FilesAPIRepo(APIRepo):
         )
         return self.format_response(r)
-    # def update_dataset(self, name, path, id_token, checksum):
-    #     # check that dataset exists
-    #     data, error = self.retrieve_dataset(name)
-    #     if error:
-    #         return None, error
-    #     # first call to get upload id
-    #     dataset_id = data["id"]
-    #     url = self.url + f"datasets/chunk/{dataset_id}?checksum={checksum}"
-    #     response = requests.get(url, headers={"Authorization": "Bearer " + id_token})
-    #     if response.status_code != 200:
-    #         return None, response.json()["detail"]
-    #     data = response.json()
-    #     _, upload_id, parts = data["dataset_id"], data["upload_id"], data["parts"]
-    #     # assert dataset_id is None
-    #     content_path = os.path.abspath(path)
-    #     content_size = os.stat(content_path).st_size
-    #     url = self.url + "datasets/chunk"
-    #     chunk_size = 1024 * 1024 * 100  # 100 MiB
-    #     total_chunks = content_size // chunk_size
-    #     headers = {
-    #         "Authorization": "Bearer " + id_token,
-    #         "Upload-Id": upload_id,
-    #         "Dataset-Id": dataset_id,
-    #     }
-    #     # upload chunks sequentially
-    #     pbar = tqdm(
-    #         self.read_in_chunks(open(content_path, "rb"), chunk_size),
-    #         total=total_chunks,
-    #     )
-    #     index = 0
-    #     for chunk in pbar:
-    #         offset = index + len(chunk)
-    #         part = index // chunk_size + 1
-    #         index = offset
-    #         if part not in parts:
-    #             headers["Part-Number"] = str(part)
-    #             file = {"file": chunk}
-    #             r = requests.post(url, files=file, headers=headers)
-    #             if r.status_code != 200:
-    #                 return None, r.json()["detail"]
-    #         pbar.set_description(
-    #             "{:.2f}/{:.2f} MB".format(
-    #                 offset / 1024 / 1024, content_size / 1024 / 1024
-    #             )
-    #         )
-    #     pbar.close()
-    #     # complete upload
-    #     url = self.url + "datasets/complete"
-    #     r = requests.post(
-    #         url,
-    #         json={"checksum": checksum},
-    #         headers={
-    #             "Authorization": "Bearer " + id_token,
-    #             "Upload-Id": upload_id,
-    #             "Dataset-Id": dataset_id,
-    #         },
-    #     )
-    #     if r.status_code != 200:
-    #         return None, r.json()["detail"]
-    #     return r.json(), None
     # def delete_file(self, dataset_id, file_name, id_token):
     #     response = requests.delete(
     #         self.url + "datasets/" + dataset_id + "/file/" + file_name,
@@ -274,21 +207,3 @@ class FilesAPIRepo(APIRepo):
     #     if response.status_code != 200:
     #         return None, response.json()["detail"]
     #     return response.json(), None
-    # def ingest_stac(self, stac_json, dataset_id, id_token):
-    #     reponse = requests.put(
-    #         self.url + f"datasets/stac/{dataset_id}",
-    #         json={"stac": stac_json},
-    #         headers={"Authorization": "Bearer " + id_token},
-    #     )
-    #     if reponse.status_code != 200:
-    #         return None, reponse.json()["detail"]
-    #     return reponse.json(), None
-    # def download_stac(self, dataset_id, id_token):
-    #     url = self.url + "datasets/" + dataset_id + "/download"
-    #     headers = {"Authorization": "Bearer " + id_token}
-    #     response = requests.get(url, headers=headers)
-    #     if response.status_code != 200:
-    #         return None, response.json()["detail"]
-    #     return gpd.GeoDataFrame.from_features(response.json()["features"]), None

eotdl/repos/ModelsAPIRepo.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import requests
-import os
 from ..repos import APIRepo

eotdl/tools/__init__.py CHANGED Viewed

@@ -1,6 +1,10 @@
+"""
+Tools module for eotdl package.
+"""
 from .stac import *
 from .tools import *
 from .geo_utils import *
 from .time_utils import *
 from .metadata import *
-from .paths import *
+from .paths import *

eotdl 2023.11.2.post5__py3-none-any.whl → 2023.11.3.post2__py3-none-any.whl

eotdl 2023.11.2.post5py3-none-any.whl → 2023.11.3.post2py3-none-any.whl