eotdl 2025.2.10__py3-none-any.whl → 2025.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. eotdl/__init__.py +1 -1
  2. eotdl/access/__init__.py +13 -3
  3. eotdl/access/download.py +47 -14
  4. eotdl/access/search.py +33 -5
  5. eotdl/access/sentinelhub/__init__.py +6 -2
  6. eotdl/access/sentinelhub/client.py +7 -6
  7. eotdl/access/sentinelhub/evalscripts.py +266 -0
  8. eotdl/access/sentinelhub/parameters.py +101 -23
  9. eotdl/access/sentinelhub/utils.py +54 -15
  10. eotdl/cli.py +2 -2
  11. eotdl/commands/datasets.py +28 -31
  12. eotdl/commands/models.py +27 -30
  13. eotdl/commands/stac.py +57 -0
  14. eotdl/curation/__init__.py +0 -8
  15. eotdl/curation/stac/__init__.py +1 -8
  16. eotdl/curation/stac/api.py +58 -0
  17. eotdl/curation/stac/stac.py +31 -341
  18. eotdl/datasets/__init__.py +2 -2
  19. eotdl/datasets/ingest.py +36 -161
  20. eotdl/datasets/retrieve.py +0 -9
  21. eotdl/datasets/stage.py +64 -0
  22. eotdl/files/__init__.py +0 -2
  23. eotdl/files/ingest.bck +178 -0
  24. eotdl/files/ingest.py +237 -166
  25. eotdl/{datasets → files}/metadata.py +16 -17
  26. eotdl/models/__init__.py +1 -1
  27. eotdl/models/ingest.py +35 -158
  28. eotdl/models/stage.py +63 -0
  29. eotdl/repos/APIRepo.py +1 -1
  30. eotdl/repos/DatasetsAPIRepo.py +56 -43
  31. eotdl/repos/FilesAPIRepo.py +260 -167
  32. eotdl/repos/ModelsAPIRepo.py +50 -42
  33. eotdl/repos/STACAPIRepo.py +40 -0
  34. eotdl/repos/__init__.py +1 -0
  35. eotdl/tools/time_utils.py +3 -3
  36. {eotdl-2025.2.10.dist-info → eotdl-2025.4.2.dist-info}/METADATA +1 -1
  37. eotdl-2025.4.2.dist-info/RECORD +66 -0
  38. eotdl/curation/stac/assets.py +0 -110
  39. eotdl/curation/stac/dataframe.py +0 -172
  40. eotdl/curation/stac/dataframe_bck.py +0 -253
  41. eotdl/curation/stac/dataframe_labeling.py +0 -63
  42. eotdl/curation/stac/extensions/__init__.py +0 -23
  43. eotdl/curation/stac/extensions/base.py +0 -30
  44. eotdl/curation/stac/extensions/dem.py +0 -18
  45. eotdl/curation/stac/extensions/eo.py +0 -117
  46. eotdl/curation/stac/extensions/label/__init__.py +0 -7
  47. eotdl/curation/stac/extensions/label/base.py +0 -136
  48. eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
  49. eotdl/curation/stac/extensions/label/scaneo.py +0 -219
  50. eotdl/curation/stac/extensions/ml_dataset.py +0 -648
  51. eotdl/curation/stac/extensions/projection.py +0 -44
  52. eotdl/curation/stac/extensions/raster.py +0 -53
  53. eotdl/curation/stac/extensions/sar.py +0 -55
  54. eotdl/curation/stac/extent.py +0 -158
  55. eotdl/curation/stac/parsers.py +0 -61
  56. eotdl/datasets/download.py +0 -104
  57. eotdl/files/list_files.py +0 -13
  58. eotdl/models/metadata.py +0 -43
  59. eotdl-2025.2.10.dist-info/RECORD +0 -81
  60. {eotdl-2025.2.10.dist-info → eotdl-2025.4.2.dist-info}/WHEEL +0 -0
  61. {eotdl-2025.2.10.dist-info → eotdl-2025.4.2.dist-info}/entry_points.txt +0 -0
eotdl/models/ingest.py CHANGED
@@ -1,165 +1,42 @@
1
1
  from pathlib import Path
2
- import yaml
3
- import frontmatter
4
- from tqdm import tqdm
5
- import json
6
-
7
- from ..auth import with_auth
8
- from .metadata import Metadata, generate_metadata
9
- from ..repos import ModelsAPIRepo, FilesAPIRepo
10
- from ..shared import calculate_checksum
11
- from ..files import ingest_files, create_new_version
12
- from .update import update_model
13
- from ..curation.stac import STACDataFrame
14
-
15
-
16
- def ingest_model(
17
- path, verbose=False, logger=print, force_metadata_update=False, sync_metadata=False
18
- ):
19
- path = Path(path)
20
- if not path.is_dir():
21
- raise Exception("Path must be a folder")
22
- if "catalog.json" in [f.name for f in path.iterdir()]:
23
- return ingest_stac(path / "catalog.json", logger)
24
- return ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
25
2
 
3
+ from ..repos import ModelsAPIRepo
4
+ from ..files.ingest import prep_ingest_stac, prep_ingest_folder, ingest, ingest_virtual
26
5
 
27
6
  def retrieve_model(metadata, user):
28
- repo = ModelsAPIRepo()
29
- data, error = repo.retrieve_model(metadata.name)
30
- # print(data, error)
31
- if data and data["uid"] != user["uid"]:
32
- raise Exception("Model already exists.")
33
- if error and error == "Model doesn't exist":
34
- # create dataset
35
- data, error = repo.create_model(metadata.dict(), user)
36
- # print(data, error)
37
- if error:
38
- raise Exception(error)
39
- data["id"] = data["model_id"]
40
- return data
41
-
7
+ repo = ModelsAPIRepo()
8
+ data, error = repo.retrieve_model(metadata.name)
9
+ # print(data, error)
10
+ if data and data["uid"] != user["uid"]:
11
+ raise Exception("Model already exists.")
12
+ if error and error == "Model doesn't exist":
13
+ # create model
14
+ data, error = repo.create_model(metadata.dict(), user)
15
+ if error:
16
+ raise Exception(error)
17
+ return data
42
18
 
43
- @with_auth
44
- def ingest_folder(
45
- folder,
46
- verbose=False,
47
- logger=print,
48
- force_metadata_update=False,
49
- sync_metadata=False,
50
- user=None,
19
+ def ingest_model(
20
+ path,
21
+ verbose=False,
22
+ logger=print,
23
+ force_metadata_update=False,
24
+ sync_metadata=False,
51
25
  ):
52
- repo = ModelsAPIRepo()
53
- # load metadata
54
- try:
55
- readme = frontmatter.load(folder.joinpath("README.md"))
56
- metadata, content = readme.metadata, readme.content
57
- metadata = Metadata(**metadata)
58
- except FileNotFoundError:
59
- # load metadata (legacy)
60
- metadata = (
61
- yaml.safe_load(open(folder.joinpath("metadata.yml"), "r").read()) or {}
62
- )
63
- metadata = Metadata(**metadata)
64
- content = None
65
- except Exception as e:
66
- raise Exception(f"Error loading metadata: {e}")
67
- # retrieve model (create if doesn't exist)
68
- model = retrieve_model(metadata, user)
69
-
70
- update_metadata = True
71
- if "description" in model:
72
- # do not do this if the model is new, only if it already exists
73
- update_metadata = check_metadata(
74
- model, metadata, content, force_metadata_update, sync_metadata, folder
75
- )
76
- if update_metadata:
77
- update_model(model["id"], metadata, content, user)
78
- # ingest files
79
- return ingest_files(
80
- repo, model["id"], folder, verbose, logger, user, endpoint="models"
81
- )
82
-
83
-
84
- def check_metadata(
85
- dataset, metadata, content, force_metadata_update, sync_metadata, folder
26
+ path = Path(path)
27
+ if not path.is_dir():
28
+ raise Exception("Path must be a folder")
29
+ if "catalog.json" in [f.name for f in path.iterdir()]:
30
+ prep_ingest_stac(path, logger)
31
+ else:
32
+ prep_ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
33
+ return ingest(path, ModelsAPIRepo(), retrieve_model, 'models')
34
+
35
+ def ingest_virtual_model( # could work for a list of paths with minimal changes...
36
+ path,
37
+ links,
38
+ metadata = None,
39
+ logger=print,
40
+ user=None,
86
41
  ):
87
- if (
88
- dataset["name"] != metadata.name
89
- or dataset["description"] != content
90
- or dataset["authors"] != metadata.authors
91
- or dataset["source"] != metadata.source
92
- or dataset["license"] != metadata.license
93
- or dataset["thumbnail"] != metadata.thumbnail
94
- ):
95
- if not force_metadata_update and not sync_metadata:
96
- raise Exception(
97
- "The provided metadata is not consistent with the current metadata. Use -f to force metadata update or -s to sync your local metadata."
98
- )
99
- if force_metadata_update:
100
- return True
101
- if sync_metadata:
102
- generate_metadata(str(folder), dataset)
103
- return False
104
- return False
105
-
106
-
107
- def retrieve_stac_model(model_name, user):
108
- repo = ModelsAPIRepo()
109
- data, error = repo.retrieve_model(model_name)
110
- # print(data, error)
111
- if data and data["uid"] != user["uid"]:
112
- raise Exception("Model already exists.")
113
- if error and error == "Model doesn't exist":
114
- # create model
115
- data, error = repo.create_stac_model(model_name, user)
116
- # print(data, error)
117
- if error:
118
- raise Exception(error)
119
- data["id"] = data["model_id"]
120
- return data["id"]
121
-
122
-
123
- @with_auth
124
- def ingest_stac(stac_catalog, logger=None, user=None):
125
- repo, files_repo = ModelsAPIRepo(), FilesAPIRepo()
126
- # load catalog
127
- logger("Loading STAC catalog...")
128
- df = STACDataFrame.from_stac_file(stac_catalog)
129
- catalog = df[df["type"] == "Catalog"]
130
- assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
131
- dataset_name = catalog.id.iloc[0]
132
- # retrieve dataset (create if doesn't exist)
133
- model_id = retrieve_stac_model(dataset_name, user)
134
- # create new version
135
- version = create_new_version(repo, model_id, user)
136
- logger("New version created, version: " + str(version))
137
- df2 = df.dropna(subset=["assets"])
138
- for row in tqdm(df2.iterrows(), total=len(df2)):
139
- try:
140
- for k, v in row[1]["assets"].items():
141
- data, error = files_repo.ingest_file(
142
- v["href"],
143
- model_id,
144
- user,
145
- calculate_checksum(v["href"]), # is always absolute?
146
- "models",
147
- version,
148
- )
149
- if error:
150
- raise Exception(error)
151
- file_url = (
152
- f"{repo.url}models/{data['model_id']}/download/{data['filename']}"
153
- )
154
- df.loc[row[0], "assets"][k]["href"] = file_url
155
- except Exception as e:
156
- logger(f"Error uploading asset {row[0]}: {e}")
157
- break
158
- # ingest the STAC catalog into geodb
159
- logger("Ingesting STAC catalog...")
160
- data, error = repo.ingest_stac(json.loads(df.to_json()), model_id, user)
161
- if error:
162
- # TODO: delete all assets that were uploaded
163
- raise Exception(error)
164
- logger("Done")
165
- return
42
+ return ingest_virtual(path, links, ModelsAPIRepo(), retrieve_model, 'models', metadata, logger)
eotdl/models/stage.py ADDED
@@ -0,0 +1,63 @@
1
+ import os
2
+ from pathlib import Path
3
+ from tqdm import tqdm
4
+ import geopandas as gpd
5
+
6
+ from ..auth import with_auth
7
+ from .retrieve import retrieve_model
8
+ from ..repos import FilesAPIRepo
9
+ from ..files.metadata import Metadata
10
+
11
+ @with_auth
12
+ def stage_model(
13
+ model_name,
14
+ version=None,
15
+ path=None,
16
+ logger=print,
17
+ assets=False,
18
+ force=False,
19
+ verbose=False,
20
+ user=None,
21
+ file=None,
22
+ ):
23
+ model = retrieve_model(model_name)
24
+ if version is None:
25
+ version = sorted([v['version_id'] for v in model["versions"]])[-1]
26
+ else:
27
+ assert version in [
28
+ v["version_id"] for v in model["versions"]
29
+ ], f"Version {version} not found"
30
+ download_base_path = os.getenv(
31
+ "EOTDL_DOWNLOAD_PATH", str(Path.home()) + "/.cache/eotdl/models"
32
+ )
33
+ if path is None:
34
+ download_path = download_base_path + "/" + model_name
35
+ else:
36
+ download_path = path + "/" + model_name
37
+ # check if model already exists
38
+ if os.path.exists(download_path) and not force:
39
+ os.makedirs(download_path, exist_ok=True)
40
+ # raise Exception(
41
+ # f"model `{model['name']} v{str(version)}` already exists at {download_path}. To force download, use force=True or -f in the CLI."
42
+ # )
43
+
44
+ # stage metadata
45
+ repo = FilesAPIRepo()
46
+ catalog_path = repo.stage_file(model["id"], f"catalog.v{version}.parquet", user, download_path)
47
+
48
+ # stage README.md
49
+ metadata = Metadata(**model['metadata'], name=model['name'])
50
+ metadata.save_metadata(download_path)
51
+
52
+ if assets:
53
+ gdf = gpd.read_parquet(catalog_path)
54
+ for _, row in tqdm(gdf.iterrows(), total=len(gdf), desc="Staging assets"):
55
+ for k, v in row["assets"].items():
56
+ stage_model_file(v["href"], download_path)
57
+
58
+ return download_path
59
+
60
+ @with_auth
61
+ def stage_model_file(file_url, path, user):
62
+ repo = FilesAPIRepo()
63
+ return repo.stage_file_url(file_url, path, user)
eotdl/repos/APIRepo.py CHANGED
@@ -5,7 +5,7 @@ import requests
5
5
  class APIRepo:
6
6
  def __init__(self, url=None):
7
7
  default_url = "https://api.eotdl.com/"
8
- # default_url = "http://localhost:8001/"
8
+ # default_url = "http://localhost:8000/"
9
9
  self.url = url if url else os.getenv("EOTDL_API_URL", default_url)
10
10
 
11
11
  def format_response(self, response):
@@ -19,7 +19,15 @@ class DatasetsAPIRepo(APIRepo):
19
19
  url += "&limit=" + str(limit)
20
20
  response = requests.get(url)
21
21
  return self.format_response(response)
22
-
22
+
23
+ def retrieve_dataset(self, name):
24
+ response = requests.get(self.url + "datasets?name=" + name)
25
+ return self.format_response(response)
26
+
27
+ def get_dataset_by_id(self, dataset_id):
28
+ response = requests.get(self.url + "datasets/" + dataset_id)
29
+ return self.format_response(response)
30
+
23
31
  def create_dataset(self, metadata, user):
24
32
  response = requests.post(
25
33
  self.url + "datasets",
@@ -28,53 +36,58 @@ class DatasetsAPIRepo(APIRepo):
28
36
  )
29
37
  return self.format_response(response)
30
38
 
31
- def retrieve_dataset(self, name):
32
- response = requests.get(self.url + "datasets?name=" + name)
33
- return self.format_response(response)
34
-
35
- def create_version(self, dataset_id, user):
39
+ def complete_ingestion(self, dataset_id, version, size, user):
36
40
  response = requests.post(
37
- self.url + "datasets/version/" + dataset_id,
41
+ self.url + "datasets/complete/" + dataset_id,
42
+ json={"version": version, "size": size},
38
43
  headers=self.generate_headers(user),
39
44
  )
40
45
  return self.format_response(response)
41
46
 
42
- def create_stac_dataset(self, name, user):
43
- response = requests.post(
44
- self.url + "datasets/stac",
45
- json={"name": name},
46
- headers=self.generate_headers(user),
47
- )
48
- return self.format_response(response)
49
47
 
50
- def ingest_stac(self, stac_json, dataset_id, user):
51
- response = requests.put(
52
- self.url + f"datasets/stac/{dataset_id}",
53
- json={"stac": stac_json},
54
- headers=self.generate_headers(user),
55
- )
56
- return self.format_response(response)
48
+ # def create_version(self, dataset_id, user):
49
+ # response = requests.post(
50
+ # self.url + "datasets/version/" + dataset_id,
51
+ # headers=self.generate_headers(user),
52
+ # )
53
+ # return self.format_response(response)
57
54
 
58
- def download_stac(self, dataset_id, user):
59
- url = self.url + "datasets/" + dataset_id + "/download"
60
- headers = self.generate_headers(user)
61
- response = requests.get(url, headers=headers)
62
- if response.status_code != 200:
63
- return None, response.json()["detail"]
64
- return gpd.GeoDataFrame.from_features(response.json()["features"]), None
55
+ # def create_stac_dataset(self, name, user):
56
+ # response = requests.post(
57
+ # self.url + "datasets/stac",
58
+ # json={"name": name},
59
+ # headers=self.generate_headers(user),
60
+ # )
61
+ # return self.format_response(response)
65
62
 
66
- def update_dataset(
67
- self, dataset_id, authors, source, license, thumbnail, content, user
68
- ):
69
- response = requests.put(
70
- self.url + f"datasets/{dataset_id}",
71
- json={
72
- "authors": authors,
73
- "source": source,
74
- "license": license,
75
- "thumbnail": thumbnail,
76
- "description": content,
77
- },
78
- headers=self.generate_headers(user),
79
- )
80
- return self.format_response(response)
63
+ # def ingest_stac(self, stac_json, dataset_id, user):
64
+ # response = requests.put(
65
+ # self.url + f"datasets/stac/{dataset_id}",
66
+ # json={"stac": stac_json},
67
+ # headers=self.generate_headers(user),
68
+ # )
69
+ # return self.format_response(response)
70
+
71
+ # def download_stac(self, dataset_id, user):
72
+ # url = self.url + "datasets/" + dataset_id + "/download"
73
+ # headers = self.generate_headers(user)
74
+ # response = requests.get(url, headers=headers)
75
+ # if response.status_code != 200:
76
+ # return None, response.json()["detail"]
77
+ # return gpd.GeoDataFrame.from_features(response.json()["features"]), None
78
+
79
+ # def update_dataset(
80
+ # self, dataset_id, authors, source, license, thumbnail, content, user
81
+ # ):
82
+ # response = requests.put(
83
+ # self.url + f"datasets/{dataset_id}",
84
+ # json={
85
+ # "authors": authors,
86
+ # "source": source,
87
+ # "license": license,
88
+ # "thumbnail": thumbnail,
89
+ # "description": content,
90
+ # },
91
+ # headers=self.generate_headers(user),
92
+ # )
93
+ # return self.format_response(response)