eotdl 2024.10.7__py3-none-any.whl → 2025.3.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. eotdl/__init__.py +1 -1
  2. eotdl/access/search.py +0 -2
  3. eotdl/access/sentinelhub/parameters.py +1 -1
  4. eotdl/cli.py +2 -2
  5. eotdl/commands/datasets.py +28 -31
  6. eotdl/commands/models.py +27 -30
  7. eotdl/commands/stac.py +57 -0
  8. eotdl/curation/__init__.py +0 -8
  9. eotdl/curation/stac/__init__.py +1 -8
  10. eotdl/curation/stac/api.py +58 -0
  11. eotdl/curation/stac/stac.py +31 -341
  12. eotdl/datasets/__init__.py +1 -1
  13. eotdl/datasets/ingest.py +28 -159
  14. eotdl/datasets/retrieve.py +0 -9
  15. eotdl/datasets/stage.py +64 -0
  16. eotdl/files/__init__.py +0 -2
  17. eotdl/files/ingest.bck +178 -0
  18. eotdl/files/ingest.py +229 -164
  19. eotdl/{datasets → files}/metadata.py +16 -17
  20. eotdl/models/__init__.py +1 -1
  21. eotdl/models/ingest.py +28 -159
  22. eotdl/models/stage.py +60 -0
  23. eotdl/repos/APIRepo.py +1 -1
  24. eotdl/repos/DatasetsAPIRepo.py +56 -43
  25. eotdl/repos/FilesAPIRepo.py +260 -167
  26. eotdl/repos/STACAPIRepo.py +40 -0
  27. eotdl/repos/__init__.py +1 -0
  28. eotdl/tools/geo_utils.py +7 -2
  29. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/METADATA +5 -4
  30. eotdl-2025.3.25.dist-info/RECORD +65 -0
  31. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/WHEEL +1 -1
  32. eotdl/curation/stac/assets.py +0 -110
  33. eotdl/curation/stac/dataframe.py +0 -172
  34. eotdl/curation/stac/dataframe_bck.py +0 -253
  35. eotdl/curation/stac/dataframe_labeling.py +0 -63
  36. eotdl/curation/stac/extensions/__init__.py +0 -23
  37. eotdl/curation/stac/extensions/base.py +0 -30
  38. eotdl/curation/stac/extensions/dem.py +0 -18
  39. eotdl/curation/stac/extensions/eo.py +0 -117
  40. eotdl/curation/stac/extensions/label/__init__.py +0 -7
  41. eotdl/curation/stac/extensions/label/base.py +0 -136
  42. eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
  43. eotdl/curation/stac/extensions/label/scaneo.py +0 -219
  44. eotdl/curation/stac/extensions/ml_dataset.py +0 -648
  45. eotdl/curation/stac/extensions/projection.py +0 -44
  46. eotdl/curation/stac/extensions/raster.py +0 -53
  47. eotdl/curation/stac/extensions/sar.py +0 -55
  48. eotdl/curation/stac/extent.py +0 -158
  49. eotdl/curation/stac/parsers.py +0 -61
  50. eotdl/datasets/download.py +0 -104
  51. eotdl/files/list_files.py +0 -13
  52. eotdl/models/download.py +0 -101
  53. eotdl/models/metadata.py +0 -43
  54. eotdl/wrappers/utils.py +0 -35
  55. eotdl-2024.10.7.dist-info/RECORD +0 -82
  56. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/entry_points.txt +0 -0
eotdl/models/ingest.py CHANGED
@@ -1,165 +1,34 @@
1
1
  from pathlib import Path
2
- import yaml
3
- import frontmatter
4
- from tqdm import tqdm
5
- import json
6
-
7
- from ..auth import with_auth
8
- from .metadata import Metadata, generate_metadata
9
- from ..repos import ModelsAPIRepo, FilesAPIRepo
10
- from ..shared import calculate_checksum
11
- from ..files import ingest_files, create_new_version
12
- from .update import update_model
13
- from ..curation.stac import STACDataFrame
14
-
15
-
16
- def ingest_model(
17
- path, verbose=False, logger=print, force_metadata_update=False, sync_metadata=False
18
- ):
19
- path = Path(path)
20
- if not path.is_dir():
21
- raise Exception("Path must be a folder")
22
- if "catalog.json" in [f.name for f in path.iterdir()]:
23
- return ingest_stac(path / "catalog.json", logger)
24
- return ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
25
2
 
3
+ from ..repos import ModelsAPIRepo
4
+ from ..files.ingest import prep_ingest_stac, prep_ingest_folder, ingest
26
5
 
27
6
  def retrieve_model(metadata, user):
28
- repo = ModelsAPIRepo()
29
- data, error = repo.retrieve_model(metadata.name)
30
- # print(data, error)
31
- if data and data["uid"] != user["uid"]:
32
- raise Exception("Model already exists.")
33
- if error and error == "Model doesn't exist":
34
- # create dataset
35
- data, error = repo.create_model(metadata.dict(), user)
36
- # print(data, error)
37
- if error:
38
- raise Exception(error)
39
- data["id"] = data["model_id"]
40
- return data
41
-
42
-
43
- @with_auth
44
- def ingest_folder(
45
- folder,
46
- verbose=False,
47
- logger=print,
48
- force_metadata_update=False,
49
- sync_metadata=False,
50
- user=None,
51
- ):
52
- repo = ModelsAPIRepo()
53
- # load metadata
54
- try:
55
- readme = frontmatter.load(folder.joinpath("README.md"))
56
- metadata, content = readme.metadata, readme.content
57
- metadata = Metadata(**metadata)
58
- except FileNotFoundError:
59
- # load metadata (legacy)
60
- metadata = (
61
- yaml.safe_load(open(folder.joinpath("metadata.yml"), "r").read()) or {}
62
- )
63
- metadata = Metadata(**metadata)
64
- content = None
65
- except Exception as e:
66
- raise Exception(f"Error loading metadata: {e}")
67
- # retrieve model (create if doesn't exist)
68
- model = retrieve_model(metadata, user)
7
+ repo = ModelsAPIRepo()
8
+ data, error = repo.retrieve_model(metadata.name)
9
+ # print(data, error)
10
+ if data and data["uid"] != user["uid"]:
11
+ raise Exception("Model already exists.")
12
+ if error and error == "Model doesn't exist":
13
+ # create model
14
+ data, error = repo.create_model(metadata.dict(), user)
15
+ # print(data, error)
16
+ if error:
17
+ raise Exception(error)
18
+ return data
69
19
 
70
- update_metadata = True
71
- if "description" in model:
72
- # do not do this if the model is new, only if it already exists
73
- update_metadata = check_metadata(
74
- model, metadata, content, force_metadata_update, sync_metadata, folder
75
- )
76
- if update_metadata:
77
- update_model(model["id"], metadata, content, user)
78
- # ingest files
79
- return ingest_files(
80
- repo, model["id"], folder, verbose, logger, user, endpoint="models"
81
- )
82
-
83
-
84
- def check_metadata(
85
- dataset, metadata, content, force_metadata_update, sync_metadata, folder
20
+ def ingest_model(
21
+ path,
22
+ verbose=False,
23
+ logger=print,
24
+ force_metadata_update=False,
25
+ sync_metadata=False,
86
26
  ):
87
- if (
88
- dataset["name"] != metadata.name
89
- or dataset["description"] != content
90
- or dataset["authors"] != metadata.authors
91
- or dataset["source"] != metadata.source
92
- or dataset["license"] != metadata.license
93
- or dataset["thumbnail"] != metadata.thumbnail
94
- ):
95
- if not force_metadata_update and not sync_metadata:
96
- raise Exception(
97
- "The provided metadata is not consistent with the current metadata. Use -f to force metadata update or -s to sync your local metadata."
98
- )
99
- if force_metadata_update:
100
- return True
101
- if sync_metadata:
102
- generate_metadata(str(folder), dataset)
103
- return False
104
- return False
105
-
106
-
107
- def retrieve_stac_model(model_name, user):
108
- repo = ModelsAPIRepo()
109
- data, error = repo.retrieve_model(model_name)
110
- # print(data, error)
111
- if data and data["uid"] != user["uid"]:
112
- raise Exception("Model already exists.")
113
- if error and error == "Model doesn't exist":
114
- # create model
115
- data, error = repo.create_stac_model(model_name, user)
116
- # print(data, error)
117
- if error:
118
- raise Exception(error)
119
- data["id"] = data["model_id"]
120
- return data["id"]
121
-
122
-
123
- @with_auth
124
- def ingest_stac(stac_catalog, logger=None, user=None):
125
- repo, files_repo = ModelsAPIRepo(), FilesAPIRepo()
126
- # load catalog
127
- logger("Loading STAC catalog...")
128
- df = STACDataFrame.from_stac_file(stac_catalog)
129
- catalog = df[df["type"] == "Catalog"]
130
- assert len(catalog) == 1, "STAC catalog must have exactly one root catalog"
131
- dataset_name = catalog.id.iloc[0]
132
- # retrieve dataset (create if doesn't exist)
133
- model_id = retrieve_stac_model(dataset_name, user)
134
- # create new version
135
- version = create_new_version(repo, model_id, user)
136
- logger("New version created, version: " + str(version))
137
- df2 = df.dropna(subset=["assets"])
138
- for row in tqdm(df2.iterrows(), total=len(df2)):
139
- try:
140
- for k, v in row[1]["assets"].items():
141
- data, error = files_repo.ingest_file(
142
- v["href"],
143
- model_id,
144
- user,
145
- calculate_checksum(v["href"]), # is always absolute?
146
- "models",
147
- version,
148
- )
149
- if error:
150
- raise Exception(error)
151
- file_url = (
152
- f"{repo.url}models/{data['model_id']}/download/{data['filename']}"
153
- )
154
- df.loc[row[0], "assets"][k]["href"] = file_url
155
- except Exception as e:
156
- logger(f"Error uploading asset {row[0]}: {e}")
157
- break
158
- # ingest the STAC catalog into geodb
159
- logger("Ingesting STAC catalog...")
160
- data, error = repo.ingest_stac(json.loads(df.to_json()), model_id, user)
161
- if error:
162
- # TODO: delete all assets that were uploaded
163
- raise Exception(error)
164
- logger("Done")
165
- return
27
+ path = Path(path)
28
+ if not path.is_dir():
29
+ raise Exception("Path must be a folder")
30
+ if "catalog.json" in [f.name for f in path.iterdir()]:
31
+ prep_ingest_stac(path, logger)
32
+ else:
33
+ prep_ingest_folder(path, verbose, logger, force_metadata_update, sync_metadata)
34
+ return ingest(path, ModelsAPIRepo(), retrieve_model, 'models')
eotdl/models/stage.py ADDED
@@ -0,0 +1,60 @@
1
+ import os
2
+ from pathlib import Path
3
+ from tqdm import tqdm
4
+ import geopandas as gpd
5
+
6
+ from ..auth import with_auth
7
+ from .retrieve import retrieve_model
8
+ from ..repos import FilesAPIRepo
9
+
10
+ @with_auth
11
+ def stage_model(
12
+ model_name,
13
+ version=None,
14
+ path=None,
15
+ logger=print,
16
+ assets=False,
17
+ force=False,
18
+ verbose=False,
19
+ user=None,
20
+ file=None,
21
+ ):
22
+ model = retrieve_model(model_name)
23
+ if version is None:
24
+ version = sorted([v['version_id'] for v in model["versions"]])[-1]
25
+ else:
26
+ assert version in [
27
+ v["version_id"] for v in model["versions"]
28
+ ], f"Version {version} not found"
29
+ download_base_path = os.getenv(
30
+ "EOTDL_DOWNLOAD_PATH", str(Path.home()) + "/.cache/eotdl/models"
31
+ )
32
+ if path is None:
33
+ download_path = download_base_path + "/" + model_name
34
+ else:
35
+ download_path = path + "/" + model_name
36
+ # check if model already exists
37
+ if os.path.exists(download_path) and not force:
38
+ os.makedirs(download_path, exist_ok=True)
39
+ # raise Exception(
40
+ # f"model `{model['name']} v{str(version)}` already exists at {download_path}. To force download, use force=True or -f in the CLI."
41
+ # )
42
+
43
+ # stage metadata
44
+ repo = FilesAPIRepo()
45
+ catalog_path = repo.stage_file(model["id"], f"catalog.v{version}.parquet", user, download_path)
46
+
47
+ # TODO: stage README.md
48
+
49
+ if assets:
50
+ gdf = gpd.read_parquet(catalog_path)
51
+ for _, row in tqdm(gdf.iterrows(), total=len(gdf), desc="Staging assets"):
52
+ for k, v in row["assets"].items():
53
+ stage_model_file(v["href"], download_path)
54
+
55
+ return download_path
56
+
57
+ @with_auth
58
+ def stage_model_file(file_url, path, user):
59
+ repo = FilesAPIRepo()
60
+ return repo.stage_file_url(file_url, path, user)
eotdl/repos/APIRepo.py CHANGED
@@ -5,7 +5,7 @@ import requests
5
5
  class APIRepo:
6
6
  def __init__(self, url=None):
7
7
  default_url = "https://api.eotdl.com/"
8
- # default_url = "http://localhost:8010/"
8
+ # default_url = "http://localhost:8000/"
9
9
  self.url = url if url else os.getenv("EOTDL_API_URL", default_url)
10
10
 
11
11
  def format_response(self, response):
@@ -19,7 +19,15 @@ class DatasetsAPIRepo(APIRepo):
19
19
  url += "&limit=" + str(limit)
20
20
  response = requests.get(url)
21
21
  return self.format_response(response)
22
-
22
+
23
+ def retrieve_dataset(self, name):
24
+ response = requests.get(self.url + "datasets?name=" + name)
25
+ return self.format_response(response)
26
+
27
+ def get_dataset_by_id(self, dataset_id):
28
+ response = requests.get(self.url + "datasets/" + dataset_id)
29
+ return self.format_response(response)
30
+
23
31
  def create_dataset(self, metadata, user):
24
32
  response = requests.post(
25
33
  self.url + "datasets",
@@ -28,53 +36,58 @@ class DatasetsAPIRepo(APIRepo):
28
36
  )
29
37
  return self.format_response(response)
30
38
 
31
- def retrieve_dataset(self, name):
32
- response = requests.get(self.url + "datasets?name=" + name)
33
- return self.format_response(response)
34
-
35
- def create_version(self, dataset_id, user):
39
+ def complete_ingestion(self, dataset_id, version, size, user):
36
40
  response = requests.post(
37
- self.url + "datasets/version/" + dataset_id,
41
+ self.url + "datasets/complete/" + dataset_id,
42
+ json={"version": version, "size": size},
38
43
  headers=self.generate_headers(user),
39
44
  )
40
45
  return self.format_response(response)
41
46
 
42
- def create_stac_dataset(self, name, user):
43
- response = requests.post(
44
- self.url + "datasets/stac",
45
- json={"name": name},
46
- headers=self.generate_headers(user),
47
- )
48
- return self.format_response(response)
49
47
 
50
- def ingest_stac(self, stac_json, dataset_id, user):
51
- response = requests.put(
52
- self.url + f"datasets/stac/{dataset_id}",
53
- json={"stac": stac_json},
54
- headers=self.generate_headers(user),
55
- )
56
- return self.format_response(response)
48
+ # def create_version(self, dataset_id, user):
49
+ # response = requests.post(
50
+ # self.url + "datasets/version/" + dataset_id,
51
+ # headers=self.generate_headers(user),
52
+ # )
53
+ # return self.format_response(response)
57
54
 
58
- def download_stac(self, dataset_id, user):
59
- url = self.url + "datasets/" + dataset_id + "/download"
60
- headers = self.generate_headers(user)
61
- response = requests.get(url, headers=headers)
62
- if response.status_code != 200:
63
- return None, response.json()["detail"]
64
- return gpd.GeoDataFrame.from_features(response.json()["features"]), None
55
+ # def create_stac_dataset(self, name, user):
56
+ # response = requests.post(
57
+ # self.url + "datasets/stac",
58
+ # json={"name": name},
59
+ # headers=self.generate_headers(user),
60
+ # )
61
+ # return self.format_response(response)
65
62
 
66
- def update_dataset(
67
- self, dataset_id, authors, source, license, thumbnail, content, user
68
- ):
69
- response = requests.put(
70
- self.url + f"datasets/{dataset_id}",
71
- json={
72
- "authors": authors,
73
- "source": source,
74
- "license": license,
75
- "thumbnail": thumbnail,
76
- "description": content,
77
- },
78
- headers=self.generate_headers(user),
79
- )
80
- return self.format_response(response)
63
+ # def ingest_stac(self, stac_json, dataset_id, user):
64
+ # response = requests.put(
65
+ # self.url + f"datasets/stac/{dataset_id}",
66
+ # json={"stac": stac_json},
67
+ # headers=self.generate_headers(user),
68
+ # )
69
+ # return self.format_response(response)
70
+
71
+ # def download_stac(self, dataset_id, user):
72
+ # url = self.url + "datasets/" + dataset_id + "/download"
73
+ # headers = self.generate_headers(user)
74
+ # response = requests.get(url, headers=headers)
75
+ # if response.status_code != 200:
76
+ # return None, response.json()["detail"]
77
+ # return gpd.GeoDataFrame.from_features(response.json()["features"]), None
78
+
79
+ # def update_dataset(
80
+ # self, dataset_id, authors, source, license, thumbnail, content, user
81
+ # ):
82
+ # response = requests.put(
83
+ # self.url + f"datasets/{dataset_id}",
84
+ # json={
85
+ # "authors": authors,
86
+ # "source": source,
87
+ # "license": license,
88
+ # "thumbnail": thumbnail,
89
+ # "description": content,
90
+ # },
91
+ # headers=self.generate_headers(user),
92
+ # )
93
+ # return self.format_response(response)