eotdl 2024.10.7__py3-none-any.whl → 2025.3.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. eotdl/__init__.py +1 -1
  2. eotdl/access/search.py +0 -2
  3. eotdl/access/sentinelhub/parameters.py +1 -1
  4. eotdl/cli.py +2 -2
  5. eotdl/commands/datasets.py +28 -31
  6. eotdl/commands/models.py +27 -30
  7. eotdl/commands/stac.py +57 -0
  8. eotdl/curation/__init__.py +0 -8
  9. eotdl/curation/stac/__init__.py +1 -8
  10. eotdl/curation/stac/api.py +58 -0
  11. eotdl/curation/stac/stac.py +31 -341
  12. eotdl/datasets/__init__.py +1 -1
  13. eotdl/datasets/ingest.py +28 -159
  14. eotdl/datasets/retrieve.py +0 -9
  15. eotdl/datasets/stage.py +64 -0
  16. eotdl/files/__init__.py +0 -2
  17. eotdl/files/ingest.bck +178 -0
  18. eotdl/files/ingest.py +229 -164
  19. eotdl/{datasets → files}/metadata.py +16 -17
  20. eotdl/models/__init__.py +1 -1
  21. eotdl/models/ingest.py +28 -159
  22. eotdl/models/stage.py +60 -0
  23. eotdl/repos/APIRepo.py +1 -1
  24. eotdl/repos/DatasetsAPIRepo.py +56 -43
  25. eotdl/repos/FilesAPIRepo.py +260 -167
  26. eotdl/repos/STACAPIRepo.py +40 -0
  27. eotdl/repos/__init__.py +1 -0
  28. eotdl/tools/geo_utils.py +7 -2
  29. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/METADATA +5 -4
  30. eotdl-2025.3.25.dist-info/RECORD +65 -0
  31. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/WHEEL +1 -1
  32. eotdl/curation/stac/assets.py +0 -110
  33. eotdl/curation/stac/dataframe.py +0 -172
  34. eotdl/curation/stac/dataframe_bck.py +0 -253
  35. eotdl/curation/stac/dataframe_labeling.py +0 -63
  36. eotdl/curation/stac/extensions/__init__.py +0 -23
  37. eotdl/curation/stac/extensions/base.py +0 -30
  38. eotdl/curation/stac/extensions/dem.py +0 -18
  39. eotdl/curation/stac/extensions/eo.py +0 -117
  40. eotdl/curation/stac/extensions/label/__init__.py +0 -7
  41. eotdl/curation/stac/extensions/label/base.py +0 -136
  42. eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
  43. eotdl/curation/stac/extensions/label/scaneo.py +0 -219
  44. eotdl/curation/stac/extensions/ml_dataset.py +0 -648
  45. eotdl/curation/stac/extensions/projection.py +0 -44
  46. eotdl/curation/stac/extensions/raster.py +0 -53
  47. eotdl/curation/stac/extensions/sar.py +0 -55
  48. eotdl/curation/stac/extent.py +0 -158
  49. eotdl/curation/stac/parsers.py +0 -61
  50. eotdl/datasets/download.py +0 -104
  51. eotdl/files/list_files.py +0 -13
  52. eotdl/models/download.py +0 -101
  53. eotdl/models/metadata.py +0 -43
  54. eotdl/wrappers/utils.py +0 -35
  55. eotdl-2024.10.7.dist-info/RECORD +0 -82
  56. {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/entry_points.txt +0 -0
eotdl/files/ingest.bck ADDED
@@ -0,0 +1,178 @@
1
+ from pathlib import Path
2
+ import os
3
+ from tqdm import tqdm
4
+ import zipfile
5
+ import io
6
+ from glob import glob
7
+ import os
8
+
9
+ from ..repos import FilesAPIRepo
10
+ from ..shared import calculate_checksum
11
+
12
+
13
+ def retrieve_files(folder):
14
+ # get all files in directory recursively
15
+ items = [Path(item) for item in glob(str(folder) + "/**/*", recursive=True)]
16
+ if not any(item.name == "metadata.yml" for item in items) and not any(
17
+ item.name == "README.md" for item in items
18
+ ):
19
+ raise Exception("README.md not found in directory")
20
+ # remove metadata files
21
+ items = [item for item in items if item.name != "metadata.yml"]
22
+ items = [item for item in items if item.name != "README.md"]
23
+ # remove directories
24
+ items = [item for item in items if not item.is_dir()]
25
+ if len(items) == 0:
26
+ raise Exception("No files found in directory")
27
+ return items
28
+
29
+
30
+ def prepare_item(item, folder):
31
+ return {
32
+ "filename": item.name,
33
+ "path": str(item.relative_to(folder)),
34
+ "absolute_path": item.absolute(),
35
+ "size": os.path.getsize(item.absolute()),
36
+ "checksum": calculate_checksum(item.absolute()),
37
+ }
38
+
39
+
40
+ def generate_batches(files, max_batch_size=1024 * 1024 * 10, max_batch_files=10):
41
+ batches = []
42
+ for item in tqdm(files):
43
+ if not batches:
44
+ batches.append([item])
45
+ continue
46
+ if max_batch_size:
47
+ size_check = sum([i["size"] for i in batches[-1]]) < max_batch_size
48
+ else:
49
+ size_check = True
50
+ if size_check and len(batches[-1]) < max_batch_files:
51
+ batches[-1].append(item)
52
+ else:
53
+ batches.append([item])
54
+ return batches
55
+
56
+
57
+ def compress_batch(batch):
58
+ memory_file = io.BytesIO()
59
+ with zipfile.ZipFile(memory_file, "w") as zf:
60
+ for f in batch:
61
+ zf.write(f["absolute_path"], arcname=f["path"])
62
+ memory_file.seek(0)
63
+ return memory_file
64
+
65
+
66
+ def generate_files_lists(
67
+ items, folder, dataset_or_model_id, endpoint, logger, max_size=1024 * 1024 * 16
68
+ ):
69
+ files_repo = FilesAPIRepo()
70
+ current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
71
+ # print(len(current_files), len(items) - len(current_files))
72
+ # print(current_files, error)
73
+ if error:
74
+ current_files = []
75
+ # generate list of files to upload
76
+ logger("generating list of files to upload...")
77
+ upload_files, existing_files, large_files = [], [], []
78
+ current_names = [f["filename"] for f in current_files]
79
+ current_checksums = [f["checksum"] for f in current_files]
80
+ for item in tqdm(items):
81
+ data = prepare_item(item, folder)
82
+ if data["path"] in current_names and data["checksum"] in current_checksums:
83
+ existing_files.append(data)
84
+ else:
85
+ if data["size"] > max_size:
86
+ large_files.append(data)
87
+ else:
88
+ upload_files.append(data)
89
+ # TODO: should ingest new version if files removed
90
+ if len(upload_files) == 0 and len(large_files) == 0:
91
+ raise Exception("No new files to upload")
92
+ return upload_files, existing_files, large_files
93
+
94
+
95
+ def create_new_version(repo, dataset_or_model_id, user):
96
+ data, error = repo.create_version(dataset_or_model_id, user)
97
+ if error:
98
+ raise Exception(error)
99
+ return data["version"]
100
+
101
+
102
+ def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpoint):
103
+ files_repo = FilesAPIRepo()
104
+ logger(f"Uploading directory {folder}...")
105
+ items = retrieve_files(folder)
106
+ # retrieve files
107
+ upload_files, existing_files, large_files = generate_files_lists(
108
+ items, folder, dataset_or_model_id, endpoint, logger
109
+ )
110
+ logger(f"{len(upload_files) + len(large_files)} new files will be ingested")
111
+ logger(f"{len(existing_files)} files already exist in dataset")
112
+ logger(f"{len(large_files)} large files will be ingested separately")
113
+ # create new version
114
+ version = create_new_version(repo, dataset_or_model_id, user)
115
+ logger("New version created, version: " + str(version))
116
+ # ingest new large files
117
+ if len(large_files) > 0:
118
+ logger("ingesting large files...")
119
+ for file in large_files:
120
+ logger("ingesting file: " + file["path"])
121
+ upload_id, parts = files_repo.prepare_large_upload(
122
+ file["path"],
123
+ dataset_or_model_id,
124
+ file["checksum"],
125
+ user,
126
+ endpoint,
127
+ )
128
+ # print(upload_id, parts)
129
+ files_repo.ingest_large_file(
130
+ file["absolute_path"],
131
+ file["size"],
132
+ upload_id,
133
+ user,
134
+ parts,
135
+ endpoint,
136
+ )
137
+ data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
138
+ # ingest new small files in batches
139
+ if len(upload_files) > 0:
140
+ logger("generating batches...")
141
+ batches = generate_batches(upload_files)
142
+ logger(
143
+ f"Uploading {len(upload_files)} small files in {len(batches)} batches..."
144
+ )
145
+ repo = FilesAPIRepo()
146
+ for batch in tqdm(
147
+ batches, desc="Uploading batches", unit="batches", disable=verbose
148
+ ):
149
+ # compress batch
150
+ memory_file = compress_batch(batch)
151
+ # ingest batch
152
+ data, error = repo.ingest_files_batch(
153
+ memory_file,
154
+ [f["checksum"] for f in batch],
155
+ dataset_or_model_id,
156
+ user,
157
+ endpoint,
158
+ version,
159
+ )
160
+ # ingest existing files
161
+ if len(existing_files) > 0:
162
+ batches = generate_batches(existing_files, max_batch_size=None)
163
+ for batch in tqdm(
164
+ batches,
165
+ desc="Ingesting existing files",
166
+ unit="batches",
167
+ disable=verbose,
168
+ ):
169
+ data, error = files_repo.add_files_batch_to_version(
170
+ batch,
171
+ dataset_or_model_id,
172
+ version,
173
+ user,
174
+ endpoint,
175
+ )
176
+ if error:
177
+ raise Exception(error)
178
+ return data
eotdl/files/ingest.py CHANGED
@@ -1,178 +1,243 @@
1
+ from glob import glob
1
2
  from pathlib import Path
3
+ import geopandas as gpd
2
4
  import os
5
+ import pystac
3
6
  from tqdm import tqdm
4
- import zipfile
5
- import io
6
- from glob import glob
7
- import os
7
+ import stac_geoparquet
8
+ import frontmatter
9
+ import random
10
+ import pandas as pd
11
+ from datetime import datetime
12
+ from shapely.geometry import Polygon
8
13
 
14
+ from ..auth import with_auth
15
+ from ..files.metadata import Metadata
9
16
  from ..repos import FilesAPIRepo
10
17
  from ..shared import calculate_checksum
11
18
 
19
+ def prep_ingest_folder(
20
+ folder,
21
+ verbose=False,
22
+ logger=print,
23
+ force_metadata_update=False,
24
+ sync_metadata=False,
25
+ ):
26
+ logger("Ingesting directory: " + str(folder))
27
+ catalog_path = folder.joinpath("catalog.parquet")
28
+ files = glob(str(folder) + '/**/*', recursive=True)
29
+ # remove catalog.parquet from files
30
+ files = [f for f in files if f != str(catalog_path)]
31
+ # ingest geometry from files (if tifs) or additional list of geometries
32
+ # https://stac-utils.github.io/stac-geoparquet/latest/spec/stac-geoparquet-spec/#use-cases
33
+ data = []
34
+ for file in files:
35
+ file_path = Path(file)
36
+ if file_path.is_file():
37
+ relative_path = os.path.relpath(file_path, catalog_path.parent)
38
+ absolute_path = str(file_path)
39
+ # THIS IS THE MINIMUM REQUIRED FIELDS TO CREATE A VALID STAC ITEM
40
+ data.append(create_stac_item(relative_path, absolute_path))
41
+ gdf = gpd.GeoDataFrame(data, geometry='geometry')
42
+ # Save to parquet
43
+ gdf.to_parquet(catalog_path)
44
+ return catalog_path
12
45
 
13
- def retrieve_files(folder):
14
- # get all files in directory recursively
15
- items = [Path(item) for item in glob(str(folder) + "/**/*", recursive=True)]
16
- if not any(item.name == "metadata.yml" for item in items) and not any(
17
- item.name == "README.md" for item in items
18
- ):
19
- raise Exception("README.md not found in directory")
20
- # remove metadata files
21
- items = [item for item in items if item.name != "metadata.yml"]
22
- items = [item for item in items if item.name != "README.md"]
23
- # remove directories
24
- items = [item for item in items if not item.is_dir()]
25
- if len(items) == 0:
26
- raise Exception("No files found in directory")
27
- return items
28
-
29
-
30
- def prepare_item(item, folder):
31
- return {
32
- "filename": item.name,
33
- "path": str(item.relative_to(folder)),
34
- "absolute_path": item.absolute(),
35
- "size": os.path.getsize(item.absolute()),
36
- "checksum": calculate_checksum(item.absolute()),
37
- }
38
-
39
-
40
- def generate_batches(files, max_batch_size=1024 * 1024 * 10, max_batch_files=10):
41
- batches = []
42
- for item in tqdm(files):
43
- if not batches:
44
- batches.append([item])
45
- continue
46
- if max_batch_size:
47
- size_check = sum([i["size"] for i in batches[-1]]) < max_batch_size
48
- else:
49
- size_check = True
50
- if size_check and len(batches[-1]) < max_batch_files:
51
- batches[-1].append(item)
52
- else:
53
- batches.append([item])
54
- return batches
55
-
56
-
57
- def compress_batch(batch):
58
- memory_file = io.BytesIO()
59
- with zipfile.ZipFile(memory_file, "w") as zf:
60
- for f in batch:
61
- zf.write(f["absolute_path"], arcname=f["path"])
62
- memory_file.seek(0)
63
- return memory_file
64
-
46
+ # IF THE KEYS IN THE ASSETS ARE NOT THE SAME ON ALL ITEMS, THE PARQUET WILL NOT BE VALID !!!
47
+ def prep_ingest_stac(path, logger=None): # in theory should work with a remote catalog (given URL)
48
+ # read stac catalog
49
+ stac_catalog = path / "catalog.json"
50
+ catalog = pystac.Catalog.from_file(stac_catalog)
51
+ # make all items paths hredf in assets absolute
52
+ catalog.make_all_asset_hrefs_absolute()
53
+ # generate list of items for all collections
54
+ items = []
55
+ for collection in catalog.get_collections():
56
+ # iterate over items
57
+ for item in tqdm(collection.get_items(), desc=f"Ingesting items from collection {collection.id}"):
58
+ assert isinstance(item, pystac.Item)
59
+ items.append(item)
60
+ # save parquet file
61
+ record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(items)
62
+ output_path = stac_catalog.parent / "catalog.parquet"
63
+ stac_geoparquet.arrow.to_parquet(record_batch_reader, output_path)
64
+ return output_path
65
65
 
66
- def generate_files_lists(
67
- items, folder, dataset_or_model_id, endpoint, logger, max_size=1024 * 1024 * 16
66
+ @with_auth
67
+ def ingest_virutal_dataset( # could work for a list of paths with minimal changes...
68
+ path,
69
+ links,
70
+ metadata = None,
71
+ logger=print,
72
+ user=None,
68
73
  ):
69
- files_repo = FilesAPIRepo()
70
- current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
71
- # print(len(current_files), len(items) - len(current_files))
72
- # print(current_files, error)
73
- if error:
74
- current_files = []
75
- # generate list of files to upload
76
- logger("generating list of files to upload...")
77
- upload_files, existing_files, large_files = [], [], []
78
- current_names = [f["filename"] for f in current_files]
79
- current_checksums = [f["checksum"] for f in current_files]
80
- for item in tqdm(items):
81
- data = prepare_item(item, folder)
82
- if data["path"] in current_names and data["checksum"] in current_checksums:
83
- existing_files.append(data)
84
- else:
85
- if data["size"] > max_size:
86
- large_files.append(data)
87
- else:
88
- upload_files.append(data)
89
- # TODO: should ingest new version if files removed
90
- if len(upload_files) == 0 and len(large_files) == 0:
91
- raise Exception("No new files to upload")
92
- return upload_files, existing_files, large_files
74
+ path = Path(path)
75
+ if metadata is None:
76
+ readme = frontmatter.load(path.joinpath("README.md"))
77
+ metadata_dict = readme.metadata
78
+ # Add description from content before creating Metadata object
79
+ metadata_dict["description"] = readme.content
80
+ metadata = Metadata(**metadata_dict)
81
+ else:
82
+ metadata = Metadata(**metadata)
83
+ metadata.save_metadata(path)
84
+ data = []
85
+ for link in links:
86
+ assert link.startswith("http"), "All links must start with http or https"
87
+ data.append(create_stac_item(link, link))
88
+ data.append(create_stac_item('README.md', str(path / "README.md")))
89
+ gdf = gpd.GeoDataFrame(data, geometry='geometry')
90
+ gdf.to_parquet(path / "catalog.parquet")
91
+ return ingest(path)
92
+
93
+ @with_auth
94
+ def ingest(path, repo, retrieve, mode, user):
95
+ try:
96
+ readme = frontmatter.load(path.joinpath("README.md"))
97
+ metadata_dict = readme.metadata
98
+ # Add description from content before creating Metadata object
99
+ metadata_dict["description"] = readme.content
100
+ metadata = Metadata(**metadata_dict)
101
+ except Exception as e:
102
+ print(str(e))
103
+ raise Exception("Error loading metadata")
104
+ # retrieve dataset (create if doesn't exist)
105
+ dataset_or_model = retrieve(metadata, user)
106
+ current_version = sorted([v['version_id'] for v in dataset_or_model["versions"]])[-1]
107
+ print("current version: ", current_version)
93
108
 
109
+ # TODO: update README if metadata changed in UI (db)
110
+ # update_metadata = True
111
+ # if "description" in dataset:
112
+ # # do not do this if the dataset is new, only if it already exists
113
+ # update_metadata = check_metadata(
114
+ # dataset, metadata, content, force_metadata_update, sync_metadata, folder
115
+ # )
116
+ # if update_metadata:
117
+ # update_dataset(dataset["id"], metadata, content, user)
118
+ # return ingest_files(
119
+ # repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
120
+ # )
94
121
 
95
- def create_new_version(repo, dataset_or_model_id, user):
96
- data, error = repo.create_version(dataset_or_model_id, user)
97
- if error:
98
- raise Exception(error)
99
- return data["version"]
122
+ catalog_path = path.joinpath("catalog.parquet")
123
+ gdf = gpd.read_parquet(catalog_path)
124
+ files_repo = FilesAPIRepo()
125
+ catalog_url = files_repo.generate_presigned_url(f'catalog.v{current_version}.parquet', dataset_or_model['id'], user)
100
126
 
127
+ # first time ingesting
128
+ if catalog_url is None:
129
+ total_size = 0
130
+ for row in tqdm(gdf.iterrows(), total=len(gdf), desc="Ingesting files"):
131
+ try:
132
+ for k, v in row[1]["assets"].items():
133
+ if v["href"].startswith("http"): continue
134
+ item_id = row[1]["id"]
135
+ data, error = files_repo.ingest_file(
136
+ v["href"],
137
+ item_id,
138
+ # Path(v["href"]).stat().st_size,
139
+ dataset_or_model['id'],
140
+ user,
141
+ mode,
142
+ )
143
+ if error:
144
+ raise Exception(error)
145
+ file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
146
+ gdf.loc[row[0], "assets"][k]["href"] = file_url
147
+ total_size += v["size"]
148
+ except Exception as e:
149
+ print(f"Error uploading asset {row[0]}: {e}")
150
+ break
151
+ gdf.to_parquet(catalog_path)
152
+ files_repo.ingest_file(str(catalog_path), f'catalog.v{current_version}.parquet', dataset_or_model['id'], user, "datasets")
153
+ data, error = repo.complete_ingestion(dataset_or_model['id'], current_version, total_size, user)
154
+ if error:
155
+ raise Exception(error)
156
+ return catalog_path
157
+
158
+ # files were already ingested
159
+ # TODO: check for deleted files (currently only updating existing files and ingesting new ones)
160
+ # TODO: adding new links in virtual datasets dont trigger new version (but changing README does)
161
+ new_version = False
162
+ num_changes = 0
163
+ total_size = 0
164
+ for row in tqdm(gdf.iterrows(), total=len(gdf), desc="Ingesting files"):
165
+ try:
166
+ for k, v in row[1]["assets"].items():
167
+ if v["href"].startswith("http"): continue
168
+ item_id = row[1]["id"]
169
+ # check if file exists in previous versions
170
+ df = pd.read_parquet(
171
+ path=catalog_url,
172
+ filters=[('id', '=', item_id)]
173
+ )
174
+ if len(df) > 0: # file exists in previous versions
175
+ if df.iloc[0]['assets'][k]["checksum"] == v["checksum"]: # file is the same
176
+ # still need to update the required fields
177
+ file_url = f"{repo.url}datasets/{dataset_or_model['id']}/stage/{item_id}"
178
+ gdf.loc[row[0], "assets"][k]["href"] = file_url
179
+ total_size += v["size"]
180
+ continue
181
+ else: # file is different, so ingest new version but with a different id
182
+ item_id = item_id + f"-{random.randint(1, 1000000)}"
183
+ gdf.loc[row[0], "id"] = item_id
184
+ new_version = True
185
+ num_changes += 1
186
+ # ingest new files
187
+ data, error = files_repo.ingest_file(
188
+ v["href"],
189
+ item_id, # item id, will be path in local or given id in STAC. if not unique, will overwrite previous file in storage
190
+ # Path(v["href"]).stat().st_size,
191
+ dataset_or_model['id'],
192
+ user,
193
+ # calculate_checksum(asset["href"]), # is always absolute?
194
+ mode,
195
+ # version,
196
+ )
197
+ if error:
198
+ raise Exception(error)
199
+ file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
200
+ gdf.loc[row[0], "assets"][k]["href"] = file_url
201
+ total_size += v["size"]
202
+ except Exception as e:
203
+ print(f"Error uploading asset {row[0]}: {e}")
204
+ break
205
+ if not new_version:
206
+ print("No new version was created, your dataset has not changed.")
207
+ else:
208
+ new_version = current_version + 1
209
+ print("A new version was created, your dataset has changed.")
210
+ print(f"Num changes: {num_changes}")
211
+ gdf.to_parquet(catalog_path)
212
+ files_repo.ingest_file(str(catalog_path), f'catalog.v{new_version}.parquet', dataset_or_model['id'], user, mode)
213
+ # TODO: ingest README.md
214
+ data, error = repo.complete_ingestion(dataset_or_model['id'], new_version, total_size, user)
215
+ if error:
216
+ raise Exception(error)
217
+ return catalog_path
101
218
 
102
- def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpoint):
103
- files_repo = FilesAPIRepo()
104
- logger(f"Uploading directory {folder}...")
105
- items = retrieve_files(folder)
106
- # retrieve files
107
- upload_files, existing_files, large_files = generate_files_lists(
108
- items, folder, dataset_or_model_id, endpoint, logger
109
- )
110
- logger(f"{len(upload_files) + len(large_files)} new files will be ingested")
111
- logger(f"{len(existing_files)} files already exist in dataset")
112
- logger(f"{len(large_files)} large files will be ingested separately")
113
- # create new version
114
- version = create_new_version(repo, dataset_or_model_id, user)
115
- logger("New version created, version: " + str(version))
116
- # ingest new large files
117
- if len(large_files) > 0:
118
- logger("ingesting large files...")
119
- for file in large_files:
120
- logger("ingesting file: " + file["path"])
121
- upload_id, parts = files_repo.prepare_large_upload(
122
- file["path"],
123
- dataset_or_model_id,
124
- file["checksum"],
125
- user,
126
- endpoint,
127
- )
128
- # print(upload_id, parts)
129
- files_repo.ingest_large_file(
130
- file["absolute_path"],
131
- file["size"],
132
- upload_id,
133
- user,
134
- parts,
135
- endpoint,
136
- )
137
- data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
138
- # ingest new small files in batches
139
- if len(upload_files) > 0:
140
- logger("generating batches...")
141
- batches = generate_batches(upload_files)
142
- logger(
143
- f"Uploading {len(upload_files)} small files in {len(batches)} batches..."
144
- )
145
- repo = FilesAPIRepo()
146
- for batch in tqdm(
147
- batches, desc="Uploading batches", unit="batches", disable=verbose
148
- ):
149
- # compress batch
150
- memory_file = compress_batch(batch)
151
- # ingest batch
152
- data, error = repo.ingest_files_batch(
153
- memory_file,
154
- [f["checksum"] for f in batch],
155
- dataset_or_model_id,
156
- user,
157
- endpoint,
158
- version,
159
- )
160
- # ingest existing files
161
- if len(existing_files) > 0:
162
- batches = generate_batches(existing_files, max_batch_size=None)
163
- for batch in tqdm(
164
- batches,
165
- desc="Ingesting existing files",
166
- unit="batches",
167
- disable=verbose,
168
- ):
169
- data, error = files_repo.add_files_batch_to_version(
170
- batch,
171
- dataset_or_model_id,
172
- version,
173
- user,
174
- endpoint,
175
- )
176
- if error:
177
- raise Exception(error)
178
- return data
219
+ def create_stac_item(item_id, asset_href):
220
+ return {
221
+ 'type': 'Feature',
222
+ 'stac_version': '1.0.0',
223
+ 'stac_extensions': [],
224
+ 'datetime': datetime.now(), # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
225
+ 'id': item_id,
226
+ 'bbox': {
227
+ 'xmin': 0.0,
228
+ 'ymin': 0.0,
229
+ 'xmax': 0.0,
230
+ 'ymax': 0.0
231
+ }, # infer from file or from list of geometries
232
+ 'geometry': Polygon(), # empty polygon
233
+ 'assets': { 'asset': { # STAC needs this to be a Dict[str, Asset], not list !!! use same key or parquet breaks !!!
234
+ 'href': asset_href,
235
+ 'checksum': calculate_checksum(asset_href) if not asset_href.startswith("http") else None,
236
+ 'timestamp': datetime.now(),
237
+ 'size': Path(asset_href).stat().st_size if not asset_href.startswith("http") else None,
238
+ }},
239
+ "links": [],
240
+ # 'collection': 'source',
241
+ # anything below are properties (need at least one!)
242
+ 'repository': 'eotdl',
243
+ }
@@ -1,13 +1,14 @@
1
1
  from pydantic import BaseModel, validator
2
2
  from typing import List, Optional
3
3
  from pathlib import Path
4
-
4
+ import os
5
5
 
6
6
  class Metadata(BaseModel):
7
7
  authors: List[str]
8
8
  license: str
9
9
  source: str
10
10
  name: str
11
+ description: str
11
12
  thumbnail: Optional[str] = ""
12
13
 
13
14
  # validate source is a URL
@@ -27,19 +28,17 @@ class Metadata(BaseModel):
27
28
  return v
28
29
 
29
30
 
30
- def generate_metadata(download_path, dataset):
31
- with open(download_path + "/README.md", "w") as f:
32
- f.write("---\n")
33
- f.write(f"name: {dataset['name']}\n")
34
- f.write(f"license: {dataset['license']}\n")
35
- f.write(f"source: {dataset['source']}\n")
36
- f.write(f"thumbnail: {dataset['thumbnail']}\n")
37
- f.write(f"authors:\n")
38
- for author in dataset["authors"]:
39
- f.write(f" - {author}\n")
40
- f.write("---\n")
41
- f.write(dataset["description"])
42
- # remove metadata.yml if exists
43
- if Path(download_path + "/metadata.yml").exists():
44
- Path(download_path + "/metadata.yml").unlink()
45
- return download_path + "/README.md"
31
+ def save_metadata(self, dst_path):
32
+ os.makedirs(dst_path, exist_ok=True)
33
+ with open(Path(dst_path) / "README.md", "w") as f:
34
+ f.write("---\n")
35
+ f.write(f"name: {self.name}\n")
36
+ f.write(f"license: {self.license}\n")
37
+ f.write(f"source: {self.source}\n")
38
+ f.write(f"thumbnail: {self.thumbnail}\n")
39
+ f.write(f"authors:\n")
40
+ for author in self.authors:
41
+ f.write(f" - {author}\n")
42
+ f.write("---\n")
43
+ f.write(self.description)
44
+ return str(Path(dst_path) / "README.md")
eotdl/models/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  from .retrieve import retrieve_models, retrieve_model, retrieve_model_files
2
2
  from .ingest import ingest_model
3
- from .download import download_model
3
+ from .stage import stage_model, stage_model_file