eotdl 2025.2.10__py3-none-any.whl → 2025.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. eotdl/__init__.py +1 -1
  2. eotdl/access/__init__.py +13 -3
  3. eotdl/access/download.py +47 -14
  4. eotdl/access/search.py +33 -5
  5. eotdl/access/sentinelhub/__init__.py +6 -2
  6. eotdl/access/sentinelhub/client.py +7 -6
  7. eotdl/access/sentinelhub/evalscripts.py +266 -0
  8. eotdl/access/sentinelhub/parameters.py +101 -23
  9. eotdl/access/sentinelhub/utils.py +54 -15
  10. eotdl/cli.py +2 -2
  11. eotdl/commands/datasets.py +28 -31
  12. eotdl/commands/models.py +27 -30
  13. eotdl/commands/stac.py +57 -0
  14. eotdl/curation/__init__.py +0 -8
  15. eotdl/curation/stac/__init__.py +1 -8
  16. eotdl/curation/stac/api.py +58 -0
  17. eotdl/curation/stac/stac.py +31 -341
  18. eotdl/datasets/__init__.py +2 -2
  19. eotdl/datasets/ingest.py +36 -161
  20. eotdl/datasets/retrieve.py +0 -9
  21. eotdl/datasets/stage.py +64 -0
  22. eotdl/files/__init__.py +0 -2
  23. eotdl/files/ingest.bck +178 -0
  24. eotdl/files/ingest.py +237 -166
  25. eotdl/{datasets → files}/metadata.py +16 -17
  26. eotdl/models/__init__.py +1 -1
  27. eotdl/models/ingest.py +35 -158
  28. eotdl/models/stage.py +63 -0
  29. eotdl/repos/APIRepo.py +1 -1
  30. eotdl/repos/DatasetsAPIRepo.py +56 -43
  31. eotdl/repos/FilesAPIRepo.py +260 -167
  32. eotdl/repos/ModelsAPIRepo.py +50 -42
  33. eotdl/repos/STACAPIRepo.py +40 -0
  34. eotdl/repos/__init__.py +1 -0
  35. eotdl/tools/time_utils.py +3 -3
  36. {eotdl-2025.2.10.dist-info → eotdl-2025.4.2.dist-info}/METADATA +1 -1
  37. eotdl-2025.4.2.dist-info/RECORD +66 -0
  38. eotdl/curation/stac/assets.py +0 -110
  39. eotdl/curation/stac/dataframe.py +0 -172
  40. eotdl/curation/stac/dataframe_bck.py +0 -253
  41. eotdl/curation/stac/dataframe_labeling.py +0 -63
  42. eotdl/curation/stac/extensions/__init__.py +0 -23
  43. eotdl/curation/stac/extensions/base.py +0 -30
  44. eotdl/curation/stac/extensions/dem.py +0 -18
  45. eotdl/curation/stac/extensions/eo.py +0 -117
  46. eotdl/curation/stac/extensions/label/__init__.py +0 -7
  47. eotdl/curation/stac/extensions/label/base.py +0 -136
  48. eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
  49. eotdl/curation/stac/extensions/label/scaneo.py +0 -219
  50. eotdl/curation/stac/extensions/ml_dataset.py +0 -648
  51. eotdl/curation/stac/extensions/projection.py +0 -44
  52. eotdl/curation/stac/extensions/raster.py +0 -53
  53. eotdl/curation/stac/extensions/sar.py +0 -55
  54. eotdl/curation/stac/extent.py +0 -158
  55. eotdl/curation/stac/parsers.py +0 -61
  56. eotdl/datasets/download.py +0 -104
  57. eotdl/files/list_files.py +0 -13
  58. eotdl/models/metadata.py +0 -43
  59. eotdl-2025.2.10.dist-info/RECORD +0 -81
  60. {eotdl-2025.2.10.dist-info → eotdl-2025.4.2.dist-info}/WHEEL +0 -0
  61. {eotdl-2025.2.10.dist-info → eotdl-2025.4.2.dist-info}/entry_points.txt +0 -0
eotdl/files/ingest.bck ADDED
@@ -0,0 +1,178 @@
1
+ from pathlib import Path
2
+ import os
3
+ from tqdm import tqdm
4
+ import zipfile
5
+ import io
6
+ from glob import glob
7
+ import os
8
+
9
+ from ..repos import FilesAPIRepo
10
+ from ..shared import calculate_checksum
11
+
12
+
13
+ def retrieve_files(folder):
14
+ # get all files in directory recursively
15
+ items = [Path(item) for item in glob(str(folder) + "/**/*", recursive=True)]
16
+ if not any(item.name == "metadata.yml" for item in items) and not any(
17
+ item.name == "README.md" for item in items
18
+ ):
19
+ raise Exception("README.md not found in directory")
20
+ # remove metadata files
21
+ items = [item for item in items if item.name != "metadata.yml"]
22
+ items = [item for item in items if item.name != "README.md"]
23
+ # remove directories
24
+ items = [item for item in items if not item.is_dir()]
25
+ if len(items) == 0:
26
+ raise Exception("No files found in directory")
27
+ return items
28
+
29
+
30
+ def prepare_item(item, folder):
31
+ return {
32
+ "filename": item.name,
33
+ "path": str(item.relative_to(folder)),
34
+ "absolute_path": item.absolute(),
35
+ "size": os.path.getsize(item.absolute()),
36
+ "checksum": calculate_checksum(item.absolute()),
37
+ }
38
+
39
+
40
+ def generate_batches(files, max_batch_size=1024 * 1024 * 10, max_batch_files=10):
41
+ batches = []
42
+ for item in tqdm(files):
43
+ if not batches:
44
+ batches.append([item])
45
+ continue
46
+ if max_batch_size:
47
+ size_check = sum([i["size"] for i in batches[-1]]) < max_batch_size
48
+ else:
49
+ size_check = True
50
+ if size_check and len(batches[-1]) < max_batch_files:
51
+ batches[-1].append(item)
52
+ else:
53
+ batches.append([item])
54
+ return batches
55
+
56
+
57
+ def compress_batch(batch):
58
+ memory_file = io.BytesIO()
59
+ with zipfile.ZipFile(memory_file, "w") as zf:
60
+ for f in batch:
61
+ zf.write(f["absolute_path"], arcname=f["path"])
62
+ memory_file.seek(0)
63
+ return memory_file
64
+
65
+
66
+ def generate_files_lists(
67
+ items, folder, dataset_or_model_id, endpoint, logger, max_size=1024 * 1024 * 16
68
+ ):
69
+ files_repo = FilesAPIRepo()
70
+ current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
71
+ # print(len(current_files), len(items) - len(current_files))
72
+ # print(current_files, error)
73
+ if error:
74
+ current_files = []
75
+ # generate list of files to upload
76
+ logger("generating list of files to upload...")
77
+ upload_files, existing_files, large_files = [], [], []
78
+ current_names = [f["filename"] for f in current_files]
79
+ current_checksums = [f["checksum"] for f in current_files]
80
+ for item in tqdm(items):
81
+ data = prepare_item(item, folder)
82
+ if data["path"] in current_names and data["checksum"] in current_checksums:
83
+ existing_files.append(data)
84
+ else:
85
+ if data["size"] > max_size:
86
+ large_files.append(data)
87
+ else:
88
+ upload_files.append(data)
89
+ # TODO: should ingest new version if files removed
90
+ if len(upload_files) == 0 and len(large_files) == 0:
91
+ raise Exception("No new files to upload")
92
+ return upload_files, existing_files, large_files
93
+
94
+
95
+ def create_new_version(repo, dataset_or_model_id, user):
96
+ data, error = repo.create_version(dataset_or_model_id, user)
97
+ if error:
98
+ raise Exception(error)
99
+ return data["version"]
100
+
101
+
102
+ def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpoint):
103
+ files_repo = FilesAPIRepo()
104
+ logger(f"Uploading directory {folder}...")
105
+ items = retrieve_files(folder)
106
+ # retrieve files
107
+ upload_files, existing_files, large_files = generate_files_lists(
108
+ items, folder, dataset_or_model_id, endpoint, logger
109
+ )
110
+ logger(f"{len(upload_files) + len(large_files)} new files will be ingested")
111
+ logger(f"{len(existing_files)} files already exist in dataset")
112
+ logger(f"{len(large_files)} large files will be ingested separately")
113
+ # create new version
114
+ version = create_new_version(repo, dataset_or_model_id, user)
115
+ logger("New version created, version: " + str(version))
116
+ # ingest new large files
117
+ if len(large_files) > 0:
118
+ logger("ingesting large files...")
119
+ for file in large_files:
120
+ logger("ingesting file: " + file["path"])
121
+ upload_id, parts = files_repo.prepare_large_upload(
122
+ file["path"],
123
+ dataset_or_model_id,
124
+ file["checksum"],
125
+ user,
126
+ endpoint,
127
+ )
128
+ # print(upload_id, parts)
129
+ files_repo.ingest_large_file(
130
+ file["absolute_path"],
131
+ file["size"],
132
+ upload_id,
133
+ user,
134
+ parts,
135
+ endpoint,
136
+ )
137
+ data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
138
+ # ingest new small files in batches
139
+ if len(upload_files) > 0:
140
+ logger("generating batches...")
141
+ batches = generate_batches(upload_files)
142
+ logger(
143
+ f"Uploading {len(upload_files)} small files in {len(batches)} batches..."
144
+ )
145
+ repo = FilesAPIRepo()
146
+ for batch in tqdm(
147
+ batches, desc="Uploading batches", unit="batches", disable=verbose
148
+ ):
149
+ # compress batch
150
+ memory_file = compress_batch(batch)
151
+ # ingest batch
152
+ data, error = repo.ingest_files_batch(
153
+ memory_file,
154
+ [f["checksum"] for f in batch],
155
+ dataset_or_model_id,
156
+ user,
157
+ endpoint,
158
+ version,
159
+ )
160
+ # ingest existing files
161
+ if len(existing_files) > 0:
162
+ batches = generate_batches(existing_files, max_batch_size=None)
163
+ for batch in tqdm(
164
+ batches,
165
+ desc="Ingesting existing files",
166
+ unit="batches",
167
+ disable=verbose,
168
+ ):
169
+ data, error = files_repo.add_files_batch_to_version(
170
+ batch,
171
+ dataset_or_model_id,
172
+ version,
173
+ user,
174
+ endpoint,
175
+ )
176
+ if error:
177
+ raise Exception(error)
178
+ return data
eotdl/files/ingest.py CHANGED
@@ -1,178 +1,249 @@
1
+ from glob import glob
1
2
  from pathlib import Path
3
+ import geopandas as gpd
2
4
  import os
5
+ import pystac
3
6
  from tqdm import tqdm
4
- import zipfile
5
- import io
6
- from glob import glob
7
- import os
7
+ import stac_geoparquet
8
+ import frontmatter
9
+ import random
10
+ import pandas as pd
11
+ from datetime import datetime
12
+ from shapely.geometry import Polygon
8
13
 
14
+ from ..auth import with_auth
15
+ from ..files.metadata import Metadata
9
16
  from ..repos import FilesAPIRepo
10
17
  from ..shared import calculate_checksum
11
18
 
12
-
13
- def retrieve_files(folder):
14
- # get all files in directory recursively
15
- items = [Path(item) for item in glob(str(folder) + "/**/*", recursive=True)]
16
- if not any(item.name == "metadata.yml" for item in items) and not any(
17
- item.name == "README.md" for item in items
18
- ):
19
- raise Exception("README.md not found in directory")
20
- # remove metadata files
21
- items = [item for item in items if item.name != "metadata.yml"]
22
- items = [item for item in items if item.name != "README.md"]
23
- # remove directories
24
- items = [item for item in items if not item.is_dir()]
25
- if len(items) == 0:
26
- raise Exception("No files found in directory")
27
- return items
28
-
29
-
30
- def prepare_item(item, folder):
31
- return {
32
- "filename": item.name,
33
- "path": str(item.relative_to(folder)),
34
- "absolute_path": item.absolute(),
35
- "size": os.path.getsize(item.absolute()),
36
- "checksum": calculate_checksum(item.absolute()),
37
- }
38
-
39
-
40
- def generate_batches(files, max_batch_size=1024 * 1024 * 10, max_batch_files=10):
41
- batches = []
42
- for item in tqdm(files):
43
- if not batches:
44
- batches.append([item])
45
- continue
46
- if max_batch_size:
47
- size_check = sum([i["size"] for i in batches[-1]]) < max_batch_size
48
- else:
49
- size_check = True
50
- if size_check and len(batches[-1]) < max_batch_files:
51
- batches[-1].append(item)
52
- else:
53
- batches.append([item])
54
- return batches
55
-
56
-
57
- def compress_batch(batch):
58
- memory_file = io.BytesIO()
59
- with zipfile.ZipFile(memory_file, "w") as zf:
60
- for f in batch:
61
- zf.write(f["absolute_path"], arcname=f["path"])
62
- memory_file.seek(0)
63
- return memory_file
64
-
65
-
66
- def generate_files_lists(
67
- items, folder, dataset_or_model_id, endpoint, logger, max_size=1024 * 1024 * 16
19
+ def prep_ingest_folder(
20
+ folder,
21
+ verbose=False,
22
+ logger=print,
23
+ force_metadata_update=False,
24
+ sync_metadata=False,
68
25
  ):
69
- files_repo = FilesAPIRepo()
70
- current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
71
- # print(len(current_files), len(items) - len(current_files))
72
- # print(current_files, error)
73
- if error:
74
- current_files = []
75
- # generate list of files to upload
76
- logger("generating list of files to upload...")
77
- upload_files, existing_files, large_files = [], [], []
78
- current_names = [f["filename"] for f in current_files]
79
- current_checksums = [f["checksum"] for f in current_files]
80
- for item in tqdm(items):
81
- data = prepare_item(item, folder)
82
- if data["path"] in current_names and data["checksum"] in current_checksums:
83
- existing_files.append(data)
84
- else:
85
- if data["size"] > max_size:
86
- large_files.append(data)
87
- else:
88
- upload_files.append(data)
89
- # TODO: should ingest new version if files removed
90
- if len(upload_files) == 0 and len(large_files) == 0:
91
- raise Exception("No new files to upload")
92
- return upload_files, existing_files, large_files
26
+ logger("Ingesting directory: " + str(folder))
27
+ catalog_path = folder.joinpath("catalog.parquet")
28
+ files = glob(str(folder) + '/**/*', recursive=True)
29
+ # remove catalog.parquet from files
30
+ files = [f for f in files if f != str(catalog_path)]
31
+ # ingest geometry from files (if tifs) or additional list of geometries
32
+ # https://stac-utils.github.io/stac-geoparquet/latest/spec/stac-geoparquet-spec/#use-cases
33
+ data = []
34
+ for file in files:
35
+ file_path = Path(file)
36
+ if file_path.is_file():
37
+ relative_path = os.path.relpath(file_path, catalog_path.parent)
38
+ absolute_path = str(file_path)
39
+ # THIS IS THE MINIMUM REQUIRED FIELDS TO CREATE A VALID STAC ITEM
40
+ data.append(create_stac_item(relative_path, absolute_path))
41
+ gdf = gpd.GeoDataFrame(data, geometry='geometry')
42
+ # Save to parquet
43
+ gdf.to_parquet(catalog_path)
44
+ return catalog_path
93
45
 
46
+ # IF THE KEYS IN THE ASSETS ARE NOT THE SAME ON ALL ITEMS, THE PARQUET WILL NOT BE VALID !!!
47
+ def prep_ingest_stac(path, logger=None): # in theory should work with a remote catalog (given URL)
48
+ # read stac catalog
49
+ stac_catalog = path / "catalog.json"
50
+ catalog = pystac.Catalog.from_file(stac_catalog)
51
+ # make all items paths hredf in assets absolute
52
+ catalog.make_all_asset_hrefs_absolute()
53
+ # generate list of items for all collections
54
+ items = []
55
+ for collection in catalog.get_collections():
56
+ # iterate over items
57
+ for item in tqdm(collection.get_items(), desc=f"Ingesting items from collection {collection.id}"):
58
+ assert isinstance(item, pystac.Item)
59
+ # Process each asset in the item
60
+ for asset in item.assets.values():
61
+ if not asset.href.startswith(('http://', 'https://')):
62
+ # Asset is a local file
63
+ file_path = Path(asset.href)
64
+ # Calculate and add file size
65
+ asset.extra_fields['size'] = file_path.stat().st_size
66
+ # Calculate and add checksum
67
+ asset.extra_fields['checksum'] = calculate_checksum(str(file_path))
68
+ items.append(item)
69
+ # save parquet file
70
+ record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(items)
71
+ output_path = stac_catalog.parent / "catalog.parquet"
72
+ stac_geoparquet.arrow.to_parquet(record_batch_reader, output_path)
73
+ return output_path
94
74
 
95
- def create_new_version(repo, dataset_or_model_id, user):
96
- data, error = repo.create_version(dataset_or_model_id, user)
97
- if error:
98
- raise Exception(error)
99
- return data["version"]
75
+ def ingest_virtual( # could work for a list of paths with minimal changes...
76
+ path,
77
+ links,
78
+ repo,
79
+ retrieve,
80
+ mode,
81
+ metadata = None,
82
+ logger=print,
83
+ ):
84
+ path = Path(path)
85
+ if metadata is None:
86
+ readme = frontmatter.load(path.joinpath("README.md"))
87
+ metadata_dict = readme.metadata
88
+ # Add description from content before creating Metadata object
89
+ metadata_dict["description"] = readme.content
90
+ metadata = Metadata(**metadata_dict)
91
+ else:
92
+ metadata = Metadata(**metadata)
93
+ metadata.save_metadata(path)
94
+ data = []
95
+ for link in links:
96
+ assert link.startswith("http"), "All links must start with http or https"
97
+ data.append(create_stac_item(link, link))
98
+ data.append(create_stac_item('README.md', str(path / "README.md")))
99
+ gdf = gpd.GeoDataFrame(data, geometry='geometry')
100
+ gdf.to_parquet(path / "catalog.parquet")
101
+ return ingest(path, repo, retrieve, mode)
100
102
 
103
+ @with_auth
104
+ def ingest(path, repo, retrieve, mode, user):
105
+ try:
106
+ readme = frontmatter.load(path.joinpath("README.md"))
107
+ metadata_dict = readme.metadata
108
+ # Add description from content before creating Metadata object
109
+ metadata_dict["description"] = readme.content
110
+ metadata = Metadata(**metadata_dict)
111
+ except Exception as e:
112
+ print(str(e))
113
+ raise Exception("Error loading metadata")
114
+ # retrieve dataset (create if doesn't exist)
115
+ dataset_or_model = retrieve(metadata, user)
116
+ current_version = sorted([v['version_id'] for v in dataset_or_model["versions"]])[-1]
117
+ # TODO: update README if metadata changed in UI (db)
118
+ # update_metadata = True
119
+ # if "description" in dataset:
120
+ # # do not do this if the dataset is new, only if it already exists
121
+ # update_metadata = check_metadata(
122
+ # dataset, metadata, content, force_metadata_update, sync_metadata, folder
123
+ # )
124
+ # if update_metadata:
125
+ # update_dataset(dataset["id"], metadata, content, user)
126
+ # return ingest_files(
127
+ # repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
128
+ # )
129
+ catalog_path = path.joinpath("catalog.parquet")
130
+ gdf = gpd.read_parquet(catalog_path)
131
+ files_repo = FilesAPIRepo()
132
+ catalog_url = files_repo.generate_presigned_url(f'catalog.v{current_version}.parquet', dataset_or_model['id'], user, endpoint=mode)
133
+ # first time ingesting
134
+ if catalog_url is None:
135
+ total_size = 0
136
+ for row in tqdm(gdf.iterrows(), total=len(gdf), desc="Ingesting files"):
137
+ try:
138
+ for k, v in row[1]["assets"].items():
139
+ if v["href"].startswith("http"): continue
140
+ item_id = row[1]["id"]
141
+ data, error = files_repo.ingest_file(
142
+ v["href"],
143
+ item_id,
144
+ # Path(v["href"]).stat().st_size,
145
+ dataset_or_model['id'],
146
+ user,
147
+ mode,
148
+ )
149
+ if error:
150
+ raise Exception(error)
151
+ file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
152
+ gdf.loc[row[0], "assets"][k]["href"] = file_url
153
+ total_size += v["size"]
154
+ except Exception as e:
155
+ print(f"Error uploading asset {row[0]}: {e}")
156
+ break
157
+ gdf.to_parquet(catalog_path)
158
+ files_repo.ingest_file(str(catalog_path), f'catalog.v{current_version}.parquet', dataset_or_model['id'], user, mode)
159
+ data, error = repo.complete_ingestion(dataset_or_model['id'], current_version, total_size, user)
160
+ if error:
161
+ raise Exception(error)
162
+ return catalog_path
163
+
164
+ # files were already ingested
165
+ # TODO: check for deleted files (currently only updating existing files and ingesting new ones)
166
+ # TODO: adding new links in virtual datasets dont trigger new version (but changing README does)
167
+ new_version = False
168
+ num_changes = 0
169
+ total_size = 0
170
+ for row in tqdm(gdf.iterrows(), total=len(gdf), desc="Ingesting files"):
171
+ try:
172
+ for k, v in row[1]["assets"].items():
173
+ if v["href"].startswith("http"): continue
174
+ item_id = row[1]["id"]
175
+ # check if file exists in previous versions
176
+ df = pd.read_parquet(
177
+ path=catalog_url,
178
+ filters=[('id', '=', item_id)]
179
+ )
180
+ if len(df) > 0: # file exists in previous versions
181
+ if df.iloc[0]['assets'][k]["checksum"] == v["checksum"]: # file is the same
182
+ # still need to update the required fields
183
+ file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
184
+ gdf.loc[row[0], "assets"][k]["href"] = file_url
185
+ total_size += v["size"]
186
+ continue
187
+ else: # file is different, so ingest new version but with a different id
188
+ item_id = item_id + f"-{random.randint(1, 1000000)}"
189
+ gdf.loc[row[0], "id"] = item_id
190
+ new_version = True
191
+ num_changes += 1
192
+ # ingest new files
193
+ data, error = files_repo.ingest_file(
194
+ v["href"],
195
+ item_id, # item id, will be path in local or given id in STAC. if not unique, will overwrite previous file in storage
196
+ # Path(v["href"]).stat().st_size,
197
+ dataset_or_model['id'],
198
+ user,
199
+ # calculate_checksum(asset["href"]), # is always absolute?
200
+ mode,
201
+ # version,
202
+ )
203
+ if error:
204
+ raise Exception(error)
205
+ file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
206
+ gdf.loc[row[0], "assets"][k]["href"] = file_url
207
+ total_size += v["size"]
208
+ except Exception as e:
209
+ print(f"Error uploading asset {row[0]}: {e}")
210
+ break
211
+ if not new_version:
212
+ print("No new version was created, your dataset has not changed.")
213
+ else:
214
+ new_version = current_version + 1
215
+ print("A new version was created, your dataset has changed.")
216
+ print(f"Num changes: {num_changes}")
217
+ gdf.to_parquet(catalog_path)
218
+ files_repo.ingest_file(str(catalog_path), f'catalog.v{new_version}.parquet', dataset_or_model['id'], user, mode)
219
+ # TODO: ingest README.md
220
+ data, error = repo.complete_ingestion(dataset_or_model['id'], new_version, total_size, user)
221
+ if error:
222
+ raise Exception(error)
223
+ return catalog_path
101
224
 
102
- def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpoint):
103
- files_repo = FilesAPIRepo()
104
- logger(f"Uploading directory {folder}...")
105
- items = retrieve_files(folder)
106
- # retrieve files
107
- upload_files, existing_files, large_files = generate_files_lists(
108
- items, folder, dataset_or_model_id, endpoint, logger
109
- )
110
- logger(f"{len(upload_files) + len(large_files)} new files will be ingested")
111
- logger(f"{len(existing_files)} files already exist in dataset")
112
- logger(f"{len(large_files)} large files will be ingested separately")
113
- # create new version
114
- version = create_new_version(repo, dataset_or_model_id, user)
115
- logger("New version created, version: " + str(version))
116
- # ingest new large files
117
- if len(large_files) > 0:
118
- logger("ingesting large files...")
119
- for file in large_files:
120
- logger("ingesting file: " + file["path"])
121
- upload_id, parts = files_repo.prepare_large_upload(
122
- file["path"],
123
- dataset_or_model_id,
124
- file["checksum"],
125
- user,
126
- endpoint,
127
- )
128
- # print(upload_id, parts)
129
- files_repo.ingest_large_file(
130
- file["absolute_path"],
131
- file["size"],
132
- upload_id,
133
- user,
134
- parts,
135
- endpoint,
136
- )
137
- data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
138
- # ingest new small files in batches
139
- if len(upload_files) > 0:
140
- logger("generating batches...")
141
- batches = generate_batches(upload_files)
142
- logger(
143
- f"Uploading {len(upload_files)} small files in {len(batches)} batches..."
144
- )
145
- repo = FilesAPIRepo()
146
- for batch in tqdm(
147
- batches, desc="Uploading batches", unit="batches", disable=verbose
148
- ):
149
- # compress batch
150
- memory_file = compress_batch(batch)
151
- # ingest batch
152
- data, error = repo.ingest_files_batch(
153
- memory_file,
154
- [f["checksum"] for f in batch],
155
- dataset_or_model_id,
156
- user,
157
- endpoint,
158
- version,
159
- )
160
- # ingest existing files
161
- if len(existing_files) > 0:
162
- batches = generate_batches(existing_files, max_batch_size=None)
163
- for batch in tqdm(
164
- batches,
165
- desc="Ingesting existing files",
166
- unit="batches",
167
- disable=verbose,
168
- ):
169
- data, error = files_repo.add_files_batch_to_version(
170
- batch,
171
- dataset_or_model_id,
172
- version,
173
- user,
174
- endpoint,
175
- )
176
- if error:
177
- raise Exception(error)
178
- return data
225
+ def create_stac_item(item_id, asset_href):
226
+ return {
227
+ 'type': 'Feature',
228
+ 'stac_version': '1.0.0',
229
+ 'stac_extensions': [],
230
+ 'datetime': datetime.now(), # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
231
+ 'id': item_id,
232
+ 'bbox': {
233
+ 'xmin': 0.0,
234
+ 'ymin': 0.0,
235
+ 'xmax': 0.0,
236
+ 'ymax': 0.0
237
+ }, # infer from file or from list of geometries
238
+ 'geometry': Polygon(), # empty polygon
239
+ 'assets': { 'asset': { # STAC needs this to be a Dict[str, Asset], not list !!! use same key or parquet breaks !!!
240
+ 'href': asset_href,
241
+ 'checksum': calculate_checksum(asset_href) if not asset_href.startswith("http") else None,
242
+ 'timestamp': datetime.now(),
243
+ 'size': Path(asset_href).stat().st_size if not asset_href.startswith("http") else None,
244
+ }},
245
+ "links": [],
246
+ # 'collection': 'source',
247
+ # anything below are properties (need at least one!)
248
+ 'repository': 'eotdl',
249
+ }
@@ -1,13 +1,14 @@
1
1
  from pydantic import BaseModel, validator
2
2
  from typing import List, Optional
3
3
  from pathlib import Path
4
-
4
+ import os
5
5
 
6
6
  class Metadata(BaseModel):
7
7
  authors: List[str]
8
8
  license: str
9
9
  source: str
10
10
  name: str
11
+ description: str
11
12
  thumbnail: Optional[str] = ""
12
13
 
13
14
  # validate source is a URL
@@ -27,19 +28,17 @@ class Metadata(BaseModel):
27
28
  return v
28
29
 
29
30
 
30
- def generate_metadata(download_path, dataset):
31
- with open(download_path + "/README.md", "w") as f:
32
- f.write("---\n")
33
- f.write(f"name: {dataset['name']}\n")
34
- f.write(f"license: {dataset['license']}\n")
35
- f.write(f"source: {dataset['source']}\n")
36
- f.write(f"thumbnail: {dataset['thumbnail']}\n")
37
- f.write(f"authors:\n")
38
- for author in dataset["authors"]:
39
- f.write(f" - {author}\n")
40
- f.write("---\n")
41
- f.write(dataset["description"])
42
- # remove metadata.yml if exists
43
- if Path(download_path + "/metadata.yml").exists():
44
- Path(download_path + "/metadata.yml").unlink()
45
- return download_path + "/README.md"
31
+ def save_metadata(self, dst_path):
32
+ os.makedirs(dst_path, exist_ok=True)
33
+ with open(Path(dst_path) / "README.md", "w") as f:
34
+ f.write("---\n")
35
+ f.write(f"name: {self.name}\n")
36
+ f.write(f"license: {self.license}\n")
37
+ f.write(f"source: {self.source}\n")
38
+ f.write(f"thumbnail: {self.thumbnail}\n")
39
+ f.write(f"authors:\n")
40
+ for author in self.authors:
41
+ f.write(f" - {author}\n")
42
+ f.write("---\n")
43
+ f.write(self.description)
44
+ return str(Path(dst_path) / "README.md")
eotdl/models/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  from .retrieve import retrieve_models, retrieve_model, retrieve_model_files
2
2
  from .ingest import ingest_model
3
- from .download import download_model
3
+ from .stage import stage_model, stage_model_file