eotdl 2024.10.7__py3-none-any.whl → 2025.3.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eotdl/__init__.py +1 -1
- eotdl/access/search.py +0 -2
- eotdl/access/sentinelhub/parameters.py +1 -1
- eotdl/cli.py +2 -2
- eotdl/commands/datasets.py +28 -31
- eotdl/commands/models.py +27 -30
- eotdl/commands/stac.py +57 -0
- eotdl/curation/__init__.py +0 -8
- eotdl/curation/stac/__init__.py +1 -8
- eotdl/curation/stac/api.py +58 -0
- eotdl/curation/stac/stac.py +31 -341
- eotdl/datasets/__init__.py +1 -1
- eotdl/datasets/ingest.py +28 -159
- eotdl/datasets/retrieve.py +0 -9
- eotdl/datasets/stage.py +64 -0
- eotdl/files/__init__.py +0 -2
- eotdl/files/ingest.bck +178 -0
- eotdl/files/ingest.py +229 -164
- eotdl/{datasets → files}/metadata.py +16 -17
- eotdl/models/__init__.py +1 -1
- eotdl/models/ingest.py +28 -159
- eotdl/models/stage.py +60 -0
- eotdl/repos/APIRepo.py +1 -1
- eotdl/repos/DatasetsAPIRepo.py +56 -43
- eotdl/repos/FilesAPIRepo.py +260 -167
- eotdl/repos/STACAPIRepo.py +40 -0
- eotdl/repos/__init__.py +1 -0
- eotdl/tools/geo_utils.py +7 -2
- {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/METADATA +5 -4
- eotdl-2025.3.25.dist-info/RECORD +65 -0
- {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/WHEEL +1 -1
- eotdl/curation/stac/assets.py +0 -110
- eotdl/curation/stac/dataframe.py +0 -172
- eotdl/curation/stac/dataframe_bck.py +0 -253
- eotdl/curation/stac/dataframe_labeling.py +0 -63
- eotdl/curation/stac/extensions/__init__.py +0 -23
- eotdl/curation/stac/extensions/base.py +0 -30
- eotdl/curation/stac/extensions/dem.py +0 -18
- eotdl/curation/stac/extensions/eo.py +0 -117
- eotdl/curation/stac/extensions/label/__init__.py +0 -7
- eotdl/curation/stac/extensions/label/base.py +0 -136
- eotdl/curation/stac/extensions/label/image_name_labeler.py +0 -203
- eotdl/curation/stac/extensions/label/scaneo.py +0 -219
- eotdl/curation/stac/extensions/ml_dataset.py +0 -648
- eotdl/curation/stac/extensions/projection.py +0 -44
- eotdl/curation/stac/extensions/raster.py +0 -53
- eotdl/curation/stac/extensions/sar.py +0 -55
- eotdl/curation/stac/extent.py +0 -158
- eotdl/curation/stac/parsers.py +0 -61
- eotdl/datasets/download.py +0 -104
- eotdl/files/list_files.py +0 -13
- eotdl/models/download.py +0 -101
- eotdl/models/metadata.py +0 -43
- eotdl/wrappers/utils.py +0 -35
- eotdl-2024.10.7.dist-info/RECORD +0 -82
- {eotdl-2024.10.7.dist-info → eotdl-2025.3.25.dist-info}/entry_points.txt +0 -0
eotdl/files/ingest.bck
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import os
|
3
|
+
from tqdm import tqdm
|
4
|
+
import zipfile
|
5
|
+
import io
|
6
|
+
from glob import glob
|
7
|
+
import os
|
8
|
+
|
9
|
+
from ..repos import FilesAPIRepo
|
10
|
+
from ..shared import calculate_checksum
|
11
|
+
|
12
|
+
|
13
|
+
def retrieve_files(folder):
|
14
|
+
# get all files in directory recursively
|
15
|
+
items = [Path(item) for item in glob(str(folder) + "/**/*", recursive=True)]
|
16
|
+
if not any(item.name == "metadata.yml" for item in items) and not any(
|
17
|
+
item.name == "README.md" for item in items
|
18
|
+
):
|
19
|
+
raise Exception("README.md not found in directory")
|
20
|
+
# remove metadata files
|
21
|
+
items = [item for item in items if item.name != "metadata.yml"]
|
22
|
+
items = [item for item in items if item.name != "README.md"]
|
23
|
+
# remove directories
|
24
|
+
items = [item for item in items if not item.is_dir()]
|
25
|
+
if len(items) == 0:
|
26
|
+
raise Exception("No files found in directory")
|
27
|
+
return items
|
28
|
+
|
29
|
+
|
30
|
+
def prepare_item(item, folder):
|
31
|
+
return {
|
32
|
+
"filename": item.name,
|
33
|
+
"path": str(item.relative_to(folder)),
|
34
|
+
"absolute_path": item.absolute(),
|
35
|
+
"size": os.path.getsize(item.absolute()),
|
36
|
+
"checksum": calculate_checksum(item.absolute()),
|
37
|
+
}
|
38
|
+
|
39
|
+
|
40
|
+
def generate_batches(files, max_batch_size=1024 * 1024 * 10, max_batch_files=10):
|
41
|
+
batches = []
|
42
|
+
for item in tqdm(files):
|
43
|
+
if not batches:
|
44
|
+
batches.append([item])
|
45
|
+
continue
|
46
|
+
if max_batch_size:
|
47
|
+
size_check = sum([i["size"] for i in batches[-1]]) < max_batch_size
|
48
|
+
else:
|
49
|
+
size_check = True
|
50
|
+
if size_check and len(batches[-1]) < max_batch_files:
|
51
|
+
batches[-1].append(item)
|
52
|
+
else:
|
53
|
+
batches.append([item])
|
54
|
+
return batches
|
55
|
+
|
56
|
+
|
57
|
+
def compress_batch(batch):
|
58
|
+
memory_file = io.BytesIO()
|
59
|
+
with zipfile.ZipFile(memory_file, "w") as zf:
|
60
|
+
for f in batch:
|
61
|
+
zf.write(f["absolute_path"], arcname=f["path"])
|
62
|
+
memory_file.seek(0)
|
63
|
+
return memory_file
|
64
|
+
|
65
|
+
|
66
|
+
def generate_files_lists(
|
67
|
+
items, folder, dataset_or_model_id, endpoint, logger, max_size=1024 * 1024 * 16
|
68
|
+
):
|
69
|
+
files_repo = FilesAPIRepo()
|
70
|
+
current_files, error = files_repo.retrieve_files(dataset_or_model_id, endpoint)
|
71
|
+
# print(len(current_files), len(items) - len(current_files))
|
72
|
+
# print(current_files, error)
|
73
|
+
if error:
|
74
|
+
current_files = []
|
75
|
+
# generate list of files to upload
|
76
|
+
logger("generating list of files to upload...")
|
77
|
+
upload_files, existing_files, large_files = [], [], []
|
78
|
+
current_names = [f["filename"] for f in current_files]
|
79
|
+
current_checksums = [f["checksum"] for f in current_files]
|
80
|
+
for item in tqdm(items):
|
81
|
+
data = prepare_item(item, folder)
|
82
|
+
if data["path"] in current_names and data["checksum"] in current_checksums:
|
83
|
+
existing_files.append(data)
|
84
|
+
else:
|
85
|
+
if data["size"] > max_size:
|
86
|
+
large_files.append(data)
|
87
|
+
else:
|
88
|
+
upload_files.append(data)
|
89
|
+
# TODO: should ingest new version if files removed
|
90
|
+
if len(upload_files) == 0 and len(large_files) == 0:
|
91
|
+
raise Exception("No new files to upload")
|
92
|
+
return upload_files, existing_files, large_files
|
93
|
+
|
94
|
+
|
95
|
+
def create_new_version(repo, dataset_or_model_id, user):
|
96
|
+
data, error = repo.create_version(dataset_or_model_id, user)
|
97
|
+
if error:
|
98
|
+
raise Exception(error)
|
99
|
+
return data["version"]
|
100
|
+
|
101
|
+
|
102
|
+
def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpoint):
|
103
|
+
files_repo = FilesAPIRepo()
|
104
|
+
logger(f"Uploading directory {folder}...")
|
105
|
+
items = retrieve_files(folder)
|
106
|
+
# retrieve files
|
107
|
+
upload_files, existing_files, large_files = generate_files_lists(
|
108
|
+
items, folder, dataset_or_model_id, endpoint, logger
|
109
|
+
)
|
110
|
+
logger(f"{len(upload_files) + len(large_files)} new files will be ingested")
|
111
|
+
logger(f"{len(existing_files)} files already exist in dataset")
|
112
|
+
logger(f"{len(large_files)} large files will be ingested separately")
|
113
|
+
# create new version
|
114
|
+
version = create_new_version(repo, dataset_or_model_id, user)
|
115
|
+
logger("New version created, version: " + str(version))
|
116
|
+
# ingest new large files
|
117
|
+
if len(large_files) > 0:
|
118
|
+
logger("ingesting large files...")
|
119
|
+
for file in large_files:
|
120
|
+
logger("ingesting file: " + file["path"])
|
121
|
+
upload_id, parts = files_repo.prepare_large_upload(
|
122
|
+
file["path"],
|
123
|
+
dataset_or_model_id,
|
124
|
+
file["checksum"],
|
125
|
+
user,
|
126
|
+
endpoint,
|
127
|
+
)
|
128
|
+
# print(upload_id, parts)
|
129
|
+
files_repo.ingest_large_file(
|
130
|
+
file["absolute_path"],
|
131
|
+
file["size"],
|
132
|
+
upload_id,
|
133
|
+
user,
|
134
|
+
parts,
|
135
|
+
endpoint,
|
136
|
+
)
|
137
|
+
data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
|
138
|
+
# ingest new small files in batches
|
139
|
+
if len(upload_files) > 0:
|
140
|
+
logger("generating batches...")
|
141
|
+
batches = generate_batches(upload_files)
|
142
|
+
logger(
|
143
|
+
f"Uploading {len(upload_files)} small files in {len(batches)} batches..."
|
144
|
+
)
|
145
|
+
repo = FilesAPIRepo()
|
146
|
+
for batch in tqdm(
|
147
|
+
batches, desc="Uploading batches", unit="batches", disable=verbose
|
148
|
+
):
|
149
|
+
# compress batch
|
150
|
+
memory_file = compress_batch(batch)
|
151
|
+
# ingest batch
|
152
|
+
data, error = repo.ingest_files_batch(
|
153
|
+
memory_file,
|
154
|
+
[f["checksum"] for f in batch],
|
155
|
+
dataset_or_model_id,
|
156
|
+
user,
|
157
|
+
endpoint,
|
158
|
+
version,
|
159
|
+
)
|
160
|
+
# ingest existing files
|
161
|
+
if len(existing_files) > 0:
|
162
|
+
batches = generate_batches(existing_files, max_batch_size=None)
|
163
|
+
for batch in tqdm(
|
164
|
+
batches,
|
165
|
+
desc="Ingesting existing files",
|
166
|
+
unit="batches",
|
167
|
+
disable=verbose,
|
168
|
+
):
|
169
|
+
data, error = files_repo.add_files_batch_to_version(
|
170
|
+
batch,
|
171
|
+
dataset_or_model_id,
|
172
|
+
version,
|
173
|
+
user,
|
174
|
+
endpoint,
|
175
|
+
)
|
176
|
+
if error:
|
177
|
+
raise Exception(error)
|
178
|
+
return data
|
eotdl/files/ingest.py
CHANGED
@@ -1,178 +1,243 @@
|
|
1
|
+
from glob import glob
|
1
2
|
from pathlib import Path
|
3
|
+
import geopandas as gpd
|
2
4
|
import os
|
5
|
+
import pystac
|
3
6
|
from tqdm import tqdm
|
4
|
-
import
|
5
|
-
import
|
6
|
-
|
7
|
-
import
|
7
|
+
import stac_geoparquet
|
8
|
+
import frontmatter
|
9
|
+
import random
|
10
|
+
import pandas as pd
|
11
|
+
from datetime import datetime
|
12
|
+
from shapely.geometry import Polygon
|
8
13
|
|
14
|
+
from ..auth import with_auth
|
15
|
+
from ..files.metadata import Metadata
|
9
16
|
from ..repos import FilesAPIRepo
|
10
17
|
from ..shared import calculate_checksum
|
11
18
|
|
19
|
+
def prep_ingest_folder(
|
20
|
+
folder,
|
21
|
+
verbose=False,
|
22
|
+
logger=print,
|
23
|
+
force_metadata_update=False,
|
24
|
+
sync_metadata=False,
|
25
|
+
):
|
26
|
+
logger("Ingesting directory: " + str(folder))
|
27
|
+
catalog_path = folder.joinpath("catalog.parquet")
|
28
|
+
files = glob(str(folder) + '/**/*', recursive=True)
|
29
|
+
# remove catalog.parquet from files
|
30
|
+
files = [f for f in files if f != str(catalog_path)]
|
31
|
+
# ingest geometry from files (if tifs) or additional list of geometries
|
32
|
+
# https://stac-utils.github.io/stac-geoparquet/latest/spec/stac-geoparquet-spec/#use-cases
|
33
|
+
data = []
|
34
|
+
for file in files:
|
35
|
+
file_path = Path(file)
|
36
|
+
if file_path.is_file():
|
37
|
+
relative_path = os.path.relpath(file_path, catalog_path.parent)
|
38
|
+
absolute_path = str(file_path)
|
39
|
+
# THIS IS THE MINIMUM REQUIRED FIELDS TO CREATE A VALID STAC ITEM
|
40
|
+
data.append(create_stac_item(relative_path, absolute_path))
|
41
|
+
gdf = gpd.GeoDataFrame(data, geometry='geometry')
|
42
|
+
# Save to parquet
|
43
|
+
gdf.to_parquet(catalog_path)
|
44
|
+
return catalog_path
|
12
45
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
"filename": item.name,
|
33
|
-
"path": str(item.relative_to(folder)),
|
34
|
-
"absolute_path": item.absolute(),
|
35
|
-
"size": os.path.getsize(item.absolute()),
|
36
|
-
"checksum": calculate_checksum(item.absolute()),
|
37
|
-
}
|
38
|
-
|
39
|
-
|
40
|
-
def generate_batches(files, max_batch_size=1024 * 1024 * 10, max_batch_files=10):
|
41
|
-
batches = []
|
42
|
-
for item in tqdm(files):
|
43
|
-
if not batches:
|
44
|
-
batches.append([item])
|
45
|
-
continue
|
46
|
-
if max_batch_size:
|
47
|
-
size_check = sum([i["size"] for i in batches[-1]]) < max_batch_size
|
48
|
-
else:
|
49
|
-
size_check = True
|
50
|
-
if size_check and len(batches[-1]) < max_batch_files:
|
51
|
-
batches[-1].append(item)
|
52
|
-
else:
|
53
|
-
batches.append([item])
|
54
|
-
return batches
|
55
|
-
|
56
|
-
|
57
|
-
def compress_batch(batch):
|
58
|
-
memory_file = io.BytesIO()
|
59
|
-
with zipfile.ZipFile(memory_file, "w") as zf:
|
60
|
-
for f in batch:
|
61
|
-
zf.write(f["absolute_path"], arcname=f["path"])
|
62
|
-
memory_file.seek(0)
|
63
|
-
return memory_file
|
64
|
-
|
46
|
+
# IF THE KEYS IN THE ASSETS ARE NOT THE SAME ON ALL ITEMS, THE PARQUET WILL NOT BE VALID !!!
|
47
|
+
def prep_ingest_stac(path, logger=None): # in theory should work with a remote catalog (given URL)
|
48
|
+
# read stac catalog
|
49
|
+
stac_catalog = path / "catalog.json"
|
50
|
+
catalog = pystac.Catalog.from_file(stac_catalog)
|
51
|
+
# make all items paths hredf in assets absolute
|
52
|
+
catalog.make_all_asset_hrefs_absolute()
|
53
|
+
# generate list of items for all collections
|
54
|
+
items = []
|
55
|
+
for collection in catalog.get_collections():
|
56
|
+
# iterate over items
|
57
|
+
for item in tqdm(collection.get_items(), desc=f"Ingesting items from collection {collection.id}"):
|
58
|
+
assert isinstance(item, pystac.Item)
|
59
|
+
items.append(item)
|
60
|
+
# save parquet file
|
61
|
+
record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(items)
|
62
|
+
output_path = stac_catalog.parent / "catalog.parquet"
|
63
|
+
stac_geoparquet.arrow.to_parquet(record_batch_reader, output_path)
|
64
|
+
return output_path
|
65
65
|
|
66
|
-
|
67
|
-
|
66
|
+
@with_auth
|
67
|
+
def ingest_virutal_dataset( # could work for a list of paths with minimal changes...
|
68
|
+
path,
|
69
|
+
links,
|
70
|
+
metadata = None,
|
71
|
+
logger=print,
|
72
|
+
user=None,
|
68
73
|
):
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
74
|
+
path = Path(path)
|
75
|
+
if metadata is None:
|
76
|
+
readme = frontmatter.load(path.joinpath("README.md"))
|
77
|
+
metadata_dict = readme.metadata
|
78
|
+
# Add description from content before creating Metadata object
|
79
|
+
metadata_dict["description"] = readme.content
|
80
|
+
metadata = Metadata(**metadata_dict)
|
81
|
+
else:
|
82
|
+
metadata = Metadata(**metadata)
|
83
|
+
metadata.save_metadata(path)
|
84
|
+
data = []
|
85
|
+
for link in links:
|
86
|
+
assert link.startswith("http"), "All links must start with http or https"
|
87
|
+
data.append(create_stac_item(link, link))
|
88
|
+
data.append(create_stac_item('README.md', str(path / "README.md")))
|
89
|
+
gdf = gpd.GeoDataFrame(data, geometry='geometry')
|
90
|
+
gdf.to_parquet(path / "catalog.parquet")
|
91
|
+
return ingest(path)
|
92
|
+
|
93
|
+
@with_auth
|
94
|
+
def ingest(path, repo, retrieve, mode, user):
|
95
|
+
try:
|
96
|
+
readme = frontmatter.load(path.joinpath("README.md"))
|
97
|
+
metadata_dict = readme.metadata
|
98
|
+
# Add description from content before creating Metadata object
|
99
|
+
metadata_dict["description"] = readme.content
|
100
|
+
metadata = Metadata(**metadata_dict)
|
101
|
+
except Exception as e:
|
102
|
+
print(str(e))
|
103
|
+
raise Exception("Error loading metadata")
|
104
|
+
# retrieve dataset (create if doesn't exist)
|
105
|
+
dataset_or_model = retrieve(metadata, user)
|
106
|
+
current_version = sorted([v['version_id'] for v in dataset_or_model["versions"]])[-1]
|
107
|
+
print("current version: ", current_version)
|
93
108
|
|
109
|
+
# TODO: update README if metadata changed in UI (db)
|
110
|
+
# update_metadata = True
|
111
|
+
# if "description" in dataset:
|
112
|
+
# # do not do this if the dataset is new, only if it already exists
|
113
|
+
# update_metadata = check_metadata(
|
114
|
+
# dataset, metadata, content, force_metadata_update, sync_metadata, folder
|
115
|
+
# )
|
116
|
+
# if update_metadata:
|
117
|
+
# update_dataset(dataset["id"], metadata, content, user)
|
118
|
+
# return ingest_files(
|
119
|
+
# repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
|
120
|
+
# )
|
94
121
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
return data["version"]
|
122
|
+
catalog_path = path.joinpath("catalog.parquet")
|
123
|
+
gdf = gpd.read_parquet(catalog_path)
|
124
|
+
files_repo = FilesAPIRepo()
|
125
|
+
catalog_url = files_repo.generate_presigned_url(f'catalog.v{current_version}.parquet', dataset_or_model['id'], user)
|
100
126
|
|
127
|
+
# first time ingesting
|
128
|
+
if catalog_url is None:
|
129
|
+
total_size = 0
|
130
|
+
for row in tqdm(gdf.iterrows(), total=len(gdf), desc="Ingesting files"):
|
131
|
+
try:
|
132
|
+
for k, v in row[1]["assets"].items():
|
133
|
+
if v["href"].startswith("http"): continue
|
134
|
+
item_id = row[1]["id"]
|
135
|
+
data, error = files_repo.ingest_file(
|
136
|
+
v["href"],
|
137
|
+
item_id,
|
138
|
+
# Path(v["href"]).stat().st_size,
|
139
|
+
dataset_or_model['id'],
|
140
|
+
user,
|
141
|
+
mode,
|
142
|
+
)
|
143
|
+
if error:
|
144
|
+
raise Exception(error)
|
145
|
+
file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
|
146
|
+
gdf.loc[row[0], "assets"][k]["href"] = file_url
|
147
|
+
total_size += v["size"]
|
148
|
+
except Exception as e:
|
149
|
+
print(f"Error uploading asset {row[0]}: {e}")
|
150
|
+
break
|
151
|
+
gdf.to_parquet(catalog_path)
|
152
|
+
files_repo.ingest_file(str(catalog_path), f'catalog.v{current_version}.parquet', dataset_or_model['id'], user, "datasets")
|
153
|
+
data, error = repo.complete_ingestion(dataset_or_model['id'], current_version, total_size, user)
|
154
|
+
if error:
|
155
|
+
raise Exception(error)
|
156
|
+
return catalog_path
|
157
|
+
|
158
|
+
# files were already ingested
|
159
|
+
# TODO: check for deleted files (currently only updating existing files and ingesting new ones)
|
160
|
+
# TODO: adding new links in virtual datasets dont trigger new version (but changing README does)
|
161
|
+
new_version = False
|
162
|
+
num_changes = 0
|
163
|
+
total_size = 0
|
164
|
+
for row in tqdm(gdf.iterrows(), total=len(gdf), desc="Ingesting files"):
|
165
|
+
try:
|
166
|
+
for k, v in row[1]["assets"].items():
|
167
|
+
if v["href"].startswith("http"): continue
|
168
|
+
item_id = row[1]["id"]
|
169
|
+
# check if file exists in previous versions
|
170
|
+
df = pd.read_parquet(
|
171
|
+
path=catalog_url,
|
172
|
+
filters=[('id', '=', item_id)]
|
173
|
+
)
|
174
|
+
if len(df) > 0: # file exists in previous versions
|
175
|
+
if df.iloc[0]['assets'][k]["checksum"] == v["checksum"]: # file is the same
|
176
|
+
# still need to update the required fields
|
177
|
+
file_url = f"{repo.url}datasets/{dataset_or_model['id']}/stage/{item_id}"
|
178
|
+
gdf.loc[row[0], "assets"][k]["href"] = file_url
|
179
|
+
total_size += v["size"]
|
180
|
+
continue
|
181
|
+
else: # file is different, so ingest new version but with a different id
|
182
|
+
item_id = item_id + f"-{random.randint(1, 1000000)}"
|
183
|
+
gdf.loc[row[0], "id"] = item_id
|
184
|
+
new_version = True
|
185
|
+
num_changes += 1
|
186
|
+
# ingest new files
|
187
|
+
data, error = files_repo.ingest_file(
|
188
|
+
v["href"],
|
189
|
+
item_id, # item id, will be path in local or given id in STAC. if not unique, will overwrite previous file in storage
|
190
|
+
# Path(v["href"]).stat().st_size,
|
191
|
+
dataset_or_model['id'],
|
192
|
+
user,
|
193
|
+
# calculate_checksum(asset["href"]), # is always absolute?
|
194
|
+
mode,
|
195
|
+
# version,
|
196
|
+
)
|
197
|
+
if error:
|
198
|
+
raise Exception(error)
|
199
|
+
file_url = f"{repo.url}{mode}/{dataset_or_model['id']}/stage/{item_id}"
|
200
|
+
gdf.loc[row[0], "assets"][k]["href"] = file_url
|
201
|
+
total_size += v["size"]
|
202
|
+
except Exception as e:
|
203
|
+
print(f"Error uploading asset {row[0]}: {e}")
|
204
|
+
break
|
205
|
+
if not new_version:
|
206
|
+
print("No new version was created, your dataset has not changed.")
|
207
|
+
else:
|
208
|
+
new_version = current_version + 1
|
209
|
+
print("A new version was created, your dataset has changed.")
|
210
|
+
print(f"Num changes: {num_changes}")
|
211
|
+
gdf.to_parquet(catalog_path)
|
212
|
+
files_repo.ingest_file(str(catalog_path), f'catalog.v{new_version}.parquet', dataset_or_model['id'], user, mode)
|
213
|
+
# TODO: ingest README.md
|
214
|
+
data, error = repo.complete_ingestion(dataset_or_model['id'], new_version, total_size, user)
|
215
|
+
if error:
|
216
|
+
raise Exception(error)
|
217
|
+
return catalog_path
|
101
218
|
|
102
|
-
def
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
)
|
128
|
-
# print(upload_id, parts)
|
129
|
-
files_repo.ingest_large_file(
|
130
|
-
file["absolute_path"],
|
131
|
-
file["size"],
|
132
|
-
upload_id,
|
133
|
-
user,
|
134
|
-
parts,
|
135
|
-
endpoint,
|
136
|
-
)
|
137
|
-
data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
|
138
|
-
# ingest new small files in batches
|
139
|
-
if len(upload_files) > 0:
|
140
|
-
logger("generating batches...")
|
141
|
-
batches = generate_batches(upload_files)
|
142
|
-
logger(
|
143
|
-
f"Uploading {len(upload_files)} small files in {len(batches)} batches..."
|
144
|
-
)
|
145
|
-
repo = FilesAPIRepo()
|
146
|
-
for batch in tqdm(
|
147
|
-
batches, desc="Uploading batches", unit="batches", disable=verbose
|
148
|
-
):
|
149
|
-
# compress batch
|
150
|
-
memory_file = compress_batch(batch)
|
151
|
-
# ingest batch
|
152
|
-
data, error = repo.ingest_files_batch(
|
153
|
-
memory_file,
|
154
|
-
[f["checksum"] for f in batch],
|
155
|
-
dataset_or_model_id,
|
156
|
-
user,
|
157
|
-
endpoint,
|
158
|
-
version,
|
159
|
-
)
|
160
|
-
# ingest existing files
|
161
|
-
if len(existing_files) > 0:
|
162
|
-
batches = generate_batches(existing_files, max_batch_size=None)
|
163
|
-
for batch in tqdm(
|
164
|
-
batches,
|
165
|
-
desc="Ingesting existing files",
|
166
|
-
unit="batches",
|
167
|
-
disable=verbose,
|
168
|
-
):
|
169
|
-
data, error = files_repo.add_files_batch_to_version(
|
170
|
-
batch,
|
171
|
-
dataset_or_model_id,
|
172
|
-
version,
|
173
|
-
user,
|
174
|
-
endpoint,
|
175
|
-
)
|
176
|
-
if error:
|
177
|
-
raise Exception(error)
|
178
|
-
return data
|
219
|
+
def create_stac_item(item_id, asset_href):
|
220
|
+
return {
|
221
|
+
'type': 'Feature',
|
222
|
+
'stac_version': '1.0.0',
|
223
|
+
'stac_extensions': [],
|
224
|
+
'datetime': datetime.now(), # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
|
225
|
+
'id': item_id,
|
226
|
+
'bbox': {
|
227
|
+
'xmin': 0.0,
|
228
|
+
'ymin': 0.0,
|
229
|
+
'xmax': 0.0,
|
230
|
+
'ymax': 0.0
|
231
|
+
}, # infer from file or from list of geometries
|
232
|
+
'geometry': Polygon(), # empty polygon
|
233
|
+
'assets': { 'asset': { # STAC needs this to be a Dict[str, Asset], not list !!! use same key or parquet breaks !!!
|
234
|
+
'href': asset_href,
|
235
|
+
'checksum': calculate_checksum(asset_href) if not asset_href.startswith("http") else None,
|
236
|
+
'timestamp': datetime.now(),
|
237
|
+
'size': Path(asset_href).stat().st_size if not asset_href.startswith("http") else None,
|
238
|
+
}},
|
239
|
+
"links": [],
|
240
|
+
# 'collection': 'source',
|
241
|
+
# anything below are properties (need at least one!)
|
242
|
+
'repository': 'eotdl',
|
243
|
+
}
|
@@ -1,13 +1,14 @@
|
|
1
1
|
from pydantic import BaseModel, validator
|
2
2
|
from typing import List, Optional
|
3
3
|
from pathlib import Path
|
4
|
-
|
4
|
+
import os
|
5
5
|
|
6
6
|
class Metadata(BaseModel):
|
7
7
|
authors: List[str]
|
8
8
|
license: str
|
9
9
|
source: str
|
10
10
|
name: str
|
11
|
+
description: str
|
11
12
|
thumbnail: Optional[str] = ""
|
12
13
|
|
13
14
|
# validate source is a URL
|
@@ -27,19 +28,17 @@ class Metadata(BaseModel):
|
|
27
28
|
return v
|
28
29
|
|
29
30
|
|
30
|
-
def
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
Path(download_path + "/metadata.yml").unlink()
|
45
|
-
return download_path + "/README.md"
|
31
|
+
def save_metadata(self, dst_path):
|
32
|
+
os.makedirs(dst_path, exist_ok=True)
|
33
|
+
with open(Path(dst_path) / "README.md", "w") as f:
|
34
|
+
f.write("---\n")
|
35
|
+
f.write(f"name: {self.name}\n")
|
36
|
+
f.write(f"license: {self.license}\n")
|
37
|
+
f.write(f"source: {self.source}\n")
|
38
|
+
f.write(f"thumbnail: {self.thumbnail}\n")
|
39
|
+
f.write(f"authors:\n")
|
40
|
+
for author in self.authors:
|
41
|
+
f.write(f" - {author}\n")
|
42
|
+
f.write("---\n")
|
43
|
+
f.write(self.description)
|
44
|
+
return str(Path(dst_path) / "README.md")
|
eotdl/models/__init__.py
CHANGED