eotdl 2023.10.25.post10__py3-none-any.whl → 2023.11.2.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eotdl/__init__.py +1 -1
- eotdl/cli.py +6 -2
- eotdl/commands/auth.py +18 -1
- eotdl/commands/datasets.py +61 -11
- eotdl/commands/models.py +108 -0
- eotdl/curation/__init__.py +1 -4
- eotdl/curation/stac/assets.py +2 -1
- eotdl/curation/stac/dataframe.py +1 -1
- eotdl/curation/stac/extensions/label/image_name_labeler.py +6 -5
- eotdl/curation/stac/extensions/ml_dataset.py +15 -25
- eotdl/curation/stac/extent.py +1 -1
- eotdl/curation/stac/stac.py +1 -1
- eotdl/datasets/download.py +5 -4
- eotdl/datasets/ingest.py +25 -154
- eotdl/datasets/retrieve.py +1 -1
- eotdl/files/__init__.py +1 -0
- eotdl/files/ingest.py +175 -0
- eotdl/models/__init__.py +3 -0
- eotdl/models/download.py +119 -0
- eotdl/models/ingest.py +47 -0
- eotdl/models/metadata.py +16 -0
- eotdl/models/retrieve.py +26 -0
- eotdl/repos/FilesAPIRepo.py +136 -95
- eotdl/repos/ModelsAPIRepo.py +40 -0
- eotdl/repos/__init__.py +1 -0
- eotdl/shared/__init__.py +1 -0
- eotdl/tools/__init__.py +5 -6
- eotdl/tools/geo_utils.py +15 -1
- eotdl/tools/stac.py +144 -8
- eotdl/tools/time_utils.py +19 -6
- eotdl/tools/tools.py +2 -3
- {eotdl-2023.10.25.post10.dist-info → eotdl-2023.11.2.post2.dist-info}/METADATA +1 -1
- {eotdl-2023.10.25.post10.dist-info → eotdl-2023.11.2.post2.dist-info}/RECORD +38 -35
- eotdl/curation/folder_formatters/__init__.py +0 -1
- eotdl/curation/folder_formatters/base.py +0 -19
- eotdl/curation/folder_formatters/sentinel_hub.py +0 -135
- eotdl/curation/stac/utils/__init__.py +0 -5
- eotdl/curation/stac/utils/geometry.py +0 -22
- eotdl/curation/stac/utils/stac.py +0 -143
- eotdl/curation/stac/utils/time.py +0 -21
- /eotdl/{datasets/utils.py → shared/checksum.py} +0 -0
- /eotdl/{curation/stac/utils → tools}/metadata.py +0 -0
- /eotdl/{curation/stac/utils → tools}/paths.py +0 -0
- {eotdl-2023.10.25.post10.dist-info → eotdl-2023.11.2.post2.dist-info}/WHEEL +0 -0
- {eotdl-2023.10.25.post10.dist-info → eotdl-2023.11.2.post2.dist-info}/entry_points.txt +0 -0
eotdl/repos/FilesAPIRepo.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import requests
|
2
2
|
import os
|
3
3
|
from tqdm import tqdm
|
4
|
+
import hashlib
|
4
5
|
|
5
6
|
from ..repos import APIRepo
|
6
7
|
|
@@ -9,52 +10,85 @@ class FilesAPIRepo(APIRepo):
|
|
9
10
|
def __init__(self, url=None):
|
10
11
|
super().__init__(url)
|
11
12
|
|
12
|
-
def
|
13
|
+
def ingest_files_batch(
|
14
|
+
self,
|
15
|
+
batch, # ziped batch of files
|
16
|
+
checksums,
|
17
|
+
dataset_or_model_id,
|
18
|
+
id_token,
|
19
|
+
endpoint,
|
20
|
+
version=None,
|
21
|
+
):
|
22
|
+
url = self.url + f"{endpoint}/{dataset_or_model_id}"
|
23
|
+
if version is not None:
|
24
|
+
url += "?version=" + str(version)
|
13
25
|
reponse = requests.post(
|
14
|
-
|
15
|
-
files={"
|
16
|
-
data={"
|
17
|
-
if checksum
|
18
|
-
else None,
|
26
|
+
url,
|
27
|
+
files={"batch": ("batch.zip", batch)},
|
28
|
+
data={"checksums": checksums},
|
19
29
|
headers={"Authorization": "Bearer " + id_token},
|
20
30
|
)
|
21
31
|
return self.format_response(reponse)
|
22
32
|
|
23
|
-
def
|
33
|
+
def add_files_batch_to_version(
|
24
34
|
self,
|
25
|
-
|
26
|
-
|
35
|
+
batch,
|
36
|
+
dataset_or_model_id,
|
27
37
|
version,
|
28
|
-
file_version,
|
29
38
|
id_token,
|
30
|
-
|
39
|
+
endpoint,
|
31
40
|
):
|
32
41
|
reponse = requests.post(
|
33
|
-
self.url + "
|
42
|
+
self.url + f"{endpoint}/{dataset_or_model_id}/files?version={str(version)}",
|
34
43
|
data={
|
35
|
-
"
|
36
|
-
"
|
37
|
-
|
38
|
-
|
39
|
-
|
44
|
+
"filenames": [f["path"] for f in batch],
|
45
|
+
"checksums": [f["checksum"] for f in batch],
|
46
|
+
},
|
47
|
+
headers={"Authorization": "Bearer " + id_token},
|
48
|
+
)
|
49
|
+
return self.format_response(reponse)
|
50
|
+
|
51
|
+
def ingest_file(
|
52
|
+
self,
|
53
|
+
file,
|
54
|
+
dataset_or_model_id,
|
55
|
+
version,
|
56
|
+
parent,
|
57
|
+
id_token,
|
58
|
+
checksum,
|
59
|
+
endpoint,
|
60
|
+
):
|
61
|
+
reponse = requests.post(
|
62
|
+
self.url + f"{endpoint}/{dataset_or_model_id}",
|
63
|
+
files={"file": open(file, "rb")},
|
64
|
+
data={"checksum": checksum, "version": version, "parent": parent}
|
40
65
|
if checksum
|
41
66
|
else None,
|
42
67
|
headers={"Authorization": "Bearer " + id_token},
|
43
68
|
)
|
44
69
|
return self.format_response(reponse)
|
45
70
|
|
46
|
-
def
|
47
|
-
url = self.url
|
71
|
+
def retrieve_files(self, dataset_or_model_id, endpoint, version=None):
|
72
|
+
url = f"{self.url}{endpoint}/{dataset_or_model_id}/files"
|
48
73
|
if version is not None:
|
49
74
|
url += "?version=" + str(version)
|
50
75
|
response = requests.get(url)
|
51
76
|
return self.format_response(response)
|
52
77
|
|
53
|
-
def download_file(
|
54
|
-
|
78
|
+
def download_file(
|
79
|
+
self,
|
80
|
+
dataset_or_model_id,
|
81
|
+
file_name,
|
82
|
+
id_token,
|
83
|
+
path,
|
84
|
+
file_version,
|
85
|
+
endpoint="datasets",
|
86
|
+
progress=False,
|
87
|
+
):
|
88
|
+
url = self.url + f"{endpoint}/{dataset_or_model_id}/download/{file_name}"
|
55
89
|
if file_version is not None:
|
56
90
|
url += "?version=" + str(file_version)
|
57
|
-
return self.download_file_url(url, file_name, path, id_token)
|
91
|
+
return self.download_file_url(url, file_name, path, id_token, progress=progress)
|
58
92
|
|
59
93
|
def download_file_url(self, url, filename, path, id_token, progress=False):
|
60
94
|
headers = {"Authorization": "Bearer " + id_token}
|
@@ -66,9 +100,14 @@ class FilesAPIRepo(APIRepo):
|
|
66
100
|
r.raise_for_status()
|
67
101
|
total_size = int(r.headers.get("content-length", 0))
|
68
102
|
block_size = 1024 * 1024 * 10
|
103
|
+
progress = progress and total_size > 1024 * 1024 * 16
|
69
104
|
if progress:
|
70
105
|
progress_bar = tqdm(
|
71
|
-
total=total_size,
|
106
|
+
total=total_size,
|
107
|
+
unit="iB",
|
108
|
+
unit_scale=True,
|
109
|
+
unit_divisor=1024,
|
110
|
+
position=1,
|
72
111
|
)
|
73
112
|
with open(path, "wb") as f:
|
74
113
|
for chunk in r.iter_content(block_size):
|
@@ -90,79 +129,81 @@ class FilesAPIRepo(APIRepo):
|
|
90
129
|
# return None, reponse.json()["detail"]
|
91
130
|
# return reponse.json(), None
|
92
131
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
# data = response.json()
|
110
|
-
# upload_id, parts = (
|
111
|
-
# data["upload_id"],
|
112
|
-
# data["parts"] if "parts" in data else [],
|
113
|
-
# )
|
114
|
-
# return upload_id, parts
|
115
|
-
|
116
|
-
# def get_chunk_size(self, content_size):
|
117
|
-
# # adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
|
118
|
-
# chunk_size = 1024 * 1024 * 10 # 10 MB (up to 100 GB, 10000 parts)
|
119
|
-
# if content_size >= 1024 * 1024 * 1024 * 100: # 100 GB
|
120
|
-
# chunk_size = 1024 * 1024 * 100 # 100 MB (up to 1 TB, 10000 parts)
|
121
|
-
# elif content_size >= 1024 * 1024 * 1024 * 1000: # 1 TB
|
122
|
-
# chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
|
123
|
-
# return chunk_size
|
124
|
-
|
125
|
-
# def ingest_large_dataset(self, file, upload_id, id_token, parts):
|
126
|
-
# content_path = os.path.abspath(file)
|
127
|
-
# content_size = os.stat(content_path).st_size
|
128
|
-
# chunk_size = self.get_chunk_size(content_size)
|
129
|
-
# total_chunks = content_size // chunk_size
|
130
|
-
# # upload chunks sequentially
|
131
|
-
# pbar = tqdm(
|
132
|
-
# self.read_in_chunks(open(content_path, "rb"), chunk_size),
|
133
|
-
# total=total_chunks,
|
134
|
-
# )
|
135
|
-
# index = 0
|
136
|
-
# for chunk in pbar:
|
137
|
-
# part = index // chunk_size + 1
|
138
|
-
# offset = index + len(chunk)
|
139
|
-
# index = offset
|
140
|
-
# if part not in parts:
|
141
|
-
# checksum = hashlib.md5(chunk).hexdigest()
|
142
|
-
# response = requests.post(
|
143
|
-
# self.url + "datasets/chunk/" + upload_id,
|
144
|
-
# files={"file": chunk},
|
145
|
-
# data={"part_number": part, "checksum": checksum},
|
146
|
-
# headers={"Authorization": "Bearer " + id_token},
|
147
|
-
# )
|
148
|
-
# if response.status_code != 200:
|
149
|
-
# raise Exception(response.json()["detail"])
|
150
|
-
# pbar.set_description(
|
151
|
-
# "{:.2f}/{:.2f} MB".format(
|
152
|
-
# offset / 1024 / 1024, content_size / 1024 / 1024
|
153
|
-
# )
|
154
|
-
# )
|
155
|
-
# pbar.close()
|
156
|
-
# return
|
132
|
+
def prepare_large_upload(
|
133
|
+
self, filename, dataset_or_model_id, checksum, id_token, endpoint
|
134
|
+
):
|
135
|
+
response = requests.post(
|
136
|
+
self.url + f"{endpoint}/{dataset_or_model_id}/uploadId",
|
137
|
+
json={"filname": filename, "checksum": checksum},
|
138
|
+
headers={"Authorization": "Bearer " + id_token},
|
139
|
+
)
|
140
|
+
if response.status_code != 200:
|
141
|
+
raise Exception(response.json()["detail"])
|
142
|
+
data = response.json()
|
143
|
+
upload_id, parts = (
|
144
|
+
data["upload_id"],
|
145
|
+
data["parts"] if "parts" in data else [],
|
146
|
+
)
|
147
|
+
return upload_id, parts
|
157
148
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
149
|
+
def get_chunk_size(self, content_size):
|
150
|
+
# adapt chunk size to content size to avoid S3 limits (10000 parts, 500MB per part, 5TB per object)
|
151
|
+
chunk_size = 1024 * 1024 * 10 # 10 MB (up to 100 GB, 10000 parts)
|
152
|
+
if content_size >= 1024 * 1024 * 1024 * 100: # 100 GB
|
153
|
+
chunk_size = 1024 * 1024 * 100 # 100 MB (up to 1 TB, 10000 parts)
|
154
|
+
elif content_size >= 1024 * 1024 * 1024 * 1000: # 1 TB
|
155
|
+
chunk_size = 1024 * 1024 * 500 # 0.5 GB (up to 5 TB, 10000 parts)
|
156
|
+
return chunk_size
|
157
|
+
|
158
|
+
def read_in_chunks(self, file_object, CHUNK_SIZE):
|
159
|
+
while True:
|
160
|
+
data = file_object.read(CHUNK_SIZE)
|
161
|
+
if not data:
|
162
|
+
break
|
163
|
+
yield data
|
164
|
+
|
165
|
+
def ingest_large_file(
|
166
|
+
self, file_path, files_size, upload_id, id_token, parts, endpoint
|
167
|
+
):
|
168
|
+
print(endpoint)
|
169
|
+
# content_path = os.path.abspath(file)
|
170
|
+
# content_size = os.stat(content_path).st_size
|
171
|
+
chunk_size = self.get_chunk_size(files_size)
|
172
|
+
total_chunks = files_size // chunk_size
|
173
|
+
# upload chunks sequentially
|
174
|
+
pbar = tqdm(
|
175
|
+
self.read_in_chunks(open(file_path, "rb"), chunk_size),
|
176
|
+
total=total_chunks,
|
177
|
+
)
|
178
|
+
index = 0
|
179
|
+
for chunk in pbar:
|
180
|
+
part = index // chunk_size + 1
|
181
|
+
offset = index + len(chunk)
|
182
|
+
index = offset
|
183
|
+
if part not in parts:
|
184
|
+
checksum = hashlib.md5(chunk).hexdigest()
|
185
|
+
response = requests.post(
|
186
|
+
f"{self.url}{endpoint}/chunk/{upload_id}",
|
187
|
+
files={"file": chunk},
|
188
|
+
data={"part_number": part, "checksum": checksum},
|
189
|
+
headers={"Authorization": "Bearer " + id_token},
|
190
|
+
)
|
191
|
+
if response.status_code != 200:
|
192
|
+
raise Exception(response.json()["detail"])
|
193
|
+
pbar.set_description(
|
194
|
+
"{:.2f}/{:.2f} MB".format(
|
195
|
+
offset / 1024 / 1024, files_size / 1024 / 1024
|
196
|
+
)
|
197
|
+
)
|
198
|
+
pbar.close()
|
199
|
+
return
|
200
|
+
|
201
|
+
def complete_upload(self, id_token, upload_id, version, endpoint):
|
202
|
+
r = requests.post(
|
203
|
+
f"{self.url}{endpoint}/complete/{upload_id}?version={version}",
|
204
|
+
headers={"Authorization": "Bearer " + id_token},
|
205
|
+
)
|
206
|
+
return self.format_response(r)
|
166
207
|
|
167
208
|
# def update_dataset(self, name, path, id_token, checksum):
|
168
209
|
# # check that dataset exists
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import requests
|
2
|
+
import os
|
3
|
+
|
4
|
+
from ..repos import APIRepo
|
5
|
+
|
6
|
+
|
7
|
+
class ModelsAPIRepo(APIRepo):
|
8
|
+
def __init__(self, url=None):
|
9
|
+
super().__init__(url)
|
10
|
+
|
11
|
+
def retrieve_models(self, name, limit):
|
12
|
+
url = self.url + "models"
|
13
|
+
if name is not None:
|
14
|
+
url += "?match=" + name
|
15
|
+
if limit is not None:
|
16
|
+
if name is None:
|
17
|
+
url += "?limit=" + str(limit)
|
18
|
+
else:
|
19
|
+
url += "&limit=" + str(limit)
|
20
|
+
response = requests.get(url)
|
21
|
+
return self.format_response(response)
|
22
|
+
|
23
|
+
def create_model(self, metadata, id_token):
|
24
|
+
response = requests.post(
|
25
|
+
self.url + "models",
|
26
|
+
json=metadata,
|
27
|
+
headers={"Authorization": "Bearer " + id_token},
|
28
|
+
)
|
29
|
+
return self.format_response(response)
|
30
|
+
|
31
|
+
def retrieve_model(self, name):
|
32
|
+
response = requests.get(self.url + "models?name=" + name)
|
33
|
+
return self.format_response(response)
|
34
|
+
|
35
|
+
def create_version(self, model_id, id_token):
|
36
|
+
response = requests.post(
|
37
|
+
self.url + "models/version/" + model_id,
|
38
|
+
headers={"Authorization": "Bearer " + id_token},
|
39
|
+
)
|
40
|
+
return self.format_response(response)
|
eotdl/repos/__init__.py
CHANGED
eotdl/shared/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
from .checksum import calculate_checksum
|
eotdl/tools/__init__.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
from .stac import
|
2
|
-
from .tools import
|
3
|
-
format_product_location_payload,
|
4
|
-
get_images_by_location,
|
5
|
-
get_tarfile_image_info)
|
1
|
+
from .stac import *
|
2
|
+
from .tools import *
|
6
3
|
from .geo_utils import *
|
7
|
-
from .time_utils import *
|
4
|
+
from .time_utils import *
|
5
|
+
from .metadata import *
|
6
|
+
from .paths import *
|
eotdl/tools/geo_utils.py
CHANGED
@@ -5,9 +5,10 @@ import tarfile
|
|
5
5
|
|
6
6
|
from typing import Union
|
7
7
|
from shapely import geometry
|
8
|
-
from shapely.geometry import box, Polygon
|
8
|
+
from shapely.geometry import box, Polygon, shape
|
9
9
|
from pyproj import Transformer
|
10
10
|
from sentinelhub import BBox, CRS, bbox_to_dimensions
|
11
|
+
from pandas import isna
|
11
12
|
|
12
13
|
|
13
14
|
def is_bounding_box(bbox: list) -> bool:
|
@@ -199,3 +200,16 @@ def generate_new_locations_bounding_boxes(gdf: gpd.GeoDataFrame,
|
|
199
200
|
latest_id += 1
|
200
201
|
|
201
202
|
return bbox_by_new_location
|
203
|
+
|
204
|
+
|
205
|
+
def convert_df_geom_to_shape(row):
|
206
|
+
"""
|
207
|
+
Convert the geometry of a dataframe row to a shapely shape
|
208
|
+
"""
|
209
|
+
if not isna(row["geometry"]):
|
210
|
+
geo = shape(row["geometry"])
|
211
|
+
wkt = geo.wkt
|
212
|
+
else:
|
213
|
+
wkt = "POLYGON EMPTY"
|
214
|
+
|
215
|
+
return wkt
|
eotdl/tools/stac.py
CHANGED
@@ -1,11 +1,20 @@
|
|
1
1
|
"""
|
2
2
|
Module for data engineering with STAC elements
|
3
3
|
"""
|
4
|
+
|
4
5
|
import geopandas as gpd
|
5
|
-
|
6
|
+
import pystac
|
7
|
+
|
8
|
+
from os.path import dirname, join, abspath
|
9
|
+
from os import makedirs
|
10
|
+
from json import dumps
|
11
|
+
from typing import Union, Optional
|
12
|
+
from tqdm import tqdm
|
13
|
+
from traceback import print_exc
|
14
|
+
from shutil import rmtree
|
6
15
|
|
7
16
|
|
8
|
-
def stac_items_to_gdf(items: ItemCollection) -> gpd.GeoDataFrame:
|
17
|
+
def stac_items_to_gdf(items: pystac.ItemCollection) -> gpd.GeoDataFrame:
|
9
18
|
"""
|
10
19
|
Get a GeoDataFrame from a given pystac.ItemCollection.
|
11
20
|
|
@@ -18,14 +27,141 @@ def stac_items_to_gdf(items: ItemCollection) -> gpd.GeoDataFrame:
|
|
18
27
|
features = []
|
19
28
|
for f in _features:
|
20
29
|
if f not in features:
|
21
|
-
# Add the
|
22
|
-
|
23
|
-
|
24
|
-
|
30
|
+
# Add all the keys in the properties dict as columns in the GeoDataFrame
|
31
|
+
for k, v in f.items():
|
32
|
+
if k not in f['properties'] and k != 'geometry':
|
33
|
+
f['properties'][k] = v
|
25
34
|
if 'scene_id' in f['properties']:
|
26
35
|
f['properties']['scene_id'] = f['id'].split('_')[3]
|
27
|
-
f['properties']['type'] = f['type']
|
28
|
-
f['properties']['stac_extensions'] = f['stac_extensions']
|
29
36
|
features.append(f)
|
30
37
|
|
31
38
|
return gpd.GeoDataFrame.from_features(features)
|
39
|
+
|
40
|
+
|
41
|
+
def get_all_children(obj: pystac.STACObject) -> list:
|
42
|
+
"""
|
43
|
+
Get all the children of a STAC object
|
44
|
+
"""
|
45
|
+
children = []
|
46
|
+
# Append the current object to the list
|
47
|
+
children.append(obj.to_dict())
|
48
|
+
|
49
|
+
# Collections
|
50
|
+
collections = list(obj.get_collections())
|
51
|
+
for collection in collections:
|
52
|
+
children.append(collection.to_dict())
|
53
|
+
|
54
|
+
# Items
|
55
|
+
items = obj.get_items()
|
56
|
+
for item in items:
|
57
|
+
children.append(item.to_dict())
|
58
|
+
|
59
|
+
# Items from collections
|
60
|
+
for collection in collections:
|
61
|
+
items = collection.get_items()
|
62
|
+
for item in items:
|
63
|
+
children.append(item.to_dict())
|
64
|
+
|
65
|
+
return children
|
66
|
+
|
67
|
+
|
68
|
+
def make_links_relative_to_path(path: str,
|
69
|
+
catalog: Union[pystac.Catalog, str],
|
70
|
+
) -> pystac.Catalog:
|
71
|
+
"""
|
72
|
+
Makes all asset HREFs and links in the STAC catalog relative to a given path
|
73
|
+
"""
|
74
|
+
if isinstance(catalog, str):
|
75
|
+
catalog = pystac.read_file(catalog)
|
76
|
+
path = abspath(path)
|
77
|
+
|
78
|
+
# Create a temporary catalog in the destination path to set as root
|
79
|
+
future_path = join(path, 'catalog.json')
|
80
|
+
makedirs(path, exist_ok=True)
|
81
|
+
with open(future_path, 'w') as f:
|
82
|
+
f.write(dumps(catalog.to_dict(), indent=4))
|
83
|
+
temp_catalog = pystac.Catalog.from_file(future_path)
|
84
|
+
|
85
|
+
catalog.set_root(temp_catalog)
|
86
|
+
catalog.make_all_asset_hrefs_absolute()
|
87
|
+
|
88
|
+
for collection in catalog.get_children():
|
89
|
+
# Create new collection
|
90
|
+
new_collection = collection.clone()
|
91
|
+
new_collection.set_self_href(join(path, collection.id, f"collection.json"))
|
92
|
+
new_collection.set_root(catalog)
|
93
|
+
new_collection.set_parent(catalog)
|
94
|
+
# Remove old collection and add new one to catalog
|
95
|
+
catalog.remove_child(collection.id)
|
96
|
+
catalog.add_child(new_collection)
|
97
|
+
for item in collection.get_all_items():
|
98
|
+
# Create new item from old collection and add it to the new collection
|
99
|
+
new_item = item.clone()
|
100
|
+
new_item.set_self_href(join(path, collection.id, item.id, f"{item.id}.json"))
|
101
|
+
new_item.set_parent(collection)
|
102
|
+
new_item.set_root(catalog)
|
103
|
+
new_item.make_asset_hrefs_relative()
|
104
|
+
new_collection.add_item(new_item)
|
105
|
+
|
106
|
+
catalog.make_all_asset_hrefs_relative()
|
107
|
+
|
108
|
+
return catalog
|
109
|
+
|
110
|
+
|
111
|
+
def merge_stac_catalogs(catalog_1: Union[pystac.Catalog, str],
|
112
|
+
catalog_2: Union[pystac.Catalog, str],
|
113
|
+
destination: Optional[str] = None,
|
114
|
+
keep_extensions: Optional[bool] = False,
|
115
|
+
catalog_type: Optional[pystac.CatalogType] = pystac.CatalogType.SELF_CONTAINED
|
116
|
+
) -> None:
|
117
|
+
"""
|
118
|
+
Merge two STAC catalogs, keeping the properties, collection and items of both catalogs
|
119
|
+
"""
|
120
|
+
if isinstance(catalog_1, str):
|
121
|
+
catalog_1 = pystac.Catalog.from_file(catalog_1)
|
122
|
+
if isinstance(catalog_2, str):
|
123
|
+
catalog_2 = pystac.Catalog.from_file(catalog_2)
|
124
|
+
|
125
|
+
for col1 in tqdm(catalog_1.get_children(), desc='Merging catalogs...'):
|
126
|
+
# Check if the collection exists in catalog_2
|
127
|
+
col2 = catalog_2.get_child(col1.id)
|
128
|
+
if col2 is None:
|
129
|
+
# If it does not exist, add it
|
130
|
+
col1_ = col1.clone()
|
131
|
+
catalog_2.add_child(col1)
|
132
|
+
col2 = catalog_2.get_child(col1.id)
|
133
|
+
col2.clear_items()
|
134
|
+
for i in col1_.get_stac_objects(pystac.RelType.ITEM):
|
135
|
+
col2.add_item(i)
|
136
|
+
else:
|
137
|
+
# If it exists, merge the items
|
138
|
+
for item1 in col1.get_items():
|
139
|
+
if col2.get_item(item1.id) is None:
|
140
|
+
col2.add_item(item1)
|
141
|
+
|
142
|
+
if keep_extensions:
|
143
|
+
for ext in catalog_1.stac_extensions:
|
144
|
+
if ext not in catalog_2.stac_extensions:
|
145
|
+
catalog_2.stac_extensions.append(ext)
|
146
|
+
|
147
|
+
for extra_field_name, extra_field_value in catalog_1.extra_fields.items():
|
148
|
+
if extra_field_name not in catalog_2.extra_fields:
|
149
|
+
catalog_2.extra_fields[extra_field_name] = extra_field_value
|
150
|
+
|
151
|
+
if destination:
|
152
|
+
make_links_relative_to_path(destination, catalog_2)
|
153
|
+
else:
|
154
|
+
destination = dirname(catalog_2.get_self_href())
|
155
|
+
|
156
|
+
# Save the merged catalog
|
157
|
+
try:
|
158
|
+
print("Validating and saving...")
|
159
|
+
catalog_2.validate()
|
160
|
+
rmtree(destination) if not destination else None # Remove the old catalog and replace it with the new one
|
161
|
+
catalog_2.normalize_and_save(root_href=destination,
|
162
|
+
catalog_type=catalog_type
|
163
|
+
)
|
164
|
+
print("Success!")
|
165
|
+
except pystac.STACValidationError:
|
166
|
+
# Return full callback
|
167
|
+
print_exc()
|
eotdl/tools/time_utils.py
CHANGED
@@ -81,21 +81,21 @@ def expand_time_interval(time_interval: Union[list, tuple], format: str='%Y-%m-%
|
|
81
81
|
def prepare_time_interval(date):
|
82
82
|
if isinstance(date, str):
|
83
83
|
date = datetime.strptime(date, "%Y-%m-%d")
|
84
|
-
elif isinstance(date, datetime):
|
85
|
-
date = date.strftime("%Y-%m-%d")
|
86
84
|
elif isinstance(date, tuple):
|
87
85
|
if not is_time_interval(date):
|
88
86
|
raise ValueError('The time interval must be a range of two dates, with format YYYY-MM-DD or a datetime object')
|
89
87
|
else:
|
90
88
|
return date
|
91
|
-
|
89
|
+
elif not isinstance(date, datetime):
|
92
90
|
raise ValueError('The date must be a string with format YYYY-MM-DD or a datetime object')
|
91
|
+
|
93
92
|
date_day_before = date - timedelta(days=1)
|
94
93
|
date_next_day = date + timedelta(days=1)
|
95
|
-
date_day_before = date_day_before.strftime("%Y-%m-%d")
|
96
|
-
date_next_day = date_next_day.strftime("%Y-%m-%d")
|
97
94
|
|
98
|
-
|
95
|
+
date_day_before_str = date_day_before.strftime("%Y-%m-%d")
|
96
|
+
date_next_day_str = date_next_day.strftime("%Y-%m-%d")
|
97
|
+
|
98
|
+
return (date_day_before_str, date_next_day_str)
|
99
99
|
|
100
100
|
|
101
101
|
def get_day_between(from_date: Union[datetime, str],
|
@@ -112,3 +112,16 @@ def get_day_between(from_date: Union[datetime, str],
|
|
112
112
|
date_between = date_between.strftime("%Y-%m-%d")
|
113
113
|
|
114
114
|
return date_between
|
115
|
+
|
116
|
+
|
117
|
+
def format_time_acquired(dt: Union[str, datetime]) -> str:
|
118
|
+
"""
|
119
|
+
Format the date time to the required format for STAC
|
120
|
+
|
121
|
+
:param dt: date time to format
|
122
|
+
"""
|
123
|
+
from dateutil import parser
|
124
|
+
|
125
|
+
dt_str = parser.parse(dt).strftime("%Y-%m-%dT%H:%M:%S.%f")
|
126
|
+
|
127
|
+
return dt_str
|
eotdl/tools/tools.py
CHANGED
@@ -5,13 +5,12 @@ Module for data engineeringt
|
|
5
5
|
import geopandas as gpd
|
6
6
|
import pandas as pd
|
7
7
|
import tarfile
|
8
|
-
import rasterio
|
9
8
|
import re
|
10
9
|
import datetime
|
11
10
|
import json
|
12
11
|
|
13
|
-
from
|
14
|
-
from
|
12
|
+
from .geo_utils import get_image_bbox
|
13
|
+
from shapely.geometry import box
|
15
14
|
from os.path import exists
|
16
15
|
from typing import Union, Optional
|
17
16
|
|