terrakio-core 0.3.8__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of terrakio-core might be problematic. Click here for more details.
- terrakio_core/__init__.py +3 -3
- terrakio_core/async_client.py +22 -7
- terrakio_core/client.py +103 -5
- terrakio_core/convenience_functions/convenience_functions.py +302 -69
- terrakio_core/endpoints/mass_stats.py +71 -16
- terrakio_core/endpoints/model_management.py +388 -217
- terrakio_core/endpoints/user_management.py +5 -5
- terrakio_core/sync_client.py +107 -188
- {terrakio_core-0.3.8.dist-info → terrakio_core-0.4.0.dist-info}/METADATA +2 -1
- {terrakio_core-0.3.8.dist-info → terrakio_core-0.4.0.dist-info}/RECORD +12 -12
- {terrakio_core-0.3.8.dist-info → terrakio_core-0.4.0.dist-info}/WHEEL +0 -0
- {terrakio_core-0.3.8.dist-info → terrakio_core-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -9,36 +9,41 @@ from ..exceptions import APIError, ConfigurationError
|
|
|
9
9
|
from ..helper.bounded_taskgroup import BoundedTaskGroup
|
|
10
10
|
from ..helper.tiles import tiles
|
|
11
11
|
import uuid
|
|
12
|
+
import xarray as xr
|
|
12
13
|
|
|
13
|
-
async def
|
|
14
|
+
async def request_data(
|
|
14
15
|
client,
|
|
15
16
|
gdf: GeoDataFrame,
|
|
16
17
|
expr: str,
|
|
17
18
|
conc: int = 20,
|
|
18
|
-
inplace: bool = False,
|
|
19
19
|
in_crs: str = "epsg:4326",
|
|
20
20
|
out_crs: str = "epsg:4326",
|
|
21
21
|
resolution: int = -1,
|
|
22
22
|
geom_fix: bool = False,
|
|
23
|
+
max_memory_mb: int = 500,
|
|
24
|
+
stream_to_disk: bool = None,
|
|
23
25
|
):
|
|
24
26
|
"""
|
|
25
|
-
|
|
27
|
+
Request xarray datasets for all geometries in a GeoDataFrame.
|
|
26
28
|
|
|
27
29
|
Args:
|
|
28
30
|
client: The AsyncClient instance
|
|
29
31
|
gdf (GeoDataFrame): GeoDataFrame containing geometries
|
|
30
32
|
expr (str): Terrakio expression to evaluate, can include spatial aggregations
|
|
31
33
|
conc (int): Number of concurrent requests to make
|
|
32
|
-
inplace (bool): Whether to modify the input GeoDataFrame in place
|
|
33
34
|
in_crs (str): Input coordinate reference system
|
|
34
35
|
out_crs (str): Output coordinate reference system
|
|
35
36
|
resolution (int): Resolution parameter
|
|
36
37
|
geom_fix (bool): Whether to fix the geometry (default False)
|
|
38
|
+
max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
|
|
39
|
+
stream_to_disk (bool): Whether to stream large datasets to disk. If None, will be determined automatically.
|
|
40
|
+
|
|
37
41
|
Returns:
|
|
38
|
-
geopandas.GeoDataFrame:
|
|
42
|
+
geopandas.GeoDataFrame: Copy of input GeoDataFrame with additional 'dataset' column
|
|
43
|
+
containing the xarray Dataset for each geometry.
|
|
39
44
|
|
|
40
45
|
Raises:
|
|
41
|
-
ValueError: If concurrency is too high
|
|
46
|
+
ValueError: If concurrency is too high or if data exceeds memory limit without streaming
|
|
42
47
|
APIError: If the API request fails
|
|
43
48
|
"""
|
|
44
49
|
if conc > 100:
|
|
@@ -46,32 +51,74 @@ async def zonal_stats(
|
|
|
46
51
|
|
|
47
52
|
total_geometries = len(gdf)
|
|
48
53
|
|
|
54
|
+
# First, make a request with the first geometry to estimate total memory usage
|
|
55
|
+
client.logger.info("Estimating total memory usage...")
|
|
56
|
+
first_geom = gdf.geometry.iloc[0]
|
|
57
|
+
feature = {
|
|
58
|
+
"type": "Feature",
|
|
59
|
+
"geometry": mapping(first_geom),
|
|
60
|
+
"properties": {}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
first_result = await client.geoquery(expr=expr, feature=feature,
|
|
65
|
+
in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
|
|
66
|
+
if isinstance(first_result, dict) and first_result.get("error"):
|
|
67
|
+
error_msg = f"Request failed: {first_result.get('error_message', 'Unknown error')}"
|
|
68
|
+
if first_result.get('status_code'):
|
|
69
|
+
error_msg = f"Request failed with status {first_result['status_code']}: {first_result.get('error_message', 'Unknown error')}"
|
|
70
|
+
raise APIError(error_msg)
|
|
71
|
+
|
|
72
|
+
if not isinstance(first_result, xr.Dataset):
|
|
73
|
+
raise ValueError(f"Expected xarray Dataset, got {type(first_result)}")
|
|
74
|
+
|
|
75
|
+
# Estimate total memory usage
|
|
76
|
+
single_dataset_size_bytes = estimate_dataset_size(first_result)
|
|
77
|
+
total_size_bytes = single_dataset_size_bytes * total_geometries
|
|
78
|
+
total_size_mb = total_size_bytes / (1024 * 1024)
|
|
79
|
+
|
|
80
|
+
client.logger.info(f"Estimated total memory usage: {total_size_mb:.2f} MB for {total_geometries} geometries")
|
|
81
|
+
|
|
82
|
+
# Check if we need to stream to disk
|
|
83
|
+
if stream_to_disk is None:
|
|
84
|
+
# Auto-determine based on memory usage
|
|
85
|
+
if total_size_mb > max_memory_mb:
|
|
86
|
+
client.logger.warning(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
|
|
87
|
+
raise ValueError(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
if "recommend you to set the stream_to_disk parameter to True" in str(e):
|
|
91
|
+
raise
|
|
92
|
+
client.logger.error(f"Failed to estimate memory usage: {e}")
|
|
93
|
+
raise
|
|
94
|
+
|
|
49
95
|
client.logger.info(f"Processing {total_geometries} geometries with concurrency {conc}")
|
|
50
96
|
|
|
51
97
|
completed_count = 0
|
|
52
98
|
lock = asyncio.Lock()
|
|
53
99
|
|
|
54
|
-
async def process_geometry(geom
|
|
55
|
-
"""Process a single geometry"""
|
|
100
|
+
async def process_geometry(geom):
|
|
101
|
+
"""Process a single geometry and return the dataset"""
|
|
56
102
|
nonlocal completed_count
|
|
57
103
|
|
|
58
104
|
try:
|
|
59
105
|
feature = {
|
|
60
106
|
"type": "Feature",
|
|
61
107
|
"geometry": mapping(geom),
|
|
62
|
-
"properties": {
|
|
108
|
+
"properties": {}
|
|
63
109
|
}
|
|
64
|
-
|
|
110
|
+
# Request xarray dataset
|
|
111
|
+
result = await client.geoquery(expr=expr, feature=feature,
|
|
65
112
|
in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
|
|
66
|
-
|
|
67
113
|
if isinstance(result, dict) and result.get("error"):
|
|
68
|
-
error_msg = f"Request
|
|
114
|
+
error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
|
|
69
115
|
if result.get('status_code'):
|
|
70
|
-
error_msg = f"Request
|
|
116
|
+
error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
|
|
71
117
|
raise APIError(error_msg)
|
|
72
118
|
|
|
73
|
-
|
|
74
|
-
|
|
119
|
+
# Ensure we got an xarray Dataset
|
|
120
|
+
if not isinstance(result, xr.Dataset):
|
|
121
|
+
raise ValueError(f"Expected xarray Dataset, got {type(result)}")
|
|
75
122
|
|
|
76
123
|
async with lock:
|
|
77
124
|
completed_count += 1
|
|
@@ -88,85 +135,271 @@ async def zonal_stats(
|
|
|
88
135
|
try:
|
|
89
136
|
async with BoundedTaskGroup(max_concurrency=conc) as tg:
|
|
90
137
|
tasks = [
|
|
91
|
-
tg.create_task(process_geometry(gdf.geometry.iloc[idx]
|
|
138
|
+
tg.create_task(process_geometry(gdf.geometry.iloc[idx]))
|
|
92
139
|
for idx in range(len(gdf))
|
|
93
140
|
]
|
|
94
141
|
all_results = [task.result() for task in tasks]
|
|
95
|
-
|
|
142
|
+
|
|
96
143
|
except* Exception as eg:
|
|
97
144
|
for e in eg.exceptions:
|
|
98
145
|
if hasattr(e, 'response'):
|
|
99
146
|
raise APIError(f"API request failed: {e.response.text}")
|
|
100
147
|
raise
|
|
101
148
|
|
|
102
|
-
client.logger.info("All requests completed!
|
|
103
|
-
|
|
149
|
+
client.logger.info("All requests completed!")
|
|
150
|
+
|
|
104
151
|
if not all_results:
|
|
105
152
|
raise ValueError("No valid results were returned for any geometry")
|
|
153
|
+
|
|
154
|
+
# Create a copy of the input GeoDataFrame
|
|
155
|
+
result_gdf = gdf.copy()
|
|
156
|
+
|
|
157
|
+
# Add the dataset column with the xarray datasets
|
|
158
|
+
result_gdf['dataset'] = all_results
|
|
159
|
+
|
|
160
|
+
return result_gdf
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
import os
|
|
164
|
+
from pathlib import Path
|
|
165
|
+
|
|
166
|
+
def estimate_dataset_size(dataset):
|
|
167
|
+
"""
|
|
168
|
+
Estimate the memory size of an xarray dataset in bytes.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
dataset: xarray Dataset
|
|
106
172
|
|
|
107
|
-
|
|
173
|
+
Returns:
|
|
174
|
+
int: Estimated size in bytes
|
|
175
|
+
"""
|
|
176
|
+
total_size = 0
|
|
177
|
+
for var_name, var in dataset.data_vars.items():
|
|
178
|
+
# Get the dtype size in bytes
|
|
179
|
+
dtype_size = var.dtype.itemsize
|
|
180
|
+
# Get the total number of elements
|
|
181
|
+
total_elements = var.size
|
|
182
|
+
# Calculate total size for this variable
|
|
183
|
+
total_size += dtype_size * total_elements
|
|
108
184
|
|
|
109
|
-
|
|
185
|
+
# Add coordinate sizes
|
|
186
|
+
for coord_name, coord in dataset.coords.items():
|
|
187
|
+
if coord_name not in dataset.dims: # Don't double count dimension coordinates
|
|
188
|
+
dtype_size = coord.dtype.itemsize
|
|
189
|
+
total_elements = coord.size
|
|
190
|
+
total_size += dtype_size * total_elements
|
|
110
191
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
192
|
+
return total_size
|
|
193
|
+
|
|
194
|
+
def save_dataset_to_file(dataset, filepath):
|
|
195
|
+
"""
|
|
196
|
+
Save dataset to NetCDF file.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
dataset: xarray Dataset
|
|
200
|
+
filepath: Path to save the file
|
|
114
201
|
|
|
115
|
-
|
|
202
|
+
Returns:
|
|
203
|
+
str: Path to saved file
|
|
204
|
+
"""
|
|
205
|
+
filepath = Path(filepath)
|
|
206
|
+
|
|
207
|
+
if not str(filepath).endswith('.nc'):
|
|
208
|
+
filepath = filepath.with_suffix('.nc')
|
|
209
|
+
|
|
210
|
+
dataset.to_netcdf(filepath)
|
|
211
|
+
return str(filepath)
|
|
212
|
+
|
|
213
|
+
def post_processing(
|
|
214
|
+
gdf_with_datasets: GeoDataFrame,
|
|
215
|
+
spatial_reduction: str = None,
|
|
216
|
+
temporal_reduction: str = None,
|
|
217
|
+
drop_nan: bool = False,
|
|
218
|
+
inplace: bool = False,
|
|
219
|
+
stream_to_disk: bool = False,
|
|
220
|
+
):
|
|
221
|
+
"""
|
|
222
|
+
Post-process the GeoDataFrame with datasets to extract variables with optional reductions.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
gdf_with_datasets (GeoDataFrame): GeoDataFrame with 'dataset' column containing xarray Datasets
|
|
226
|
+
spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
|
|
227
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
228
|
+
temporal_reduction (str): Reduction operation for temporal dimension (time).
|
|
229
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
230
|
+
drop_nan (bool): Whether to drop NaN values from the results (default False)
|
|
231
|
+
inplace (bool): Whether to modify the input GeoDataFrame in place
|
|
232
|
+
stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
geopandas.GeoDataFrame: GeoDataFrame with variable dataarrays/values or file paths in separate columns.
|
|
236
|
+
If stream_to_disk=True, large datasets are saved as NetCDF files with file paths stored.
|
|
237
|
+
"""
|
|
238
|
+
if 'dataset' not in gdf_with_datasets.columns:
|
|
239
|
+
raise ValueError("Input GeoDataFrame must contain a 'dataset' column")
|
|
240
|
+
|
|
241
|
+
# Validate reduction parameters
|
|
242
|
+
valid_reductions = ['mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count']
|
|
243
|
+
if spatial_reduction and spatial_reduction not in valid_reductions:
|
|
244
|
+
raise ValueError(f"spatial_reduction must be one of {valid_reductions}")
|
|
245
|
+
if temporal_reduction and temporal_reduction not in valid_reductions:
|
|
246
|
+
raise ValueError(f"temporal_reduction must be one of {valid_reductions}")
|
|
247
|
+
|
|
248
|
+
result_rows = []
|
|
249
|
+
geometries = []
|
|
250
|
+
|
|
251
|
+
# Process each row (geometry + dataset)
|
|
252
|
+
for i, row in gdf_with_datasets.iterrows():
|
|
253
|
+
dataset = row['dataset']
|
|
116
254
|
|
|
117
|
-
|
|
255
|
+
# Create new row for this geometry
|
|
256
|
+
new_row = {}
|
|
118
257
|
|
|
119
|
-
|
|
120
|
-
|
|
258
|
+
# Copy original GeoDataFrame attributes (excluding dataset column)
|
|
259
|
+
for col in gdf_with_datasets.columns:
|
|
260
|
+
if col not in ['geometry', 'dataset']:
|
|
261
|
+
new_row[col] = row[col]
|
|
121
262
|
|
|
122
|
-
|
|
123
|
-
|
|
263
|
+
# Process each variable in the dataset
|
|
264
|
+
data_vars = list(dataset.data_vars.keys())
|
|
265
|
+
for var_name in data_vars:
|
|
266
|
+
var_data = dataset[var_name]
|
|
124
267
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
268
|
+
# Apply drop_nan if requested
|
|
269
|
+
if drop_nan:
|
|
270
|
+
# Drop spatial dimensions where all values are NaN
|
|
271
|
+
var_data = var_data.dropna(dim='x', how='all').dropna(dim='y', how='all')
|
|
272
|
+
|
|
273
|
+
# Drop time dimensions where all values are NaN
|
|
274
|
+
if 'time' in var_data.dims:
|
|
275
|
+
var_data = var_data.dropna(dim='time', how='all')
|
|
128
276
|
|
|
129
|
-
|
|
130
|
-
|
|
277
|
+
# Check current dimensions to determine if aggregation is needed
|
|
278
|
+
current_dims = set(var_data.dims)
|
|
279
|
+
has_spatial_dims = bool(current_dims.intersection(['x', 'y']))
|
|
280
|
+
has_temporal_dim = 'time' in current_dims
|
|
131
281
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
282
|
+
# Apply spatial reduction only if spatial dimensions exist and reduction is requested
|
|
283
|
+
if spatial_reduction and has_spatial_dims:
|
|
284
|
+
spatial_dims = [dim for dim in ['x', 'y'] if dim in var_data.dims]
|
|
285
|
+
if spatial_dims:
|
|
286
|
+
if spatial_reduction == 'count':
|
|
287
|
+
var_data = var_data.count(dim=spatial_dims)
|
|
288
|
+
else:
|
|
289
|
+
var_data = getattr(var_data, spatial_reduction)(dim=spatial_dims)
|
|
290
|
+
|
|
291
|
+
# Apply temporal reduction only if time dimension exists and reduction is requested
|
|
292
|
+
if temporal_reduction and has_temporal_dim:
|
|
293
|
+
if temporal_reduction == 'count':
|
|
294
|
+
var_data = var_data.count(dim='time')
|
|
295
|
+
else:
|
|
296
|
+
var_data = getattr(var_data, temporal_reduction)(dim='time')
|
|
297
|
+
|
|
298
|
+
# Handle streaming to disk if requested
|
|
299
|
+
if stream_to_disk:
|
|
300
|
+
# Create a single-variable dataset for saving
|
|
301
|
+
single_var_dataset = var_data.to_dataset(name=var_name)
|
|
302
|
+
|
|
303
|
+
# Generate filename based on row index and variable name
|
|
304
|
+
filename = f"geometry_{i}_{var_name}.nc"
|
|
305
|
+
filepath = os.path.join(os.getcwd(), filename)
|
|
306
|
+
|
|
307
|
+
# Save to disk and store file path
|
|
308
|
+
saved_path = save_dataset_to_file(single_var_dataset, filepath)
|
|
309
|
+
new_row[var_name] = f"file://{saved_path}"
|
|
310
|
+
|
|
311
|
+
print(f"Dataset for geometry {i}, variable '{var_name}' saved to: {saved_path}")
|
|
312
|
+
else:
|
|
313
|
+
# Keep in memory
|
|
314
|
+
new_row[var_name] = var_data
|
|
145
315
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
316
|
+
result_rows.append(new_row)
|
|
317
|
+
geometries.append(row['geometry'])
|
|
318
|
+
|
|
319
|
+
# Create the result GeoDataFrame with default integer index
|
|
320
|
+
result_gdf = GeoDataFrame(result_rows, geometry=geometries)
|
|
321
|
+
|
|
322
|
+
if inplace:
|
|
323
|
+
# Clear original gdf and replace with result_gdf content
|
|
324
|
+
gdf_with_datasets.drop(gdf_with_datasets.index, inplace=True)
|
|
325
|
+
gdf_with_datasets.drop(gdf_with_datasets.columns, axis=1, inplace=True)
|
|
152
326
|
|
|
153
|
-
|
|
327
|
+
# Copy all data from result_gdf to gdf
|
|
328
|
+
for col in result_gdf.columns:
|
|
329
|
+
gdf_with_datasets[col] = result_gdf[col].values
|
|
154
330
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
geom_idx = int(row['_geometry_index'])
|
|
158
|
-
geom_idx_to_row[geom_idx] = row
|
|
331
|
+
# Ensure it remains a GeoDataFrame with correct geometry
|
|
332
|
+
gdf_with_datasets.geometry = result_gdf.geometry
|
|
159
333
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
334
|
+
return None
|
|
335
|
+
else:
|
|
336
|
+
return result_gdf
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# Updated zonal_stats function that uses both parts
|
|
340
|
+
async def zonal_stats(
|
|
341
|
+
client,
|
|
342
|
+
gdf: GeoDataFrame,
|
|
343
|
+
expr: str,
|
|
344
|
+
conc: int = 20,
|
|
345
|
+
inplace: bool = False,
|
|
346
|
+
in_crs: str = "epsg:4326",
|
|
347
|
+
out_crs: str = "epsg:4326",
|
|
348
|
+
resolution: int = -1,
|
|
349
|
+
geom_fix: bool = False,
|
|
350
|
+
drop_nan: bool = False,
|
|
351
|
+
spatial_reduction: str = None,
|
|
352
|
+
temporal_reduction: str = None,
|
|
353
|
+
max_memory_mb: int = 500,
|
|
354
|
+
stream_to_disk: bool = False,
|
|
355
|
+
):
|
|
356
|
+
"""
|
|
357
|
+
Compute zonal statistics for all geometries in a GeoDataFrame.
|
|
358
|
+
This is a convenience function that combines request_data and post_processing.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
client: The AsyncClient instance
|
|
362
|
+
gdf (GeoDataFrame): GeoDataFrame containing geometries
|
|
363
|
+
expr (str): Terrakio expression to evaluate, can include spatial aggregations
|
|
364
|
+
conc (int): Number of concurrent requests to make
|
|
365
|
+
inplace (bool): Whether to modify the input GeoDataFrame in place
|
|
366
|
+
in_crs (str): Input coordinate reference system
|
|
367
|
+
out_crs (str): Output coordinate reference system
|
|
368
|
+
resolution (int): Resolution parameter
|
|
369
|
+
geom_fix (bool): Whether to fix the geometry (default False)
|
|
370
|
+
drop_nan (bool): Whether to drop NaN values from the results (default False)
|
|
371
|
+
spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
|
|
372
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
373
|
+
temporal_reduction (str): Reduction operation for temporal dimension (time).
|
|
374
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
375
|
+
max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
|
|
376
|
+
stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
|
|
377
|
+
"""
|
|
378
|
+
# Step 1: Request data (with memory estimation)
|
|
379
|
+
gdf_with_datasets = await request_data(
|
|
380
|
+
client=client,
|
|
381
|
+
gdf=gdf,
|
|
382
|
+
expr=expr,
|
|
383
|
+
conc=conc,
|
|
384
|
+
in_crs=in_crs,
|
|
385
|
+
out_crs=out_crs,
|
|
386
|
+
resolution=resolution,
|
|
387
|
+
geom_fix=geom_fix,
|
|
388
|
+
max_memory_mb=max_memory_mb,
|
|
389
|
+
stream_to_disk=stream_to_disk
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Step 2: Post-process with reductions and optional streaming
|
|
393
|
+
result = post_processing(
|
|
394
|
+
gdf_with_datasets=gdf_with_datasets,
|
|
395
|
+
spatial_reduction=spatial_reduction,
|
|
396
|
+
temporal_reduction=temporal_reduction,
|
|
397
|
+
drop_nan=drop_nan,
|
|
398
|
+
inplace=inplace,
|
|
399
|
+
stream_to_disk=stream_to_disk
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
return result
|
|
170
403
|
|
|
171
404
|
async def create_dataset_file(
|
|
172
405
|
client,
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
from ..helper.decorators import require_token, require_api_key, require_auth
|
|
8
8
|
import aiohttp
|
|
9
|
-
|
|
9
|
+
from typing import Dict, Any, Optional, List, Union
|
|
10
10
|
class MassStats:
|
|
11
11
|
def __init__(self, client):
|
|
12
12
|
self._client = client
|
|
@@ -218,7 +218,7 @@ class MassStats:
|
|
|
218
218
|
params["output"] = output
|
|
219
219
|
|
|
220
220
|
return self._client._terrakio_request("GET", "mass_stats/download", params=params)
|
|
221
|
-
|
|
221
|
+
|
|
222
222
|
@require_api_key
|
|
223
223
|
async def _upload_file(self, file_path: str, url: str, use_gzip: bool = False):
|
|
224
224
|
"""
|
|
@@ -237,6 +237,18 @@ class MassStats:
|
|
|
237
237
|
except json.JSONDecodeError as e:
|
|
238
238
|
raise ValueError(f"Invalid JSON in file {file_path}: {e}")
|
|
239
239
|
|
|
240
|
+
return await self._upload_json_data(json_data, url, use_gzip)
|
|
241
|
+
|
|
242
|
+
@require_api_key
|
|
243
|
+
async def _upload_json_data(self, json_data: Union[Dict, List], url: str, use_gzip: bool = False):
|
|
244
|
+
"""
|
|
245
|
+
Helper method to upload JSON data directly to a signed URL.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
json_data: JSON data (dict or list) to upload
|
|
249
|
+
url: Signed URL to upload to
|
|
250
|
+
use_gzip: Whether to compress the data with gzip
|
|
251
|
+
"""
|
|
240
252
|
if hasattr(json, 'dumps') and 'ignore_nan' in json.dumps.__code__.co_varnames:
|
|
241
253
|
dumps_kwargs = {'ignore_nan': True}
|
|
242
254
|
else:
|
|
@@ -253,6 +265,7 @@ class MassStats:
|
|
|
253
265
|
headers = {
|
|
254
266
|
'Content-Type': 'application/json'
|
|
255
267
|
}
|
|
268
|
+
|
|
256
269
|
response = await self._client._regular_request("PUT", url, data=body, headers=headers)
|
|
257
270
|
return response
|
|
258
271
|
|
|
@@ -386,36 +399,55 @@ class MassStats:
|
|
|
386
399
|
region: str,
|
|
387
400
|
output: str,
|
|
388
401
|
config: Dict[str, Any],
|
|
389
|
-
request_json:
|
|
390
|
-
manifest_json: Dict[str, Any],
|
|
402
|
+
request_json: str, # Path to request JSON file
|
|
391
403
|
overwrite: bool = False,
|
|
392
404
|
skip_existing: bool = False,
|
|
393
405
|
location: str = None,
|
|
394
406
|
force_loc: bool = None,
|
|
395
|
-
server: str =
|
|
407
|
+
server: str = None
|
|
396
408
|
) -> Dict[str, Any]:
|
|
397
409
|
"""
|
|
398
410
|
Execute a mass stats job.
|
|
399
|
-
|
|
411
|
+
|
|
400
412
|
Args:
|
|
401
413
|
name: The name of the job
|
|
402
414
|
region: The region of the job
|
|
403
415
|
output: The output of the job
|
|
404
416
|
config: The config of the job
|
|
405
|
-
request_json:
|
|
406
|
-
manifest_json: The manifest JSON
|
|
417
|
+
request_json: Path to the request JSON file
|
|
407
418
|
overwrite: Whether to overwrite the job
|
|
408
419
|
skip_existing: Whether to skip existing jobs
|
|
409
420
|
location: The location of the job
|
|
410
421
|
force_loc: Whether to force the location
|
|
411
422
|
server: The server to use
|
|
412
|
-
|
|
423
|
+
|
|
413
424
|
Returns:
|
|
414
425
|
API response as a dictionary
|
|
415
|
-
|
|
426
|
+
|
|
416
427
|
Raises:
|
|
417
428
|
APIError: If the API request fails
|
|
418
429
|
"""
|
|
430
|
+
|
|
431
|
+
def extract_manifest_from_request(request_data: List[Dict[str, Any]]) -> List[str]:
|
|
432
|
+
"""Extract unique group names from request data to create manifest list."""
|
|
433
|
+
groups = []
|
|
434
|
+
seen_groups = set()
|
|
435
|
+
|
|
436
|
+
for item in request_data:
|
|
437
|
+
if not isinstance(item, dict):
|
|
438
|
+
raise ValueError("Each item in request JSON should be a dictionary")
|
|
439
|
+
|
|
440
|
+
if 'group' not in item:
|
|
441
|
+
raise ValueError("Each item should have a 'group' field")
|
|
442
|
+
|
|
443
|
+
group = item['group']
|
|
444
|
+
if group not in seen_groups:
|
|
445
|
+
groups.append(group)
|
|
446
|
+
seen_groups.add(group)
|
|
447
|
+
|
|
448
|
+
return groups
|
|
449
|
+
|
|
450
|
+
# Load and validate request JSON
|
|
419
451
|
try:
|
|
420
452
|
with open(request_json, 'r') as file:
|
|
421
453
|
request_data = json.load(file)
|
|
@@ -427,14 +459,35 @@ class MassStats:
|
|
|
427
459
|
return e
|
|
428
460
|
except json.JSONDecodeError as e:
|
|
429
461
|
return e
|
|
430
|
-
|
|
462
|
+
|
|
463
|
+
# Generate manifest from request data (kept in memory)
|
|
464
|
+
try:
|
|
465
|
+
manifest_groups = extract_manifest_from_request(request_data)
|
|
466
|
+
except Exception as e:
|
|
467
|
+
raise ValueError(f"Error extracting manifest from request JSON: {e}")
|
|
468
|
+
|
|
469
|
+
# Get upload URLs
|
|
470
|
+
upload_result = await self._upload_request(
|
|
471
|
+
name=name,
|
|
472
|
+
size=size,
|
|
473
|
+
region=region,
|
|
474
|
+
output=output,
|
|
475
|
+
config=config,
|
|
476
|
+
location=location,
|
|
477
|
+
force_loc=force_loc,
|
|
478
|
+
overwrite=overwrite,
|
|
479
|
+
server=server,
|
|
480
|
+
skip_existing=skip_existing
|
|
481
|
+
)
|
|
482
|
+
|
|
431
483
|
requests_url = upload_result.get('requests_url')
|
|
432
484
|
manifest_url = upload_result.get('manifest_url')
|
|
485
|
+
|
|
433
486
|
if not requests_url:
|
|
434
487
|
raise ValueError("No requests_url returned from server for request JSON upload")
|
|
435
488
|
|
|
489
|
+
# Upload request JSON file
|
|
436
490
|
try:
|
|
437
|
-
# in this place we are uploading the request json file, we need to check whether the json is in the correct format or not
|
|
438
491
|
self.validate_request(request_json)
|
|
439
492
|
requests_response = await self._upload_file(request_json, requests_url, use_gzip=True)
|
|
440
493
|
if requests_response.status not in [200, 201, 204]:
|
|
@@ -446,15 +499,17 @@ class MassStats:
|
|
|
446
499
|
if not manifest_url:
|
|
447
500
|
raise ValueError("No manifest_url returned from server for manifest JSON upload")
|
|
448
501
|
|
|
502
|
+
# Upload manifest JSON data directly (no temporary file needed)
|
|
449
503
|
try:
|
|
450
|
-
manifest_response = await self.
|
|
504
|
+
manifest_response = await self._upload_json_data(manifest_groups, manifest_url, use_gzip=False)
|
|
451
505
|
if manifest_response.status not in [200, 201, 204]:
|
|
452
506
|
self._client.logger.error(f"Manifest upload error: {manifest_response.text()}")
|
|
453
507
|
raise Exception(f"Failed to upload manifest JSON: {manifest_response.text()}")
|
|
454
508
|
except Exception as e:
|
|
455
|
-
raise Exception(f"Error uploading manifest JSON
|
|
456
|
-
|
|
457
|
-
|
|
509
|
+
raise Exception(f"Error uploading manifest JSON: {e}")
|
|
510
|
+
|
|
511
|
+
# Start the job
|
|
512
|
+
start_job_task_id = await self.start_job(upload_result.get("id"))
|
|
458
513
|
return start_job_task_id
|
|
459
514
|
|
|
460
515
|
@require_api_key
|