terrakio-core 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of terrakio-core might be problematic. Click here for more details.
- terrakio_core/__init__.py +1 -1
- terrakio_core/async_client.py +21 -2
- terrakio_core/client.py +101 -5
- terrakio_core/convenience_functions/convenience_functions.py +280 -29
- terrakio_core/endpoints/mass_stats.py +71 -16
- terrakio_core/endpoints/model_management.py +388 -217
- terrakio_core/endpoints/user_management.py +5 -5
- terrakio_core/sync_client.py +106 -185
- {terrakio_core-0.3.9.dist-info → terrakio_core-0.4.0.dist-info}/METADATA +1 -1
- {terrakio_core-0.3.9.dist-info → terrakio_core-0.4.0.dist-info}/RECORD +12 -12
- {terrakio_core-0.3.9.dist-info → terrakio_core-0.4.0.dist-info}/WHEEL +0 -0
- {terrakio_core-0.3.9.dist-info → terrakio_core-0.4.0.dist-info}/top_level.txt +0 -0
terrakio_core/__init__.py
CHANGED
terrakio_core/async_client.py
CHANGED
|
@@ -182,6 +182,11 @@ class AsyncClient(BaseClient):
|
|
|
182
182
|
out_crs: str = "epsg:4326",
|
|
183
183
|
resolution: int = -1,
|
|
184
184
|
geom_fix: bool = False,
|
|
185
|
+
drop_nan: bool = False,
|
|
186
|
+
spatial_reduction: str = None,
|
|
187
|
+
temporal_reduction: str = None,
|
|
188
|
+
max_memory_mb: int = 500,
|
|
189
|
+
stream_to_disk: bool = False,
|
|
185
190
|
):
|
|
186
191
|
"""
|
|
187
192
|
Compute zonal statistics for all geometries in a GeoDataFrame.
|
|
@@ -195,11 +200,20 @@ class AsyncClient(BaseClient):
|
|
|
195
200
|
out_crs (str): Output coordinate reference system
|
|
196
201
|
resolution (int): Resolution parameter
|
|
197
202
|
geom_fix (bool): Whether to fix the geometry (default False)
|
|
203
|
+
drop_nan (bool): Whether to drop NaN values from the results (default False)
|
|
204
|
+
spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
|
|
205
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
206
|
+
temporal_reduction (str): Reduction operation for temporal dimension (time).
|
|
207
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
208
|
+
max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
|
|
209
|
+
stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
|
|
210
|
+
|
|
198
211
|
Returns:
|
|
199
212
|
geopandas.GeoDataFrame: GeoDataFrame with added columns for results, or None if inplace=True
|
|
213
|
+
If stream_to_disk=True, large datasets are saved as NetCDF files with file paths stored.
|
|
200
214
|
|
|
201
215
|
Raises:
|
|
202
|
-
ValueError: If concurrency is too high
|
|
216
|
+
ValueError: If concurrency is too high or if data exceeds memory limit without streaming
|
|
203
217
|
APIError: If the API request fails
|
|
204
218
|
"""
|
|
205
219
|
return await _zonal_stats(
|
|
@@ -211,7 +225,12 @@ class AsyncClient(BaseClient):
|
|
|
211
225
|
in_crs=in_crs,
|
|
212
226
|
out_crs=out_crs,
|
|
213
227
|
resolution=resolution,
|
|
214
|
-
geom_fix=geom_fix
|
|
228
|
+
geom_fix=geom_fix,
|
|
229
|
+
drop_nan=drop_nan,
|
|
230
|
+
spatial_reduction=spatial_reduction,
|
|
231
|
+
temporal_reduction=temporal_reduction,
|
|
232
|
+
max_memory_mb=max_memory_mb,
|
|
233
|
+
stream_to_disk=stream_to_disk
|
|
215
234
|
)
|
|
216
235
|
|
|
217
236
|
async def create_dataset_file(
|
terrakio_core/client.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from terrakio_core.config import read_config_file, DEFAULT_CONFIG_FILE
|
|
4
5
|
from abc import abstractmethod
|
|
6
|
+
import xarray as xr
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
class BaseClient():
|
|
7
10
|
def __init__(self, url: Optional[str] = None, api_key: Optional[str] = None, verbose: bool = False):
|
|
8
|
-
|
|
11
|
+
|
|
9
12
|
self.verbose = verbose
|
|
10
13
|
self.logger = logging.getLogger("terrakio")
|
|
11
14
|
if verbose:
|
|
@@ -21,17 +24,110 @@ class BaseClient():
|
|
|
21
24
|
self.url = url
|
|
22
25
|
self.key = api_key
|
|
23
26
|
|
|
24
|
-
config = read_config_file(
|
|
25
|
-
|
|
27
|
+
config = read_config_file(DEFAULT_CONFIG_FILE, logger=self.logger)
|
|
28
|
+
|
|
26
29
|
if self.url is None:
|
|
27
30
|
self.url = config.get('url')
|
|
28
|
-
|
|
31
|
+
|
|
29
32
|
if self.key is None:
|
|
30
33
|
self.key = config.get('key')
|
|
31
34
|
|
|
32
35
|
self.token = config.get('token')
|
|
33
|
-
|
|
34
36
|
|
|
37
|
+
# Apply xarray printing fix to prevent crashes with GeoDataFrames
|
|
38
|
+
self._apply_xarray_fix()
|
|
39
|
+
|
|
40
|
+
def _apply_xarray_fix(self):
|
|
41
|
+
"""
|
|
42
|
+
Apply xarray printing fix to prevent crashes when GeoDataFrames contain xarray objects.
|
|
43
|
+
This fix is applied automatically when the client is initialized.
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
|
|
47
|
+
# Check if fix is already applied globally
|
|
48
|
+
if hasattr(xr.DataArray, '_terrakio_fix_applied'):
|
|
49
|
+
if self.verbose:
|
|
50
|
+
self.logger.info("xarray printing fix already applied")
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
# Store original methods for potential restoration
|
|
54
|
+
if not hasattr(xr.DataArray, '_original_iter'):
|
|
55
|
+
xr.DataArray._original_iter = xr.DataArray.__iter__
|
|
56
|
+
xr.Dataset._original_iter = xr.Dataset.__iter__
|
|
57
|
+
|
|
58
|
+
# Define safe iteration methods that prevent pandas from iterating
|
|
59
|
+
# but leave __repr__ and __str__ untouched for normal xarray printing
|
|
60
|
+
def safe_dataarray_iter(self):
|
|
61
|
+
# Return infinite iterator that always yields the same safe value
|
|
62
|
+
name = getattr(self, 'name', None) or 'unnamed'
|
|
63
|
+
shape_str = 'x'.join(map(str, self.shape)) if hasattr(self, 'shape') else 'unknown'
|
|
64
|
+
placeholder = f"<DataArray '{name}' {shape_str}>"
|
|
65
|
+
while True:
|
|
66
|
+
yield placeholder
|
|
67
|
+
|
|
68
|
+
def safe_dataset_iter(self):
|
|
69
|
+
# Return infinite iterator that always yields the same safe value
|
|
70
|
+
num_vars = len(self.data_vars) if hasattr(self, 'data_vars') else 0
|
|
71
|
+
num_dims = len(self.dims) if hasattr(self, 'dims') else 0
|
|
72
|
+
placeholder = f"<Dataset: {num_vars} vars, {num_dims} dims>"
|
|
73
|
+
while True:
|
|
74
|
+
yield placeholder
|
|
75
|
+
|
|
76
|
+
# Apply only the iteration fix - leave __repr__ and __str__ untouched
|
|
77
|
+
xr.DataArray.__iter__ = safe_dataarray_iter
|
|
78
|
+
xr.Dataset.__iter__ = safe_dataset_iter
|
|
79
|
+
|
|
80
|
+
# Mark as applied to avoid duplicate applications
|
|
81
|
+
xr.DataArray._terrakio_fix_applied = True
|
|
82
|
+
xr.Dataset._terrakio_fix_applied = True
|
|
83
|
+
|
|
84
|
+
if self.verbose:
|
|
85
|
+
self.logger.info("xarray iteration fix applied - GeoDataFrames with xarray objects will print safely, direct xarray printing unchanged")
|
|
86
|
+
|
|
87
|
+
except ImportError:
|
|
88
|
+
# xarray not installed, skip the fix
|
|
89
|
+
if self.verbose:
|
|
90
|
+
self.logger.info("xarray not installed, skipping printing fix")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
# Log warning but don't fail initialization
|
|
93
|
+
warning_msg = f"Failed to apply xarray printing fix: {e}"
|
|
94
|
+
warnings.warn(warning_msg)
|
|
95
|
+
if self.verbose:
|
|
96
|
+
self.logger.warning(warning_msg)
|
|
97
|
+
|
|
98
|
+
def restore_xarray_printing(self):
|
|
99
|
+
"""
|
|
100
|
+
Restore original xarray printing behavior.
|
|
101
|
+
Call this method if you want to see full xarray representations again.
|
|
102
|
+
"""
|
|
103
|
+
try:
|
|
104
|
+
import xarray as xr
|
|
105
|
+
|
|
106
|
+
if hasattr(xr.DataArray, '_original_iter'):
|
|
107
|
+
xr.DataArray.__iter__ = xr.DataArray._original_iter
|
|
108
|
+
xr.Dataset.__iter__ = xr.Dataset._original_iter
|
|
109
|
+
|
|
110
|
+
# Remove the fix markers
|
|
111
|
+
if hasattr(xr.DataArray, '_terrakio_fix_applied'):
|
|
112
|
+
delattr(xr.DataArray, '_terrakio_fix_applied')
|
|
113
|
+
if hasattr(xr.Dataset, '_terrakio_fix_applied'):
|
|
114
|
+
delattr(xr.Dataset, '_terrakio_fix_applied')
|
|
115
|
+
|
|
116
|
+
if self.verbose:
|
|
117
|
+
self.logger.info("Original xarray iteration behavior restored")
|
|
118
|
+
else:
|
|
119
|
+
if self.verbose:
|
|
120
|
+
self.logger.info("No xarray fix to restore")
|
|
121
|
+
|
|
122
|
+
except ImportError:
|
|
123
|
+
if self.verbose:
|
|
124
|
+
self.logger.info("xarray not available")
|
|
125
|
+
except Exception as e:
|
|
126
|
+
warning_msg = f"Failed to restore xarray printing: {e}"
|
|
127
|
+
warnings.warn(warning_msg)
|
|
128
|
+
if self.verbose:
|
|
129
|
+
self.logger.warning(warning_msg)
|
|
130
|
+
|
|
35
131
|
@abstractmethod
|
|
36
132
|
def _setup_session(self):
|
|
37
133
|
"""Initialize the HTTP session - implemented by sync/async clients"""
|
|
@@ -11,37 +11,39 @@ from ..helper.tiles import tiles
|
|
|
11
11
|
import uuid
|
|
12
12
|
import xarray as xr
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
async def zonal_stats(
|
|
14
|
+
async def request_data(
|
|
16
15
|
client,
|
|
17
16
|
gdf: GeoDataFrame,
|
|
18
17
|
expr: str,
|
|
19
18
|
conc: int = 20,
|
|
20
|
-
inplace: bool = False,
|
|
21
19
|
in_crs: str = "epsg:4326",
|
|
22
20
|
out_crs: str = "epsg:4326",
|
|
23
21
|
resolution: int = -1,
|
|
24
22
|
geom_fix: bool = False,
|
|
23
|
+
max_memory_mb: int = 500,
|
|
24
|
+
stream_to_disk: bool = None,
|
|
25
25
|
):
|
|
26
26
|
"""
|
|
27
|
-
|
|
27
|
+
Request xarray datasets for all geometries in a GeoDataFrame.
|
|
28
28
|
|
|
29
29
|
Args:
|
|
30
30
|
client: The AsyncClient instance
|
|
31
31
|
gdf (GeoDataFrame): GeoDataFrame containing geometries
|
|
32
32
|
expr (str): Terrakio expression to evaluate, can include spatial aggregations
|
|
33
33
|
conc (int): Number of concurrent requests to make
|
|
34
|
-
inplace (bool): Whether to modify the input GeoDataFrame in place
|
|
35
34
|
in_crs (str): Input coordinate reference system
|
|
36
35
|
out_crs (str): Output coordinate reference system
|
|
37
36
|
resolution (int): Resolution parameter
|
|
38
37
|
geom_fix (bool): Whether to fix the geometry (default False)
|
|
38
|
+
max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
|
|
39
|
+
stream_to_disk (bool): Whether to stream large datasets to disk. If None, will be determined automatically.
|
|
40
|
+
|
|
39
41
|
Returns:
|
|
40
|
-
geopandas.GeoDataFrame: GeoDataFrame with
|
|
41
|
-
|
|
42
|
+
geopandas.GeoDataFrame: Copy of input GeoDataFrame with additional 'dataset' column
|
|
43
|
+
containing the xarray Dataset for each geometry.
|
|
42
44
|
|
|
43
45
|
Raises:
|
|
44
|
-
ValueError: If concurrency is too high
|
|
46
|
+
ValueError: If concurrency is too high or if data exceeds memory limit without streaming
|
|
45
47
|
APIError: If the API request fails
|
|
46
48
|
"""
|
|
47
49
|
if conc > 100:
|
|
@@ -49,13 +51,54 @@ async def zonal_stats(
|
|
|
49
51
|
|
|
50
52
|
total_geometries = len(gdf)
|
|
51
53
|
|
|
54
|
+
# First, make a request with the first geometry to estimate total memory usage
|
|
55
|
+
client.logger.info("Estimating total memory usage...")
|
|
56
|
+
first_geom = gdf.geometry.iloc[0]
|
|
57
|
+
feature = {
|
|
58
|
+
"type": "Feature",
|
|
59
|
+
"geometry": mapping(first_geom),
|
|
60
|
+
"properties": {}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
first_result = await client.geoquery(expr=expr, feature=feature,
|
|
65
|
+
in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
|
|
66
|
+
if isinstance(first_result, dict) and first_result.get("error"):
|
|
67
|
+
error_msg = f"Request failed: {first_result.get('error_message', 'Unknown error')}"
|
|
68
|
+
if first_result.get('status_code'):
|
|
69
|
+
error_msg = f"Request failed with status {first_result['status_code']}: {first_result.get('error_message', 'Unknown error')}"
|
|
70
|
+
raise APIError(error_msg)
|
|
71
|
+
|
|
72
|
+
if not isinstance(first_result, xr.Dataset):
|
|
73
|
+
raise ValueError(f"Expected xarray Dataset, got {type(first_result)}")
|
|
74
|
+
|
|
75
|
+
# Estimate total memory usage
|
|
76
|
+
single_dataset_size_bytes = estimate_dataset_size(first_result)
|
|
77
|
+
total_size_bytes = single_dataset_size_bytes * total_geometries
|
|
78
|
+
total_size_mb = total_size_bytes / (1024 * 1024)
|
|
79
|
+
|
|
80
|
+
client.logger.info(f"Estimated total memory usage: {total_size_mb:.2f} MB for {total_geometries} geometries")
|
|
81
|
+
|
|
82
|
+
# Check if we need to stream to disk
|
|
83
|
+
if stream_to_disk is None:
|
|
84
|
+
# Auto-determine based on memory usage
|
|
85
|
+
if total_size_mb > max_memory_mb:
|
|
86
|
+
client.logger.warning(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
|
|
87
|
+
raise ValueError(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
if "recommend you to set the stream_to_disk parameter to True" in str(e):
|
|
91
|
+
raise
|
|
92
|
+
client.logger.error(f"Failed to estimate memory usage: {e}")
|
|
93
|
+
raise
|
|
94
|
+
|
|
52
95
|
client.logger.info(f"Processing {total_geometries} geometries with concurrency {conc}")
|
|
53
96
|
|
|
54
97
|
completed_count = 0
|
|
55
98
|
lock = asyncio.Lock()
|
|
56
99
|
|
|
57
100
|
async def process_geometry(geom):
|
|
58
|
-
"""Process a single geometry"""
|
|
101
|
+
"""Process a single geometry and return the dataset"""
|
|
59
102
|
nonlocal completed_count
|
|
60
103
|
|
|
61
104
|
try:
|
|
@@ -103,53 +146,261 @@ async def zonal_stats(
|
|
|
103
146
|
raise APIError(f"API request failed: {e.response.text}")
|
|
104
147
|
raise
|
|
105
148
|
|
|
106
|
-
client.logger.info("All requests completed!
|
|
149
|
+
client.logger.info("All requests completed!")
|
|
107
150
|
|
|
108
|
-
|
|
109
151
|
if not all_results:
|
|
110
152
|
raise ValueError("No valid results were returned for any geometry")
|
|
111
153
|
|
|
154
|
+
# Create a copy of the input GeoDataFrame
|
|
155
|
+
result_gdf = gdf.copy()
|
|
156
|
+
|
|
157
|
+
# Add the dataset column with the xarray datasets
|
|
158
|
+
result_gdf['dataset'] = all_results
|
|
159
|
+
|
|
160
|
+
return result_gdf
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
import os
|
|
164
|
+
from pathlib import Path
|
|
165
|
+
|
|
166
|
+
def estimate_dataset_size(dataset):
|
|
167
|
+
"""
|
|
168
|
+
Estimate the memory size of an xarray dataset in bytes.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
dataset: xarray Dataset
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
int: Estimated size in bytes
|
|
175
|
+
"""
|
|
176
|
+
total_size = 0
|
|
177
|
+
for var_name, var in dataset.data_vars.items():
|
|
178
|
+
# Get the dtype size in bytes
|
|
179
|
+
dtype_size = var.dtype.itemsize
|
|
180
|
+
# Get the total number of elements
|
|
181
|
+
total_elements = var.size
|
|
182
|
+
# Calculate total size for this variable
|
|
183
|
+
total_size += dtype_size * total_elements
|
|
184
|
+
|
|
185
|
+
# Add coordinate sizes
|
|
186
|
+
for coord_name, coord in dataset.coords.items():
|
|
187
|
+
if coord_name not in dataset.dims: # Don't double count dimension coordinates
|
|
188
|
+
dtype_size = coord.dtype.itemsize
|
|
189
|
+
total_elements = coord.size
|
|
190
|
+
total_size += dtype_size * total_elements
|
|
191
|
+
|
|
192
|
+
return total_size
|
|
112
193
|
|
|
194
|
+
def save_dataset_to_file(dataset, filepath):
|
|
195
|
+
"""
|
|
196
|
+
Save dataset to NetCDF file.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
dataset: xarray Dataset
|
|
200
|
+
filepath: Path to save the file
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
str: Path to saved file
|
|
204
|
+
"""
|
|
205
|
+
filepath = Path(filepath)
|
|
206
|
+
|
|
207
|
+
if not str(filepath).endswith('.nc'):
|
|
208
|
+
filepath = filepath.with_suffix('.nc')
|
|
209
|
+
|
|
210
|
+
dataset.to_netcdf(filepath)
|
|
211
|
+
return str(filepath)
|
|
212
|
+
|
|
213
|
+
def post_processing(
|
|
214
|
+
gdf_with_datasets: GeoDataFrame,
|
|
215
|
+
spatial_reduction: str = None,
|
|
216
|
+
temporal_reduction: str = None,
|
|
217
|
+
drop_nan: bool = False,
|
|
218
|
+
inplace: bool = False,
|
|
219
|
+
stream_to_disk: bool = False,
|
|
220
|
+
):
|
|
221
|
+
"""
|
|
222
|
+
Post-process the GeoDataFrame with datasets to extract variables with optional reductions.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
gdf_with_datasets (GeoDataFrame): GeoDataFrame with 'dataset' column containing xarray Datasets
|
|
226
|
+
spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
|
|
227
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
228
|
+
temporal_reduction (str): Reduction operation for temporal dimension (time).
|
|
229
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
230
|
+
drop_nan (bool): Whether to drop NaN values from the results (default False)
|
|
231
|
+
inplace (bool): Whether to modify the input GeoDataFrame in place
|
|
232
|
+
stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
geopandas.GeoDataFrame: GeoDataFrame with variable dataarrays/values or file paths in separate columns.
|
|
236
|
+
If stream_to_disk=True, large datasets are saved as NetCDF files with file paths stored.
|
|
237
|
+
"""
|
|
238
|
+
if 'dataset' not in gdf_with_datasets.columns:
|
|
239
|
+
raise ValueError("Input GeoDataFrame must contain a 'dataset' column")
|
|
240
|
+
|
|
241
|
+
# Validate reduction parameters
|
|
242
|
+
valid_reductions = ['mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count']
|
|
243
|
+
if spatial_reduction and spatial_reduction not in valid_reductions:
|
|
244
|
+
raise ValueError(f"spatial_reduction must be one of {valid_reductions}")
|
|
245
|
+
if temporal_reduction and temporal_reduction not in valid_reductions:
|
|
246
|
+
raise ValueError(f"temporal_reduction must be one of {valid_reductions}")
|
|
247
|
+
|
|
113
248
|
result_rows = []
|
|
114
249
|
geometries = []
|
|
115
250
|
|
|
116
|
-
#
|
|
117
|
-
for i,
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
# Create
|
|
251
|
+
# Process each row (geometry + dataset)
|
|
252
|
+
for i, row in gdf_with_datasets.iterrows():
|
|
253
|
+
dataset = row['dataset']
|
|
254
|
+
|
|
255
|
+
# Create new row for this geometry
|
|
121
256
|
new_row = {}
|
|
122
|
-
|
|
123
|
-
# Copy original GeoDataFrame attributes
|
|
124
|
-
for col in
|
|
125
|
-
if col
|
|
126
|
-
new_row[col] =
|
|
127
|
-
|
|
257
|
+
|
|
258
|
+
# Copy original GeoDataFrame attributes (excluding dataset column)
|
|
259
|
+
for col in gdf_with_datasets.columns:
|
|
260
|
+
if col not in ['geometry', 'dataset']:
|
|
261
|
+
new_row[col] = row[col]
|
|
262
|
+
|
|
263
|
+
# Process each variable in the dataset
|
|
128
264
|
data_vars = list(dataset.data_vars.keys())
|
|
129
265
|
for var_name in data_vars:
|
|
130
266
|
var_data = dataset[var_name]
|
|
131
|
-
|
|
267
|
+
|
|
268
|
+
# Apply drop_nan if requested
|
|
269
|
+
if drop_nan:
|
|
270
|
+
# Drop spatial dimensions where all values are NaN
|
|
271
|
+
var_data = var_data.dropna(dim='x', how='all').dropna(dim='y', how='all')
|
|
272
|
+
|
|
273
|
+
# Drop time dimensions where all values are NaN
|
|
274
|
+
if 'time' in var_data.dims:
|
|
275
|
+
var_data = var_data.dropna(dim='time', how='all')
|
|
276
|
+
|
|
277
|
+
# Check current dimensions to determine if aggregation is needed
|
|
278
|
+
current_dims = set(var_data.dims)
|
|
279
|
+
has_spatial_dims = bool(current_dims.intersection(['x', 'y']))
|
|
280
|
+
has_temporal_dim = 'time' in current_dims
|
|
281
|
+
|
|
282
|
+
# Apply spatial reduction only if spatial dimensions exist and reduction is requested
|
|
283
|
+
if spatial_reduction and has_spatial_dims:
|
|
284
|
+
spatial_dims = [dim for dim in ['x', 'y'] if dim in var_data.dims]
|
|
285
|
+
if spatial_dims:
|
|
286
|
+
if spatial_reduction == 'count':
|
|
287
|
+
var_data = var_data.count(dim=spatial_dims)
|
|
288
|
+
else:
|
|
289
|
+
var_data = getattr(var_data, spatial_reduction)(dim=spatial_dims)
|
|
290
|
+
|
|
291
|
+
# Apply temporal reduction only if time dimension exists and reduction is requested
|
|
292
|
+
if temporal_reduction and has_temporal_dim:
|
|
293
|
+
if temporal_reduction == 'count':
|
|
294
|
+
var_data = var_data.count(dim='time')
|
|
295
|
+
else:
|
|
296
|
+
var_data = getattr(var_data, temporal_reduction)(dim='time')
|
|
297
|
+
|
|
298
|
+
# Handle streaming to disk if requested
|
|
299
|
+
if stream_to_disk:
|
|
300
|
+
# Create a single-variable dataset for saving
|
|
301
|
+
single_var_dataset = var_data.to_dataset(name=var_name)
|
|
302
|
+
|
|
303
|
+
# Generate filename based on row index and variable name
|
|
304
|
+
filename = f"geometry_{i}_{var_name}.nc"
|
|
305
|
+
filepath = os.path.join(os.getcwd(), filename)
|
|
306
|
+
|
|
307
|
+
# Save to disk and store file path
|
|
308
|
+
saved_path = save_dataset_to_file(single_var_dataset, filepath)
|
|
309
|
+
new_row[var_name] = f"file://{saved_path}"
|
|
310
|
+
|
|
311
|
+
print(f"Dataset for geometry {i}, variable '{var_name}' saved to: {saved_path}")
|
|
312
|
+
else:
|
|
313
|
+
# Keep in memory
|
|
314
|
+
new_row[var_name] = var_data
|
|
315
|
+
|
|
132
316
|
result_rows.append(new_row)
|
|
133
|
-
geometries.append(
|
|
317
|
+
geometries.append(row['geometry'])
|
|
318
|
+
|
|
134
319
|
# Create the result GeoDataFrame with default integer index
|
|
135
320
|
result_gdf = GeoDataFrame(result_rows, geometry=geometries)
|
|
321
|
+
|
|
136
322
|
if inplace:
|
|
137
323
|
# Clear original gdf and replace with result_gdf content
|
|
138
|
-
|
|
139
|
-
|
|
324
|
+
gdf_with_datasets.drop(gdf_with_datasets.index, inplace=True)
|
|
325
|
+
gdf_with_datasets.drop(gdf_with_datasets.columns, axis=1, inplace=True)
|
|
140
326
|
|
|
141
327
|
# Copy all data from result_gdf to gdf
|
|
142
328
|
for col in result_gdf.columns:
|
|
143
|
-
|
|
329
|
+
gdf_with_datasets[col] = result_gdf[col].values
|
|
144
330
|
|
|
145
331
|
# Ensure it remains a GeoDataFrame with correct geometry
|
|
146
|
-
|
|
332
|
+
gdf_with_datasets.geometry = result_gdf.geometry
|
|
147
333
|
|
|
148
334
|
return None
|
|
149
335
|
else:
|
|
150
|
-
|
|
151
336
|
return result_gdf
|
|
152
337
|
|
|
338
|
+
|
|
339
|
+
# Updated zonal_stats function that uses both parts
|
|
340
|
+
async def zonal_stats(
|
|
341
|
+
client,
|
|
342
|
+
gdf: GeoDataFrame,
|
|
343
|
+
expr: str,
|
|
344
|
+
conc: int = 20,
|
|
345
|
+
inplace: bool = False,
|
|
346
|
+
in_crs: str = "epsg:4326",
|
|
347
|
+
out_crs: str = "epsg:4326",
|
|
348
|
+
resolution: int = -1,
|
|
349
|
+
geom_fix: bool = False,
|
|
350
|
+
drop_nan: bool = False,
|
|
351
|
+
spatial_reduction: str = None,
|
|
352
|
+
temporal_reduction: str = None,
|
|
353
|
+
max_memory_mb: int = 500,
|
|
354
|
+
stream_to_disk: bool = False,
|
|
355
|
+
):
|
|
356
|
+
"""
|
|
357
|
+
Compute zonal statistics for all geometries in a GeoDataFrame.
|
|
358
|
+
This is a convenience function that combines request_data and post_processing.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
client: The AsyncClient instance
|
|
362
|
+
gdf (GeoDataFrame): GeoDataFrame containing geometries
|
|
363
|
+
expr (str): Terrakio expression to evaluate, can include spatial aggregations
|
|
364
|
+
conc (int): Number of concurrent requests to make
|
|
365
|
+
inplace (bool): Whether to modify the input GeoDataFrame in place
|
|
366
|
+
in_crs (str): Input coordinate reference system
|
|
367
|
+
out_crs (str): Output coordinate reference system
|
|
368
|
+
resolution (int): Resolution parameter
|
|
369
|
+
geom_fix (bool): Whether to fix the geometry (default False)
|
|
370
|
+
drop_nan (bool): Whether to drop NaN values from the results (default False)
|
|
371
|
+
spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
|
|
372
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
373
|
+
temporal_reduction (str): Reduction operation for temporal dimension (time).
|
|
374
|
+
Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
|
|
375
|
+
max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
|
|
376
|
+
stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
|
|
377
|
+
"""
|
|
378
|
+
# Step 1: Request data (with memory estimation)
|
|
379
|
+
gdf_with_datasets = await request_data(
|
|
380
|
+
client=client,
|
|
381
|
+
gdf=gdf,
|
|
382
|
+
expr=expr,
|
|
383
|
+
conc=conc,
|
|
384
|
+
in_crs=in_crs,
|
|
385
|
+
out_crs=out_crs,
|
|
386
|
+
resolution=resolution,
|
|
387
|
+
geom_fix=geom_fix,
|
|
388
|
+
max_memory_mb=max_memory_mb,
|
|
389
|
+
stream_to_disk=stream_to_disk
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Step 2: Post-process with reductions and optional streaming
|
|
393
|
+
result = post_processing(
|
|
394
|
+
gdf_with_datasets=gdf_with_datasets,
|
|
395
|
+
spatial_reduction=spatial_reduction,
|
|
396
|
+
temporal_reduction=temporal_reduction,
|
|
397
|
+
drop_nan=drop_nan,
|
|
398
|
+
inplace=inplace,
|
|
399
|
+
stream_to_disk=stream_to_disk
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
return result
|
|
403
|
+
|
|
153
404
|
async def create_dataset_file(
|
|
154
405
|
client,
|
|
155
406
|
aoi: str,
|