terrakio-core 0.4.8__py3-none-any.whl → 0.4.93__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of terrakio-core might be problematic. Click here for more details.
- terrakio_core/__init__.py +1 -1
- terrakio_core/accessors.py +800 -328
- terrakio_core/async_client.py +6 -2
- terrakio_core/convenience_functions/create_dataset_file.py +132 -0
- terrakio_core/convenience_functions/geoquries.py +102 -0
- terrakio_core/convenience_functions/{convenience_functions.py → zonal_stats.py} +166 -263
- terrakio_core/endpoints/mass_stats.py +42 -147
- terrakio_core/sync_client.py +0 -340
- terrakio_core-0.4.93.dist-info/METADATA +31 -0
- {terrakio_core-0.4.8.dist-info → terrakio_core-0.4.93.dist-info}/RECORD +11 -10
- {terrakio_core-0.4.8.dist-info → terrakio_core-0.4.93.dist-info}/WHEEL +1 -2
- terrakio_core-0.4.8.dist-info/METADATA +0 -47
- terrakio_core-0.4.8.dist-info/top_level.txt +0 -1
terrakio_core/accessors.py
CHANGED
|
@@ -1,151 +1,370 @@
|
|
|
1
|
-
|
|
1
|
+
# Standard library imports
|
|
2
|
+
import inspect
|
|
3
|
+
import time
|
|
4
|
+
import weakref
|
|
5
|
+
from typing import List, Optional, Union
|
|
6
|
+
|
|
7
|
+
# Third-party imports
|
|
2
8
|
import geopandas as gpd
|
|
3
|
-
import xarray as xr
|
|
4
9
|
import numpy as np
|
|
5
|
-
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import xarray as xr
|
|
12
|
+
|
|
13
|
+
# Local/relative imports
|
|
14
|
+
from .convenience_functions.zonal_stats import cloud_object
|
|
15
|
+
from .endpoints.mass_stats import MassStats
|
|
6
16
|
|
|
7
17
|
@pd.api.extensions.register_dataframe_accessor("geo")
|
|
8
18
|
class GeoXarrayAccessor:
|
|
9
|
-
"""
|
|
10
|
-
Custom accessor for GeoDataFrames containing xarray datasets or dataarrays.
|
|
11
|
-
Handles both direct xarray objects and lists containing xarray objects.
|
|
12
|
-
Can aggregate across time when time dimension has been expanded into the index.
|
|
13
|
-
"""
|
|
14
19
|
|
|
15
20
|
def __init__(self, pandas_obj):
|
|
16
21
|
self._obj = pandas_obj
|
|
22
|
+
|
|
23
|
+
# Only initialize client for cloud_object instances
|
|
24
|
+
if isinstance(pandas_obj, cloud_object):
|
|
25
|
+
self._client = pandas_obj.client
|
|
26
|
+
else:
|
|
27
|
+
self._client = None
|
|
28
|
+
|
|
29
|
+
chain_state = self._obj.attrs.get('_geo_chain_state', None)
|
|
30
|
+
|
|
31
|
+
if chain_state:
|
|
32
|
+
self._pending_operations = chain_state.get('pending_operations', [])
|
|
33
|
+
self._operation_sequence_id = chain_state.get('operation_sequence_id', None)
|
|
34
|
+
self._last_operation_time = chain_state.get('last_operation_time', None)
|
|
35
|
+
self._operation_count = chain_state.get('operation_count', 0)
|
|
36
|
+
self._processing_in_progress = chain_state.get('processing_in_progress', False)
|
|
37
|
+
else:
|
|
38
|
+
self._pending_operations = []
|
|
39
|
+
self._operation_sequence_id = None
|
|
40
|
+
self._operation_count = 0
|
|
41
|
+
self._last_operation_time = None
|
|
42
|
+
self._processing_in_progress = False
|
|
43
|
+
|
|
44
|
+
self._chain_refs = weakref.WeakSet()
|
|
17
45
|
self._validate()
|
|
18
|
-
|
|
46
|
+
|
|
19
47
|
def _validate(self):
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
48
|
+
if isinstance(self._obj, gpd.GeoDataFrame):
|
|
49
|
+
pass
|
|
50
|
+
elif isinstance(self._obj, cloud_object):
|
|
51
|
+
pass
|
|
52
|
+
elif isinstance(self._obj, pd.DataFrame) and hasattr(self._obj, '_has_index_geometry'):
|
|
53
|
+
pass
|
|
54
|
+
elif isinstance(self._obj, pd.DataFrame) and hasattr(self._obj.index, 'names'):
|
|
55
|
+
geometry_level = self._get_geometry_level_name()
|
|
56
|
+
if geometry_level is None:
|
|
57
|
+
raise AttributeError("Can only use .geo accessor with GeoDataFrames or DataFrames with geometry in index")
|
|
58
|
+
else:
|
|
59
|
+
raise AttributeError("Can only use .geo accessor with GeoDataFrames or DataFrames with geometry in index")
|
|
23
60
|
|
|
24
|
-
# Check for columns with xarray data (including lists containing xarray objects)
|
|
25
61
|
self._xarray_columns = []
|
|
62
|
+
self._scalar_columns = []
|
|
63
|
+
|
|
26
64
|
for col in self._obj.columns:
|
|
27
65
|
if col != 'geometry':
|
|
28
66
|
sample_value = self._obj[col].iloc[0] if len(self._obj) > 0 else None
|
|
29
67
|
|
|
30
|
-
# Check if it's directly an xarray object
|
|
31
68
|
if isinstance(sample_value, (xr.Dataset, xr.DataArray)):
|
|
32
69
|
self._xarray_columns.append(col)
|
|
33
|
-
# Check if it's a list containing xarray objects
|
|
34
70
|
elif isinstance(sample_value, list) and len(sample_value) > 0:
|
|
35
71
|
if isinstance(sample_value[0], (xr.Dataset, xr.DataArray)):
|
|
36
72
|
self._xarray_columns.append(col)
|
|
73
|
+
elif isinstance(sample_value, (int, float, np.integer, np.floating)):
|
|
74
|
+
self._scalar_columns.append(col)
|
|
75
|
+
elif pd.isna(sample_value):
|
|
76
|
+
self._scalar_columns.append(col)
|
|
37
77
|
|
|
38
|
-
if not self._xarray_columns:
|
|
39
|
-
raise AttributeError("No xarray Dataset or
|
|
40
|
-
|
|
41
|
-
def _extract_xarray_object(self, value):
|
|
42
|
-
"""Extract xarray object from various formats (direct object, list, etc.)."""
|
|
43
|
-
if isinstance(value, (xr.Dataset, xr.DataArray)):
|
|
44
|
-
return value
|
|
45
|
-
elif isinstance(value, list) and len(value) > 0:
|
|
46
|
-
if isinstance(value[0], (xr.Dataset, xr.DataArray)):
|
|
47
|
-
return value[0] # Take the first item from the list
|
|
48
|
-
return None
|
|
49
|
-
|
|
50
|
-
def _get_target_columns(self, columns: Optional[List[str]] = None) -> List[str]:
|
|
51
|
-
"""
|
|
52
|
-
Get the list of columns to operate on.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
columns: List of column names to operate on. If None, uses all xarray columns.
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
List of column names to operate on
|
|
59
|
-
"""
|
|
60
|
-
if columns is None:
|
|
61
|
-
return self._xarray_columns
|
|
62
|
-
|
|
63
|
-
# Validate that specified columns exist and contain xarray data
|
|
64
|
-
invalid_columns = [col for col in columns if col not in self._xarray_columns]
|
|
65
|
-
if invalid_columns:
|
|
66
|
-
raise ValueError(f"Columns {invalid_columns} are not valid xarray columns. "
|
|
67
|
-
f"Available xarray columns: {self._xarray_columns}")
|
|
68
|
-
|
|
69
|
-
return columns
|
|
78
|
+
if not self._xarray_columns and not self._scalar_columns:
|
|
79
|
+
raise AttributeError("No xarray Dataset, DataArray, or aggregated scalar columns found")
|
|
70
80
|
|
|
71
81
|
def _should_aggregate_by_geometry(self, dim: Optional[Union[str, List[str]]] = None) -> bool:
|
|
72
|
-
"""
|
|
73
|
-
Determine if we should aggregate by geometry (i.e., time dimension was expanded to index).
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
dim: Dimension(s) being reduced over
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
True if we should group by geometry and aggregate across time rows
|
|
80
|
-
"""
|
|
81
82
|
if dim is None:
|
|
82
83
|
return False
|
|
83
84
|
|
|
84
85
|
dims_to_reduce = [dim] if isinstance(dim, str) else dim
|
|
85
86
|
|
|
86
|
-
# Check if 'time' is in the dimensions to reduce and if we have a MultiIndex with time
|
|
87
87
|
if 'time' in dims_to_reduce:
|
|
88
88
|
if hasattr(self._obj.index, 'names') and self._obj.index.names:
|
|
89
|
-
# Check if time is one of the index levels
|
|
90
89
|
return 'time' in self._obj.index.names
|
|
91
90
|
|
|
92
91
|
return False
|
|
93
92
|
|
|
94
93
|
def _get_geometry_level_name(self) -> Optional[str]:
|
|
95
|
-
"""Get the name of the geometry level in the MultiIndex."""
|
|
96
94
|
if hasattr(self._obj.index, 'names') and self._obj.index.names:
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
non_time_levels = [name for name in self._obj.index.names if name != 'time']
|
|
96
|
+
if len(non_time_levels) == 1:
|
|
97
|
+
return non_time_levels[0]
|
|
98
|
+
|
|
99
|
+
for i, name in enumerate(self._obj.index.names):
|
|
99
100
|
if name != 'time':
|
|
100
|
-
|
|
101
|
+
try:
|
|
102
|
+
sample_value = self._obj.index.get_level_values(i)[0]
|
|
103
|
+
if hasattr(sample_value, 'geom_type') or hasattr(sample_value, 'bounds'):
|
|
104
|
+
return name
|
|
105
|
+
except (IndexError, AttributeError):
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
if non_time_levels:
|
|
109
|
+
return non_time_levels[0]
|
|
110
|
+
|
|
101
111
|
return None
|
|
102
112
|
|
|
103
|
-
def
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
113
|
+
def _try_convert_to_scalar(self, data):
|
|
114
|
+
if isinstance(data, xr.DataArray) and data.size == 1:
|
|
115
|
+
try:
|
|
116
|
+
return float(data.values)
|
|
117
|
+
except (ValueError, TypeError):
|
|
118
|
+
pass
|
|
119
|
+
elif isinstance(data, xr.Dataset) and len(data.dims) == 0:
|
|
120
|
+
try:
|
|
121
|
+
vars_list = list(data.data_vars.keys())
|
|
122
|
+
if len(vars_list) == 1:
|
|
123
|
+
var_name = vars_list[0]
|
|
124
|
+
return float(data[var_name].values)
|
|
125
|
+
except (ValueError, TypeError, KeyError):
|
|
126
|
+
pass
|
|
127
|
+
return data
|
|
128
|
+
|
|
129
|
+
def _ensure_proper_geodataframe(self, result_data, result_geometries, result_index, geometry_level):
|
|
130
|
+
result_df = pd.DataFrame(result_data)
|
|
131
|
+
result_df['geometry'] = result_geometries
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
crs = self._obj.crs
|
|
135
|
+
except AttributeError:
|
|
136
|
+
crs = None
|
|
137
|
+
|
|
138
|
+
result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry', crs=crs)
|
|
139
|
+
|
|
140
|
+
if geometry_level:
|
|
141
|
+
result_gdf = result_gdf.set_index(['geometry'])
|
|
142
|
+
result_gdf.index.name = geometry_level
|
|
122
143
|
else:
|
|
123
|
-
|
|
144
|
+
result_gdf = result_gdf.set_index(['geometry'])
|
|
145
|
+
|
|
146
|
+
result_gdf._original_crs = crs
|
|
147
|
+
result_gdf._index_geometry_level = geometry_level
|
|
148
|
+
result_gdf._has_index_geometry = True
|
|
149
|
+
|
|
150
|
+
return result_gdf
|
|
151
|
+
|
|
152
|
+
def to_index_geometry(self):
|
|
153
|
+
if not hasattr(self._obj, '_has_index_geometry') or not self._obj._has_index_geometry:
|
|
154
|
+
return self._obj
|
|
155
|
+
|
|
156
|
+
data_columns = [col for col in self._obj.columns if col != 'geometry']
|
|
157
|
+
result_df = self._obj[data_columns].copy()
|
|
158
|
+
|
|
159
|
+
result_df._original_crs = getattr(self._obj, 'crs', None)
|
|
160
|
+
result_df._index_geometry_level = getattr(self._obj, '_index_geometry_level', None)
|
|
161
|
+
|
|
162
|
+
return result_df
|
|
163
|
+
|
|
164
|
+
def to_column_geometry(self):
|
|
165
|
+
if 'geometry' in self._obj.columns:
|
|
166
|
+
return self._obj
|
|
167
|
+
|
|
168
|
+
if hasattr(self._obj, '_index_geometry_level'):
|
|
169
|
+
geometry_level = self._obj._index_geometry_level
|
|
170
|
+
geometry_series = self._obj.index.to_series()
|
|
171
|
+
|
|
172
|
+
result_gdf = gpd.GeoDataFrame(
|
|
173
|
+
self._obj.copy(),
|
|
174
|
+
geometry=geometry_series,
|
|
175
|
+
crs=getattr(self._obj, '_original_crs', None)
|
|
176
|
+
)
|
|
177
|
+
result_gdf._has_index_geometry = True
|
|
178
|
+
result_gdf._index_geometry_level = geometry_level
|
|
179
|
+
|
|
180
|
+
return result_gdf
|
|
181
|
+
|
|
182
|
+
return self._obj
|
|
183
|
+
|
|
184
|
+
def _get_geometry_level_name(self):
|
|
185
|
+
if hasattr(self._obj.index, 'names'):
|
|
186
|
+
for name in self._obj.index.names:
|
|
187
|
+
if name and 'geometry' in str(name).lower():
|
|
188
|
+
return name
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
def _inspect_call_stack_for_chain_end(self) -> bool:
|
|
192
|
+
try:
|
|
193
|
+
stack = inspect.stack()
|
|
194
|
+
|
|
195
|
+
for i, frame_info in enumerate(stack[1:8]):
|
|
196
|
+
if frame_info.code_context:
|
|
197
|
+
line = ''.join(frame_info.code_context).strip()
|
|
198
|
+
|
|
199
|
+
if any(internal in frame_info.filename for internal in
|
|
200
|
+
['pandas', 'numpy', 'site-packages', '<frozen']):
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
if '.geo.' in line:
|
|
204
|
+
geo_count = line.count('.geo.')
|
|
205
|
+
pending_count = len(self._pending_operations)
|
|
206
|
+
|
|
207
|
+
if pending_count >= geo_count:
|
|
208
|
+
return True
|
|
209
|
+
else:
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
except Exception:
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
def _schedule_chain_completion_check(self):
|
|
218
|
+
return self._inspect_call_stack_for_chain_end()
|
|
219
|
+
|
|
220
|
+
def _trigger_processing_immediately(self):
|
|
221
|
+
import concurrent.futures
|
|
222
|
+
|
|
223
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
224
|
+
future = executor.submit(self._sync_generate_and_start_processing)
|
|
225
|
+
try:
|
|
226
|
+
job_result = future.result(timeout=35)
|
|
227
|
+
return job_result
|
|
228
|
+
except concurrent.futures.TimeoutError:
|
|
229
|
+
return None
|
|
230
|
+
except Exception as e:
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
def _extract_xarray_object(self, value):
|
|
234
|
+
if isinstance(value, (xr.Dataset, xr.DataArray)):
|
|
235
|
+
return value
|
|
236
|
+
elif isinstance(value, list) and len(value) > 0:
|
|
237
|
+
if isinstance(value[0], (xr.Dataset, xr.DataArray)):
|
|
238
|
+
return value[0]
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
if pd.isna(value):
|
|
242
|
+
return None
|
|
243
|
+
except (TypeError, ValueError):
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
def _get_target_columns(self, columns: Optional[List[str]] = None) -> tuple:
|
|
249
|
+
if columns is None:
|
|
250
|
+
return self._xarray_columns, self._scalar_columns
|
|
251
|
+
|
|
252
|
+
all_valid_columns = self._xarray_columns + self._scalar_columns
|
|
253
|
+
invalid_columns = [col for col in columns if col not in all_valid_columns]
|
|
254
|
+
if invalid_columns:
|
|
255
|
+
raise ValueError(f"Columns {invalid_columns} are not valid xarray or scalar columns. "
|
|
256
|
+
f"Available columns: {all_valid_columns}")
|
|
257
|
+
|
|
258
|
+
target_xarray = [col for col in columns if col in self._xarray_columns]
|
|
259
|
+
target_scalar = [col for col in columns if col in self._scalar_columns]
|
|
260
|
+
|
|
261
|
+
return target_xarray, target_scalar
|
|
262
|
+
|
|
263
|
+
def _apply_spatial_reduction(self, reduction_func: str, spatial_dims: Optional[List[str]],
|
|
264
|
+
target_xarray_columns: List[str], **kwargs):
|
|
265
|
+
result_gdf = self._obj.copy()
|
|
266
|
+
|
|
267
|
+
for col in target_xarray_columns:
|
|
268
|
+
new_data = []
|
|
269
|
+
for idx, row in self._obj.iterrows():
|
|
270
|
+
original_value = row[col]
|
|
271
|
+
xr_data = self._extract_xarray_object(original_value)
|
|
272
|
+
|
|
273
|
+
if xr_data is not None:
|
|
274
|
+
try:
|
|
275
|
+
if hasattr(xr_data, reduction_func):
|
|
276
|
+
if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
|
|
277
|
+
kwargs['skipna'] = True
|
|
278
|
+
|
|
279
|
+
if spatial_dims:
|
|
280
|
+
available_spatial_dims = [d for d in spatial_dims if d in xr_data.dims]
|
|
281
|
+
if available_spatial_dims:
|
|
282
|
+
reduced_data = getattr(xr_data, reduction_func)(dim=available_spatial_dims, **kwargs)
|
|
283
|
+
else:
|
|
284
|
+
reduced_data = xr_data
|
|
285
|
+
else:
|
|
286
|
+
reduced_data = getattr(xr_data, reduction_func)(dim=None, **kwargs)
|
|
287
|
+
|
|
288
|
+
reduced_data = self._try_convert_to_scalar(reduced_data)
|
|
289
|
+
|
|
290
|
+
if isinstance(original_value, list):
|
|
291
|
+
new_data.append([reduced_data])
|
|
292
|
+
else:
|
|
293
|
+
new_data.append(reduced_data)
|
|
294
|
+
else:
|
|
295
|
+
raise AttributeError(f"'{type(xr_data).__name__}' object has no attribute '{reduction_func}'")
|
|
296
|
+
except Exception as e:
|
|
297
|
+
new_data.append(original_value)
|
|
298
|
+
else:
|
|
299
|
+
new_data.append(original_value)
|
|
300
|
+
|
|
301
|
+
result_gdf[col] = new_data
|
|
302
|
+
|
|
303
|
+
return result_gdf
|
|
124
304
|
|
|
125
|
-
def
|
|
126
|
-
target_columns: List[str], **kwargs):
|
|
127
|
-
"""
|
|
128
|
-
Apply aggregation across time by grouping by geometry.
|
|
129
|
-
|
|
130
|
-
Args:
|
|
131
|
-
reduction_func: Name of the reduction method
|
|
132
|
-
dim: Dimension(s) being reduced (should include 'time')
|
|
133
|
-
target_columns: Columns to operate on
|
|
134
|
-
**kwargs: Additional arguments
|
|
135
|
-
|
|
136
|
-
Returns:
|
|
137
|
-
GeoDataFrame with time-aggregated data
|
|
138
|
-
"""
|
|
305
|
+
def _apply_scalar_temporal_aggregation(self, reduction_func: str, target_scalar_columns: List[str], **kwargs):
|
|
139
306
|
geometry_level = self._get_geometry_level_name()
|
|
140
307
|
if geometry_level is None:
|
|
141
308
|
raise ValueError("Could not identify geometry level in MultiIndex")
|
|
142
309
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
310
|
+
grouped = self._obj.groupby(level=geometry_level)
|
|
311
|
+
|
|
312
|
+
result_data = []
|
|
313
|
+
result_geometries = []
|
|
314
|
+
result_index = []
|
|
315
|
+
|
|
316
|
+
for geometry_key, group in grouped:
|
|
317
|
+
new_row = {}
|
|
318
|
+
|
|
319
|
+
for col in target_scalar_columns:
|
|
320
|
+
try:
|
|
321
|
+
if reduction_func == 'mean':
|
|
322
|
+
agg_value = group[col].mean(skipna=True)
|
|
323
|
+
elif reduction_func == 'sum':
|
|
324
|
+
agg_value = group[col].sum(skipna=True)
|
|
325
|
+
elif reduction_func == 'std':
|
|
326
|
+
agg_value = group[col].std(skipna=True)
|
|
327
|
+
elif reduction_func == 'var':
|
|
328
|
+
agg_value = group[col].var(skipna=True)
|
|
329
|
+
elif reduction_func == 'min':
|
|
330
|
+
agg_value = group[col].min(skipna=True)
|
|
331
|
+
elif reduction_func == 'max':
|
|
332
|
+
agg_value = group[col].max(skipna=True)
|
|
333
|
+
elif reduction_func == 'median':
|
|
334
|
+
agg_value = group[col].median(skipna=True)
|
|
335
|
+
elif reduction_func == 'count':
|
|
336
|
+
agg_value = group[col].count()
|
|
337
|
+
elif reduction_func == 'quantile':
|
|
338
|
+
q = kwargs.get('q', 0.5)
|
|
339
|
+
agg_value = group[col].quantile(q, skipna=True)
|
|
340
|
+
else:
|
|
341
|
+
agg_value = group[col].mean(skipna=True)
|
|
342
|
+
|
|
343
|
+
new_row[col] = agg_value
|
|
344
|
+
|
|
345
|
+
except Exception as e:
|
|
346
|
+
new_row[col] = np.nan
|
|
347
|
+
|
|
348
|
+
for col in self._obj.columns:
|
|
349
|
+
if col not in target_scalar_columns and col != 'geometry':
|
|
350
|
+
new_row[col] = group[col].iloc[0]
|
|
351
|
+
|
|
352
|
+
result_data.append(new_row)
|
|
353
|
+
result_geometries.append(geometry_key)
|
|
354
|
+
result_index.append(geometry_key)
|
|
355
|
+
|
|
356
|
+
return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
|
|
357
|
+
|
|
358
|
+
def _apply_mixed_aggregation(self, reduction_func: str, temporal_dims: List[str],
|
|
359
|
+
spatial_dims: List[str], target_xarray_columns: List[str],
|
|
360
|
+
target_scalar_columns: List[str], **kwargs):
|
|
361
|
+
geometry_level = self._get_geometry_level_name()
|
|
362
|
+
if geometry_level is None:
|
|
363
|
+
raise ValueError("Could not identify geometry level in MultiIndex")
|
|
364
|
+
|
|
365
|
+
if target_xarray_columns != self._xarray_columns:
|
|
366
|
+
target_xarray_columns = self._xarray_columns
|
|
147
367
|
|
|
148
|
-
# Group by geometry level
|
|
149
368
|
grouped = self._obj.groupby(level=geometry_level)
|
|
150
369
|
|
|
151
370
|
result_data = []
|
|
@@ -153,24 +372,22 @@ class GeoXarrayAccessor:
|
|
|
153
372
|
result_index = []
|
|
154
373
|
|
|
155
374
|
for geometry_key, group in grouped:
|
|
156
|
-
# For each geometry, collect all xarray objects across time
|
|
157
|
-
# The geometry is the group key itself (from the MultiIndex)
|
|
158
375
|
new_row = {}
|
|
159
376
|
|
|
160
|
-
for col in
|
|
377
|
+
for col in target_xarray_columns:
|
|
161
378
|
xarray_objects = []
|
|
379
|
+
valid_time_steps = 0
|
|
380
|
+
total_time_steps = len(group)
|
|
162
381
|
|
|
163
|
-
# Collect all xarray objects for this geometry across different times
|
|
164
382
|
for _, row in group.iterrows():
|
|
165
383
|
xr_data = self._extract_xarray_object(row[col])
|
|
166
384
|
if xr_data is not None:
|
|
167
385
|
xarray_objects.append(xr_data)
|
|
386
|
+
valid_time_steps += 1
|
|
168
387
|
|
|
169
388
|
if xarray_objects:
|
|
170
389
|
try:
|
|
171
|
-
# Concatenate along a new 'time' dimension
|
|
172
390
|
if isinstance(xarray_objects[0], xr.DataArray):
|
|
173
|
-
# Create time coordinate
|
|
174
391
|
time_coords = list(range(len(xarray_objects)))
|
|
175
392
|
concatenated = xr.concat(xarray_objects, dim='time')
|
|
176
393
|
concatenated = concatenated.assign_coords(time=time_coords)
|
|
@@ -181,31 +398,27 @@ class GeoXarrayAccessor:
|
|
|
181
398
|
else:
|
|
182
399
|
raise TypeError(f"Unsupported xarray type: {type(xarray_objects[0])}")
|
|
183
400
|
|
|
184
|
-
# Apply the reduction function over the time dimension
|
|
185
401
|
if hasattr(concatenated, reduction_func):
|
|
186
402
|
if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
|
|
187
403
|
kwargs['skipna'] = True
|
|
188
404
|
|
|
189
|
-
|
|
405
|
+
if temporal_dims:
|
|
406
|
+
reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
|
|
407
|
+
else:
|
|
408
|
+
reduced_data = concatenated
|
|
409
|
+
|
|
410
|
+
if spatial_dims:
|
|
411
|
+
available_spatial_dims = [d for d in spatial_dims if d in reduced_data.dims]
|
|
412
|
+
if available_spatial_dims:
|
|
413
|
+
reduced_data = getattr(reduced_data, reduction_func)(dim=available_spatial_dims, **kwargs)
|
|
190
414
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
pass
|
|
198
|
-
elif isinstance(reduced_data, xr.Dataset) and len(reduced_data.dims) == 0:
|
|
199
|
-
try:
|
|
200
|
-
vars_list = list(reduced_data.data_vars.keys())
|
|
201
|
-
if len(vars_list) == 1:
|
|
202
|
-
var_name = vars_list[0]
|
|
203
|
-
scalar_value = float(reduced_data[var_name].values)
|
|
204
|
-
reduced_data = scalar_value
|
|
205
|
-
except (ValueError, TypeError, KeyError):
|
|
206
|
-
pass
|
|
415
|
+
all_dims_reduced = (
|
|
416
|
+
temporal_dims and spatial_dims and
|
|
417
|
+
set(temporal_dims + spatial_dims) >= set(reduced_data.dims)
|
|
418
|
+
)
|
|
419
|
+
if all_dims_reduced:
|
|
420
|
+
reduced_data = self._try_convert_to_scalar(reduced_data)
|
|
207
421
|
|
|
208
|
-
# Maintain original format (list vs direct)
|
|
209
422
|
original_format = group[col].iloc[0]
|
|
210
423
|
if isinstance(original_format, list):
|
|
211
424
|
new_row[col] = [reduced_data]
|
|
@@ -215,51 +428,215 @@ class GeoXarrayAccessor:
|
|
|
215
428
|
raise AttributeError(f"'{type(concatenated).__name__}' object has no attribute '{reduction_func}'")
|
|
216
429
|
|
|
217
430
|
except Exception as e:
|
|
218
|
-
|
|
219
|
-
# Keep the first value as fallback
|
|
220
|
-
new_row[col] = group[col].iloc[0]
|
|
431
|
+
new_row[col] = np.nan
|
|
221
432
|
else:
|
|
222
|
-
|
|
433
|
+
new_row[col] = np.nan
|
|
434
|
+
|
|
435
|
+
for col in target_scalar_columns:
|
|
436
|
+
new_row[col] = group[col].iloc[0]
|
|
437
|
+
|
|
438
|
+
for col in self._obj.columns:
|
|
439
|
+
if (col not in target_xarray_columns and
|
|
440
|
+
col not in target_scalar_columns and
|
|
441
|
+
col != 'geometry'):
|
|
223
442
|
new_row[col] = group[col].iloc[0]
|
|
224
443
|
|
|
225
444
|
result_data.append(new_row)
|
|
226
445
|
result_geometries.append(geometry_key)
|
|
227
446
|
result_index.append(geometry_key)
|
|
228
447
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
448
|
+
return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
|
|
449
|
+
|
|
450
|
+
def _apply_mixed_scalar_xarray_aggregation(self, reduction_func: str, temporal_dims: List[str],
|
|
451
|
+
spatial_dims: List[str], target_xarray_columns: List[str],
|
|
452
|
+
target_scalar_columns: List[str], **kwargs):
|
|
453
|
+
geometry_level = self._get_geometry_level_name()
|
|
454
|
+
if geometry_level is None:
|
|
455
|
+
raise ValueError("Could not identify geometry level in MultiIndex")
|
|
232
456
|
|
|
233
|
-
|
|
234
|
-
result_df['_temp_geom'] = result_geometries
|
|
457
|
+
grouped = self._obj.groupby(level=geometry_level)
|
|
235
458
|
|
|
236
|
-
|
|
237
|
-
|
|
459
|
+
result_data = []
|
|
460
|
+
result_geometries = []
|
|
461
|
+
result_index = []
|
|
238
462
|
|
|
239
|
-
|
|
240
|
-
|
|
463
|
+
for geometry_key, group in grouped:
|
|
464
|
+
new_row = {}
|
|
465
|
+
|
|
466
|
+
for col in target_xarray_columns:
|
|
467
|
+
xarray_objects = []
|
|
468
|
+
|
|
469
|
+
for _, row in group.iterrows():
|
|
470
|
+
xr_data = self._extract_xarray_object(row[col])
|
|
471
|
+
if xr_data is not None:
|
|
472
|
+
xarray_objects.append(xr_data)
|
|
473
|
+
|
|
474
|
+
if xarray_objects:
|
|
475
|
+
try:
|
|
476
|
+
if isinstance(xarray_objects[0], xr.DataArray):
|
|
477
|
+
time_coords = list(range(len(xarray_objects)))
|
|
478
|
+
concatenated = xr.concat(xarray_objects, dim='time')
|
|
479
|
+
concatenated = concatenated.assign_coords(time=time_coords)
|
|
480
|
+
elif isinstance(xarray_objects[0], xr.Dataset):
|
|
481
|
+
time_coords = list(range(len(xarray_objects)))
|
|
482
|
+
concatenated = xr.concat(xarray_objects, dim='time')
|
|
483
|
+
concatenated = concatenated.assign_coords(time=time_coords)
|
|
484
|
+
else:
|
|
485
|
+
raise TypeError(f"Unsupported xarray type: {type(xarray_objects[0])}")
|
|
486
|
+
|
|
487
|
+
if hasattr(concatenated, reduction_func):
|
|
488
|
+
if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
|
|
489
|
+
kwargs['skipna'] = True
|
|
490
|
+
|
|
491
|
+
if temporal_dims:
|
|
492
|
+
reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
|
|
493
|
+
else:
|
|
494
|
+
reduced_data = concatenated
|
|
495
|
+
|
|
496
|
+
if spatial_dims:
|
|
497
|
+
available_spatial_dims = [d for d in spatial_dims if d in reduced_data.dims]
|
|
498
|
+
if available_spatial_dims:
|
|
499
|
+
reduced_data = getattr(reduced_data, reduction_func)(dim=available_spatial_dims, **kwargs)
|
|
500
|
+
|
|
501
|
+
all_dims_reduced = (
|
|
502
|
+
temporal_dims and spatial_dims and
|
|
503
|
+
set(temporal_dims + spatial_dims) >= set(reduced_data.dims)
|
|
504
|
+
)
|
|
505
|
+
if all_dims_reduced:
|
|
506
|
+
reduced_data = self._try_convert_to_scalar(reduced_data)
|
|
507
|
+
|
|
508
|
+
original_format = group[col].iloc[0]
|
|
509
|
+
if isinstance(original_format, list):
|
|
510
|
+
new_row[col] = [reduced_data]
|
|
511
|
+
else:
|
|
512
|
+
new_row[col] = reduced_data
|
|
513
|
+
else:
|
|
514
|
+
raise AttributeError(f"'{type(concatenated).__name__}' object has no attribute '{reduction_func}'")
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
new_row[col] = np.nan
|
|
518
|
+
else:
|
|
519
|
+
new_row[col] = np.nan
|
|
520
|
+
|
|
521
|
+
for col in target_scalar_columns:
|
|
522
|
+
try:
|
|
523
|
+
if reduction_func == 'mean':
|
|
524
|
+
agg_value = group[col].mean(skipna=True)
|
|
525
|
+
elif reduction_func == 'sum':
|
|
526
|
+
agg_value = group[col].sum(skipna=True)
|
|
527
|
+
elif reduction_func == 'std':
|
|
528
|
+
agg_value = group[col].std(skipna=True)
|
|
529
|
+
elif reduction_func == 'var':
|
|
530
|
+
agg_value = group[col].var(skipna=True)
|
|
531
|
+
elif reduction_func == 'min':
|
|
532
|
+
agg_value = group[col].min(skipna=True)
|
|
533
|
+
elif reduction_func == 'max':
|
|
534
|
+
agg_value = group[col].max(skipna=True)
|
|
535
|
+
elif reduction_func == 'median':
|
|
536
|
+
agg_value = group[col].median(skipna=True)
|
|
537
|
+
elif reduction_func == 'count':
|
|
538
|
+
agg_value = group[col].count()
|
|
539
|
+
elif reduction_func == 'quantile':
|
|
540
|
+
q = kwargs.get('q', 0.5)
|
|
541
|
+
agg_value = group[col].quantile(q, skipna=True)
|
|
542
|
+
else:
|
|
543
|
+
agg_value = group[col].mean(skipna=True)
|
|
544
|
+
|
|
545
|
+
new_row[col] = agg_value
|
|
546
|
+
|
|
547
|
+
except Exception as e:
|
|
548
|
+
new_row[col] = np.nan
|
|
549
|
+
|
|
550
|
+
for col in self._obj.columns:
|
|
551
|
+
if (col not in target_xarray_columns and
|
|
552
|
+
col not in target_scalar_columns and
|
|
553
|
+
col != 'geometry'):
|
|
554
|
+
new_row[col] = group[col].iloc[0]
|
|
555
|
+
|
|
556
|
+
result_data.append(new_row)
|
|
557
|
+
result_geometries.append(geometry_key)
|
|
558
|
+
result_index.append(geometry_key)
|
|
559
|
+
|
|
560
|
+
return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
|
|
561
|
+
|
|
562
|
+
def _apply_temporal_aggregation(self, reduction_func: str, temporal_dims: List[str],
|
|
563
|
+
target_xarray_columns: List[str], target_scalar_columns: List[str], **kwargs):
|
|
564
|
+
geometry_level = self._get_geometry_level_name()
|
|
565
|
+
if geometry_level is None:
|
|
566
|
+
raise ValueError("Could not identify geometry level in MultiIndex")
|
|
241
567
|
|
|
242
|
-
|
|
568
|
+
if target_xarray_columns != self._xarray_columns:
|
|
569
|
+
target_xarray_columns = self._xarray_columns
|
|
243
570
|
|
|
244
|
-
|
|
571
|
+
grouped = self._obj.groupby(level=geometry_level)
|
|
572
|
+
|
|
573
|
+
result_data = []
|
|
574
|
+
result_geometries = []
|
|
575
|
+
result_index = []
|
|
576
|
+
|
|
577
|
+
for geometry_key, group in grouped:
|
|
578
|
+
new_row = {}
|
|
579
|
+
|
|
580
|
+
for col in target_xarray_columns:
|
|
581
|
+
xarray_objects = []
|
|
582
|
+
|
|
583
|
+
for _, row in group.iterrows():
|
|
584
|
+
xr_data = self._extract_xarray_object(row[col])
|
|
585
|
+
if xr_data is not None:
|
|
586
|
+
xarray_objects.append(xr_data)
|
|
587
|
+
|
|
588
|
+
if xarray_objects:
|
|
589
|
+
try:
|
|
590
|
+
if isinstance(xarray_objects[0], xr.DataArray):
|
|
591
|
+
time_coords = list(range(len(xarray_objects)))
|
|
592
|
+
concatenated = xr.concat(xarray_objects, dim='time')
|
|
593
|
+
concatenated = concatenated.assign_coords(time=time_coords)
|
|
594
|
+
elif isinstance(xarray_objects[0], xr.Dataset):
|
|
595
|
+
time_coords = list(range(len(xarray_objects)))
|
|
596
|
+
concatenated = xr.concat(xarray_objects, dim='time')
|
|
597
|
+
concatenated = concatenated.assign_coords(time=time_coords)
|
|
598
|
+
else:
|
|
599
|
+
raise TypeError(f"Unsupported xarray type: {type(xarray_objects[0])}")
|
|
600
|
+
|
|
601
|
+
if hasattr(concatenated, reduction_func):
|
|
602
|
+
if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
|
|
603
|
+
kwargs['skipna'] = True
|
|
604
|
+
|
|
605
|
+
reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
|
|
606
|
+
|
|
607
|
+
original_format = group[col].iloc[0]
|
|
608
|
+
if isinstance(original_format, list):
|
|
609
|
+
new_row[col] = [reduced_data]
|
|
610
|
+
else:
|
|
611
|
+
new_row[col] = reduced_data
|
|
612
|
+
else:
|
|
613
|
+
raise AttributeError(f"'{type(concatenated).__name__}' object has no attribute '{reduction_func}'")
|
|
614
|
+
|
|
615
|
+
except Exception as e:
|
|
616
|
+
new_row[col] = np.nan
|
|
617
|
+
else:
|
|
618
|
+
new_row[col] = np.nan
|
|
619
|
+
|
|
620
|
+
for col in target_scalar_columns:
|
|
621
|
+
new_row[col] = group[col].iloc[0]
|
|
622
|
+
|
|
623
|
+
for col in self._obj.columns:
|
|
624
|
+
if (col not in target_xarray_columns and
|
|
625
|
+
col not in target_scalar_columns and
|
|
626
|
+
col != 'geometry'):
|
|
627
|
+
new_row[col] = group[col].iloc[0]
|
|
628
|
+
|
|
629
|
+
result_data.append(new_row)
|
|
630
|
+
result_geometries.append(geometry_key)
|
|
631
|
+
result_index.append(geometry_key)
|
|
632
|
+
|
|
633
|
+
return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
|
|
245
634
|
|
|
246
|
-
def _apply_spatial_reduction(self, reduction_func: str,
|
|
247
|
-
|
|
248
|
-
"""
|
|
249
|
-
Apply reduction to spatial dimensions within each xarray object.
|
|
250
|
-
|
|
251
|
-
Args:
|
|
252
|
-
reduction_func: Name of the reduction method
|
|
253
|
-
dim: Spatial dimension(s) to reduce over
|
|
254
|
-
target_columns: Columns to operate on
|
|
255
|
-
**kwargs: Additional arguments
|
|
256
|
-
|
|
257
|
-
Returns:
|
|
258
|
-
GeoDataFrame with spatially reduced data
|
|
259
|
-
"""
|
|
635
|
+
def _apply_spatial_reduction(self, reduction_func: str, spatial_dims: Optional[List[str]],
|
|
636
|
+
target_xarray_columns: List[str], **kwargs):
|
|
260
637
|
result_gdf = self._obj.copy()
|
|
261
638
|
|
|
262
|
-
for col in
|
|
639
|
+
for col in target_xarray_columns:
|
|
263
640
|
new_data = []
|
|
264
641
|
for idx, row in self._obj.iterrows():
|
|
265
642
|
original_value = row[col]
|
|
@@ -267,33 +644,21 @@ class GeoXarrayAccessor:
|
|
|
267
644
|
|
|
268
645
|
if xr_data is not None:
|
|
269
646
|
try:
|
|
270
|
-
# Apply the reduction function
|
|
271
647
|
if hasattr(xr_data, reduction_func):
|
|
272
|
-
# Ensure skipna=True is set by default for most reduction functions
|
|
273
648
|
if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
|
|
274
649
|
kwargs['skipna'] = True
|
|
275
|
-
reduced_data = getattr(xr_data, reduction_func)(dim=dim, **kwargs)
|
|
276
650
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
if
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
if len(reduced_data.dims) == 0:
|
|
288
|
-
vars_list = list(reduced_data.data_vars.keys())
|
|
289
|
-
if len(vars_list) == 1:
|
|
290
|
-
var_name = vars_list[0]
|
|
291
|
-
scalar_value = float(reduced_data[var_name].values)
|
|
292
|
-
reduced_data = scalar_value
|
|
293
|
-
except (ValueError, TypeError, KeyError):
|
|
294
|
-
pass
|
|
651
|
+
if spatial_dims:
|
|
652
|
+
available_spatial_dims = [d for d in spatial_dims if d in xr_data.dims]
|
|
653
|
+
if available_spatial_dims:
|
|
654
|
+
reduced_data = getattr(xr_data, reduction_func)(dim=available_spatial_dims, **kwargs)
|
|
655
|
+
else:
|
|
656
|
+
reduced_data = xr_data
|
|
657
|
+
else:
|
|
658
|
+
reduced_data = getattr(xr_data, reduction_func)(dim=None, **kwargs)
|
|
659
|
+
|
|
660
|
+
reduced_data = self._try_convert_to_scalar(reduced_data)
|
|
295
661
|
|
|
296
|
-
# Keep the same format as original (list vs direct)
|
|
297
662
|
if isinstance(original_value, list):
|
|
298
663
|
new_data.append([reduced_data])
|
|
299
664
|
else:
|
|
@@ -301,177 +666,284 @@ class GeoXarrayAccessor:
|
|
|
301
666
|
else:
|
|
302
667
|
raise AttributeError(f"'{type(xr_data).__name__}' object has no attribute '{reduction_func}'")
|
|
303
668
|
except Exception as e:
|
|
304
|
-
# If reduction fails, keep original data
|
|
305
|
-
print(f"Warning: Could not apply {reduction_func} to row {idx}, column {col}: {e}")
|
|
306
669
|
new_data.append(original_value)
|
|
307
670
|
else:
|
|
308
|
-
# If it's not xarray data, keep as is
|
|
309
671
|
new_data.append(original_value)
|
|
310
672
|
|
|
311
673
|
result_gdf[col] = new_data
|
|
312
674
|
|
|
313
675
|
return result_gdf
|
|
676
|
+
|
|
677
|
+
def _apply_cloud_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
|
|
678
|
+
columns: Optional[List[str]] = None, **kwargs):
|
|
679
|
+
current_time = time.time()
|
|
680
|
+
chain_reset_threshold = 0.01
|
|
681
|
+
|
|
682
|
+
if (self._last_operation_time is None or
|
|
683
|
+
current_time - self._last_operation_time > chain_reset_threshold):
|
|
684
|
+
|
|
685
|
+
if not self._pending_operations:
|
|
686
|
+
self._operation_sequence_id = int(current_time * 1000)
|
|
687
|
+
self._operation_count = 0
|
|
688
|
+
|
|
689
|
+
self._last_operation_time = current_time
|
|
690
|
+
self._operation_count += 1
|
|
691
|
+
|
|
692
|
+
params = {"dim": dim, "columns": columns, **kwargs}
|
|
693
|
+
description = f"Apply {reduction_func} over dimension(s): {dim}" if dim else f"Apply {reduction_func} over all dimensions"
|
|
694
|
+
|
|
695
|
+
operation = {
|
|
696
|
+
"type": reduction_func,
|
|
697
|
+
"description": description,
|
|
698
|
+
"params": params,
|
|
699
|
+
"timestamp": pd.Timestamp.now(),
|
|
700
|
+
"sequence_id": self._operation_sequence_id
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
self._pending_operations.append(operation)
|
|
704
|
+
|
|
705
|
+
chain_complete = self._schedule_chain_completion_check()
|
|
706
|
+
|
|
707
|
+
result = self._obj.copy()
|
|
708
|
+
result.attrs = self._obj.attrs.copy()
|
|
709
|
+
|
|
710
|
+
if hasattr(self._obj, 'client'):
|
|
711
|
+
object.__setattr__(result, 'client', self._obj.client)
|
|
712
|
+
if hasattr(self._obj, 'job_id'):
|
|
713
|
+
object.__setattr__(result, 'job_id', self._obj.job_id)
|
|
714
|
+
if hasattr(self._obj, 'job_name'):
|
|
715
|
+
object.__setattr__(result, 'job_name', self._obj.job_name)
|
|
716
|
+
|
|
717
|
+
if not result.attrs:
|
|
718
|
+
result.attrs = {}
|
|
719
|
+
if chain_complete:
|
|
720
|
+
job_result = self._trigger_processing_immediately()
|
|
721
|
+
# result.attrs['job_id'] = job_result
|
|
722
|
+
return job_result
|
|
723
|
+
|
|
724
|
+
result.attrs['_geo_chain_state'] = {
|
|
725
|
+
'pending_operations': self._pending_operations,
|
|
726
|
+
'operation_sequence_id': self._operation_sequence_id,
|
|
727
|
+
'last_operation_time': self._last_operation_time,
|
|
728
|
+
'operation_count': self._operation_count,
|
|
729
|
+
'processing_in_progress': getattr(self, '_processing_in_progress', False)
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
return result
|
|
733
|
+
|
|
734
|
+
def _apply_local_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
|
|
735
|
+
columns: Optional[List[str]] = None, **kwargs):
|
|
736
|
+
target_xarray_columns, target_scalar_columns = self._get_target_columns(columns)
|
|
737
|
+
|
|
738
|
+
if dim is None:
|
|
739
|
+
if target_xarray_columns:
|
|
740
|
+
return self._apply_spatial_reduction(reduction_func, dim, target_xarray_columns, **kwargs)
|
|
741
|
+
else:
|
|
742
|
+
return self._obj.copy()
|
|
743
|
+
|
|
744
|
+
dims_to_reduce = [dim] if isinstance(dim, str) else dim
|
|
745
|
+
|
|
746
|
+
temporal_dims = [d for d in dims_to_reduce if d == 'time']
|
|
747
|
+
spatial_dims = [d for d in dims_to_reduce if d != 'time']
|
|
748
|
+
|
|
749
|
+
has_temporal_agg = (
|
|
750
|
+
temporal_dims and
|
|
751
|
+
hasattr(self._obj.index, 'names') and
|
|
752
|
+
self._obj.index.names and
|
|
753
|
+
'time' in self._obj.index.names
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
if has_temporal_agg and target_scalar_columns and not target_xarray_columns:
|
|
757
|
+
return self._apply_scalar_temporal_aggregation(reduction_func, target_scalar_columns, **kwargs)
|
|
758
|
+
|
|
759
|
+
if has_temporal_agg and target_scalar_columns and target_xarray_columns:
|
|
760
|
+
return self._apply_mixed_scalar_xarray_aggregation(reduction_func, temporal_dims, spatial_dims,
|
|
761
|
+
target_xarray_columns, target_scalar_columns, **kwargs)
|
|
762
|
+
|
|
763
|
+
if not target_xarray_columns and target_scalar_columns:
|
|
764
|
+
if spatial_dims:
|
|
765
|
+
pass
|
|
766
|
+
return self._obj.copy()
|
|
767
|
+
|
|
768
|
+
if has_temporal_agg and spatial_dims:
|
|
769
|
+
return self._apply_mixed_aggregation(reduction_func, temporal_dims, spatial_dims,
|
|
770
|
+
target_xarray_columns, target_scalar_columns, **kwargs)
|
|
771
|
+
elif has_temporal_agg:
|
|
772
|
+
return self._apply_temporal_aggregation(reduction_func, temporal_dims,
|
|
773
|
+
target_xarray_columns, target_scalar_columns, **kwargs)
|
|
774
|
+
else:
|
|
775
|
+
return self._apply_spatial_reduction(reduction_func, spatial_dims,
|
|
776
|
+
target_xarray_columns, **kwargs)
|
|
777
|
+
|
|
778
|
+
def _apply_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
|
|
779
|
+
columns: Optional[List[str]] = None, **kwargs):
|
|
780
|
+
if isinstance(self._obj, cloud_object):
|
|
781
|
+
return self._apply_cloud_reduction(reduction_func = reduction_func, dim = dim, columns = columns, **kwargs)
|
|
782
|
+
else:
|
|
783
|
+
return self._apply_local_reduction(reduction_func = reduction_func, dim = dim, columns = columns, **kwargs)
|
|
784
|
+
|
|
785
|
+
def _sync_generate_and_start_processing(self):
|
|
786
|
+
if not self._pending_operations or getattr(self, '_processing_in_progress', False):
|
|
787
|
+
return None
|
|
788
|
+
|
|
789
|
+
self._processing_in_progress = True
|
|
790
|
+
|
|
791
|
+
try:
|
|
792
|
+
sequence_id = self._operation_sequence_id
|
|
793
|
+
script_content = self._generate_post_processing_script()
|
|
794
|
+
client = self._client
|
|
795
|
+
if client:
|
|
796
|
+
mass_stats = MassStats(client)
|
|
797
|
+
|
|
798
|
+
import asyncio
|
|
799
|
+
import concurrent.futures
|
|
800
|
+
|
|
801
|
+
def run_async():
|
|
802
|
+
loop = asyncio.new_event_loop()
|
|
803
|
+
asyncio.set_event_loop(loop)
|
|
804
|
+
# we don't actually have the dataset name, currently it is just getting job named zonal stats job
|
|
805
|
+
try:
|
|
806
|
+
return loop.run_until_complete(
|
|
807
|
+
mass_stats.zonal_stats_transform(
|
|
808
|
+
data_name=self._obj.job_name,
|
|
809
|
+
output="netcdf",
|
|
810
|
+
consumer = script_content.encode('utf-8'),
|
|
811
|
+
overwrite=True,
|
|
812
|
+
)
|
|
813
|
+
)
|
|
814
|
+
finally:
|
|
815
|
+
loop.close()
|
|
816
|
+
|
|
817
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
818
|
+
future = executor.submit(run_async)
|
|
819
|
+
result = future.result(timeout=30)
|
|
820
|
+
return result
|
|
821
|
+
|
|
822
|
+
return None
|
|
823
|
+
|
|
824
|
+
except Exception as e:
|
|
825
|
+
return None
|
|
826
|
+
finally:
|
|
827
|
+
self._processing_in_progress = False
|
|
828
|
+
self._pending_operations.clear()
|
|
829
|
+
|
|
830
|
+
def _generate_post_processing_script(self) -> str:
|
|
831
|
+
script_lines = [
|
|
832
|
+
"import pandas as pd",
|
|
833
|
+
"import xarray as xr",
|
|
834
|
+
"import numpy as np",
|
|
835
|
+
"from io import BytesIO",
|
|
836
|
+
"import tempfile",
|
|
837
|
+
"import os",
|
|
838
|
+
"",
|
|
839
|
+
"def consume(filename, file_bytes, metadata):",
|
|
840
|
+
]
|
|
841
|
+
|
|
842
|
+
script_lines.extend([
|
|
843
|
+
" ",
|
|
844
|
+
" try:",
|
|
845
|
+
" with tempfile.NamedTemporaryFile(suffix='.nc', delete=False) as tmp_file:",
|
|
846
|
+
" tmp_file.write(file_bytes)",
|
|
847
|
+
" tmp_file.flush()",
|
|
848
|
+
" ds = xr.open_dataset(tmp_file.name, engine='scipy')",
|
|
849
|
+
" ",
|
|
850
|
+
])
|
|
851
|
+
|
|
852
|
+
for i, op in enumerate(self._pending_operations):
|
|
853
|
+
op_type = op['type']
|
|
854
|
+
params = op['params']
|
|
855
|
+
dim = params.get('dim')
|
|
856
|
+
|
|
857
|
+
if dim:
|
|
858
|
+
dim_str = repr(dim)
|
|
859
|
+
script_lines.append(f" ds = ds.{op_type}(dim={dim_str}, skipna=True)")
|
|
860
|
+
else:
|
|
861
|
+
script_lines.append(f" ds = ds.{op_type}(skipna=True)")
|
|
862
|
+
script_lines.append("")
|
|
863
|
+
|
|
864
|
+
script_lines.extend([
|
|
865
|
+
" # Determine output format based on data structure",
|
|
866
|
+
" base_filename = os.path.splitext(filename)[0]",
|
|
867
|
+
" ",
|
|
868
|
+
" # Check if all data variables are scalar (0-dimensional)",
|
|
869
|
+
" all_scalar = True",
|
|
870
|
+
" for var_name in ds.data_vars:",
|
|
871
|
+
" if ds[var_name].dims:",
|
|
872
|
+
" all_scalar = False",
|
|
873
|
+
" break",
|
|
874
|
+
" ",
|
|
875
|
+
" if all_scalar:",
|
|
876
|
+
" # Output as CSV - all variables are scalar",
|
|
877
|
+
" result_data = {}",
|
|
878
|
+
" for var_name in ds.data_vars:",
|
|
879
|
+
" result_data[var_name] = float(ds[var_name].values)",
|
|
880
|
+
" ",
|
|
881
|
+
" result_df = pd.DataFrame([result_data])",
|
|
882
|
+
' output_filename = f"{base_filename}_processed.csv"',
|
|
883
|
+
" csv_data = result_df.to_csv(index=False).encode()",
|
|
884
|
+
" ",
|
|
885
|
+
" ds.close()",
|
|
886
|
+
" os.unlink(tmp_file.name)",
|
|
887
|
+
" return output_filename, csv_data",
|
|
888
|
+
" else:",
|
|
889
|
+
" # Output as NetCDF - still has dimensions",
|
|
890
|
+
' output_filename = f"{base_filename}_processed.nc"',
|
|
891
|
+
" # Use temp file instead of BytesIO to avoid buffer closing issues",
|
|
892
|
+
" with tempfile.NamedTemporaryFile(suffix='.nc', delete=False) as nc_tmp_file:",
|
|
893
|
+
" ds.to_netcdf(nc_tmp_file.name, format='NETCDF3_64BIT')",
|
|
894
|
+
" ",
|
|
895
|
+
" # Read the temp file back as bytes",
|
|
896
|
+
" with open(nc_tmp_file.name, 'rb') as f:",
|
|
897
|
+
" netcdf_data = f.read()",
|
|
898
|
+
" ",
|
|
899
|
+
" # Clean up temp files",
|
|
900
|
+
" os.unlink(nc_tmp_file.name)",
|
|
901
|
+
" ",
|
|
902
|
+
" ds.close()",
|
|
903
|
+
" os.unlink(tmp_file.name)",
|
|
904
|
+
" return output_filename, netcdf_data",
|
|
905
|
+
])
|
|
906
|
+
|
|
907
|
+
script_lines.extend([
|
|
908
|
+
" ",
|
|
909
|
+
" except Exception as e:",
|
|
910
|
+
" try:",
|
|
911
|
+
" os.unlink(tmp_file.name)",
|
|
912
|
+
" except:",
|
|
913
|
+
" pass",
|
|
914
|
+
" try:",
|
|
915
|
+
" os.unlink(nc_tmp_file.name)",
|
|
916
|
+
" except:",
|
|
917
|
+
" pass",
|
|
918
|
+
" return None, None",
|
|
919
|
+
])
|
|
920
|
+
|
|
921
|
+
return "\n".join(script_lines)
|
|
922
|
+
|
|
923
|
+
@property
|
|
924
|
+
def job_id(self):
|
|
925
|
+
return self._obj.attrs.get('job_id')
|
|
314
926
|
|
|
315
927
|
def mean(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
316
|
-
"""
|
|
317
|
-
Calculate mean of xarray datasets/dataarrays.
|
|
318
|
-
|
|
319
|
-
Args:
|
|
320
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
321
|
-
If spatial dims like 'x', 'y', reduces within each xarray object.
|
|
322
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
323
|
-
**kwargs: Additional arguments for the reduction function
|
|
324
|
-
"""
|
|
325
928
|
return self._apply_reduction('mean', dim=dim, columns=columns, **kwargs)
|
|
326
929
|
|
|
327
930
|
def sum(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
328
|
-
"""
|
|
329
|
-
Calculate sum of xarray datasets/dataarrays.
|
|
330
|
-
|
|
331
|
-
Args:
|
|
332
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
333
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
334
|
-
**kwargs: Additional arguments for the reduction function
|
|
335
|
-
"""
|
|
336
931
|
return self._apply_reduction('sum', dim=dim, columns=columns, **kwargs)
|
|
337
932
|
|
|
933
|
+
def max(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
934
|
+
return self._apply_reduction('max', dim=dim, columns=columns, **kwargs)
|
|
935
|
+
|
|
936
|
+
def min(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
937
|
+
return self._apply_reduction('min', dim=dim, columns=columns, **kwargs)
|
|
938
|
+
|
|
338
939
|
def std(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
339
|
-
"""
|
|
340
|
-
Calculate standard deviation of xarray datasets/dataarrays.
|
|
341
|
-
|
|
342
|
-
Args:
|
|
343
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
344
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
345
|
-
**kwargs: Additional arguments for the reduction function
|
|
346
|
-
"""
|
|
347
940
|
return self._apply_reduction('std', dim=dim, columns=columns, **kwargs)
|
|
348
941
|
|
|
349
942
|
def var(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
350
|
-
"""
|
|
351
|
-
Calculate variance of xarray datasets/dataarrays.
|
|
352
|
-
|
|
353
|
-
Args:
|
|
354
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
355
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
356
|
-
**kwargs: Additional arguments for the reduction function
|
|
357
|
-
"""
|
|
358
943
|
return self._apply_reduction('var', dim=dim, columns=columns, **kwargs)
|
|
359
944
|
|
|
360
|
-
def min(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
361
|
-
"""
|
|
362
|
-
Calculate minimum of xarray datasets/dataarrays.
|
|
363
|
-
|
|
364
|
-
Args:
|
|
365
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
366
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
367
|
-
**kwargs: Additional arguments for the reduction function
|
|
368
|
-
"""
|
|
369
|
-
return self._apply_reduction('min', dim=dim, columns=columns, **kwargs)
|
|
370
|
-
|
|
371
|
-
def max(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
372
|
-
"""
|
|
373
|
-
Calculate maximum of xarray datasets/dataarrays.
|
|
374
|
-
|
|
375
|
-
Args:
|
|
376
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
377
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
378
|
-
**kwargs: Additional arguments for the reduction function
|
|
379
|
-
"""
|
|
380
|
-
return self._apply_reduction('max', dim=dim, columns=columns, **kwargs)
|
|
381
|
-
|
|
382
945
|
def median(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
383
|
-
"""
|
|
384
|
-
Calculate median of xarray datasets/dataarrays.
|
|
385
|
-
|
|
386
|
-
Args:
|
|
387
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
388
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
389
|
-
**kwargs: Additional arguments for the reduction function
|
|
390
|
-
"""
|
|
391
946
|
return self._apply_reduction('median', dim=dim, columns=columns, **kwargs)
|
|
392
947
|
|
|
393
|
-
def quantile(self, q: float, dim: Optional[Union[str, List[str]]] = None,
|
|
394
|
-
columns: Optional[List[str]] = None, **kwargs):
|
|
395
|
-
"""
|
|
396
|
-
Calculate quantile of xarray datasets/dataarrays.
|
|
397
|
-
|
|
398
|
-
Args:
|
|
399
|
-
q: Quantile to compute (between 0 and 1)
|
|
400
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
401
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
402
|
-
**kwargs: Additional arguments for the reduction function
|
|
403
|
-
"""
|
|
404
|
-
return self._apply_reduction('quantile', dim=dim, columns=columns, q=q, **kwargs)
|
|
405
|
-
|
|
406
948
|
def count(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
|
|
407
|
-
|
|
408
|
-
Count non-NaN values in xarray datasets/dataarrays.
|
|
409
|
-
|
|
410
|
-
Args:
|
|
411
|
-
dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
|
|
412
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
413
|
-
**kwargs: Additional arguments for the reduction function
|
|
414
|
-
"""
|
|
415
|
-
return self._apply_reduction('count', dim=dim, columns=columns, **kwargs)
|
|
416
|
-
|
|
417
|
-
def to_values(self, columns: Optional[List[str]] = None):
|
|
418
|
-
"""
|
|
419
|
-
Extract scalar values from xarray dataarrays and add them as new columns.
|
|
420
|
-
Useful when dataarrays have been reduced to single values.
|
|
421
|
-
|
|
422
|
-
Args:
|
|
423
|
-
columns: List of column names to operate on. If None, operates on all xarray columns
|
|
424
|
-
|
|
425
|
-
Returns:
|
|
426
|
-
GeoDataFrame with extracted values as new columns
|
|
427
|
-
"""
|
|
428
|
-
result_gdf = self._obj.copy()
|
|
429
|
-
target_columns = self._get_target_columns(columns)
|
|
430
|
-
|
|
431
|
-
for col in target_columns:
|
|
432
|
-
values_to_add = []
|
|
433
|
-
for idx, row in self._obj.iterrows():
|
|
434
|
-
xr_data = self._extract_xarray_object(row[col])
|
|
435
|
-
if isinstance(xr_data, xr.DataArray):
|
|
436
|
-
try:
|
|
437
|
-
if xr_data.size == 1:
|
|
438
|
-
scalar_value = float(xr_data.values)
|
|
439
|
-
values_to_add.append(scalar_value)
|
|
440
|
-
else:
|
|
441
|
-
values_to_add.append(np.nan) # Can't convert non-scalar to value
|
|
442
|
-
except (ValueError, TypeError):
|
|
443
|
-
values_to_add.append(np.nan)
|
|
444
|
-
else:
|
|
445
|
-
values_to_add.append(np.nan)
|
|
446
|
-
|
|
447
|
-
# Add new column with scalar values
|
|
448
|
-
new_col_name = f"{col}_value"
|
|
449
|
-
result_gdf[new_col_name] = values_to_add
|
|
450
|
-
|
|
451
|
-
return result_gdf
|
|
452
|
-
|
|
453
|
-
def info(self):
|
|
454
|
-
"""Print information about xarray datasets/dataarrays in the GeoDataFrame."""
|
|
455
|
-
print(f"GeoDataFrame shape: {self._obj.shape}")
|
|
456
|
-
print(f"Columns: {list(self._obj.columns)}")
|
|
457
|
-
print(f"Xarray columns: {self._xarray_columns}")
|
|
458
|
-
print(f"Index structure: {self._obj.index.names if hasattr(self._obj.index, 'names') else 'Simple index'}")
|
|
459
|
-
print(f"Geometry column name: {self._obj.geometry.name if hasattr(self._obj.geometry, 'name') else 'No geometry name'}")
|
|
460
|
-
|
|
461
|
-
if hasattr(self._obj.index, 'names') and 'time' in self._obj.index.names:
|
|
462
|
-
print("Note: Time dimension appears to be expanded into the index.")
|
|
463
|
-
print("Use dim='time' to aggregate across time rows for each geometry.")
|
|
464
|
-
|
|
465
|
-
for col in self._xarray_columns:
|
|
466
|
-
print(f"\n--- Column: {col} ---")
|
|
467
|
-
sample_data = self._extract_xarray_object(self._obj[col].iloc[0]) if len(self._obj) > 0 else None
|
|
468
|
-
if isinstance(sample_data, xr.Dataset):
|
|
469
|
-
print(f"Type: xarray.Dataset")
|
|
470
|
-
print(f"Variables: {list(sample_data.data_vars.keys())}")
|
|
471
|
-
print(f"Dimensions: {list(sample_data.dims.keys())}")
|
|
472
|
-
print(f"Coordinates: {list(sample_data.coords.keys())}")
|
|
473
|
-
elif isinstance(sample_data, xr.DataArray):
|
|
474
|
-
print(f"Type: xarray.DataArray")
|
|
475
|
-
print(f"Dimensions: {list(sample_data.dims)}")
|
|
476
|
-
print(f"Shape: {sample_data.shape}")
|
|
477
|
-
print(f"Data type: {sample_data.dtype}")
|
|
949
|
+
return self._apply_reduction('count', dim=dim, columns=columns, **kwargs)
|