terrakio-core 0.4.7__py3-none-any.whl → 0.4.93__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of terrakio-core might be problematic. Click here for more details.

@@ -1,151 +1,370 @@
1
- import pandas as pd
1
+ # Standard library imports
2
+ import inspect
3
+ import time
4
+ import weakref
5
+ from typing import List, Optional, Union
6
+
7
+ # Third-party imports
2
8
  import geopandas as gpd
3
- import xarray as xr
4
9
  import numpy as np
5
- from typing import Optional, Union, List
10
+ import pandas as pd
11
+ import xarray as xr
12
+
13
+ # Local/relative imports
14
+ from .convenience_functions.zonal_stats import cloud_object
15
+ from .endpoints.mass_stats import MassStats
6
16
 
7
17
  @pd.api.extensions.register_dataframe_accessor("geo")
8
18
  class GeoXarrayAccessor:
9
- """
10
- Custom accessor for GeoDataFrames containing xarray datasets or dataarrays.
11
- Handles both direct xarray objects and lists containing xarray objects.
12
- Can aggregate across time when time dimension has been expanded into the index.
13
- """
14
19
 
15
20
  def __init__(self, pandas_obj):
16
21
  self._obj = pandas_obj
22
+
23
+ # Only initialize client for cloud_object instances
24
+ if isinstance(pandas_obj, cloud_object):
25
+ self._client = pandas_obj.client
26
+ else:
27
+ self._client = None
28
+
29
+ chain_state = self._obj.attrs.get('_geo_chain_state', None)
30
+
31
+ if chain_state:
32
+ self._pending_operations = chain_state.get('pending_operations', [])
33
+ self._operation_sequence_id = chain_state.get('operation_sequence_id', None)
34
+ self._last_operation_time = chain_state.get('last_operation_time', None)
35
+ self._operation_count = chain_state.get('operation_count', 0)
36
+ self._processing_in_progress = chain_state.get('processing_in_progress', False)
37
+ else:
38
+ self._pending_operations = []
39
+ self._operation_sequence_id = None
40
+ self._operation_count = 0
41
+ self._last_operation_time = None
42
+ self._processing_in_progress = False
43
+
44
+ self._chain_refs = weakref.WeakSet()
17
45
  self._validate()
18
-
46
+
19
47
  def _validate(self):
20
- """Validate that the DataFrame has the expected structure."""
21
- if not isinstance(self._obj, gpd.GeoDataFrame):
22
- raise AttributeError("Can only use .geo accessor with GeoDataFrames")
48
+ if isinstance(self._obj, gpd.GeoDataFrame):
49
+ pass
50
+ elif isinstance(self._obj, cloud_object):
51
+ pass
52
+ elif isinstance(self._obj, pd.DataFrame) and hasattr(self._obj, '_has_index_geometry'):
53
+ pass
54
+ elif isinstance(self._obj, pd.DataFrame) and hasattr(self._obj.index, 'names'):
55
+ geometry_level = self._get_geometry_level_name()
56
+ if geometry_level is None:
57
+ raise AttributeError("Can only use .geo accessor with GeoDataFrames or DataFrames with geometry in index")
58
+ else:
59
+ raise AttributeError("Can only use .geo accessor with GeoDataFrames or DataFrames with geometry in index")
23
60
 
24
- # Check for columns with xarray data (including lists containing xarray objects)
25
61
  self._xarray_columns = []
62
+ self._scalar_columns = []
63
+
26
64
  for col in self._obj.columns:
27
65
  if col != 'geometry':
28
66
  sample_value = self._obj[col].iloc[0] if len(self._obj) > 0 else None
29
67
 
30
- # Check if it's directly an xarray object
31
68
  if isinstance(sample_value, (xr.Dataset, xr.DataArray)):
32
69
  self._xarray_columns.append(col)
33
- # Check if it's a list containing xarray objects
34
70
  elif isinstance(sample_value, list) and len(sample_value) > 0:
35
71
  if isinstance(sample_value[0], (xr.Dataset, xr.DataArray)):
36
72
  self._xarray_columns.append(col)
73
+ elif isinstance(sample_value, (int, float, np.integer, np.floating)):
74
+ self._scalar_columns.append(col)
75
+ elif pd.isna(sample_value):
76
+ self._scalar_columns.append(col)
37
77
 
38
- if not self._xarray_columns:
39
- raise AttributeError("No xarray Dataset or DataArray columns found")
40
-
41
- def _extract_xarray_object(self, value):
42
- """Extract xarray object from various formats (direct object, list, etc.)."""
43
- if isinstance(value, (xr.Dataset, xr.DataArray)):
44
- return value
45
- elif isinstance(value, list) and len(value) > 0:
46
- if isinstance(value[0], (xr.Dataset, xr.DataArray)):
47
- return value[0] # Take the first item from the list
48
- return None
49
-
50
- def _get_target_columns(self, columns: Optional[List[str]] = None) -> List[str]:
51
- """
52
- Get the list of columns to operate on.
53
-
54
- Args:
55
- columns: List of column names to operate on. If None, uses all xarray columns.
56
-
57
- Returns:
58
- List of column names to operate on
59
- """
60
- if columns is None:
61
- return self._xarray_columns
62
-
63
- # Validate that specified columns exist and contain xarray data
64
- invalid_columns = [col for col in columns if col not in self._xarray_columns]
65
- if invalid_columns:
66
- raise ValueError(f"Columns {invalid_columns} are not valid xarray columns. "
67
- f"Available xarray columns: {self._xarray_columns}")
68
-
69
- return columns
78
+ if not self._xarray_columns and not self._scalar_columns:
79
+ raise AttributeError("No xarray Dataset, DataArray, or aggregated scalar columns found")
70
80
 
71
81
  def _should_aggregate_by_geometry(self, dim: Optional[Union[str, List[str]]] = None) -> bool:
72
- """
73
- Determine if we should aggregate by geometry (i.e., time dimension was expanded to index).
74
-
75
- Args:
76
- dim: Dimension(s) being reduced over
77
-
78
- Returns:
79
- True if we should group by geometry and aggregate across time rows
80
- """
81
82
  if dim is None:
82
83
  return False
83
84
 
84
85
  dims_to_reduce = [dim] if isinstance(dim, str) else dim
85
86
 
86
- # Check if 'time' is in the dimensions to reduce and if we have a MultiIndex with time
87
87
  if 'time' in dims_to_reduce:
88
88
  if hasattr(self._obj.index, 'names') and self._obj.index.names:
89
- # Check if time is one of the index levels
90
89
  return 'time' in self._obj.index.names
91
90
 
92
91
  return False
93
92
 
94
93
  def _get_geometry_level_name(self) -> Optional[str]:
95
- """Get the name of the geometry level in the MultiIndex."""
96
94
  if hasattr(self._obj.index, 'names') and self._obj.index.names:
97
- # Look for the level that's not 'time' - this should be the geometry level
98
- for name in self._obj.index.names:
95
+ non_time_levels = [name for name in self._obj.index.names if name != 'time']
96
+ if len(non_time_levels) == 1:
97
+ return non_time_levels[0]
98
+
99
+ for i, name in enumerate(self._obj.index.names):
99
100
  if name != 'time':
100
- return name
101
+ try:
102
+ sample_value = self._obj.index.get_level_values(i)[0]
103
+ if hasattr(sample_value, 'geom_type') or hasattr(sample_value, 'bounds'):
104
+ return name
105
+ except (IndexError, AttributeError):
106
+ continue
107
+
108
+ if non_time_levels:
109
+ return non_time_levels[0]
110
+
101
111
  return None
102
112
 
103
- def _apply_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
104
- columns: Optional[List[str]] = None, **kwargs):
105
- """
106
- Apply a reduction function to specified xarray datasets/dataarrays in the GeoDataFrame.
107
-
108
- Args:
109
- reduction_func: Name of the xarray reduction method (e.g., 'mean', 'sum', 'std')
110
- dim: Dimension(s) to reduce over. If None, reduces over all dimensions
111
- columns: List of column names to operate on. If None, operates on all xarray columns
112
- **kwargs: Additional arguments to pass to the reduction function
113
-
114
- Returns:
115
- GeoDataFrame with reduced xarray data
116
- """
117
- target_columns = self._get_target_columns(columns)
118
-
119
- # Check if we need to aggregate by geometry (time dimension expanded to index)
120
- if self._should_aggregate_by_geometry(dim):
121
- return self._apply_temporal_aggregation(reduction_func, dim, target_columns, **kwargs)
113
+ def _try_convert_to_scalar(self, data):
114
+ if isinstance(data, xr.DataArray) and data.size == 1:
115
+ try:
116
+ return float(data.values)
117
+ except (ValueError, TypeError):
118
+ pass
119
+ elif isinstance(data, xr.Dataset) and len(data.dims) == 0:
120
+ try:
121
+ vars_list = list(data.data_vars.keys())
122
+ if len(vars_list) == 1:
123
+ var_name = vars_list[0]
124
+ return float(data[var_name].values)
125
+ except (ValueError, TypeError, KeyError):
126
+ pass
127
+ return data
128
+
129
+ def _ensure_proper_geodataframe(self, result_data, result_geometries, result_index, geometry_level):
130
+ result_df = pd.DataFrame(result_data)
131
+ result_df['geometry'] = result_geometries
132
+
133
+ try:
134
+ crs = self._obj.crs
135
+ except AttributeError:
136
+ crs = None
137
+
138
+ result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry', crs=crs)
139
+
140
+ if geometry_level:
141
+ result_gdf = result_gdf.set_index(['geometry'])
142
+ result_gdf.index.name = geometry_level
122
143
  else:
123
- return self._apply_spatial_reduction(reduction_func, dim, target_columns, **kwargs)
144
+ result_gdf = result_gdf.set_index(['geometry'])
145
+
146
+ result_gdf._original_crs = crs
147
+ result_gdf._index_geometry_level = geometry_level
148
+ result_gdf._has_index_geometry = True
149
+
150
+ return result_gdf
151
+
152
+ def to_index_geometry(self):
153
+ if not hasattr(self._obj, '_has_index_geometry') or not self._obj._has_index_geometry:
154
+ return self._obj
155
+
156
+ data_columns = [col for col in self._obj.columns if col != 'geometry']
157
+ result_df = self._obj[data_columns].copy()
158
+
159
+ result_df._original_crs = getattr(self._obj, 'crs', None)
160
+ result_df._index_geometry_level = getattr(self._obj, '_index_geometry_level', None)
161
+
162
+ return result_df
163
+
164
+ def to_column_geometry(self):
165
+ if 'geometry' in self._obj.columns:
166
+ return self._obj
167
+
168
+ if hasattr(self._obj, '_index_geometry_level'):
169
+ geometry_level = self._obj._index_geometry_level
170
+ geometry_series = self._obj.index.to_series()
171
+
172
+ result_gdf = gpd.GeoDataFrame(
173
+ self._obj.copy(),
174
+ geometry=geometry_series,
175
+ crs=getattr(self._obj, '_original_crs', None)
176
+ )
177
+ result_gdf._has_index_geometry = True
178
+ result_gdf._index_geometry_level = geometry_level
179
+
180
+ return result_gdf
181
+
182
+ return self._obj
183
+
184
+ def _get_geometry_level_name(self):
185
+ if hasattr(self._obj.index, 'names'):
186
+ for name in self._obj.index.names:
187
+ if name and 'geometry' in str(name).lower():
188
+ return name
189
+ return None
190
+
191
+ def _inspect_call_stack_for_chain_end(self) -> bool:
192
+ try:
193
+ stack = inspect.stack()
194
+
195
+ for i, frame_info in enumerate(stack[1:8]):
196
+ if frame_info.code_context:
197
+ line = ''.join(frame_info.code_context).strip()
198
+
199
+ if any(internal in frame_info.filename for internal in
200
+ ['pandas', 'numpy', 'site-packages', '<frozen']):
201
+ continue
202
+
203
+ if '.geo.' in line:
204
+ geo_count = line.count('.geo.')
205
+ pending_count = len(self._pending_operations)
206
+
207
+ if pending_count >= geo_count:
208
+ return True
209
+ else:
210
+ return False
211
+
212
+ return False
213
+
214
+ except Exception:
215
+ return False
216
+
217
+ def _schedule_chain_completion_check(self):
218
+ return self._inspect_call_stack_for_chain_end()
219
+
220
+ def _trigger_processing_immediately(self):
221
+ import concurrent.futures
222
+
223
+ with concurrent.futures.ThreadPoolExecutor() as executor:
224
+ future = executor.submit(self._sync_generate_and_start_processing)
225
+ try:
226
+ job_result = future.result(timeout=35)
227
+ return job_result
228
+ except concurrent.futures.TimeoutError:
229
+ return None
230
+ except Exception as e:
231
+ return None
232
+
233
+ def _extract_xarray_object(self, value):
234
+ if isinstance(value, (xr.Dataset, xr.DataArray)):
235
+ return value
236
+ elif isinstance(value, list) and len(value) > 0:
237
+ if isinstance(value[0], (xr.Dataset, xr.DataArray)):
238
+ return value[0]
239
+
240
+ try:
241
+ if pd.isna(value):
242
+ return None
243
+ except (TypeError, ValueError):
244
+ pass
245
+
246
+ return None
247
+
248
+ def _get_target_columns(self, columns: Optional[List[str]] = None) -> tuple:
249
+ if columns is None:
250
+ return self._xarray_columns, self._scalar_columns
251
+
252
+ all_valid_columns = self._xarray_columns + self._scalar_columns
253
+ invalid_columns = [col for col in columns if col not in all_valid_columns]
254
+ if invalid_columns:
255
+ raise ValueError(f"Columns {invalid_columns} are not valid xarray or scalar columns. "
256
+ f"Available columns: {all_valid_columns}")
257
+
258
+ target_xarray = [col for col in columns if col in self._xarray_columns]
259
+ target_scalar = [col for col in columns if col in self._scalar_columns]
260
+
261
+ return target_xarray, target_scalar
262
+
263
+ def _apply_spatial_reduction(self, reduction_func: str, spatial_dims: Optional[List[str]],
264
+ target_xarray_columns: List[str], **kwargs):
265
+ result_gdf = self._obj.copy()
266
+
267
+ for col in target_xarray_columns:
268
+ new_data = []
269
+ for idx, row in self._obj.iterrows():
270
+ original_value = row[col]
271
+ xr_data = self._extract_xarray_object(original_value)
272
+
273
+ if xr_data is not None:
274
+ try:
275
+ if hasattr(xr_data, reduction_func):
276
+ if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
277
+ kwargs['skipna'] = True
278
+
279
+ if spatial_dims:
280
+ available_spatial_dims = [d for d in spatial_dims if d in xr_data.dims]
281
+ if available_spatial_dims:
282
+ reduced_data = getattr(xr_data, reduction_func)(dim=available_spatial_dims, **kwargs)
283
+ else:
284
+ reduced_data = xr_data
285
+ else:
286
+ reduced_data = getattr(xr_data, reduction_func)(dim=None, **kwargs)
287
+
288
+ reduced_data = self._try_convert_to_scalar(reduced_data)
289
+
290
+ if isinstance(original_value, list):
291
+ new_data.append([reduced_data])
292
+ else:
293
+ new_data.append(reduced_data)
294
+ else:
295
+ raise AttributeError(f"'{type(xr_data).__name__}' object has no attribute '{reduction_func}'")
296
+ except Exception as e:
297
+ new_data.append(original_value)
298
+ else:
299
+ new_data.append(original_value)
300
+
301
+ result_gdf[col] = new_data
302
+
303
+ return result_gdf
124
304
 
125
- def _apply_temporal_aggregation(self, reduction_func: str, dim: Union[str, List[str]],
126
- target_columns: List[str], **kwargs):
127
- """
128
- Apply aggregation across time by grouping by geometry.
129
-
130
- Args:
131
- reduction_func: Name of the reduction method
132
- dim: Dimension(s) being reduced (should include 'time')
133
- target_columns: Columns to operate on
134
- **kwargs: Additional arguments
135
-
136
- Returns:
137
- GeoDataFrame with time-aggregated data
138
- """
305
+ def _apply_scalar_temporal_aggregation(self, reduction_func: str, target_scalar_columns: List[str], **kwargs):
139
306
  geometry_level = self._get_geometry_level_name()
140
307
  if geometry_level is None:
141
308
  raise ValueError("Could not identify geometry level in MultiIndex")
142
309
 
143
- # Check if specific columns were requested for time aggregation
144
- if target_columns != self._xarray_columns:
145
- print("Warning: Cannot aggregate time on a single column. Aggregating all xarray columns instead.")
146
- target_columns = self._xarray_columns
310
+ grouped = self._obj.groupby(level=geometry_level)
311
+
312
+ result_data = []
313
+ result_geometries = []
314
+ result_index = []
315
+
316
+ for geometry_key, group in grouped:
317
+ new_row = {}
318
+
319
+ for col in target_scalar_columns:
320
+ try:
321
+ if reduction_func == 'mean':
322
+ agg_value = group[col].mean(skipna=True)
323
+ elif reduction_func == 'sum':
324
+ agg_value = group[col].sum(skipna=True)
325
+ elif reduction_func == 'std':
326
+ agg_value = group[col].std(skipna=True)
327
+ elif reduction_func == 'var':
328
+ agg_value = group[col].var(skipna=True)
329
+ elif reduction_func == 'min':
330
+ agg_value = group[col].min(skipna=True)
331
+ elif reduction_func == 'max':
332
+ agg_value = group[col].max(skipna=True)
333
+ elif reduction_func == 'median':
334
+ agg_value = group[col].median(skipna=True)
335
+ elif reduction_func == 'count':
336
+ agg_value = group[col].count()
337
+ elif reduction_func == 'quantile':
338
+ q = kwargs.get('q', 0.5)
339
+ agg_value = group[col].quantile(q, skipna=True)
340
+ else:
341
+ agg_value = group[col].mean(skipna=True)
342
+
343
+ new_row[col] = agg_value
344
+
345
+ except Exception as e:
346
+ new_row[col] = np.nan
347
+
348
+ for col in self._obj.columns:
349
+ if col not in target_scalar_columns and col != 'geometry':
350
+ new_row[col] = group[col].iloc[0]
351
+
352
+ result_data.append(new_row)
353
+ result_geometries.append(geometry_key)
354
+ result_index.append(geometry_key)
355
+
356
+ return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
357
+
358
+ def _apply_mixed_aggregation(self, reduction_func: str, temporal_dims: List[str],
359
+ spatial_dims: List[str], target_xarray_columns: List[str],
360
+ target_scalar_columns: List[str], **kwargs):
361
+ geometry_level = self._get_geometry_level_name()
362
+ if geometry_level is None:
363
+ raise ValueError("Could not identify geometry level in MultiIndex")
364
+
365
+ if target_xarray_columns != self._xarray_columns:
366
+ target_xarray_columns = self._xarray_columns
147
367
 
148
- # Group by geometry level
149
368
  grouped = self._obj.groupby(level=geometry_level)
150
369
 
151
370
  result_data = []
@@ -153,24 +372,22 @@ class GeoXarrayAccessor:
153
372
  result_index = []
154
373
 
155
374
  for geometry_key, group in grouped:
156
- # For each geometry, collect all xarray objects across time
157
- # The geometry is the group key itself (from the MultiIndex)
158
375
  new_row = {}
159
376
 
160
- for col in target_columns:
377
+ for col in target_xarray_columns:
161
378
  xarray_objects = []
379
+ valid_time_steps = 0
380
+ total_time_steps = len(group)
162
381
 
163
- # Collect all xarray objects for this geometry across different times
164
382
  for _, row in group.iterrows():
165
383
  xr_data = self._extract_xarray_object(row[col])
166
384
  if xr_data is not None:
167
385
  xarray_objects.append(xr_data)
386
+ valid_time_steps += 1
168
387
 
169
388
  if xarray_objects:
170
389
  try:
171
- # Concatenate along a new 'time' dimension
172
390
  if isinstance(xarray_objects[0], xr.DataArray):
173
- # Create time coordinate
174
391
  time_coords = list(range(len(xarray_objects)))
175
392
  concatenated = xr.concat(xarray_objects, dim='time')
176
393
  concatenated = concatenated.assign_coords(time=time_coords)
@@ -181,31 +398,27 @@ class GeoXarrayAccessor:
181
398
  else:
182
399
  raise TypeError(f"Unsupported xarray type: {type(xarray_objects[0])}")
183
400
 
184
- # Apply the reduction function over the time dimension
185
401
  if hasattr(concatenated, reduction_func):
186
402
  if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
187
403
  kwargs['skipna'] = True
188
404
 
189
- reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
405
+ if temporal_dims:
406
+ reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
407
+ else:
408
+ reduced_data = concatenated
409
+
410
+ if spatial_dims:
411
+ available_spatial_dims = [d for d in spatial_dims if d in reduced_data.dims]
412
+ if available_spatial_dims:
413
+ reduced_data = getattr(reduced_data, reduction_func)(dim=available_spatial_dims, **kwargs)
190
414
 
191
- # Check if result should be converted to scalar
192
- if isinstance(reduced_data, xr.DataArray) and reduced_data.size == 1:
193
- try:
194
- scalar_value = float(reduced_data.values)
195
- reduced_data = scalar_value
196
- except (ValueError, TypeError):
197
- pass
198
- elif isinstance(reduced_data, xr.Dataset) and len(reduced_data.dims) == 0:
199
- try:
200
- vars_list = list(reduced_data.data_vars.keys())
201
- if len(vars_list) == 1:
202
- var_name = vars_list[0]
203
- scalar_value = float(reduced_data[var_name].values)
204
- reduced_data = scalar_value
205
- except (ValueError, TypeError, KeyError):
206
- pass
415
+ all_dims_reduced = (
416
+ temporal_dims and spatial_dims and
417
+ set(temporal_dims + spatial_dims) >= set(reduced_data.dims)
418
+ )
419
+ if all_dims_reduced:
420
+ reduced_data = self._try_convert_to_scalar(reduced_data)
207
421
 
208
- # Maintain original format (list vs direct)
209
422
  original_format = group[col].iloc[0]
210
423
  if isinstance(original_format, list):
211
424
  new_row[col] = [reduced_data]
@@ -215,51 +428,215 @@ class GeoXarrayAccessor:
215
428
  raise AttributeError(f"'{type(concatenated).__name__}' object has no attribute '{reduction_func}'")
216
429
 
217
430
  except Exception as e:
218
- print(f"Warning: Could not apply {reduction_func} to geometry {geometry_key}, column {col}: {e}")
219
- # Keep the first value as fallback
220
- new_row[col] = group[col].iloc[0]
431
+ new_row[col] = np.nan
221
432
  else:
222
- # No xarray data found, keep first value
433
+ new_row[col] = np.nan
434
+
435
+ for col in target_scalar_columns:
436
+ new_row[col] = group[col].iloc[0]
437
+
438
+ for col in self._obj.columns:
439
+ if (col not in target_xarray_columns and
440
+ col not in target_scalar_columns and
441
+ col != 'geometry'):
223
442
  new_row[col] = group[col].iloc[0]
224
443
 
225
444
  result_data.append(new_row)
226
445
  result_geometries.append(geometry_key)
227
446
  result_index.append(geometry_key)
228
447
 
229
- # Create result GeoDataFrame
230
- # Create a normal DataFrame with just the data columns
231
- result_df = pd.DataFrame(result_data, index=result_index)
448
+ return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
449
+
450
+ def _apply_mixed_scalar_xarray_aggregation(self, reduction_func: str, temporal_dims: List[str],
451
+ spatial_dims: List[str], target_xarray_columns: List[str],
452
+ target_scalar_columns: List[str], **kwargs):
453
+ geometry_level = self._get_geometry_level_name()
454
+ if geometry_level is None:
455
+ raise ValueError("Could not identify geometry level in MultiIndex")
232
456
 
233
- # Add geometry as a temporary column
234
- result_df['_temp_geom'] = result_geometries
457
+ grouped = self._obj.groupby(level=geometry_level)
235
458
 
236
- # Convert to GeoDataFrame using the temporary geometry column
237
- result_gdf = gpd.GeoDataFrame(result_df, geometry='_temp_geom')
459
+ result_data = []
460
+ result_geometries = []
461
+ result_index = []
238
462
 
239
- # Drop the temporary geometry column (the geometry is now properly set as the active geometry)
240
- result_gdf = result_gdf.drop(columns=['_temp_geom'])
463
+ for geometry_key, group in grouped:
464
+ new_row = {}
465
+
466
+ for col in target_xarray_columns:
467
+ xarray_objects = []
468
+
469
+ for _, row in group.iterrows():
470
+ xr_data = self._extract_xarray_object(row[col])
471
+ if xr_data is not None:
472
+ xarray_objects.append(xr_data)
473
+
474
+ if xarray_objects:
475
+ try:
476
+ if isinstance(xarray_objects[0], xr.DataArray):
477
+ time_coords = list(range(len(xarray_objects)))
478
+ concatenated = xr.concat(xarray_objects, dim='time')
479
+ concatenated = concatenated.assign_coords(time=time_coords)
480
+ elif isinstance(xarray_objects[0], xr.Dataset):
481
+ time_coords = list(range(len(xarray_objects)))
482
+ concatenated = xr.concat(xarray_objects, dim='time')
483
+ concatenated = concatenated.assign_coords(time=time_coords)
484
+ else:
485
+ raise TypeError(f"Unsupported xarray type: {type(xarray_objects[0])}")
486
+
487
+ if hasattr(concatenated, reduction_func):
488
+ if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
489
+ kwargs['skipna'] = True
490
+
491
+ if temporal_dims:
492
+ reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
493
+ else:
494
+ reduced_data = concatenated
495
+
496
+ if spatial_dims:
497
+ available_spatial_dims = [d for d in spatial_dims if d in reduced_data.dims]
498
+ if available_spatial_dims:
499
+ reduced_data = getattr(reduced_data, reduction_func)(dim=available_spatial_dims, **kwargs)
500
+
501
+ all_dims_reduced = (
502
+ temporal_dims and spatial_dims and
503
+ set(temporal_dims + spatial_dims) >= set(reduced_data.dims)
504
+ )
505
+ if all_dims_reduced:
506
+ reduced_data = self._try_convert_to_scalar(reduced_data)
507
+
508
+ original_format = group[col].iloc[0]
509
+ if isinstance(original_format, list):
510
+ new_row[col] = [reduced_data]
511
+ else:
512
+ new_row[col] = reduced_data
513
+ else:
514
+ raise AttributeError(f"'{type(concatenated).__name__}' object has no attribute '{reduction_func}'")
515
+
516
+ except Exception as e:
517
+ new_row[col] = np.nan
518
+ else:
519
+ new_row[col] = np.nan
520
+
521
+ for col in target_scalar_columns:
522
+ try:
523
+ if reduction_func == 'mean':
524
+ agg_value = group[col].mean(skipna=True)
525
+ elif reduction_func == 'sum':
526
+ agg_value = group[col].sum(skipna=True)
527
+ elif reduction_func == 'std':
528
+ agg_value = group[col].std(skipna=True)
529
+ elif reduction_func == 'var':
530
+ agg_value = group[col].var(skipna=True)
531
+ elif reduction_func == 'min':
532
+ agg_value = group[col].min(skipna=True)
533
+ elif reduction_func == 'max':
534
+ agg_value = group[col].max(skipna=True)
535
+ elif reduction_func == 'median':
536
+ agg_value = group[col].median(skipna=True)
537
+ elif reduction_func == 'count':
538
+ agg_value = group[col].count()
539
+ elif reduction_func == 'quantile':
540
+ q = kwargs.get('q', 0.5)
541
+ agg_value = group[col].quantile(q, skipna=True)
542
+ else:
543
+ agg_value = group[col].mean(skipna=True)
544
+
545
+ new_row[col] = agg_value
546
+
547
+ except Exception as e:
548
+ new_row[col] = np.nan
549
+
550
+ for col in self._obj.columns:
551
+ if (col not in target_xarray_columns and
552
+ col not in target_scalar_columns and
553
+ col != 'geometry'):
554
+ new_row[col] = group[col].iloc[0]
555
+
556
+ result_data.append(new_row)
557
+ result_geometries.append(geometry_key)
558
+ result_index.append(geometry_key)
559
+
560
+ return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
561
+
562
+ def _apply_temporal_aggregation(self, reduction_func: str, temporal_dims: List[str],
563
+ target_xarray_columns: List[str], target_scalar_columns: List[str], **kwargs):
564
+ geometry_level = self._get_geometry_level_name()
565
+ if geometry_level is None:
566
+ raise ValueError("Could not identify geometry level in MultiIndex")
241
567
 
242
- result_gdf.index.name = geometry_level
568
+ if target_xarray_columns != self._xarray_columns:
569
+ target_xarray_columns = self._xarray_columns
243
570
 
244
- return result_gdf
571
+ grouped = self._obj.groupby(level=geometry_level)
572
+
573
+ result_data = []
574
+ result_geometries = []
575
+ result_index = []
576
+
577
+ for geometry_key, group in grouped:
578
+ new_row = {}
579
+
580
+ for col in target_xarray_columns:
581
+ xarray_objects = []
582
+
583
+ for _, row in group.iterrows():
584
+ xr_data = self._extract_xarray_object(row[col])
585
+ if xr_data is not None:
586
+ xarray_objects.append(xr_data)
587
+
588
+ if xarray_objects:
589
+ try:
590
+ if isinstance(xarray_objects[0], xr.DataArray):
591
+ time_coords = list(range(len(xarray_objects)))
592
+ concatenated = xr.concat(xarray_objects, dim='time')
593
+ concatenated = concatenated.assign_coords(time=time_coords)
594
+ elif isinstance(xarray_objects[0], xr.Dataset):
595
+ time_coords = list(range(len(xarray_objects)))
596
+ concatenated = xr.concat(xarray_objects, dim='time')
597
+ concatenated = concatenated.assign_coords(time=time_coords)
598
+ else:
599
+ raise TypeError(f"Unsupported xarray type: {type(xarray_objects[0])}")
600
+
601
+ if hasattr(concatenated, reduction_func):
602
+ if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
603
+ kwargs['skipna'] = True
604
+
605
+ reduced_data = getattr(concatenated, reduction_func)(dim='time', **kwargs)
606
+
607
+ original_format = group[col].iloc[0]
608
+ if isinstance(original_format, list):
609
+ new_row[col] = [reduced_data]
610
+ else:
611
+ new_row[col] = reduced_data
612
+ else:
613
+ raise AttributeError(f"'{type(concatenated).__name__}' object has no attribute '{reduction_func}'")
614
+
615
+ except Exception as e:
616
+ new_row[col] = np.nan
617
+ else:
618
+ new_row[col] = np.nan
619
+
620
+ for col in target_scalar_columns:
621
+ new_row[col] = group[col].iloc[0]
622
+
623
+ for col in self._obj.columns:
624
+ if (col not in target_xarray_columns and
625
+ col not in target_scalar_columns and
626
+ col != 'geometry'):
627
+ new_row[col] = group[col].iloc[0]
628
+
629
+ result_data.append(new_row)
630
+ result_geometries.append(geometry_key)
631
+ result_index.append(geometry_key)
632
+
633
+ return self._ensure_proper_geodataframe(result_data, result_geometries, result_index, geometry_level)
245
634
 
246
- def _apply_spatial_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]],
247
- target_columns: List[str], **kwargs):
248
- """
249
- Apply reduction to spatial dimensions within each xarray object.
250
-
251
- Args:
252
- reduction_func: Name of the reduction method
253
- dim: Spatial dimension(s) to reduce over
254
- target_columns: Columns to operate on
255
- **kwargs: Additional arguments
256
-
257
- Returns:
258
- GeoDataFrame with spatially reduced data
259
- """
635
+ def _apply_spatial_reduction(self, reduction_func: str, spatial_dims: Optional[List[str]],
636
+ target_xarray_columns: List[str], **kwargs):
260
637
  result_gdf = self._obj.copy()
261
638
 
262
- for col in target_columns:
639
+ for col in target_xarray_columns:
263
640
  new_data = []
264
641
  for idx, row in self._obj.iterrows():
265
642
  original_value = row[col]
@@ -267,33 +644,21 @@ class GeoXarrayAccessor:
267
644
 
268
645
  if xr_data is not None:
269
646
  try:
270
- # Apply the reduction function
271
647
  if hasattr(xr_data, reduction_func):
272
- # Ensure skipna=True is set by default for most reduction functions
273
648
  if 'skipna' not in kwargs and reduction_func in ['mean', 'sum', 'std', 'var', 'min', 'max', 'median', 'quantile']:
274
649
  kwargs['skipna'] = True
275
- reduced_data = getattr(xr_data, reduction_func)(dim=dim, **kwargs)
276
650
 
277
- # Check if the result is a scalar and convert to float if so
278
- if isinstance(reduced_data, xr.DataArray):
279
- if reduced_data.size == 1:
280
- try:
281
- scalar_value = float(reduced_data.values)
282
- reduced_data = scalar_value
283
- except (ValueError, TypeError):
284
- pass
285
- elif isinstance(reduced_data, xr.Dataset):
286
- try:
287
- if len(reduced_data.dims) == 0:
288
- vars_list = list(reduced_data.data_vars.keys())
289
- if len(vars_list) == 1:
290
- var_name = vars_list[0]
291
- scalar_value = float(reduced_data[var_name].values)
292
- reduced_data = scalar_value
293
- except (ValueError, TypeError, KeyError):
294
- pass
651
+ if spatial_dims:
652
+ available_spatial_dims = [d for d in spatial_dims if d in xr_data.dims]
653
+ if available_spatial_dims:
654
+ reduced_data = getattr(xr_data, reduction_func)(dim=available_spatial_dims, **kwargs)
655
+ else:
656
+ reduced_data = xr_data
657
+ else:
658
+ reduced_data = getattr(xr_data, reduction_func)(dim=None, **kwargs)
659
+
660
+ reduced_data = self._try_convert_to_scalar(reduced_data)
295
661
 
296
- # Keep the same format as original (list vs direct)
297
662
  if isinstance(original_value, list):
298
663
  new_data.append([reduced_data])
299
664
  else:
@@ -301,177 +666,284 @@ class GeoXarrayAccessor:
301
666
  else:
302
667
  raise AttributeError(f"'{type(xr_data).__name__}' object has no attribute '{reduction_func}'")
303
668
  except Exception as e:
304
- # If reduction fails, keep original data
305
- print(f"Warning: Could not apply {reduction_func} to row {idx}, column {col}: {e}")
306
669
  new_data.append(original_value)
307
670
  else:
308
- # If it's not xarray data, keep as is
309
671
  new_data.append(original_value)
310
672
 
311
673
  result_gdf[col] = new_data
312
674
 
313
675
  return result_gdf
676
+
677
+ def _apply_cloud_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
678
+ columns: Optional[List[str]] = None, **kwargs):
679
+ current_time = time.time()
680
+ chain_reset_threshold = 0.01
681
+
682
+ if (self._last_operation_time is None or
683
+ current_time - self._last_operation_time > chain_reset_threshold):
684
+
685
+ if not self._pending_operations:
686
+ self._operation_sequence_id = int(current_time * 1000)
687
+ self._operation_count = 0
688
+
689
+ self._last_operation_time = current_time
690
+ self._operation_count += 1
691
+
692
+ params = {"dim": dim, "columns": columns, **kwargs}
693
+ description = f"Apply {reduction_func} over dimension(s): {dim}" if dim else f"Apply {reduction_func} over all dimensions"
694
+
695
+ operation = {
696
+ "type": reduction_func,
697
+ "description": description,
698
+ "params": params,
699
+ "timestamp": pd.Timestamp.now(),
700
+ "sequence_id": self._operation_sequence_id
701
+ }
702
+
703
+ self._pending_operations.append(operation)
704
+
705
+ chain_complete = self._schedule_chain_completion_check()
706
+
707
+ result = self._obj.copy()
708
+ result.attrs = self._obj.attrs.copy()
709
+
710
+ if hasattr(self._obj, 'client'):
711
+ object.__setattr__(result, 'client', self._obj.client)
712
+ if hasattr(self._obj, 'job_id'):
713
+ object.__setattr__(result, 'job_id', self._obj.job_id)
714
+ if hasattr(self._obj, 'job_name'):
715
+ object.__setattr__(result, 'job_name', self._obj.job_name)
716
+
717
+ if not result.attrs:
718
+ result.attrs = {}
719
+ if chain_complete:
720
+ job_result = self._trigger_processing_immediately()
721
+ # result.attrs['job_id'] = job_result
722
+ return job_result
723
+
724
+ result.attrs['_geo_chain_state'] = {
725
+ 'pending_operations': self._pending_operations,
726
+ 'operation_sequence_id': self._operation_sequence_id,
727
+ 'last_operation_time': self._last_operation_time,
728
+ 'operation_count': self._operation_count,
729
+ 'processing_in_progress': getattr(self, '_processing_in_progress', False)
730
+ }
731
+
732
+ return result
733
+
734
+ def _apply_local_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
735
+ columns: Optional[List[str]] = None, **kwargs):
736
+ target_xarray_columns, target_scalar_columns = self._get_target_columns(columns)
737
+
738
+ if dim is None:
739
+ if target_xarray_columns:
740
+ return self._apply_spatial_reduction(reduction_func, dim, target_xarray_columns, **kwargs)
741
+ else:
742
+ return self._obj.copy()
743
+
744
+ dims_to_reduce = [dim] if isinstance(dim, str) else dim
745
+
746
+ temporal_dims = [d for d in dims_to_reduce if d == 'time']
747
+ spatial_dims = [d for d in dims_to_reduce if d != 'time']
748
+
749
+ has_temporal_agg = (
750
+ temporal_dims and
751
+ hasattr(self._obj.index, 'names') and
752
+ self._obj.index.names and
753
+ 'time' in self._obj.index.names
754
+ )
755
+
756
+ if has_temporal_agg and target_scalar_columns and not target_xarray_columns:
757
+ return self._apply_scalar_temporal_aggregation(reduction_func, target_scalar_columns, **kwargs)
758
+
759
+ if has_temporal_agg and target_scalar_columns and target_xarray_columns:
760
+ return self._apply_mixed_scalar_xarray_aggregation(reduction_func, temporal_dims, spatial_dims,
761
+ target_xarray_columns, target_scalar_columns, **kwargs)
762
+
763
+ if not target_xarray_columns and target_scalar_columns:
764
+ if spatial_dims:
765
+ pass
766
+ return self._obj.copy()
767
+
768
+ if has_temporal_agg and spatial_dims:
769
+ return self._apply_mixed_aggregation(reduction_func, temporal_dims, spatial_dims,
770
+ target_xarray_columns, target_scalar_columns, **kwargs)
771
+ elif has_temporal_agg:
772
+ return self._apply_temporal_aggregation(reduction_func, temporal_dims,
773
+ target_xarray_columns, target_scalar_columns, **kwargs)
774
+ else:
775
+ return self._apply_spatial_reduction(reduction_func, spatial_dims,
776
+ target_xarray_columns, **kwargs)
777
+
778
+ def _apply_reduction(self, reduction_func: str, dim: Optional[Union[str, List[str]]] = None,
779
+ columns: Optional[List[str]] = None, **kwargs):
780
+ if isinstance(self._obj, cloud_object):
781
+ return self._apply_cloud_reduction(reduction_func = reduction_func, dim = dim, columns = columns, **kwargs)
782
+ else:
783
+ return self._apply_local_reduction(reduction_func = reduction_func, dim = dim, columns = columns, **kwargs)
784
+
785
+ def _sync_generate_and_start_processing(self):
786
+ if not self._pending_operations or getattr(self, '_processing_in_progress', False):
787
+ return None
788
+
789
+ self._processing_in_progress = True
790
+
791
+ try:
792
+ sequence_id = self._operation_sequence_id
793
+ script_content = self._generate_post_processing_script()
794
+ client = self._client
795
+ if client:
796
+ mass_stats = MassStats(client)
797
+
798
+ import asyncio
799
+ import concurrent.futures
800
+
801
+ def run_async():
802
+ loop = asyncio.new_event_loop()
803
+ asyncio.set_event_loop(loop)
804
+ # we don't actually have the dataset name, currently it is just getting job named zonal stats job
805
+ try:
806
+ return loop.run_until_complete(
807
+ mass_stats.zonal_stats_transform(
808
+ data_name=self._obj.job_name,
809
+ output="netcdf",
810
+ consumer = script_content.encode('utf-8'),
811
+ overwrite=True,
812
+ )
813
+ )
814
+ finally:
815
+ loop.close()
816
+
817
+ with concurrent.futures.ThreadPoolExecutor() as executor:
818
+ future = executor.submit(run_async)
819
+ result = future.result(timeout=30)
820
+ return result
821
+
822
+ return None
823
+
824
+ except Exception as e:
825
+ return None
826
+ finally:
827
+ self._processing_in_progress = False
828
+ self._pending_operations.clear()
829
+
830
+ def _generate_post_processing_script(self) -> str:
831
+ script_lines = [
832
+ "import pandas as pd",
833
+ "import xarray as xr",
834
+ "import numpy as np",
835
+ "from io import BytesIO",
836
+ "import tempfile",
837
+ "import os",
838
+ "",
839
+ "def consume(filename, file_bytes, metadata):",
840
+ ]
841
+
842
+ script_lines.extend([
843
+ " ",
844
+ " try:",
845
+ " with tempfile.NamedTemporaryFile(suffix='.nc', delete=False) as tmp_file:",
846
+ " tmp_file.write(file_bytes)",
847
+ " tmp_file.flush()",
848
+ " ds = xr.open_dataset(tmp_file.name, engine='scipy')",
849
+ " ",
850
+ ])
851
+
852
+ for i, op in enumerate(self._pending_operations):
853
+ op_type = op['type']
854
+ params = op['params']
855
+ dim = params.get('dim')
856
+
857
+ if dim:
858
+ dim_str = repr(dim)
859
+ script_lines.append(f" ds = ds.{op_type}(dim={dim_str}, skipna=True)")
860
+ else:
861
+ script_lines.append(f" ds = ds.{op_type}(skipna=True)")
862
+ script_lines.append("")
863
+
864
+ script_lines.extend([
865
+ " # Determine output format based on data structure",
866
+ " base_filename = os.path.splitext(filename)[0]",
867
+ " ",
868
+ " # Check if all data variables are scalar (0-dimensional)",
869
+ " all_scalar = True",
870
+ " for var_name in ds.data_vars:",
871
+ " if ds[var_name].dims:",
872
+ " all_scalar = False",
873
+ " break",
874
+ " ",
875
+ " if all_scalar:",
876
+ " # Output as CSV - all variables are scalar",
877
+ " result_data = {}",
878
+ " for var_name in ds.data_vars:",
879
+ " result_data[var_name] = float(ds[var_name].values)",
880
+ " ",
881
+ " result_df = pd.DataFrame([result_data])",
882
+ ' output_filename = f"{base_filename}_processed.csv"',
883
+ " csv_data = result_df.to_csv(index=False).encode()",
884
+ " ",
885
+ " ds.close()",
886
+ " os.unlink(tmp_file.name)",
887
+ " return output_filename, csv_data",
888
+ " else:",
889
+ " # Output as NetCDF - still has dimensions",
890
+ ' output_filename = f"{base_filename}_processed.nc"',
891
+ " # Use temp file instead of BytesIO to avoid buffer closing issues",
892
+ " with tempfile.NamedTemporaryFile(suffix='.nc', delete=False) as nc_tmp_file:",
893
+ " ds.to_netcdf(nc_tmp_file.name, format='NETCDF3_64BIT')",
894
+ " ",
895
+ " # Read the temp file back as bytes",
896
+ " with open(nc_tmp_file.name, 'rb') as f:",
897
+ " netcdf_data = f.read()",
898
+ " ",
899
+ " # Clean up temp files",
900
+ " os.unlink(nc_tmp_file.name)",
901
+ " ",
902
+ " ds.close()",
903
+ " os.unlink(tmp_file.name)",
904
+ " return output_filename, netcdf_data",
905
+ ])
906
+
907
+ script_lines.extend([
908
+ " ",
909
+ " except Exception as e:",
910
+ " try:",
911
+ " os.unlink(tmp_file.name)",
912
+ " except:",
913
+ " pass",
914
+ " try:",
915
+ " os.unlink(nc_tmp_file.name)",
916
+ " except:",
917
+ " pass",
918
+ " return None, None",
919
+ ])
920
+
921
+ return "\n".join(script_lines)
922
+
923
+ @property
924
+ def job_id(self):
925
+ return self._obj.attrs.get('job_id')
314
926
 
315
927
  def mean(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
316
- """
317
- Calculate mean of xarray datasets/dataarrays.
318
-
319
- Args:
320
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
321
- If spatial dims like 'x', 'y', reduces within each xarray object.
322
- columns: List of column names to operate on. If None, operates on all xarray columns
323
- **kwargs: Additional arguments for the reduction function
324
- """
325
928
  return self._apply_reduction('mean', dim=dim, columns=columns, **kwargs)
326
929
 
327
930
  def sum(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
328
- """
329
- Calculate sum of xarray datasets/dataarrays.
330
-
331
- Args:
332
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
333
- columns: List of column names to operate on. If None, operates on all xarray columns
334
- **kwargs: Additional arguments for the reduction function
335
- """
336
931
  return self._apply_reduction('sum', dim=dim, columns=columns, **kwargs)
337
932
 
933
+ def max(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
934
+ return self._apply_reduction('max', dim=dim, columns=columns, **kwargs)
935
+
936
+ def min(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
937
+ return self._apply_reduction('min', dim=dim, columns=columns, **kwargs)
938
+
338
939
  def std(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
339
- """
340
- Calculate standard deviation of xarray datasets/dataarrays.
341
-
342
- Args:
343
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
344
- columns: List of column names to operate on. If None, operates on all xarray columns
345
- **kwargs: Additional arguments for the reduction function
346
- """
347
940
  return self._apply_reduction('std', dim=dim, columns=columns, **kwargs)
348
941
 
349
942
  def var(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
350
- """
351
- Calculate variance of xarray datasets/dataarrays.
352
-
353
- Args:
354
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
355
- columns: List of column names to operate on. If None, operates on all xarray columns
356
- **kwargs: Additional arguments for the reduction function
357
- """
358
943
  return self._apply_reduction('var', dim=dim, columns=columns, **kwargs)
359
944
 
360
- def min(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
361
- """
362
- Calculate minimum of xarray datasets/dataarrays.
363
-
364
- Args:
365
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
366
- columns: List of column names to operate on. If None, operates on all xarray columns
367
- **kwargs: Additional arguments for the reduction function
368
- """
369
- return self._apply_reduction('min', dim=dim, columns=columns, **kwargs)
370
-
371
- def max(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
372
- """
373
- Calculate maximum of xarray datasets/dataarrays.
374
-
375
- Args:
376
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
377
- columns: List of column names to operate on. If None, operates on all xarray columns
378
- **kwargs: Additional arguments for the reduction function
379
- """
380
- return self._apply_reduction('max', dim=dim, columns=columns, **kwargs)
381
-
382
945
  def median(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
383
- """
384
- Calculate median of xarray datasets/dataarrays.
385
-
386
- Args:
387
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
388
- columns: List of column names to operate on. If None, operates on all xarray columns
389
- **kwargs: Additional arguments for the reduction function
390
- """
391
946
  return self._apply_reduction('median', dim=dim, columns=columns, **kwargs)
392
947
 
393
- def quantile(self, q: float, dim: Optional[Union[str, List[str]]] = None,
394
- columns: Optional[List[str]] = None, **kwargs):
395
- """
396
- Calculate quantile of xarray datasets/dataarrays.
397
-
398
- Args:
399
- q: Quantile to compute (between 0 and 1)
400
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
401
- columns: List of column names to operate on. If None, operates on all xarray columns
402
- **kwargs: Additional arguments for the reduction function
403
- """
404
- return self._apply_reduction('quantile', dim=dim, columns=columns, q=q, **kwargs)
405
-
406
948
  def count(self, dim: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, **kwargs):
407
- """
408
- Count non-NaN values in xarray datasets/dataarrays.
409
-
410
- Args:
411
- dim: Dimension(s) to reduce over. If 'time', aggregates across time rows for each geometry.
412
- columns: List of column names to operate on. If None, operates on all xarray columns
413
- **kwargs: Additional arguments for the reduction function
414
- """
415
- return self._apply_reduction('count', dim=dim, columns=columns, **kwargs)
416
-
417
- def to_values(self, columns: Optional[List[str]] = None):
418
- """
419
- Extract scalar values from xarray dataarrays and add them as new columns.
420
- Useful when dataarrays have been reduced to single values.
421
-
422
- Args:
423
- columns: List of column names to operate on. If None, operates on all xarray columns
424
-
425
- Returns:
426
- GeoDataFrame with extracted values as new columns
427
- """
428
- result_gdf = self._obj.copy()
429
- target_columns = self._get_target_columns(columns)
430
-
431
- for col in target_columns:
432
- values_to_add = []
433
- for idx, row in self._obj.iterrows():
434
- xr_data = self._extract_xarray_object(row[col])
435
- if isinstance(xr_data, xr.DataArray):
436
- try:
437
- if xr_data.size == 1:
438
- scalar_value = float(xr_data.values)
439
- values_to_add.append(scalar_value)
440
- else:
441
- values_to_add.append(np.nan) # Can't convert non-scalar to value
442
- except (ValueError, TypeError):
443
- values_to_add.append(np.nan)
444
- else:
445
- values_to_add.append(np.nan)
446
-
447
- # Add new column with scalar values
448
- new_col_name = f"{col}_value"
449
- result_gdf[new_col_name] = values_to_add
450
-
451
- return result_gdf
452
-
453
- def info(self):
454
- """Print information about xarray datasets/dataarrays in the GeoDataFrame."""
455
- print(f"GeoDataFrame shape: {self._obj.shape}")
456
- print(f"Columns: {list(self._obj.columns)}")
457
- print(f"Xarray columns: {self._xarray_columns}")
458
- print(f"Index structure: {self._obj.index.names if hasattr(self._obj.index, 'names') else 'Simple index'}")
459
- print(f"Geometry column name: {self._obj.geometry.name if hasattr(self._obj.geometry, 'name') else 'No geometry name'}")
460
-
461
- if hasattr(self._obj.index, 'names') and 'time' in self._obj.index.names:
462
- print("Note: Time dimension appears to be expanded into the index.")
463
- print("Use dim='time' to aggregate across time rows for each geometry.")
464
-
465
- for col in self._xarray_columns:
466
- print(f"\n--- Column: {col} ---")
467
- sample_data = self._extract_xarray_object(self._obj[col].iloc[0]) if len(self._obj) > 0 else None
468
- if isinstance(sample_data, xr.Dataset):
469
- print(f"Type: xarray.Dataset")
470
- print(f"Variables: {list(sample_data.data_vars.keys())}")
471
- print(f"Dimensions: {list(sample_data.dims.keys())}")
472
- print(f"Coordinates: {list(sample_data.coords.keys())}")
473
- elif isinstance(sample_data, xr.DataArray):
474
- print(f"Type: xarray.DataArray")
475
- print(f"Dimensions: {list(sample_data.dims)}")
476
- print(f"Shape: {sample_data.shape}")
477
- print(f"Data type: {sample_data.dtype}")
949
+ return self._apply_reduction('count', dim=dim, columns=columns, **kwargs)