terrakio-core 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of terrakio-core might be problematic. Click here for more details.

@@ -1,132 +1,282 @@
1
+
1
2
  import os
2
3
  import asyncio
3
4
  import tempfile
4
5
  import time
5
6
  import pandas as pd
7
+ import geopandas as gpd
6
8
  from geopandas import GeoDataFrame
7
9
  from shapely.geometry import mapping
10
+ from pathlib import Path
8
11
  from ..exceptions import APIError, ConfigurationError
9
12
  from ..helper.bounded_taskgroup import BoundedTaskGroup
10
13
  from ..helper.tiles import tiles
11
14
  import uuid
12
15
  import xarray as xr
16
+ import random
17
+ import psutil
18
+ import copy
19
+ from shapely.geometry import shape
20
+ from shapely.ops import transform
21
+ from shapely.geometry import box
22
+ import pyproj
13
23
 
14
- async def request_data(
15
- client,
16
- gdf: GeoDataFrame,
17
- expr: str,
18
- conc: int = 20,
19
- in_crs: str = "epsg:4326",
20
- out_crs: str = "epsg:4326",
21
- resolution: int = -1,
22
- geom_fix: bool = False,
23
- max_memory_mb: int = 500,
24
- stream_to_disk: bool = None,
25
- ):
24
+ import pandas as pd
25
+ import geopandas as gpd
26
+
27
+ def expand_on_time(gdf):
28
+ """
29
+ Expand datasets on time dimension - each time becomes a new row.
30
+
31
+ Input: GeoDataFrame with 'geometry' and 'dataset' columns (or variable columns)
32
+ Output: GeoDataFrame with time in multi-index and datasets without time coordinate
26
33
  """
27
- Request xarray datasets for all geometries in a GeoDataFrame.
34
+ rows = []
35
+
36
+ for idx, row in gdf.iterrows():
37
+ if 'geometry' in gdf.columns:
38
+ geometry = row['geometry']
39
+ elif gdf.index.name == 'geometry':
40
+ geometry = idx
41
+ else:
42
+ raise ValueError(f"Cannot find geometry in columns: {list(gdf.columns)} or index: {gdf.index.name}")
43
+
44
+ if 'dataset' in gdf.columns:
45
+ dataset = row['dataset']
46
+
47
+ if 'time' in dataset.dims:
48
+ for time_val in dataset.time.values:
49
+ time_slice = dataset.sel(time=time_val).drop_vars('time')
50
+ rows.append({
51
+ 'geometry': geometry,
52
+ 'time': time_val,
53
+ 'dataset': time_slice
54
+ })
55
+ else:
56
+ rows.append({
57
+ 'geometry': geometry,
58
+ 'dataset': dataset
59
+ })
60
+ else:
61
+ variable_columns = list(gdf.columns)
62
+
63
+ first_dataset = row[variable_columns[0]]
64
+ if 'time' in first_dataset.dims:
65
+ time_values = first_dataset.time.values
66
+
67
+ for time_val in time_values:
68
+ row_data = {'geometry': geometry, 'time': time_val}
69
+
70
+ for var_col in variable_columns:
71
+ dataset = row[var_col]
72
+ time_slice = dataset.sel(time=time_val).drop_vars('time')
73
+ row_data[var_col] = time_slice
74
+
75
+ rows.append(row_data)
76
+ else:
77
+ row_data = {'geometry': geometry}
78
+ for var_col in variable_columns:
79
+ row_data[var_col] = row[var_col]
80
+ rows.append(row_data)
81
+
82
+ result_df = pd.DataFrame(rows)
83
+
84
+ if 'time' in result_df.columns:
85
+ result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
86
+ result_gdf = result_gdf.set_index(['geometry', 'time'])
87
+ else:
88
+ result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
89
+ result_gdf = result_gdf.set_index(['geometry'])
90
+
91
+ return result_gdf
28
92
 
29
- Args:
30
- client: The AsyncClient instance
31
- gdf (GeoDataFrame): GeoDataFrame containing geometries
32
- expr (str): Terrakio expression to evaluate, can include spatial aggregations
33
- conc (int): Number of concurrent requests to make
34
- in_crs (str): Input coordinate reference system
35
- out_crs (str): Output coordinate reference system
36
- resolution (int): Resolution parameter
37
- geom_fix (bool): Whether to fix the geometry (default False)
38
- max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
39
- stream_to_disk (bool): Whether to stream large datasets to disk. If None, will be determined automatically.
93
+ def expand_on_variables(gdf):
94
+ """
95
+ Expand datasets on variables dimension - each variable becomes a new column.
96
+
97
+ Input: GeoDataFrame with 'geometry' and 'dataset' columns (or already time-expanded)
98
+ Output: GeoDataFrame with separate column for each variable
99
+ """
100
+ rows = []
101
+
102
+ for idx, row in gdf.iterrows():
103
+ if 'geometry' in gdf.columns:
104
+ geometry = row['geometry']
105
+ elif hasattr(gdf.index, 'names') and 'geometry' in gdf.index.names:
106
+ if isinstance(idx, tuple):
107
+ geometry_idx = gdf.index.names.index('geometry')
108
+ geometry = idx[geometry_idx]
109
+ time_idx = gdf.index.names.index('time')
110
+ time_val = idx[time_idx]
111
+ else:
112
+ geometry = idx
113
+ time_val = None
114
+ else:
115
+ raise ValueError(f"Cannot find geometry in columns: {list(gdf.columns)} or index: {gdf.index.names}")
116
+
117
+ if 'dataset' in gdf.columns:
118
+ dataset = row['dataset']
119
+
120
+ var_names = list(dataset.data_vars.keys())
121
+
122
+ if len(var_names) <= 1:
123
+ if len(var_names) == 0:
124
+ continue
125
+
126
+ if hasattr(gdf.index, 'names') and 'time' in gdf.index.names:
127
+ row_data = {'geometry': geometry, 'time': time_val}
128
+ else:
129
+ row_data = {'geometry': geometry}
130
+
131
+ for var_name in var_names:
132
+ var_dataset = dataset[[var_name]]
133
+
134
+ if len(var_dataset.dims) == 0:
135
+ row_data[var_name] = float(var_dataset[var_name].values)
136
+ else:
137
+ row_data[var_name] = var_dataset
138
+
139
+ rows.append(row_data)
140
+ else:
141
+ raise ValueError("Expected 'dataset' column for variable expansion")
142
+
143
+ result_df = pd.DataFrame(rows)
144
+
145
+ if 'time' in result_df.columns:
146
+ result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
147
+ result_gdf = result_gdf.set_index(['geometry', 'time'])
148
+ else:
149
+ result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
150
+ result_gdf = result_gdf.set_index(['geometry'])
151
+
152
+ return result_gdf
40
153
 
41
- Returns:
42
- geopandas.GeoDataFrame: Copy of input GeoDataFrame with additional 'dataset' column
43
- containing the xarray Dataset for each geometry.
44
154
 
45
- Raises:
46
- ValueError: If concurrency is too high or if data exceeds memory limit without streaming
47
- APIError: If the API request fails
155
+ def expand_on_variables_and_time(gdf):
48
156
  """
49
- if conc > 100:
50
- raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
157
+ Convenience function to expand on both variables and time.
158
+ Automatically detects which expansions are possible.
159
+ """
160
+ try:
161
+ expanded_on_time = expand_on_time(gdf)
162
+ except Exception as e:
163
+ expanded_on_time = gdf
51
164
 
52
- total_geometries = len(gdf)
165
+ try:
166
+ expanded_on_variables_and_time = expand_on_variables(expanded_on_time)
167
+ return expanded_on_variables_and_time
168
+ except Exception as e:
169
+ return expanded_on_time
170
+
171
+ def estimate_geometry_size_ratio(queries: list):
172
+ """Calculate size ratios for all geometries relative to the first geometry using bounding box area."""
53
173
 
54
- # First, make a request with the first geometry to estimate total memory usage
55
- client.logger.info("Estimating total memory usage...")
56
- first_geom = gdf.geometry.iloc[0]
57
- feature = {
58
- "type": "Feature",
59
- "geometry": mapping(first_geom),
60
- "properties": {}
61
- }
174
+ areas = []
62
175
 
63
- try:
64
- first_result = await client.geoquery(expr=expr, feature=feature,
65
- in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
66
- if isinstance(first_result, dict) and first_result.get("error"):
67
- error_msg = f"Request failed: {first_result.get('error_message', 'Unknown error')}"
68
- if first_result.get('status_code'):
69
- error_msg = f"Request failed with status {first_result['status_code']}: {first_result.get('error_message', 'Unknown error')}"
70
- raise APIError(error_msg)
176
+ for query in queries:
177
+ geom = shape(query["feature"]["geometry"])
178
+ in_crs = query["in_crs"]
71
179
 
72
- if not isinstance(first_result, xr.Dataset):
73
- raise ValueError(f"Expected xarray Dataset, got {type(first_result)}")
180
+ if in_crs and in_crs != 'EPSG:3857':
181
+ transformer = pyproj.Transformer.from_crs(in_crs, 'EPSG:3857', always_xy=True)
182
+ transformed_geom = transform(transformer.transform, geom)
183
+ bbox = box(*transformed_geom.bounds)
184
+ area = bbox.area
185
+ else:
186
+ bbox = box(*geom.bounds)
187
+ area = bbox.area
74
188
 
75
- # Estimate total memory usage
76
- single_dataset_size_bytes = estimate_dataset_size(first_result)
77
- total_size_bytes = single_dataset_size_bytes * total_geometries
78
- total_size_mb = total_size_bytes / (1024 * 1024)
79
-
80
- client.logger.info(f"Estimated total memory usage: {total_size_mb:.2f} MB for {total_geometries} geometries")
189
+ areas.append(area)
190
+ base_area = areas[0]
191
+
192
+ if base_area == 0:
193
+ non_zero_areas = [area for area in areas if area > 0]
194
+ base_area = non_zero_areas[0] if non_zero_areas else 1.0
195
+
196
+ ratios = []
197
+ for area in areas:
198
+ if area == 0:
199
+ ratios.append(0.1)
200
+ else:
201
+ ratios.append(area / base_area)
202
+
203
+ return ratios
204
+
205
+ async def estimate_query_size(
206
+ client,
207
+ quries: list[dict],
208
+ ):
209
+ first_query = quries[0]
210
+
211
+ first_query_dataset = await client.geoquery(**first_query)
212
+ ratios = estimate_geometry_size_ratio(quries)
213
+ total_size_mb = 0
214
+ for i in range(len(ratios)):
215
+ total_size_mb += first_query_dataset.nbytes * ratios[i] / (1024**2)
216
+ return total_size_mb
217
+
218
+ async def request_geoquery_list(
219
+ client,
220
+ quries: list[dict],
221
+ conc: int = 20,
222
+ ):
223
+ """
224
+ Execute multiple geo queries.
225
+
226
+ Args:
227
+ client: The Terrakio client instance
228
+ quries: List of dictionaries containing query parameters
229
+ conc: The concurrency level for the requests
81
230
 
82
- # Check if we need to stream to disk
83
- if stream_to_disk is None:
84
- # Auto-determine based on memory usage
85
- if total_size_mb > max_memory_mb:
86
- client.logger.warning(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
87
- raise ValueError(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
231
+ Returns:
232
+ List of query results
88
233
 
89
- except Exception as e:
90
- if "recommend you to set the stream_to_disk parameter to True" in str(e):
91
- raise
92
- client.logger.error(f"Failed to estimate memory usage: {e}")
93
- raise
94
-
95
- client.logger.info(f"Processing {total_geometries} geometries with concurrency {conc}")
234
+ Raises:
235
+ ValueError: If the queries list is empty
236
+ """
237
+ if not quries:
238
+ raise ValueError("Queries list cannot be empty")
239
+ if conc > 100:
240
+ raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
241
+
242
+ for i, query in enumerate(quries):
243
+ if 'expr' not in query:
244
+ raise ValueError(f"Query at index {i} is missing the required 'expr' key")
245
+ if 'feature' not in query:
246
+ raise ValueError(f"Query at index {i} is missing the required 'feature' key")
247
+ if 'in_crs' not in query:
248
+ raise ValueError(f"Query at index {i} is missing the required 'in_crs' key")
96
249
 
97
250
  completed_count = 0
98
251
  lock = asyncio.Lock()
99
-
100
- async def process_geometry(geom):
101
- """Process a single geometry and return the dataset"""
102
- nonlocal completed_count
252
+ async def single_geo_query(query):
253
+ """
254
+ Execute multiple geo queries concurrently.
103
255
 
256
+ Args:
257
+ quries: List of dictionaries containing query parameters
258
+ """
259
+ total_number_of_requests = len(quries)
260
+ nonlocal completed_count
104
261
  try:
105
- feature = {
106
- "type": "Feature",
107
- "geometry": mapping(geom),
108
- "properties": {}
109
- }
110
- # Request xarray dataset
111
- result = await client.geoquery(expr=expr, feature=feature,
112
- in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
262
+ result = await client.geoquery(**query)
113
263
  if isinstance(result, dict) and result.get("error"):
114
264
  error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
115
265
  if result.get('status_code'):
116
266
  error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
117
267
  raise APIError(error_msg)
118
-
119
- # Ensure we got an xarray Dataset
268
+ if isinstance(result, list):
269
+ result = result[0]
270
+ timestamp_number = result['request_count']
271
+ return timestamp_number
120
272
  if not isinstance(result, xr.Dataset):
121
273
  raise ValueError(f"Expected xarray Dataset, got {type(result)}")
122
274
 
123
275
  async with lock:
124
276
  completed_count += 1
125
- if completed_count % max(1, total_geometries // 10) == 0:
126
- client.logger.info(f"Progress: {completed_count}/{total_geometries} geometries processed")
127
-
128
- return result
129
-
277
+ if completed_count % max(1, total_number_of_requests // 10) == 0:
278
+ client.logger.info(f"Progress: {completed_count}/{total_number_of_requests} requests processed")
279
+ return result
130
280
  except Exception as e:
131
281
  async with lock:
132
282
  completed_count += 1
@@ -134,10 +284,7 @@ async def request_data(
134
284
 
135
285
  try:
136
286
  async with BoundedTaskGroup(max_concurrency=conc) as tg:
137
- tasks = [
138
- tg.create_task(process_geometry(gdf.geometry.iloc[idx]))
139
- for idx in range(len(gdf))
140
- ]
287
+ tasks = [tg.create_task(single_geo_query(quries[idx])) for idx in range(len(quries))]
141
288
  all_results = [task.result() for task in tasks]
142
289
 
143
290
  except* Exception as eg:
@@ -145,261 +292,106 @@ async def request_data(
145
292
  if hasattr(e, 'response'):
146
293
  raise APIError(f"API request failed: {e.response.text}")
147
294
  raise
148
-
149
295
  client.logger.info("All requests completed!")
150
-
296
+
151
297
  if not all_results:
152
298
  raise ValueError("No valid results were returned for any geometry")
153
-
154
- # Create a copy of the input GeoDataFrame
155
- result_gdf = gdf.copy()
156
-
157
- # Add the dataset column with the xarray datasets
158
- result_gdf['dataset'] = all_results
159
-
160
- return result_gdf
161
-
299
+ if isinstance(all_results, list) and type(all_results[0]) == int:
300
+ return sum(all_results)/len(all_results)
301
+ else:
302
+ geometries = []
303
+ for query in quries:
304
+ feature = query['feature']
305
+ geometry = shape(feature['geometry'])
306
+ geometries.append(geometry)
307
+ result_gdf = gpd.GeoDataFrame({
308
+ 'geometry': geometries,
309
+ 'dataset': all_results
310
+ })
311
+ return result_gdf
162
312
 
163
- import os
164
- from pathlib import Path
313
+ async def estimate_timestamp_number(
314
+ client,
315
+ quries: list[dict],
316
+ ):
317
+ if len(quries) <= 3:
318
+ return quries
319
+ sampled_queries = [query.copy() for query in random.sample(quries, 3)]
320
+ for query in sampled_queries:
321
+ query['debug'] = 'grpc'
322
+ result = await request_geoquery_list(client = client, quries = sampled_queries, conc = 5)
323
+ total_estimated_number_of_timestamps = result * len(quries)
324
+ return total_estimated_number_of_timestamps
165
325
 
166
- def estimate_dataset_size(dataset):
167
- """
168
- Estimate the memory size of an xarray dataset in bytes.
169
-
170
- Args:
171
- dataset: xarray Dataset
172
-
173
- Returns:
174
- int: Estimated size in bytes
175
- """
176
- total_size = 0
177
- for var_name, var in dataset.data_vars.items():
178
- # Get the dtype size in bytes
179
- dtype_size = var.dtype.itemsize
180
- # Get the total number of elements
181
- total_elements = var.size
182
- # Calculate total size for this variable
183
- total_size += dtype_size * total_elements
184
-
185
- # Add coordinate sizes
186
- for coord_name, coord in dataset.coords.items():
187
- if coord_name not in dataset.dims: # Don't double count dimension coordinates
188
- dtype_size = coord.dtype.itemsize
189
- total_elements = coord.size
190
- total_size += dtype_size * total_elements
191
-
192
- return total_size
193
326
 
194
- def save_dataset_to_file(dataset, filepath):
327
+ def get_available_memory_mb():
195
328
  """
196
- Save dataset to NetCDF file.
329
+ Get available system memory in MB
197
330
 
198
- Args:
199
- dataset: xarray Dataset
200
- filepath: Path to save the file
201
-
202
331
  Returns:
203
- str: Path to saved file
332
+ float: Available memory in MB
204
333
  """
205
- filepath = Path(filepath)
206
-
207
- if not str(filepath).endswith('.nc'):
208
- filepath = filepath.with_suffix('.nc')
209
-
210
- dataset.to_netcdf(filepath)
211
- return str(filepath)
334
+ memory = psutil.virtual_memory()
335
+ available_mb = memory.available / (1024 * 1024)
336
+ return round(available_mb, 2)
212
337
 
213
- def post_processing(
214
- gdf_with_datasets: GeoDataFrame,
215
- spatial_reduction: str = None,
216
- temporal_reduction: str = None,
217
- drop_nan: bool = False,
218
- inplace: bool = False,
219
- stream_to_disk: bool = False,
338
+ async def local_or_remote(
339
+ client,
340
+ quries: list[dict],
220
341
  ):
221
- """
222
- Post-process the GeoDataFrame with datasets to extract variables with optional reductions.
223
-
224
- Args:
225
- gdf_with_datasets (GeoDataFrame): GeoDataFrame with 'dataset' column containing xarray Datasets
226
- spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
227
- Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
228
- temporal_reduction (str): Reduction operation for temporal dimension (time).
229
- Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
230
- drop_nan (bool): Whether to drop NaN values from the results (default False)
231
- inplace (bool): Whether to modify the input GeoDataFrame in place
232
- stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
233
-
234
- Returns:
235
- geopandas.GeoDataFrame: GeoDataFrame with variable dataarrays/values or file paths in separate columns.
236
- If stream_to_disk=True, large datasets are saved as NetCDF files with file paths stored.
237
- """
238
- if 'dataset' not in gdf_with_datasets.columns:
239
- raise ValueError("Input GeoDataFrame must contain a 'dataset' column")
240
-
241
- # Validate reduction parameters
242
- valid_reductions = ['mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count']
243
- if spatial_reduction and spatial_reduction not in valid_reductions:
244
- raise ValueError(f"spatial_reduction must be one of {valid_reductions}")
245
- if temporal_reduction and temporal_reduction not in valid_reductions:
246
- raise ValueError(f"temporal_reduction must be one of {valid_reductions}")
247
-
248
- result_rows = []
249
- geometries = []
250
-
251
- # Process each row (geometry + dataset)
252
- for i, row in gdf_with_datasets.iterrows():
253
- dataset = row['dataset']
254
-
255
- # Create new row for this geometry
256
- new_row = {}
257
-
258
- # Copy original GeoDataFrame attributes (excluding dataset column)
259
- for col in gdf_with_datasets.columns:
260
- if col not in ['geometry', 'dataset']:
261
- new_row[col] = row[col]
262
-
263
- # Process each variable in the dataset
264
- data_vars = list(dataset.data_vars.keys())
265
- for var_name in data_vars:
266
- var_data = dataset[var_name]
267
-
268
- # Apply drop_nan if requested
269
- if drop_nan:
270
- # Drop spatial dimensions where all values are NaN
271
- var_data = var_data.dropna(dim='x', how='all').dropna(dim='y', how='all')
272
-
273
- # Drop time dimensions where all values are NaN
274
- if 'time' in var_data.dims:
275
- var_data = var_data.dropna(dim='time', how='all')
276
-
277
- # Check current dimensions to determine if aggregation is needed
278
- current_dims = set(var_data.dims)
279
- has_spatial_dims = bool(current_dims.intersection(['x', 'y']))
280
- has_temporal_dim = 'time' in current_dims
281
-
282
- # Apply spatial reduction only if spatial dimensions exist and reduction is requested
283
- if spatial_reduction and has_spatial_dims:
284
- spatial_dims = [dim for dim in ['x', 'y'] if dim in var_data.dims]
285
- if spatial_dims:
286
- if spatial_reduction == 'count':
287
- var_data = var_data.count(dim=spatial_dims)
288
- else:
289
- var_data = getattr(var_data, spatial_reduction)(dim=spatial_dims)
290
-
291
- # Apply temporal reduction only if time dimension exists and reduction is requested
292
- if temporal_reduction and has_temporal_dim:
293
- if temporal_reduction == 'count':
294
- var_data = var_data.count(dim='time')
295
- else:
296
- var_data = getattr(var_data, temporal_reduction)(dim='time')
297
-
298
- # Handle streaming to disk if requested
299
- if stream_to_disk:
300
- # Create a single-variable dataset for saving
301
- single_var_dataset = var_data.to_dataset(name=var_name)
302
-
303
- # Generate filename based on row index and variable name
304
- filename = f"geometry_{i}_{var_name}.nc"
305
- filepath = os.path.join(os.getcwd(), filename)
306
-
307
- # Save to disk and store file path
308
- saved_path = save_dataset_to_file(single_var_dataset, filepath)
309
- new_row[var_name] = f"file://{saved_path}"
310
-
311
- print(f"Dataset for geometry {i}, variable '{var_name}' saved to: {saved_path}")
312
- else:
313
- # Keep in memory
314
- new_row[var_name] = var_data
315
-
316
- result_rows.append(new_row)
317
- geometries.append(row['geometry'])
318
-
319
- # Create the result GeoDataFrame with default integer index
320
- result_gdf = GeoDataFrame(result_rows, geometry=geometries)
321
-
322
- if inplace:
323
- # Clear original gdf and replace with result_gdf content
324
- gdf_with_datasets.drop(gdf_with_datasets.index, inplace=True)
325
- gdf_with_datasets.drop(gdf_with_datasets.columns, axis=1, inplace=True)
326
-
327
- # Copy all data from result_gdf to gdf
328
- for col in result_gdf.columns:
329
- gdf_with_datasets[col] = result_gdf[col].values
330
-
331
- # Ensure it remains a GeoDataFrame with correct geometry
332
- gdf_with_datasets.geometry = result_gdf.geometry
333
-
334
- return None
342
+ if len(quries) > 1000:
343
+ return {
344
+ "local_or_remote": "remote",
345
+ "reason": "The number of the requests is too large(>1000), please set the mass_stats parameter to True",
346
+ }
347
+ elif await estimate_timestamp_number(client = client, quries = quries) > 25000:
348
+ return {
349
+ "local_or_remote": "remote",
350
+ "reason": "The time taking for making these requests is too long, please set the mass_stats parameter to True",
351
+ }
352
+ elif await estimate_query_size(client = client, quries = quries) > get_available_memory_mb():
353
+ return {
354
+ "local_or_remote": "remote",
355
+ "reason": "The size of the dataset is too large, please set the mass_stats parameter to True",
356
+ }
335
357
  else:
336
- return result_gdf
337
-
358
+ return {
359
+ "local_or_remote": "local",
360
+ "reason": "The number of the requests is not too large, and the time taking for making these requests is not too long, and the size of the dataset is not too large",
361
+ }
338
362
 
339
- # Updated zonal_stats function that uses both parts
340
363
  async def zonal_stats(
341
364
  client,
342
365
  gdf: GeoDataFrame,
343
366
  expr: str,
344
367
  conc: int = 20,
345
- inplace: bool = False,
346
368
  in_crs: str = "epsg:4326",
347
369
  out_crs: str = "epsg:4326",
348
370
  resolution: int = -1,
349
371
  geom_fix: bool = False,
350
- drop_nan: bool = False,
351
- spatial_reduction: str = None,
352
- temporal_reduction: str = None,
353
- max_memory_mb: int = 500,
354
- stream_to_disk: bool = False,
355
372
  ):
356
- """
357
- Compute zonal statistics for all geometries in a GeoDataFrame.
358
- This is a convenience function that combines request_data and post_processing.
359
-
360
- Args:
361
- client: The AsyncClient instance
362
- gdf (GeoDataFrame): GeoDataFrame containing geometries
363
- expr (str): Terrakio expression to evaluate, can include spatial aggregations
364
- conc (int): Number of concurrent requests to make
365
- inplace (bool): Whether to modify the input GeoDataFrame in place
366
- in_crs (str): Input coordinate reference system
367
- out_crs (str): Output coordinate reference system
368
- resolution (int): Resolution parameter
369
- geom_fix (bool): Whether to fix the geometry (default False)
370
- drop_nan (bool): Whether to drop NaN values from the results (default False)
371
- spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
372
- Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
373
- temporal_reduction (str): Reduction operation for temporal dimension (time).
374
- Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
375
- max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
376
- stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
377
- """
378
- # Step 1: Request data (with memory estimation)
379
- gdf_with_datasets = await request_data(
380
- client=client,
381
- gdf=gdf,
382
- expr=expr,
383
- conc=conc,
384
- in_crs=in_crs,
385
- out_crs=out_crs,
386
- resolution=resolution,
387
- geom_fix=geom_fix,
388
- max_memory_mb=max_memory_mb,
389
- stream_to_disk=stream_to_disk
390
- )
391
-
392
- # Step 2: Post-process with reductions and optional streaming
393
- result = post_processing(
394
- gdf_with_datasets=gdf_with_datasets,
395
- spatial_reduction=spatial_reduction,
396
- temporal_reduction=temporal_reduction,
397
- drop_nan=drop_nan,
398
- inplace=inplace,
399
- stream_to_disk=stream_to_disk
400
- )
401
-
402
- return result
373
+ """Compute zonal statistics for all geometries in a GeoDataFrame."""
374
+ quries = []
375
+ for i in range(len(gdf)):
376
+ quries.append({
377
+ "expr": expr,
378
+ "feature": {
379
+ "type": "Feature",
380
+ "geometry": mapping(gdf.geometry.iloc[i]),
381
+ "properties": {}
382
+ },
383
+ "in_crs": in_crs,
384
+ "out_crs": out_crs,
385
+ "resolution": resolution,
386
+ "geom_fix": geom_fix,
387
+ })
388
+ local_or_remote_result = await local_or_remote(client= client, quries = quries)
389
+ if local_or_remote_result["local_or_remote"] == "remote":
390
+ raise ValueError(local_or_remote_result["reason"])
391
+ else:
392
+ gdf_with_datasets = await request_geoquery_list(client = client, quries = quries, conc = conc)
393
+ gdf_with_datasets = expand_on_variables_and_time(gdf_with_datasets)
394
+ return gdf_with_datasets
403
395
 
404
396
  async def create_dataset_file(
405
397
  client,