terrakio-core 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of terrakio-core might be problematic. Click here for more details.
- terrakio_core/__init__.py +3 -1
- terrakio_core/accessors.py +477 -0
- terrakio_core/async_client.py +23 -38
- terrakio_core/client.py +83 -84
- terrakio_core/convenience_functions/convenience_functions.py +316 -324
- terrakio_core/endpoints/auth.py +8 -1
- terrakio_core/endpoints/mass_stats.py +13 -9
- terrakio_core/endpoints/model_management.py +604 -948
- terrakio_core/sync_client.py +341 -33
- {terrakio_core-0.4.3.dist-info → terrakio_core-0.4.4.dist-info}/METADATA +2 -1
- terrakio_core-0.4.4.dist-info/RECORD +22 -0
- terrakio_core-0.4.3.dist-info/RECORD +0 -21
- {terrakio_core-0.4.3.dist-info → terrakio_core-0.4.4.dist-info}/WHEEL +0 -0
- {terrakio_core-0.4.3.dist-info → terrakio_core-0.4.4.dist-info}/top_level.txt +0 -0
|
@@ -1,132 +1,282 @@
|
|
|
1
|
+
|
|
1
2
|
import os
|
|
2
3
|
import asyncio
|
|
3
4
|
import tempfile
|
|
4
5
|
import time
|
|
5
6
|
import pandas as pd
|
|
7
|
+
import geopandas as gpd
|
|
6
8
|
from geopandas import GeoDataFrame
|
|
7
9
|
from shapely.geometry import mapping
|
|
10
|
+
from pathlib import Path
|
|
8
11
|
from ..exceptions import APIError, ConfigurationError
|
|
9
12
|
from ..helper.bounded_taskgroup import BoundedTaskGroup
|
|
10
13
|
from ..helper.tiles import tiles
|
|
11
14
|
import uuid
|
|
12
15
|
import xarray as xr
|
|
16
|
+
import random
|
|
17
|
+
import psutil
|
|
18
|
+
import copy
|
|
19
|
+
from shapely.geometry import shape
|
|
20
|
+
from shapely.ops import transform
|
|
21
|
+
from shapely.geometry import box
|
|
22
|
+
import pyproj
|
|
13
23
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
max_memory_mb: int = 500,
|
|
24
|
-
stream_to_disk: bool = None,
|
|
25
|
-
):
|
|
24
|
+
import pandas as pd
|
|
25
|
+
import geopandas as gpd
|
|
26
|
+
|
|
27
|
+
def expand_on_time(gdf):
|
|
28
|
+
"""
|
|
29
|
+
Expand datasets on time dimension - each time becomes a new row.
|
|
30
|
+
|
|
31
|
+
Input: GeoDataFrame with 'geometry' and 'dataset' columns (or variable columns)
|
|
32
|
+
Output: GeoDataFrame with time in multi-index and datasets without time coordinate
|
|
26
33
|
"""
|
|
27
|
-
|
|
34
|
+
rows = []
|
|
35
|
+
|
|
36
|
+
for idx, row in gdf.iterrows():
|
|
37
|
+
if 'geometry' in gdf.columns:
|
|
38
|
+
geometry = row['geometry']
|
|
39
|
+
elif gdf.index.name == 'geometry':
|
|
40
|
+
geometry = idx
|
|
41
|
+
else:
|
|
42
|
+
raise ValueError(f"Cannot find geometry in columns: {list(gdf.columns)} or index: {gdf.index.name}")
|
|
43
|
+
|
|
44
|
+
if 'dataset' in gdf.columns:
|
|
45
|
+
dataset = row['dataset']
|
|
46
|
+
|
|
47
|
+
if 'time' in dataset.dims:
|
|
48
|
+
for time_val in dataset.time.values:
|
|
49
|
+
time_slice = dataset.sel(time=time_val).drop_vars('time')
|
|
50
|
+
rows.append({
|
|
51
|
+
'geometry': geometry,
|
|
52
|
+
'time': time_val,
|
|
53
|
+
'dataset': time_slice
|
|
54
|
+
})
|
|
55
|
+
else:
|
|
56
|
+
rows.append({
|
|
57
|
+
'geometry': geometry,
|
|
58
|
+
'dataset': dataset
|
|
59
|
+
})
|
|
60
|
+
else:
|
|
61
|
+
variable_columns = list(gdf.columns)
|
|
62
|
+
|
|
63
|
+
first_dataset = row[variable_columns[0]]
|
|
64
|
+
if 'time' in first_dataset.dims:
|
|
65
|
+
time_values = first_dataset.time.values
|
|
66
|
+
|
|
67
|
+
for time_val in time_values:
|
|
68
|
+
row_data = {'geometry': geometry, 'time': time_val}
|
|
69
|
+
|
|
70
|
+
for var_col in variable_columns:
|
|
71
|
+
dataset = row[var_col]
|
|
72
|
+
time_slice = dataset.sel(time=time_val).drop_vars('time')
|
|
73
|
+
row_data[var_col] = time_slice
|
|
74
|
+
|
|
75
|
+
rows.append(row_data)
|
|
76
|
+
else:
|
|
77
|
+
row_data = {'geometry': geometry}
|
|
78
|
+
for var_col in variable_columns:
|
|
79
|
+
row_data[var_col] = row[var_col]
|
|
80
|
+
rows.append(row_data)
|
|
81
|
+
|
|
82
|
+
result_df = pd.DataFrame(rows)
|
|
83
|
+
|
|
84
|
+
if 'time' in result_df.columns:
|
|
85
|
+
result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
|
|
86
|
+
result_gdf = result_gdf.set_index(['geometry', 'time'])
|
|
87
|
+
else:
|
|
88
|
+
result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
|
|
89
|
+
result_gdf = result_gdf.set_index(['geometry'])
|
|
90
|
+
|
|
91
|
+
return result_gdf
|
|
28
92
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
93
|
+
def expand_on_variables(gdf):
|
|
94
|
+
"""
|
|
95
|
+
Expand datasets on variables dimension - each variable becomes a new column.
|
|
96
|
+
|
|
97
|
+
Input: GeoDataFrame with 'geometry' and 'dataset' columns (or already time-expanded)
|
|
98
|
+
Output: GeoDataFrame with separate column for each variable
|
|
99
|
+
"""
|
|
100
|
+
rows = []
|
|
101
|
+
|
|
102
|
+
for idx, row in gdf.iterrows():
|
|
103
|
+
if 'geometry' in gdf.columns:
|
|
104
|
+
geometry = row['geometry']
|
|
105
|
+
elif hasattr(gdf.index, 'names') and 'geometry' in gdf.index.names:
|
|
106
|
+
if isinstance(idx, tuple):
|
|
107
|
+
geometry_idx = gdf.index.names.index('geometry')
|
|
108
|
+
geometry = idx[geometry_idx]
|
|
109
|
+
time_idx = gdf.index.names.index('time')
|
|
110
|
+
time_val = idx[time_idx]
|
|
111
|
+
else:
|
|
112
|
+
geometry = idx
|
|
113
|
+
time_val = None
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Cannot find geometry in columns: {list(gdf.columns)} or index: {gdf.index.names}")
|
|
116
|
+
|
|
117
|
+
if 'dataset' in gdf.columns:
|
|
118
|
+
dataset = row['dataset']
|
|
119
|
+
|
|
120
|
+
var_names = list(dataset.data_vars.keys())
|
|
121
|
+
|
|
122
|
+
if len(var_names) <= 1:
|
|
123
|
+
if len(var_names) == 0:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
if hasattr(gdf.index, 'names') and 'time' in gdf.index.names:
|
|
127
|
+
row_data = {'geometry': geometry, 'time': time_val}
|
|
128
|
+
else:
|
|
129
|
+
row_data = {'geometry': geometry}
|
|
130
|
+
|
|
131
|
+
for var_name in var_names:
|
|
132
|
+
var_dataset = dataset[[var_name]]
|
|
133
|
+
|
|
134
|
+
if len(var_dataset.dims) == 0:
|
|
135
|
+
row_data[var_name] = float(var_dataset[var_name].values)
|
|
136
|
+
else:
|
|
137
|
+
row_data[var_name] = var_dataset
|
|
138
|
+
|
|
139
|
+
rows.append(row_data)
|
|
140
|
+
else:
|
|
141
|
+
raise ValueError("Expected 'dataset' column for variable expansion")
|
|
142
|
+
|
|
143
|
+
result_df = pd.DataFrame(rows)
|
|
144
|
+
|
|
145
|
+
if 'time' in result_df.columns:
|
|
146
|
+
result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
|
|
147
|
+
result_gdf = result_gdf.set_index(['geometry', 'time'])
|
|
148
|
+
else:
|
|
149
|
+
result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
|
|
150
|
+
result_gdf = result_gdf.set_index(['geometry'])
|
|
151
|
+
|
|
152
|
+
return result_gdf
|
|
40
153
|
|
|
41
|
-
Returns:
|
|
42
|
-
geopandas.GeoDataFrame: Copy of input GeoDataFrame with additional 'dataset' column
|
|
43
|
-
containing the xarray Dataset for each geometry.
|
|
44
154
|
|
|
45
|
-
|
|
46
|
-
ValueError: If concurrency is too high or if data exceeds memory limit without streaming
|
|
47
|
-
APIError: If the API request fails
|
|
155
|
+
def expand_on_variables_and_time(gdf):
|
|
48
156
|
"""
|
|
49
|
-
|
|
50
|
-
|
|
157
|
+
Convenience function to expand on both variables and time.
|
|
158
|
+
Automatically detects which expansions are possible.
|
|
159
|
+
"""
|
|
160
|
+
try:
|
|
161
|
+
expanded_on_time = expand_on_time(gdf)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
expanded_on_time = gdf
|
|
51
164
|
|
|
52
|
-
|
|
165
|
+
try:
|
|
166
|
+
expanded_on_variables_and_time = expand_on_variables(expanded_on_time)
|
|
167
|
+
return expanded_on_variables_and_time
|
|
168
|
+
except Exception as e:
|
|
169
|
+
return expanded_on_time
|
|
170
|
+
|
|
171
|
+
def estimate_geometry_size_ratio(queries: list):
|
|
172
|
+
"""Calculate size ratios for all geometries relative to the first geometry using bounding box area."""
|
|
53
173
|
|
|
54
|
-
|
|
55
|
-
client.logger.info("Estimating total memory usage...")
|
|
56
|
-
first_geom = gdf.geometry.iloc[0]
|
|
57
|
-
feature = {
|
|
58
|
-
"type": "Feature",
|
|
59
|
-
"geometry": mapping(first_geom),
|
|
60
|
-
"properties": {}
|
|
61
|
-
}
|
|
174
|
+
areas = []
|
|
62
175
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
if isinstance(first_result, dict) and first_result.get("error"):
|
|
67
|
-
error_msg = f"Request failed: {first_result.get('error_message', 'Unknown error')}"
|
|
68
|
-
if first_result.get('status_code'):
|
|
69
|
-
error_msg = f"Request failed with status {first_result['status_code']}: {first_result.get('error_message', 'Unknown error')}"
|
|
70
|
-
raise APIError(error_msg)
|
|
176
|
+
for query in queries:
|
|
177
|
+
geom = shape(query["feature"]["geometry"])
|
|
178
|
+
in_crs = query["in_crs"]
|
|
71
179
|
|
|
72
|
-
if
|
|
73
|
-
|
|
180
|
+
if in_crs and in_crs != 'EPSG:3857':
|
|
181
|
+
transformer = pyproj.Transformer.from_crs(in_crs, 'EPSG:3857', always_xy=True)
|
|
182
|
+
transformed_geom = transform(transformer.transform, geom)
|
|
183
|
+
bbox = box(*transformed_geom.bounds)
|
|
184
|
+
area = bbox.area
|
|
185
|
+
else:
|
|
186
|
+
bbox = box(*geom.bounds)
|
|
187
|
+
area = bbox.area
|
|
74
188
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
189
|
+
areas.append(area)
|
|
190
|
+
base_area = areas[0]
|
|
191
|
+
|
|
192
|
+
if base_area == 0:
|
|
193
|
+
non_zero_areas = [area for area in areas if area > 0]
|
|
194
|
+
base_area = non_zero_areas[0] if non_zero_areas else 1.0
|
|
195
|
+
|
|
196
|
+
ratios = []
|
|
197
|
+
for area in areas:
|
|
198
|
+
if area == 0:
|
|
199
|
+
ratios.append(0.1)
|
|
200
|
+
else:
|
|
201
|
+
ratios.append(area / base_area)
|
|
202
|
+
|
|
203
|
+
return ratios
|
|
204
|
+
|
|
205
|
+
async def estimate_query_size(
|
|
206
|
+
client,
|
|
207
|
+
quries: list[dict],
|
|
208
|
+
):
|
|
209
|
+
first_query = quries[0]
|
|
210
|
+
|
|
211
|
+
first_query_dataset = await client.geoquery(**first_query)
|
|
212
|
+
ratios = estimate_geometry_size_ratio(quries)
|
|
213
|
+
total_size_mb = 0
|
|
214
|
+
for i in range(len(ratios)):
|
|
215
|
+
total_size_mb += first_query_dataset.nbytes * ratios[i] / (1024**2)
|
|
216
|
+
return total_size_mb
|
|
217
|
+
|
|
218
|
+
async def request_geoquery_list(
|
|
219
|
+
client,
|
|
220
|
+
quries: list[dict],
|
|
221
|
+
conc: int = 20,
|
|
222
|
+
):
|
|
223
|
+
"""
|
|
224
|
+
Execute multiple geo queries.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
client: The Terrakio client instance
|
|
228
|
+
quries: List of dictionaries containing query parameters
|
|
229
|
+
conc: The concurrency level for the requests
|
|
81
230
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# Auto-determine based on memory usage
|
|
85
|
-
if total_size_mb > max_memory_mb:
|
|
86
|
-
client.logger.warning(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
|
|
87
|
-
raise ValueError(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
|
|
231
|
+
Returns:
|
|
232
|
+
List of query results
|
|
88
233
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
raise
|
|
94
|
-
|
|
95
|
-
|
|
234
|
+
Raises:
|
|
235
|
+
ValueError: If the queries list is empty
|
|
236
|
+
"""
|
|
237
|
+
if not quries:
|
|
238
|
+
raise ValueError("Queries list cannot be empty")
|
|
239
|
+
if conc > 100:
|
|
240
|
+
raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
|
|
241
|
+
|
|
242
|
+
for i, query in enumerate(quries):
|
|
243
|
+
if 'expr' not in query:
|
|
244
|
+
raise ValueError(f"Query at index {i} is missing the required 'expr' key")
|
|
245
|
+
if 'feature' not in query:
|
|
246
|
+
raise ValueError(f"Query at index {i} is missing the required 'feature' key")
|
|
247
|
+
if 'in_crs' not in query:
|
|
248
|
+
raise ValueError(f"Query at index {i} is missing the required 'in_crs' key")
|
|
96
249
|
|
|
97
250
|
completed_count = 0
|
|
98
251
|
lock = asyncio.Lock()
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
nonlocal completed_count
|
|
252
|
+
async def single_geo_query(query):
|
|
253
|
+
"""
|
|
254
|
+
Execute multiple geo queries concurrently.
|
|
103
255
|
|
|
256
|
+
Args:
|
|
257
|
+
quries: List of dictionaries containing query parameters
|
|
258
|
+
"""
|
|
259
|
+
total_number_of_requests = len(quries)
|
|
260
|
+
nonlocal completed_count
|
|
104
261
|
try:
|
|
105
|
-
|
|
106
|
-
"type": "Feature",
|
|
107
|
-
"geometry": mapping(geom),
|
|
108
|
-
"properties": {}
|
|
109
|
-
}
|
|
110
|
-
# Request xarray dataset
|
|
111
|
-
result = await client.geoquery(expr=expr, feature=feature,
|
|
112
|
-
in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
|
|
262
|
+
result = await client.geoquery(**query)
|
|
113
263
|
if isinstance(result, dict) and result.get("error"):
|
|
114
264
|
error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
|
|
115
265
|
if result.get('status_code'):
|
|
116
266
|
error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
|
|
117
267
|
raise APIError(error_msg)
|
|
118
|
-
|
|
119
|
-
|
|
268
|
+
if isinstance(result, list):
|
|
269
|
+
result = result[0]
|
|
270
|
+
timestamp_number = result['request_count']
|
|
271
|
+
return timestamp_number
|
|
120
272
|
if not isinstance(result, xr.Dataset):
|
|
121
273
|
raise ValueError(f"Expected xarray Dataset, got {type(result)}")
|
|
122
274
|
|
|
123
275
|
async with lock:
|
|
124
276
|
completed_count += 1
|
|
125
|
-
if completed_count % max(1,
|
|
126
|
-
client.logger.info(f"Progress: {completed_count}/{
|
|
127
|
-
|
|
128
|
-
return result
|
|
129
|
-
|
|
277
|
+
if completed_count % max(1, total_number_of_requests // 10) == 0:
|
|
278
|
+
client.logger.info(f"Progress: {completed_count}/{total_number_of_requests} requests processed")
|
|
279
|
+
return result
|
|
130
280
|
except Exception as e:
|
|
131
281
|
async with lock:
|
|
132
282
|
completed_count += 1
|
|
@@ -134,10 +284,7 @@ async def request_data(
|
|
|
134
284
|
|
|
135
285
|
try:
|
|
136
286
|
async with BoundedTaskGroup(max_concurrency=conc) as tg:
|
|
137
|
-
tasks = [
|
|
138
|
-
tg.create_task(process_geometry(gdf.geometry.iloc[idx]))
|
|
139
|
-
for idx in range(len(gdf))
|
|
140
|
-
]
|
|
287
|
+
tasks = [tg.create_task(single_geo_query(quries[idx])) for idx in range(len(quries))]
|
|
141
288
|
all_results = [task.result() for task in tasks]
|
|
142
289
|
|
|
143
290
|
except* Exception as eg:
|
|
@@ -145,261 +292,106 @@ async def request_data(
|
|
|
145
292
|
if hasattr(e, 'response'):
|
|
146
293
|
raise APIError(f"API request failed: {e.response.text}")
|
|
147
294
|
raise
|
|
148
|
-
|
|
149
295
|
client.logger.info("All requests completed!")
|
|
150
|
-
|
|
296
|
+
|
|
151
297
|
if not all_results:
|
|
152
298
|
raise ValueError("No valid results were returned for any geometry")
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
299
|
+
if isinstance(all_results, list) and type(all_results[0]) == int:
|
|
300
|
+
return sum(all_results)/len(all_results)
|
|
301
|
+
else:
|
|
302
|
+
geometries = []
|
|
303
|
+
for query in quries:
|
|
304
|
+
feature = query['feature']
|
|
305
|
+
geometry = shape(feature['geometry'])
|
|
306
|
+
geometries.append(geometry)
|
|
307
|
+
result_gdf = gpd.GeoDataFrame({
|
|
308
|
+
'geometry': geometries,
|
|
309
|
+
'dataset': all_results
|
|
310
|
+
})
|
|
311
|
+
return result_gdf
|
|
162
312
|
|
|
163
|
-
|
|
164
|
-
|
|
313
|
+
async def estimate_timestamp_number(
|
|
314
|
+
client,
|
|
315
|
+
quries: list[dict],
|
|
316
|
+
):
|
|
317
|
+
if len(quries) <= 3:
|
|
318
|
+
return quries
|
|
319
|
+
sampled_queries = [query.copy() for query in random.sample(quries, 3)]
|
|
320
|
+
for query in sampled_queries:
|
|
321
|
+
query['debug'] = 'grpc'
|
|
322
|
+
result = await request_geoquery_list(client = client, quries = sampled_queries, conc = 5)
|
|
323
|
+
total_estimated_number_of_timestamps = result * len(quries)
|
|
324
|
+
return total_estimated_number_of_timestamps
|
|
165
325
|
|
|
166
|
-
def estimate_dataset_size(dataset):
|
|
167
|
-
"""
|
|
168
|
-
Estimate the memory size of an xarray dataset in bytes.
|
|
169
|
-
|
|
170
|
-
Args:
|
|
171
|
-
dataset: xarray Dataset
|
|
172
|
-
|
|
173
|
-
Returns:
|
|
174
|
-
int: Estimated size in bytes
|
|
175
|
-
"""
|
|
176
|
-
total_size = 0
|
|
177
|
-
for var_name, var in dataset.data_vars.items():
|
|
178
|
-
# Get the dtype size in bytes
|
|
179
|
-
dtype_size = var.dtype.itemsize
|
|
180
|
-
# Get the total number of elements
|
|
181
|
-
total_elements = var.size
|
|
182
|
-
# Calculate total size for this variable
|
|
183
|
-
total_size += dtype_size * total_elements
|
|
184
|
-
|
|
185
|
-
# Add coordinate sizes
|
|
186
|
-
for coord_name, coord in dataset.coords.items():
|
|
187
|
-
if coord_name not in dataset.dims: # Don't double count dimension coordinates
|
|
188
|
-
dtype_size = coord.dtype.itemsize
|
|
189
|
-
total_elements = coord.size
|
|
190
|
-
total_size += dtype_size * total_elements
|
|
191
|
-
|
|
192
|
-
return total_size
|
|
193
326
|
|
|
194
|
-
def
|
|
327
|
+
def get_available_memory_mb():
|
|
195
328
|
"""
|
|
196
|
-
|
|
329
|
+
Get available system memory in MB
|
|
197
330
|
|
|
198
|
-
Args:
|
|
199
|
-
dataset: xarray Dataset
|
|
200
|
-
filepath: Path to save the file
|
|
201
|
-
|
|
202
331
|
Returns:
|
|
203
|
-
|
|
332
|
+
float: Available memory in MB
|
|
204
333
|
"""
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
filepath = filepath.with_suffix('.nc')
|
|
209
|
-
|
|
210
|
-
dataset.to_netcdf(filepath)
|
|
211
|
-
return str(filepath)
|
|
334
|
+
memory = psutil.virtual_memory()
|
|
335
|
+
available_mb = memory.available / (1024 * 1024)
|
|
336
|
+
return round(available_mb, 2)
|
|
212
337
|
|
|
213
|
-
def
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
temporal_reduction: str = None,
|
|
217
|
-
drop_nan: bool = False,
|
|
218
|
-
inplace: bool = False,
|
|
219
|
-
stream_to_disk: bool = False,
|
|
338
|
+
async def local_or_remote(
|
|
339
|
+
client,
|
|
340
|
+
quries: list[dict],
|
|
220
341
|
):
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
If stream_to_disk=True, large datasets are saved as NetCDF files with file paths stored.
|
|
237
|
-
"""
|
|
238
|
-
if 'dataset' not in gdf_with_datasets.columns:
|
|
239
|
-
raise ValueError("Input GeoDataFrame must contain a 'dataset' column")
|
|
240
|
-
|
|
241
|
-
# Validate reduction parameters
|
|
242
|
-
valid_reductions = ['mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count']
|
|
243
|
-
if spatial_reduction and spatial_reduction not in valid_reductions:
|
|
244
|
-
raise ValueError(f"spatial_reduction must be one of {valid_reductions}")
|
|
245
|
-
if temporal_reduction and temporal_reduction not in valid_reductions:
|
|
246
|
-
raise ValueError(f"temporal_reduction must be one of {valid_reductions}")
|
|
247
|
-
|
|
248
|
-
result_rows = []
|
|
249
|
-
geometries = []
|
|
250
|
-
|
|
251
|
-
# Process each row (geometry + dataset)
|
|
252
|
-
for i, row in gdf_with_datasets.iterrows():
|
|
253
|
-
dataset = row['dataset']
|
|
254
|
-
|
|
255
|
-
# Create new row for this geometry
|
|
256
|
-
new_row = {}
|
|
257
|
-
|
|
258
|
-
# Copy original GeoDataFrame attributes (excluding dataset column)
|
|
259
|
-
for col in gdf_with_datasets.columns:
|
|
260
|
-
if col not in ['geometry', 'dataset']:
|
|
261
|
-
new_row[col] = row[col]
|
|
262
|
-
|
|
263
|
-
# Process each variable in the dataset
|
|
264
|
-
data_vars = list(dataset.data_vars.keys())
|
|
265
|
-
for var_name in data_vars:
|
|
266
|
-
var_data = dataset[var_name]
|
|
267
|
-
|
|
268
|
-
# Apply drop_nan if requested
|
|
269
|
-
if drop_nan:
|
|
270
|
-
# Drop spatial dimensions where all values are NaN
|
|
271
|
-
var_data = var_data.dropna(dim='x', how='all').dropna(dim='y', how='all')
|
|
272
|
-
|
|
273
|
-
# Drop time dimensions where all values are NaN
|
|
274
|
-
if 'time' in var_data.dims:
|
|
275
|
-
var_data = var_data.dropna(dim='time', how='all')
|
|
276
|
-
|
|
277
|
-
# Check current dimensions to determine if aggregation is needed
|
|
278
|
-
current_dims = set(var_data.dims)
|
|
279
|
-
has_spatial_dims = bool(current_dims.intersection(['x', 'y']))
|
|
280
|
-
has_temporal_dim = 'time' in current_dims
|
|
281
|
-
|
|
282
|
-
# Apply spatial reduction only if spatial dimensions exist and reduction is requested
|
|
283
|
-
if spatial_reduction and has_spatial_dims:
|
|
284
|
-
spatial_dims = [dim for dim in ['x', 'y'] if dim in var_data.dims]
|
|
285
|
-
if spatial_dims:
|
|
286
|
-
if spatial_reduction == 'count':
|
|
287
|
-
var_data = var_data.count(dim=spatial_dims)
|
|
288
|
-
else:
|
|
289
|
-
var_data = getattr(var_data, spatial_reduction)(dim=spatial_dims)
|
|
290
|
-
|
|
291
|
-
# Apply temporal reduction only if time dimension exists and reduction is requested
|
|
292
|
-
if temporal_reduction and has_temporal_dim:
|
|
293
|
-
if temporal_reduction == 'count':
|
|
294
|
-
var_data = var_data.count(dim='time')
|
|
295
|
-
else:
|
|
296
|
-
var_data = getattr(var_data, temporal_reduction)(dim='time')
|
|
297
|
-
|
|
298
|
-
# Handle streaming to disk if requested
|
|
299
|
-
if stream_to_disk:
|
|
300
|
-
# Create a single-variable dataset for saving
|
|
301
|
-
single_var_dataset = var_data.to_dataset(name=var_name)
|
|
302
|
-
|
|
303
|
-
# Generate filename based on row index and variable name
|
|
304
|
-
filename = f"geometry_{i}_{var_name}.nc"
|
|
305
|
-
filepath = os.path.join(os.getcwd(), filename)
|
|
306
|
-
|
|
307
|
-
# Save to disk and store file path
|
|
308
|
-
saved_path = save_dataset_to_file(single_var_dataset, filepath)
|
|
309
|
-
new_row[var_name] = f"file://{saved_path}"
|
|
310
|
-
|
|
311
|
-
print(f"Dataset for geometry {i}, variable '{var_name}' saved to: {saved_path}")
|
|
312
|
-
else:
|
|
313
|
-
# Keep in memory
|
|
314
|
-
new_row[var_name] = var_data
|
|
315
|
-
|
|
316
|
-
result_rows.append(new_row)
|
|
317
|
-
geometries.append(row['geometry'])
|
|
318
|
-
|
|
319
|
-
# Create the result GeoDataFrame with default integer index
|
|
320
|
-
result_gdf = GeoDataFrame(result_rows, geometry=geometries)
|
|
321
|
-
|
|
322
|
-
if inplace:
|
|
323
|
-
# Clear original gdf and replace with result_gdf content
|
|
324
|
-
gdf_with_datasets.drop(gdf_with_datasets.index, inplace=True)
|
|
325
|
-
gdf_with_datasets.drop(gdf_with_datasets.columns, axis=1, inplace=True)
|
|
326
|
-
|
|
327
|
-
# Copy all data from result_gdf to gdf
|
|
328
|
-
for col in result_gdf.columns:
|
|
329
|
-
gdf_with_datasets[col] = result_gdf[col].values
|
|
330
|
-
|
|
331
|
-
# Ensure it remains a GeoDataFrame with correct geometry
|
|
332
|
-
gdf_with_datasets.geometry = result_gdf.geometry
|
|
333
|
-
|
|
334
|
-
return None
|
|
342
|
+
if len(quries) > 1000:
|
|
343
|
+
return {
|
|
344
|
+
"local_or_remote": "remote",
|
|
345
|
+
"reason": "The number of the requests is too large(>1000), please set the mass_stats parameter to True",
|
|
346
|
+
}
|
|
347
|
+
elif await estimate_timestamp_number(client = client, quries = quries) > 25000:
|
|
348
|
+
return {
|
|
349
|
+
"local_or_remote": "remote",
|
|
350
|
+
"reason": "The time taking for making these requests is too long, please set the mass_stats parameter to True",
|
|
351
|
+
}
|
|
352
|
+
elif await estimate_query_size(client = client, quries = quries) > get_available_memory_mb():
|
|
353
|
+
return {
|
|
354
|
+
"local_or_remote": "remote",
|
|
355
|
+
"reason": "The size of the dataset is too large, please set the mass_stats parameter to True",
|
|
356
|
+
}
|
|
335
357
|
else:
|
|
336
|
-
return
|
|
337
|
-
|
|
358
|
+
return {
|
|
359
|
+
"local_or_remote": "local",
|
|
360
|
+
"reason": "The number of the requests is not too large, and the time taking for making these requests is not too long, and the size of the dataset is not too large",
|
|
361
|
+
}
|
|
338
362
|
|
|
339
|
-
# Updated zonal_stats function that uses both parts
|
|
340
363
|
async def zonal_stats(
|
|
341
364
|
client,
|
|
342
365
|
gdf: GeoDataFrame,
|
|
343
366
|
expr: str,
|
|
344
367
|
conc: int = 20,
|
|
345
|
-
inplace: bool = False,
|
|
346
368
|
in_crs: str = "epsg:4326",
|
|
347
369
|
out_crs: str = "epsg:4326",
|
|
348
370
|
resolution: int = -1,
|
|
349
371
|
geom_fix: bool = False,
|
|
350
|
-
drop_nan: bool = False,
|
|
351
|
-
spatial_reduction: str = None,
|
|
352
|
-
temporal_reduction: str = None,
|
|
353
|
-
max_memory_mb: int = 500,
|
|
354
|
-
stream_to_disk: bool = False,
|
|
355
372
|
):
|
|
356
|
-
"""
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
# Step 1: Request data (with memory estimation)
|
|
379
|
-
gdf_with_datasets = await request_data(
|
|
380
|
-
client=client,
|
|
381
|
-
gdf=gdf,
|
|
382
|
-
expr=expr,
|
|
383
|
-
conc=conc,
|
|
384
|
-
in_crs=in_crs,
|
|
385
|
-
out_crs=out_crs,
|
|
386
|
-
resolution=resolution,
|
|
387
|
-
geom_fix=geom_fix,
|
|
388
|
-
max_memory_mb=max_memory_mb,
|
|
389
|
-
stream_to_disk=stream_to_disk
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
# Step 2: Post-process with reductions and optional streaming
|
|
393
|
-
result = post_processing(
|
|
394
|
-
gdf_with_datasets=gdf_with_datasets,
|
|
395
|
-
spatial_reduction=spatial_reduction,
|
|
396
|
-
temporal_reduction=temporal_reduction,
|
|
397
|
-
drop_nan=drop_nan,
|
|
398
|
-
inplace=inplace,
|
|
399
|
-
stream_to_disk=stream_to_disk
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
return result
|
|
373
|
+
"""Compute zonal statistics for all geometries in a GeoDataFrame."""
|
|
374
|
+
quries = []
|
|
375
|
+
for i in range(len(gdf)):
|
|
376
|
+
quries.append({
|
|
377
|
+
"expr": expr,
|
|
378
|
+
"feature": {
|
|
379
|
+
"type": "Feature",
|
|
380
|
+
"geometry": mapping(gdf.geometry.iloc[i]),
|
|
381
|
+
"properties": {}
|
|
382
|
+
},
|
|
383
|
+
"in_crs": in_crs,
|
|
384
|
+
"out_crs": out_crs,
|
|
385
|
+
"resolution": resolution,
|
|
386
|
+
"geom_fix": geom_fix,
|
|
387
|
+
})
|
|
388
|
+
local_or_remote_result = await local_or_remote(client= client, quries = quries)
|
|
389
|
+
if local_or_remote_result["local_or_remote"] == "remote":
|
|
390
|
+
raise ValueError(local_or_remote_result["reason"])
|
|
391
|
+
else:
|
|
392
|
+
gdf_with_datasets = await request_geoquery_list(client = client, quries = quries, conc = conc)
|
|
393
|
+
gdf_with_datasets = expand_on_variables_and_time(gdf_with_datasets)
|
|
394
|
+
return gdf_with_datasets
|
|
403
395
|
|
|
404
396
|
async def create_dataset_file(
|
|
405
397
|
client,
|