terrakio-core 0.3.8__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of terrakio-core might be problematic. Click here for more details.

@@ -9,36 +9,41 @@ from ..exceptions import APIError, ConfigurationError
9
9
  from ..helper.bounded_taskgroup import BoundedTaskGroup
10
10
  from ..helper.tiles import tiles
11
11
  import uuid
12
+ import xarray as xr
12
13
 
13
- async def zonal_stats(
14
+ async def request_data(
14
15
  client,
15
16
  gdf: GeoDataFrame,
16
17
  expr: str,
17
18
  conc: int = 20,
18
- inplace: bool = False,
19
19
  in_crs: str = "epsg:4326",
20
20
  out_crs: str = "epsg:4326",
21
21
  resolution: int = -1,
22
22
  geom_fix: bool = False,
23
+ max_memory_mb: int = 500,
24
+ stream_to_disk: bool = None,
23
25
  ):
24
26
  """
25
- Compute zonal statistics for all geometries in a GeoDataFrame.
27
+ Request xarray datasets for all geometries in a GeoDataFrame.
26
28
 
27
29
  Args:
28
30
  client: The AsyncClient instance
29
31
  gdf (GeoDataFrame): GeoDataFrame containing geometries
30
32
  expr (str): Terrakio expression to evaluate, can include spatial aggregations
31
33
  conc (int): Number of concurrent requests to make
32
- inplace (bool): Whether to modify the input GeoDataFrame in place
33
34
  in_crs (str): Input coordinate reference system
34
35
  out_crs (str): Output coordinate reference system
35
36
  resolution (int): Resolution parameter
36
37
  geom_fix (bool): Whether to fix the geometry (default False)
38
+ max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
39
+ stream_to_disk (bool): Whether to stream large datasets to disk. If None, will be determined automatically.
40
+
37
41
  Returns:
38
- geopandas.GeoDataFrame: GeoDataFrame with added columns for results, or None if inplace=True
42
+ geopandas.GeoDataFrame: Copy of input GeoDataFrame with additional 'dataset' column
43
+ containing the xarray Dataset for each geometry.
39
44
 
40
45
  Raises:
41
- ValueError: If concurrency is too high
46
+ ValueError: If concurrency is too high or if data exceeds memory limit without streaming
42
47
  APIError: If the API request fails
43
48
  """
44
49
  if conc > 100:
@@ -46,32 +51,74 @@ async def zonal_stats(
46
51
 
47
52
  total_geometries = len(gdf)
48
53
 
54
+ # First, make a request with the first geometry to estimate total memory usage
55
+ client.logger.info("Estimating total memory usage...")
56
+ first_geom = gdf.geometry.iloc[0]
57
+ feature = {
58
+ "type": "Feature",
59
+ "geometry": mapping(first_geom),
60
+ "properties": {}
61
+ }
62
+
63
+ try:
64
+ first_result = await client.geoquery(expr=expr, feature=feature,
65
+ in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
66
+ if isinstance(first_result, dict) and first_result.get("error"):
67
+ error_msg = f"Request failed: {first_result.get('error_message', 'Unknown error')}"
68
+ if first_result.get('status_code'):
69
+ error_msg = f"Request failed with status {first_result['status_code']}: {first_result.get('error_message', 'Unknown error')}"
70
+ raise APIError(error_msg)
71
+
72
+ if not isinstance(first_result, xr.Dataset):
73
+ raise ValueError(f"Expected xarray Dataset, got {type(first_result)}")
74
+
75
+ # Estimate total memory usage
76
+ single_dataset_size_bytes = estimate_dataset_size(first_result)
77
+ total_size_bytes = single_dataset_size_bytes * total_geometries
78
+ total_size_mb = total_size_bytes / (1024 * 1024)
79
+
80
+ client.logger.info(f"Estimated total memory usage: {total_size_mb:.2f} MB for {total_geometries} geometries")
81
+
82
+ # Check if we need to stream to disk
83
+ if stream_to_disk is None:
84
+ # Auto-determine based on memory usage
85
+ if total_size_mb > max_memory_mb:
86
+ client.logger.warning(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
87
+ raise ValueError(f"The data you are requesting exceeds {max_memory_mb} MB, we recommend you to set the stream_to_disk parameter to True")
88
+
89
+ except Exception as e:
90
+ if "recommend you to set the stream_to_disk parameter to True" in str(e):
91
+ raise
92
+ client.logger.error(f"Failed to estimate memory usage: {e}")
93
+ raise
94
+
49
95
  client.logger.info(f"Processing {total_geometries} geometries with concurrency {conc}")
50
96
 
51
97
  completed_count = 0
52
98
  lock = asyncio.Lock()
53
99
 
54
- async def process_geometry(geom, index):
55
- """Process a single geometry"""
100
+ async def process_geometry(geom):
101
+ """Process a single geometry and return the dataset"""
56
102
  nonlocal completed_count
57
103
 
58
104
  try:
59
105
  feature = {
60
106
  "type": "Feature",
61
107
  "geometry": mapping(geom),
62
- "properties": {"index": index}
108
+ "properties": {}
63
109
  }
64
- result = await client.geoquery(expr=expr, feature=feature, output="csv",
110
+ # Request xarray dataset
111
+ result = await client.geoquery(expr=expr, feature=feature,
65
112
  in_crs=in_crs, out_crs=out_crs, resolution=resolution, geom_fix=geom_fix)
66
-
67
113
  if isinstance(result, dict) and result.get("error"):
68
- error_msg = f"Request {index} failed: {result.get('error_message', 'Unknown error')}"
114
+ error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
69
115
  if result.get('status_code'):
70
- error_msg = f"Request {index} failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
116
+ error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
71
117
  raise APIError(error_msg)
72
118
 
73
- if isinstance(result, pd.DataFrame):
74
- result['_geometry_index'] = index
119
+ # Ensure we got an xarray Dataset
120
+ if not isinstance(result, xr.Dataset):
121
+ raise ValueError(f"Expected xarray Dataset, got {type(result)}")
75
122
 
76
123
  async with lock:
77
124
  completed_count += 1
@@ -88,85 +135,271 @@ async def zonal_stats(
88
135
  try:
89
136
  async with BoundedTaskGroup(max_concurrency=conc) as tg:
90
137
  tasks = [
91
- tg.create_task(process_geometry(gdf.geometry.iloc[idx], idx))
138
+ tg.create_task(process_geometry(gdf.geometry.iloc[idx]))
92
139
  for idx in range(len(gdf))
93
140
  ]
94
141
  all_results = [task.result() for task in tasks]
95
-
142
+
96
143
  except* Exception as eg:
97
144
  for e in eg.exceptions:
98
145
  if hasattr(e, 'response'):
99
146
  raise APIError(f"API request failed: {e.response.text}")
100
147
  raise
101
148
 
102
- client.logger.info("All requests completed! Processing results...")
103
-
149
+ client.logger.info("All requests completed!")
150
+
104
151
  if not all_results:
105
152
  raise ValueError("No valid results were returned for any geometry")
153
+
154
+ # Create a copy of the input GeoDataFrame
155
+ result_gdf = gdf.copy()
156
+
157
+ # Add the dataset column with the xarray datasets
158
+ result_gdf['dataset'] = all_results
159
+
160
+ return result_gdf
161
+
162
+
163
+ import os
164
+ from pathlib import Path
165
+
166
+ def estimate_dataset_size(dataset):
167
+ """
168
+ Estimate the memory size of an xarray dataset in bytes.
169
+
170
+ Args:
171
+ dataset: xarray Dataset
106
172
 
107
- combined_df = pd.concat(all_results, ignore_index=True)
173
+ Returns:
174
+ int: Estimated size in bytes
175
+ """
176
+ total_size = 0
177
+ for var_name, var in dataset.data_vars.items():
178
+ # Get the dtype size in bytes
179
+ dtype_size = var.dtype.itemsize
180
+ # Get the total number of elements
181
+ total_elements = var.size
182
+ # Calculate total size for this variable
183
+ total_size += dtype_size * total_elements
108
184
 
109
- has_time = 'time' in combined_df.columns
185
+ # Add coordinate sizes
186
+ for coord_name, coord in dataset.coords.items():
187
+ if coord_name not in dataset.dims: # Don't double count dimension coordinates
188
+ dtype_size = coord.dtype.itemsize
189
+ total_elements = coord.size
190
+ total_size += dtype_size * total_elements
110
191
 
111
- if has_time:
112
- if '_geometry_index' not in combined_df.columns:
113
- raise ValueError("Missing geometry index in results")
192
+ return total_size
193
+
194
+ def save_dataset_to_file(dataset, filepath):
195
+ """
196
+ Save dataset to NetCDF file.
197
+
198
+ Args:
199
+ dataset: xarray Dataset
200
+ filepath: Path to save the file
114
201
 
115
- combined_df.set_index(['_geometry_index', 'time'], inplace=True)
202
+ Returns:
203
+ str: Path to saved file
204
+ """
205
+ filepath = Path(filepath)
206
+
207
+ if not str(filepath).endswith('.nc'):
208
+ filepath = filepath.with_suffix('.nc')
209
+
210
+ dataset.to_netcdf(filepath)
211
+ return str(filepath)
212
+
213
+ def post_processing(
214
+ gdf_with_datasets: GeoDataFrame,
215
+ spatial_reduction: str = None,
216
+ temporal_reduction: str = None,
217
+ drop_nan: bool = False,
218
+ inplace: bool = False,
219
+ stream_to_disk: bool = False,
220
+ ):
221
+ """
222
+ Post-process the GeoDataFrame with datasets to extract variables with optional reductions.
223
+
224
+ Args:
225
+ gdf_with_datasets (GeoDataFrame): GeoDataFrame with 'dataset' column containing xarray Datasets
226
+ spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
227
+ Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
228
+ temporal_reduction (str): Reduction operation for temporal dimension (time).
229
+ Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
230
+ drop_nan (bool): Whether to drop NaN values from the results (default False)
231
+ inplace (bool): Whether to modify the input GeoDataFrame in place
232
+ stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
233
+
234
+ Returns:
235
+ geopandas.GeoDataFrame: GeoDataFrame with variable dataarrays/values or file paths in separate columns.
236
+ If stream_to_disk=True, large datasets are saved as NetCDF files with file paths stored.
237
+ """
238
+ if 'dataset' not in gdf_with_datasets.columns:
239
+ raise ValueError("Input GeoDataFrame must contain a 'dataset' column")
240
+
241
+ # Validate reduction parameters
242
+ valid_reductions = ['mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count']
243
+ if spatial_reduction and spatial_reduction not in valid_reductions:
244
+ raise ValueError(f"spatial_reduction must be one of {valid_reductions}")
245
+ if temporal_reduction and temporal_reduction not in valid_reductions:
246
+ raise ValueError(f"temporal_reduction must be one of {valid_reductions}")
247
+
248
+ result_rows = []
249
+ geometries = []
250
+
251
+ # Process each row (geometry + dataset)
252
+ for i, row in gdf_with_datasets.iterrows():
253
+ dataset = row['dataset']
116
254
 
117
- result_cols = combined_df.columns
255
+ # Create new row for this geometry
256
+ new_row = {}
118
257
 
119
- result_rows = []
120
- geometries = []
258
+ # Copy original GeoDataFrame attributes (excluding dataset column)
259
+ for col in gdf_with_datasets.columns:
260
+ if col not in ['geometry', 'dataset']:
261
+ new_row[col] = row[col]
121
262
 
122
- for (geom_idx, time_val), row in combined_df.iterrows():
123
- new_row = {}
263
+ # Process each variable in the dataset
264
+ data_vars = list(dataset.data_vars.keys())
265
+ for var_name in data_vars:
266
+ var_data = dataset[var_name]
124
267
 
125
- for col in gdf.columns:
126
- if col != 'geometry':
127
- new_row[col] = gdf.loc[geom_idx, col]
268
+ # Apply drop_nan if requested
269
+ if drop_nan:
270
+ # Drop spatial dimensions where all values are NaN
271
+ var_data = var_data.dropna(dim='x', how='all').dropna(dim='y', how='all')
272
+
273
+ # Drop time dimensions where all values are NaN
274
+ if 'time' in var_data.dims:
275
+ var_data = var_data.dropna(dim='time', how='all')
128
276
 
129
- for col in result_cols:
130
- new_row[col] = row[col]
277
+ # Check current dimensions to determine if aggregation is needed
278
+ current_dims = set(var_data.dims)
279
+ has_spatial_dims = bool(current_dims.intersection(['x', 'y']))
280
+ has_temporal_dim = 'time' in current_dims
131
281
 
132
- result_rows.append(new_row)
133
- geometries.append(gdf.geometry.iloc[geom_idx])
134
-
135
- multi_index = pd.MultiIndex.from_tuples(
136
- combined_df.index.tolist(),
137
- names=['geometry_index', 'time']
138
- )
139
-
140
- result_gdf = GeoDataFrame(
141
- result_rows,
142
- geometry=geometries,
143
- index=multi_index
144
- )
282
+ # Apply spatial reduction only if spatial dimensions exist and reduction is requested
283
+ if spatial_reduction and has_spatial_dims:
284
+ spatial_dims = [dim for dim in ['x', 'y'] if dim in var_data.dims]
285
+ if spatial_dims:
286
+ if spatial_reduction == 'count':
287
+ var_data = var_data.count(dim=spatial_dims)
288
+ else:
289
+ var_data = getattr(var_data, spatial_reduction)(dim=spatial_dims)
290
+
291
+ # Apply temporal reduction only if time dimension exists and reduction is requested
292
+ if temporal_reduction and has_temporal_dim:
293
+ if temporal_reduction == 'count':
294
+ var_data = var_data.count(dim='time')
295
+ else:
296
+ var_data = getattr(var_data, temporal_reduction)(dim='time')
297
+
298
+ # Handle streaming to disk if requested
299
+ if stream_to_disk:
300
+ # Create a single-variable dataset for saving
301
+ single_var_dataset = var_data.to_dataset(name=var_name)
302
+
303
+ # Generate filename based on row index and variable name
304
+ filename = f"geometry_{i}_{var_name}.nc"
305
+ filepath = os.path.join(os.getcwd(), filename)
306
+
307
+ # Save to disk and store file path
308
+ saved_path = save_dataset_to_file(single_var_dataset, filepath)
309
+ new_row[var_name] = f"file://{saved_path}"
310
+
311
+ print(f"Dataset for geometry {i}, variable '{var_name}' saved to: {saved_path}")
312
+ else:
313
+ # Keep in memory
314
+ new_row[var_name] = var_data
145
315
 
146
- if inplace:
147
- return result_gdf
148
- else:
149
- return result_gdf
150
- else:
151
- result_gdf = gdf.copy() if not inplace else gdf
316
+ result_rows.append(new_row)
317
+ geometries.append(row['geometry'])
318
+
319
+ # Create the result GeoDataFrame with default integer index
320
+ result_gdf = GeoDataFrame(result_rows, geometry=geometries)
321
+
322
+ if inplace:
323
+ # Clear original gdf and replace with result_gdf content
324
+ gdf_with_datasets.drop(gdf_with_datasets.index, inplace=True)
325
+ gdf_with_datasets.drop(gdf_with_datasets.columns, axis=1, inplace=True)
152
326
 
153
- result_cols = [col for col in combined_df.columns if col not in ['_geometry_index']]
327
+ # Copy all data from result_gdf to gdf
328
+ for col in result_gdf.columns:
329
+ gdf_with_datasets[col] = result_gdf[col].values
154
330
 
155
- geom_idx_to_row = {}
156
- for idx, row in combined_df.iterrows():
157
- geom_idx = int(row['_geometry_index'])
158
- geom_idx_to_row[geom_idx] = row
331
+ # Ensure it remains a GeoDataFrame with correct geometry
332
+ gdf_with_datasets.geometry = result_gdf.geometry
159
333
 
160
- for col in result_cols:
161
- if col not in result_gdf.columns:
162
- result_gdf[col] = None
163
-
164
- for geom_idx, row in geom_idx_to_row.items():
165
- result_gdf.loc[geom_idx, col] = row[col]
166
- if inplace:
167
- return None
168
- else:
169
- return result_gdf
334
+ return None
335
+ else:
336
+ return result_gdf
337
+
338
+
339
+ # Updated zonal_stats function that uses both parts
340
+ async def zonal_stats(
341
+ client,
342
+ gdf: GeoDataFrame,
343
+ expr: str,
344
+ conc: int = 20,
345
+ inplace: bool = False,
346
+ in_crs: str = "epsg:4326",
347
+ out_crs: str = "epsg:4326",
348
+ resolution: int = -1,
349
+ geom_fix: bool = False,
350
+ drop_nan: bool = False,
351
+ spatial_reduction: str = None,
352
+ temporal_reduction: str = None,
353
+ max_memory_mb: int = 500,
354
+ stream_to_disk: bool = False,
355
+ ):
356
+ """
357
+ Compute zonal statistics for all geometries in a GeoDataFrame.
358
+ This is a convenience function that combines request_data and post_processing.
359
+
360
+ Args:
361
+ client: The AsyncClient instance
362
+ gdf (GeoDataFrame): GeoDataFrame containing geometries
363
+ expr (str): Terrakio expression to evaluate, can include spatial aggregations
364
+ conc (int): Number of concurrent requests to make
365
+ inplace (bool): Whether to modify the input GeoDataFrame in place
366
+ in_crs (str): Input coordinate reference system
367
+ out_crs (str): Output coordinate reference system
368
+ resolution (int): Resolution parameter
369
+ geom_fix (bool): Whether to fix the geometry (default False)
370
+ drop_nan (bool): Whether to drop NaN values from the results (default False)
371
+ spatial_reduction (str): Reduction operation for spatial dimensions (x, y).
372
+ Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
373
+ temporal_reduction (str): Reduction operation for temporal dimension (time).
374
+ Options: 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', 'count'
375
+ max_memory_mb (int): Maximum memory threshold in MB (default 500MB)
376
+ stream_to_disk (bool): Whether to stream datasets to disk as NetCDF files (default False)
377
+ """
378
+ # Step 1: Request data (with memory estimation)
379
+ gdf_with_datasets = await request_data(
380
+ client=client,
381
+ gdf=gdf,
382
+ expr=expr,
383
+ conc=conc,
384
+ in_crs=in_crs,
385
+ out_crs=out_crs,
386
+ resolution=resolution,
387
+ geom_fix=geom_fix,
388
+ max_memory_mb=max_memory_mb,
389
+ stream_to_disk=stream_to_disk
390
+ )
391
+
392
+ # Step 2: Post-process with reductions and optional streaming
393
+ result = post_processing(
394
+ gdf_with_datasets=gdf_with_datasets,
395
+ spatial_reduction=spatial_reduction,
396
+ temporal_reduction=temporal_reduction,
397
+ drop_nan=drop_nan,
398
+ inplace=inplace,
399
+ stream_to_disk=stream_to_disk
400
+ )
401
+
402
+ return result
170
403
 
171
404
  async def create_dataset_file(
172
405
  client,
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  from urllib.parse import urlparse
7
7
  from ..helper.decorators import require_token, require_api_key, require_auth
8
8
  import aiohttp
9
-
9
+ from typing import Dict, Any, Optional, List, Union
10
10
  class MassStats:
11
11
  def __init__(self, client):
12
12
  self._client = client
@@ -218,7 +218,7 @@ class MassStats:
218
218
  params["output"] = output
219
219
 
220
220
  return self._client._terrakio_request("GET", "mass_stats/download", params=params)
221
-
221
+
222
222
  @require_api_key
223
223
  async def _upload_file(self, file_path: str, url: str, use_gzip: bool = False):
224
224
  """
@@ -237,6 +237,18 @@ class MassStats:
237
237
  except json.JSONDecodeError as e:
238
238
  raise ValueError(f"Invalid JSON in file {file_path}: {e}")
239
239
 
240
+ return await self._upload_json_data(json_data, url, use_gzip)
241
+
242
+ @require_api_key
243
+ async def _upload_json_data(self, json_data: Union[Dict, List], url: str, use_gzip: bool = False):
244
+ """
245
+ Helper method to upload JSON data directly to a signed URL.
246
+
247
+ Args:
248
+ json_data: JSON data (dict or list) to upload
249
+ url: Signed URL to upload to
250
+ use_gzip: Whether to compress the data with gzip
251
+ """
240
252
  if hasattr(json, 'dumps') and 'ignore_nan' in json.dumps.__code__.co_varnames:
241
253
  dumps_kwargs = {'ignore_nan': True}
242
254
  else:
@@ -253,6 +265,7 @@ class MassStats:
253
265
  headers = {
254
266
  'Content-Type': 'application/json'
255
267
  }
268
+
256
269
  response = await self._client._regular_request("PUT", url, data=body, headers=headers)
257
270
  return response
258
271
 
@@ -386,36 +399,55 @@ class MassStats:
386
399
  region: str,
387
400
  output: str,
388
401
  config: Dict[str, Any],
389
- request_json: Dict[str, Any],
390
- manifest_json: Dict[str, Any],
402
+ request_json: str, # Path to request JSON file
391
403
  overwrite: bool = False,
392
404
  skip_existing: bool = False,
393
405
  location: str = None,
394
406
  force_loc: bool = None,
395
- server: str = "dev-au.terrak.io"
407
+ server: str = None
396
408
  ) -> Dict[str, Any]:
397
409
  """
398
410
  Execute a mass stats job.
399
-
411
+
400
412
  Args:
401
413
  name: The name of the job
402
414
  region: The region of the job
403
415
  output: The output of the job
404
416
  config: The config of the job
405
- request_json: The request JSON
406
- manifest_json: The manifest JSON
417
+ request_json: Path to the request JSON file
407
418
  overwrite: Whether to overwrite the job
408
419
  skip_existing: Whether to skip existing jobs
409
420
  location: The location of the job
410
421
  force_loc: Whether to force the location
411
422
  server: The server to use
412
-
423
+
413
424
  Returns:
414
425
  API response as a dictionary
415
-
426
+
416
427
  Raises:
417
428
  APIError: If the API request fails
418
429
  """
430
+
431
+ def extract_manifest_from_request(request_data: List[Dict[str, Any]]) -> List[str]:
432
+ """Extract unique group names from request data to create manifest list."""
433
+ groups = []
434
+ seen_groups = set()
435
+
436
+ for item in request_data:
437
+ if not isinstance(item, dict):
438
+ raise ValueError("Each item in request JSON should be a dictionary")
439
+
440
+ if 'group' not in item:
441
+ raise ValueError("Each item should have a 'group' field")
442
+
443
+ group = item['group']
444
+ if group not in seen_groups:
445
+ groups.append(group)
446
+ seen_groups.add(group)
447
+
448
+ return groups
449
+
450
+ # Load and validate request JSON
419
451
  try:
420
452
  with open(request_json, 'r') as file:
421
453
  request_data = json.load(file)
@@ -427,14 +459,35 @@ class MassStats:
427
459
  return e
428
460
  except json.JSONDecodeError as e:
429
461
  return e
430
- upload_result = await self._upload_request(name = name, size = size, region = region, output = output, config = config, location = location, force_loc = force_loc, overwrite = overwrite, server = server, skip_existing = skip_existing)
462
+
463
+ # Generate manifest from request data (kept in memory)
464
+ try:
465
+ manifest_groups = extract_manifest_from_request(request_data)
466
+ except Exception as e:
467
+ raise ValueError(f"Error extracting manifest from request JSON: {e}")
468
+
469
+ # Get upload URLs
470
+ upload_result = await self._upload_request(
471
+ name=name,
472
+ size=size,
473
+ region=region,
474
+ output=output,
475
+ config=config,
476
+ location=location,
477
+ force_loc=force_loc,
478
+ overwrite=overwrite,
479
+ server=server,
480
+ skip_existing=skip_existing
481
+ )
482
+
431
483
  requests_url = upload_result.get('requests_url')
432
484
  manifest_url = upload_result.get('manifest_url')
485
+
433
486
  if not requests_url:
434
487
  raise ValueError("No requests_url returned from server for request JSON upload")
435
488
 
489
+ # Upload request JSON file
436
490
  try:
437
- # in this place we are uploading the request json file, we need to check whether the json is in the correct format or not
438
491
  self.validate_request(request_json)
439
492
  requests_response = await self._upload_file(request_json, requests_url, use_gzip=True)
440
493
  if requests_response.status not in [200, 201, 204]:
@@ -446,15 +499,17 @@ class MassStats:
446
499
  if not manifest_url:
447
500
  raise ValueError("No manifest_url returned from server for manifest JSON upload")
448
501
 
502
+ # Upload manifest JSON data directly (no temporary file needed)
449
503
  try:
450
- manifest_response = await self._upload_file(manifest_json, manifest_url, use_gzip=False)
504
+ manifest_response = await self._upload_json_data(manifest_groups, manifest_url, use_gzip=False)
451
505
  if manifest_response.status not in [200, 201, 204]:
452
506
  self._client.logger.error(f"Manifest upload error: {manifest_response.text()}")
453
507
  raise Exception(f"Failed to upload manifest JSON: {manifest_response.text()}")
454
508
  except Exception as e:
455
- raise Exception(f"Error uploading manifest JSON file {manifest_json}: {e}")
456
-
457
- start_job_task_id =await self.start_job(upload_result.get("id"))
509
+ raise Exception(f"Error uploading manifest JSON: {e}")
510
+
511
+ # Start the job
512
+ start_job_task_id = await self.start_job(upload_result.get("id"))
458
513
  return start_job_task_id
459
514
 
460
515
  @require_api_key