terrakio-core 0.4.6__tar.gz → 0.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of terrakio-core might be problematic. Click here for more details.

Files changed (27) hide show
  1. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/PKG-INFO +2 -1
  2. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/pyproject.toml +2 -1
  3. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/__init__.py +1 -1
  4. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/async_client.py +8 -0
  5. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/convenience_functions/convenience_functions.py +25 -61
  6. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/mass_stats.py +171 -39
  7. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/helper/tiles.py +13 -12
  8. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/sync_client.py +9 -0
  9. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core.egg-info/PKG-INFO +2 -1
  10. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core.egg-info/requires.txt +1 -0
  11. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/README.md +0 -0
  12. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/setup.cfg +0 -0
  13. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/accessors.py +0 -0
  14. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/client.py +0 -0
  15. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/config.py +0 -0
  16. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/auth.py +0 -0
  17. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/dataset_management.py +0 -0
  18. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/group_management.py +0 -0
  19. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/model_management.py +0 -0
  20. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/space_management.py +0 -0
  21. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/endpoints/user_management.py +0 -0
  22. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/exceptions.py +0 -0
  23. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/helper/bounded_taskgroup.py +0 -0
  24. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core/helper/decorators.py +0 -0
  25. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core.egg-info/SOURCES.txt +0 -0
  26. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core.egg-info/dependency_links.txt +0 -0
  27. {terrakio_core-0.4.6 → terrakio_core-0.4.7}/terrakio_core.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: terrakio-core
3
- Version: 0.4.6
3
+ Version: 0.4.7
4
4
  Summary: Core components for Terrakio API clients
5
5
  Author-email: Yupeng Chao <yupeng@haizea.com.au>
6
6
  Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api
@@ -28,6 +28,7 @@ Requires-Dist: onnxruntime>=1.10.0
28
28
  Requires-Dist: psutil>=5.0.0
29
29
  Requires-Dist: h5netcdf>=1.0.0
30
30
  Requires-Dist: netcdf4>=1.5.0
31
+ Requires-Dist: aiofiles>=24.1.0
31
32
  Provides-Extra: ml
32
33
  Requires-Dist: torch>=2.7.1; extra == "ml"
33
34
  Requires-Dist: scikit-learn>=1.7.0; extra == "ml"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "terrakio-core"
7
- version = "0.4.6"
7
+ version = "0.4.7"
8
8
  authors = [
9
9
  {name = "Yupeng Chao", email = "yupeng@haizea.com.au"},
10
10
  ]
@@ -35,6 +35,7 @@ dependencies = [
35
35
  "psutil>=5.0.0",
36
36
  "h5netcdf>=1.0.0",
37
37
  "netcdf4>=1.5.0",
38
+ "aiofiles>=24.1.0"
38
39
  ]
39
40
 
40
41
  [project.optional-dependencies]
@@ -5,7 +5,7 @@ Terrakio Core
5
5
  Core components for Terrakio API clients.
6
6
  """
7
7
 
8
- __version__ = "0.4.6"
8
+ __version__ = "0.4.7"
9
9
 
10
10
  from .async_client import AsyncClient
11
11
  from .sync_client import SyncClient as Client
@@ -235,6 +235,7 @@ class AsyncClient(BaseClient):
235
235
 
236
236
  async def create_dataset_file(
237
237
  self,
238
+ name: str,
238
239
  aoi: str,
239
240
  expression: str,
240
241
  output: str,
@@ -247,6 +248,9 @@ class AsyncClient(BaseClient):
247
248
  non_interactive: bool = True,
248
249
  poll_interval: int = 30,
249
250
  download_path: str = "/home/user/Downloads",
251
+ mask = True,
252
+ max_file_size_mb: int = 5120, # Default to 5GB
253
+ tile_size: int = 1024,
250
254
  ) -> dict:
251
255
  """
252
256
  Create a dataset file using mass stats operations.
@@ -286,6 +290,10 @@ class AsyncClient(BaseClient):
286
290
  non_interactive=non_interactive,
287
291
  poll_interval=poll_interval,
288
292
  download_path=download_path,
293
+ name=name,
294
+ mask=mask,
295
+ max_file_size_mb=max_file_size_mb,
296
+ tile_size=tile_size
289
297
  )
290
298
 
291
299
  async def geo_queries(
@@ -438,15 +438,7 @@ async def handle_mass_stats(
438
438
  id_column: Optional[str] = None,
439
439
 
440
440
  ):
441
- # we have the handle mass stats function, we need to have the list of quries, and we need to pass the quries to the mass stats function
442
- # we have the three different variables
443
-
444
- # Check if id_column is provided
445
- # if id_column is None:
446
- # Handle case where no ID column is specified
447
- # this means that the id column is none, so we could just use the default value of 1 2 3 4
448
441
  request_json = gdf_to_json(gdf = gdf, expr = expr, in_crs = in_crs, out_crs = out_crs, resolution = resolution, geom_fix = geom_fix, id_column = id_column)
449
- # we need to call the execute job function
450
442
  job_id =await client.mass_stats.execute_job(
451
443
  name = "zonal_stats_job",
452
444
  output = "netcdf",
@@ -455,32 +447,7 @@ async def handle_mass_stats(
455
447
  overwrite = True,
456
448
  )
457
449
  return job_id
458
- # async def test_regular_async_mass_stats(regular_async_client):
459
- # """Test mass statistics with regular client async"""
460
- # start_result = await regular_async_client.mass_stats.execute_job(
461
- # name="test_regular_mass_stats_test",
462
- # region="aus",
463
- # output="csv",
464
- # config={},
465
- # request_json = "./test_config.json",
466
- # manifest_json = "./test_manifest.json",
467
- # overwrite=True,
468
- # )
469
- # assert isinstance(start_result, dict)
470
- # assert 'task_id' in start_result
471
-
472
- # return
473
- # else:
474
- # # Handle case where ID column is specified
475
- # # Verify the column exists in the GeoDataFrame
476
-
477
- # if id_column not in gdf.columns:
478
- # raise ValueError(f"ID column '{id_column}' not found in GeoDataFrame columns: {list(gdf.columns)}")
479
- # pass
480
- # the second case is that we have an id_column, we need to use the id_column to create the group name
481
450
 
482
- # we have the mass stats as one of the parameters, so that when a user wants a mass
483
- # for both cases we need to have the list of quries
484
451
  async def zonal_stats(
485
452
  client,
486
453
  gdf: GeoDataFrame,
@@ -506,7 +473,6 @@ async def zonal_stats(
506
473
  geom_fix = geom_fix,
507
474
  id_column = id_column
508
475
  )
509
- # if we started the mass stats job, we need to return the job id
510
476
  return mass_stats_id
511
477
  quries = []
512
478
  for i in range(len(gdf)):
@@ -536,30 +502,35 @@ async def create_dataset_file(
536
502
  aoi: str,
537
503
  expression: str,
538
504
  output: str,
505
+ download_path: str,
539
506
  in_crs: str = "epsg:4326",
540
- res: float = 0.0001,
541
- region: str = "aus",
542
507
  to_crs: str = "epsg:4326",
543
- overwrite: bool = True,
508
+ res: float = 0.0001,
509
+ region: str = None,
510
+ overwrite: bool = False,
544
511
  skip_existing: bool = False,
545
512
  non_interactive: bool = True,
513
+ name: str | None = None,
546
514
  poll_interval: int = 30,
547
- download_path: str = "/home/user/Downloads",
515
+ max_file_size_mb: int = 5120,
516
+ tile_size: int = 1024,
517
+ mask: bool = True
548
518
  ) -> dict:
549
519
 
550
- name = f"tiles-{uuid.uuid4().hex[:8]}"
520
+ if not name:
521
+ name = f"file-gen-{uuid.uuid4().hex[:8]}"
551
522
 
552
523
  body, reqs, groups = tiles(
553
524
  name = name,
554
525
  aoi = aoi,
555
526
  expression = expression,
556
527
  output = output,
557
- tile_size = 128,
528
+ tile_size = tile_size,
558
529
  crs = in_crs,
559
530
  res = res,
560
531
  region = region,
561
532
  to_crs = to_crs,
562
- fully_cover = True,
533
+ mask = mask,
563
534
  overwrite = overwrite,
564
535
  skip_existing = skip_existing,
565
536
  non_interactive = non_interactive
@@ -567,9 +538,6 @@ async def create_dataset_file(
567
538
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tempreq:
568
539
  tempreq.write(reqs)
569
540
  tempreqname = tempreq.name
570
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tempmanifest:
571
- tempmanifest.write(groups)
572
- tempmanifestname = tempmanifest.name
573
541
 
574
542
  task_id = await client.mass_stats.execute_job(
575
543
  name=body["name"],
@@ -579,28 +547,24 @@ async def create_dataset_file(
579
547
  overwrite=body["overwrite"],
580
548
  skip_existing=body["skip_existing"],
581
549
  request_json=tempreqname,
582
- manifest_json=tempmanifestname,
583
550
  )
584
551
 
585
552
  start_time = time.time()
586
553
  status = None
587
-
554
+ client.logger.info(f"Tracking data generation job {task_id['task_id']}...")
588
555
  while True:
589
556
  try:
590
557
  taskid = task_id['task_id']
591
558
  trackinfo = await client.mass_stats.track_job([taskid])
592
- client.logger.info("the trackinfo is: ", trackinfo)
593
559
  status = trackinfo[taskid]['status']
594
-
595
560
  if status == 'Completed':
596
- client.logger.info('Tiles generated successfully!')
561
+ client.logger.info('Data generated successfully!')
597
562
  break
598
563
  elif status in ['Failed', 'Cancelled', 'Error']:
599
564
  raise RuntimeError(f"Job {taskid} failed with status: {status}")
600
565
  else:
601
566
  elapsed_time = time.time() - start_time
602
- client.logger.info(f"Job status: {status} - Elapsed time: {elapsed_time:.1f}s", end='\r')
603
-
567
+ client.logger.info(f"Job status: {status} - Elapsed time: {elapsed_time:.1f}s")
604
568
  await asyncio.sleep(poll_interval)
605
569
 
606
570
 
@@ -612,16 +576,15 @@ async def create_dataset_file(
612
576
  raise
613
577
 
614
578
  os.unlink(tempreqname)
615
- os.unlink(tempmanifestname)
616
579
 
617
- combine_result = await client.mass_stats.combine_tiles(body["name"], body["overwrite"], body["output"])
580
+ combine_result = await client.mass_stats.combine_tiles(body["name"], body["overwrite"], body["output"], max_file_size_mb=max_file_size_mb)
618
581
  combine_task_id = combine_result.get("task_id")
619
582
 
620
583
  combine_start_time = time.time()
584
+ client.logger.info(f"Tracking file generation job {combine_task_id}...")
621
585
  while True:
622
586
  try:
623
587
  trackinfo = await client.mass_stats.track_job([combine_task_id])
624
- client.logger.info('client create dataset file track info:', trackinfo)
625
588
  if body["output"] == "netcdf":
626
589
  download_file_name = trackinfo[combine_task_id]['folder'] + '.nc'
627
590
  elif body["output"] == "geotiff":
@@ -629,19 +592,19 @@ async def create_dataset_file(
629
592
  bucket = trackinfo[combine_task_id]['bucket']
630
593
  combine_status = trackinfo[combine_task_id]['status']
631
594
  if combine_status == 'Completed':
632
- client.logger.info('Tiles combined successfully!')
595
+ client.logger.info('File/s generated successfully!')
633
596
  break
634
597
  elif combine_status in ['Failed', 'Cancelled', 'Error']:
635
- raise RuntimeError(f"Combine job {combine_task_id} failed with status: {combine_status}")
598
+ raise RuntimeError(f"File generation job {combine_task_id} failed with status: {combine_status}")
636
599
  else:
637
600
  elapsed_time = time.time() - combine_start_time
638
- client.logger.info(f"Combine job status: {combine_status} - Elapsed time: {elapsed_time:.1f}s", end='\r')
601
+ client.logger.info(f"File generation job status: {combine_status} - Elapsed time: {elapsed_time:.1f}s")
639
602
  time.sleep(poll_interval)
640
603
  except KeyboardInterrupt:
641
- client.logger.info(f"\nInterrupted! Combine job {combine_task_id} is still running in the background.")
604
+ client.logger.info(f"\nInterrupted! File generation job {combine_task_id} is still running in the background.")
642
605
  raise
643
606
  except Exception as e:
644
- client.logger.info(f"\nError tracking combine job: {e}")
607
+ client.logger.info(f"\nError tracking file generation job: {e}")
645
608
  raise
646
609
 
647
610
  if download_path:
@@ -649,11 +612,12 @@ async def create_dataset_file(
649
612
  job_name=body["name"],
650
613
  bucket=bucket,
651
614
  file_type='processed',
652
- page_size=10,
615
+ folder='file-gen',
616
+ page_size=100,
653
617
  output_path=download_path,
654
618
  )
655
619
  else:
656
620
  path = f"{body['name']}/outputs/merged/{download_file_name}"
657
- client.logger.info(f"Combined file is available at {path}")
621
+ client.logger.info(f"Dataset file/s is available at {path}")
658
622
 
659
623
  return {"generation_task_id": task_id, "combine_task_id": combine_task_id}
@@ -7,6 +7,13 @@ from urllib.parse import urlparse
7
7
  from ..helper.decorators import require_token, require_api_key, require_auth
8
8
  import aiohttp
9
9
  from typing import Dict, Any, Optional, List, Union
10
+ import asyncio
11
+ import xarray as xr
12
+ from io import BytesIO
13
+ import geopandas as gpd
14
+ from shapely.geometry import shape
15
+ from ..convenience_functions.convenience_functions import expand_on_variables_and_time
16
+
10
17
  class MassStats:
11
18
  def __init__(self, client):
12
19
  self._client = client
@@ -19,6 +26,7 @@ class MassStats:
19
26
  sample: str,
20
27
  output: str,
21
28
  config: Dict[str, Any],
29
+ region: str = None,
22
30
  overwrite: bool = False,
23
31
  skip_existing: bool = False,
24
32
  location: Optional[str] = None,
@@ -55,7 +63,8 @@ class MassStats:
55
63
  "config": config,
56
64
  "overwrite": overwrite,
57
65
  "skip_existing": skip_existing,
58
- "server": server
66
+ "server": server,
67
+ "region": region
59
68
  }
60
69
  payload_mapping = {
61
70
  "location": location,
@@ -66,7 +75,6 @@ class MassStats:
66
75
  payload[key] = str(value).lower()
67
76
  return await self._client._terrakio_request("POST", "mass_stats/upload", json=payload)
68
77
 
69
-
70
78
  @require_api_key
71
79
  async def start_job(self, id: str) -> Dict[str, Any]:
72
80
  """
@@ -276,6 +284,7 @@ class MassStats:
276
284
  bucket: str,
277
285
  file_type: str,
278
286
  output_path: str,
287
+ folder: str = None,
279
288
  page_size: int = None,
280
289
  ) -> list:
281
290
  """
@@ -303,7 +312,8 @@ class MassStats:
303
312
  request_body = {
304
313
  "job_name": job_name,
305
314
  "bucket": bucket,
306
- "file_type": file_type
315
+ "file_type": file_type,
316
+ "folder": folder
307
317
  }
308
318
 
309
319
  output_dir = Path(output_path)
@@ -311,8 +321,7 @@ class MassStats:
311
321
  output_files = []
312
322
 
313
323
  async def download_urls_batch(download_urls, session):
314
- for url in download_urls:
315
- self._client.logger.info(f"Processing download URL: {url}")
324
+ for i, url in enumerate(download_urls):
316
325
  parsed = urlparse(url)
317
326
  path_parts = Path(parsed.path).parts
318
327
  try:
@@ -322,13 +331,13 @@ class MassStats:
322
331
  subpath = Path(path_parts[-1])
323
332
  file_save_path = output_dir / subpath
324
333
  file_save_path.parent.mkdir(parents=True, exist_ok=True)
325
- self._client.logger.info(f"Downloading file to {file_save_path}")
334
+ self._client.logger.info(f"Downloading file to {file_save_path} ({i+1}/{len(download_urls)})")
326
335
 
327
336
  async with session.get(url) as resp:
328
337
  resp.raise_for_status()
329
338
  import aiofiles
330
339
  async with aiofiles.open(file_save_path, 'wb') as file:
331
- async for chunk in resp.content.iter_chunked(1048576):
340
+ async for chunk in resp.content.iter_chunked(1048576): # 1 MB
332
341
  if chunk:
333
342
  await file.write(chunk)
334
343
 
@@ -352,7 +361,6 @@ class MassStats:
352
361
  response = await self._client._terrakio_request("POST", "mass_stats/download_files", json=request_body, params=params)
353
362
  data = response
354
363
 
355
- self._client.logger.info(f'processed, endpoint response is {data}')
356
364
  download_urls = data.get('download_urls')
357
365
  if not download_urls:
358
366
  break
@@ -363,7 +371,7 @@ class MassStats:
363
371
  if total_files is not None and downloaded_files >= total_files:
364
372
  break
365
373
  if len(download_urls) < page_size:
366
- break
374
+ break # Last page
367
375
  page += 1
368
376
  return output_files
369
377
  except Exception as e:
@@ -392,13 +400,13 @@ class MassStats:
392
400
  if i == 3:
393
401
  break
394
402
 
395
- @require_api_key
396
403
  async def execute_job(
397
404
  self,
398
405
  name: str,
399
406
  output: str,
400
407
  config: Dict[str, Any],
401
- request_json: Union[str, list[Dict[str, Any]]],
408
+ request_json: str, # Path to request JSON file
409
+ region: str = None,
402
410
  overwrite: bool = False,
403
411
  skip_existing: bool = False,
404
412
  location: str = None,
@@ -425,6 +433,7 @@ class MassStats:
425
433
  Raises:
426
434
  APIError: If the API request fails
427
435
  """
436
+
428
437
  def extract_manifest_from_request(request_data: List[Dict[str, Any]]) -> List[str]:
429
438
  """Extract unique group names from request data to create manifest list."""
430
439
  groups = []
@@ -444,35 +453,34 @@ class MassStats:
444
453
 
445
454
  return groups
446
455
 
447
- if isinstance(request_json, str):
448
- try:
449
- with open(request_json, 'r') as file:
450
- request_data = json.load(file)
451
- if isinstance(request_data, list):
452
- size = len(request_data)
453
- else:
454
- raise ValueError(f"Request JSON file {request_json} should contain a list of dictionaries")
455
- except FileNotFoundError as e:
456
- return e
457
- except json.JSONDecodeError as e:
458
- return e
459
- request_json_path = request_json
460
- else:
461
- request_data = request_json
462
- size = len(request_data)
463
- request_json_path = None
456
+ # Load and validate request JSON
457
+ try:
458
+ with open(request_json, 'r') as file:
459
+ request_data = json.load(file)
460
+ if isinstance(request_data, list):
461
+ size = len(request_data)
462
+ else:
463
+ raise ValueError(f"Request JSON file {request_json} should contain a list of dictionaries")
464
+ except FileNotFoundError as e:
465
+ return e
466
+ except json.JSONDecodeError as e:
467
+ return e
464
468
 
469
+ # Generate manifest from request data (kept in memory)
465
470
  try:
466
471
  manifest_groups = extract_manifest_from_request(request_data)
467
472
  except Exception as e:
468
473
  raise ValueError(f"Error extracting manifest from request JSON: {e}")
469
474
 
470
- first_request = request_data[0]
475
+ # Extract the first expression
476
+ first_request = request_data[0] # Changed from data[0] to request_data[0]
471
477
  first_expression = first_request["request"]["expr"]
472
478
 
479
+ # Get upload URLs
473
480
  upload_result = await self._upload_request(
474
481
  name=name,
475
482
  size=size,
483
+ region=region,
476
484
  sample = first_expression,
477
485
  output=output,
478
486
  config=config,
@@ -488,21 +496,21 @@ class MassStats:
488
496
 
489
497
  if not requests_url:
490
498
  raise ValueError("No requests_url returned from server for request JSON upload")
499
+
500
+ # Upload request JSON file
491
501
  try:
492
- if request_json_path:
493
- self.validate_request(request_json_path)
494
- requests_response = await self._upload_file(request_json_path, requests_url, use_gzip=True)
495
- else:
496
- requests_response = await self._upload_json_data(request_data, requests_url, use_gzip=True)
502
+ self.validate_request(request_json)
503
+ requests_response = await self._upload_file(request_json, requests_url, use_gzip=True)
497
504
  if requests_response.status not in [200, 201, 204]:
498
505
  self._client.logger.error(f"Requests upload error: {requests_response.text()}")
499
- raise Exception(f"Failed to upload request data: {requests_response.text()}")
506
+ raise Exception(f"Failed to upload request JSON: {requests_response.text()}")
500
507
  except Exception as e:
501
508
  raise Exception(f"Error uploading request JSON file {request_json}: {e}")
502
-
509
+
503
510
  if not manifest_url:
504
511
  raise ValueError("No manifest_url returned from server for manifest JSON upload")
505
512
 
513
+ # Upload manifest JSON data directly (no temporary file needed)
506
514
  try:
507
515
  manifest_response = await self._upload_json_data(manifest_groups, manifest_url, use_gzip=False)
508
516
  if manifest_response.status not in [200, 201, 204]:
@@ -511,6 +519,7 @@ class MassStats:
511
519
  except Exception as e:
512
520
  raise Exception(f"Error uploading manifest JSON: {e}")
513
521
 
522
+ # Start the job
514
523
  start_job_task_id = await self.start_job(upload_result.get("id"))
515
524
  return start_job_task_id
516
525
 
@@ -625,7 +634,7 @@ class MassStats:
625
634
  return self._client._terrakio_request("POST", "pyramids/create", json=payload)
626
635
 
627
636
  @require_api_key
628
- async def combine_tiles(self, data_name: str, overwrite: bool = True, output: str = "netcdf") -> Dict[str, Any]:
637
+ async def combine_tiles(self, data_name: str, overwrite: bool = True, output: str = "netcdf", max_file_size_mb = 5120) -> Dict[str, Any]:
629
638
  """
630
639
  Combine tiles for a dataset.
631
640
 
@@ -642,7 +651,130 @@ class MassStats:
642
651
  """
643
652
  payload = {
644
653
  'data_name': data_name,
654
+ 'folder': "file-gen",
645
655
  'output': output,
646
- 'overwrite': str(overwrite).lower()
656
+ 'overwrite': str(overwrite).lower(),
657
+ 'max_file_size_mb': max_file_size_mb
647
658
  }
648
- return await self._client._terrakio_request("POST", "mass_stats/combine_tiles", json=payload)
659
+ return await self._client._terrakio_request("POST", "mass_stats/combine_tiles", json=payload)
660
+
661
+ @require_api_key
662
+ async def load_zonal_stats(self, job_id: str, max_files: int = 5, poll_interval: int = 30):
663
+ """
664
+ Load zonal stats results from a completed mass stats job.
665
+
666
+ Args:
667
+ job_id: The job ID returned from the mass stats execution
668
+ max_files: Maximum number of files to download (default: 5)
669
+ poll_interval: Seconds to wait between status checks (default: 30)
670
+
671
+ Returns:
672
+ GeoDataFrame with geometry and dataset columns, or None if failed
673
+ """
674
+ try:
675
+ while True:
676
+ try:
677
+ track_info = await self.track_job([job_id])
678
+ job_info = track_info[job_id]
679
+ status = job_info['status']
680
+
681
+ self._client.logger.info(f"Job {job_id} status: {status}")
682
+
683
+ if status == 'Completed':
684
+ self._client.logger.info('Job completed successfully!')
685
+ break
686
+ elif status in ['Failed', 'Cancelled', 'Error']:
687
+ raise RuntimeError(f"Job {job_id} failed with status: {status}")
688
+
689
+ await asyncio.sleep(poll_interval)
690
+
691
+ except KeyboardInterrupt:
692
+ self._client.logger.info(f"\nInterrupted! Job {job_id} is still running.")
693
+ raise
694
+
695
+ async with aiohttp.ClientSession() as session:
696
+ payload = {
697
+ "job_name": job_info['name'],
698
+ "file_type": "raw",
699
+ "bucket": job_info['bucket']
700
+ }
701
+
702
+ result = await self._client._terrakio_request("POST", "mass_stats/download_files", json=payload)
703
+ download_urls = result['download_urls'][:max_files]
704
+
705
+ self._client.logger.info(f"Downloading {len(download_urls)} dataset files...")
706
+
707
+ datasets = []
708
+ for i, url in enumerate(download_urls):
709
+ try:
710
+ self._client.logger.info(f"Downloading dataset {i+1}/{len(download_urls)}...")
711
+ async with session.get(url) as response:
712
+ if response.status == 200:
713
+ content = await response.read()
714
+ dataset = xr.open_dataset(BytesIO(content))
715
+ datasets.append(dataset)
716
+ self._client.logger.info(f"Successfully processed dataset {i+1}")
717
+ else:
718
+ self._client.logger.warning(f"Failed to download dataset {i+1}: HTTP {response.status}")
719
+ except Exception as e:
720
+ self._client.logger.error(f"Error downloading dataset {i+1}: {e}")
721
+ continue
722
+
723
+ if not datasets:
724
+ self._client.logger.warning("No datasets were successfully downloaded")
725
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
726
+
727
+ try:
728
+ json_response = await self._client._terrakio_request(
729
+ "POST", "mass_stats/download_json",
730
+ params={"job_name": job_info['name']}
731
+ )
732
+ json_url = json_response["download_url"]
733
+
734
+ async with session.get(json_url) as response:
735
+ if response.status == 200:
736
+ json_data = await response.json()
737
+ self._client.logger.info("Successfully downloaded geometry data")
738
+
739
+ geometries = []
740
+ max_geometries = min(max_files, len(json_data), len(datasets))
741
+
742
+ for i in range(max_geometries):
743
+ try:
744
+ geom_dict = json_data[i]["request"]["feature"]["geometry"]
745
+ shapely_geom = shape(geom_dict)
746
+ geometries.append(shapely_geom)
747
+ except (KeyError, ValueError) as e:
748
+ self._client.logger.warning(f"Error parsing geometry {i}: {e}")
749
+ continue
750
+
751
+ min_length = min(len(datasets), len(geometries))
752
+ if min_length == 0:
753
+ self._client.logger.warning("No matching datasets and geometries found")
754
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
755
+
756
+ gdf = gpd.GeoDataFrame({
757
+ 'geometry': geometries[:min_length],
758
+ 'dataset': datasets[:min_length]
759
+ })
760
+
761
+ self._client.logger.info(f"Created GeoDataFrame with {len(gdf)} rows")
762
+
763
+ try:
764
+ expanded_gdf = expand_on_variables_and_time(gdf)
765
+ return expanded_gdf
766
+ except NameError:
767
+ self._client.logger.warning("expand_on_variables_and_time function not found, returning raw GeoDataFrame")
768
+ return gdf
769
+
770
+ else:
771
+ self._client.logger.warning(f"Failed to download geometry data: HTTP {response.status}")
772
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
773
+
774
+ except Exception as e:
775
+ self._client.logger.error(f"Error downloading geometry data: {e}")
776
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
777
+
778
+ except Exception as e:
779
+ self._client.logger.error(f"Failed to load zonal stats for job {job_id}: {e}")
780
+ return None
@@ -16,20 +16,21 @@ def get_bounds(aoi, crs, to_crs = None):
16
16
  bounds = aoi.geometry[0].bounds
17
17
  return *bounds, aoi
18
18
 
19
- def tile_generator(x_min, y_min, x_max, y_max, aoi, crs, res, tile_size, expression, output, fully_cover=True):
20
- i_max = int((x_max-x_min)/(tile_size*res))
21
- j_max = int((y_max-y_min)/(tile_size*res))
22
- if fully_cover:
23
- i_max += 1
24
- j_max += 1
19
+ def tile_generator(x_min, y_min, x_max, y_max, aoi, crs, res, tile_size, expression, output, mask):
20
+ i_max = int((x_max-x_min)/(tile_size*res)) + 1
21
+ j_max = int((y_max-y_min)/(tile_size*res)) + 1
25
22
  for j in range(0, int(j_max)):
26
23
  for i in range(0, int(i_max)):
27
24
  x = x_min + i*(tile_size*res)
28
25
  y = y_max - j*(tile_size*res)
29
- bbox = shapely.geometry.box(x, y-(tile_size*res), x + (tile_size*res), y)
30
- if not aoi.geometry[0].intersects(bbox):
26
+ geom = shapely.geometry.box(x, y-(tile_size*res), x + (tile_size*res), y)
27
+ if not aoi.geometry[0].intersects(geom):
31
28
  continue
32
- feat = {"type": "Feature", "geometry": bbox.__geo_interface__}
29
+ if mask:
30
+ geom = geom.intersection(aoi.geometry[0])
31
+ if geom.is_empty:
32
+ continue
33
+ feat = {"type": "Feature", "geometry": geom.__geo_interface__}
33
34
  data = {
34
35
  "feature": feat,
35
36
  "in_crs": crs,
@@ -46,15 +47,15 @@ def tiles(
46
47
  aoi : str,
47
48
  expression: str = "red=S2v2#(year,median).red@(year =2024) \n red",
48
49
  output: str = "netcdf",
49
- tile_size : float = 512,
50
+ tile_size : float = 1024,
50
51
  crs : str = "epsg:3577",
51
52
  res: float = 10,
52
53
  region : str = "eu",
53
54
  to_crs: str = None,
54
- fully_cover: bool = True,
55
55
  overwrite: bool = False,
56
56
  skip_existing: bool = False,
57
57
  non_interactive: bool = False,
58
+ mask: bool = True,
58
59
  ):
59
60
 
60
61
  reqs = []
@@ -62,7 +63,7 @@ def tiles(
62
63
 
63
64
  if to_crs is None:
64
65
  to_crs = crs
65
- for tile_req, i, j in tile_generator(x_min, y_min, x_max, y_max, aoi, to_crs, res, tile_size, expression, output, fully_cover):
66
+ for tile_req, i, j in tile_generator(x_min, y_min, x_max, y_max, aoi, to_crs, res, tile_size, expression, output, mask):
66
67
  req_name = f"{name}_{i:02d}_{j:02d}"
67
68
  reqs.append({"group": "tiles", "file": req_name, "request": tile_req})
68
69
 
@@ -643,6 +643,7 @@ class SyncClient:
643
643
 
644
644
  def create_dataset_file(
645
645
  self,
646
+ name: str,
646
647
  aoi: str,
647
648
  expression: str,
648
649
  output: str,
@@ -655,6 +656,9 @@ class SyncClient:
655
656
  non_interactive: bool = True,
656
657
  poll_interval: int = 30,
657
658
  download_path: str = "/home/user/Downloads",
659
+ mask = True,
660
+ max_file_size_mb: int = 5120, # Default to 5GB
661
+ tile_size: int = 1024,
658
662
  ) -> dict:
659
663
  """Create a dataset file using mass stats operations (synchronous version)."""
660
664
  coro = self._async_client.create_dataset_file(
@@ -670,9 +674,14 @@ class SyncClient:
670
674
  non_interactive=non_interactive,
671
675
  poll_interval=poll_interval,
672
676
  download_path=download_path,
677
+ name=name,
678
+ mask=mask,
679
+ max_file_size_mb=max_file_size_mb,
680
+ tile_size=tile_size
673
681
  )
674
682
  return self._run_async(coro)
675
683
 
684
+
676
685
  def geo_queries(
677
686
  self,
678
687
  queries: list[dict],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: terrakio-core
3
- Version: 0.4.6
3
+ Version: 0.4.7
4
4
  Summary: Core components for Terrakio API clients
5
5
  Author-email: Yupeng Chao <yupeng@haizea.com.au>
6
6
  Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api
@@ -28,6 +28,7 @@ Requires-Dist: onnxruntime>=1.10.0
28
28
  Requires-Dist: psutil>=5.0.0
29
29
  Requires-Dist: h5netcdf>=1.0.0
30
30
  Requires-Dist: netcdf4>=1.5.0
31
+ Requires-Dist: aiofiles>=24.1.0
31
32
  Provides-Extra: ml
32
33
  Requires-Dist: torch>=2.7.1; extra == "ml"
33
34
  Requires-Dist: scikit-learn>=1.7.0; extra == "ml"
@@ -11,6 +11,7 @@ onnxruntime>=1.10.0
11
11
  psutil>=5.0.0
12
12
  h5netcdf>=1.0.0
13
13
  netcdf4>=1.5.0
14
+ aiofiles>=24.1.0
14
15
 
15
16
  [ml]
16
17
  torch>=2.7.1
File without changes
File without changes