terrakio-core 0.4.7__py3-none-any.whl → 0.4.93__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of terrakio-core might be problematic. Click here for more details.

@@ -1,30 +1,143 @@
1
-
2
- import os
1
+ # Standard library imports
3
2
  import asyncio
4
- import tempfile
5
- import time
6
- import pandas as pd
7
- import geopandas as gpd
8
- from geopandas import GeoDataFrame
9
- from shapely.geometry import mapping
10
- from pathlib import Path
11
- from ..exceptions import APIError, ConfigurationError
12
- from ..helper.bounded_taskgroup import BoundedTaskGroup
13
- from ..helper.tiles import tiles
3
+ import psutil
4
+ import random
14
5
  import uuid
6
+ from io import BytesIO
7
+ from typing import Optional
8
+
9
+ # Third-party library imports
10
+ import aiohttp
11
+ import geopandas as gpd
12
+ import nest_asyncio
13
+ import pandas as pd
14
+ import pyproj
15
15
  import xarray as xr
16
- import random
17
- import psutil
18
- import copy
19
- from shapely.geometry import shape
16
+ from geopandas import GeoDataFrame
17
+ from shapely.geometry import box, mapping, shape
20
18
  from shapely.ops import transform
21
- from shapely.geometry import box
22
- import pyproj
23
19
 
24
- import pandas as pd
25
- import geopandas as gpd
20
+ # Local imports
21
+ from .geoquries import request_geoquery_list
22
+
23
+ nest_asyncio.apply()
24
+ class cloud_object(gpd.GeoDataFrame):
25
+ """
26
+ This class is a class used for cloud
27
+ """
28
+ def __init__(self, job_id: str, job_name: str, client=None):
29
+
30
+ super().__init__({
31
+ 'geometry': [],
32
+ 'dataset': []
33
+ })
34
+
35
+ self.job_id = job_id
36
+ self.client = client
37
+ self.job_name = job_name
38
+
39
+ def head(self, n = 5):
40
+ """
41
+ Returns the first n files stored in the cloud bucket.
42
+ """
43
+ return asyncio.run(self._head_async(n))
44
+
45
+ async def _head_async(self, n = 5):
46
+ """
47
+ Returns the first n files stored in the cloud bucket.
48
+
49
+ Args:
50
+ n (int): Number of files to return. Default is 5.
51
+
52
+ Returns:
53
+ GeoDataFrame: A GeoDataFrame containing the first n files.
54
+ """
55
+
56
+ track_info = await self.client.mass_stats.track_job([self.job_id])
57
+ job_info = track_info[self.job_id]
58
+ status = job_info['status']
59
+ if status == "Completed":
60
+ payload = {
61
+ "job_name": job_info["name"],
62
+ "file_type": "raw",
63
+ "bucket": job_info["bucket"],
64
+ }
65
+ result = await self.client._terrakio_request("POST", "mass_stats/download_files", json=payload)
66
+ download_urls = result["download_urls"][:n]
67
+ datasets = []
68
+
69
+ async with aiohttp.ClientSession() as session:
70
+ for i, url in enumerate(download_urls):
71
+ try:
72
+ self.client.logger.info(f"Downloading dataset {i+1}/{len(download_urls)}...")
73
+ async with session.get(url) as response:
74
+ if response.status == 200:
75
+ content = await response.read()
76
+ dataset = xr.open_dataset(BytesIO(content))
77
+ datasets.append(dataset)
78
+ self.client.logger.info(f"Successfully processed dataset {i+1}")
79
+ else:
80
+ self.client.logger.warning(f"Failed to download dataset {i+1}: HTTP {response.status}")
81
+ except Exception as e:
82
+ self.client.logger.error(f"Error downloading dataset {i+1}: {e}")
83
+ continue
84
+ if not datasets:
85
+ self.client.logger.warning("No datasets were successfully downloaded")
86
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
87
+ try:
88
+ json_response = await self.client._terrakio_request(
89
+ "POST", "mass_stats/download_json",
90
+ params={"job_name": job_info['name']}
91
+ )
92
+ json_url = json_response["download_url"]
93
+
94
+ async with session.get(json_url) as response:
95
+ if response.status == 200:
96
+ json_data = await response.json()
97
+ self.client.logger.info("Successfully downloaded geometry data")
98
+
99
+ geometries = []
100
+ max_geometries = min(n, len(json_data), len(datasets))
101
+
102
+ for i in range(max_geometries):
103
+ try:
104
+ geom_dict = json_data[i]["request"]["feature"]["geometry"]
105
+ shapely_geom = shape(geom_dict)
106
+ geometries.append(shapely_geom)
107
+ except (KeyError, ValueError) as e:
108
+ self.client.logger.warning(f"Error parsing geometry {i}: {e}")
109
+ continue
110
+
111
+ min_length = min(len(datasets), len(geometries))
112
+ if min_length == 0:
113
+ self.client.logger.warning("No matching datasets and geometries found")
114
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
115
+
116
+ gdf = gpd.GeoDataFrame({
117
+ 'geometry': geometries[:min_length],
118
+ 'dataset': datasets[:min_length]
119
+ })
120
+
121
+ self.client.logger.info(f"Created GeoDataFrame with {len(gdf)} rows")
122
+ try:
123
+ expanded_gdf = expand_on_variables_and_time(gdf)
124
+ return expanded_gdf
125
+ except NameError:
126
+ self.client.logger.warning("expand_on_variables_and_time function not found, returning raw GeoDataFrame")
127
+ return gdf
128
+
129
+ else:
130
+ self.client.logger.warning(f"Failed to download geometry data: HTTP {response.status}")
131
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
132
+
133
+ except Exception as e:
134
+ self.client.logger.error(f"Error downloading geometry data: {e}")
135
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
136
+ elif status in ["Failed", "Cancelled", "Error"]:
137
+ return "The zonal stats job(for preparing the data) has failed, please check the job status!"
138
+ else:
139
+ return "The zonal stats job(for preparing the data) is still runningm, please come back at a later time!"
26
140
 
27
- from typing import Optional
28
141
 
29
142
  def expand_on_time(gdf):
30
143
  """
@@ -90,6 +203,8 @@ def expand_on_time(gdf):
90
203
  result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
91
204
  result_gdf = result_gdf.set_index(['geometry'])
92
205
 
206
+ result_gdf.attrs = gdf.attrs.copy()
207
+
93
208
  return result_gdf
94
209
 
95
210
  def expand_on_variables(gdf):
@@ -143,7 +258,7 @@ def expand_on_variables(gdf):
143
258
  raise ValueError("Expected 'dataset' column for variable expansion")
144
259
 
145
260
  result_df = pd.DataFrame(rows)
146
-
261
+
147
262
  if 'time' in result_df.columns:
148
263
  result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
149
264
  result_gdf = result_gdf.set_index(['geometry', 'time'])
@@ -151,9 +266,10 @@ def expand_on_variables(gdf):
151
266
  result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
152
267
  result_gdf = result_gdf.set_index(['geometry'])
153
268
 
269
+ result_gdf.attrs = gdf.attrs.copy()
270
+
154
271
  return result_gdf
155
272
 
156
-
157
273
  def expand_on_variables_and_time(gdf):
158
274
  """
159
275
  Convenience function to expand on both variables and time.
@@ -169,7 +285,7 @@ def expand_on_variables_and_time(gdf):
169
285
  return expanded_on_variables_and_time
170
286
  except Exception as e:
171
287
  return expanded_on_time
172
-
288
+
173
289
  def estimate_geometry_size_ratio(queries: list):
174
290
  """Calculate size ratios for all geometries relative to the first geometry using bounding box area."""
175
291
 
@@ -217,101 +333,6 @@ async def estimate_query_size(
217
333
  total_size_mb += first_query_dataset.nbytes * ratios[i] / (1024**2)
218
334
  return total_size_mb
219
335
 
220
- async def request_geoquery_list(
221
- client,
222
- quries: list[dict],
223
- conc: int = 20,
224
- ):
225
- """
226
- Execute multiple geo queries.
227
-
228
- Args:
229
- client: The Terrakio client instance
230
- quries: List of dictionaries containing query parameters
231
- conc: The concurrency level for the requests
232
-
233
- Returns:
234
- List of query results
235
-
236
- Raises:
237
- ValueError: If the queries list is empty
238
- """
239
- if not quries:
240
- raise ValueError("Queries list cannot be empty")
241
- if conc > 100:
242
- raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
243
-
244
- for i, query in enumerate(quries):
245
- if 'expr' not in query:
246
- raise ValueError(f"Query at index {i} is missing the required 'expr' key")
247
- if 'feature' not in query:
248
- raise ValueError(f"Query at index {i} is missing the required 'feature' key")
249
- if 'in_crs' not in query:
250
- raise ValueError(f"Query at index {i} is missing the required 'in_crs' key")
251
-
252
- completed_count = 0
253
- lock = asyncio.Lock()
254
- async def single_geo_query(query):
255
- """
256
- Execute multiple geo queries concurrently.
257
-
258
- Args:
259
- quries: List of dictionaries containing query parameters
260
- """
261
- total_number_of_requests = len(quries)
262
- nonlocal completed_count
263
- try:
264
- result = await client.geoquery(**query)
265
- if isinstance(result, dict) and result.get("error"):
266
- error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
267
- if result.get('status_code'):
268
- error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
269
- raise APIError(error_msg)
270
- if isinstance(result, list):
271
- result = result[0]
272
- timestamp_number = result['request_count']
273
- return timestamp_number
274
- if not isinstance(result, xr.Dataset):
275
- raise ValueError(f"Expected xarray Dataset, got {type(result)}")
276
-
277
- async with lock:
278
- completed_count += 1
279
- if completed_count % max(1, total_number_of_requests // 10) == 0:
280
- client.logger.info(f"Progress: {completed_count}/{total_number_of_requests} requests processed")
281
- return result
282
- except Exception as e:
283
- async with lock:
284
- completed_count += 1
285
- raise
286
-
287
- try:
288
- async with BoundedTaskGroup(max_concurrency=conc) as tg:
289
- tasks = [tg.create_task(single_geo_query(quries[idx])) for idx in range(len(quries))]
290
- all_results = [task.result() for task in tasks]
291
-
292
- except* Exception as eg:
293
- for e in eg.exceptions:
294
- if hasattr(e, 'response'):
295
- raise APIError(f"API request failed: {e.response.text}")
296
- raise
297
- client.logger.info("All requests completed!")
298
-
299
- if not all_results:
300
- raise ValueError("No valid results were returned for any geometry")
301
- if isinstance(all_results, list) and type(all_results[0]) == int:
302
- return sum(all_results)/len(all_results)
303
- else:
304
- geometries = []
305
- for query in quries:
306
- feature = query['feature']
307
- geometry = shape(feature['geometry'])
308
- geometries.append(geometry)
309
- result_gdf = gpd.GeoDataFrame({
310
- 'geometry': geometries,
311
- 'dataset': all_results
312
- })
313
- return result_gdf
314
-
315
336
  async def estimate_timestamp_number(
316
337
  client,
317
338
  quries: list[dict],
@@ -388,9 +409,7 @@ def gdf_to_json(
388
409
  """
389
410
  mass_stats_requests = []
390
411
 
391
- # Loop through each row in the GeoDataFrame
392
412
  for idx, row in gdf.iterrows():
393
- # Create the request feature
394
413
  request_feature = {
395
414
  "expr": expr,
396
415
  "feature": {
@@ -404,29 +423,24 @@ def gdf_to_json(
404
423
  "geom_fix": geom_fix,
405
424
  }
406
425
 
407
- # Determine group name and file name based on id_column
408
426
  if id_column is not None and id_column in gdf.columns:
409
- # Use the value from the specified column as group and file name
410
427
  identifier = str(row[id_column])
411
428
  group_name = f"group_{identifier}"
412
429
  file_name = f"file_{identifier}"
413
430
  else:
414
- # Use the index as group and file name
415
431
  group_name = f"group_{idx}"
416
432
  file_name = f"file_{idx}"
417
433
 
418
- # Create the complete request entry
419
434
  request_entry = {
420
435
  "group": group_name,
421
436
  "file": file_name,
422
437
  "request": request_feature,
423
438
  }
424
439
 
425
- # Add the request to our list
426
440
  mass_stats_requests.append(request_entry)
427
441
 
428
442
  return mass_stats_requests
429
-
443
+
430
444
  async def handle_mass_stats(
431
445
  client,
432
446
  gdf: GeoDataFrame,
@@ -436,17 +450,24 @@ async def handle_mass_stats(
436
450
  resolution: int = -1,
437
451
  geom_fix: bool = False,
438
452
  id_column: Optional[str] = None,
439
-
440
453
  ):
441
- request_json = gdf_to_json(gdf = gdf, expr = expr, in_crs = in_crs, out_crs = out_crs, resolution = resolution, geom_fix = geom_fix, id_column = id_column)
442
- job_id =await client.mass_stats.execute_job(
443
- name = "zonal_stats_job",
444
- output = "netcdf",
445
- config = {},
446
- request_json = request_json,
447
- overwrite = True,
454
+ request_json = gdf_to_json(gdf=gdf, expr=expr, in_crs=in_crs, out_crs=out_crs,
455
+ resolution=resolution, geom_fix=geom_fix, id_column=id_column)
456
+
457
+ job_response = await client.mass_stats.execute_job(
458
+ name=f"zonal-stats-{str(uuid.uuid4())[:6]}",
459
+ output="netcdf",
460
+ config={},
461
+ request_json=request_json,
462
+ overwrite=True,
448
463
  )
449
- return job_id
464
+
465
+ # Extract the actual task ID from the response
466
+ if isinstance(job_response, dict) and 'task_id' in job_response:
467
+ return job_response['task_id'] # Return just the string ID
468
+ else:
469
+ return job_response # In case it's already just the ID
470
+
450
471
 
451
472
  async def zonal_stats(
452
473
  client,
@@ -461,7 +482,6 @@ async def zonal_stats(
461
482
  id_column: Optional[str] = None,
462
483
  ):
463
484
  """Compute zonal statistics for all geometries in a GeoDataFrame."""
464
-
465
485
  if mass_stats:
466
486
  mass_stats_id = await handle_mass_stats(
467
487
  client = client,
@@ -471,9 +491,13 @@ async def zonal_stats(
471
491
  out_crs = out_crs,
472
492
  resolution = resolution,
473
493
  geom_fix = geom_fix,
474
- id_column = id_column
494
+ id_column = id_column,
475
495
  )
476
- return mass_stats_id
496
+ job_name = await client.mass_stats.track_job([mass_stats_id])
497
+ job_name = job_name[mass_stats_id]["name"]
498
+ cloud_files_object = cloud_object(job_id = mass_stats_id, job_name = job_name, client = client)
499
+ return cloud_files_object
500
+
477
501
  quries = []
478
502
  for i in range(len(gdf)):
479
503
  quries.append({
@@ -494,130 +518,9 @@ async def zonal_stats(
494
518
  raise ValueError(local_or_remote_result["reason"])
495
519
  else:
496
520
  gdf_with_datasets = await request_geoquery_list(client = client, quries = quries, conc = conc)
521
+ gdf_with_datasets.attrs["cloud_metadata"] = {
522
+ "is_cloud_backed": False,
523
+ }
497
524
  gdf_with_datasets = expand_on_variables_and_time(gdf_with_datasets)
498
525
  return gdf_with_datasets
499
526
 
500
- async def create_dataset_file(
501
- client,
502
- aoi: str,
503
- expression: str,
504
- output: str,
505
- download_path: str,
506
- in_crs: str = "epsg:4326",
507
- to_crs: str = "epsg:4326",
508
- res: float = 0.0001,
509
- region: str = None,
510
- overwrite: bool = False,
511
- skip_existing: bool = False,
512
- non_interactive: bool = True,
513
- name: str | None = None,
514
- poll_interval: int = 30,
515
- max_file_size_mb: int = 5120,
516
- tile_size: int = 1024,
517
- mask: bool = True
518
- ) -> dict:
519
-
520
- if not name:
521
- name = f"file-gen-{uuid.uuid4().hex[:8]}"
522
-
523
- body, reqs, groups = tiles(
524
- name = name,
525
- aoi = aoi,
526
- expression = expression,
527
- output = output,
528
- tile_size = tile_size,
529
- crs = in_crs,
530
- res = res,
531
- region = region,
532
- to_crs = to_crs,
533
- mask = mask,
534
- overwrite = overwrite,
535
- skip_existing = skip_existing,
536
- non_interactive = non_interactive
537
- )
538
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tempreq:
539
- tempreq.write(reqs)
540
- tempreqname = tempreq.name
541
-
542
- task_id = await client.mass_stats.execute_job(
543
- name=body["name"],
544
- region=body["region"],
545
- output=body["output"],
546
- config = {},
547
- overwrite=body["overwrite"],
548
- skip_existing=body["skip_existing"],
549
- request_json=tempreqname,
550
- )
551
-
552
- start_time = time.time()
553
- status = None
554
- client.logger.info(f"Tracking data generation job {task_id['task_id']}...")
555
- while True:
556
- try:
557
- taskid = task_id['task_id']
558
- trackinfo = await client.mass_stats.track_job([taskid])
559
- status = trackinfo[taskid]['status']
560
- if status == 'Completed':
561
- client.logger.info('Data generated successfully!')
562
- break
563
- elif status in ['Failed', 'Cancelled', 'Error']:
564
- raise RuntimeError(f"Job {taskid} failed with status: {status}")
565
- else:
566
- elapsed_time = time.time() - start_time
567
- client.logger.info(f"Job status: {status} - Elapsed time: {elapsed_time:.1f}s")
568
- await asyncio.sleep(poll_interval)
569
-
570
-
571
- except KeyboardInterrupt:
572
- client.logger.info(f"\nInterrupted! Job {taskid} is still running in the background.")
573
- raise
574
- except Exception as e:
575
- client.logger.info(f"\nError tracking job: {e}")
576
- raise
577
-
578
- os.unlink(tempreqname)
579
-
580
- combine_result = await client.mass_stats.combine_tiles(body["name"], body["overwrite"], body["output"], max_file_size_mb=max_file_size_mb)
581
- combine_task_id = combine_result.get("task_id")
582
-
583
- combine_start_time = time.time()
584
- client.logger.info(f"Tracking file generation job {combine_task_id}...")
585
- while True:
586
- try:
587
- trackinfo = await client.mass_stats.track_job([combine_task_id])
588
- if body["output"] == "netcdf":
589
- download_file_name = trackinfo[combine_task_id]['folder'] + '.nc'
590
- elif body["output"] == "geotiff":
591
- download_file_name = trackinfo[combine_task_id]['folder'] + '.tif'
592
- bucket = trackinfo[combine_task_id]['bucket']
593
- combine_status = trackinfo[combine_task_id]['status']
594
- if combine_status == 'Completed':
595
- client.logger.info('File/s generated successfully!')
596
- break
597
- elif combine_status in ['Failed', 'Cancelled', 'Error']:
598
- raise RuntimeError(f"File generation job {combine_task_id} failed with status: {combine_status}")
599
- else:
600
- elapsed_time = time.time() - combine_start_time
601
- client.logger.info(f"File generation job status: {combine_status} - Elapsed time: {elapsed_time:.1f}s")
602
- time.sleep(poll_interval)
603
- except KeyboardInterrupt:
604
- client.logger.info(f"\nInterrupted! File generation job {combine_task_id} is still running in the background.")
605
- raise
606
- except Exception as e:
607
- client.logger.info(f"\nError tracking file generation job: {e}")
608
- raise
609
-
610
- if download_path:
611
- await client.mass_stats.download_file(
612
- job_name=body["name"],
613
- bucket=bucket,
614
- file_type='processed',
615
- folder='file-gen',
616
- page_size=100,
617
- output_path=download_path,
618
- )
619
- else:
620
- path = f"{body['name']}/outputs/merged/{download_file_name}"
621
- client.logger.info(f"Dataset file/s is available at {path}")
622
-
623
- return {"generation_task_id": task_id, "combine_task_id": combine_task_id}