terrakio-core 0.4.8__py3-none-any.whl → 0.4.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of terrakio-core might be problematic. Click here for more details.

@@ -1,30 +1,145 @@
1
-
2
- import os
1
+ # Standard library imports
3
2
  import asyncio
4
- import tempfile
5
- import time
6
- import pandas as pd
7
- import geopandas as gpd
8
- from geopandas import GeoDataFrame
9
- from shapely.geometry import mapping
10
- from pathlib import Path
11
- from ..exceptions import APIError, ConfigurationError
12
- from ..helper.bounded_taskgroup import BoundedTaskGroup
13
- from ..helper.tiles import tiles
3
+ import psutil
4
+ import random
14
5
  import uuid
6
+ from io import BytesIO
7
+ from typing import Optional
8
+
9
+ # Third-party library imports
10
+ import aiohttp
11
+ import geopandas as gpd
12
+ import nest_asyncio
13
+ import pandas as pd
14
+ import pyproj
15
15
  import xarray as xr
16
- import random
17
- import psutil
18
- import copy
19
- from shapely.geometry import shape
16
+ from geopandas import GeoDataFrame
17
+ from shapely.geometry import box, mapping, shape
20
18
  from shapely.ops import transform
21
- from shapely.geometry import box
22
- import pyproj
23
19
 
24
- import pandas as pd
25
- import geopandas as gpd
20
+ # Local imports
21
+ from .geoquries import request_geoquery_list
26
22
 
27
- from typing import Optional
23
+ nest_asyncio.apply()
24
+ class cloud_object(gpd.GeoDataFrame):
25
+ """
26
+ This class is a class used for cloud
27
+ """
28
+ def __init__(self, job_id: str, job_name: str, client=None):
29
+
30
+ super().__init__({
31
+ 'geometry': [],
32
+ 'dataset': []
33
+ })
34
+
35
+ self.job_id = job_id
36
+ self.client = client
37
+ self.job_name = job_name
38
+
39
+ def head(self, n = 5):
40
+ """
41
+ Returns the first n files stored in the cloud bucket.
42
+ """
43
+ return asyncio.run(self._head_async(n))
44
+
45
+ async def _head_async(self, n = 5):
46
+ """
47
+ Returns the first n files stored in the cloud bucket.
48
+
49
+ Args:
50
+ n (int): Number of files to return. Default is 5.
51
+
52
+ Returns:
53
+ GeoDataFrame: A GeoDataFrame containing the first n files.
54
+ """
55
+
56
+ track_info = await self.client.mass_stats.track_job([self.job_id])
57
+ job_info = track_info[self.job_id]
58
+ status = job_info['status']
59
+
60
+ if status == "Completed":
61
+ payload = {
62
+ "job_name": job_info["name"],
63
+ "file_type": "raw",
64
+ "bucket": job_info["bucket"],
65
+ }
66
+ result = await self.client._terrakio_request("POST", "mass_stats/download_files", json=payload)
67
+ download_urls = result["download_urls"][:n]
68
+ datasets = []
69
+
70
+ async with aiohttp.ClientSession() as session:
71
+ for i, url in enumerate(download_urls):
72
+ try:
73
+ self.client.logger.info(f"Downloading dataset {i+1}/{len(download_urls)}...")
74
+ async with session.get(url) as response:
75
+ if response.status == 200:
76
+ content = await response.read()
77
+ dataset = xr.open_dataset(BytesIO(content))
78
+ datasets.append(dataset)
79
+ self.client.logger.info(f"Successfully processed dataset {i+1}")
80
+ else:
81
+ self.client.logger.warning(f"Failed to download dataset {i+1}: HTTP {response.status}")
82
+ except Exception as e:
83
+ self.client.logger.error(f"Error downloading dataset {i+1}: {e}")
84
+ continue
85
+ if not datasets:
86
+ self.client.logger.warning("No datasets were successfully downloaded")
87
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
88
+ try:
89
+ json_response = await self.client._terrakio_request(
90
+ "POST", "mass_stats/download_json",
91
+ params={"job_name": job_info['name']}
92
+ )
93
+ json_url = json_response["download_url"]
94
+
95
+ async with session.get(json_url) as response:
96
+ if response.status == 200:
97
+ json_data = await response.json()
98
+ self.client.logger.info("Successfully downloaded geometry data")
99
+
100
+ geometries = []
101
+ max_geometries = min(n, len(json_data), len(datasets))
102
+
103
+ for i in range(max_geometries):
104
+ try:
105
+ geom_dict = json_data[i]["request"]["feature"]["geometry"]
106
+ shapely_geom = shape(geom_dict)
107
+ geometries.append(shapely_geom)
108
+ except (KeyError, ValueError) as e:
109
+ self.client.logger.warning(f"Error parsing geometry {i}: {e}")
110
+ continue
111
+
112
+ min_length = min(len(datasets), len(geometries))
113
+ if min_length == 0:
114
+ self.client.logger.warning("No matching datasets and geometries found")
115
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
116
+
117
+ gdf = gpd.GeoDataFrame({
118
+ 'geometry': geometries[:min_length],
119
+ 'dataset': datasets[:min_length]
120
+ })
121
+
122
+ self.client.logger.info(f"Created GeoDataFrame with {len(gdf)} rows")
123
+ try:
124
+ expanded_gdf = expand_on_variables_and_time(gdf)
125
+ return expanded_gdf
126
+ except NameError:
127
+ self.client.logger.warning("expand_on_variables_and_time function not found, returning raw GeoDataFrame")
128
+ return gdf
129
+
130
+ else:
131
+ self.client.logger.warning(f"Failed to download geometry data: HTTP {response.status}")
132
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
133
+
134
+ except Exception as e:
135
+ self.client.logger.error(f"Error downloading geometry data: {e}")
136
+ return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
137
+
138
+ elif status in ["Failed", "Cancelled", "Error"]:
139
+ raise RuntimeError(f"The zonal stats job (job_id: {self.job_id}) has failed, cancelled, or errored. Please check the job status!")
140
+
141
+ else:
142
+ raise RuntimeError(f"The zonal stats job (job_id: {self.job_id}) is still running. Please come back at a later time!")
28
143
 
29
144
  def expand_on_time(gdf):
30
145
  """
@@ -90,6 +205,8 @@ def expand_on_time(gdf):
90
205
  result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
91
206
  result_gdf = result_gdf.set_index(['geometry'])
92
207
 
208
+ result_gdf.attrs = gdf.attrs.copy()
209
+
93
210
  return result_gdf
94
211
 
95
212
  def expand_on_variables(gdf):
@@ -143,7 +260,7 @@ def expand_on_variables(gdf):
143
260
  raise ValueError("Expected 'dataset' column for variable expansion")
144
261
 
145
262
  result_df = pd.DataFrame(rows)
146
-
263
+
147
264
  if 'time' in result_df.columns:
148
265
  result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
149
266
  result_gdf = result_gdf.set_index(['geometry', 'time'])
@@ -151,9 +268,10 @@ def expand_on_variables(gdf):
151
268
  result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
152
269
  result_gdf = result_gdf.set_index(['geometry'])
153
270
 
271
+ result_gdf.attrs = gdf.attrs.copy()
272
+
154
273
  return result_gdf
155
274
 
156
-
157
275
  def expand_on_variables_and_time(gdf):
158
276
  """
159
277
  Convenience function to expand on both variables and time.
@@ -169,7 +287,7 @@ def expand_on_variables_and_time(gdf):
169
287
  return expanded_on_variables_and_time
170
288
  except Exception as e:
171
289
  return expanded_on_time
172
-
290
+
173
291
  def estimate_geometry_size_ratio(queries: list):
174
292
  """Calculate size ratios for all geometries relative to the first geometry using bounding box area."""
175
293
 
@@ -217,101 +335,6 @@ async def estimate_query_size(
217
335
  total_size_mb += first_query_dataset.nbytes * ratios[i] / (1024**2)
218
336
  return total_size_mb
219
337
 
220
- async def request_geoquery_list(
221
- client,
222
- quries: list[dict],
223
- conc: int = 20,
224
- ):
225
- """
226
- Execute multiple geo queries.
227
-
228
- Args:
229
- client: The Terrakio client instance
230
- quries: List of dictionaries containing query parameters
231
- conc: The concurrency level for the requests
232
-
233
- Returns:
234
- List of query results
235
-
236
- Raises:
237
- ValueError: If the queries list is empty
238
- """
239
- if not quries:
240
- raise ValueError("Queries list cannot be empty")
241
- if conc > 100:
242
- raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
243
-
244
- for i, query in enumerate(quries):
245
- if 'expr' not in query:
246
- raise ValueError(f"Query at index {i} is missing the required 'expr' key")
247
- if 'feature' not in query:
248
- raise ValueError(f"Query at index {i} is missing the required 'feature' key")
249
- if 'in_crs' not in query:
250
- raise ValueError(f"Query at index {i} is missing the required 'in_crs' key")
251
-
252
- completed_count = 0
253
- lock = asyncio.Lock()
254
- async def single_geo_query(query):
255
- """
256
- Execute multiple geo queries concurrently.
257
-
258
- Args:
259
- quries: List of dictionaries containing query parameters
260
- """
261
- total_number_of_requests = len(quries)
262
- nonlocal completed_count
263
- try:
264
- result = await client.geoquery(**query)
265
- if isinstance(result, dict) and result.get("error"):
266
- error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
267
- if result.get('status_code'):
268
- error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
269
- raise APIError(error_msg)
270
- if isinstance(result, list):
271
- result = result[0]
272
- timestamp_number = result['request_count']
273
- return timestamp_number
274
- if not isinstance(result, xr.Dataset):
275
- raise ValueError(f"Expected xarray Dataset, got {type(result)}")
276
-
277
- async with lock:
278
- completed_count += 1
279
- if completed_count % max(1, total_number_of_requests // 10) == 0:
280
- client.logger.info(f"Progress: {completed_count}/{total_number_of_requests} requests processed")
281
- return result
282
- except Exception as e:
283
- async with lock:
284
- completed_count += 1
285
- raise
286
-
287
- try:
288
- async with BoundedTaskGroup(max_concurrency=conc) as tg:
289
- tasks = [tg.create_task(single_geo_query(quries[idx])) for idx in range(len(quries))]
290
- all_results = [task.result() for task in tasks]
291
-
292
- except* Exception as eg:
293
- for e in eg.exceptions:
294
- if hasattr(e, 'response'):
295
- raise APIError(f"API request failed: {e.response.text}")
296
- raise
297
- client.logger.info("All requests completed!")
298
-
299
- if not all_results:
300
- raise ValueError("No valid results were returned for any geometry")
301
- if isinstance(all_results, list) and type(all_results[0]) == int:
302
- return sum(all_results)/len(all_results)
303
- else:
304
- geometries = []
305
- for query in quries:
306
- feature = query['feature']
307
- geometry = shape(feature['geometry'])
308
- geometries.append(geometry)
309
- result_gdf = gpd.GeoDataFrame({
310
- 'geometry': geometries,
311
- 'dataset': all_results
312
- })
313
- return result_gdf
314
-
315
338
  async def estimate_timestamp_number(
316
339
  client,
317
340
  quries: list[dict],
@@ -388,9 +411,7 @@ def gdf_to_json(
388
411
  """
389
412
  mass_stats_requests = []
390
413
 
391
- # Loop through each row in the GeoDataFrame
392
414
  for idx, row in gdf.iterrows():
393
- # Create the request feature
394
415
  request_feature = {
395
416
  "expr": expr,
396
417
  "feature": {
@@ -404,29 +425,24 @@ def gdf_to_json(
404
425
  "geom_fix": geom_fix,
405
426
  }
406
427
 
407
- # Determine group name and file name based on id_column
408
428
  if id_column is not None and id_column in gdf.columns:
409
- # Use the value from the specified column as group and file name
410
429
  identifier = str(row[id_column])
411
430
  group_name = f"group_{identifier}"
412
431
  file_name = f"file_{identifier}"
413
432
  else:
414
- # Use the index as group and file name
415
433
  group_name = f"group_{idx}"
416
434
  file_name = f"file_{idx}"
417
435
 
418
- # Create the complete request entry
419
436
  request_entry = {
420
437
  "group": group_name,
421
438
  "file": file_name,
422
439
  "request": request_feature,
423
440
  }
424
441
 
425
- # Add the request to our list
426
442
  mass_stats_requests.append(request_entry)
427
443
 
428
444
  return mass_stats_requests
429
-
445
+
430
446
  async def handle_mass_stats(
431
447
  client,
432
448
  gdf: GeoDataFrame,
@@ -436,17 +452,24 @@ async def handle_mass_stats(
436
452
  resolution: int = -1,
437
453
  geom_fix: bool = False,
438
454
  id_column: Optional[str] = None,
439
-
440
455
  ):
441
- request_json = gdf_to_json(gdf = gdf, expr = expr, in_crs = in_crs, out_crs = out_crs, resolution = resolution, geom_fix = geom_fix, id_column = id_column)
442
- job_id =await client.mass_stats.execute_job(
443
- name = "zonal_stats_job",
444
- output = "netcdf",
445
- config = {},
446
- request_json = request_json,
447
- overwrite = True,
456
+ request_json = gdf_to_json(gdf=gdf, expr=expr, in_crs=in_crs, out_crs=out_crs,
457
+ resolution=resolution, geom_fix=geom_fix, id_column=id_column)
458
+
459
+ job_response = await client.mass_stats.execute_job(
460
+ name=f"zonal-stats-{str(uuid.uuid4())[:6]}",
461
+ output="netcdf",
462
+ config={},
463
+ request_json=request_json,
464
+ overwrite=True,
448
465
  )
449
- return job_id
466
+
467
+ # Extract the actual task ID from the response
468
+ if isinstance(job_response, dict) and 'task_id' in job_response:
469
+ return job_response['task_id'] # Return just the string ID
470
+ else:
471
+ return job_response # In case it's already just the ID
472
+
450
473
 
451
474
  async def zonal_stats(
452
475
  client,
@@ -461,7 +484,6 @@ async def zonal_stats(
461
484
  id_column: Optional[str] = None,
462
485
  ):
463
486
  """Compute zonal statistics for all geometries in a GeoDataFrame."""
464
-
465
487
  if mass_stats:
466
488
  mass_stats_id = await handle_mass_stats(
467
489
  client = client,
@@ -471,9 +493,13 @@ async def zonal_stats(
471
493
  out_crs = out_crs,
472
494
  resolution = resolution,
473
495
  geom_fix = geom_fix,
474
- id_column = id_column
496
+ id_column = id_column,
475
497
  )
476
- return mass_stats_id
498
+ job_name = await client.mass_stats.track_job([mass_stats_id])
499
+ job_name = job_name[mass_stats_id]["name"]
500
+ cloud_files_object = cloud_object(job_id = mass_stats_id, job_name = job_name, client = client)
501
+ return cloud_files_object
502
+
477
503
  quries = []
478
504
  for i in range(len(gdf)):
479
505
  quries.append({
@@ -494,130 +520,9 @@ async def zonal_stats(
494
520
  raise ValueError(local_or_remote_result["reason"])
495
521
  else:
496
522
  gdf_with_datasets = await request_geoquery_list(client = client, quries = quries, conc = conc)
523
+ gdf_with_datasets.attrs["cloud_metadata"] = {
524
+ "is_cloud_backed": False,
525
+ }
497
526
  gdf_with_datasets = expand_on_variables_and_time(gdf_with_datasets)
498
527
  return gdf_with_datasets
499
528
 
500
- async def create_dataset_file(
501
- client,
502
- aoi: str,
503
- expression: str,
504
- output: str,
505
- download_path: str,
506
- in_crs: str = "epsg:4326",
507
- to_crs: str = "epsg:4326",
508
- res: float = 0.0001,
509
- region: str = None,
510
- overwrite: bool = False,
511
- skip_existing: bool = False,
512
- non_interactive: bool = True,
513
- name: str | None = None,
514
- poll_interval: int = 30,
515
- max_file_size_mb: int = 5120,
516
- tile_size: int = 1024,
517
- mask: bool = True
518
- ) -> dict:
519
-
520
- if not name:
521
- name = f"file-gen-{uuid.uuid4().hex[:8]}"
522
-
523
- body, reqs, groups = tiles(
524
- name = name,
525
- aoi = aoi,
526
- expression = expression,
527
- output = output,
528
- tile_size = tile_size,
529
- crs = in_crs,
530
- res = res,
531
- region = region,
532
- to_crs = to_crs,
533
- mask = mask,
534
- overwrite = overwrite,
535
- skip_existing = skip_existing,
536
- non_interactive = non_interactive
537
- )
538
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tempreq:
539
- tempreq.write(reqs)
540
- tempreqname = tempreq.name
541
-
542
- task_id = await client.mass_stats.execute_job(
543
- name=body["name"],
544
- region=body["region"],
545
- output=body["output"],
546
- config = {},
547
- overwrite=body["overwrite"],
548
- skip_existing=body["skip_existing"],
549
- request_json=tempreqname,
550
- )
551
-
552
- start_time = time.time()
553
- status = None
554
- client.logger.info(f"Tracking data generation job {task_id['task_id']}...")
555
- while True:
556
- try:
557
- taskid = task_id['task_id']
558
- trackinfo = await client.mass_stats.track_job([taskid])
559
- status = trackinfo[taskid]['status']
560
- if status == 'Completed':
561
- client.logger.info('Data generated successfully!')
562
- break
563
- elif status in ['Failed', 'Cancelled', 'Error']:
564
- raise RuntimeError(f"Job {taskid} failed with status: {status}")
565
- else:
566
- elapsed_time = time.time() - start_time
567
- client.logger.info(f"Job status: {status} - Elapsed time: {elapsed_time:.1f}s")
568
- await asyncio.sleep(poll_interval)
569
-
570
-
571
- except KeyboardInterrupt:
572
- client.logger.info(f"\nInterrupted! Job {taskid} is still running in the background.")
573
- raise
574
- except Exception as e:
575
- client.logger.info(f"\nError tracking job: {e}")
576
- raise
577
-
578
- os.unlink(tempreqname)
579
-
580
- combine_result = await client.mass_stats.combine_tiles(body["name"], body["overwrite"], body["output"], max_file_size_mb=max_file_size_mb)
581
- combine_task_id = combine_result.get("task_id")
582
-
583
- combine_start_time = time.time()
584
- client.logger.info(f"Tracking file generation job {combine_task_id}...")
585
- while True:
586
- try:
587
- trackinfo = await client.mass_stats.track_job([combine_task_id])
588
- if body["output"] == "netcdf":
589
- download_file_name = trackinfo[combine_task_id]['folder'] + '.nc'
590
- elif body["output"] == "geotiff":
591
- download_file_name = trackinfo[combine_task_id]['folder'] + '.tif'
592
- bucket = trackinfo[combine_task_id]['bucket']
593
- combine_status = trackinfo[combine_task_id]['status']
594
- if combine_status == 'Completed':
595
- client.logger.info('File/s generated successfully!')
596
- break
597
- elif combine_status in ['Failed', 'Cancelled', 'Error']:
598
- raise RuntimeError(f"File generation job {combine_task_id} failed with status: {combine_status}")
599
- else:
600
- elapsed_time = time.time() - combine_start_time
601
- client.logger.info(f"File generation job status: {combine_status} - Elapsed time: {elapsed_time:.1f}s")
602
- time.sleep(poll_interval)
603
- except KeyboardInterrupt:
604
- client.logger.info(f"\nInterrupted! File generation job {combine_task_id} is still running in the background.")
605
- raise
606
- except Exception as e:
607
- client.logger.info(f"\nError tracking file generation job: {e}")
608
- raise
609
-
610
- if download_path:
611
- await client.mass_stats.download_file(
612
- job_name=body["name"],
613
- bucket=bucket,
614
- file_type='processed',
615
- folder='file-gen',
616
- page_size=100,
617
- output_path=download_path,
618
- )
619
- else:
620
- path = f"{body['name']}/outputs/merged/{download_file_name}"
621
- client.logger.info(f"Dataset file/s is available at {path}")
622
-
623
- return {"generation_task_id": task_id, "combine_task_id": combine_task_id}