PyPI - terrakio-core - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.94__py3-none-any.whl - Mend

terrakio-core 0.4.8py3-none-any.whl → 0.4.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of terrakio-core might be problematic. Click here for more details.

Files changed (13) hide show

terrakio_core/__init__.py +1 -1
terrakio_core/accessors.py +800 -328
terrakio_core/async_client.py +10 -3
terrakio_core/convenience_functions/create_dataset_file.py +132 -0
terrakio_core/convenience_functions/geoquries.py +102 -0
terrakio_core/convenience_functions/{convenience_functions.py → zonal_stats.py} +168 -263
terrakio_core/endpoints/mass_stats.py +94 -162
terrakio_core/sync_client.py +0 -340
terrakio_core-0.4.94.dist-info/METADATA +31 -0
{terrakio_core-0.4.8.dist-info → terrakio_core-0.4.94.dist-info}/RECORD +11 -10
{terrakio_core-0.4.8.dist-info → terrakio_core-0.4.94.dist-info}/WHEEL +1 -2
terrakio_core-0.4.8.dist-info/METADATA +0 -47
terrakio_core-0.4.8.dist-info/top_level.txt +0 -1

terrakio_core/convenience_functions/{convenience_functions.py → zonal_stats.py} RENAMED Viewed

@@ -1,30 +1,145 @@
-import os
+# Standard library imports
 import asyncio
-import tempfile
-import time
-import pandas as pd
-import geopandas as gpd
-from geopandas import GeoDataFrame
-from shapely.geometry import mapping
-from pathlib import Path
-from ..exceptions import APIError, ConfigurationError
-from ..helper.bounded_taskgroup import BoundedTaskGroup
-from ..helper.tiles import tiles
+import psutil
+import random
 import uuid
+from io import BytesIO
+from typing import Optional
+# Third-party library imports
+import aiohttp
+import geopandas as gpd
+import nest_asyncio
+import pandas as pd
+import pyproj
 import xarray as xr
-import random
-import psutil
-import copy
-from shapely.geometry import shape
+from geopandas import GeoDataFrame
+from shapely.geometry import box, mapping, shape
 from shapely.ops import transform
-from shapely.geometry import box
-import pyproj
-import pandas as pd
-import geopandas as gpd
+# Local imports
+from .geoquries import request_geoquery_list
-from typing import Optional
+nest_asyncio.apply()
+class cloud_object(gpd.GeoDataFrame):
+    """
+    This class is a class used for cloud
+    """
+    def __init__(self, job_id: str, job_name: str, client=None):
+        super().__init__({
+            'geometry': [],
+            'dataset': []
+        })
+        self.job_id = job_id
+        self.client = client
+        self.job_name = job_name
+    def head(self, n = 5):
+        """
+        Returns the first n files stored in the cloud bucket.
+        """
+        return asyncio.run(self._head_async(n))
+    async def _head_async(self, n = 5):
+        """
+        Returns the first n files stored in the cloud bucket.
+        Args:
+            n (int): Number of files to return. Default is 5.
+        Returns:
+            GeoDataFrame: A GeoDataFrame containing the first n files.
+        """
+        track_info = await self.client.mass_stats.track_job([self.job_id])
+        job_info = track_info[self.job_id]
+        status = job_info['status']
+        if status == "Completed":
+            payload = {
+                "job_name": job_info["name"],
+                "file_type": "raw",
+                "bucket": job_info["bucket"],
+            }
+            result = await self.client._terrakio_request("POST", "mass_stats/download_files", json=payload)
+            download_urls = result["download_urls"][:n]
+            datasets = []
+            async with aiohttp.ClientSession() as session:
+                for i, url in enumerate(download_urls):
+                    try:
+                        self.client.logger.info(f"Downloading dataset {i+1}/{len(download_urls)}...")
+                        async with session.get(url) as response:
+                            if response.status == 200:
+                                content = await response.read()
+                                dataset = xr.open_dataset(BytesIO(content))
+                                datasets.append(dataset)
+                                self.client.logger.info(f"Successfully processed dataset {i+1}")
+                            else:
+                                self.client.logger.warning(f"Failed to download dataset {i+1}: HTTP {response.status}")
+                    except Exception as e:
+                        self.client.logger.error(f"Error downloading dataset {i+1}: {e}")
+                        continue
+                if not datasets:
+                    self.client.logger.warning("No datasets were successfully downloaded")
+                    return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
+                try:
+                    json_response = await self.client._terrakio_request(
+                        "POST", "mass_stats/download_json",
+                        params={"job_name": job_info['name']}
+                    )
+                    json_url = json_response["download_url"]
+                    async with session.get(json_url) as response:
+                        if response.status == 200:
+                            json_data = await response.json()
+                            self.client.logger.info("Successfully downloaded geometry data")
+                            geometries = []
+                            max_geometries = min(n, len(json_data), len(datasets))
+                            for i in range(max_geometries):
+                                try:
+                                    geom_dict = json_data[i]["request"]["feature"]["geometry"]
+                                    shapely_geom = shape(geom_dict)
+                                    geometries.append(shapely_geom)
+                                except (KeyError, ValueError) as e:
+                                    self.client.logger.warning(f"Error parsing geometry {i}: {e}")
+                                    continue
+                            min_length = min(len(datasets), len(geometries))
+                            if min_length == 0:
+                                self.client.logger.warning("No matching datasets and geometries found")
+                                return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
+                            gdf = gpd.GeoDataFrame({
+                                'geometry': geometries[:min_length],
+                                'dataset': datasets[:min_length]
+                            })
+                            self.client.logger.info(f"Created GeoDataFrame with {len(gdf)} rows")
+                            try:
+                                expanded_gdf = expand_on_variables_and_time(gdf)
+                                return expanded_gdf
+                            except NameError:
+                                self.client.logger.warning("expand_on_variables_and_time function not found, returning raw GeoDataFrame")
+                                return gdf
+                        else:
+                            self.client.logger.warning(f"Failed to download geometry data: HTTP {response.status}")
+                            return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
+                except Exception as e:
+                        self.client.logger.error(f"Error downloading geometry data: {e}")
+                        return gpd.GeoDataFrame({'geometry': [], 'dataset': []})
+        elif status in ["Failed", "Cancelled", "Error"]:
+            raise RuntimeError(f"The zonal stats job (job_id: {self.job_id}) has failed, cancelled, or errored. Please check the job status!")
+        else:
+            raise RuntimeError(f"The zonal stats job (job_id: {self.job_id}) is still running. Please come back at a later time!")
 def expand_on_time(gdf):
     """
@@ -90,6 +205,8 @@ def expand_on_time(gdf):
         result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
         result_gdf = result_gdf.set_index(['geometry'])
+    result_gdf.attrs = gdf.attrs.copy()
     return result_gdf
 def expand_on_variables(gdf):
@@ -143,7 +260,7 @@ def expand_on_variables(gdf):
             raise ValueError("Expected 'dataset' column for variable expansion")
     result_df = pd.DataFrame(rows)
     if 'time' in result_df.columns:
         result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
         result_gdf = result_gdf.set_index(['geometry', 'time'])
@@ -151,9 +268,10 @@ def expand_on_variables(gdf):
         result_gdf = gpd.GeoDataFrame(result_df, geometry='geometry')
         result_gdf = result_gdf.set_index(['geometry'])
+    result_gdf.attrs = gdf.attrs.copy()
     return result_gdf
 def expand_on_variables_and_time(gdf):
     """
     Convenience function to expand on both variables and time.
@@ -169,7 +287,7 @@ def expand_on_variables_and_time(gdf):
         return expanded_on_variables_and_time
     except Exception as e:
         return expanded_on_time
 def estimate_geometry_size_ratio(queries: list):
     """Calculate size ratios for all geometries relative to the first geometry using bounding box area."""
@@ -217,101 +335,6 @@ async def estimate_query_size(
         total_size_mb += first_query_dataset.nbytes * ratios[i] / (1024**2)
     return total_size_mb
-async def request_geoquery_list(
-        client,
-        quries: list[dict],
-        conc: int = 20,
-):
-    """
-    Execute multiple geo queries.
-    Args:
-        client: The Terrakio client instance
-        quries: List of dictionaries containing query parameters
-        conc: The concurrency level for the requests
-    Returns:
-        List of query results
-    Raises:
-        ValueError: If the queries list is empty
-    """
-    if not quries:
-        raise ValueError("Queries list cannot be empty")
-    if conc > 100:
-        raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
-    for i, query in enumerate(quries):
-        if 'expr' not in query:
-            raise ValueError(f"Query at index {i} is missing the required 'expr' key")
-        if 'feature' not in query:
-            raise ValueError(f"Query at index {i} is missing the required 'feature' key")
-        if 'in_crs' not in query:
-            raise ValueError(f"Query at index {i} is missing the required 'in_crs' key")
-    completed_count = 0
-    lock = asyncio.Lock()
-    async def single_geo_query(query):
-        """
-        Execute multiple geo queries concurrently.
-        Args:
-            quries: List of dictionaries containing query parameters
-        """
-        total_number_of_requests = len(quries)
-        nonlocal completed_count
-        try:
-            result = await client.geoquery(**query)
-            if isinstance(result, dict) and result.get("error"):
-                error_msg = f"Request failed: {result.get('error_message', 'Unknown error')}"
-                if result.get('status_code'):
-                    error_msg = f"Request failed with status {result['status_code']}: {result.get('error_message', 'Unknown error')}"
-                raise APIError(error_msg)
-            if isinstance(result, list):
-                result = result[0]
-                timestamp_number = result['request_count']
-                return timestamp_number
-            if not isinstance(result, xr.Dataset):
-                raise ValueError(f"Expected xarray Dataset, got {type(result)}")
-            async with lock:
-                completed_count += 1
-                if completed_count % max(1, total_number_of_requests // 10) == 0:
-                    client.logger.info(f"Progress: {completed_count}/{total_number_of_requests} requests processed")
-            return result
-        except Exception as e:
-            async with lock:
-                completed_count += 1
-            raise
-    try:
-        async with BoundedTaskGroup(max_concurrency=conc) as tg:
-            tasks = [tg.create_task(single_geo_query(quries[idx])) for idx in range(len(quries))]
-        all_results = [task.result() for task in tasks]
-    except* Exception as eg:
-        for e in eg.exceptions:
-            if hasattr(e, 'response'):
-                raise APIError(f"API request failed: {e.response.text}")
-        raise
-    client.logger.info("All requests completed!")
-    if not all_results:
-        raise ValueError("No valid results were returned for any geometry")
-    if isinstance(all_results, list) and type(all_results[0]) == int:
-        return sum(all_results)/len(all_results)
-    else:
-        geometries = []
-        for query in quries:
-            feature = query['feature']
-            geometry = shape(feature['geometry'])
-            geometries.append(geometry)
-        result_gdf = gpd.GeoDataFrame({
-            'geometry': geometries,
-            'dataset': all_results
-        })
-        return result_gdf
 async def estimate_timestamp_number(
         client,
         quries: list[dict],
@@ -388,9 +411,7 @@ def gdf_to_json(
     """
     mass_stats_requests = []
-    # Loop through each row in the GeoDataFrame
     for idx, row in gdf.iterrows():
-        # Create the request feature
         request_feature = {
             "expr": expr,
             "feature": {
@@ -404,29 +425,24 @@ def gdf_to_json(
             "geom_fix": geom_fix,
         }
-        # Determine group name and file name based on id_column
         if id_column is not None and id_column in gdf.columns:
-            # Use the value from the specified column as group and file name
             identifier = str(row[id_column])
             group_name = f"group_{identifier}"
             file_name = f"file_{identifier}"
         else:
-            # Use the index as group and file name
             group_name = f"group_{idx}"
             file_name = f"file_{idx}"
-        # Create the complete request entry
         request_entry = {
             "group": group_name,
             "file": file_name,
             "request": request_feature,
         }
-        # Add the request to our list
         mass_stats_requests.append(request_entry)
     return mass_stats_requests
 async def handle_mass_stats(
     client,
     gdf: GeoDataFrame,
@@ -436,17 +452,24 @@ async def handle_mass_stats(
     resolution: int = -1,
     geom_fix: bool = False,
     id_column: Optional[str] = None,
 ):
-    request_json = gdf_to_json(gdf = gdf, expr = expr, in_crs = in_crs, out_crs = out_crs, resolution = resolution, geom_fix = geom_fix, id_column = id_column)
-    job_id =await client.mass_stats.execute_job(
-        name = "zonal_stats_job",
-        output = "netcdf",
-        config = {},
-        request_json = request_json,
-        overwrite = True,
+    request_json = gdf_to_json(gdf=gdf, expr=expr, in_crs=in_crs, out_crs=out_crs,
+                              resolution=resolution, geom_fix=geom_fix, id_column=id_column)
+    job_response = await client.mass_stats.execute_job(
+        name=f"zonal-stats-{str(uuid.uuid4())[:6]}",
+        output="netcdf",
+        config={},
+        request_json=request_json,
+        overwrite=True,
     )
-    return job_id
+    # Extract the actual task ID from the response
+    if isinstance(job_response, dict) and 'task_id' in job_response:
+        return job_response['task_id']  # Return just the string ID
+    else:
+        return job_response  # In case it's already just the ID
 async def zonal_stats(
     client,
@@ -461,7 +484,6 @@ async def zonal_stats(
     id_column: Optional[str] = None,
 ):
     """Compute zonal statistics for all geometries in a GeoDataFrame."""
     if mass_stats:
         mass_stats_id = await handle_mass_stats(
             client = client,
@@ -471,9 +493,13 @@ async def zonal_stats(
             out_crs = out_crs,
             resolution = resolution,
             geom_fix = geom_fix,
-            id_column = id_column
+            id_column = id_column,
         )
-        return mass_stats_id
+        job_name = await client.mass_stats.track_job([mass_stats_id])
+        job_name = job_name[mass_stats_id]["name"]
+        cloud_files_object = cloud_object(job_id = mass_stats_id, job_name = job_name, client = client)
+        return cloud_files_object
     quries = []
     for i in range(len(gdf)):
         quries.append({
@@ -494,130 +520,9 @@ async def zonal_stats(
         raise ValueError(local_or_remote_result["reason"])
     else:
         gdf_with_datasets = await request_geoquery_list(client = client, quries = quries, conc = conc)
+        gdf_with_datasets.attrs["cloud_metadata"] = {
+            "is_cloud_backed": False,
+        }
         gdf_with_datasets = expand_on_variables_and_time(gdf_with_datasets)
     return gdf_with_datasets
-async def create_dataset_file(
-    client,
-    aoi: str,
-    expression: str,
-    output: str,
-    download_path: str,
-    in_crs: str = "epsg:4326",
-    to_crs: str = "epsg:4326",
-    res: float = 0.0001,
-    region: str = None,
-    overwrite: bool = False,
-    skip_existing: bool = False,
-    non_interactive: bool = True,
-    name: str | None = None,
-    poll_interval: int = 30,
-    max_file_size_mb: int = 5120,
-    tile_size: int = 1024,
-    mask: bool = True
-) -> dict:
-    if not name:
-        name = f"file-gen-{uuid.uuid4().hex[:8]}"
-    body, reqs, groups = tiles(
-        name = name,
-        aoi = aoi,
-        expression = expression,
-        output = output,
-        tile_size = tile_size,
-        crs = in_crs,
-        res = res,
-        region = region,
-        to_crs = to_crs,
-        mask = mask,
-        overwrite = overwrite,
-        skip_existing = skip_existing,
-        non_interactive = non_interactive
-    )
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tempreq:
-        tempreq.write(reqs)
-        tempreqname = tempreq.name
-    task_id = await client.mass_stats.execute_job(
-        name=body["name"],
-        region=body["region"],
-        output=body["output"],
-        config = {},
-        overwrite=body["overwrite"],
-        skip_existing=body["skip_existing"],
-        request_json=tempreqname,
-    )
-    start_time = time.time()
-    status = None
-    client.logger.info(f"Tracking data generation job {task_id['task_id']}...")
-    while True:
-        try:
-            taskid = task_id['task_id']
-            trackinfo = await client.mass_stats.track_job([taskid])
-            status = trackinfo[taskid]['status']
-            if status == 'Completed':
-                client.logger.info('Data generated successfully!')
-                break
-            elif status in ['Failed', 'Cancelled', 'Error']:
-                raise RuntimeError(f"Job {taskid} failed with status: {status}")
-            else:
-                elapsed_time = time.time() - start_time
-                client.logger.info(f"Job status: {status} - Elapsed time: {elapsed_time:.1f}s")
-                await asyncio.sleep(poll_interval)
-        except KeyboardInterrupt:
-            client.logger.info(f"\nInterrupted! Job {taskid} is still running in the background.")
-            raise
-        except Exception as e:
-            client.logger.info(f"\nError tracking job: {e}")
-            raise
-    os.unlink(tempreqname)
-    combine_result = await client.mass_stats.combine_tiles(body["name"], body["overwrite"], body["output"], max_file_size_mb=max_file_size_mb)
-    combine_task_id = combine_result.get("task_id")
-    combine_start_time = time.time()
-    client.logger.info(f"Tracking file generation job {combine_task_id}...")
-    while True:
-        try:
-            trackinfo = await client.mass_stats.track_job([combine_task_id])
-            if body["output"] == "netcdf":
-                download_file_name = trackinfo[combine_task_id]['folder'] + '.nc'
-            elif body["output"] == "geotiff":
-                download_file_name = trackinfo[combine_task_id]['folder'] + '.tif'
-            bucket = trackinfo[combine_task_id]['bucket']
-            combine_status = trackinfo[combine_task_id]['status']
-            if combine_status == 'Completed':
-                client.logger.info('File/s generated successfully!')
-                break
-            elif combine_status in ['Failed', 'Cancelled', 'Error']:
-                raise RuntimeError(f"File generation job {combine_task_id} failed with status: {combine_status}")
-            else:
-                elapsed_time = time.time() - combine_start_time
-                client.logger.info(f"File generation job status: {combine_status} - Elapsed time: {elapsed_time:.1f}s")
-                time.sleep(poll_interval)
-        except KeyboardInterrupt:
-            client.logger.info(f"\nInterrupted! File generation job {combine_task_id} is still running in the background.")
-            raise
-        except Exception as e:
-            client.logger.info(f"\nError tracking file generation job: {e}")
-            raise
-    if download_path:
-        await client.mass_stats.download_file(
-            job_name=body["name"],
-            bucket=bucket,
-            file_type='processed',
-            folder='file-gen',
-            page_size=100,
-            output_path=download_path,
-        )
-    else:
-        path = f"{body['name']}/outputs/merged/{download_file_name}"
-        client.logger.info(f"Dataset file/s is available at {path}")
-    return {"generation_task_id": task_id, "combine_task_id": combine_task_id}

terrakio-core 0.4.8__py3-none-any.whl → 0.4.94__py3-none-any.whl

Potentially problematic release.

terrakio-core 0.4.8py3-none-any.whl → 0.4.94py3-none-any.whl