PyPI - terrakio-core - Versions diffs - 0.2.6__tar.gz → 0.2.8__tar.gz - Mend

terrakio-core 0.2.6tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of terrakio-core might be problematic. Click here for more details.

Files changed (20) hide show

{terrakio_core-0.2.6 → terrakio_core-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: terrakio-core
-Version: 0.2.6
+Version: 0.2.8
 Summary: Core components for Terrakio API clients
 Author-email: Yupeng Chao <yupeng@haizea.com.au>
 Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api

{terrakio_core-0.2.6 → terrakio_core-0.2.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "terrakio-core"
-version = "0.2.6"
+version = "0.2.8"
 authors = [
     {name = "Yupeng Chao", email = "yupeng@haizea.com.au"},
 ]

terrakio_core-0.2.8/terrakio_core/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+Terrakio Core
+Core components for Terrakio API clients.
+"""
+__version__ = "0.2.8"

{terrakio_core-0.2.6 → terrakio_core-0.2.8}/terrakio_core/client.py RENAMED Viewed

@@ -11,14 +11,17 @@ import xarray as xr
 import nest_asyncio
 from shapely.geometry import shape, mapping
 from shapely.geometry.base import BaseGeometry as ShapelyGeometry
+from google.cloud import storage
 from .exceptions import APIError, ConfigurationError
+import logging
+import textwrap
 class BaseClient:
     def __init__(self, url: Optional[str] = None, key: Optional[str] = None,
                 auth_url: Optional[str] = "https://dev-au.terrak.io",
                 quiet: bool = False, config_file: Optional[str] = None,
-                verify: bool = True, timeout: int = 60):
+                verify: bool = True, timeout: int = 300):
         nest_asyncio.apply()
         self.quiet = quiet
         self.verify = verify
@@ -86,9 +89,10 @@ class BaseClient:
             )
         return self._aiohttp_session
-    async def wcs_async(self, expr: str, feature: Union[Dict[str, Any], ShapelyGeometry],
-                       in_crs: str = "epsg:4326", out_crs: str = "epsg:4326",
-                       output: str = "csv", resolution: int = -1, **kwargs):
+    async def wcs_async(self, expr: str, feature: Union[Dict[str, Any], ShapelyGeometry],
+                        in_crs: str = "epsg:4326", out_crs: str = "epsg:4326",
+                        output: str = "csv", resolution: int = -1, buffer: bool = False,
+                        retry: int = 3, **kwargs):
         """
         Asynchronous version of the wcs() method using aiohttp.
@@ -99,6 +103,8 @@ class BaseClient:
             out_crs (str): Output coordinate reference system
             output (str): Output format ('csv' or 'netcdf')
             resolution (int): Resolution parameter
+            buffer (bool): Whether to buffer the request (default True)
+            retry (int): Number of retry attempts (default 3)
             **kwargs: Additional parameters to pass to the WCS request
         Returns:
@@ -111,8 +117,7 @@ class BaseClient:
                 "geometry": mapping(feature),
                 "properties": {}
             }
-        self.validate_feature(feature)
         payload = {
             "feature": feature,
             "in_crs": in_crs,
@@ -120,47 +125,65 @@ class BaseClient:
             "output": output,
             "resolution": resolution,
             "expr": expr,
+            "buffer": buffer,
+            "resolution": resolution,
             **kwargs
         }
         request_url = f"{self.url}/geoquery"
-        try:
-            # Get the shared aiohttp session
-            session = await self.aiohttp_session
-            async with session.post(request_url, json=payload, ssl=self.verify) as response:
-                if not response.ok:
-                    error_msg = f"API request failed: {response.status} {response.reason}"
-                    try:
-                        error_data = await response.json()
-                        if "detail" in error_data:
-                            error_msg += f" - {error_data['detail']}"
-                    except:
-                        pass
-                    raise APIError(error_msg)
-                content = await response.read()
-                if output.lower() == "csv":
-                    import pandas as pd
-                    df = pd.read_csv(BytesIO(content))
-                    return df
-                elif output.lower() == "netcdf":
-                    return xr.open_dataset(BytesIO(content))
-                else:
-                    try:
-                        return xr.open_dataset(BytesIO(content))
-                    except ValueError:
+        for attempt in range(retry + 1):
+            try:
+                session = await self.aiohttp_session
+                async with session.post(request_url, json=payload, ssl=self.verify) as response:
+                    if not response.ok:
+                        should_retry = False
+                        if response.status in [408, 502, 503, 504]:
+                            should_retry = True
+                        elif response.status == 500:
+                            try:
+                                response_text = await response.text()
+                                if "Internal server error" not in response_text:
+                                    should_retry = True
+                            except:
+                                should_retry = True
+                        if should_retry and attempt < retry:
+                            continue
+                        else:
+                            error_msg = f"API request failed: {response.status} {response.reason}"
+                            try:
+                                error_data = await response.json()
+                                if "detail" in error_data:
+                                    error_msg += f" - {error_data['detail']}"
+                            except:
+                                pass
+                            raise APIError(error_msg)
+                    content = await response.read()
+                    if output.lower() == "csv":
                         import pandas as pd
+                        df = pd.read_csv(BytesIO(content))
+                        return df
+                    elif output.lower() == "netcdf":
+                        return xr.open_dataset(BytesIO(content))
+                    else:
                         try:
-                            return pd.read_csv(BytesIO(content))
-                        except:
-                            return content
-        except aiohttp.ClientError as e:
-            raise APIError(f"Request failed: {str(e)}")
-        except Exception as e:
-            raise
+                            return xr.open_dataset(BytesIO(content))
+                        except ValueError:
+                            import pandas as pd
+                            try:
+                                return pd.read_csv(BytesIO(content))
+                            except:
+                                return content
+            except aiohttp.ClientError as e:
+                if attempt == retry:
+                    raise APIError(f"Request failed: {str(e)}")
+                continue
+            except Exception as e:
+                if attempt == retry:
+                    raise
+                continue
     async def close_async(self):
         """Close the aiohttp session"""
@@ -174,41 +197,6 @@ class BaseClient:
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self.close_async()
-    def validate_feature(self, feature: Dict[str, Any]) -> None:
-        if hasattr(feature, 'is_valid'):
-            from shapely.geometry import mapping
-            feature = {
-                "type": "Feature",
-                "geometry": mapping(feature),
-                "properties": {}
-            }
-        if not isinstance(feature, dict):
-            raise ValueError("Feature must be a dictionary or a Shapely geometry")
-        if feature.get("type") != "Feature":
-            raise ValueError("GeoJSON object must be of type 'Feature'")
-        if "geometry" not in feature:
-            raise ValueError("Feature must contain a 'geometry' field")
-        if "properties" not in feature:
-            raise ValueError("Feature must contain a 'properties' field")
-        try:
-            geometry = shape(feature["geometry"])
-        except Exception as e:
-            raise ValueError(f"Invalid geometry format: {str(e)}")
-        if not geometry.is_valid:
-            raise ValueError(f"Invalid geometry: {geometry.is_valid_reason}")
-        geom_type = feature["geometry"]["type"]
-        if geom_type == "Point":
-            if len(feature["geometry"]["coordinates"]) != 2:
-                raise ValueError("Point must have exactly 2 coordinates")
-        elif geom_type == "Polygon":
-            if not geometry.is_simple:
-                raise ValueError("Polygon must be simple (not self-intersecting)")
-            if geometry.area == 0:
-                raise ValueError("Polygon must have non-zero area")
-            coords = feature["geometry"]["coordinates"][0]
-            if coords[0] != coords[-1]:
-                raise ValueError("Polygon must be closed (first and last points must match)")
     def signup(self, email: str, password: str) -> Dict[str, Any]:
         if not self.auth_client:
             raise ConfigurationError("Authentication client not initialized. Please provide auth_url during client initialization.")
@@ -309,7 +297,6 @@ class BaseClient:
                 "geometry": mapping(feature),
                 "properties": {}
             }
-        self.validate_feature(feature)
         payload = {
             "feature": feature,
             "in_crs": in_crs,
@@ -321,7 +308,10 @@ class BaseClient:
         }
         request_url = f"{self.url}/geoquery"
         try:
+            print("the request url is ", request_url)
+            print("the payload is ", payload)
             response = self.session.post(request_url, json=payload, timeout=self.timeout, verify=self.verify)
+            print("the response is ", response.text)
             if not response.ok:
                 error_msg = f"API request failed: {response.status_code} {response.reason}"
                 try:
@@ -690,15 +680,39 @@ class BaseClient:
             )
         return self.mass_stats.random_sample(name, **kwargs)
-    async def zonal_stats_async(self, gdb, expr, conc=20, inplace=False, output="csv"):
+    async def zonal_stats_async(self, gdb, expr, conc=20, inplace=False, output="csv",
+                               in_crs="epsg:4326", out_crs="epsg:4326", resolution=-1, buffer=False):
         """
         Compute zonal statistics for all geometries in a GeoDataFrame using asyncio for concurrency.
+        Args:
+            gdb (geopandas.GeoDataFrame): GeoDataFrame containing geometries
+            expr (str): Terrakio expression to evaluate, can include spatial aggregations
+            conc (int): Number of concurrent requests to make
+            inplace (bool): Whether to modify the input GeoDataFrame in place
+            output (str): Output format (csv or netcdf)
+            in_crs (str): Input coordinate reference system
+            out_crs (str): Output coordinate reference system
+            resolution (int): Resolution parameter
+            buffer (bool): Whether to buffer the request (default True)
+        Returns:
+            geopandas.GeoDataFrame: GeoDataFrame with added columns for results, or None if inplace=True
         """
+        if conc > 100:
+            raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
         # Process geometries in batches
         all_results = []
         row_indices = []
+        # Calculate total batches for progress reporting
+        total_geometries = len(gdb)
+        total_batches = (total_geometries + conc - 1) // conc  # Ceiling division
+        completed_batches = 0
+        print(f"Processing {total_geometries} geometries with concurrency {conc}")
         async def process_geometry(geom, index):
             """Process a single geometry"""
             try:
@@ -707,7 +721,8 @@ class BaseClient:
                     "geometry": mapping(geom),
                     "properties": {"index": index}
                 }
-                result = await self.wcs_async(expr=expr, feature=feature, output=output)
+                result = await self.wcs_async(expr=expr, feature=feature, output=output,
+                                            in_crs=in_crs, out_crs=out_crs, resolution=resolution, buffer=buffer)
                 # Add original index to track which geometry this result belongs to
                 if isinstance(result, pd.DataFrame):
                     result['_geometry_index'] = index
@@ -749,11 +764,19 @@ class BaseClient:
                 batch_results = await process_batch(batch_indices)
                 all_results.extend(batch_results)
                 row_indices.extend(batch_indices)
+                # Update progress
+                completed_batches += 1
+                processed_geometries = min(i + conc, total_geometries)
+                print(f"Progress: {completed_batches}/{total_batches} completed ({processed_geometries}/{total_geometries} geometries processed)")
             except Exception as e:
                 if hasattr(e, 'response'):
                     raise APIError(f"API request failed: {e.response.text}")
                 raise
+        print("All batches completed! Processing results...")
         if not all_results:
             raise ValueError("No valid results were returned for any geometry")
@@ -845,7 +868,8 @@ class BaseClient:
             else:
                 return result_gdf
-    def zonal_stats(self, gdb, expr, conc=20, inplace=False, output="csv"):
+    def zonal_stats(self, gdb, expr, conc=20, inplace=False, output="csv",
+                   in_crs="epsg:4326", out_crs="epsg:4326", resolution=-1, buffer=False):
         """
         Compute zonal statistics for all geometries in a GeoDataFrame.
@@ -855,32 +879,44 @@ class BaseClient:
             conc (int): Number of concurrent requests to make
             inplace (bool): Whether to modify the input GeoDataFrame in place
             output (str): Output format (csv or netcdf)
+            in_crs (str): Input coordinate reference system
+            out_crs (str): Output coordinate reference system
+            resolution (int): Resolution parameter
+            buffer (bool): Whether to buffer the request (default True)
         Returns:
             geopandas.GeoDataFrame: GeoDataFrame with added columns for results, or None if inplace=True
         """
+        if conc > 100:
+            raise ValueError("Concurrency (conc) is too high. Please set conc to 100 or less.")
         import asyncio
+        print(f"Starting zonal statistics computation for expression: {expr}")
         # Check if we're in a Jupyter environment or already have an event loop
         try:
             loop = asyncio.get_running_loop()
             # We're in an async context (like Jupyter), use create_task
             nest_asyncio.apply()
-            result = asyncio.run(self.zonal_stats_async(gdb, expr, conc, inplace, output))
+            result = asyncio.run(self.zonal_stats_async(gdb, expr, conc, inplace, output,
+                                                      in_crs, out_crs, resolution, buffer))
         except RuntimeError:
             # No running event loop, safe to use asyncio.run()
-            result = asyncio.run(self.zonal_stats_async(gdb, expr, conc, inplace, output))
+            result = asyncio.run(self.zonal_stats_async(gdb, expr, conc, inplace, output,
+                                                      in_crs, out_crs, resolution, buffer))
         except ImportError:
             # nest_asyncio not available, try alternative approach
             try:
                 loop = asyncio.get_running_loop()
                 # Create task in existing loop
-                task = loop.create_task(self.zonal_stats_async(gdb, expr, conc, inplace, output))
+                task = loop.create_task(self.zonal_stats_async(gdb, expr, conc, inplace, output,
+                                                             in_crs, out_crs, resolution, buffer))
                 # This won't work directly - we need a different approach
                 raise RuntimeError("Cannot run async code in Jupyter without nest_asyncio. Please install: pip install nest-asyncio")
             except RuntimeError:
                 # No event loop, use asyncio.run
-                result = asyncio.run(self.zonal_stats_async(gdb, expr, conc, inplace, output))
+                result = asyncio.run(self.zonal_stats_async(gdb, expr, conc, inplace, output,
+                                                          in_crs, out_crs, resolution, buffer))
         # Ensure aiohttp session is closed after running async code
         try:
@@ -890,6 +926,7 @@ class BaseClient:
             # Event loop may already be closed, ignore
             pass
+        print("Zonal statistics computation completed!")
         return result
     # Group access management protected methods
@@ -1011,6 +1048,80 @@ class BaseClient:
                 timeout=self.timeout
             )
         return self.space_management.delete_data_in_path(path, region)
+    def generate_ai_dataset(
+        self,
+        name: str,
+        aoi_geojson: str,
+        expression_x: str,
+        expression_y: str,
+        samples: int,
+        tile_size: int,
+        crs: str = "epsg:4326",
+        res: float = 0.001,
+        region: str = "aus",
+        start_year: int = None,
+        end_year: int = None,
+    ) -> dict:
+        """
+        Generate an AI dataset using specified parameters.
+        Args:
+            name (str): Name of the dataset to generate
+            aoi_geojson (str): Path to GeoJSON file containing area of interest
+            expression_x (str): Expression for X variable (e.g. "MSWX.air_temperature@(year=2021, month=1)")
+            expression_y (str): Expression for Y variable with {year} placeholder
+            samples (int): Number of samples to generate
+            tile_size (int): Size of tiles in degrees
+            crs (str, optional): Coordinate reference system. Defaults to "epsg:4326"
+            res (float, optional): Resolution in degrees. Defaults to 0.001
+            region (str, optional): Region code. Defaults to "aus"
+            start_year (int, optional): Start year for data generation. Required if end_year provided
+            end_year (int, optional): End year for data generation. Required if start_year provided
+            overwrite (bool, optional): Whether to overwrite existing dataset. Defaults to False
+        Returns:
+            dict: Response from the AI dataset generation API
+        Raises:
+            ValidationError: If required parameters are missing or invalid
+            APIError: If the API request fails
+        """
+        # we have the parameters, let pass the parameters to the random sample function
+        # task_id = self.random_sample(name, aoi_geojson, expression_x, expression_y, samples, tile_size, crs, res, region, start_year, end_year, overwrite)
+        config = {
+            "expressions" : [{"expr": expression_x, "res": res, "prefix": "x"}],
+            "filters" : []
+        }
+        config["expressions"].append({"expr": expression_y, "res" : res, "prefix": "y"})
+        expression_x = expression_x.replace("{year}", str(start_year))
+        expression_y = expression_y.replace("{year}", str(start_year))
+        print("the aoi geojson is ", aoi_geojson)
+        with open(aoi_geojson, 'r') as f:
+            aoi_data = json.load(f)
+        print("the config is ", config)
+        task_id = self.random_sample(
+            name=name,
+            config=config,
+            aoi=aoi_data,
+            samples=samples,
+            year_range=[start_year, end_year],
+            crs=crs,
+            tile_size=tile_size,
+            res=res,
+            region=region,
+            output="netcdf",
+            server=self.url,
+            bucket="terrakio-mass-requests",
+            overwrite=True
+        )["task_id"]
+        print("the task id is ", task_id)
+        task_id = self.start_mass_stats_job(task_id)
+        print("the task id is ", task_id)
+        return task_id
     def train_model(self, model_name: str, training_data: dict) -> dict:
         """
@@ -1044,3 +1155,90 @@ class BaseClient:
         except requests.RequestException as e:
             raise APIError(f"Model training request failed: {str(e)}")
+    def deploy_model(self, dataset: str, product:str, model_name:str, input_expression: str, model_training_job_name: str, uid: str, dates_iso8601: list):
+        # we have the dataset and we have the product, and we have the model name, we need to create a new json file and add that to the dataset as our virtual dataset
+        # upload the script to the bucket, the script should be able to download the model and do the inferencing
+        # we need to upload the the json to the to the dataset as our virtual dataset
+        # then we do nothing and wait for the user to make the request call to the explorer
+        # we should have a uniform script for the random forest deployment
+        # create a script for each model
+        # upload script to google bucket,
+        #
+        script_content = self._generate_script(model_name, product, model_training_job_name, uid)
+        # self.create_dataset(collection = "terrakio-datasets", input = input, )
+        # we have the script, we need to upload it to the bucket
+        script_name = f"{product}.py"
+        print("the script content is ", script_content)
+        print("the script name is ", script_name)
+        self._upload_script_to_bucket(script_content, script_name, model_training_job_name, uid)
+        # after uploading the script, we need to create a new virtual dataset
+        self._create_dataset(name = dataset, collection = "terrakio-datasets", products = [product], path = f"gs://terrakio-mass-requests/{uid}/{model_training_job_name}/inference_scripts", input = input_expression, dates_iso8601 = dates_iso8601, padding = 0)
+    def _generate_script(self, model_name: str, product: str, model_training_job_name: str, uid: str) -> str:
+        return textwrap.dedent(f'''
+            import logging
+            from io import BytesIO
+            from google.cloud import storage
+            from onnxruntime import InferenceSession
+            import numpy as np
+            import xarray as xr
+            import datetime
+            logging.basicConfig(
+                level=logging.INFO
+            )
+            def get_model():
+                logging.info("Loading model for {model_name}...")
+                client = storage.Client()
+                bucket = client.get_bucket('terrakio-mass-requests')
+                blob = bucket.blob('{uid}/{model_training_job_name}/models/{model_name}.onnx')
+                model = BytesIO()
+                blob.download_to_file(model)
+                model.seek(0)
+                session = InferenceSession(model.read(), providers=["CPUExecutionProvider"])
+                return session
+            def {product}(*bands, model):
+                logging.info("start preparing data")
+                original_shape = bands[0].shape
+                logging.info(f"Original shape: {{original_shape}}")
+                transformed_bands = []
+                for band in bands:
+                    transformed_band = band.values.reshape(-1,1)
+                    transformed_bands.append(transformed_band)
+                input_data = np.hstack(transformed_bands)
+                logging.info(f"Final input shape: {{input_data.shape}}")
+                output = model.run(None, {{"float_input": input_data.astype(np.float32)}})[0]
+                logging.info(f"Model output shape: {{output.shape}}")
+                output_reshaped = output.reshape(original_shape)
+                result = xr.DataArray(
+                    data=output_reshaped,
+                    dims=bands[0].dims,
+                    coords=bands[0].coords
+                )
+                return result
+            ''').strip()
+    def _upload_script_to_bucket(self, script_content: str, script_name: str, model_training_job_name: str, uid: str):
+        """Upload the generated script to Google Cloud Storage"""
+        client = storage.Client()
+        bucket = client.get_bucket('terrakio-mass-requests')
+        blob = bucket.blob(f'{uid}/{model_training_job_name}/inference_scripts/{script_name}')
+        # the first layer is the uid, the second layer is the model training job name
+        blob.upload_from_string(script_content, content_type='text/plain')
+        logging.info(f"Script uploaded successfully to {uid}/{model_training_job_name}/inference_scripts/{script_name}")

{terrakio_core-0.2.6 → terrakio_core-0.2.8}/terrakio_core/dataset_management.py RENAMED Viewed

@@ -117,7 +117,7 @@ class DatasetManagement:
         # Add optional parameters if provided
         for param in ["products", "dates_iso8601", "bucket", "path", "data_type",
-                     "no_data", "l_max", "y_size", "x_size", "proj4", "abstract", "geotransform"]:
+                     "no_data", "l_max", "y_size", "x_size", "proj4", "abstract", "geotransform", "input"]:
             if param in kwargs:
                 payload[param] = kwargs[param]

{terrakio_core-0.2.6 → terrakio_core-0.2.8}/terrakio_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: terrakio-core
-Version: 0.2.6
+Version: 0.2.8
 Summary: Core components for Terrakio API clients
 Author-email: Yupeng Chao <yupeng@haizea.com.au>
 Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api