PyPI - terrakio-core - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

terrakio-core 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of terrakio-core might be problematic. Click here for more details.

Files changed (21) hide show

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: terrakio-core
-Version: 0.3.1
+Version: 0.3.2
 Summary: Core components for Terrakio API clients
 Author-email: Yupeng Chao <yupeng@haizea.com.au>
 Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api
@@ -22,6 +22,7 @@ Requires-Dist: xarray>=2023.1.0
 Requires-Dist: shapely>=2.0.0
 Requires-Dist: geopandas>=0.13.0
 Requires-Dist: google-cloud-storage>=2.0.0
+Requires-Dist: nest_asyncio
 # Terrakio Core

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "terrakio-core"
-version = "0.3.1"
+version = "0.3.2"
 authors = [
     {name = "Yupeng Chao", email = "yupeng@haizea.com.au"},
 ]
@@ -29,6 +29,7 @@ dependencies = [
     "shapely>=2.0.0",
     "geopandas>=0.13.0",
     "google-cloud-storage>=2.0.0",
+    "nest_asyncio",
 ]
 [project.urls]

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/terrakio_core/__init__.py RENAMED Viewed

@@ -4,4 +4,4 @@ Terrakio Core
 Core components for Terrakio API clients.
 """
-__version__ = "0.3.1"
+__version__ = "0.3.2"

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/terrakio_core/client.py RENAMED Viewed

@@ -130,6 +130,7 @@ class BaseClient:
             "resolution": resolution,
             **kwargs
         }
+        print("the payload is ", payload)
         request_url = f"{self.url}/geoquery"
         for attempt in range(retry + 1):
             try:
@@ -565,7 +566,7 @@ class BaseClient:
             )
         return self.mass_stats.get_task_id(name, stage, uid)
-    def track_mass_stats_job(self, ids=None):
+    def track_mass_stats_job(self, ids: Optional[list] = None):
         if not self.mass_stats:
             from terrakio_core.mass_stats import MassStats
             if not self.url or not self.key:
@@ -1038,6 +1039,20 @@ class BaseClient:
             )
         return self.space_management.delete_data_in_path(path, region)
+    def start_mass_stats_job(self, task_id):
+        if not self.mass_stats:
+            from terrakio_core.mass_stats import MassStats
+            if not self.url or not self.key:
+                raise ConfigurationError("Mass Stats client not initialized. Make sure API URL and key are set.")
+            self.mass_stats = MassStats(
+                base_url=self.url,
+                api_key=self.key,
+                verify=self.verify,
+                timeout=self.timeout
+            )
+        return self.mass_stats.start_job(task_id)
     def generate_ai_dataset(
         self,
         name: str,
@@ -1107,43 +1122,30 @@ class BaseClient:
             overwrite=True
         )["task_id"]
         print("the task id is ", task_id)
-        task_id = self.start_mass_stats_job(task_id)
-        print("the task id is ", task_id)
-        return task_id
+        # Wait for job completion
+        import time
+        while True:
+            result = self.track_mass_stats_job(ids=[task_id])
+            status = result[task_id]['status']
+            print(f"Job status: {status}")
+            if status == "Completed":
+                break
+            elif status == "Error":
+                raise Exception(f"Job {task_id} encountered an error")
+            # Wait 30 seconds before checking again
+            time.sleep(30)
-    # def train_model(self, model_name: str, training_data: dict) -> dict:
-    #     """
-    #     Train a model using the external model training API.
-    #     Args:
-    #         model_name (str): The name of the model to train.
-    #         training_data (dict): Dictionary containing training data parameters.
-    #     Returns:
-    #         dict: The response from the model training API.
-    #     """
-    #     endpoint = "https://modeltraining-573248941006.australia-southeast1.run.app/train_model"
-    #     payload = {
-    #         "model_name": model_name,
-    #         "training_data": training_data
-    #     }
-    #     try:
-    #         response = self.session.post(endpoint, json=payload, timeout=self.timeout, verify=self.verify)
-    #         if not response.ok:
-    #             error_msg = f"Model training request failed: {response.status_code} {response.reason}"
-    #             try:
-    #                 error_data = response.json()
-    #                 if "detail" in error_data:
-    #                     error_msg += f" - {error_data['detail']}"
-    #             except Exception:
-    #                 if response.text:
-    #                     error_msg += f" - {response.text}"
-    #             raise APIError(error_msg)
-    #         return response.json()
-    #     except requests.RequestException as e:
-    #         raise APIError(f"Model training request failed: {str(e)}")
+        # print("the result is ", result)
+        # after all the random sample jos are done, we then start the mass stats job
+        task_id = self.start_mass_stats_job(task_id)
+        # now we hav ethe random sampel
+        # print("the task id is ", task_id)
+        return task_id
     def train_model(self, model_name: str, training_dataset: str, task_type: str, model_category: str, architecture: str, region: str, hyperparameters: dict = None) -> dict:
         """
@@ -1209,7 +1211,7 @@ class BaseClient:
-    def generate_combine_tiles(
+    def create_dataset_file(
         self,
         name: str,
         aoi: str,
@@ -1329,30 +1331,22 @@ class BaseClient:
         return self.mass_stats.combine_tiles(body["name"], usezarr, body["overwrite"], body["output"])
     def deploy_model(self, dataset: str, product:str, model_name:str, input_expression: str, model_training_job_name: str, uid: str, dates_iso8601: list):
         script_content = self._generate_script(model_name, product, model_training_job_name, uid)
         script_name = f"{product}.py"
         self._upload_script_to_bucket(script_content, script_name, model_training_job_name, uid)
-        # after uploading the script, we need to create a new virtual dataset
         self._create_dataset(name = dataset, collection = "terrakio-datasets", products = [product], path = f"gs://terrakio-mass-requests/{uid}/{model_training_job_name}/inference_scripts", input = input_expression, dates_iso8601 = dates_iso8601, padding = 0)
     def _generate_script(self, model_name: str, product: str, model_training_job_name: str, uid: str) -> str:
         return textwrap.dedent(f'''
             import logging
             from io import BytesIO
-            from google.cloud import storage
-            from onnxruntime import InferenceSession
             import numpy as np
+            import pandas as pd
             import xarray as xr
-            import datetime
+            from google.cloud import storage
+            from onnxruntime import InferenceSession
             logging.basicConfig(
                 level=logging.INFO
@@ -1360,47 +1354,95 @@ class BaseClient:
             def get_model():
                 logging.info("Loading model for {model_name}...")
                 client = storage.Client()
                 bucket = client.get_bucket('terrakio-mass-requests')
                 blob = bucket.blob('{uid}/{model_training_job_name}/models/{model_name}.onnx')
                 model = BytesIO()
                 blob.download_to_file(model)
                 model.seek(0)
                 session = InferenceSession(model.read(), providers=["CPUExecutionProvider"])
                 return session
             def {product}(*bands, model):
                 logging.info("start preparing data")
+                print("the bands are ", bands)
-                original_shape = bands[0].shape
-                logging.info(f"Original shape: {{original_shape}}")
+                data_arrays = list(bands)
-                transformed_bands = []
-                for band in bands:
-                    transformed_band = band.values.reshape(-1,1)
-                    transformed_bands.append(transformed_band)
+                print("the data arrays are ", [da.name for da in data_arrays])
-                input_data = np.hstack(transformed_bands)
+                reference_array = data_arrays[0]
+                original_shape = reference_array.shape
+                logging.info(f"Original shape: {{original_shape}}")
+                if 'time' in reference_array.dims:
+                    time_coords = reference_array.coords['time']
+                    if len(time_coords) == 1:
+                        output_timestamp = time_coords[0]
+                    else:
+                        years = [pd.to_datetime(t).year for t in time_coords.values]
+                        unique_years = set(years)
+                        if len(unique_years) == 1:
+                            year = list(unique_years)[0]
+                            output_timestamp = pd.Timestamp(f"{{year}}-01-01")
+                        else:
+                            latest_year = max(unique_years)
+                            output_timestamp = pd.Timestamp(f"{{latest_year}}-01-01")
+                else:
+                    output_timestamp = pd.Timestamp("1970-01-01")
+                averaged_bands = []
+                for data_array in data_arrays:
+                    if 'time' in data_array.dims:
+                        averaged_band = np.mean(data_array.values, axis=0)
+                        logging.info(f"Averaged band from {{data_array.shape}} to {{averaged_band.shape}}")
+                    else:
+                        averaged_band = data_array.values
+                        logging.info(f"No time dimension, shape: {{averaged_band.shape}}")
+                    flattened_band = averaged_band.reshape(-1, 1)
+                    averaged_bands.append(flattened_band)
+                input_data = np.hstack(averaged_bands)
                 logging.info(f"Final input shape: {{input_data.shape}}")
                 output = model.run(None, {{"float_input": input_data.astype(np.float32)}})[0]
                 logging.info(f"Model output shape: {{output.shape}}")
-                output_reshaped = output.reshape(original_shape)
+                if len(original_shape) >= 3:
+                    spatial_shape = original_shape[1:]
+                else:
+                    spatial_shape = original_shape
+                output_reshaped = output.reshape(spatial_shape)
+                output_with_time = np.expand_dims(output_reshaped, axis=0)
+                if 'time' in reference_array.dims:
+                    spatial_dims = [dim for dim in reference_array.dims if dim != 'time']
+                    spatial_coords = {{dim: reference_array.coords[dim] for dim in spatial_dims if dim in reference_array.coords}}
+                else:
+                    spatial_dims = list(reference_array.dims)
+                    spatial_coords = dict(reference_array.coords)
                 result = xr.DataArray(
-                    data=output_reshaped,
-                    dims=bands[0].dims,
-                    coords=bands[0].coords
+                    data=output_with_time.astype(np.float32),
+                    dims=['time'] + list(spatial_dims),
+                    coords={
+                        'time': [output_timestamp.values],
+                        'y': spatial_coords['y'].values,
+                        'x': spatial_coords['x'].values
+                    }
                 )
                 return result
             ''').strip()
     def _upload_script_to_bucket(self, script_content: str, script_name: str, model_training_job_name: str, uid: str):
         """Upload the generated script to Google Cloud Storage"""
@@ -1410,3 +1452,24 @@ class BaseClient:
         blob.upload_from_string(script_content, content_type='text/plain')
         logging.info(f"Script uploaded successfully to {uid}/{model_training_job_name}/inference_scripts/{script_name}")
+    def download_file_to_path(self, job_name, stage, file_name, output_path):
+        if not self.mass_stats:
+            from terrakio_core.mass_stats import MassStats
+            if not self.url or not self.key:
+                raise ConfigurationError("Mass Stats client not initialized. Make sure API URL and key are set.")
+            self.mass_stats = MassStats(
+                base_url=self.url,
+                api_key=self.key,
+                verify=self.verify,
+                timeout=self.timeout
+            )
+        # fetch bucket info based on job name and stage
+        taskid = self.mass_stats.get_task_id(job_name, stage).get('task_id')
+        trackinfo = self.mass_stats.track_job([taskid])
+        bucket = trackinfo[taskid]['bucket']
+        return self.mass_stats.download_file(job_name, bucket, file_name, output_path)

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/terrakio_core/dataset_management.py RENAMED Viewed

@@ -83,10 +83,63 @@ class DatasetManagement:
         except requests.RequestException as e:
             raise APIError(f"Request failed: {str(e)}")
+    # def create_dataset(self, name: str, collection: str = "terrakio-datasets", **kwargs) -> Dict[str, Any]:
+    #     """
+    #     Create a new dataset.
+    #     Args:
+    #         name: Name of the dataset (required)
+    #         collection: Dataset collection (default: 'terrakio-datasets')
+    #         **kwargs: Additional dataset parameters including:
+    #             - products: List of products
+    #             - dates_iso8601: List of dates
+    #             - bucket: Storage bucket
+    #             - path: Storage path
+    #             - data_type: Data type
+    #             - no_data: No data value
+    #             - l_max: Maximum level
+    #             - y_size: Y size
+    #             - x_size: X size
+    #             - proj4: Projection string
+    #             - abstract: Dataset abstract
+    #             - geotransform: Geotransform parameters
+    #     Returns:
+    #         Created dataset information
+    #     Raises:
+    #         APIError: If the API request fails
+    #     """
+    #     endpoint = f"{self.api_url}/datasets"
+    #     params = {"collection": collection}
+    #     # Create payload with required name parameter
+    #     payload = {"name": name}
+    #     # Add optional parameters if provided
+    #     for param in ["products", "dates_iso8601", "bucket", "path", "data_type",
+    #                  "no_data", "l_max", "y_size", "x_size", "proj4", "abstract", "geotransform", "input"]:
+    #         if param in kwargs:
+    #             payload[param] = kwargs[param]
+    #     try:
+    #         response = self.session.post(
+    #             endpoint,
+    #             params=params,
+    #             json=payload,
+    #             timeout=self.timeout,
+    #             verify=self.verify
+    #         )
+    #         if not response.ok:
+    #             raise APIError(f"API request failed: {response.status_code} {response.reason}")
+    #         return response.json()
+    #     except requests.RequestException as e:
+    #         raise APIError(f"Request failed: {str(e)}")
     def create_dataset(self, name: str, collection: str = "terrakio-datasets", **kwargs) -> Dict[str, Any]:
         """
         Create a new dataset.
         Args:
             name: Name of the dataset (required)
             collection: Dataset collection (default: 'terrakio-datasets')
@@ -103,24 +156,23 @@ class DatasetManagement:
                 - proj4: Projection string
                 - abstract: Dataset abstract
                 - geotransform: Geotransform parameters
+                - padding: Padding value
         Returns:
             Created dataset information
         Raises:
             APIError: If the API request fails
         """
         endpoint = f"{self.api_url}/datasets"
         params = {"collection": collection}
-        # Create payload with required name parameter
         payload = {"name": name}
-        # Add optional parameters if provided
-        for param in ["products", "dates_iso8601", "bucket", "path", "data_type",
-                     "no_data", "l_max", "y_size", "x_size", "proj4", "abstract", "geotransform", "input"]:
+        for param in ["products", "dates_iso8601", "bucket", "path", "data_type",
+                        "no_data", "l_max", "y_size", "x_size", "proj4", "abstract", "geotransform", "input", "padding"]:
             if param in kwargs:
                 payload[param] = kwargs[param]
         try:
             response = self.session.post(
                 endpoint,
@@ -129,7 +181,7 @@ class DatasetManagement:
                 timeout=self.timeout,
                 verify=self.verify
             )
             if not response.ok:
                 raise APIError(f"API request failed: {response.status_code} {response.reason}")
             return response.json()

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/terrakio_core/mass_stats.py RENAMED Viewed

@@ -61,36 +61,89 @@ class MassStats:
         return response
-    # def _download_file(self, url: str, output_path: str) -> str:
-    #     """
-    #     Helper method to download a file from a signed URL.
+    def download_file(self, job_name: str, bucket:str, file_name: str, output_path: str) -> str:
+        """
+        Download a file from mass_stats using job name and file name.
+        Args:
+            job_name: Name of the job
+            file_name: Name of the file to download
+            output_path: Path where the file should be saved
+        Returns:
+            str: Path to the downloaded file
+        """
+        import os
+        from pathlib import Path
-    #     Args:
-    #         url: Signed URL to download from
-    #         output_path: Path where the file should be saved
+        endpoint_url = f"{self.base_url}/mass_stats/download_files"
+        request_body = {
+            "job_name": job_name,
+            "bucket": bucket,
+            "file_name": file_name
+        }
+        try:
+            # Get signed URL
+            response = self.session.post(
+                endpoint_url,
+                json=request_body,
+                verify=self.verify,
+                timeout=self.timeout
+            )
+            signed_url = response.json().get('download_url')
+            if not signed_url:
+                raise Exception("No download URL received from server")
+            print(f"Generated signed URL for download")
+            # Create output directory if it doesn't exist
+            output_dir = Path(output_path).parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+            # Download the file using the signed URL
+            download_response = self.session.get(
+                signed_url,
+                verify=self.verify,
+                timeout=self.timeout,
+                stream=True  # Stream for large files
+            )
+            download_response.raise_for_status()
+            # Check if file exists in the response (content-length header)
+            content_length = download_response.headers.get('content-length')
+            if content_length and int(content_length) == 0:
+                raise Exception("File appears to be empty")
-    #     Returns:
-    #         str: Path to the downloaded file
-    #     """
-    #     try:
-    #         response = requests.get(
-    #             url,
-    #             verify=self.verify,
-    #             timeout=self.timeout
-    #         )
-    #         response.raise_for_status()
+            # Write the file
+            with open(output_path, 'wb') as file:
+                for chunk in download_response.iter_content(chunk_size=8192):
+                    if chunk:
+                        file.write(chunk)
-    #         # Download and write the file
-    #         with open(output_path, 'wb') as file:
-    #             file.write(response.content)
-    #         print(f"File downloaded successfully to {output_path}")
-    #         return output_path
+            # Verify file was written
+            if not os.path.exists(output_path):
+                raise Exception(f"File was not written to {output_path}")
+            file_size = os.path.getsize(output_path)
+            print(f"File downloaded successfully to {output_path} (size: {file_size / (1024 * 1024):.4f} mb)")
+            return output_path
-    #     except requests.exceptions.RequestException as e:
-    #         raise Exception(f"Error downloading file from {url}: {e}")
-    #     except IOError as e:
-    #         raise Exception(f"Error writing file to {output_path}: {e}")
+        except self.session.exceptions.RequestException as e:
+            if hasattr(e, 'response') and e.response is not None:
+                error_detail = e.response.text
+                raise Exception(f"Error getting signed URL: {e}. Details: {error_detail}")
+            raise Exception(f"Error in download process: {e}")
+        except IOError as e:
+            raise Exception(f"Error writing file to {output_path}: {e}")
+        except Exception as e:
+            # Clean up partial file if it exists
+            if os.path.exists(output_path):
+                try:
+                    os.remove(output_path)
+                except:
+                    pass
+            raise
@@ -152,53 +205,7 @@ class MassStats:
             timeout=self.timeout
         )
         return response.json()
-    # def construct_download_url(
-    #     self,
-    #     name: str,
-    #     output: str,
-    #     region: Optional[str] = None,
-    # ) -> Dict[str, Any]:
-    #     """
-    #     Request a signed download URL for a file.
-    #     Args:
-    #         name: job name
-    #         file_type: Type of file to download (e.g., "output", "manifest", "log")
-    #         region: Region where the file is stored
-    #     Returns:
-    #         Dict containing download_url and file metadata
-    #     """
-    #     url = f"{self.base_url}/mass_stats/download"
-    #     data = {
-    #         "name": name,
-    #         "output": output
-    #     }
-    #     if region is not None:
-    #         data["region"] = region
-    #     response = self.session.post(
-    #         url,
-    #         json=data,
-    #         verify=self.verify,
-    #         timeout=self.timeout
-    #     )
-    #     return response.json()
-    # def testdownload(
-    #     self,
-    #     name: str,
-    #     region: str,
-    #     output: str,
-    # ):
-    #     upload_result = self.construct_download_url(name, region, output)
-    #     return upload_result
@@ -286,7 +293,7 @@ class MassStats:
         if uid is not None:
             url += f"&uid={uid}"
         response = self.session.get(url, verify=self.verify, timeout=self.timeout)
-        print("response text is ", response.text)
+        #print("response text is ", response.text)
         return response.json()
     def track_job(self, ids: Optional[list] = None) -> Dict[str, Any]:
@@ -491,6 +498,7 @@ class MassStats:

{terrakio_core-0.3.1 → terrakio_core-0.3.2}/terrakio_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: terrakio-core
-Version: 0.3.1
+Version: 0.3.2
 Summary: Core components for Terrakio API clients
 Author-email: Yupeng Chao <yupeng@haizea.com.au>
 Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api
@@ -22,6 +22,7 @@ Requires-Dist: xarray>=2023.1.0
 Requires-Dist: shapely>=2.0.0
 Requires-Dist: geopandas>=0.13.0
 Requires-Dist: google-cloud-storage>=2.0.0
+Requires-Dist: nest_asyncio
 # Terrakio Core