PyPI - terrakio-core - Versions diffs - 0.4.2__tar.gz → 0.4.3__tar.gz - Mend

terrakio-core 0.4.2tar.gz → 0.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of terrakio-core might be problematic. Click here for more details.

Files changed (26) hide show

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: terrakio-core
-Version: 0.4.2
+Version: 0.4.3
 Summary: Core components for Terrakio API clients
 Author-email: Yupeng Chao <yupeng@haizea.com.au>
 Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "terrakio-core"
-version = "0.4.2"
+version = "0.4.3"
 authors = [
     {name = "Yupeng Chao", email = "yupeng@haizea.com.au"},
 ]

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/terrakio_core/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ Terrakio Core
 Core components for Terrakio API clients.
 """
-__version__ = "0.4.2"
+__version__ = "0.4.3"
 from .async_client import AsyncClient
 from .sync_client import SyncClient as Client

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/terrakio_core/async_client.py RENAMED Viewed

@@ -47,7 +47,7 @@ class AsyncClient(BaseClient):
             return await self._make_request_with_retry(self._session, method, endpoint, **kwargs)
     async def _make_request_with_retry(self, session: aiohttp.ClientSession, method: str, endpoint: str, **kwargs) -> Dict[Any, Any]:
-        url = f"{self.url}/{endpoint.lstrip('/')}"
+        url = f"{self.url}/{endpoint.lstrip('/')}"
         last_exception = None
         for attempt in range(self.retry + 1):

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/terrakio_core/endpoints/mass_stats.py RENAMED Viewed

@@ -67,7 +67,7 @@ class MassStats:
     @require_api_key
-    def start_job(self, id: str) -> Dict[str, Any]:
+    async def start_job(self, id: str) -> Dict[str, Any]:
         """
         Start a mass stats job by task ID.
@@ -78,7 +78,7 @@ class MassStats:
             API response as a dictionary
         """
-        return self._client._terrakio_request("POST", f"mass_stats/start/{id}")
+        return await self._client._terrakio_request("POST", f"mass_stats/start/{id}")
     @require_api_key
     def get_task_id(self, name: str, stage: str, uid: Optional[str] = None) -> Dict[str, Any]:
@@ -542,7 +542,7 @@ class MassStats:
         return self._client._terrakio_request("POST", "mass_stats/cancel")
     @require_api_key
-    def random_sample(
+    async def random_sample(
         self,
         name: str,
         config: dict,
@@ -556,7 +556,7 @@ class MassStats:
         year_range: list[int] = None,
         overwrite: bool = False,
         server: str = None,
-        bucket: str = None,
+        bucket: str = None
     ) -> Dict[str, Any]:
         """
         Submit a random sample job.
@@ -591,18 +591,18 @@ class MassStats:
             "tile_size": tile_size,
             "res": res,
             "output": output,
-            "region": region,
             "overwrite": str(overwrite).lower(),
         }
         payload_mapping = {
             "year_range": year_range,
             "server": server,
-            "bucket": bucket
+            "region": region,
+            "bucket": bucket,
         }
         for key, value in payload_mapping.items():
             if value is not None:
                 payload[key] = value
-        return self._client._terrakio_request("POST", "random_sample", json=payload)
+        return await self._client._terrakio_request("POST", "random_sample", json=payload)
     @require_api_key

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/terrakio_core/endpoints/model_management.py RENAMED Viewed

@@ -37,7 +37,7 @@ class ModelManagement:
         self._client = client
     @require_api_key
-    def generate_ai_dataset(
+    async def generate_ai_dataset(
         self,
         name: str,
         aoi_geojson: str,
@@ -51,7 +51,8 @@ class ModelManagement:
         filter_y: str = "skip",
         crs: str = "epsg:4326",
         res: float = 0.001,
-        region: str = "aus",
+        region: str = None,
+        bucket: str = None,
         start_year: int = None,
         end_year: int = None,
     ) -> dict:
@@ -71,7 +72,8 @@ class ModelManagement:
             tile_size (int): Size of tiles in degrees
             crs (str, optional): Coordinate reference system. Defaults to "epsg:4326"
             res (float, optional): Resolution in degrees. Defaults to 0.001
-            region (str, optional): Region code. Defaults to "aus"
+            region (str, optional): Region code. Defaults to None
+            bucket (str, optional): Bucket name. Defaults to None
             start_year (int, optional): Start year for data generation. Required if end_year provided
             end_year (int, optional): End year for data generation. Required if start_year provided
@@ -109,7 +111,7 @@ class ModelManagement:
         with open(aoi_geojson, 'r') as f:
             aoi_data = json.load(f)
-        task_response = self._client.mass_stats.random_sample(
+        task_response = await self._client.mass_stats.random_sample(
             name=name,
             config=config,
             aoi=aoi_data,
@@ -121,14 +123,14 @@ class ModelManagement:
             region=region,
             output="netcdf",
             server=self._client.url,
-            bucket="terrakio-mass-requests",
+            bucket=bucket,
             overwrite=True
         )
         task_id = task_response["task_id"]
         # Wait for job completion with progress bar
         while True:
-            result = self._client.track_mass_stats_job(ids=[task_id])
+            result = await self._client.mass_stats.track_job(ids=[task_id])
             status = result[task_id]['status']
             completed = result[task_id].get('completed', 0)
             total = result[task_id].get('total', 1)
@@ -153,9 +155,53 @@ class ModelManagement:
             time.sleep(5)
         # after all the random sample jobs are done, we then start the mass stats job
-        task_id = self._client.mass_stats.start_mass_stats_job(task_id)
+        # task_id = self._client.mass_stats.start_mass_stats_job(task_id)
+        task_id = await self._client.mass_stats.start_job(task_id)
         return task_id
+    # the folder that is being created is not under the jobs folder, its directly under the UID folder
+    # @require_api_key
+    # async def upload_model(self, model, model_name: str, input_shape: Tuple[int, ...] = None):
+    #     """
+    #     Upload a model to the bucket so that it can be used for inference.
+    #     Converts PyTorch and scikit-learn models to ONNX format before uploading.
+    #     Args:
+    #         model: The model object (PyTorch model or scikit-learn model)
+    #         model_name: Name for the model (without extension)
+    #         input_shape: Shape of input data for ONNX conversion (e.g., (1, 10) for batch_size=1, features=10)
+    #                     Required for PyTorch models, optional for scikit-learn models
+    #     Raises:
+    #         APIError: If the API request fails
+    #         ValueError: If model type is not supported or input_shape is missing for PyTorch models
+    #         ImportError: If required libraries (torch or skl2onnx) are not installed
+    #     """
+    #     uid = (await self._client.auth.get_user_info())["uid"]
+    #     # above line is getting the uid,
+    #     client = storage.Client()
+    #     bucket = client.get_bucket('terrakio-mass-requests')
+    #     # Convert model to ONNX format
+    #     onnx_bytes = self._convert_model_to_onnx(model, model_name, input_shape)
+    #     # Upload ONNX model to bucket
+    #     # blob = bucket.blob(f'{uid}/{model_name}/models/{model_name}.onnx')
+    #     # we don't need to upload the model to the bucket
+    #     # so the stuff is stored under the virtual datasets folder
+    #     # the model name and the virtual dataset name should be the same
+    #     virtual_dataset_name = model_name
+    #     blob = bucket.blob(f'{uid}/virtual_datasets/{virtual_dataset_name}/{model_name}.onnx')
+    #     # wer are uploading the model to the virtual dataset folder
+    #     blob.upload_from_string(onnx_bytes, content_type='application/octet-stream')
+    #     self._client.logger.info(f"Model uploaded successfully to {uid}/virtual_datasets/{virtual_dataset_name}/{model_name}.onnx")
+    # this is the upload model function, I think we need to upload to the user, under the virutal_datasets folder, and create the virtual dataset
     @require_api_key
     async def upload_model(self, model, model_name: str, input_shape: Tuple[int, ...] = None):
         """
@@ -351,6 +397,33 @@ class ModelManagement:
             raise ValueError(f"Failed to convert scikit-learn model {model_name} to ONNX: {str(e)}")
+    # we do not need to pass in both the model name and the dataset name, since the model name should the same as the virtual dataset name
+    # but we are gonna have multiple products for the same virtual dataset
+    # @require_api_key
+    # async def upload_and_deploy_cnn_model(self, model, dataset: str, product: str, input_expression: str, dates_iso8601: list, input_shape: Tuple[int, ...] = None, processing_script_path: Optional[str] = None):
+    #     """
+    #     Upload a CNN model to the bucket and deploy it.
+    #     Args:
+    #         model: The model object (PyTorch model or scikit-learn model)
+    #         model_name: Name for the model (without extension)
+    #         dataset: Name of the dataset to create
+    #         product: Product name for the inference
+    #         input_expression: Input expression for the dataset
+    #         dates_iso8601: List of dates in ISO8601 format
+    #         input_shape: Shape of input data for ONNX conversion (required for PyTorch models)
+    #         processing_script_path: Path to the processing script, if not provided, no processing will be done
+    #     Raises:
+    #         APIError: If the API request fails
+    #         ValueError: If model type is not supported or input_shape is missing for PyTorch models
+    #         ImportError: If required libraries (torch or skl2onnx) are not installed
+    #     """
+    #     await self.upload_model(model=model, model_name=dataset, input_shape=input_shape)
+    #     # so the uploading process is kinda similar, but the deployment step is kinda different
+    #     # we should pass the processing script path to the deploy cnn model function
+    #     await self.deploy_cnn_model(dataset=dataset, product=product, model_name=model_name, input_expression=input_expression, model_training_job_name=model_name, dates_iso8601=dates_iso8601, processing_script_path=processing_script_path)
     @require_api_key
     async def upload_and_deploy_cnn_model(self, model, model_name: str, dataset: str, product: str, input_expression: str, dates_iso8601: list, input_shape: Tuple[int, ...] = None, processing_script_path: Optional[str] = None):
         """
@@ -376,6 +449,7 @@ class ModelManagement:
         # we should pass the processing script path to the deploy cnn model function
         await self.deploy_cnn_model(dataset=dataset, product=product, model_name=model_name, input_expression=input_expression, model_training_job_name=model_name, dates_iso8601=dates_iso8601, processing_script_path=processing_script_path)
     @require_api_key
     async def upload_and_deploy_model(self, model, model_name: str, dataset: str, product: str, input_expression: str, dates_iso8601: list, input_shape: Tuple[int, ...] = None):
         """
@@ -617,7 +691,7 @@ class ModelManagement:
             clean_preprocessing = preprocessing_code
             # Then add consistent 8-space indentation to match the template
             preprocessing_section = f"""{textwrap.indent(clean_preprocessing, '')}"""  # 8 spaces
-        print(preprocessing_section)
+        # print(preprocessing_section)
         script_content = self.generate_cnn_script(model_name, product, model_training_job_name, uid, preprocessing_code, postprocessing_code)
         script_name = f"{product}.py"
         self._upload_script_to_bucket(script_content, script_name, model_training_job_name, uid)
@@ -748,6 +822,245 @@ class ModelManagement:
             return result
             ''').strip()
+    # @require_api_key
+    # def generate_cnn_script(self, model_name: str, product: str, model_training_job_name: str, uid: str, preprocessing_code: Optional[str] = None, postprocessing_code: Optional[str] = None) -> str:
+    #     """
+    #     Generate Python inference script for CNN model with time-stacked bands.
+    #     Args:
+    #         model_name: Name of the model
+    #         product: Product name
+    #         model_training_job_name: Training job name
+    #         uid: User ID
+    #         preprocessing_code: Preprocessing code
+    #         postprocessing_code: Postprocessing code
+    #     Returns:
+    #         str: Generated Python script content
+    #     """
+    #     import textwrap
+    #     # Build preprocessing section with CONSISTENT 4-space indentation
+    #     preprocessing_section = ""
+    #     if preprocessing_code and preprocessing_code.strip():
+    #         clean_preprocessing = textwrap.dedent(preprocessing_code)
+    #         preprocessing_section = textwrap.indent(clean_preprocessing, '    ')
+    #     # Build postprocessing section with CONSISTENT 4-space indentation
+    #     postprocessing_section = ""
+    #     if postprocessing_code and postprocessing_code.strip():
+    #         clean_postprocessing = textwrap.dedent(postprocessing_code)
+    #         postprocessing_section = textwrap.indent(clean_postprocessing, '    ')
+    #     # Build the template WITHOUT dedenting the whole thing, so indentation is preserved
+    #     script_lines = [
+    #         "import logging",
+    #         "from io import BytesIO",
+    #         "import numpy as np",
+    #         "import pandas as pd",
+    #         "import xarray as xr",
+    #         "from google.cloud import storage",
+    #         "from onnxruntime import InferenceSession",
+    #         "from typing import Tuple",
+    #         "",
+    #         "logging.basicConfig(",
+    #         "    level=logging.INFO",
+    #         ")",
+    #         "",
+    #     ]
+    #     # Add preprocessing function definition BEFORE the main function
+    #     if preprocessing_section:
+    #         script_lines.extend([
+    #             "def preprocessing(array: Tuple[xr.DataArray, ...]) -> Tuple[xr.DataArray, ...]:",
+    #             preprocessing_section,
+    #             "",
+    #         ])
+    #     # Add postprocessing function definition BEFORE the main function
+    #     if postprocessing_section:
+    #         script_lines.extend([
+    #             "def postprocessing(array: xr.DataArray) -> xr.DataArray:",
+    #             postprocessing_section,
+    #             "",
+    #         ])
+    #     # Add the get_model function
+    #     script_lines.extend([
+    #         "def get_model():",
+    #         f"    logging.info(\"Loading CNN model for {model_name}...\")",
+    #         "",
+    #         "    client = storage.Client()",
+    #         "    bucket = client.get_bucket('terrakio-mass-requests')",
+    #         f"    blob = bucket.blob('{uid}/{model_training_job_name}/models/{model_name}.onnx')",
+    #         "",
+    #         "    model = BytesIO()",
+    #         "    blob.download_to_file(model)",
+    #         "    model.seek(0)",
+    #         "",
+    #         "    session = InferenceSession(model.read(), providers=[\"CPUExecutionProvider\"])",
+    #         "    return session",
+    #         "",
+    #         f"def {product}(*bands, model):",
+    #         "    logging.info(\"Start preparing CNN data with time-stacked bands\")",
+    #         "    data_arrays = list(bands)",
+    #         "    ",
+    #         "    if not data_arrays:",
+    #         "        raise ValueError(\"No bands provided\")",
+    #         "    ",
+    #     ])
+    #     # Add preprocessing call if preprocessing exists
+    #     if preprocessing_section:
+    #         script_lines.extend([
+    #             "    # Apply preprocessing",
+    #             "    data_arrays = preprocessing(tuple(data_arrays))",
+    #             "    data_arrays = list(data_arrays)  # Convert back to list for processing",
+    #             "    ",
+    #         ])
+    #     # Continue with the rest of the processing logic
+    #     script_lines.extend([
+    #         "    reference_array = data_arrays[0]",
+    #         "    original_shape = reference_array.shape",
+    #         "    logging.info(f\"Original shape: {original_shape}\")",
+    #         "    ",
+    #         "    # Get time coordinates - all bands should have the same time dimension",
+    #         "    if 'time' not in reference_array.dims:",
+    #         "        raise ValueError(\"Time dimension is required for CNN processing\")",
+    #         "    ",
+    #         "    time_coords = reference_array.coords['time']",
+    #         "    num_timestamps = len(time_coords)",
+    #         "    logging.info(f\"Number of timestamps: {num_timestamps}\")",
+    #         "    ",
+    #         "    # Get spatial dimensions",
+    #         "    spatial_dims = [dim for dim in reference_array.dims if dim != 'time']",
+    #         "    height = reference_array.sizes[spatial_dims[0]]  # assuming first spatial dim is height",
+    #         "    width = reference_array.sizes[spatial_dims[1]]   # assuming second spatial dim is width",
+    #         "    logging.info(f\"Spatial dimensions: {height} x {width}\")",
+    #         "    ",
+    #         "    # Stack bands across time dimension",
+    #         "    # Result will be: (num_bands * num_timestamps, height, width)",
+    #         "    stacked_channels = []",
+    #         "    ",
+    #         "    for band_idx, data_array in enumerate(data_arrays):",
+    #         "        logging.info(f\"Processing band {band_idx + 1}/{len(data_arrays)}\")",
+    #         "        ",
+    #         "        # Ensure consistent time coordinates across bands",
+    #         "        if not np.array_equal(data_array.coords['time'].values, time_coords.values):",
+    #         "            logging.warning(f\"Band {band_idx} has different time coordinates, aligning...\")",
+    #         "            data_array = data_array.sel(time=time_coords, method='nearest')",
+    #         "        ",
+    #         "        # Extract values and ensure proper ordering (time, height, width)",
+    #         "        band_values = data_array.values",
+    #         "        if band_values.ndim == 3:",
+    #         "            # Reorder dimensions if needed to ensure (time, height, width)",
+    #         "            time_dim_idx = data_array.dims.index('time')",
+    #         "            if time_dim_idx != 0:",
+    #         "                axes_order = [time_dim_idx] + [i for i in range(len(data_array.dims)) if i != time_dim_idx]",
+    #         "                band_values = np.transpose(band_values, axes_order)",
+    #         "        ",
+    #         "        # Add each timestamp of this band to the channel stack",
+    #         "        for t in range(num_timestamps):",
+    #         "            stacked_channels.append(band_values[t])",
+    #         "    ",
+    #         "    # Stack all channels: (num_bands * num_timestamps, height, width)",
+    #         "    input_channels = np.stack(stacked_channels, axis=0)",
+    #         "    total_channels = len(data_arrays) * num_timestamps",
+    #         "    logging.info(f\"Stacked channels shape: {input_channels.shape}\")",
+    #         "    logging.info(f\"Total channels: {total_channels} ({len(data_arrays)} bands × {num_timestamps} timestamps)\")",
+    #         "    ",
+    #         "    # Add batch dimension: (1, num_channels, height, width)",
+    #         "    input_data = np.expand_dims(input_channels, axis=0).astype(np.float32)",
+    #         "    logging.info(f\"Final input shape for CNN: {input_data.shape}\")",
+    #         "    ",
+    #         "    # Run inference",
+    #         "    output = model.run(None, {\"float_input\": input_data})[0]",
+    #         "    logging.info(f\"Model output shape: {output.shape}\")",
+    #         "    ",
+    #         "    # UPDATED: Handle multi-class CNN output properly",
+    #         "    if output.ndim == 4:",
+    #         "        if output.shape[1] == 1:",
+    #         "            # Single class output (regression or binary classification)",
+    #         "            output_2d = output[0, 0]",
+    #         "            logging.info(\"Single channel output detected\")",
+    #         "        else:",
+    #         "            # Multi-class output - convert logits/probabilities to class predictions",
+    #         "            output_classes = np.argmax(output, axis=1)  # Shape: (1, height, width)",
+    #         "            output_2d = output_classes[0]  # Shape: (height, width)",
+    #         "            ",
+    #         "            # Apply class merging: merge class 6 into class 3",
+    #         "            output_2d = np.where(output_2d == 6, 3, output_2d)",
+    #         "            ",
+    #         "            logging.info(f\"Multi-class output processed. Original classes: {output.shape[1]}\")",
+    #         "            logging.info(f\"Unique classes in output: {np.unique(output_2d)}\")",
+    #         "            logging.info(f\"Class distribution: {np.bincount(output_2d.flatten())}\")",
+    #         "    elif output.ndim == 3:",
+    #         "        # Remove batch dimension",
+    #         "        output_2d = output[0]",
+    #         "        logging.info(\"3D output detected, removed batch dimension\")",
+    #         "    else:",
+    #         "        # Handle other cases",
+    #         "        output_2d = np.squeeze(output)",
+    #         "        if output_2d.ndim != 2:",
+    #         "            logging.error(f\"Cannot process output shape: {output.shape}\")",
+    #         "            logging.error(f\"After squeeze: {output_2d.shape}\")",
+    #         "            raise ValueError(f\"Unexpected output shape after processing: {output_2d.shape}\")",
+    #         "        logging.info(\"Applied squeeze to output\")",
+    #         "    ",
+    #         "    # Ensure output is 2D",
+    #         "    if output_2d.ndim != 2:",
+    #         "        raise ValueError(f\"Final output must be 2D, got shape: {output_2d.shape}\")",
+    #         "    ",
+    #         "    # Determine output timestamp (use the latest timestamp)",
+    #         "    output_timestamp = time_coords[-1]",
+    #         "    ",
+    #         "    # Get spatial coordinates from reference array",
+    #         "    spatial_coords = {dim: reference_array.coords[dim] for dim in spatial_dims}",
+    #         "    ",
+    #         "    # Create output DataArray with appropriate data type",
+    #         "    # Use int32 for classification, float32 for regression",
+    #         "    is_multiclass = output.ndim == 4 and output.shape[1] > 1",
+    #         "    if is_multiclass:",
+    #         "        # Multi-class classification - use integer type",
+    #         "        output_dtype = np.int32",
+    #         "        output_type = 'classification'",
+    #         "    else:",
+    #         "        # Single output - use float type",
+    #         "        output_dtype = np.float32",
+    #         "        output_type = 'regression'",
+    #         "    ",
+    #         "    result = xr.DataArray(",
+    #         "        data=np.expand_dims(output_2d.astype(output_dtype), axis=0),",
+    #         "        dims=['time'] + spatial_dims,",
+    #         "        coords={",
+    #         "            'time': [output_timestamp.values],",
+    #         "            spatial_dims[0]: spatial_coords[spatial_dims[0]].values,",
+    #         "            spatial_dims[1]: spatial_coords[spatial_dims[1]].values",
+    #         "        },",
+    #         "        attrs={",
+    #         "            'description': 'CNN model prediction',",
+    #         "        }",
+    #         "    )",
+    #         "    ",
+    #         "    logging.info(f\"Final result shape: {result.shape}\")",
+    #         "    logging.info(f\"Final result data type: {result.dtype}\")",
+    #         "    logging.info(f\"Final result value range: {result.values.min()} to {result.values.max()}\")",
+    #     ])
+    #     # Add postprocessing call if postprocessing exists
+    #     if postprocessing_section:
+    #         script_lines.extend([
+    #             "    # Apply postprocessing",
+    #             "    result = postprocessing(result)",
+    #             "    ",
+    #         ])
+    #     # Single return statement at the end
+    #     script_lines.append("    return result")
+    #     return "\n".join(script_lines)
     @require_api_key
     def generate_cnn_script(self, model_name: str, product: str, model_training_job_name: str, uid: str, preprocessing_code: Optional[str] = None, postprocessing_code: Optional[str] = None) -> str:
         """
@@ -794,6 +1107,160 @@ class ModelManagement:
             "",
         ]
+        # Add preprocessing validation function if preprocessing exists
+        if preprocessing_section:
+            script_lines.extend([
+                "def validate_preprocessing_output(data_arrays):",
+                "    \"\"\"",
+                "    Validate preprocessing output coordinates and data type.",
+                "    ",
+                "    Args:",
+                "        data_arrays: List of xarray DataArrays from preprocessing",
+                "        ",
+                "    Returns:",
+                "        str: Validation signature symbol",
+                "        ",
+                "    Raises:",
+                "        ValueError: If validation fails",
+                "    \"\"\"",
+                "    import numpy as np",
+                "    ",
+                "    logging.info(\"=\" * 60)",
+                "    logging.info(\"VALIDATING PREPROCESSING OUTPUT\")",
+                "    logging.info(\"=\" * 60)",
+                "    ",
+                "    if not data_arrays:",
+                "        raise ValueError(\"No data arrays provided from preprocessing\")",
+                "    ",
+                "    reference_shape = None",
+                "    ",
+                "    for i, data_array in enumerate(data_arrays):",
+                "        logging.info(f\"Validating channel {i+1}/{len(data_arrays)}: {data_array.name}\")",
+                "        ",
+                "        # Check if it's an xarray DataArray",
+                "        if not hasattr(data_array, 'dims') or not hasattr(data_array, 'coords'):",
+                "            raise ValueError(f\"Channel {i+1} is not a valid xarray DataArray\")",
+                "        ",
+                "        # Check coordinates",
+                "        if 'time' not in data_array.coords:",
+                "            raise ValueError(f\"Channel {i+1} missing time coordinate\")",
+                "        ",
+                "        spatial_dims = [dim for dim in data_array.dims if dim != 'time']",
+                "        if len(spatial_dims) != 2:",
+                "            raise ValueError(f\"Channel {i+1} must have exactly 2 spatial dimensions, got {spatial_dims}\")",
+                "        ",
+                "        for dim in spatial_dims:",
+                "            if dim not in data_array.coords:",
+                "                raise ValueError(f\"Channel {i+1} missing coordinate: {dim}\")",
+                "        ",
+                "        logging.info(f\"  Coordinates: {list(data_array.coords.keys())}\")",
+                "        ",
+                "        # Check data type",
+                "        data_values = data_array.values",
+                "        logging.info(f\"  Data type: {data_values.dtype}\")",
+                "        ",
+                "        # Check shape consistency",
+                "        shape = data_array.shape",
+                "        if reference_shape is None:",
+                "            reference_shape = shape",
+                "        else:",
+                "            if shape != reference_shape:",
+                "                raise ValueError(f\"Channel {i+1} shape {shape} doesn't match reference {reference_shape}\")",
+                "        ",
+                "        logging.info(f\"  Shape: {shape}\")",
+                "    ",
+                "    # Generate validation signature",
+                "    signature_components = [",
+                "        f\"CH{len(data_arrays)}\",  # Channel count",
+                "        f\"T{reference_shape[0]}\",  # Time dimension",
+                "        f\"S{reference_shape[1]}x{reference_shape[2]}\",  # Spatial dimensions",
+                "        f\"DT{data_arrays[0].values.dtype}\",  # Data type",
+                "    ]",
+                "    ",
+                "    signature = \"★PRE_\" + \"_\".join(signature_components) + \"★\"",
+                "    ",
+                "    logging.info(\"-\" * 60)",
+                "    logging.info(\"PREPROCESSING VALIDATION SUMMARY\")",
+                "    logging.info(\"-\" * 60)",
+                "    logging.info(f\"Channels validated: {len(data_arrays)}\")",
+                "    logging.info(f\"Common shape: {reference_shape}\")",
+                "    logging.info(f\"Validation signature: {signature}\")",
+                "    logging.info(\"=\" * 60)",
+                "    ",
+                "    return signature",
+                "",
+            ])
+        # Add postprocessing validation function if postprocessing exists
+        if postprocessing_section:
+            script_lines.extend([
+                "def validate_postprocessing_output(result_array):",
+                "    \"\"\"",
+                "    Validate postprocessing output coordinates and data type.",
+                "    ",
+                "    Args:",
+                "        result_array: xarray DataArray from postprocessing",
+                "        ",
+                "    Returns:",
+                "        str: Validation signature symbol",
+                "        ",
+                "    Raises:",
+                "        ValueError: If validation fails",
+                "    \"\"\"",
+                "    import numpy as np",
+                "    ",
+                "    logging.info(\"=\" * 60)",
+                "    logging.info(\"VALIDATING POSTPROCESSING OUTPUT\")",
+                "    logging.info(\"=\" * 60)",
+                "    ",
+                "    # Check if it's an xarray DataArray",
+                "    if not hasattr(result_array, 'dims') or not hasattr(result_array, 'coords'):",
+                "        raise ValueError(\"Postprocessing output is not a valid xarray DataArray\")",
+                "    ",
+                "    # Check required coordinates",
+                "    if 'time' not in result_array.coords:",
+                "        raise ValueError(\"Missing time coordinate\")",
+                "    ",
+                "    spatial_dims = [dim for dim in result_array.dims if dim != 'time']",
+                "    if len(spatial_dims) != 2:",
+                "        raise ValueError(f\"Expected 2 spatial dimensions, got {len(spatial_dims)}: {spatial_dims}\")",
+                "    ",
+                "    for dim in spatial_dims:",
+                "        if dim not in result_array.coords:",
+                "            raise ValueError(f\"Missing spatial coordinate: {dim}\")",
+                "    ",
+                "    logging.info(f\"Coordinates found: {list(result_array.coords.keys())}\")",
+                "    ",
+                "    # Check data type",
+                "    data_values = result_array.values",
+                "    logging.info(f\"Data type: {data_values.dtype}\")",
+                "    ",
+                "    # Check shape",
+                "    shape = result_array.shape",
+                "    logging.info(f\"Shape: {shape}\")",
+                "    ",
+                "    # Generate validation signature",
+                "    signature_components = [",
+                "        f\"T{shape[0]}\",  # Time dimension",
+                "        f\"S{shape[1]}x{shape[2]}\",  # Spatial dimensions",
+                "        f\"DT{data_values.dtype}\",  # Data type",
+                "    ]",
+                "    ",
+                "    signature = \"★POST_\" + \"_\".join(signature_components) + \"★\"",
+                "    ",
+                "    logging.info(\"-\" * 60)",
+                "    logging.info(\"POSTPROCESSING VALIDATION SUMMARY\")",
+                "    logging.info(\"-\" * 60)",
+                "    logging.info(f\"Final shape: {shape}\")",
+                "    logging.info(f\"Final coordinates: {list(result_array.coords.keys())}\")",
+                "    logging.info(f\"Data type: {data_values.dtype}\")",
+                "    logging.info(f\"Validation signature: {signature}\")",
+                "    logging.info(\"=\" * 60)",
+                "    ",
+                "    return signature",
+                "",
+            ])
         # Add preprocessing function definition BEFORE the main function
         if preprocessing_section:
             script_lines.extend([
@@ -835,13 +1302,17 @@ class ModelManagement:
             "    ",
         ])
-        # Add preprocessing call if preprocessing exists
+        # Add preprocessing call and validation if preprocessing exists
         if preprocessing_section:
             script_lines.extend([
                 "    # Apply preprocessing",
                 "    data_arrays = preprocessing(tuple(data_arrays))",
                 "    data_arrays = list(data_arrays)  # Convert back to list for processing",
                 "    ",
+                "    # Validate preprocessing output",
+                "    preprocessing_signature = validate_preprocessing_output(data_arrays)",
+                "    logging.info(f\"Preprocessing validation signature: {preprocessing_signature}\")",
+                "    ",
             ])
         # Continue with the rest of the processing logic
@@ -973,19 +1444,23 @@ class ModelManagement:
             "    logging.info(f\"Final result value range: {result.values.min()} to {result.values.max()}\")",
         ])
-        # Add postprocessing call if postprocessing exists
+        # Add postprocessing call and validation if postprocessing exists
         if postprocessing_section:
             script_lines.extend([
                 "    # Apply postprocessing",
                 "    result = postprocessing(result)",
                 "    ",
+                "    # Validate postprocessing output",
+                "    postprocessing_signature = validate_postprocessing_output(result)",
+                "    logging.info(f\"Postprocessing validation signature: {postprocessing_signature}\")",
+                "    ",
             ])
         # Single return statement at the end
         script_lines.append("    return result")
         return "\n".join(script_lines)
     @require_api_key
     def _upload_script_to_bucket(self, script_content: str, script_name: str, model_training_job_name: str, uid: str):
         """Upload the generated script to Google Cloud Storage"""

{terrakio_core-0.4.2 → terrakio_core-0.4.3}/terrakio_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: terrakio-core
-Version: 0.4.2
+Version: 0.4.3
 Summary: Core components for Terrakio API clients
 Author-email: Yupeng Chao <yupeng@haizea.com.au>
 Project-URL: Homepage, https://github.com/HaizeaAnalytics/terrakio-python-api