PyPI - cosmotech-acceleration-library - Versions diffs - 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

cosmotech-acceleration-library 1.1.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

cosmotech/coal/cosmotech_api/runner/datasets.py DELETED Viewed

@@ -1,416 +0,0 @@
-# Copyright (C) - 2023 - 2025 - Cosmo Tech
-# This document and all information contained herein is the exclusive property -
-# including all intellectual property rights pertaining thereto - of Cosmo Tech.
-# Any use, reproduction, translation, broadcasting, transmission, distribution,
-# etc., to any person is prohibited unless it has been previously and
-# specifically authorized by written means by Cosmo Tech.
-"""
-Dataset handling functions.
-"""
-import multiprocessing
-import tempfile
-from pathlib import Path
-from typing import Dict, List, Any, Optional, Union
-from azure.identity import DefaultAzureCredential
-from cosmotech_api.api.dataset_api import DatasetApi
-from cosmotech.coal.cosmotech_api.connection import get_api_client
-from cosmotech.coal.cosmotech_api.dataset import (
-    convert_graph_dataset_to_files,
-    download_adt_dataset,
-    download_twingraph_dataset,
-    download_legacy_twingraph_dataset,
-    download_file_dataset,
-)
-from cosmotech.coal.cosmotech_api.dataset.download import file
-from cosmotech.coal.utils.logger import LOGGER
-from cosmotech.orchestrator.utils.translate import T
-def get_dataset_ids_from_runner(runner_data) -> List[str]:
-    """
-    Extract dataset IDs from runner data.
-    Args:
-        runner_data: Runner data object
-    Returns:
-        List of dataset IDs
-    """
-    dataset_ids = runner_data.dataset_list[:]
-    for parameter in runner_data.parameters_values:
-        if parameter.var_type == "%DATASETID%" and parameter.value:
-            dataset_id = parameter.value
-            dataset_ids.append(dataset_id)
-    return dataset_ids
-def download_dataset(
-    organization_id: str,
-    workspace_id: str,
-    dataset_id: str,
-    read_files: bool = True,
-) -> Dict[str, Any]:
-    """
-    retro-compatibility to cosmo-api v4
-    """
-    from cosmotech.coal.utils.semver import semver_of
-    csm_version = semver_of("cosmotech_api")
-    if csm_version.major >= 5:
-        return download_dataset_v5(organization_id, workspace_id, dataset_id, read_files)
-    else:
-        return download_dataset_v4(organization_id, workspace_id, dataset_id, read_files)
-def download_dataset_v5(
-    organization_id: str,
-    workspace_id: str,
-    dataset_id: str,
-    read_files: bool = True,
-) -> Dict[str, Any]:
-    """
-    Download a single dataset by ID.
-    Args:
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        dataset_id: Dataset ID
-        read_files: Whether to read file contents
-    Returns:
-        Dataset information dictionary
-    """
-    # Get dataset information
-    with get_api_client()[0] as api_client:
-        dataset_api_instance = DatasetApi(api_client)
-        dataset = dataset_api_instance.get_dataset(
-            organization_id=organization_id, workspace_id=workspace_id, dataset_id=dataset_id
-        )
-        content = dict()
-        tmp_dataset_dir = tempfile.mkdtemp()
-        tmp_dataset_dir_path = Path(tmp_dataset_dir)
-        for part in dataset.parts:
-            part_file_path = tmp_dataset_dir_path / part.source_name
-            part_file_path.parent.mkdir(parents=True, exist_ok=True)
-            data_part = dataset_api_instance.download_dataset_part(organization_id, workspace_id, dataset_id, part.id)
-            with open(part_file_path, "wb") as binary_file:
-                binary_file.write(data_part)
-            if read_files:
-                content.update(file.read_file(part.source_name, part_file_path))
-        return {
-            "type": "csm_dataset",
-            "content": content,
-            "name": dataset.name,
-            "folder_path": tmp_dataset_dir,
-            "dataset_id": dataset_id,
-        }
-def download_dataset_v4(
-    organization_id: str,
-    workspace_id: str,
-    dataset_id: str,
-    read_files: bool = True,
-) -> Dict[str, Any]:
-    """
-    Download a single dataset by ID.
-    Args:
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        dataset_id: Dataset ID
-        read_files: Whether to read file contents
-    Returns:
-        Dataset information dictionary
-    """
-    # Get dataset information
-    with get_api_client()[0] as api_client:
-        api_instance = DatasetApi(api_client)
-        dataset = api_instance.find_dataset_by_id(organization_id=organization_id, dataset_id=dataset_id)
-        if dataset.connector is None:
-            parameters = []
-        else:
-            parameters = dataset.connector.parameters_values
-        is_adt = "AZURE_DIGITAL_TWINS_URL" in parameters
-        is_storage = "AZURE_STORAGE_CONTAINER_BLOB_PREFIX" in parameters
-        is_legacy_twin_cache = "TWIN_CACHE_NAME" in parameters and dataset.twingraph_id is None
-        is_in_workspace_file = (
-            False if dataset.tags is None else "workspaceFile" in dataset.tags or "dataset_part" in dataset.tags
-        )
-        # Download based on dataset type
-        if is_adt:
-            content, folder_path = download_adt_dataset(
-                adt_address=parameters["AZURE_DIGITAL_TWINS_URL"],
-                credentials=DefaultAzureCredential(),
-            )
-            return {
-                "type": "adt",
-                "content": content,
-                "name": dataset.name,
-                "folder_path": str(folder_path),
-                "dataset_id": dataset_id,
-            }
-        elif is_legacy_twin_cache:
-            twin_cache_name = parameters["TWIN_CACHE_NAME"]
-            content, folder_path = download_legacy_twingraph_dataset(
-                organization_id=organization_id, cache_name=twin_cache_name
-            )
-            return {
-                "type": "twincache",
-                "content": content,
-                "name": dataset.name,
-                "folder_path": str(folder_path),
-                "dataset_id": dataset_id,
-            }
-        elif is_storage:
-            _file_name = parameters["AZURE_STORAGE_CONTAINER_BLOB_PREFIX"].replace("%WORKSPACE_FILE%/", "")
-            content, folder_path = download_file_dataset(
-                organization_id=organization_id,
-                workspace_id=workspace_id,
-                file_name=_file_name,
-                read_files=read_files,
-            )
-            return {
-                "type": _file_name.split(".")[-1],
-                "content": content,
-                "name": dataset.name,
-                "folder_path": str(folder_path),
-                "dataset_id": dataset_id,
-                "file_name": _file_name,
-            }
-        elif is_in_workspace_file:
-            _file_name = dataset.source.location
-            content, folder_path = download_file_dataset(
-                organization_id=organization_id,
-                workspace_id=workspace_id,
-                file_name=_file_name,
-                read_files=read_files,
-            )
-            return {
-                "type": _file_name.split(".")[-1],
-                "content": content,
-                "name": dataset.name,
-                "folder_path": str(folder_path),
-                "dataset_id": dataset_id,
-                "file_name": _file_name,
-            }
-        else:
-            content, folder_path = download_twingraph_dataset(organization_id=organization_id, dataset_id=dataset_id)
-            return {
-                "type": "twincache",
-                "content": content,
-                "name": dataset.name,
-                "folder_path": str(folder_path),
-                "dataset_id": dataset_id,
-            }
-def download_dataset_process(_dataset_id, organization_id, workspace_id, read_files, _return_dict, _error_dict):
-    """
-    Process function for downloading a dataset in a separate process.
-    This function is designed to be used with multiprocessing to download datasets in parallel.
-    It downloads a single dataset and stores the result in a shared dictionary.
-    If an error occurs, it stores the error message in a shared error dictionary and re-raises the exception.
-    Args:
-        _dataset_id: Dataset ID to download
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        read_files: Whether to read file contents
-        _return_dict: Shared dictionary to store successful download results
-        _error_dict: Shared dictionary to store error messages
-    Raises:
-        Exception: Any exception that occurs during dataset download is re-raised
-    """
-    try:
-        _c = download_dataset(
-            organization_id=organization_id,
-            workspace_id=workspace_id,
-            dataset_id=_dataset_id,
-            read_files=read_files,
-        )
-        _return_dict[_dataset_id] = _c
-    except Exception as e:
-        _error_dict[_dataset_id] = f"{type(e).__name__}: {str(e)}"
-        raise e
-def download_datasets_parallel(
-    organization_id: str,
-    workspace_id: str,
-    dataset_ids: List[str],
-    read_files: bool = True,
-) -> Dict[str, Dict[str, Any]]:
-    """
-    Download multiple datasets in parallel.
-    Args:
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        dataset_ids: List of dataset IDs
-        read_files: Whether to read file contents
-    Returns:
-        Dictionary mapping dataset IDs to dataset information
-    """
-    # Use multiprocessing to download datasets in parallel
-    manager = multiprocessing.Manager()
-    return_dict = manager.dict()
-    error_dict = manager.dict()
-    processes = [
-        (
-            dataset_id,
-            multiprocessing.Process(
-                target=download_dataset_process,
-                args=(dataset_id, organization_id, workspace_id, read_files, return_dict, error_dict),
-            ),
-        )
-        for dataset_id in dataset_ids
-    ]
-    LOGGER.info(T("coal.services.dataset.parallel_download").format(count=len(dataset_ids)))
-    [p.start() for _, p in processes]
-    [p.join() for _, p in processes]
-    for dataset_id, p in processes:
-        # We might hit the following bug: https://bugs.python.org/issue43944
-        # As a workaround, only treat non-null exit code as a real issue if we also have stored an error
-        # message
-        if p.exitcode != 0 and dataset_id in error_dict:
-            raise ChildProcessError(f"Failed to download dataset '{dataset_id}': {error_dict[dataset_id]}")
-    return dict(return_dict)
-def download_datasets_sequential(
-    organization_id: str,
-    workspace_id: str,
-    dataset_ids: List[str],
-    read_files: bool = True,
-) -> Dict[str, Dict[str, Any]]:
-    """
-    Download multiple datasets sequentially.
-    Args:
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        dataset_ids: List of dataset IDs
-        read_files: Whether to read file contents
-    Returns:
-        Dictionary mapping dataset IDs to dataset information
-    """
-    return_dict = {}
-    error_dict = {}
-    LOGGER.info(T("coal.services.dataset.sequential_download").format(count=len(dataset_ids)))
-    for dataset_id in dataset_ids:
-        try:
-            return_dict[dataset_id] = download_dataset(
-                organization_id=organization_id,
-                workspace_id=workspace_id,
-                dataset_id=dataset_id,
-                read_files=read_files,
-            )
-        except Exception as e:
-            error_dict[dataset_id] = f"{type(e).__name__}: {str(e)}"
-            raise ChildProcessError(f"Failed to download dataset '{dataset_id}': {error_dict.get(dataset_id, '')}")
-    return return_dict
-def download_datasets(
-    organization_id: str,
-    workspace_id: str,
-    dataset_ids: List[str],
-    read_files: bool = True,
-    parallel: bool = True,
-) -> Dict[str, Dict[str, Any]]:
-    """
-    Download multiple datasets, either in parallel or sequentially.
-    Args:
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        dataset_ids: List of dataset IDs
-        read_files: Whether to read file contents
-        parallel: Whether to download in parallel
-    Returns:
-        Dictionary mapping dataset IDs to dataset information
-    """
-    if not dataset_ids:
-        return {}
-    if parallel and len(dataset_ids) > 1:
-        return download_datasets_parallel(
-            organization_id=organization_id,
-            workspace_id=workspace_id,
-            dataset_ids=dataset_ids,
-            read_files=read_files,
-        )
-    else:
-        return download_datasets_sequential(
-            organization_id=organization_id,
-            workspace_id=workspace_id,
-            dataset_ids=dataset_ids,
-            read_files=read_files,
-        )
-def dataset_to_file(dataset_info: Dict[str, Any], target_folder: Optional[Union[str, Path]] = None) -> str:
-    """
-    Convert dataset to files.
-    Args:
-        dataset_info: Dataset information dictionary
-        target_folder: Optional folder to save files (if None, uses temp dir)
-    Returns:
-        Path to folder containing files
-    """
-    dataset_type = dataset_info["type"]
-    content = dataset_info["content"]
-    if dataset_type in ["adt", "twincache"]:
-        # Use conversion function
-        if target_folder:
-            target_folder = convert_graph_dataset_to_files(content, target_folder)
-        else:
-            target_folder = convert_graph_dataset_to_files(content)
-        return str(target_folder)
-    # For file datasets, return the folder path
-    if "folder_path" in dataset_info:
-        return dataset_info["folder_path"]
-    # Fallback to creating a temp directory
-    if target_folder:
-        return str(target_folder)
-    else:
-        return tempfile.mkdtemp()

cosmotech/coal/cosmotech_api/runner/download.py DELETED Viewed

@@ -1,135 +0,0 @@
-# Copyright (C) - 2023 - 2025 - Cosmo Tech
-# This document and all information contained herein is the exclusive property -
-# including all intellectual property rights pertaining thereto - of Cosmo Tech.
-# Any use, reproduction, translation, broadcasting, transmission, distribution,
-# etc., to any person is prohibited unless it has been previously and
-# specifically authorized by written means by Cosmo Tech.
-"""
-Orchestration functions for downloading runner and run data.
-"""
-import os
-import pathlib
-import shutil
-from typing import Dict, Any, Optional
-from cosmotech.coal.cosmotech_api.runner.data import get_runner_data
-from cosmotech.coal.cosmotech_api.runner.parameters import (
-    format_parameters_list,
-    write_parameters,
-)
-from cosmotech.coal.cosmotech_api.runner.datasets import (
-    get_dataset_ids_from_runner,
-    download_datasets,
-    dataset_to_file,
-)
-from cosmotech.coal.utils.logger import LOGGER
-from cosmotech.orchestrator.utils.translate import T
-def download_runner_data(
-    organization_id: str,
-    workspace_id: str,
-    runner_id: str,
-    parameter_folder: str,
-    dataset_folder: Optional[str] = None,
-    read_files: bool = False,
-    parallel: bool = True,
-    write_json: bool = True,
-    write_csv: bool = False,
-    fetch_dataset: bool = True,
-) -> Dict[str, Any]:
-    """
-    Download all runner data including datasets and parameters.
-    Args:
-        organization_id: Organization ID
-        workspace_id: Workspace ID
-        runner_id: Runner ID
-        parameter_folder: Folder to save parameters
-        dataset_folder: Folder to save datasets (if None, only saves datasets referenced by parameters)
-        read_files: Whether to read file contents
-        parallel: Whether to download datasets in parallel
-        write_json: Whether to write parameters as JSON
-        write_csv: Whether to write parameters as CSV
-        fetch_dataset: Whether to fetch datasets
-    Returns:
-        Dictionary with runner data, datasets, and parameters
-    """
-    LOGGER.info(T("coal.cosmotech_api.runner.starting_download"))
-    # Get runner data
-    runner_data = get_runner_data(organization_id, workspace_id, runner_id)
-    # Create result dictionary
-    result = {"runner_data": runner_data, "datasets": {}, "parameters": {}}
-    # Skip if no parameters found
-    if not runner_data.parameters_values:
-        LOGGER.warning(T("coal.cosmotech_api.runner.no_parameters"))
-        return result
-    LOGGER.info(T("coal.cosmotech_api.runner.loaded_data"))
-    # Format parameters
-    parameters = format_parameters_list(runner_data)
-    result["parameters"] = {param["parameterId"]: param["value"] for param in parameters}
-    # Download datasets if requested
-    if fetch_dataset:
-        dataset_ids = get_dataset_ids_from_runner(runner_data)
-        if dataset_ids:
-            LOGGER.info(T("coal.cosmotech_api.runner.downloading_datasets").format(count=len(dataset_ids)))
-            datasets = download_datasets(
-                organization_id=organization_id,
-                workspace_id=workspace_id,
-                dataset_ids=dataset_ids,
-                read_files=read_files,
-                parallel=parallel,
-            )
-            result["datasets"] = datasets
-            # List datasets set as parameter
-            datasets_parameters_ids = {
-                param.value: param.parameter_id
-                for param in runner_data.parameters_values
-                if param.var_type == "%DATASETID%" and param.value
-            }
-            # Save parameter datasets to parameter folders
-            for dataset_id, dataset_info in datasets.items():
-                # If dataset is referenced by a parameter, save to parameter folder
-                if dataset_id in datasets_parameters_ids:
-                    param_id = datasets_parameters_ids[dataset_id]
-                    param_dir = os.path.join(parameter_folder, param_id)
-                    pathlib.Path(param_dir).mkdir(exist_ok=True, parents=True)
-                    dataset_folder_path = dataset_to_file(dataset_info)
-                    shutil.copytree(dataset_folder_path, param_dir, dirs_exist_ok=True)
-                    # Update parameter value to point to the folder
-                    for param in parameters:
-                        if param["parameterId"] == param_id:
-                            param["value"] = param_dir
-                            break
-                # If dataset is in dataset_list and dataset_folder is provided, save there too
-                if dataset_folder and dataset_id in runner_data.dataset_list:
-                    pathlib.Path(dataset_folder).mkdir(parents=True, exist_ok=True)
-                    dataset_folder_path = dataset_to_file(dataset_info)
-                    shutil.copytree(dataset_folder_path, dataset_folder, dirs_exist_ok=True)
-                    LOGGER.debug(
-                        T("coal.cosmotech_api.runner.dataset_debug").format(folder=dataset_folder, id=dataset_id)
-                    )
-    # Write parameters to files
-    if write_json or write_csv:
-        LOGGER.info(T("coal.cosmotech_api.runner.writing_parameters"))
-        write_parameters(parameter_folder, parameters, write_csv, write_json)
-    return result

cosmotech/coal/cosmotech_api/runner/metadata.py DELETED Viewed

@@ -1,42 +0,0 @@
-# Copyright (C) - 2023 - 2025 - Cosmo Tech
-# This document and all information contained herein is the exclusive property -
-# including all intellectual property rights pertaining thereto - of Cosmo Tech.
-# Any use, reproduction, translation, broadcasting, transmission, distribution,
-# etc., to any person is prohibited unless it has been previously and
-# specifically authorized by written means by Cosmo Tech.
-"""
-Runner metadata retrieval functions.
-"""
-from typing import Any, Optional
-import cosmotech_api
-def get_runner_metadata(
-    api_client: cosmotech_api.api_client.ApiClient,
-    organization_id: str,
-    workspace_id: str,
-    runner_id: str,
-    include: Optional[list[str]] = None,
-    exclude: Optional[list[str]] = None,
-) -> dict[str, Any]:
-    """
-    Get runner metadata from the API.
-    Args:
-        api_client: The API client to use
-        organization_id: The ID of the organization
-        workspace_id: The ID of the workspace
-        runner_id: The ID of the runner
-        include: Optional list of fields to include
-        exclude: Optional list of fields to exclude
-    Returns:
-        Dictionary with runner metadata
-    """
-    runner_api = cosmotech_api.RunnerApi(api_client)
-    runner: cosmotech_api.Runner = runner_api.get_runner(organization_id, workspace_id, runner_id)
-    return runner.model_dump(by_alias=True, exclude_none=True, include=include, exclude=exclude, mode="json")

cosmotech-acceleration-library 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

cosmotech-acceleration-library 1.1.0py3-none-any.whl → 2.0.0py3-none-any.whl