PyPI - databricks-air - Versions diffs - 0.1.0b1__py3-none-any.whl - Mend

databricks-air 0.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

cli/__init__.py +5 -0
cli/ai_training_client.py +51 -0
cli/base_client.py +101 -0
cli/changelog.md +138 -0
cli/cli_display.py +711 -0
cli/cli_entrypoint.py +3100 -0
cli/cli_output.py +223 -0
cli/compute.py +146 -0
cli/docker_utils.py +82 -0
cli/error_detection.py +190 -0
cli/get_env_secrets.py +135 -0
cli/image_registration/__init__.py +41 -0
cli/image_registration/docker_config_creds.py +143 -0
cli/image_registration/image_client.py +257 -0
cli/image_registration/image_credentials.py +317 -0
cli/image_registration/image_policy.py +234 -0
cli/jobs_api_client.py +1440 -0
cli/json_output.py +40 -0
cli/log_streaming.py +1363 -0
cli/mlflow_metrics.py +173 -0
cli/mlflow_rest_client.py +161 -0
cli/mlflow_system_metrics.py +127 -0
cli/node_sanity_check.sh +338 -0
cli/run_config.py +748 -0
cli/sdk.py +255 -0
cli/serverless_policy_client.py +104 -0
cli/telemetry.py +226 -0
cli/utils/__init__.py +134 -0
cli/utils/auth.py +267 -0
cli/utils/git_state.py +639 -0
cli/utils/mapi/__init__.py +27 -0
cli/utils/mapi/api.py +117 -0
cli/utils/mapi/launch_script.py +577 -0
cli/utils/retry.py +115 -0
cli/utils/snapshot.py +255 -0
cli/utils/uploads.py +229 -0
cli/utils/workspace.py +140 -0
cli/version.py +92 -0
cli/yaml_config.py +700 -0
cli/yaml_help.py +972 -0
cli/yaml_overrides.py +210 -0
databricks_air-0.1.0b1.dist-info/METADATA +13 -0
databricks_air-0.1.0b1.dist-info/RECORD +45 -0
databricks_air-0.1.0b1.dist-info/WHEEL +4 -0
databricks_air-0.1.0b1.dist-info/entry_points.txt +2 -0

cli/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Databricks AIR CLI package."""
+# NOTE: version and git sha are determined at release time; the versions listed here are placeholders
+__version__ = "0.1.0b1"
+__git_sha__ = "7cd7f024ac30887540b70722d0af231cd9176538"

cli/ai_training_client.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Client for AiTrainingService — the per-user training-workflow surface on AICM.
+Authenticated as the calling user via the Databricks SDK's [[WorkspaceClient]], matching how
+[[cli.image_registration.image_client.ImageClient]] talks to AICM today.
+"""
+import logging
+from typing import Any, Dict, Optional
+from cli.utils import get_workspace_client
+from databricks.sdk import WorkspaceClient
+log = logging.getLogger(__name__)
+class AiTrainingClient:
+    """REST client for /api/2.0/ai-compute-manager/training/workflows."""
+    API_PATH = "/api/2.0/ai-compute-manager/training/workflows"
+    def __init__(self, workspace_host: Optional[str] = None):
+        self.workspace_host = workspace_host
+        self._workspace_client: Optional[WorkspaceClient] = None
+    def _get_workspace_client(self) -> WorkspaceClient:
+        if self._workspace_client is None:
+            self._workspace_client = get_workspace_client(workspace_host=self.workspace_host)
+        return self._workspace_client
+    def _api_request(
+        self,
+        method: str,
+        endpoint: str,
+        data: Optional[Dict[str, Any]] = None,
+        params: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        w = self._get_workspace_client()
+        path = f"{self.API_PATH}{endpoint}"
+        try:
+            response = w.api_client.do(method, path, body=data, query=params)
+            return response if response else {}
+        except Exception as e:
+            raise RuntimeError(f"AiTrainingService request failed: {e}") from e
+    def cancel_workflow(self, job_run_id: str) -> None:
+        """Cancel a training workflow by its Jobs job_run_id.
+        Maps to CancelTrainingWorkflow with a job_run_id ref. No-op server-side if the run is
+        already in a terminal state.
+        """
+        self._api_request(method="POST", endpoint=f"/by-run-id/{job_run_id}/cancel", data={})

cli/base_client.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Base client interface and types for workload management.
+This module defines the abstract interface that all workload clients must implement,
+ensuring consistent behavior across different backend APIs (Jobs API, CMv3, etc.).
+It also defines the client type enumeration.
+"""
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Dict, Any, Optional
+from cli.compute import GPUType
+class WorkloadClientType(Enum):
+    """Enum for different workload client types."""
+    JOBS_API = "Jobs API"
+    AI_COMPUTE_MANAGER = "AI Compute Manager (CMv3)"
+    def __str__(self) -> str:
+        """Return the string representation of the client type."""
+        return self.value
+class WorkloadClient(ABC):
+    """Abstract base class for workload management clients.
+    This interface ensures that all workload clients (Jobs API, CMv3, etc.)
+    provide the same core functionality with consistent method signatures.
+    """
+    @abstractmethod
+    def create_workload(
+        self,
+        gpus: int,
+        gpus_per_node: int,
+        python_file: str,
+        func_dir: str,
+        experiment_name: str,
+        timeout_seconds: int,
+        param_to_args: dict,
+        env_sync_timeout: int,
+        gpu_type: GPUType,
+        max_retries: int = 3,
+    ) -> str:
+        """Create a workload and return workload/run ID.
+        Args:
+            gpus: Number of GPUs required
+            gpus_per_node: GPUs per node (for distributed workloads)
+            python_file: Path to the Python script to execute
+            func_dir: Directory containing the function
+            experiment_name: Name of the function to execute
+            timeout_seconds: Timeout for the workload
+            param_to_args: Parameters to pass to the function
+            env_sync_timeout: Environment synchronization timeout
+            gpu_type: Type of GPU required
+            max_retries: Maximum number of retries for failed tasks (default: 3)
+        Returns:
+            Workload/run ID as string
+        Raises:
+            Exception: If workload creation fails
+        """
+        pass
+    @abstractmethod
+    def get_workload_status(self, workload_id: str) -> Dict[str, Any]:
+        """Get workload status and metadata.
+        Args:
+            workload_id: Unique workload identifier
+        Returns:
+            Dictionary containing workload status and metadata
+        """
+        pass
+    @abstractmethod
+    def terminate_workload(self, workload_id: str, reason: Optional[str] = None) -> bool:
+        """Terminate a running workload.
+        Args:
+            workload_id: Unique workload identifier
+            reason: Optional reason for termination
+        Returns:
+            True if termination was successful, False otherwise
+        """
+        pass
+    @abstractmethod
+    def list_workloads(self, **kwargs) -> Dict[str, Any]:
+        """List workloads.
+        Args:
+            **kwargs: Client-specific parameters for listing (pagination, filters, etc.)
+        Returns:
+            Dictionary containing workloads and pagination info
+        """
+        pass
+    @property
+    @abstractmethod
+    def client_type(self) -> WorkloadClientType:
+        """Get the client type for logging and debugging.
+        Returns:
+            WorkloadClientType enum identifying the client type
+        """
+        pass

cli/changelog.md ADDED Viewed

@@ -0,0 +1,138 @@
+# Start of CLI Changelog
+Version 1.0.0
+[CNXT-2129] air run --dry-run now implies -v unless verbosity is already set.
+[CNXT-2130] air cancel: print a friendly "Run X not found" message instead of leaking the Jobs API URL.
+Launch artifacts now upload under .air/cli_launch/<experiment>/<run_name>_<uuid> with the command script named command.sh, and --help/--version help text is capitalized.
+Show [Beta] badges in help output: `air --help` (AIR CLI is in Beta) and Beta GPU types (e.g. GPU_1xH100) in `air -h config.compute`
+Stamp the installed CLI wheel version on BYOT submissions (client_version field) so the training service can correlate runs to a CLI release.
+`air cancel --all` now lists target runs and workspace before prompting for y/yes confirmation; skip with -y/--yes.
+Route `air cancel` through AiTrainingService instead of the Jobs API (no user-visible change).
+Forward `credentials-for-read` response headers to the pre-signed URL fetch in `mlflow_rest_client.download_artifact`. Fixes log/artifact downloads on Azure-backed workspaces where SAS URIs require headers like `x-ms-version` to authenticate.
+Fix cryptic "'str' object has no attribute 'decode'" error on submit when no Databricks profile is configured; surface the real authentication error instead
+Fix `sgcli --json run --watch` returning immediately with `status=PENDING` instead of blocking until terminal. The JSON path now emits a `SUBMITTED` event with run_id, streams `STATUS`/`LOG`/`ALERT` JSONL events through the watch, then emits a final success envelope whose `status` reflects the actual terminal state (`SUCCESS`/`FAILED`/`TIMEDOUT`/`CANCELED`).
+Fix post-submit MLflow run-name update clobbering the user-supplied `mlflow_run_name`. The CLI now passes the user's run_name (when set) to the post-submit `/api/2.0/mlflow/runs/update` call, falling back to `job_run_name` (== `experiment_name`) only when the user did not set one. Previously the CLI always passed `job_run_name`, which overwrote the run name the ai-training server had just set from `gen_ai_compute_task.mlflow_run_name`.
+Fix the "Waiting for run to start" spinner smearing into the first lines of log output during `air logs` and `air run --watch`. The Rich Live render thread was still drawing the spinner while raw bytes were being written to stdout from inside `print_new_logs`; `live.stop()` only ran in the caller after the write returned. The fix threads an `on_before_print` hook through `print_new_logs` that the streaming caller uses to tear down the spinner strictly before any byte hits stdout, and only when there is actually new content to emit (a no-op poll keeps the spinner running).
+Fix spinner artifacts in redirected output and JSON mode by detecting TTY status before creating Rich console objects
+[air-cli] Align `air get run` display with `air list runs`: rename "Task Run ID" -> "Run ID", adopt list-runs color palette, add User and Accelerators rows.
+[air-cli] Help/error polish: drop duplicate `--download_to` alias in `air logs -h`; add description lines to subcommand help; `air` (no args) and `air config[.field]` now print help; remove UNCOMMITTED-CHANGES table from `air -h config.code_source`; long YAML help (`air -h config[.field]`) now opens in a pager unless `--json` / piped; pull_wheel.sh gains `--download-only` (downloads to /tmp and prints the `uv tool install` command); `Failed to get run output: 404` during list-runs no longer logs at ERROR; YAML validation deduplicates the "Available fields are: …" list when multiple unknown fields share a parent.
+Added support for inline dependencies in the workload YAML: environment.dependencies now accepts a list of packages alongside an environment.version field, as an alternative to pointing at a separate requirements.yaml file.
+Remove `no_interpolation` field and `--no-interpolation` flag; OmegaConf variable interpolation is now disabled unconditionally. Literal `${VAR}` strings in YAML are preserved as-is. Bash `$VAR` shell expansion at runtime on the worker node is unaffected.
+[air-cli] Plug --json envelope coverage gaps so machine consumers always get a structured envelope: `--version --json` returns `{version}` (instead of the ASCII banner); `--json changelog` returns `{version, changelog}`; `--json get pools` returns `{pools[], workspace_url, workspace_id}`; `--json run --dry-run` returns `{status: DRY_RUN_OK, dry_run: true}`; `--json register image` returns success envelopes (cached + new) and a classified error envelope (USER/INTERNAL_ERROR) on failure.
+PuPr filter consolidation: replace `get runs` flags `--user`, `--experiment`, and the deprecated `--all` with a single repeatable `--filter KEY=VALUE` flag (supported keys: user, experiment). `--all-users` and the other listing flags are unchanged.
+Pass user run_name through gen_ai_compute_task.mlflow_run_name on the TS path.
+sgcli logs: follow multi-chunk MLflow log streams; default to last 10000 lines for completed runs
+Group code_source.snapshot.{git_branch,git_commit,use_remote_head,remote_alias} under a nested `git:` object; consolidate remote behavior into bool-or-string `git.remote`.
+Allow --override to add new YAML fields and auto-create missing nested blocks; typos still surface from Pydantic schema validation.
+Without git_branch/git_commit, package working tree as plain tar; upload git state (base/tip/dirty + diff) as WSFS sidecars for the backend to apply MLflow tags.
+PuPr command renames: monitor->subscribe, get runs->list runs, get status->get run, get logs->logs; cancel now takes one or more run IDs or --all; removed --no-interpolation, get-run --watch, logs --local-rank; hid get pools, register image, logs --review; deprecated aliases (get runs, get status, get logs) still parse with a warning.
+PuPr cleanup: `environment` block is now optional in YAML, and `--host` is removed from `sgcli list runs` / `sgcli get pools` (workspace comes from `-p/--profile`).
+Expand `air register image -h` to walk through the three credential methods (docker login -- recommended, `--interactive-authenticate`, `--scope`/`--key`) with concrete examples.
+[air-cli] Reject workload configs whose YAML exceeds 1 MB before submission, with a clear error pointing at oversized parameters/command fields.
+Remove the `bash_script` YAML field. Use the top-level `command:` field instead. Configs that still specify `bash_script` now fail validation with a hint pointing at the replacement.
+Remove deprecated YAML fields: 'workspace' (use -p/--profile) and 'code_source.snapshot.allow_uncommitted' (commit changes or pin with git_commit / git_branch + use_remote_head). Raise 'command' cap from 500 to 1000 lines.
+Remove --local-rank from `sgcli logs` / `sgcli subscribe` and the `local_rank` parameter from `sdk.get_logs()` — per-rank logs are not produced by the platform; every rank funnels into the consolidated per-node stream
+Remove the deprecated nested `environment.env_variables` and `environment.env_variables_secrets` YAML fields. Use the top-level `env_variables:` and `secrets:` fields instead. Configs that still set the nested forms now fail validation with a hint pointing at the replacement.
+[BREAKING] Rename CLI from `sgcli` to `air`; wheel from `databricks-serverless-gpu-cli` to `databricks-air`. The `SGCLI_DISABLE_TELEMETRY` env var is now `AIR_DISABLE_TELEMETRY`. No deprecation alias — update scripts that invoke `sgcli`.
+Revert task_run_id as the user-facing identifier; the CLI again uses job_run_id as the canonical handle (#1929533 undone).
+[air-cli] Validation hardening: reject experiment_name > 100 chars client-side (Jobs API task_key limit); reject empty/whitespace docker_image_url in `air register image`; fix `air get logs --review` (without `--lines`) rendering "Last None lines per node" by coercing the None default to 200.
+Telemetry: emit job_run_id (Jobs API run_id) on sgcli run events to enable joins against AI scheduler activity logs.
+Assign a usage policy by name via usage_policy_name; removed usage_policy_id/budget_policy_id fields and the DATABRICKS_USAGE_POLICY_ID env var
+Switched requirements.yaml dependency installation from `pip install --upgrade` to `python3 -m uv pip install -U` (uv is bootstrapped via pip if missing). `--trusted-host` entries are now ignored with a warning since uv configures trust per index URL.
+`air --version` now prints an ASCII art banner alongside the version string.
+Rename user-facing YAML fields to align with cross-product Databricks naming. Old names are temporarily accepted with a deprecation warning and will be removed in an upcoming release. Renames: env_variables_secrets -> secrets, run_name -> mlflow_run_name, experiment_directory -> mlflow_experiment_directory, budget_policy_id -> usage_policy_id, code_source.snapshot.repo_path -> code_source.snapshot.root_path, compute.gpus -> compute.num_accelerators, compute.gpu_type -> compute.accelerator_type, compute.gpu_node_pool_id -> compute.node_pool_id, compute.gpu_pool_name -> compute.pool_name.
+Version 0.1.0
+[internal] Add hidden --via flag and git-aware code_source telemetry signals (via_flag, code_source_uses_git).
+sgcli register image: when --scope/--key and --interactive-authenticate are absent, auto-discover credentials from ~/.docker/config.json (including credHelpers/credsStore) and store them in the per-user Databricks scope, so a single docker login means subsequent registrations need no flags
+[CNXT-2084] Always emit MLflow system-metrics sidecar and run-name update in the launch script, including for custom docker images, so multi-node jobs report per-node metrics.
+CODE_SOURCE_PATH now points to the extracted code_source directory (e.g. /databricks/code_source/<dir_name>). Use `cd $CODE_SOURCE_PATH` instead of `cd $CODE_SOURCE/<dir>`.
+Declare `pydantic` and `packaging` as wheel dependencies (previously relied on transitive resolution in pre-existing venvs; required for `uv tool install` to produce a working sgcli).
+Replace mlflow Python SDK with a direct Databricks MLflow REST client; drop the mlflow wheel dependency.
+Drop unused wheel deps (cloudpickle, psutil, pynvml); delete vestigial cli/requirements.txt.
+Emit HardwareAcceleratorType enum names (e.g. GPU_1xA10, GPU_8xH100) for on-demand workloads so genai-mapi forwards to the ai-training service. Pool workloads continue to use the legacy gpu_type strings and stay on the MAPI path.
+Pass user env_variables through gen_ai_compute_task.env_vars.
+Auto-create experiment_directory at submit time if it does not exist, instead of letting the run fail server-side with INTERNAL_ERROR.
+sgcli register image --interactive-authenticate: validate Databricks workspace auth before prompting for the Docker username/PAT, so unauthenticated users get a clear error immediately instead of typing creds and then hitting 401
+[Bug-Fix] [ES-1857723] Fix broken `$HOME/<repo>` symlink for macOS users whose snapshot repos have extended attributes; pass tar directory name from Python instead of parsing tarball contents, and exclude AppleDouble (`._*`) files from snapshot tarballs
+[Bug-fix] sgcli monitor: surface every loss key and match the real per-GPU keys for auto-detected gpu_utilization / gpu_memory.
+[Bug-Fix] Resolve relative repo_path in YAML config relative to the YAML file's directory, not the current working directory
+[CNXT-2024] Fix: create default secret scope with open permissions; surface clear error when workspace scope quota is exceeded
+Fix `register image --tag-policy latest` ignoring the policy when no --scope/--key were passed
+Pass tar_workspace_path and requirements_yaml_path through gen_ai_compute_task so the server-side entry script can extract the workspace tarball and install pip dependencies.
+Security fix: default Docker credential scope is now per-Databricks-user (docker-credentials-<user>) and creator-only, preventing cross-user PAT reads
+Add DABs-compatible 'permissions' field; deprecate 'grant_permissions' with backward compatibility.
+Remove embedded agent_skills; use the "sgcli" Claude Code plugin instead
+Route workloads through the ai-training service when new enum type is specified.
+Pass user env_variables_secrets through gen_ai_compute_task.secret_env_vars on the TS path.
+Fix --watch and get logs hanging on quick timeout/error, show logs when job ends before reaching RUNNING, and wrap user scripts with set -euo pipefail by default.
+[CNXT-2030] Normalize short-form Docker Hub image names (e.g. ubuntu:latest -> docker.io/library/ubuntu:latest)
+Version 0.0.7
+Fix several small bugs: Fix override bug which would silently drop second override. Protect command field from variable interpolation so bash ${VAR} work.
+[Bug-fix] Fix hyperlinks in status table, show MLflow run name, suppress download bar spam, and improve error message when pool validation fails
+[BugFix] Fixes a bug where run submission could exit without setting the MLflow run name
+Add glob-style batch cancellation: sgcli cancel --match 'pattern' [-y]
+[CNXT-1853] Add priority field for pool workload scheduling
+[CNXT-1939] Create versioned composite snapshot key
+Introduce CODE_SOURCE_PATH environment variable so users can locate the uploaded code.
+get client telemetry working with sgcli
+[Bug-Fix] Fix symlink failure when extracting git archive snapshots
+Add --retry flag to `get logs` to view logs from a specific retry attempt
+Update 'get runs' to show all runs by default; add --active, --all-users, and --user flags
+Honor .gitignore when snapshotting non-git directories, preventing venv and other ignored files from being uploaded
+[CNXT-1638] Fix --json mode to suppress human-readable output and add some caching to remove redundant auth calls
+Improve CLI startup performance by ~3.5x via lazy imports of heavy dependencies (databricks-sdk, mlflow)
+[Bug-Fix] Ensure MLflow sidecar cleanup runs on any exit (including failures) via EXIT trap to prevent GPU hang
+[Bug-Fix] Fix stalled jobs caused by MLflow system metrics sidecar not terminating after user code completes
+Decrease the timeout for MLflow api call to update the MLflow run name
+Suppress noisy Apple extended-header warnings during tarball extraction
+Support git worktrees and allow include_paths for non-git directories
+Version 0.0.6
+[CNXT-1924] Make sgcli more agent friendly. Introduce --json and monitor command
+[UX] Make -p, and -v flags global flags that work before or after the subcommand
+[CNXT-1891] Support runtime variable interpolation in env_variables
+[Bug-Fix] Fix sandbox script for DCS to not assume any package installs
+[Bug-Fix] Fix use of 'remote_head'
+[Deprecation] Removed no-image-upload flag
+[CNXT-1638] Support v5 client and deprecate v3
+[CNXT-1887] Add email support through --email flag
+[CNXT-1877] Add client telemetry for SGCLI
+[Deprecation] Removed no-image-upload flag
+Version 0.0.5
+[Bug-fix] Remove print_error from env_secrets
+[CNXT-1859] Remove python script from sgcli
+[Bug-fix] Fix uncommitted snapshot code path directories
+[Bug-fix] Fix call to experiment creation.
+[CNXT-1778] Add color for sgcli
+[CNXT-1844] Remove git clone from cli src
+[CNXT-1727] Fix bug in setting permissions in config
+[CNXT-1832] Improve perofrmance of `sgcli get runs` command
+[CNXT-1784] Provide yaml pointers from sgcli tool
+[CNXT-1638] Add dry run command
+[CNXT-1828] Add budget policy attribution field
+Version 0.0.4
+[CNXT-1817] Fix uncommitted changes not being captured in snapshot. Also allows include_paths when changes are outside those paths.
+[CNXT-1809] Update log syntax and allow for dowloading to a specified directory
+[CNXT-1806] Add support for automatic permission granting after job submission with `grant_permissions` field.
+[CNXT-1638] Make logs from dependency installation unbuffered
+Version 0.0.3
+[CNXT-1727] Allow uncommitted changes and simplify git UX.
+[CNXT-1759] Add support non git folders.
+[CNXT-1713]Add support for variable interpretation in local and remote.
+[Bug-fix] Fix hardcoded email from databricks.com to actual user email.
+Version 0.0.2
+[CNXT-1727] Support for subfolders in snapshot via git archive. This is a major update to speed up snapshot especially in large repos.
+[CNXT-1716] Remove unique suffix from job run names. Experiment corresponds exactly to Job Run name.
+Version 0.0.1
+[CNXT-1727] Add changelog command.
+[CNXT-1706] Fix get runs hyperlinks and add get status hyperlinks.
+[CNXT-1713] Add experiment_name validation and reduce log level.
+[CNXT-1624] Validate fields and gpu num during workload submission.
+[CNXT-1538] Add Streaming Logs capability.