colabctl 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- colabctl/__init__.py +71 -0
- colabctl/auth/__init__.py +21 -0
- colabctl/auth/adc.py +80 -0
- colabctl/auth/base.py +63 -0
- colabctl/backends/__init__.py +42 -0
- colabctl/backends/base.py +147 -0
- colabctl/backends/colab.py +166 -0
- colabctl/backends/factory.py +65 -0
- colabctl/backends/hf_backend.py +197 -0
- colabctl/backends/kaggle_backend.py +216 -0
- colabctl/backends/modal_backend.py +201 -0
- colabctl/backends/router.py +76 -0
- colabctl/backends/vertex_backend.py +237 -0
- colabctl/cli.py +327 -0
- colabctl/drive.py +218 -0
- colabctl/errors.py +147 -0
- colabctl/lifecycle.py +194 -0
- colabctl/mcp_server.py +209 -0
- colabctl/models.py +233 -0
- colabctl/observability.py +107 -0
- colabctl/sdk/__init__.py +8 -0
- colabctl/sdk/client.py +202 -0
- colabctl/sdk/remote.py +178 -0
- colabctl/secrets/__init__.py +37 -0
- colabctl/secrets/base.py +56 -0
- colabctl/secrets/encrypted_file.py +122 -0
- colabctl/secrets/keyring_store.py +107 -0
- colabctl/secrets/memory.py +24 -0
- colabctl/transport/__init__.py +18 -0
- colabctl/transport/base.py +122 -0
- colabctl/transport/browser/__init__.py +19 -0
- colabctl/transport/browser/bridge.py +260 -0
- colabctl/transport/cli/__init__.py +15 -0
- colabctl/transport/cli/adapter.py +252 -0
- colabctl/transport/cli/parser.py +192 -0
- colabctl/transport/native/__init__.py +49 -0
- colabctl/transport/native/adapter.py +260 -0
- colabctl/transport/native/client.py +371 -0
- colabctl/transport/native/kernel.py +259 -0
- colabctl-0.1.0.dist-info/METADATA +200 -0
- colabctl-0.1.0.dist-info/RECORD +44 -0
- colabctl-0.1.0.dist-info/WHEEL +4 -0
- colabctl-0.1.0.dist-info/entry_points.txt +3 -0
- colabctl-0.1.0.dist-info/licenses/LICENSE +201 -0
colabctl/__init__.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""colabctl — programmatic control of Google Colab for developers and AI agents.
|
|
2
|
+
|
|
3
|
+
Public API is intentionally small and stable; everything else is an
|
|
4
|
+
implementation detail behind the transport/provider abstractions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from colabctl.drive import DriveSync, drive_checkpoint_hooks
|
|
10
|
+
from colabctl.errors import (
|
|
11
|
+
AcceleratorUnavailableError,
|
|
12
|
+
AllocationError,
|
|
13
|
+
AuthError,
|
|
14
|
+
ColabctlError,
|
|
15
|
+
ConfigurationError,
|
|
16
|
+
ExecutionError,
|
|
17
|
+
FileTransferError,
|
|
18
|
+
KeepAliveError,
|
|
19
|
+
KernelError,
|
|
20
|
+
QuotaExceededError,
|
|
21
|
+
SecretStoreError,
|
|
22
|
+
TooManyAssignmentsError,
|
|
23
|
+
TransportError,
|
|
24
|
+
)
|
|
25
|
+
from colabctl.lifecycle import RuntimeLifecycleManager
|
|
26
|
+
from colabctl.models import (
|
|
27
|
+
Accelerator,
|
|
28
|
+
Assignment,
|
|
29
|
+
ExecutionResult,
|
|
30
|
+
MachineShape,
|
|
31
|
+
RuntimeProxyInfo,
|
|
32
|
+
RuntimeSpec,
|
|
33
|
+
SessionInfo,
|
|
34
|
+
SessionStatus,
|
|
35
|
+
Variant,
|
|
36
|
+
)
|
|
37
|
+
from colabctl.sdk import ColabClient, ColabSession, remote
|
|
38
|
+
|
|
39
|
+
__version__ = "0.1.0"
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"Accelerator",
|
|
43
|
+
"AcceleratorUnavailableError",
|
|
44
|
+
"AllocationError",
|
|
45
|
+
"Assignment",
|
|
46
|
+
"AuthError",
|
|
47
|
+
"ColabClient",
|
|
48
|
+
"ColabSession",
|
|
49
|
+
"ColabctlError",
|
|
50
|
+
"ConfigurationError",
|
|
51
|
+
"DriveSync",
|
|
52
|
+
"ExecutionError",
|
|
53
|
+
"ExecutionResult",
|
|
54
|
+
"FileTransferError",
|
|
55
|
+
"KeepAliveError",
|
|
56
|
+
"KernelError",
|
|
57
|
+
"MachineShape",
|
|
58
|
+
"QuotaExceededError",
|
|
59
|
+
"RuntimeLifecycleManager",
|
|
60
|
+
"RuntimeProxyInfo",
|
|
61
|
+
"RuntimeSpec",
|
|
62
|
+
"SecretStoreError",
|
|
63
|
+
"SessionInfo",
|
|
64
|
+
"SessionStatus",
|
|
65
|
+
"TooManyAssignmentsError",
|
|
66
|
+
"TransportError",
|
|
67
|
+
"Variant",
|
|
68
|
+
"__version__",
|
|
69
|
+
"drive_checkpoint_hooks",
|
|
70
|
+
"remote",
|
|
71
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Authentication providers for the Colab backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from colabctl.auth.adc import ADCAuthProvider
|
|
6
|
+
from colabctl.auth.base import (
|
|
7
|
+
ADC_LOGIN_SCOPES,
|
|
8
|
+
COLAB_SCOPES,
|
|
9
|
+
AuthProvider,
|
|
10
|
+
StaticTokenProvider,
|
|
11
|
+
TokenCallable,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"ADC_LOGIN_SCOPES",
|
|
16
|
+
"COLAB_SCOPES",
|
|
17
|
+
"ADCAuthProvider",
|
|
18
|
+
"AuthProvider",
|
|
19
|
+
"StaticTokenProvider",
|
|
20
|
+
"TokenCallable",
|
|
21
|
+
]
|
colabctl/auth/adc.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""ADC auth provider — the Phase 0-verified working path.
|
|
2
|
+
|
|
3
|
+
Uses Application Default Credentials (``gcloud auth application-default login
|
|
4
|
+
--scopes=…colaboratory``). ``google.auth`` is sync, so refreshes run in a thread;
|
|
5
|
+
a lock serializes concurrent refreshes. ``google-auth`` is imported lazily.
|
|
6
|
+
|
|
7
|
+
Setup (once, by the user):
|
|
8
|
+
|
|
9
|
+
gcloud auth application-default login \\
|
|
10
|
+
--scopes=openid,https://www.googleapis.com/auth/cloud-platform,\\
|
|
11
|
+
https://www.googleapis.com/auth/userinfo.email,\\
|
|
12
|
+
https://www.googleapis.com/auth/colaboratory,\\
|
|
13
|
+
https://www.googleapis.com/auth/drive.file
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import warnings
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from colabctl.auth.base import COLAB_SCOPES, AuthProvider
|
|
23
|
+
from colabctl.errors import AuthError
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ADCAuthProvider(AuthProvider):
|
|
27
|
+
"""Bearer tokens from Application Default Credentials with the Colab scopes."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, *, scopes: tuple[str, ...] = COLAB_SCOPES) -> None:
|
|
30
|
+
self._scopes = list(scopes)
|
|
31
|
+
self._creds: Any | None = None
|
|
32
|
+
self._lock = asyncio.Lock()
|
|
33
|
+
|
|
34
|
+
async def token(self) -> str:
|
|
35
|
+
async with self._lock:
|
|
36
|
+
return await asyncio.to_thread(self._sync_token)
|
|
37
|
+
|
|
38
|
+
async def email(self) -> str | None:
|
|
39
|
+
async with self._lock:
|
|
40
|
+
creds = self._creds
|
|
41
|
+
# ADC user creds expose the signer email for service accounts; user creds
|
|
42
|
+
# usually don't, so this is best-effort.
|
|
43
|
+
return getattr(creds, "service_account_email", None) if creds else None
|
|
44
|
+
|
|
45
|
+
def _sync_token(self) -> str:
|
|
46
|
+
try:
|
|
47
|
+
import google.auth
|
|
48
|
+
from google.auth.transport.requests import Request
|
|
49
|
+
except ImportError as exc: # pragma: no cover - only without the extra
|
|
50
|
+
raise AuthError(
|
|
51
|
+
"google-auth is not installed. Install with `pip install 'colabctl[native]'`."
|
|
52
|
+
) from exc
|
|
53
|
+
|
|
54
|
+
# Typed Any: google-auth credential subclasses differ structurally
|
|
55
|
+
# (only scopable creds have with_scopes; refresh is untyped upstream).
|
|
56
|
+
creds: Any = self._creds
|
|
57
|
+
if creds is None:
|
|
58
|
+
# ADC user creds emit a noisy "no quota project" UserWarning on every
|
|
59
|
+
# call; it's irrelevant here (Colab calls pin their own project), so
|
|
60
|
+
# suppress just that one message.
|
|
61
|
+
with warnings.catch_warnings():
|
|
62
|
+
warnings.filterwarnings(
|
|
63
|
+
"ignore",
|
|
64
|
+
message=r"Your application has authenticated using end user credentials.*",
|
|
65
|
+
category=UserWarning,
|
|
66
|
+
)
|
|
67
|
+
creds, _ = google.auth.default(scopes=self._scopes)
|
|
68
|
+
# User creds ignore scopes= in default(); re-apply when supported.
|
|
69
|
+
if getattr(creds, "requires_scopes", False):
|
|
70
|
+
creds = creds.with_scopes(self._scopes)
|
|
71
|
+
if not creds.valid:
|
|
72
|
+
try:
|
|
73
|
+
creds.refresh(Request())
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
raise AuthError(f"Failed to refresh ADC credentials: {exc}") from exc
|
|
76
|
+
self._creds = creds
|
|
77
|
+
token = getattr(creds, "token", None)
|
|
78
|
+
if not token:
|
|
79
|
+
raise AuthError("ADC credentials produced no access token.")
|
|
80
|
+
return str(token)
|
colabctl/auth/base.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Authentication contract.
|
|
2
|
+
|
|
3
|
+
An :class:`AuthProvider` yields a fresh OAuth bearer token (with the Colab scopes)
|
|
4
|
+
on demand. Transports depend only on this; concrete providers (ADC, OAuth2-loopback,
|
|
5
|
+
static) are swappable. The Colab scope set is the one verified in Phase 0 — note
|
|
6
|
+
that ``colaboratory`` is not third-party-grantable, so ADC (gcloud's client) is the
|
|
7
|
+
working path; ``cloud-platform`` is additionally required by gcloud itself.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import abc
|
|
13
|
+
from collections.abc import Awaitable, Callable
|
|
14
|
+
|
|
15
|
+
#: OAuth scopes the Colab backend requires (verified from CLI ``PUBLIC_SCOPES``).
|
|
16
|
+
COLAB_SCOPES: tuple[str, ...] = (
|
|
17
|
+
"openid",
|
|
18
|
+
"https://www.googleapis.com/auth/userinfo.profile",
|
|
19
|
+
"https://www.googleapis.com/auth/userinfo.email",
|
|
20
|
+
"https://www.googleapis.com/auth/colaboratory",
|
|
21
|
+
"https://www.googleapis.com/auth/drive.file",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
#: ADC via ``gcloud`` additionally requires these (gcloud refuses otherwise).
|
|
25
|
+
ADC_LOGIN_SCOPES: tuple[str, ...] = (
|
|
26
|
+
"openid",
|
|
27
|
+
"https://www.googleapis.com/auth/cloud-platform",
|
|
28
|
+
"https://www.googleapis.com/auth/userinfo.email",
|
|
29
|
+
"https://www.googleapis.com/auth/colaboratory",
|
|
30
|
+
"https://www.googleapis.com/auth/drive.file",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
TokenCallable = Callable[[], Awaitable[str]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class AuthProvider(abc.ABC):
|
|
37
|
+
"""Yields fresh bearer tokens for the Colab backend."""
|
|
38
|
+
|
|
39
|
+
@abc.abstractmethod
|
|
40
|
+
async def token(self) -> str:
|
|
41
|
+
"""Return a currently-valid bearer token, refreshing if needed."""
|
|
42
|
+
|
|
43
|
+
async def email(self) -> str | None:
|
|
44
|
+
"""Return the authenticated account email if known (best-effort)."""
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def as_token_callable(self) -> TokenCallable:
|
|
48
|
+
"""Adapt to the ``TokenProvider`` callable the native client expects."""
|
|
49
|
+
return self.token
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class StaticTokenProvider(AuthProvider):
|
|
53
|
+
"""An :class:`AuthProvider` wrapping a fixed token — for tests/injection."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, token: str, *, email: str | None = None) -> None:
|
|
56
|
+
self._token = token
|
|
57
|
+
self._email = email
|
|
58
|
+
|
|
59
|
+
async def token(self) -> str:
|
|
60
|
+
return self._token
|
|
61
|
+
|
|
62
|
+
async def email(self) -> str | None:
|
|
63
|
+
return self._email
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Provider abstraction: pluggable batch backends + capability-based routing.
|
|
2
|
+
|
|
3
|
+
- :class:`Backend` — the submit/status/logs/result/cancel contract.
|
|
4
|
+
- :class:`ColabBackend` — Colab via an interactive transport (sanctioned default).
|
|
5
|
+
- :class:`ModalBackend` — gVisor GPU sandboxes (best for agent code).
|
|
6
|
+
- :class:`VertexBackend` — sanctioned, headless, deadline-bound GPU jobs.
|
|
7
|
+
- :class:`BackendRouter` — selects a backend by capability and fails over on infra errors.
|
|
8
|
+
|
|
9
|
+
HF Jobs / Kaggle / IaaS are registered-but-deferred (Phase 4).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from colabctl.backends.base import (
|
|
15
|
+
Backend,
|
|
16
|
+
BackendCapabilities,
|
|
17
|
+
JobInfo,
|
|
18
|
+
JobResult,
|
|
19
|
+
JobSpec,
|
|
20
|
+
JobState,
|
|
21
|
+
)
|
|
22
|
+
from colabctl.backends.colab import ColabBackend
|
|
23
|
+
from colabctl.backends.hf_backend import HFJobsBackend
|
|
24
|
+
from colabctl.backends.kaggle_backend import KaggleBackend
|
|
25
|
+
from colabctl.backends.modal_backend import ModalBackend
|
|
26
|
+
from colabctl.backends.router import BackendRouter
|
|
27
|
+
from colabctl.backends.vertex_backend import VertexBackend
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"Backend",
|
|
31
|
+
"BackendCapabilities",
|
|
32
|
+
"BackendRouter",
|
|
33
|
+
"ColabBackend",
|
|
34
|
+
"HFJobsBackend",
|
|
35
|
+
"JobInfo",
|
|
36
|
+
"JobResult",
|
|
37
|
+
"JobSpec",
|
|
38
|
+
"JobState",
|
|
39
|
+
"KaggleBackend",
|
|
40
|
+
"ModalBackend",
|
|
41
|
+
"VertexBackend",
|
|
42
|
+
]
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Provider abstraction: the batch-`Backend` contract.
|
|
2
|
+
|
|
3
|
+
Two complementary abstractions exist in colabctl:
|
|
4
|
+
|
|
5
|
+
- :class:`~colabctl.transport.base.TransportAdapter` — *interactive* runtimes
|
|
6
|
+
(allocate a warm GPU, run cells on a live kernel). Colab's native shape.
|
|
7
|
+
- :class:`Backend` (this module) — *batch jobs* (submit code → poll → fetch result →
|
|
8
|
+
cancel). The natural shape for Modal, Vertex, HF Jobs, etc.
|
|
9
|
+
|
|
10
|
+
The :class:`~colabctl.backends.router.BackendRouter` selects a backend by capability
|
|
11
|
+
and fails over between them, so a Colab outage/quota/ban degrades to Modal or Vertex
|
|
12
|
+
instead of failing. Colab is also exposed as a batch backend
|
|
13
|
+
(:class:`~colabctl.backends.colab.ColabBackend`) so callers can use one job API
|
|
14
|
+
across every provider.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import abc
|
|
20
|
+
import enum
|
|
21
|
+
|
|
22
|
+
from pydantic import BaseModel, model_validator
|
|
23
|
+
|
|
24
|
+
from colabctl.models import Accelerator
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class JobState(enum.StrEnum):
|
|
28
|
+
PENDING = "PENDING"
|
|
29
|
+
RUNNING = "RUNNING"
|
|
30
|
+
SUCCEEDED = "SUCCEEDED"
|
|
31
|
+
FAILED = "FAILED"
|
|
32
|
+
CANCELLED = "CANCELLED"
|
|
33
|
+
UNKNOWN = "UNKNOWN"
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def is_terminal(self) -> bool:
|
|
37
|
+
return self in {JobState.SUCCEEDED, JobState.FAILED, JobState.CANCELLED}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class JobSpec(BaseModel):
|
|
41
|
+
"""What to run on a backend. Provide exactly one of ``code`` or ``script_path``."""
|
|
42
|
+
|
|
43
|
+
code: str | None = None
|
|
44
|
+
script_path: str | None = None
|
|
45
|
+
requirements: list[str] = []
|
|
46
|
+
accelerator: Accelerator = Accelerator.T4
|
|
47
|
+
env: dict[str, str] = {}
|
|
48
|
+
timeout: int | None = None
|
|
49
|
+
name: str | None = None
|
|
50
|
+
|
|
51
|
+
@model_validator(mode="after")
|
|
52
|
+
def _exactly_one_source(self) -> JobSpec:
|
|
53
|
+
if bool(self.code) == bool(self.script_path):
|
|
54
|
+
raise ValueError("Provide exactly one of `code` or `script_path`.")
|
|
55
|
+
return self
|
|
56
|
+
|
|
57
|
+
def resolved_code(self) -> str:
|
|
58
|
+
"""Return the code to run (reads ``script_path`` if that's what was given)."""
|
|
59
|
+
if self.code is not None:
|
|
60
|
+
return self.code
|
|
61
|
+
from pathlib import Path
|
|
62
|
+
|
|
63
|
+
return Path(self.script_path or "").read_text()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class JobInfo(BaseModel):
|
|
67
|
+
"""A lightweight view of a submitted job."""
|
|
68
|
+
|
|
69
|
+
id: str
|
|
70
|
+
backend: str
|
|
71
|
+
state: JobState
|
|
72
|
+
accelerator: Accelerator = Accelerator.NONE
|
|
73
|
+
detail: str | None = None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class JobResult(BaseModel):
|
|
77
|
+
"""The outcome of a job."""
|
|
78
|
+
|
|
79
|
+
id: str
|
|
80
|
+
backend: str
|
|
81
|
+
state: JobState
|
|
82
|
+
exit_code: int | None = None
|
|
83
|
+
stdout: str = ""
|
|
84
|
+
stderr: str = ""
|
|
85
|
+
error: str | None = None
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def ok(self) -> bool:
|
|
89
|
+
return self.state is JobState.SUCCEEDED
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class BackendCapabilities(BaseModel):
|
|
93
|
+
"""What a backend can do — used for routing and honest disclosure."""
|
|
94
|
+
|
|
95
|
+
name: str
|
|
96
|
+
accelerators: list[str] = [] # supported accelerator values; empty = any/unknown
|
|
97
|
+
interactive: bool = False
|
|
98
|
+
streaming_logs: bool = False
|
|
99
|
+
persistent: bool = False # runtime survives between calls
|
|
100
|
+
max_runtime_seconds: int | None = None
|
|
101
|
+
requires_account: bool = True
|
|
102
|
+
tos_posture: str = "sanctioned" # "sanctioned" | "gray-area" | "prohibited"
|
|
103
|
+
notes: list[str] = []
|
|
104
|
+
|
|
105
|
+
def supports(self, accelerator: Accelerator) -> bool:
|
|
106
|
+
if not self.accelerators:
|
|
107
|
+
return True
|
|
108
|
+
return accelerator.value in self.accelerators or accelerator is Accelerator.NONE
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class Backend(abc.ABC):
|
|
112
|
+
"""A pluggable batch-execution backend (Colab, Modal, Vertex, ...)."""
|
|
113
|
+
|
|
114
|
+
name: str = "backend"
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
@abc.abstractmethod
|
|
118
|
+
def capabilities(self) -> BackendCapabilities: ...
|
|
119
|
+
|
|
120
|
+
@abc.abstractmethod
|
|
121
|
+
async def submit(self, spec: JobSpec) -> JobInfo:
|
|
122
|
+
"""Start a job and return immediately with its handle."""
|
|
123
|
+
|
|
124
|
+
@abc.abstractmethod
|
|
125
|
+
async def status(self, job_id: str) -> JobInfo:
|
|
126
|
+
"""Return the job's current state."""
|
|
127
|
+
|
|
128
|
+
@abc.abstractmethod
|
|
129
|
+
async def logs(self, job_id: str) -> str:
|
|
130
|
+
"""Return the job's logs so far (best-effort)."""
|
|
131
|
+
|
|
132
|
+
@abc.abstractmethod
|
|
133
|
+
async def result(self, job_id: str) -> JobResult:
|
|
134
|
+
"""Wait for the job to finish and return its result."""
|
|
135
|
+
|
|
136
|
+
@abc.abstractmethod
|
|
137
|
+
async def cancel(self, job_id: str) -> None:
|
|
138
|
+
"""Cancel a running job."""
|
|
139
|
+
|
|
140
|
+
async def run(self, spec: JobSpec) -> JobResult:
|
|
141
|
+
"""Convenience: submit and wait for the result."""
|
|
142
|
+
info = await self.submit(spec)
|
|
143
|
+
return await self.result(info.id)
|
|
144
|
+
|
|
145
|
+
async def aclose(self) -> None:
|
|
146
|
+
"""Release backend-level resources (default no-op; override if needed)."""
|
|
147
|
+
return None
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Colab as a batch :class:`Backend` (wraps a TransportAdapter).
|
|
2
|
+
|
|
3
|
+
Presents Colab's interactive transport through the unified job API: ``submit``
|
|
4
|
+
launches an in-process asyncio task that allocates a runtime, optionally pip-installs
|
|
5
|
+
requirements, runs the code, captures output, and releases the runtime. State/logs/
|
|
6
|
+
result read the in-memory job record.
|
|
7
|
+
|
|
8
|
+
Limitation: job tracking is in-process (if the host process dies, the record is lost).
|
|
9
|
+
Cross-process durability is the runtime-lifecycle manager's concern (checkpoint to
|
|
10
|
+
Drive/GCS); for the interactive, warm-GPU workflow use the SDK's ``ColabSession`` directly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import contextlib
|
|
17
|
+
import uuid
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
|
|
20
|
+
from colabctl.backends.base import (
|
|
21
|
+
Backend,
|
|
22
|
+
BackendCapabilities,
|
|
23
|
+
JobInfo,
|
|
24
|
+
JobResult,
|
|
25
|
+
JobSpec,
|
|
26
|
+
JobState,
|
|
27
|
+
)
|
|
28
|
+
from colabctl.errors import ColabctlError
|
|
29
|
+
from colabctl.models import RuntimeSpec
|
|
30
|
+
from colabctl.transport.base import TransportAdapter
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _install_code(requirements: list[str]) -> str:
|
|
34
|
+
pkgs = ", ".join(repr(r) for r in requirements)
|
|
35
|
+
return (
|
|
36
|
+
"import subprocess, sys\n"
|
|
37
|
+
f"subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', {pkgs}], check=True)\n"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class _Job:
|
|
43
|
+
info: JobInfo
|
|
44
|
+
spec: JobSpec
|
|
45
|
+
task: asyncio.Task[None] | None = None
|
|
46
|
+
session: str | None = None
|
|
47
|
+
stdout: str = ""
|
|
48
|
+
stderr: str = ""
|
|
49
|
+
exit_code: int | None = None
|
|
50
|
+
error: str | None = None
|
|
51
|
+
logbuf: list[str] = field(default_factory=list)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ColabBackend(Backend):
|
|
55
|
+
"""Run batch jobs on Colab via an interactive transport."""
|
|
56
|
+
|
|
57
|
+
name = "colab"
|
|
58
|
+
|
|
59
|
+
def __init__(self, transport: TransportAdapter) -> None:
|
|
60
|
+
self._transport = transport
|
|
61
|
+
self._jobs: dict[str, _Job] = {}
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def capabilities(self) -> BackendCapabilities:
|
|
65
|
+
caps = self._transport.capabilities
|
|
66
|
+
return BackendCapabilities(
|
|
67
|
+
name=self.name,
|
|
68
|
+
accelerators=["T4", "L4", "G4", "A100", "H100"],
|
|
69
|
+
interactive=caps.interactive,
|
|
70
|
+
streaming_logs=False,
|
|
71
|
+
persistent=True,
|
|
72
|
+
requires_account=True,
|
|
73
|
+
tos_posture="sanctioned" if self._transport.name == "cli" else "gray-area",
|
|
74
|
+
notes=[f"via the {self._transport.name!r} transport", *caps.caveats],
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
async def submit(self, spec: JobSpec) -> JobInfo:
|
|
78
|
+
job_id = f"colab-{uuid.uuid4().hex[:10]}"
|
|
79
|
+
info = JobInfo(
|
|
80
|
+
id=job_id, backend=self.name, state=JobState.PENDING, accelerator=spec.accelerator
|
|
81
|
+
)
|
|
82
|
+
job = _Job(info=info, spec=spec)
|
|
83
|
+
self._jobs[job_id] = job
|
|
84
|
+
job.task = asyncio.create_task(self._execute(job))
|
|
85
|
+
return info
|
|
86
|
+
|
|
87
|
+
async def status(self, job_id: str) -> JobInfo:
|
|
88
|
+
return self._require(job_id).info
|
|
89
|
+
|
|
90
|
+
async def logs(self, job_id: str) -> str:
|
|
91
|
+
return "".join(self._require(job_id).logbuf)
|
|
92
|
+
|
|
93
|
+
async def result(self, job_id: str) -> JobResult:
|
|
94
|
+
job = self._require(job_id)
|
|
95
|
+
if job.task is not None:
|
|
96
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
97
|
+
await job.task
|
|
98
|
+
return JobResult(
|
|
99
|
+
id=job_id,
|
|
100
|
+
backend=self.name,
|
|
101
|
+
state=job.info.state,
|
|
102
|
+
exit_code=job.exit_code,
|
|
103
|
+
stdout=job.stdout,
|
|
104
|
+
stderr=job.stderr,
|
|
105
|
+
error=job.error,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
async def cancel(self, job_id: str) -> None:
|
|
109
|
+
job = self._require(job_id)
|
|
110
|
+
if job.task is not None and not job.task.done():
|
|
111
|
+
job.task.cancel()
|
|
112
|
+
job.info.state = JobState.CANCELLED
|
|
113
|
+
|
|
114
|
+
async def aclose(self) -> None:
|
|
115
|
+
for job in list(self._jobs.values()):
|
|
116
|
+
if job.task is not None and not job.task.done():
|
|
117
|
+
job.task.cancel()
|
|
118
|
+
await self._transport.aclose()
|
|
119
|
+
|
|
120
|
+
# -- internals ----------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
def _require(self, job_id: str) -> _Job:
|
|
123
|
+
job = self._jobs.get(job_id)
|
|
124
|
+
if job is None:
|
|
125
|
+
raise ColabctlError(f"No such job: {job_id!r}")
|
|
126
|
+
return job
|
|
127
|
+
|
|
128
|
+
async def _execute(self, job: _Job) -> None:
|
|
129
|
+
job.info.state = JobState.RUNNING
|
|
130
|
+
try:
|
|
131
|
+
session = await self._transport.allocate(
|
|
132
|
+
RuntimeSpec(accelerator=job.spec.accelerator, name=job.spec.name)
|
|
133
|
+
)
|
|
134
|
+
job.session = session.name
|
|
135
|
+
if job.spec.requirements:
|
|
136
|
+
install = await self._transport.execute(
|
|
137
|
+
session.name, _install_code(job.spec.requirements), timeout=job.spec.timeout
|
|
138
|
+
)
|
|
139
|
+
if not install.ok:
|
|
140
|
+
job.error = "pip install failed: " + (install.stderr or install.text)[:400]
|
|
141
|
+
job.info.state = JobState.FAILED
|
|
142
|
+
return
|
|
143
|
+
result = await self._transport.execute(
|
|
144
|
+
session.name, job.spec.resolved_code(), timeout=job.spec.timeout
|
|
145
|
+
)
|
|
146
|
+
job.stdout = result.text
|
|
147
|
+
job.stderr = result.stderr
|
|
148
|
+
job.logbuf.append(result.text)
|
|
149
|
+
if result.ok:
|
|
150
|
+
job.info.state = JobState.SUCCEEDED
|
|
151
|
+
job.exit_code = 0
|
|
152
|
+
else:
|
|
153
|
+
job.info.state = JobState.FAILED
|
|
154
|
+
job.exit_code = 1
|
|
155
|
+
if result.error is not None:
|
|
156
|
+
job.error = f"{result.error.ename}: {result.error.evalue}"
|
|
157
|
+
except asyncio.CancelledError:
|
|
158
|
+
job.info.state = JobState.CANCELLED
|
|
159
|
+
raise
|
|
160
|
+
except ColabctlError as exc:
|
|
161
|
+
job.info.state = JobState.FAILED
|
|
162
|
+
job.error = str(exc)
|
|
163
|
+
finally:
|
|
164
|
+
if job.session is not None:
|
|
165
|
+
with contextlib.suppress(ColabctlError):
|
|
166
|
+
await self._transport.stop(job.session)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Backend construction by name — the seam the CLI and MCP server build on.
|
|
2
|
+
|
|
3
|
+
Keeps backend wiring in one place so ``colabctl job run --backend modal`` and the
|
|
4
|
+
MCP ``run_job`` tool construct backends identically. The Colab backend is built over
|
|
5
|
+
the chosen transport (cli/native); Modal/Vertex read their own env/config.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from colabctl.backends.base import Backend
|
|
11
|
+
from colabctl.backends.modal_backend import ModalBackend
|
|
12
|
+
from colabctl.backends.router import BackendRouter
|
|
13
|
+
from colabctl.backends.vertex_backend import VertexBackend
|
|
14
|
+
from colabctl.errors import ConfigurationError
|
|
15
|
+
|
|
16
|
+
#: Backends available for selection.
|
|
17
|
+
BACKEND_NAMES: tuple[str, ...] = ("colab", "modal", "vertex", "hf", "kaggle")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_backend(
|
|
21
|
+
name: str,
|
|
22
|
+
*,
|
|
23
|
+
transport_name: str = "cli",
|
|
24
|
+
auth_mode: str = "adc",
|
|
25
|
+
colab_bin: str = "colab",
|
|
26
|
+
) -> Backend:
|
|
27
|
+
"""Construct a backend by name. Colab uses the chosen transport."""
|
|
28
|
+
key = name.lower()
|
|
29
|
+
if key == "colab":
|
|
30
|
+
from colabctl.backends.colab import ColabBackend
|
|
31
|
+
from colabctl.sdk.client import ColabClient
|
|
32
|
+
|
|
33
|
+
client = ColabClient(
|
|
34
|
+
transport_name=transport_name, auth_mode=auth_mode, colab_bin=colab_bin
|
|
35
|
+
)
|
|
36
|
+
return ColabBackend(client.transport)
|
|
37
|
+
if key == "modal":
|
|
38
|
+
return ModalBackend()
|
|
39
|
+
if key == "vertex":
|
|
40
|
+
return VertexBackend()
|
|
41
|
+
if key == "hf":
|
|
42
|
+
from colabctl.backends.hf_backend import HFJobsBackend
|
|
43
|
+
|
|
44
|
+
return HFJobsBackend()
|
|
45
|
+
if key == "kaggle":
|
|
46
|
+
from colabctl.backends.kaggle_backend import KaggleBackend
|
|
47
|
+
|
|
48
|
+
return KaggleBackend()
|
|
49
|
+
raise ConfigurationError(f"Unknown backend {name!r}. Choose from: {', '.join(BACKEND_NAMES)}.")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def build_router(
|
|
53
|
+
names: list[str] | None = None,
|
|
54
|
+
*,
|
|
55
|
+
transport_name: str = "cli",
|
|
56
|
+
auth_mode: str = "adc",
|
|
57
|
+
colab_bin: str = "colab",
|
|
58
|
+
) -> BackendRouter:
|
|
59
|
+
"""Build a router over the named backends (default: all), in failover order."""
|
|
60
|
+
selected = names or list(BACKEND_NAMES)
|
|
61
|
+
backends = [
|
|
62
|
+
build_backend(n, transport_name=transport_name, auth_mode=auth_mode, colab_bin=colab_bin)
|
|
63
|
+
for n in selected
|
|
64
|
+
]
|
|
65
|
+
return BackendRouter(backends, order=selected)
|