paralleliq-skypilot-plugin 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paralleliq_skypilot_plugin-0.1.0/.gitignore +6 -0
- paralleliq_skypilot_plugin-0.1.0/PKG-INFO +63 -0
- paralleliq_skypilot_plugin-0.1.0/README.md +46 -0
- paralleliq_skypilot_plugin-0.1.0/pyproject.toml +25 -0
- paralleliq_skypilot_plugin-0.1.0/skypilot-plugin-devlog.md +277 -0
- paralleliq_skypilot_plugin-0.1.0/src/paralleliq_skypilot_plugin/__init__.py +4 -0
- paralleliq_skypilot_plugin-0.1.0/src/paralleliq_skypilot_plugin/policy.py +41 -0
- paralleliq_skypilot_plugin-0.1.0/test-task-cpu.yaml +5 -0
- paralleliq_skypilot_plugin-0.1.0/test-task.yaml +5 -0
- paralleliq_skypilot_plugin-0.1.0/test_policy.py +41 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paralleliq-skypilot-plugin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SkyPilot AdminPolicy that surfaces piqc GPU waste scanning when a GPU workload is launched on Kubernetes
|
|
5
|
+
Author-email: Paralleliq <engineering@paralleliq.ai>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: gpu,inference,kubernetes,paralleliq,piqc,skypilot
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Requires-Dist: skypilot>=0.7.0
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# paralleliq-skypilot-plugin
|
|
19
|
+
|
|
20
|
+
A SkyPilot [AdminPolicy](https://docs.skypilot.co/en/latest/cloud-setup/policy.html) that surfaces [piqc](https://github.com/paralleliq/piqc) — Paralleliq's GPU waste scanner — when a GPU workload is launched on a Kubernetes cluster.
|
|
21
|
+
|
|
22
|
+
## What it does
|
|
23
|
+
|
|
24
|
+
When you run `sky launch` with GPU accelerators targeting a Kubernetes cluster, the policy prints instructions to run a free piqc scan on your cluster before the job starts.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install paralleliq-skypilot-plugin
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Configure
|
|
33
|
+
|
|
34
|
+
Add one line to `~/.sky/config.yaml`:
|
|
35
|
+
|
|
36
|
+
```yaml
|
|
37
|
+
admin_policy: paralleliq_skypilot_plugin.ParalleliqPolicy
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
Run any GPU workload as normal:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
sky launch -c my-cluster task.yaml
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
If the task requests GPU accelerators on a Kubernetes backend, you will see:
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
Paralleliq — GPU workload detected on Kubernetes.
|
|
52
|
+
Run a free GPU waste scan on this cluster before launching:
|
|
53
|
+
|
|
54
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/rbac.yaml
|
|
55
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/scan-job.yaml
|
|
56
|
+
kubectl logs -n kube-system job/piqc-scan
|
|
57
|
+
|
|
58
|
+
github.com/paralleliq/piqc · paralleliq.ai
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## License
|
|
62
|
+
|
|
63
|
+
Apache-2.0
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# paralleliq-skypilot-plugin
|
|
2
|
+
|
|
3
|
+
A SkyPilot [AdminPolicy](https://docs.skypilot.co/en/latest/cloud-setup/policy.html) that surfaces [piqc](https://github.com/paralleliq/piqc) — Paralleliq's GPU waste scanner — when a GPU workload is launched on a Kubernetes cluster.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
When you run `sky launch` with GPU accelerators targeting a Kubernetes cluster, the policy prints instructions to run a free piqc scan on your cluster before the job starts.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install paralleliq-skypilot-plugin
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Configure
|
|
16
|
+
|
|
17
|
+
Add one line to `~/.sky/config.yaml`:
|
|
18
|
+
|
|
19
|
+
```yaml
|
|
20
|
+
admin_policy: paralleliq_skypilot_plugin.ParalleliqPolicy
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
Run any GPU workload as normal:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
sky launch -c my-cluster task.yaml
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
If the task requests GPU accelerators on a Kubernetes backend, you will see:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
Paralleliq — GPU workload detected on Kubernetes.
|
|
35
|
+
Run a free GPU waste scan on this cluster before launching:
|
|
36
|
+
|
|
37
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/rbac.yaml
|
|
38
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/scan-job.yaml
|
|
39
|
+
kubectl logs -n kube-system job/piqc-scan
|
|
40
|
+
|
|
41
|
+
github.com/paralleliq/piqc · paralleliq.ai
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## License
|
|
45
|
+
|
|
46
|
+
Apache-2.0
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "paralleliq-skypilot-plugin"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "SkyPilot AdminPolicy that surfaces piqc GPU waste scanning when a GPU workload is launched on Kubernetes"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "Apache-2.0" }
|
|
11
|
+
authors = [{ name = "Paralleliq", email = "engineering@paralleliq.ai" }]
|
|
12
|
+
keywords = ["skypilot", "gpu", "kubernetes", "inference", "piqc", "paralleliq"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: Apache Software License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
]
|
|
21
|
+
requires-python = ">=3.11"
|
|
22
|
+
dependencies = ["skypilot>=0.7.0"]
|
|
23
|
+
|
|
24
|
+
[tool.hatch.build.targets.wheel]
|
|
25
|
+
packages = ["src/paralleliq_skypilot_plugin"]
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# SkyPilot AdminPolicy Integration — Developer Experience Log
|
|
2
|
+
|
|
3
|
+
This document captures the experience of a third-party developer building an AdminPolicy plugin for SkyPilot.
|
|
4
|
+
Written from the perspective of Paralleliq integrating with SkyPilot's policy system.
|
|
5
|
+
Shared with Zongheng Yang and the SkyPilot team as honest feedback.
|
|
6
|
+
|
|
7
|
+
*Note: We previously built a plugin for dstack — that experience is the baseline for several comparisons below.*
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## What We Are Building
|
|
12
|
+
|
|
13
|
+
A `paralleliq-skypilot-plugin` Python package that hooks into SkyPilot's `AdminPolicy` system to surface our open source GPU scanner ([piqc](https://github.com/paralleliq/piqc)) when a user launches a GPU workload on a Kubernetes cluster. Goal: when a user runs `sky launch`, they immediately see instructions to run a piqc GPU waste scan on their cluster before the job starts.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Step 1 — Installing SkyPilot
|
|
18
|
+
|
|
19
|
+
**What we did:**
|
|
20
|
+
Installed with `pip install "skypilot[kubernetes]"`. Used `[kubernetes]` rather than `[all]` since our test clusters are on GKE and we didn't need every cloud SDK.
|
|
21
|
+
|
|
22
|
+
**One dependency conflict to note:**
|
|
23
|
+
SkyPilot and dstack require different versions of pydantic. SkyPilot upgraded pydantic to 2.x, which broke dstack (which requires pydantic <2.0). In a production environment where both tools are used, separate virtual environments would be the clean fix.
|
|
24
|
+
|
|
25
|
+
**No issues with the install itself** — straightforward.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Step 2 — Finding the AdminPolicy System
|
|
30
|
+
|
|
31
|
+
**What we did:**
|
|
32
|
+
Searched the SkyPilot docs for "admin policy" and found a dedicated page immediately. This was a significantly better experience than dstack, where we had to navigate the source code directly to find the plugin API.
|
|
33
|
+
|
|
34
|
+
**What the docs tell you:**
|
|
35
|
+
- Subclass `sky.AdminPolicy`
|
|
36
|
+
- Implement `validate_and_mutate(cls, user_request: UserRequest) -> MutatedUserRequest`
|
|
37
|
+
- Register via one line in `~/.sky/config.yaml`
|
|
38
|
+
|
|
39
|
+
**How we verified the interface:**
|
|
40
|
+
Read the installed source at `sky/admin_policy.py`. Found the full class hierarchy:
|
|
41
|
+
- `AdminPolicy` — the class to subclass, `validate_and_mutate` is a `@classmethod`
|
|
42
|
+
- `UserRequest` — wraps `sky.Task`, `sky.Config`, `request_name`, `request_options`
|
|
43
|
+
- `MutatedUserRequest` — wraps `sky.Task`, `sky.Config` (return this unchanged to pass through)
|
|
44
|
+
|
|
45
|
+
**Bonus finding — `RestfulAdminPolicy`:**
|
|
46
|
+
SkyPilot ships a built-in `RestfulAdminPolicy` class that calls a REST endpoint for validation. This is a distribution model we hadn't considered: instead of installing a pip package, a cluster admin could point to a Paralleliq API URL. Worth exploring in a future version.
|
|
47
|
+
|
|
48
|
+
**Feedback for SkyPilot:**
|
|
49
|
+
> The AdminPolicy docs are well-structured and the interface is immediately clear. Compared to building the equivalent for dstack, ramp time was significantly shorter. One addition that would help: a minimal working example in the docs (10–15 lines) that a developer can copy, install, and run as a starting point.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Step 3 — Understanding the Task and Resources Model
|
|
54
|
+
|
|
55
|
+
**What we needed to know:**
|
|
56
|
+
How to detect a GPU workload — specifically, how to check if the user is requesting GPU accelerators on a Kubernetes backend.
|
|
57
|
+
|
|
58
|
+
**What we found in `sky.Task`:**
|
|
59
|
+
`task.resources` is a **list** of `sky.Resources` objects, not a single object. SkyPilot supports ordered resource preferences (try A100 first, fall back to V100, etc.), so the list can have multiple entries.
|
|
60
|
+
|
|
61
|
+
**What we found in `sky.Resources`:**
|
|
62
|
+
- `.accelerators` — returns `Optional[Dict[str, Union[int, float]]]`. Always a dict when set, e.g., `{'A100': 4}`. Returns `None` if no GPU is requested.
|
|
63
|
+
- `.cloud` — the cloud target. When Kubernetes is specified, `str(r.cloud).lower()` returns `'kubernetes'`.
|
|
64
|
+
|
|
65
|
+
**The check we implemented:**
|
|
66
|
+
```python
|
|
67
|
+
has_gpu = any(r.accelerators for r in task.resources)
|
|
68
|
+
is_k8s = any(str(r.cloud).lower() == "kubernetes" for r in task.resources if r.cloud)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**One gotcha:**
|
|
72
|
+
`str(r.cloud)` returns `'kubernetes'` (lowercase), not `'Kubernetes'` as suggested by the class's `_REPR = 'Kubernetes'`. A case-insensitive comparison is required.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Step 4 — Building the Plugin Package
|
|
77
|
+
|
|
78
|
+
**Package structure** (mirrored from `paralleliq-dstack-plugin`):
|
|
79
|
+
```
|
|
80
|
+
paralleliq-skypilot-plugin/
|
|
81
|
+
pyproject.toml
|
|
82
|
+
README.md
|
|
83
|
+
test_policy.py
|
|
84
|
+
src/paralleliq_skypilot_plugin/
|
|
85
|
+
__init__.py
|
|
86
|
+
policy.py
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**The full policy implementation:**
|
|
90
|
+
```python
|
|
91
|
+
from sky.admin_policy import AdminPolicy, MutatedUserRequest, UserRequest
|
|
92
|
+
|
|
93
|
+
_PIQC_RBAC_URL = "https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/rbac.yaml"
|
|
94
|
+
_PIQC_JOB_URL = "https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/scan-job.yaml"
|
|
95
|
+
|
|
96
|
+
_PIQC_MESSAGE = (
|
|
97
|
+
"\n"
|
|
98
|
+
" Paralleliq — GPU workload detected on Kubernetes.\n"
|
|
99
|
+
" Run a free GPU waste scan on this cluster before launching:\n"
|
|
100
|
+
"\n"
|
|
101
|
+
f" kubectl apply -f {_PIQC_RBAC_URL}\n"
|
|
102
|
+
f" kubectl apply -f {_PIQC_JOB_URL}\n"
|
|
103
|
+
" kubectl logs -n kube-system job/piqc-scan\n"
|
|
104
|
+
"\n"
|
|
105
|
+
" github.com/paralleliq/piqc · paralleliq.ai\n"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
class ParalleliqPolicy(AdminPolicy):
|
|
109
|
+
@classmethod
|
|
110
|
+
def validate_and_mutate(cls, user_request: UserRequest) -> MutatedUserRequest:
|
|
111
|
+
resources = user_request.task.resources or []
|
|
112
|
+
if _has_gpu(resources) and _is_kubernetes(resources):
|
|
113
|
+
print(_PIQC_MESSAGE)
|
|
114
|
+
return MutatedUserRequest(
|
|
115
|
+
task=user_request.task,
|
|
116
|
+
skypilot_config=user_request.skypilot_config,
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Registration in `~/.sky/config.yaml`:**
|
|
121
|
+
```yaml
|
|
122
|
+
admin_policy: paralleliq_skypilot_plugin.ParalleliqPolicy
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Key advantage over dstack:**
|
|
126
|
+
The dstack plugin message appears only in the dstack server log — not in the user's terminal. The SkyPilot AdminPolicy fires **client-side**, so `print()` goes directly to the user's terminal at `sky launch` time. This is a materially better developer and user experience.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Step 5 — Local Testing
|
|
131
|
+
|
|
132
|
+
**What we built:**
|
|
133
|
+
A `test_policy.py` with 3 test cases that exercise the policy directly without a cluster.
|
|
134
|
+
|
|
135
|
+
**Test results:**
|
|
136
|
+
```
|
|
137
|
+
=== Test 1: GPU + Kubernetes -> should print message ===
|
|
138
|
+
|
|
139
|
+
Paralleliq — GPU workload detected on Kubernetes.
|
|
140
|
+
Run a free GPU waste scan on this cluster before launching:
|
|
141
|
+
|
|
142
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/rbac.yaml
|
|
143
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/scan-job.yaml
|
|
144
|
+
kubectl logs -n kube-system job/piqc-scan
|
|
145
|
+
|
|
146
|
+
github.com/paralleliq/piqc · paralleliq.ai
|
|
147
|
+
|
|
148
|
+
PASS
|
|
149
|
+
|
|
150
|
+
=== Test 2: GPU + no cloud -> should NOT print message ===
|
|
151
|
+
PASS (no message above = correct)
|
|
152
|
+
|
|
153
|
+
=== Test 3: No GPU + Kubernetes -> should NOT print message ===
|
|
154
|
+
PASS (no message above = correct)
|
|
155
|
+
|
|
156
|
+
All tests passed.
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**One bug caught during testing:**
|
|
160
|
+
Initial check used `str(r.cloud) == "Kubernetes"` (capital K). The actual value is `'kubernetes'` (lowercase). Fixed to `.lower() == "kubernetes"`. This would have been a silent failure in production — the plugin would install and load but never fire.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Step 6 — GKE Cluster Testing
|
|
165
|
+
|
|
166
|
+
**Test environment:**
|
|
167
|
+
- `piqc-cluster-1` — GKE, us-central1-a, e2-standard-2, 2 nodes
|
|
168
|
+
- `piqc-cluster-2` — GKE, us-west1-a, e2-standard-2, 2 nodes
|
|
169
|
+
- CPU-only nodes (no GPUs) — testing plugin fire behavior, not GPU optimization
|
|
170
|
+
|
|
171
|
+
### 6a — Prerequisite: verify piqc still works on the cluster
|
|
172
|
+
|
|
173
|
+
Before layering in SkyPilot, confirmed piqc scans cleanly on piqc-cluster-1:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/rbac.yaml
|
|
177
|
+
kubectl apply -f https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/scan-job.yaml
|
|
178
|
+
kubectl logs -n kube-system job/piqc-scan
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Result: scan completed successfully. Cluster healthy, piqc image pulls correctly.
|
|
182
|
+
|
|
183
|
+
### 6b — Connect SkyPilot to the cluster
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
sky check kubernetes
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**First attempt failed** — SkyPilot's default Kubernetes networking mode (`portforward`) requires `socat` and GNU `netcat`, neither of which ship with macOS.
|
|
190
|
+
|
|
191
|
+
Fix:
|
|
192
|
+
```bash
|
|
193
|
+
brew install socat netcat
|
|
194
|
+
sky check kubernetes
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Second attempt succeeded:
|
|
198
|
+
```
|
|
199
|
+
Kubernetes: enabled [compute]
|
|
200
|
+
Allowed contexts:
|
|
201
|
+
└── gke_piqc-483417_us-central1-a_piqc-cluster-1: enabled.
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Expected warning about no GPU/TPU resources — clusters are CPU-only for this test.
|
|
205
|
+
|
|
206
|
+
### 6c — GPU task: policy fires, scheduling fails (expected)
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
sky launch test-task.yaml # requests A100:1 on kubernetes
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
**Result:**
|
|
213
|
+
```
|
|
214
|
+
Applying client admin policy: ParalleliqPolicy
|
|
215
|
+
|
|
216
|
+
Paralleliq — GPU workload detected on Kubernetes.
|
|
217
|
+
Run a free GPU waste scan on this cluster before launching:
|
|
218
|
+
|
|
219
|
+
kubectl apply -f .../rbac.yaml
|
|
220
|
+
kubectl apply -f .../scan-job.yaml
|
|
221
|
+
kubectl logs -n kube-system job/piqc-scan
|
|
222
|
+
|
|
223
|
+
github.com/paralleliq/piqc · paralleliq.ai
|
|
224
|
+
|
|
225
|
+
Applying server admin policy: ParalleliqPolicy
|
|
226
|
+
ResourcesUnavailableError: Kubernetes cluster does not contain any instances satisfying the request: 1x Kubernetes({'A100': 1}).
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Policy fired exactly once (client-side), message visible to the user. SkyPilot then correctly rejected the request — no A100s on a CPU-only cluster. This is the real-world scenario: user is about to launch a GPU job, our message tells them to scan first before consuming GPU capacity.
|
|
230
|
+
|
|
231
|
+
**One bug found and fixed during this step:**
|
|
232
|
+
The policy was initially printing twice — once client-side, once server-side. SkyPilot runs a local API server even for local use, and fires the AdminPolicy on both sides. Fixed by checking `user_request.at_client_side`:
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
if user_request.at_client_side and _has_gpu(resources) and _is_kubernetes(resources):
|
|
236
|
+
print(_PIQC_MESSAGE)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Confirmed via debug logging: client call has `at_client_side=True`, server call has `at_client_side=False`.
|
|
240
|
+
|
|
241
|
+
### 6d — CPU task: policy silent, task runs successfully
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
sky launch test-task-cpu.yaml # 1 CPU, kubernetes, no GPU
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
**Result:**
|
|
248
|
+
```
|
|
249
|
+
Applying client admin policy: ParalleliqPolicy
|
|
250
|
+
Applying server admin policy: ParalleliqPolicy
|
|
251
|
+
Considered resources (1 node):
|
|
252
|
+
Kubernetes (gke_piqc-48...c-cluster-1) 1 vCPU 1GB - $0.00 ✔
|
|
253
|
+
|
|
254
|
+
(task, pid=965) piqc cluster connectivity test - OK
|
|
255
|
+
✓ Job finished (status: SUCCEEDED).
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Policy stayed completely silent — no GPU, no message. Task ran to completion on piqc-cluster-1. End-to-end cluster connectivity confirmed.
|
|
259
|
+
|
|
260
|
+
**Feedback for SkyPilot:**
|
|
261
|
+
> The `at_client_side` field on `UserRequest` is well-designed and exactly what we needed to deduplicate messages across client and server. One thing worth documenting: the AdminPolicy fires twice by default (client + server), and `at_client_side` is the mechanism to control that. This isn't obvious until you hit the double-print behavior in testing.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Step 7 — PyPI Publish
|
|
266
|
+
|
|
267
|
+
*[Pending — after GKE test]*
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Summary for Zongheng
|
|
272
|
+
|
|
273
|
+
*[To be written after Steps 6 and 7 are complete]*
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
*Last updated: 2026-06-14*
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from sky.admin_policy import AdminPolicy, MutatedUserRequest, UserRequest
|
|
2
|
+
|
|
3
|
+
_PIQC_RBAC_URL = "https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/rbac.yaml"
|
|
4
|
+
_PIQC_JOB_URL = "https://raw.githubusercontent.com/paralleliq/piqc/main/deploy/scan-job.yaml"
|
|
5
|
+
|
|
6
|
+
_PIQC_MESSAGE = (
|
|
7
|
+
"\n"
|
|
8
|
+
" Paralleliq — GPU workload detected on Kubernetes.\n"
|
|
9
|
+
" Run a free GPU waste scan on this cluster before launching:\n"
|
|
10
|
+
"\n"
|
|
11
|
+
f" kubectl apply -f {_PIQC_RBAC_URL}\n"
|
|
12
|
+
f" kubectl apply -f {_PIQC_JOB_URL}\n"
|
|
13
|
+
" kubectl logs -n kube-system job/piqc-scan\n"
|
|
14
|
+
"\n"
|
|
15
|
+
" github.com/paralleliq/piqc · paralleliq.ai\n"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _has_gpu(resources_list) -> bool:
|
|
20
|
+
"""Return True if any resource in the list requests GPU accelerators."""
|
|
21
|
+
return any(r.accelerators for r in resources_list)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _is_kubernetes(resources_list) -> bool:
|
|
25
|
+
"""Return True if any resource in the list explicitly targets Kubernetes."""
|
|
26
|
+
return any(
|
|
27
|
+
r.cloud is not None and str(r.cloud).lower() == "kubernetes"
|
|
28
|
+
for r in resources_list
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ParalleliqPolicy(AdminPolicy):
|
|
33
|
+
@classmethod
|
|
34
|
+
def validate_and_mutate(cls, user_request: UserRequest) -> MutatedUserRequest:
|
|
35
|
+
resources = user_request.task.resources or []
|
|
36
|
+
if user_request.at_client_side and _has_gpu(resources) and _is_kubernetes(resources):
|
|
37
|
+
print(_PIQC_MESSAGE)
|
|
38
|
+
return MutatedUserRequest(
|
|
39
|
+
task=user_request.task,
|
|
40
|
+
skypilot_config=user_request.skypilot_config,
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Local test for ParalleliqPolicy — no cluster required."""
|
|
2
|
+
import sys
|
|
3
|
+
sys.path.insert(0, "src")
|
|
4
|
+
|
|
5
|
+
import sky
|
|
6
|
+
from sky.admin_policy import UserRequest, MutatedUserRequest
|
|
7
|
+
from sky.server.requests.request_names import AdminPolicyRequestName
|
|
8
|
+
from paralleliq_skypilot_plugin import ParalleliqPolicy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def make_request(accelerators=None, cloud=None):
|
|
12
|
+
resources = sky.Resources(accelerators=accelerators, cloud=cloud)
|
|
13
|
+
task = sky.Task()
|
|
14
|
+
task.set_resources(resources)
|
|
15
|
+
config = sky.Config()
|
|
16
|
+
return UserRequest(
|
|
17
|
+
task=task,
|
|
18
|
+
skypilot_config=config,
|
|
19
|
+
request_name=AdminPolicyRequestName.CLUSTER_LAUNCH,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
print("=== Test 1: GPU + Kubernetes -> should print message ===")
|
|
24
|
+
req = make_request(accelerators="A100:1", cloud="kubernetes")
|
|
25
|
+
result = ParalleliqPolicy.validate_and_mutate(req)
|
|
26
|
+
assert isinstance(result, MutatedUserRequest)
|
|
27
|
+
print("PASS\n")
|
|
28
|
+
|
|
29
|
+
print("=== Test 2: GPU + no cloud -> should NOT print message ===")
|
|
30
|
+
req = make_request(accelerators="A100:1", cloud=None)
|
|
31
|
+
result = ParalleliqPolicy.validate_and_mutate(req)
|
|
32
|
+
assert isinstance(result, MutatedUserRequest)
|
|
33
|
+
print("PASS (no message above = correct)\n")
|
|
34
|
+
|
|
35
|
+
print("=== Test 3: No GPU + Kubernetes -> should NOT print message ===")
|
|
36
|
+
req = make_request(accelerators=None, cloud="kubernetes")
|
|
37
|
+
result = ParalleliqPolicy.validate_and_mutate(req)
|
|
38
|
+
assert isinstance(result, MutatedUserRequest)
|
|
39
|
+
print("PASS (no message above = correct)\n")
|
|
40
|
+
|
|
41
|
+
print("All tests passed.")
|