siclaw 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -114
- package/dist/agentbox/gateway-client.d.ts +2 -1
- package/dist/agentbox/gateway-client.js +6 -2
- package/dist/agentbox/gateway-client.js.map +1 -1
- package/dist/agentbox/http-server.js +184 -19
- package/dist/agentbox/http-server.js.map +1 -1
- package/dist/agentbox/resource-handlers.d.ts +1 -0
- package/dist/agentbox/resource-handlers.js +23 -23
- package/dist/agentbox/resource-handlers.js.map +1 -1
- package/dist/agentbox/session.js +85 -5
- package/dist/agentbox/session.js.map +1 -1
- package/dist/agentbox-main.d.ts +2 -1
- package/dist/agentbox-main.js +65 -18
- package/dist/agentbox-main.js.map +1 -1
- package/dist/cli-credentials.d.ts +1 -0
- package/dist/cli-credentials.js +109 -0
- package/dist/cli-credentials.js.map +1 -0
- package/dist/cli-first-run.d.ts +11 -0
- package/dist/cli-first-run.js +99 -0
- package/dist/cli-first-run.js.map +1 -0
- package/dist/cli-main.js +33 -11
- package/dist/cli-main.js.map +1 -1
- package/dist/cli-setup.d.ts +5 -11
- package/dist/cli-setup.js +12 -225
- package/dist/cli-setup.js.map +1 -1
- package/dist/core/agent-factory.d.ts +4 -0
- package/dist/core/agent-factory.js +102 -151
- package/dist/core/agent-factory.js.map +1 -1
- package/dist/core/config.d.ts +10 -3
- package/dist/core/config.js +11 -95
- package/dist/core/config.js.map +1 -1
- package/dist/core/extensions/deep-investigation.d.ts +2 -1
- package/dist/core/extensions/deep-investigation.js +144 -24
- package/dist/core/extensions/deep-investigation.js.map +1 -1
- package/dist/core/extensions/setup.d.ts +8 -0
- package/dist/core/extensions/setup.js +669 -0
- package/dist/core/extensions/setup.js.map +1 -0
- package/dist/core/llm-proxy.js +7 -3
- package/dist/core/llm-proxy.js.map +1 -1
- package/dist/core/mcp-client.d.ts +0 -10
- package/dist/core/mcp-client.js +0 -65
- package/dist/core/mcp-client.js.map +1 -1
- package/dist/core/prompt.d.ts +1 -1
- package/dist/core/prompt.js +42 -5
- package/dist/core/prompt.js.map +1 -1
- package/dist/core/provider-presets.d.ts +14 -0
- package/dist/core/provider-presets.js +81 -0
- package/dist/core/provider-presets.js.map +1 -0
- package/dist/cron/cron-coordinator.d.ts +2 -0
- package/dist/cron/cron-coordinator.js +46 -14
- package/dist/cron/cron-coordinator.js.map +1 -1
- package/dist/cron/cron-executor.js +33 -8
- package/dist/cron/cron-executor.js.map +1 -1
- package/dist/cron/cron-scheduler.d.ts +1 -1
- package/dist/cron/gateway-client.d.ts +5 -0
- package/dist/cron/gateway-client.js +43 -8
- package/dist/cron/gateway-client.js.map +1 -1
- package/dist/cron-main.js +39 -9
- package/dist/cron-main.js.map +1 -1
- package/dist/gateway/agentbox/client.d.ts +11 -0
- package/dist/gateway/agentbox/client.js +18 -0
- package/dist/gateway/agentbox/client.js.map +1 -1
- package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
- package/dist/gateway/agentbox/k8s-spawner.js +95 -52
- package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
- package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
- package/dist/gateway/agentbox/local-spawner.js +4 -2
- package/dist/gateway/agentbox/local-spawner.js.map +1 -1
- package/dist/gateway/agentbox/manager.d.ts +0 -10
- package/dist/gateway/agentbox/manager.js +11 -30
- package/dist/gateway/agentbox/manager.js.map +1 -1
- package/dist/gateway/agentbox/types.d.ts +6 -4
- package/dist/gateway/cron/cron-service.d.ts +49 -0
- package/dist/gateway/cron/cron-service.js +259 -0
- package/dist/gateway/cron/cron-service.js.map +1 -0
- package/dist/gateway/db/init-schema.js +44 -0
- package/dist/gateway/db/init-schema.js.map +1 -1
- package/dist/gateway/db/migrate-sqlite.js +73 -4
- package/dist/gateway/db/migrate-sqlite.js.map +1 -1
- package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
- package/dist/gateway/db/repositories/chat-repo.js +132 -2
- package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
- package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
- package/dist/gateway/db/repositories/config-repo.js +57 -7
- package/dist/gateway/db/repositories/config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
- package/dist/gateway/db/repositories/env-repo.js +15 -2
- package/dist/gateway/db/repositories/env-repo.js.map +1 -1
- package/dist/gateway/db/repositories/model-config-repo.d.ts +1 -1
- package/dist/gateway/db/repositories/model-config-repo.js +26 -12
- package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
- package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
- package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
- package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
- package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
- package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
- package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
- package/dist/gateway/db/repositories/system-config-repo.js +2 -1
- package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
- package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
- package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
- package/dist/gateway/db/repositories/workspace-repo.js +6 -2
- package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
- package/dist/gateway/db/schema-mysql.d.ts +473 -51
- package/dist/gateway/db/schema-mysql.js +35 -4
- package/dist/gateway/db/schema-mysql.js.map +1 -1
- package/dist/gateway/db/schema-sqlite.d.ts +522 -57
- package/dist/gateway/db/schema-sqlite.js +38 -6
- package/dist/gateway/db/schema-sqlite.js.map +1 -1
- package/dist/gateway/db/schema.d.ts +471 -51
- package/dist/gateway/db/schema.js +1 -1
- package/dist/gateway/db/schema.js.map +1 -1
- package/dist/gateway/metrics-aggregator.d.ts +65 -0
- package/dist/gateway/metrics-aggregator.js +244 -0
- package/dist/gateway/metrics-aggregator.js.map +1 -0
- package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
- package/dist/gateway/plugins/channel-bridge.js +78 -86
- package/dist/gateway/plugins/channel-bridge.js.map +1 -1
- package/dist/gateway/rpc-methods.d.ts +4 -2
- package/dist/gateway/rpc-methods.js +962 -163
- package/dist/gateway/rpc-methods.js.map +1 -1
- package/dist/gateway/security/cert-manager.d.ts +2 -2
- package/dist/gateway/security/cert-manager.js +4 -2
- package/dist/gateway/security/cert-manager.js.map +1 -1
- package/dist/gateway/server.d.ts +4 -8
- package/dist/gateway/server.js +297 -261
- package/dist/gateway/server.js.map +1 -1
- package/dist/gateway/skills/file-writer.js +17 -11
- package/dist/gateway/skills/file-writer.js.map +1 -1
- package/dist/gateway/skills/script-evaluator.js +12 -9
- package/dist/gateway/skills/script-evaluator.js.map +1 -1
- package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
- package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
- package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
- package/dist/gateway/web/dist/assets/index-CAmSY91d.js +675 -0
- package/dist/gateway/web/dist/assets/index-DMFEh8Pp.css +1 -0
- package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
- package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
- package/dist/gateway/web/dist/index.html +2 -2
- package/dist/gateway-main.js +27 -10
- package/dist/gateway-main.js.map +1 -1
- package/dist/memory/embeddings.js +5 -4
- package/dist/memory/embeddings.js.map +1 -1
- package/dist/memory/indexer.d.ts +23 -3
- package/dist/memory/indexer.js +235 -23
- package/dist/memory/indexer.js.map +1 -1
- package/dist/memory/schema.js +15 -1
- package/dist/memory/schema.js.map +1 -1
- package/dist/memory/types.d.ts +18 -0
- package/dist/memory/types.js +6 -1
- package/dist/memory/types.js.map +1 -1
- package/dist/shared/detect-language.d.ts +12 -0
- package/dist/shared/detect-language.js +78 -0
- package/dist/shared/detect-language.js.map +1 -0
- package/dist/shared/diagnostic-events.d.ts +70 -0
- package/dist/shared/diagnostic-events.js +38 -0
- package/dist/shared/diagnostic-events.js.map +1 -0
- package/dist/shared/local-collector.d.ts +56 -0
- package/dist/shared/local-collector.js +284 -0
- package/dist/shared/local-collector.js.map +1 -0
- package/dist/shared/metrics-types.d.ts +64 -0
- package/dist/shared/metrics-types.js +25 -0
- package/dist/shared/metrics-types.js.map +1 -0
- package/dist/shared/metrics.d.ts +19 -0
- package/dist/shared/metrics.js +185 -0
- package/dist/shared/metrics.js.map +1 -0
- package/dist/shared/path-utils.d.ts +15 -0
- package/dist/shared/path-utils.js +23 -0
- package/dist/shared/path-utils.js.map +1 -0
- package/dist/shared/retry.d.ts +35 -0
- package/dist/shared/retry.js +61 -0
- package/dist/shared/retry.js.map +1 -0
- package/dist/tools/command-sets.d.ts +18 -2
- package/dist/tools/command-sets.js +207 -32
- package/dist/tools/command-sets.js.map +1 -1
- package/dist/tools/command-validator.d.ts +56 -0
- package/dist/tools/command-validator.js +357 -0
- package/dist/tools/command-validator.js.map +1 -0
- package/dist/tools/create-skill.js +26 -1
- package/dist/tools/create-skill.js.map +1 -1
- package/dist/tools/credential-list.js +1 -23
- package/dist/tools/credential-list.js.map +1 -1
- package/dist/tools/credential-manager.d.ts +98 -0
- package/dist/tools/credential-manager.js +313 -0
- package/dist/tools/credential-manager.js.map +1 -0
- package/dist/tools/deep-search/engine.js +184 -127
- package/dist/tools/deep-search/engine.js.map +1 -1
- package/dist/tools/deep-search/prompts.d.ts +10 -2
- package/dist/tools/deep-search/prompts.js +37 -36
- package/dist/tools/deep-search/prompts.js.map +1 -1
- package/dist/tools/deep-search/schemas.d.ts +87 -0
- package/dist/tools/deep-search/schemas.js +85 -0
- package/dist/tools/deep-search/schemas.js.map +1 -0
- package/dist/tools/deep-search/sub-agent.d.ts +21 -0
- package/dist/tools/deep-search/sub-agent.js +153 -4
- package/dist/tools/deep-search/sub-agent.js.map +1 -1
- package/dist/tools/deep-search/tool.js +1 -0
- package/dist/tools/deep-search/tool.js.map +1 -1
- package/dist/tools/deep-search/types.d.ts +2 -0
- package/dist/tools/deep-search/types.js.map +1 -1
- package/dist/tools/dp-tools.js +29 -5
- package/dist/tools/dp-tools.js.map +1 -1
- package/dist/tools/exec-utils.d.ts +85 -0
- package/dist/tools/exec-utils.js +294 -0
- package/dist/tools/exec-utils.js.map +1 -0
- package/dist/tools/fork-skill.js +14 -2
- package/dist/tools/fork-skill.js.map +1 -1
- package/dist/tools/investigation-feedback.d.ts +3 -0
- package/dist/tools/investigation-feedback.js +71 -0
- package/dist/tools/investigation-feedback.js.map +1 -0
- package/dist/tools/manage-schedule.js +16 -6
- package/dist/tools/manage-schedule.js.map +1 -1
- package/dist/tools/netns-script.js +27 -281
- package/dist/tools/netns-script.js.map +1 -1
- package/dist/tools/node-exec.d.ts +2 -14
- package/dist/tools/node-exec.js +18 -225
- package/dist/tools/node-exec.js.map +1 -1
- package/dist/tools/node-script.js +14 -168
- package/dist/tools/node-script.js.map +1 -1
- package/dist/tools/pod-exec.d.ts +1 -1
- package/dist/tools/pod-exec.js +10 -26
- package/dist/tools/pod-exec.js.map +1 -1
- package/dist/tools/pod-nsenter-exec.js +21 -225
- package/dist/tools/pod-nsenter-exec.js.map +1 -1
- package/dist/tools/pod-script.js +10 -19
- package/dist/tools/pod-script.js.map +1 -1
- package/dist/tools/restricted-bash.d.ts +1 -17
- package/dist/tools/restricted-bash.js +38 -252
- package/dist/tools/restricted-bash.js.map +1 -1
- package/dist/tools/run-skill.d.ts +3 -1
- package/dist/tools/run-skill.js +21 -1
- package/dist/tools/run-skill.js.map +1 -1
- package/dist/tools/script-resolver.d.ts +3 -1
- package/dist/tools/script-resolver.js +74 -30
- package/dist/tools/script-resolver.js.map +1 -1
- package/dist/tools/update-skill.js +17 -6
- package/dist/tools/update-skill.js.map +1 -1
- package/package.json +8 -6
- package/siclaw.mjs +10 -1
- package/skills/core/cluster-events/SKILL.md +1 -1
- package/skills/core/deep-investigation/SKILL.md +11 -0
- package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
- package/skills/core/dns-debug/SKILL.md +1 -0
- package/skills/core/meta.json +12 -1
- package/skills/core/networkpolicy-debug/SKILL.md +332 -0
- package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
- package/skills/core/pod-pending-debug/SKILL.md +1 -0
- package/skills/core/quota-debug/SKILL.md +203 -0
- package/skills/core/service-debug/SKILL.md +1 -0
- package/skills/core/statefulset-debug/SKILL.md +280 -0
- package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
- package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
- package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
- package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
- package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
- package/skills/core/volcano-node-resources/SKILL.md +334 -0
- package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
- package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
- package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
- package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
- package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
- package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
- package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
- package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
- package/skills/platform/create-skill/SKILL.md +35 -3
- package/skills/platform/manage-skill/SKILL.md +9 -2
- package/skills/platform/update-skill/SKILL.md +17 -6
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: statefulset-debug
|
|
3
|
+
description: >-
|
|
4
|
+
Diagnose StatefulSet rollout and scaling failures (ordered update stuck, OnDelete not updating, partition misconfiguration, PVC binding deadlocks).
|
|
5
|
+
Checks update strategy, pod ordinal progression, PVC bindings, and ordered startup to identify why a StatefulSet is not progressing.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# StatefulSet Rollout & Scaling Failure Diagnosis
|
|
9
|
+
|
|
10
|
+
When a StatefulSet rollout is stuck, pods are not updating, or scaling is not progressing, follow this flow to identify the root cause.
|
|
11
|
+
|
|
12
|
+
**Scope:** This skill is for **diagnosis only**. Once you identify the root cause, report it to the user and stop. Do NOT attempt to modify the StatefulSet, delete pods, or change PVCs — that should be left to the user.
|
|
13
|
+
|
|
14
|
+
**When to use:** A StatefulSet is not progressing — pods are not updating to the new version, scaling up/down is stuck, or specific ordinal pods are not becoming ready.
|
|
15
|
+
|
|
16
|
+
**Not for Deployments:** Deployment rollouts have different semantics (parallel, unordered). Use `deployment-rollout-debug` for Deployments.
|
|
17
|
+
|
|
18
|
+
## Key Concepts
|
|
19
|
+
|
|
20
|
+
StatefulSets differ fundamentally from Deployments:
|
|
21
|
+
- **Fixed pod identity** — pods have stable names with ordinal suffixes (pod-0, pod-1, ...)
|
|
22
|
+
- **Ordered operations** — updates go in reverse order (N-1 → 0), scaling up goes in forward order (0 → N-1)
|
|
23
|
+
- **Per-pod PVCs** — each pod gets its own PersistentVolumeClaim via `volumeClaimTemplates`
|
|
24
|
+
- **Blocking progression** — in OrderedReady mode (default), if pod at ordinal K is not Ready, all pods with ordinal < K will NOT be updated
|
|
25
|
+
|
|
26
|
+
## Diagnostic Flow
|
|
27
|
+
|
|
28
|
+
### 1. Get StatefulSet overview
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
kubectl get statefulset <name> -n <ns> -o wide
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Compare the columns:
|
|
35
|
+
- **READY** — pods that are running and ready
|
|
36
|
+
- **REPLICAS** — desired replica count (from `spec.replicas`)
|
|
37
|
+
- **UP-TO-DATE** — pods running the current version (matching `currentRevision` == `updateRevision`)
|
|
38
|
+
|
|
39
|
+
If `READY < REPLICAS` or there is no `UP-TO-DATE` column showing full count, the rollout or scaling is incomplete.
|
|
40
|
+
|
|
41
|
+
### 2. Describe the StatefulSet
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
kubectl describe statefulset <name> -n <ns>
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Focus on:
|
|
48
|
+
- **Update Strategy** — `RollingUpdate` or `OnDelete`
|
|
49
|
+
- **Partition** — if set, only pods with ordinal ≥ partition are updated
|
|
50
|
+
- **maxUnavailable** — if set (Kubernetes 1.24+), allows multiple pods to be updated simultaneously instead of one-at-a-time
|
|
51
|
+
- **Current Revision / Update Revision** — if different, an update is in progress
|
|
52
|
+
- **Events** — look for errors or warnings
|
|
53
|
+
|
|
54
|
+
### 3. Check pod status by ordinal
|
|
55
|
+
|
|
56
|
+
First get the StatefulSet's pod selector to reliably find its pods:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.selector.matchLabels}'
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Then use the returned labels to list pods:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
kubectl get pods -n <ns> -l <key>=<value> --sort-by='.metadata.name'
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Identify which ordinal pod is stuck. In a StatefulSet with OrderedReady policy, **the stuck pod blocks all subsequent operations**.
|
|
69
|
+
|
|
70
|
+
### 4. Match the failure pattern
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
#### OnDelete strategy — Pods not updating after StatefulSet change
|
|
75
|
+
|
|
76
|
+
The StatefulSet uses `updateStrategy.type: OnDelete`. In this mode, Kubernetes does **not** automatically update pods — the user must manually delete each pod for it to be recreated with the new spec.
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.updateStrategy}'
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
If the output shows `{"type":"OnDelete"}` or no `rollingUpdate` field:
|
|
83
|
+
|
|
84
|
+
Check if the current and update revisions differ:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='current={.status.currentRevision} update={.status.updateRevision}'
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
If they differ, the StatefulSet spec has been updated but pods are still running the old version. This is **expected behavior** for OnDelete — pods must be manually deleted to pick up the new version.
|
|
91
|
+
|
|
92
|
+
Check which pods are still on the old revision (use the selector from step 3):
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
kubectl get pods -n <ns> -l <key>=<value> -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.controller-revision-hash}{"\n"}{end}'
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Pods whose `controller-revision-hash` matches `currentRevision` (not `updateRevision`) are still on the old version.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
#### RollingUpdate stuck at a specific ordinal — Ordered update blocked
|
|
103
|
+
|
|
104
|
+
In RollingUpdate mode, StatefulSet updates pods in **reverse ordinal order** (N-1 → N-2 → ... → 0). By default (one-at-a-time), if pod at ordinal K is not Ready, the update stops — pods K-1, K-2, ..., 0 will not be updated.
|
|
105
|
+
|
|
106
|
+
**Check maxUnavailable** (Kubernetes 1.24+, GA in 1.27):
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.updateStrategy.rollingUpdate.maxUnavailable}'
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
If `maxUnavailable` is set (e.g., `3`), multiple pods can be updated simultaneously instead of strict one-at-a-time. In this case, seeing 2-3 pods updating at once is normal — not a sign of being stuck. Only investigate if the number of updating pods is below `maxUnavailable` for an extended period, or if specific pods are stuck in a non-Ready state.
|
|
113
|
+
|
|
114
|
+
Find pods that are not Ready:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
kubectl get pods -n <ns> -l <key>=<value> --sort-by='.metadata.name'
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Check the stuck pod's status:
|
|
121
|
+
- **Pending** → Use `pod-pending-debug`
|
|
122
|
+
- **CrashLoopBackOff / Error** → Use `pod-crash-debug`
|
|
123
|
+
- **ImagePullBackOff** → Use `image-pull-debug`
|
|
124
|
+
- **Running but not Ready** → Check readiness probe (see below)
|
|
125
|
+
|
|
126
|
+
If the pod is Running but not Ready:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
kubectl describe pod <stuck-pod> -n <ns>
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Look for `Readiness probe failed` events. Common causes:
|
|
133
|
+
- Application not listening on the expected port after config change
|
|
134
|
+
- New version has a bug that prevents health check from passing
|
|
135
|
+
- Readiness probe configuration too aggressive for the new version's startup time
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
#### Partition update — Only some pods updated
|
|
140
|
+
|
|
141
|
+
The StatefulSet has `spec.updateStrategy.rollingUpdate.partition` set. Only pods with ordinal **≥ partition** are updated; pods with ordinal < partition remain on the old version.
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.updateStrategy.rollingUpdate.partition}'
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
If this returns a number (e.g., `3`), then pods 0, 1, 2 will NOT be updated. This is often used intentionally for **canary rollouts** — update a subset first, verify, then lower the partition to 0 to roll out fully.
|
|
148
|
+
|
|
149
|
+
If the user expects all pods to be updated, the partition value needs to be set to `0` or removed.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
#### Scaling up stuck — Ordered creation blocked
|
|
154
|
+
|
|
155
|
+
When scaling up, StatefulSet creates pods in **forward ordinal order** (0 → 1 → 2 → ...). Pod at ordinal K+1 is not created until pod K is Running and Ready.
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
kubectl get pods -n <ns> | grep <statefulset-name>
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Find the highest ordinal pod that exists — the next ordinal is waiting for this pod to become Ready.
|
|
162
|
+
|
|
163
|
+
Check why the current highest pod is not Ready (same diagnosis as the "stuck at specific ordinal" pattern above).
|
|
164
|
+
|
|
165
|
+
For the `podManagementPolicy` field:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.podManagementPolicy}'
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
- **OrderedReady** (default) — strict ordered creation, one at a time
|
|
172
|
+
- **Parallel** — all pods are created simultaneously (no ordering guarantee)
|
|
173
|
+
|
|
174
|
+
If the policy is `Parallel` and pods are still stuck, the issue is not ordering — check individual pod status.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
#### PVC binding deadlock — Pod stuck in Pending due to volume topology
|
|
179
|
+
|
|
180
|
+
StatefulSet pods use `volumeClaimTemplates` to create per-pod PVCs. If the PVC is bound to a PV in a specific availability zone (AZ) or node, but that node/AZ has no resources, the pod cannot be scheduled.
|
|
181
|
+
|
|
182
|
+
Check PVC status for the stuck pod:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
kubectl get pvc -n <ns> | grep <statefulset-name>
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
kubectl describe pvc <pvc-name> -n <ns>
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Check the StorageClass's `volumeBindingMode`:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
kubectl get storageclass $(kubectl get pvc <pvc-name> -n <ns> -o jsonpath='{.spec.storageClassName}') -o jsonpath='{.volumeBindingMode}'
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
- **Immediate** — PVC is bound to a PV as soon as created, regardless of pod scheduling. If the PV is in a different zone than the only available nodes, the pod cannot be scheduled.
|
|
199
|
+
- **WaitForFirstConsumer** — PVC binding is delayed until the pod is scheduled. If no node can satisfy both the pod's scheduling constraints and the storage topology, the PVC stays `Pending` and the pod stays `Pending` — a deadlock.
|
|
200
|
+
|
|
201
|
+
Check if the PV has a node affinity constraint:
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
kubectl get pv <pv-name> -o jsonpath='{.spec.nodeAffinity}'
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
If the PV is locked to a specific node/zone:
|
|
208
|
+
- Check if that node has available resources: `kubectl describe node <node>`
|
|
209
|
+
- Check if that node is healthy: `kubectl get node <node>`
|
|
210
|
+
|
|
211
|
+
**Common scenario:** A node was replaced or drained, but the PV is still bound to the old node's zone. The new pod can only be scheduled to nodes that can access this PV, but those nodes may be full or tainted.
|
|
212
|
+
|
|
213
|
+
For further PVC diagnosis, use the `pvc-debug` skill.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
#### Scaling down — PVCs left behind
|
|
218
|
+
|
|
219
|
+
When a StatefulSet is scaled down, pods are deleted in **reverse ordinal order** (N-1 → N-2 → ...). However, Kubernetes does **not** automatically delete the associated PVCs.
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
kubectl get pvc -n <ns> | grep <statefulset-name>
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
If there are PVCs for ordinals that no longer exist (e.g., `data-myapp-3` when replicas is 2), these are orphaned PVCs from a previous scale-down.
|
|
226
|
+
|
|
227
|
+
This is by design to prevent data loss. But when scaling back up, the new pod will reattach to the old PVC with stale data, which may cause application issues.
|
|
228
|
+
|
|
229
|
+
Check the StatefulSet's `persistentVolumeClaimRetentionPolicy` (Kubernetes 1.27+):
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.persistentVolumeClaimRetentionPolicy}'
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
- **whenDeleted: Retain** (default) — PVCs are kept when StatefulSet is deleted
|
|
236
|
+
- **whenScaled: Retain** (default) — PVCs are kept when scaling down
|
|
237
|
+
- **whenScaled: Delete** — PVCs are automatically deleted on scale-down
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
#### Pod stuck in Terminating during update or scale-down
|
|
242
|
+
|
|
243
|
+
During an update or scale-down, if a pod is stuck in `Terminating`, the next operation cannot proceed.
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
kubectl describe pod <terminating-pod> -n <ns>
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
First check if a PodDisruptionBudget (PDB) is preventing the deletion:
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
kubectl get pdb -n <ns>
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
kubectl describe pdb <pdb-name> -n <ns>
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
If the PDB's `minAvailable` or `maxUnavailable` limit has been reached, the StatefulSet controller cannot delete the pod. Check `status.disruptionsAllowed` — if it is `0`, no more pods can be disrupted until other pods become Ready.
|
|
260
|
+
|
|
261
|
+
If PDB is not the issue, check other common causes:
|
|
262
|
+
- **Finalizer blocking deletion** — check `metadata.finalizers`
|
|
263
|
+
- **PreStop hook hanging** — a long-running preStop hook delays termination
|
|
264
|
+
- **Process not responding to SIGTERM** — the container process ignores shutdown signals and must wait for `terminationGracePeriodSeconds` to expire
|
|
265
|
+
- **Volume unmount stuck** — the volume cannot be detached from the node
|
|
266
|
+
|
|
267
|
+
Check the grace period:
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
kubectl get pod <pod> -n <ns> -o jsonpath='{.spec.terminationGracePeriodSeconds}'
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## Notes
|
|
274
|
+
|
|
275
|
+
- StatefulSet updates go in **reverse** ordinal order (N-1 → 0), but scaling up goes in **forward** order (0 → N-1). This is a common source of confusion.
|
|
276
|
+
- `OnDelete` is frequently used in database StatefulSets (MySQL, PostgreSQL, etc.) where the operator wants manual control over when each replica is restarted. If a user complains that pods are not updating, check the strategy before assuming there is a bug.
|
|
277
|
+
- The `partition` field is for canary rollouts. A common workflow: set partition=N-1 to update only the last pod, verify, then set partition=0 to roll out to all pods. If a user sees partial updates, check partition before investigating further.
|
|
278
|
+
- PVCs created by `volumeClaimTemplates` follow the naming convention `<volumeClaimTemplate-name>-<statefulset-name>-<ordinal>`. Use this pattern to find PVCs for specific ordinals.
|
|
279
|
+
- Unlike Deployments, StatefulSets do NOT create new ReplicaSets for updates. They update pods in-place (delete old pod, create new pod with same name and PVC).
|
|
280
|
+
- For cross-reference: if the stuck pod's issue is at the scheduling level, use `pod-pending-debug`. If it is crashing, use `pod-crash-debug`. If PVCs are not binding, use `pvc-debug`.
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: volcano-diagnose-pod
|
|
3
|
+
description: >-
|
|
4
|
+
Diagnose Volcano-managed Pod scheduling issues.
|
|
5
|
+
Checks Pod status, PodGroup, events, and Queue to identify scheduling failures.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Volcano Pod Diagnosis
|
|
9
|
+
|
|
10
|
+
Diagnose Volcano-managed Pod scheduling issues. This skill checks Pod status, associated PodGroup, scheduling events, and Queue configuration to identify why a Pod cannot be scheduled.
|
|
11
|
+
|
|
12
|
+
**Scope:** This skill is for **diagnosis only**. Once you identify the root cause, report it to the user and stop. Do NOT attempt to modify pod specs, PodGroups, or Queues — that should be left to the user.
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod <pod-name> --namespace <namespace>
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Parameters
|
|
21
|
+
|
|
22
|
+
| Parameter | Required | Description |
|
|
23
|
+
|-----------|----------|-------------|
|
|
24
|
+
| `--pod POD` | yes | Pod name to diagnose |
|
|
25
|
+
| `--namespace NS` | no | Namespace (default: `default`) |
|
|
26
|
+
| `--verbose` | no | Show detailed output including node resources |
|
|
27
|
+
|
|
28
|
+
## Examples
|
|
29
|
+
|
|
30
|
+
Diagnose a pending pod in default namespace:
|
|
31
|
+
```bash
|
|
32
|
+
bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod my-job-0
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Diagnose a pod in specific namespace:
|
|
36
|
+
```bash
|
|
37
|
+
bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod my-job-0 --namespace training
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Verbose mode with node resource information:
|
|
41
|
+
```bash
|
|
42
|
+
bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod my-job-0 --namespace training --verbose
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Diagnostic Flow
|
|
46
|
+
|
|
47
|
+
The script performs the following checks in order:
|
|
48
|
+
|
|
49
|
+
### 1. Pod Status
|
|
50
|
+
Check the Pod's current phase and conditions.
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
kubectl get pod <pod> -n <ns> -o wide
|
|
54
|
+
kubectl describe pod <pod> -n <ns>
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 2. PodGroup Status
|
|
58
|
+
Check if the Pod is associated with a PodGroup and its scheduling status.
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
kubectl get pod <pod> -n <ns> -o jsonpath='{.metadata.annotations.scheduling.volcano.sh/pod-group}'
|
|
62
|
+
kubectl get podgroup <podgroup> -n <ns>
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Key fields to check:
|
|
66
|
+
- `spec.minMember`: Minimum members required for Gang scheduling
|
|
67
|
+
- `status.phase`: Pending, Inqueue, Running, Unknown
|
|
68
|
+
- `status.running`: Number of running pods
|
|
69
|
+
- `status.pending`: Number of pending pods
|
|
70
|
+
|
|
71
|
+
### 3. Events Analysis
|
|
72
|
+
Check scheduling events for failure reasons.
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
kubectl get events -n <ns> --field-selector involvedObject.name=<pod> --sort-by='.lastTimestamp'
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Look for these event patterns:
|
|
79
|
+
|
|
80
|
+
#### `FailedScheduling` - General scheduling failure
|
|
81
|
+
The scheduler attempted but failed to schedule the pod. Check the message for specific reasons.
|
|
82
|
+
|
|
83
|
+
**Volcano-specific sub-patterns:**
|
|
84
|
+
|
|
85
|
+
| Event Message | Meaning | Next Step |
|
|
86
|
+
|---------------|---------|-----------|
|
|
87
|
+
| `0/N nodes are available` + `minMember` | Gang constraint not satisfied | Use `volcano-gang-scheduling` |
|
|
88
|
+
| `exceeded quota` / `queue resource exceeded` | Queue deserved resources exhausted | Use `volcano-queue-diagnose` |
|
|
89
|
+
| `Insufficient cpu/memory` + Gang mention | Resource shortage blocking Gang | Use `volcano-resource-insufficient` |
|
|
90
|
+
| `pod group is not ready` | PodGroup not in Inqueue phase | Check PodGroup status |
|
|
91
|
+
| `task <name> is not ready` | Task dependencies not met | Check dependent tasks |
|
|
92
|
+
|
|
93
|
+
> **Quick Reference vs Detailed Analysis:** The table above provides a quick lookup for common patterns. The sections below provide detailed analysis, additional context, and more diagnostic commands for each pattern.
|
|
94
|
+
|
|
95
|
+
#### `Insufficient cpu` / `Insufficient memory` - Resource shortage
|
|
96
|
+
No node has enough allocatable resources. Check:
|
|
97
|
+
- Node resources: `kubectl top nodes`
|
|
98
|
+
- Pod resource requests: `kubectl get pod <pod> -n <ns> -o jsonpath='{.spec.containers[*].resources.requests}'`
|
|
99
|
+
|
|
100
|
+
**Volcano context:** If this is a Gang-scheduled pod, even if total cluster resources are sufficient, you need enough resources **simultaneously** on enough nodes. Use `volcano-resource-insufficient` to check fragmentation.
|
|
101
|
+
|
|
102
|
+
#### `minMember` not satisfied - Gang constraint
|
|
103
|
+
The PodGroup requires `minMember` pods to be scheduled simultaneously, but the cluster cannot satisfy this. Use `volcano-gang-scheduling` skill for detailed diagnosis.
|
|
104
|
+
|
|
105
|
+
**Key insight:** Even if `kubectl top nodes` shows enough total resources, Gang requires **simultaneous** availability on **different nodes**.
|
|
106
|
+
|
|
107
|
+
#### `queue resource exceeded` - Queue quota limit
|
|
108
|
+
The Queue associated with this Pod has exceeded its deserved resources. Check Queue status with `volcano-queue-diagnose` skill.
|
|
109
|
+
|
|
110
|
+
**Volcano-specific terms you might see:**
|
|
111
|
+
- `overused` - Queue has exceeded its fair share
|
|
112
|
+
- `deserved resources` - Calculated from queue weight proportion
|
|
113
|
+
- `allocated resources` - Currently used by jobs in this queue
|
|
114
|
+
|
|
115
|
+
#### `reclaim` events - Resource reclamation triggered
|
|
116
|
+
If you see events mentioning `reclaim`:
|
|
117
|
+
- Another queue is trying to reclaim resources from your pod's queue
|
|
118
|
+
- Your queue may be `over-allocated` (allocated > deserved)
|
|
119
|
+
- Check queue status: `volcano-queue-diagnose --queue <queue>`
|
|
120
|
+
|
|
121
|
+
#### `preempt` events - Priority preemption
|
|
122
|
+
Higher priority workload is evicting this pod. Check:
|
|
123
|
+
- Pod priority class: `kubectl get pod <pod> -o jsonpath='{.spec.priorityClassName}'`
|
|
124
|
+
- Preemptor details in scheduler logs: `volcano-scheduler-logs --keyword preempt`
|
|
125
|
+
|
|
126
|
+
#### `enqueue` related events
|
|
127
|
+
- `PodGroup is enqueued` - PodGroup admitted to queue, ready for scheduling
|
|
128
|
+
- `PodGroup is pending` - Waiting for queue admission (capacity or resource check)
|
|
129
|
+
- `enqueue failed` - Failed admission check (overcommit, queue closed, etc.)
|
|
130
|
+
|
|
131
|
+
### 4. Queue Status
|
|
132
|
+
Check the Queue configuration and resource allocation.
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
kubectl get podgroup <podgroup> -n <ns> -o jsonpath='{.spec.queue}'
|
|
136
|
+
kubectl get queue <queue>
|
|
137
|
+
kubectl describe queue <queue>
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Key fields:
|
|
141
|
+
- `spec.weight`: Queue weight for resource sharing
|
|
142
|
+
- `spec.capability`: Maximum resources the queue can use
|
|
143
|
+
- `status.state`: Open, Closed, or Closing
|
|
144
|
+
- `status.deserved`: Resources deserved by this queue
|
|
145
|
+
- `status.allocated`: Resources currently allocated
|
|
146
|
+
|
|
147
|
+
### 5. Node Resources (verbose mode)
|
|
148
|
+
When `--verbose` is specified, also check node allocatable resources.
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU:.status.allocatable.cpu,MEM:.status.allocatable.memory'
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Common Issues and Solutions
|
|
155
|
+
|
|
156
|
+
### Pod stuck in Pending, no events
|
|
157
|
+
- Check if Volcano scheduler is running: `kubectl get pods -n volcano-system -l app=volcano-scheduler`
|
|
158
|
+
- Check if Volcano controller-manager is running: `kubectl get pods -n volcano-system -l app=volcano-controller-manager`
|
|
159
|
+
- The controller-manager is responsible for Job lifecycle, PodGroup creation, and queue management — if it's down, jobs won't transition states even if the scheduler is healthy
|
|
160
|
+
- Check scheduler logs: `volcano-scheduler-logs` skill
|
|
161
|
+
|
|
162
|
+
### PodGroup phase is Pending
|
|
163
|
+
- The PodGroup is waiting for enqueue action to admit it
|
|
164
|
+
- **Verify the queue actually exists** — a typo in queue name causes the PodGroup to stay Pending silently:
|
|
165
|
+
```bash
|
|
166
|
+
kubectl get podgroup <pg> -n <ns> -o jsonpath='{.spec.queue}'
|
|
167
|
+
kubectl get queue <queue-name>
|
|
168
|
+
```
|
|
169
|
+
If the queue name is empty, the job uses the `default` queue — verify it exists and is Open
|
|
170
|
+
- Check Queue capacity and deserved resources
|
|
171
|
+
- Check if cluster has sufficient resources
|
|
172
|
+
|
|
173
|
+
### PodGroup phase is Inqueue but Pod is Pending
|
|
174
|
+
- Check if `minMember` constraint is not satisfied
|
|
175
|
+
- Check if there are affinity/anti-affinity conflicts
|
|
176
|
+
- Check if taints prevent scheduling
|
|
177
|
+
|
|
178
|
+
### Queue status shows insufficient deserved resources
|
|
179
|
+
- The queue may have insufficient weight or capability configured
|
|
180
|
+
- Other queues may be reclaiming resources
|
|
181
|
+
- Use `volcano-queue-diagnose` for detailed analysis
|
|
182
|
+
|
|
183
|
+
## Environment Variables
|
|
184
|
+
|
|
185
|
+
| Variable | Default | Description |
|
|
186
|
+
|----------|---------|-------------|
|
|
187
|
+
| `VOLCANO_NAMESPACE` | `default` | Default namespace for Pod lookup |
|
|
188
|
+
| `VOLCANO_SCHEDULER_NS` | `volcano-system` | Namespace where volcano scheduler runs |
|
|
189
|
+
|
|
190
|
+
## See Also
|
|
191
|
+
|
|
192
|
+
- `volcano-gang-scheduling` - Detailed Gang scheduling diagnosis
|
|
193
|
+
- `volcano-queue-diagnose` - Queue status and quota analysis
|
|
194
|
+
- `volcano-scheduler-logs` - Scheduler log analysis
|
|
195
|
+
- `volcano-resource-insufficient` - Resource shortage diagnosis
|
|
196
|
+
- `quota-debug` - Native Kubernetes ResourceQuota/LimitRange diagnosis (non-Volcano)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Diagnose Volcano-managed Pod scheduling issues.
|
|
3
|
+
# This script performs read-only operations using kubectl.
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
show_help() {
|
|
7
|
+
cat <<EOF
|
|
8
|
+
Usage: $0 --pod <pod> [options]
|
|
9
|
+
|
|
10
|
+
Diagnose Volcano-managed Pod scheduling issues.
|
|
11
|
+
Checks Pod status, PodGroup, events, and Queue configuration.
|
|
12
|
+
|
|
13
|
+
Options:
|
|
14
|
+
--pod POD Pod name to diagnose (required)
|
|
15
|
+
--namespace NS Namespace (default: default)
|
|
16
|
+
--verbose Show detailed output including node resources
|
|
17
|
+
-h, --help Show this help message
|
|
18
|
+
|
|
19
|
+
Environment:
|
|
20
|
+
VOLCANO_NAMESPACE Override default namespace
|
|
21
|
+
VOLCANO_SCHEDULER_NS Scheduler namespace (default: volcano-system)
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
$0 --pod my-job-0
|
|
25
|
+
$0 --pod my-job-0 --namespace training
|
|
26
|
+
$0 --pod my-job-0 --namespace training --verbose
|
|
27
|
+
EOF
|
|
28
|
+
exit 0
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# Parse arguments
|
|
32
|
+
POD=""
|
|
33
|
+
NS="${VOLCANO_NAMESPACE:-default}"
|
|
34
|
+
SCHEDULER_NS="${VOLCANO_SCHEDULER_NS:-volcano-system}"
|
|
35
|
+
VERBOSE=false
|
|
36
|
+
|
|
37
|
+
while [[ $# -gt 0 ]]; do
|
|
38
|
+
case $1 in
|
|
39
|
+
-h|--help) show_help ;;
|
|
40
|
+
--pod) POD="$2"; shift 2 ;;
|
|
41
|
+
--namespace) NS="$2"; shift 2 ;;
|
|
42
|
+
--verbose) VERBOSE=true; shift ;;
|
|
43
|
+
*) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
|
|
44
|
+
esac
|
|
45
|
+
done
|
|
46
|
+
|
|
47
|
+
[[ -z "$POD" ]] && { echo "Error: --pod is required. Use --help for usage." >&2; exit 1; }
|
|
48
|
+
|
|
49
|
+
echo "=== Volcano Pod Diagnosis: $NS/$POD ==="
|
|
50
|
+
echo
|
|
51
|
+
|
|
52
|
+
# 1. Pod Status
|
|
53
|
+
echo "[1/5] Pod Status"
|
|
54
|
+
echo "----------------"
|
|
55
|
+
if ! kubectl get pod "$POD" -n "$NS" -o wide 2>/dev/null; then
|
|
56
|
+
echo "Error: Pod '$POD' not found in namespace '$NS'" >&2
|
|
57
|
+
exit 1
|
|
58
|
+
fi
|
|
59
|
+
echo
|
|
60
|
+
|
|
61
|
+
# Get Pod phase
|
|
62
|
+
POD_PHASE=$(kubectl get pod "$POD" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
|
|
63
|
+
echo "Pod Phase: $POD_PHASE"
|
|
64
|
+
echo
|
|
65
|
+
|
|
66
|
+
# 2. PodGroup Information
|
|
67
|
+
echo "[2/5] PodGroup Information"
|
|
68
|
+
echo "--------------------------"
|
|
69
|
+
PG=$(kubectl get pod "$POD" -n "$NS" -o jsonpath='{.metadata.annotations.scheduling\.volcano\.sh/pod-group}' 2>/dev/null || true)
|
|
70
|
+
|
|
71
|
+
if [[ -n "$PG" ]]; then
|
|
72
|
+
echo "PodGroup: $PG"
|
|
73
|
+
echo
|
|
74
|
+
if kubectl get podgroup "$PG" -n "$NS" 2>/dev/null; then
|
|
75
|
+
echo
|
|
76
|
+
PG_PHASE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
|
|
77
|
+
PG_MINMEMBER=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.minMember}' 2>/dev/null || echo "0")
|
|
78
|
+
PG_RUNNING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
|
|
79
|
+
PG_PENDING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
|
|
80
|
+
PG_QUEUE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.queue}' 2>/dev/null || echo "default")
|
|
81
|
+
|
|
82
|
+
echo "PodGroup Phase: $PG_PHASE"
|
|
83
|
+
echo "MinMember: $PG_MINMEMBER"
|
|
84
|
+
echo "Running: $PG_RUNNING"
|
|
85
|
+
echo "Pending: $PG_PENDING"
|
|
86
|
+
echo "Queue: $PG_QUEUE"
|
|
87
|
+
else
|
|
88
|
+
echo "Warning: PodGroup '$PG' not found"
|
|
89
|
+
fi
|
|
90
|
+
else
|
|
91
|
+
echo "⚠️ No PodGroup annotation found — this Pod is NOT managed by Volcano scheduler."
|
|
92
|
+
echo " Recommended: Use 'pod-pending-debug' skill for standard kube-scheduler issues."
|
|
93
|
+
echo ""
|
|
94
|
+
echo " Continuing with basic event analysis..."
|
|
95
|
+
fi
|
|
96
|
+
echo
|
|
97
|
+
|
|
98
|
+
# 3. Events Analysis
|
|
99
|
+
echo "[3/5] Recent Events"
|
|
100
|
+
echo "-------------------"
|
|
101
|
+
kubectl get events -n "$NS" --field-selector "involvedObject.name=$POD" --sort-by='.lastTimestamp' 2>/dev/null | tail -15 || echo "No events found"
|
|
102
|
+
echo
|
|
103
|
+
|
|
104
|
+
# 4. Queue Status (if PodGroup exists and has a queue)
|
|
105
|
+
if [[ -n "$PG" ]]; then
|
|
106
|
+
PG_QUEUE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.queue}' 2>/dev/null || echo "")
|
|
107
|
+
if [[ -n "$PG_QUEUE" ]]; then
|
|
108
|
+
echo "[4/5] Queue Status: $PG_QUEUE"
|
|
109
|
+
echo "------------------------------"
|
|
110
|
+
if kubectl get queue "$PG_QUEUE" 2>/dev/null; then
|
|
111
|
+
echo
|
|
112
|
+
QUEUE_STATE=$(kubectl get queue "$PG_QUEUE" -o jsonpath='{.status.state}' 2>/dev/null || echo "Unknown")
|
|
113
|
+
QUEUE_WEIGHT=$(kubectl get queue "$PG_QUEUE" -o jsonpath='{.spec.weight}' 2>/dev/null || echo "N/A")
|
|
114
|
+
echo "Queue State: $QUEUE_STATE"
|
|
115
|
+
echo "Queue Weight: $QUEUE_WEIGHT"
|
|
116
|
+
echo
|
|
117
|
+
echo "Deserved Resources:"
|
|
118
|
+
kubectl get queue "$PG_QUEUE" -o jsonpath='{.status.deserved}' 2>/dev/null || echo " N/A"
|
|
119
|
+
echo
|
|
120
|
+
echo "Allocated Resources:"
|
|
121
|
+
kubectl get queue "$PG_QUEUE" -o jsonpath='{.status.allocated}' 2>/dev/null || echo " N/A"
|
|
122
|
+
else
|
|
123
|
+
echo "Warning: Queue '$PG_QUEUE' not found"
|
|
124
|
+
fi
|
|
125
|
+
echo
|
|
126
|
+
else
|
|
127
|
+
echo "[4/5] Queue Status"
|
|
128
|
+
echo "------------------"
|
|
129
|
+
echo "No queue specified in PodGroup"
|
|
130
|
+
echo
|
|
131
|
+
fi
|
|
132
|
+
else
|
|
133
|
+
echo "[4/5] Queue Status"
|
|
134
|
+
echo "------------------"
|
|
135
|
+
echo "Skipping (no PodGroup found)"
|
|
136
|
+
echo
|
|
137
|
+
fi
|
|
138
|
+
|
|
139
|
+
# 5. Node Resources (verbose mode)
|
|
140
|
+
if [[ "$VERBOSE" == "true" ]]; then
|
|
141
|
+
echo "[5/5] Node Resources"
|
|
142
|
+
echo "--------------------"
|
|
143
|
+
echo "Node Allocatable Resources:"
|
|
144
|
+
kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU:.status.allocatable.cpu,MEM:.status.allocatable.memory,GPU:.status.allocatable.nvidia\.com/gpu' 2>/dev/null | head -10
|
|
145
|
+
echo
|
|
146
|
+
|
|
147
|
+
echo "Node Resource Usage (if metrics available):"
|
|
148
|
+
kubectl top nodes 2>/dev/null | head -10 || echo "Metrics not available (requires metrics-server)"
|
|
149
|
+
echo
|
|
150
|
+
fi
|
|
151
|
+
|
|
152
|
+
# Summary
|
|
153
|
+
echo "=== Diagnosis Summary ==="
|
|
154
|
+
echo "Pod: $NS/$POD"
|
|
155
|
+
echo "Phase: $POD_PHASE"
|
|
156
|
+
if [[ -n "$PG" ]]; then
|
|
157
|
+
echo "PodGroup: $PG (Phase: ${PG_PHASE:-Unknown})"
|
|
158
|
+
if [[ -n "${PG_QUEUE:-}" ]]; then
|
|
159
|
+
echo "Queue: $PG_QUEUE (State: ${QUEUE_STATE:-Unknown})"
|
|
160
|
+
fi
|
|
161
|
+
else
|
|
162
|
+
echo "PodGroup: Not found"
|
|
163
|
+
fi
|
|
164
|
+
|
|
165
|
+
if [[ "$POD_PHASE" == "Pending" ]]; then
|
|
166
|
+
echo
|
|
167
|
+
echo "Recommendations:"
|
|
168
|
+
echo "1. Check events above for 'FailedScheduling' reasons"
|
|
169
|
+
echo "2. If PodGroup phase is 'Pending', check Queue capacity"
|
|
170
|
+
echo "3. If minMember is not satisfied, use volcano-gang-scheduling skill"
|
|
171
|
+
echo "4. Check scheduler logs with volcano-scheduler-logs skill"
|
|
172
|
+
fi
|
|
173
|
+
|
|
174
|
+
echo
|
|
175
|
+
echo "=== Diagnosis Complete ==="
|