siclaw 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/README.md +75 -114
  2. package/dist/agentbox/gateway-client.d.ts +2 -1
  3. package/dist/agentbox/gateway-client.js +6 -2
  4. package/dist/agentbox/gateway-client.js.map +1 -1
  5. package/dist/agentbox/http-server.js +184 -19
  6. package/dist/agentbox/http-server.js.map +1 -1
  7. package/dist/agentbox/resource-handlers.d.ts +1 -0
  8. package/dist/agentbox/resource-handlers.js +23 -23
  9. package/dist/agentbox/resource-handlers.js.map +1 -1
  10. package/dist/agentbox/session.js +85 -5
  11. package/dist/agentbox/session.js.map +1 -1
  12. package/dist/agentbox-main.d.ts +2 -1
  13. package/dist/agentbox-main.js +65 -18
  14. package/dist/agentbox-main.js.map +1 -1
  15. package/dist/cli-credentials.d.ts +1 -0
  16. package/dist/cli-credentials.js +109 -0
  17. package/dist/cli-credentials.js.map +1 -0
  18. package/dist/cli-first-run.d.ts +11 -0
  19. package/dist/cli-first-run.js +99 -0
  20. package/dist/cli-first-run.js.map +1 -0
  21. package/dist/cli-main.js +33 -11
  22. package/dist/cli-main.js.map +1 -1
  23. package/dist/cli-setup.d.ts +5 -11
  24. package/dist/cli-setup.js +12 -225
  25. package/dist/cli-setup.js.map +1 -1
  26. package/dist/core/agent-factory.d.ts +4 -0
  27. package/dist/core/agent-factory.js +102 -151
  28. package/dist/core/agent-factory.js.map +1 -1
  29. package/dist/core/config.d.ts +10 -3
  30. package/dist/core/config.js +11 -95
  31. package/dist/core/config.js.map +1 -1
  32. package/dist/core/extensions/deep-investigation.d.ts +2 -1
  33. package/dist/core/extensions/deep-investigation.js +144 -24
  34. package/dist/core/extensions/deep-investigation.js.map +1 -1
  35. package/dist/core/extensions/setup.d.ts +8 -0
  36. package/dist/core/extensions/setup.js +669 -0
  37. package/dist/core/extensions/setup.js.map +1 -0
  38. package/dist/core/llm-proxy.js +7 -3
  39. package/dist/core/llm-proxy.js.map +1 -1
  40. package/dist/core/mcp-client.d.ts +0 -10
  41. package/dist/core/mcp-client.js +0 -65
  42. package/dist/core/mcp-client.js.map +1 -1
  43. package/dist/core/prompt.d.ts +1 -1
  44. package/dist/core/prompt.js +42 -5
  45. package/dist/core/prompt.js.map +1 -1
  46. package/dist/core/provider-presets.d.ts +14 -0
  47. package/dist/core/provider-presets.js +81 -0
  48. package/dist/core/provider-presets.js.map +1 -0
  49. package/dist/cron/cron-coordinator.d.ts +2 -0
  50. package/dist/cron/cron-coordinator.js +46 -14
  51. package/dist/cron/cron-coordinator.js.map +1 -1
  52. package/dist/cron/cron-executor.js +33 -8
  53. package/dist/cron/cron-executor.js.map +1 -1
  54. package/dist/cron/cron-scheduler.d.ts +1 -1
  55. package/dist/cron/gateway-client.d.ts +5 -0
  56. package/dist/cron/gateway-client.js +43 -8
  57. package/dist/cron/gateway-client.js.map +1 -1
  58. package/dist/cron-main.js +39 -9
  59. package/dist/cron-main.js.map +1 -1
  60. package/dist/gateway/agentbox/client.d.ts +11 -0
  61. package/dist/gateway/agentbox/client.js +18 -0
  62. package/dist/gateway/agentbox/client.js.map +1 -1
  63. package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
  64. package/dist/gateway/agentbox/k8s-spawner.js +95 -52
  65. package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
  66. package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
  67. package/dist/gateway/agentbox/local-spawner.js +4 -2
  68. package/dist/gateway/agentbox/local-spawner.js.map +1 -1
  69. package/dist/gateway/agentbox/manager.d.ts +0 -10
  70. package/dist/gateway/agentbox/manager.js +11 -30
  71. package/dist/gateway/agentbox/manager.js.map +1 -1
  72. package/dist/gateway/agentbox/types.d.ts +6 -4
  73. package/dist/gateway/cron/cron-service.d.ts +49 -0
  74. package/dist/gateway/cron/cron-service.js +259 -0
  75. package/dist/gateway/cron/cron-service.js.map +1 -0
  76. package/dist/gateway/db/init-schema.js +44 -0
  77. package/dist/gateway/db/init-schema.js.map +1 -1
  78. package/dist/gateway/db/migrate-sqlite.js +73 -4
  79. package/dist/gateway/db/migrate-sqlite.js.map +1 -1
  80. package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
  81. package/dist/gateway/db/repositories/chat-repo.js +132 -2
  82. package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
  83. package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
  84. package/dist/gateway/db/repositories/config-repo.js +57 -7
  85. package/dist/gateway/db/repositories/config-repo.js.map +1 -1
  86. package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
  87. package/dist/gateway/db/repositories/env-repo.js +15 -2
  88. package/dist/gateway/db/repositories/env-repo.js.map +1 -1
  89. package/dist/gateway/db/repositories/model-config-repo.d.ts +1 -1
  90. package/dist/gateway/db/repositories/model-config-repo.js +26 -12
  91. package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
  92. package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
  93. package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
  94. package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
  95. package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
  96. package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
  97. package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
  98. package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
  99. package/dist/gateway/db/repositories/system-config-repo.js +2 -1
  100. package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
  101. package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
  102. package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
  103. package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
  104. package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
  105. package/dist/gateway/db/repositories/workspace-repo.js +6 -2
  106. package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
  107. package/dist/gateway/db/schema-mysql.d.ts +473 -51
  108. package/dist/gateway/db/schema-mysql.js +35 -4
  109. package/dist/gateway/db/schema-mysql.js.map +1 -1
  110. package/dist/gateway/db/schema-sqlite.d.ts +522 -57
  111. package/dist/gateway/db/schema-sqlite.js +38 -6
  112. package/dist/gateway/db/schema-sqlite.js.map +1 -1
  113. package/dist/gateway/db/schema.d.ts +471 -51
  114. package/dist/gateway/db/schema.js +1 -1
  115. package/dist/gateway/db/schema.js.map +1 -1
  116. package/dist/gateway/metrics-aggregator.d.ts +65 -0
  117. package/dist/gateway/metrics-aggregator.js +244 -0
  118. package/dist/gateway/metrics-aggregator.js.map +1 -0
  119. package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
  120. package/dist/gateway/plugins/channel-bridge.js +78 -86
  121. package/dist/gateway/plugins/channel-bridge.js.map +1 -1
  122. package/dist/gateway/rpc-methods.d.ts +4 -2
  123. package/dist/gateway/rpc-methods.js +962 -163
  124. package/dist/gateway/rpc-methods.js.map +1 -1
  125. package/dist/gateway/security/cert-manager.d.ts +2 -2
  126. package/dist/gateway/security/cert-manager.js +4 -2
  127. package/dist/gateway/security/cert-manager.js.map +1 -1
  128. package/dist/gateway/server.d.ts +4 -8
  129. package/dist/gateway/server.js +297 -261
  130. package/dist/gateway/server.js.map +1 -1
  131. package/dist/gateway/skills/file-writer.js +17 -11
  132. package/dist/gateway/skills/file-writer.js.map +1 -1
  133. package/dist/gateway/skills/script-evaluator.js +12 -9
  134. package/dist/gateway/skills/script-evaluator.js.map +1 -1
  135. package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
  136. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
  137. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
  138. package/dist/gateway/web/dist/assets/index-CAmSY91d.js +675 -0
  139. package/dist/gateway/web/dist/assets/index-DMFEh8Pp.css +1 -0
  140. package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
  141. package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
  142. package/dist/gateway/web/dist/index.html +2 -2
  143. package/dist/gateway-main.js +27 -10
  144. package/dist/gateway-main.js.map +1 -1
  145. package/dist/memory/embeddings.js +5 -4
  146. package/dist/memory/embeddings.js.map +1 -1
  147. package/dist/memory/indexer.d.ts +23 -3
  148. package/dist/memory/indexer.js +235 -23
  149. package/dist/memory/indexer.js.map +1 -1
  150. package/dist/memory/schema.js +15 -1
  151. package/dist/memory/schema.js.map +1 -1
  152. package/dist/memory/types.d.ts +18 -0
  153. package/dist/memory/types.js +6 -1
  154. package/dist/memory/types.js.map +1 -1
  155. package/dist/shared/detect-language.d.ts +12 -0
  156. package/dist/shared/detect-language.js +78 -0
  157. package/dist/shared/detect-language.js.map +1 -0
  158. package/dist/shared/diagnostic-events.d.ts +70 -0
  159. package/dist/shared/diagnostic-events.js +38 -0
  160. package/dist/shared/diagnostic-events.js.map +1 -0
  161. package/dist/shared/local-collector.d.ts +56 -0
  162. package/dist/shared/local-collector.js +284 -0
  163. package/dist/shared/local-collector.js.map +1 -0
  164. package/dist/shared/metrics-types.d.ts +64 -0
  165. package/dist/shared/metrics-types.js +25 -0
  166. package/dist/shared/metrics-types.js.map +1 -0
  167. package/dist/shared/metrics.d.ts +19 -0
  168. package/dist/shared/metrics.js +185 -0
  169. package/dist/shared/metrics.js.map +1 -0
  170. package/dist/shared/path-utils.d.ts +15 -0
  171. package/dist/shared/path-utils.js +23 -0
  172. package/dist/shared/path-utils.js.map +1 -0
  173. package/dist/shared/retry.d.ts +35 -0
  174. package/dist/shared/retry.js +61 -0
  175. package/dist/shared/retry.js.map +1 -0
  176. package/dist/tools/command-sets.d.ts +18 -2
  177. package/dist/tools/command-sets.js +207 -32
  178. package/dist/tools/command-sets.js.map +1 -1
  179. package/dist/tools/command-validator.d.ts +56 -0
  180. package/dist/tools/command-validator.js +357 -0
  181. package/dist/tools/command-validator.js.map +1 -0
  182. package/dist/tools/create-skill.js +26 -1
  183. package/dist/tools/create-skill.js.map +1 -1
  184. package/dist/tools/credential-list.js +1 -23
  185. package/dist/tools/credential-list.js.map +1 -1
  186. package/dist/tools/credential-manager.d.ts +98 -0
  187. package/dist/tools/credential-manager.js +313 -0
  188. package/dist/tools/credential-manager.js.map +1 -0
  189. package/dist/tools/deep-search/engine.js +184 -127
  190. package/dist/tools/deep-search/engine.js.map +1 -1
  191. package/dist/tools/deep-search/prompts.d.ts +10 -2
  192. package/dist/tools/deep-search/prompts.js +37 -36
  193. package/dist/tools/deep-search/prompts.js.map +1 -1
  194. package/dist/tools/deep-search/schemas.d.ts +87 -0
  195. package/dist/tools/deep-search/schemas.js +85 -0
  196. package/dist/tools/deep-search/schemas.js.map +1 -0
  197. package/dist/tools/deep-search/sub-agent.d.ts +21 -0
  198. package/dist/tools/deep-search/sub-agent.js +153 -4
  199. package/dist/tools/deep-search/sub-agent.js.map +1 -1
  200. package/dist/tools/deep-search/tool.js +1 -0
  201. package/dist/tools/deep-search/tool.js.map +1 -1
  202. package/dist/tools/deep-search/types.d.ts +2 -0
  203. package/dist/tools/deep-search/types.js.map +1 -1
  204. package/dist/tools/dp-tools.js +29 -5
  205. package/dist/tools/dp-tools.js.map +1 -1
  206. package/dist/tools/exec-utils.d.ts +85 -0
  207. package/dist/tools/exec-utils.js +294 -0
  208. package/dist/tools/exec-utils.js.map +1 -0
  209. package/dist/tools/fork-skill.js +14 -2
  210. package/dist/tools/fork-skill.js.map +1 -1
  211. package/dist/tools/investigation-feedback.d.ts +3 -0
  212. package/dist/tools/investigation-feedback.js +71 -0
  213. package/dist/tools/investigation-feedback.js.map +1 -0
  214. package/dist/tools/manage-schedule.js +16 -6
  215. package/dist/tools/manage-schedule.js.map +1 -1
  216. package/dist/tools/netns-script.js +27 -281
  217. package/dist/tools/netns-script.js.map +1 -1
  218. package/dist/tools/node-exec.d.ts +2 -14
  219. package/dist/tools/node-exec.js +18 -225
  220. package/dist/tools/node-exec.js.map +1 -1
  221. package/dist/tools/node-script.js +14 -168
  222. package/dist/tools/node-script.js.map +1 -1
  223. package/dist/tools/pod-exec.d.ts +1 -1
  224. package/dist/tools/pod-exec.js +10 -26
  225. package/dist/tools/pod-exec.js.map +1 -1
  226. package/dist/tools/pod-nsenter-exec.js +21 -225
  227. package/dist/tools/pod-nsenter-exec.js.map +1 -1
  228. package/dist/tools/pod-script.js +10 -19
  229. package/dist/tools/pod-script.js.map +1 -1
  230. package/dist/tools/restricted-bash.d.ts +1 -17
  231. package/dist/tools/restricted-bash.js +38 -252
  232. package/dist/tools/restricted-bash.js.map +1 -1
  233. package/dist/tools/run-skill.d.ts +3 -1
  234. package/dist/tools/run-skill.js +21 -1
  235. package/dist/tools/run-skill.js.map +1 -1
  236. package/dist/tools/script-resolver.d.ts +3 -1
  237. package/dist/tools/script-resolver.js +74 -30
  238. package/dist/tools/script-resolver.js.map +1 -1
  239. package/dist/tools/update-skill.js +17 -6
  240. package/dist/tools/update-skill.js.map +1 -1
  241. package/package.json +8 -6
  242. package/siclaw.mjs +10 -1
  243. package/skills/core/cluster-events/SKILL.md +1 -1
  244. package/skills/core/deep-investigation/SKILL.md +11 -0
  245. package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
  246. package/skills/core/dns-debug/SKILL.md +1 -0
  247. package/skills/core/meta.json +12 -1
  248. package/skills/core/networkpolicy-debug/SKILL.md +332 -0
  249. package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
  250. package/skills/core/pod-pending-debug/SKILL.md +1 -0
  251. package/skills/core/quota-debug/SKILL.md +203 -0
  252. package/skills/core/service-debug/SKILL.md +1 -0
  253. package/skills/core/statefulset-debug/SKILL.md +280 -0
  254. package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
  255. package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
  256. package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
  257. package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
  258. package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
  259. package/skills/core/volcano-node-resources/SKILL.md +334 -0
  260. package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
  261. package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
  262. package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
  263. package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
  264. package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
  265. package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
  266. package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
  267. package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
  268. package/skills/platform/create-skill/SKILL.md +35 -3
  269. package/skills/platform/manage-skill/SKILL.md +9 -2
  270. package/skills/platform/update-skill/SKILL.md +17 -6
@@ -0,0 +1,280 @@
1
+ ---
2
+ name: statefulset-debug
3
+ description: >-
4
+ Diagnose StatefulSet rollout and scaling failures (ordered update stuck, OnDelete not updating, partition misconfiguration, PVC binding deadlocks).
5
+ Checks update strategy, pod ordinal progression, PVC bindings, and ordered startup to identify why a StatefulSet is not progressing.
6
+ ---
7
+
8
+ # StatefulSet Rollout & Scaling Failure Diagnosis
9
+
10
+ When a StatefulSet rollout is stuck, pods are not updating, or scaling is not progressing, follow this flow to identify the root cause.
11
+
12
+ **Scope:** This skill is for **diagnosis only**. Once you identify the root cause, report it to the user and stop. Do NOT attempt to modify the StatefulSet, delete pods, or change PVCs — that should be left to the user.
13
+
14
+ **When to use:** A StatefulSet is not progressing — pods are not updating to the new version, scaling up/down is stuck, or specific ordinal pods are not becoming ready.
15
+
16
+ **Not for Deployments:** Deployment rollouts have different semantics (parallel, unordered). Use `deployment-rollout-debug` for Deployments.
17
+
18
+ ## Key Concepts
19
+
20
+ StatefulSets differ fundamentally from Deployments:
21
+ - **Fixed pod identity** — pods have stable names with ordinal suffixes (pod-0, pod-1, ...)
22
+ - **Ordered operations** — updates go in reverse order (N-1 → 0), scaling up goes in forward order (0 → N-1)
23
+ - **Per-pod PVCs** — each pod gets its own PersistentVolumeClaim via `volumeClaimTemplates`
24
+ - **Blocking progression** — in OrderedReady mode (default), if pod at ordinal K is not Ready, all pods with ordinal < K will NOT be updated
25
+
26
+ ## Diagnostic Flow
27
+
28
+ ### 1. Get StatefulSet overview
29
+
30
+ ```bash
31
+ kubectl get statefulset <name> -n <ns> -o wide
32
+ ```
33
+
34
+ Compare the columns:
35
+ - **READY** — pods that are running and ready
36
+ - **REPLICAS** — desired replica count (from `spec.replicas`)
37
+ - **UP-TO-DATE** — pods running the current version (matching `currentRevision` == `updateRevision`)
38
+
39
+ If `READY < REPLICAS` or there is no `UP-TO-DATE` column showing full count, the rollout or scaling is incomplete.
40
+
41
+ ### 2. Describe the StatefulSet
42
+
43
+ ```bash
44
+ kubectl describe statefulset <name> -n <ns>
45
+ ```
46
+
47
+ Focus on:
48
+ - **Update Strategy** — `RollingUpdate` or `OnDelete`
49
+ - **Partition** — if set, only pods with ordinal ≥ partition are updated
50
+ - **maxUnavailable** — if set (Kubernetes 1.24+), allows multiple pods to be updated simultaneously instead of one-at-a-time
51
+ - **Current Revision / Update Revision** — if different, an update is in progress
52
+ - **Events** — look for errors or warnings
53
+
54
+ ### 3. Check pod status by ordinal
55
+
56
+ First get the StatefulSet's pod selector to reliably find its pods:
57
+
58
+ ```bash
59
+ kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.selector.matchLabels}'
60
+ ```
61
+
62
+ Then use the returned labels to list pods:
63
+
64
+ ```bash
65
+ kubectl get pods -n <ns> -l <key>=<value> --sort-by='.metadata.name'
66
+ ```
67
+
68
+ Identify which ordinal pod is stuck. In a StatefulSet with OrderedReady policy, **the stuck pod blocks all subsequent operations**.
69
+
70
+ ### 4. Match the failure pattern
71
+
72
+ ---
73
+
74
+ #### OnDelete strategy — Pods not updating after StatefulSet change
75
+
76
+ The StatefulSet uses `updateStrategy.type: OnDelete`. In this mode, Kubernetes does **not** automatically update pods — the user must manually delete each pod for it to be recreated with the new spec.
77
+
78
+ ```bash
79
+ kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.updateStrategy}'
80
+ ```
81
+
82
+ If the output shows `{"type":"OnDelete"}` or no `rollingUpdate` field:
83
+
84
+ Check if the current and update revisions differ:
85
+
86
+ ```bash
87
+ kubectl get statefulset <name> -n <ns> -o jsonpath='current={.status.currentRevision} update={.status.updateRevision}'
88
+ ```
89
+
90
+ If they differ, the StatefulSet spec has been updated but pods are still running the old version. This is **expected behavior** for OnDelete — pods must be manually deleted to pick up the new version.
91
+
92
+ Check which pods are still on the old revision (use the selector from step 3):
93
+
94
+ ```bash
95
+ kubectl get pods -n <ns> -l <key>=<value> -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.controller-revision-hash}{"\n"}{end}'
96
+ ```
97
+
98
+ Pods whose `controller-revision-hash` matches `currentRevision` (not `updateRevision`) are still on the old version.
99
+
100
+ ---
101
+
102
+ #### RollingUpdate stuck at a specific ordinal — Ordered update blocked
103
+
104
+ In RollingUpdate mode, StatefulSet updates pods in **reverse ordinal order** (N-1 → N-2 → ... → 0). By default (one-at-a-time), if pod at ordinal K is not Ready, the update stops — pods K-1, K-2, ..., 0 will not be updated.
105
+
106
+ **Check maxUnavailable** (Kubernetes 1.24+, GA in 1.27):
107
+
108
+ ```bash
109
+ kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.updateStrategy.rollingUpdate.maxUnavailable}'
110
+ ```
111
+
112
+ If `maxUnavailable` is set (e.g., `3`), multiple pods can be updated simultaneously instead of strict one-at-a-time. In this case, seeing 2-3 pods updating at once is normal — not a sign of being stuck. Only investigate if the number of updating pods is below `maxUnavailable` for an extended period, or if specific pods are stuck in a non-Ready state.
113
+
114
+ Find pods that are not Ready:
115
+
116
+ ```bash
117
+ kubectl get pods -n <ns> -l <key>=<value> --sort-by='.metadata.name'
118
+ ```
119
+
120
+ Check the stuck pod's status:
121
+ - **Pending** → Use `pod-pending-debug`
122
+ - **CrashLoopBackOff / Error** → Use `pod-crash-debug`
123
+ - **ImagePullBackOff** → Use `image-pull-debug`
124
+ - **Running but not Ready** → Check readiness probe (see below)
125
+
126
+ If the pod is Running but not Ready:
127
+
128
+ ```bash
129
+ kubectl describe pod <stuck-pod> -n <ns>
130
+ ```
131
+
132
+ Look for `Readiness probe failed` events. Common causes:
133
+ - Application not listening on the expected port after config change
134
+ - New version has a bug that prevents health check from passing
135
+ - Readiness probe configuration too aggressive for the new version's startup time
136
+
137
+ ---
138
+
139
+ #### Partition update — Only some pods updated
140
+
141
+ The StatefulSet has `spec.updateStrategy.rollingUpdate.partition` set. Only pods with ordinal **≥ partition** are updated; pods with ordinal < partition remain on the old version.
142
+
143
+ ```bash
144
+ kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.updateStrategy.rollingUpdate.partition}'
145
+ ```
146
+
147
+ If this returns a number (e.g., `3`), then pods 0, 1, 2 will NOT be updated. This is often used intentionally for **canary rollouts** — update a subset first, verify, then lower the partition to 0 to roll out fully.
148
+
149
+ If the user expects all pods to be updated, the partition value needs to be set to `0` or removed.
150
+
151
+ ---
152
+
153
+ #### Scaling up stuck — Ordered creation blocked
154
+
155
+ When scaling up, StatefulSet creates pods in **forward ordinal order** (0 → 1 → 2 → ...). Pod at ordinal K+1 is not created until pod K is Running and Ready.
156
+
157
+ ```bash
158
+ kubectl get pods -n <ns> | grep <statefulset-name>
159
+ ```
160
+
161
+ Find the highest ordinal pod that exists — the next ordinal is waiting for this pod to become Ready.
162
+
163
+ Check why the current highest pod is not Ready (same diagnosis as the "stuck at specific ordinal" pattern above).
164
+
165
+ For the `podManagementPolicy` field:
166
+
167
+ ```bash
168
+ kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.podManagementPolicy}'
169
+ ```
170
+
171
+ - **OrderedReady** (default) — strict ordered creation, one at a time
172
+ - **Parallel** — all pods are created simultaneously (no ordering guarantee)
173
+
174
+ If the policy is `Parallel` and pods are still stuck, the issue is not ordering — check individual pod status.
175
+
176
+ ---
177
+
178
+ #### PVC binding deadlock — Pod stuck in Pending due to volume topology
179
+
180
+ StatefulSet pods use `volumeClaimTemplates` to create per-pod PVCs. If the PVC is bound to a PV in a specific availability zone (AZ) or node, but that node/AZ has no resources, the pod cannot be scheduled.
181
+
182
+ Check PVC status for the stuck pod:
183
+
184
+ ```bash
185
+ kubectl get pvc -n <ns> | grep <statefulset-name>
186
+ ```
187
+
188
+ ```bash
189
+ kubectl describe pvc <pvc-name> -n <ns>
190
+ ```
191
+
192
+ Check the StorageClass's `volumeBindingMode`:
193
+
194
+ ```bash
195
+ kubectl get storageclass $(kubectl get pvc <pvc-name> -n <ns> -o jsonpath='{.spec.storageClassName}') -o jsonpath='{.volumeBindingMode}'
196
+ ```
197
+
198
+ - **Immediate** — PVC is bound to a PV as soon as created, regardless of pod scheduling. If the PV is in a different zone than the only available nodes, the pod cannot be scheduled.
199
+ - **WaitForFirstConsumer** — PVC binding is delayed until the pod is scheduled. If no node can satisfy both the pod's scheduling constraints and the storage topology, the PVC stays `Pending` and the pod stays `Pending` — a deadlock.
200
+
201
+ Check if the PV has a node affinity constraint:
202
+
203
+ ```bash
204
+ kubectl get pv <pv-name> -o jsonpath='{.spec.nodeAffinity}'
205
+ ```
206
+
207
+ If the PV is locked to a specific node/zone:
208
+ - Check if that node has available resources: `kubectl describe node <node>`
209
+ - Check if that node is healthy: `kubectl get node <node>`
210
+
211
+ **Common scenario:** A node was replaced or drained, but the PV is still bound to the old node's zone. The new pod can only be scheduled to nodes that can access this PV, but those nodes may be full or tainted.
212
+
213
+ For further PVC diagnosis, use the `pvc-debug` skill.
214
+
215
+ ---
216
+
217
+ #### Scaling down — PVCs left behind
218
+
219
+ When a StatefulSet is scaled down, pods are deleted in **reverse ordinal order** (N-1 → N-2 → ...). However, Kubernetes does **not** automatically delete the associated PVCs.
220
+
221
+ ```bash
222
+ kubectl get pvc -n <ns> | grep <statefulset-name>
223
+ ```
224
+
225
+ If there are PVCs for ordinals that no longer exist (e.g., `data-myapp-3` when replicas is 2), these are orphaned PVCs from a previous scale-down.
226
+
227
+ This is by design to prevent data loss. But when scaling back up, the new pod will reattach to the old PVC with stale data, which may cause application issues.
228
+
229
+ Check the StatefulSet's `persistentVolumeClaimRetentionPolicy` (Kubernetes 1.27+):
230
+
231
+ ```bash
232
+ kubectl get statefulset <name> -n <ns> -o jsonpath='{.spec.persistentVolumeClaimRetentionPolicy}'
233
+ ```
234
+
235
+ - **whenDeleted: Retain** (default) — PVCs are kept when StatefulSet is deleted
236
+ - **whenScaled: Retain** (default) — PVCs are kept when scaling down
237
+ - **whenScaled: Delete** — PVCs are automatically deleted on scale-down
238
+
239
+ ---
240
+
241
+ #### Pod stuck in Terminating during update or scale-down
242
+
243
+ During an update or scale-down, if a pod is stuck in `Terminating`, the next operation cannot proceed.
244
+
245
+ ```bash
246
+ kubectl describe pod <terminating-pod> -n <ns>
247
+ ```
248
+
249
+ First check if a PodDisruptionBudget (PDB) is preventing the deletion:
250
+
251
+ ```bash
252
+ kubectl get pdb -n <ns>
253
+ ```
254
+
255
+ ```bash
256
+ kubectl describe pdb <pdb-name> -n <ns>
257
+ ```
258
+
259
+ If the PDB's `minAvailable` or `maxUnavailable` limit has been reached, the StatefulSet controller cannot delete the pod. Check `status.disruptionsAllowed` — if it is `0`, no more pods can be disrupted until other pods become Ready.
260
+
261
+ If PDB is not the issue, check other common causes:
262
+ - **Finalizer blocking deletion** — check `metadata.finalizers`
263
+ - **PreStop hook hanging** — a long-running preStop hook delays termination
264
+ - **Process not responding to SIGTERM** — the container process ignores shutdown signals and must wait for `terminationGracePeriodSeconds` to expire
265
+ - **Volume unmount stuck** — the volume cannot be detached from the node
266
+
267
+ Check the grace period:
268
+
269
+ ```bash
270
+ kubectl get pod <pod> -n <ns> -o jsonpath='{.spec.terminationGracePeriodSeconds}'
271
+ ```
272
+
273
+ ## Notes
274
+
275
+ - StatefulSet updates go in **reverse** ordinal order (N-1 → 0), but scaling up goes in **forward** order (0 → N-1). This is a common source of confusion.
276
+ - `OnDelete` is frequently used in database StatefulSets (MySQL, PostgreSQL, etc.) where the operator wants manual control over when each replica is restarted. If a user complains that pods are not updating, check the strategy before assuming there is a bug.
277
+ - The `partition` field is for canary rollouts. A common workflow: set partition=N-1 to update only the last pod, verify, then set partition=0 to roll out to all pods. If a user sees partial updates, check partition before investigating further.
278
+ - PVCs created by `volumeClaimTemplates` follow the naming convention `<volumeClaimTemplate-name>-<statefulset-name>-<ordinal>`. Use this pattern to find PVCs for specific ordinals.
279
+ - Unlike Deployments, StatefulSets do NOT create new ReplicaSets for updates. They update pods in-place (delete old pod, create new pod with same name and PVC).
280
+ - For cross-reference: if the stuck pod's issue is at the scheduling level, use `pod-pending-debug`. If it is crashing, use `pod-crash-debug`. If PVCs are not binding, use `pvc-debug`.
@@ -0,0 +1,196 @@
1
+ ---
2
+ name: volcano-diagnose-pod
3
+ description: >-
4
+ Diagnose Volcano-managed Pod scheduling issues.
5
+ Checks Pod status, PodGroup, events, and Queue to identify scheduling failures.
6
+ ---
7
+
8
+ # Volcano Pod Diagnosis
9
+
10
+ Diagnose Volcano-managed Pod scheduling issues. This skill checks Pod status, associated PodGroup, scheduling events, and Queue configuration to identify why a Pod cannot be scheduled.
11
+
12
+ **Scope:** This skill is for **diagnosis only**. Once you identify the root cause, report it to the user and stop. Do NOT attempt to modify pod specs, PodGroups, or Queues — that should be left to the user.
13
+
14
+ ## Usage
15
+
16
+ ```bash
17
+ bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod <pod-name> --namespace <namespace>
18
+ ```
19
+
20
+ ## Parameters
21
+
22
+ | Parameter | Required | Description |
23
+ |-----------|----------|-------------|
24
+ | `--pod POD` | yes | Pod name to diagnose |
25
+ | `--namespace NS` | no | Namespace (default: `default`) |
26
+ | `--verbose` | no | Show detailed output including node resources |
27
+
28
+ ## Examples
29
+
30
+ Diagnose a pending pod in default namespace:
31
+ ```bash
32
+ bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod my-job-0
33
+ ```
34
+
35
+ Diagnose a pod in specific namespace:
36
+ ```bash
37
+ bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod my-job-0 --namespace training
38
+ ```
39
+
40
+ Verbose mode with node resource information:
41
+ ```bash
42
+ bash skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh --pod my-job-0 --namespace training --verbose
43
+ ```
44
+
45
+ ## Diagnostic Flow
46
+
47
+ The script performs the following checks in order:
48
+
49
+ ### 1. Pod Status
50
+ Check the Pod's current phase and conditions.
51
+
52
+ ```bash
53
+ kubectl get pod <pod> -n <ns> -o wide
54
+ kubectl describe pod <pod> -n <ns>
55
+ ```
56
+
57
+ ### 2. PodGroup Status
58
+ Check if the Pod is associated with a PodGroup and its scheduling status.
59
+
60
+ ```bash
61
+ kubectl get pod <pod> -n <ns> -o jsonpath='{.metadata.annotations.scheduling.volcano.sh/pod-group}'
62
+ kubectl get podgroup <podgroup> -n <ns>
63
+ ```
64
+
65
+ Key fields to check:
66
+ - `spec.minMember`: Minimum members required for Gang scheduling
67
+ - `status.phase`: Pending, Inqueue, Running, Unknown
68
+ - `status.running`: Number of running pods
69
+ - `status.pending`: Number of pending pods
70
+
71
+ ### 3. Events Analysis
72
+ Check scheduling events for failure reasons.
73
+
74
+ ```bash
75
+ kubectl get events -n <ns> --field-selector involvedObject.name=<pod> --sort-by='.lastTimestamp'
76
+ ```
77
+
78
+ Look for these event patterns:
79
+
80
+ #### `FailedScheduling` - General scheduling failure
81
+ The scheduler attempted but failed to schedule the pod. Check the message for specific reasons.
82
+
83
+ **Volcano-specific sub-patterns:**
84
+
85
+ | Event Message | Meaning | Next Step |
86
+ |---------------|---------|-----------|
87
+ | `0/N nodes are available` + `minMember` | Gang constraint not satisfied | Use `volcano-gang-scheduling` |
88
+ | `exceeded quota` / `queue resource exceeded` | Queue deserved resources exhausted | Use `volcano-queue-diagnose` |
89
+ | `Insufficient cpu/memory` + Gang mention | Resource shortage blocking Gang | Use `volcano-resource-insufficient` |
90
+ | `pod group is not ready` | PodGroup not in Inqueue phase | Check PodGroup status |
91
+ | `task <name> is not ready` | Task dependencies not met | Check dependent tasks |
92
+
93
+ > **Quick Reference vs Detailed Analysis:** The table above provides a quick lookup for common patterns. The sections below provide detailed analysis, additional context, and more diagnostic commands for each pattern.
94
+
95
+ #### `Insufficient cpu` / `Insufficient memory` - Resource shortage
96
+ No node has enough allocatable resources. Check:
97
+ - Node resources: `kubectl top nodes`
98
+ - Pod resource requests: `kubectl get pod <pod> -n <ns> -o jsonpath='{.spec.containers[*].resources.requests}'`
99
+
100
+ **Volcano context:** If this is a Gang-scheduled pod, even if total cluster resources are sufficient, you need enough resources **simultaneously** on enough nodes. Use `volcano-resource-insufficient` to check fragmentation.
101
+
102
+ #### `minMember` not satisfied - Gang constraint
103
+ The PodGroup requires `minMember` pods to be scheduled simultaneously, but the cluster cannot satisfy this. Use `volcano-gang-scheduling` skill for detailed diagnosis.
104
+
105
+ **Key insight:** Even if `kubectl top nodes` shows enough total resources, Gang requires **simultaneous** availability on **different nodes**.
106
+
107
+ #### `queue resource exceeded` - Queue quota limit
108
+ The Queue associated with this Pod has exceeded its deserved resources. Check Queue status with `volcano-queue-diagnose` skill.
109
+
110
+ **Volcano-specific terms you might see:**
111
+ - `overused` - Queue has exceeded its fair share
112
+ - `deserved resources` - Calculated from queue weight proportion
113
+ - `allocated resources` - Currently used by jobs in this queue
114
+
115
+ #### `reclaim` events - Resource reclamation triggered
116
+ If you see events mentioning `reclaim`:
117
+ - Another queue is trying to reclaim resources from your pod's queue
118
+ - Your queue may be `over-allocated` (allocated > deserved)
119
+ - Check queue status: `volcano-queue-diagnose --queue <queue>`
120
+
121
+ #### `preempt` events - Priority preemption
122
+ Higher priority workload is evicting this pod. Check:
123
+ - Pod priority class: `kubectl get pod <pod> -o jsonpath='{.spec.priorityClassName}'`
124
+ - Preemptor details in scheduler logs: `volcano-scheduler-logs --keyword preempt`
125
+
126
+ #### `enqueue` related events
127
+ - `PodGroup is enqueued` - PodGroup admitted to queue, ready for scheduling
128
+ - `PodGroup is pending` - Waiting for queue admission (capacity or resource check)
129
+ - `enqueue failed` - Failed admission check (overcommit, queue closed, etc.)
130
+
131
+ ### 4. Queue Status
132
+ Check the Queue configuration and resource allocation.
133
+
134
+ ```bash
135
+ kubectl get podgroup <podgroup> -n <ns> -o jsonpath='{.spec.queue}'
136
+ kubectl get queue <queue>
137
+ kubectl describe queue <queue>
138
+ ```
139
+
140
+ Key fields:
141
+ - `spec.weight`: Queue weight for resource sharing
142
+ - `spec.capability`: Maximum resources the queue can use
143
+ - `status.state`: Open, Closed, or Closing
144
+ - `status.deserved`: Resources deserved by this queue
145
+ - `status.allocated`: Resources currently allocated
146
+
147
+ ### 5. Node Resources (verbose mode)
148
+ When `--verbose` is specified, also check node allocatable resources.
149
+
150
+ ```bash
151
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU:.status.allocatable.cpu,MEM:.status.allocatable.memory'
152
+ ```
153
+
154
+ ## Common Issues and Solutions
155
+
156
+ ### Pod stuck in Pending, no events
157
+ - Check if Volcano scheduler is running: `kubectl get pods -n volcano-system -l app=volcano-scheduler`
158
+ - Check if Volcano controller-manager is running: `kubectl get pods -n volcano-system -l app=volcano-controller-manager`
159
+ - The controller-manager is responsible for Job lifecycle, PodGroup creation, and queue management — if it's down, jobs won't transition states even if the scheduler is healthy
160
+ - Check scheduler logs: `volcano-scheduler-logs` skill
161
+
162
+ ### PodGroup phase is Pending
163
+ - The PodGroup is waiting for enqueue action to admit it
164
+ - **Verify the queue actually exists** — a typo in queue name causes the PodGroup to stay Pending silently:
165
+ ```bash
166
+ kubectl get podgroup <pg> -n <ns> -o jsonpath='{.spec.queue}'
167
+ kubectl get queue <queue-name>
168
+ ```
169
+ If the queue name is empty, the job uses the `default` queue — verify it exists and is Open
170
+ - Check Queue capacity and deserved resources
171
+ - Check if cluster has sufficient resources
172
+
173
+ ### PodGroup phase is Inqueue but Pod is Pending
174
+ - Check if `minMember` constraint is not satisfied
175
+ - Check if there are affinity/anti-affinity conflicts
176
+ - Check if taints prevent scheduling
177
+
178
+ ### Queue status shows insufficient deserved resources
179
+ - The queue may have insufficient weight or capability configured
180
+ - Other queues may be reclaiming resources
181
+ - Use `volcano-queue-diagnose` for detailed analysis
182
+
183
+ ## Environment Variables
184
+
185
+ | Variable | Default | Description |
186
+ |----------|---------|-------------|
187
+ | `VOLCANO_NAMESPACE` | `default` | Default namespace for Pod lookup |
188
+ | `VOLCANO_SCHEDULER_NS` | `volcano-system` | Namespace where volcano scheduler runs |
189
+
190
+ ## See Also
191
+
192
+ - `volcano-gang-scheduling` - Detailed Gang scheduling diagnosis
193
+ - `volcano-queue-diagnose` - Queue status and quota analysis
194
+ - `volcano-scheduler-logs` - Scheduler log analysis
195
+ - `volcano-resource-insufficient` - Resource shortage diagnosis
196
+ - `quota-debug` - Native Kubernetes ResourceQuota/LimitRange diagnosis (non-Volcano)
@@ -0,0 +1,175 @@
1
+ #!/bin/bash
2
+ # Diagnose Volcano-managed Pod scheduling issues.
3
+ # This script performs read-only operations using kubectl.
4
+ set -euo pipefail
5
+
6
+ show_help() {
7
+ cat <<EOF
8
+ Usage: $0 --pod <pod> [options]
9
+
10
+ Diagnose Volcano-managed Pod scheduling issues.
11
+ Checks Pod status, PodGroup, events, and Queue configuration.
12
+
13
+ Options:
14
+ --pod POD Pod name to diagnose (required)
15
+ --namespace NS Namespace (default: default)
16
+ --verbose Show detailed output including node resources
17
+ -h, --help Show this help message
18
+
19
+ Environment:
20
+ VOLCANO_NAMESPACE Override default namespace
21
+ VOLCANO_SCHEDULER_NS Scheduler namespace (default: volcano-system)
22
+
23
+ Examples:
24
+ $0 --pod my-job-0
25
+ $0 --pod my-job-0 --namespace training
26
+ $0 --pod my-job-0 --namespace training --verbose
27
+ EOF
28
+ exit 0
29
+ }
30
+
31
+ # Parse arguments
32
+ POD=""
33
+ NS="${VOLCANO_NAMESPACE:-default}"
34
+ SCHEDULER_NS="${VOLCANO_SCHEDULER_NS:-volcano-system}"
35
+ VERBOSE=false
36
+
37
+ while [[ $# -gt 0 ]]; do
38
+ case $1 in
39
+ -h|--help) show_help ;;
40
+ --pod) POD="$2"; shift 2 ;;
41
+ --namespace) NS="$2"; shift 2 ;;
42
+ --verbose) VERBOSE=true; shift ;;
43
+ *) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
44
+ esac
45
+ done
46
+
47
+ [[ -z "$POD" ]] && { echo "Error: --pod is required. Use --help for usage." >&2; exit 1; }
48
+
49
+ echo "=== Volcano Pod Diagnosis: $NS/$POD ==="
50
+ echo
51
+
52
+ # 1. Pod Status
53
+ echo "[1/5] Pod Status"
54
+ echo "----------------"
55
+ if ! kubectl get pod "$POD" -n "$NS" -o wide 2>/dev/null; then
56
+ echo "Error: Pod '$POD' not found in namespace '$NS'" >&2
57
+ exit 1
58
+ fi
59
+ echo
60
+
61
+ # Get Pod phase
62
+ POD_PHASE=$(kubectl get pod "$POD" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
63
+ echo "Pod Phase: $POD_PHASE"
64
+ echo
65
+
66
+ # 2. PodGroup Information
67
+ echo "[2/5] PodGroup Information"
68
+ echo "--------------------------"
69
+ PG=$(kubectl get pod "$POD" -n "$NS" -o jsonpath='{.metadata.annotations.scheduling\.volcano\.sh/pod-group}' 2>/dev/null || true)
70
+
71
+ if [[ -n "$PG" ]]; then
72
+ echo "PodGroup: $PG"
73
+ echo
74
+ if kubectl get podgroup "$PG" -n "$NS" 2>/dev/null; then
75
+ echo
76
+ PG_PHASE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
77
+ PG_MINMEMBER=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.minMember}' 2>/dev/null || echo "0")
78
+ PG_RUNNING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
79
+ PG_PENDING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
80
+ PG_QUEUE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.queue}' 2>/dev/null || echo "default")
81
+
82
+ echo "PodGroup Phase: $PG_PHASE"
83
+ echo "MinMember: $PG_MINMEMBER"
84
+ echo "Running: $PG_RUNNING"
85
+ echo "Pending: $PG_PENDING"
86
+ echo "Queue: $PG_QUEUE"
87
+ else
88
+ echo "Warning: PodGroup '$PG' not found"
89
+ fi
90
+ else
91
+ echo "⚠️ No PodGroup annotation found — this Pod is NOT managed by Volcano scheduler."
92
+ echo " Recommended: Use 'pod-pending-debug' skill for standard kube-scheduler issues."
93
+ echo ""
94
+ echo " Continuing with basic event analysis..."
95
+ fi
96
+ echo
97
+
98
+ # 3. Events Analysis
99
+ echo "[3/5] Recent Events"
100
+ echo "-------------------"
101
+ kubectl get events -n "$NS" --field-selector "involvedObject.name=$POD" --sort-by='.lastTimestamp' 2>/dev/null | tail -15 || echo "No events found"
102
+ echo
103
+
104
+ # 4. Queue Status (if PodGroup exists and has a queue)
105
+ if [[ -n "$PG" ]]; then
106
+ PG_QUEUE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.queue}' 2>/dev/null || echo "")
107
+ if [[ -n "$PG_QUEUE" ]]; then
108
+ echo "[4/5] Queue Status: $PG_QUEUE"
109
+ echo "------------------------------"
110
+ if kubectl get queue "$PG_QUEUE" 2>/dev/null; then
111
+ echo
112
+ QUEUE_STATE=$(kubectl get queue "$PG_QUEUE" -o jsonpath='{.status.state}' 2>/dev/null || echo "Unknown")
113
+ QUEUE_WEIGHT=$(kubectl get queue "$PG_QUEUE" -o jsonpath='{.spec.weight}' 2>/dev/null || echo "N/A")
114
+ echo "Queue State: $QUEUE_STATE"
115
+ echo "Queue Weight: $QUEUE_WEIGHT"
116
+ echo
117
+ echo "Deserved Resources:"
118
+ kubectl get queue "$PG_QUEUE" -o jsonpath='{.status.deserved}' 2>/dev/null || echo " N/A"
119
+ echo
120
+ echo "Allocated Resources:"
121
+ kubectl get queue "$PG_QUEUE" -o jsonpath='{.status.allocated}' 2>/dev/null || echo " N/A"
122
+ else
123
+ echo "Warning: Queue '$PG_QUEUE' not found"
124
+ fi
125
+ echo
126
+ else
127
+ echo "[4/5] Queue Status"
128
+ echo "------------------"
129
+ echo "No queue specified in PodGroup"
130
+ echo
131
+ fi
132
+ else
133
+ echo "[4/5] Queue Status"
134
+ echo "------------------"
135
+ echo "Skipping (no PodGroup found)"
136
+ echo
137
+ fi
138
+
139
+ # 5. Node Resources (verbose mode)
140
+ if [[ "$VERBOSE" == "true" ]]; then
141
+ echo "[5/5] Node Resources"
142
+ echo "--------------------"
143
+ echo "Node Allocatable Resources:"
144
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU:.status.allocatable.cpu,MEM:.status.allocatable.memory,GPU:.status.allocatable.nvidia\.com/gpu' 2>/dev/null | head -10
145
+ echo
146
+
147
+ echo "Node Resource Usage (if metrics available):"
148
+ kubectl top nodes 2>/dev/null | head -10 || echo "Metrics not available (requires metrics-server)"
149
+ echo
150
+ fi
151
+
152
+ # Summary
153
+ echo "=== Diagnosis Summary ==="
154
+ echo "Pod: $NS/$POD"
155
+ echo "Phase: $POD_PHASE"
156
+ if [[ -n "$PG" ]]; then
157
+ echo "PodGroup: $PG (Phase: ${PG_PHASE:-Unknown})"
158
+ if [[ -n "${PG_QUEUE:-}" ]]; then
159
+ echo "Queue: $PG_QUEUE (State: ${QUEUE_STATE:-Unknown})"
160
+ fi
161
+ else
162
+ echo "PodGroup: Not found"
163
+ fi
164
+
165
+ if [[ "$POD_PHASE" == "Pending" ]]; then
166
+ echo
167
+ echo "Recommendations:"
168
+ echo "1. Check events above for 'FailedScheduling' reasons"
169
+ echo "2. If PodGroup phase is 'Pending', check Queue capacity"
170
+ echo "3. If minMember is not satisfied, use volcano-gang-scheduling skill"
171
+ echo "4. Check scheduler logs with volcano-scheduler-logs skill"
172
+ fi
173
+
174
+ echo
175
+ echo "=== Diagnosis Complete ==="