siclaw 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/README.md +75 -114
  2. package/dist/agentbox/gateway-client.d.ts +2 -1
  3. package/dist/agentbox/gateway-client.js +6 -2
  4. package/dist/agentbox/gateway-client.js.map +1 -1
  5. package/dist/agentbox/http-server.js +184 -19
  6. package/dist/agentbox/http-server.js.map +1 -1
  7. package/dist/agentbox/resource-handlers.d.ts +1 -0
  8. package/dist/agentbox/resource-handlers.js +23 -23
  9. package/dist/agentbox/resource-handlers.js.map +1 -1
  10. package/dist/agentbox/session.js +85 -5
  11. package/dist/agentbox/session.js.map +1 -1
  12. package/dist/agentbox-main.d.ts +2 -1
  13. package/dist/agentbox-main.js +65 -18
  14. package/dist/agentbox-main.js.map +1 -1
  15. package/dist/cli-credentials.d.ts +1 -0
  16. package/dist/cli-credentials.js +109 -0
  17. package/dist/cli-credentials.js.map +1 -0
  18. package/dist/cli-first-run.d.ts +11 -0
  19. package/dist/cli-first-run.js +99 -0
  20. package/dist/cli-first-run.js.map +1 -0
  21. package/dist/cli-main.js +33 -11
  22. package/dist/cli-main.js.map +1 -1
  23. package/dist/cli-setup.d.ts +5 -11
  24. package/dist/cli-setup.js +12 -225
  25. package/dist/cli-setup.js.map +1 -1
  26. package/dist/core/agent-factory.d.ts +4 -0
  27. package/dist/core/agent-factory.js +102 -151
  28. package/dist/core/agent-factory.js.map +1 -1
  29. package/dist/core/config.d.ts +10 -3
  30. package/dist/core/config.js +11 -95
  31. package/dist/core/config.js.map +1 -1
  32. package/dist/core/extensions/deep-investigation.d.ts +2 -1
  33. package/dist/core/extensions/deep-investigation.js +144 -24
  34. package/dist/core/extensions/deep-investigation.js.map +1 -1
  35. package/dist/core/extensions/setup.d.ts +8 -0
  36. package/dist/core/extensions/setup.js +669 -0
  37. package/dist/core/extensions/setup.js.map +1 -0
  38. package/dist/core/llm-proxy.js +7 -3
  39. package/dist/core/llm-proxy.js.map +1 -1
  40. package/dist/core/mcp-client.d.ts +0 -10
  41. package/dist/core/mcp-client.js +0 -65
  42. package/dist/core/mcp-client.js.map +1 -1
  43. package/dist/core/prompt.d.ts +1 -1
  44. package/dist/core/prompt.js +42 -5
  45. package/dist/core/prompt.js.map +1 -1
  46. package/dist/core/provider-presets.d.ts +14 -0
  47. package/dist/core/provider-presets.js +81 -0
  48. package/dist/core/provider-presets.js.map +1 -0
  49. package/dist/cron/cron-coordinator.d.ts +2 -0
  50. package/dist/cron/cron-coordinator.js +46 -14
  51. package/dist/cron/cron-coordinator.js.map +1 -1
  52. package/dist/cron/cron-executor.js +33 -8
  53. package/dist/cron/cron-executor.js.map +1 -1
  54. package/dist/cron/cron-scheduler.d.ts +1 -1
  55. package/dist/cron/gateway-client.d.ts +5 -0
  56. package/dist/cron/gateway-client.js +43 -8
  57. package/dist/cron/gateway-client.js.map +1 -1
  58. package/dist/cron-main.js +39 -9
  59. package/dist/cron-main.js.map +1 -1
  60. package/dist/gateway/agentbox/client.d.ts +11 -0
  61. package/dist/gateway/agentbox/client.js +18 -0
  62. package/dist/gateway/agentbox/client.js.map +1 -1
  63. package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
  64. package/dist/gateway/agentbox/k8s-spawner.js +95 -52
  65. package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
  66. package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
  67. package/dist/gateway/agentbox/local-spawner.js +4 -2
  68. package/dist/gateway/agentbox/local-spawner.js.map +1 -1
  69. package/dist/gateway/agentbox/manager.d.ts +0 -10
  70. package/dist/gateway/agentbox/manager.js +11 -30
  71. package/dist/gateway/agentbox/manager.js.map +1 -1
  72. package/dist/gateway/agentbox/types.d.ts +6 -4
  73. package/dist/gateway/cron/cron-service.d.ts +49 -0
  74. package/dist/gateway/cron/cron-service.js +259 -0
  75. package/dist/gateway/cron/cron-service.js.map +1 -0
  76. package/dist/gateway/db/init-schema.js +44 -0
  77. package/dist/gateway/db/init-schema.js.map +1 -1
  78. package/dist/gateway/db/migrate-sqlite.js +73 -4
  79. package/dist/gateway/db/migrate-sqlite.js.map +1 -1
  80. package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
  81. package/dist/gateway/db/repositories/chat-repo.js +132 -2
  82. package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
  83. package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
  84. package/dist/gateway/db/repositories/config-repo.js +57 -7
  85. package/dist/gateway/db/repositories/config-repo.js.map +1 -1
  86. package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
  87. package/dist/gateway/db/repositories/env-repo.js +15 -2
  88. package/dist/gateway/db/repositories/env-repo.js.map +1 -1
  89. package/dist/gateway/db/repositories/model-config-repo.d.ts +1 -1
  90. package/dist/gateway/db/repositories/model-config-repo.js +26 -12
  91. package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
  92. package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
  93. package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
  94. package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
  95. package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
  96. package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
  97. package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
  98. package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
  99. package/dist/gateway/db/repositories/system-config-repo.js +2 -1
  100. package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
  101. package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
  102. package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
  103. package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
  104. package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
  105. package/dist/gateway/db/repositories/workspace-repo.js +6 -2
  106. package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
  107. package/dist/gateway/db/schema-mysql.d.ts +473 -51
  108. package/dist/gateway/db/schema-mysql.js +35 -4
  109. package/dist/gateway/db/schema-mysql.js.map +1 -1
  110. package/dist/gateway/db/schema-sqlite.d.ts +522 -57
  111. package/dist/gateway/db/schema-sqlite.js +38 -6
  112. package/dist/gateway/db/schema-sqlite.js.map +1 -1
  113. package/dist/gateway/db/schema.d.ts +471 -51
  114. package/dist/gateway/db/schema.js +1 -1
  115. package/dist/gateway/db/schema.js.map +1 -1
  116. package/dist/gateway/metrics-aggregator.d.ts +65 -0
  117. package/dist/gateway/metrics-aggregator.js +244 -0
  118. package/dist/gateway/metrics-aggregator.js.map +1 -0
  119. package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
  120. package/dist/gateway/plugins/channel-bridge.js +78 -86
  121. package/dist/gateway/plugins/channel-bridge.js.map +1 -1
  122. package/dist/gateway/rpc-methods.d.ts +4 -2
  123. package/dist/gateway/rpc-methods.js +962 -163
  124. package/dist/gateway/rpc-methods.js.map +1 -1
  125. package/dist/gateway/security/cert-manager.d.ts +2 -2
  126. package/dist/gateway/security/cert-manager.js +4 -2
  127. package/dist/gateway/security/cert-manager.js.map +1 -1
  128. package/dist/gateway/server.d.ts +4 -8
  129. package/dist/gateway/server.js +297 -261
  130. package/dist/gateway/server.js.map +1 -1
  131. package/dist/gateway/skills/file-writer.js +17 -11
  132. package/dist/gateway/skills/file-writer.js.map +1 -1
  133. package/dist/gateway/skills/script-evaluator.js +12 -9
  134. package/dist/gateway/skills/script-evaluator.js.map +1 -1
  135. package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
  136. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
  137. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
  138. package/dist/gateway/web/dist/assets/index-CAmSY91d.js +675 -0
  139. package/dist/gateway/web/dist/assets/index-DMFEh8Pp.css +1 -0
  140. package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
  141. package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
  142. package/dist/gateway/web/dist/index.html +2 -2
  143. package/dist/gateway-main.js +27 -10
  144. package/dist/gateway-main.js.map +1 -1
  145. package/dist/memory/embeddings.js +5 -4
  146. package/dist/memory/embeddings.js.map +1 -1
  147. package/dist/memory/indexer.d.ts +23 -3
  148. package/dist/memory/indexer.js +235 -23
  149. package/dist/memory/indexer.js.map +1 -1
  150. package/dist/memory/schema.js +15 -1
  151. package/dist/memory/schema.js.map +1 -1
  152. package/dist/memory/types.d.ts +18 -0
  153. package/dist/memory/types.js +6 -1
  154. package/dist/memory/types.js.map +1 -1
  155. package/dist/shared/detect-language.d.ts +12 -0
  156. package/dist/shared/detect-language.js +78 -0
  157. package/dist/shared/detect-language.js.map +1 -0
  158. package/dist/shared/diagnostic-events.d.ts +70 -0
  159. package/dist/shared/diagnostic-events.js +38 -0
  160. package/dist/shared/diagnostic-events.js.map +1 -0
  161. package/dist/shared/local-collector.d.ts +56 -0
  162. package/dist/shared/local-collector.js +284 -0
  163. package/dist/shared/local-collector.js.map +1 -0
  164. package/dist/shared/metrics-types.d.ts +64 -0
  165. package/dist/shared/metrics-types.js +25 -0
  166. package/dist/shared/metrics-types.js.map +1 -0
  167. package/dist/shared/metrics.d.ts +19 -0
  168. package/dist/shared/metrics.js +185 -0
  169. package/dist/shared/metrics.js.map +1 -0
  170. package/dist/shared/path-utils.d.ts +15 -0
  171. package/dist/shared/path-utils.js +23 -0
  172. package/dist/shared/path-utils.js.map +1 -0
  173. package/dist/shared/retry.d.ts +35 -0
  174. package/dist/shared/retry.js +61 -0
  175. package/dist/shared/retry.js.map +1 -0
  176. package/dist/tools/command-sets.d.ts +18 -2
  177. package/dist/tools/command-sets.js +207 -32
  178. package/dist/tools/command-sets.js.map +1 -1
  179. package/dist/tools/command-validator.d.ts +56 -0
  180. package/dist/tools/command-validator.js +357 -0
  181. package/dist/tools/command-validator.js.map +1 -0
  182. package/dist/tools/create-skill.js +26 -1
  183. package/dist/tools/create-skill.js.map +1 -1
  184. package/dist/tools/credential-list.js +1 -23
  185. package/dist/tools/credential-list.js.map +1 -1
  186. package/dist/tools/credential-manager.d.ts +98 -0
  187. package/dist/tools/credential-manager.js +313 -0
  188. package/dist/tools/credential-manager.js.map +1 -0
  189. package/dist/tools/deep-search/engine.js +184 -127
  190. package/dist/tools/deep-search/engine.js.map +1 -1
  191. package/dist/tools/deep-search/prompts.d.ts +10 -2
  192. package/dist/tools/deep-search/prompts.js +37 -36
  193. package/dist/tools/deep-search/prompts.js.map +1 -1
  194. package/dist/tools/deep-search/schemas.d.ts +87 -0
  195. package/dist/tools/deep-search/schemas.js +85 -0
  196. package/dist/tools/deep-search/schemas.js.map +1 -0
  197. package/dist/tools/deep-search/sub-agent.d.ts +21 -0
  198. package/dist/tools/deep-search/sub-agent.js +153 -4
  199. package/dist/tools/deep-search/sub-agent.js.map +1 -1
  200. package/dist/tools/deep-search/tool.js +1 -0
  201. package/dist/tools/deep-search/tool.js.map +1 -1
  202. package/dist/tools/deep-search/types.d.ts +2 -0
  203. package/dist/tools/deep-search/types.js.map +1 -1
  204. package/dist/tools/dp-tools.js +29 -5
  205. package/dist/tools/dp-tools.js.map +1 -1
  206. package/dist/tools/exec-utils.d.ts +85 -0
  207. package/dist/tools/exec-utils.js +294 -0
  208. package/dist/tools/exec-utils.js.map +1 -0
  209. package/dist/tools/fork-skill.js +14 -2
  210. package/dist/tools/fork-skill.js.map +1 -1
  211. package/dist/tools/investigation-feedback.d.ts +3 -0
  212. package/dist/tools/investigation-feedback.js +71 -0
  213. package/dist/tools/investigation-feedback.js.map +1 -0
  214. package/dist/tools/manage-schedule.js +16 -6
  215. package/dist/tools/manage-schedule.js.map +1 -1
  216. package/dist/tools/netns-script.js +27 -281
  217. package/dist/tools/netns-script.js.map +1 -1
  218. package/dist/tools/node-exec.d.ts +2 -14
  219. package/dist/tools/node-exec.js +18 -225
  220. package/dist/tools/node-exec.js.map +1 -1
  221. package/dist/tools/node-script.js +14 -168
  222. package/dist/tools/node-script.js.map +1 -1
  223. package/dist/tools/pod-exec.d.ts +1 -1
  224. package/dist/tools/pod-exec.js +10 -26
  225. package/dist/tools/pod-exec.js.map +1 -1
  226. package/dist/tools/pod-nsenter-exec.js +21 -225
  227. package/dist/tools/pod-nsenter-exec.js.map +1 -1
  228. package/dist/tools/pod-script.js +10 -19
  229. package/dist/tools/pod-script.js.map +1 -1
  230. package/dist/tools/restricted-bash.d.ts +1 -17
  231. package/dist/tools/restricted-bash.js +38 -252
  232. package/dist/tools/restricted-bash.js.map +1 -1
  233. package/dist/tools/run-skill.d.ts +3 -1
  234. package/dist/tools/run-skill.js +21 -1
  235. package/dist/tools/run-skill.js.map +1 -1
  236. package/dist/tools/script-resolver.d.ts +3 -1
  237. package/dist/tools/script-resolver.js +74 -30
  238. package/dist/tools/script-resolver.js.map +1 -1
  239. package/dist/tools/update-skill.js +17 -6
  240. package/dist/tools/update-skill.js.map +1 -1
  241. package/package.json +8 -6
  242. package/siclaw.mjs +10 -1
  243. package/skills/core/cluster-events/SKILL.md +1 -1
  244. package/skills/core/deep-investigation/SKILL.md +11 -0
  245. package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
  246. package/skills/core/dns-debug/SKILL.md +1 -0
  247. package/skills/core/meta.json +12 -1
  248. package/skills/core/networkpolicy-debug/SKILL.md +332 -0
  249. package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
  250. package/skills/core/pod-pending-debug/SKILL.md +1 -0
  251. package/skills/core/quota-debug/SKILL.md +203 -0
  252. package/skills/core/service-debug/SKILL.md +1 -0
  253. package/skills/core/statefulset-debug/SKILL.md +280 -0
  254. package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
  255. package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
  256. package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
  257. package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
  258. package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
  259. package/skills/core/volcano-node-resources/SKILL.md +334 -0
  260. package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
  261. package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
  262. package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
  263. package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
  264. package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
  265. package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
  266. package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
  267. package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
  268. package/skills/platform/create-skill/SKILL.md +35 -3
  269. package/skills/platform/manage-skill/SKILL.md +9 -2
  270. package/skills/platform/update-skill/SKILL.md +17 -6
@@ -0,0 +1,253 @@
1
+ #!/bin/bash
2
+ # Diagnose Volcano Job status and issues.
3
+ # This script performs read-only operations using kubectl.
4
+ set -euo pipefail
5
+
6
+ if ! command -v jq &>/dev/null; then
7
+ echo "ERROR: jq is required but not installed. Install it with: apt-get install jq / brew install jq" >&2
8
+ exit 1
9
+ fi
10
+
11
+ show_help() {
12
+ cat <<EOF
13
+ Usage: $0 --job <job-name> [options]
14
+
15
+ Diagnose Volcano Job (batch.volcano.sh/v1beta1) status and issues.
16
+ Checks Job phases, task statuses, PodGroup associations, and overall job health.
17
+
18
+ Options:
19
+ --job JOB Job name to diagnose (required)
20
+ --namespace NS Namespace (default: default)
21
+ --verbose Show detailed task and pod information
22
+ -h, --help Show this help message
23
+
24
+ Environment:
25
+ VOLCANO_NAMESPACE Override default namespace
26
+
27
+ Examples:
28
+ $0 --job my-training-job --namespace training
29
+ $0 --job my-training-job --namespace training --verbose
30
+ EOF
31
+ exit 0
32
+ }
33
+
34
+ # Parse arguments
35
+ JOB=""
36
+ NS="${VOLCANO_NAMESPACE:-default}"
37
+ VERBOSE=false
38
+
39
+ while [[ $# -gt 0 ]]; do
40
+ case $1 in
41
+ -h|--help) show_help ;;
42
+ --job) JOB="$2"; shift 2 ;;
43
+ --namespace) NS="$2"; shift 2 ;;
44
+ --verbose) VERBOSE=true; shift ;;
45
+ *) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
46
+ esac
47
+ done
48
+
49
+ [[ -z "$JOB" ]] && { echo "Error: --job is required. Use --help for usage." >&2; exit 1; }
50
+
51
+ echo "=== Volcano Job Diagnosis: $NS/$JOB ==="
52
+ echo
53
+
54
+ # 1. Job Overview
55
+ echo "[1/5] Job Overview"
56
+ echo "------------------"
57
+ if ! kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o wide 2>/dev/null; then
58
+ echo "Error: Job '$JOB' not found in namespace '$NS'" >&2
59
+ exit 1
60
+ fi
61
+ echo
62
+
63
+ # Get job details
64
+ JOB_PHASE=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.state.phase}' 2>/dev/null || echo "Unknown")
65
+ JOB_FAILED=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.failed}' 2>/dev/null || echo "0")
66
+ JOB_SUCCEEDED=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.succeeded}' 2>/dev/null || echo "0")
67
+ JOB_RUNNING=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
68
+ JOB_PENDING=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
69
+
70
+ echo "Job Phase: $JOB_PHASE"
71
+ echo "Tasks - Failed: $JOB_FAILED, Succeeded: $JOB_SUCCEEDED, Running: $JOB_RUNNING, Pending: $JOB_PENDING"
72
+ echo
73
+
74
+ # Warning for problematic states
75
+ case "$JOB_PHASE" in
76
+ Failed)
77
+ echo "⚠️ WARNING: Job has FAILED"
78
+ ;;
79
+ Pending)
80
+ echo "ℹ️ Job is PENDING - waiting for resources or admission"
81
+ ;;
82
+ Restarting)
83
+ echo "⚠️ WARNING: Job is RESTARTING - check previous failure reasons"
84
+ ;;
85
+ Aborted)
86
+ echo "⚠️ WARNING: Job was ABORTED"
87
+ ;;
88
+ esac
89
+
90
+ # Check minAvailable if set
91
+ MIN_AVAILABLE=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.spec.minAvailable}' 2>/dev/null || echo "")
92
+ if [[ -n "$MIN_AVAILABLE" ]]; then
93
+ echo "MinAvailable: $MIN_AVAILABLE (Gang constraint)"
94
+ fi
95
+ echo
96
+
97
+ # 2. Check Policies
98
+ echo "[2/5] Job Policies"
99
+ echo "------------------"
100
+ POLICIES=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.spec.policies}' 2>/dev/null || echo "")
101
+ if [[ -n "$POLICIES" && "$POLICIES" != "[]" && "$POLICIES" != "null" ]]; then
102
+ echo "Configured Policies:"
103
+ kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.spec.policies}' 2>/dev/null | jq -r '.[] | " - Event: \(.event), Action: \(.action)"' 2>/dev/null || echo " (Failed to parse policies)"
104
+ else
105
+ echo "No policies configured"
106
+ fi
107
+ echo
108
+
109
+ # 3. Task Status
110
+ echo "[3/5] Task Status"
111
+ echo "-----------------"
112
+
113
+ # Initialize counters with defaults (in case PODS is empty)
114
+ PENDING_PODS=0
115
+ RUNNING_PODS=0
116
+ COMPLETED_PODS=0
117
+ FAILED_PODS=0
118
+ TOTAL_PODS=0
119
+
120
+ # Get all pods for this job
121
+ PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "")
122
+
123
+ if [[ -z "$PODS" ]]; then
124
+ echo "⚠️ No pods found for this job"
125
+ echo " Job may be in Pending state or pods may have been cleaned up"
126
+ else
127
+ # Count pods by phase (--no-headers avoids header line in count)
128
+ PENDING_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Pending --no-headers 2>/dev/null | grep -c . || echo "0")
129
+ RUNNING_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Running --no-headers 2>/dev/null | grep -c . || echo "0")
130
+ COMPLETED_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Succeeded --no-headers 2>/dev/null | grep -c . || echo "0")
131
+ FAILED_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Failed --no-headers 2>/dev/null | grep -c . || echo "0")
132
+
133
+ TOTAL_PODS=$((PENDING_PODS + RUNNING_PODS + COMPLETED_PODS + FAILED_PODS))
134
+
135
+ echo "Total Pods: $TOTAL_PODS"
136
+ echo " Pending: $PENDING_PODS"
137
+ echo " Running: $RUNNING_PODS"
138
+ echo " Completed: $COMPLETED_PODS"
139
+ echo " Failed: $FAILED_PODS"
140
+ echo
141
+
142
+ if [[ "$FAILED_PODS" -gt 0 ]]; then
143
+ echo "⚠️ Failed pods detected - check pod logs and events"
144
+ fi
145
+
146
+ if [[ "$PENDING_PODS" -gt 0 && "$JOB_RUNNING" -gt 0 ]]; then
147
+ echo "⚠️ Partial scheduling - some pods pending while others running"
148
+ echo " Possible Gang scheduling issue - use volcano-gang-scheduling skill"
149
+ fi
150
+
151
+ # Show pod details in verbose mode
152
+ if [[ "$VERBOSE" == "true" ]]; then
153
+ echo
154
+ echo "Pod Details:"
155
+ kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" -o custom-columns='NAME:.metadata.name,STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,NODE:.spec.nodeName,AGE:.metadata.creationTimestamp' 2>/dev/null || echo " (Failed to get pod details)"
156
+ fi
157
+ fi
158
+ echo
159
+
160
+ # 4. PodGroup Association
161
+ echo "[4/5] PodGroup Association"
162
+ echo "----------------------------"
163
+
164
+ # Try to find PodGroup
165
+ PG=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" -o jsonpath='{.items[0].metadata.annotations.scheduling\.volcano\.sh/pod-group}' 2>/dev/null || echo "")
166
+
167
+ if [[ -n "$PG" ]]; then
168
+ echo "PodGroup: $PG"
169
+
170
+ if kubectl get podgroup "$PG" -n "$NS" &>/dev/null; then
171
+ PG_PHASE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
172
+ PG_MINMEMBER=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.minMember}' 2>/dev/null || echo "N/A")
173
+ PG_RUNNING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
174
+ PG_PENDING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
175
+
176
+ echo "PodGroup Phase: $PG_PHASE"
177
+ echo "MinMember: $PG_MINMEMBER | Running: $PG_RUNNING | Pending: $PG_PENDING"
178
+
179
+ if [[ "$PG_PHASE" == "Pending" ]]; then
180
+ echo "⚠️ PodGroup is Pending - check Queue capacity and resource availability"
181
+ fi
182
+
183
+ if [[ "$PG_PHASE" == "Inqueue" && "$PENDING_PODS" -gt 0 ]]; then
184
+ echo "⚠️ PodGroup Inqueue but pods Pending - Gang constraint may not be satisfied"
185
+ fi
186
+ else
187
+ echo "Warning: PodGroup '$PG' not found"
188
+ fi
189
+ else
190
+ echo "No PodGroup annotation found on job pods"
191
+ echo "This may indicate the job is not using Gang scheduling"
192
+ fi
193
+ echo
194
+
195
+ # 5. Events Analysis
196
+ echo "[5/5] Recent Events"
197
+ echo "-------------------"
198
+ kubectl get events -n "$NS" --field-selector "involvedObject.name=$JOB" --sort-by='.lastTimestamp' 2>/dev/null | tail -10 || echo "No events found for job"
199
+ echo
200
+
201
+ # Also check pod events if verbose
202
+ if [[ "$VERBOSE" == "true" && -n "$PODS" ]]; then
203
+ echo "Pod Events (first failed/running pod):"
204
+ # Find a failed or running pod to check events
205
+ SAMPLE_POD=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Failed -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || \
206
+ kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Running -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
207
+ if [[ -n "$SAMPLE_POD" ]]; then
208
+ kubectl get events -n "$NS" --field-selector "involvedObject.name=$SAMPLE_POD" --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo "No events found"
209
+ fi
210
+ echo
211
+ fi
212
+
213
+ # Summary
214
+ echo "=== Diagnosis Summary ==="
215
+ echo "Job: $NS/$JOB"
216
+ echo "Phase: $JOB_PHASE"
217
+ echo "Tasks: $JOB_PENDING pending, $JOB_RUNNING running, $JOB_SUCCEEDED succeeded, $JOB_FAILED failed"
218
+
219
+ if [[ -n "$PG" ]]; then
220
+ echo "PodGroup: $PG (Phase: ${PG_PHASE:-Unknown})"
221
+ fi
222
+
223
+ # Recommendations
224
+ echo
225
+ echo "Recommendations:"
226
+ case "$JOB_PHASE" in
227
+ Pending)
228
+ echo "1. Check PodGroup status (if associated)"
229
+ echo "2. Check Queue capacity with volcano-queue-diagnose"
230
+ echo "3. Check scheduler logs with volcano-scheduler-logs"
231
+ ;;
232
+ Running)
233
+ if [[ "$PENDING_PODS" -gt 0 ]]; then
234
+ echo "1. Partial scheduling detected - use volcano-gang-scheduling for Gang analysis"
235
+ echo "2. Check node resources with volcano-node-resources"
236
+ else
237
+ echo "1. Job is running normally - monitor progress"
238
+ fi
239
+ ;;
240
+ Failed)
241
+ echo "1. Check failed pod logs: kubectl logs <pod> -n $NS"
242
+ echo "2. Check pod events for failure reasons"
243
+ echo "3. Review job policies and restart configuration"
244
+ ;;
245
+ Restarting)
246
+ echo "1. Check previous failure reason in events"
247
+ echo "2. Review container logs for crash reasons"
248
+ echo "3. Verify restart policy is appropriate"
249
+ ;;
250
+ esac
251
+
252
+ echo
253
+ echo "=== Diagnosis Complete ==="
@@ -0,0 +1,334 @@
1
+ ---
2
+ name: volcano-node-resources
3
+ description: >-
4
+ Query cluster node resources for Volcano scheduling.
5
+ Check allocatable CPU, memory, GPU, and current usage.
6
+ ---
7
+
8
+ # Volcano Node Resources
9
+
10
+ Query cluster node resources to understand capacity and availability for Volcano scheduling. This skill helps identify resource bottlenecks at the node level.
11
+
12
+ **Scope:** This skill is for **diagnosis only**. It retrieves resource information but does not modify any cluster state.
13
+
14
+ ## Usage
15
+
16
+ ```bash
17
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh [options]
18
+ ```
19
+
20
+ ## Parameters
21
+
22
+ | Parameter | Required | Description |
23
+ |-----------|----------|-------------|
24
+ | `--node NODE` | no | Query specific node only |
25
+ | `--label LABEL` | no | Filter nodes by label (e.g., gpu=true) |
26
+ | `--show-usage` | no | Show current resource usage (requires metrics-server) |
27
+ | `--show-pods` | no | Show pods running on each node |
28
+ | `--format FORMAT` | no | Output format: table (default), json, wide |
29
+
30
+ ## Examples
31
+
32
+ Get overview of all nodes:
33
+ ```bash
34
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh
35
+ ```
36
+
37
+ Check specific node:
38
+ ```bash
39
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --node worker-1
40
+ ```
41
+
42
+ Check GPU nodes:
43
+ ```bash
44
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --label nvidia.com/gpu.present=true
45
+ ```
46
+
47
+ Show resource usage:
48
+ ```bash
49
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --show-usage
50
+ ```
51
+
52
+ Show with pod information:
53
+ ```bash
54
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --show-pods
55
+ ```
56
+
57
+ JSON output for parsing:
58
+ ```bash
59
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --format json
60
+ ```
61
+
62
+ ## Understanding Node Resources
63
+
64
+ ### Resource Types
65
+
66
+ | Resource | Description | Scheduling Impact |
67
+ |----------|-------------|-------------------|
68
+ | `cpu` | CPU cores (millicores) | Primary scheduling constraint |
69
+ | `memory` | RAM (bytes) | Primary scheduling constraint |
70
+ | `nvidia.com/gpu` | GPU devices | Hardware-specific scheduling |
71
+ | `pods` | Max pods per node | Density limit |
72
+ | `ephemeral-storage` | Disk space | Secondary constraint |
73
+
74
+ ### Capacity vs Allocatable
75
+
76
+ - **Capacity**: Total physical resources on the node
77
+ - **Allocatable**: Resources available for pods (capacity minus system reservations)
78
+
79
+ **Reservations include:**
80
+ - kubelet overhead
81
+ - System daemons (kube-proxy, node-exporter)
82
+ - Kernel reserved memory
83
+ - Eviction threshold
84
+
85
+ ### Resource Usage
86
+
87
+ When `--show-usage` is enabled and metrics-server is available:
88
+
89
+ - **Requests**: Sum of all pod resource requests on the node
90
+ - **Limits**: Sum of all pod resource limits
91
+ - **Usage**: Actual resource consumption
92
+
93
+ **Key insights:**
94
+ - Allocatable - Requests = Available for new pods
95
+ - Usage < Requests = Over-provisioning
96
+ - Usage > Requests = Over-committing (risky)
97
+
98
+ ## Resource Calculation
99
+
100
+ ### Available Resources
101
+
102
+ ```
103
+ Available = Allocatable - Allocated (sum of all requests)
104
+ ```
105
+
106
+ Nodes with zero or negative available resources cannot accept new pods.
107
+
108
+ ### Gang Scheduling Calculation
109
+
110
+ For Gang scheduling, you need:
111
+ ```
112
+ Number of nodes with Available >= Pod Request >= minMember
113
+ ```
114
+
115
+ Example:
116
+ - minMember = 4
117
+ - Pod requests 4 CPUs each
118
+ - Need at least 4 nodes with 4+ CPUs available
119
+
120
+ ## Diagnostic Use Cases
121
+
122
+ ### Case 1: Identify Nodes with Available Resources
123
+
124
+ ```bash
125
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh
126
+ ```
127
+
128
+ Look for nodes with positive Available CPU/Memory. Nodes with zero or near-zero availability cannot schedule new pods.
129
+
130
+ ### Case 2: Find GPU-Equipped Nodes
131
+
132
+ ```bash
133
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --label nvidia.com/gpu.present=true --show-usage
134
+ ```
135
+
136
+ Check:
137
+ - Which nodes have GPUs
138
+ - How many GPUs are allocatable
139
+ - How many are currently allocated/used
140
+ - GPU utilization patterns
141
+
142
+ ### Case 3: Detect Resource Fragmentation
143
+
144
+ ```bash
145
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --show-usage
146
+ ```
147
+
148
+ Fragmentation indicators:
149
+ - Many nodes with small amounts of available resources
150
+ - High node count but low available resources per node
151
+ - Allocated resources spread thinly across many nodes
152
+
153
+ ### Case 4: Node Affinity Troubleshooting
154
+
155
+ If pods require specific labels:
156
+
157
+ ```bash
158
+ # Check nodes with required labels
159
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --label <required-label>
160
+
161
+ # Verify sufficient resources
162
+ kubectl describe node <node-name> | grep -A 10 "Allocated resources"
163
+ ```
164
+
165
+ ### Case 5: Capacity Planning
166
+
167
+ Monitor trends over time:
168
+
169
+ ```bash
170
+ # Current capacity
171
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --format json
172
+
173
+ # Check usage trends (if metrics available)
174
+ for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
175
+ echo "=== $node ==="
176
+ kubectl top node $node 2>/dev/null || echo "Metrics not available"
177
+ done
178
+ ```
179
+
180
+ ## Node Status Indicators
181
+
182
+ ### Ready Status
183
+
184
+ | Status | Meaning | Action |
185
+ |--------|---------|--------|
186
+ | `Ready` | Node healthy and schedulable | Normal |
187
+ | `NotReady` | Node unhealthy | Check node conditions |
188
+ | `SchedulingDisabled` | Node cordoned | May need uncordon |
189
+
190
+ Check node conditions:
191
+ ```bash
192
+ kubectl get nodes -o json | jq '.items[].status.conditions'
193
+ ```
194
+
195
+ ### Node Taints
196
+
197
+ Taints prevent pod scheduling:
198
+
199
+ ```bash
200
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,TAINTS:.spec.taints[*].key'
201
+ ```
202
+
203
+ Common taints:
204
+ - `node.kubernetes.io/not-ready`
205
+ - `node.kubernetes.io/unreachable`
206
+ - `node.kubernetes.io/disk-pressure`
207
+ - `node.kubernetes.io/memory-pressure`
208
+ - `node.kubernetes.io/pid-pressure`
209
+
210
+ ## Common Issues
211
+
212
+ ### Issue 1: Node at Capacity
213
+
214
+ **Symptom:** Available resources near zero, new pods stuck pending
215
+
216
+ **Check:**
217
+ ```bash
218
+ kubectl describe node <node-name> | grep -A 5 "Allocated resources"
219
+ ```
220
+
221
+ **Solution:**
222
+ - Scale cluster (add nodes)
223
+ - Drain and consolidate workloads
224
+ - Review resource requests (may be over-provisioned)
225
+
226
+ ### Issue 2: GPU Not Allocatable
227
+
228
+ **Symptom:** Node has GPUs but not showing as allocatable
229
+
230
+ **Check:**
231
+ ```bash
232
+ kubectl get node <node> -o jsonpath='{.status.allocatable.nvidia\.com/gpu}'
233
+ ```
234
+
235
+ **Solution:**
236
+ - Verify GPU device plugin is running
237
+ - Check nvidia-driver installation
238
+ - Review node labels
239
+
240
+ ### Issue 3: Memory Pressure
241
+
242
+ **Symptom:** Node has `node.kubernetes.io/memory-pressure` taint
243
+
244
+ **Check:**
245
+ ```bash
246
+ kubectl describe node <node-name> | grep -A 3 "MemoryPressure"
247
+ ```
248
+
249
+ **Solution:**
250
+ - Evict or reschedule memory-intensive pods
251
+ - Increase node memory
252
+ - Adjust pod memory limits
253
+
254
+ ### Issue 4: Disk Pressure
255
+
256
+ **Symptom:** Node has `node.kubernetes.io/disk-pressure` taint
257
+
258
+ **Check:**
259
+ ```bash
260
+ kubectl describe node <node-name> | grep -A 3 "DiskPressure"
261
+ ```
262
+
263
+ **Solution:**
264
+ - Clean up unused images/containers
265
+ - Increase node disk capacity
266
+ - Review log rotation policies
267
+
268
+ ## Output Formats
269
+
270
+ ### Table Format (default)
271
+
272
+ Human-readable table:
273
+ ```
274
+ NAME CPU_ALLOC MEM_ALLOC GPU_ALLOC CPU_AVAIL MEM_AVAIL
275
+ node-1 32 64Gi 4 8 16Gi
276
+ node-2 16 32Gi 0 2 4Gi
277
+ ```
278
+
279
+ ### Wide Format
280
+
281
+ Additional columns:
282
+ ```
283
+ NAME CPU MEM GPU CPU_AVAIL MEM_AVAIL STATUS AGE
284
+ ```
285
+
286
+ ### JSON Format
287
+
288
+ Machine-parseable output:
289
+ ```json
290
+ {
291
+ "nodes": [
292
+ {
293
+ "name": "node-1",
294
+ "allocatable": {
295
+ "cpu": "32",
296
+ "memory": "64Gi",
297
+ "nvidia.com/gpu": "4"
298
+ },
299
+ "available": {
300
+ "cpu": "8",
301
+ "memory": "16Gi"
302
+ }
303
+ }
304
+ ]
305
+ }
306
+ ```
307
+
308
+ ## Environment Variables
309
+
310
+ | Variable | Default | Description |
311
+ |----------|---------|-------------|
312
+ | `NODE_LABEL` | "" | Default label selector for nodes |
313
+
314
+ ## Integration with Other Skills
315
+
316
+ Combine with other skills for comprehensive analysis:
317
+
318
+ ```bash
319
+ # 1. Check node resources
320
+ bash skills/core/volcano-node-resources/scripts/get-node-resources.sh
321
+
322
+ # 2. Check queue resources
323
+ bash skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh
324
+
325
+ # 3. If insufficient resources, refer to volcano-resource-insufficient skill guide
326
+ # (This is a documentation skill - follow the diagnostic steps in the SKILL.md)
327
+ ```
328
+
329
+ ## See Also
330
+
331
+ - `volcano-resource-insufficient` - Resource shortage diagnosis
332
+ - `volcano-diagnose-pod` - Pod-specific scheduling issues
333
+ - `volcano-gang-scheduling` - Gang constraint analysis
334
+ - `volcano-queue-diagnose` - Queue resource distribution