siclaw 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. package/README.md +74 -114
  2. package/dist/agentbox/gateway-client.d.ts +2 -1
  3. package/dist/agentbox/gateway-client.js +6 -2
  4. package/dist/agentbox/gateway-client.js.map +1 -1
  5. package/dist/agentbox/http-server.js +184 -19
  6. package/dist/agentbox/http-server.js.map +1 -1
  7. package/dist/agentbox/resource-handlers.d.ts +1 -0
  8. package/dist/agentbox/resource-handlers.js +23 -23
  9. package/dist/agentbox/resource-handlers.js.map +1 -1
  10. package/dist/agentbox/session.js +85 -5
  11. package/dist/agentbox/session.js.map +1 -1
  12. package/dist/agentbox-main.d.ts +2 -1
  13. package/dist/agentbox-main.js +65 -18
  14. package/dist/agentbox-main.js.map +1 -1
  15. package/dist/cli-credentials.d.ts +1 -0
  16. package/dist/cli-credentials.js +109 -0
  17. package/dist/cli-credentials.js.map +1 -0
  18. package/dist/cli-first-run.d.ts +11 -0
  19. package/dist/cli-first-run.js +99 -0
  20. package/dist/cli-first-run.js.map +1 -0
  21. package/dist/cli-main.js +33 -11
  22. package/dist/cli-main.js.map +1 -1
  23. package/dist/cli-setup.d.ts +5 -11
  24. package/dist/cli-setup.js +12 -225
  25. package/dist/cli-setup.js.map +1 -1
  26. package/dist/core/agent-factory.d.ts +4 -0
  27. package/dist/core/agent-factory.js +102 -151
  28. package/dist/core/agent-factory.js.map +1 -1
  29. package/dist/core/config.d.ts +10 -3
  30. package/dist/core/config.js +11 -95
  31. package/dist/core/config.js.map +1 -1
  32. package/dist/core/extensions/deep-investigation.d.ts +2 -1
  33. package/dist/core/extensions/deep-investigation.js +144 -24
  34. package/dist/core/extensions/deep-investigation.js.map +1 -1
  35. package/dist/core/extensions/setup.d.ts +8 -0
  36. package/dist/core/extensions/setup.js +669 -0
  37. package/dist/core/extensions/setup.js.map +1 -0
  38. package/dist/core/llm-proxy.js +7 -3
  39. package/dist/core/llm-proxy.js.map +1 -1
  40. package/dist/core/mcp-client.d.ts +0 -10
  41. package/dist/core/mcp-client.js +0 -65
  42. package/dist/core/mcp-client.js.map +1 -1
  43. package/dist/core/prompt.d.ts +1 -1
  44. package/dist/core/prompt.js +42 -5
  45. package/dist/core/prompt.js.map +1 -1
  46. package/dist/core/provider-presets.d.ts +14 -0
  47. package/dist/core/provider-presets.js +81 -0
  48. package/dist/core/provider-presets.js.map +1 -0
  49. package/dist/cron/cron-coordinator.d.ts +2 -0
  50. package/dist/cron/cron-coordinator.js +46 -14
  51. package/dist/cron/cron-coordinator.js.map +1 -1
  52. package/dist/cron/cron-executor.js +33 -8
  53. package/dist/cron/cron-executor.js.map +1 -1
  54. package/dist/cron/cron-scheduler.d.ts +1 -1
  55. package/dist/cron/gateway-client.d.ts +5 -0
  56. package/dist/cron/gateway-client.js +43 -8
  57. package/dist/cron/gateway-client.js.map +1 -1
  58. package/dist/cron-main.js +39 -9
  59. package/dist/cron-main.js.map +1 -1
  60. package/dist/gateway/agentbox/client.d.ts +11 -0
  61. package/dist/gateway/agentbox/client.js +18 -0
  62. package/dist/gateway/agentbox/client.js.map +1 -1
  63. package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
  64. package/dist/gateway/agentbox/k8s-spawner.js +95 -52
  65. package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
  66. package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
  67. package/dist/gateway/agentbox/local-spawner.js +4 -2
  68. package/dist/gateway/agentbox/local-spawner.js.map +1 -1
  69. package/dist/gateway/agentbox/manager.d.ts +0 -10
  70. package/dist/gateway/agentbox/manager.js +11 -30
  71. package/dist/gateway/agentbox/manager.js.map +1 -1
  72. package/dist/gateway/agentbox/types.d.ts +6 -4
  73. package/dist/gateway/cron/cron-service.d.ts +49 -0
  74. package/dist/gateway/cron/cron-service.js +259 -0
  75. package/dist/gateway/cron/cron-service.js.map +1 -0
  76. package/dist/gateway/db/init-schema.js +44 -0
  77. package/dist/gateway/db/init-schema.js.map +1 -1
  78. package/dist/gateway/db/migrate-sqlite.js +73 -4
  79. package/dist/gateway/db/migrate-sqlite.js.map +1 -1
  80. package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
  81. package/dist/gateway/db/repositories/chat-repo.js +132 -2
  82. package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
  83. package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
  84. package/dist/gateway/db/repositories/config-repo.js +57 -7
  85. package/dist/gateway/db/repositories/config-repo.js.map +1 -1
  86. package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
  87. package/dist/gateway/db/repositories/env-repo.js +15 -2
  88. package/dist/gateway/db/repositories/env-repo.js.map +1 -1
  89. package/dist/gateway/db/repositories/model-config-repo.js +6 -5
  90. package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
  91. package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
  92. package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
  93. package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
  94. package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
  95. package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
  96. package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
  97. package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
  98. package/dist/gateway/db/repositories/system-config-repo.js +2 -1
  99. package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
  100. package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
  101. package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
  102. package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
  103. package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
  104. package/dist/gateway/db/repositories/workspace-repo.js +6 -2
  105. package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
  106. package/dist/gateway/db/schema-mysql.d.ts +473 -51
  107. package/dist/gateway/db/schema-mysql.js +35 -4
  108. package/dist/gateway/db/schema-mysql.js.map +1 -1
  109. package/dist/gateway/db/schema-sqlite.d.ts +522 -57
  110. package/dist/gateway/db/schema-sqlite.js +38 -6
  111. package/dist/gateway/db/schema-sqlite.js.map +1 -1
  112. package/dist/gateway/db/schema.d.ts +471 -51
  113. package/dist/gateway/db/schema.js +1 -1
  114. package/dist/gateway/db/schema.js.map +1 -1
  115. package/dist/gateway/metrics-aggregator.d.ts +65 -0
  116. package/dist/gateway/metrics-aggregator.js +244 -0
  117. package/dist/gateway/metrics-aggregator.js.map +1 -0
  118. package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
  119. package/dist/gateway/plugins/channel-bridge.js +78 -86
  120. package/dist/gateway/plugins/channel-bridge.js.map +1 -1
  121. package/dist/gateway/rpc-methods.d.ts +4 -2
  122. package/dist/gateway/rpc-methods.js +852 -166
  123. package/dist/gateway/rpc-methods.js.map +1 -1
  124. package/dist/gateway/security/cert-manager.d.ts +2 -2
  125. package/dist/gateway/security/cert-manager.js +4 -2
  126. package/dist/gateway/security/cert-manager.js.map +1 -1
  127. package/dist/gateway/server.d.ts +4 -8
  128. package/dist/gateway/server.js +297 -261
  129. package/dist/gateway/server.js.map +1 -1
  130. package/dist/gateway/skills/file-writer.js +17 -11
  131. package/dist/gateway/skills/file-writer.js.map +1 -1
  132. package/dist/gateway/skills/script-evaluator.js +12 -9
  133. package/dist/gateway/skills/script-evaluator.js.map +1 -1
  134. package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
  135. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
  136. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
  137. package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
  138. package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
  139. package/dist/gateway/web/dist/index.html +2 -2
  140. package/dist/gateway-main.js +27 -10
  141. package/dist/gateway-main.js.map +1 -1
  142. package/dist/memory/embeddings.js +5 -4
  143. package/dist/memory/embeddings.js.map +1 -1
  144. package/dist/memory/indexer.d.ts +23 -3
  145. package/dist/memory/indexer.js +235 -23
  146. package/dist/memory/indexer.js.map +1 -1
  147. package/dist/memory/schema.js +15 -1
  148. package/dist/memory/schema.js.map +1 -1
  149. package/dist/memory/types.d.ts +18 -0
  150. package/dist/memory/types.js +6 -1
  151. package/dist/memory/types.js.map +1 -1
  152. package/dist/shared/detect-language.d.ts +12 -0
  153. package/dist/shared/detect-language.js +78 -0
  154. package/dist/shared/detect-language.js.map +1 -0
  155. package/dist/shared/diagnostic-events.d.ts +70 -0
  156. package/dist/shared/diagnostic-events.js +38 -0
  157. package/dist/shared/diagnostic-events.js.map +1 -0
  158. package/dist/shared/local-collector.d.ts +56 -0
  159. package/dist/shared/local-collector.js +284 -0
  160. package/dist/shared/local-collector.js.map +1 -0
  161. package/dist/shared/metrics-types.d.ts +64 -0
  162. package/dist/shared/metrics-types.js +25 -0
  163. package/dist/shared/metrics-types.js.map +1 -0
  164. package/dist/shared/metrics.d.ts +19 -0
  165. package/dist/shared/metrics.js +185 -0
  166. package/dist/shared/metrics.js.map +1 -0
  167. package/dist/shared/path-utils.d.ts +15 -0
  168. package/dist/shared/path-utils.js +23 -0
  169. package/dist/shared/path-utils.js.map +1 -0
  170. package/dist/shared/retry.d.ts +35 -0
  171. package/dist/shared/retry.js +61 -0
  172. package/dist/shared/retry.js.map +1 -0
  173. package/dist/tools/command-sets.d.ts +18 -2
  174. package/dist/tools/command-sets.js +207 -32
  175. package/dist/tools/command-sets.js.map +1 -1
  176. package/dist/tools/command-validator.d.ts +56 -0
  177. package/dist/tools/command-validator.js +357 -0
  178. package/dist/tools/command-validator.js.map +1 -0
  179. package/dist/tools/create-skill.js +26 -1
  180. package/dist/tools/create-skill.js.map +1 -1
  181. package/dist/tools/credential-list.js +1 -23
  182. package/dist/tools/credential-list.js.map +1 -1
  183. package/dist/tools/credential-manager.d.ts +98 -0
  184. package/dist/tools/credential-manager.js +313 -0
  185. package/dist/tools/credential-manager.js.map +1 -0
  186. package/dist/tools/deep-search/engine.js +184 -127
  187. package/dist/tools/deep-search/engine.js.map +1 -1
  188. package/dist/tools/deep-search/prompts.d.ts +10 -2
  189. package/dist/tools/deep-search/prompts.js +37 -36
  190. package/dist/tools/deep-search/prompts.js.map +1 -1
  191. package/dist/tools/deep-search/schemas.d.ts +87 -0
  192. package/dist/tools/deep-search/schemas.js +85 -0
  193. package/dist/tools/deep-search/schemas.js.map +1 -0
  194. package/dist/tools/deep-search/sub-agent.d.ts +21 -0
  195. package/dist/tools/deep-search/sub-agent.js +153 -4
  196. package/dist/tools/deep-search/sub-agent.js.map +1 -1
  197. package/dist/tools/deep-search/tool.js +1 -0
  198. package/dist/tools/deep-search/tool.js.map +1 -1
  199. package/dist/tools/deep-search/types.d.ts +2 -0
  200. package/dist/tools/deep-search/types.js.map +1 -1
  201. package/dist/tools/dp-tools.js +29 -5
  202. package/dist/tools/dp-tools.js.map +1 -1
  203. package/dist/tools/exec-utils.d.ts +85 -0
  204. package/dist/tools/exec-utils.js +294 -0
  205. package/dist/tools/exec-utils.js.map +1 -0
  206. package/dist/tools/fork-skill.js +14 -2
  207. package/dist/tools/fork-skill.js.map +1 -1
  208. package/dist/tools/investigation-feedback.d.ts +3 -0
  209. package/dist/tools/investigation-feedback.js +71 -0
  210. package/dist/tools/investigation-feedback.js.map +1 -0
  211. package/dist/tools/manage-schedule.js +16 -6
  212. package/dist/tools/manage-schedule.js.map +1 -1
  213. package/dist/tools/netns-script.js +27 -281
  214. package/dist/tools/netns-script.js.map +1 -1
  215. package/dist/tools/node-exec.d.ts +2 -14
  216. package/dist/tools/node-exec.js +18 -225
  217. package/dist/tools/node-exec.js.map +1 -1
  218. package/dist/tools/node-script.js +14 -168
  219. package/dist/tools/node-script.js.map +1 -1
  220. package/dist/tools/pod-exec.d.ts +1 -1
  221. package/dist/tools/pod-exec.js +10 -26
  222. package/dist/tools/pod-exec.js.map +1 -1
  223. package/dist/tools/pod-nsenter-exec.js +21 -225
  224. package/dist/tools/pod-nsenter-exec.js.map +1 -1
  225. package/dist/tools/pod-script.js +10 -19
  226. package/dist/tools/pod-script.js.map +1 -1
  227. package/dist/tools/restricted-bash.d.ts +1 -17
  228. package/dist/tools/restricted-bash.js +38 -252
  229. package/dist/tools/restricted-bash.js.map +1 -1
  230. package/dist/tools/run-skill.d.ts +3 -1
  231. package/dist/tools/run-skill.js +21 -1
  232. package/dist/tools/run-skill.js.map +1 -1
  233. package/dist/tools/script-resolver.d.ts +3 -1
  234. package/dist/tools/script-resolver.js +74 -30
  235. package/dist/tools/script-resolver.js.map +1 -1
  236. package/dist/tools/update-skill.js +17 -6
  237. package/dist/tools/update-skill.js.map +1 -1
  238. package/package.json +4 -2
  239. package/siclaw.mjs +10 -1
  240. package/skills/core/cluster-events/SKILL.md +1 -1
  241. package/skills/core/deep-investigation/SKILL.md +11 -0
  242. package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
  243. package/skills/core/dns-debug/SKILL.md +1 -0
  244. package/skills/core/meta.json +12 -1
  245. package/skills/core/networkpolicy-debug/SKILL.md +332 -0
  246. package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
  247. package/skills/core/pod-pending-debug/SKILL.md +1 -0
  248. package/skills/core/quota-debug/SKILL.md +203 -0
  249. package/skills/core/service-debug/SKILL.md +1 -0
  250. package/skills/core/statefulset-debug/SKILL.md +280 -0
  251. package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
  252. package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
  253. package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
  254. package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
  255. package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
  256. package/skills/core/volcano-node-resources/SKILL.md +334 -0
  257. package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
  258. package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
  259. package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
  260. package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
  261. package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
  262. package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
  263. package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
  264. package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
  265. package/skills/platform/create-skill/SKILL.md +35 -3
  266. package/skills/platform/manage-skill/SKILL.md +9 -2
  267. package/skills/platform/update-skill/SKILL.md +17 -6
@@ -0,0 +1,283 @@
1
+ #!/bin/bash
2
+ # Diagnose Volcano Queue status and resource allocation.
3
+ # This script performs read-only operations using kubectl.
4
+ set -euo pipefail
5
+
6
+ if ! command -v jq &>/dev/null; then
7
+ echo "ERROR: jq is required but not installed. Install it with: apt-get install jq / brew install jq" >&2
8
+ exit 1
9
+ fi
10
+
11
+ show_help() {
12
+ cat <<EOF
13
+ Usage: $0 [options]
14
+
15
+ Diagnose Volcano Queue status, resource allocation, and scheduling bottlenecks.
16
+ Checks queue weights, deserved resources, allocated resources, and state.
17
+
18
+ Options:
19
+ --queue QUEUE Queue name to diagnose (default: all queues)
20
+ --show-pods Show PodGroups associated with each queue
21
+ --verbose Show detailed resource breakdown
22
+ -h, --help Show this help message
23
+
24
+ Examples:
25
+ $0 # Diagnose all queues
26
+ $0 --queue training-queue # Diagnose specific queue
27
+ $0 --queue training-queue --verbose --show-pods
28
+ EOF
29
+ exit 0
30
+ }
31
+
32
+ # Parse arguments
33
+ QUEUE=""
34
+ SHOW_PODS=false
35
+ VERBOSE=false
36
+
37
+ while [[ $# -gt 0 ]]; do
38
+ case $1 in
39
+ -h|--help) show_help ;;
40
+ --queue) QUEUE="$2"; shift 2 ;;
41
+ --show-pods) SHOW_PODS=true; shift ;;
42
+ --verbose) VERBOSE=true; shift ;;
43
+ *) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
44
+ esac
45
+ done
46
+
47
+ echo "=== Volcano Queue Diagnosis ==="
48
+ [[ -n "$QUEUE" ]] && echo "Queue: $QUEUE"
49
+ echo
50
+
51
+ # Function to diagnose a single queue
52
+ diagnose_queue() {
53
+ local q="$1"
54
+
55
+ echo "Queue: $q"
56
+ echo "=================="
57
+
58
+ # Get queue details
59
+ local weight state pending running deserved allocated
60
+ weight=$(kubectl get queue "$q" -o jsonpath='{.spec.weight}' 2>/dev/null || echo "N/A")
61
+ state=$(kubectl get queue "$q" -o jsonpath='{.status.state}' 2>/dev/null || echo "Unknown")
62
+ pending=$(kubectl get queue "$q" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
63
+ running=$(kubectl get queue "$q" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
64
+
65
+ echo " Weight: $weight"
66
+ echo " State: $state"
67
+ echo " Pending PodGroups: $pending"
68
+ echo " Running PodGroups: $running"
69
+
70
+ # State warnings
71
+ if [[ "$state" == "Closed" ]]; then
72
+ echo " ⚠️ WARNING: Queue is CLOSED - new jobs will be rejected"
73
+ elif [[ "$state" == "Closing" ]]; then
74
+ echo " ⚠️ WARNING: Queue is CLOSING - will not accept new jobs soon"
75
+ fi
76
+
77
+ # Get resource info
78
+ local deserved_cpu deserved_mem allocated_cpu allocated_mem
79
+ deserved_cpu=$(kubectl get queue "$q" -o jsonpath='{.status.deserved.cpu}' 2>/dev/null || echo "")
80
+ deserved_mem=$(kubectl get queue "$q" -o jsonpath='{.status.deserved.memory}' 2>/dev/null || echo "")
81
+ allocated_cpu=$(kubectl get queue "$q" -o jsonpath='{.status.allocated.cpu}' 2>/dev/null || echo "")
82
+ allocated_mem=$(kubectl get queue "$q" -o jsonpath='{.status.allocated.memory}' 2>/dev/null || echo "")
83
+
84
+ # Handle empty values
85
+ [[ -z "$deserved_cpu" ]] && deserved_cpu="0"
86
+ [[ -z "$deserved_mem" ]] && deserved_mem="0"
87
+ [[ -z "$allocated_cpu" ]] && allocated_cpu="0"
88
+ [[ -z "$allocated_mem" ]] && allocated_mem="0"
89
+
90
+ echo
91
+ echo " Resources:"
92
+ echo " CPU:"
93
+ echo " Deserved: $deserved_cpu"
94
+ echo " Allocated: $allocated_cpu"
95
+
96
+ # Calculate CPU ratio if possible
97
+ if [[ "$deserved_cpu" =~ ^[0-9]+\.?[0-9]*$ && "$allocated_cpu" =~ ^[0-9]+\.?[0-9]*$ ]]; then
98
+ if awk "BEGIN {exit !($deserved_cpu > 0)}" 2>/dev/null; then
99
+ local cpu_ratio
100
+ cpu_ratio=$(awk "BEGIN {printf \"%.1f\", $allocated_cpu * 100 / $deserved_cpu}" 2>/dev/null || echo "N/A")
101
+ echo " Ratio: ${cpu_ratio}%"
102
+
103
+ if awk "BEGIN {exit !($allocated_cpu > $deserved_cpu)}" 2>/dev/null; then
104
+ echo " ⚠️ OVER-ALLOCATED: Queue using more than deserved"
105
+ elif [[ "$cpu_ratio" != "N/A" ]] && awk "BEGIN {exit !($cpu_ratio >= 90)}" 2>/dev/null; then
106
+ echo " ⚠️ NEAR CAPACITY: ${cpu_ratio}% of deserved resources used"
107
+ fi
108
+ fi
109
+ fi
110
+
111
+ echo " Memory:"
112
+ echo " Deserved: $deserved_mem"
113
+ echo " Allocated: $allocated_mem"
114
+
115
+ # Check capability if set
116
+ local cap_cpu cap_mem
117
+ cap_cpu=$(kubectl get queue "$q" -o jsonpath='{.spec.capability.cpu}' 2>/dev/null || echo "")
118
+ cap_mem=$(kubectl get queue "$q" -o jsonpath='{.spec.capability.memory}' 2>/dev/null || echo "")
119
+
120
+ if [[ -n "$cap_cpu" || -n "$cap_mem" ]]; then
121
+ echo " Capability (max allowed):"
122
+ [[ -n "$cap_cpu" ]] && echo " CPU: $cap_cpu"
123
+ [[ -n "$cap_mem" ]] && echo " Memory: $cap_mem"
124
+ fi
125
+
126
+ # Check reclaimable
127
+ local reclaimable
128
+ reclaimable=$(kubectl get queue "$q" -o jsonpath='{.spec.reclaimable}' 2>/dev/null || echo "true")
129
+ [[ "$reclaimable" != "false" ]] && reclaimable="true"
130
+ echo " Reclaimable: $reclaimable"
131
+ [[ "$reclaimable" == "false" ]] && echo " ℹ️ Resources cannot be reclaimed from this queue"
132
+
133
+ # Show PodGroups if requested
134
+ if [[ "$SHOW_PODS" == "true" ]]; then
135
+ echo
136
+ echo " PodGroups in this Queue:"
137
+ local pgs
138
+ pgs=$(kubectl get podgroups --all-namespaces -o json 2>/dev/null | \
139
+ jq -r --arg q "$q" '.items[] | select(.spec.queue==$q) | " \(.metadata.namespace)/\(.metadata.name): \(.status.phase // \"Unknown\")"' 2>/dev/null || echo "")
140
+
141
+ if [[ -n "$pgs" ]]; then
142
+ echo "$pgs"
143
+ else
144
+ echo " No PodGroups found"
145
+ fi
146
+
147
+ # Show pending count specifically
148
+ local pending_pgs
149
+ pending_pgs=$(kubectl get podgroups --all-namespaces -o json 2>/dev/null | \
150
+ jq -r --arg q "$q" '.items[] | select(.spec.queue==$q and .status.phase=="Pending") | "\(.metadata.namespace)/\(.metadata.name)"' 2>/dev/null || echo "")
151
+
152
+ if [[ -n "$pending_pgs" ]]; then
153
+ echo
154
+ echo " ⚠️ Pending PodGroups:"
155
+ echo "$pending_pgs" | while read -r pg; do
156
+ echo " - $pg"
157
+ done
158
+ fi
159
+ fi
160
+
161
+ # Verbose output
162
+ if [[ "$VERBOSE" == "true" ]]; then
163
+ echo
164
+ echo " Raw Queue YAML:"
165
+ kubectl get queue "$q" -o yaml 2>/dev/null | sed 's/^/ /'
166
+ fi
167
+
168
+ echo
169
+ }
170
+
171
+ # Main logic
172
+ if [[ -n "$QUEUE" ]]; then
173
+ # Diagnose specific queue
174
+ if ! kubectl get queue "$QUEUE" &>/dev/null; then
175
+ echo "Error: Queue '$QUEUE' not found" >&2
176
+ exit 1
177
+ fi
178
+ diagnose_queue "$QUEUE"
179
+ else
180
+ # Diagnose all queues
181
+ echo "[1] Listing all queues"
182
+ echo "---------------------"
183
+ kubectl get queue -o custom-columns='NAME:.metadata.name,STATE:.status.state,WEIGHT:.spec.weight,PENDING:.status.pending,RUNNING:.status.running' 2>/dev/null || {
184
+ echo "Error: Failed to list queues" >&2
185
+ exit 1
186
+ }
187
+ echo
188
+
189
+ echo "[2] Resource Allocation Summary"
190
+ echo "--------------------------------"
191
+ # Print table header
192
+ printf "%-20s %-8s %-10s %-12s %-15s %-12s\n" "QUEUE" "STATE" "WEIGHT" "CPU_RATIO" "MEM_ALLOC" "PODS(P/R)"
193
+ printf "%-20s %-8s %-10s %-12s %-15s %-12s\n" "--------------------" "--------" "----------" "------------" "---------------" "-----------"
194
+
195
+ # Helper: convert K8s CPU value (e.g. "500m", "2", "1.5") to millicores
196
+ cpu_to_milli() {
197
+ local v="$1"
198
+ if [[ "$v" =~ ^([0-9]+)m$ ]]; then
199
+ echo "${BASH_REMATCH[1]}"
200
+ elif [[ "$v" =~ ^[0-9]+\.?[0-9]*$ ]]; then
201
+ awk "BEGIN {printf \"%.0f\", $v * 1000}" 2>/dev/null || echo "0"
202
+ else
203
+ echo "0"
204
+ fi
205
+ }
206
+
207
+ # Get all queue names and print resource summary
208
+ kubectl get queue -o json 2>/dev/null | jq -r '.items[] |
209
+ [.metadata.name,
210
+ (.status.state // "Unknown"),
211
+ (.spec.weight // 1),
212
+ (.status.deserved.cpu // "0"),
213
+ (.status.allocated.cpu // "0"),
214
+ (.status.allocated.memory // "N/A"),
215
+ (.status.pending // 0),
216
+ (.status.running // 0)] | @tsv' 2>/dev/null | \
217
+ while IFS=$'\t' read -r name state weight deserved_cpu_raw alloc_cpu_raw mem_alloc pending running; do
218
+ ratio=0
219
+ deserved_m=$(cpu_to_milli "$deserved_cpu_raw")
220
+ alloc_m=$(cpu_to_milli "$alloc_cpu_raw")
221
+ if [[ "$deserved_m" -gt 0 ]]; then
222
+ ratio=$((alloc_m * 100 / deserved_m))
223
+ fi
224
+
225
+ status_indicator=""
226
+ if [[ "$state" == "Closed" ]]; then
227
+ status_indicator="🚫"
228
+ elif [[ "$state" == "Closing" ]]; then
229
+ status_indicator="⚠️"
230
+ elif [[ "$ratio" -ge 90 ]]; then
231
+ status_indicator="🔴"
232
+ elif [[ "$ratio" -ge 75 ]]; then
233
+ status_indicator="🟡"
234
+ fi
235
+
236
+ printf "%-20s %-8s %-10s %-12s %-15s %-12s %s\n" \
237
+ "$name" "$state" "${weight:-1}" "${ratio}%" "${mem_alloc:-N/A}" "${pending:-0}/${running:-0}" "$status_indicator"
238
+ done
239
+ echo
240
+ echo "Legend: 🚫=Closed ⚠️=Closing 🔴=>=90% 🟡=>=75%"
241
+ echo
242
+
243
+ echo "[3] Detailed Queue Analysis"
244
+ echo "--------------------------"
245
+ # Get all queue names
246
+ kubectl get queue -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | while read -r q; do
247
+ [[ -n "$q" ]] && diagnose_queue "$q"
248
+ done
249
+ fi
250
+
251
+ # Summary
252
+ echo "=== Diagnosis Summary ==="
253
+
254
+ # Count queues by state
255
+ total=0
256
+ open_count=0
257
+ closed_count=0
258
+ closing_count=0
259
+
260
+ while read -r q; do
261
+ [[ -z "$q" ]] && continue
262
+ total=$((total + 1))
263
+ state=$(kubectl get queue "$q" -o jsonpath='{.status.state}' 2>/dev/null || echo "Unknown")
264
+ [[ "$state" == "Open" ]] && open_count=$((open_count + 1))
265
+ [[ "$state" == "Closed" ]] && closed_count=$((closed_count + 1))
266
+ [[ "$state" == "Closing" ]] && closing_count=$((closing_count + 1))
267
+ done < <(kubectl get queue -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n')
268
+
269
+ echo "Total Queues: $total"
270
+ echo " Open: $open_count"
271
+ echo " Closed: $closed_count"
272
+ echo " Closing: $closing_count"
273
+
274
+ # Find queues with high pending
275
+ high_pending=$(kubectl get queue -o json 2>/dev/null | jq -r '.items[] | select(.status.pending > 5) | "\(.metadata.name) (\(.status.pending) pending)"' 2>/dev/null || echo "")
276
+ if [[ -n "$high_pending" ]]; then
277
+ echo
278
+ echo "⚠️ Queues with high pending (>5):"
279
+ echo "$high_pending" | sed 's/^/ - /'
280
+ fi
281
+
282
+ echo
283
+ echo "=== Diagnosis Complete ==="
@@ -0,0 +1,315 @@
1
+ ---
2
+ name: volcano-resource-insufficient
3
+ description: >-
4
+ Resource shortage diagnostic guide for Volcano.
5
+ Use when seeing Insufficient cpu/memory events, OOMKilled pods,
6
+ or nodes with zero allocatable resources.
7
+ ---
8
+
9
+ # Resource Insufficiency Diagnosis
10
+
11
+ This guide helps diagnose resource shortage issues in Volcano-scheduled workloads. Resource insufficiency is one of the most common causes of scheduling failures.
12
+
13
+ **Scope:** This skill is for **diagnosis only**. Once you identify the root cause, report it to the user and stop. Do NOT attempt to modify resource quotas or delete workloads.
14
+
15
+ ## When to Use This Guide
16
+
17
+ Use this skill when:
18
+ - Events show `Insufficient cpu` or `Insufficient memory`
19
+ - Pods are stuck in `Pending` with resource-related events
20
+ - Nodes show zero allocatable resources
21
+ - Pods are being `OOMKilled` (Out of Memory)
22
+ - `FailedScheduling` events mention resource constraints
23
+
24
+ ## Types of Resource Issues
25
+
26
+ ### 1. Cluster-Wide Resource Exhaustion
27
+ The entire cluster lacks sufficient resources to meet the workload demands.
28
+
29
+ ### 2. Resource Fragmentation
30
+ Total resources exist but are distributed across too many nodes to satisfy specific scheduling constraints (like Gang scheduling).
31
+
32
+ ### 3. Per-Node Resource Shortage
33
+ Individual nodes lack enough resources, even though the cluster as a whole has capacity.
34
+
35
+ ### 4. Queue Resource Limits
36
+ The Queue has reached its deserved resource limit, preventing new pods from being scheduled.
37
+
38
+ ## Diagnostic Steps
39
+
40
+ ### Step 1: Identify Resource Shortage Type
41
+
42
+ Check the specific error message in events:
43
+
44
+ ```bash
45
+ kubectl get events -n <namespace> --field-selector involvedObject.name=<pod-name> --sort-by='.lastTimestamp'
46
+ ```
47
+
48
+ **Common error patterns:**
49
+
50
+ | Error Message | Resource Type | Scope |
51
+ |--------------|---------------|-------|
52
+ | `Insufficient cpu` | CPU | Node-level |
53
+ | `Insufficient memory` | Memory | Node-level |
54
+ | `Insufficient nvidia.com/gpu` | GPU | Node-level |
55
+ | `0/N nodes are available` | General | Cluster-level |
56
+ | `exceeded quota` | Queue-level | Queue limit |
57
+
58
+ ### Step 2: Check Pod Resource Requests
59
+
60
+ Determine how much resources the pod is requesting:
61
+
62
+ ```bash
63
+ kubectl get pod <pod-name> -n <namespace> -o jsonpath='{.spec.containers[*].resources.requests}'
64
+ ```
65
+
66
+ For detailed breakdown:
67
+ ```bash
68
+ kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 10 "resources:"
69
+ ```
70
+
71
+ **Key fields:**
72
+ - `resources.requests.cpu` - CPU cores requested (e.g., "100m" = 0.1 core, "2" = 2 cores)
73
+ - `resources.requests.memory` - Memory requested (e.g., "1Gi", "512Mi")
74
+ - `resources.requests.nvidia.com/gpu` - GPUs requested
75
+ - `resources.limits` - Maximum allowed (may be different from requests)
76
+
77
+ ### Step 3: Check Node Allocatable Resources
78
+
79
+ View total allocatable resources per node:
80
+
81
+ ```bash
82
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU:.status.allocatable.cpu,MEM:.status.allocatable.memory,GPU:.status.allocatable.nvidia\.com/gpu,PODS:.status.allocatable.pods'
83
+ ```
84
+
85
+ For detailed node information:
86
+ ```bash
87
+ kubectl describe node <node-name>
88
+ ```
89
+
90
+ **Key concepts:**
91
+ - `allocatable` = Total capacity - System reserved - Kubelet reserved
92
+ - `capacity` = Total hardware capacity
93
+ - The difference is reserved for system/Kubernetes daemons
94
+
95
+ ### Step 4: Check Current Resource Usage
96
+
97
+ If metrics-server is available:
98
+
99
+ ```bash
100
+ kubectl top nodes
101
+ ```
102
+
103
+ For per-node pod usage:
104
+ ```bash
105
+ kubectl top pods --all-namespaces --sort-by=cpu | head -20
106
+ kubectl top pods --all-namespaces --sort-by=memory | head -20
107
+ ```
108
+
109
+ **Note:** If metrics-server is not available, you can still see resource allocation (requests) but not actual usage.
110
+
111
+ ### Step 5: Calculate Resource Availability
112
+
113
+ For each node, calculate available resources:
114
+
115
+ ```
116
+ Available CPU = allocatable.cpu - sum(all pod requests on node)
117
+ Available Memory = allocatable.memory - sum(all pod requests on node)
118
+ ```
119
+
120
+ Quick check with:
121
+ ```bash
122
+ kubectl describe node <node-name> | grep -A 20 "Allocated resources"
123
+ ```
124
+
125
+ **Look for:**
126
+ - `cpu-requests` vs `cpu-capacity`
127
+ - `memory-requests` vs `memory-capacity`
128
+ - Percentage of allocation (high % = resource pressure)
129
+
130
+ ### Step 6: Check for Resource Fragmentation
131
+
132
+ For Gang scheduling or affinity constraints, fragmentation is critical:
133
+
134
+ ```bash
135
+ # Count nodes that can fit a single pod
136
+ NODE_CPU_REQ="4"
137
+ NODE_MEM_REQ="8Gi"
138
+
139
+ kubectl get nodes -o json | jq -r '
140
+ .items[] |
141
+ select(.status.allocatable.cpu | tonumber >= '"$NODE_CPU_REQ"') |
142
+ select(.status.allocatable.memory | ascii_downcase | gsub("[gimk]"; "") | tonumber >= 8) |
143
+ .metadata.name'
144
+ ```
145
+
146
+ **Fragmentation indicators:**
147
+ - Many nodes with small amounts of free resources
148
+ - No single node can satisfy the pod's resource needs
149
+ - Total cluster resources sufficient but poorly distributed
150
+
151
+ ## Common Scenarios
152
+
153
+ ### Scenario 1: Pod Requests Exceed Any Single Node
154
+
155
+ **Symptom:** `Insufficient cpu` or `Insufficient memory` on all nodes
156
+
157
+ **Diagnosis:**
158
+ ```bash
159
+ # Check pod request
160
+ kubectl get pod <pod> -o jsonpath='{.spec.containers[0].resources.requests.cpu}'
161
+ # Output: 32
162
+
163
+ # Check largest node's allocatable
164
+ kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.allocatable.cpu}{"\n"}{end}' | sort -k2 -n | tail -1
165
+ # Output: node-1 16
166
+ ```
167
+
168
+ **Analysis:** Pod requests 32 CPUs, but largest node only has 16 allocatable.
169
+
170
+ **Solution:**
171
+ - Reduce pod resource requests (if actual usage is lower)
172
+ - Add larger nodes to cluster
173
+ - Use node pool with bigger instances
174
+
175
+ ### Scenario 2: Cluster at Capacity
176
+
177
+ **Symptom:** Most nodes show high allocation percentage
178
+
179
+ **Diagnosis:**
180
+ ```bash
181
+ kubectl describe node <node-name> | grep "Allocated resources"
182
+ # cpu-requests: 14900m (93%)
183
+ # memory-requests: 55000Mi (85%)
184
+ ```
185
+
186
+ **Analysis:** Nodes are 85-93% allocated, leaving little room for new pods.
187
+
188
+ **Solution:**
189
+ - Scale cluster (add more nodes)
190
+ - Review and optimize resource requests (may be over-provisioned)
191
+ - Consider cluster autoscaler for dynamic scaling
192
+
193
+ ### Scenario 3: Resource Fragmentation
194
+
195
+ **Symptom:** Gang scheduling fails despite total resources being sufficient
196
+
197
+ **Diagnosis:**
198
+ ```bash
199
+ # Total cluster CPU
200
+ kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable.cpu}{"\n"}{end}' | awk '{sum+=$1} END {print sum}'
201
+ # Output: 64
202
+
203
+ # Available nodes for 4-CPU pods
204
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU:.status.allocatable.cpu' | awk '$2 >= 4 {count++} END {print count " nodes can fit the pod"}'
205
+ # Output: 2 nodes can fit the pod
206
+
207
+ # But we need 8 pods for Gang
208
+ # 2 < 8, so Gang fails
209
+ ```
210
+
211
+ **Analysis:** Total cluster has 64 CPUs, but only 2 nodes have 4+ CPUs. Gang needs 8 pods.
212
+
213
+ **Solution:**
214
+ - Enable `binpack` plugin to concentrate pods
215
+ - Defragment by draining and rebalancing nodes
216
+ - Use larger nodes to reduce fragmentation
217
+
218
+ ### Scenario 4: Queue Resource Exhaustion
219
+
220
+ **Symptom:** Events mention queue limits, PodGroup stays in Pending
221
+
222
+ **Diagnosis:**
223
+ ```bash
224
+ # Check queue status
225
+ kubectl get queue <queue-name> -o yaml
226
+ ```
227
+
228
+ **Look for:**
229
+ - `status.allocated` >= `status.deserved`
230
+ - `state` is `Open` but no capacity available
231
+
232
+ **Analysis:** Queue has used all its deserved resources.
233
+
234
+ **Solution:**
235
+ - Increase queue weight or capability
236
+ - Wait for other jobs to complete
237
+ - Use `volcano-queue-diagnose` for detailed analysis
238
+
239
+ ### Scenario 5: GPU Resource Shortage
240
+
241
+ **Symptom:** `Insufficient nvidia.com/gpu` in events
242
+
243
+ **Diagnosis:**
244
+ ```bash
245
+ # Check GPU allocatable
246
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu'
247
+
248
+ # Check GPU usage (if metrics available)
249
+ kubectl top nodes --show-capacity 2>/dev/null || echo "GPU metrics not available"
250
+ ```
251
+
252
+ **Analysis:** GPU resources are fully allocated or fragmented across nodes.
253
+
254
+ **Solution:**
255
+ - Verify GPU device plugin is running
256
+ - Check if GPUs are properly allocatable on nodes
257
+ - Consider GPU sharing if workload supports it
258
+
259
+ ## Resource Calculation Examples
260
+
261
+ ### Example 1: Calculate Total Cluster Capacity
262
+
263
+ ```bash
264
+ kubectl get nodes -o json | jq -r '
265
+ .items |
266
+ map(.status.allocatable) |
267
+ reduce .[] as $item ({};
268
+ . + {cpu: ((.cpu // 0 | tonumber) + ($item.cpu | tonumber)),
269
+ memory: ((.memory // 0) + ($item.memory | tonumber))})'
270
+ ```
271
+
272
+ ### Example 2: Find Pods with High Resource Requests
273
+
274
+ ```bash
275
+ kubectl get pods --all-namespaces -o json | jq -r '
276
+ .items[] |
277
+ select(.spec.containers[].resources.requests.cpu | tonumber > 4) |
278
+ "\(.metadata.namespace)/\(.metadata.name): \(.spec.containers[].resources.requests)"'
279
+ ```
280
+
281
+ ### Example 3: Check Resource Utilization vs Request
282
+
283
+ ```bash
284
+ # High-level view
285
+ kubectl get nodes -o custom-columns='NAME:.metadata.name,CPU_ALLOC:.status.allocatable.cpu,MEM_ALLOC:.status.allocatable.memory'
286
+ ```
287
+
288
+ ## Prevention and Best Practices
289
+
290
+ 1. **Right-size resource requests**
291
+ - Set requests based on actual usage, not maximum possible
292
+ - Use Vertical Pod Autoscaler (VPA) for recommendations
293
+
294
+ 2. **Use cluster autoscaler**
295
+ - Automatically scale nodes based on pending pod demands
296
+ - Configure appropriate node pools for different workloads
297
+
298
+ 3. **Enable binpack plugin**
299
+ - Reduces fragmentation by concentrating pods
300
+ - Better for batch workloads
301
+
302
+ 4. **Monitor resource quotas**
303
+ - Set up alerts for queue resource exhaustion
304
+ - Use `volcano-queue-diagnose` proactively
305
+
306
+ 5. **Regular capacity planning**
307
+ - Track resource growth trends
308
+ - Plan cluster expansion before hitting capacity
309
+
310
+ ## See Also
311
+
312
+ - `volcano-diagnose-pod` - General Pod scheduling diagnosis
313
+ - `volcano-gang-scheduling` - Gang scheduling constraint issues
314
+ - `volcano-queue-diagnose` - Queue resource analysis
315
+ - `volcano-node-resources` - Node resource querying