siclaw 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -114
- package/dist/agentbox/gateway-client.d.ts +2 -1
- package/dist/agentbox/gateway-client.js +6 -2
- package/dist/agentbox/gateway-client.js.map +1 -1
- package/dist/agentbox/http-server.js +184 -19
- package/dist/agentbox/http-server.js.map +1 -1
- package/dist/agentbox/resource-handlers.d.ts +1 -0
- package/dist/agentbox/resource-handlers.js +23 -23
- package/dist/agentbox/resource-handlers.js.map +1 -1
- package/dist/agentbox/session.js +85 -5
- package/dist/agentbox/session.js.map +1 -1
- package/dist/agentbox-main.d.ts +2 -1
- package/dist/agentbox-main.js +65 -18
- package/dist/agentbox-main.js.map +1 -1
- package/dist/cli-credentials.d.ts +1 -0
- package/dist/cli-credentials.js +109 -0
- package/dist/cli-credentials.js.map +1 -0
- package/dist/cli-first-run.d.ts +11 -0
- package/dist/cli-first-run.js +99 -0
- package/dist/cli-first-run.js.map +1 -0
- package/dist/cli-main.js +33 -11
- package/dist/cli-main.js.map +1 -1
- package/dist/cli-setup.d.ts +5 -11
- package/dist/cli-setup.js +12 -225
- package/dist/cli-setup.js.map +1 -1
- package/dist/core/agent-factory.d.ts +4 -0
- package/dist/core/agent-factory.js +102 -151
- package/dist/core/agent-factory.js.map +1 -1
- package/dist/core/config.d.ts +10 -3
- package/dist/core/config.js +11 -95
- package/dist/core/config.js.map +1 -1
- package/dist/core/extensions/deep-investigation.d.ts +2 -1
- package/dist/core/extensions/deep-investigation.js +144 -24
- package/dist/core/extensions/deep-investigation.js.map +1 -1
- package/dist/core/extensions/setup.d.ts +8 -0
- package/dist/core/extensions/setup.js +669 -0
- package/dist/core/extensions/setup.js.map +1 -0
- package/dist/core/llm-proxy.js +7 -3
- package/dist/core/llm-proxy.js.map +1 -1
- package/dist/core/mcp-client.d.ts +0 -10
- package/dist/core/mcp-client.js +0 -65
- package/dist/core/mcp-client.js.map +1 -1
- package/dist/core/prompt.d.ts +1 -1
- package/dist/core/prompt.js +42 -5
- package/dist/core/prompt.js.map +1 -1
- package/dist/core/provider-presets.d.ts +14 -0
- package/dist/core/provider-presets.js +81 -0
- package/dist/core/provider-presets.js.map +1 -0
- package/dist/cron/cron-coordinator.d.ts +2 -0
- package/dist/cron/cron-coordinator.js +46 -14
- package/dist/cron/cron-coordinator.js.map +1 -1
- package/dist/cron/cron-executor.js +33 -8
- package/dist/cron/cron-executor.js.map +1 -1
- package/dist/cron/cron-scheduler.d.ts +1 -1
- package/dist/cron/gateway-client.d.ts +5 -0
- package/dist/cron/gateway-client.js +43 -8
- package/dist/cron/gateway-client.js.map +1 -1
- package/dist/cron-main.js +39 -9
- package/dist/cron-main.js.map +1 -1
- package/dist/gateway/agentbox/client.d.ts +11 -0
- package/dist/gateway/agentbox/client.js +18 -0
- package/dist/gateway/agentbox/client.js.map +1 -1
- package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
- package/dist/gateway/agentbox/k8s-spawner.js +95 -52
- package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
- package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
- package/dist/gateway/agentbox/local-spawner.js +4 -2
- package/dist/gateway/agentbox/local-spawner.js.map +1 -1
- package/dist/gateway/agentbox/manager.d.ts +0 -10
- package/dist/gateway/agentbox/manager.js +11 -30
- package/dist/gateway/agentbox/manager.js.map +1 -1
- package/dist/gateway/agentbox/types.d.ts +6 -4
- package/dist/gateway/cron/cron-service.d.ts +49 -0
- package/dist/gateway/cron/cron-service.js +259 -0
- package/dist/gateway/cron/cron-service.js.map +1 -0
- package/dist/gateway/db/init-schema.js +44 -0
- package/dist/gateway/db/init-schema.js.map +1 -1
- package/dist/gateway/db/migrate-sqlite.js +73 -4
- package/dist/gateway/db/migrate-sqlite.js.map +1 -1
- package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
- package/dist/gateway/db/repositories/chat-repo.js +132 -2
- package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
- package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
- package/dist/gateway/db/repositories/config-repo.js +57 -7
- package/dist/gateway/db/repositories/config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
- package/dist/gateway/db/repositories/env-repo.js +15 -2
- package/dist/gateway/db/repositories/env-repo.js.map +1 -1
- package/dist/gateway/db/repositories/model-config-repo.d.ts +1 -1
- package/dist/gateway/db/repositories/model-config-repo.js +26 -12
- package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
- package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
- package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
- package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
- package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
- package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
- package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
- package/dist/gateway/db/repositories/system-config-repo.js +2 -1
- package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
- package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
- package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
- package/dist/gateway/db/repositories/workspace-repo.js +6 -2
- package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
- package/dist/gateway/db/schema-mysql.d.ts +473 -51
- package/dist/gateway/db/schema-mysql.js +35 -4
- package/dist/gateway/db/schema-mysql.js.map +1 -1
- package/dist/gateway/db/schema-sqlite.d.ts +522 -57
- package/dist/gateway/db/schema-sqlite.js +38 -6
- package/dist/gateway/db/schema-sqlite.js.map +1 -1
- package/dist/gateway/db/schema.d.ts +471 -51
- package/dist/gateway/db/schema.js +1 -1
- package/dist/gateway/db/schema.js.map +1 -1
- package/dist/gateway/metrics-aggregator.d.ts +65 -0
- package/dist/gateway/metrics-aggregator.js +244 -0
- package/dist/gateway/metrics-aggregator.js.map +1 -0
- package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
- package/dist/gateway/plugins/channel-bridge.js +78 -86
- package/dist/gateway/plugins/channel-bridge.js.map +1 -1
- package/dist/gateway/rpc-methods.d.ts +4 -2
- package/dist/gateway/rpc-methods.js +962 -163
- package/dist/gateway/rpc-methods.js.map +1 -1
- package/dist/gateway/security/cert-manager.d.ts +2 -2
- package/dist/gateway/security/cert-manager.js +4 -2
- package/dist/gateway/security/cert-manager.js.map +1 -1
- package/dist/gateway/server.d.ts +4 -8
- package/dist/gateway/server.js +297 -261
- package/dist/gateway/server.js.map +1 -1
- package/dist/gateway/skills/file-writer.js +17 -11
- package/dist/gateway/skills/file-writer.js.map +1 -1
- package/dist/gateway/skills/script-evaluator.js +12 -9
- package/dist/gateway/skills/script-evaluator.js.map +1 -1
- package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
- package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
- package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
- package/dist/gateway/web/dist/assets/index-CAmSY91d.js +675 -0
- package/dist/gateway/web/dist/assets/index-DMFEh8Pp.css +1 -0
- package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
- package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
- package/dist/gateway/web/dist/index.html +2 -2
- package/dist/gateway-main.js +27 -10
- package/dist/gateway-main.js.map +1 -1
- package/dist/memory/embeddings.js +5 -4
- package/dist/memory/embeddings.js.map +1 -1
- package/dist/memory/indexer.d.ts +23 -3
- package/dist/memory/indexer.js +235 -23
- package/dist/memory/indexer.js.map +1 -1
- package/dist/memory/schema.js +15 -1
- package/dist/memory/schema.js.map +1 -1
- package/dist/memory/types.d.ts +18 -0
- package/dist/memory/types.js +6 -1
- package/dist/memory/types.js.map +1 -1
- package/dist/shared/detect-language.d.ts +12 -0
- package/dist/shared/detect-language.js +78 -0
- package/dist/shared/detect-language.js.map +1 -0
- package/dist/shared/diagnostic-events.d.ts +70 -0
- package/dist/shared/diagnostic-events.js +38 -0
- package/dist/shared/diagnostic-events.js.map +1 -0
- package/dist/shared/local-collector.d.ts +56 -0
- package/dist/shared/local-collector.js +284 -0
- package/dist/shared/local-collector.js.map +1 -0
- package/dist/shared/metrics-types.d.ts +64 -0
- package/dist/shared/metrics-types.js +25 -0
- package/dist/shared/metrics-types.js.map +1 -0
- package/dist/shared/metrics.d.ts +19 -0
- package/dist/shared/metrics.js +185 -0
- package/dist/shared/metrics.js.map +1 -0
- package/dist/shared/path-utils.d.ts +15 -0
- package/dist/shared/path-utils.js +23 -0
- package/dist/shared/path-utils.js.map +1 -0
- package/dist/shared/retry.d.ts +35 -0
- package/dist/shared/retry.js +61 -0
- package/dist/shared/retry.js.map +1 -0
- package/dist/tools/command-sets.d.ts +18 -2
- package/dist/tools/command-sets.js +207 -32
- package/dist/tools/command-sets.js.map +1 -1
- package/dist/tools/command-validator.d.ts +56 -0
- package/dist/tools/command-validator.js +357 -0
- package/dist/tools/command-validator.js.map +1 -0
- package/dist/tools/create-skill.js +26 -1
- package/dist/tools/create-skill.js.map +1 -1
- package/dist/tools/credential-list.js +1 -23
- package/dist/tools/credential-list.js.map +1 -1
- package/dist/tools/credential-manager.d.ts +98 -0
- package/dist/tools/credential-manager.js +313 -0
- package/dist/tools/credential-manager.js.map +1 -0
- package/dist/tools/deep-search/engine.js +184 -127
- package/dist/tools/deep-search/engine.js.map +1 -1
- package/dist/tools/deep-search/prompts.d.ts +10 -2
- package/dist/tools/deep-search/prompts.js +37 -36
- package/dist/tools/deep-search/prompts.js.map +1 -1
- package/dist/tools/deep-search/schemas.d.ts +87 -0
- package/dist/tools/deep-search/schemas.js +85 -0
- package/dist/tools/deep-search/schemas.js.map +1 -0
- package/dist/tools/deep-search/sub-agent.d.ts +21 -0
- package/dist/tools/deep-search/sub-agent.js +153 -4
- package/dist/tools/deep-search/sub-agent.js.map +1 -1
- package/dist/tools/deep-search/tool.js +1 -0
- package/dist/tools/deep-search/tool.js.map +1 -1
- package/dist/tools/deep-search/types.d.ts +2 -0
- package/dist/tools/deep-search/types.js.map +1 -1
- package/dist/tools/dp-tools.js +29 -5
- package/dist/tools/dp-tools.js.map +1 -1
- package/dist/tools/exec-utils.d.ts +85 -0
- package/dist/tools/exec-utils.js +294 -0
- package/dist/tools/exec-utils.js.map +1 -0
- package/dist/tools/fork-skill.js +14 -2
- package/dist/tools/fork-skill.js.map +1 -1
- package/dist/tools/investigation-feedback.d.ts +3 -0
- package/dist/tools/investigation-feedback.js +71 -0
- package/dist/tools/investigation-feedback.js.map +1 -0
- package/dist/tools/manage-schedule.js +16 -6
- package/dist/tools/manage-schedule.js.map +1 -1
- package/dist/tools/netns-script.js +27 -281
- package/dist/tools/netns-script.js.map +1 -1
- package/dist/tools/node-exec.d.ts +2 -14
- package/dist/tools/node-exec.js +18 -225
- package/dist/tools/node-exec.js.map +1 -1
- package/dist/tools/node-script.js +14 -168
- package/dist/tools/node-script.js.map +1 -1
- package/dist/tools/pod-exec.d.ts +1 -1
- package/dist/tools/pod-exec.js +10 -26
- package/dist/tools/pod-exec.js.map +1 -1
- package/dist/tools/pod-nsenter-exec.js +21 -225
- package/dist/tools/pod-nsenter-exec.js.map +1 -1
- package/dist/tools/pod-script.js +10 -19
- package/dist/tools/pod-script.js.map +1 -1
- package/dist/tools/restricted-bash.d.ts +1 -17
- package/dist/tools/restricted-bash.js +38 -252
- package/dist/tools/restricted-bash.js.map +1 -1
- package/dist/tools/run-skill.d.ts +3 -1
- package/dist/tools/run-skill.js +21 -1
- package/dist/tools/run-skill.js.map +1 -1
- package/dist/tools/script-resolver.d.ts +3 -1
- package/dist/tools/script-resolver.js +74 -30
- package/dist/tools/script-resolver.js.map +1 -1
- package/dist/tools/update-skill.js +17 -6
- package/dist/tools/update-skill.js.map +1 -1
- package/package.json +8 -6
- package/siclaw.mjs +10 -1
- package/skills/core/cluster-events/SKILL.md +1 -1
- package/skills/core/deep-investigation/SKILL.md +11 -0
- package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
- package/skills/core/dns-debug/SKILL.md +1 -0
- package/skills/core/meta.json +12 -1
- package/skills/core/networkpolicy-debug/SKILL.md +332 -0
- package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
- package/skills/core/pod-pending-debug/SKILL.md +1 -0
- package/skills/core/quota-debug/SKILL.md +203 -0
- package/skills/core/service-debug/SKILL.md +1 -0
- package/skills/core/statefulset-debug/SKILL.md +280 -0
- package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
- package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
- package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
- package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
- package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
- package/skills/core/volcano-node-resources/SKILL.md +334 -0
- package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
- package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
- package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
- package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
- package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
- package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
- package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
- package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
- package/skills/platform/create-skill/SKILL.md +35 -3
- package/skills/platform/manage-skill/SKILL.md +9 -2
- package/skills/platform/update-skill/SKILL.md +17 -6
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Diagnose Volcano Job status and issues.
|
|
3
|
+
# This script performs read-only operations using kubectl.
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
if ! command -v jq &>/dev/null; then
|
|
7
|
+
echo "ERROR: jq is required but not installed. Install it with: apt-get install jq / brew install jq" >&2
|
|
8
|
+
exit 1
|
|
9
|
+
fi
|
|
10
|
+
|
|
11
|
+
show_help() {
|
|
12
|
+
cat <<EOF
|
|
13
|
+
Usage: $0 --job <job-name> [options]
|
|
14
|
+
|
|
15
|
+
Diagnose Volcano Job (batch.volcano.sh/v1beta1) status and issues.
|
|
16
|
+
Checks Job phases, task statuses, PodGroup associations, and overall job health.
|
|
17
|
+
|
|
18
|
+
Options:
|
|
19
|
+
--job JOB Job name to diagnose (required)
|
|
20
|
+
--namespace NS Namespace (default: default)
|
|
21
|
+
--verbose Show detailed task and pod information
|
|
22
|
+
-h, --help Show this help message
|
|
23
|
+
|
|
24
|
+
Environment:
|
|
25
|
+
VOLCANO_NAMESPACE Override default namespace
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
$0 --job my-training-job --namespace training
|
|
29
|
+
$0 --job my-training-job --namespace training --verbose
|
|
30
|
+
EOF
|
|
31
|
+
exit 0
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Parse arguments
|
|
35
|
+
JOB=""
|
|
36
|
+
NS="${VOLCANO_NAMESPACE:-default}"
|
|
37
|
+
VERBOSE=false
|
|
38
|
+
|
|
39
|
+
while [[ $# -gt 0 ]]; do
|
|
40
|
+
case $1 in
|
|
41
|
+
-h|--help) show_help ;;
|
|
42
|
+
--job) JOB="$2"; shift 2 ;;
|
|
43
|
+
--namespace) NS="$2"; shift 2 ;;
|
|
44
|
+
--verbose) VERBOSE=true; shift ;;
|
|
45
|
+
*) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
|
|
46
|
+
esac
|
|
47
|
+
done
|
|
48
|
+
|
|
49
|
+
[[ -z "$JOB" ]] && { echo "Error: --job is required. Use --help for usage." >&2; exit 1; }
|
|
50
|
+
|
|
51
|
+
echo "=== Volcano Job Diagnosis: $NS/$JOB ==="
|
|
52
|
+
echo
|
|
53
|
+
|
|
54
|
+
# 1. Job Overview
|
|
55
|
+
echo "[1/5] Job Overview"
|
|
56
|
+
echo "------------------"
|
|
57
|
+
if ! kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o wide 2>/dev/null; then
|
|
58
|
+
echo "Error: Job '$JOB' not found in namespace '$NS'" >&2
|
|
59
|
+
exit 1
|
|
60
|
+
fi
|
|
61
|
+
echo
|
|
62
|
+
|
|
63
|
+
# Get job details
|
|
64
|
+
JOB_PHASE=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.state.phase}' 2>/dev/null || echo "Unknown")
|
|
65
|
+
JOB_FAILED=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.failed}' 2>/dev/null || echo "0")
|
|
66
|
+
JOB_SUCCEEDED=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.succeeded}' 2>/dev/null || echo "0")
|
|
67
|
+
JOB_RUNNING=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
|
|
68
|
+
JOB_PENDING=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
|
|
69
|
+
|
|
70
|
+
echo "Job Phase: $JOB_PHASE"
|
|
71
|
+
echo "Tasks - Failed: $JOB_FAILED, Succeeded: $JOB_SUCCEEDED, Running: $JOB_RUNNING, Pending: $JOB_PENDING"
|
|
72
|
+
echo
|
|
73
|
+
|
|
74
|
+
# Warning for problematic states
|
|
75
|
+
case "$JOB_PHASE" in
|
|
76
|
+
Failed)
|
|
77
|
+
echo "⚠️ WARNING: Job has FAILED"
|
|
78
|
+
;;
|
|
79
|
+
Pending)
|
|
80
|
+
echo "ℹ️ Job is PENDING - waiting for resources or admission"
|
|
81
|
+
;;
|
|
82
|
+
Restarting)
|
|
83
|
+
echo "⚠️ WARNING: Job is RESTARTING - check previous failure reasons"
|
|
84
|
+
;;
|
|
85
|
+
Aborted)
|
|
86
|
+
echo "⚠️ WARNING: Job was ABORTED"
|
|
87
|
+
;;
|
|
88
|
+
esac
|
|
89
|
+
|
|
90
|
+
# Check minAvailable if set
|
|
91
|
+
MIN_AVAILABLE=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.spec.minAvailable}' 2>/dev/null || echo "")
|
|
92
|
+
if [[ -n "$MIN_AVAILABLE" ]]; then
|
|
93
|
+
echo "MinAvailable: $MIN_AVAILABLE (Gang constraint)"
|
|
94
|
+
fi
|
|
95
|
+
echo
|
|
96
|
+
|
|
97
|
+
# 2. Check Policies
|
|
98
|
+
echo "[2/5] Job Policies"
|
|
99
|
+
echo "------------------"
|
|
100
|
+
POLICIES=$(kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.spec.policies}' 2>/dev/null || echo "")
|
|
101
|
+
if [[ -n "$POLICIES" && "$POLICIES" != "[]" && "$POLICIES" != "null" ]]; then
|
|
102
|
+
echo "Configured Policies:"
|
|
103
|
+
kubectl get job.batch.volcano.sh "$JOB" -n "$NS" -o jsonpath='{.spec.policies}' 2>/dev/null | jq -r '.[] | " - Event: \(.event), Action: \(.action)"' 2>/dev/null || echo " (Failed to parse policies)"
|
|
104
|
+
else
|
|
105
|
+
echo "No policies configured"
|
|
106
|
+
fi
|
|
107
|
+
echo
|
|
108
|
+
|
|
109
|
+
# 3. Task Status
|
|
110
|
+
echo "[3/5] Task Status"
|
|
111
|
+
echo "-----------------"
|
|
112
|
+
|
|
113
|
+
# Initialize counters with defaults (in case PODS is empty)
|
|
114
|
+
PENDING_PODS=0
|
|
115
|
+
RUNNING_PODS=0
|
|
116
|
+
COMPLETED_PODS=0
|
|
117
|
+
FAILED_PODS=0
|
|
118
|
+
TOTAL_PODS=0
|
|
119
|
+
|
|
120
|
+
# Get all pods for this job
|
|
121
|
+
PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "")
|
|
122
|
+
|
|
123
|
+
if [[ -z "$PODS" ]]; then
|
|
124
|
+
echo "⚠️ No pods found for this job"
|
|
125
|
+
echo " Job may be in Pending state or pods may have been cleaned up"
|
|
126
|
+
else
|
|
127
|
+
# Count pods by phase (--no-headers avoids header line in count)
|
|
128
|
+
PENDING_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Pending --no-headers 2>/dev/null | grep -c . || echo "0")
|
|
129
|
+
RUNNING_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Running --no-headers 2>/dev/null | grep -c . || echo "0")
|
|
130
|
+
COMPLETED_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Succeeded --no-headers 2>/dev/null | grep -c . || echo "0")
|
|
131
|
+
FAILED_PODS=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Failed --no-headers 2>/dev/null | grep -c . || echo "0")
|
|
132
|
+
|
|
133
|
+
TOTAL_PODS=$((PENDING_PODS + RUNNING_PODS + COMPLETED_PODS + FAILED_PODS))
|
|
134
|
+
|
|
135
|
+
echo "Total Pods: $TOTAL_PODS"
|
|
136
|
+
echo " Pending: $PENDING_PODS"
|
|
137
|
+
echo " Running: $RUNNING_PODS"
|
|
138
|
+
echo " Completed: $COMPLETED_PODS"
|
|
139
|
+
echo " Failed: $FAILED_PODS"
|
|
140
|
+
echo
|
|
141
|
+
|
|
142
|
+
if [[ "$FAILED_PODS" -gt 0 ]]; then
|
|
143
|
+
echo "⚠️ Failed pods detected - check pod logs and events"
|
|
144
|
+
fi
|
|
145
|
+
|
|
146
|
+
if [[ "$PENDING_PODS" -gt 0 && "$JOB_RUNNING" -gt 0 ]]; then
|
|
147
|
+
echo "⚠️ Partial scheduling - some pods pending while others running"
|
|
148
|
+
echo " Possible Gang scheduling issue - use volcano-gang-scheduling skill"
|
|
149
|
+
fi
|
|
150
|
+
|
|
151
|
+
# Show pod details in verbose mode
|
|
152
|
+
if [[ "$VERBOSE" == "true" ]]; then
|
|
153
|
+
echo
|
|
154
|
+
echo "Pod Details:"
|
|
155
|
+
kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" -o custom-columns='NAME:.metadata.name,STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,NODE:.spec.nodeName,AGE:.metadata.creationTimestamp' 2>/dev/null || echo " (Failed to get pod details)"
|
|
156
|
+
fi
|
|
157
|
+
fi
|
|
158
|
+
echo
|
|
159
|
+
|
|
160
|
+
# 4. PodGroup Association
|
|
161
|
+
echo "[4/5] PodGroup Association"
|
|
162
|
+
echo "----------------------------"
|
|
163
|
+
|
|
164
|
+
# Try to find PodGroup
|
|
165
|
+
PG=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" -o jsonpath='{.items[0].metadata.annotations.scheduling\.volcano\.sh/pod-group}' 2>/dev/null || echo "")
|
|
166
|
+
|
|
167
|
+
if [[ -n "$PG" ]]; then
|
|
168
|
+
echo "PodGroup: $PG"
|
|
169
|
+
|
|
170
|
+
if kubectl get podgroup "$PG" -n "$NS" &>/dev/null; then
|
|
171
|
+
PG_PHASE=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
|
|
172
|
+
PG_MINMEMBER=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.spec.minMember}' 2>/dev/null || echo "N/A")
|
|
173
|
+
PG_RUNNING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.running}' 2>/dev/null || echo "0")
|
|
174
|
+
PG_PENDING=$(kubectl get podgroup "$PG" -n "$NS" -o jsonpath='{.status.pending}' 2>/dev/null || echo "0")
|
|
175
|
+
|
|
176
|
+
echo "PodGroup Phase: $PG_PHASE"
|
|
177
|
+
echo "MinMember: $PG_MINMEMBER | Running: $PG_RUNNING | Pending: $PG_PENDING"
|
|
178
|
+
|
|
179
|
+
if [[ "$PG_PHASE" == "Pending" ]]; then
|
|
180
|
+
echo "⚠️ PodGroup is Pending - check Queue capacity and resource availability"
|
|
181
|
+
fi
|
|
182
|
+
|
|
183
|
+
if [[ "$PG_PHASE" == "Inqueue" && "$PENDING_PODS" -gt 0 ]]; then
|
|
184
|
+
echo "⚠️ PodGroup Inqueue but pods Pending - Gang constraint may not be satisfied"
|
|
185
|
+
fi
|
|
186
|
+
else
|
|
187
|
+
echo "Warning: PodGroup '$PG' not found"
|
|
188
|
+
fi
|
|
189
|
+
else
|
|
190
|
+
echo "No PodGroup annotation found on job pods"
|
|
191
|
+
echo "This may indicate the job is not using Gang scheduling"
|
|
192
|
+
fi
|
|
193
|
+
echo
|
|
194
|
+
|
|
195
|
+
# 5. Events Analysis
|
|
196
|
+
echo "[5/5] Recent Events"
|
|
197
|
+
echo "-------------------"
|
|
198
|
+
kubectl get events -n "$NS" --field-selector "involvedObject.name=$JOB" --sort-by='.lastTimestamp' 2>/dev/null | tail -10 || echo "No events found for job"
|
|
199
|
+
echo
|
|
200
|
+
|
|
201
|
+
# Also check pod events if verbose
|
|
202
|
+
if [[ "$VERBOSE" == "true" && -n "$PODS" ]]; then
|
|
203
|
+
echo "Pod Events (first failed/running pod):"
|
|
204
|
+
# Find a failed or running pod to check events
|
|
205
|
+
SAMPLE_POD=$(kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Failed -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || \
|
|
206
|
+
kubectl get pods -n "$NS" -l "volcano.sh/job-name=$JOB" --field-selector status.phase=Running -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
|
207
|
+
if [[ -n "$SAMPLE_POD" ]]; then
|
|
208
|
+
kubectl get events -n "$NS" --field-selector "involvedObject.name=$SAMPLE_POD" --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo "No events found"
|
|
209
|
+
fi
|
|
210
|
+
echo
|
|
211
|
+
fi
|
|
212
|
+
|
|
213
|
+
# Summary
|
|
214
|
+
echo "=== Diagnosis Summary ==="
|
|
215
|
+
echo "Job: $NS/$JOB"
|
|
216
|
+
echo "Phase: $JOB_PHASE"
|
|
217
|
+
echo "Tasks: $JOB_PENDING pending, $JOB_RUNNING running, $JOB_SUCCEEDED succeeded, $JOB_FAILED failed"
|
|
218
|
+
|
|
219
|
+
if [[ -n "$PG" ]]; then
|
|
220
|
+
echo "PodGroup: $PG (Phase: ${PG_PHASE:-Unknown})"
|
|
221
|
+
fi
|
|
222
|
+
|
|
223
|
+
# Recommendations
|
|
224
|
+
echo
|
|
225
|
+
echo "Recommendations:"
|
|
226
|
+
case "$JOB_PHASE" in
|
|
227
|
+
Pending)
|
|
228
|
+
echo "1. Check PodGroup status (if associated)"
|
|
229
|
+
echo "2. Check Queue capacity with volcano-queue-diagnose"
|
|
230
|
+
echo "3. Check scheduler logs with volcano-scheduler-logs"
|
|
231
|
+
;;
|
|
232
|
+
Running)
|
|
233
|
+
if [[ "$PENDING_PODS" -gt 0 ]]; then
|
|
234
|
+
echo "1. Partial scheduling detected - use volcano-gang-scheduling for Gang analysis"
|
|
235
|
+
echo "2. Check node resources with volcano-node-resources"
|
|
236
|
+
else
|
|
237
|
+
echo "1. Job is running normally - monitor progress"
|
|
238
|
+
fi
|
|
239
|
+
;;
|
|
240
|
+
Failed)
|
|
241
|
+
echo "1. Check failed pod logs: kubectl logs <pod> -n $NS"
|
|
242
|
+
echo "2. Check pod events for failure reasons"
|
|
243
|
+
echo "3. Review job policies and restart configuration"
|
|
244
|
+
;;
|
|
245
|
+
Restarting)
|
|
246
|
+
echo "1. Check previous failure reason in events"
|
|
247
|
+
echo "2. Review container logs for crash reasons"
|
|
248
|
+
echo "3. Verify restart policy is appropriate"
|
|
249
|
+
;;
|
|
250
|
+
esac
|
|
251
|
+
|
|
252
|
+
echo
|
|
253
|
+
echo "=== Diagnosis Complete ==="
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: volcano-node-resources
|
|
3
|
+
description: >-
|
|
4
|
+
Query cluster node resources for Volcano scheduling.
|
|
5
|
+
Check allocatable CPU, memory, GPU, and current usage.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Volcano Node Resources
|
|
9
|
+
|
|
10
|
+
Query cluster node resources to understand capacity and availability for Volcano scheduling. This skill helps identify resource bottlenecks at the node level.
|
|
11
|
+
|
|
12
|
+
**Scope:** This skill is for **diagnosis only**. It retrieves resource information but does not modify any cluster state.
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh [options]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Parameters
|
|
21
|
+
|
|
22
|
+
| Parameter | Required | Description |
|
|
23
|
+
|-----------|----------|-------------|
|
|
24
|
+
| `--node NODE` | no | Query specific node only |
|
|
25
|
+
| `--label LABEL` | no | Filter nodes by label (e.g., gpu=true) |
|
|
26
|
+
| `--show-usage` | no | Show current resource usage (requires metrics-server) |
|
|
27
|
+
| `--show-pods` | no | Show pods running on each node |
|
|
28
|
+
| `--format FORMAT` | no | Output format: table (default), json, wide |
|
|
29
|
+
|
|
30
|
+
## Examples
|
|
31
|
+
|
|
32
|
+
Get overview of all nodes:
|
|
33
|
+
```bash
|
|
34
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Check specific node:
|
|
38
|
+
```bash
|
|
39
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --node worker-1
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Check GPU nodes:
|
|
43
|
+
```bash
|
|
44
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --label nvidia.com/gpu.present=true
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Show resource usage:
|
|
48
|
+
```bash
|
|
49
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --show-usage
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Show with pod information:
|
|
53
|
+
```bash
|
|
54
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --show-pods
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
JSON output for parsing:
|
|
58
|
+
```bash
|
|
59
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --format json
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Understanding Node Resources
|
|
63
|
+
|
|
64
|
+
### Resource Types
|
|
65
|
+
|
|
66
|
+
| Resource | Description | Scheduling Impact |
|
|
67
|
+
|----------|-------------|-------------------|
|
|
68
|
+
| `cpu` | CPU cores (millicores) | Primary scheduling constraint |
|
|
69
|
+
| `memory` | RAM (bytes) | Primary scheduling constraint |
|
|
70
|
+
| `nvidia.com/gpu` | GPU devices | Hardware-specific scheduling |
|
|
71
|
+
| `pods` | Max pods per node | Density limit |
|
|
72
|
+
| `ephemeral-storage` | Disk space | Secondary constraint |
|
|
73
|
+
|
|
74
|
+
### Capacity vs Allocatable
|
|
75
|
+
|
|
76
|
+
- **Capacity**: Total physical resources on the node
|
|
77
|
+
- **Allocatable**: Resources available for pods (capacity minus system reservations)
|
|
78
|
+
|
|
79
|
+
**Reservations include:**
|
|
80
|
+
- kubelet overhead
|
|
81
|
+
- System daemons (kube-proxy, node-exporter)
|
|
82
|
+
- Kernel reserved memory
|
|
83
|
+
- Eviction threshold
|
|
84
|
+
|
|
85
|
+
### Resource Usage
|
|
86
|
+
|
|
87
|
+
When `--show-usage` is enabled and metrics-server is available:
|
|
88
|
+
|
|
89
|
+
- **Requests**: Sum of all pod resource requests on the node
|
|
90
|
+
- **Limits**: Sum of all pod resource limits
|
|
91
|
+
- **Usage**: Actual resource consumption
|
|
92
|
+
|
|
93
|
+
**Key insights:**
|
|
94
|
+
- Allocatable - Requests = Available for new pods
|
|
95
|
+
- Usage < Requests = Over-provisioning
|
|
96
|
+
- Usage > Requests = Over-committing (risky)
|
|
97
|
+
|
|
98
|
+
## Resource Calculation
|
|
99
|
+
|
|
100
|
+
### Available Resources
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
Available = Allocatable - Allocated (sum of all requests)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Nodes with zero or negative available resources cannot accept new pods.
|
|
107
|
+
|
|
108
|
+
### Gang Scheduling Calculation
|
|
109
|
+
|
|
110
|
+
For Gang scheduling, you need:
|
|
111
|
+
```
|
|
112
|
+
Number of nodes with Available >= Pod Request >= minMember
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Example:
|
|
116
|
+
- minMember = 4
|
|
117
|
+
- Pod requests 4 CPUs each
|
|
118
|
+
- Need at least 4 nodes with 4+ CPUs available
|
|
119
|
+
|
|
120
|
+
## Diagnostic Use Cases
|
|
121
|
+
|
|
122
|
+
### Case 1: Identify Nodes with Available Resources
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Look for nodes with positive Available CPU/Memory. Nodes with zero or near-zero availability cannot schedule new pods.
|
|
129
|
+
|
|
130
|
+
### Case 2: Find GPU-Equipped Nodes
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --label nvidia.com/gpu.present=true --show-usage
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Check:
|
|
137
|
+
- Which nodes have GPUs
|
|
138
|
+
- How many GPUs are allocatable
|
|
139
|
+
- How many are currently allocated/used
|
|
140
|
+
- GPU utilization patterns
|
|
141
|
+
|
|
142
|
+
### Case 3: Detect Resource Fragmentation
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --show-usage
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Fragmentation indicators:
|
|
149
|
+
- Many nodes with small amounts of available resources
|
|
150
|
+
- High node count but low available resources per node
|
|
151
|
+
- Allocated resources spread thinly across many nodes
|
|
152
|
+
|
|
153
|
+
### Case 4: Node Affinity Troubleshooting
|
|
154
|
+
|
|
155
|
+
If pods require specific labels:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
# Check nodes with required labels
|
|
159
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --label <required-label>
|
|
160
|
+
|
|
161
|
+
# Verify sufficient resources
|
|
162
|
+
kubectl describe node <node-name> | grep -A 10 "Allocated resources"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Case 5: Capacity Planning
|
|
166
|
+
|
|
167
|
+
Monitor trends over time:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
# Current capacity
|
|
171
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh --format json
|
|
172
|
+
|
|
173
|
+
# Check usage trends (if metrics available)
|
|
174
|
+
for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
|
|
175
|
+
echo "=== $node ==="
|
|
176
|
+
kubectl top node $node 2>/dev/null || echo "Metrics not available"
|
|
177
|
+
done
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Node Status Indicators
|
|
181
|
+
|
|
182
|
+
### Ready Status
|
|
183
|
+
|
|
184
|
+
| Status | Meaning | Action |
|
|
185
|
+
|--------|---------|--------|
|
|
186
|
+
| `Ready` | Node healthy and schedulable | Normal |
|
|
187
|
+
| `NotReady` | Node unhealthy | Check node conditions |
|
|
188
|
+
| `SchedulingDisabled` | Node cordoned | May need uncordon |
|
|
189
|
+
|
|
190
|
+
Check node conditions:
|
|
191
|
+
```bash
|
|
192
|
+
kubectl get nodes -o json | jq '.items[].status.conditions'
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Node Taints
|
|
196
|
+
|
|
197
|
+
Taints prevent pod scheduling:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
kubectl get nodes -o custom-columns='NAME:.metadata.name,TAINTS:.spec.taints[*].key'
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Common taints:
|
|
204
|
+
- `node.kubernetes.io/not-ready`
|
|
205
|
+
- `node.kubernetes.io/unreachable`
|
|
206
|
+
- `node.kubernetes.io/disk-pressure`
|
|
207
|
+
- `node.kubernetes.io/memory-pressure`
|
|
208
|
+
- `node.kubernetes.io/pid-pressure`
|
|
209
|
+
|
|
210
|
+
## Common Issues
|
|
211
|
+
|
|
212
|
+
### Issue 1: Node at Capacity
|
|
213
|
+
|
|
214
|
+
**Symptom:** Available resources near zero, new pods stuck pending
|
|
215
|
+
|
|
216
|
+
**Check:**
|
|
217
|
+
```bash
|
|
218
|
+
kubectl describe node <node-name> | grep -A 5 "Allocated resources"
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**Solution:**
|
|
222
|
+
- Scale cluster (add nodes)
|
|
223
|
+
- Drain and consolidate workloads
|
|
224
|
+
- Review resource requests (may be over-provisioned)
|
|
225
|
+
|
|
226
|
+
### Issue 2: GPU Not Allocatable
|
|
227
|
+
|
|
228
|
+
**Symptom:** Node has GPUs but not showing as allocatable
|
|
229
|
+
|
|
230
|
+
**Check:**
|
|
231
|
+
```bash
|
|
232
|
+
kubectl get node <node> -o jsonpath='{.status.allocatable.nvidia\.com/gpu}'
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
**Solution:**
|
|
236
|
+
- Verify GPU device plugin is running
|
|
237
|
+
- Check nvidia-driver installation
|
|
238
|
+
- Review node labels
|
|
239
|
+
|
|
240
|
+
### Issue 3: Memory Pressure
|
|
241
|
+
|
|
242
|
+
**Symptom:** Node has `node.kubernetes.io/memory-pressure` taint
|
|
243
|
+
|
|
244
|
+
**Check:**
|
|
245
|
+
```bash
|
|
246
|
+
kubectl describe node <node-name> | grep -A 3 "MemoryPressure"
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
**Solution:**
|
|
250
|
+
- Evict or reschedule memory-intensive pods
|
|
251
|
+
- Increase node memory
|
|
252
|
+
- Adjust pod memory limits
|
|
253
|
+
|
|
254
|
+
### Issue 4: Disk Pressure
|
|
255
|
+
|
|
256
|
+
**Symptom:** Node has `node.kubernetes.io/disk-pressure` taint
|
|
257
|
+
|
|
258
|
+
**Check:**
|
|
259
|
+
```bash
|
|
260
|
+
kubectl describe node <node-name> | grep -A 3 "DiskPressure"
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**Solution:**
|
|
264
|
+
- Clean up unused images/containers
|
|
265
|
+
- Increase node disk capacity
|
|
266
|
+
- Review log rotation policies
|
|
267
|
+
|
|
268
|
+
## Output Formats
|
|
269
|
+
|
|
270
|
+
### Table Format (default)
|
|
271
|
+
|
|
272
|
+
Human-readable table:
|
|
273
|
+
```
|
|
274
|
+
NAME CPU_ALLOC MEM_ALLOC GPU_ALLOC CPU_AVAIL MEM_AVAIL
|
|
275
|
+
node-1 32 64Gi 4 8 16Gi
|
|
276
|
+
node-2 16 32Gi 0 2 4Gi
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### Wide Format
|
|
280
|
+
|
|
281
|
+
Additional columns:
|
|
282
|
+
```
|
|
283
|
+
NAME CPU MEM GPU CPU_AVAIL MEM_AVAIL STATUS AGE
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### JSON Format
|
|
287
|
+
|
|
288
|
+
Machine-parseable output:
|
|
289
|
+
```json
|
|
290
|
+
{
|
|
291
|
+
"nodes": [
|
|
292
|
+
{
|
|
293
|
+
"name": "node-1",
|
|
294
|
+
"allocatable": {
|
|
295
|
+
"cpu": "32",
|
|
296
|
+
"memory": "64Gi",
|
|
297
|
+
"nvidia.com/gpu": "4"
|
|
298
|
+
},
|
|
299
|
+
"available": {
|
|
300
|
+
"cpu": "8",
|
|
301
|
+
"memory": "16Gi"
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
]
|
|
305
|
+
}
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## Environment Variables
|
|
309
|
+
|
|
310
|
+
| Variable | Default | Description |
|
|
311
|
+
|----------|---------|-------------|
|
|
312
|
+
| `NODE_LABEL` | "" | Default label selector for nodes |
|
|
313
|
+
|
|
314
|
+
## Integration with Other Skills
|
|
315
|
+
|
|
316
|
+
Combine with other skills for comprehensive analysis:
|
|
317
|
+
|
|
318
|
+
```bash
|
|
319
|
+
# 1. Check node resources
|
|
320
|
+
bash skills/core/volcano-node-resources/scripts/get-node-resources.sh
|
|
321
|
+
|
|
322
|
+
# 2. Check queue resources
|
|
323
|
+
bash skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh
|
|
324
|
+
|
|
325
|
+
# 3. If insufficient resources, refer to volcano-resource-insufficient skill guide
|
|
326
|
+
# (This is a documentation skill - follow the diagnostic steps in the SKILL.md)
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
## See Also
|
|
330
|
+
|
|
331
|
+
- `volcano-resource-insufficient` - Resource shortage diagnosis
|
|
332
|
+
- `volcano-diagnose-pod` - Pod-specific scheduling issues
|
|
333
|
+
- `volcano-gang-scheduling` - Gang constraint analysis
|
|
334
|
+
- `volcano-queue-diagnose` - Queue resource distribution
|