siclaw 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -114
- package/dist/agentbox/gateway-client.d.ts +2 -1
- package/dist/agentbox/gateway-client.js +6 -2
- package/dist/agentbox/gateway-client.js.map +1 -1
- package/dist/agentbox/http-server.js +184 -19
- package/dist/agentbox/http-server.js.map +1 -1
- package/dist/agentbox/resource-handlers.d.ts +1 -0
- package/dist/agentbox/resource-handlers.js +23 -23
- package/dist/agentbox/resource-handlers.js.map +1 -1
- package/dist/agentbox/session.js +85 -5
- package/dist/agentbox/session.js.map +1 -1
- package/dist/agentbox-main.d.ts +2 -1
- package/dist/agentbox-main.js +65 -18
- package/dist/agentbox-main.js.map +1 -1
- package/dist/cli-credentials.d.ts +1 -0
- package/dist/cli-credentials.js +109 -0
- package/dist/cli-credentials.js.map +1 -0
- package/dist/cli-first-run.d.ts +11 -0
- package/dist/cli-first-run.js +99 -0
- package/dist/cli-first-run.js.map +1 -0
- package/dist/cli-main.js +33 -11
- package/dist/cli-main.js.map +1 -1
- package/dist/cli-setup.d.ts +5 -11
- package/dist/cli-setup.js +12 -225
- package/dist/cli-setup.js.map +1 -1
- package/dist/core/agent-factory.d.ts +4 -0
- package/dist/core/agent-factory.js +102 -151
- package/dist/core/agent-factory.js.map +1 -1
- package/dist/core/config.d.ts +10 -3
- package/dist/core/config.js +11 -95
- package/dist/core/config.js.map +1 -1
- package/dist/core/extensions/deep-investigation.d.ts +2 -1
- package/dist/core/extensions/deep-investigation.js +144 -24
- package/dist/core/extensions/deep-investigation.js.map +1 -1
- package/dist/core/extensions/setup.d.ts +8 -0
- package/dist/core/extensions/setup.js +669 -0
- package/dist/core/extensions/setup.js.map +1 -0
- package/dist/core/llm-proxy.js +7 -3
- package/dist/core/llm-proxy.js.map +1 -1
- package/dist/core/mcp-client.d.ts +0 -10
- package/dist/core/mcp-client.js +0 -65
- package/dist/core/mcp-client.js.map +1 -1
- package/dist/core/prompt.d.ts +1 -1
- package/dist/core/prompt.js +42 -5
- package/dist/core/prompt.js.map +1 -1
- package/dist/core/provider-presets.d.ts +14 -0
- package/dist/core/provider-presets.js +81 -0
- package/dist/core/provider-presets.js.map +1 -0
- package/dist/cron/cron-coordinator.d.ts +2 -0
- package/dist/cron/cron-coordinator.js +46 -14
- package/dist/cron/cron-coordinator.js.map +1 -1
- package/dist/cron/cron-executor.js +33 -8
- package/dist/cron/cron-executor.js.map +1 -1
- package/dist/cron/cron-scheduler.d.ts +1 -1
- package/dist/cron/gateway-client.d.ts +5 -0
- package/dist/cron/gateway-client.js +43 -8
- package/dist/cron/gateway-client.js.map +1 -1
- package/dist/cron-main.js +39 -9
- package/dist/cron-main.js.map +1 -1
- package/dist/gateway/agentbox/client.d.ts +11 -0
- package/dist/gateway/agentbox/client.js +18 -0
- package/dist/gateway/agentbox/client.js.map +1 -1
- package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
- package/dist/gateway/agentbox/k8s-spawner.js +95 -52
- package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
- package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
- package/dist/gateway/agentbox/local-spawner.js +4 -2
- package/dist/gateway/agentbox/local-spawner.js.map +1 -1
- package/dist/gateway/agentbox/manager.d.ts +0 -10
- package/dist/gateway/agentbox/manager.js +11 -30
- package/dist/gateway/agentbox/manager.js.map +1 -1
- package/dist/gateway/agentbox/types.d.ts +6 -4
- package/dist/gateway/cron/cron-service.d.ts +49 -0
- package/dist/gateway/cron/cron-service.js +259 -0
- package/dist/gateway/cron/cron-service.js.map +1 -0
- package/dist/gateway/db/init-schema.js +44 -0
- package/dist/gateway/db/init-schema.js.map +1 -1
- package/dist/gateway/db/migrate-sqlite.js +73 -4
- package/dist/gateway/db/migrate-sqlite.js.map +1 -1
- package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
- package/dist/gateway/db/repositories/chat-repo.js +132 -2
- package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
- package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
- package/dist/gateway/db/repositories/config-repo.js +57 -7
- package/dist/gateway/db/repositories/config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
- package/dist/gateway/db/repositories/env-repo.js +15 -2
- package/dist/gateway/db/repositories/env-repo.js.map +1 -1
- package/dist/gateway/db/repositories/model-config-repo.d.ts +1 -1
- package/dist/gateway/db/repositories/model-config-repo.js +26 -12
- package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
- package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
- package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
- package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
- package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
- package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
- package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
- package/dist/gateway/db/repositories/system-config-repo.js +2 -1
- package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
- package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
- package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
- package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
- package/dist/gateway/db/repositories/workspace-repo.js +6 -2
- package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
- package/dist/gateway/db/schema-mysql.d.ts +473 -51
- package/dist/gateway/db/schema-mysql.js +35 -4
- package/dist/gateway/db/schema-mysql.js.map +1 -1
- package/dist/gateway/db/schema-sqlite.d.ts +522 -57
- package/dist/gateway/db/schema-sqlite.js +38 -6
- package/dist/gateway/db/schema-sqlite.js.map +1 -1
- package/dist/gateway/db/schema.d.ts +471 -51
- package/dist/gateway/db/schema.js +1 -1
- package/dist/gateway/db/schema.js.map +1 -1
- package/dist/gateway/metrics-aggregator.d.ts +65 -0
- package/dist/gateway/metrics-aggregator.js +244 -0
- package/dist/gateway/metrics-aggregator.js.map +1 -0
- package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
- package/dist/gateway/plugins/channel-bridge.js +78 -86
- package/dist/gateway/plugins/channel-bridge.js.map +1 -1
- package/dist/gateway/rpc-methods.d.ts +4 -2
- package/dist/gateway/rpc-methods.js +962 -163
- package/dist/gateway/rpc-methods.js.map +1 -1
- package/dist/gateway/security/cert-manager.d.ts +2 -2
- package/dist/gateway/security/cert-manager.js +4 -2
- package/dist/gateway/security/cert-manager.js.map +1 -1
- package/dist/gateway/server.d.ts +4 -8
- package/dist/gateway/server.js +297 -261
- package/dist/gateway/server.js.map +1 -1
- package/dist/gateway/skills/file-writer.js +17 -11
- package/dist/gateway/skills/file-writer.js.map +1 -1
- package/dist/gateway/skills/script-evaluator.js +12 -9
- package/dist/gateway/skills/script-evaluator.js.map +1 -1
- package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
- package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
- package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
- package/dist/gateway/web/dist/assets/index-CAmSY91d.js +675 -0
- package/dist/gateway/web/dist/assets/index-DMFEh8Pp.css +1 -0
- package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
- package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
- package/dist/gateway/web/dist/index.html +2 -2
- package/dist/gateway-main.js +27 -10
- package/dist/gateway-main.js.map +1 -1
- package/dist/memory/embeddings.js +5 -4
- package/dist/memory/embeddings.js.map +1 -1
- package/dist/memory/indexer.d.ts +23 -3
- package/dist/memory/indexer.js +235 -23
- package/dist/memory/indexer.js.map +1 -1
- package/dist/memory/schema.js +15 -1
- package/dist/memory/schema.js.map +1 -1
- package/dist/memory/types.d.ts +18 -0
- package/dist/memory/types.js +6 -1
- package/dist/memory/types.js.map +1 -1
- package/dist/shared/detect-language.d.ts +12 -0
- package/dist/shared/detect-language.js +78 -0
- package/dist/shared/detect-language.js.map +1 -0
- package/dist/shared/diagnostic-events.d.ts +70 -0
- package/dist/shared/diagnostic-events.js +38 -0
- package/dist/shared/diagnostic-events.js.map +1 -0
- package/dist/shared/local-collector.d.ts +56 -0
- package/dist/shared/local-collector.js +284 -0
- package/dist/shared/local-collector.js.map +1 -0
- package/dist/shared/metrics-types.d.ts +64 -0
- package/dist/shared/metrics-types.js +25 -0
- package/dist/shared/metrics-types.js.map +1 -0
- package/dist/shared/metrics.d.ts +19 -0
- package/dist/shared/metrics.js +185 -0
- package/dist/shared/metrics.js.map +1 -0
- package/dist/shared/path-utils.d.ts +15 -0
- package/dist/shared/path-utils.js +23 -0
- package/dist/shared/path-utils.js.map +1 -0
- package/dist/shared/retry.d.ts +35 -0
- package/dist/shared/retry.js +61 -0
- package/dist/shared/retry.js.map +1 -0
- package/dist/tools/command-sets.d.ts +18 -2
- package/dist/tools/command-sets.js +207 -32
- package/dist/tools/command-sets.js.map +1 -1
- package/dist/tools/command-validator.d.ts +56 -0
- package/dist/tools/command-validator.js +357 -0
- package/dist/tools/command-validator.js.map +1 -0
- package/dist/tools/create-skill.js +26 -1
- package/dist/tools/create-skill.js.map +1 -1
- package/dist/tools/credential-list.js +1 -23
- package/dist/tools/credential-list.js.map +1 -1
- package/dist/tools/credential-manager.d.ts +98 -0
- package/dist/tools/credential-manager.js +313 -0
- package/dist/tools/credential-manager.js.map +1 -0
- package/dist/tools/deep-search/engine.js +184 -127
- package/dist/tools/deep-search/engine.js.map +1 -1
- package/dist/tools/deep-search/prompts.d.ts +10 -2
- package/dist/tools/deep-search/prompts.js +37 -36
- package/dist/tools/deep-search/prompts.js.map +1 -1
- package/dist/tools/deep-search/schemas.d.ts +87 -0
- package/dist/tools/deep-search/schemas.js +85 -0
- package/dist/tools/deep-search/schemas.js.map +1 -0
- package/dist/tools/deep-search/sub-agent.d.ts +21 -0
- package/dist/tools/deep-search/sub-agent.js +153 -4
- package/dist/tools/deep-search/sub-agent.js.map +1 -1
- package/dist/tools/deep-search/tool.js +1 -0
- package/dist/tools/deep-search/tool.js.map +1 -1
- package/dist/tools/deep-search/types.d.ts +2 -0
- package/dist/tools/deep-search/types.js.map +1 -1
- package/dist/tools/dp-tools.js +29 -5
- package/dist/tools/dp-tools.js.map +1 -1
- package/dist/tools/exec-utils.d.ts +85 -0
- package/dist/tools/exec-utils.js +294 -0
- package/dist/tools/exec-utils.js.map +1 -0
- package/dist/tools/fork-skill.js +14 -2
- package/dist/tools/fork-skill.js.map +1 -1
- package/dist/tools/investigation-feedback.d.ts +3 -0
- package/dist/tools/investigation-feedback.js +71 -0
- package/dist/tools/investigation-feedback.js.map +1 -0
- package/dist/tools/manage-schedule.js +16 -6
- package/dist/tools/manage-schedule.js.map +1 -1
- package/dist/tools/netns-script.js +27 -281
- package/dist/tools/netns-script.js.map +1 -1
- package/dist/tools/node-exec.d.ts +2 -14
- package/dist/tools/node-exec.js +18 -225
- package/dist/tools/node-exec.js.map +1 -1
- package/dist/tools/node-script.js +14 -168
- package/dist/tools/node-script.js.map +1 -1
- package/dist/tools/pod-exec.d.ts +1 -1
- package/dist/tools/pod-exec.js +10 -26
- package/dist/tools/pod-exec.js.map +1 -1
- package/dist/tools/pod-nsenter-exec.js +21 -225
- package/dist/tools/pod-nsenter-exec.js.map +1 -1
- package/dist/tools/pod-script.js +10 -19
- package/dist/tools/pod-script.js.map +1 -1
- package/dist/tools/restricted-bash.d.ts +1 -17
- package/dist/tools/restricted-bash.js +38 -252
- package/dist/tools/restricted-bash.js.map +1 -1
- package/dist/tools/run-skill.d.ts +3 -1
- package/dist/tools/run-skill.js +21 -1
- package/dist/tools/run-skill.js.map +1 -1
- package/dist/tools/script-resolver.d.ts +3 -1
- package/dist/tools/script-resolver.js +74 -30
- package/dist/tools/script-resolver.js.map +1 -1
- package/dist/tools/update-skill.js +17 -6
- package/dist/tools/update-skill.js.map +1 -1
- package/package.json +8 -6
- package/siclaw.mjs +10 -1
- package/skills/core/cluster-events/SKILL.md +1 -1
- package/skills/core/deep-investigation/SKILL.md +11 -0
- package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
- package/skills/core/dns-debug/SKILL.md +1 -0
- package/skills/core/meta.json +12 -1
- package/skills/core/networkpolicy-debug/SKILL.md +332 -0
- package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
- package/skills/core/pod-pending-debug/SKILL.md +1 -0
- package/skills/core/quota-debug/SKILL.md +203 -0
- package/skills/core/service-debug/SKILL.md +1 -0
- package/skills/core/statefulset-debug/SKILL.md +280 -0
- package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
- package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
- package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
- package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
- package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
- package/skills/core/volcano-node-resources/SKILL.md +334 -0
- package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
- package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
- package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
- package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
- package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
- package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
- package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
- package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
- package/skills/platform/create-skill/SKILL.md +35 -3
- package/skills/platform/manage-skill/SKILL.md +9 -2
- package/skills/platform/update-skill/SKILL.md +17 -6
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: volcano-scheduler-logs
|
|
3
|
+
description: >-
|
|
4
|
+
Retrieve and analyze Volcano scheduler logs.
|
|
5
|
+
Filter by keyword, time range, or pod name to debug scheduling decisions.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Volcano Scheduler Logs
|
|
9
|
+
|
|
10
|
+
Retrieve and analyze Volcano scheduler logs to understand scheduling decisions, failures, and performance issues.
|
|
11
|
+
|
|
12
|
+
**Scope:** This skill is for **diagnosis only**. It retrieves logs for analysis but does not modify any cluster state.
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh [options]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Parameters
|
|
21
|
+
|
|
22
|
+
| Parameter | Required | Description |
|
|
23
|
+
|-----------|----------|-------------|
|
|
24
|
+
| `--keyword KEYWORD` | no | Filter logs by keyword (case-insensitive) |
|
|
25
|
+
| `--pod POD` | no | Filter logs related to specific pod name |
|
|
26
|
+
| `--since TIME` | no | Show logs newer than relative time (e.g., 10m, 1h) |
|
|
27
|
+
| `--lines N` | no | Number of lines to show (default: 100) |
|
|
28
|
+
| `--follow` | no | Stream logs in real-time (Ctrl+C to stop) |
|
|
29
|
+
| `--previous` | no | Show logs from previous container instance (after restart) |
|
|
30
|
+
|
|
31
|
+
## Examples
|
|
32
|
+
|
|
33
|
+
Get recent scheduler logs:
|
|
34
|
+
```bash
|
|
35
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Search for error messages:
|
|
39
|
+
```bash
|
|
40
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword error
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Get logs for a specific pod:
|
|
44
|
+
```bash
|
|
45
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --pod my-job-0
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Get last 500 lines from the past hour:
|
|
49
|
+
```bash
|
|
50
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --since 1h --lines 500
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Stream logs for gang scheduling issues:
|
|
54
|
+
```bash
|
|
55
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword gang --follow
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Check logs from previous scheduler instance (after crash/restart):
|
|
59
|
+
```bash
|
|
60
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --previous --lines 200
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Common Keywords for Filtering
|
|
64
|
+
|
|
65
|
+
| Keyword | Use Case |
|
|
66
|
+
|---------|----------|
|
|
67
|
+
| `error` | Find error messages and failures |
|
|
68
|
+
| `FailedScheduling` | Scheduling failures |
|
|
69
|
+
| `allocate` | Resource allocation attempts |
|
|
70
|
+
| `gang` | Gang scheduling decisions |
|
|
71
|
+
| `minMember` | MinMember constraint issues |
|
|
72
|
+
| `preempt` | Preemption events |
|
|
73
|
+
| `reclaim` | Resource reclamation |
|
|
74
|
+
| `enqueue` | Queue admission decisions |
|
|
75
|
+
| `bind` | Pod binding attempts |
|
|
76
|
+
| `queue` | Queue-related decisions |
|
|
77
|
+
| `proportion` | Proportion plugin decisions |
|
|
78
|
+
| `priority` | Priority-related decisions |
|
|
79
|
+
|
|
80
|
+
## Understanding Scheduler Logs
|
|
81
|
+
|
|
82
|
+
### Log Format
|
|
83
|
+
|
|
84
|
+
Volcano scheduler logs typically follow this format:
|
|
85
|
+
```
|
|
86
|
+
I0102 15:04:05.123456 1 scheduler.go:123] Starting scheduling session
|
|
87
|
+
I0102 15:04:05.234567 1 allocate.go:456] Try to allocate resources for Job <namespace>/<job-name>
|
|
88
|
+
E0102 15:04:05.345678 1 gang.go:789] Failed to schedule pod <pod-name>: minMember not satisfied
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Log levels:**
|
|
92
|
+
- `I` - Info: Normal operation information
|
|
93
|
+
- `W` - Warning: Unusual but non-fatal conditions
|
|
94
|
+
- `E` - Error: Failures and errors
|
|
95
|
+
- `F` - Fatal: Critical errors causing shutdown
|
|
96
|
+
|
|
97
|
+
### Common Log Patterns
|
|
98
|
+
|
|
99
|
+
#### Session Start
|
|
100
|
+
```
|
|
101
|
+
Starting scheduling session
|
|
102
|
+
Starting scheduling loop
|
|
103
|
+
```
|
|
104
|
+
- Indicates scheduler is processing a new batch of pending pods
|
|
105
|
+
|
|
106
|
+
#### Enqueue Decisions
|
|
107
|
+
```
|
|
108
|
+
Try to enqueue pod group
|
|
109
|
+
PodGroup <name> is enqueued
|
|
110
|
+
PodGroup <name> is pending
|
|
111
|
+
```
|
|
112
|
+
- Shows whether pod groups are admitted to the queue
|
|
113
|
+
|
|
114
|
+
#### Allocation Attempts
|
|
115
|
+
```
|
|
116
|
+
Try to allocate resources for Job
|
|
117
|
+
Try to allocate for task
|
|
118
|
+
```
|
|
119
|
+
- Shows scheduling attempts for specific jobs/pods
|
|
120
|
+
|
|
121
|
+
#### Gang Scheduling
|
|
122
|
+
```
|
|
123
|
+
minMember not satisfied
|
|
124
|
+
gang member not ready
|
|
125
|
+
Waiting for gang members
|
|
126
|
+
```
|
|
127
|
+
- Indicates Gang constraint preventing scheduling
|
|
128
|
+
|
|
129
|
+
#### Resource Shortage
|
|
130
|
+
```
|
|
131
|
+
Insufficient cpu
|
|
132
|
+
Insufficient memory
|
|
133
|
+
0 nodes are available
|
|
134
|
+
```
|
|
135
|
+
- Indicates resource constraint preventing scheduling
|
|
136
|
+
|
|
137
|
+
#### Preemption
|
|
138
|
+
```
|
|
139
|
+
Preempting pods
|
|
140
|
+
Found victim pods
|
|
141
|
+
```
|
|
142
|
+
- Shows preemption decisions for high-priority workloads
|
|
143
|
+
|
|
144
|
+
#### Reclaim
|
|
145
|
+
```
|
|
146
|
+
Try to reclaim resources
|
|
147
|
+
Reclaiming resources from queue
|
|
148
|
+
```
|
|
149
|
+
- Shows resource reclamation between queues
|
|
150
|
+
|
|
151
|
+
## Diagnostic Use Cases
|
|
152
|
+
|
|
153
|
+
### Case 1: Pod Stuck in Pending
|
|
154
|
+
|
|
155
|
+
Find relevant scheduler decisions:
|
|
156
|
+
```bash
|
|
157
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --pod <pod-name> --since 30m
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Look for:
|
|
161
|
+
- `FailedScheduling` events
|
|
162
|
+
- `minMember not satisfied`
|
|
163
|
+
- `Insufficient` resource messages
|
|
164
|
+
- `enqueue` decisions (is the PodGroup being admitted?)
|
|
165
|
+
|
|
166
|
+
### Case 2: Gang Scheduling Issues
|
|
167
|
+
|
|
168
|
+
Check Gang plugin behavior:
|
|
169
|
+
```bash
|
|
170
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword gang --since 1h
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Look for:
|
|
174
|
+
- `minMember` related messages
|
|
175
|
+
- Gang constraint validation
|
|
176
|
+
- Comparison of running vs required members
|
|
177
|
+
|
|
178
|
+
### Case 3: Queue Resource Issues
|
|
179
|
+
|
|
180
|
+
Check proportion and reclaim decisions:
|
|
181
|
+
```bash
|
|
182
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword "reclaim\|proportion" --since 30m
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Look for:
|
|
186
|
+
- Queue resource calculations
|
|
187
|
+
- Reclaim triggers
|
|
188
|
+
- Over-commit handling
|
|
189
|
+
|
|
190
|
+
### Case 4: Scheduler Performance
|
|
191
|
+
|
|
192
|
+
Check for scheduling delays:
|
|
193
|
+
```bash
|
|
194
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --lines 500 | grep -E "(Starting|Finished) scheduling"
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Look for:
|
|
198
|
+
- Long gaps between "Starting" and "Finished"
|
|
199
|
+
- High frequency of scheduling loops
|
|
200
|
+
- Errors causing retries
|
|
201
|
+
|
|
202
|
+
### Case 5: Preemption Analysis
|
|
203
|
+
|
|
204
|
+
Check preemption decisions:
|
|
205
|
+
```bash
|
|
206
|
+
bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword preempt --since 1h
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Look for:
|
|
210
|
+
- Which pods are being preempted
|
|
211
|
+
- Priority comparisons
|
|
212
|
+
- Preemption success/failure
|
|
213
|
+
|
|
214
|
+
## Environment Variables
|
|
215
|
+
|
|
216
|
+
| Variable | Default | Description |
|
|
217
|
+
|----------|---------|-------------|
|
|
218
|
+
| `VOLCANO_SCHEDULER_NS` | `volcano-system` | Scheduler namespace |
|
|
219
|
+
| `VOLCANO_SCHEDULER_LABEL` | `app=volcano-scheduler` | Label selector for scheduler pods |
|
|
220
|
+
|
|
221
|
+
## Limitations
|
|
222
|
+
|
|
223
|
+
1. **Log retention:** Logs may be rotated based on cluster configuration
|
|
224
|
+
2. **Multi-scheduler:** If running multiple schedulers, logs will be interleaved
|
|
225
|
+
3. **Log level:** Default log level may not show all debug information
|
|
226
|
+
4. **Previous logs:** `--previous` only works if the container has restarted
|
|
227
|
+
|
|
228
|
+
## Tips for Effective Log Analysis
|
|
229
|
+
|
|
230
|
+
1. **Use time ranges:** Narrow down with `--since` to focus on recent issues
|
|
231
|
+
2. **Combine keywords:** Search for `error\|Failed\|failed` to catch all failures
|
|
232
|
+
3. **Check pod context:** Always include `--pod` when investigating specific pods
|
|
233
|
+
4. **Look for patterns:** Repeating errors may indicate systemic issues
|
|
234
|
+
5. **Correlate with events:** Compare with `kubectl get events` timestamps
|
|
235
|
+
|
|
236
|
+
## See Also
|
|
237
|
+
|
|
238
|
+
- `volcano-diagnose-pod` - Diagnose individual pod issues
|
|
239
|
+
- `volcano-gang-scheduling` - Gang scheduling specific diagnosis
|
|
240
|
+
- `volcano-queue-diagnose` - Queue resource analysis
|
|
241
|
+
- `volcano-resource-insufficient` - Resource shortage diagnosis
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Retrieve and analyze Volcano scheduler logs.
|
|
3
|
+
# This script performs read-only operations using kubectl.
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
show_help() {
|
|
7
|
+
cat <<EOF
|
|
8
|
+
Usage: $0 [options]
|
|
9
|
+
|
|
10
|
+
Retrieve and analyze Volcano scheduler logs.
|
|
11
|
+
Filter by keyword, time range, or pod name to debug scheduling decisions.
|
|
12
|
+
|
|
13
|
+
Options:
|
|
14
|
+
--keyword KEYWORD Filter logs by keyword (case-insensitive)
|
|
15
|
+
--pod POD Filter logs related to specific pod name
|
|
16
|
+
--since TIME Show logs newer than relative time (e.g., 10m, 1h, 1d)
|
|
17
|
+
--lines N Number of lines to show (default: 100)
|
|
18
|
+
--follow Stream logs in real-time (Ctrl+C to stop)
|
|
19
|
+
--previous Show logs from previous container instance
|
|
20
|
+
-h, --help Show this help message
|
|
21
|
+
|
|
22
|
+
Environment:
|
|
23
|
+
VOLCANO_SCHEDULER_NS Scheduler namespace (default: volcano-system)
|
|
24
|
+
VOLCANO_SCHEDULER_LABEL Pod label selector (default: app=volcano-scheduler)
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
$0 --keyword error # Search for errors
|
|
28
|
+
$0 --pod my-job-0 --since 30m # Logs for pod in last 30 min
|
|
29
|
+
$0 --lines 500 --since 1h # Last 500 lines from past hour
|
|
30
|
+
$0 --keyword gang --follow # Stream gang scheduling logs
|
|
31
|
+
$0 --previous --lines 200 # Logs from previous scheduler instance
|
|
32
|
+
EOF
|
|
33
|
+
exit 0
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Parse arguments
|
|
37
|
+
KEYWORD=""
|
|
38
|
+
POD=""
|
|
39
|
+
SINCE=""
|
|
40
|
+
LINES=100
|
|
41
|
+
FOLLOW=false
|
|
42
|
+
PREVIOUS=false
|
|
43
|
+
|
|
44
|
+
while [[ $# -gt 0 ]]; do
|
|
45
|
+
case $1 in
|
|
46
|
+
-h|--help) show_help ;;
|
|
47
|
+
--keyword) KEYWORD="$2"; shift 2 ;;
|
|
48
|
+
--pod) POD="$2"; shift 2 ;;
|
|
49
|
+
--since) SINCE="$2"; shift 2 ;;
|
|
50
|
+
--lines) LINES="$2"; shift 2 ;;
|
|
51
|
+
--follow) FOLLOW=true; shift ;;
|
|
52
|
+
--previous) PREVIOUS=true; shift ;;
|
|
53
|
+
*) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
|
|
54
|
+
esac
|
|
55
|
+
done
|
|
56
|
+
|
|
57
|
+
# Validate arguments
|
|
58
|
+
if [[ "$FOLLOW" == "true" && -n "$SINCE" ]]; then
|
|
59
|
+
echo "Error: --follow and --since cannot be used together" >&2
|
|
60
|
+
exit 1
|
|
61
|
+
fi
|
|
62
|
+
|
|
63
|
+
if [[ "$FOLLOW" == "true" && -n "$SINCE" && "$LINES" != "100" ]]; then
|
|
64
|
+
echo "Warning: --follow ignores --lines, streaming from now" >&2
|
|
65
|
+
fi
|
|
66
|
+
|
|
67
|
+
# Environment settings
|
|
68
|
+
SCHEDULER_NS="${VOLCANO_SCHEDULER_NS:-volcano-system}"
|
|
69
|
+
SCHEDULER_LABEL="${VOLCANO_SCHEDULER_LABEL:-app=volcano-scheduler}"
|
|
70
|
+
|
|
71
|
+
echo "=== Volcano Scheduler Logs ==="
|
|
72
|
+
echo "Namespace: $SCHEDULER_NS"
|
|
73
|
+
echo "Label: $SCHEDULER_LABEL"
|
|
74
|
+
[[ -n "$KEYWORD" ]] && echo "Keyword filter: $KEYWORD"
|
|
75
|
+
[[ -n "$POD" ]] && echo "Pod filter: $POD"
|
|
76
|
+
[[ -n "$SINCE" ]] && echo "Time range: $SINCE"
|
|
77
|
+
echo "Lines: $LINES"
|
|
78
|
+
[[ "$PREVIOUS" == "true" ]] && echo "Previous instance: yes"
|
|
79
|
+
echo
|
|
80
|
+
|
|
81
|
+
# Check if scheduler pod exists
|
|
82
|
+
if ! kubectl get pods -n "$SCHEDULER_NS" -l "$SCHEDULER_LABEL" &>/dev/null; then
|
|
83
|
+
echo "Error: No scheduler pods found in namespace '$SCHEDULER_NS' with label '$SCHEDULER_LABEL'" >&2
|
|
84
|
+
echo "Available pods in $SCHEDULER_NS:" >&2
|
|
85
|
+
kubectl get pods -n "$SCHEDULER_NS" 2>/dev/null | head -10 >&2 || echo " (failed to list pods)" >&2
|
|
86
|
+
exit 1
|
|
87
|
+
fi
|
|
88
|
+
|
|
89
|
+
# Get scheduler pod name
|
|
90
|
+
SCHEDULER_POD=$(kubectl get pods -n "$SCHEDULER_NS" -l "$SCHEDULER_LABEL" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
|
91
|
+
|
|
92
|
+
if [[ -z "$SCHEDULER_POD" ]]; then
|
|
93
|
+
echo "Error: Could not determine scheduler pod name" >&2
|
|
94
|
+
exit 1
|
|
95
|
+
fi
|
|
96
|
+
|
|
97
|
+
echo "Scheduler Pod: $SCHEDULER_POD"
|
|
98
|
+
echo
|
|
99
|
+
|
|
100
|
+
# Build kubectl logs command
|
|
101
|
+
LOG_CMD="kubectl logs -n $SCHEDULER_NS $SCHEDULER_POD"
|
|
102
|
+
|
|
103
|
+
# Add options
|
|
104
|
+
[[ "$FOLLOW" == "true" ]] && LOG_CMD="$LOG_CMD --follow"
|
|
105
|
+
[[ "$PREVIOUS" == "true" ]] && LOG_CMD="$LOG_CMD --previous"
|
|
106
|
+
[[ -n "$SINCE" ]] && LOG_CMD="$LOG_CMD --since=$SINCE"
|
|
107
|
+
[[ "$FOLLOW" == "false" ]] && LOG_CMD="$LOG_CMD --tail=$LINES"
|
|
108
|
+
|
|
109
|
+
# Execute command with optional filtering
|
|
110
|
+
echo "Executing: $LOG_CMD"
|
|
111
|
+
echo "----------------------------------------"
|
|
112
|
+
echo
|
|
113
|
+
|
|
114
|
+
# Build filter pattern
|
|
115
|
+
FILTER_PATTERN=""
|
|
116
|
+
|
|
117
|
+
# If both keyword and pod are specified, combine them
|
|
118
|
+
if [[ -n "$KEYWORD" && -n "$POD" ]]; then
|
|
119
|
+
FILTER_PATTERN="$KEYWORD|$POD"
|
|
120
|
+
elif [[ -n "$KEYWORD" ]]; then
|
|
121
|
+
FILTER_PATTERN="$KEYWORD"
|
|
122
|
+
elif [[ -n "$POD" ]]; then
|
|
123
|
+
FILTER_PATTERN="$POD"
|
|
124
|
+
fi
|
|
125
|
+
|
|
126
|
+
# Execute and filter
|
|
127
|
+
if [[ -n "$FILTER_PATTERN" ]]; then
|
|
128
|
+
# Use case-insensitive grep for filtering
|
|
129
|
+
if [[ "$FOLLOW" == "true" ]]; then
|
|
130
|
+
# For follow mode, we need to filter in real-time
|
|
131
|
+
$LOG_CMD 2>&1 | grep -iE "$FILTER_PATTERN" || true
|
|
132
|
+
else
|
|
133
|
+
# For non-follow mode, filter after getting logs
|
|
134
|
+
$LOG_CMD 2>&1 | grep -iE "$FILTER_PATTERN" || {
|
|
135
|
+
echo "(No log lines matched the filter pattern: $FILTER_PATTERN)"
|
|
136
|
+
}
|
|
137
|
+
fi
|
|
138
|
+
else
|
|
139
|
+
# No filtering, show all logs
|
|
140
|
+
$LOG_CMD 2>&1 || {
|
|
141
|
+
echo "Error: Failed to retrieve logs" >&2
|
|
142
|
+
exit 1
|
|
143
|
+
}
|
|
144
|
+
fi
|
|
145
|
+
|
|
146
|
+
echo
|
|
147
|
+
|
|
148
|
+
# If not following, show some helpful hints
|
|
149
|
+
if [[ "$FOLLOW" == "false" ]]; then
|
|
150
|
+
echo "----------------------------------------"
|
|
151
|
+
echo "Hints:"
|
|
152
|
+
echo " - Use --follow to stream logs in real-time"
|
|
153
|
+
echo " - Use --since 30m for recent logs only"
|
|
154
|
+
echo " - Use --previous if scheduler recently restarted"
|
|
155
|
+
echo " - Common keywords: error, FailedScheduling, gang, preempt, reclaim"
|
|
156
|
+
fi
|
|
157
|
+
|
|
158
|
+
echo
|
|
159
|
+
echo "=== Log Retrieval Complete ==="
|
|
@@ -15,6 +15,32 @@ Use this skill when the user asks you to:
|
|
|
15
15
|
- Create a new skill for a specific operational procedure
|
|
16
16
|
- Save a diagnosis workflow as a skill
|
|
17
17
|
|
|
18
|
+
## Duplicate / Overlap Check — Do This FIRST
|
|
19
|
+
|
|
20
|
+
**Before creating any skill, check whether an existing skill already covers the same functionality.** Consult the `<available_skills>` index in your context.
|
|
21
|
+
|
|
22
|
+
- **Functional overlap found**: If an existing builtin, team, or personal skill solves the same problem (even with a different name), DO NOT silently create a new one. Instead:
|
|
23
|
+
1. Tell the user which existing skill overlaps and what it does.
|
|
24
|
+
2. Ask if they want to: (a) use the existing skill as-is, (b) fork it with `fork_skill` to make a customized personal copy, or (c) still create a brand-new separate skill.
|
|
25
|
+
3. Only proceed with `create_skill` if the user explicitly chooses option (c).
|
|
26
|
+
- **Why this matters**: Duplicate skills with similar functionality confuse the model — it cannot reliably choose between two skills that do the same thing. One well-maintained skill is always better than two overlapping ones.
|
|
27
|
+
- To fork a builtin or team skill into a personal copy, use `fork_skill`.
|
|
28
|
+
|
|
29
|
+
## Environments and Approval Workflow
|
|
30
|
+
|
|
31
|
+
Skills go through a review workflow that behaves differently per environment:
|
|
32
|
+
|
|
33
|
+
| Environment | Behavior |
|
|
34
|
+
|-------------|----------|
|
|
35
|
+
| **Dev / Test** | Newly created skills (draft status) are immediately visible and usable. You can test them right away. |
|
|
36
|
+
| **Production** | Only **approved** skill versions are visible and usable. Draft and pending skills do NOT appear. |
|
|
37
|
+
|
|
38
|
+
- After creating a skill, it starts in **draft** status.
|
|
39
|
+
- Skills with scripts must be **submitted for review** and **approved by an admin** before they become active in production.
|
|
40
|
+
- Skills without scripts (pure guidance) also start as draft but can be submitted and approved more quickly.
|
|
41
|
+
- **After creating a skill in production context**: inform the user that it is pending review and will not be available in production until approved. Suggest testing in the dev/test environment first.
|
|
42
|
+
- **Do NOT attempt to test or run a newly created skill in production** — it will not be found.
|
|
43
|
+
|
|
18
44
|
## Skill Structure
|
|
19
45
|
|
|
20
46
|
A skill is a directory under `skills/` containing:
|
|
@@ -171,9 +197,12 @@ node_script: node="node-1", skill="node-logs", script="get-node-logs.sh", args="
|
|
|
171
197
|
|
|
172
198
|
## How to Create a Skill
|
|
173
199
|
|
|
174
|
-
### Step 0: Check
|
|
200
|
+
### Step 0: Check for Duplicates and Completeness
|
|
201
|
+
|
|
202
|
+
Before calling `create_skill`:
|
|
175
203
|
|
|
176
|
-
|
|
204
|
+
1. **Check for existing skills** — consult the `<available_skills>` index. If an existing skill covers the same functionality, discuss with the user: reuse as-is, fork with `fork_skill`, or create new.
|
|
205
|
+
2. **Verify completeness** — a good skill needs **all** of the following. If any are missing, ask the user:
|
|
177
206
|
|
|
178
207
|
| Required Info | What to check | Example question to ask |
|
|
179
208
|
|---|---|---|
|
|
@@ -205,7 +234,8 @@ create_skill({
|
|
|
205
234
|
description: "Find OOMKilled pods and analyze memory usage",
|
|
206
235
|
type: "Monitoring",
|
|
207
236
|
specs: "---\nname: check-pod-oom\n...",
|
|
208
|
-
scripts: [{ name: "check-oom.sh", content: "#!/bin/bash\n..." }]
|
|
237
|
+
scripts: [{ name: "check-oom.sh", content: "#!/bin/bash\n..." }],
|
|
238
|
+
labels: ["monitoring", "memory"]
|
|
209
239
|
})
|
|
210
240
|
```
|
|
211
241
|
|
|
@@ -283,6 +313,8 @@ pod_netns_script: pod="<pod>", namespace="<ns>", skill="pod-ping-gateway", scrip
|
|
|
283
313
|
- **`## Parameters` table**: list required and optional parameters with descriptions
|
|
284
314
|
- **Actionable examples**: show multiple real tool invocations with realistic parameters
|
|
285
315
|
- **Category selection**: choose from Monitoring, Network, Security, Database, Core, Utility, Automation, Custom
|
|
316
|
+
- **Labels**: add relevant labels (e.g. `['gpu', 'network', 'monitoring']`) for discoverability
|
|
286
317
|
- **Scripts are optional**: simple skills that just guide the bot's kubectl usage don't need scripts
|
|
287
318
|
- **One concern per skill**: keep skills focused on a single task
|
|
319
|
+
- **No duplicates**: always check for existing skills first; fork rather than recreate
|
|
288
320
|
- **User scripts by name**: when referencing uploaded scripts, just pass `{name: "file.sh"}` without content
|
|
@@ -9,15 +9,22 @@ description: >-
|
|
|
9
9
|
|
|
10
10
|
## When to Use
|
|
11
11
|
|
|
12
|
-
When the user requests to create, update, edit, enable, or disable a Skill.
|
|
12
|
+
When the user requests to create, update, edit, enable, or disable a Skill in a **Channel** conversation (where skill management tools are not available).
|
|
13
13
|
|
|
14
14
|
## Instructions
|
|
15
15
|
|
|
16
16
|
Skill creation, updates, and management should be done through the Siclaw Web page.
|
|
17
17
|
|
|
18
18
|
On the Web page, you can:
|
|
19
|
-
- Create and edit Skills
|
|
19
|
+
- Create and edit Skills (with live preview)
|
|
20
|
+
- Fork builtin or team skills into personal copies
|
|
21
|
+
- Submit skills for review and approval
|
|
20
22
|
- Enable or disable Skills
|
|
21
23
|
- View Skill execution history
|
|
22
24
|
|
|
23
25
|
Inform the user of this directly — no further action is required.
|
|
26
|
+
|
|
27
|
+
## Environments
|
|
28
|
+
|
|
29
|
+
- **Dev / Test environment**: newly created or updated skills are immediately usable for testing.
|
|
30
|
+
- **Production environment**: only approved skill versions are available. Skills must go through admin review before they appear in production.
|
|
@@ -2,27 +2,37 @@
|
|
|
2
2
|
name: update-skill
|
|
3
3
|
description: >-
|
|
4
4
|
Procedure for modifying, updating, or fixing an existing Siclaw skill.
|
|
5
|
-
|
|
6
|
-
never edit files directly.
|
|
5
|
+
Use the update_skill tool — never edit skill files directly.
|
|
7
6
|
---
|
|
8
7
|
|
|
9
8
|
# Update Skill
|
|
10
9
|
|
|
11
10
|
## When to Use
|
|
12
11
|
|
|
13
|
-
When the user's message contains `[
|
|
12
|
+
When the user's message contains `[Skill: <name>]` (UI skill editing context), or when the user asks to modify/update/fix an existing skill.
|
|
13
|
+
|
|
14
|
+
## Environments and Approval Workflow
|
|
15
|
+
|
|
16
|
+
| Environment | Behavior |
|
|
17
|
+
|-------------|----------|
|
|
18
|
+
| **Dev / Test** | Updated content (working copy) is immediately visible and testable. |
|
|
19
|
+
| **Production** | Only the **approved** version is active. Updates enter a staged review state; the old version remains in use until the new version is approved by an admin. |
|
|
20
|
+
|
|
21
|
+
- When scripts are changed, the update enters a **staged review** state.
|
|
22
|
+
- The **old version** of the skill remains usable in production during review.
|
|
23
|
+
- In dev/test, the working copy is available immediately for testing.
|
|
14
24
|
|
|
15
25
|
## How to Update
|
|
16
26
|
|
|
17
27
|
Call the `update_skill` tool (NOT `create_skill`) with the skill ID and the complete updated definition.
|
|
18
28
|
|
|
19
|
-
**
|
|
29
|
+
**Skill directories are read-only. All skill modifications must go through skill management tools (create_skill, update_skill, fork_skill).**
|
|
20
30
|
|
|
21
31
|
### Tool Call Format
|
|
22
32
|
|
|
23
33
|
```
|
|
24
34
|
update_skill({
|
|
25
|
-
id: "<skill-id>", //
|
|
35
|
+
id: "<skill-id>", // From [Skill: ...] context, or the skill's kebab-case name
|
|
26
36
|
name: "skill-name", // Keep original name unless user wants rename
|
|
27
37
|
description: "What the skill does",
|
|
28
38
|
type: "Monitoring",
|
|
@@ -31,7 +41,8 @@ update_skill({
|
|
|
31
41
|
{ name: "run.sh", content: "#!/bin/bash\n..." }, // Changed: provide full content
|
|
32
42
|
{ name: "check.sh" } // Unchanged: name only
|
|
33
43
|
// Omitted scripts are deleted
|
|
34
|
-
]
|
|
44
|
+
],
|
|
45
|
+
labels: ["monitoring", "memory"] // Optional labels/tags
|
|
35
46
|
})
|
|
36
47
|
```
|
|
37
48
|
|