siclaw 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/README.md +75 -114
  2. package/dist/agentbox/gateway-client.d.ts +2 -1
  3. package/dist/agentbox/gateway-client.js +6 -2
  4. package/dist/agentbox/gateway-client.js.map +1 -1
  5. package/dist/agentbox/http-server.js +184 -19
  6. package/dist/agentbox/http-server.js.map +1 -1
  7. package/dist/agentbox/resource-handlers.d.ts +1 -0
  8. package/dist/agentbox/resource-handlers.js +23 -23
  9. package/dist/agentbox/resource-handlers.js.map +1 -1
  10. package/dist/agentbox/session.js +85 -5
  11. package/dist/agentbox/session.js.map +1 -1
  12. package/dist/agentbox-main.d.ts +2 -1
  13. package/dist/agentbox-main.js +65 -18
  14. package/dist/agentbox-main.js.map +1 -1
  15. package/dist/cli-credentials.d.ts +1 -0
  16. package/dist/cli-credentials.js +109 -0
  17. package/dist/cli-credentials.js.map +1 -0
  18. package/dist/cli-first-run.d.ts +11 -0
  19. package/dist/cli-first-run.js +99 -0
  20. package/dist/cli-first-run.js.map +1 -0
  21. package/dist/cli-main.js +33 -11
  22. package/dist/cli-main.js.map +1 -1
  23. package/dist/cli-setup.d.ts +5 -11
  24. package/dist/cli-setup.js +12 -225
  25. package/dist/cli-setup.js.map +1 -1
  26. package/dist/core/agent-factory.d.ts +4 -0
  27. package/dist/core/agent-factory.js +102 -151
  28. package/dist/core/agent-factory.js.map +1 -1
  29. package/dist/core/config.d.ts +10 -3
  30. package/dist/core/config.js +11 -95
  31. package/dist/core/config.js.map +1 -1
  32. package/dist/core/extensions/deep-investigation.d.ts +2 -1
  33. package/dist/core/extensions/deep-investigation.js +144 -24
  34. package/dist/core/extensions/deep-investigation.js.map +1 -1
  35. package/dist/core/extensions/setup.d.ts +8 -0
  36. package/dist/core/extensions/setup.js +669 -0
  37. package/dist/core/extensions/setup.js.map +1 -0
  38. package/dist/core/llm-proxy.js +7 -3
  39. package/dist/core/llm-proxy.js.map +1 -1
  40. package/dist/core/mcp-client.d.ts +0 -10
  41. package/dist/core/mcp-client.js +0 -65
  42. package/dist/core/mcp-client.js.map +1 -1
  43. package/dist/core/prompt.d.ts +1 -1
  44. package/dist/core/prompt.js +42 -5
  45. package/dist/core/prompt.js.map +1 -1
  46. package/dist/core/provider-presets.d.ts +14 -0
  47. package/dist/core/provider-presets.js +81 -0
  48. package/dist/core/provider-presets.js.map +1 -0
  49. package/dist/cron/cron-coordinator.d.ts +2 -0
  50. package/dist/cron/cron-coordinator.js +46 -14
  51. package/dist/cron/cron-coordinator.js.map +1 -1
  52. package/dist/cron/cron-executor.js +33 -8
  53. package/dist/cron/cron-executor.js.map +1 -1
  54. package/dist/cron/cron-scheduler.d.ts +1 -1
  55. package/dist/cron/gateway-client.d.ts +5 -0
  56. package/dist/cron/gateway-client.js +43 -8
  57. package/dist/cron/gateway-client.js.map +1 -1
  58. package/dist/cron-main.js +39 -9
  59. package/dist/cron-main.js.map +1 -1
  60. package/dist/gateway/agentbox/client.d.ts +11 -0
  61. package/dist/gateway/agentbox/client.js +18 -0
  62. package/dist/gateway/agentbox/client.js.map +1 -1
  63. package/dist/gateway/agentbox/k8s-spawner.d.ts +11 -2
  64. package/dist/gateway/agentbox/k8s-spawner.js +95 -52
  65. package/dist/gateway/agentbox/k8s-spawner.js.map +1 -1
  66. package/dist/gateway/agentbox/local-spawner.d.ts +1 -1
  67. package/dist/gateway/agentbox/local-spawner.js +4 -2
  68. package/dist/gateway/agentbox/local-spawner.js.map +1 -1
  69. package/dist/gateway/agentbox/manager.d.ts +0 -10
  70. package/dist/gateway/agentbox/manager.js +11 -30
  71. package/dist/gateway/agentbox/manager.js.map +1 -1
  72. package/dist/gateway/agentbox/types.d.ts +6 -4
  73. package/dist/gateway/cron/cron-service.d.ts +49 -0
  74. package/dist/gateway/cron/cron-service.js +259 -0
  75. package/dist/gateway/cron/cron-service.js.map +1 -0
  76. package/dist/gateway/db/init-schema.js +44 -0
  77. package/dist/gateway/db/init-schema.js.map +1 -1
  78. package/dist/gateway/db/migrate-sqlite.js +73 -4
  79. package/dist/gateway/db/migrate-sqlite.js.map +1 -1
  80. package/dist/gateway/db/repositories/chat-repo.d.ts +56 -2
  81. package/dist/gateway/db/repositories/chat-repo.js +132 -2
  82. package/dist/gateway/db/repositories/chat-repo.js.map +1 -1
  83. package/dist/gateway/db/repositories/config-repo.d.ts +31 -2
  84. package/dist/gateway/db/repositories/config-repo.js +57 -7
  85. package/dist/gateway/db/repositories/config-repo.js.map +1 -1
  86. package/dist/gateway/db/repositories/env-repo.d.ts +14 -0
  87. package/dist/gateway/db/repositories/env-repo.js +15 -2
  88. package/dist/gateway/db/repositories/env-repo.js.map +1 -1
  89. package/dist/gateway/db/repositories/model-config-repo.d.ts +1 -1
  90. package/dist/gateway/db/repositories/model-config-repo.js +26 -12
  91. package/dist/gateway/db/repositories/model-config-repo.js.map +1 -1
  92. package/dist/gateway/db/repositories/skill-repo.d.ts +0 -5
  93. package/dist/gateway/db/repositories/skill-review-repo.d.ts +1 -0
  94. package/dist/gateway/db/repositories/skill-review-repo.js +4 -1
  95. package/dist/gateway/db/repositories/skill-review-repo.js.map +1 -1
  96. package/dist/gateway/db/repositories/skill-version-repo.js +0 -1
  97. package/dist/gateway/db/repositories/skill-version-repo.js.map +1 -1
  98. package/dist/gateway/db/repositories/system-config-repo.d.ts +1 -1
  99. package/dist/gateway/db/repositories/system-config-repo.js +2 -1
  100. package/dist/gateway/db/repositories/system-config-repo.js.map +1 -1
  101. package/dist/gateway/db/repositories/user-env-config-repo.d.ts +13 -0
  102. package/dist/gateway/db/repositories/user-env-config-repo.js +11 -0
  103. package/dist/gateway/db/repositories/user-env-config-repo.js.map +1 -1
  104. package/dist/gateway/db/repositories/workspace-repo.d.ts +3 -2
  105. package/dist/gateway/db/repositories/workspace-repo.js +6 -2
  106. package/dist/gateway/db/repositories/workspace-repo.js.map +1 -1
  107. package/dist/gateway/db/schema-mysql.d.ts +473 -51
  108. package/dist/gateway/db/schema-mysql.js +35 -4
  109. package/dist/gateway/db/schema-mysql.js.map +1 -1
  110. package/dist/gateway/db/schema-sqlite.d.ts +522 -57
  111. package/dist/gateway/db/schema-sqlite.js +38 -6
  112. package/dist/gateway/db/schema-sqlite.js.map +1 -1
  113. package/dist/gateway/db/schema.d.ts +471 -51
  114. package/dist/gateway/db/schema.js +1 -1
  115. package/dist/gateway/db/schema.js.map +1 -1
  116. package/dist/gateway/metrics-aggregator.d.ts +65 -0
  117. package/dist/gateway/metrics-aggregator.js +244 -0
  118. package/dist/gateway/metrics-aggregator.js.map +1 -0
  119. package/dist/gateway/plugins/channel-bridge.d.ts +4 -1
  120. package/dist/gateway/plugins/channel-bridge.js +78 -86
  121. package/dist/gateway/plugins/channel-bridge.js.map +1 -1
  122. package/dist/gateway/rpc-methods.d.ts +4 -2
  123. package/dist/gateway/rpc-methods.js +962 -163
  124. package/dist/gateway/rpc-methods.js.map +1 -1
  125. package/dist/gateway/security/cert-manager.d.ts +2 -2
  126. package/dist/gateway/security/cert-manager.js +4 -2
  127. package/dist/gateway/security/cert-manager.js.map +1 -1
  128. package/dist/gateway/server.d.ts +4 -8
  129. package/dist/gateway/server.js +297 -261
  130. package/dist/gateway/server.js.map +1 -1
  131. package/dist/gateway/skills/file-writer.js +17 -11
  132. package/dist/gateway/skills/file-writer.js.map +1 -1
  133. package/dist/gateway/skills/script-evaluator.js +12 -9
  134. package/dist/gateway/skills/script-evaluator.js.map +1 -1
  135. package/dist/gateway/web/dist/assets/index-0p17ZeTP.js +740 -0
  136. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js +741 -0
  137. package/dist/gateway/web/dist/assets/index-9eP6nPUq.js.map +1 -0
  138. package/dist/gateway/web/dist/assets/index-CAmSY91d.js +675 -0
  139. package/dist/gateway/web/dist/assets/index-DMFEh8Pp.css +1 -0
  140. package/dist/gateway/web/dist/assets/index-DyowBCEj.css +1 -0
  141. package/dist/gateway/web/dist/assets/index-PDK5JJDO.css +1 -0
  142. package/dist/gateway/web/dist/index.html +2 -2
  143. package/dist/gateway-main.js +27 -10
  144. package/dist/gateway-main.js.map +1 -1
  145. package/dist/memory/embeddings.js +5 -4
  146. package/dist/memory/embeddings.js.map +1 -1
  147. package/dist/memory/indexer.d.ts +23 -3
  148. package/dist/memory/indexer.js +235 -23
  149. package/dist/memory/indexer.js.map +1 -1
  150. package/dist/memory/schema.js +15 -1
  151. package/dist/memory/schema.js.map +1 -1
  152. package/dist/memory/types.d.ts +18 -0
  153. package/dist/memory/types.js +6 -1
  154. package/dist/memory/types.js.map +1 -1
  155. package/dist/shared/detect-language.d.ts +12 -0
  156. package/dist/shared/detect-language.js +78 -0
  157. package/dist/shared/detect-language.js.map +1 -0
  158. package/dist/shared/diagnostic-events.d.ts +70 -0
  159. package/dist/shared/diagnostic-events.js +38 -0
  160. package/dist/shared/diagnostic-events.js.map +1 -0
  161. package/dist/shared/local-collector.d.ts +56 -0
  162. package/dist/shared/local-collector.js +284 -0
  163. package/dist/shared/local-collector.js.map +1 -0
  164. package/dist/shared/metrics-types.d.ts +64 -0
  165. package/dist/shared/metrics-types.js +25 -0
  166. package/dist/shared/metrics-types.js.map +1 -0
  167. package/dist/shared/metrics.d.ts +19 -0
  168. package/dist/shared/metrics.js +185 -0
  169. package/dist/shared/metrics.js.map +1 -0
  170. package/dist/shared/path-utils.d.ts +15 -0
  171. package/dist/shared/path-utils.js +23 -0
  172. package/dist/shared/path-utils.js.map +1 -0
  173. package/dist/shared/retry.d.ts +35 -0
  174. package/dist/shared/retry.js +61 -0
  175. package/dist/shared/retry.js.map +1 -0
  176. package/dist/tools/command-sets.d.ts +18 -2
  177. package/dist/tools/command-sets.js +207 -32
  178. package/dist/tools/command-sets.js.map +1 -1
  179. package/dist/tools/command-validator.d.ts +56 -0
  180. package/dist/tools/command-validator.js +357 -0
  181. package/dist/tools/command-validator.js.map +1 -0
  182. package/dist/tools/create-skill.js +26 -1
  183. package/dist/tools/create-skill.js.map +1 -1
  184. package/dist/tools/credential-list.js +1 -23
  185. package/dist/tools/credential-list.js.map +1 -1
  186. package/dist/tools/credential-manager.d.ts +98 -0
  187. package/dist/tools/credential-manager.js +313 -0
  188. package/dist/tools/credential-manager.js.map +1 -0
  189. package/dist/tools/deep-search/engine.js +184 -127
  190. package/dist/tools/deep-search/engine.js.map +1 -1
  191. package/dist/tools/deep-search/prompts.d.ts +10 -2
  192. package/dist/tools/deep-search/prompts.js +37 -36
  193. package/dist/tools/deep-search/prompts.js.map +1 -1
  194. package/dist/tools/deep-search/schemas.d.ts +87 -0
  195. package/dist/tools/deep-search/schemas.js +85 -0
  196. package/dist/tools/deep-search/schemas.js.map +1 -0
  197. package/dist/tools/deep-search/sub-agent.d.ts +21 -0
  198. package/dist/tools/deep-search/sub-agent.js +153 -4
  199. package/dist/tools/deep-search/sub-agent.js.map +1 -1
  200. package/dist/tools/deep-search/tool.js +1 -0
  201. package/dist/tools/deep-search/tool.js.map +1 -1
  202. package/dist/tools/deep-search/types.d.ts +2 -0
  203. package/dist/tools/deep-search/types.js.map +1 -1
  204. package/dist/tools/dp-tools.js +29 -5
  205. package/dist/tools/dp-tools.js.map +1 -1
  206. package/dist/tools/exec-utils.d.ts +85 -0
  207. package/dist/tools/exec-utils.js +294 -0
  208. package/dist/tools/exec-utils.js.map +1 -0
  209. package/dist/tools/fork-skill.js +14 -2
  210. package/dist/tools/fork-skill.js.map +1 -1
  211. package/dist/tools/investigation-feedback.d.ts +3 -0
  212. package/dist/tools/investigation-feedback.js +71 -0
  213. package/dist/tools/investigation-feedback.js.map +1 -0
  214. package/dist/tools/manage-schedule.js +16 -6
  215. package/dist/tools/manage-schedule.js.map +1 -1
  216. package/dist/tools/netns-script.js +27 -281
  217. package/dist/tools/netns-script.js.map +1 -1
  218. package/dist/tools/node-exec.d.ts +2 -14
  219. package/dist/tools/node-exec.js +18 -225
  220. package/dist/tools/node-exec.js.map +1 -1
  221. package/dist/tools/node-script.js +14 -168
  222. package/dist/tools/node-script.js.map +1 -1
  223. package/dist/tools/pod-exec.d.ts +1 -1
  224. package/dist/tools/pod-exec.js +10 -26
  225. package/dist/tools/pod-exec.js.map +1 -1
  226. package/dist/tools/pod-nsenter-exec.js +21 -225
  227. package/dist/tools/pod-nsenter-exec.js.map +1 -1
  228. package/dist/tools/pod-script.js +10 -19
  229. package/dist/tools/pod-script.js.map +1 -1
  230. package/dist/tools/restricted-bash.d.ts +1 -17
  231. package/dist/tools/restricted-bash.js +38 -252
  232. package/dist/tools/restricted-bash.js.map +1 -1
  233. package/dist/tools/run-skill.d.ts +3 -1
  234. package/dist/tools/run-skill.js +21 -1
  235. package/dist/tools/run-skill.js.map +1 -1
  236. package/dist/tools/script-resolver.d.ts +3 -1
  237. package/dist/tools/script-resolver.js +74 -30
  238. package/dist/tools/script-resolver.js.map +1 -1
  239. package/dist/tools/update-skill.js +17 -6
  240. package/dist/tools/update-skill.js.map +1 -1
  241. package/package.json +8 -6
  242. package/siclaw.mjs +10 -1
  243. package/skills/core/cluster-events/SKILL.md +1 -1
  244. package/skills/core/deep-investigation/SKILL.md +11 -0
  245. package/skills/core/deployment-rollout-debug/SKILL.md +1 -1
  246. package/skills/core/dns-debug/SKILL.md +1 -0
  247. package/skills/core/meta.json +12 -1
  248. package/skills/core/networkpolicy-debug/SKILL.md +332 -0
  249. package/skills/core/node-logs/scripts/get-node-logs.sh +19 -9
  250. package/skills/core/pod-pending-debug/SKILL.md +1 -0
  251. package/skills/core/quota-debug/SKILL.md +203 -0
  252. package/skills/core/service-debug/SKILL.md +1 -0
  253. package/skills/core/statefulset-debug/SKILL.md +280 -0
  254. package/skills/core/volcano-diagnose-pod/SKILL.md +196 -0
  255. package/skills/core/volcano-diagnose-pod/scripts/diagnose-pod.sh +175 -0
  256. package/skills/core/volcano-gang-scheduling/SKILL.md +299 -0
  257. package/skills/core/volcano-job-diagnose/SKILL.md +319 -0
  258. package/skills/core/volcano-job-diagnose/scripts/diagnose-job.sh +253 -0
  259. package/skills/core/volcano-node-resources/SKILL.md +334 -0
  260. package/skills/core/volcano-node-resources/scripts/get-node-resources.sh +281 -0
  261. package/skills/core/volcano-queue-diagnose/SKILL.md +294 -0
  262. package/skills/core/volcano-queue-diagnose/scripts/diagnose-queue.sh +283 -0
  263. package/skills/core/volcano-resource-insufficient/SKILL.md +315 -0
  264. package/skills/core/volcano-scheduler-config/SKILL.md +371 -0
  265. package/skills/core/volcano-scheduler-config/scripts/get-scheduler-config.sh +297 -0
  266. package/skills/core/volcano-scheduler-logs/SKILL.md +241 -0
  267. package/skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh +159 -0
  268. package/skills/platform/create-skill/SKILL.md +35 -3
  269. package/skills/platform/manage-skill/SKILL.md +9 -2
  270. package/skills/platform/update-skill/SKILL.md +17 -6
@@ -0,0 +1,241 @@
1
+ ---
2
+ name: volcano-scheduler-logs
3
+ description: >-
4
+ Retrieve and analyze Volcano scheduler logs.
5
+ Filter by keyword, time range, or pod name to debug scheduling decisions.
6
+ ---
7
+
8
+ # Volcano Scheduler Logs
9
+
10
+ Retrieve and analyze Volcano scheduler logs to understand scheduling decisions, failures, and performance issues.
11
+
12
+ **Scope:** This skill is for **diagnosis only**. It retrieves logs for analysis but does not modify any cluster state.
13
+
14
+ ## Usage
15
+
16
+ ```bash
17
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh [options]
18
+ ```
19
+
20
+ ## Parameters
21
+
22
+ | Parameter | Required | Description |
23
+ |-----------|----------|-------------|
24
+ | `--keyword KEYWORD` | no | Filter logs by keyword (case-insensitive) |
25
+ | `--pod POD` | no | Filter logs related to specific pod name |
26
+ | `--since TIME` | no | Show logs newer than relative time (e.g., 10m, 1h) |
27
+ | `--lines N` | no | Number of lines to show (default: 100) |
28
+ | `--follow` | no | Stream logs in real-time (Ctrl+C to stop) |
29
+ | `--previous` | no | Show logs from previous container instance (after restart) |
30
+
31
+ ## Examples
32
+
33
+ Get recent scheduler logs:
34
+ ```bash
35
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh
36
+ ```
37
+
38
+ Search for error messages:
39
+ ```bash
40
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword error
41
+ ```
42
+
43
+ Get logs for a specific pod:
44
+ ```bash
45
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --pod my-job-0
46
+ ```
47
+
48
+ Get last 500 lines from the past hour:
49
+ ```bash
50
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --since 1h --lines 500
51
+ ```
52
+
53
+ Stream logs for gang scheduling issues:
54
+ ```bash
55
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword gang --follow
56
+ ```
57
+
58
+ Check logs from previous scheduler instance (after crash/restart):
59
+ ```bash
60
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --previous --lines 200
61
+ ```
62
+
63
+ ## Common Keywords for Filtering
64
+
65
+ | Keyword | Use Case |
66
+ |---------|----------|
67
+ | `error` | Find error messages and failures |
68
+ | `FailedScheduling` | Scheduling failures |
69
+ | `allocate` | Resource allocation attempts |
70
+ | `gang` | Gang scheduling decisions |
71
+ | `minMember` | MinMember constraint issues |
72
+ | `preempt` | Preemption events |
73
+ | `reclaim` | Resource reclamation |
74
+ | `enqueue` | Queue admission decisions |
75
+ | `bind` | Pod binding attempts |
76
+ | `queue` | Queue-related decisions |
77
+ | `proportion` | Proportion plugin decisions |
78
+ | `priority` | Priority-related decisions |
79
+
80
+ ## Understanding Scheduler Logs
81
+
82
+ ### Log Format
83
+
84
+ Volcano scheduler logs typically follow this format:
85
+ ```
86
+ I0102 15:04:05.123456 1 scheduler.go:123] Starting scheduling session
87
+ I0102 15:04:05.234567 1 allocate.go:456] Try to allocate resources for Job <namespace>/<job-name>
88
+ E0102 15:04:05.345678 1 gang.go:789] Failed to schedule pod <pod-name>: minMember not satisfied
89
+ ```
90
+
91
+ **Log levels:**
92
+ - `I` - Info: Normal operation information
93
+ - `W` - Warning: Unusual but non-fatal conditions
94
+ - `E` - Error: Failures and errors
95
+ - `F` - Fatal: Critical errors causing shutdown
96
+
97
+ ### Common Log Patterns
98
+
99
+ #### Session Start
100
+ ```
101
+ Starting scheduling session
102
+ Starting scheduling loop
103
+ ```
104
+ - Indicates scheduler is processing a new batch of pending pods
105
+
106
+ #### Enqueue Decisions
107
+ ```
108
+ Try to enqueue pod group
109
+ PodGroup <name> is enqueued
110
+ PodGroup <name> is pending
111
+ ```
112
+ - Shows whether pod groups are admitted to the queue
113
+
114
+ #### Allocation Attempts
115
+ ```
116
+ Try to allocate resources for Job
117
+ Try to allocate for task
118
+ ```
119
+ - Shows scheduling attempts for specific jobs/pods
120
+
121
+ #### Gang Scheduling
122
+ ```
123
+ minMember not satisfied
124
+ gang member not ready
125
+ Waiting for gang members
126
+ ```
127
+ - Indicates Gang constraint preventing scheduling
128
+
129
+ #### Resource Shortage
130
+ ```
131
+ Insufficient cpu
132
+ Insufficient memory
133
+ 0 nodes are available
134
+ ```
135
+ - Indicates resource constraint preventing scheduling
136
+
137
+ #### Preemption
138
+ ```
139
+ Preempting pods
140
+ Found victim pods
141
+ ```
142
+ - Shows preemption decisions for high-priority workloads
143
+
144
+ #### Reclaim
145
+ ```
146
+ Try to reclaim resources
147
+ Reclaiming resources from queue
148
+ ```
149
+ - Shows resource reclamation between queues
150
+
151
+ ## Diagnostic Use Cases
152
+
153
+ ### Case 1: Pod Stuck in Pending
154
+
155
+ Find relevant scheduler decisions:
156
+ ```bash
157
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --pod <pod-name> --since 30m
158
+ ```
159
+
160
+ Look for:
161
+ - `FailedScheduling` events
162
+ - `minMember not satisfied`
163
+ - `Insufficient` resource messages
164
+ - `enqueue` decisions (is the PodGroup being admitted?)
165
+
166
+ ### Case 2: Gang Scheduling Issues
167
+
168
+ Check Gang plugin behavior:
169
+ ```bash
170
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword gang --since 1h
171
+ ```
172
+
173
+ Look for:
174
+ - `minMember` related messages
175
+ - Gang constraint validation
176
+ - Comparison of running vs required members
177
+
178
+ ### Case 3: Queue Resource Issues
179
+
180
+ Check proportion and reclaim decisions:
181
+ ```bash
182
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword "reclaim\|proportion" --since 30m
183
+ ```
184
+
185
+ Look for:
186
+ - Queue resource calculations
187
+ - Reclaim triggers
188
+ - Over-commit handling
189
+
190
+ ### Case 4: Scheduler Performance
191
+
192
+ Check for scheduling delays:
193
+ ```bash
194
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --lines 500 | grep -E "(Starting|Finished) scheduling"
195
+ ```
196
+
197
+ Look for:
198
+ - Long gaps between "Starting" and "Finished"
199
+ - High frequency of scheduling loops
200
+ - Errors causing retries
201
+
202
+ ### Case 5: Preemption Analysis
203
+
204
+ Check preemption decisions:
205
+ ```bash
206
+ bash skills/core/volcano-scheduler-logs/scripts/get-scheduler-logs.sh --keyword preempt --since 1h
207
+ ```
208
+
209
+ Look for:
210
+ - Which pods are being preempted
211
+ - Priority comparisons
212
+ - Preemption success/failure
213
+
214
+ ## Environment Variables
215
+
216
+ | Variable | Default | Description |
217
+ |----------|---------|-------------|
218
+ | `VOLCANO_SCHEDULER_NS` | `volcano-system` | Scheduler namespace |
219
+ | `VOLCANO_SCHEDULER_LABEL` | `app=volcano-scheduler` | Label selector for scheduler pods |
220
+
221
+ ## Limitations
222
+
223
+ 1. **Log retention:** Logs may be rotated based on cluster configuration
224
+ 2. **Multi-scheduler:** If running multiple schedulers, logs will be interleaved
225
+ 3. **Log level:** Default log level may not show all debug information
226
+ 4. **Previous logs:** `--previous` only works if the container has restarted
227
+
228
+ ## Tips for Effective Log Analysis
229
+
230
+ 1. **Use time ranges:** Narrow down with `--since` to focus on recent issues
231
+ 2. **Combine keywords:** Search for `error\|Failed\|failed` to catch all failures
232
+ 3. **Check pod context:** Always include `--pod` when investigating specific pods
233
+ 4. **Look for patterns:** Repeating errors may indicate systemic issues
234
+ 5. **Correlate with events:** Compare with `kubectl get events` timestamps
235
+
236
+ ## See Also
237
+
238
+ - `volcano-diagnose-pod` - Diagnose individual pod issues
239
+ - `volcano-gang-scheduling` - Gang scheduling specific diagnosis
240
+ - `volcano-queue-diagnose` - Queue resource analysis
241
+ - `volcano-resource-insufficient` - Resource shortage diagnosis
@@ -0,0 +1,159 @@
1
+ #!/bin/bash
2
+ # Retrieve and analyze Volcano scheduler logs.
3
+ # This script performs read-only operations using kubectl.
4
+ set -euo pipefail
5
+
6
+ show_help() {
7
+ cat <<EOF
8
+ Usage: $0 [options]
9
+
10
+ Retrieve and analyze Volcano scheduler logs.
11
+ Filter by keyword, time range, or pod name to debug scheduling decisions.
12
+
13
+ Options:
14
+ --keyword KEYWORD Filter logs by keyword (case-insensitive)
15
+ --pod POD Filter logs related to specific pod name
16
+ --since TIME Show logs newer than relative time (e.g., 10m, 1h, 1d)
17
+ --lines N Number of lines to show (default: 100)
18
+ --follow Stream logs in real-time (Ctrl+C to stop)
19
+ --previous Show logs from previous container instance
20
+ -h, --help Show this help message
21
+
22
+ Environment:
23
+ VOLCANO_SCHEDULER_NS Scheduler namespace (default: volcano-system)
24
+ VOLCANO_SCHEDULER_LABEL Pod label selector (default: app=volcano-scheduler)
25
+
26
+ Examples:
27
+ $0 --keyword error # Search for errors
28
+ $0 --pod my-job-0 --since 30m # Logs for pod in last 30 min
29
+ $0 --lines 500 --since 1h # Last 500 lines from past hour
30
+ $0 --keyword gang --follow # Stream gang scheduling logs
31
+ $0 --previous --lines 200 # Logs from previous scheduler instance
32
+ EOF
33
+ exit 0
34
+ }
35
+
36
+ # Parse arguments
37
+ KEYWORD=""
38
+ POD=""
39
+ SINCE=""
40
+ LINES=100
41
+ FOLLOW=false
42
+ PREVIOUS=false
43
+
44
+ while [[ $# -gt 0 ]]; do
45
+ case $1 in
46
+ -h|--help) show_help ;;
47
+ --keyword) KEYWORD="$2"; shift 2 ;;
48
+ --pod) POD="$2"; shift 2 ;;
49
+ --since) SINCE="$2"; shift 2 ;;
50
+ --lines) LINES="$2"; shift 2 ;;
51
+ --follow) FOLLOW=true; shift ;;
52
+ --previous) PREVIOUS=true; shift ;;
53
+ *) echo "Unknown option: $1. Use --help for usage." >&2; exit 1 ;;
54
+ esac
55
+ done
56
+
57
+ # Validate arguments
58
+ if [[ "$FOLLOW" == "true" && -n "$SINCE" ]]; then
59
+ echo "Error: --follow and --since cannot be used together" >&2
60
+ exit 1
61
+ fi
62
+
63
+ if [[ "$FOLLOW" == "true" && -n "$SINCE" && "$LINES" != "100" ]]; then
64
+ echo "Warning: --follow ignores --lines, streaming from now" >&2
65
+ fi
66
+
67
+ # Environment settings
68
+ SCHEDULER_NS="${VOLCANO_SCHEDULER_NS:-volcano-system}"
69
+ SCHEDULER_LABEL="${VOLCANO_SCHEDULER_LABEL:-app=volcano-scheduler}"
70
+
71
+ echo "=== Volcano Scheduler Logs ==="
72
+ echo "Namespace: $SCHEDULER_NS"
73
+ echo "Label: $SCHEDULER_LABEL"
74
+ [[ -n "$KEYWORD" ]] && echo "Keyword filter: $KEYWORD"
75
+ [[ -n "$POD" ]] && echo "Pod filter: $POD"
76
+ [[ -n "$SINCE" ]] && echo "Time range: $SINCE"
77
+ echo "Lines: $LINES"
78
+ [[ "$PREVIOUS" == "true" ]] && echo "Previous instance: yes"
79
+ echo
80
+
81
+ # Check if scheduler pod exists
82
+ if ! kubectl get pods -n "$SCHEDULER_NS" -l "$SCHEDULER_LABEL" &>/dev/null; then
83
+ echo "Error: No scheduler pods found in namespace '$SCHEDULER_NS' with label '$SCHEDULER_LABEL'" >&2
84
+ echo "Available pods in $SCHEDULER_NS:" >&2
85
+ kubectl get pods -n "$SCHEDULER_NS" 2>/dev/null | head -10 >&2 || echo " (failed to list pods)" >&2
86
+ exit 1
87
+ fi
88
+
89
+ # Get scheduler pod name
90
+ SCHEDULER_POD=$(kubectl get pods -n "$SCHEDULER_NS" -l "$SCHEDULER_LABEL" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
91
+
92
+ if [[ -z "$SCHEDULER_POD" ]]; then
93
+ echo "Error: Could not determine scheduler pod name" >&2
94
+ exit 1
95
+ fi
96
+
97
+ echo "Scheduler Pod: $SCHEDULER_POD"
98
+ echo
99
+
100
+ # Build kubectl logs command
101
+ LOG_CMD="kubectl logs -n $SCHEDULER_NS $SCHEDULER_POD"
102
+
103
+ # Add options
104
+ [[ "$FOLLOW" == "true" ]] && LOG_CMD="$LOG_CMD --follow"
105
+ [[ "$PREVIOUS" == "true" ]] && LOG_CMD="$LOG_CMD --previous"
106
+ [[ -n "$SINCE" ]] && LOG_CMD="$LOG_CMD --since=$SINCE"
107
+ [[ "$FOLLOW" == "false" ]] && LOG_CMD="$LOG_CMD --tail=$LINES"
108
+
109
+ # Execute command with optional filtering
110
+ echo "Executing: $LOG_CMD"
111
+ echo "----------------------------------------"
112
+ echo
113
+
114
+ # Build filter pattern
115
+ FILTER_PATTERN=""
116
+
117
+ # If both keyword and pod are specified, combine them
118
+ if [[ -n "$KEYWORD" && -n "$POD" ]]; then
119
+ FILTER_PATTERN="$KEYWORD|$POD"
120
+ elif [[ -n "$KEYWORD" ]]; then
121
+ FILTER_PATTERN="$KEYWORD"
122
+ elif [[ -n "$POD" ]]; then
123
+ FILTER_PATTERN="$POD"
124
+ fi
125
+
126
+ # Execute and filter
127
+ if [[ -n "$FILTER_PATTERN" ]]; then
128
+ # Use case-insensitive grep for filtering
129
+ if [[ "$FOLLOW" == "true" ]]; then
130
+ # For follow mode, we need to filter in real-time
131
+ $LOG_CMD 2>&1 | grep -iE "$FILTER_PATTERN" || true
132
+ else
133
+ # For non-follow mode, filter after getting logs
134
+ $LOG_CMD 2>&1 | grep -iE "$FILTER_PATTERN" || {
135
+ echo "(No log lines matched the filter pattern: $FILTER_PATTERN)"
136
+ }
137
+ fi
138
+ else
139
+ # No filtering, show all logs
140
+ $LOG_CMD 2>&1 || {
141
+ echo "Error: Failed to retrieve logs" >&2
142
+ exit 1
143
+ }
144
+ fi
145
+
146
+ echo
147
+
148
+ # If not following, show some helpful hints
149
+ if [[ "$FOLLOW" == "false" ]]; then
150
+ echo "----------------------------------------"
151
+ echo "Hints:"
152
+ echo " - Use --follow to stream logs in real-time"
153
+ echo " - Use --since 30m for recent logs only"
154
+ echo " - Use --previous if scheduler recently restarted"
155
+ echo " - Common keywords: error, FailedScheduling, gang, preempt, reclaim"
156
+ fi
157
+
158
+ echo
159
+ echo "=== Log Retrieval Complete ==="
@@ -15,6 +15,32 @@ Use this skill when the user asks you to:
15
15
  - Create a new skill for a specific operational procedure
16
16
  - Save a diagnosis workflow as a skill
17
17
 
18
+ ## Duplicate / Overlap Check — Do This FIRST
19
+
20
+ **Before creating any skill, check whether an existing skill already covers the same functionality.** Consult the `<available_skills>` index in your context.
21
+
22
+ - **Functional overlap found**: If an existing builtin, team, or personal skill solves the same problem (even with a different name), DO NOT silently create a new one. Instead:
23
+ 1. Tell the user which existing skill overlaps and what it does.
24
+ 2. Ask if they want to: (a) use the existing skill as-is, (b) fork it with `fork_skill` to make a customized personal copy, or (c) still create a brand-new separate skill.
25
+ 3. Only proceed with `create_skill` if the user explicitly chooses option (c).
26
+ - **Why this matters**: Duplicate skills with similar functionality confuse the model — it cannot reliably choose between two skills that do the same thing. One well-maintained skill is always better than two overlapping ones.
27
+ - To fork a builtin or team skill into a personal copy, use `fork_skill`.
28
+
29
+ ## Environments and Approval Workflow
30
+
31
+ Skills go through a review workflow that behaves differently per environment:
32
+
33
+ | Environment | Behavior |
34
+ |-------------|----------|
35
+ | **Dev / Test** | Newly created skills (draft status) are immediately visible and usable. You can test them right away. |
36
+ | **Production** | Only **approved** skill versions are visible and usable. Draft and pending skills do NOT appear. |
37
+
38
+ - After creating a skill, it starts in **draft** status.
39
+ - Skills with scripts must be **submitted for review** and **approved by an admin** before they become active in production.
40
+ - Skills without scripts (pure guidance) also start as draft but can be submitted and approved more quickly.
41
+ - **After creating a skill in production context**: inform the user that it is pending review and will not be available in production until approved. Suggest testing in the dev/test environment first.
42
+ - **Do NOT attempt to test or run a newly created skill in production** — it will not be found.
43
+
18
44
  ## Skill Structure
19
45
 
20
46
  A skill is a directory under `skills/` containing:
@@ -171,9 +197,12 @@ node_script: node="node-1", skill="node-logs", script="get-node-logs.sh", args="
171
197
 
172
198
  ## How to Create a Skill
173
199
 
174
- ### Step 0: Check Completeness Ask Before You Build
200
+ ### Step 0: Check for Duplicates and Completeness
201
+
202
+ Before calling `create_skill`:
175
203
 
176
- Before calling `create_skill`, review what you know and identify gaps. A good skill needs **all** of the following. If any are missing or vague, ask the user to clarify before proceeding:
204
+ 1. **Check for existing skills** consult the `<available_skills>` index. If an existing skill covers the same functionality, discuss with the user: reuse as-is, fork with `fork_skill`, or create new.
205
+ 2. **Verify completeness** — a good skill needs **all** of the following. If any are missing, ask the user:
177
206
 
178
207
  | Required Info | What to check | Example question to ask |
179
208
  |---|---|---|
@@ -205,7 +234,8 @@ create_skill({
205
234
  description: "Find OOMKilled pods and analyze memory usage",
206
235
  type: "Monitoring",
207
236
  specs: "---\nname: check-pod-oom\n...",
208
- scripts: [{ name: "check-oom.sh", content: "#!/bin/bash\n..." }]
237
+ scripts: [{ name: "check-oom.sh", content: "#!/bin/bash\n..." }],
238
+ labels: ["monitoring", "memory"]
209
239
  })
210
240
  ```
211
241
 
@@ -283,6 +313,8 @@ pod_netns_script: pod="<pod>", namespace="<ns>", skill="pod-ping-gateway", scrip
283
313
  - **`## Parameters` table**: list required and optional parameters with descriptions
284
314
  - **Actionable examples**: show multiple real tool invocations with realistic parameters
285
315
  - **Category selection**: choose from Monitoring, Network, Security, Database, Core, Utility, Automation, Custom
316
+ - **Labels**: add relevant labels (e.g. `['gpu', 'network', 'monitoring']`) for discoverability
286
317
  - **Scripts are optional**: simple skills that just guide the bot's kubectl usage don't need scripts
287
318
  - **One concern per skill**: keep skills focused on a single task
319
+ - **No duplicates**: always check for existing skills first; fork rather than recreate
288
320
  - **User scripts by name**: when referencing uploaded scripts, just pass `{name: "file.sh"}` without content
@@ -9,15 +9,22 @@ description: >-
9
9
 
10
10
  ## When to Use
11
11
 
12
- When the user requests to create, update, edit, enable, or disable a Skill.
12
+ When the user requests to create, update, edit, enable, or disable a Skill in a **Channel** conversation (where skill management tools are not available).
13
13
 
14
14
  ## Instructions
15
15
 
16
16
  Skill creation, updates, and management should be done through the Siclaw Web page.
17
17
 
18
18
  On the Web page, you can:
19
- - Create and edit Skills
19
+ - Create and edit Skills (with live preview)
20
+ - Fork builtin or team skills into personal copies
21
+ - Submit skills for review and approval
20
22
  - Enable or disable Skills
21
23
  - View Skill execution history
22
24
 
23
25
  Inform the user of this directly — no further action is required.
26
+
27
+ ## Environments
28
+
29
+ - **Dev / Test environment**: newly created or updated skills are immediately usable for testing.
30
+ - **Production environment**: only approved skill versions are available. Skills must go through admin review before they appear in production.
@@ -2,27 +2,37 @@
2
2
  name: update-skill
3
3
  description: >-
4
4
  Procedure for modifying, updating, or fixing an existing Siclaw skill.
5
- Skills are on a read-only filesystem use the update_skill tool,
6
- never edit files directly.
5
+ Use the update_skill tool never edit skill files directly.
7
6
  ---
8
7
 
9
8
  # Update Skill
10
9
 
11
10
  ## When to Use
12
11
 
13
- When the user's message contains `[Editing Skill: <name> (id:<id>)]` followed by the current skill content, or when the user asks to modify/update/fix an existing skill.
12
+ When the user's message contains `[Skill: <name>]` (UI skill editing context), or when the user asks to modify/update/fix an existing skill.
13
+
14
+ ## Environments and Approval Workflow
15
+
16
+ | Environment | Behavior |
17
+ |-------------|----------|
18
+ | **Dev / Test** | Updated content (working copy) is immediately visible and testable. |
19
+ | **Production** | Only the **approved** version is active. Updates enter a staged review state; the old version remains in use until the new version is approved by an admin. |
20
+
21
+ - When scripts are changed, the update enters a **staged review** state.
22
+ - The **old version** of the skill remains usable in production during review.
23
+ - In dev/test, the working copy is available immediately for testing.
14
24
 
15
25
  ## How to Update
16
26
 
17
27
  Call the `update_skill` tool (NOT `create_skill`) with the skill ID and the complete updated definition.
18
28
 
19
- **Do NOT use `read`, `edit`, `write`, or `bash` on files under `/mnt/skills/` the filesystem is read-only.**
29
+ **Skill directories are read-only. All skill modifications must go through skill management tools (create_skill, update_skill, fork_skill).**
20
30
 
21
31
  ### Tool Call Format
22
32
 
23
33
  ```
24
34
  update_skill({
25
- id: "<skill-id>", // Required — from [Editing Skill: ... (id:<id>)]
35
+ id: "<skill-id>", // From [Skill: ...] context, or the skill's kebab-case name
26
36
  name: "skill-name", // Keep original name unless user wants rename
27
37
  description: "What the skill does",
28
38
  type: "Monitoring",
@@ -31,7 +41,8 @@ update_skill({
31
41
  { name: "run.sh", content: "#!/bin/bash\n..." }, // Changed: provide full content
32
42
  { name: "check.sh" } // Unchanged: name only
33
43
  // Omitted scripts are deleted
34
- ]
44
+ ],
45
+ labels: ["monitoring", "memory"] // Optional labels/tags
35
46
  })
36
47
  ```
37
48