switchroom 0.13.2 → 0.13.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/dist/agent-scheduler/index.js +2 -2
  2. package/dist/auth-broker/index.js +2 -2
  3. package/dist/cli/switchroom.js +21 -18
  4. package/dist/host-control/main.js +2 -2
  5. package/dist/vault/approvals/kernel-server.js +2 -2
  6. package/dist/vault/broker/server.js +2 -2
  7. package/package.json +1 -1
  8. package/telegram-plugin/dist/gateway/gateway.js +42 -10
  9. package/telegram-plugin/gateway/boot-probes.ts +13 -6
  10. package/telegram-plugin/gateway/gateway.ts +44 -6
  11. package/telegram-plugin/hooks/silent-end-interrupt-stop.mjs +5 -1
  12. package/telegram-plugin/silent-end.ts +56 -0
  13. package/telegram-plugin/tests/boot-probes.test.ts +26 -2
  14. package/telegram-plugin/tests/silent-end.test.ts +69 -0
  15. package/skills/buildkite-agent-infrastructure/SKILL.md +0 -321
  16. package/skills/buildkite-agent-infrastructure/agents/openai.yaml +0 -6
  17. package/skills/buildkite-agent-infrastructure/assets/buildkite-icon-large.png +0 -0
  18. package/skills/buildkite-agent-infrastructure/assets/buildkite-icon-small.png +0 -0
  19. package/skills/buildkite-agent-infrastructure/references/audit-logging.md +0 -87
  20. package/skills/buildkite-agent-infrastructure/references/graphql-mutations.md +0 -690
  21. package/skills/buildkite-agent-infrastructure/references/instance-shapes.md +0 -38
  22. package/skills/buildkite-agent-infrastructure/references/pipeline-templates.md +0 -73
  23. package/skills/buildkite-agent-infrastructure/references/self-hosted-agents.md +0 -137
  24. package/skills/buildkite-agent-infrastructure/references/sso-saml.md +0 -92
  25. package/skills/buildkite-agent-runtime/SKILL.md +0 -509
  26. package/skills/buildkite-agent-runtime/agents/openai.yaml +0 -6
  27. package/skills/buildkite-agent-runtime/assets/buildkite-icon-large.png +0 -0
  28. package/skills/buildkite-agent-runtime/assets/buildkite-icon-small.png +0 -0
  29. package/skills/buildkite-agent-runtime/references/flag-reference.md +0 -417
  30. package/skills/buildkite-agent-runtime/references/patterns-and-recipes.md +0 -555
  31. package/skills/buildkite-api/SKILL.md +0 -308
  32. package/skills/buildkite-api/agents/openai.yaml +0 -6
  33. package/skills/buildkite-api/assets/buildkite-icon-large.png +0 -0
  34. package/skills/buildkite-api/assets/buildkite-icon-small.png +0 -0
  35. package/skills/buildkite-api/references/graphql-reference.md +0 -195
  36. package/skills/buildkite-api/references/patterns.md +0 -44
  37. package/skills/buildkite-api/references/webhooks.md +0 -161
  38. package/skills/buildkite-cli/SKILL.md +0 -397
  39. package/skills/buildkite-cli/agents/openai.yaml +0 -6
  40. package/skills/buildkite-cli/assets/buildkite-icon-large.png +0 -0
  41. package/skills/buildkite-cli/assets/buildkite-icon-small.png +0 -0
  42. package/skills/buildkite-cli/references/command-reference.md +0 -181
  43. package/skills/buildkite-migration/SKILL.md +0 -195
  44. package/skills/buildkite-pipelines/SKILL.md +0 -481
  45. package/skills/buildkite-pipelines/agents/openai.yaml +0 -6
  46. package/skills/buildkite-pipelines/assets/buildkite-icon-large.png +0 -0
  47. package/skills/buildkite-pipelines/assets/buildkite-icon-small.png +0 -0
  48. package/skills/buildkite-pipelines/examples/basic-pipeline.yml +0 -24
  49. package/skills/buildkite-pipelines/examples/optimized-pipeline.yml +0 -100
  50. package/skills/buildkite-pipelines/references/advanced-patterns.md +0 -286
  51. package/skills/buildkite-pipelines/references/retry-and-error-codes.md +0 -131
  52. package/skills/buildkite-pipelines/references/step-types-reference.md +0 -225
  53. package/skills/buildkite-secure-delivery/SKILL.md +0 -182
  54. package/skills/buildkite-secure-delivery/agents/openai.yaml +0 -6
  55. package/skills/buildkite-secure-delivery/assets/buildkite-icon-large.png +0 -0
  56. package/skills/buildkite-secure-delivery/assets/buildkite-icon-small.png +0 -0
  57. package/skills/buildkite-secure-delivery/references/oidc-cloud-providers.md +0 -83
  58. package/skills/buildkite-secure-delivery/references/package-publishing.md +0 -100
  59. package/skills/buildkite-test-engine/SKILL.md +0 -256
  60. package/skills/buildkite-test-engine/agents/openai.yaml +0 -6
  61. package/skills/buildkite-test-engine/assets/buildkite-icon-large.png +0 -0
  62. package/skills/buildkite-test-engine/assets/buildkite-icon-small.png +0 -0
  63. package/skills/buildkite-test-engine/examples/bktec-splitting.yml +0 -16
  64. package/skills/buildkite-test-engine/examples/collector-pipeline.yml +0 -11
  65. package/skills/buildkite-test-engine/references/collectors.md +0 -198
  66. package/skills/buildkite-test-engine/references/splitting-examples.md +0 -93
@@ -7,6 +7,8 @@ import {
7
7
  writeSilentEndState,
8
8
  clearSilentEndState,
9
9
  readSilentEndState,
10
+ recordSilentTurnEnd,
11
+ SILENT_END_MAX_RETRIES,
10
12
  } from '../silent-end.js'
11
13
 
12
14
  let stateDir: string
@@ -118,6 +120,73 @@ describe('silent-end.ts — gateway state writer', () => {
118
120
  })
119
121
  })
120
122
 
123
+ describe('recordSilentTurnEnd — #1161 exhaustion detection', () => {
124
+ it('first silent-end of a turn writes state and reports exhausted:false', () => {
125
+ const r = recordSilentTurnEnd({ chatId: 'c', threadId: null, turnKey: 'c:_' })
126
+ expect(r.exhausted).toBe(false)
127
+ expect(readSilentEndState()).toMatchObject({ turnKey: 'c:_', retryCount: 0 })
128
+ })
129
+
130
+ it('reports exhausted:false while prior retryCount is still below the cap', () => {
131
+ // The Stop hook has not yet been able to push retryCount to the cap.
132
+ const path = join(stateDir, 'silent-end-pending.json')
133
+ writeFileSync(path, JSON.stringify({
134
+ chatId: 'c', threadId: null, turnKey: 'c:_',
135
+ retryCount: SILENT_END_MAX_RETRIES - 1, timestamp: 0,
136
+ }))
137
+ const r = recordSilentTurnEnd({ chatId: 'c', threadId: null, turnKey: 'c:_' })
138
+ expect(r.exhausted).toBe(false)
139
+ // State is (re)written, inheriting the prior counter for the same turn.
140
+ expect(readSilentEndState()!.retryCount).toBe(SILENT_END_MAX_RETRIES - 1)
141
+ })
142
+
143
+ it('reports exhausted:true and clears state once the re-prompt cap is reached', () => {
144
+ // The Stop hook already blocked once and pushed retryCount to the cap;
145
+ // the agent is STILL silent on this re-prompted turn.
146
+ const path = join(stateDir, 'silent-end-pending.json')
147
+ writeFileSync(path, JSON.stringify({
148
+ chatId: 'c', threadId: null, turnKey: 'c:_',
149
+ retryCount: SILENT_END_MAX_RETRIES, timestamp: 0,
150
+ }))
151
+ const r = recordSilentTurnEnd({ chatId: 'c', threadId: null, turnKey: 'c:_' })
152
+ expect(r.exhausted).toBe(true)
153
+ // State cleared so the Stop hook on this final turn allows the stop.
154
+ expect(readSilentEndState()).toBeNull()
155
+ })
156
+
157
+ it('treats a capped prior state for a DIFFERENT turn as a fresh silent-end', () => {
158
+ const path = join(stateDir, 'silent-end-pending.json')
159
+ writeFileSync(path, JSON.stringify({
160
+ chatId: 'old', threadId: null, turnKey: 'old:_',
161
+ retryCount: SILENT_END_MAX_RETRIES, timestamp: 0,
162
+ }))
163
+ const r = recordSilentTurnEnd({ chatId: 'new', threadId: 9, turnKey: 'new:9' })
164
+ expect(r.exhausted).toBe(false)
165
+ expect(readSilentEndState()).toMatchObject({ turnKey: 'new:9', retryCount: 0 })
166
+ })
167
+
168
+ it('full lifecycle: silent → re-prompt → still silent → exhausted', () => {
169
+ // 1. Turn ends silent — first record.
170
+ expect(recordSilentTurnEnd({ chatId: 'c', threadId: null, turnKey: 'c:_' }).exhausted).toBe(false)
171
+ // 2. Stop hook blocks and increments retryCount (simulated).
172
+ const path = join(stateDir, 'silent-end-pending.json')
173
+ const s = readSilentEndState()!
174
+ writeFileSync(path, JSON.stringify({ ...s, retryCount: s.retryCount + 1 }))
175
+ // 3. Re-prompted turn ends silent again — recovery exhausted.
176
+ expect(recordSilentTurnEnd({ chatId: 'c', threadId: null, turnKey: 'c:_' }).exhausted).toBe(true)
177
+ expect(readSilentEndState()).toBeNull()
178
+ })
179
+
180
+ it('SILENT_END_MAX_RETRIES matches MAX_RETRIES in the Stop hook', () => {
181
+ // The hook is a standalone .mjs and hardcodes its own copy — this
182
+ // guards the two from drifting apart.
183
+ const hookSrc = readFileSync(join(__dirname, '..', 'hooks', 'silent-end-interrupt-stop.mjs'), 'utf8')
184
+ const m = hookSrc.match(/const MAX_RETRIES = (\d+)/)
185
+ expect(m).not.toBeNull()
186
+ expect(Number(m![1])).toBe(SILENT_END_MAX_RETRIES)
187
+ })
188
+ })
189
+
121
190
  describe('silent-end-interrupt-stop hook — integration', () => {
122
191
  const hookPath = join(__dirname, '..', 'hooks', 'silent-end-interrupt-stop.mjs')
123
192
 
@@ -1,321 +0,0 @@
1
- ---
2
- name: buildkite-agent-infrastructure
3
- description: >
4
- Buildkite cluster / organization / platform administration. Whenever
5
- the user's message starts with the phrase "In Buildkite cluster
6
- admin," — regardless of what follows — use this skill; that prefix
7
- is a hard trigger that wins over `buildkite-api`, `buildkite-cli`,
8
- and `buildkite-agent-runtime`. Provision and govern Buildkite CI
9
- infrastructure: creating clusters, creating queues, scaling queues,
10
- setting up hosted agents, right-sizing instance shapes, optimizing
11
- CI costs, managing agent tokens, managing cluster secrets,
12
- configuring SSO, setting up SAML, setting up audit logging, creating
13
- pipeline templates, and standardizing pipelines across teams. Use
14
- when the user says, verbatim: "set up SAML", "manage agent tokens",
15
- "configure SSO", "set up audit logging", "Let's configure SSO.",
16
- "I need to configure SSO.", "Could you scale queues for me?",
17
- "Scale queues, please.", "scale queues", "Create a queue, please.",
18
- "Create a cluster, please.", "set up hosted agents", "manage
19
- cluster secrets", "right-size instance shapes", "optimize CI
20
- costs", "standardize pipelines across teams", "create a pipeline
21
- template", "configure agents", and typo'd variants like "manage
22
- clusetr secrets", "configuree agents", "set up hostted agents".
23
- Anything about buildkite-agent.cfg, agent tags, agent tokens, cluster
24
- queues, hosted agent instance shapes, pipeline templates, audit
25
- events, SSO/SAML providers, queue wait time, agent lifecycle hooks,
26
- or Buildkite platform governance fires this skill — even when the
27
- request mentions GraphQL or API calls (the rival `buildkite-api` is
28
- for generic webhook/pagination/scripting, NOT for SSO/queue/cluster
29
- admin which always belongs here).
30
- Do NOT use when the user is calling `buildkite-agent <subcommand>` from
31
- inside a running step (token use, artifact upload, annotate) — that's
32
- `buildkite-agent-runtime`; or when the user just wants cluster CLI
33
- shortcuts like `bk cluster ...` — that's `buildkite-cli`.
34
- ---
35
-
36
- # Buildkite Platform Engineering
37
-
38
- Provision and govern Buildkite CI infrastructure at scale: clusters, queues, hosted agent sizing, secrets, agent tokens, self-hosted configuration, lifecycle hooks, pipeline templates, audit logging, SSO/SAML, and cost optimization.
39
-
40
- ## Quick Start
41
-
42
- Create a cluster with a hosted queue to get builds running immediately. **Start with hosted agents unless there is a specific reason to self-host** (GPU workloads, on-prem, custom hardware). Self-hosted queues require provisioning your own agents; builds hang "scheduled" until agents connect.
43
-
44
- All GraphQL mutations go to `https://graphql.buildkite.com/v1` with a Bearer token:
45
-
46
- ```bash
47
- curl -sS -X POST "https://graphql.buildkite.com/v1" \
48
- -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \
49
- -H "Content-Type: application/json" \
50
- -d '{"query": "<GRAPHQL_QUERY_OR_MUTATION>", "variables": { ... }}'
51
- ```
52
-
53
- **Step 1:** Get the organization ID: `query { organization(slug: "my-org") { id } }`
54
-
55
- **Step 2:** Create a cluster:
56
-
57
- ```graphql
58
- mutation {
59
- clusterCreate(input: {
60
- organizationId: "org-id"
61
- name: "Production"
62
- description: "Production CI cluster"
63
- emoji: ":rocket:"
64
- color: "#14CC80"
65
- }) { cluster { id uuid name } }
66
- }
67
- ```
68
-
69
- **Step 3:** Create a hosted queue with a specific instance shape:
70
-
71
- ```graphql
72
- mutation {
73
- clusterQueueCreate(input: {
74
- organizationId: "org-id"
75
- clusterId: "cluster-id"
76
- key: "linux-large"
77
- description: "Linux 8 vCPU / 32 GB for heavy compilation"
78
- hostedAgents: { instanceShape: LINUX_AMD64_8X32 }
79
- }) { clusterQueue { id key } }
80
- }
81
- ```
82
-
83
- **Step 4:** Create a pipeline in the cluster via GraphQL `pipelineCreate` or the REST API, then trigger a build.
84
-
85
- > For pipeline creation via REST and GraphQL, see the **buildkite-api** skill.
86
-
87
- > For pipeline YAML syntax including `agents:` routing and `secrets:` access, see the **buildkite-pipelines** skill.
88
- > For `bk cluster` CLI commands, see the **buildkite-cli** skill.
89
-
90
- ## Clusters
91
-
92
- A cluster is the top-level container for queues, agent tokens, and secrets. Every organization starts with one default cluster; create additional clusters to isolate workloads (e.g., production vs. staging, team-specific).
93
-
94
- ### Create a cluster
95
-
96
- ```bash
97
- curl -s -X POST "https://api.buildkite.com/v2/organizations/my-org/clusters" \
98
- -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \
99
- -H "Content-Type: application/json" \
100
- -d '{
101
- "name": "Backend",
102
- "description": "Backend team CI cluster",
103
- "emoji": ":gear:",
104
- "color": "#0B79CE"
105
- }'
106
- ```
107
-
108
- Fields: `name` (required), `description`, `emoji`, `color`, `default_queue_id` (optional).
109
-
110
- > For full REST and GraphQL API reference, see the **buildkite-api** skill.
111
-
112
- ## Queues and Hosted Agents
113
-
114
- Queues route builds to agents. **Hosted queues** (Buildkite-managed compute) are the recommended starting point — builds run immediately. **Self-hosted queues** require connecting your own agents; builds remain "scheduled" until agents connect.
115
-
116
- Create queues with the `clusterQueueCreate` GraphQL mutation (shown in Quick Start above). To create a **self-hosted queue**, omit `hostedAgents`. Self-hosted agents connect by targeting the queue key in their configuration.
117
-
118
- ### Instance shapes and sizing guide
119
-
120
- Full list: `references/instance-shapes.md`. Quick sizing:
121
-
122
- | Workload | Shape |
123
- |----------|-------|
124
- | Linting, unit tests | `LINUX_AMD64_2X4` |
125
- | Monorepos, multi-service | `LINUX_AMD64_4X16` |
126
- | Heavy compilation (C++, Rust) | `LINUX_AMD64_8X32` |
127
- | Docker builds, ML prep | `LINUX_AMD64_16X64` |
128
- | iOS / macOS | `MACOS_M4_6X28` or `MACOS_M4_12X56` |
129
-
130
- Start with the smallest shape that keeps builds under target time. Scale up if queue wait exceeds 2 minutes.
131
-
132
- ### Queue design patterns
133
-
134
- - **Keep 1-2 static instances in the default queue** — avoids cold-start latency on pipeline uploads
135
- - **Retire oldest agents first during scale-down** — preserves warm caches
136
- - **Trial pattern** — test new shapes/architectures on a separate queue before migrating
137
- - **Tag builds with metadata** for cost attribution by queue or team
138
-
139
- Temporarily pause dispatch to a queue for maintenance or cost control using `clusterQueuePauseDispatch` / `clusterQueueResumeDispatch` GraphQL mutations. See `references/graphql-mutations.md` for examples.
140
-
141
- ## Cluster Secrets
142
-
143
- Cluster secrets are encrypted, cluster-scoped values accessible from pipeline steps. They replace hardcoded credentials and environment-hook-based secret injection. Create, update, and rotate secrets via the REST API at `/v2/organizations/{org}/clusters/{cluster_id}/secrets`.
144
-
145
- ### Secret key constraints
146
-
147
- | Rule | Detail |
148
- |------|--------|
149
- | Must start with | A letter (A-Z, a-z) |
150
- | Allowed characters | Letters, numbers, underscores only |
151
- | Prohibited prefixes | `buildkite`, `bk` (reserved) |
152
- | Max key length | 255 characters |
153
- | Max value size | 8 KB |
154
-
155
- ### Access policies
156
-
157
- Restrict which pipelines and branches can access a secret by adding a `policy` object with `claims`. Available claim types: `pipeline_slug`, `build_branch`, `build_creator`, `build_source`, `build_creator_team`, `cluster_queue_key`. Claims support `*` wildcards. See [Buildkite Secrets docs](https://buildkite.com/docs/pipelines/security/secrets/buildkite-secrets.md) for policy examples.
158
-
159
- Value rotation uses a separate endpoint (`PUT .../secrets/{id}/value`) from description/policy updates (`PUT .../secrets/{id}`).
160
-
161
- > For `secrets:` YAML syntax, see the **buildkite-pipelines** skill. For `buildkite-agent secret get`, see the **buildkite-agent-runtime** skill.
162
-
163
- ## Agent Tokens
164
-
165
- Agent tokens authenticate agents connecting to a cluster. Each token is scoped to a single cluster.
166
-
167
- ### Create a token
168
-
169
- ```bash
170
- curl -s -X POST "https://api.buildkite.com/v2/organizations/my-org/clusters/$CLUSTER_ID/tokens" \
171
- -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \
172
- -H "Content-Type: application/json" \
173
- -d '{
174
- "description": "Backend CI agents - production",
175
- "allowed_ip_addresses": "10.0.0.0/8"
176
- }'
177
- ```
178
-
179
- | Field | Required | Description |
180
- |-------|----------|-------------|
181
- | `description` | Yes | Human-readable token description |
182
- | `allowed_ip_addresses` | No | Comma-separated CIDR ranges restricting agent connections |
183
- | `expires_at` | No | ISO 8601 expiry timestamp |
184
-
185
- The token value is only returned at creation time. Store it in a secrets manager immediately.
186
-
187
- ## Self-Hosted Agents and Lifecycle Hooks
188
-
189
- Self-hosted agents run on your own infrastructure, configured via `buildkite-agent.cfg`. Prefer clustered agents for new deployments — they provide secret scoping, queue isolation, and better organizational control. For full configuration reference, `buildkite-agent.cfg` examples, and clustered vs. unclustered agent details, see `references/self-hosted-agents.md`.
190
-
191
- Agent lifecycle hooks execute at specific points during job execution: `environment` → `pre-checkout` → `checkout` → `post-checkout` → `pre-command` → `command` → `post-command` → `pre-exit` → `pre-artifact`. Agent-level hooks run first, then repository hooks, then plugin hooks. For hook details and examples, see `references/self-hosted-agents.md`.
192
-
193
- ### Hosted agent caching behavior
194
-
195
- **Cache volumes on hosted agents are non-deterministic** — jobs may or may not get a warm cache. Treat cache volumes as performance accelerators, not guarantees. Cache volumes are **pipeline-scoped** (not shared across pipelines). For deterministic caching, use Docker images with pre-built dependencies instead. Git mirrors can be enabled via cache volumes to accelerate checkout; mount `.git/lfs/objects` in cache volumes and pre-install `git-lfs` in the agent image.
196
-
197
- ### Hosted agent checkout performance
198
-
199
- Buildkite's default checkout **prioritizes completeness over speed** — it may be noticeably slower than GitHub Actions for the same repo. Optimize with the Sparse Checkout plugin (monorepos), Git mirrors (frequent builds), or the Git Shallow Clone plugin (repos where full history is unnecessary).
200
-
201
- ### Hosted agent custom hooks
202
-
203
- Hosted agents support custom hooks via a custom agent image. Add hooks in a Dockerfile:
204
-
205
- ```dockerfile
206
- FROM buildkite/agent:latest
207
-
208
- ENV BUILDKITE_ADDITIONAL_HOOKS_PATHS=/custom/hooks
209
- COPY ./hooks/*.sh /custom/hooks/
210
- RUN chmod +x /custom/hooks/*.sh
211
- ```
212
-
213
- ### Hosted agent pre-installed tools
214
-
215
- Linux hosted agents include: `bash`, `curl`, `wget`, `git`, `docker`, `python3`, `jq`.
216
-
217
- **`nvm` is NOT pre-installed.** Do not source `~/.nvm/nvm.sh` — it will fail silently or exit 127. Use `fnm` instead:
218
-
219
- ```bash
220
- curl -fsSL https://fnm.vercel.app/install | bash -s -- --install-dir "$HOME/.fnm" --skip-shell
221
- export PATH="$HOME/.fnm:$PATH" && eval "$(fnm env --use-on-cd)"
222
- fnm install 20 && fnm use 20
223
- ```
224
-
225
- `fnm` downloads from `nodejs.org` directly and works for all versions including EOL.
226
-
227
- **GitHub release asset downloads may be blocked.** `release-assets.githubusercontent.com` is unreachable from hosted agents. Pre-install tools distributed as GitHub release binaries (CodeQL, Scorecard, Trivy) in a custom agent image using `agentImageRef`.
228
-
229
- **Always verify queue creation** after a GraphQL mutation by listing queues via `GET /v2/organizations/{org}/clusters/{cluster_id}/queues`. Silent GraphQL errors can leave the cluster without a hosted queue — if the list is empty, retry via the REST API.
230
-
231
- ## Plugin Security Controls
232
-
233
- Restrict which plugins agents can run:
234
-
235
- - **Agent-level allowlisting** — use `allowed-plugins` in `buildkite-agent.cfg` to restrict to approved plugins
236
- - **`no-plugins=true`** — disable all plugins on sensitive agents
237
- - **Cluster-based policies** — apply different plugin restrictions per cluster based on security requirements
238
-
239
- Audit plugin repositories proactively — Buildkite does not automatically alert to plugin vulnerabilities.
240
-
241
- ## Pipeline Templates
242
-
243
- Pipeline templates (Enterprise-only) standardize pipeline YAML across the organization. See `references/pipeline-templates.md`.
244
-
245
- ## Audit Logging
246
-
247
- Audit logging (Enterprise-only) tracks organization-level events for compliance. Query via GraphQL or stream to a SIEM via Amazon EventBridge. See `references/audit-logging.md`.
248
-
249
- ## SSO/SAML
250
-
251
- Buildkite supports SAML 2.0 (Okta, Azure AD, Google Workspace, OneLogin). See `references/sso-saml.md` for setup flow.
252
-
253
- ## Cost Optimization
254
-
255
- ### Cost reduction patterns
256
-
257
- | Pattern | Savings | How |
258
- |---------|---------|-----|
259
- | Right-size instance shapes | 20-40% | Match shape to actual resource needs |
260
- | Use `disconnect-after-job` for self-hosted | 10-20% | Ephemeral agents don't idle between jobs |
261
- | Pause queues during off-hours | 10-30% | `clusterQueuePauseDispatch` on nights/weekends |
262
- | Skip unnecessary work with `if_changed` | 10-30% | Only run tests for changed code paths |
263
- | Use `priority` to run critical jobs first | Indirect | Reduces developer wait time for important builds |
264
-
265
- > For `if_changed` and pipeline optimization patterns, see the **buildkite-pipelines** skill.
266
-
267
- ## Observability and Queue Monitoring
268
-
269
- | Tool | Purpose | How it works |
270
- |------|---------|-------------|
271
- | `buildkite-agent-metrics` | Fleet-level queue and job metrics | Polls the Buildkite API; emits to CloudWatch, Datadog, StatsD |
272
- | Agent health check service | Per-agent process health | Exposes Prometheus endpoint; scrape from each agent host |
273
-
274
- **Start with queue profiling** — wait time and checkout time are the biggest, cheapest wins. Target: queue wait time under 2 minutes.
275
-
276
- ### Scaling decision flow
277
-
278
- ```
279
- Queue wait > 2 min?
280
- ├── Yes → Check agent count
281
- │ ├── Agents maxed out → Scale up (add agents or increase shape)
282
- │ ├── Agents idle → Check for job distribution issues (tags, queue routing)
283
- │ └── No agents → Check token, connectivity, agent health
284
- └── No → Queue is healthy
285
- ```
286
-
287
- ## Common Mistakes
288
-
289
- | Mistake | Fix |
290
- |---------|-----|
291
- | Secret key starting with `buildkite` or `bk` | Use a different prefix — these are reserved |
292
- | Secret key with dashes or dots | Only letters, numbers, underscores allowed: `MY_SECRET_KEY` not `my-secret-key` |
293
- | Not storing agent token at creation time | Token is only shown once — store in secrets manager immediately |
294
- | Org-level tokens for clustered agents | Use cluster-scoped tokens (`clusterAgentTokenCreate`) |
295
- | Over-provisioning instance shapes | Start small, monitor, scale up only when builds are slow |
296
- | No `disconnect-after-job` on autoscaled agents | Set `disconnect-after-job=true` for ephemeral pools |
297
- | One large queue for all workloads | Create specialized queues per workload type |
298
- | Cluster creation returns HTTP 500 | List existing clusters first; rename the Default cluster via PATCH as a workaround |
299
- | "Upgrade to Platform Pro" on hosted queue creation | Fall back: create self-hosted queue, install `buildkite-agent` locally with `--spawn 3` |
300
- | Expecting cache volumes to always be warm | Design builds to work without cache — volumes are non-deterministic |
301
- | One IAM role for all queues | Assign different IAM roles per queue; scope secrets by `cluster_queue_key` |
302
- | Scaling down newest agents first | Retire oldest agents first to preserve warm caches |
303
- | Jobs hang "scheduled" with agents connected | Check `default_queue_id` matches the agent's queue tag; update via PATCH |
304
-
305
- ## Additional Resources
306
-
307
- - **`references/graphql-mutations.md`** — GraphQL mutations for clusters, queues, tokens, templates, SSO, audit
308
- - **`references/instance-shapes.md`** — All hosted agent instance shapes
309
- - **`references/self-hosted-agents.md`** — Agent config, clustered vs. unclustered, lifecycle hooks
310
- - **`references/pipeline-templates.md`** — Template mutations and strategy (Enterprise)
311
- - **`references/audit-logging.md`** — Audit queries, SIEM/EventBridge integration (Enterprise)
312
- - **`references/sso-saml.md`** — SSO/SAML provider setup
313
-
314
- ## Further Reading
315
-
316
- - [Buildkite Docs for LLMs](https://buildkite.com/docs/llms.txt)
317
- - [Manage clusters](https://buildkite.com/docs/clusters/manage-clusters.md)
318
- - [Manage cluster queues](https://buildkite.com/docs/clusters/manage-queues.md)
319
- - [Manage cluster secrets](https://buildkite.com/docs/pipelines/security/secrets/buildkite-secrets.md)
320
- - [Agent configuration](https://buildkite.com/docs/agent/v3/configuration.md)
321
- - [Agent hooks](https://buildkite.com/docs/agent/v3/hooks.md)
@@ -1,6 +0,0 @@
1
- interface:
2
- display_name: "Buildkite Agent Infrastructure"
3
- short_description: "Clusters, queues, hosted agents, secrets, SSO, and cost optimization"
4
- icon_small: "./assets/buildkite-icon-small.png"
5
- icon_large: "./assets/buildkite-icon-large.png"
6
- brand_color: "#00D974"
@@ -1,87 +0,0 @@
1
- # Audit Logging and SIEM Integration
2
-
3
- Audit logging (Enterprise-only) tracks organization-level events for compliance and security monitoring.
4
-
5
- ## Query audit events
6
-
7
- ```graphql
8
- query {
9
- organization(slug: "my-org") {
10
- auditEvents(
11
- first: 50
12
- occurredAtFrom: "2026-03-01T00:00:00Z"
13
- occurredAtTo: "2026-03-26T23:59:59Z"
14
- ) {
15
- edges {
16
- node {
17
- type
18
- occurredAt
19
- actor { name type uuid }
20
- subject { name type uuid }
21
- data
22
- }
23
- }
24
- }
25
- }
26
- }
27
- ```
28
-
29
- | Filter | Description |
30
- |--------|-------------|
31
- | `occurredAtFrom` / `occurredAtTo` | ISO 8601 time range |
32
- | `type` | Specific audit event type (e.g., `ORGANIZATION_UPDATED`) |
33
- | `subjectType` | Filter by subject type (e.g., `PIPELINE`, `AGENT_TOKEN`) |
34
- | `subjectUUID` | Filter by specific subject |
35
- | `order` | `RECENTLY_OCCURRED` (default) or `OLDEST_OCCURRED` |
36
-
37
- ## High-severity events to monitor
38
-
39
- | Event type | Why it matters |
40
- |------------|---------------|
41
- | `agent_token.created` / `.deleted` | Agent authentication changes |
42
- | `member.invited` / `.removed` | Team membership changes |
43
- | `sso_provider.created` / `.updated` | SSO configuration changes |
44
- | `pipeline_schedule.created` | New automated triggers |
45
- | `cluster_secret.created` / `.deleted` | Secret management changes |
46
- | `organization.updated` | Org-level setting changes |
47
-
48
- ## SIEM integration via Amazon EventBridge
49
-
50
- Stream audit events to a SIEM in real time using EventBridge:
51
-
52
- - **Source:** `aws.partner/buildkite.com/buildkite/<partner-event-source-id>`
53
- - **Detail type:** `"Audit Event Logged"`
54
-
55
- Event payload structure:
56
-
57
- ```json
58
- {
59
- "organization": {
60
- "uuid": "org-uuid",
61
- "graphql_id": "T3JnYW5pemF0aW9u...",
62
- "slug": "my-org"
63
- },
64
- "event": {
65
- "uuid": "event-uuid",
66
- "occurred_at": "2026-03-26T14:30:00Z",
67
- "type": "agent_token.created",
68
- "data": { },
69
- "subject_type": "AgentToken",
70
- "subject_uuid": "token-uuid",
71
- "subject_name": "Production agents",
72
- "context": {
73
- "request_id": "req-uuid",
74
- "request_ip": "203.0.113.42",
75
- "session_user_uuid": "user-uuid",
76
- "request_user_agent": "Mozilla/5.0..."
77
- }
78
- },
79
- "actor": {
80
- "name": "Jane Engineer",
81
- "type": "USER",
82
- "uuid": "user-uuid"
83
- }
84
- }
85
- ```
86
-
87
- Route high-severity events to PagerDuty, Splunk, or Datadog via EventBridge rules matching on `detail.event.type`.