@bradygaster/squad-sdk 0.8.25 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/dist/adapter/client.d.ts +17 -0
  2. package/dist/adapter/client.d.ts.map +1 -1
  3. package/dist/adapter/client.js +101 -1
  4. package/dist/adapter/client.js.map +1 -1
  5. package/dist/agents/history-shadow.d.ts.map +1 -1
  6. package/dist/agents/history-shadow.js +99 -32
  7. package/dist/agents/history-shadow.js.map +1 -1
  8. package/dist/agents/index.d.ts +1 -0
  9. package/dist/agents/index.d.ts.map +1 -1
  10. package/dist/agents/index.js +2 -0
  11. package/dist/agents/index.js.map +1 -1
  12. package/dist/agents/model-selector.d.ts +2 -0
  13. package/dist/agents/model-selector.d.ts.map +1 -1
  14. package/dist/agents/model-selector.js +41 -35
  15. package/dist/agents/model-selector.js.map +1 -1
  16. package/dist/agents/personal.d.ts +35 -0
  17. package/dist/agents/personal.d.ts.map +1 -0
  18. package/dist/agents/personal.js +67 -0
  19. package/dist/agents/personal.js.map +1 -0
  20. package/dist/builders/index.d.ts +3 -2
  21. package/dist/builders/index.d.ts.map +1 -1
  22. package/dist/builders/index.js +28 -0
  23. package/dist/builders/index.js.map +1 -1
  24. package/dist/builders/types.d.ts +13 -0
  25. package/dist/builders/types.d.ts.map +1 -1
  26. package/dist/config/init.d.ts +8 -0
  27. package/dist/config/init.d.ts.map +1 -1
  28. package/dist/config/init.js +131 -20
  29. package/dist/config/init.js.map +1 -1
  30. package/dist/config/models.d.ts +112 -0
  31. package/dist/config/models.d.ts.map +1 -1
  32. package/dist/config/models.js +329 -18
  33. package/dist/config/models.js.map +1 -1
  34. package/dist/coordinator/index.js +2 -2
  35. package/dist/coordinator/index.js.map +1 -1
  36. package/dist/index.d.ts +8 -3
  37. package/dist/index.d.ts.map +1 -1
  38. package/dist/index.js +7 -2
  39. package/dist/index.js.map +1 -1
  40. package/dist/platform/azure-devops.d.ts +42 -0
  41. package/dist/platform/azure-devops.d.ts.map +1 -1
  42. package/dist/platform/azure-devops.js +75 -0
  43. package/dist/platform/azure-devops.js.map +1 -1
  44. package/dist/platform/comms-file-log.d.ts.map +1 -1
  45. package/dist/platform/comms-file-log.js +2 -1
  46. package/dist/platform/comms-file-log.js.map +1 -1
  47. package/dist/platform/index.d.ts +2 -1
  48. package/dist/platform/index.d.ts.map +1 -1
  49. package/dist/platform/index.js +1 -0
  50. package/dist/platform/index.js.map +1 -1
  51. package/dist/ralph/capabilities.d.ts +67 -0
  52. package/dist/ralph/capabilities.d.ts.map +1 -0
  53. package/dist/ralph/capabilities.js +111 -0
  54. package/dist/ralph/capabilities.js.map +1 -0
  55. package/dist/ralph/index.d.ts +2 -0
  56. package/dist/ralph/index.d.ts.map +1 -1
  57. package/dist/ralph/index.js +6 -5
  58. package/dist/ralph/index.js.map +1 -1
  59. package/dist/ralph/rate-limiting.d.ts +99 -0
  60. package/dist/ralph/rate-limiting.d.ts.map +1 -0
  61. package/dist/ralph/rate-limiting.js +170 -0
  62. package/dist/ralph/rate-limiting.js.map +1 -0
  63. package/dist/resolution.d.ts +24 -2
  64. package/dist/resolution.d.ts.map +1 -1
  65. package/dist/resolution.js +106 -6
  66. package/dist/resolution.js.map +1 -1
  67. package/dist/roles/catalog-categories.d.ts +146 -0
  68. package/dist/roles/catalog-categories.d.ts.map +1 -0
  69. package/dist/roles/catalog-categories.js +374 -0
  70. package/dist/roles/catalog-categories.js.map +1 -0
  71. package/dist/roles/catalog-engineering.d.ts +212 -0
  72. package/dist/roles/catalog-engineering.d.ts.map +1 -0
  73. package/dist/roles/catalog-engineering.js +549 -0
  74. package/dist/roles/catalog-engineering.js.map +1 -0
  75. package/dist/roles/catalog.d.ts +24 -0
  76. package/dist/roles/catalog.d.ts.map +1 -0
  77. package/dist/roles/catalog.js +28 -0
  78. package/dist/roles/catalog.js.map +1 -0
  79. package/dist/roles/index.d.ts +69 -0
  80. package/dist/roles/index.d.ts.map +1 -0
  81. package/dist/roles/index.js +197 -0
  82. package/dist/roles/index.js.map +1 -0
  83. package/dist/roles/types.d.ts +87 -0
  84. package/dist/roles/types.d.ts.map +1 -0
  85. package/dist/roles/types.js +14 -0
  86. package/dist/roles/types.js.map +1 -0
  87. package/dist/runtime/benchmarks.js +5 -5
  88. package/dist/runtime/benchmarks.js.map +1 -1
  89. package/dist/runtime/constants.d.ts +2 -2
  90. package/dist/runtime/constants.d.ts.map +1 -1
  91. package/dist/runtime/constants.js +5 -3
  92. package/dist/runtime/constants.js.map +1 -1
  93. package/dist/runtime/cross-squad.d.ts +118 -0
  94. package/dist/runtime/cross-squad.d.ts.map +1 -0
  95. package/dist/runtime/cross-squad.js +234 -0
  96. package/dist/runtime/cross-squad.js.map +1 -0
  97. package/dist/runtime/otel-init.d.ts +24 -17
  98. package/dist/runtime/otel-init.d.ts.map +1 -1
  99. package/dist/runtime/otel-init.js +29 -20
  100. package/dist/runtime/otel-init.js.map +1 -1
  101. package/dist/runtime/otel-metrics.d.ts +5 -0
  102. package/dist/runtime/otel-metrics.d.ts.map +1 -1
  103. package/dist/runtime/otel-metrics.js +54 -0
  104. package/dist/runtime/otel-metrics.js.map +1 -1
  105. package/dist/runtime/rework.d.ts +71 -0
  106. package/dist/runtime/rework.d.ts.map +1 -0
  107. package/dist/runtime/rework.js +107 -0
  108. package/dist/runtime/rework.js.map +1 -0
  109. package/dist/runtime/scheduler.d.ts +128 -0
  110. package/dist/runtime/scheduler.d.ts.map +1 -0
  111. package/dist/runtime/scheduler.js +427 -0
  112. package/dist/runtime/scheduler.js.map +1 -0
  113. package/dist/runtime/squad-observer.d.ts.map +1 -1
  114. package/dist/runtime/squad-observer.js +4 -0
  115. package/dist/runtime/squad-observer.js.map +1 -1
  116. package/dist/runtime/streaming.d.ts +2 -0
  117. package/dist/runtime/streaming.d.ts.map +1 -1
  118. package/dist/runtime/streaming.js +6 -0
  119. package/dist/runtime/streaming.js.map +1 -1
  120. package/dist/runtime/telemetry.d.ts +2 -0
  121. package/dist/runtime/telemetry.d.ts.map +1 -1
  122. package/dist/runtime/telemetry.js +6 -0
  123. package/dist/runtime/telemetry.js.map +1 -1
  124. package/dist/sharing/consult.d.ts +2 -2
  125. package/dist/sharing/consult.js +6 -6
  126. package/dist/sharing/consult.js.map +1 -1
  127. package/dist/sharing/export.d.ts.map +1 -1
  128. package/dist/sharing/export.js +17 -4
  129. package/dist/sharing/export.js.map +1 -1
  130. package/dist/skills/handler-types.d.ts +271 -0
  131. package/dist/skills/handler-types.d.ts.map +1 -0
  132. package/dist/skills/handler-types.js +31 -0
  133. package/dist/skills/handler-types.js.map +1 -0
  134. package/dist/skills/index.d.ts +3 -0
  135. package/dist/skills/index.d.ts.map +1 -1
  136. package/dist/skills/index.js +3 -0
  137. package/dist/skills/index.js.map +1 -1
  138. package/dist/skills/skill-script-loader.d.ts +65 -0
  139. package/dist/skills/skill-script-loader.d.ts.map +1 -0
  140. package/dist/skills/skill-script-loader.js +227 -0
  141. package/dist/skills/skill-script-loader.js.map +1 -0
  142. package/dist/skills/skill-source.d.ts.map +1 -1
  143. package/dist/skills/skill-source.js +5 -1
  144. package/dist/skills/skill-source.js.map +1 -1
  145. package/dist/tools/index.d.ts +10 -1
  146. package/dist/tools/index.d.ts.map +1 -1
  147. package/dist/tools/index.js +49 -8
  148. package/dist/tools/index.js.map +1 -1
  149. package/dist/upstream/resolver.d.ts.map +1 -1
  150. package/dist/upstream/resolver.js +14 -5
  151. package/dist/upstream/resolver.js.map +1 -1
  152. package/package.json +34 -3
  153. package/templates/casting/Futurama.json +10 -0
  154. package/templates/casting-policy.json +4 -2
  155. package/templates/casting-reference.md +104 -0
  156. package/templates/cooperative-rate-limiting.md +229 -0
  157. package/templates/issue-lifecycle.md +412 -0
  158. package/templates/keda-scaler.md +164 -0
  159. package/templates/machine-capabilities.md +75 -0
  160. package/templates/mcp-config.md +0 -8
  161. package/templates/orchestration-log.md +27 -27
  162. package/templates/package.json +3 -0
  163. package/templates/ralph-circuit-breaker.md +313 -0
  164. package/templates/ralph-triage.js +543 -0
  165. package/templates/routing.md +5 -20
  166. package/templates/schedule.json +19 -0
  167. package/templates/scribe-charter.md +1 -1
  168. package/templates/skills/agent-collaboration/SKILL.md +42 -0
  169. package/templates/skills/agent-conduct/SKILL.md +24 -0
  170. package/templates/skills/architectural-proposals/SKILL.md +151 -0
  171. package/templates/skills/ci-validation-gates/SKILL.md +84 -0
  172. package/templates/skills/cli-wiring/SKILL.md +47 -0
  173. package/templates/skills/client-compatibility/SKILL.md +89 -0
  174. package/templates/skills/cross-squad/SKILL.md +114 -0
  175. package/templates/skills/distributed-mesh/SKILL.md +287 -0
  176. package/templates/skills/distributed-mesh/mesh.json.example +30 -0
  177. package/templates/skills/distributed-mesh/sync-mesh.ps1 +111 -0
  178. package/templates/skills/distributed-mesh/sync-mesh.sh +104 -0
  179. package/templates/skills/docs-standards/SKILL.md +71 -0
  180. package/templates/skills/economy-mode/SKILL.md +114 -0
  181. package/templates/skills/external-comms/SKILL.md +329 -0
  182. package/templates/skills/gh-auth-isolation/SKILL.md +183 -0
  183. package/templates/skills/git-workflow/SKILL.md +204 -0
  184. package/templates/skills/github-multi-account/SKILL.md +95 -0
  185. package/templates/skills/history-hygiene/SKILL.md +36 -0
  186. package/templates/skills/humanizer/SKILL.md +105 -0
  187. package/templates/skills/init-mode/SKILL.md +102 -0
  188. package/templates/skills/model-selection/SKILL.md +117 -0
  189. package/templates/skills/nap/SKILL.md +24 -0
  190. package/templates/skills/personal-squad/SKILL.md +57 -0
  191. package/templates/skills/release-process/SKILL.md +423 -0
  192. package/templates/skills/reskill/SKILL.md +92 -0
  193. package/templates/skills/reviewer-protocol/SKILL.md +79 -0
  194. package/templates/skills/secret-handling/SKILL.md +200 -0
  195. package/templates/skills/session-recovery/SKILL.md +155 -0
  196. package/templates/skills/squad-conventions/SKILL.md +69 -0
  197. package/templates/skills/test-discipline/SKILL.md +37 -0
  198. package/templates/skills/windows-compatibility/SKILL.md +74 -0
  199. package/templates/squad.agent.md +1287 -1146
  200. package/templates/workflows/squad-docs.yml +8 -4
  201. package/templates/workflows/squad-heartbeat.yml +55 -200
  202. package/templates/workflows/squad-insider-release.yml +1 -1
@@ -0,0 +1,164 @@
1
+ # KEDA External Scaler for GitHub Issue-Driven Agent Autoscaling
2
+
3
+ > Scale agent pods to zero when idle, up when work arrives — driven by GitHub Issues.
4
+
5
+ ## Overview
6
+
7
+ When running Squad on Kubernetes, agent pods sit idle when no work exists. [KEDA](https://keda.sh) (Kubernetes Event-Driven Autoscaler) solves this for queue-based workloads, but GitHub Issues isn't a native KEDA trigger.
8
+
9
+ The `keda-copilot-scaler` is a KEDA External Scaler (gRPC) that bridges this gap:
10
+ 1. Polls GitHub API for issues matching specific labels (e.g., `squad:copilot`)
11
+ 2. Reports queue depth as a KEDA metric
12
+ 3. Handles rate limits gracefully (Retry-After, exponential backoff)
13
+ 4. Supports composite scaling decisions
14
+
15
+ ## Quick Start
16
+
17
+ ### Prerequisites
18
+ - Kubernetes cluster with KEDA v2.x installed
19
+ - GitHub personal access token (PAT) with `repo` scope
20
+ - Helm 3.x
21
+
22
+ ### 1. Install the Scaler
23
+
24
+ ```bash
25
+ helm install keda-copilot-scaler oci://ghcr.io/tamirdresher/keda-copilot-scaler \
26
+ --namespace squad-scaler --create-namespace \
27
+ --set github.owner=YOUR_ORG \
28
+ --set github.repo=YOUR_REPO \
29
+ --set github.token=YOUR_TOKEN
30
+ ```
31
+
32
+ Or with Kustomize:
33
+ ```bash
34
+ kubectl apply -k https://github.com/tamirdresher/keda-copilot-scaler/deploy/kustomize
35
+ ```
36
+
37
+ ### 2. Create a ScaledObject
38
+
39
+ ```yaml
40
+ apiVersion: keda.sh/v1alpha1
41
+ kind: ScaledObject
42
+ metadata:
43
+ name: picard-scaler
44
+ namespace: squad
45
+ spec:
46
+ scaleTargetRef:
47
+ name: picard-deployment
48
+ minReplicaCount: 0 # Scale to zero when idle
49
+ maxReplicaCount: 3
50
+ pollingInterval: 30 # Check every 30 seconds
51
+ cooldownPeriod: 300 # Wait 5 minutes before scaling down
52
+ triggers:
53
+ - type: external
54
+ metadata:
55
+ scalerAddress: keda-copilot-scaler.squad-scaler.svc.cluster.local:6000
56
+ owner: your-org
57
+ repo: your-repo
58
+ labels: squad:copilot # Only count issues with this label
59
+ threshold: "1" # Scale up when >= 1 issue exists
60
+ ```
61
+
62
+ ### 3. Verify
63
+
64
+ ```bash
65
+ # Check the scaler is running
66
+ kubectl get pods -n squad-scaler
67
+
68
+ # Check ScaledObject status
69
+ kubectl get scaledobject picard-scaler -n squad
70
+
71
+ # Watch scaling events
72
+ kubectl get events -n squad --watch
73
+ ```
74
+
75
+ ## Scaling Behavior
76
+
77
+ | Open Issues | Target Replicas | Behavior |
78
+ |------------|----------------|----------|
79
+ | 0 | 0 | Scale to zero — save resources |
80
+ | 1–3 | 1 | Single agent handles work |
81
+ | 4–10 | 2 | Scale up for parallel processing |
82
+ | 10+ | 3 (max) | Maximum parallelism |
83
+
84
+ The threshold and max replicas are configurable per ScaledObject.
85
+
86
+ ## Rate Limit Awareness
87
+
88
+ The scaler tracks GitHub API rate limits:
89
+ - Reads `X-RateLimit-Remaining` from API responses
90
+ - Backs off when quota is low (< 100 remaining)
91
+ - Reports rate limit metrics as secondary KEDA triggers
92
+ - Never exhausts API quota from polling
93
+
94
+ ## Integration with Squad
95
+
96
+ ### Machine Capabilities (#514)
97
+
98
+ Combine with machine capability labels for intelligent scheduling:
99
+
100
+ ```yaml
101
+ # Only scale pods on GPU-capable nodes
102
+ spec:
103
+ template:
104
+ spec:
105
+ nodeSelector:
106
+ node.squad.dev/gpu: "true"
107
+ triggers:
108
+ - type: external
109
+ metadata:
110
+ labels: squad:copilot,needs:gpu
111
+ ```
112
+
113
+ ### Cooperative Rate Limiting (#515)
114
+
115
+ The scaler exposes rate limit metrics that feed into the cooperative rate limiting system:
116
+ - Current `X-RateLimit-Remaining` value
117
+ - Predicted time to exhaustion (from predictive circuit breaker)
118
+ - Can return 0 target replicas when rate limited → pods scale to zero
119
+
120
+ ## Architecture
121
+
122
+ ```
123
+ GitHub API KEDA Kubernetes
124
+ ┌──────────┐ ┌──────────┐ ┌──────────────┐
125
+ │ Issues │◄── poll ──►│ Scaler │──metrics─►│ HPA / KEDA │
126
+ │ (REST) │ │ (gRPC) │ │ Controller │
127
+ └──────────┘ └──────────┘ └──────┬───────┘
128
+
129
+ scale up/down
130
+
131
+ ┌──────▼───────┐
132
+ │ Agent Pods │
133
+ │ (0–N replicas)│
134
+ └──────────────┘
135
+ ```
136
+
137
+ ## Configuration Reference
138
+
139
+ | Parameter | Default | Description |
140
+ |-----------|---------|-------------|
141
+ | `github.owner` | — | Repository owner |
142
+ | `github.repo` | — | Repository name |
143
+ | `github.token` | — | GitHub PAT with `repo` scope |
144
+ | `github.labels` | `squad:copilot` | Comma-separated label filter |
145
+ | `scaler.port` | `6000` | gRPC server port |
146
+ | `scaler.pollInterval` | `30s` | GitHub API polling interval |
147
+ | `scaler.rateLimitThreshold` | `100` | Stop polling below this remaining |
148
+
149
+ ## Source & Contributing
150
+
151
+ - **Repository:** [tamirdresher/keda-copilot-scaler](https://github.com/tamirdresher/keda-copilot-scaler)
152
+ - **License:** MIT
153
+ - **Language:** Go
154
+ - **Tests:** 51 passing (unit + integration)
155
+ - **CI:** GitHub Actions
156
+
157
+ The scaler is maintained as a standalone project. PRs and issues welcome.
158
+
159
+ ## References
160
+
161
+ - [KEDA External Scalers](https://keda.sh/docs/latest/concepts/external-scalers/) — KEDA documentation
162
+ - [Squad on AKS](https://github.com/tamirdresher/squad-on-aks) — Full Kubernetes deployment example
163
+ - [Machine Capabilities](machine-capabilities.md) — Capability-based routing (#514)
164
+ - [Cooperative Rate Limiting](cooperative-rate-limiting.md) — Multi-agent rate management (#515)
@@ -0,0 +1,75 @@
1
+ # Machine Capability Discovery & Label-Based Routing
2
+
3
+ > Enable Ralph to skip issues requiring capabilities the current machine lacks.
4
+
5
+ ## Overview
6
+
7
+ When running Squad across multiple machines (laptops, DevBoxes, GPU servers, Kubernetes nodes), each machine has different tooling. The capability system lets you declare what each machine can do, and Ralph automatically routes work accordingly.
8
+
9
+ ## Setup
10
+
11
+ ### 1. Create a Capabilities Manifest
12
+
13
+ Create `~/.squad/machine-capabilities.json` (user-wide) or `.squad/machine-capabilities.json` (project-local):
14
+
15
+ ```json
16
+ {
17
+ "machine": "MY-LAPTOP",
18
+ "capabilities": ["browser", "personal-gh", "onedrive"],
19
+ "missing": ["gpu", "docker", "azure-speech"],
20
+ "lastUpdated": "2026-03-22T00:00:00Z"
21
+ }
22
+ ```
23
+
24
+ ### 2. Label Issues with Requirements
25
+
26
+ Add `needs:*` labels to issues that require specific capabilities:
27
+
28
+ | Label | Meaning |
29
+ |-------|---------|
30
+ | `needs:browser` | Requires Playwright / browser automation |
31
+ | `needs:gpu` | Requires NVIDIA GPU |
32
+ | `needs:personal-gh` | Requires personal GitHub account |
33
+ | `needs:emu-gh` | Requires Enterprise Managed User account |
34
+ | `needs:azure-cli` | Requires authenticated Azure CLI |
35
+ | `needs:docker` | Requires Docker daemon |
36
+ | `needs:onedrive` | Requires OneDrive sync |
37
+ | `needs:teams-mcp` | Requires Teams MCP tools |
38
+
39
+ Custom capabilities are supported — any `needs:X` label works if `X` is in the machine's `capabilities` array.
40
+
41
+ ### 3. Run Ralph
42
+
43
+ ```bash
44
+ squad watch --interval 5
45
+ ```
46
+
47
+ Ralph will log skipped issues:
48
+ ```
49
+ ⏭️ Skipping #42 "Train ML model" — missing: gpu
50
+ ✓ Triaged #43 "Fix CSS layout" → Picard (routing-rule)
51
+ ```
52
+
53
+ ## How It Works
54
+
55
+ 1. Ralph loads `machine-capabilities.json` at startup
56
+ 2. For each open issue, Ralph extracts `needs:*` labels
57
+ 3. If any required capability is missing, the issue is skipped
58
+ 4. Issues without `needs:*` labels are always processed (opt-in system)
59
+
60
+ ## Kubernetes Integration
61
+
62
+ On Kubernetes, machine capabilities map to node labels:
63
+
64
+ ```yaml
65
+ # Node labels (set by capability DaemonSet or manually)
66
+ node.squad.dev/gpu: "true"
67
+ node.squad.dev/browser: "true"
68
+
69
+ # Pod spec uses nodeSelector
70
+ spec:
71
+ nodeSelector:
72
+ node.squad.dev/gpu: "true"
73
+ ```
74
+
75
+ A DaemonSet can run capability discovery on each node and maintain labels automatically. See the [squad-on-aks](https://github.com/tamirdresher/squad-on-aks) project for a complete Kubernetes deployment example.
@@ -4,14 +4,6 @@ MCP (Model Context Protocol) servers extend Squad with tools for external servic
4
4
 
5
5
  > **Full patterns:** Read `.squad/skills/mcp-tool-discovery/SKILL.md` for discovery patterns, domain-specific usage, and graceful degradation.
6
6
 
7
- ## Security Considerations
8
-
9
- > ⚠️ **Important:** The sample configs below use `npx -y` to run MCP server packages without version pinning. For production use:
10
- > - **Pin versions:** Use `npx -y @trello/mcp-server@1.2.3` instead of bare package names
11
- > - **Audit packages:** Review MCP server source code before granting access to credentials
12
- > - **Use least-privilege tokens:** Create tokens with minimal required scopes
13
- > - **Consider local installs:** Install packages locally (`npm install`) rather than fetching on each run
14
-
15
7
  ## Config File Locations
16
8
 
17
9
  Users configure MCP servers at these locations (checked in priority order):
@@ -1,27 +1,27 @@
1
- # Orchestration Log Entry
2
-
3
- > One file per agent spawn. Saved to `.squad/orchestration-log/{timestamp}-{agent-name}.md`
4
-
5
- ---
6
-
7
- ### {timestamp} — {task summary}
8
-
9
- | Field | Value |
10
- |-------|-------|
11
- | **Agent routed** | {Name} ({Role}) |
12
- | **Why chosen** | {Routing rationale — what in the request matched this agent} |
13
- | **Mode** | {`background` / `sync`} |
14
- | **Why this mode** | {Brief reason — e.g., "No hard data dependencies" or "User needs to approve architecture"} |
15
- | **Files authorized to read** | {Exact file paths the agent was told to read} |
16
- | **File(s) agent must produce** | {Exact file paths the agent is expected to create or modify} |
17
- | **Outcome** | {Completed / Rejected by {Reviewer} / Escalated} |
18
-
19
- ---
20
-
21
- ## Rules
22
-
23
- 1. **One file per agent spawn.** Named `{timestamp}-{agent-name}.md`.
24
- 2. **Log BEFORE spawning.** The entry must exist before the agent runs.
25
- 3. **Update outcome AFTER the agent completes.** Fill in the Outcome field.
26
- 4. **Never delete or edit past entries.** Append-only.
27
- 5. **If a reviewer rejects work,** log the rejection as a new entry with the revision agent.
1
+ # Orchestration Log Entry
2
+
3
+ > One file per agent spawn. Saved to `.squad/orchestration-log/{timestamp}-{agent-name}.md`
4
+
5
+ ---
6
+
7
+ ### {timestamp} — {task summary}
8
+
9
+ | Field | Value |
10
+ |-------|-------|
11
+ | **Agent routed** | {Name} ({Role}) |
12
+ | **Why chosen** | {Routing rationale — what in the request matched this agent} |
13
+ | **Mode** | {`background` / `sync`} |
14
+ | **Why this mode** | {Brief reason — e.g., "No hard data dependencies" or "User needs to approve architecture"} |
15
+ | **Files authorized to read** | {Exact file paths the agent was told to read} |
16
+ | **File(s) agent must produce** | {Exact file paths the agent is expected to create or modify} |
17
+ | **Outcome** | {Completed / Rejected by {Reviewer} / Escalated} |
18
+
19
+ ---
20
+
21
+ ## Rules
22
+
23
+ 1. **One file per agent spawn.** Named `{timestamp}-{agent-name}.md`.
24
+ 2. **Log BEFORE spawning.** The entry must exist before the agent runs.
25
+ 3. **Update outcome AFTER the agent completes.** Fill in the Outcome field.
26
+ 4. **Never delete or edit past entries.** Append-only.
27
+ 5. **If a reviewer rejects work,** log the rejection as a new entry with the revision agent.
@@ -0,0 +1,3 @@
1
+ {
2
+ "type": "commonjs"
3
+ }
@@ -0,0 +1,313 @@
1
+ # Ralph Circuit Breaker — Model Rate Limit Fallback
2
+
3
+ > Classic circuit breaker pattern (Hystrix / Polly / Resilience4j) applied to Copilot model selection.
4
+ > When the preferred model hits rate limits, Ralph automatically degrades to free-tier models, then self-heals.
5
+
6
+ ## Problem
7
+
8
+ When running multiple Ralph instances across repos, Copilot model rate limits cause cascading failures.
9
+ All Ralphs fail simultaneously when the preferred model (e.g., `claude-sonnet-4.6`) hits quota.
10
+
11
+ Premium models burn quota fast:
12
+ | Model | Multiplier | Risk |
13
+ |-------|-----------|------|
14
+ | `claude-sonnet-4.6` | 1x | Moderate with many Ralphs |
15
+ | `claude-opus-4.6` | 10x | High |
16
+ | `gpt-5.4` | 50x | Very high |
17
+ | `gpt-5.4-mini` | **0x** | **Free — unlimited** |
18
+ | `gpt-5-mini` | **0x** | **Free — unlimited** |
19
+ | `gpt-4.1` | **0x** | **Free — unlimited** |
20
+
21
+ ## Circuit Breaker States
22
+
23
+ ```
24
+ ┌─────────┐ rate limit error ┌────────┐
25
+ │ CLOSED │ ───────────────────► │ OPEN │
26
+ │ (normal)│ │(fallback)│
27
+ └────┬────┘ ◄──────────────── └────┬────┘
28
+ │ 2 consecutive │
29
+ │ successes │ cooldown expires
30
+ │ ▼
31
+ │ ┌──────────┐
32
+ └───── success ◄──────── │HALF-OPEN │
33
+ (close) │ (testing) │
34
+ └──────────┘
35
+ ```
36
+
37
+ ### CLOSED (normal operation)
38
+ - Use preferred model from config
39
+ - Every successful response confirms circuit stays closed
40
+ - On rate limit error → transition to OPEN
41
+
42
+ ### OPEN (rate limited — fallback active)
43
+ - Fall back through the free-tier model chain:
44
+ 1. `gpt-5.4-mini`
45
+ 2. `gpt-5-mini`
46
+ 3. `gpt-4.1`
47
+ - Start cooldown timer (default: 10 minutes)
48
+ - When cooldown expires → transition to HALF-OPEN
49
+
50
+ ### HALF-OPEN (testing recovery)
51
+ - Try preferred model again
52
+ - If 2 consecutive successes → transition to CLOSED
53
+ - If rate limit error → back to OPEN, reset cooldown
54
+
55
+ ## State File: `.squad/ralph-circuit-breaker.json`
56
+
57
+ ```json
58
+ {
59
+ "state": "closed",
60
+ "preferredModel": "claude-sonnet-4.6",
61
+ "fallbackChain": ["gpt-5.4-mini", "gpt-5-mini", "gpt-4.1"],
62
+ "currentFallbackIndex": 0,
63
+ "cooldownMinutes": 10,
64
+ "openedAt": null,
65
+ "halfOpenSuccesses": 0,
66
+ "consecutiveFailures": 0,
67
+ "metrics": {
68
+ "totalFallbacks": 0,
69
+ "totalRecoveries": 0,
70
+ "lastFallbackAt": null,
71
+ "lastRecoveryAt": null
72
+ }
73
+ }
74
+ ```
75
+
76
+ ## PowerShell Functions
77
+
78
+ Paste these into your `ralph-watch.ps1` or source them from a shared module.
79
+
80
+ ### `Get-CircuitBreakerState`
81
+
82
+ ```powershell
83
+ function Get-CircuitBreakerState {
84
+ param([string]$StateFile = ".squad/ralph-circuit-breaker.json")
85
+
86
+ if (-not (Test-Path $StateFile)) {
87
+ $default = @{
88
+ state = "closed"
89
+ preferredModel = "claude-sonnet-4.6"
90
+ fallbackChain = @("gpt-5.4-mini", "gpt-5-mini", "gpt-4.1")
91
+ currentFallbackIndex = 0
92
+ cooldownMinutes = 10
93
+ openedAt = $null
94
+ halfOpenSuccesses = 0
95
+ consecutiveFailures = 0
96
+ metrics = @{
97
+ totalFallbacks = 0
98
+ totalRecoveries = 0
99
+ lastFallbackAt = $null
100
+ lastRecoveryAt = $null
101
+ }
102
+ }
103
+ $default | ConvertTo-Json -Depth 3 | Set-Content $StateFile
104
+ return $default
105
+ }
106
+
107
+ return (Get-Content $StateFile -Raw | ConvertFrom-Json)
108
+ }
109
+ ```
110
+
111
+ ### `Save-CircuitBreakerState`
112
+
113
+ ```powershell
114
+ function Save-CircuitBreakerState {
115
+ param(
116
+ [object]$State,
117
+ [string]$StateFile = ".squad/ralph-circuit-breaker.json"
118
+ )
119
+
120
+ $State | ConvertTo-Json -Depth 3 | Set-Content $StateFile
121
+ }
122
+ ```
123
+
124
+ ### `Get-CurrentModel`
125
+
126
+ Returns the model Ralph should use right now, based on circuit state.
127
+
128
+ ```powershell
129
+ function Get-CurrentModel {
130
+ param([string]$StateFile = ".squad/ralph-circuit-breaker.json")
131
+
132
+ $cb = Get-CircuitBreakerState -StateFile $StateFile
133
+
134
+ switch ($cb.state) {
135
+ "closed" {
136
+ return $cb.preferredModel
137
+ }
138
+ "open" {
139
+ # Check if cooldown has expired
140
+ if ($cb.openedAt) {
141
+ $opened = [DateTime]::Parse($cb.openedAt)
142
+ $elapsed = (Get-Date) - $opened
143
+ if ($elapsed.TotalMinutes -ge $cb.cooldownMinutes) {
144
+ # Transition to half-open
145
+ $cb.state = "half-open"
146
+ $cb.halfOpenSuccesses = 0
147
+ Save-CircuitBreakerState -State $cb -StateFile $StateFile
148
+ Write-Host " [circuit-breaker] Cooldown expired. Testing preferred model..." -ForegroundColor Yellow
149
+ return $cb.preferredModel
150
+ }
151
+ }
152
+ # Still in cooldown — use fallback
153
+ $idx = [Math]::Min($cb.currentFallbackIndex, $cb.fallbackChain.Count - 1)
154
+ return $cb.fallbackChain[$idx]
155
+ }
156
+ "half-open" {
157
+ return $cb.preferredModel
158
+ }
159
+ default {
160
+ return $cb.preferredModel
161
+ }
162
+ }
163
+ }
164
+ ```
165
+
166
+ ### `Update-CircuitBreakerOnSuccess`
167
+
168
+ Call after every successful model response.
169
+
170
+ ```powershell
171
+ function Update-CircuitBreakerOnSuccess {
172
+ param([string]$StateFile = ".squad/ralph-circuit-breaker.json")
173
+
174
+ $cb = Get-CircuitBreakerState -StateFile $StateFile
175
+ $cb.consecutiveFailures = 0
176
+
177
+ if ($cb.state -eq "half-open") {
178
+ $cb.halfOpenSuccesses++
179
+ if ($cb.halfOpenSuccesses -ge 2) {
180
+ # Recovery! Close the circuit
181
+ $cb.state = "closed"
182
+ $cb.openedAt = $null
183
+ $cb.halfOpenSuccesses = 0
184
+ $cb.currentFallbackIndex = 0
185
+ $cb.metrics.totalRecoveries++
186
+ $cb.metrics.lastRecoveryAt = (Get-Date).ToString("o")
187
+ Save-CircuitBreakerState -State $cb -StateFile $StateFile
188
+ Write-Host " [circuit-breaker] RECOVERED — back to preferred model ($($cb.preferredModel))" -ForegroundColor Green
189
+ return
190
+ }
191
+ Save-CircuitBreakerState -State $cb -StateFile $StateFile
192
+ Write-Host " [circuit-breaker] Half-open success $($cb.halfOpenSuccesses)/2" -ForegroundColor Yellow
193
+ return
194
+ }
195
+
196
+ # closed state — nothing to do
197
+ }
198
+ ```
199
+
200
+ ### `Update-CircuitBreakerOnRateLimit`
201
+
202
+ Call when a model response indicates rate limiting (HTTP 429 or error message containing "rate limit").
203
+
204
+ ```powershell
205
+ function Update-CircuitBreakerOnRateLimit {
206
+ param([string]$StateFile = ".squad/ralph-circuit-breaker.json")
207
+
208
+ $cb = Get-CircuitBreakerState -StateFile $StateFile
209
+ $cb.consecutiveFailures++
210
+
211
+ if ($cb.state -eq "closed" -or $cb.state -eq "half-open") {
212
+ # Open the circuit
213
+ $cb.state = "open"
214
+ $cb.openedAt = (Get-Date).ToString("o")
215
+ $cb.halfOpenSuccesses = 0
216
+ $cb.currentFallbackIndex = 0
217
+ $cb.metrics.totalFallbacks++
218
+ $cb.metrics.lastFallbackAt = (Get-Date).ToString("o")
219
+ Save-CircuitBreakerState -State $cb -StateFile $StateFile
220
+
221
+ $fallbackModel = $cb.fallbackChain[0]
222
+ Write-Host " [circuit-breaker] RATE LIMITED — falling back to $fallbackModel (cooldown: $($cb.cooldownMinutes)m)" -ForegroundColor Red
223
+ return
224
+ }
225
+
226
+ if ($cb.state -eq "open") {
227
+ # Already open — try next fallback in chain if current one also fails
228
+ if ($cb.currentFallbackIndex -lt ($cb.fallbackChain.Count - 1)) {
229
+ $cb.currentFallbackIndex++
230
+ $nextModel = $cb.fallbackChain[$cb.currentFallbackIndex]
231
+ Write-Host " [circuit-breaker] Fallback also limited — trying $nextModel" -ForegroundColor Red
232
+ }
233
+ # Reset cooldown timer
234
+ $cb.openedAt = (Get-Date).ToString("o")
235
+ Save-CircuitBreakerState -State $cb -StateFile $StateFile
236
+ }
237
+ }
238
+ ```
239
+
240
+ ## Integration with ralph-watch.ps1
241
+
242
+ In your Ralph polling loop, wrap the model selection:
243
+
244
+ ```powershell
245
+ # At the top of your polling loop
246
+ $model = Get-CurrentModel
247
+
248
+ # When invoking copilot CLI
249
+ $result = copilot-cli --model $model ...
250
+
251
+ # After the call
252
+ if ($result -match "rate.?limit" -or $LASTEXITCODE -eq 429) {
253
+ Update-CircuitBreakerOnRateLimit
254
+ } else {
255
+ Update-CircuitBreakerOnSuccess
256
+ }
257
+ ```
258
+
259
+ ### Full integration example
260
+
261
+ ```powershell
262
+ # Source the circuit breaker functions
263
+ . .squad-templates/ralph-circuit-breaker-functions.ps1
264
+
265
+ while ($true) {
266
+ $model = Get-CurrentModel
267
+ Write-Host "Polling with model: $model"
268
+
269
+ try {
270
+ # Your existing Ralph logic here, but pass $model
271
+ $response = Invoke-RalphCycle -Model $model
272
+
273
+ # Success path
274
+ Update-CircuitBreakerOnSuccess
275
+ }
276
+ catch {
277
+ if ($_.Exception.Message -match "rate.?limit|429|quota|Too Many Requests") {
278
+ Update-CircuitBreakerOnRateLimit
279
+ # Retry immediately with fallback model
280
+ continue
281
+ }
282
+ # Other errors — handle normally
283
+ throw
284
+ }
285
+
286
+ Start-Sleep -Seconds $pollInterval
287
+ }
288
+ ```
289
+
290
+ ## Configuration
291
+
292
+ Override defaults by editing `.squad/ralph-circuit-breaker.json`:
293
+
294
+ | Field | Default | Description |
295
+ |-------|---------|-------------|
296
+ | `preferredModel` | `claude-sonnet-4.6` | Model to use when circuit is closed |
297
+ | `fallbackChain` | `["gpt-5.4-mini", "gpt-5-mini", "gpt-4.1"]` | Ordered fallback models (all free-tier) |
298
+ | `cooldownMinutes` | `10` | How long to wait before testing recovery |
299
+
300
+ ## Metrics
301
+
302
+ The state file tracks operational metrics:
303
+
304
+ - **totalFallbacks** — How many times the circuit opened
305
+ - **totalRecoveries** — How many times it recovered to preferred model
306
+ - **lastFallbackAt** — ISO timestamp of last rate limit event
307
+ - **lastRecoveryAt** — ISO timestamp of last successful recovery
308
+
309
+ Query metrics with:
310
+ ```powershell
311
+ $cb = Get-Content .squad/ralph-circuit-breaker.json | ConvertFrom-Json
312
+ Write-Host "Fallbacks: $($cb.metrics.totalFallbacks) | Recoveries: $($cb.metrics.totalRecoveries)"
313
+ ```