@zhixuan92/multi-model-agent 4.5.3 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/cli/index.js +1 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/http/async-dispatch.d.ts.map +1 -1
- package/dist/http/async-dispatch.js +21 -16
- package/dist/http/async-dispatch.js.map +1 -1
- package/dist/http/execution-context.d.ts.map +1 -1
- package/dist/http/execution-context.js +12 -9
- package/dist/http/execution-context.js.map +1 -1
- package/dist/http/handler-deps.d.ts +0 -6
- package/dist/http/handler-deps.d.ts.map +1 -1
- package/dist/http/handlers/control/batch.d.ts.map +1 -1
- package/dist/http/handlers/control/batch.js +50 -0
- package/dist/http/handlers/control/batch.js.map +1 -1
- package/dist/http/handlers/control/context-blocks.d.ts +0 -2
- package/dist/http/handlers/control/context-blocks.d.ts.map +1 -1
- package/dist/http/handlers/control/context-blocks.js +3 -1
- package/dist/http/handlers/control/context-blocks.js.map +1 -1
- package/dist/http/handlers/control/retry.js.map +1 -1
- package/dist/http/handlers/tools/audit.d.ts.map +1 -1
- package/dist/http/handlers/tools/audit.js +1 -11
- package/dist/http/handlers/tools/audit.js.map +1 -1
- package/dist/http/handlers/tools/debug.d.ts.map +1 -1
- package/dist/http/handlers/tools/debug.js +1 -11
- package/dist/http/handlers/tools/debug.js.map +1 -1
- package/dist/http/handlers/tools/delegate.d.ts.map +1 -1
- package/dist/http/handlers/tools/delegate.js +1 -11
- package/dist/http/handlers/tools/delegate.js.map +1 -1
- package/dist/http/handlers/tools/execute-plan.d.ts.map +1 -1
- package/dist/http/handlers/tools/execute-plan.js +1 -11
- package/dist/http/handlers/tools/execute-plan.js.map +1 -1
- package/dist/http/handlers/tools/investigate.d.ts.map +1 -1
- package/dist/http/handlers/tools/investigate.js +1 -11
- package/dist/http/handlers/tools/investigate.js.map +1 -1
- package/dist/http/handlers/tools/research.d.ts.map +1 -1
- package/dist/http/handlers/tools/research.js +1 -11
- package/dist/http/handlers/tools/research.js.map +1 -1
- package/dist/http/handlers/tools/retry.d.ts.map +1 -1
- package/dist/http/handlers/tools/retry.js +6 -16
- package/dist/http/handlers/tools/retry.js.map +1 -1
- package/dist/http/handlers/tools/review.d.ts.map +1 -1
- package/dist/http/handlers/tools/review.js +1 -11
- package/dist/http/handlers/tools/review.js.map +1 -1
- package/dist/http/request-observability.d.ts.map +1 -1
- package/dist/http/request-observability.js +6 -8
- package/dist/http/request-observability.js.map +1 -1
- package/dist/http/server.d.ts.map +1 -1
- package/dist/http/server.js +20 -42
- package/dist/http/server.js.map +1 -1
- package/dist/skills/mma-audit/SKILL.md +38 -25
- package/dist/skills/mma-context-blocks/SKILL.md +22 -1
- package/dist/skills/mma-debug/SKILL.md +38 -25
- package/dist/skills/mma-delegate/SKILL.md +103 -11
- package/dist/skills/mma-execute-plan/SKILL.md +101 -2
- package/dist/skills/mma-explore/SKILL.md +21 -5
- package/dist/skills/mma-investigate/SKILL.md +62 -38
- package/dist/skills/mma-research/SKILL.md +52 -3
- package/dist/skills/mma-retry/SKILL.md +102 -3
- package/dist/skills/mma-review/SKILL.md +38 -25
- package/dist/skills/multi-model-agent/SKILL.md +1 -1
- package/dist/telemetry/flusher.d.ts.map +1 -1
- package/dist/telemetry/flusher.js +9 -4
- package/dist/telemetry/flusher.js.map +1 -1
- package/package.json +2 -2
|
@@ -10,7 +10,7 @@ when_to_use: >-
|
|
|
10
10
|
read files, reproduce, trace — OR a methodology skill
|
|
11
11
|
(superpowers:systematic-debugging) points at the investigation step. Delegate
|
|
12
12
|
the read/reproduce/trace; the main agent stays on the hypothesis and the fix.
|
|
13
|
-
version: 4.
|
|
13
|
+
version: 4.6.0
|
|
14
14
|
---
|
|
15
15
|
|
|
16
16
|
# mma-debug
|
|
@@ -84,41 +84,54 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId')
|
|
|
84
84
|
|
|
85
85
|
@include _shared/response-shape.md
|
|
86
86
|
|
|
87
|
-
## Reading the findings
|
|
87
|
+
## Reading the findings
|
|
88
88
|
|
|
89
|
-
The
|
|
90
|
-
|
|
91
|
-
|
|
89
|
+
The main agent reads `completed` + `message` + `findings` — the findings are the answer. For
|
|
90
|
+
read-only routes, `filesChanged` is always `[]` and `commitSha` is always `null`.
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"completed": true,
|
|
95
|
+
"message": "Investigation complete; 1 finding.",
|
|
96
|
+
"findings": [
|
|
97
|
+
{ "id": "F1", "severity": "high", "category": "root-cause",
|
|
98
|
+
"claim": "bcrypt binding fails on non-ASCII input in the Docker image.",
|
|
99
|
+
"evidence": "Worker reproduced the failure with `pass='café'`; strace shows EINVAL on encode call.",
|
|
100
|
+
"suggestion": "Normalize input to NFC form before calling bcrypt.",
|
|
101
|
+
"source": "implementer" }
|
|
102
|
+
],
|
|
103
|
+
"filesChanged": [],
|
|
104
|
+
"commitSha": null,
|
|
105
|
+
"summary": "...",
|
|
106
|
+
"telemetry": { ... }
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Finding shape
|
|
111
|
+
|
|
112
|
+
Every finding has this shape:
|
|
92
113
|
|
|
93
114
|
| Field | Type | Notes |
|
|
94
115
|
|---|---|---|
|
|
95
|
-
| `id` | string |
|
|
116
|
+
| `id` | string | Worker-assigned, e.g. `F1`, `F2`. Stable across chain. |
|
|
96
117
|
| `severity` | `'critical' \| 'high' \| 'medium' \| 'low'` | 4-tier. |
|
|
118
|
+
| `category` | string | Topical bucket, e.g. `root-cause`, `reproduction`. |
|
|
97
119
|
| `claim` | string | One-sentence summary. |
|
|
98
|
-
| `evidence` | string ≥20 chars |
|
|
120
|
+
| `evidence` | string ≥20 chars | Verbatim from source when grounded. |
|
|
99
121
|
| `suggestion?` | string | Optional fix recommendation. |
|
|
100
|
-
| `
|
|
101
|
-
| `evidenceGrounded` | boolean | True when `evidence` is a verbatim substring of worker output. |
|
|
102
|
-
|
|
103
|
-
### Verdict states (`qualityReviewVerdict`)
|
|
122
|
+
| `source` | `'implementer' \| 'reviewer'` | Who produced the finding. |
|
|
104
123
|
|
|
105
|
-
|
|
106
|
-
numeric `annotatorConfidence`) or deterministic-fallback (with
|
|
107
|
-
`annotatorConfidence: null`). The route ALWAYS reaches `'annotated'` unless
|
|
108
|
-
the reviewer call itself fails transport.
|
|
109
|
-
- `'error'` — only when the reviewer call fails transport (network / 5xx).
|
|
124
|
+
`annotatorConfidence` and `evidenceGrounded` are retired — they were v4 fields with no producers.
|
|
110
125
|
|
|
111
126
|
### Recommended rendering by the main agent
|
|
112
127
|
|
|
113
|
-
1. Show ALL findings — never silently drop.
|
|
114
|
-
|
|
115
|
-
2. Default sort: severity (critical → low) then `
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
color, or `(low confidence)` annotation). User decides what to do.
|
|
121
|
-
5. Severity-tier counts feed the dashboard via V3 `findingsBySeverity`.
|
|
128
|
+
1. Show ALL findings — never silently drop. Severity and grounding are soft
|
|
129
|
+
signals, not gates.
|
|
130
|
+
2. Default sort: severity (critical → low), then `id` ascending.
|
|
131
|
+
3. `severity` is the authoritative value — use it directly.
|
|
132
|
+
4. Mark findings with `evidence` shorter than 30 chars as "low-evidence"
|
|
133
|
+
(lighter color or `(low evidence)` annotation). User decides what to do.
|
|
134
|
+
5. Severity-tier counts feed the dashboard.
|
|
122
135
|
|
|
123
136
|
@include _shared/budget-defaults.md
|
|
124
137
|
|
|
@@ -11,7 +11,7 @@ when_to_use: >-
|
|
|
11
11
|
and keep main context free. If a plan file exists → use mma-execute-plan. If
|
|
12
12
|
the task is audit / review / verify / debug / investigate → use the matching
|
|
13
13
|
specialized skill.
|
|
14
|
-
version: 4.
|
|
14
|
+
version: 4.6.0
|
|
15
15
|
---
|
|
16
16
|
|
|
17
17
|
# mma-delegate
|
|
@@ -86,7 +86,107 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId')
|
|
|
86
86
|
|
|
87
87
|
@include _shared/polling.md
|
|
88
88
|
|
|
89
|
-
|
|
89
|
+
## Response shapes
|
|
90
|
+
|
|
91
|
+
### POST /delegate?cwd=<abs> — dispatch response (202)
|
|
92
|
+
|
|
93
|
+
```json
|
|
94
|
+
{ "batchId": "<uuid>", "statusUrl": "/batch/<uuid>" }
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Use `batchId` to poll. `statusUrl` is a convenience pointer.
|
|
98
|
+
|
|
99
|
+
### GET /batch/:id — polling response
|
|
100
|
+
|
|
101
|
+
The HTTP status is the state discriminator:
|
|
102
|
+
|
|
103
|
+
| Status | Meaning |
|
|
104
|
+
|---|---|
|
|
105
|
+
| `202 text/plain` | Still pending — body is the running headline string |
|
|
106
|
+
| `200 application/json` | Terminal — body is the batch envelope below |
|
|
107
|
+
| `404` / `401` / `5xx` | Error — see Error response below; stop polling |
|
|
108
|
+
|
|
109
|
+
### GET /batch/:id?taskIndex=N — single task slice
|
|
110
|
+
|
|
111
|
+
Same envelope. `results` contains exactly the task at index `N`. Returns `404 unknown_task_index` if `N` is out of range.
|
|
112
|
+
|
|
113
|
+
### Reading the task result
|
|
114
|
+
|
|
115
|
+
Each task result is the per-task wire object (`ComposePayload`):
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
{
|
|
119
|
+
"completed": true,
|
|
120
|
+
"message": "Task completed; tests passed; one file changed.",
|
|
121
|
+
"findings": [
|
|
122
|
+
{
|
|
123
|
+
"id": "F1",
|
|
124
|
+
"severity": "high",
|
|
125
|
+
"category": "correctness",
|
|
126
|
+
"claim": "The function does not handle empty input",
|
|
127
|
+
"evidence": "function foo() { ... } // no null check",
|
|
128
|
+
"suggestion": "Add an explicit null guard at the top",
|
|
129
|
+
"source": "reviewer"
|
|
130
|
+
}
|
|
131
|
+
],
|
|
132
|
+
"summary": "Refactored utils.ts — removed 3 dead branches, added JSDoc",
|
|
133
|
+
"filesChanged": ["/project/src/utils.ts"],
|
|
134
|
+
"commitSha": "abc123def",
|
|
135
|
+
"blockId": null,
|
|
136
|
+
"telemetry": {
|
|
137
|
+
"totalDurationMs": 12400,
|
|
138
|
+
"totalCostUSD": 0.08,
|
|
139
|
+
"workerSelfAssessment": "done",
|
|
140
|
+
"reviewVerdict": "approved",
|
|
141
|
+
"commitOutcome": "committed",
|
|
142
|
+
"stopReason": "normal",
|
|
143
|
+
"haltedStage": null,
|
|
144
|
+
"stages": [
|
|
145
|
+
{ "name": "prepare", "outcome": "advance", "durationMs": 2, "costUSD": 0 },
|
|
146
|
+
{ "name": "register-block", "outcome": "skip", "comment": "register-block does not apply to route=delegate", "durationMs": 0, "costUSD": 0 },
|
|
147
|
+
{ "name": "implement", "outcome": "advance", "durationMs": 8900, "costUSD": 0.05 },
|
|
148
|
+
{ "name": "review", "outcome": "advance", "durationMs": 2100, "costUSD": 0.02 },
|
|
149
|
+
{ "name": "rework", "outcome": "skip", "comment": "rework skipped because review approved", "durationMs": 0, "costUSD": 0 },
|
|
150
|
+
{ "name": "commit", "outcome": "advance", "durationMs": 340, "costUSD": 0 },
|
|
151
|
+
{ "name": "annotate", "outcome": "advance", "durationMs": 890, "costUSD": 0.01 },
|
|
152
|
+
{ "name": "compose", "outcome": "advance", "durationMs": 68, "costUSD": 0 },
|
|
153
|
+
{ "name": "terminal", "outcome": "advance", "durationMs": 100, "costUSD": 0 }
|
|
154
|
+
]
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**Top-level fields to read for the main-agent verdict:**
|
|
160
|
+
|
|
161
|
+
| Field | When `true` / populated |
|
|
162
|
+
|---|---|
|
|
163
|
+
| `completed: true` | Task succeeded. `message` is the summary; `findings` are post-review issues (if any). |
|
|
164
|
+
| `completed: false` | Task did not complete. `message` names the blocking gate or finding; `findings` carry any discovered issues. |
|
|
165
|
+
| `findings` | Issues surfaced by the worker or reviewer. `severity` = `critical` \| `high` \| `medium` \| `low`. `source` = `implementer` \| `reviewer`. |
|
|
166
|
+
| `filesChanged` | File paths modified (empty for read-only routes). |
|
|
167
|
+
| `commitSha` | Git SHA of the committed diff; `null` for read-only routes or when commit was skipped. |
|
|
168
|
+
|
|
169
|
+
`blockId` is not used for the delegate route — it is always `null`. To chain results, use the `terminalBlockId` from the batch's `contextBlockIds` field instead.
|
|
170
|
+
|
|
171
|
+
**The stages array** (always 9 rows) is the canonical telemetry log. `outcome` is one of:
|
|
172
|
+
- `advance` — stage ran and produced its payload
|
|
173
|
+
- `skip` — stage did not run; `comment` explains why
|
|
174
|
+
- `halt` — stage stopped the chain; `comment` is the failure message
|
|
175
|
+
- `not_run` — stage was not reached because a prior stage halted
|
|
176
|
+
|
|
177
|
+
Use `telemetry.haltedStage` to find the first halt; `telemetry.stopReason` to find why.
|
|
178
|
+
|
|
179
|
+
### Error response (4xx / 5xx)
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"error": "<code>",
|
|
184
|
+
"message": "<human-readable>",
|
|
185
|
+
"details": { /* optional structured context, e.g. fieldErrors for 400 */ }
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
`details` is optional and present only when the server has structured additional context.
|
|
90
190
|
|
|
91
191
|
@include _shared/budget-defaults.md
|
|
92
192
|
|
|
@@ -106,15 +206,7 @@ Anti-pattern alert: **`inline-labor-leakage`** (AP2). If you're reading 3+ files
|
|
|
106
206
|
|
|
107
207
|
Workers run concurrently and race on the file. **Fix:** dispatch sequentially, or merge into one prompt.
|
|
108
208
|
|
|
109
|
-
❌ **
|
|
110
|
-
> "improve the auth module"
|
|
111
|
-
|
|
112
|
-
Worker has no completion signal — likely returns `done_with_concerns`. **Fix:** specific verb + acceptance: `"Add input validation to login.ts so all string fields reject empty/whitespace; tests pass"`.
|
|
113
|
-
|
|
114
|
-
❌ **Defaulting to `agentType: "complex"` for everything**
|
|
115
|
-
Standard tier is 5–10× cheaper and finishes most edits. Escalate only when standard returns `filesWritten: 0` or `incompleteReason: "turn_cap"`.
|
|
116
|
-
|
|
117
|
-
❌ **Inlining a 50KB doc into every prompt**
|
|
209
|
+
❌ **Two tasks writing the same file in one batch**
|
|
118
210
|
N tasks × 50KB = N transmissions. **Fix:** register the doc once via `mma-context-blocks`, pass the `contextBlockIds` to each task.
|
|
119
211
|
|
|
120
212
|
❌ **Reading the worker's diff inline before review**
|
|
@@ -10,7 +10,7 @@ when_to_use: >-
|
|
|
10
10
|
superpowers:subagent-driven-development / superpowers:executing-plans —
|
|
11
11
|
workers are cheaper and don't pollute main context. Task descriptors must
|
|
12
12
|
match plan headings verbatim.
|
|
13
|
-
version: 4.
|
|
13
|
+
version: 4.6.0
|
|
14
14
|
---
|
|
15
15
|
|
|
16
16
|
# mma-execute-plan
|
|
@@ -83,7 +83,106 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId')
|
|
|
83
83
|
|
|
84
84
|
@include _shared/polling.md
|
|
85
85
|
|
|
86
|
-
|
|
86
|
+
## Response shapes
|
|
87
|
+
|
|
88
|
+
### POST /execute-plan?cwd=<abs> — dispatch response (202)
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{ "batchId": "<uuid>", "statusUrl": "/batch/<uuid>" }
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Use `batchId` to poll. `statusUrl` is a convenience pointer.
|
|
95
|
+
|
|
96
|
+
### GET /batch/:id — polling response
|
|
97
|
+
|
|
98
|
+
The HTTP status is the state discriminator:
|
|
99
|
+
|
|
100
|
+
| Status | Meaning |
|
|
101
|
+
|---|---|
|
|
102
|
+
| `202 text/plain` | Still pending — body is the running headline string |
|
|
103
|
+
| `200 application/json` | Terminal — body is the batch envelope below |
|
|
104
|
+
| `404` / `401` / `5xx` | Error — see Error response below; stop polling |
|
|
105
|
+
|
|
106
|
+
### GET /batch/:id?taskIndex=N — single task slice
|
|
107
|
+
|
|
108
|
+
Same envelope. `results` contains exactly the task at index `N`. Returns `404 unknown_task_index` if `N` is out of range.
|
|
109
|
+
|
|
110
|
+
### Reading the task result
|
|
111
|
+
|
|
112
|
+
Each task result is the per-task wire object (`ComposePayload`):
|
|
113
|
+
|
|
114
|
+
```json
|
|
115
|
+
{
|
|
116
|
+
"completed": true,
|
|
117
|
+
"message": "Task completed; tests passed; one file changed.",
|
|
118
|
+
"findings": [
|
|
119
|
+
{
|
|
120
|
+
"id": "F1",
|
|
121
|
+
"severity": "high",
|
|
122
|
+
"category": "correctness",
|
|
123
|
+
"claim": "The function does not handle empty input",
|
|
124
|
+
"evidence": "function foo() { ... } // no null check",
|
|
125
|
+
"suggestion": "Add an explicit null guard at the top",
|
|
126
|
+
"source": "reviewer"
|
|
127
|
+
}
|
|
128
|
+
],
|
|
129
|
+
"summary": "Refactored utils.ts — removed 3 dead branches, added JSDoc",
|
|
130
|
+
"filesChanged": ["/project/src/utils.ts"],
|
|
131
|
+
"commitSha": "abc123def",
|
|
132
|
+
"blockId": null,
|
|
133
|
+
"telemetry": {
|
|
134
|
+
"totalDurationMs": 12400,
|
|
135
|
+
"totalCostUSD": 0.08,
|
|
136
|
+
"workerSelfAssessment": "done",
|
|
137
|
+
"reviewVerdict": "approved",
|
|
138
|
+
"commitOutcome": "committed",
|
|
139
|
+
"stopReason": "normal",
|
|
140
|
+
"haltedStage": null,
|
|
141
|
+
"stages": [
|
|
142
|
+
{ "name": "prepare", "outcome": "advance", "durationMs": 2, "costUSD": 0 },
|
|
143
|
+
{ "name": "register-block", "outcome": "skip", "comment": "register-block does not apply to route=execute-plan", "durationMs": 0, "costUSD": 0 },
|
|
144
|
+
{ "name": "implement", "outcome": "advance", "durationMs": 8900, "costUSD": 0.05 },
|
|
145
|
+
{ "name": "review", "outcome": "advance", "durationMs": 2100, "costUSD": 0.02 },
|
|
146
|
+
{ "name": "rework", "outcome": "skip", "comment": "rework skipped because review approved", "durationMs": 0, "costUSD": 0 },
|
|
147
|
+
{ "name": "commit", "outcome": "advance", "durationMs": 340, "costUSD": 0 },
|
|
148
|
+
{ "name": "annotate", "outcome": "advance", "durationMs": 890, "costUSD": 0.01 },
|
|
149
|
+
{ "name": "compose", "outcome": "advance", "durationMs": 68, "costUSD": 0 },
|
|
150
|
+
{ "name": "terminal", "outcome": "advance", "durationMs": 100, "costUSD": 0 }
|
|
151
|
+
]
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Top-level fields to read for the main-agent verdict:**
|
|
157
|
+
|
|
158
|
+
| Field | When `true` / populated |
|
|
159
|
+
|---|---|
|
|
160
|
+
| `completed: true` | Task succeeded. `message` is the summary; `findings` are post-review issues (if any). |
|
|
161
|
+
| `completed: false` | Task did not complete. `message` names the blocking gate or finding; `findings` carry any discovered issues. |
|
|
162
|
+
| `findings` | Issues surfaced by the worker or reviewer. `severity` = `critical` \| `high` \| `medium` \| `low`. `source` = `implementer` \| `reviewer`. |
|
|
163
|
+
| `filesChanged` | File paths modified (empty for read-only routes). |
|
|
164
|
+
| `commitSha` | Git SHA of the committed diff; `null` for read-only routes or when commit was skipped. |
|
|
165
|
+
| `blockId` | `terminalBlockId` — pass to `contextBlockIds` in a follow-up task to chain results without re-inlining. |
|
|
166
|
+
|
|
167
|
+
**The stages array** (always 9 rows) is the canonical telemetry log. `outcome` is one of:
|
|
168
|
+
- `advance` — stage ran and produced its payload
|
|
169
|
+
- `skip` — stage did not run; `comment` explains why
|
|
170
|
+
- `halt` — stage stopped the chain; `comment` is the failure message
|
|
171
|
+
- `not_run` — stage was not reached because a prior stage halted
|
|
172
|
+
|
|
173
|
+
Use `telemetry.haltedStage` to find the first halt; `telemetry.stopReason` to find why.
|
|
174
|
+
|
|
175
|
+
### Error response (4xx / 5xx)
|
|
176
|
+
|
|
177
|
+
```json
|
|
178
|
+
{
|
|
179
|
+
"error": "<code>",
|
|
180
|
+
"message": "<human-readable>",
|
|
181
|
+
"details": { /* optional structured context, e.g. fieldErrors for 400 */ }
|
|
182
|
+
}
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`details` is optional and present only when the server has structured additional context.
|
|
87
186
|
|
|
88
187
|
@include _shared/budget-defaults.md
|
|
89
188
|
|
|
@@ -12,7 +12,7 @@ when_to_use: >-
|
|
|
12
12
|
out mma-investigate (internal) + mma-research (external) in parallel and
|
|
13
13
|
synthesise the results yourself. DO NOT use for convergent single-answer
|
|
14
14
|
questions — those are mma-investigate.
|
|
15
|
-
version: 4.
|
|
15
|
+
version: 4.6.0
|
|
16
16
|
---
|
|
17
17
|
|
|
18
18
|
# mma-explore
|
|
@@ -88,6 +88,21 @@ The main agent (you) issues a single message with two parallel tool calls:
|
|
|
88
88
|
mma-research { researchQuestion: "State-of-the-art streaming JSON parsers with backpressure?", background: "We use a single-pass push parser." }
|
|
89
89
|
```
|
|
90
90
|
|
|
91
|
+
## Reading the leg results
|
|
92
|
+
|
|
93
|
+
Both `mma-investigate` and `mma-research` return the v5 wire envelope (see `mma-investigate/SKILL.md` → "v5 wire shape"). Each sub-task result is a `ComposePayload` with the standard seven fields. The authoritative citation source is **`results[0].findings`** — an array of `{ id, severity, category, claim, evidence, suggestion, source }`.
|
|
94
|
+
|
|
95
|
+
Explore top-level orchestration aggregates sub-task results into a valid `ImplementPayload` (read-route shape) before the final `annotate` stage runs. Each sub-task follows the same v5 wire shape; the top-level result is a composition of those sub-tasks.
|
|
96
|
+
|
|
97
|
+
| Check | How |
|
|
98
|
+
|---|---|
|
|
99
|
+
| Did the leg succeed? | `results[0].completed === true` — findings may be zero on a read route; finding nothing wrong is a valid completion |
|
|
100
|
+
| Internal citation source | `results[0].findings[i].claim` plus a `file:LINE` token from `results[0].findings[i].evidence` (workers style them as `` `path:LINE` `` markdown-linked refs) |
|
|
101
|
+
| External citation source | `results[0].findings[i].claim` plus a source name / URL from `results[0].findings[i].evidence` |
|
|
102
|
+
| Divergence axis | `results[0].findings[i].category` groups findings by criterion — pick across categories so threads don't collapse onto one axis |
|
|
103
|
+
|
|
104
|
+
Apply a sentinel only when `findings` is empty AND `results[0].message` contains no finding-level content — i.e., the worker genuinely returned nothing. Do NOT apply a sentinel just because `results[0].message` reads tersely or `results[0].telemetry.workerSelfAssessment === 'failed'` — a worker can say `'failed'` with usable partial findings.
|
|
105
|
+
|
|
91
106
|
## Per-task report shape
|
|
92
107
|
|
|
93
108
|
Synthesis output (REQUIRED — your reply MUST contain these):
|
|
@@ -96,11 +111,11 @@ Produce **3–5 threads**. Each thread MUST have:
|
|
|
96
111
|
|
|
97
112
|
- A **title** and **one-paragraph summary**.
|
|
98
113
|
- One **internal citation** (from investigate) — `file/path.ts:LINE — claim`.
|
|
99
|
-
-
|
|
100
|
-
|
|
114
|
+
- Pick from `results[0].findings`: take `claim` as the citation claim and pull a `file:LINE` token out of `evidence`.
|
|
115
|
+
- Use the sentinel `(no internal anchor — fully greenfield)` ONLY when investigate was skipped, or `results[0].findings` is empty AND `results[0].message` contains no finding-level content. The top-level `message` alone is not evidence — see "Reading the leg results" above.
|
|
101
116
|
- One **external citation** (from research) — `<source> — claim`.
|
|
102
|
-
-
|
|
103
|
-
|
|
117
|
+
- Pick from `results[0].findings`: take `claim` as the citation claim and pull a source name / URL out of `evidence`.
|
|
118
|
+
- Use the sentinel `(no external source found)` only when `results[0].findings` is empty for the research leg.
|
|
104
119
|
- A **one-line divergence reason** — what makes this thread different from
|
|
105
120
|
the others. No two threads may share the same divergence axis.
|
|
106
121
|
|
|
@@ -142,6 +157,7 @@ directions in the data.
|
|
|
142
157
|
| Both failed | Report both errors to the user. Do NOT fabricate threads. |
|
|
143
158
|
| Investigate returned `needsCallerClarification: true` | Pause — surface the clarification need to the user. Do NOT synthesise over an unfinished investigation. |
|
|
144
159
|
| Research returned 0 usable sources | Sentinel on external lines. Add a one-line note in synthesis preamble: *"External research returned no usable sources — threads anchor on internal findings only."* |
|
|
160
|
+
| Investigate headline reads "0 citations" / "confidence unparseable" but `results[0].findings.length > 0` | Known stage-sync noise — IGNORE the headline. The leg succeeded; read `results[0].findings` directly. |
|
|
145
161
|
|
|
146
162
|
See `superpowers:brainstorming` as the natural follow-up — convergent narrowing
|
|
147
163
|
on a chosen thread.
|
|
@@ -12,7 +12,7 @@ when_to_use: >-
|
|
|
12
12
|
git-history queries. OR you are about to read 3+ files / run any grep in main
|
|
13
13
|
context — that's the inline-labor-leakage anti-pattern (AP2); delegate to this
|
|
14
14
|
skill instead.
|
|
15
|
-
version: 4.
|
|
15
|
+
version: 4.6.0
|
|
16
16
|
---
|
|
17
17
|
|
|
18
18
|
# mma-investigate
|
|
@@ -127,43 +127,70 @@ Each task carries an `investigation` field on its per-task report:
|
|
|
127
127
|
}
|
|
128
128
|
```
|
|
129
129
|
|
|
130
|
-
|
|
130
|
+
The authoritative success signals are `completed`, `message`, and `findings`. See "v5 wire shape" above for the full envelope.
|
|
131
131
|
|
|
132
|
-
##
|
|
132
|
+
## v5 wire shape (read route)
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
Every finding has the same shape:
|
|
134
|
+
Every task result is a `ComposePayload` — seven main-agent fields plus a telemetry block.
|
|
135
|
+
The main-agent fields are authoritative; the telemetry block is diagnostics.
|
|
137
136
|
|
|
138
|
-
|
|
137
|
+
```json
|
|
138
|
+
{
|
|
139
|
+
"completed": true,
|
|
140
|
+
"message": "Investigation complete; 3 files analysed.",
|
|
141
|
+
"findings": [
|
|
142
|
+
{
|
|
143
|
+
"id": "F1",
|
|
144
|
+
"severity": "high",
|
|
145
|
+
"category": "correctness",
|
|
146
|
+
"claim": "The refresh handler reads bearer from Authorization header unconditionally.",
|
|
147
|
+
"evidence": "src/auth/refresh.ts:45-72 — verbatim substring from worker output.",
|
|
148
|
+
"suggestion": "Add a guard to handle missing Authorization header gracefully.",
|
|
149
|
+
"source": "implementer"
|
|
150
|
+
}
|
|
151
|
+
],
|
|
152
|
+
"summary": "...",
|
|
153
|
+
"filesChanged": [],
|
|
154
|
+
"commitSha": null,
|
|
155
|
+
"blockId": null,
|
|
156
|
+
"telemetry": {
|
|
157
|
+
"totalDurationMs": 1234,
|
|
158
|
+
"totalCostUSD": 0.08,
|
|
159
|
+
"workerSelfAssessment": "done",
|
|
160
|
+
"reviewVerdict": null,
|
|
161
|
+
"commitOutcome": "not_applicable",
|
|
162
|
+
"stopReason": "normal",
|
|
163
|
+
"haltedStage": null,
|
|
164
|
+
"stages": [...]
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Key fields
|
|
170
|
+
|
|
171
|
+
| Field | When populated | Notes |
|
|
139
172
|
|---|---|---|
|
|
140
|
-
| `
|
|
141
|
-
| `
|
|
142
|
-
| `
|
|
143
|
-
| `
|
|
144
|
-
| `
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
(nulls last).
|
|
162
|
-
3. `severity` is the reviewer's authoritative final value — use it directly.
|
|
163
|
-
4. Mark findings with `evidenceGrounded: false` or
|
|
164
|
-
`annotatorConfidence < 70` as "lower-trust" (collapsed section, lighter
|
|
165
|
-
color, or `(low confidence)` annotation). User decides what to do.
|
|
166
|
-
5. Severity-tier counts feed the dashboard via V3 `findingsBySeverity`.
|
|
173
|
+
| `completed` | always | `true` when at least one criterion succeeded; `false` on annotator transport failure OR unmet annotate preconditions (e.g. non-`done` worker self-assessment on a read route) |
|
|
174
|
+
| `message` | always | human-readable summary; names blocking gates or finding IDs on failure |
|
|
175
|
+
| `findings` | always | `source: 'implementer'` for investigate; findings are the deliverable on read routes |
|
|
176
|
+
| `workerSelfAssessment` | always | `'done'` or `'failed'` — never `done_with_concerns` |
|
|
177
|
+
| `blockId` | always `null` | investigate is a task route, not register-context-block |
|
|
178
|
+
|
|
179
|
+
### No second review
|
|
180
|
+
|
|
181
|
+
The LLM-judge stage (`annotate`) runs once, after the worker's output. Its preconditions for read-route `completed: true`:
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
gates.implement.outcome === 'advance'
|
|
185
|
+
&& gates.implement.payload.workerSelfAssessment === 'done'
|
|
186
|
+
&& (criteriaSucceeded.length > 0 || criteriaErrors.length === 0)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Findings are the deliverable — a task that surfaces 5 issues is `completed: true`. Finding nothing wrong is also a valid completion.
|
|
190
|
+
|
|
191
|
+
### `completed: false` — what it means
|
|
192
|
+
|
|
193
|
+
Only on annotator transport failure. The `message` names the blocking gate. Re-dispatch with tighter `filePaths` if the worker's citations were unusable.
|
|
167
194
|
|
|
168
195
|
## Best practices
|
|
169
196
|
|
|
@@ -180,15 +207,12 @@ Anti-pattern alert: **`inline-labor-leakage`** (AP2). If you find yourself readi
|
|
|
180
207
|
|
|
181
208
|
The investigator can't write — `tools: 'readonly'`. **Fix:** use `mma-delegate` for research-then-edit, or split: investigate first, then dispatch the edit.
|
|
182
209
|
|
|
183
|
-
❌ **Treating `done_with_concerns` as failure**
|
|
184
|
-
The worker still produced citations and a confidence level. Read them — partial coverage with `incompleteReason: 'turn_cap'` often answers the question well enough. Re-dispatch with a tighter scope only if the citations are unusable.
|
|
185
|
-
|
|
186
210
|
❌ **Inline-reading instead of delegating**
|
|
187
211
|
About to `Read` 3+ files just to answer one question? That's the wrong tradeoff — the worker reads on its cheap budget; you read its synthesis on yours.
|
|
188
212
|
|
|
189
213
|
## Terminal context block
|
|
190
214
|
|
|
191
|
-
Every completed task automatically registers a terminal markdown context block containing the full task report (headline, investigation synthesis, citations, and annotated findings). The `blockId` is returned in each task result
|
|
215
|
+
Every completed task automatically registers a terminal markdown context block containing the full task report (headline, investigation synthesis, citations, and annotated findings). The `blockId` is returned in each task result under the shared `blockId` field (not a separate `terminalBlockId` field). This block is immutable, lives for the session duration, and counts against the project's `maxEntries` quota (default 500).
|
|
192
216
|
|
|
193
217
|
**Use cases:**
|
|
194
218
|
- Pass investigation results to a downstream planning step
|
|
@@ -10,7 +10,7 @@ when_to_use: >-
|
|
|
10
10
|
others do, what published methods exist) AND mmagent is running. Delegate the
|
|
11
11
|
multi-source web/adapter research to a worker so the main context stays on
|
|
12
12
|
judgment. NOT for codebase questions — those are mma-investigate.
|
|
13
|
-
version: 4.
|
|
13
|
+
version: 4.6.0
|
|
14
14
|
---
|
|
15
15
|
|
|
16
16
|
# mma-research
|
|
@@ -89,10 +89,59 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId')
|
|
|
89
89
|
|
|
90
90
|
@include _shared/response-shape.md
|
|
91
91
|
|
|
92
|
-
## Per-task report shape
|
|
92
|
+
## Per-task report shape (v5 envelope)
|
|
93
|
+
|
|
94
|
+
Each `results[N]` is the v5 `ComposePayload`:
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"completed": true,
|
|
99
|
+
"message": "Research complete; 4 sources synthesised.",
|
|
100
|
+
"findings": [
|
|
101
|
+
{
|
|
102
|
+
"id": "F1",
|
|
103
|
+
"severity": "medium",
|
|
104
|
+
"category": "evidence",
|
|
105
|
+
"claim": "Pattern X is the canonical approach as of 2026 per upstream RFC.",
|
|
106
|
+
"evidence": "https://example.org/rfc/...",
|
|
107
|
+
"source": "implementer"
|
|
108
|
+
}
|
|
109
|
+
],
|
|
110
|
+
"summary": "Pattern X dominates; pattern Y is a 2024 fork.",
|
|
111
|
+
"filesChanged": [],
|
|
112
|
+
"commitSha": null,
|
|
113
|
+
"blockId": null,
|
|
114
|
+
"telemetry": {
|
|
115
|
+
"totalDurationMs": 12400,
|
|
116
|
+
"totalCostUSD": 0.06,
|
|
117
|
+
"workerSelfAssessment": "done",
|
|
118
|
+
"reviewVerdict": null,
|
|
119
|
+
"commitOutcome": "not_applicable",
|
|
120
|
+
"stopReason": "normal",
|
|
121
|
+
"haltedStage": null,
|
|
122
|
+
"stages": [
|
|
123
|
+
{ "name": "prepare", "outcome": "advance" },
|
|
124
|
+
{ "name": "implement", "outcome": "advance" },
|
|
125
|
+
{ "name": "annotate", "outcome": "advance" },
|
|
126
|
+
{ "name": "compose", "outcome": "advance" },
|
|
127
|
+
{ "name": "terminal", "outcome": "advance" }
|
|
128
|
+
]
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
| Field | Notes |
|
|
134
|
+
|---|---|
|
|
135
|
+
| `completed: true` | At least one criterion succeeded; sources synthesised. |
|
|
136
|
+
| `completed: false` | Annotator transport failure OR worker self-assessed as `failed`. `message` names the blocking gate. |
|
|
137
|
+
| `findings` | The deliverable. `source: 'implementer'`. Empty `findings` on a research route means "no signal found" — still a valid completion. |
|
|
138
|
+
| `workerSelfAssessment` | `'done'` or `'failed'` — never `done_with_concerns`. |
|
|
139
|
+
| `blockId` | Always `null` — research is a task route, not register-context-block. |
|
|
140
|
+
|
|
141
|
+
Legacy aliases (still emitted for back-compat):
|
|
93
142
|
|
|
94
143
|
```
|
|
95
|
-
results[0].structuredReport.findings[] //
|
|
144
|
+
results[0].structuredReport.findings[] // mirror of findings above
|
|
96
145
|
results[0].structuredReport.sourcesUsed[] // table of sources tried
|
|
97
146
|
results[0].output // raw narrative report
|
|
98
147
|
```
|