@zhixuan92/multi-model-agent 4.5.4 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +8 -6
  2. package/dist/cli/serve.d.ts.map +1 -1
  3. package/dist/cli/serve.js +39 -1
  4. package/dist/cli/serve.js.map +1 -1
  5. package/dist/http/async-dispatch.d.ts.map +1 -1
  6. package/dist/http/async-dispatch.js +22 -17
  7. package/dist/http/async-dispatch.js.map +1 -1
  8. package/dist/http/execution-context.d.ts.map +1 -1
  9. package/dist/http/execution-context.js +40 -10
  10. package/dist/http/execution-context.js.map +1 -1
  11. package/dist/http/handler-deps.d.ts +0 -6
  12. package/dist/http/handler-deps.d.ts.map +1 -1
  13. package/dist/http/handlers/control/batch.d.ts.map +1 -1
  14. package/dist/http/handlers/control/batch.js +50 -0
  15. package/dist/http/handlers/control/batch.js.map +1 -1
  16. package/dist/http/handlers/control/context-blocks.d.ts +0 -2
  17. package/dist/http/handlers/control/context-blocks.d.ts.map +1 -1
  18. package/dist/http/handlers/control/context-blocks.js +3 -1
  19. package/dist/http/handlers/control/context-blocks.js.map +1 -1
  20. package/dist/http/handlers/control/retry.js.map +1 -1
  21. package/dist/http/handlers/tools/audit.d.ts.map +1 -1
  22. package/dist/http/handlers/tools/audit.js +1 -11
  23. package/dist/http/handlers/tools/audit.js.map +1 -1
  24. package/dist/http/handlers/tools/debug.d.ts.map +1 -1
  25. package/dist/http/handlers/tools/debug.js +1 -11
  26. package/dist/http/handlers/tools/debug.js.map +1 -1
  27. package/dist/http/handlers/tools/delegate.d.ts.map +1 -1
  28. package/dist/http/handlers/tools/delegate.js +1 -11
  29. package/dist/http/handlers/tools/delegate.js.map +1 -1
  30. package/dist/http/handlers/tools/execute-plan.d.ts.map +1 -1
  31. package/dist/http/handlers/tools/execute-plan.js +1 -11
  32. package/dist/http/handlers/tools/execute-plan.js.map +1 -1
  33. package/dist/http/handlers/tools/investigate.d.ts.map +1 -1
  34. package/dist/http/handlers/tools/investigate.js +1 -11
  35. package/dist/http/handlers/tools/investigate.js.map +1 -1
  36. package/dist/http/handlers/tools/research.d.ts.map +1 -1
  37. package/dist/http/handlers/tools/research.js +1 -11
  38. package/dist/http/handlers/tools/research.js.map +1 -1
  39. package/dist/http/handlers/tools/retry.d.ts.map +1 -1
  40. package/dist/http/handlers/tools/retry.js +6 -16
  41. package/dist/http/handlers/tools/retry.js.map +1 -1
  42. package/dist/http/handlers/tools/review.d.ts.map +1 -1
  43. package/dist/http/handlers/tools/review.js +1 -11
  44. package/dist/http/handlers/tools/review.js.map +1 -1
  45. package/dist/http/request-observability.d.ts.map +1 -1
  46. package/dist/http/request-observability.js +6 -8
  47. package/dist/http/request-observability.js.map +1 -1
  48. package/dist/http/request-pipeline.d.ts +2 -0
  49. package/dist/http/request-pipeline.d.ts.map +1 -1
  50. package/dist/http/request-pipeline.js +15 -0
  51. package/dist/http/request-pipeline.js.map +1 -1
  52. package/dist/http/server.d.ts.map +1 -1
  53. package/dist/http/server.js +19 -43
  54. package/dist/http/server.js.map +1 -1
  55. package/dist/skills/mma-audit/SKILL.md +38 -27
  56. package/dist/skills/mma-context-blocks/SKILL.md +22 -1
  57. package/dist/skills/mma-debug/SKILL.md +38 -27
  58. package/dist/skills/mma-delegate/SKILL.md +102 -13
  59. package/dist/skills/mma-execute-plan/SKILL.md +100 -4
  60. package/dist/skills/mma-explore/SKILL.md +21 -5
  61. package/dist/skills/mma-investigate/SKILL.md +63 -39
  62. package/dist/skills/mma-research/SKILL.md +52 -3
  63. package/dist/skills/mma-retry/SKILL.md +102 -3
  64. package/dist/skills/mma-review/SKILL.md +38 -27
  65. package/dist/skills/multi-model-agent/SKILL.md +2 -2
  66. package/package.json +2 -2
  67. package/dist/http/middleware/body-size.d.ts +0 -7
  68. package/dist/http/middleware/body-size.d.ts.map +0 -1
  69. package/dist/http/middleware/body-size.js +0 -7
  70. package/dist/http/middleware/body-size.js.map +0 -1
  71. package/dist/skills/_shared/budget-defaults.md +0 -13
@@ -12,7 +12,7 @@ when_to_use: >-
12
12
  git-history queries. OR you are about to read 3+ files / run any grep in main
13
13
  context — that's the inline-labor-leakage anti-pattern (AP2); delegate to this
14
14
  skill instead.
15
- version: 4.5.4
15
+ version: 4.7.0
16
16
  ---
17
17
 
18
18
  # mma-investigate
@@ -88,7 +88,7 @@ digraph when_to_use {
88
88
  ❌ `{ "question": "Where is parseConfig called?" }` — searches the whole repo
89
89
  ✅ `{ "question": "Where is parseConfig called?", "filePaths": ["src/"] }` — bounded
90
90
 
91
- **Why:** the worker greps and reads under its cost ceiling. Without anchors, broad questions exhaust the budget before they finish.
91
+ **Why:** the worker greps and reads under a turn and wall-clock budget. Without anchors, broad questions exhaust those budgets before they finish.
92
92
 
93
93
  ## Full example
94
94
 
@@ -127,43 +127,70 @@ Each task carries an `investigation` field on its per-task report:
127
127
  }
128
128
  ```
129
129
 
130
- `workerStatus` is one of `done`, `done_with_concerns`, `needs_context`, `blocked`. When `done_with_concerns`, the per-task report carries `incompleteReason` (`turn_cap`, `cost_cap`, `timeout`, or `missing_sections`). When `needs_context`, the worker flagged a `[needs_context]` bullet under `## Unresolved` — re-dispatch with extra context (anchor paths or a context block).
130
+ The authoritative success signals are `completed`, `message`, and `findings`. See "v5 wire shape" above for the full envelope.
131
131
 
132
- ## Reading the findings (3.10.5+)
132
+ ## v5 wire shape (read route)
133
133
 
134
- The terminal envelope's `results[N].annotatedFindings` is a list of structured
135
- findings the reviewer extracted and scored from the implementer's narrative.
136
- Every finding has the same shape:
134
+ Every task result is a `ComposePayload` seven main-agent fields plus a telemetry block.
135
+ The main-agent fields are authoritative; the telemetry block is diagnostics.
137
136
 
138
- | Field | Type | Notes |
137
+ ```json
138
+ {
139
+ "completed": true,
140
+ "message": "Investigation complete; 3 files analysed.",
141
+ "findings": [
142
+ {
143
+ "id": "F1",
144
+ "severity": "high",
145
+ "category": "correctness",
146
+ "claim": "The refresh handler reads bearer from Authorization header unconditionally.",
147
+ "evidence": "src/auth/refresh.ts:45-72 — verbatim substring from worker output.",
148
+ "suggestion": "Add a guard to handle missing Authorization header gracefully.",
149
+ "source": "implementer"
150
+ }
151
+ ],
152
+ "summary": "...",
153
+ "filesChanged": [],
154
+ "commitSha": null,
155
+ "blockId": null,
156
+ "telemetry": {
157
+ "totalDurationMs": 1234,
158
+ "totalCostUSD": 0.08,
159
+ "workerSelfAssessment": "done",
160
+ "reviewVerdict": null,
161
+ "commitOutcome": "not_applicable",
162
+ "stopReason": "normal",
163
+ "haltedStage": null,
164
+ "stages": [...]
165
+ }
166
+ }
167
+ ```
168
+
169
+ ### Key fields
170
+
171
+ | Field | When populated | Notes |
139
172
  |---|---|---|
140
- | `id` | string | Reviewer-assigned, e.g. `F1`, `F2`. |
141
- | `severity` | `'critical' \| 'high' \| 'medium' \| 'low'` | 4-tier. |
142
- | `claim` | string | One-sentence summary. |
143
- | `evidence` | string ≥20 chars | Quoted from worker output when grounded. |
144
- | `suggestion?` | string | Optional fix recommendation. |
145
- | `annotatorConfidence` | `number \| null` | 0–100 from the reviewer; `null` when emitted via deterministic fallback. |
146
- | `evidenceGrounded` | boolean | True when `evidence` is a verbatim substring of worker output. |
147
-
148
- ### Verdict states (`qualityReviewVerdict`)
149
-
150
- - `'annotated'` — every finding is structured. May be reviewer-emitted (with
151
- numeric `annotatorConfidence`) or deterministic-fallback (with
152
- `annotatorConfidence: null`). The route ALWAYS reaches `'annotated'` unless
153
- the reviewer call itself fails transport.
154
- - `'error'` — only when the reviewer call fails transport (network / 5xx).
155
-
156
- ### Recommended rendering by the main agent
157
-
158
- 1. Show ALL findings never silently drop. Confidence and grounding are
159
- soft signals, not gates.
160
- 2. Default sort: severity (critical low) then `annotatorConfidence` desc
161
- (nulls last).
162
- 3. `severity` is the reviewer's authoritative final value — use it directly.
163
- 4. Mark findings with `evidenceGrounded: false` or
164
- `annotatorConfidence < 70` as "lower-trust" (collapsed section, lighter
165
- color, or `(low confidence)` annotation). User decides what to do.
166
- 5. Severity-tier counts feed the dashboard via V3 `findingsBySeverity`.
173
+ | `completed` | always | `true` when at least one criterion succeeded; `false` on annotator transport failure OR unmet annotate preconditions (e.g. non-`done` worker self-assessment on a read route) |
174
+ | `message` | always | human-readable summary; names blocking gates or finding IDs on failure |
175
+ | `findings` | always | `source: 'implementer'` for investigate; findings are the deliverable on read routes |
176
+ | `workerSelfAssessment` | always | `'done'` or `'failed'` never `done_with_concerns` |
177
+ | `blockId` | always `null` | investigate is a task route, not register-context-block |
178
+
179
+ ### No second review
180
+
181
+ The LLM-judge stage (`annotate`) runs once, after the worker's output. Its preconditions for read-route `completed: true`:
182
+
183
+ ```
184
+ gates.implement.outcome === 'advance'
185
+ && gates.implement.payload.workerSelfAssessment === 'done'
186
+ && (criteriaSucceeded.length > 0 || criteriaErrors.length === 0)
187
+ ```
188
+
189
+ Findings are the deliverable a task that surfaces 5 issues is `completed: true`. Finding nothing wrong is also a valid completion.
190
+
191
+ ### `completed: false`what it means
192
+
193
+ Only on annotator transport failure. The `message` names the blocking gate. Re-dispatch with tighter `filePaths` if the worker's citations were unusable.
167
194
 
168
195
  ## Best practices
169
196
 
@@ -180,15 +207,12 @@ Anti-pattern alert: **`inline-labor-leakage`** (AP2). If you find yourself readi
180
207
 
181
208
  The investigator can't write — `tools: 'readonly'`. **Fix:** use `mma-delegate` for research-then-edit, or split: investigate first, then dispatch the edit.
182
209
 
183
- ❌ **Treating `done_with_concerns` as failure**
184
- The worker still produced citations and a confidence level. Read them — partial coverage with `incompleteReason: 'turn_cap'` often answers the question well enough. Re-dispatch with a tighter scope only if the citations are unusable.
185
-
186
210
  ❌ **Inline-reading instead of delegating**
187
211
  About to `Read` 3+ files just to answer one question? That's the wrong tradeoff — the worker reads on its cheap budget; you read its synthesis on yours.
188
212
 
189
213
  ## Terminal context block
190
214
 
191
- Every completed task automatically registers a terminal markdown context block containing the full task report (headline, investigation synthesis, citations, and annotated findings). The `blockId` is returned in each task result as `terminalBlockId`. This block is immutable, lives for the session duration, and counts against the project's `maxEntries` quota (default 500).
215
+ Every completed task automatically registers a terminal markdown context block containing the full task report (headline, investigation synthesis, citations, and annotated findings). The `blockId` is returned in each task result under the shared `blockId` field (not a separate `terminalBlockId` field). This block is immutable, lives for the session duration, and counts against the project's `maxEntries` quota (default 500).
192
216
 
193
217
  **Use cases:**
194
218
  - Pass investigation results to a downstream planning step
@@ -10,7 +10,7 @@ when_to_use: >-
10
10
  others do, what published methods exist) AND mmagent is running. Delegate the
11
11
  multi-source web/adapter research to a worker so the main context stays on
12
12
  judgment. NOT for codebase questions — those are mma-investigate.
13
- version: 4.5.4
13
+ version: 4.7.0
14
14
  ---
15
15
 
16
16
  # mma-research
@@ -89,10 +89,59 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId')
89
89
 
90
90
  @include _shared/response-shape.md
91
91
 
92
- ## Per-task report shape
92
+ ## Per-task report shape (v5 envelope)
93
+
94
+ Each `results[N]` is the v5 `ComposePayload`:
95
+
96
+ ```json
97
+ {
98
+ "completed": true,
99
+ "message": "Research complete; 4 sources synthesised.",
100
+ "findings": [
101
+ {
102
+ "id": "F1",
103
+ "severity": "medium",
104
+ "category": "evidence",
105
+ "claim": "Pattern X is the canonical approach as of 2026 per upstream RFC.",
106
+ "evidence": "https://example.org/rfc/...",
107
+ "source": "implementer"
108
+ }
109
+ ],
110
+ "summary": "Pattern X dominates; pattern Y is a 2024 fork.",
111
+ "filesChanged": [],
112
+ "commitSha": null,
113
+ "blockId": null,
114
+ "telemetry": {
115
+ "totalDurationMs": 12400,
116
+ "totalCostUSD": 0.06,
117
+ "workerSelfAssessment": "done",
118
+ "reviewVerdict": null,
119
+ "commitOutcome": "not_applicable",
120
+ "stopReason": "normal",
121
+ "haltedStage": null,
122
+ "stages": [
123
+ { "name": "prepare", "outcome": "advance" },
124
+ { "name": "implement", "outcome": "advance" },
125
+ { "name": "annotate", "outcome": "advance" },
126
+ { "name": "compose", "outcome": "advance" },
127
+ { "name": "terminal", "outcome": "advance" }
128
+ ]
129
+ }
130
+ }
131
+ ```
132
+
133
+ | Field | Notes |
134
+ |---|---|
135
+ | `completed: true` | At least one criterion succeeded; sources synthesised. |
136
+ | `completed: false` | Annotator transport failure OR worker self-assessed as `failed`. `message` names the blocking gate. |
137
+ | `findings` | The deliverable. `source: 'implementer'`. Empty `findings` on a research route means "no signal found" — still a valid completion. |
138
+ | `workerSelfAssessment` | `'done'` or `'failed'` — never `done_with_concerns`. |
139
+ | `blockId` | Always `null` — research is a task route, not register-context-block. |
140
+
141
+ Legacy aliases (still emitted for back-compat):
93
142
 
94
143
  ```
95
- results[0].structuredReport.findings[] // numbered findings with citations
144
+ results[0].structuredReport.findings[] // mirror of findings above
96
145
  results[0].structuredReport.sourcesUsed[] // table of sources tried
97
146
  results[0].output // raw narrative report
98
147
  ```
@@ -10,7 +10,7 @@ when_to_use: >-
10
10
  re-try the failed indices only. Prefer this over re-dispatching the whole
11
11
  batch or inline-retrying — it's idempotent and preserves the original batch's
12
12
  diagnostics.
13
- version: 4.5.4
13
+ version: 4.7.0
14
14
  ---
15
15
 
16
16
  # mma-retry
@@ -41,7 +41,7 @@ digraph when_to_use {
41
41
  ```
42
42
 
43
43
  **Use when:**
44
- - A previous batch's terminal envelope shows mixed `done` / `done_with_concerns` / `failed`
44
+ - A previous batch's terminal envelope shows mixed `completed: true` / `completed: false`
45
45
  - 1–N tasks (but not all) need a re-run with the same config
46
46
  - You want to keep the original batch's diagnostics intact for comparison
47
47
 
@@ -88,7 +88,106 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId') # NEW batchId — not the origina
88
88
 
89
89
  @include _shared/polling.md
90
90
 
91
- @include _shared/response-shape.md
91
+ ## Response shapes
92
+
93
+ ### POST /retry?cwd=<abs> — dispatch response (202)
94
+
95
+ ```json
96
+ { "batchId": "<uuid>", "statusUrl": "/batch/<uuid>" }
97
+ ```
98
+
99
+ Use `batchId` to poll. `statusUrl` is a convenience pointer. **This is a new batchId** — polling the original batch returns its terminal state.
100
+
101
+ ### GET /batch/:id — polling response
102
+
103
+ The HTTP status is the state discriminator:
104
+
105
+ | Status | Meaning |
106
+ |---|---|
107
+ | `202 text/plain` | Still pending — body is the running headline string |
108
+ | `200 application/json` | Terminal — body is the batch envelope below |
109
+ | `404` / `401` / `5xx` | Error — see Error response below; stop polling |
110
+
111
+ ### GET /batch/:id?taskIndex=N — single task slice
112
+
113
+ Same envelope. `results` contains exactly the task at index `N`. Returns `404 unknown_task_index` if `N` is out of range.
114
+
115
+ ### Reading the task result
116
+
117
+ Each task result is the per-task wire object (`ComposePayload`):
118
+
119
+ ```json
120
+ {
121
+ "completed": true,
122
+ "message": "Task completed; tests passed; one file changed.",
123
+ "findings": [
124
+ {
125
+ "id": "F1",
126
+ "severity": "high",
127
+ "category": "correctness",
128
+ "claim": "The function does not handle empty input",
129
+ "evidence": "function foo() { ... } // no null check",
130
+ "suggestion": "Add an explicit null guard at the top",
131
+ "source": "reviewer"
132
+ }
133
+ ],
134
+ "summary": "Refactored utils.ts — removed 3 dead branches, added JSDoc",
135
+ "filesChanged": ["/project/src/utils.ts"],
136
+ "commitSha": "abc123def",
137
+ "blockId": null,
138
+ "telemetry": {
139
+ "totalDurationMs": 12400,
140
+ "totalCostUSD": 0.08,
141
+ "workerSelfAssessment": "done",
142
+ "reviewVerdict": "approved",
143
+ "commitOutcome": "committed",
144
+ "stopReason": "normal",
145
+ "haltedStage": null,
146
+ "stages": [
147
+ { "name": "prepare", "outcome": "advance", "durationMs": 2, "costUSD": 0 },
148
+ { "name": "register-block", "outcome": "skip", "comment": "register-block does not apply to route=delegate", "durationMs": 0, "costUSD": 0 },
149
+ { "name": "implement", "outcome": "advance", "durationMs": 8900, "costUSD": 0.05 },
150
+ { "name": "review", "outcome": "advance", "durationMs": 2100, "costUSD": 0.02 },
151
+ { "name": "rework", "outcome": "skip", "comment": "rework skipped because review approved", "durationMs": 0, "costUSD": 0 },
152
+ { "name": "commit", "outcome": "advance", "durationMs": 340, "costUSD": 0 },
153
+ { "name": "annotate", "outcome": "advance", "durationMs": 890, "costUSD": 0.01 },
154
+ { "name": "compose", "outcome": "advance", "durationMs": 68, "costUSD": 0 },
155
+ { "name": "terminal", "outcome": "advance", "durationMs": 100, "costUSD": 0 }
156
+ ]
157
+ }
158
+ }
159
+ ```
160
+
161
+ **Top-level fields to read for the main-agent verdict:**
162
+
163
+ | Field | When `true` / populated |
164
+ |---|---|
165
+ | `completed: true` | Task succeeded. `message` is the summary; `findings` are post-review issues (if any). |
166
+ | `completed: false` | Task did not complete. `message` names the blocking gate or finding; `findings` carry any discovered issues. |
167
+ | `findings` | Issues surfaced by the worker or reviewer. `severity` = `critical` \| `high` \| `medium` \| `low`. `source` = `implementer` \| `reviewer`. |
168
+ | `filesChanged` | File paths modified (empty for read-only routes). |
169
+ | `commitSha` | Git SHA of the committed diff; `null` for read-only routes or when commit was skipped. |
170
+ | `blockId` | `terminalBlockId` — pass to `contextBlockIds` in a follow-up task to chain results without re-inlining. |
171
+
172
+ **The stages array** (always 9 rows) is the canonical telemetry log. `outcome` is one of:
173
+ - `advance` — stage ran and produced its payload
174
+ - `skip` — stage did not run; `comment` explains why
175
+ - `halt` — stage stopped the chain; `comment` is the failure message
176
+ - `not_run` — stage was not reached because a prior stage halted
177
+
178
+ Use `telemetry.haltedStage` to find the first halt; `telemetry.stopReason` to find why.
179
+
180
+ ### Error response (4xx / 5xx)
181
+
182
+ ```json
183
+ {
184
+ "error": "<code>",
185
+ "message": "<human-readable>",
186
+ "details": { /* optional structured context, e.g. fieldErrors for 400 */ }
187
+ }
188
+ ```
189
+
190
+ `details` is optional and present only when the server has structured additional context.
92
191
 
93
192
  ## Best practices
94
193
 
@@ -10,7 +10,7 @@ when_to_use: >-
10
10
  AND mmagent is running. Delegate so each file reviews on its own worker; the
11
11
  main agent only decides what to merge. Review on SOURCE CODE — use mma-audit
12
12
  for prose specs / configs.
13
- version: 4.5.4
13
+ version: 4.7.0
14
14
  ---
15
15
 
16
16
  # mma-review
@@ -90,43 +90,54 @@ BATCH_ID=$(echo "$BATCH" | jq -r '.batchId')
90
90
 
91
91
  @include _shared/response-shape.md
92
92
 
93
- ## Reading the findings (3.10.5+)
93
+ ## Reading the findings
94
94
 
95
- The terminal envelope's `results[N].annotatedFindings` is a list of structured
96
- findings the reviewer extracted and scored from the implementer's narrative.
97
- Every finding has the same shape:
95
+ The main agent reads `completed` + `message` + `findings` — the findings are the answer. For
96
+ read-only routes, `filesChanged` is always `[]` and `commitSha` is always `null`.
97
+
98
+ ```json
99
+ {
100
+ "completed": true,
101
+ "message": "Review complete; 3 findings.",
102
+ "findings": [
103
+ { "id": "F1", "severity": "critical", "category": "test-gap",
104
+ "claim": "login.ts has no test for null username edge case.",
105
+ "evidence": "Worker read login.ts and grepped for test files — no null-case test found.",
106
+ "suggestion": "Add test case: `login(null) throws ValidationError`.",
107
+ "source": "reviewer" }
108
+ ],
109
+ "filesChanged": [],
110
+ "commitSha": null,
111
+ "summary": "...",
112
+ "telemetry": { ... }
113
+ }
114
+ ```
115
+
116
+ ### Finding shape
117
+
118
+ Every finding has this shape:
98
119
 
99
120
  | Field | Type | Notes |
100
121
  |---|---|---|
101
- | `id` | string | Reviewer-assigned, e.g. `F1`, `F2`. |
122
+ | `id` | string | Worker-assigned, e.g. `F1`, `F2`. Stable across chain. |
102
123
  | `severity` | `'critical' \| 'high' \| 'medium' \| 'low'` | 4-tier. |
124
+ | `category` | string | Topical bucket, e.g. `test-gap`, `cross-file-ripple`. |
103
125
  | `claim` | string | One-sentence summary. |
104
- | `evidence` | string ≥20 chars | Quoted from worker output when grounded. |
126
+ | `evidence` | string ≥20 chars | Verbatim from source when grounded. |
105
127
  | `suggestion?` | string | Optional fix recommendation. |
106
- | `annotatorConfidence` | `number \| null` | 0–100 from the reviewer; `null` when emitted via deterministic fallback. |
107
- | `evidenceGrounded` | boolean | True when `evidence` is a verbatim substring of worker output. |
108
-
109
- ### Verdict states (`qualityReviewVerdict`)
128
+ | `source` | `'implementer' \| 'reviewer'` | Who produced the finding. |
110
129
 
111
- - `'annotated'` — every finding is structured. May be reviewer-emitted (with
112
- numeric `annotatorConfidence`) or deterministic-fallback (with
113
- `annotatorConfidence: null`). The route ALWAYS reaches `'annotated'` unless
114
- the reviewer call itself fails transport.
115
- - `'error'` — only when the reviewer call fails transport (network / 5xx).
130
+ `annotatorConfidence` and `evidenceGrounded` are retired they were v4 fields with no producers.
116
131
 
117
132
  ### Recommended rendering by the main agent
118
133
 
119
- 1. Show ALL findings — never silently drop. Confidence and grounding are
120
- soft signals, not gates.
121
- 2. Default sort: severity (critical → low) then `annotatorConfidence` desc
122
- (nulls last).
123
- 3. `severity` is the reviewer's authoritative final value use it directly.
124
- 4. Mark findings with `evidenceGrounded: false` or
125
- `annotatorConfidence < 70` as "lower-trust" (collapsed section, lighter
126
- color, or `(low confidence)` annotation). User decides what to do.
127
- 5. Severity-tier counts feed the dashboard via V3 `findingsBySeverity`.
128
-
129
- @include _shared/budget-defaults.md
134
+ 1. Show ALL findings — never silently drop. Severity and grounding are soft
135
+ signals, not gates.
136
+ 2. Default sort: severity (critical → low), then `id` ascending.
137
+ 3. `severity` is the authoritative value — use it directly.
138
+ 4. Mark findings with `evidence` shorter than 30 chars as "low-evidence"
139
+ (lighter color or `(low evidence)` annotation). User decides what to do.
140
+ 5. Severity-tier counts feed the dashboard.
130
141
 
131
142
  ## Best practices
132
143
 
@@ -11,7 +11,7 @@ when_to_use: >-
11
11
  tasks — AND mmagent is running. Read this once, pick the matching mma-* skill,
12
12
  and delegate there. Applies equally whether the user invoked a superpowers
13
13
  methodology skill or asked directly.
14
- version: 4.5.4
14
+ version: 4.7.0
15
15
  ---
16
16
 
17
17
  # multi-model-agent (router)
@@ -153,7 +153,7 @@ Every request requires `Authorization: Bearer $MMAGENT_AUTH_TOKEN`. The token ro
153
153
  Only `mma-delegate` accepts `agentType: "standard" | "complex"` per task — default `"standard"` (cheaper, faster). Pick `"complex"` when:
154
154
 
155
155
  - The task touches many files or requires multi-step reasoning a standard-tier model cannot hold in context.
156
- - A prior standard run came back with `filesWritten: 0` or `incompleteReason: "turn_cap"` / `"cost_cap"` / `"timeout"`.
156
+ - A prior standard run came back with `filesWritten: 0` or `incompleteReason: "turn_cap"` / `"timeout"`.
157
157
  - The task is security-sensitive or ambiguous enough that being wrong is costly.
158
158
 
159
159
  Every other route hardcodes its tier and rejects `agentType` with HTTP 400:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zhixuan92/multi-model-agent",
3
- "version": "4.5.4",
3
+ "version": "4.7.0",
4
4
  "type": "module",
5
5
  "license": "MIT",
6
6
  "description": "Standalone HTTP server for multi-model-agent. Routes tool-invocation work to Claude, Codex, or OpenAI-compatible sub-agents with async-polling REST dispatch and installable skills for Claude Code, Gemini CLI, Codex CLI, and Cursor.",
@@ -53,7 +53,7 @@
53
53
  },
54
54
  "dependencies": {
55
55
  "@asteasolutions/zod-to-openapi": "^8.5.0",
56
- "@zhixuan92/multi-model-agent-core": "^4.5.4",
56
+ "@zhixuan92/multi-model-agent-core": "^4.7.0",
57
57
  "gray-matter": "^4.0.3",
58
58
  "minimist": "^1.2.8",
59
59
  "proper-lockfile": "^4.1.2",
@@ -1,7 +0,0 @@
1
- export declare const COMPRESSED_BODY_LIMIT_BYTES: number;
2
- /** Decompressed body cap — 2 MiB. Enforced by the decompress middleware. */
3
- export declare const DECOMPRESSED_BODY_LIMIT_BYTES: number;
4
- export declare function buildServerOpts(): {
5
- bodyLimit: number;
6
- };
7
- //# sourceMappingURL=body-size.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"body-size.d.ts","sourceRoot":"","sources":["../../../src/http/middleware/body-size.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,2BAA2B,QAAa,CAAC;AAEtD,4EAA4E;AAC5E,eAAO,MAAM,6BAA6B,QAAkB,CAAC;AAE7D,wBAAgB,eAAe;;EAE9B"}
@@ -1,7 +0,0 @@
1
- export const COMPRESSED_BODY_LIMIT_BYTES = 256 * 1024; // 256 KiB
2
- /** Decompressed body cap — 2 MiB. Enforced by the decompress middleware. */
3
- export const DECOMPRESSED_BODY_LIMIT_BYTES = 2 * 1024 * 1024; // 2 MiB
4
- export function buildServerOpts() {
5
- return { bodyLimit: COMPRESSED_BODY_LIMIT_BYTES };
6
- }
7
- //# sourceMappingURL=body-size.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"body-size.js","sourceRoot":"","sources":["../../../src/http/middleware/body-size.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,2BAA2B,GAAG,GAAG,GAAG,IAAI,CAAC,CAAC,UAAU;AAEjE,4EAA4E;AAC5E,MAAM,CAAC,MAAM,6BAA6B,GAAG,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,QAAQ;AAEtE,MAAM,UAAU,eAAe;IAC7B,OAAO,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC;AACpD,CAAC"}
@@ -1,13 +0,0 @@
1
- ## Budget defaults
2
-
3
- | Constant | Value | Notes |
4
- |---|---|---|
5
- | Task timeout | 1 h (3,600,000 ms) | Wall-clock cap per task |
6
- | Stall timeout | 20 min (1,200,000 ms) | Idle gap before force-abort |
7
- | Max cost | $10 USD | Per-task cost cap |
8
- | Cost pre-stop ratio | 0.80 | Pre-stop threshold; see cushion semantics below |
9
- | Time pre-stop ratio | 0.80 | Same pre-stop for timeouts |
10
-
11
- **Cushion semantics:** `MAX_COST_PRESTOP_RATIO` and `MAX_TIME_PRESTOP_RATIO` are *pre-stop thresholds*, not overshoot allowances. The runtime warns and may refuse new turns when cost reaches `DEFAULT_MAX_COST_USD × MAX_COST_PRESTOP_RATIO` ($8), but allows an already-in-flight turn to complete. The worst-case total is therefore `DEFAULT_MAX_COST_USD / MAX_COST_PRESTOP_RATIO` ($12.50). Same logic applies to time: worst-case = `DEFAULT_TASK_TIMEOUT_MS / MAX_TIME_PRESTOP_RATIO` (1.25 h).
12
-
13
- Callers can override `maxCostUSD` per task. Timeouts are config-wide defaults set in the server config file.