@zhixuan92/multi-model-agent-core 4.0.5 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/intake/brief-compiler-slots/delegate.d.ts +18 -0
- package/dist/intake/brief-compiler-slots/delegate.d.ts.map +1 -1
- package/dist/intake/brief-compiler-slots/delegate.js +36 -3
- package/dist/intake/brief-compiler-slots/delegate.js.map +1 -1
- package/dist/lifecycle/task-executor.d.ts.map +1 -1
- package/dist/lifecycle/task-executor.js +66 -77
- package/dist/lifecycle/task-executor.js.map +1 -1
- package/dist/reporting/report-parser-slots/investigate-report.d.ts.map +1 -1
- package/dist/reporting/report-parser-slots/investigate-report.js +41 -2
- package/dist/reporting/report-parser-slots/investigate-report.js.map +1 -1
- package/dist/review/templates/annotator-audit.d.ts.map +1 -1
- package/dist/review/templates/annotator-audit.js +5 -3
- package/dist/review/templates/annotator-audit.js.map +1 -1
- package/dist/review/templates/annotator-debug.d.ts.map +1 -1
- package/dist/review/templates/annotator-debug.js +11 -7
- package/dist/review/templates/annotator-debug.js.map +1 -1
- package/dist/review/templates/annotator-investigate.d.ts.map +1 -1
- package/dist/review/templates/annotator-investigate.js +6 -3
- package/dist/review/templates/annotator-investigate.js.map +1 -1
- package/dist/review/templates/annotator-review.d.ts.map +1 -1
- package/dist/review/templates/annotator-review.js +13 -6
- package/dist/review/templates/annotator-review.js.map +1 -1
- package/dist/review/templates/annotator-verify.d.ts.map +1 -1
- package/dist/review/templates/annotator-verify.js +11 -5
- package/dist/review/templates/annotator-verify.js.map +1 -1
- package/dist/tools/audit/implementer-criteria.d.ts +55 -4
- package/dist/tools/audit/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/audit/implementer-criteria.js +113 -11
- package/dist/tools/audit/implementer-criteria.js.map +1 -1
- package/dist/tools/audit/schema.d.ts +3 -10
- package/dist/tools/audit/schema.d.ts.map +1 -1
- package/dist/tools/audit/schema.js +3 -4
- package/dist/tools/audit/schema.js.map +1 -1
- package/dist/tools/audit/tool-config.d.ts.map +1 -1
- package/dist/tools/audit/tool-config.js +42 -21
- package/dist/tools/audit/tool-config.js.map +1 -1
- package/dist/tools/debug/implementer-criteria.d.ts +44 -4
- package/dist/tools/debug/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/debug/implementer-criteria.js +105 -13
- package/dist/tools/debug/implementer-criteria.js.map +1 -1
- package/dist/tools/debug/tool-config.d.ts.map +1 -1
- package/dist/tools/debug/tool-config.js +22 -4
- package/dist/tools/debug/tool-config.js.map +1 -1
- package/dist/tools/delegate/implementer-criteria.d.ts +62 -0
- package/dist/tools/delegate/implementer-criteria.d.ts.map +1 -0
- package/dist/tools/delegate/implementer-criteria.js +114 -0
- package/dist/tools/delegate/implementer-criteria.js.map +1 -0
- package/dist/tools/execute-plan/implementer-criteria.d.ts +52 -0
- package/dist/tools/execute-plan/implementer-criteria.d.ts.map +1 -0
- package/dist/tools/execute-plan/implementer-criteria.js +104 -0
- package/dist/tools/execute-plan/implementer-criteria.js.map +1 -0
- package/dist/tools/execute-plan/tool-config.d.ts.map +1 -1
- package/dist/tools/execute-plan/tool-config.js +17 -3
- package/dist/tools/execute-plan/tool-config.js.map +1 -1
- package/dist/tools/investigate/implementer-criteria.d.ts +48 -5
- package/dist/tools/investigate/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/investigate/implementer-criteria.js +103 -13
- package/dist/tools/investigate/implementer-criteria.js.map +1 -1
- package/dist/tools/investigate/tool-config.d.ts.map +1 -1
- package/dist/tools/investigate/tool-config.js +15 -8
- package/dist/tools/investigate/tool-config.js.map +1 -1
- package/dist/tools/review/implementer-criteria.d.ts +47 -1
- package/dist/tools/review/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/review/implementer-criteria.js +110 -9
- package/dist/tools/review/implementer-criteria.js.map +1 -1
- package/dist/tools/review/tool-config.d.ts.map +1 -1
- package/dist/tools/review/tool-config.js +39 -7
- package/dist/tools/review/tool-config.js.map +1 -1
- package/dist/tools/verify/implementer-criteria.d.ts +46 -0
- package/dist/tools/verify/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/verify/implementer-criteria.js +103 -8
- package/dist/tools/verify/implementer-criteria.js.map +1 -1
- package/dist/tools/verify/tool-config.d.ts.map +1 -1
- package/dist/tools/verify/tool-config.js +18 -2
- package/dist/tools/verify/tool-config.js.map +1 -1
- package/package.json +1 -1
|
@@ -5,7 +5,7 @@ import { reviewReportSchema } from '../../reporting/report-parser-slots/review-r
|
|
|
5
5
|
import { reviewHeadlineTemplate } from '../../reporting/headline-templates/review.js';
|
|
6
6
|
import { DEFAULT_TASK_TIMEOUT_MS } from '../../config/schema.js';
|
|
7
7
|
import { SEVERITY_LADDER } from '../../review/templates/finding-criteria.js';
|
|
8
|
-
import { EVIDENCE_RULE_REVIEW, SCOPE_RULE_REVIEW, ANNOTATOR_AWARENESS_REVIEW, } from './implementer-criteria.js';
|
|
8
|
+
import { REVIEW_PURPOSE_ORIENTATION, EVIDENCE_RULE_REVIEW, SCOPE_RULE_REVIEW, ANNOTATOR_AWARENESS_REVIEW, CODE_REVIEW_FAILURE_MODES, THOROUGHNESS_REMINDER_REVIEW, } from './implementer-criteria.js';
|
|
9
9
|
export function registerReview(registry) {
|
|
10
10
|
registry.register({
|
|
11
11
|
routeName: 'review',
|
|
@@ -19,18 +19,34 @@ export function registerReview(registry) {
|
|
|
19
19
|
responseShapeName: 'BatchResponse',
|
|
20
20
|
});
|
|
21
21
|
}
|
|
22
|
+
/**
|
|
23
|
+
* Per-focus "done" conditions.
|
|
24
|
+
*
|
|
25
|
+
* The full failure-mode taxonomy in CODE_REVIEW_FAILURE_MODES applies to
|
|
26
|
+
* all reviews regardless of focus. These per-focus conditions tell the
|
|
27
|
+
* worker which lens to weight, not which categories to skip. Security,
|
|
28
|
+
* performance, and correctness lenses are universally applicable to
|
|
29
|
+
* every code change — the focus array picks emphasis, not gating.
|
|
30
|
+
*
|
|
31
|
+
* When focus is empty/missing, the worker performs a comprehensive sweep
|
|
32
|
+
* applying all four lenses with the executability/merge-safety
|
|
33
|
+
* orientation block at the top of the prompt.
|
|
34
|
+
*/
|
|
22
35
|
const REVIEW_DONE_CONDITIONS = {
|
|
23
|
-
security: '
|
|
24
|
-
performance: '
|
|
25
|
-
correctness: '
|
|
26
|
-
style: '
|
|
36
|
+
security: 'Lens emphasis: security. Apply the full failure-mode taxonomy through the security lens: auth bypass, injection (SQL/command/prompt), untrusted input flowing to a sink (eval/exec/HTML), data exposure, weakened sandboxing, and hardcoded secrets. Each finding has severity, location, and remediation.',
|
|
37
|
+
performance: 'Lens emphasis: performance. Apply the full failure-mode taxonomy through the performance lens: N+1 queries, unbounded loops, blocking I/O on hot paths, unnecessary deep clones, work shifted from build/init time to request time, and missing caching where the same value is recomputed. Each finding has impact level, location, and fix recommendation.',
|
|
38
|
+
correctness: 'Lens emphasis: correctness. Apply the full failure-mode taxonomy through the correctness lens: logic errors, off-by-one, unhandled edge cases (null/undefined/empty/timeout/error/zero/negative), type mismatches, contract violations, race conditions, and resource leaks. Each finding has severity, location, and correct behavior.',
|
|
39
|
+
style: 'Lens emphasis: style. Apply the full failure-mode taxonomy through the style lens: naming, formatting, dead code, inconsistent patterns, deprecated APIs, and missing types. Note: style is rarely the highest-value review lens for a non-trivial diff — sweep the correctness, security, and performance categories too.',
|
|
27
40
|
};
|
|
28
41
|
const DELTA_REVIEW_SUFFIX = ' Perform a full review (do not reduce thoroughness). Verify each prior finding as addressed or unaddressed. Omit addressed prior findings. Include unaddressed prior findings and new findings. End with a summary of which prior findings were resolved.';
|
|
29
42
|
function resolveReviewDoneCondition(focus, hasContextBlocks) {
|
|
43
|
+
let base;
|
|
30
44
|
if (!focus || focus.length === 0) {
|
|
31
|
-
|
|
45
|
+
base = 'Comprehensive code review. Apply the full failure-mode taxonomy (the orientation block above) through all four lenses (correctness, security, performance, style). Emphasize TEST GAP, CROSS-FILE RIPPLE, MISSING EDGE CASE, and IMPLICIT-CONTRACT ASSUMPTION — these are the categories most often missed and most likely to ship regressions. Each finding has category, severity, location, and recommendation.';
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
base = focus.map(f => REVIEW_DONE_CONDITIONS[f] ?? '').filter(Boolean).join(' ');
|
|
32
49
|
}
|
|
33
|
-
const base = focus.map(f => REVIEW_DONE_CONDITIONS[f] ?? '').filter(Boolean).join(' ');
|
|
34
50
|
return hasContextBlocks ? base + DELTA_REVIEW_SUFFIX : base;
|
|
35
51
|
}
|
|
36
52
|
function buildReviewPrompt(brief) {
|
|
@@ -57,6 +73,13 @@ function buildReviewPrompt(brief) {
|
|
|
57
73
|
return parts.join('\n\n');
|
|
58
74
|
}
|
|
59
75
|
const FINDING_FORMAT_INSTRUCTIONS = [
|
|
76
|
+
// Orientation goes FIRST — the worker needs to know why this review
|
|
77
|
+
// exists (pre-merge gate, your verdict is authoritative, missing a
|
|
78
|
+
// regression here ships) before reading the format spec / taxonomy /
|
|
79
|
+
// evidence rules. Without it, workers do line-by-line proofreading and
|
|
80
|
+
// miss cross-file ripples and test gaps.
|
|
81
|
+
REVIEW_PURPOSE_ORIENTATION,
|
|
82
|
+
'',
|
|
60
83
|
'Produce a narrative code review. Use this EXACT per-finding format — both the structured reviewer and the deterministic fallback extract from this same format:',
|
|
61
84
|
'',
|
|
62
85
|
'## Finding 1: <one-line title>',
|
|
@@ -76,6 +99,15 @@ const FINDING_FORMAT_INSTRUCTIONS = [
|
|
|
76
99
|
'',
|
|
77
100
|
SEVERITY_LADDER,
|
|
78
101
|
'',
|
|
102
|
+
// Code-review failure-mode taxonomy. Without this block, workers
|
|
103
|
+
// calibrated on line-by-line proofreading miss the cross-file ripple,
|
|
104
|
+
// test gap, and implicit-contract findings that actually block merges.
|
|
105
|
+
CODE_REVIEW_FAILURE_MODES,
|
|
106
|
+
'',
|
|
107
|
+
// Counter-balances the SEVERITY_LADDER's anti-inflation hint and
|
|
108
|
+
// includes the cross-file pass with worked example.
|
|
109
|
+
THOROUGHNESS_REMINDER_REVIEW,
|
|
110
|
+
'',
|
|
79
111
|
EVIDENCE_RULE_REVIEW,
|
|
80
112
|
'',
|
|
81
113
|
SCOPE_RULE_REVIEW,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-config.js","sourceRoot":"","sources":["../../../src/tools/review/tool-config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAE,qBAAqB,EAAE,MAAM,iDAAiD,CAAC;AAGxF,OAAO,EAAE,eAAe,EAAoB,MAAM,6CAA6C,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,sDAAsD,CAAC;AAC1F,OAAO,EAAE,sBAAsB,EAAE,MAAM,8CAA8C,CAAC;AACtF,OAAO,EAAE,uBAAuB,EAAE,MAAM,wBAAwB,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,MAAM,4CAA4C,CAAC;AAC7E,OAAO,EACL,oBAAoB,EACpB,iBAAiB,EACjB,0BAA0B,
|
|
1
|
+
{"version":3,"file":"tool-config.js","sourceRoot":"","sources":["../../../src/tools/review/tool-config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAE,qBAAqB,EAAE,MAAM,iDAAiD,CAAC;AAGxF,OAAO,EAAE,eAAe,EAAoB,MAAM,6CAA6C,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,sDAAsD,CAAC;AAC1F,OAAO,EAAE,sBAAsB,EAAE,MAAM,8CAA8C,CAAC;AACtF,OAAO,EAAE,uBAAuB,EAAE,MAAM,wBAAwB,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,MAAM,4CAA4C,CAAC;AAC7E,OAAO,EACL,0BAA0B,EAC1B,oBAAoB,EACpB,iBAAiB,EACjB,0BAA0B,EAC1B,yBAAyB,EACzB,4BAA4B,GAC7B,MAAM,2BAA2B,CAAC;AAEnC,MAAM,UAAU,cAAc,CAAC,QAA6B;IAC1D,QAAQ,CAAC,QAAQ,CAAC;QAChB,SAAS,EAAE,QAAQ;QACnB,UAAU,EAAE,MAAM;QAClB,QAAQ,EAAE,SAAS;QACnB,OAAO,EAAE,MAAM;QACf,MAAM,EAAE,WAAW;QACnB,YAAY,EAAE,WAAW;QACzB,gBAAgB,EAAE,SAAS;QAC3B,oBAAoB,EAAE,KAAK;QAC3B,iBAAiB,EAAE,eAAe;KACnC,CAAC,CAAC;AACL,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,sBAAsB,GAA2B;IACrD,QAAQ,EACN,4SAA4S;IAC9S,WAAW,EACT,8VAA8V;IAChW,WAAW,EACT,yUAAyU;IAC3U,KAAK,EACH,4TAA4T;CAC/T,CAAC;AAEF,MAAM,mBAAmB,GAAG,2PAA2P,CAAC;AAExR,SAAS,0BAA0B,CAAC,KAA2B,EAAE,gBAAyB;IACxF,IAAI,IAAY,CAAC;IACjB,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACjC,IAAI,GAAG,oZAAoZ,CAAC;IAC9Z,CAAC;SAAM,CAAC;QACN,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,sBAAsB,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnF,CAAC;IACD,OAAO,gBAAgB,CAAC,CAAC,CAAC,IAAI,GAAG,mBAAmB,CAAC,CAAC,CAAC,IAAI,CAAC;AAC9D,CAAC;AAED,SAAS,iBAAiB,CAAC,KAAkB;IAC3C,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,gBAAgB,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IACrE,MAAM,KAAK,GAAa,CAAC,mBAAmB,CAAC,CAAC;IAE9C,IAAI,QAAQ,EAAE,CAAC;QACb,KAAK,CAAC,IAAI,CAAC,kCAAkC,QAAQ,EAAE,CAAC,CAAC;IAC3D,CAAC;SAAM,CAAC;QACN,IAAI,IAAI;YAAE,KAAK,CAAC,IAAI,CAAC,WAAW,IAAI,UAAU,CAAC,CAAC;QAChD,IAAI,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtC,KAAK,CAAC,IAAI,CAAC,kCAAkC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC1F,CAAC;QACD,IAAI,KAAK,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,gBAAgB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACjF,CAAC;IAED,gEAAgE;IAChE,oEAAoE;IACpE,IAAI,gBAAgB,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CACR,yMAAyM,CAC1M,CAAC;IACJ,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;IAExC,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED,MAAM,2BAA2B,GAAG;IAClC,oEAAoE;IACpE,mEAAmE;IACnE,qEAAqE;IACrE,uEAAuE;IACvE,yCAAyC;IACzC,0BAA0B;IAC1B,EAAE;IACF,iKAAiK;IACjK,EAAE;IACF,gCAAgC;IAChC,4CAA4C;IAC5C,uBAAuB;IACvB,oCAAoC;IACpC,2CAA2C;IAC3C,EAAE;IACF,gCAAgC;IAChC,iBAAiB;IACjB,OAAO;IACP,EAAE;IACF,QAAQ;IACR,8HAA8H;IAC9H,6GAA6G;IAC7G,mGAAmG;IACnG,EAAE;IACF,eAAe;IACf,EAAE;IACF,iEAAiE;IACjE,sEAAsE;IACtE,uEAAuE;IACvE,yBAAyB;IACzB,EAAE;IACF,iEAAiE;IACjE,oDAAoD;IACpD,4BAA4B;IAC5B,EAAE;IACF,oBAAoB;IACpB,EAAE;IACF,iBAAiB;IACjB,EAAE;IACF,0BAA0B;CAC3B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,MAAM,CAAC,MAAM,UAAU,GAA4C;IACjE,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,WAAW;IACrB,SAAS,EAAE,SAAS;IACpB,SAAS,EAAE,eAAe;IAC1B,aAAa,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE;QAC5B,MAAM,MAAM,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;QACxC,oEAAoE;QACpE,kEAAkE;QAClE,iEAAiE;QACjE,+DAA+D;QAC/D,4EAA4E;QAC5E,wEAAwE;QACxE,MAAM,SAAS,GAAG,KAAK,CAAC,QAAQ;YAC9B,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC;YAClB,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAClF,OAAO;YACL,MAAM;YACN,SAAS,EAAE,SAAS;YACpB,YAAY,EAAE,cAAc;YAC5B,kBAAkB,EAAE,KAAK;YACzB,IAAI,EAAE,0BAA0B,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,gBAAgB,CAAC;YACrE,KAAK,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,MAAM;YAC3C,SAAS,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,uBAAuB;YACpE,UAAU,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,UAAU,IAAI,EAAE;YACjD,aAAa,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,IAAI,UAAU;YAC/D,GAAG,EAAE,GAAG,CAAC,cAAc,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG;YACvC,eAAe,EAAE,KAAK,CAAC,eAAe;YACtC,SAAS;YACT,SAAS,EAAE,GAAG,CAAC,SAAS,IAAI,SAAS;YACrC,UAAU,EAAE,KAAK;SAClB,CAAC;IACJ,CAAC;IACD,YAAY,EAAE,kBAAkB;IAChC,gBAAgB,EAAE,sBAAsB;IACxC,eAAe,EAAE;QACf,SAAS,EAAE,qBAAqB;KACjC;CACF,CAAC"}
|
|
@@ -1,12 +1,58 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Verify-specific implementer criteria.
|
|
3
3
|
*
|
|
4
|
+
* VERIFY'S PURPOSE — read this before adding categories.
|
|
5
|
+
* mma-verify is the "are we lying when we say it's done?" gate. The
|
|
6
|
+
* caller is about to claim work is complete to a stakeholder; the
|
|
7
|
+
* verify output is the evidence trail for that claim. The success
|
|
8
|
+
* criterion is:
|
|
9
|
+
*
|
|
10
|
+
* "If you mark every item PASS, the caller can claim the work is
|
|
11
|
+
* done to a stakeholder without lying — and the stakeholder, given
|
|
12
|
+
* your evidence, can re-verify each item without re-doing the work."
|
|
13
|
+
*
|
|
14
|
+
* That criterion is what makes a finding load-bearing. A PASS marked
|
|
15
|
+
* on the basis of a prose claim ("the bug is fixed") rather than
|
|
16
|
+
* execution output or a file:line citation is a rubber stamp — the
|
|
17
|
+
* verify-equivalent of an unimplementable fix. A criterion that the
|
|
18
|
+
* worker could not actually verify from the supplied artifact must be
|
|
19
|
+
* marked FAIL with "cannot verify from this artifact" — not assumed-
|
|
20
|
+
* PASS or skipped.
|
|
21
|
+
*
|
|
4
22
|
* Verify walks an explicit checklist; each Finding maps 1:1 to one
|
|
5
23
|
* checklist item. Severity is bound to the result (PASS = low,
|
|
6
24
|
* FAIL = medium/high based on impact). Anything outside the checklist
|
|
7
25
|
* is out of scope, no exceptions.
|
|
8
26
|
*/
|
|
27
|
+
/**
|
|
28
|
+
* The orientation block. Goes at the TOP of every verify prompt.
|
|
29
|
+
*
|
|
30
|
+
* Without an explicit purpose statement, workers default to "rubber-
|
|
31
|
+
* stamp the checklist" — marking PASS based on prose claims in the
|
|
32
|
+
* work product instead of demanding execution-level evidence. With
|
|
33
|
+
* this orientation, every PASS comes with evidence a stakeholder
|
|
34
|
+
* could re-verify.
|
|
35
|
+
*/
|
|
36
|
+
export declare const VERIFY_PURPOSE_ORIENTATION: string;
|
|
9
37
|
export declare const EVIDENCE_RULE_VERIFY: string;
|
|
10
38
|
export declare const SCOPE_RULE_VERIFY: string;
|
|
39
|
+
/**
|
|
40
|
+
* The failure-mode taxonomy for verify.
|
|
41
|
+
*
|
|
42
|
+
* Without this block, workers tend to rubber-stamp PASS based on prose
|
|
43
|
+
* claims in the work product. The 7 categories below are the patterns
|
|
44
|
+
* a careful verifier would consciously check for.
|
|
45
|
+
*/
|
|
46
|
+
export declare const VERIFY_FAILURE_MODES: string;
|
|
47
|
+
/**
|
|
48
|
+
* Counter-balance to the SEVERITY_LADDER's anti-inflation hint.
|
|
49
|
+
*
|
|
50
|
+
* The shared severity ladder warns against inflation. For verify, the
|
|
51
|
+
* common failure is the OPPOSITE — workers UNDER-flag because they
|
|
52
|
+
* accept prose claims at face value (rubber stamp). This block tells
|
|
53
|
+
* the worker the typical verify failure is rubber-stamping, not
|
|
54
|
+
* over-skeptical FAIL marking.
|
|
55
|
+
*/
|
|
56
|
+
export declare const THOROUGHNESS_REMINDER_VERIFY: string;
|
|
11
57
|
export declare const ANNOTATOR_AWARENESS_VERIFY: string;
|
|
12
58
|
//# sourceMappingURL=implementer-criteria.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"implementer-criteria.d.ts","sourceRoot":"","sources":["../../../src/tools/verify/implementer-criteria.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"implementer-criteria.d.ts","sourceRoot":"","sources":["../../../src/tools/verify/implementer-criteria.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH;;;;;;;;GAQG;AACH,eAAO,MAAM,0BAA0B,QAe3B,CAAC;AAEb,eAAO,MAAM,oBAAoB,QAUrB,CAAC;AAEb,eAAO,MAAM,iBAAiB,QAKlB,CAAC;AAEb;;;;;;GAMG;AACH,eAAO,MAAM,oBAAoB,QAgBrB,CAAC;AAEb;;;;;;;;GAQG;AACH,eAAO,MAAM,4BAA4B,QAW7B,CAAC;AAEb,eAAO,MAAM,0BAA0B,QAS3B,CAAC"}
|
|
@@ -1,29 +1,124 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Verify-specific implementer criteria.
|
|
3
3
|
*
|
|
4
|
+
* VERIFY'S PURPOSE — read this before adding categories.
|
|
5
|
+
* mma-verify is the "are we lying when we say it's done?" gate. The
|
|
6
|
+
* caller is about to claim work is complete to a stakeholder; the
|
|
7
|
+
* verify output is the evidence trail for that claim. The success
|
|
8
|
+
* criterion is:
|
|
9
|
+
*
|
|
10
|
+
* "If you mark every item PASS, the caller can claim the work is
|
|
11
|
+
* done to a stakeholder without lying — and the stakeholder, given
|
|
12
|
+
* your evidence, can re-verify each item without re-doing the work."
|
|
13
|
+
*
|
|
14
|
+
* That criterion is what makes a finding load-bearing. A PASS marked
|
|
15
|
+
* on the basis of a prose claim ("the bug is fixed") rather than
|
|
16
|
+
* execution output or a file:line citation is a rubber stamp — the
|
|
17
|
+
* verify-equivalent of an unimplementable fix. A criterion that the
|
|
18
|
+
* worker could not actually verify from the supplied artifact must be
|
|
19
|
+
* marked FAIL with "cannot verify from this artifact" — not assumed-
|
|
20
|
+
* PASS or skipped.
|
|
21
|
+
*
|
|
4
22
|
* Verify walks an explicit checklist; each Finding maps 1:1 to one
|
|
5
23
|
* checklist item. Severity is bound to the result (PASS = low,
|
|
6
24
|
* FAIL = medium/high based on impact). Anything outside the checklist
|
|
7
25
|
* is out of scope, no exceptions.
|
|
8
26
|
*/
|
|
27
|
+
/**
|
|
28
|
+
* The orientation block. Goes at the TOP of every verify prompt.
|
|
29
|
+
*
|
|
30
|
+
* Without an explicit purpose statement, workers default to "rubber-
|
|
31
|
+
* stamp the checklist" — marking PASS based on prose claims in the
|
|
32
|
+
* work product instead of demanding execution-level evidence. With
|
|
33
|
+
* this orientation, every PASS comes with evidence a stakeholder
|
|
34
|
+
* could re-verify.
|
|
35
|
+
*/
|
|
36
|
+
export const VERIFY_PURPOSE_ORIENTATION = [
|
|
37
|
+
'Why this verify exists:',
|
|
38
|
+
'mma-verify is the "are we lying when we say it is done?" gate. Your output becomes the evidence trail behind a claim of completeness to a stakeholder. A wrong PASS here ships a false claim.',
|
|
39
|
+
'',
|
|
40
|
+
'For your output to clear that bar, every Finding must answer:',
|
|
41
|
+
'- Item: the exact criterion text (preserve the caller\'s wording).',
|
|
42
|
+
'- Result: PASS or FAIL — never "partial", "mostly", "in progress" — only PASS or FAIL.',
|
|
43
|
+
'- Evidence: how the stakeholder could re-verify this PASS or FAIL themselves. Acceptable evidence shapes:',
|
|
44
|
+
' 1. EXECUTION: a command + its observed output (test name + pass/fail line, build output, lint result).',
|
|
45
|
+
' 2. FILE-LEVEL: `file:line` citation showing the implementation that satisfies (or fails to satisfy) the criterion.',
|
|
46
|
+
' 3. NEGATIVE: an explicit "cannot verify from this artifact" with what would be needed to verify.',
|
|
47
|
+
'',
|
|
48
|
+
'A PASS without evidence is a rubber stamp — the worst possible verify failure mode. If you cannot demonstrate PASS, the result is FAIL, NOT assumed-PASS or skipped.',
|
|
49
|
+
'',
|
|
50
|
+
'The completion test: would a stakeholder who reads only your verification report and the named artifacts be able to re-verify each PASS themselves — and end up agreeing with each verdict?',
|
|
51
|
+
].join('\n');
|
|
9
52
|
export const EVIDENCE_RULE_VERIFY = [
|
|
10
53
|
'Evidence grounding (REQUIRED for every finding):',
|
|
11
54
|
'- Each Finding maps 1:1 to a checklist item (same count, same order).',
|
|
12
|
-
'- Evidence is
|
|
13
|
-
'
|
|
14
|
-
'-
|
|
55
|
+
'- Evidence is one of three shapes:',
|
|
56
|
+
' 1. EXECUTION: a command + its observed output. Quote the relevant line of the output (e.g. "12 passed, 0 failed", "✓ tests/foo.test.ts").',
|
|
57
|
+
' 2. FILE-LEVEL: `file:line` citation showing the implementation that satisfies (or fails) the criterion. Include the quoted excerpt.',
|
|
58
|
+
' 3. NEGATIVE: explicitly state "cannot verify from this artifact" plus what would be needed (a test run, a different file, a runtime check).',
|
|
59
|
+
'- A "the work product says it is done" claim is NOT evidence — that is a rubber stamp. Only execution output or file:line citations count.',
|
|
60
|
+
'- If you cannot demonstrate PASS, the result is FAIL — explain why in the Evidence field. Do NOT mark PASS without evidence and do NOT skip the item.',
|
|
61
|
+
'- Severity binding: PASS items are `low`. FAIL items are `medium` or `high` based on impact. Reserve `critical` for FAIL items that block the next step entirely (a release-blocking criterion failed, an acceptance test failed, etc.).',
|
|
15
62
|
].join('\n');
|
|
16
63
|
export const SCOPE_RULE_VERIFY = [
|
|
17
64
|
'Scope:',
|
|
18
65
|
'- Strictly the checklist items. One Finding per item, in checklist order, no skips.',
|
|
66
|
+
'- IMPLICIT criteria embedded in a checklist item ARE in scope. Example: a checklist item "fix the off-by-one bug in pagination" has an implicit sub-criterion "regression test added". If the implicit sub-criterion is not met, mark FAIL — do NOT split the item into two findings.',
|
|
19
67
|
'- Out of scope: any issue not tied to a checklist item, however interesting. Such observations may be noted in your summary section, but do NOT emit them as Findings.',
|
|
20
68
|
].join('\n');
|
|
69
|
+
/**
|
|
70
|
+
* The failure-mode taxonomy for verify.
|
|
71
|
+
*
|
|
72
|
+
* Without this block, workers tend to rubber-stamp PASS based on prose
|
|
73
|
+
* claims in the work product. The 7 categories below are the patterns
|
|
74
|
+
* a careful verifier would consciously check for.
|
|
75
|
+
*/
|
|
76
|
+
export const VERIFY_FAILURE_MODES = [
|
|
77
|
+
'Patterns to consciously check for. Apply on EVERY checklist item:',
|
|
78
|
+
'',
|
|
79
|
+
'1. CLAIM-WITHOUT-EVIDENCE — the work product says "the bug is fixed" or "tests added" but you have no execution output and no file:line citation backing the claim. Marking PASS on prose alone is a rubber stamp. Demand evidence; mark FAIL if none.',
|
|
80
|
+
'2. STALE EVIDENCE — the cited test run, build output, or commit was captured BEFORE the change being verified. Always check the timestamp / SHA / sequence — if the evidence predates the change, it does not demonstrate the criterion.',
|
|
81
|
+
'3. IMPLICIT-CRITERION GAP — a checklist item like "fix bug X" has an implicit sub-criterion (regression test added, no behavior change in unrelated code). If a reasonable stakeholder reading the criterion would expect the implicit sub-criterion, mark FAIL when it is not met. Do NOT silently exclude the implicit sub-criterion from the verdict.',
|
|
82
|
+
'4. PARTIAL COVERAGE — the criterion has multiple parts (e.g. "fix the bug AND add a regression test"). You only checked one part. Mark FAIL with explicit note of which part was satisfied and which was not. Do NOT mark PASS based on partial satisfaction.',
|
|
83
|
+
'5. CONFLATED CRITERIA — evidence for criterion B is used to claim criterion A. Each Finding\'s Evidence must directly demonstrate the criterion in that Finding\'s Item field, not a neighboring criterion.',
|
|
84
|
+
'6. WRONG-ARTIFACT EVIDENCE — the cited file or test name looks valid but does not actually exercise the change. Verify that the cited test name exists, that the cited line is in the cited file, and that the executed command produced the quoted output.',
|
|
85
|
+
'7. ASSUMED-PASS-ON-UNTESTED — the criterion cannot be verified from the artifact provided. The correct verdict is FAIL with "cannot verify from this artifact, would need X" — NOT assumed-PASS, NOT skipped, NOT marked PASS-with-caveats.',
|
|
86
|
+
'',
|
|
87
|
+
'Severity calibration for verify:',
|
|
88
|
+
'- critical: FAIL on a release-blocking criterion, acceptance test failure, security gate failure. The caller must NOT claim done.',
|
|
89
|
+
'- high: FAIL on a substantial criterion that affects the work product\'s correctness or safety. The claim of done is materially wrong.',
|
|
90
|
+
'- medium: FAIL on an implicit sub-criterion or partial coverage. The work is mostly done but the claim is partially overstated.',
|
|
91
|
+
'- low: PASS — every PASS is `low` severity regardless of importance, because the verdict is "done correctly" and severity reflects departure from done.',
|
|
92
|
+
].join('\n');
|
|
93
|
+
/**
|
|
94
|
+
* Counter-balance to the SEVERITY_LADDER's anti-inflation hint.
|
|
95
|
+
*
|
|
96
|
+
* The shared severity ladder warns against inflation. For verify, the
|
|
97
|
+
* common failure is the OPPOSITE — workers UNDER-flag because they
|
|
98
|
+
* accept prose claims at face value (rubber stamp). This block tells
|
|
99
|
+
* the worker the typical verify failure is rubber-stamping, not
|
|
100
|
+
* over-skeptical FAIL marking.
|
|
101
|
+
*/
|
|
102
|
+
export const THOROUGHNESS_REMINDER_VERIFY = [
|
|
103
|
+
'Thoroughness expectation for verify:',
|
|
104
|
+
'- The SEVERITY_LADDER warns against inflation. That warning is calibrated for code reviews — for verify, the common failure is the OPPOSITE: rubber-stamping PASS based on a prose claim instead of demanding execution-level evidence. Apply the failure-mode taxonomy first; THEN calibrate severity.',
|
|
105
|
+
'- For each checklist item, ASK: "could a stakeholder reading my evidence re-verify this PASS themselves and reach the same conclusion?" If no — even if you believe the criterion is met — the verdict is FAIL with "cannot verify from this artifact".',
|
|
106
|
+
'- Do not invent FAILs to hit a quota. But if every item is PASS and the artifact is non-trivial, double-check categories 1, 3, 4, and 7 (claim-without-evidence, implicit-criterion gap, partial coverage, assumed-PASS-on-untested) — these are the ones verifiers most often miss on first pass and the ones most likely to ship a false claim.',
|
|
107
|
+
'',
|
|
108
|
+
'Evidence-shape walk (REQUIRED on every checklist item):',
|
|
109
|
+
'- For each item, before writing the Finding, ask: which of the three evidence shapes do I have? EXECUTION, FILE-LEVEL, or NEGATIVE?',
|
|
110
|
+
'- If you cannot place your verdict in one of the three shapes, your evidence is insufficient — the verdict is FAIL with NEGATIVE evidence ("cannot verify from this artifact, would need X").',
|
|
111
|
+
'- Worked example. Checklist item: "Bug #123 is fixed (the off-by-one in pagination)." Work product says "fixed in commit abc123 — added a guard at the loop boundary." Naive verdict: PASS based on the work product\'s claim. Correct verdict: read commit abc123, find the changed lines, cite `src/pagination.ts:48` with the actual changed expression, AND check for a regression test (implicit sub-criterion of "fix the bug"). If no test exists, mark FAIL with explicit note "fix is in place but no regression test was added — implicit sub-criterion not met". Do NOT mark PASS on the work-product claim alone, and do NOT silently exclude the implicit sub-criterion.',
|
|
112
|
+
'- Most verifiers miss findings of this shape on first pass because the work product\'s prose is persuasive. The evidence-shape walk forces the demand for execution or file:line.',
|
|
113
|
+
].join('\n');
|
|
21
114
|
export const ANNOTATOR_AWARENESS_VERIFY = [
|
|
22
115
|
'After your output, an annotator validates each finding against this verify rubric:',
|
|
23
|
-
'- Does each Finding map to exactly one checklist item?',
|
|
24
|
-
'- Does the evidence actually demonstrate the claimed PASS or FAIL?',
|
|
25
|
-
'- Is the severity bound (PASS = low; FAIL = medium/high)?',
|
|
26
|
-
'- Are all checklist items covered?',
|
|
27
|
-
'
|
|
116
|
+
'- Does each Finding map to exactly one checklist item, in checklist order, with the criterion text preserved?',
|
|
117
|
+
'- Does the evidence actually demonstrate the claimed PASS or FAIL — and is it one of the three valid shapes (EXECUTION, FILE-LEVEL, or NEGATIVE)?',
|
|
118
|
+
'- Is the severity bound (PASS = low; FAIL = medium/high based on impact)?',
|
|
119
|
+
'- Are all checklist items covered, including ones the worker thought were trivial?',
|
|
120
|
+
'- For PASS items: could a stakeholder re-verify the PASS from the cited evidence alone?',
|
|
121
|
+
'- For FAIL items: is the FAIL backed by a specific shortfall (which sub-criterion missed, which test failed, which file does not implement what the criterion requires)?',
|
|
122
|
+
'Self-check before emitting. Findings that fail any check are downgraded or dropped — but FAIL with NEGATIVE evidence ("cannot verify from this artifact") is FULLY VALID and the correct verdict when the artifact is insufficient. Do NOT downgrade NEGATIVE-evidence FAILs to "cannot determine" or assumed-PASS.',
|
|
28
123
|
].join('\n');
|
|
29
124
|
//# sourceMappingURL=implementer-criteria.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"implementer-criteria.js","sourceRoot":"","sources":["../../../src/tools/verify/implementer-criteria.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"implementer-criteria.js","sourceRoot":"","sources":["../../../src/tools/verify/implementer-criteria.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH;;;;;;;;GAQG;AACH,MAAM,CAAC,MAAM,0BAA0B,GAAG;IACxC,yBAAyB;IACzB,+LAA+L;IAC/L,EAAE;IACF,+DAA+D;IAC/D,oEAAoE;IACpE,wFAAwF;IACxF,2GAA2G;IAC3G,4GAA4G;IAC5G,wHAAwH;IACxH,sGAAsG;IACtG,EAAE;IACF,sKAAsK;IACtK,EAAE;IACF,6LAA6L;CAC9L,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,MAAM,CAAC,MAAM,oBAAoB,GAAG;IAClC,kDAAkD;IAClD,uEAAuE;IACvE,oCAAoC;IACpC,+IAA+I;IAC/I,yIAAyI;IACzI,iJAAiJ;IACjJ,4IAA4I;IAC5I,uJAAuJ;IACvJ,0OAA0O;CAC3O,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,QAAQ;IACR,qFAAqF;IACrF,uRAAuR;IACvR,wKAAwK;CACzK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAG;IAClC,mEAAmE;IACnE,EAAE;IACF,wPAAwP;IACxP,0OAA0O;IAC1O,0VAA0V;IAC1V,+PAA+P;IAC/P,6MAA6M;IAC7M,6PAA6P;IAC7P,6OAA6O;IAC7O,EAAE;IACF,kCAAkC;IAClC,mIAAmI;IACnI,wIAAwI;IACxI,iIAAiI;IACjI,yJAAyJ;CAC1J,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;;;;GAQG;AACH,MAAM,CAAC,MAAM,4BAA4B,GAAG;IAC1C,sCAAsC;IACtC,ySAAyS;IACzS,yPAAyP;IACzP,mVAAmV;IACnV,EAAE;IACF,yDAAyD;IACzD,qIAAqI;IACrI,+LAA+L;IAC/L,upBAAupB;IACvpB,mLAAmL;CACpL,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,MAAM,CAAC,MAAM,0BAA0B,GAAG;IACxC,oFAAoF;IACpF,+GAA+G;IAC/G,mJAAmJ;IACnJ,2EAA2E;IAC3E,oFAAoF;IACpF,yFAAyF;IACzF,0KAA0K;IAC1K,qTAAqT;CACtT,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-config.d.ts","sourceRoot":"","sources":["../../../src/tools/verify/tool-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,6CAA6C,CAAC;AAElF,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAEzC,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,sCAAsC,CAAC;
|
|
1
|
+
{"version":3,"file":"tool-config.d.ts","sourceRoot":"","sources":["../../../src/tools/verify/tool-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,6CAA6C,CAAC;AAElF,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAEzC,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,sCAAsC,CAAC;AAevE,wBAAgB,cAAc,CAAC,QAAQ,EAAE,mBAAmB,GAAG,IAAI,CAYlE;AAID,MAAM,WAAW,WAAW;IAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,0EAA0E;IAC1E,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAgFD,eAAO,MAAM,UAAU,EAAE,UAAU,CAAC,KAAK,EAAE,WAAW,EAAE,OAAO,CAqD9D,CAAC"}
|
|
@@ -4,7 +4,7 @@ import { verifyReportSchema } from '../../reporting/report-parser-slots/verify-r
|
|
|
4
4
|
import { verifyHeadlineTemplate } from '../../reporting/headline-templates/verify.js';
|
|
5
5
|
import { DEFAULT_TASK_TIMEOUT_MS } from '../../config/schema.js';
|
|
6
6
|
import { SEVERITY_LADDER } from '../../review/templates/finding-criteria.js';
|
|
7
|
-
import { EVIDENCE_RULE_VERIFY, SCOPE_RULE_VERIFY, ANNOTATOR_AWARENESS_VERIFY, } from './implementer-criteria.js';
|
|
7
|
+
import { VERIFY_PURPOSE_ORIENTATION, EVIDENCE_RULE_VERIFY, SCOPE_RULE_VERIFY, ANNOTATOR_AWARENESS_VERIFY, VERIFY_FAILURE_MODES, THOROUGHNESS_REMINDER_VERIFY, } from './implementer-criteria.js';
|
|
8
8
|
export function registerVerify(registry) {
|
|
9
9
|
registry.register({
|
|
10
10
|
routeName: 'verify',
|
|
@@ -20,13 +20,19 @@ export function registerVerify(registry) {
|
|
|
20
20
|
}
|
|
21
21
|
// ── Prompt builders (lifted from legacy executor) ──
|
|
22
22
|
const FINDING_FORMAT_INSTRUCTIONS = [
|
|
23
|
+
// Orientation goes FIRST — the worker needs to know why this verify
|
|
24
|
+
// exists (false-claim gate, every PASS must be re-verifiable) before
|
|
25
|
+
// reading the format spec / taxonomy / evidence rules. Without it,
|
|
26
|
+
// workers default to rubber-stamping based on prose claims.
|
|
27
|
+
VERIFY_PURPOSE_ORIENTATION,
|
|
28
|
+
'',
|
|
23
29
|
'For each checklist item, use this EXACT per-finding format — both the structured reviewer and the deterministic fallback extract from this same format:',
|
|
24
30
|
'',
|
|
25
31
|
'## Finding 1: <one-line title (the criterion summary)>',
|
|
26
32
|
'- Severity: critical | high | medium | low (use `low` for PASS items; `medium` or `high` for FAIL items per impact)',
|
|
27
33
|
'- Item: the criterion text',
|
|
28
34
|
'- Result: PASS or FAIL',
|
|
29
|
-
'- Evidence: file:line +
|
|
35
|
+
'- Evidence: EXECUTION (command + observed output), FILE-LEVEL (file:line + quoted excerpt), or NEGATIVE ("cannot verify from this artifact, would need X")',
|
|
30
36
|
'',
|
|
31
37
|
'## Finding 2: <one-line title>',
|
|
32
38
|
'- Severity: ...',
|
|
@@ -42,6 +48,16 @@ const FINDING_FORMAT_INSTRUCTIONS = [
|
|
|
42
48
|
// PASS -> low, FAIL -> medium/high based on impact.
|
|
43
49
|
SEVERITY_LADDER,
|
|
44
50
|
'',
|
|
51
|
+
// Verify failure-mode taxonomy. Without this block, workers tend to
|
|
52
|
+
// rubber-stamp PASS based on prose claims instead of demanding
|
|
53
|
+
// execution-level evidence. The 7 categories enumerate the ways a
|
|
54
|
+
// verifier can ship a false claim.
|
|
55
|
+
VERIFY_FAILURE_MODES,
|
|
56
|
+
'',
|
|
57
|
+
// Counter-balances the SEVERITY_LADDER's anti-inflation hint and
|
|
58
|
+
// includes the evidence-shape walk with worked example.
|
|
59
|
+
THOROUGHNESS_REMINDER_VERIFY,
|
|
60
|
+
'',
|
|
45
61
|
EVIDENCE_RULE_VERIFY,
|
|
46
62
|
'',
|
|
47
63
|
SCOPE_RULE_VERIFY,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-config.js","sourceRoot":"","sources":["../../../src/tools/verify/tool-config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAE,qBAAqB,EAAE,MAAM,iCAAiC,CAAC;AAGxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,sDAAsD,CAAC;AAC1F,OAAO,EAAE,sBAAsB,EAAE,MAAM,8CAA8C,CAAC;AACtF,OAAO,EAAE,uBAAuB,EAAE,MAAM,wBAAwB,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,MAAM,4CAA4C,CAAC;AAC7E,OAAO,EACL,oBAAoB,EACpB,iBAAiB,EACjB,0BAA0B,
|
|
1
|
+
{"version":3,"file":"tool-config.js","sourceRoot":"","sources":["../../../src/tools/verify/tool-config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAE,qBAAqB,EAAE,MAAM,iCAAiC,CAAC;AAGxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,sDAAsD,CAAC;AAC1F,OAAO,EAAE,sBAAsB,EAAE,MAAM,8CAA8C,CAAC;AACtF,OAAO,EAAE,uBAAuB,EAAE,MAAM,wBAAwB,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,MAAM,4CAA4C,CAAC;AAC7E,OAAO,EACL,0BAA0B,EAC1B,oBAAoB,EACpB,iBAAiB,EACjB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,GAC7B,MAAM,2BAA2B,CAAC;AAEnC,MAAM,UAAU,cAAc,CAAC,QAA6B;IAC1D,QAAQ,CAAC,QAAQ,CAAC;QAChB,SAAS,EAAE,QAAQ;QACnB,UAAU,EAAE,MAAM;QAClB,QAAQ,EAAE,SAAS;QACnB,OAAO,EAAE,MAAM;QACf,MAAM,EAAE,WAAW;QACnB,YAAY,EAAE,WAAW;QACzB,gBAAgB,EAAE,SAAS;QAC3B,oBAAoB,EAAE,KAAK;QAC3B,iBAAiB,EAAE,eAAe;KACnC,CAAC,CAAC;AACL,CAAC;AAaD,sDAAsD;AAEtD,MAAM,2BAA2B,GAAG;IAClC,oEAAoE;IACpE,qEAAqE;IACrE,mEAAmE;IACnE,4DAA4D;IAC5D,0BAA0B;IAC1B,EAAE;IACF,yJAAyJ;IACzJ,EAAE;IACF,wDAAwD;IACxD,qHAAqH;IACrH,4BAA4B;IAC5B,wBAAwB;IACxB,4JAA4J;IAC5J,EAAE;IACF,gCAAgC;IAChC,iBAAiB;IACjB,OAAO;IACP,EAAE;IACF,QAAQ;IACR,6IAA6I;IAC7I,wGAAwG;IACxG,EAAE;IACF,sEAAsE;IACtE,qEAAqE;IACrE,oEAAoE;IACpE,oDAAoD;IACpD,eAAe;IACf,EAAE;IACF,oEAAoE;IACpE,+DAA+D;IAC/D,kEAAkE;IAClE,mCAAmC;IACnC,oBAAoB;IACpB,EAAE;IACF,iEAAiE;IACjE,wDAAwD;IACxD,4BAA4B;IAC5B,EAAE;IACF,oBAAoB;IACpB,EAAE;IACF,iBAAiB;IACjB,EAAE;IACF,0BAA0B;CAC3B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,SAAS,oBAAoB,CAAC,SAAoB;IAChD,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACpD,OAAO,kCAAkC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;AACrF,CAAC;AAED,SAAS,iBAAiB,CACxB,IAAwB,EACxB,SAA+B,EAC/B,SAAmB;IAEnB,MAAM,KAAK,GAAa,CAAC,mBAAmB,CAAC,CAAC;IAC9C,IAAI,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3B,MAAM,WAAW,GAAG,oBAAoB,CAAC,SAAS,CAAC,CAAC;IACpD,IAAI,WAAW;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACzC,MAAM,aAAa,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjF,KAAK,CAAC,IAAI,CAAC,eAAe,aAAa,EAAE,CAAC,CAAC;IAC3C,KAAK,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;IACxC,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,kBAAkB,CAAC,QAAgB,EAAE,cAAsB;IAClE,OAAO,GAAG,cAAc,sCAAsC,QAAQ,EAAE,CAAC;AAC3E,CAAC;AAED,SAAS,UAAU,CAAC,KAAyB;IAC3C,OAAO,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,mBAAmB;AAEnB,MAAM,CAAC,MAAM,UAAU,GAA4C;IACjE,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,WAAW;IACrB,SAAS,EAAE,SAAS;IACpB,SAAS,EAAE,CAAC,KAAY,EAAiB,EAAE;QACzC,MAAM,SAAS,GAAG,CAAC,KAAK,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAE3E,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,SAAS,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YACrD,qDAAqD;YACrD,MAAM,cAAc,GAAG,iBAAiB,CAAC,SAAS,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;YAChF,OAAO,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;gBAC1B,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,SAAS,EAAE,CAAC,EAAE,CAAC;gBACf,SAAS,EAAE,KAAK,CAAC,SAAS;gBAC1B,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,EAAE;gBAC5C,cAAc;aACf,CAAC,CAAC,CAAC;QACN,CAAC;QAED,cAAc;QACd,OAAO,CAAC;gBACN,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,SAAS;gBACT,SAAS,EAAE,KAAK,CAAC,SAAS;gBAC1B,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,EAAE;aAC7C,CAAC,CAAC;IACL,CAAC;IACD,aAAa,EAAE,CAAC,KAAkB,EAAE,GAAqB,EAAE,EAAE;QAC3D,MAAM,MAAM,GAAG,KAAK,CAAC,cAAc;YACjC,CAAC,CAAC,kBAAkB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAE,EAAE,KAAK,CAAC,cAAc,CAAC;YAC/D,CAAC,CAAC,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;QAE7G,OAAO;YACL,MAAM;YACN,SAAS,EAAE,SAAS;YACpB,YAAY,EAAE,cAAc;YAC5B,kBAAkB,EAAE,KAAK;YACzB,IAAI,EAAE,yBAAyB,KAAK,CAAC,SAAS,CAAC,MAAM,yEAAyE;YAC9H,KAAK,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,MAAM;YAC3C,SAAS,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,uBAAuB;YACpE,UAAU,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,UAAU,IAAI,EAAE;YACjD,aAAa,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,IAAI,UAAU;YAC/D,GAAG,EAAE,GAAG,CAAC,cAAc,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG;YACvC,eAAe,EAAE,KAAK,CAAC,eAAe;YACtC,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS;YACnE,SAAS,EAAE,GAAG,CAAC,SAAS,IAAI,SAAS;SACtC,CAAC;IACJ,CAAC;IACD,YAAY,EAAE,kBAAkB;IAChC,gBAAgB,EAAE,sBAAsB;IACxC,eAAe,EAAE;QACf,SAAS,EAAE,qBAAqB;KACjC;CACF,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zhixuan92/multi-model-agent-core",
|
|
3
|
-
"version": "4.0
|
|
3
|
+
"version": "4.1.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"description": "Core library for multi-model-agent: provider runners (Claude, Codex, OpenAI-compatible), routing logic, config schema, and tool/sandbox primitives.",
|