edsger 0.55.4 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/quality-benchmark/index.d.ts +32 -0
- package/dist/commands/quality-benchmark/index.js +124 -0
- package/dist/index.js +24 -0
- package/dist/phases/quality-benchmark/index.d.ts +65 -0
- package/dist/phases/quality-benchmark/index.js +194 -0
- package/dist/phases/quality-benchmark/mcp-server.d.ts +46 -0
- package/dist/phases/quality-benchmark/mcp-server.js +252 -0
- package/dist/phases/quality-benchmark/parsers.d.ts +22 -0
- package/dist/phases/quality-benchmark/parsers.js +1022 -0
- package/dist/phases/quality-benchmark/prompts.d.ts +31 -0
- package/dist/phases/quality-benchmark/prompts.js +154 -0
- package/dist/phases/quality-benchmark/rubric.md +1066 -0
- package/dist/phases/quality-benchmark/tool-catalog.d.ts +33 -0
- package/dist/phases/quality-benchmark/tool-catalog.js +597 -0
- package/dist/phases/quality-benchmark/tool-runner.d.ts +69 -0
- package/dist/phases/quality-benchmark/tool-runner.js +399 -0
- package/dist/phases/quality-benchmark/types.d.ts +312 -0
- package/dist/phases/quality-benchmark/types.js +23 -0
- package/package.json +4 -4
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI command: `edsger quality-benchmark <productId>`
|
|
3
|
+
*
|
|
4
|
+
* Runs the quality-benchmark phase against a local repo checkout and
|
|
5
|
+
* writes the resulting JSON report to disk. Persistence to the
|
|
6
|
+
* `quality_reports` table is the caller's responsibility — desktop-app
|
|
7
|
+
* picks the JSON up via stdout / file and writes via its supabase
|
|
8
|
+
* client, exactly the same pattern used by other CLI commands.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* edsger quality-benchmark <productId>
|
|
12
|
+
* --repo <path> (required) local repo checkout
|
|
13
|
+
* --branch <name> (optional) branch name (informational)
|
|
14
|
+
* --pkg-manager <name> (optional) npm|pnpm|yarn (auto-detected if absent)
|
|
15
|
+
* --no-install refuse to install missing tools
|
|
16
|
+
* --output <path> where to write the JSON report
|
|
17
|
+
* (default: ./quality-report-<commit>.json)
|
|
18
|
+
* --verbose print every progress event
|
|
19
|
+
*/
|
|
20
|
+
export interface QualityBenchmarkCliOptions {
|
|
21
|
+
repo: string;
|
|
22
|
+
branch?: string;
|
|
23
|
+
pkgManager?: string;
|
|
24
|
+
install?: boolean;
|
|
25
|
+
output?: string;
|
|
26
|
+
verbose?: boolean;
|
|
27
|
+
/** When false, skip the quality_reports/save MCP call. */
|
|
28
|
+
save?: boolean;
|
|
29
|
+
/** Overwrite any existing report row for this (product, commit, rubric). */
|
|
30
|
+
force?: boolean;
|
|
31
|
+
}
|
|
32
|
+
export declare function runQualityBenchmarkCli(productId: string, options: QualityBenchmarkCliOptions): Promise<void>;
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI command: `edsger quality-benchmark <productId>`
|
|
3
|
+
*
|
|
4
|
+
* Runs the quality-benchmark phase against a local repo checkout and
|
|
5
|
+
* writes the resulting JSON report to disk. Persistence to the
|
|
6
|
+
* `quality_reports` table is the caller's responsibility — desktop-app
|
|
7
|
+
* picks the JSON up via stdout / file and writes via its supabase
|
|
8
|
+
* client, exactly the same pattern used by other CLI commands.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* edsger quality-benchmark <productId>
|
|
12
|
+
* --repo <path> (required) local repo checkout
|
|
13
|
+
* --branch <name> (optional) branch name (informational)
|
|
14
|
+
* --pkg-manager <name> (optional) npm|pnpm|yarn (auto-detected if absent)
|
|
15
|
+
* --no-install refuse to install missing tools
|
|
16
|
+
* --output <path> where to write the JSON report
|
|
17
|
+
* (default: ./quality-report-<commit>.json)
|
|
18
|
+
* --verbose print every progress event
|
|
19
|
+
*/
|
|
20
|
+
import { mkdirSync, writeFileSync } from 'node:fs';
|
|
21
|
+
import { dirname, resolve } from 'node:path';
|
|
22
|
+
import { callMcpEndpoint } from '../../api/mcp-client.js';
|
|
23
|
+
import { fetchProductBasics } from '../../phases/find-shared/mcp.js';
|
|
24
|
+
import { runQualityBenchmark, } from '../../phases/quality-benchmark/index.js';
|
|
25
|
+
import { logError, logInfo, logSuccess, logWarning, } from '../../utils/logger.js';
|
|
26
|
+
export async function runQualityBenchmarkCli(productId, options) {
|
|
27
|
+
const repoRoot = resolve(options.repo);
|
|
28
|
+
const installEnabled = options.install !== false;
|
|
29
|
+
logInfo(`Starting quality benchmark for product ${productId}`);
|
|
30
|
+
logInfo(`Repo: ${repoRoot}`);
|
|
31
|
+
if (!installEnabled) {
|
|
32
|
+
logWarning('Install consent NOT granted (--no-install). Missing tools will be marked unmeasured.');
|
|
33
|
+
}
|
|
34
|
+
const basics = await fetchProductBasics(productId).catch(() => ({
|
|
35
|
+
name: productId,
|
|
36
|
+
}));
|
|
37
|
+
const productName = basics.name;
|
|
38
|
+
const onProgress = (event) => {
|
|
39
|
+
if (options.verbose) {
|
|
40
|
+
logInfo(`[${event.phase}] ${event.message}`);
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
// Non-verbose: surface phase changes and tool completions only.
|
|
44
|
+
if (event.phase === 'execution' ||
|
|
45
|
+
event.phase === 'synthesis' ||
|
|
46
|
+
event.phase === 'installation') {
|
|
47
|
+
logInfo(`[${event.phase}] ${event.message}`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
const outcome = await runQualityBenchmark({
|
|
52
|
+
productId,
|
|
53
|
+
productName,
|
|
54
|
+
repoRoot,
|
|
55
|
+
branch: options.branch,
|
|
56
|
+
packageManager: options.pkgManager,
|
|
57
|
+
installEnabled,
|
|
58
|
+
onProgress,
|
|
59
|
+
verbose: options.verbose,
|
|
60
|
+
});
|
|
61
|
+
if (outcome.status === 'error') {
|
|
62
|
+
logError(`Quality benchmark failed: ${outcome.message}`);
|
|
63
|
+
if (outcome.lastPhase) {
|
|
64
|
+
logError(`Last phase: ${outcome.lastPhase}`);
|
|
65
|
+
}
|
|
66
|
+
process.exit(1);
|
|
67
|
+
}
|
|
68
|
+
// Write the report to disk (caller persists from there).
|
|
69
|
+
const outputPath = options.output ??
|
|
70
|
+
resolve(process.cwd(), `quality-report-${outcome.commitSha.slice(0, 8)}.json`);
|
|
71
|
+
const reportEnvelope = {
|
|
72
|
+
run_id: outcome.runId,
|
|
73
|
+
product_id: productId,
|
|
74
|
+
commit_sha: outcome.commitSha,
|
|
75
|
+
branch: options.branch ?? null,
|
|
76
|
+
started_at: outcome.startedAt,
|
|
77
|
+
completed_at: outcome.completedAt,
|
|
78
|
+
duration_seconds: outcome.durationSeconds,
|
|
79
|
+
payload: outcome.report,
|
|
80
|
+
};
|
|
81
|
+
mkdirSync(dirname(outputPath), { recursive: true });
|
|
82
|
+
writeFileSync(outputPath, JSON.stringify(reportEnvelope, null, 2), 'utf8');
|
|
83
|
+
logSuccess(`Report written to ${outputPath}`);
|
|
84
|
+
// Persist via MCP unless the caller opted out.
|
|
85
|
+
if (options.save !== false) {
|
|
86
|
+
try {
|
|
87
|
+
const saved = (await callMcpEndpoint('quality_reports/save', {
|
|
88
|
+
product_id: productId,
|
|
89
|
+
commit_sha: outcome.commitSha,
|
|
90
|
+
rubric_version: outcome.report.rubric_version,
|
|
91
|
+
branch: options.branch ?? null,
|
|
92
|
+
repo_root: repoRoot,
|
|
93
|
+
detected_context: outcome.report.detected_context,
|
|
94
|
+
tool_versions: outcome.report.tool_versions,
|
|
95
|
+
unavailable_tools: outcome.report.unavailable_tools,
|
|
96
|
+
applied_checks: outcome.report.applied_checks,
|
|
97
|
+
tool_outputs: outcome.report.tool_outputs,
|
|
98
|
+
external_signals: outcome.report.external_signals,
|
|
99
|
+
dropped_findings: outcome.report.dropped_findings,
|
|
100
|
+
dimension_scores: outcome.report.dimension_scores,
|
|
101
|
+
overall_score: outcome.report.overall_score,
|
|
102
|
+
overall_grade: outcome.report.overall_grade,
|
|
103
|
+
executive_summary: outcome.report.executive_summary,
|
|
104
|
+
low_confidence: outcome.report.low_confidence,
|
|
105
|
+
status: 'completed',
|
|
106
|
+
started_at: outcome.startedAt,
|
|
107
|
+
completed_at: outcome.completedAt,
|
|
108
|
+
duration_seconds: outcome.durationSeconds,
|
|
109
|
+
replace_existing: options.force === true,
|
|
110
|
+
}));
|
|
111
|
+
const row = JSON.parse(saved.content?.[0]?.text ?? '{}');
|
|
112
|
+
if (row.id) {
|
|
113
|
+
logSuccess(`Saved to quality_reports.id = ${row.id}`);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch (err) {
|
|
117
|
+
logWarning(`Failed to persist report via MCP (will keep local file): ${err instanceof Error ? err.message : String(err)}`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
logInfo(`Overall: grade ${outcome.report.overall_grade ?? '?'} (${outcome.report.overall_score ?? '?'})`);
|
|
121
|
+
if (outcome.report.executive_summary) {
|
|
122
|
+
logInfo(outcome.report.executive_summary);
|
|
123
|
+
}
|
|
124
|
+
}
|
package/dist/index.js
CHANGED
|
@@ -24,6 +24,7 @@ import { runIntelligence } from './commands/intelligence/index.js';
|
|
|
24
24
|
import { runIssueAnalysisCommand } from './commands/issue-analysis/index.js';
|
|
25
25
|
import { runPRResolve } from './commands/pr-resolve/index.js';
|
|
26
26
|
import { runPRReview } from './commands/pr-review/index.js';
|
|
27
|
+
import { runQualityBenchmarkCli } from './commands/quality-benchmark/index.js';
|
|
27
28
|
import { runRefactor } from './commands/refactor/refactor.js';
|
|
28
29
|
import { runReleaseSyncCommand } from './commands/release-sync/index.js';
|
|
29
30
|
import { runRunSheetCommand } from './commands/run-sheet/index.js';
|
|
@@ -486,6 +487,29 @@ program
|
|
|
486
487
|
}
|
|
487
488
|
});
|
|
488
489
|
// ============================================================
|
|
490
|
+
// Subcommand: edsger quality-benchmark <productId>
|
|
491
|
+
// ============================================================
|
|
492
|
+
program
|
|
493
|
+
.command('quality-benchmark <productId>')
|
|
494
|
+
.description('Run an industrial-grade code quality benchmark against a local repo')
|
|
495
|
+
.requiredOption('--repo <path>', 'Path to the local repo checkout')
|
|
496
|
+
.option('--branch <name>', 'Branch name (informational; does not checkout)')
|
|
497
|
+
.option('--pkg-manager <name>', 'npm | pnpm | yarn (auto-detected if absent)')
|
|
498
|
+
.option('--no-install', 'Refuse to install missing tools; mark them unmeasured')
|
|
499
|
+
.option('--output <path>', 'Where to write the JSON report (default: ./quality-report-<commit>.json)')
|
|
500
|
+
.option('--no-save', 'Skip the quality_reports/save MCP call (local-only run)')
|
|
501
|
+
.option('--force', 'Overwrite any existing report row for this (product, commit, rubric)')
|
|
502
|
+
.option('-v, --verbose', 'Print every progress event')
|
|
503
|
+
.action(async (productId, opts) => {
|
|
504
|
+
try {
|
|
505
|
+
await runQualityBenchmarkCli(productId, opts);
|
|
506
|
+
}
|
|
507
|
+
catch (error) {
|
|
508
|
+
logError(error instanceof Error ? error.message : String(error));
|
|
509
|
+
process.exit(1);
|
|
510
|
+
}
|
|
511
|
+
});
|
|
512
|
+
// ============================================================
|
|
489
513
|
// Subcommand: edsger sync-github-issues <productId>
|
|
490
514
|
// ============================================================
|
|
491
515
|
program
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quality benchmark phase entry point.
|
|
3
|
+
*
|
|
4
|
+
* Drives a single Claude Agent SDK session whose system prompt is the
|
|
5
|
+
* full rubric (loaded by `prompts.ts`) and whose tool surface is the
|
|
6
|
+
* in-process MCP server in `mcp-server.ts`. The session walks the
|
|
7
|
+
* rubric's 6-phase pipeline by calling MCP tools; we observe the
|
|
8
|
+
* messages for progress events and capture the final JSON report.
|
|
9
|
+
*
|
|
10
|
+
* Pure compute boundary: this function does NOT touch the database.
|
|
11
|
+
* Callers (the CLI command, the desktop-app IPC bridge) decide how to
|
|
12
|
+
* persist the returned payload — see the CLI command in
|
|
13
|
+
* `src/commands/quality-benchmark/index.ts` for the MCP-mediated
|
|
14
|
+
* persistence path.
|
|
15
|
+
*
|
|
16
|
+
* Side effects that DO happen here:
|
|
17
|
+
* - Creates a scratch dir under ~/.edsger/quality-runs/<run-id>/
|
|
18
|
+
* - Writes raw tool outputs to that dir
|
|
19
|
+
* - Spawns child_process invocations of the catalog tools
|
|
20
|
+
*/
|
|
21
|
+
import { type ProgressEvent } from './mcp-server.js';
|
|
22
|
+
import { type QualityReportPayload, type RunPhase } from './types.js';
|
|
23
|
+
export interface QualityBenchmarkOptions {
|
|
24
|
+
productId: string;
|
|
25
|
+
productName: string;
|
|
26
|
+
/** Absolute path to the local repo checkout. */
|
|
27
|
+
repoRoot: string;
|
|
28
|
+
/** Branch name (informational; doesn't checkout). */
|
|
29
|
+
branch?: string;
|
|
30
|
+
/**
|
|
31
|
+
* Detected JS package manager (informational; the SDK will detect on its
|
|
32
|
+
* own but we pass through for catalog filtering).
|
|
33
|
+
*/
|
|
34
|
+
packageManager?: string;
|
|
35
|
+
/** When false, install_tool always refuses (mirrors --no-install CLI flag). */
|
|
36
|
+
installEnabled: boolean;
|
|
37
|
+
/**
|
|
38
|
+
* Optional progress callback for the calling surface (CLI logger,
|
|
39
|
+
* Electron IPC, etc.). Receives every MCP-server-emitted event.
|
|
40
|
+
*/
|
|
41
|
+
onProgress?: (event: ProgressEvent) => void;
|
|
42
|
+
verbose?: boolean;
|
|
43
|
+
/** Override scratch-dir base (mainly for tests). */
|
|
44
|
+
scanDirBase?: string;
|
|
45
|
+
/** Override commit sha resolution (mainly for tests). */
|
|
46
|
+
commitShaOverride?: string;
|
|
47
|
+
}
|
|
48
|
+
export type QualityBenchmarkOutcome = {
|
|
49
|
+
status: 'success';
|
|
50
|
+
report: QualityReportPayload;
|
|
51
|
+
runId: string;
|
|
52
|
+
commitSha: string;
|
|
53
|
+
startedAt: string;
|
|
54
|
+
completedAt: string;
|
|
55
|
+
durationSeconds: number;
|
|
56
|
+
} | {
|
|
57
|
+
status: 'error';
|
|
58
|
+
message: string;
|
|
59
|
+
runId?: string;
|
|
60
|
+
commitSha?: string;
|
|
61
|
+
lastPhase?: RunPhase;
|
|
62
|
+
};
|
|
63
|
+
export declare function runQualityBenchmark(opts: QualityBenchmarkOptions): Promise<QualityBenchmarkOutcome>;
|
|
64
|
+
export type { ProgressEvent } from './mcp-server.js';
|
|
65
|
+
export type { QualityReportPayload, RunPhase } from './types.js';
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quality benchmark phase entry point.
|
|
3
|
+
*
|
|
4
|
+
* Drives a single Claude Agent SDK session whose system prompt is the
|
|
5
|
+
* full rubric (loaded by `prompts.ts`) and whose tool surface is the
|
|
6
|
+
* in-process MCP server in `mcp-server.ts`. The session walks the
|
|
7
|
+
* rubric's 6-phase pipeline by calling MCP tools; we observe the
|
|
8
|
+
* messages for progress events and capture the final JSON report.
|
|
9
|
+
*
|
|
10
|
+
* Pure compute boundary: this function does NOT touch the database.
|
|
11
|
+
* Callers (the CLI command, the desktop-app IPC bridge) decide how to
|
|
12
|
+
* persist the returned payload — see the CLI command in
|
|
13
|
+
* `src/commands/quality-benchmark/index.ts` for the MCP-mediated
|
|
14
|
+
* persistence path.
|
|
15
|
+
*
|
|
16
|
+
* Side effects that DO happen here:
|
|
17
|
+
* - Creates a scratch dir under ~/.edsger/quality-runs/<run-id>/
|
|
18
|
+
* - Writes raw tool outputs to that dir
|
|
19
|
+
* - Spawns child_process invocations of the catalog tools
|
|
20
|
+
*/
|
|
21
|
+
import { spawnSync } from 'node:child_process';
|
|
22
|
+
import { randomUUID } from 'node:crypto';
|
|
23
|
+
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
24
|
+
import { DEFAULT_MODEL } from '../../constants.js';
|
|
25
|
+
import { logError, logInfo, logSuccess } from '../../utils/logger.js';
|
|
26
|
+
import { createPromptGenerator } from '../pr-shared/agent-utils.js';
|
|
27
|
+
import { createEmptyRunState, createQualityBenchmarkMcpServer, } from './mcp-server.js';
|
|
28
|
+
import { createQualityBenchmarkSystemPrompt, createQualityBenchmarkUserPrompt, extractReportJson, } from './prompts.js';
|
|
29
|
+
import { createRunnerContext } from './tool-runner.js';
|
|
30
|
+
import { RUBRIC_VERSION, } from './types.js';
|
|
31
|
+
// SDK upper bound. Each phase usually takes ~1-3 turns; with up to 30
|
|
32
|
+
// tools probed + run plus per-finding verification, 500 gives headroom while
|
|
33
|
+
// still cutting off runaway loops.
|
|
34
|
+
const MAX_TURNS = 500;
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Orchestration
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// eslint-disable-next-line complexity
|
|
39
|
+
export async function runQualityBenchmark(opts) {
|
|
40
|
+
const { productId, productName, repoRoot, branch, packageManager, installEnabled, onProgress, verbose, scanDirBase, commitShaOverride, } = opts;
|
|
41
|
+
// 1. Determine commit sha from the local checkout.
|
|
42
|
+
const commitSha = commitShaOverride ?? readGitHead(repoRoot);
|
|
43
|
+
if (!commitSha) {
|
|
44
|
+
return {
|
|
45
|
+
status: 'error',
|
|
46
|
+
message: `Could not read git HEAD at ${repoRoot} — is this a git repo?`,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
// 2. Set up runner + MCP server.
|
|
50
|
+
const runId = randomUUID();
|
|
51
|
+
const startedAt = new Date().toISOString();
|
|
52
|
+
const runner = createRunnerContext({
|
|
53
|
+
repo_root: repoRoot,
|
|
54
|
+
package_manager: packageManager,
|
|
55
|
+
install_enabled: installEnabled,
|
|
56
|
+
run_id: runId,
|
|
57
|
+
base_dir: scanDirBase,
|
|
58
|
+
});
|
|
59
|
+
const state = createEmptyRunState();
|
|
60
|
+
let lastPhase = 'detection';
|
|
61
|
+
const emit = (event) => {
|
|
62
|
+
lastPhase = event.phase;
|
|
63
|
+
onProgress?.(event);
|
|
64
|
+
if (verbose) {
|
|
65
|
+
logInfo(`[${event.phase}] ${event.message}`);
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
const mcpServer = createQualityBenchmarkMcpServer({ runner, onProgress: emit, installEnabled }, state);
|
|
69
|
+
// 3. Run the SDK session.
|
|
70
|
+
const systemPrompt = createQualityBenchmarkSystemPrompt();
|
|
71
|
+
const userPrompt = createQualityBenchmarkUserPrompt({
|
|
72
|
+
productName,
|
|
73
|
+
productId,
|
|
74
|
+
repoRoot,
|
|
75
|
+
branch,
|
|
76
|
+
commitSha,
|
|
77
|
+
runId,
|
|
78
|
+
installEnabled,
|
|
79
|
+
});
|
|
80
|
+
let lastAssistantText = '';
|
|
81
|
+
try {
|
|
82
|
+
for await (const message of query({
|
|
83
|
+
prompt: createPromptGenerator(userPrompt),
|
|
84
|
+
options: {
|
|
85
|
+
systemPrompt: {
|
|
86
|
+
type: 'preset',
|
|
87
|
+
preset: 'claude_code',
|
|
88
|
+
append: systemPrompt,
|
|
89
|
+
},
|
|
90
|
+
model: DEFAULT_MODEL,
|
|
91
|
+
maxTurns: MAX_TURNS,
|
|
92
|
+
permissionMode: 'bypassPermissions',
|
|
93
|
+
cwd: repoRoot,
|
|
94
|
+
mcpServers: {
|
|
95
|
+
'quality-benchmark': mcpServer,
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
})) {
|
|
99
|
+
if (message.type === 'assistant' && message.message?.content) {
|
|
100
|
+
for (const c of message.message.content) {
|
|
101
|
+
if (c.type === 'text') {
|
|
102
|
+
lastAssistantText += `${c.text}\n`;
|
|
103
|
+
if (verbose) {
|
|
104
|
+
logInfo(`🤖 ${c.text}`);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
if (message.type === 'result') {
|
|
110
|
+
if (message.subtype !== 'success') {
|
|
111
|
+
logError(`SDK session ended with subtype=${message.subtype}`);
|
|
112
|
+
}
|
|
113
|
+
if (message.subtype === 'success' &&
|
|
114
|
+
'result' in message &&
|
|
115
|
+
typeof message.result === 'string') {
|
|
116
|
+
lastAssistantText = message.result || lastAssistantText;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
catch (err) {
|
|
122
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
123
|
+
return {
|
|
124
|
+
status: 'error',
|
|
125
|
+
message: `SDK error: ${msg}`,
|
|
126
|
+
runId,
|
|
127
|
+
commitSha,
|
|
128
|
+
lastPhase,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
// 4. Extract + validate JSON report.
|
|
132
|
+
const raw = extractReportJson(lastAssistantText);
|
|
133
|
+
if (!raw || typeof raw !== 'object') {
|
|
134
|
+
return {
|
|
135
|
+
status: 'error',
|
|
136
|
+
message: 'SDK session did not emit a parseable JSON report',
|
|
137
|
+
runId,
|
|
138
|
+
commitSha,
|
|
139
|
+
lastPhase: 'synthesis',
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
const report = raw;
|
|
143
|
+
// Merge SDK-emitted fields with the state we accumulated server-side. The
|
|
144
|
+
// server-side state is authoritative for execution metadata; the LLM owns
|
|
145
|
+
// the scoring fields.
|
|
146
|
+
const merged = {
|
|
147
|
+
...report,
|
|
148
|
+
rubric_version: RUBRIC_VERSION,
|
|
149
|
+
tool_versions: { ...state.tool_versions, ...(report.tool_versions ?? {}) },
|
|
150
|
+
unavailable_tools: dedupUnavailable([
|
|
151
|
+
...state.unavailable_tools,
|
|
152
|
+
...(report.unavailable_tools ?? []),
|
|
153
|
+
]),
|
|
154
|
+
tool_outputs: { ...state.tool_outputs, ...(report.tool_outputs ?? {}) },
|
|
155
|
+
dropped_findings: Math.max(state.dropped_findings, report.dropped_findings ?? 0),
|
|
156
|
+
};
|
|
157
|
+
const completedAt = new Date().toISOString();
|
|
158
|
+
const durationSec = Math.round((new Date(completedAt).getTime() - new Date(startedAt).getTime()) / 1000);
|
|
159
|
+
logSuccess(`Benchmark complete — grade ${merged.overall_grade ?? '?'} (${merged.overall_score ?? '?'})`);
|
|
160
|
+
return {
|
|
161
|
+
status: 'success',
|
|
162
|
+
report: merged,
|
|
163
|
+
runId,
|
|
164
|
+
commitSha,
|
|
165
|
+
startedAt,
|
|
166
|
+
completedAt,
|
|
167
|
+
durationSeconds: durationSec,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
// ---------------------------------------------------------------------------
|
|
171
|
+
// Helpers
|
|
172
|
+
// ---------------------------------------------------------------------------
|
|
173
|
+
function readGitHead(repoRoot) {
|
|
174
|
+
const res = spawnSync('git', ['rev-parse', 'HEAD'], {
|
|
175
|
+
cwd: repoRoot,
|
|
176
|
+
encoding: 'utf8',
|
|
177
|
+
});
|
|
178
|
+
if (res.status !== 0) {
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
return res.stdout.trim() || null;
|
|
182
|
+
}
|
|
183
|
+
function dedupUnavailable(list) {
|
|
184
|
+
const seen = new Set();
|
|
185
|
+
const out = [];
|
|
186
|
+
for (const entry of list) {
|
|
187
|
+
if (seen.has(entry.name)) {
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
seen.add(entry.name);
|
|
191
|
+
out.push(entry);
|
|
192
|
+
}
|
|
193
|
+
return out;
|
|
194
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-process MCP server exposing the quality-benchmark execution
|
|
3
|
+
* primitives to the Claude Agent SDK session.
|
|
4
|
+
*
|
|
5
|
+
* The single SDK session reasons about the rubric in its system prompt
|
|
6
|
+
* and calls these tools to drive the 6-phase pipeline:
|
|
7
|
+
*
|
|
8
|
+
* list_applicable_tools(detected_context) — Phase 1 -> 2 handoff
|
|
9
|
+
* probe_tool(tool_id) — Phase 2
|
|
10
|
+
* install_tool(tool_id) — Phase 2.5
|
|
11
|
+
* run_tool(tool_id) — Phase 3 (returns parsed summary)
|
|
12
|
+
* verify_finding(file, line, snippet?) — Phase 5
|
|
13
|
+
* record_progress(phase, message) — UI streaming side channel
|
|
14
|
+
*
|
|
15
|
+
* Every command and install step is whitelisted by `tool-catalog.ts`
|
|
16
|
+
* and re-checked at runtime by `tool-runner.ts`. The MCP server is a
|
|
17
|
+
* thin adapter between the SDK's tool-call protocol and that runner —
|
|
18
|
+
* it adds no new privilege, no new commands, and no new side effects.
|
|
19
|
+
*/
|
|
20
|
+
import { type RunnerContext } from './tool-runner.js';
|
|
21
|
+
import type { ParsedToolOutput, RunPhase, ToolRunOutput, UnavailableTool } from './types';
|
|
22
|
+
export interface QualityMcpDeps {
|
|
23
|
+
/** Runner context shared across all tool calls in this benchmark run. */
|
|
24
|
+
runner: RunnerContext;
|
|
25
|
+
/** Optional callback for streaming progress updates to the UI / DB. */
|
|
26
|
+
onProgress?: (event: ProgressEvent) => void;
|
|
27
|
+
/** When false, install_tool always returns install_disabled. */
|
|
28
|
+
installEnabled: boolean;
|
|
29
|
+
}
|
|
30
|
+
export interface ProgressEvent {
|
|
31
|
+
phase: RunPhase;
|
|
32
|
+
message: string;
|
|
33
|
+
data?: Record<string, unknown>;
|
|
34
|
+
}
|
|
35
|
+
/** State accumulated across a single benchmark run. */
|
|
36
|
+
export interface RunState {
|
|
37
|
+
tool_versions: Record<string, string>;
|
|
38
|
+
unavailable_tools: UnavailableTool[];
|
|
39
|
+
tool_outputs: Record<string, ToolRunOutput>;
|
|
40
|
+
/** Cached parsed summaries for synthesis-time read-back. */
|
|
41
|
+
parsed_summaries: Record<string, ParsedToolOutput>;
|
|
42
|
+
/** Dropped finding count from Phase 5 verification. */
|
|
43
|
+
dropped_findings: number;
|
|
44
|
+
}
|
|
45
|
+
export declare function createEmptyRunState(): RunState;
|
|
46
|
+
export declare function createQualityBenchmarkMcpServer(deps: QualityMcpDeps, state: RunState): import("@anthropic-ai/claude-agent-sdk").McpSdkServerConfigWithInstance;
|