edsger 0.55.4 → 0.56.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ /**
2
+ * CLI command: `edsger quality-benchmark <productId>`
3
+ *
4
+ * Runs the quality-benchmark phase against a local repo checkout and
5
+ * writes the resulting JSON report to disk. Persistence to the
6
+ * `quality_reports` table is the caller's responsibility — desktop-app
7
+ * picks the JSON up via stdout / file and writes via its supabase
8
+ * client, exactly the same pattern used by other CLI commands.
9
+ *
10
+ * Usage:
11
+ * edsger quality-benchmark <productId>
12
+ * --repo <path> (required) local repo checkout
13
+ * --branch <name> (optional) branch name (informational)
14
+ * --pkg-manager <name> (optional) npm|pnpm|yarn (auto-detected if absent)
15
+ * --no-install refuse to install missing tools
16
+ * --output <path> where to write the JSON report
17
+ * (default: ./quality-report-<commit>.json)
18
+ * --verbose print every progress event
19
+ */
20
+ export interface QualityBenchmarkCliOptions {
21
+ repo: string;
22
+ branch?: string;
23
+ pkgManager?: string;
24
+ install?: boolean;
25
+ output?: string;
26
+ verbose?: boolean;
27
+ /** When false, skip the quality_reports/save MCP call. */
28
+ save?: boolean;
29
+ /** Overwrite any existing report row for this (product, commit, rubric). */
30
+ force?: boolean;
31
+ }
32
+ export declare function runQualityBenchmarkCli(productId: string, options: QualityBenchmarkCliOptions): Promise<void>;
@@ -0,0 +1,124 @@
1
+ /**
2
+ * CLI command: `edsger quality-benchmark <productId>`
3
+ *
4
+ * Runs the quality-benchmark phase against a local repo checkout and
5
+ * writes the resulting JSON report to disk. Persistence to the
6
+ * `quality_reports` table is the caller's responsibility — desktop-app
7
+ * picks the JSON up via stdout / file and writes via its supabase
8
+ * client, exactly the same pattern used by other CLI commands.
9
+ *
10
+ * Usage:
11
+ * edsger quality-benchmark <productId>
12
+ * --repo <path> (required) local repo checkout
13
+ * --branch <name> (optional) branch name (informational)
14
+ * --pkg-manager <name> (optional) npm|pnpm|yarn (auto-detected if absent)
15
+ * --no-install refuse to install missing tools
16
+ * --output <path> where to write the JSON report
17
+ * (default: ./quality-report-<commit>.json)
18
+ * --verbose print every progress event
19
+ */
20
+ import { mkdirSync, writeFileSync } from 'node:fs';
21
+ import { dirname, resolve } from 'node:path';
22
+ import { callMcpEndpoint } from '../../api/mcp-client.js';
23
+ import { fetchProductBasics } from '../../phases/find-shared/mcp.js';
24
+ import { runQualityBenchmark, } from '../../phases/quality-benchmark/index.js';
25
+ import { logError, logInfo, logSuccess, logWarning, } from '../../utils/logger.js';
26
+ export async function runQualityBenchmarkCli(productId, options) {
27
+ const repoRoot = resolve(options.repo);
28
+ const installEnabled = options.install !== false;
29
+ logInfo(`Starting quality benchmark for product ${productId}`);
30
+ logInfo(`Repo: ${repoRoot}`);
31
+ if (!installEnabled) {
32
+ logWarning('Install consent NOT granted (--no-install). Missing tools will be marked unmeasured.');
33
+ }
34
+ const basics = await fetchProductBasics(productId).catch(() => ({
35
+ name: productId,
36
+ }));
37
+ const productName = basics.name;
38
+ const onProgress = (event) => {
39
+ if (options.verbose) {
40
+ logInfo(`[${event.phase}] ${event.message}`);
41
+ }
42
+ else {
43
+ // Non-verbose: surface phase changes and tool completions only.
44
+ if (event.phase === 'execution' ||
45
+ event.phase === 'synthesis' ||
46
+ event.phase === 'installation') {
47
+ logInfo(`[${event.phase}] ${event.message}`);
48
+ }
49
+ }
50
+ };
51
+ const outcome = await runQualityBenchmark({
52
+ productId,
53
+ productName,
54
+ repoRoot,
55
+ branch: options.branch,
56
+ packageManager: options.pkgManager,
57
+ installEnabled,
58
+ onProgress,
59
+ verbose: options.verbose,
60
+ });
61
+ if (outcome.status === 'error') {
62
+ logError(`Quality benchmark failed: ${outcome.message}`);
63
+ if (outcome.lastPhase) {
64
+ logError(`Last phase: ${outcome.lastPhase}`);
65
+ }
66
+ process.exit(1);
67
+ }
68
+ // Write the report to disk (caller persists from there).
69
+ const outputPath = options.output ??
70
+ resolve(process.cwd(), `quality-report-${outcome.commitSha.slice(0, 8)}.json`);
71
+ const reportEnvelope = {
72
+ run_id: outcome.runId,
73
+ product_id: productId,
74
+ commit_sha: outcome.commitSha,
75
+ branch: options.branch ?? null,
76
+ started_at: outcome.startedAt,
77
+ completed_at: outcome.completedAt,
78
+ duration_seconds: outcome.durationSeconds,
79
+ payload: outcome.report,
80
+ };
81
+ mkdirSync(dirname(outputPath), { recursive: true });
82
+ writeFileSync(outputPath, JSON.stringify(reportEnvelope, null, 2), 'utf8');
83
+ logSuccess(`Report written to ${outputPath}`);
84
+ // Persist via MCP unless the caller opted out.
85
+ if (options.save !== false) {
86
+ try {
87
+ const saved = (await callMcpEndpoint('quality_reports/save', {
88
+ product_id: productId,
89
+ commit_sha: outcome.commitSha,
90
+ rubric_version: outcome.report.rubric_version,
91
+ branch: options.branch ?? null,
92
+ repo_root: repoRoot,
93
+ detected_context: outcome.report.detected_context,
94
+ tool_versions: outcome.report.tool_versions,
95
+ unavailable_tools: outcome.report.unavailable_tools,
96
+ applied_checks: outcome.report.applied_checks,
97
+ tool_outputs: outcome.report.tool_outputs,
98
+ external_signals: outcome.report.external_signals,
99
+ dropped_findings: outcome.report.dropped_findings,
100
+ dimension_scores: outcome.report.dimension_scores,
101
+ overall_score: outcome.report.overall_score,
102
+ overall_grade: outcome.report.overall_grade,
103
+ executive_summary: outcome.report.executive_summary,
104
+ low_confidence: outcome.report.low_confidence,
105
+ status: 'completed',
106
+ started_at: outcome.startedAt,
107
+ completed_at: outcome.completedAt,
108
+ duration_seconds: outcome.durationSeconds,
109
+ replace_existing: options.force === true,
110
+ }));
111
+ const row = JSON.parse(saved.content?.[0]?.text ?? '{}');
112
+ if (row.id) {
113
+ logSuccess(`Saved to quality_reports.id = ${row.id}`);
114
+ }
115
+ }
116
+ catch (err) {
117
+ logWarning(`Failed to persist report via MCP (will keep local file): ${err instanceof Error ? err.message : String(err)}`);
118
+ }
119
+ }
120
+ logInfo(`Overall: grade ${outcome.report.overall_grade ?? '?'} (${outcome.report.overall_score ?? '?'})`);
121
+ if (outcome.report.executive_summary) {
122
+ logInfo(outcome.report.executive_summary);
123
+ }
124
+ }
package/dist/index.js CHANGED
@@ -24,6 +24,7 @@ import { runIntelligence } from './commands/intelligence/index.js';
24
24
  import { runIssueAnalysisCommand } from './commands/issue-analysis/index.js';
25
25
  import { runPRResolve } from './commands/pr-resolve/index.js';
26
26
  import { runPRReview } from './commands/pr-review/index.js';
27
+ import { runQualityBenchmarkCli } from './commands/quality-benchmark/index.js';
27
28
  import { runRefactor } from './commands/refactor/refactor.js';
28
29
  import { runReleaseSyncCommand } from './commands/release-sync/index.js';
29
30
  import { runRunSheetCommand } from './commands/run-sheet/index.js';
@@ -486,6 +487,29 @@ program
486
487
  }
487
488
  });
488
489
  // ============================================================
490
+ // Subcommand: edsger quality-benchmark <productId>
491
+ // ============================================================
492
+ program
493
+ .command('quality-benchmark <productId>')
494
+ .description('Run an industrial-grade code quality benchmark against a local repo')
495
+ .requiredOption('--repo <path>', 'Path to the local repo checkout')
496
+ .option('--branch <name>', 'Branch name (informational; does not checkout)')
497
+ .option('--pkg-manager <name>', 'npm | pnpm | yarn (auto-detected if absent)')
498
+ .option('--no-install', 'Refuse to install missing tools; mark them unmeasured')
499
+ .option('--output <path>', 'Where to write the JSON report (default: ./quality-report-<commit>.json)')
500
+ .option('--no-save', 'Skip the quality_reports/save MCP call (local-only run)')
501
+ .option('--force', 'Overwrite any existing report row for this (product, commit, rubric)')
502
+ .option('-v, --verbose', 'Print every progress event')
503
+ .action(async (productId, opts) => {
504
+ try {
505
+ await runQualityBenchmarkCli(productId, opts);
506
+ }
507
+ catch (error) {
508
+ logError(error instanceof Error ? error.message : String(error));
509
+ process.exit(1);
510
+ }
511
+ });
512
+ // ============================================================
489
513
  // Subcommand: edsger sync-github-issues <productId>
490
514
  // ============================================================
491
515
  program
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Quality benchmark phase entry point.
3
+ *
4
+ * Drives a single Claude Agent SDK session whose system prompt is the
5
+ * full rubric (loaded by `prompts.ts`) and whose tool surface is the
6
+ * in-process MCP server in `mcp-server.ts`. The session walks the
7
+ * rubric's 6-phase pipeline by calling MCP tools; we observe the
8
+ * messages for progress events and capture the final JSON report.
9
+ *
10
+ * Pure compute boundary: this function does NOT touch the database.
11
+ * Callers (the CLI command, the desktop-app IPC bridge) decide how to
12
+ * persist the returned payload — see the CLI command in
13
+ * `src/commands/quality-benchmark/index.ts` for the MCP-mediated
14
+ * persistence path.
15
+ *
16
+ * Side effects that DO happen here:
17
+ * - Creates a scratch dir under ~/.edsger/quality-runs/<run-id>/
18
+ * - Writes raw tool outputs to that dir
19
+ * - Spawns child_process invocations of the catalog tools
20
+ */
21
+ import { type ProgressEvent } from './mcp-server.js';
22
+ import { type QualityReportPayload, type RunPhase } from './types.js';
23
+ export interface QualityBenchmarkOptions {
24
+ productId: string;
25
+ productName: string;
26
+ /** Absolute path to the local repo checkout. */
27
+ repoRoot: string;
28
+ /** Branch name (informational; doesn't checkout). */
29
+ branch?: string;
30
+ /**
31
+ * Detected JS package manager (informational; the SDK will detect on its
32
+ * own but we pass through for catalog filtering).
33
+ */
34
+ packageManager?: string;
35
+ /** When false, install_tool always refuses (mirrors --no-install CLI flag). */
36
+ installEnabled: boolean;
37
+ /**
38
+ * Optional progress callback for the calling surface (CLI logger,
39
+ * Electron IPC, etc.). Receives every MCP-server-emitted event.
40
+ */
41
+ onProgress?: (event: ProgressEvent) => void;
42
+ verbose?: boolean;
43
+ /** Override scratch-dir base (mainly for tests). */
44
+ scanDirBase?: string;
45
+ /** Override commit sha resolution (mainly for tests). */
46
+ commitShaOverride?: string;
47
+ }
48
+ export type QualityBenchmarkOutcome = {
49
+ status: 'success';
50
+ report: QualityReportPayload;
51
+ runId: string;
52
+ commitSha: string;
53
+ startedAt: string;
54
+ completedAt: string;
55
+ durationSeconds: number;
56
+ } | {
57
+ status: 'error';
58
+ message: string;
59
+ runId?: string;
60
+ commitSha?: string;
61
+ lastPhase?: RunPhase;
62
+ };
63
+ export declare function runQualityBenchmark(opts: QualityBenchmarkOptions): Promise<QualityBenchmarkOutcome>;
64
+ export type { ProgressEvent } from './mcp-server.js';
65
+ export type { QualityReportPayload, RunPhase } from './types.js';
@@ -0,0 +1,194 @@
1
+ /**
2
+ * Quality benchmark phase entry point.
3
+ *
4
+ * Drives a single Claude Agent SDK session whose system prompt is the
5
+ * full rubric (loaded by `prompts.ts`) and whose tool surface is the
6
+ * in-process MCP server in `mcp-server.ts`. The session walks the
7
+ * rubric's 6-phase pipeline by calling MCP tools; we observe the
8
+ * messages for progress events and capture the final JSON report.
9
+ *
10
+ * Pure compute boundary: this function does NOT touch the database.
11
+ * Callers (the CLI command, the desktop-app IPC bridge) decide how to
12
+ * persist the returned payload — see the CLI command in
13
+ * `src/commands/quality-benchmark/index.ts` for the MCP-mediated
14
+ * persistence path.
15
+ *
16
+ * Side effects that DO happen here:
17
+ * - Creates a scratch dir under ~/.edsger/quality-runs/<run-id>/
18
+ * - Writes raw tool outputs to that dir
19
+ * - Spawns child_process invocations of the catalog tools
20
+ */
21
+ import { spawnSync } from 'node:child_process';
22
+ import { randomUUID } from 'node:crypto';
23
+ import { query } from '@anthropic-ai/claude-agent-sdk';
24
+ import { DEFAULT_MODEL } from '../../constants.js';
25
+ import { logError, logInfo, logSuccess } from '../../utils/logger.js';
26
+ import { createPromptGenerator } from '../pr-shared/agent-utils.js';
27
+ import { createEmptyRunState, createQualityBenchmarkMcpServer, } from './mcp-server.js';
28
+ import { createQualityBenchmarkSystemPrompt, createQualityBenchmarkUserPrompt, extractReportJson, } from './prompts.js';
29
+ import { createRunnerContext } from './tool-runner.js';
30
+ import { RUBRIC_VERSION, } from './types.js';
31
+ // SDK upper bound. Each phase usually takes ~1-3 turns; with up to 30
32
+ // tools probed + run plus per-finding verification, 500 gives headroom while
33
+ // still cutting off runaway loops.
34
+ const MAX_TURNS = 500;
35
+ // ---------------------------------------------------------------------------
36
+ // Orchestration
37
+ // ---------------------------------------------------------------------------
38
+ // eslint-disable-next-line complexity
39
+ export async function runQualityBenchmark(opts) {
40
+ const { productId, productName, repoRoot, branch, packageManager, installEnabled, onProgress, verbose, scanDirBase, commitShaOverride, } = opts;
41
+ // 1. Determine commit sha from the local checkout.
42
+ const commitSha = commitShaOverride ?? readGitHead(repoRoot);
43
+ if (!commitSha) {
44
+ return {
45
+ status: 'error',
46
+ message: `Could not read git HEAD at ${repoRoot} — is this a git repo?`,
47
+ };
48
+ }
49
+ // 2. Set up runner + MCP server.
50
+ const runId = randomUUID();
51
+ const startedAt = new Date().toISOString();
52
+ const runner = createRunnerContext({
53
+ repo_root: repoRoot,
54
+ package_manager: packageManager,
55
+ install_enabled: installEnabled,
56
+ run_id: runId,
57
+ base_dir: scanDirBase,
58
+ });
59
+ const state = createEmptyRunState();
60
+ let lastPhase = 'detection';
61
+ const emit = (event) => {
62
+ lastPhase = event.phase;
63
+ onProgress?.(event);
64
+ if (verbose) {
65
+ logInfo(`[${event.phase}] ${event.message}`);
66
+ }
67
+ };
68
+ const mcpServer = createQualityBenchmarkMcpServer({ runner, onProgress: emit, installEnabled }, state);
69
+ // 3. Run the SDK session.
70
+ const systemPrompt = createQualityBenchmarkSystemPrompt();
71
+ const userPrompt = createQualityBenchmarkUserPrompt({
72
+ productName,
73
+ productId,
74
+ repoRoot,
75
+ branch,
76
+ commitSha,
77
+ runId,
78
+ installEnabled,
79
+ });
80
+ let lastAssistantText = '';
81
+ try {
82
+ for await (const message of query({
83
+ prompt: createPromptGenerator(userPrompt),
84
+ options: {
85
+ systemPrompt: {
86
+ type: 'preset',
87
+ preset: 'claude_code',
88
+ append: systemPrompt,
89
+ },
90
+ model: DEFAULT_MODEL,
91
+ maxTurns: MAX_TURNS,
92
+ permissionMode: 'bypassPermissions',
93
+ cwd: repoRoot,
94
+ mcpServers: {
95
+ 'quality-benchmark': mcpServer,
96
+ },
97
+ },
98
+ })) {
99
+ if (message.type === 'assistant' && message.message?.content) {
100
+ for (const c of message.message.content) {
101
+ if (c.type === 'text') {
102
+ lastAssistantText += `${c.text}\n`;
103
+ if (verbose) {
104
+ logInfo(`🤖 ${c.text}`);
105
+ }
106
+ }
107
+ }
108
+ }
109
+ if (message.type === 'result') {
110
+ if (message.subtype !== 'success') {
111
+ logError(`SDK session ended with subtype=${message.subtype}`);
112
+ }
113
+ if (message.subtype === 'success' &&
114
+ 'result' in message &&
115
+ typeof message.result === 'string') {
116
+ lastAssistantText = message.result || lastAssistantText;
117
+ }
118
+ }
119
+ }
120
+ }
121
+ catch (err) {
122
+ const msg = err instanceof Error ? err.message : String(err);
123
+ return {
124
+ status: 'error',
125
+ message: `SDK error: ${msg}`,
126
+ runId,
127
+ commitSha,
128
+ lastPhase,
129
+ };
130
+ }
131
+ // 4. Extract + validate JSON report.
132
+ const raw = extractReportJson(lastAssistantText);
133
+ if (!raw || typeof raw !== 'object') {
134
+ return {
135
+ status: 'error',
136
+ message: 'SDK session did not emit a parseable JSON report',
137
+ runId,
138
+ commitSha,
139
+ lastPhase: 'synthesis',
140
+ };
141
+ }
142
+ const report = raw;
143
+ // Merge SDK-emitted fields with the state we accumulated server-side. The
144
+ // server-side state is authoritative for execution metadata; the LLM owns
145
+ // the scoring fields.
146
+ const merged = {
147
+ ...report,
148
+ rubric_version: RUBRIC_VERSION,
149
+ tool_versions: { ...state.tool_versions, ...(report.tool_versions ?? {}) },
150
+ unavailable_tools: dedupUnavailable([
151
+ ...state.unavailable_tools,
152
+ ...(report.unavailable_tools ?? []),
153
+ ]),
154
+ tool_outputs: { ...state.tool_outputs, ...(report.tool_outputs ?? {}) },
155
+ dropped_findings: Math.max(state.dropped_findings, report.dropped_findings ?? 0),
156
+ };
157
+ const completedAt = new Date().toISOString();
158
+ const durationSec = Math.round((new Date(completedAt).getTime() - new Date(startedAt).getTime()) / 1000);
159
+ logSuccess(`Benchmark complete — grade ${merged.overall_grade ?? '?'} (${merged.overall_score ?? '?'})`);
160
+ return {
161
+ status: 'success',
162
+ report: merged,
163
+ runId,
164
+ commitSha,
165
+ startedAt,
166
+ completedAt,
167
+ durationSeconds: durationSec,
168
+ };
169
+ }
170
+ // ---------------------------------------------------------------------------
171
+ // Helpers
172
+ // ---------------------------------------------------------------------------
173
+ function readGitHead(repoRoot) {
174
+ const res = spawnSync('git', ['rev-parse', 'HEAD'], {
175
+ cwd: repoRoot,
176
+ encoding: 'utf8',
177
+ });
178
+ if (res.status !== 0) {
179
+ return null;
180
+ }
181
+ return res.stdout.trim() || null;
182
+ }
183
+ function dedupUnavailable(list) {
184
+ const seen = new Set();
185
+ const out = [];
186
+ for (const entry of list) {
187
+ if (seen.has(entry.name)) {
188
+ continue;
189
+ }
190
+ seen.add(entry.name);
191
+ out.push(entry);
192
+ }
193
+ return out;
194
+ }
@@ -0,0 +1,46 @@
1
+ /**
2
+ * In-process MCP server exposing the quality-benchmark execution
3
+ * primitives to the Claude Agent SDK session.
4
+ *
5
+ * The single SDK session reasons about the rubric in its system prompt
6
+ * and calls these tools to drive the 6-phase pipeline:
7
+ *
8
+ * list_applicable_tools(detected_context) — Phase 1 -> 2 handoff
9
+ * probe_tool(tool_id) — Phase 2
10
+ * install_tool(tool_id) — Phase 2.5
11
+ * run_tool(tool_id) — Phase 3 (returns parsed summary)
12
+ * verify_finding(file, line, snippet?) — Phase 5
13
+ * record_progress(phase, message) — UI streaming side channel
14
+ *
15
+ * Every command and install step is whitelisted by `tool-catalog.ts`
16
+ * and re-checked at runtime by `tool-runner.ts`. The MCP server is a
17
+ * thin adapter between the SDK's tool-call protocol and that runner —
18
+ * it adds no new privilege, no new commands, and no new side effects.
19
+ */
20
+ import { type RunnerContext } from './tool-runner.js';
21
+ import type { ParsedToolOutput, RunPhase, ToolRunOutput, UnavailableTool } from './types.js';
22
+ export interface QualityMcpDeps {
23
+ /** Runner context shared across all tool calls in this benchmark run. */
24
+ runner: RunnerContext;
25
+ /** Optional callback for streaming progress updates to the UI / DB. */
26
+ onProgress?: (event: ProgressEvent) => void;
27
+ /** When false, install_tool always returns install_disabled. */
28
+ installEnabled: boolean;
29
+ }
30
+ export interface ProgressEvent {
31
+ phase: RunPhase;
32
+ message: string;
33
+ data?: Record<string, unknown>;
34
+ }
35
+ /** State accumulated across a single benchmark run. */
36
+ export interface RunState {
37
+ tool_versions: Record<string, string>;
38
+ unavailable_tools: UnavailableTool[];
39
+ tool_outputs: Record<string, ToolRunOutput>;
40
+ /** Cached parsed summaries for synthesis-time read-back. */
41
+ parsed_summaries: Record<string, ParsedToolOutput>;
42
+ /** Dropped finding count from Phase 5 verification. */
43
+ dropped_findings: number;
44
+ }
45
+ export declare function createEmptyRunState(): RunState;
46
+ export declare function createQualityBenchmarkMcpServer(deps: QualityMcpDeps, state: RunState): import("@anthropic-ai/claude-agent-sdk").McpSdkServerConfigWithInstance;