edsger 0.55.4 → 0.56.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ /**
2
+ * In-process MCP server exposing the quality-benchmark execution
3
+ * primitives to the Claude Agent SDK session.
4
+ *
5
+ * The single SDK session reasons about the rubric in its system prompt
6
+ * and calls these tools to drive the 6-phase pipeline:
7
+ *
8
+ * list_applicable_tools(detected_context) — Phase 1 -> 2 handoff
9
+ * probe_tool(tool_id) — Phase 2
10
+ * install_tool(tool_id) — Phase 2.5
11
+ * run_tool(tool_id) — Phase 3 (returns parsed summary)
12
+ * verify_finding(file, line, snippet?) — Phase 5
13
+ * record_progress(phase, message) — UI streaming side channel
14
+ *
15
+ * Every command and install step is whitelisted by `tool-catalog.ts`
16
+ * and re-checked at runtime by `tool-runner.ts`. The MCP server is a
17
+ * thin adapter between the SDK's tool-call protocol and that runner —
18
+ * it adds no new privilege, no new commands, and no new side effects.
19
+ */
20
+ import { existsSync, readFileSync, statSync } from 'node:fs';
21
+ import { join } from 'node:path';
22
+ import { createSdkMcpServer, tool } from '@anthropic-ai/claude-agent-sdk';
23
+ import { z } from 'zod';
24
+ import { selectToolsForContext, TOOL_CATALOG_BY_ID } from './tool-catalog.js';
25
+ import { executeTool, installFailureToUnavailable, installTool, probeTool, probeToUnavailable, } from './tool-runner.js';
26
+ export function createEmptyRunState() {
27
+ return {
28
+ tool_versions: {},
29
+ unavailable_tools: [],
30
+ tool_outputs: {},
31
+ parsed_summaries: {},
32
+ dropped_findings: 0,
33
+ };
34
+ }
35
+ // ---------------------------------------------------------------------------
36
+ // Helpers
37
+ // ---------------------------------------------------------------------------
38
+ function textResult(data, isError = false) {
39
+ const text = typeof data === 'string' ? data : JSON.stringify(data);
40
+ return {
41
+ content: [{ type: 'text', text }],
42
+ ...(isError ? { isError: true } : {}),
43
+ };
44
+ }
45
+ function emit(deps, event) {
46
+ deps.onProgress?.(event);
47
+ }
48
+ // ---------------------------------------------------------------------------
49
+ // Server factory
50
+ // ---------------------------------------------------------------------------
51
+ export function createQualityBenchmarkMcpServer(deps, state) {
52
+ const listApplicable = tool('list_applicable_tools', 'List the catalog tools that apply to the detected repo. Pass the languages / package-managers / frameworks you identified in Phase 1; the server returns the subset of tools whose `applies_to` and `requires` gates are satisfied. Use this to decide what to probe in Phase 2.', {
53
+ languages: z
54
+ .array(z.string())
55
+ .describe('Detected language tags (e.g. ["ts","py"]).'),
56
+ package_managers: z
57
+ .array(z.string())
58
+ .optional()
59
+ .describe('Detected package managers (npm/pnpm/yarn/...).'),
60
+ frameworks: z
61
+ .array(z.string())
62
+ .optional()
63
+ .describe('Detected frameworks (rails/django/...).'),
64
+ files_present: z
65
+ .array(z.string())
66
+ .optional()
67
+ .describe('Manifest / config filenames present at the repo root (e.g. "tsconfig.json").'),
68
+ }, async (args) => {
69
+ const tools = selectToolsForContext({
70
+ languages: args.languages,
71
+ package_managers: args.package_managers,
72
+ frameworks: args.frameworks,
73
+ files_present: args.files_present,
74
+ });
75
+ return textResult({
76
+ tools: tools.map((t) => ({
77
+ id: t.id,
78
+ label: t.label,
79
+ category: t.category,
80
+ install_prereq: t.install_prereq,
81
+ subscores: t.subscores,
82
+ requires: t.requires,
83
+ })),
84
+ });
85
+ });
86
+ const probe = tool('probe_tool', 'Probe whether a tool from the catalog is installed and reachable. Returns availability + version + the install command that would install it. Use this in Phase 2 before deciding to install / skip.', {
87
+ tool_id: z.string().describe('Catalog tool id (e.g. "semgrep").'),
88
+ }, async (args) => {
89
+ if (!TOOL_CATALOG_BY_ID.has(args.tool_id)) {
90
+ return textResult({ error: `Unknown tool_id: ${args.tool_id}` }, true);
91
+ }
92
+ const result = await probeTool(args.tool_id, deps.runner);
93
+ if (result.available && result.version) {
94
+ state.tool_versions[args.tool_id] = result.version;
95
+ }
96
+ emit(deps, {
97
+ phase: 'probing',
98
+ message: `${args.tool_id}: ${result.available ? `available (${result.version ?? '?'})` : 'not found'}`,
99
+ data: { tool_id: args.tool_id, available: result.available },
100
+ });
101
+ return textResult(result);
102
+ });
103
+ const install = tool('install_tool', 'Install a missing catalog tool to user-space. Only the install command registered in the catalog is allowed — `sudo`, `apt`, `brew`, `yum`, etc. are rejected. Returns {installed, version, error?}. If the user has disabled installation (--no-install), this returns install_disabled.', {
104
+ tool_id: z.string().describe('Catalog tool id to install.'),
105
+ }, async (args) => {
106
+ const entry = TOOL_CATALOG_BY_ID.get(args.tool_id);
107
+ if (!entry) {
108
+ return textResult({ error: `Unknown tool_id: ${args.tool_id}` }, true);
109
+ }
110
+ emit(deps, {
111
+ phase: 'installation',
112
+ message: `Installing ${args.tool_id}...`,
113
+ data: { tool_id: args.tool_id },
114
+ });
115
+ const result = await installTool(args.tool_id, deps.runner);
116
+ if (result.installed && result.version) {
117
+ state.tool_versions[args.tool_id] = result.version;
118
+ }
119
+ else {
120
+ state.unavailable_tools.push(installFailureToUnavailable(args.tool_id, result));
121
+ }
122
+ emit(deps, {
123
+ phase: 'installation',
124
+ message: `${args.tool_id}: ${result.installed ? `installed (${result.version ?? '?'})` : `failed (${result.error ?? 'unknown'})`}`,
125
+ });
126
+ return textResult(result);
127
+ });
128
+ const run = tool('run_tool', 'Execute a catalog tool against the repo. Returns the **parsed summary only** (counts / top-N findings / metrics) — never raw output, which is saved to disk and referenced by tool_outputs.raw_output_path. The summary is one of three shapes: counts (style linters), findings (security/correctness), or metrics (LOC/complexity).', {
129
+ tool_id: z.string().describe('Catalog tool id to run.'),
130
+ }, async (args) => {
131
+ const entry = TOOL_CATALOG_BY_ID.get(args.tool_id);
132
+ if (!entry) {
133
+ return textResult({ error: `Unknown tool_id: ${args.tool_id}` }, true);
134
+ }
135
+ // If we never confirmed the tool is available, refuse — caller must probe first.
136
+ if (!state.tool_versions[args.tool_id]) {
137
+ const probed = await probeTool(args.tool_id, deps.runner);
138
+ if (!probed.available) {
139
+ state.unavailable_tools.push(probeToUnavailable(args.tool_id, probed, 'not_found'));
140
+ emit(deps, {
141
+ phase: 'execution',
142
+ message: `${args.tool_id}: skipped (not installed)`,
143
+ });
144
+ return textResult({
145
+ ran: false,
146
+ reason: 'not_available',
147
+ install_command: probed.install_command,
148
+ });
149
+ }
150
+ if (probed.version) {
151
+ state.tool_versions[args.tool_id] = probed.version;
152
+ }
153
+ }
154
+ emit(deps, {
155
+ phase: 'execution',
156
+ message: `Running ${args.tool_id}...`,
157
+ data: { tool_id: args.tool_id },
158
+ });
159
+ const { parsed, run: runMeta, ok, } = await executeTool(args.tool_id, deps.runner);
160
+ state.parsed_summaries[args.tool_id] = parsed;
161
+ state.tool_outputs[args.tool_id] = runMeta;
162
+ emit(deps, {
163
+ phase: 'execution',
164
+ message: `${args.tool_id}: ${parsed.oneliner}`,
165
+ });
166
+ return textResult({
167
+ ran: true,
168
+ ok,
169
+ oneliner: parsed.oneliner,
170
+ summary: parsed.summary,
171
+ meta: {
172
+ duration_ms: runMeta.duration_ms,
173
+ exit_code: runMeta.exit_code,
174
+ raw_output_path: runMeta.raw_output_path,
175
+ },
176
+ });
177
+ });
178
+ const verify = tool('verify_finding', 'Validate that a `file:line` claim is real before including it as evidence in the final report. Returns { exists, line_in_range, snippet_matches } so the LLM can drop hallucinated entries. Used during Phase 5.', {
179
+ file: z.string().describe('Repo-relative path of the file.'),
180
+ line: z.number().int().min(1).describe('1-based line number.'),
181
+ snippet: z
182
+ .string()
183
+ .optional()
184
+ .describe('Optional snippet that should appear within +/-3 lines of `line`.'),
185
+ }, async (args) => {
186
+ const result = verifyFinding(deps.runner.repo_root, args);
187
+ if (!result.exists ||
188
+ !result.line_in_range ||
189
+ result.snippet_matches === false) {
190
+ state.dropped_findings += 1;
191
+ }
192
+ return textResult(result);
193
+ });
194
+ const progress = tool('record_progress', 'Send a progress message to the UI (and the DB). Does not affect scoring. Use it to keep users informed during long phases.', {
195
+ phase: z
196
+ .enum([
197
+ 'detection',
198
+ 'probing',
199
+ 'installation',
200
+ 'execution',
201
+ 'external_signals',
202
+ 'verification',
203
+ 'synthesis',
204
+ ])
205
+ .describe('Which phase the message belongs to.'),
206
+ message: z.string().describe('Human-readable status update.'),
207
+ }, async (args) => {
208
+ emit(deps, { phase: args.phase, message: args.message });
209
+ return textResult({ acknowledged: true });
210
+ });
211
+ return createSdkMcpServer({
212
+ name: 'quality-benchmark',
213
+ version: '1.0.0',
214
+ tools: [listApplicable, probe, install, run, verify, progress],
215
+ });
216
+ }
217
+ function verifyFinding(repoRoot, args) {
218
+ const abs = join(repoRoot, args.file);
219
+ if (!existsSync(abs)) {
220
+ return { exists: false, line_in_range: false };
221
+ }
222
+ let stat;
223
+ try {
224
+ stat = statSync(abs);
225
+ }
226
+ catch {
227
+ return { exists: false, line_in_range: false };
228
+ }
229
+ if (!stat.isFile()) {
230
+ return { exists: false, line_in_range: false };
231
+ }
232
+ let content;
233
+ try {
234
+ content = readFileSync(abs, 'utf8');
235
+ }
236
+ catch {
237
+ return { exists: true, line_in_range: false };
238
+ }
239
+ const lines = content.split(/\r?\n/);
240
+ const lineInRange = args.line >= 1 && args.line <= lines.length;
241
+ if (!args.snippet) {
242
+ return { exists: true, line_in_range: lineInRange };
243
+ }
244
+ if (!lineInRange) {
245
+ return { exists: true, line_in_range: false, snippet_matches: false };
246
+ }
247
+ const start = Math.max(0, args.line - 4);
248
+ const end = Math.min(lines.length, args.line + 3);
249
+ const window = lines.slice(start, end).join('\n');
250
+ const snippet_matches = window.includes(args.snippet.trim().slice(0, 80));
251
+ return { exists: true, line_in_range: true, snippet_matches };
252
+ }
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Tool output parsers. Each parser extracts the minimum information the
3
+ * LLM needs to score the dimension — either counts (Tier 1), counts plus
4
+ * top findings (Tier 2), or domain-specific metrics (Tier 3).
5
+ *
6
+ * Full raw outputs are saved to disk by the tool-runner and never enter
7
+ * the LLM context — these parsers operate on the stdout/stderr strings
8
+ * captured during execution.
9
+ *
10
+ * Severity mapping:
11
+ * tool-specific → our 4-tier enum (critical | high | medium | low)
12
+ *
13
+ * All parsers must be:
14
+ * - Defensive: never throw. Return `parsed: false` (via a counts-zero
15
+ * summary) if the output is malformed.
16
+ * - Cheap: simple JSON parse + small map. No regex over megabytes.
17
+ * - Stable: same input → same output (no randomness, no clocks).
18
+ */
19
+ import type { ParsedToolOutput, ParserContext, ParserFn } from './types.js';
20
+ export declare const PARSERS: Record<string, ParserFn>;
21
+ /** Run the parser for a tool, defensively swallowing errors. */
22
+ export declare function parseToolOutput(toolId: string, stdout: string, stderr: string, ctx: ParserContext): ParsedToolOutput;