@bryan-thompson/inspector-assessment-client 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assets/{OAuthCallback-DQS8ZWD9.js → OAuthCallback-DD8JgGmx.js} +1 -1
- package/dist/assets/{OAuthDebugCallback-CeG1zXQt.js → OAuthDebugCallback-CGeg00AP.js} +1 -1
- package/dist/assets/{index-C89qxkOz.js → index-sUICDw7A.js} +1194 -384
- package/dist/index.html +1 -1
- package/lib/lib/assessmentTypes.d.ts +108 -1
- package/lib/lib/assessmentTypes.d.ts.map +1 -1
- package/lib/lib/moduleScoring.d.ts +23 -0
- package/lib/lib/moduleScoring.d.ts.map +1 -0
- package/lib/lib/moduleScoring.js +53 -0
- package/lib/services/assessment/AssessmentOrchestrator.d.ts.map +1 -1
- package/lib/services/assessment/AssessmentOrchestrator.js +22 -37
- package/lib/services/assessment/TestDataGenerator.d.ts +22 -0
- package/lib/services/assessment/TestDataGenerator.d.ts.map +1 -1
- package/lib/services/assessment/TestDataGenerator.js +78 -0
- package/lib/services/assessment/config/annotationPatterns.d.ts +70 -0
- package/lib/services/assessment/config/annotationPatterns.d.ts.map +1 -0
- package/lib/services/assessment/config/annotationPatterns.js +305 -0
- package/lib/services/assessment/modules/FunctionalityAssessor.d.ts +15 -0
- package/lib/services/assessment/modules/FunctionalityAssessor.d.ts.map +1 -1
- package/lib/services/assessment/modules/FunctionalityAssessor.js +137 -6
- package/lib/services/assessment/modules/ToolAnnotationAssessor.d.ts +20 -2
- package/lib/services/assessment/modules/ToolAnnotationAssessor.d.ts.map +1 -1
- package/lib/services/assessment/modules/ToolAnnotationAssessor.js +266 -106
- package/package.json +1 -1
|
@@ -11,75 +11,24 @@
|
|
|
11
11
|
* Reference: Anthropic MCP Directory Policy #17
|
|
12
12
|
*/
|
|
13
13
|
import { BaseAssessor } from "./BaseAssessor.js";
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
const READ_ONLY_PATTERNS = [
|
|
18
|
-
/^get[_-]?/i,
|
|
19
|
-
/^list[_-]?/i,
|
|
20
|
-
/^fetch[_-]?/i,
|
|
21
|
-
/^read[_-]?/i,
|
|
22
|
-
/^query[_-]?/i,
|
|
23
|
-
/^search[_-]?/i,
|
|
24
|
-
/^find[_-]?/i,
|
|
25
|
-
/^show[_-]?/i,
|
|
26
|
-
/^view[_-]?/i,
|
|
27
|
-
/^describe[_-]?/i,
|
|
28
|
-
/^check[_-]?/i,
|
|
29
|
-
/^verify[_-]?/i,
|
|
30
|
-
/^validate[_-]?/i,
|
|
31
|
-
/^count[_-]?/i,
|
|
32
|
-
/^status[_-]?/i,
|
|
33
|
-
/^info[_-]?/i,
|
|
34
|
-
/^lookup[_-]?/i,
|
|
35
|
-
/^browse[_-]?/i,
|
|
36
|
-
/^preview[_-]?/i,
|
|
37
|
-
/^download[_-]?/i, // Downloads but doesn't modify server state
|
|
38
|
-
];
|
|
39
|
-
const DESTRUCTIVE_PATTERNS = [
|
|
40
|
-
/^delete[_-]?/i,
|
|
41
|
-
/^remove[_-]?/i,
|
|
42
|
-
/^destroy[_-]?/i,
|
|
43
|
-
/^drop[_-]?/i,
|
|
44
|
-
/^purge[_-]?/i,
|
|
45
|
-
/^clear[_-]?/i,
|
|
46
|
-
/^wipe[_-]?/i,
|
|
47
|
-
/^erase[_-]?/i,
|
|
48
|
-
/^reset[_-]?/i,
|
|
49
|
-
/^truncate[_-]?/i,
|
|
50
|
-
/^revoke[_-]?/i,
|
|
51
|
-
/^terminate[_-]?/i,
|
|
52
|
-
/^cancel[_-]?/i,
|
|
53
|
-
/^kill[_-]?/i,
|
|
54
|
-
/^force[_-]?/i,
|
|
55
|
-
];
|
|
56
|
-
const WRITE_PATTERNS = [
|
|
57
|
-
/^create[_-]?/i,
|
|
58
|
-
/^add[_-]?/i,
|
|
59
|
-
/^insert[_-]?/i,
|
|
60
|
-
/^update[_-]?/i,
|
|
61
|
-
/^modify[_-]?/i,
|
|
62
|
-
/^edit[_-]?/i,
|
|
63
|
-
/^change[_-]?/i,
|
|
64
|
-
/^set[_-]?/i,
|
|
65
|
-
/^put[_-]?/i,
|
|
66
|
-
/^patch[_-]?/i,
|
|
67
|
-
/^post[_-]?/i,
|
|
68
|
-
/^write[_-]?/i,
|
|
69
|
-
/^save[_-]?/i,
|
|
70
|
-
/^upload[_-]?/i,
|
|
71
|
-
/^send[_-]?/i,
|
|
72
|
-
/^submit[_-]?/i,
|
|
73
|
-
/^publish[_-]?/i,
|
|
74
|
-
/^enable[_-]?/i,
|
|
75
|
-
/^disable[_-]?/i,
|
|
76
|
-
/^start[_-]?/i,
|
|
77
|
-
/^stop[_-]?/i,
|
|
78
|
-
/^run[_-]?/i,
|
|
79
|
-
/^execute[_-]?/i,
|
|
80
|
-
];
|
|
14
|
+
import { getDefaultCompiledPatterns, matchToolPattern, } from "../config/annotationPatterns.js";
|
|
15
|
+
// NOTE: Pattern arrays moved to config/annotationPatterns.ts for configurability
|
|
16
|
+
// The patterns are now loaded from getDefaultCompiledPatterns() or custom config
|
|
81
17
|
export class ToolAnnotationAssessor extends BaseAssessor {
|
|
82
18
|
claudeBridge;
|
|
19
|
+
compiledPatterns;
|
|
20
|
+
constructor(config) {
|
|
21
|
+
super(config);
|
|
22
|
+
// Initialize with default patterns (can be overridden via setPatterns)
|
|
23
|
+
this.compiledPatterns = getDefaultCompiledPatterns();
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Set custom compiled patterns for behavior inference
|
|
27
|
+
*/
|
|
28
|
+
setPatterns(patterns) {
|
|
29
|
+
this.compiledPatterns = patterns;
|
|
30
|
+
this.log("Custom annotation patterns configured");
|
|
31
|
+
}
|
|
83
32
|
/**
|
|
84
33
|
* Set Claude Code Bridge for enhanced behavior inference
|
|
85
34
|
*/
|
|
@@ -151,17 +100,115 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
151
100
|
misalignedAnnotationsCount++;
|
|
152
101
|
}
|
|
153
102
|
}
|
|
154
|
-
|
|
103
|
+
const latestResult = toolResults[toolResults.length - 1];
|
|
104
|
+
if (latestResult.hasAnnotations) {
|
|
155
105
|
annotatedCount++;
|
|
156
106
|
}
|
|
157
107
|
else {
|
|
158
108
|
missingAnnotationsCount++;
|
|
109
|
+
// Emit annotation_missing event with tool details
|
|
110
|
+
if (context.onProgress && latestResult.inferredBehavior) {
|
|
111
|
+
const annotations = this.extractAnnotations(tool);
|
|
112
|
+
context.onProgress({
|
|
113
|
+
type: "annotation_missing",
|
|
114
|
+
tool: tool.name,
|
|
115
|
+
title: annotations.title,
|
|
116
|
+
description: tool.description,
|
|
117
|
+
parameters: this.extractToolParams(tool.inputSchema),
|
|
118
|
+
inferredBehavior: {
|
|
119
|
+
expectedReadOnly: latestResult.inferredBehavior.expectedReadOnly,
|
|
120
|
+
expectedDestructive: latestResult.inferredBehavior.expectedDestructive,
|
|
121
|
+
reason: latestResult.inferredBehavior.reason,
|
|
122
|
+
},
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
// Emit appropriate event based on alignment status
|
|
127
|
+
if (context.onProgress && latestResult.inferredBehavior) {
|
|
128
|
+
const annotations = latestResult.annotations;
|
|
129
|
+
const inferred = latestResult.inferredBehavior;
|
|
130
|
+
const confidence = latestResult.claudeInference?.confidence ?? 50;
|
|
131
|
+
const toolParams = this.extractToolParams(tool.inputSchema);
|
|
132
|
+
const toolAnnotations = this.extractAnnotations(tool);
|
|
133
|
+
const alignmentStatus = latestResult.alignmentStatus;
|
|
134
|
+
// Check readOnlyHint mismatch
|
|
135
|
+
if (annotations?.readOnlyHint !== undefined &&
|
|
136
|
+
annotations.readOnlyHint !== inferred.expectedReadOnly) {
|
|
137
|
+
if (alignmentStatus === "REVIEW_RECOMMENDED") {
|
|
138
|
+
// Emit review_recommended for ambiguous cases
|
|
139
|
+
context.onProgress({
|
|
140
|
+
type: "annotation_review_recommended",
|
|
141
|
+
tool: tool.name,
|
|
142
|
+
title: toolAnnotations.title,
|
|
143
|
+
description: tool.description,
|
|
144
|
+
parameters: toolParams,
|
|
145
|
+
field: "readOnlyHint",
|
|
146
|
+
actual: annotations.readOnlyHint,
|
|
147
|
+
inferred: inferred.expectedReadOnly,
|
|
148
|
+
confidence: inferred.confidence,
|
|
149
|
+
isAmbiguous: inferred.isAmbiguous,
|
|
150
|
+
reason: inferred.reason,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
// Emit misaligned for high-confidence mismatches
|
|
155
|
+
context.onProgress({
|
|
156
|
+
type: "annotation_misaligned",
|
|
157
|
+
tool: tool.name,
|
|
158
|
+
title: toolAnnotations.title,
|
|
159
|
+
description: tool.description,
|
|
160
|
+
parameters: toolParams,
|
|
161
|
+
field: "readOnlyHint",
|
|
162
|
+
actual: annotations.readOnlyHint,
|
|
163
|
+
expected: inferred.expectedReadOnly,
|
|
164
|
+
confidence,
|
|
165
|
+
reason: `Tool has readOnlyHint=${annotations.readOnlyHint}, but ${inferred.reason}`,
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
// Check destructiveHint mismatch
|
|
170
|
+
if (annotations?.destructiveHint !== undefined &&
|
|
171
|
+
annotations.destructiveHint !== inferred.expectedDestructive) {
|
|
172
|
+
if (alignmentStatus === "REVIEW_RECOMMENDED") {
|
|
173
|
+
// Emit review_recommended for ambiguous cases
|
|
174
|
+
context.onProgress({
|
|
175
|
+
type: "annotation_review_recommended",
|
|
176
|
+
tool: tool.name,
|
|
177
|
+
title: toolAnnotations.title,
|
|
178
|
+
description: tool.description,
|
|
179
|
+
parameters: toolParams,
|
|
180
|
+
field: "destructiveHint",
|
|
181
|
+
actual: annotations.destructiveHint,
|
|
182
|
+
inferred: inferred.expectedDestructive,
|
|
183
|
+
confidence: inferred.confidence,
|
|
184
|
+
isAmbiguous: inferred.isAmbiguous,
|
|
185
|
+
reason: inferred.reason,
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
else {
|
|
189
|
+
// Emit misaligned for high-confidence mismatches
|
|
190
|
+
context.onProgress({
|
|
191
|
+
type: "annotation_misaligned",
|
|
192
|
+
tool: tool.name,
|
|
193
|
+
title: toolAnnotations.title,
|
|
194
|
+
description: tool.description,
|
|
195
|
+
parameters: toolParams,
|
|
196
|
+
field: "destructiveHint",
|
|
197
|
+
actual: annotations.destructiveHint,
|
|
198
|
+
expected: inferred.expectedDestructive,
|
|
199
|
+
confidence,
|
|
200
|
+
reason: `Tool has destructiveHint=${annotations.destructiveHint}, but ${inferred.reason}`,
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
}
|
|
159
204
|
}
|
|
160
205
|
}
|
|
161
206
|
const status = this.determineAnnotationStatus(toolResults, context.tools.length);
|
|
162
207
|
const explanation = this.generateExplanation(annotatedCount, missingAnnotationsCount, misalignedAnnotationsCount, context.tools.length);
|
|
163
208
|
const recommendations = this.generateRecommendations(toolResults);
|
|
164
|
-
|
|
209
|
+
// Calculate new metrics and alignment breakdown
|
|
210
|
+
const { metrics, alignmentBreakdown } = this.calculateMetrics(toolResults, context.tools.length);
|
|
211
|
+
this.log(`Assessment complete: ${annotatedCount}/${context.tools.length} tools annotated, ${misalignedAnnotationsCount} misaligned, ${alignmentBreakdown.reviewRecommended} need review`);
|
|
165
212
|
// Return enhanced assessment if Claude was used
|
|
166
213
|
if (useClaudeInference) {
|
|
167
214
|
const highConfidenceMisalignments = toolResults.filter((r) => r.claudeInference &&
|
|
@@ -176,6 +223,8 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
176
223
|
status,
|
|
177
224
|
explanation: this.generateEnhancedExplanation(annotatedCount, missingAnnotationsCount, highConfidenceMisalignments.length, context.tools.length),
|
|
178
225
|
recommendations: this.generateEnhancedRecommendations(toolResults),
|
|
226
|
+
metrics,
|
|
227
|
+
alignmentBreakdown,
|
|
179
228
|
claudeEnhanced: true,
|
|
180
229
|
highConfidenceMisalignments,
|
|
181
230
|
};
|
|
@@ -188,6 +237,8 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
188
237
|
status,
|
|
189
238
|
explanation,
|
|
190
239
|
recommendations,
|
|
240
|
+
metrics,
|
|
241
|
+
alignmentBreakdown,
|
|
191
242
|
};
|
|
192
243
|
}
|
|
193
244
|
/**
|
|
@@ -376,6 +427,7 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
376
427
|
}
|
|
377
428
|
/**
|
|
378
429
|
* Assess a single tool's annotations
|
|
430
|
+
* Now includes alignment status with confidence-aware logic
|
|
379
431
|
*/
|
|
380
432
|
assessTool(tool) {
|
|
381
433
|
const issues = [];
|
|
@@ -386,35 +438,65 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
386
438
|
annotations.destructiveHint !== undefined;
|
|
387
439
|
// Infer expected behavior from tool name
|
|
388
440
|
const inferredBehavior = this.inferBehavior(tool.name, tool.description);
|
|
441
|
+
// Determine alignment status
|
|
442
|
+
let alignmentStatus = "ALIGNED";
|
|
389
443
|
// Check for missing annotations
|
|
390
444
|
if (!hasAnnotations) {
|
|
391
445
|
issues.push("Missing tool annotations (readOnlyHint, destructiveHint)");
|
|
392
446
|
recommendations.push(`Add annotations to ${tool.name}: readOnlyHint=${inferredBehavior.expectedReadOnly}, destructiveHint=${inferredBehavior.expectedDestructive}`);
|
|
447
|
+
alignmentStatus = "UNKNOWN";
|
|
393
448
|
}
|
|
394
449
|
else {
|
|
395
|
-
// Check for misaligned annotations
|
|
396
|
-
|
|
397
|
-
annotations.readOnlyHint !== inferredBehavior.expectedReadOnly
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
450
|
+
// Check for misaligned annotations with confidence-aware logic
|
|
451
|
+
const readOnlyMismatch = annotations.readOnlyHint !== undefined &&
|
|
452
|
+
annotations.readOnlyHint !== inferredBehavior.expectedReadOnly;
|
|
453
|
+
const destructiveMismatch = annotations.destructiveHint !== undefined &&
|
|
454
|
+
annotations.destructiveHint !== inferredBehavior.expectedDestructive;
|
|
455
|
+
if (readOnlyMismatch || destructiveMismatch) {
|
|
456
|
+
if (inferredBehavior.isAmbiguous ||
|
|
457
|
+
inferredBehavior.confidence === "low") {
|
|
458
|
+
// Ambiguous case: REVIEW_RECOMMENDED, softer language
|
|
459
|
+
alignmentStatus = "REVIEW_RECOMMENDED";
|
|
460
|
+
if (readOnlyMismatch) {
|
|
461
|
+
issues.push(`Review recommended: readOnlyHint=${annotations.readOnlyHint} may or may not match '${tool.name}' behavior (confidence: ${inferredBehavior.confidence})`);
|
|
462
|
+
recommendations.push(`Verify readOnlyHint for ${tool.name}: pattern is ambiguous - manual review recommended`);
|
|
463
|
+
}
|
|
464
|
+
if (destructiveMismatch) {
|
|
465
|
+
issues.push(`Review recommended: destructiveHint=${annotations.destructiveHint} may or may not match '${tool.name}' behavior (confidence: ${inferredBehavior.confidence})`);
|
|
466
|
+
recommendations.push(`Verify destructiveHint for ${tool.name}: pattern is ambiguous - manual review recommended`);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
else {
|
|
470
|
+
// High/medium confidence mismatch: MISALIGNED
|
|
471
|
+
alignmentStatus = "MISALIGNED";
|
|
472
|
+
if (readOnlyMismatch) {
|
|
473
|
+
issues.push(`Potentially misaligned readOnlyHint: set to ${annotations.readOnlyHint}, expected ${inferredBehavior.expectedReadOnly} based on tool name pattern`);
|
|
474
|
+
recommendations.push(`Verify readOnlyHint for ${tool.name}: currently ${annotations.readOnlyHint}, tool name suggests ${inferredBehavior.expectedReadOnly}`);
|
|
475
|
+
}
|
|
476
|
+
if (destructiveMismatch) {
|
|
477
|
+
issues.push(`Potentially misaligned destructiveHint: set to ${annotations.destructiveHint}, expected ${inferredBehavior.expectedDestructive} based on tool name pattern`);
|
|
478
|
+
recommendations.push(`Verify destructiveHint for ${tool.name}: currently ${annotations.destructiveHint}, tool name suggests ${inferredBehavior.expectedDestructive}`);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
405
481
|
}
|
|
406
482
|
}
|
|
407
|
-
// Check for destructive tools without explicit hint
|
|
483
|
+
// Check for destructive tools without explicit hint (only for high-confidence patterns)
|
|
408
484
|
if (inferredBehavior.expectedDestructive &&
|
|
485
|
+
inferredBehavior.confidence !== "low" &&
|
|
409
486
|
annotations.destructiveHint !== true) {
|
|
410
487
|
issues.push("Tool appears destructive but destructiveHint is not set to true");
|
|
411
488
|
recommendations.push(`Set destructiveHint=true for ${tool.name} - this tool appears to perform destructive operations`);
|
|
489
|
+
// Only upgrade to MISALIGNED if we have high confidence
|
|
490
|
+
if (inferredBehavior.confidence === "high") {
|
|
491
|
+
alignmentStatus = "MISALIGNED";
|
|
492
|
+
}
|
|
412
493
|
}
|
|
413
494
|
return {
|
|
414
495
|
toolName: tool.name,
|
|
415
496
|
hasAnnotations,
|
|
416
497
|
annotations: hasAnnotations ? annotations : undefined,
|
|
417
498
|
inferredBehavior,
|
|
499
|
+
alignmentStatus,
|
|
418
500
|
issues,
|
|
419
501
|
recommendations,
|
|
420
502
|
};
|
|
@@ -452,48 +534,85 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
452
534
|
openWorldHint,
|
|
453
535
|
};
|
|
454
536
|
}
|
|
537
|
+
/**
|
|
538
|
+
* Extract parameters from tool input schema for event emission
|
|
539
|
+
*/
|
|
540
|
+
extractToolParams(schema) {
|
|
541
|
+
if (!schema || typeof schema !== "object")
|
|
542
|
+
return [];
|
|
543
|
+
const s = schema;
|
|
544
|
+
if (!s.properties || typeof s.properties !== "object")
|
|
545
|
+
return [];
|
|
546
|
+
const required = new Set(Array.isArray(s.required) ? s.required : []);
|
|
547
|
+
const properties = s.properties;
|
|
548
|
+
return Object.entries(properties).map(([name, prop]) => {
|
|
549
|
+
const param = {
|
|
550
|
+
name,
|
|
551
|
+
type: prop.type || "any",
|
|
552
|
+
required: required.has(name),
|
|
553
|
+
};
|
|
554
|
+
if (prop.description) {
|
|
555
|
+
param.description = prop.description;
|
|
556
|
+
}
|
|
557
|
+
return param;
|
|
558
|
+
});
|
|
559
|
+
}
|
|
455
560
|
/**
|
|
456
561
|
* Infer expected behavior from tool name and description
|
|
562
|
+
* Now returns confidence level and ambiguity flag for better handling
|
|
457
563
|
*/
|
|
458
564
|
inferBehavior(toolName, description) {
|
|
459
|
-
const lowerName = toolName.toLowerCase();
|
|
460
565
|
const lowerDesc = (description || "").toLowerCase();
|
|
461
|
-
//
|
|
462
|
-
|
|
463
|
-
|
|
566
|
+
// Use the configurable pattern matching system
|
|
567
|
+
const patternMatch = matchToolPattern(toolName, this.compiledPatterns);
|
|
568
|
+
// Handle pattern match results
|
|
569
|
+
switch (patternMatch.category) {
|
|
570
|
+
case "ambiguous":
|
|
571
|
+
// Ambiguous patterns - don't make strong assertions
|
|
572
|
+
return {
|
|
573
|
+
expectedReadOnly: false,
|
|
574
|
+
expectedDestructive: false,
|
|
575
|
+
reason: `Tool name matches ambiguous pattern '${patternMatch.pattern}' - behavior varies by implementation context`,
|
|
576
|
+
confidence: "low",
|
|
577
|
+
isAmbiguous: true,
|
|
578
|
+
};
|
|
579
|
+
case "destructive":
|
|
464
580
|
return {
|
|
465
581
|
expectedReadOnly: false,
|
|
466
582
|
expectedDestructive: true,
|
|
467
|
-
reason: `Tool name matches destructive pattern: ${pattern
|
|
583
|
+
reason: `Tool name matches destructive pattern: ${patternMatch.pattern}`,
|
|
584
|
+
confidence: "high",
|
|
585
|
+
isAmbiguous: false,
|
|
468
586
|
};
|
|
469
|
-
|
|
470
|
-
}
|
|
471
|
-
// Check for read-only patterns
|
|
472
|
-
for (const pattern of READ_ONLY_PATTERNS) {
|
|
473
|
-
if (pattern.test(lowerName)) {
|
|
587
|
+
case "readOnly":
|
|
474
588
|
return {
|
|
475
589
|
expectedReadOnly: true,
|
|
476
590
|
expectedDestructive: false,
|
|
477
|
-
reason: `Tool name matches read-only pattern: ${pattern
|
|
591
|
+
reason: `Tool name matches read-only pattern: ${patternMatch.pattern}`,
|
|
592
|
+
confidence: "high",
|
|
593
|
+
isAmbiguous: false,
|
|
478
594
|
};
|
|
479
|
-
|
|
480
|
-
}
|
|
481
|
-
// Check for write patterns (not destructive but not read-only)
|
|
482
|
-
for (const pattern of WRITE_PATTERNS) {
|
|
483
|
-
if (pattern.test(lowerName)) {
|
|
595
|
+
case "write":
|
|
484
596
|
return {
|
|
485
597
|
expectedReadOnly: false,
|
|
486
598
|
expectedDestructive: false,
|
|
487
|
-
reason: `Tool name matches write pattern: ${pattern
|
|
599
|
+
reason: `Tool name matches write pattern: ${patternMatch.pattern}`,
|
|
600
|
+
confidence: "medium",
|
|
601
|
+
isAmbiguous: false,
|
|
488
602
|
};
|
|
489
|
-
|
|
603
|
+
case "unknown":
|
|
604
|
+
default:
|
|
605
|
+
// Fall through to description-based analysis
|
|
606
|
+
break;
|
|
490
607
|
}
|
|
491
|
-
// Check description for hints
|
|
608
|
+
// Check description for hints (medium confidence)
|
|
492
609
|
if (lowerDesc.includes("delete") || lowerDesc.includes("remove")) {
|
|
493
610
|
return {
|
|
494
611
|
expectedReadOnly: false,
|
|
495
612
|
expectedDestructive: true,
|
|
496
613
|
reason: "Description mentions delete/remove operations",
|
|
614
|
+
confidence: "medium",
|
|
615
|
+
isAmbiguous: false,
|
|
497
616
|
};
|
|
498
617
|
}
|
|
499
618
|
if (lowerDesc.includes("read") ||
|
|
@@ -503,30 +622,43 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
503
622
|
expectedReadOnly: true,
|
|
504
623
|
expectedDestructive: false,
|
|
505
624
|
reason: "Description suggests read-only operation",
|
|
625
|
+
confidence: "medium",
|
|
626
|
+
isAmbiguous: false,
|
|
506
627
|
};
|
|
507
628
|
}
|
|
508
|
-
// Default: assume write
|
|
629
|
+
// Default: assume write with low confidence (ambiguous)
|
|
509
630
|
return {
|
|
510
631
|
expectedReadOnly: false,
|
|
511
632
|
expectedDestructive: false,
|
|
512
633
|
reason: "Could not infer from name pattern - defaulting to write operation",
|
|
634
|
+
confidence: "low",
|
|
635
|
+
isAmbiguous: true,
|
|
513
636
|
};
|
|
514
637
|
}
|
|
515
638
|
/**
|
|
516
|
-
* Determine overall status
|
|
639
|
+
* Determine overall status using alignment status.
|
|
640
|
+
* Only MISALIGNED counts as failure; REVIEW_RECOMMENDED does not fail.
|
|
517
641
|
*/
|
|
518
642
|
determineAnnotationStatus(results, totalTools) {
|
|
519
643
|
if (totalTools === 0)
|
|
520
644
|
return "PASS";
|
|
521
645
|
const annotatedCount = results.filter((r) => r.hasAnnotations).length;
|
|
522
|
-
|
|
523
|
-
const
|
|
524
|
-
//
|
|
646
|
+
// Only count actual MISALIGNED, not REVIEW_RECOMMENDED
|
|
647
|
+
const misalignedCount = results.filter((r) => r.alignmentStatus === "MISALIGNED").length;
|
|
648
|
+
// Count high-confidence destructive tools without proper hints
|
|
649
|
+
const destructiveWithoutHint = results.filter((r) => r.inferredBehavior?.expectedDestructive === true &&
|
|
650
|
+
r.inferredBehavior?.confidence === "high" &&
|
|
651
|
+
r.annotations?.destructiveHint !== true).length;
|
|
652
|
+
// Destructive tools without proper hints = FAIL (critical safety issue)
|
|
525
653
|
if (destructiveWithoutHint > 0) {
|
|
526
654
|
return "FAIL";
|
|
527
655
|
}
|
|
528
|
-
//
|
|
529
|
-
if (
|
|
656
|
+
// High-confidence misalignments = FAIL
|
|
657
|
+
if (misalignedCount > 0) {
|
|
658
|
+
return "FAIL";
|
|
659
|
+
}
|
|
660
|
+
// All tools annotated = PASS
|
|
661
|
+
if (annotatedCount === totalTools) {
|
|
530
662
|
return "PASS";
|
|
531
663
|
}
|
|
532
664
|
// Some annotations missing = NEED_MORE_INFO
|
|
@@ -540,6 +672,34 @@ export class ToolAnnotationAssessor extends BaseAssessor {
|
|
|
540
672
|
}
|
|
541
673
|
return "NEED_MORE_INFO";
|
|
542
674
|
}
|
|
675
|
+
/**
|
|
676
|
+
* Calculate metrics and alignment breakdown for the assessment
|
|
677
|
+
*/
|
|
678
|
+
calculateMetrics(results, totalTools) {
|
|
679
|
+
const alignmentBreakdown = {
|
|
680
|
+
aligned: results.filter((r) => r.alignmentStatus === "ALIGNED").length,
|
|
681
|
+
misaligned: results.filter((r) => r.alignmentStatus === "MISALIGNED")
|
|
682
|
+
.length,
|
|
683
|
+
reviewRecommended: results.filter((r) => r.alignmentStatus === "REVIEW_RECOMMENDED").length,
|
|
684
|
+
unknown: results.filter((r) => r.alignmentStatus === "UNKNOWN").length,
|
|
685
|
+
};
|
|
686
|
+
const annotatedCount = results.filter((r) => r.hasAnnotations).length;
|
|
687
|
+
const metrics = {
|
|
688
|
+
// Coverage: percentage of tools with annotations
|
|
689
|
+
coverage: totalTools > 0 ? (annotatedCount / totalTools) * 100 : 100,
|
|
690
|
+
// Consistency: percentage without contradictions (not MISALIGNED)
|
|
691
|
+
consistency: totalTools > 0
|
|
692
|
+
? ((totalTools - alignmentBreakdown.misaligned) / totalTools) * 100
|
|
693
|
+
: 100,
|
|
694
|
+
// Correctness: percentage of annotated tools that are ALIGNED
|
|
695
|
+
correctness: annotatedCount > 0
|
|
696
|
+
? (alignmentBreakdown.aligned / annotatedCount) * 100
|
|
697
|
+
: 0,
|
|
698
|
+
// Review required: count of tools needing manual review
|
|
699
|
+
reviewRequired: alignmentBreakdown.reviewRecommended,
|
|
700
|
+
};
|
|
701
|
+
return { metrics, alignmentBreakdown };
|
|
702
|
+
}
|
|
543
703
|
/**
|
|
544
704
|
* Generate explanation
|
|
545
705
|
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@bryan-thompson/inspector-assessment-client",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.12.0",
|
|
4
4
|
"description": "Client-side application for the Enhanced MCP Inspector with assessment capabilities",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Bryan Thompson <bryan@triepod.ai>",
|