@dotsetlabs/bellwether 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +291 -0
- package/LICENSE +21 -0
- package/README.md +739 -0
- package/dist/auth/credentials.d.ts +64 -0
- package/dist/auth/credentials.js +218 -0
- package/dist/auth/index.d.ts +6 -0
- package/dist/auth/index.js +6 -0
- package/dist/auth/keychain.d.ts +64 -0
- package/dist/auth/keychain.js +268 -0
- package/dist/baseline/ab-testing.d.ts +80 -0
- package/dist/baseline/ab-testing.js +236 -0
- package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
- package/dist/baseline/ai-compatibility-scorer.js +606 -0
- package/dist/baseline/calibration.d.ts +77 -0
- package/dist/baseline/calibration.js +136 -0
- package/dist/baseline/category-matching.d.ts +85 -0
- package/dist/baseline/category-matching.js +289 -0
- package/dist/baseline/change-impact-analyzer.d.ts +98 -0
- package/dist/baseline/change-impact-analyzer.js +592 -0
- package/dist/baseline/comparator.d.ts +64 -0
- package/dist/baseline/comparator.js +916 -0
- package/dist/baseline/confidence.d.ts +55 -0
- package/dist/baseline/confidence.js +122 -0
- package/dist/baseline/converter.d.ts +61 -0
- package/dist/baseline/converter.js +585 -0
- package/dist/baseline/dependency-analyzer.d.ts +89 -0
- package/dist/baseline/dependency-analyzer.js +567 -0
- package/dist/baseline/deprecation-tracker.d.ts +133 -0
- package/dist/baseline/deprecation-tracker.js +322 -0
- package/dist/baseline/diff.d.ts +55 -0
- package/dist/baseline/diff.js +1584 -0
- package/dist/baseline/documentation-scorer.d.ts +205 -0
- package/dist/baseline/documentation-scorer.js +466 -0
- package/dist/baseline/embeddings.d.ts +118 -0
- package/dist/baseline/embeddings.js +251 -0
- package/dist/baseline/error-analyzer.d.ts +198 -0
- package/dist/baseline/error-analyzer.js +721 -0
- package/dist/baseline/evaluation/evaluator.d.ts +42 -0
- package/dist/baseline/evaluation/evaluator.js +323 -0
- package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
- package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
- package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
- package/dist/baseline/evaluation/golden-dataset.js +717 -0
- package/dist/baseline/evaluation/index.d.ts +15 -0
- package/dist/baseline/evaluation/index.js +15 -0
- package/dist/baseline/evaluation/types.d.ts +186 -0
- package/dist/baseline/evaluation/types.js +8 -0
- package/dist/baseline/external-dependency-detector.d.ts +181 -0
- package/dist/baseline/external-dependency-detector.js +524 -0
- package/dist/baseline/golden-output.d.ts +162 -0
- package/dist/baseline/golden-output.js +636 -0
- package/dist/baseline/health-scorer.d.ts +174 -0
- package/dist/baseline/health-scorer.js +451 -0
- package/dist/baseline/incremental-checker.d.ts +97 -0
- package/dist/baseline/incremental-checker.js +174 -0
- package/dist/baseline/index.d.ts +31 -0
- package/dist/baseline/index.js +42 -0
- package/dist/baseline/migration-generator.d.ts +137 -0
- package/dist/baseline/migration-generator.js +554 -0
- package/dist/baseline/migrations.d.ts +60 -0
- package/dist/baseline/migrations.js +197 -0
- package/dist/baseline/performance-tracker.d.ts +214 -0
- package/dist/baseline/performance-tracker.js +577 -0
- package/dist/baseline/pr-comment-generator.d.ts +117 -0
- package/dist/baseline/pr-comment-generator.js +546 -0
- package/dist/baseline/response-fingerprint.d.ts +127 -0
- package/dist/baseline/response-fingerprint.js +728 -0
- package/dist/baseline/response-schema-tracker.d.ts +129 -0
- package/dist/baseline/response-schema-tracker.js +420 -0
- package/dist/baseline/risk-scorer.d.ts +54 -0
- package/dist/baseline/risk-scorer.js +434 -0
- package/dist/baseline/saver.d.ts +89 -0
- package/dist/baseline/saver.js +554 -0
- package/dist/baseline/scenario-generator.d.ts +151 -0
- package/dist/baseline/scenario-generator.js +905 -0
- package/dist/baseline/schema-compare.d.ts +86 -0
- package/dist/baseline/schema-compare.js +557 -0
- package/dist/baseline/schema-evolution.d.ts +189 -0
- package/dist/baseline/schema-evolution.js +467 -0
- package/dist/baseline/semantic.d.ts +203 -0
- package/dist/baseline/semantic.js +908 -0
- package/dist/baseline/synonyms.d.ts +60 -0
- package/dist/baseline/synonyms.js +386 -0
- package/dist/baseline/telemetry.d.ts +165 -0
- package/dist/baseline/telemetry.js +294 -0
- package/dist/baseline/test-pruner.d.ts +120 -0
- package/dist/baseline/test-pruner.js +387 -0
- package/dist/baseline/types.d.ts +449 -0
- package/dist/baseline/types.js +5 -0
- package/dist/baseline/version.d.ts +138 -0
- package/dist/baseline/version.js +206 -0
- package/dist/cache/index.d.ts +5 -0
- package/dist/cache/index.js +5 -0
- package/dist/cache/response-cache.d.ts +151 -0
- package/dist/cache/response-cache.js +287 -0
- package/dist/ci/index.d.ts +60 -0
- package/dist/ci/index.js +342 -0
- package/dist/cli/commands/auth.d.ts +12 -0
- package/dist/cli/commands/auth.js +352 -0
- package/dist/cli/commands/badge.d.ts +3 -0
- package/dist/cli/commands/badge.js +74 -0
- package/dist/cli/commands/baseline-accept.d.ts +15 -0
- package/dist/cli/commands/baseline-accept.js +178 -0
- package/dist/cli/commands/baseline-migrate.d.ts +12 -0
- package/dist/cli/commands/baseline-migrate.js +164 -0
- package/dist/cli/commands/baseline.d.ts +14 -0
- package/dist/cli/commands/baseline.js +449 -0
- package/dist/cli/commands/beta.d.ts +10 -0
- package/dist/cli/commands/beta.js +231 -0
- package/dist/cli/commands/check.d.ts +11 -0
- package/dist/cli/commands/check.js +820 -0
- package/dist/cli/commands/cloud/badge.d.ts +3 -0
- package/dist/cli/commands/cloud/badge.js +74 -0
- package/dist/cli/commands/cloud/diff.d.ts +6 -0
- package/dist/cli/commands/cloud/diff.js +79 -0
- package/dist/cli/commands/cloud/history.d.ts +6 -0
- package/dist/cli/commands/cloud/history.js +102 -0
- package/dist/cli/commands/cloud/link.d.ts +9 -0
- package/dist/cli/commands/cloud/link.js +119 -0
- package/dist/cli/commands/cloud/login.d.ts +7 -0
- package/dist/cli/commands/cloud/login.js +499 -0
- package/dist/cli/commands/cloud/projects.d.ts +6 -0
- package/dist/cli/commands/cloud/projects.js +44 -0
- package/dist/cli/commands/cloud/shared.d.ts +7 -0
- package/dist/cli/commands/cloud/shared.js +42 -0
- package/dist/cli/commands/cloud/teams.d.ts +8 -0
- package/dist/cli/commands/cloud/teams.js +169 -0
- package/dist/cli/commands/cloud/upload.d.ts +8 -0
- package/dist/cli/commands/cloud/upload.js +181 -0
- package/dist/cli/commands/contract.d.ts +11 -0
- package/dist/cli/commands/contract.js +280 -0
- package/dist/cli/commands/discover.d.ts +3 -0
- package/dist/cli/commands/discover.js +82 -0
- package/dist/cli/commands/eval.d.ts +9 -0
- package/dist/cli/commands/eval.js +187 -0
- package/dist/cli/commands/explore.d.ts +11 -0
- package/dist/cli/commands/explore.js +437 -0
- package/dist/cli/commands/feedback.d.ts +9 -0
- package/dist/cli/commands/feedback.js +174 -0
- package/dist/cli/commands/golden.d.ts +12 -0
- package/dist/cli/commands/golden.js +407 -0
- package/dist/cli/commands/history.d.ts +10 -0
- package/dist/cli/commands/history.js +202 -0
- package/dist/cli/commands/init.d.ts +9 -0
- package/dist/cli/commands/init.js +219 -0
- package/dist/cli/commands/interview.d.ts +3 -0
- package/dist/cli/commands/interview.js +903 -0
- package/dist/cli/commands/link.d.ts +10 -0
- package/dist/cli/commands/link.js +169 -0
- package/dist/cli/commands/login.d.ts +7 -0
- package/dist/cli/commands/login.js +499 -0
- package/dist/cli/commands/preset.d.ts +33 -0
- package/dist/cli/commands/preset.js +297 -0
- package/dist/cli/commands/profile.d.ts +33 -0
- package/dist/cli/commands/profile.js +286 -0
- package/dist/cli/commands/registry.d.ts +11 -0
- package/dist/cli/commands/registry.js +146 -0
- package/dist/cli/commands/shared.d.ts +79 -0
- package/dist/cli/commands/shared.js +196 -0
- package/dist/cli/commands/teams.d.ts +8 -0
- package/dist/cli/commands/teams.js +169 -0
- package/dist/cli/commands/test.d.ts +9 -0
- package/dist/cli/commands/test.js +500 -0
- package/dist/cli/commands/upload.d.ts +8 -0
- package/dist/cli/commands/upload.js +223 -0
- package/dist/cli/commands/validate-config.d.ts +6 -0
- package/dist/cli/commands/validate-config.js +35 -0
- package/dist/cli/commands/verify.d.ts +11 -0
- package/dist/cli/commands/verify.js +283 -0
- package/dist/cli/commands/watch.d.ts +12 -0
- package/dist/cli/commands/watch.js +253 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.js +178 -0
- package/dist/cli/interactive.d.ts +47 -0
- package/dist/cli/interactive.js +216 -0
- package/dist/cli/output/terminal-reporter.d.ts +19 -0
- package/dist/cli/output/terminal-reporter.js +104 -0
- package/dist/cli/output.d.ts +226 -0
- package/dist/cli/output.js +438 -0
- package/dist/cli/utils/env.d.ts +5 -0
- package/dist/cli/utils/env.js +14 -0
- package/dist/cli/utils/progress.d.ts +59 -0
- package/dist/cli/utils/progress.js +206 -0
- package/dist/cli/utils/server-context.d.ts +10 -0
- package/dist/cli/utils/server-context.js +36 -0
- package/dist/cloud/auth.d.ts +144 -0
- package/dist/cloud/auth.js +374 -0
- package/dist/cloud/client.d.ts +24 -0
- package/dist/cloud/client.js +65 -0
- package/dist/cloud/http-client.d.ts +38 -0
- package/dist/cloud/http-client.js +215 -0
- package/dist/cloud/index.d.ts +23 -0
- package/dist/cloud/index.js +25 -0
- package/dist/cloud/mock-client.d.ts +107 -0
- package/dist/cloud/mock-client.js +545 -0
- package/dist/cloud/types.d.ts +515 -0
- package/dist/cloud/types.js +15 -0
- package/dist/config/defaults.d.ts +160 -0
- package/dist/config/defaults.js +169 -0
- package/dist/config/loader.d.ts +24 -0
- package/dist/config/loader.js +122 -0
- package/dist/config/template.d.ts +42 -0
- package/dist/config/template.js +647 -0
- package/dist/config/validator.d.ts +2112 -0
- package/dist/config/validator.js +658 -0
- package/dist/constants/cloud.d.ts +107 -0
- package/dist/constants/cloud.js +110 -0
- package/dist/constants/core.d.ts +521 -0
- package/dist/constants/core.js +556 -0
- package/dist/constants/testing.d.ts +1283 -0
- package/dist/constants/testing.js +1568 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.js +10 -0
- package/dist/contract/index.d.ts +6 -0
- package/dist/contract/index.js +5 -0
- package/dist/contract/validator.d.ts +177 -0
- package/dist/contract/validator.js +574 -0
- package/dist/cost/index.d.ts +6 -0
- package/dist/cost/index.js +5 -0
- package/dist/cost/tracker.d.ts +134 -0
- package/dist/cost/tracker.js +313 -0
- package/dist/discovery/discovery.d.ts +16 -0
- package/dist/discovery/discovery.js +173 -0
- package/dist/discovery/types.d.ts +51 -0
- package/dist/discovery/types.js +2 -0
- package/dist/docs/agents.d.ts +3 -0
- package/dist/docs/agents.js +995 -0
- package/dist/docs/contract.d.ts +51 -0
- package/dist/docs/contract.js +1681 -0
- package/dist/docs/generator.d.ts +4 -0
- package/dist/docs/generator.js +4 -0
- package/dist/docs/html-reporter.d.ts +9 -0
- package/dist/docs/html-reporter.js +757 -0
- package/dist/docs/index.d.ts +10 -0
- package/dist/docs/index.js +11 -0
- package/dist/docs/junit-reporter.d.ts +18 -0
- package/dist/docs/junit-reporter.js +210 -0
- package/dist/docs/report.d.ts +14 -0
- package/dist/docs/report.js +44 -0
- package/dist/docs/sarif-reporter.d.ts +19 -0
- package/dist/docs/sarif-reporter.js +335 -0
- package/dist/docs/shared.d.ts +35 -0
- package/dist/docs/shared.js +162 -0
- package/dist/docs/templates.d.ts +12 -0
- package/dist/docs/templates.js +76 -0
- package/dist/errors/index.d.ts +6 -0
- package/dist/errors/index.js +6 -0
- package/dist/errors/retry.d.ts +92 -0
- package/dist/errors/retry.js +323 -0
- package/dist/errors/types.d.ts +321 -0
- package/dist/errors/types.js +584 -0
- package/dist/index.d.ts +32 -0
- package/dist/index.js +32 -0
- package/dist/interview/dependency-resolver.d.ts +11 -0
- package/dist/interview/dependency-resolver.js +32 -0
- package/dist/interview/interviewer.d.ts +232 -0
- package/dist/interview/interviewer.js +1939 -0
- package/dist/interview/mock-response-generator.d.ts +7 -0
- package/dist/interview/mock-response-generator.js +102 -0
- package/dist/interview/orchestrator.d.ts +237 -0
- package/dist/interview/orchestrator.js +1296 -0
- package/dist/interview/rate-limiter.d.ts +15 -0
- package/dist/interview/rate-limiter.js +55 -0
- package/dist/interview/response-validator.d.ts +10 -0
- package/dist/interview/response-validator.js +132 -0
- package/dist/interview/schema-inferrer.d.ts +8 -0
- package/dist/interview/schema-inferrer.js +71 -0
- package/dist/interview/schema-test-generator.d.ts +71 -0
- package/dist/interview/schema-test-generator.js +834 -0
- package/dist/interview/smart-value-generator.d.ts +155 -0
- package/dist/interview/smart-value-generator.js +554 -0
- package/dist/interview/stateful-test-runner.d.ts +19 -0
- package/dist/interview/stateful-test-runner.js +106 -0
- package/dist/interview/types.d.ts +561 -0
- package/dist/interview/types.js +2 -0
- package/dist/llm/anthropic.d.ts +41 -0
- package/dist/llm/anthropic.js +355 -0
- package/dist/llm/client.d.ts +123 -0
- package/dist/llm/client.js +42 -0
- package/dist/llm/factory.d.ts +38 -0
- package/dist/llm/factory.js +145 -0
- package/dist/llm/fallback.d.ts +140 -0
- package/dist/llm/fallback.js +379 -0
- package/dist/llm/index.d.ts +18 -0
- package/dist/llm/index.js +15 -0
- package/dist/llm/ollama.d.ts +37 -0
- package/dist/llm/ollama.js +330 -0
- package/dist/llm/openai.d.ts +25 -0
- package/dist/llm/openai.js +320 -0
- package/dist/llm/token-budget.d.ts +161 -0
- package/dist/llm/token-budget.js +395 -0
- package/dist/logging/logger.d.ts +70 -0
- package/dist/logging/logger.js +130 -0
- package/dist/metrics/collector.d.ts +106 -0
- package/dist/metrics/collector.js +547 -0
- package/dist/metrics/index.d.ts +7 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/prometheus.d.ts +20 -0
- package/dist/metrics/prometheus.js +241 -0
- package/dist/metrics/types.d.ts +209 -0
- package/dist/metrics/types.js +5 -0
- package/dist/persona/builtins.d.ts +54 -0
- package/dist/persona/builtins.js +219 -0
- package/dist/persona/index.d.ts +8 -0
- package/dist/persona/index.js +8 -0
- package/dist/persona/loader.d.ts +30 -0
- package/dist/persona/loader.js +190 -0
- package/dist/persona/types.d.ts +144 -0
- package/dist/persona/types.js +5 -0
- package/dist/persona/validation.d.ts +94 -0
- package/dist/persona/validation.js +332 -0
- package/dist/prompts/index.d.ts +5 -0
- package/dist/prompts/index.js +5 -0
- package/dist/prompts/templates.d.ts +180 -0
- package/dist/prompts/templates.js +431 -0
- package/dist/registry/client.d.ts +49 -0
- package/dist/registry/client.js +191 -0
- package/dist/registry/index.d.ts +7 -0
- package/dist/registry/index.js +6 -0
- package/dist/registry/types.d.ts +140 -0
- package/dist/registry/types.js +6 -0
- package/dist/scenarios/evaluator.d.ts +43 -0
- package/dist/scenarios/evaluator.js +206 -0
- package/dist/scenarios/index.d.ts +10 -0
- package/dist/scenarios/index.js +9 -0
- package/dist/scenarios/loader.d.ts +20 -0
- package/dist/scenarios/loader.js +285 -0
- package/dist/scenarios/types.d.ts +153 -0
- package/dist/scenarios/types.js +8 -0
- package/dist/security/index.d.ts +17 -0
- package/dist/security/index.js +18 -0
- package/dist/security/payloads.d.ts +61 -0
- package/dist/security/payloads.js +268 -0
- package/dist/security/security-tester.d.ts +42 -0
- package/dist/security/security-tester.js +582 -0
- package/dist/security/types.d.ts +166 -0
- package/dist/security/types.js +8 -0
- package/dist/transport/base-transport.d.ts +59 -0
- package/dist/transport/base-transport.js +38 -0
- package/dist/transport/http-transport.d.ts +67 -0
- package/dist/transport/http-transport.js +238 -0
- package/dist/transport/mcp-client.d.ts +141 -0
- package/dist/transport/mcp-client.js +496 -0
- package/dist/transport/sse-transport.d.ts +88 -0
- package/dist/transport/sse-transport.js +316 -0
- package/dist/transport/stdio-transport.d.ts +43 -0
- package/dist/transport/stdio-transport.js +238 -0
- package/dist/transport/types.d.ts +125 -0
- package/dist/transport/types.js +16 -0
- package/dist/utils/concurrency.d.ts +123 -0
- package/dist/utils/concurrency.js +213 -0
- package/dist/utils/formatters.d.ts +16 -0
- package/dist/utils/formatters.js +37 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/jsonpath.d.ts +87 -0
- package/dist/utils/jsonpath.js +326 -0
- package/dist/utils/markdown.d.ts +113 -0
- package/dist/utils/markdown.js +265 -0
- package/dist/utils/network.d.ts +14 -0
- package/dist/utils/network.js +17 -0
- package/dist/utils/sanitize.d.ts +92 -0
- package/dist/utils/sanitize.js +191 -0
- package/dist/utils/semantic.d.ts +194 -0
- package/dist/utils/semantic.js +1051 -0
- package/dist/utils/smart-truncate.d.ts +94 -0
- package/dist/utils/smart-truncate.js +361 -0
- package/dist/utils/timeout.d.ts +153 -0
- package/dist/utils/timeout.js +205 -0
- package/dist/utils/yaml-parser.d.ts +58 -0
- package/dist/utils/yaml-parser.js +86 -0
- package/dist/validation/index.d.ts +32 -0
- package/dist/validation/index.js +32 -0
- package/dist/validation/semantic-test-generator.d.ts +50 -0
- package/dist/validation/semantic-test-generator.js +176 -0
- package/dist/validation/semantic-types.d.ts +66 -0
- package/dist/validation/semantic-types.js +94 -0
- package/dist/validation/semantic-validator.d.ts +38 -0
- package/dist/validation/semantic-validator.js +340 -0
- package/dist/verification/index.d.ts +6 -0
- package/dist/verification/index.js +5 -0
- package/dist/verification/types.d.ts +133 -0
- package/dist/verification/types.js +5 -0
- package/dist/verification/verifier.d.ts +30 -0
- package/dist/verification/verifier.js +309 -0
- package/dist/version.d.ts +19 -0
- package/dist/version.js +48 -0
- package/dist/workflow/auto-generator.d.ts +27 -0
- package/dist/workflow/auto-generator.js +513 -0
- package/dist/workflow/discovery.d.ts +40 -0
- package/dist/workflow/discovery.js +195 -0
- package/dist/workflow/executor.d.ts +82 -0
- package/dist/workflow/executor.js +611 -0
- package/dist/workflow/index.d.ts +10 -0
- package/dist/workflow/index.js +10 -0
- package/dist/workflow/loader.d.ts +24 -0
- package/dist/workflow/loader.js +194 -0
- package/dist/workflow/state-tracker.d.ts +98 -0
- package/dist/workflow/state-tracker.js +424 -0
- package/dist/workflow/types.d.ts +337 -0
- package/dist/workflow/types.js +5 -0
- package/package.json +94 -0
- package/schemas/bellwether-check.schema.json +651 -0
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AI Agent Compatibility Scoring.
|
|
3
|
+
*
|
|
4
|
+
* Evaluates how well an MCP server is designed for AI agent consumption.
|
|
5
|
+
* Scores tools based on description clarity, parameter naming, error quality,
|
|
6
|
+
* example completeness, workflow documentation, and response predictability.
|
|
7
|
+
*/
|
|
8
|
+
import { AI_COMPATIBILITY } from '../constants.js';
|
|
9
|
+
/**
|
|
10
|
+
* Calculate AI compatibility score for a set of tools.
|
|
11
|
+
*/
|
|
12
|
+
export function calculateAICompatibilityScore(inputs) {
|
|
13
|
+
if (inputs.length === 0) {
|
|
14
|
+
return createEmptyScore();
|
|
15
|
+
}
|
|
16
|
+
// Calculate individual component scores
|
|
17
|
+
const descriptionClarity = scoreDescriptionClarity(inputs);
|
|
18
|
+
const parameterNaming = scoreParameterNaming(inputs);
|
|
19
|
+
const errorMessageQuality = scoreErrorMessageQuality(inputs);
|
|
20
|
+
const exampleCompleteness = scoreExampleCompleteness(inputs);
|
|
21
|
+
const workflowDocumentation = scoreWorkflowDocumentation(inputs);
|
|
22
|
+
const responsePredictability = scoreResponsePredictability(inputs);
|
|
23
|
+
// Calculate weighted overall score
|
|
24
|
+
const overall = Math.round(descriptionClarity.weightedScore +
|
|
25
|
+
parameterNaming.weightedScore +
|
|
26
|
+
errorMessageQuality.weightedScore +
|
|
27
|
+
exampleCompleteness.weightedScore +
|
|
28
|
+
workflowDocumentation.weightedScore +
|
|
29
|
+
responsePredictability.weightedScore);
|
|
30
|
+
// Determine grade
|
|
31
|
+
const grade = calculateGrade(overall);
|
|
32
|
+
// Calculate per-tool scores
|
|
33
|
+
const toolScores = inputs.map(input => calculateToolScore(input));
|
|
34
|
+
// Generate recommendations
|
|
35
|
+
const recommendations = generateRecommendations({
|
|
36
|
+
descriptionClarity,
|
|
37
|
+
parameterNaming,
|
|
38
|
+
errorMessageQuality,
|
|
39
|
+
exampleCompleteness,
|
|
40
|
+
workflowDocumentation,
|
|
41
|
+
responsePredictability,
|
|
42
|
+
}, inputs, toolScores);
|
|
43
|
+
return {
|
|
44
|
+
overall,
|
|
45
|
+
grade,
|
|
46
|
+
breakdown: {
|
|
47
|
+
descriptionClarity,
|
|
48
|
+
parameterNaming,
|
|
49
|
+
errorMessageQuality,
|
|
50
|
+
exampleCompleteness,
|
|
51
|
+
workflowDocumentation,
|
|
52
|
+
responsePredictability,
|
|
53
|
+
},
|
|
54
|
+
recommendations,
|
|
55
|
+
toolScores,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Score description clarity across all tools.
|
|
60
|
+
* Checks for: minimum length, action verbs, purpose explanation, input/output mentions.
|
|
61
|
+
*/
|
|
62
|
+
function scoreDescriptionClarity(inputs) {
|
|
63
|
+
const weight = AI_COMPATIBILITY.WEIGHTS.descriptionClarity;
|
|
64
|
+
const notes = [];
|
|
65
|
+
let totalScore = 0;
|
|
66
|
+
for (const { tool } of inputs) {
|
|
67
|
+
let toolScore = 0;
|
|
68
|
+
const description = tool.description || '';
|
|
69
|
+
// Check minimum length
|
|
70
|
+
if (description.length >= AI_COMPATIBILITY.DESCRIPTION.GOOD_LENGTH) {
|
|
71
|
+
toolScore += AI_COMPATIBILITY.DESCRIPTION.POINTS.GOOD_LENGTH;
|
|
72
|
+
}
|
|
73
|
+
else if (description.length >= AI_COMPATIBILITY.DESCRIPTION.MIN_LENGTH) {
|
|
74
|
+
toolScore += AI_COMPATIBILITY.DESCRIPTION.POINTS.MIN_LENGTH;
|
|
75
|
+
}
|
|
76
|
+
// Check for action verb at start
|
|
77
|
+
if (AI_COMPATIBILITY.DESCRIPTION.ACTION_VERB_PATTERN.test(description)) {
|
|
78
|
+
toolScore += AI_COMPATIBILITY.DESCRIPTION.POINTS.ACTION_VERB;
|
|
79
|
+
}
|
|
80
|
+
// Check for purpose explanation
|
|
81
|
+
if (AI_COMPATIBILITY.DESCRIPTION.PURPOSE_PATTERN.test(description)) {
|
|
82
|
+
toolScore += AI_COMPATIBILITY.DESCRIPTION.POINTS.PURPOSE;
|
|
83
|
+
}
|
|
84
|
+
// Check for input/output mentions
|
|
85
|
+
if (AI_COMPATIBILITY.DESCRIPTION.IO_PATTERN.test(description)) {
|
|
86
|
+
toolScore += AI_COMPATIBILITY.DESCRIPTION.POINTS.IO_MENTION;
|
|
87
|
+
}
|
|
88
|
+
totalScore += Math.min(toolScore, 100);
|
|
89
|
+
}
|
|
90
|
+
const score = Math.round(totalScore / inputs.length);
|
|
91
|
+
// Add notes based on common issues
|
|
92
|
+
const shortDescriptions = inputs.filter(i => (i.tool.description || '').length < AI_COMPATIBILITY.DESCRIPTION.MIN_LENGTH);
|
|
93
|
+
if (shortDescriptions.length > 0) {
|
|
94
|
+
notes.push(`${shortDescriptions.length} tool(s) have short descriptions (<${AI_COMPATIBILITY.DESCRIPTION.MIN_LENGTH} chars)`);
|
|
95
|
+
}
|
|
96
|
+
const missingActionVerbs = inputs.filter(i => !AI_COMPATIBILITY.DESCRIPTION.ACTION_VERB_PATTERN.test(i.tool.description || ''));
|
|
97
|
+
if (missingActionVerbs.length > inputs.length / 2) {
|
|
98
|
+
notes.push('Many tools lack action verbs in descriptions');
|
|
99
|
+
}
|
|
100
|
+
if (notes.length === 0 && score >= 80) {
|
|
101
|
+
notes.push('Good description clarity across tools');
|
|
102
|
+
}
|
|
103
|
+
return {
|
|
104
|
+
score,
|
|
105
|
+
weight,
|
|
106
|
+
weightedScore: score * weight,
|
|
107
|
+
notes,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Score parameter naming quality.
|
|
112
|
+
* Checks for: descriptive names, consistent casing, common conventions.
|
|
113
|
+
*/
|
|
114
|
+
function scoreParameterNaming(inputs) {
|
|
115
|
+
const weight = AI_COMPATIBILITY.WEIGHTS.parameterNaming;
|
|
116
|
+
const notes = [];
|
|
117
|
+
let totalParams = 0;
|
|
118
|
+
let goodParams = 0;
|
|
119
|
+
const badNames = [];
|
|
120
|
+
for (const { tool } of inputs) {
|
|
121
|
+
const schema = tool.inputSchema;
|
|
122
|
+
if (!schema?.properties)
|
|
123
|
+
continue;
|
|
124
|
+
for (const paramName of Object.keys(schema.properties)) {
|
|
125
|
+
totalParams++;
|
|
126
|
+
// Check for generic/bad names
|
|
127
|
+
if (AI_COMPATIBILITY.PARAMETER.BAD_NAMES.includes(paramName.toLowerCase())) {
|
|
128
|
+
badNames.push(`${tool.name}.${paramName}`);
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
// Check for minimum length
|
|
132
|
+
if (paramName.length < AI_COMPATIBILITY.PARAMETER.MIN_NAME_LENGTH) {
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
// Check for consistent casing (snake_case or camelCase)
|
|
136
|
+
const isSnakeCase = /^[a-z][a-z0-9]*(_[a-z0-9]+)*$/.test(paramName);
|
|
137
|
+
const isCamelCase = /^[a-z][a-zA-Z0-9]*$/.test(paramName);
|
|
138
|
+
if (!isSnakeCase && !isCamelCase) {
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
goodParams++;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
const score = totalParams > 0 ? Math.round((goodParams / totalParams) * 100) : 100;
|
|
145
|
+
if (badNames.length > 0) {
|
|
146
|
+
const displayNames = badNames.slice(0, 3).join(', ');
|
|
147
|
+
const more = badNames.length > 3 ? ` and ${badNames.length - 3} more` : '';
|
|
148
|
+
notes.push(`Generic names found: ${displayNames}${more}`);
|
|
149
|
+
}
|
|
150
|
+
if (score >= 90) {
|
|
151
|
+
notes.push('Excellent parameter naming conventions');
|
|
152
|
+
}
|
|
153
|
+
else if (score < 60) {
|
|
154
|
+
notes.push('Many parameters have non-descriptive names');
|
|
155
|
+
}
|
|
156
|
+
return {
|
|
157
|
+
score,
|
|
158
|
+
weight,
|
|
159
|
+
weightedScore: score * weight,
|
|
160
|
+
notes,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Score error message quality.
|
|
165
|
+
* Checks for: actionable messages, remediation hints, consistent format.
|
|
166
|
+
*/
|
|
167
|
+
function scoreErrorMessageQuality(inputs) {
|
|
168
|
+
const weight = AI_COMPATIBILITY.WEIGHTS.errorMessageQuality;
|
|
169
|
+
const notes = [];
|
|
170
|
+
let totalErrors = 0;
|
|
171
|
+
let goodErrors = 0;
|
|
172
|
+
const poorErrors = [];
|
|
173
|
+
for (const { errorPatterns } of inputs) {
|
|
174
|
+
if (!errorPatterns)
|
|
175
|
+
continue;
|
|
176
|
+
for (const pattern of errorPatterns) {
|
|
177
|
+
totalErrors++;
|
|
178
|
+
const message = pattern.example || '';
|
|
179
|
+
let quality = 0;
|
|
180
|
+
// Check for minimum length
|
|
181
|
+
if (message.length >= AI_COMPATIBILITY.ERROR.MIN_MESSAGE_LENGTH) {
|
|
182
|
+
quality++;
|
|
183
|
+
}
|
|
184
|
+
// Check for actionable content (contains suggestion/fix)
|
|
185
|
+
if (AI_COMPATIBILITY.ERROR.ACTIONABLE_PATTERN.test(message)) {
|
|
186
|
+
quality++;
|
|
187
|
+
}
|
|
188
|
+
// Check for remediation hints
|
|
189
|
+
if (AI_COMPATIBILITY.ERROR.REMEDIATION_PATTERN.test(message)) {
|
|
190
|
+
quality++;
|
|
191
|
+
}
|
|
192
|
+
// Good error messages have at least 2 quality indicators
|
|
193
|
+
if (quality >= 2) {
|
|
194
|
+
goodErrors++;
|
|
195
|
+
}
|
|
196
|
+
else if (quality === 0 && message.length > 0) {
|
|
197
|
+
poorErrors.push(message.slice(0, 50));
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
// If no errors were observed, give a neutral score
|
|
202
|
+
const score = totalErrors > 0
|
|
203
|
+
? Math.round((goodErrors / totalErrors) * 100)
|
|
204
|
+
: AI_COMPATIBILITY.ERROR.DEFAULT_SCORE;
|
|
205
|
+
if (totalErrors === 0) {
|
|
206
|
+
notes.push('No error patterns observed (run with more test cases)');
|
|
207
|
+
}
|
|
208
|
+
else if (poorErrors.length > 0) {
|
|
209
|
+
notes.push(`${poorErrors.length} error message(s) lack actionable guidance`);
|
|
210
|
+
}
|
|
211
|
+
if (score >= 80 && totalErrors > 0) {
|
|
212
|
+
notes.push('Error messages provide good guidance');
|
|
213
|
+
}
|
|
214
|
+
return {
|
|
215
|
+
score,
|
|
216
|
+
weight,
|
|
217
|
+
weightedScore: score * weight,
|
|
218
|
+
notes,
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Score example completeness.
|
|
223
|
+
* Checks for: non-truncated examples, variety of examples, example coverage.
|
|
224
|
+
*/
|
|
225
|
+
function scoreExampleCompleteness(inputs) {
|
|
226
|
+
const weight = AI_COMPATIBILITY.WEIGHTS.exampleCompleteness;
|
|
227
|
+
const notes = [];
|
|
228
|
+
let toolsWithExamples = 0;
|
|
229
|
+
let truncatedExamples = 0;
|
|
230
|
+
for (const { fingerprint } of inputs) {
|
|
231
|
+
// Check if tool has response data (indicates examples exist)
|
|
232
|
+
if (fingerprint?.responseFingerprint) {
|
|
233
|
+
toolsWithExamples++;
|
|
234
|
+
// Check for truncation indicators
|
|
235
|
+
const raw = JSON.stringify(fingerprint.responseFingerprint);
|
|
236
|
+
if (raw.includes('...') || raw.includes('truncated')) {
|
|
237
|
+
truncatedExamples++;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
const coverage = inputs.length > 0 ? toolsWithExamples / inputs.length : 0;
|
|
242
|
+
const truncationPenalty = toolsWithExamples > 0
|
|
243
|
+
? truncatedExamples / toolsWithExamples
|
|
244
|
+
: 0;
|
|
245
|
+
// Score based on coverage and truncation
|
|
246
|
+
const score = Math.round((coverage * AI_COMPATIBILITY.EXAMPLE.COVERAGE_WEIGHT +
|
|
247
|
+
(1 - truncationPenalty) * AI_COMPATIBILITY.EXAMPLE.QUALITY_WEIGHT) * 100);
|
|
248
|
+
if (truncatedExamples > 0) {
|
|
249
|
+
notes.push(`${truncatedExamples} tool(s) have truncated examples`);
|
|
250
|
+
}
|
|
251
|
+
if (coverage < 0.5) {
|
|
252
|
+
notes.push('Less than half of tools have captured examples');
|
|
253
|
+
}
|
|
254
|
+
else if (coverage === 1 && truncatedExamples === 0) {
|
|
255
|
+
notes.push('Full example coverage with no truncation');
|
|
256
|
+
}
|
|
257
|
+
return {
|
|
258
|
+
score,
|
|
259
|
+
weight,
|
|
260
|
+
weightedScore: score * weight,
|
|
261
|
+
notes,
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Score workflow documentation quality.
|
|
266
|
+
* Checks for: sequence descriptions, dependency hints, multi-step guidance.
|
|
267
|
+
*/
|
|
268
|
+
function scoreWorkflowDocumentation(inputs) {
|
|
269
|
+
const weight = AI_COMPATIBILITY.WEIGHTS.workflowDocumentation;
|
|
270
|
+
const notes = [];
|
|
271
|
+
let toolsWithSequenceHints = 0;
|
|
272
|
+
let toolsWithDependencyHints = 0;
|
|
273
|
+
for (const { tool } of inputs) {
|
|
274
|
+
const description = (tool.description || '').toLowerCase();
|
|
275
|
+
// Check for sequence hints
|
|
276
|
+
if (AI_COMPATIBILITY.WORKFLOW.SEQUENCE_PATTERN.test(description)) {
|
|
277
|
+
toolsWithSequenceHints++;
|
|
278
|
+
}
|
|
279
|
+
// Check for dependency hints
|
|
280
|
+
if (AI_COMPATIBILITY.WORKFLOW.DEPENDENCY_PATTERN.test(description)) {
|
|
281
|
+
toolsWithDependencyHints++;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
// Calculate score based on presence of workflow hints
|
|
285
|
+
const sequenceRatio = inputs.length > 0 ? toolsWithSequenceHints / inputs.length : 0;
|
|
286
|
+
const dependencyRatio = inputs.length > 0 ? toolsWithDependencyHints / inputs.length : 0;
|
|
287
|
+
// Workflow documentation is good if at least some tools have hints
|
|
288
|
+
// But we don't penalize heavily if tools are independent
|
|
289
|
+
const score = Math.round(Math.min(100, 50 + sequenceRatio * 25 + dependencyRatio * 25));
|
|
290
|
+
if (toolsWithSequenceHints > 0) {
|
|
291
|
+
notes.push(`${toolsWithSequenceHints} tool(s) describe execution sequences`);
|
|
292
|
+
}
|
|
293
|
+
if (toolsWithDependencyHints > 0) {
|
|
294
|
+
notes.push(`${toolsWithDependencyHints} tool(s) mention dependencies`);
|
|
295
|
+
}
|
|
296
|
+
if (score < 60 && inputs.length > 3) {
|
|
297
|
+
notes.push('Consider adding workflow guidance to descriptions');
|
|
298
|
+
}
|
|
299
|
+
return {
|
|
300
|
+
score,
|
|
301
|
+
weight,
|
|
302
|
+
weightedScore: score * weight,
|
|
303
|
+
notes,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Score response predictability.
|
|
308
|
+
* Checks for: schema stability, consistent structure, type consistency.
|
|
309
|
+
*/
|
|
310
|
+
function scoreResponsePredictability(inputs) {
|
|
311
|
+
const weight = AI_COMPATIBILITY.WEIGHTS.responsePredictability;
|
|
312
|
+
const notes = [];
|
|
313
|
+
let stableTools = 0;
|
|
314
|
+
const unstableTools = [];
|
|
315
|
+
for (const { tool, schemaEvolution } of inputs) {
|
|
316
|
+
if (!schemaEvolution) {
|
|
317
|
+
// No evolution data - assume stable
|
|
318
|
+
stableTools++;
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
321
|
+
// Check stability based on schema evolution data
|
|
322
|
+
if (schemaEvolution.isStable) {
|
|
323
|
+
stableTools++;
|
|
324
|
+
}
|
|
325
|
+
else {
|
|
326
|
+
unstableTools.push(tool.name);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const score = inputs.length > 0
|
|
330
|
+
? Math.round((stableTools / inputs.length) * 100)
|
|
331
|
+
: AI_COMPATIBILITY.RESPONSE.DEFAULT_SCORE;
|
|
332
|
+
if (unstableTools.length > 0) {
|
|
333
|
+
const displayTools = unstableTools.slice(0, 3).join(', ');
|
|
334
|
+
const more = unstableTools.length > 3 ? ` and ${unstableTools.length - 3} more` : '';
|
|
335
|
+
notes.push(`Unstable responses: ${displayTools}${more}`);
|
|
336
|
+
}
|
|
337
|
+
if (score >= 90) {
|
|
338
|
+
notes.push('Highly predictable response structures');
|
|
339
|
+
}
|
|
340
|
+
return {
|
|
341
|
+
score,
|
|
342
|
+
weight,
|
|
343
|
+
weightedScore: score * weight,
|
|
344
|
+
notes,
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
/**
|
|
348
|
+
* Calculate score for a single tool.
|
|
349
|
+
*/
|
|
350
|
+
function calculateToolScore(input) {
|
|
351
|
+
const issues = [];
|
|
352
|
+
let score = 100;
|
|
353
|
+
const { tool, errorPatterns, schemaEvolution } = input;
|
|
354
|
+
const description = tool.description || '';
|
|
355
|
+
// Description issues
|
|
356
|
+
if (description.length < AI_COMPATIBILITY.DESCRIPTION.MIN_LENGTH) {
|
|
357
|
+
score -= 15;
|
|
358
|
+
issues.push('Short or missing description');
|
|
359
|
+
}
|
|
360
|
+
else if (!AI_COMPATIBILITY.DESCRIPTION.ACTION_VERB_PATTERN.test(description)) {
|
|
361
|
+
score -= 5;
|
|
362
|
+
issues.push('Description lacks action verb');
|
|
363
|
+
}
|
|
364
|
+
// Parameter issues
|
|
365
|
+
const schema = tool.inputSchema;
|
|
366
|
+
if (schema?.properties) {
|
|
367
|
+
for (const paramName of Object.keys(schema.properties)) {
|
|
368
|
+
if (AI_COMPATIBILITY.PARAMETER.BAD_NAMES.includes(paramName.toLowerCase())) {
|
|
369
|
+
score -= 10;
|
|
370
|
+
issues.push(`Generic parameter name: ${paramName}`);
|
|
371
|
+
break; // Only penalize once per tool
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
// Error quality issues
|
|
376
|
+
if (errorPatterns && errorPatterns.length > 0) {
|
|
377
|
+
const poorErrors = errorPatterns.filter(e => {
|
|
378
|
+
const msg = e.example || '';
|
|
379
|
+
return msg.length < AI_COMPATIBILITY.ERROR.MIN_MESSAGE_LENGTH ||
|
|
380
|
+
(!AI_COMPATIBILITY.ERROR.ACTIONABLE_PATTERN.test(msg) &&
|
|
381
|
+
!AI_COMPATIBILITY.ERROR.REMEDIATION_PATTERN.test(msg));
|
|
382
|
+
});
|
|
383
|
+
if (poorErrors.length > errorPatterns.length / 2) {
|
|
384
|
+
score -= 10;
|
|
385
|
+
issues.push('Error messages lack guidance');
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
// Response stability issues
|
|
389
|
+
if (schemaEvolution && !schemaEvolution.isStable) {
|
|
390
|
+
score -= 15;
|
|
391
|
+
issues.push('Response structure is unstable');
|
|
392
|
+
}
|
|
393
|
+
return {
|
|
394
|
+
toolName: tool.name,
|
|
395
|
+
score: Math.max(0, score),
|
|
396
|
+
issues,
|
|
397
|
+
};
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Generate actionable recommendations based on scores.
|
|
401
|
+
*/
|
|
402
|
+
function generateRecommendations(breakdown, inputs, toolScores) {
|
|
403
|
+
const recommendations = [];
|
|
404
|
+
let priority = 1;
|
|
405
|
+
// Recommend based on lowest-scoring components
|
|
406
|
+
const components = Object.entries(breakdown);
|
|
407
|
+
const sortedComponents = components.sort((a, b) => a[1].score - b[1].score);
|
|
408
|
+
for (const [category, component] of sortedComponents) {
|
|
409
|
+
if (component.score >= AI_COMPATIBILITY.RECOMMENDATION_THRESHOLD)
|
|
410
|
+
continue;
|
|
411
|
+
const affectedTools = toolScores
|
|
412
|
+
.filter(t => t.issues.some(i => isIssueRelatedToCategory(i, category)))
|
|
413
|
+
.map(t => t.toolName);
|
|
414
|
+
const recommendation = createRecommendation(category, component, affectedTools, priority++, inputs);
|
|
415
|
+
if (recommendation) {
|
|
416
|
+
recommendations.push(recommendation);
|
|
417
|
+
}
|
|
418
|
+
// Limit recommendations
|
|
419
|
+
if (recommendations.length >= AI_COMPATIBILITY.MAX_RECOMMENDATIONS) {
|
|
420
|
+
break;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
return recommendations;
|
|
424
|
+
}
|
|
425
|
+
/**
|
|
426
|
+
* Check if an issue is related to a scoring category.
|
|
427
|
+
*/
|
|
428
|
+
function isIssueRelatedToCategory(issue, category) {
|
|
429
|
+
const categoryKeywords = {
|
|
430
|
+
descriptionClarity: ['description', 'action verb'],
|
|
431
|
+
parameterNaming: ['parameter', 'name', 'generic'],
|
|
432
|
+
errorMessageQuality: ['error', 'message', 'guidance'],
|
|
433
|
+
exampleCompleteness: ['example', 'truncated'],
|
|
434
|
+
workflowDocumentation: ['workflow', 'sequence', 'dependency'],
|
|
435
|
+
responsePredictability: ['response', 'unstable', 'structure'],
|
|
436
|
+
};
|
|
437
|
+
const keywords = categoryKeywords[category] || [];
|
|
438
|
+
return keywords.some(kw => issue.toLowerCase().includes(kw));
|
|
439
|
+
}
|
|
440
|
+
/**
|
|
441
|
+
* Create a specific recommendation for a category.
|
|
442
|
+
*/
|
|
443
|
+
function createRecommendation(category, component, affectedTools, priority, inputs) {
|
|
444
|
+
const potentialImprovement = Math.round((100 - component.score) * component.weight);
|
|
445
|
+
switch (category) {
|
|
446
|
+
case 'descriptionClarity': {
|
|
447
|
+
const shortDescTools = inputs
|
|
448
|
+
.filter(i => (i.tool.description || '').length < AI_COMPATIBILITY.DESCRIPTION.MIN_LENGTH)
|
|
449
|
+
.map(i => i.tool.name);
|
|
450
|
+
return {
|
|
451
|
+
priority,
|
|
452
|
+
category,
|
|
453
|
+
title: 'Improve tool descriptions',
|
|
454
|
+
description: `Add clear, action-oriented descriptions (${AI_COMPATIBILITY.DESCRIPTION.MIN_LENGTH}+ chars) that explain what each tool does and when to use it.`,
|
|
455
|
+
affectedTools: shortDescTools.length > 0 ? shortDescTools : affectedTools,
|
|
456
|
+
potentialImprovement,
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
case 'parameterNaming':
|
|
460
|
+
return {
|
|
461
|
+
priority,
|
|
462
|
+
category,
|
|
463
|
+
title: 'Use descriptive parameter names',
|
|
464
|
+
description: 'Replace generic names (data, value, input) with specific, semantic names (transactionData, accountId, searchQuery).',
|
|
465
|
+
affectedTools,
|
|
466
|
+
potentialImprovement,
|
|
467
|
+
};
|
|
468
|
+
case 'errorMessageQuality':
|
|
469
|
+
return {
|
|
470
|
+
priority,
|
|
471
|
+
category,
|
|
472
|
+
title: 'Add remediation hints to errors',
|
|
473
|
+
description: 'Include suggestions for fixing errors (e.g., "Invalid date format. Expected: YYYY-MM-DD").',
|
|
474
|
+
affectedTools,
|
|
475
|
+
potentialImprovement,
|
|
476
|
+
};
|
|
477
|
+
case 'exampleCompleteness':
|
|
478
|
+
return {
|
|
479
|
+
priority,
|
|
480
|
+
category,
|
|
481
|
+
title: 'Expand examples',
|
|
482
|
+
description: 'Run with --full-examples to capture complete output samples for AI agent reference.',
|
|
483
|
+
affectedTools,
|
|
484
|
+
potentialImprovement,
|
|
485
|
+
};
|
|
486
|
+
case 'workflowDocumentation':
|
|
487
|
+
return {
|
|
488
|
+
priority,
|
|
489
|
+
category,
|
|
490
|
+
title: 'Document tool workflows',
|
|
491
|
+
description: 'Add sequence/dependency hints to descriptions (e.g., "Call after create_user" or "Requires valid access_token").',
|
|
492
|
+
affectedTools,
|
|
493
|
+
potentialImprovement,
|
|
494
|
+
};
|
|
495
|
+
case 'responsePredictability':
|
|
496
|
+
return {
|
|
497
|
+
priority,
|
|
498
|
+
category,
|
|
499
|
+
title: 'Stabilize response structures',
|
|
500
|
+
description: 'Ensure tools return consistent field names and types across calls.',
|
|
501
|
+
affectedTools,
|
|
502
|
+
potentialImprovement,
|
|
503
|
+
};
|
|
504
|
+
default:
|
|
505
|
+
return null;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
/**
|
|
509
|
+
* Calculate letter grade from score.
|
|
510
|
+
*/
|
|
511
|
+
function calculateGrade(score) {
|
|
512
|
+
if (score >= AI_COMPATIBILITY.GRADE_THRESHOLDS.A)
|
|
513
|
+
return 'A';
|
|
514
|
+
if (score >= AI_COMPATIBILITY.GRADE_THRESHOLDS.B)
|
|
515
|
+
return 'B';
|
|
516
|
+
if (score >= AI_COMPATIBILITY.GRADE_THRESHOLDS.C)
|
|
517
|
+
return 'C';
|
|
518
|
+
if (score >= AI_COMPATIBILITY.GRADE_THRESHOLDS.D)
|
|
519
|
+
return 'D';
|
|
520
|
+
return 'F';
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Create an empty score for servers with no tools.
|
|
524
|
+
*/
|
|
525
|
+
function createEmptyScore() {
|
|
526
|
+
const emptyComponent = () => ({
|
|
527
|
+
score: 0,
|
|
528
|
+
weight: 0,
|
|
529
|
+
weightedScore: 0,
|
|
530
|
+
notes: ['No tools available'],
|
|
531
|
+
});
|
|
532
|
+
return {
|
|
533
|
+
overall: 0,
|
|
534
|
+
grade: 'F',
|
|
535
|
+
breakdown: {
|
|
536
|
+
descriptionClarity: emptyComponent(),
|
|
537
|
+
parameterNaming: emptyComponent(),
|
|
538
|
+
errorMessageQuality: emptyComponent(),
|
|
539
|
+
exampleCompleteness: emptyComponent(),
|
|
540
|
+
workflowDocumentation: emptyComponent(),
|
|
541
|
+
responsePredictability: emptyComponent(),
|
|
542
|
+
},
|
|
543
|
+
recommendations: [],
|
|
544
|
+
toolScores: [],
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
/**
|
|
548
|
+
* Generate markdown documentation for AI compatibility score.
|
|
549
|
+
*/
|
|
550
|
+
export function generateAICompatibilityMarkdown(score) {
|
|
551
|
+
const lines = [];
|
|
552
|
+
lines.push('## AI Agent Compatibility');
|
|
553
|
+
lines.push('');
|
|
554
|
+
lines.push(`**Overall Score: ${score.overall}/100 (Grade ${score.grade})**`);
|
|
555
|
+
lines.push('');
|
|
556
|
+
// Breakdown table
|
|
557
|
+
lines.push('| Factor | Score | Weight | Notes |');
|
|
558
|
+
lines.push('|--------|-------|--------|-------|');
|
|
559
|
+
const components = [
|
|
560
|
+
{ name: 'Description Clarity', key: 'descriptionClarity' },
|
|
561
|
+
{ name: 'Parameter Naming', key: 'parameterNaming' },
|
|
562
|
+
{ name: 'Error Messages', key: 'errorMessageQuality' },
|
|
563
|
+
{ name: 'Example Completeness', key: 'exampleCompleteness' },
|
|
564
|
+
{ name: 'Workflow Docs', key: 'workflowDocumentation' },
|
|
565
|
+
{ name: 'Response Predictability', key: 'responsePredictability' },
|
|
566
|
+
];
|
|
567
|
+
for (const { name, key } of components) {
|
|
568
|
+
const component = score.breakdown[key];
|
|
569
|
+
const weightPercent = Math.round(component.weight * 100);
|
|
570
|
+
const notes = component.notes.join('; ') || '-';
|
|
571
|
+
lines.push(`| ${name} | ${component.score}/100 | ${weightPercent}% | ${notes} |`);
|
|
572
|
+
}
|
|
573
|
+
lines.push('');
|
|
574
|
+
// Recommendations
|
|
575
|
+
if (score.recommendations.length > 0) {
|
|
576
|
+
lines.push('### Improvement Recommendations');
|
|
577
|
+
lines.push('');
|
|
578
|
+
for (const rec of score.recommendations) {
|
|
579
|
+
lines.push(`${rec.priority}. **${rec.title}** - ${rec.description}`);
|
|
580
|
+
if (rec.affectedTools && rec.affectedTools.length > 0) {
|
|
581
|
+
const tools = rec.affectedTools.slice(0, 5).map(t => `\`${t}\``).join(', ');
|
|
582
|
+
const more = rec.affectedTools.length > 5 ? ` (+${rec.affectedTools.length - 5} more)` : '';
|
|
583
|
+
lines.push(` - Affects: ${tools}${more}`);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
lines.push('');
|
|
587
|
+
}
|
|
588
|
+
// Low-scoring tools
|
|
589
|
+
const lowScoreTools = score.toolScores.filter(t => t.score < 70);
|
|
590
|
+
if (lowScoreTools.length > 0) {
|
|
591
|
+
lines.push('### Tools Needing Attention');
|
|
592
|
+
lines.push('');
|
|
593
|
+
lines.push('| Tool | Score | Issues |');
|
|
594
|
+
lines.push('|------|-------|--------|');
|
|
595
|
+
for (const tool of lowScoreTools.slice(0, 10)) {
|
|
596
|
+
const issues = tool.issues.slice(0, 2).join('; ') || '-';
|
|
597
|
+
lines.push(`| \`${tool.toolName}\` | ${tool.score}/100 | ${issues} |`);
|
|
598
|
+
}
|
|
599
|
+
if (lowScoreTools.length > 10) {
|
|
600
|
+
lines.push(`| ... | ... | ${lowScoreTools.length - 10} more tools below 70 |`);
|
|
601
|
+
}
|
|
602
|
+
lines.push('');
|
|
603
|
+
}
|
|
604
|
+
return lines.join('\n');
|
|
605
|
+
}
|
|
606
|
+
//# sourceMappingURL=ai-compatibility-scorer.js.map
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Confidence Calibration for Drift Detection
|
|
3
|
+
*
|
|
4
|
+
* Calibrates raw confidence scores to match actual accuracy.
|
|
5
|
+
* A calibrated confidence of 80% means the algorithm is correct ~80% of the time
|
|
6
|
+
* when it reports that confidence level.
|
|
7
|
+
*
|
|
8
|
+
* Calibration is based on evaluation against the golden dataset.
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Calibration bucket defining expected accuracy for a confidence range.
|
|
12
|
+
*/
|
|
13
|
+
export interface CalibrationBucket {
|
|
14
|
+
/** Minimum confidence in this bucket (inclusive) */
|
|
15
|
+
min: number;
|
|
16
|
+
/** Maximum confidence in this bucket (exclusive) */
|
|
17
|
+
max: number;
|
|
18
|
+
/** Calibrated accuracy for this bucket */
|
|
19
|
+
calibratedAccuracy: number;
|
|
20
|
+
/** Number of samples used to calculate this bucket */
|
|
21
|
+
sampleCount: number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Default calibration model based on golden dataset evaluation.
|
|
25
|
+
*
|
|
26
|
+
* These values should be updated as the algorithm improves.
|
|
27
|
+
* Current baseline: v1.0.1 (50 test cases)
|
|
28
|
+
*/
|
|
29
|
+
export declare const DEFAULT_CALIBRATION_MODEL: CalibrationBucket[];
|
|
30
|
+
/**
|
|
31
|
+
* Calibrate a raw confidence score to reflect actual accuracy.
|
|
32
|
+
*
|
|
33
|
+
* @param rawScore - Raw confidence score (0-100)
|
|
34
|
+
* @param model - Calibration model to use (defaults to DEFAULT_CALIBRATION_MODEL)
|
|
35
|
+
* @returns Calibrated confidence score
|
|
36
|
+
*/
|
|
37
|
+
export declare function calibrateConfidence(rawScore: number, model?: CalibrationBucket[]): number;
|
|
38
|
+
/**
|
|
39
|
+
* Format confidence score with calibration information.
|
|
40
|
+
*
|
|
41
|
+
* @param rawScore - Raw confidence score
|
|
42
|
+
* @param showRaw - Whether to show raw score alongside calibrated
|
|
43
|
+
* @returns Formatted string
|
|
44
|
+
*/
|
|
45
|
+
export declare function formatCalibratedConfidence(rawScore: number, showRaw?: boolean): string;
|
|
46
|
+
/**
|
|
47
|
+
* Get confidence label based on calibrated score.
|
|
48
|
+
*/
|
|
49
|
+
export declare function getCalibratedConfidenceLabel(rawScore: number): 'high' | 'medium' | 'low' | 'very-low';
|
|
50
|
+
/**
|
|
51
|
+
* Check if a calibrated confidence meets a threshold.
|
|
52
|
+
*
|
|
53
|
+
* @param rawScore - Raw confidence score
|
|
54
|
+
* @param threshold - Minimum required calibrated confidence
|
|
55
|
+
* @returns True if calibrated confidence meets threshold
|
|
56
|
+
*/
|
|
57
|
+
export declare function meetsCalibratedThreshold(rawScore: number, threshold: number): boolean;
|
|
58
|
+
/**
|
|
59
|
+
* Update calibration model based on evaluation results.
|
|
60
|
+
* This recalculates accuracy for each bucket from test results.
|
|
61
|
+
*
|
|
62
|
+
* @param results - Array of {predictedConfidence, wasCorrect} pairs
|
|
63
|
+
* @returns Updated calibration model
|
|
64
|
+
*/
|
|
65
|
+
export declare function updateCalibrationModel(results: Array<{
|
|
66
|
+
predictedConfidence: number;
|
|
67
|
+
wasCorrect: boolean;
|
|
68
|
+
}>): CalibrationBucket[];
|
|
69
|
+
/**
|
|
70
|
+
* Calculate calibration error (ECE - Expected Calibration Error).
|
|
71
|
+
* Lower is better. 0 = perfectly calibrated.
|
|
72
|
+
*
|
|
73
|
+
* @param model - Calibration model
|
|
74
|
+
* @returns ECE as a percentage (0-100)
|
|
75
|
+
*/
|
|
76
|
+
export declare function calculateCalibrationError(model: CalibrationBucket[]): number;
|
|
77
|
+
//# sourceMappingURL=calibration.d.ts.map
|