@dotsetlabs/bellwether 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +291 -0
- package/LICENSE +21 -0
- package/README.md +739 -0
- package/dist/auth/credentials.d.ts +64 -0
- package/dist/auth/credentials.js +218 -0
- package/dist/auth/index.d.ts +6 -0
- package/dist/auth/index.js +6 -0
- package/dist/auth/keychain.d.ts +64 -0
- package/dist/auth/keychain.js +268 -0
- package/dist/baseline/ab-testing.d.ts +80 -0
- package/dist/baseline/ab-testing.js +236 -0
- package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
- package/dist/baseline/ai-compatibility-scorer.js +606 -0
- package/dist/baseline/calibration.d.ts +77 -0
- package/dist/baseline/calibration.js +136 -0
- package/dist/baseline/category-matching.d.ts +85 -0
- package/dist/baseline/category-matching.js +289 -0
- package/dist/baseline/change-impact-analyzer.d.ts +98 -0
- package/dist/baseline/change-impact-analyzer.js +592 -0
- package/dist/baseline/comparator.d.ts +64 -0
- package/dist/baseline/comparator.js +916 -0
- package/dist/baseline/confidence.d.ts +55 -0
- package/dist/baseline/confidence.js +122 -0
- package/dist/baseline/converter.d.ts +61 -0
- package/dist/baseline/converter.js +585 -0
- package/dist/baseline/dependency-analyzer.d.ts +89 -0
- package/dist/baseline/dependency-analyzer.js +567 -0
- package/dist/baseline/deprecation-tracker.d.ts +133 -0
- package/dist/baseline/deprecation-tracker.js +322 -0
- package/dist/baseline/diff.d.ts +55 -0
- package/dist/baseline/diff.js +1584 -0
- package/dist/baseline/documentation-scorer.d.ts +205 -0
- package/dist/baseline/documentation-scorer.js +466 -0
- package/dist/baseline/embeddings.d.ts +118 -0
- package/dist/baseline/embeddings.js +251 -0
- package/dist/baseline/error-analyzer.d.ts +198 -0
- package/dist/baseline/error-analyzer.js +721 -0
- package/dist/baseline/evaluation/evaluator.d.ts +42 -0
- package/dist/baseline/evaluation/evaluator.js +323 -0
- package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
- package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
- package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
- package/dist/baseline/evaluation/golden-dataset.js +717 -0
- package/dist/baseline/evaluation/index.d.ts +15 -0
- package/dist/baseline/evaluation/index.js +15 -0
- package/dist/baseline/evaluation/types.d.ts +186 -0
- package/dist/baseline/evaluation/types.js +8 -0
- package/dist/baseline/external-dependency-detector.d.ts +181 -0
- package/dist/baseline/external-dependency-detector.js +524 -0
- package/dist/baseline/golden-output.d.ts +162 -0
- package/dist/baseline/golden-output.js +636 -0
- package/dist/baseline/health-scorer.d.ts +174 -0
- package/dist/baseline/health-scorer.js +451 -0
- package/dist/baseline/incremental-checker.d.ts +97 -0
- package/dist/baseline/incremental-checker.js +174 -0
- package/dist/baseline/index.d.ts +31 -0
- package/dist/baseline/index.js +42 -0
- package/dist/baseline/migration-generator.d.ts +137 -0
- package/dist/baseline/migration-generator.js +554 -0
- package/dist/baseline/migrations.d.ts +60 -0
- package/dist/baseline/migrations.js +197 -0
- package/dist/baseline/performance-tracker.d.ts +214 -0
- package/dist/baseline/performance-tracker.js +577 -0
- package/dist/baseline/pr-comment-generator.d.ts +117 -0
- package/dist/baseline/pr-comment-generator.js +546 -0
- package/dist/baseline/response-fingerprint.d.ts +127 -0
- package/dist/baseline/response-fingerprint.js +728 -0
- package/dist/baseline/response-schema-tracker.d.ts +129 -0
- package/dist/baseline/response-schema-tracker.js +420 -0
- package/dist/baseline/risk-scorer.d.ts +54 -0
- package/dist/baseline/risk-scorer.js +434 -0
- package/dist/baseline/saver.d.ts +89 -0
- package/dist/baseline/saver.js +554 -0
- package/dist/baseline/scenario-generator.d.ts +151 -0
- package/dist/baseline/scenario-generator.js +905 -0
- package/dist/baseline/schema-compare.d.ts +86 -0
- package/dist/baseline/schema-compare.js +557 -0
- package/dist/baseline/schema-evolution.d.ts +189 -0
- package/dist/baseline/schema-evolution.js +467 -0
- package/dist/baseline/semantic.d.ts +203 -0
- package/dist/baseline/semantic.js +908 -0
- package/dist/baseline/synonyms.d.ts +60 -0
- package/dist/baseline/synonyms.js +386 -0
- package/dist/baseline/telemetry.d.ts +165 -0
- package/dist/baseline/telemetry.js +294 -0
- package/dist/baseline/test-pruner.d.ts +120 -0
- package/dist/baseline/test-pruner.js +387 -0
- package/dist/baseline/types.d.ts +449 -0
- package/dist/baseline/types.js +5 -0
- package/dist/baseline/version.d.ts +138 -0
- package/dist/baseline/version.js +206 -0
- package/dist/cache/index.d.ts +5 -0
- package/dist/cache/index.js +5 -0
- package/dist/cache/response-cache.d.ts +151 -0
- package/dist/cache/response-cache.js +287 -0
- package/dist/ci/index.d.ts +60 -0
- package/dist/ci/index.js +342 -0
- package/dist/cli/commands/auth.d.ts +12 -0
- package/dist/cli/commands/auth.js +352 -0
- package/dist/cli/commands/badge.d.ts +3 -0
- package/dist/cli/commands/badge.js +74 -0
- package/dist/cli/commands/baseline-accept.d.ts +15 -0
- package/dist/cli/commands/baseline-accept.js +178 -0
- package/dist/cli/commands/baseline-migrate.d.ts +12 -0
- package/dist/cli/commands/baseline-migrate.js +164 -0
- package/dist/cli/commands/baseline.d.ts +14 -0
- package/dist/cli/commands/baseline.js +449 -0
- package/dist/cli/commands/beta.d.ts +10 -0
- package/dist/cli/commands/beta.js +231 -0
- package/dist/cli/commands/check.d.ts +11 -0
- package/dist/cli/commands/check.js +820 -0
- package/dist/cli/commands/cloud/badge.d.ts +3 -0
- package/dist/cli/commands/cloud/badge.js +74 -0
- package/dist/cli/commands/cloud/diff.d.ts +6 -0
- package/dist/cli/commands/cloud/diff.js +79 -0
- package/dist/cli/commands/cloud/history.d.ts +6 -0
- package/dist/cli/commands/cloud/history.js +102 -0
- package/dist/cli/commands/cloud/link.d.ts +9 -0
- package/dist/cli/commands/cloud/link.js +119 -0
- package/dist/cli/commands/cloud/login.d.ts +7 -0
- package/dist/cli/commands/cloud/login.js +499 -0
- package/dist/cli/commands/cloud/projects.d.ts +6 -0
- package/dist/cli/commands/cloud/projects.js +44 -0
- package/dist/cli/commands/cloud/shared.d.ts +7 -0
- package/dist/cli/commands/cloud/shared.js +42 -0
- package/dist/cli/commands/cloud/teams.d.ts +8 -0
- package/dist/cli/commands/cloud/teams.js +169 -0
- package/dist/cli/commands/cloud/upload.d.ts +8 -0
- package/dist/cli/commands/cloud/upload.js +181 -0
- package/dist/cli/commands/contract.d.ts +11 -0
- package/dist/cli/commands/contract.js +280 -0
- package/dist/cli/commands/discover.d.ts +3 -0
- package/dist/cli/commands/discover.js +82 -0
- package/dist/cli/commands/eval.d.ts +9 -0
- package/dist/cli/commands/eval.js +187 -0
- package/dist/cli/commands/explore.d.ts +11 -0
- package/dist/cli/commands/explore.js +437 -0
- package/dist/cli/commands/feedback.d.ts +9 -0
- package/dist/cli/commands/feedback.js +174 -0
- package/dist/cli/commands/golden.d.ts +12 -0
- package/dist/cli/commands/golden.js +407 -0
- package/dist/cli/commands/history.d.ts +10 -0
- package/dist/cli/commands/history.js +202 -0
- package/dist/cli/commands/init.d.ts +9 -0
- package/dist/cli/commands/init.js +219 -0
- package/dist/cli/commands/interview.d.ts +3 -0
- package/dist/cli/commands/interview.js +903 -0
- package/dist/cli/commands/link.d.ts +10 -0
- package/dist/cli/commands/link.js +169 -0
- package/dist/cli/commands/login.d.ts +7 -0
- package/dist/cli/commands/login.js +499 -0
- package/dist/cli/commands/preset.d.ts +33 -0
- package/dist/cli/commands/preset.js +297 -0
- package/dist/cli/commands/profile.d.ts +33 -0
- package/dist/cli/commands/profile.js +286 -0
- package/dist/cli/commands/registry.d.ts +11 -0
- package/dist/cli/commands/registry.js +146 -0
- package/dist/cli/commands/shared.d.ts +79 -0
- package/dist/cli/commands/shared.js +196 -0
- package/dist/cli/commands/teams.d.ts +8 -0
- package/dist/cli/commands/teams.js +169 -0
- package/dist/cli/commands/test.d.ts +9 -0
- package/dist/cli/commands/test.js +500 -0
- package/dist/cli/commands/upload.d.ts +8 -0
- package/dist/cli/commands/upload.js +223 -0
- package/dist/cli/commands/validate-config.d.ts +6 -0
- package/dist/cli/commands/validate-config.js +35 -0
- package/dist/cli/commands/verify.d.ts +11 -0
- package/dist/cli/commands/verify.js +283 -0
- package/dist/cli/commands/watch.d.ts +12 -0
- package/dist/cli/commands/watch.js +253 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.js +178 -0
- package/dist/cli/interactive.d.ts +47 -0
- package/dist/cli/interactive.js +216 -0
- package/dist/cli/output/terminal-reporter.d.ts +19 -0
- package/dist/cli/output/terminal-reporter.js +104 -0
- package/dist/cli/output.d.ts +226 -0
- package/dist/cli/output.js +438 -0
- package/dist/cli/utils/env.d.ts +5 -0
- package/dist/cli/utils/env.js +14 -0
- package/dist/cli/utils/progress.d.ts +59 -0
- package/dist/cli/utils/progress.js +206 -0
- package/dist/cli/utils/server-context.d.ts +10 -0
- package/dist/cli/utils/server-context.js +36 -0
- package/dist/cloud/auth.d.ts +144 -0
- package/dist/cloud/auth.js +374 -0
- package/dist/cloud/client.d.ts +24 -0
- package/dist/cloud/client.js +65 -0
- package/dist/cloud/http-client.d.ts +38 -0
- package/dist/cloud/http-client.js +215 -0
- package/dist/cloud/index.d.ts +23 -0
- package/dist/cloud/index.js +25 -0
- package/dist/cloud/mock-client.d.ts +107 -0
- package/dist/cloud/mock-client.js +545 -0
- package/dist/cloud/types.d.ts +515 -0
- package/dist/cloud/types.js +15 -0
- package/dist/config/defaults.d.ts +160 -0
- package/dist/config/defaults.js +169 -0
- package/dist/config/loader.d.ts +24 -0
- package/dist/config/loader.js +122 -0
- package/dist/config/template.d.ts +42 -0
- package/dist/config/template.js +647 -0
- package/dist/config/validator.d.ts +2112 -0
- package/dist/config/validator.js +658 -0
- package/dist/constants/cloud.d.ts +107 -0
- package/dist/constants/cloud.js +110 -0
- package/dist/constants/core.d.ts +521 -0
- package/dist/constants/core.js +556 -0
- package/dist/constants/testing.d.ts +1283 -0
- package/dist/constants/testing.js +1568 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.js +10 -0
- package/dist/contract/index.d.ts +6 -0
- package/dist/contract/index.js +5 -0
- package/dist/contract/validator.d.ts +177 -0
- package/dist/contract/validator.js +574 -0
- package/dist/cost/index.d.ts +6 -0
- package/dist/cost/index.js +5 -0
- package/dist/cost/tracker.d.ts +134 -0
- package/dist/cost/tracker.js +313 -0
- package/dist/discovery/discovery.d.ts +16 -0
- package/dist/discovery/discovery.js +173 -0
- package/dist/discovery/types.d.ts +51 -0
- package/dist/discovery/types.js +2 -0
- package/dist/docs/agents.d.ts +3 -0
- package/dist/docs/agents.js +995 -0
- package/dist/docs/contract.d.ts +51 -0
- package/dist/docs/contract.js +1681 -0
- package/dist/docs/generator.d.ts +4 -0
- package/dist/docs/generator.js +4 -0
- package/dist/docs/html-reporter.d.ts +9 -0
- package/dist/docs/html-reporter.js +757 -0
- package/dist/docs/index.d.ts +10 -0
- package/dist/docs/index.js +11 -0
- package/dist/docs/junit-reporter.d.ts +18 -0
- package/dist/docs/junit-reporter.js +210 -0
- package/dist/docs/report.d.ts +14 -0
- package/dist/docs/report.js +44 -0
- package/dist/docs/sarif-reporter.d.ts +19 -0
- package/dist/docs/sarif-reporter.js +335 -0
- package/dist/docs/shared.d.ts +35 -0
- package/dist/docs/shared.js +162 -0
- package/dist/docs/templates.d.ts +12 -0
- package/dist/docs/templates.js +76 -0
- package/dist/errors/index.d.ts +6 -0
- package/dist/errors/index.js +6 -0
- package/dist/errors/retry.d.ts +92 -0
- package/dist/errors/retry.js +323 -0
- package/dist/errors/types.d.ts +321 -0
- package/dist/errors/types.js +584 -0
- package/dist/index.d.ts +32 -0
- package/dist/index.js +32 -0
- package/dist/interview/dependency-resolver.d.ts +11 -0
- package/dist/interview/dependency-resolver.js +32 -0
- package/dist/interview/interviewer.d.ts +232 -0
- package/dist/interview/interviewer.js +1939 -0
- package/dist/interview/mock-response-generator.d.ts +7 -0
- package/dist/interview/mock-response-generator.js +102 -0
- package/dist/interview/orchestrator.d.ts +237 -0
- package/dist/interview/orchestrator.js +1296 -0
- package/dist/interview/rate-limiter.d.ts +15 -0
- package/dist/interview/rate-limiter.js +55 -0
- package/dist/interview/response-validator.d.ts +10 -0
- package/dist/interview/response-validator.js +132 -0
- package/dist/interview/schema-inferrer.d.ts +8 -0
- package/dist/interview/schema-inferrer.js +71 -0
- package/dist/interview/schema-test-generator.d.ts +71 -0
- package/dist/interview/schema-test-generator.js +834 -0
- package/dist/interview/smart-value-generator.d.ts +155 -0
- package/dist/interview/smart-value-generator.js +554 -0
- package/dist/interview/stateful-test-runner.d.ts +19 -0
- package/dist/interview/stateful-test-runner.js +106 -0
- package/dist/interview/types.d.ts +561 -0
- package/dist/interview/types.js +2 -0
- package/dist/llm/anthropic.d.ts +41 -0
- package/dist/llm/anthropic.js +355 -0
- package/dist/llm/client.d.ts +123 -0
- package/dist/llm/client.js +42 -0
- package/dist/llm/factory.d.ts +38 -0
- package/dist/llm/factory.js +145 -0
- package/dist/llm/fallback.d.ts +140 -0
- package/dist/llm/fallback.js +379 -0
- package/dist/llm/index.d.ts +18 -0
- package/dist/llm/index.js +15 -0
- package/dist/llm/ollama.d.ts +37 -0
- package/dist/llm/ollama.js +330 -0
- package/dist/llm/openai.d.ts +25 -0
- package/dist/llm/openai.js +320 -0
- package/dist/llm/token-budget.d.ts +161 -0
- package/dist/llm/token-budget.js +395 -0
- package/dist/logging/logger.d.ts +70 -0
- package/dist/logging/logger.js +130 -0
- package/dist/metrics/collector.d.ts +106 -0
- package/dist/metrics/collector.js +547 -0
- package/dist/metrics/index.d.ts +7 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/prometheus.d.ts +20 -0
- package/dist/metrics/prometheus.js +241 -0
- package/dist/metrics/types.d.ts +209 -0
- package/dist/metrics/types.js +5 -0
- package/dist/persona/builtins.d.ts +54 -0
- package/dist/persona/builtins.js +219 -0
- package/dist/persona/index.d.ts +8 -0
- package/dist/persona/index.js +8 -0
- package/dist/persona/loader.d.ts +30 -0
- package/dist/persona/loader.js +190 -0
- package/dist/persona/types.d.ts +144 -0
- package/dist/persona/types.js +5 -0
- package/dist/persona/validation.d.ts +94 -0
- package/dist/persona/validation.js +332 -0
- package/dist/prompts/index.d.ts +5 -0
- package/dist/prompts/index.js +5 -0
- package/dist/prompts/templates.d.ts +180 -0
- package/dist/prompts/templates.js +431 -0
- package/dist/registry/client.d.ts +49 -0
- package/dist/registry/client.js +191 -0
- package/dist/registry/index.d.ts +7 -0
- package/dist/registry/index.js +6 -0
- package/dist/registry/types.d.ts +140 -0
- package/dist/registry/types.js +6 -0
- package/dist/scenarios/evaluator.d.ts +43 -0
- package/dist/scenarios/evaluator.js +206 -0
- package/dist/scenarios/index.d.ts +10 -0
- package/dist/scenarios/index.js +9 -0
- package/dist/scenarios/loader.d.ts +20 -0
- package/dist/scenarios/loader.js +285 -0
- package/dist/scenarios/types.d.ts +153 -0
- package/dist/scenarios/types.js +8 -0
- package/dist/security/index.d.ts +17 -0
- package/dist/security/index.js +18 -0
- package/dist/security/payloads.d.ts +61 -0
- package/dist/security/payloads.js +268 -0
- package/dist/security/security-tester.d.ts +42 -0
- package/dist/security/security-tester.js +582 -0
- package/dist/security/types.d.ts +166 -0
- package/dist/security/types.js +8 -0
- package/dist/transport/base-transport.d.ts +59 -0
- package/dist/transport/base-transport.js +38 -0
- package/dist/transport/http-transport.d.ts +67 -0
- package/dist/transport/http-transport.js +238 -0
- package/dist/transport/mcp-client.d.ts +141 -0
- package/dist/transport/mcp-client.js +496 -0
- package/dist/transport/sse-transport.d.ts +88 -0
- package/dist/transport/sse-transport.js +316 -0
- package/dist/transport/stdio-transport.d.ts +43 -0
- package/dist/transport/stdio-transport.js +238 -0
- package/dist/transport/types.d.ts +125 -0
- package/dist/transport/types.js +16 -0
- package/dist/utils/concurrency.d.ts +123 -0
- package/dist/utils/concurrency.js +213 -0
- package/dist/utils/formatters.d.ts +16 -0
- package/dist/utils/formatters.js +37 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/jsonpath.d.ts +87 -0
- package/dist/utils/jsonpath.js +326 -0
- package/dist/utils/markdown.d.ts +113 -0
- package/dist/utils/markdown.js +265 -0
- package/dist/utils/network.d.ts +14 -0
- package/dist/utils/network.js +17 -0
- package/dist/utils/sanitize.d.ts +92 -0
- package/dist/utils/sanitize.js +191 -0
- package/dist/utils/semantic.d.ts +194 -0
- package/dist/utils/semantic.js +1051 -0
- package/dist/utils/smart-truncate.d.ts +94 -0
- package/dist/utils/smart-truncate.js +361 -0
- package/dist/utils/timeout.d.ts +153 -0
- package/dist/utils/timeout.js +205 -0
- package/dist/utils/yaml-parser.d.ts +58 -0
- package/dist/utils/yaml-parser.js +86 -0
- package/dist/validation/index.d.ts +32 -0
- package/dist/validation/index.js +32 -0
- package/dist/validation/semantic-test-generator.d.ts +50 -0
- package/dist/validation/semantic-test-generator.js +176 -0
- package/dist/validation/semantic-types.d.ts +66 -0
- package/dist/validation/semantic-types.js +94 -0
- package/dist/validation/semantic-validator.d.ts +38 -0
- package/dist/validation/semantic-validator.js +340 -0
- package/dist/verification/index.d.ts +6 -0
- package/dist/verification/index.js +5 -0
- package/dist/verification/types.d.ts +133 -0
- package/dist/verification/types.js +5 -0
- package/dist/verification/verifier.d.ts +30 -0
- package/dist/verification/verifier.js +309 -0
- package/dist/version.d.ts +19 -0
- package/dist/version.js +48 -0
- package/dist/workflow/auto-generator.d.ts +27 -0
- package/dist/workflow/auto-generator.js +513 -0
- package/dist/workflow/discovery.d.ts +40 -0
- package/dist/workflow/discovery.js +195 -0
- package/dist/workflow/executor.d.ts +82 -0
- package/dist/workflow/executor.js +611 -0
- package/dist/workflow/index.d.ts +10 -0
- package/dist/workflow/index.js +10 -0
- package/dist/workflow/loader.d.ts +24 -0
- package/dist/workflow/loader.js +194 -0
- package/dist/workflow/state-tracker.d.ts +98 -0
- package/dist/workflow/state-tracker.js +424 -0
- package/dist/workflow/types.d.ts +337 -0
- package/dist/workflow/types.js +5 -0
- package/package.json +94 -0
- package/schemas/bellwether-check.schema.json +651 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A/B Testing Framework for Drift Detection Algorithms
|
|
3
|
+
*
|
|
4
|
+
* Enables safe comparison of algorithm variants to measure
|
|
5
|
+
* which performs better on the golden dataset.
|
|
6
|
+
*/
|
|
7
|
+
import type { EvaluationResult, EvaluationOptions } from './evaluation/types.js';
|
|
8
|
+
import type { SemanticComparator } from './evaluation/types.js';
|
|
9
|
+
/**
|
|
10
|
+
* An algorithm variant to test.
|
|
11
|
+
*/
|
|
12
|
+
export interface AlgorithmVariant {
|
|
13
|
+
/** Human-readable name */
|
|
14
|
+
name: string;
|
|
15
|
+
/** Version identifier */
|
|
16
|
+
version: string;
|
|
17
|
+
/** Description of what this variant changes */
|
|
18
|
+
description?: string;
|
|
19
|
+
/** The semantic comparator implementation */
|
|
20
|
+
comparator: SemanticComparator;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Results of an A/B test comparing algorithm variants.
|
|
24
|
+
*/
|
|
25
|
+
export interface ABTestResult {
|
|
26
|
+
/** Results for each variant */
|
|
27
|
+
variantResults: Map<string, EvaluationResult>;
|
|
28
|
+
/** The winning variant (highest F1 score) */
|
|
29
|
+
winner: AlgorithmVariant | null;
|
|
30
|
+
/** Comparison between variants */
|
|
31
|
+
comparison: VariantComparison[];
|
|
32
|
+
/** Statistical significance analysis */
|
|
33
|
+
significance: SignificanceAnalysis;
|
|
34
|
+
/** Summary recommendations */
|
|
35
|
+
recommendations: string[];
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Comparison between two variants.
|
|
39
|
+
*/
|
|
40
|
+
export interface VariantComparison {
|
|
41
|
+
variant1: string;
|
|
42
|
+
variant2: string;
|
|
43
|
+
accuracyDiff: number;
|
|
44
|
+
precisionDiff: number;
|
|
45
|
+
recallDiff: number;
|
|
46
|
+
f1Diff: number;
|
|
47
|
+
winner: string;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Statistical significance analysis.
|
|
51
|
+
*/
|
|
52
|
+
export interface SignificanceAnalysis {
|
|
53
|
+
/** Whether differences are statistically significant */
|
|
54
|
+
isSignificant: boolean;
|
|
55
|
+
/** Minimum sample size needed for significance */
|
|
56
|
+
recommendedSampleSize: number;
|
|
57
|
+
/** Actual sample size */
|
|
58
|
+
actualSampleSize: number;
|
|
59
|
+
/** Confidence level (e.g., 0.95 for 95%) */
|
|
60
|
+
confidenceLevel: number;
|
|
61
|
+
/** Notes on the analysis */
|
|
62
|
+
notes: string;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Run an A/B test comparing multiple algorithm variants.
|
|
66
|
+
*/
|
|
67
|
+
export declare function runABTest(variants: AlgorithmVariant[], options?: EvaluationOptions): ABTestResult;
|
|
68
|
+
/**
|
|
69
|
+
* Format A/B test results for display.
|
|
70
|
+
*/
|
|
71
|
+
export declare function formatABTestReport(result: ABTestResult): string;
|
|
72
|
+
/**
|
|
73
|
+
* Create the default algorithm variant (current implementation).
|
|
74
|
+
*/
|
|
75
|
+
export declare function createDefaultVariant(): AlgorithmVariant;
|
|
76
|
+
/**
|
|
77
|
+
* Quick comparison of two algorithms.
|
|
78
|
+
*/
|
|
79
|
+
export declare function compareAlgorithms(baseline: SemanticComparator, candidate: SemanticComparator, options?: EvaluationOptions): ABTestResult;
|
|
80
|
+
//# sourceMappingURL=ab-testing.d.ts.map
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A/B Testing Framework for Drift Detection Algorithms
|
|
3
|
+
*
|
|
4
|
+
* Enables safe comparison of algorithm variants to measure
|
|
5
|
+
* which performs better on the golden dataset.
|
|
6
|
+
*/
|
|
7
|
+
import { GOLDEN_DATASET } from './evaluation/golden-dataset.js';
|
|
8
|
+
import { evaluate, DefaultSemanticComparator } from './evaluation/evaluator.js';
|
|
9
|
+
/**
|
|
10
|
+
* Run an A/B test comparing multiple algorithm variants.
|
|
11
|
+
*/
|
|
12
|
+
export function runABTest(variants, options = {}) {
|
|
13
|
+
if (variants.length < 2) {
|
|
14
|
+
throw new Error('A/B test requires at least 2 variants');
|
|
15
|
+
}
|
|
16
|
+
const variantResults = new Map();
|
|
17
|
+
// Evaluate each variant
|
|
18
|
+
for (const variant of variants) {
|
|
19
|
+
const result = evaluate(options, variant.comparator);
|
|
20
|
+
variantResults.set(variant.name, result);
|
|
21
|
+
}
|
|
22
|
+
// Generate pairwise comparisons
|
|
23
|
+
const comparison = [];
|
|
24
|
+
for (let i = 0; i < variants.length; i++) {
|
|
25
|
+
for (let j = i + 1; j < variants.length; j++) {
|
|
26
|
+
const v1 = variants[i];
|
|
27
|
+
const v2 = variants[j];
|
|
28
|
+
const r1 = variantResults.get(v1.name);
|
|
29
|
+
const r2 = variantResults.get(v2.name);
|
|
30
|
+
comparison.push({
|
|
31
|
+
variant1: v1.name,
|
|
32
|
+
variant2: v2.name,
|
|
33
|
+
accuracyDiff: r1.accuracy - r2.accuracy,
|
|
34
|
+
precisionDiff: r1.precision - r2.precision,
|
|
35
|
+
recallDiff: r1.recall - r2.recall,
|
|
36
|
+
f1Diff: r1.f1Score - r2.f1Score,
|
|
37
|
+
winner: r1.f1Score > r2.f1Score ? v1.name : v2.name,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// Determine overall winner (highest F1 score)
|
|
42
|
+
let winner = null;
|
|
43
|
+
let highestF1 = -1;
|
|
44
|
+
for (const variant of variants) {
|
|
45
|
+
const result = variantResults.get(variant.name);
|
|
46
|
+
if (result.f1Score > highestF1) {
|
|
47
|
+
highestF1 = result.f1Score;
|
|
48
|
+
winner = variant;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// Calculate statistical significance
|
|
52
|
+
const significance = analyzeSignificance(variantResults, GOLDEN_DATASET.length);
|
|
53
|
+
// Generate recommendations
|
|
54
|
+
const recommendations = generateRecommendations(variantResults, winner, significance);
|
|
55
|
+
return {
|
|
56
|
+
variantResults,
|
|
57
|
+
winner,
|
|
58
|
+
comparison,
|
|
59
|
+
significance,
|
|
60
|
+
recommendations,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Analyze statistical significance of results.
|
|
65
|
+
*/
|
|
66
|
+
function analyzeSignificance(results, sampleSize) {
|
|
67
|
+
// For binary classification with ~50% accuracy, we need ~385 samples
|
|
68
|
+
// for a 5% margin of error at 95% confidence
|
|
69
|
+
const recommendedSampleSize = 385;
|
|
70
|
+
const hasEnoughSamples = sampleSize >= recommendedSampleSize;
|
|
71
|
+
// Calculate variance in results
|
|
72
|
+
const f1Scores = Array.from(results.values()).map(r => r.f1Score);
|
|
73
|
+
const variance = calculateVariance(f1Scores);
|
|
74
|
+
// Results are significant if:
|
|
75
|
+
// 1. We have enough samples
|
|
76
|
+
// 2. The variance between variants is meaningful (> 5%)
|
|
77
|
+
const isSignificant = hasEnoughSamples && variance > 25; // variance of 5% difference
|
|
78
|
+
let notes = '';
|
|
79
|
+
if (!hasEnoughSamples) {
|
|
80
|
+
notes = `Sample size (${sampleSize}) is below recommended (${recommendedSampleSize}). Results may not be statistically significant.`;
|
|
81
|
+
}
|
|
82
|
+
else if (!isSignificant) {
|
|
83
|
+
notes = 'Differences between variants are small. Consider running more targeted tests.';
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
notes = 'Results appear statistically significant with the current sample size.';
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
isSignificant,
|
|
90
|
+
recommendedSampleSize,
|
|
91
|
+
actualSampleSize: sampleSize,
|
|
92
|
+
confidenceLevel: 0.95,
|
|
93
|
+
notes,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Calculate variance of an array of numbers.
|
|
98
|
+
*/
|
|
99
|
+
function calculateVariance(values) {
|
|
100
|
+
if (values.length === 0)
|
|
101
|
+
return 0;
|
|
102
|
+
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
103
|
+
const squaredDiffs = values.map(v => Math.pow(v - mean, 2));
|
|
104
|
+
return squaredDiffs.reduce((sum, d) => sum + d, 0) / values.length;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Generate recommendations based on A/B test results.
|
|
108
|
+
*/
|
|
109
|
+
function generateRecommendations(results, winner, significance) {
|
|
110
|
+
const recommendations = [];
|
|
111
|
+
if (!winner) {
|
|
112
|
+
recommendations.push('No clear winner - all variants performed similarly.');
|
|
113
|
+
return recommendations;
|
|
114
|
+
}
|
|
115
|
+
const winnerResult = results.get(winner.name);
|
|
116
|
+
// Check precision/recall tradeoff
|
|
117
|
+
if (winnerResult.precision > 90 && winnerResult.recall < 50) {
|
|
118
|
+
recommendations.push(`${winner.name} has high precision (${winnerResult.precision}%) but low recall (${winnerResult.recall}%). ` +
|
|
119
|
+
'Consider if missing matches is acceptable for your use case.');
|
|
120
|
+
}
|
|
121
|
+
else if (winnerResult.recall > 90 && winnerResult.precision < 70) {
|
|
122
|
+
recommendations.push(`${winner.name} has high recall (${winnerResult.recall}%) but lower precision (${winnerResult.precision}%). ` +
|
|
123
|
+
'This may result in more false positives.');
|
|
124
|
+
}
|
|
125
|
+
// Check significance
|
|
126
|
+
if (!significance.isSignificant) {
|
|
127
|
+
recommendations.push('Results are not statistically significant. ' +
|
|
128
|
+
`Consider expanding the test dataset to ${significance.recommendedSampleSize}+ cases.`);
|
|
129
|
+
}
|
|
130
|
+
// Check false positive rate
|
|
131
|
+
if (winnerResult.falsePositives > 0) {
|
|
132
|
+
const fpRate = (winnerResult.falsePositives / winnerResult.totalCases) * 100;
|
|
133
|
+
if (fpRate > 5) {
|
|
134
|
+
recommendations.push(`False positive rate (${fpRate.toFixed(1)}%) is above 5%. ` +
|
|
135
|
+
'Consider tightening matching criteria.');
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
// Overall recommendation
|
|
139
|
+
if (significance.isSignificant && winnerResult.f1Score > 70) {
|
|
140
|
+
recommendations.push(`Recommend adopting ${winner.name} (F1: ${winnerResult.f1Score}%, Precision: ${winnerResult.precision}%, Recall: ${winnerResult.recall}%).`);
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
recommendations.push('Continue iterating on algorithm improvements before deployment.');
|
|
144
|
+
}
|
|
145
|
+
return recommendations;
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Format A/B test results for display.
|
|
149
|
+
*/
|
|
150
|
+
export function formatABTestReport(result) {
|
|
151
|
+
const lines = [];
|
|
152
|
+
lines.push('');
|
|
153
|
+
lines.push('═══════════════════════════════════════════════════════════════');
|
|
154
|
+
lines.push(' A/B TEST RESULTS ');
|
|
155
|
+
lines.push('═══════════════════════════════════════════════════════════════');
|
|
156
|
+
lines.push('');
|
|
157
|
+
// Variant results
|
|
158
|
+
lines.push(' VARIANT PERFORMANCE');
|
|
159
|
+
lines.push(' ────────────────────────────────────────────────────────────');
|
|
160
|
+
for (const [name, evalResult] of result.variantResults) {
|
|
161
|
+
const isWinner = result.winner?.name === name;
|
|
162
|
+
const marker = isWinner ? ' ★ WINNER' : '';
|
|
163
|
+
lines.push(` ${name}${marker}`);
|
|
164
|
+
lines.push(` Accuracy: ${evalResult.accuracy}%`);
|
|
165
|
+
lines.push(` Precision: ${evalResult.precision}%`);
|
|
166
|
+
lines.push(` Recall: ${evalResult.recall}%`);
|
|
167
|
+
lines.push(` F1 Score: ${evalResult.f1Score}%`);
|
|
168
|
+
lines.push('');
|
|
169
|
+
}
|
|
170
|
+
// Comparisons
|
|
171
|
+
if (result.comparison.length > 0) {
|
|
172
|
+
lines.push(' PAIRWISE COMPARISONS');
|
|
173
|
+
lines.push(' ────────────────────────────────────────────────────────────');
|
|
174
|
+
for (const comp of result.comparison) {
|
|
175
|
+
const sign = (n) => n > 0 ? '+' : '';
|
|
176
|
+
lines.push(` ${comp.variant1} vs ${comp.variant2}:`);
|
|
177
|
+
lines.push(` Accuracy: ${sign(comp.accuracyDiff)}${comp.accuracyDiff.toFixed(1)}%`);
|
|
178
|
+
lines.push(` Precision: ${sign(comp.precisionDiff)}${comp.precisionDiff.toFixed(1)}%`);
|
|
179
|
+
lines.push(` Recall: ${sign(comp.recallDiff)}${comp.recallDiff.toFixed(1)}%`);
|
|
180
|
+
lines.push(` F1: ${sign(comp.f1Diff)}${comp.f1Diff.toFixed(1)}%`);
|
|
181
|
+
lines.push(` → Winner: ${comp.winner}`);
|
|
182
|
+
lines.push('');
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// Significance
|
|
186
|
+
lines.push(' STATISTICAL SIGNIFICANCE');
|
|
187
|
+
lines.push(' ────────────────────────────────────────────────────────────');
|
|
188
|
+
lines.push(` Sample size: ${result.significance.actualSampleSize} / ${result.significance.recommendedSampleSize} recommended`);
|
|
189
|
+
lines.push(` Confidence level: ${(result.significance.confidenceLevel * 100).toFixed(0)}%`);
|
|
190
|
+
lines.push(` Significant: ${result.significance.isSignificant ? 'Yes' : 'No'}`);
|
|
191
|
+
lines.push(` ${result.significance.notes}`);
|
|
192
|
+
lines.push('');
|
|
193
|
+
// Recommendations
|
|
194
|
+
if (result.recommendations.length > 0) {
|
|
195
|
+
lines.push(' RECOMMENDATIONS');
|
|
196
|
+
lines.push(' ────────────────────────────────────────────────────────────');
|
|
197
|
+
for (const rec of result.recommendations) {
|
|
198
|
+
lines.push(` • ${rec}`);
|
|
199
|
+
}
|
|
200
|
+
lines.push('');
|
|
201
|
+
}
|
|
202
|
+
lines.push('═══════════════════════════════════════════════════════════════');
|
|
203
|
+
lines.push('');
|
|
204
|
+
return lines.join('\n');
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Create the default algorithm variant (current implementation).
|
|
208
|
+
*/
|
|
209
|
+
export function createDefaultVariant() {
|
|
210
|
+
return {
|
|
211
|
+
name: 'default',
|
|
212
|
+
version: '1.2.0',
|
|
213
|
+
description: 'Current production algorithm with negation detection and qualifier extraction',
|
|
214
|
+
comparator: new DefaultSemanticComparator(),
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Quick comparison of two algorithms.
|
|
219
|
+
*/
|
|
220
|
+
export function compareAlgorithms(baseline, candidate, options = {}) {
|
|
221
|
+
return runABTest([
|
|
222
|
+
{
|
|
223
|
+
name: 'baseline',
|
|
224
|
+
version: '1.0.0',
|
|
225
|
+
description: 'Current production algorithm',
|
|
226
|
+
comparator: baseline,
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
name: 'candidate',
|
|
230
|
+
version: '2.0.0',
|
|
231
|
+
description: 'New candidate algorithm',
|
|
232
|
+
comparator: candidate,
|
|
233
|
+
},
|
|
234
|
+
], options);
|
|
235
|
+
}
|
|
236
|
+
//# sourceMappingURL=ab-testing.js.map
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AI Agent Compatibility Scoring.
|
|
3
|
+
*
|
|
4
|
+
* Evaluates how well an MCP server is designed for AI agent consumption.
|
|
5
|
+
* Scores tools based on description clarity, parameter naming, error quality,
|
|
6
|
+
* example completeness, workflow documentation, and response predictability.
|
|
7
|
+
*/
|
|
8
|
+
import type { MCPTool } from '../transport/types.js';
|
|
9
|
+
import type { ToolFingerprint, ResponseSchemaEvolution } from './types.js';
|
|
10
|
+
import type { ErrorPattern } from './response-fingerprint.js';
|
|
11
|
+
/**
|
|
12
|
+
* Individual component of the AI compatibility score.
|
|
13
|
+
*/
|
|
14
|
+
export interface ScoreComponent {
|
|
15
|
+
/** Score value (0-100) */
|
|
16
|
+
score: number;
|
|
17
|
+
/** Weight in overall score (0-1) */
|
|
18
|
+
weight: number;
|
|
19
|
+
/** Weighted contribution to overall score */
|
|
20
|
+
weightedScore: number;
|
|
21
|
+
/** Human-readable notes about this component */
|
|
22
|
+
notes: string[];
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Complete AI compatibility score breakdown.
|
|
26
|
+
*/
|
|
27
|
+
export interface AICompatibilityScore {
|
|
28
|
+
/** Overall score (0-100) */
|
|
29
|
+
overall: number;
|
|
30
|
+
/** Letter grade */
|
|
31
|
+
grade: 'A' | 'B' | 'C' | 'D' | 'F';
|
|
32
|
+
/** Score breakdown by component */
|
|
33
|
+
breakdown: {
|
|
34
|
+
descriptionClarity: ScoreComponent;
|
|
35
|
+
parameterNaming: ScoreComponent;
|
|
36
|
+
errorMessageQuality: ScoreComponent;
|
|
37
|
+
exampleCompleteness: ScoreComponent;
|
|
38
|
+
workflowDocumentation: ScoreComponent;
|
|
39
|
+
responsePredictability: ScoreComponent;
|
|
40
|
+
};
|
|
41
|
+
/** Actionable recommendations for improvement */
|
|
42
|
+
recommendations: AICompatibilityRecommendation[];
|
|
43
|
+
/** Per-tool scores for detailed analysis */
|
|
44
|
+
toolScores: ToolAIScore[];
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* A single recommendation for improving AI compatibility.
|
|
48
|
+
*/
|
|
49
|
+
export interface AICompatibilityRecommendation {
|
|
50
|
+
/** Priority of this recommendation (1 = highest) */
|
|
51
|
+
priority: number;
|
|
52
|
+
/** Category this recommendation addresses */
|
|
53
|
+
category: keyof AICompatibilityScore['breakdown'];
|
|
54
|
+
/** Short title */
|
|
55
|
+
title: string;
|
|
56
|
+
/** Detailed description */
|
|
57
|
+
description: string;
|
|
58
|
+
/** Affected tools (if applicable) */
|
|
59
|
+
affectedTools?: string[];
|
|
60
|
+
/** Potential score improvement if fixed */
|
|
61
|
+
potentialImprovement: number;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* AI compatibility score for a single tool.
|
|
65
|
+
*/
|
|
66
|
+
export interface ToolAIScore {
|
|
67
|
+
/** Tool name */
|
|
68
|
+
toolName: string;
|
|
69
|
+
/** Overall tool score (0-100) */
|
|
70
|
+
score: number;
|
|
71
|
+
/** Issues found for this tool */
|
|
72
|
+
issues: string[];
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Input data for scoring (combines MCPTool with baseline fingerprint).
|
|
76
|
+
*/
|
|
77
|
+
export interface AICompatibilityInput {
|
|
78
|
+
/** Tool definition from MCP */
|
|
79
|
+
tool: MCPTool;
|
|
80
|
+
/** Fingerprint from baseline (may have additional data) */
|
|
81
|
+
fingerprint?: ToolFingerprint;
|
|
82
|
+
/** Error patterns observed for this tool */
|
|
83
|
+
errorPatterns?: ErrorPattern[];
|
|
84
|
+
/** Response schema evolution data */
|
|
85
|
+
schemaEvolution?: ResponseSchemaEvolution;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Calculate AI compatibility score for a set of tools.
|
|
89
|
+
*/
|
|
90
|
+
export declare function calculateAICompatibilityScore(inputs: AICompatibilityInput[]): AICompatibilityScore;
|
|
91
|
+
/**
|
|
92
|
+
* Generate markdown documentation for AI compatibility score.
|
|
93
|
+
*/
|
|
94
|
+
export declare function generateAICompatibilityMarkdown(score: AICompatibilityScore): string;
|
|
95
|
+
//# sourceMappingURL=ai-compatibility-scorer.d.ts.map
|