@dotsetlabs/bellwether 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +291 -0
- package/LICENSE +21 -0
- package/README.md +739 -0
- package/dist/auth/credentials.d.ts +64 -0
- package/dist/auth/credentials.js +218 -0
- package/dist/auth/index.d.ts +6 -0
- package/dist/auth/index.js +6 -0
- package/dist/auth/keychain.d.ts +64 -0
- package/dist/auth/keychain.js +268 -0
- package/dist/baseline/ab-testing.d.ts +80 -0
- package/dist/baseline/ab-testing.js +236 -0
- package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
- package/dist/baseline/ai-compatibility-scorer.js +606 -0
- package/dist/baseline/calibration.d.ts +77 -0
- package/dist/baseline/calibration.js +136 -0
- package/dist/baseline/category-matching.d.ts +85 -0
- package/dist/baseline/category-matching.js +289 -0
- package/dist/baseline/change-impact-analyzer.d.ts +98 -0
- package/dist/baseline/change-impact-analyzer.js +592 -0
- package/dist/baseline/comparator.d.ts +64 -0
- package/dist/baseline/comparator.js +916 -0
- package/dist/baseline/confidence.d.ts +55 -0
- package/dist/baseline/confidence.js +122 -0
- package/dist/baseline/converter.d.ts +61 -0
- package/dist/baseline/converter.js +585 -0
- package/dist/baseline/dependency-analyzer.d.ts +89 -0
- package/dist/baseline/dependency-analyzer.js +567 -0
- package/dist/baseline/deprecation-tracker.d.ts +133 -0
- package/dist/baseline/deprecation-tracker.js +322 -0
- package/dist/baseline/diff.d.ts +55 -0
- package/dist/baseline/diff.js +1584 -0
- package/dist/baseline/documentation-scorer.d.ts +205 -0
- package/dist/baseline/documentation-scorer.js +466 -0
- package/dist/baseline/embeddings.d.ts +118 -0
- package/dist/baseline/embeddings.js +251 -0
- package/dist/baseline/error-analyzer.d.ts +198 -0
- package/dist/baseline/error-analyzer.js +721 -0
- package/dist/baseline/evaluation/evaluator.d.ts +42 -0
- package/dist/baseline/evaluation/evaluator.js +323 -0
- package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
- package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
- package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
- package/dist/baseline/evaluation/golden-dataset.js +717 -0
- package/dist/baseline/evaluation/index.d.ts +15 -0
- package/dist/baseline/evaluation/index.js +15 -0
- package/dist/baseline/evaluation/types.d.ts +186 -0
- package/dist/baseline/evaluation/types.js +8 -0
- package/dist/baseline/external-dependency-detector.d.ts +181 -0
- package/dist/baseline/external-dependency-detector.js +524 -0
- package/dist/baseline/golden-output.d.ts +162 -0
- package/dist/baseline/golden-output.js +636 -0
- package/dist/baseline/health-scorer.d.ts +174 -0
- package/dist/baseline/health-scorer.js +451 -0
- package/dist/baseline/incremental-checker.d.ts +97 -0
- package/dist/baseline/incremental-checker.js +174 -0
- package/dist/baseline/index.d.ts +31 -0
- package/dist/baseline/index.js +42 -0
- package/dist/baseline/migration-generator.d.ts +137 -0
- package/dist/baseline/migration-generator.js +554 -0
- package/dist/baseline/migrations.d.ts +60 -0
- package/dist/baseline/migrations.js +197 -0
- package/dist/baseline/performance-tracker.d.ts +214 -0
- package/dist/baseline/performance-tracker.js +577 -0
- package/dist/baseline/pr-comment-generator.d.ts +117 -0
- package/dist/baseline/pr-comment-generator.js +546 -0
- package/dist/baseline/response-fingerprint.d.ts +127 -0
- package/dist/baseline/response-fingerprint.js +728 -0
- package/dist/baseline/response-schema-tracker.d.ts +129 -0
- package/dist/baseline/response-schema-tracker.js +420 -0
- package/dist/baseline/risk-scorer.d.ts +54 -0
- package/dist/baseline/risk-scorer.js +434 -0
- package/dist/baseline/saver.d.ts +89 -0
- package/dist/baseline/saver.js +554 -0
- package/dist/baseline/scenario-generator.d.ts +151 -0
- package/dist/baseline/scenario-generator.js +905 -0
- package/dist/baseline/schema-compare.d.ts +86 -0
- package/dist/baseline/schema-compare.js +557 -0
- package/dist/baseline/schema-evolution.d.ts +189 -0
- package/dist/baseline/schema-evolution.js +467 -0
- package/dist/baseline/semantic.d.ts +203 -0
- package/dist/baseline/semantic.js +908 -0
- package/dist/baseline/synonyms.d.ts +60 -0
- package/dist/baseline/synonyms.js +386 -0
- package/dist/baseline/telemetry.d.ts +165 -0
- package/dist/baseline/telemetry.js +294 -0
- package/dist/baseline/test-pruner.d.ts +120 -0
- package/dist/baseline/test-pruner.js +387 -0
- package/dist/baseline/types.d.ts +449 -0
- package/dist/baseline/types.js +5 -0
- package/dist/baseline/version.d.ts +138 -0
- package/dist/baseline/version.js +206 -0
- package/dist/cache/index.d.ts +5 -0
- package/dist/cache/index.js +5 -0
- package/dist/cache/response-cache.d.ts +151 -0
- package/dist/cache/response-cache.js +287 -0
- package/dist/ci/index.d.ts +60 -0
- package/dist/ci/index.js +342 -0
- package/dist/cli/commands/auth.d.ts +12 -0
- package/dist/cli/commands/auth.js +352 -0
- package/dist/cli/commands/badge.d.ts +3 -0
- package/dist/cli/commands/badge.js +74 -0
- package/dist/cli/commands/baseline-accept.d.ts +15 -0
- package/dist/cli/commands/baseline-accept.js +178 -0
- package/dist/cli/commands/baseline-migrate.d.ts +12 -0
- package/dist/cli/commands/baseline-migrate.js +164 -0
- package/dist/cli/commands/baseline.d.ts +14 -0
- package/dist/cli/commands/baseline.js +449 -0
- package/dist/cli/commands/beta.d.ts +10 -0
- package/dist/cli/commands/beta.js +231 -0
- package/dist/cli/commands/check.d.ts +11 -0
- package/dist/cli/commands/check.js +820 -0
- package/dist/cli/commands/cloud/badge.d.ts +3 -0
- package/dist/cli/commands/cloud/badge.js +74 -0
- package/dist/cli/commands/cloud/diff.d.ts +6 -0
- package/dist/cli/commands/cloud/diff.js +79 -0
- package/dist/cli/commands/cloud/history.d.ts +6 -0
- package/dist/cli/commands/cloud/history.js +102 -0
- package/dist/cli/commands/cloud/link.d.ts +9 -0
- package/dist/cli/commands/cloud/link.js +119 -0
- package/dist/cli/commands/cloud/login.d.ts +7 -0
- package/dist/cli/commands/cloud/login.js +499 -0
- package/dist/cli/commands/cloud/projects.d.ts +6 -0
- package/dist/cli/commands/cloud/projects.js +44 -0
- package/dist/cli/commands/cloud/shared.d.ts +7 -0
- package/dist/cli/commands/cloud/shared.js +42 -0
- package/dist/cli/commands/cloud/teams.d.ts +8 -0
- package/dist/cli/commands/cloud/teams.js +169 -0
- package/dist/cli/commands/cloud/upload.d.ts +8 -0
- package/dist/cli/commands/cloud/upload.js +181 -0
- package/dist/cli/commands/contract.d.ts +11 -0
- package/dist/cli/commands/contract.js +280 -0
- package/dist/cli/commands/discover.d.ts +3 -0
- package/dist/cli/commands/discover.js +82 -0
- package/dist/cli/commands/eval.d.ts +9 -0
- package/dist/cli/commands/eval.js +187 -0
- package/dist/cli/commands/explore.d.ts +11 -0
- package/dist/cli/commands/explore.js +437 -0
- package/dist/cli/commands/feedback.d.ts +9 -0
- package/dist/cli/commands/feedback.js +174 -0
- package/dist/cli/commands/golden.d.ts +12 -0
- package/dist/cli/commands/golden.js +407 -0
- package/dist/cli/commands/history.d.ts +10 -0
- package/dist/cli/commands/history.js +202 -0
- package/dist/cli/commands/init.d.ts +9 -0
- package/dist/cli/commands/init.js +219 -0
- package/dist/cli/commands/interview.d.ts +3 -0
- package/dist/cli/commands/interview.js +903 -0
- package/dist/cli/commands/link.d.ts +10 -0
- package/dist/cli/commands/link.js +169 -0
- package/dist/cli/commands/login.d.ts +7 -0
- package/dist/cli/commands/login.js +499 -0
- package/dist/cli/commands/preset.d.ts +33 -0
- package/dist/cli/commands/preset.js +297 -0
- package/dist/cli/commands/profile.d.ts +33 -0
- package/dist/cli/commands/profile.js +286 -0
- package/dist/cli/commands/registry.d.ts +11 -0
- package/dist/cli/commands/registry.js +146 -0
- package/dist/cli/commands/shared.d.ts +79 -0
- package/dist/cli/commands/shared.js +196 -0
- package/dist/cli/commands/teams.d.ts +8 -0
- package/dist/cli/commands/teams.js +169 -0
- package/dist/cli/commands/test.d.ts +9 -0
- package/dist/cli/commands/test.js +500 -0
- package/dist/cli/commands/upload.d.ts +8 -0
- package/dist/cli/commands/upload.js +223 -0
- package/dist/cli/commands/validate-config.d.ts +6 -0
- package/dist/cli/commands/validate-config.js +35 -0
- package/dist/cli/commands/verify.d.ts +11 -0
- package/dist/cli/commands/verify.js +283 -0
- package/dist/cli/commands/watch.d.ts +12 -0
- package/dist/cli/commands/watch.js +253 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.js +178 -0
- package/dist/cli/interactive.d.ts +47 -0
- package/dist/cli/interactive.js +216 -0
- package/dist/cli/output/terminal-reporter.d.ts +19 -0
- package/dist/cli/output/terminal-reporter.js +104 -0
- package/dist/cli/output.d.ts +226 -0
- package/dist/cli/output.js +438 -0
- package/dist/cli/utils/env.d.ts +5 -0
- package/dist/cli/utils/env.js +14 -0
- package/dist/cli/utils/progress.d.ts +59 -0
- package/dist/cli/utils/progress.js +206 -0
- package/dist/cli/utils/server-context.d.ts +10 -0
- package/dist/cli/utils/server-context.js +36 -0
- package/dist/cloud/auth.d.ts +144 -0
- package/dist/cloud/auth.js +374 -0
- package/dist/cloud/client.d.ts +24 -0
- package/dist/cloud/client.js +65 -0
- package/dist/cloud/http-client.d.ts +38 -0
- package/dist/cloud/http-client.js +215 -0
- package/dist/cloud/index.d.ts +23 -0
- package/dist/cloud/index.js +25 -0
- package/dist/cloud/mock-client.d.ts +107 -0
- package/dist/cloud/mock-client.js +545 -0
- package/dist/cloud/types.d.ts +515 -0
- package/dist/cloud/types.js +15 -0
- package/dist/config/defaults.d.ts +160 -0
- package/dist/config/defaults.js +169 -0
- package/dist/config/loader.d.ts +24 -0
- package/dist/config/loader.js +122 -0
- package/dist/config/template.d.ts +42 -0
- package/dist/config/template.js +647 -0
- package/dist/config/validator.d.ts +2112 -0
- package/dist/config/validator.js +658 -0
- package/dist/constants/cloud.d.ts +107 -0
- package/dist/constants/cloud.js +110 -0
- package/dist/constants/core.d.ts +521 -0
- package/dist/constants/core.js +556 -0
- package/dist/constants/testing.d.ts +1283 -0
- package/dist/constants/testing.js +1568 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.js +10 -0
- package/dist/contract/index.d.ts +6 -0
- package/dist/contract/index.js +5 -0
- package/dist/contract/validator.d.ts +177 -0
- package/dist/contract/validator.js +574 -0
- package/dist/cost/index.d.ts +6 -0
- package/dist/cost/index.js +5 -0
- package/dist/cost/tracker.d.ts +134 -0
- package/dist/cost/tracker.js +313 -0
- package/dist/discovery/discovery.d.ts +16 -0
- package/dist/discovery/discovery.js +173 -0
- package/dist/discovery/types.d.ts +51 -0
- package/dist/discovery/types.js +2 -0
- package/dist/docs/agents.d.ts +3 -0
- package/dist/docs/agents.js +995 -0
- package/dist/docs/contract.d.ts +51 -0
- package/dist/docs/contract.js +1681 -0
- package/dist/docs/generator.d.ts +4 -0
- package/dist/docs/generator.js +4 -0
- package/dist/docs/html-reporter.d.ts +9 -0
- package/dist/docs/html-reporter.js +757 -0
- package/dist/docs/index.d.ts +10 -0
- package/dist/docs/index.js +11 -0
- package/dist/docs/junit-reporter.d.ts +18 -0
- package/dist/docs/junit-reporter.js +210 -0
- package/dist/docs/report.d.ts +14 -0
- package/dist/docs/report.js +44 -0
- package/dist/docs/sarif-reporter.d.ts +19 -0
- package/dist/docs/sarif-reporter.js +335 -0
- package/dist/docs/shared.d.ts +35 -0
- package/dist/docs/shared.js +162 -0
- package/dist/docs/templates.d.ts +12 -0
- package/dist/docs/templates.js +76 -0
- package/dist/errors/index.d.ts +6 -0
- package/dist/errors/index.js +6 -0
- package/dist/errors/retry.d.ts +92 -0
- package/dist/errors/retry.js +323 -0
- package/dist/errors/types.d.ts +321 -0
- package/dist/errors/types.js +584 -0
- package/dist/index.d.ts +32 -0
- package/dist/index.js +32 -0
- package/dist/interview/dependency-resolver.d.ts +11 -0
- package/dist/interview/dependency-resolver.js +32 -0
- package/dist/interview/interviewer.d.ts +232 -0
- package/dist/interview/interviewer.js +1939 -0
- package/dist/interview/mock-response-generator.d.ts +7 -0
- package/dist/interview/mock-response-generator.js +102 -0
- package/dist/interview/orchestrator.d.ts +237 -0
- package/dist/interview/orchestrator.js +1296 -0
- package/dist/interview/rate-limiter.d.ts +15 -0
- package/dist/interview/rate-limiter.js +55 -0
- package/dist/interview/response-validator.d.ts +10 -0
- package/dist/interview/response-validator.js +132 -0
- package/dist/interview/schema-inferrer.d.ts +8 -0
- package/dist/interview/schema-inferrer.js +71 -0
- package/dist/interview/schema-test-generator.d.ts +71 -0
- package/dist/interview/schema-test-generator.js +834 -0
- package/dist/interview/smart-value-generator.d.ts +155 -0
- package/dist/interview/smart-value-generator.js +554 -0
- package/dist/interview/stateful-test-runner.d.ts +19 -0
- package/dist/interview/stateful-test-runner.js +106 -0
- package/dist/interview/types.d.ts +561 -0
- package/dist/interview/types.js +2 -0
- package/dist/llm/anthropic.d.ts +41 -0
- package/dist/llm/anthropic.js +355 -0
- package/dist/llm/client.d.ts +123 -0
- package/dist/llm/client.js +42 -0
- package/dist/llm/factory.d.ts +38 -0
- package/dist/llm/factory.js +145 -0
- package/dist/llm/fallback.d.ts +140 -0
- package/dist/llm/fallback.js +379 -0
- package/dist/llm/index.d.ts +18 -0
- package/dist/llm/index.js +15 -0
- package/dist/llm/ollama.d.ts +37 -0
- package/dist/llm/ollama.js +330 -0
- package/dist/llm/openai.d.ts +25 -0
- package/dist/llm/openai.js +320 -0
- package/dist/llm/token-budget.d.ts +161 -0
- package/dist/llm/token-budget.js +395 -0
- package/dist/logging/logger.d.ts +70 -0
- package/dist/logging/logger.js +130 -0
- package/dist/metrics/collector.d.ts +106 -0
- package/dist/metrics/collector.js +547 -0
- package/dist/metrics/index.d.ts +7 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/prometheus.d.ts +20 -0
- package/dist/metrics/prometheus.js +241 -0
- package/dist/metrics/types.d.ts +209 -0
- package/dist/metrics/types.js +5 -0
- package/dist/persona/builtins.d.ts +54 -0
- package/dist/persona/builtins.js +219 -0
- package/dist/persona/index.d.ts +8 -0
- package/dist/persona/index.js +8 -0
- package/dist/persona/loader.d.ts +30 -0
- package/dist/persona/loader.js +190 -0
- package/dist/persona/types.d.ts +144 -0
- package/dist/persona/types.js +5 -0
- package/dist/persona/validation.d.ts +94 -0
- package/dist/persona/validation.js +332 -0
- package/dist/prompts/index.d.ts +5 -0
- package/dist/prompts/index.js +5 -0
- package/dist/prompts/templates.d.ts +180 -0
- package/dist/prompts/templates.js +431 -0
- package/dist/registry/client.d.ts +49 -0
- package/dist/registry/client.js +191 -0
- package/dist/registry/index.d.ts +7 -0
- package/dist/registry/index.js +6 -0
- package/dist/registry/types.d.ts +140 -0
- package/dist/registry/types.js +6 -0
- package/dist/scenarios/evaluator.d.ts +43 -0
- package/dist/scenarios/evaluator.js +206 -0
- package/dist/scenarios/index.d.ts +10 -0
- package/dist/scenarios/index.js +9 -0
- package/dist/scenarios/loader.d.ts +20 -0
- package/dist/scenarios/loader.js +285 -0
- package/dist/scenarios/types.d.ts +153 -0
- package/dist/scenarios/types.js +8 -0
- package/dist/security/index.d.ts +17 -0
- package/dist/security/index.js +18 -0
- package/dist/security/payloads.d.ts +61 -0
- package/dist/security/payloads.js +268 -0
- package/dist/security/security-tester.d.ts +42 -0
- package/dist/security/security-tester.js +582 -0
- package/dist/security/types.d.ts +166 -0
- package/dist/security/types.js +8 -0
- package/dist/transport/base-transport.d.ts +59 -0
- package/dist/transport/base-transport.js +38 -0
- package/dist/transport/http-transport.d.ts +67 -0
- package/dist/transport/http-transport.js +238 -0
- package/dist/transport/mcp-client.d.ts +141 -0
- package/dist/transport/mcp-client.js +496 -0
- package/dist/transport/sse-transport.d.ts +88 -0
- package/dist/transport/sse-transport.js +316 -0
- package/dist/transport/stdio-transport.d.ts +43 -0
- package/dist/transport/stdio-transport.js +238 -0
- package/dist/transport/types.d.ts +125 -0
- package/dist/transport/types.js +16 -0
- package/dist/utils/concurrency.d.ts +123 -0
- package/dist/utils/concurrency.js +213 -0
- package/dist/utils/formatters.d.ts +16 -0
- package/dist/utils/formatters.js +37 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/jsonpath.d.ts +87 -0
- package/dist/utils/jsonpath.js +326 -0
- package/dist/utils/markdown.d.ts +113 -0
- package/dist/utils/markdown.js +265 -0
- package/dist/utils/network.d.ts +14 -0
- package/dist/utils/network.js +17 -0
- package/dist/utils/sanitize.d.ts +92 -0
- package/dist/utils/sanitize.js +191 -0
- package/dist/utils/semantic.d.ts +194 -0
- package/dist/utils/semantic.js +1051 -0
- package/dist/utils/smart-truncate.d.ts +94 -0
- package/dist/utils/smart-truncate.js +361 -0
- package/dist/utils/timeout.d.ts +153 -0
- package/dist/utils/timeout.js +205 -0
- package/dist/utils/yaml-parser.d.ts +58 -0
- package/dist/utils/yaml-parser.js +86 -0
- package/dist/validation/index.d.ts +32 -0
- package/dist/validation/index.js +32 -0
- package/dist/validation/semantic-test-generator.d.ts +50 -0
- package/dist/validation/semantic-test-generator.js +176 -0
- package/dist/validation/semantic-types.d.ts +66 -0
- package/dist/validation/semantic-types.js +94 -0
- package/dist/validation/semantic-validator.d.ts +38 -0
- package/dist/validation/semantic-validator.js +340 -0
- package/dist/verification/index.d.ts +6 -0
- package/dist/verification/index.js +5 -0
- package/dist/verification/types.d.ts +133 -0
- package/dist/verification/types.js +5 -0
- package/dist/verification/verifier.d.ts +30 -0
- package/dist/verification/verifier.js +309 -0
- package/dist/version.d.ts +19 -0
- package/dist/version.js +48 -0
- package/dist/workflow/auto-generator.d.ts +27 -0
- package/dist/workflow/auto-generator.js +513 -0
- package/dist/workflow/discovery.d.ts +40 -0
- package/dist/workflow/discovery.js +195 -0
- package/dist/workflow/executor.d.ts +82 -0
- package/dist/workflow/executor.js +611 -0
- package/dist/workflow/index.d.ts +10 -0
- package/dist/workflow/index.js +10 -0
- package/dist/workflow/loader.d.ts +24 -0
- package/dist/workflow/loader.js +194 -0
- package/dist/workflow/state-tracker.d.ts +98 -0
- package/dist/workflow/state-tracker.js +424 -0
- package/dist/workflow/types.d.ts +337 -0
- package/dist/workflow/types.js +5 -0
- package/package.json +94 -0
- package/schemas/bellwether-check.schema.json +651 -0
|
@@ -0,0 +1,717 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Golden Dataset for Drift Detection Evaluation
|
|
3
|
+
*
|
|
4
|
+
* This dataset contains labeled test cases for evaluating the accuracy
|
|
5
|
+
* of semantic comparison in drift detection. Each case specifies whether
|
|
6
|
+
* two texts should be considered semantically equivalent.
|
|
7
|
+
*
|
|
8
|
+
* Categories:
|
|
9
|
+
* - TRUE POSITIVES: Different phrasing, same meaning (should match)
|
|
10
|
+
* - TRUE NEGATIVES: Different meaning (should not match)
|
|
11
|
+
* - EDGE CASES: Boundary conditions and special scenarios
|
|
12
|
+
*
|
|
13
|
+
* To add new test cases:
|
|
14
|
+
* 1. Add to appropriate section below
|
|
15
|
+
* 2. Run `bellwether eval` to verify accuracy
|
|
16
|
+
* 3. If test fails unexpectedly, either fix algorithm or adjust test case
|
|
17
|
+
*/
|
|
18
|
+
import { EXPANDED_TEST_CASES, getExpandedDatasetStatistics } from './expanded-dataset.js';
|
|
19
|
+
/**
|
|
20
|
+
* Dataset version history:
|
|
21
|
+
* - 1.0.0: Initial 50 test cases
|
|
22
|
+
* - 2.0.0: Phase 3 expansion with 150+ additional cases
|
|
23
|
+
*/
|
|
24
|
+
export const DATASET_VERSION = '2.0.0';
|
|
25
|
+
// ============================================================================
|
|
26
|
+
// SECURITY FINDINGS - TRUE POSITIVES (should match)
|
|
27
|
+
// Same vulnerability, different phrasing
|
|
28
|
+
// ============================================================================
|
|
29
|
+
const SECURITY_TRUE_POSITIVES = [
|
|
30
|
+
// Path Traversal Variations
|
|
31
|
+
{
|
|
32
|
+
id: 'sec-tp-001',
|
|
33
|
+
category: 'security',
|
|
34
|
+
text1: 'Path traversal vulnerability allows reading files outside base directory',
|
|
35
|
+
text2: 'The tool is vulnerable to directory traversal attacks via ../ sequences',
|
|
36
|
+
toolName: 'read_file',
|
|
37
|
+
expectedMatch: true,
|
|
38
|
+
expectedConfidence: { min: 80, max: 100 },
|
|
39
|
+
reasoning: 'Both describe path_traversal category, same vulnerability type',
|
|
40
|
+
source: 'manual',
|
|
41
|
+
tags: ['path_traversal', 'paraphrase'],
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
id: 'sec-tp-002',
|
|
45
|
+
category: 'security',
|
|
46
|
+
text1: 'Local file inclusion vulnerability through path manipulation',
|
|
47
|
+
text2: 'Arbitrary file read via ../ path traversal',
|
|
48
|
+
toolName: 'read_file',
|
|
49
|
+
expectedMatch: true,
|
|
50
|
+
expectedConfidence: { min: 75, max: 100 },
|
|
51
|
+
reasoning: 'LFI and path traversal are the same category',
|
|
52
|
+
source: 'manual',
|
|
53
|
+
tags: ['path_traversal', 'lfi'],
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
id: 'sec-tp-003',
|
|
57
|
+
category: 'security',
|
|
58
|
+
text1: 'Users can access files outside the intended directory using relative paths',
|
|
59
|
+
text2: 'The read_file tool allows escaping the base directory',
|
|
60
|
+
toolName: 'read_file',
|
|
61
|
+
expectedMatch: true,
|
|
62
|
+
reasoning: 'Both describe the same directory escape vulnerability',
|
|
63
|
+
source: 'manual',
|
|
64
|
+
tags: ['path_traversal'],
|
|
65
|
+
},
|
|
66
|
+
// SQL Injection Variations
|
|
67
|
+
{
|
|
68
|
+
id: 'sec-tp-010',
|
|
69
|
+
category: 'security',
|
|
70
|
+
text1: 'SQL injection allows unauthorized database access',
|
|
71
|
+
text2: 'The query parameter is vulnerable to SQL injection attacks',
|
|
72
|
+
toolName: 'search_db',
|
|
73
|
+
expectedMatch: true,
|
|
74
|
+
expectedConfidence: { min: 85, max: 100 },
|
|
75
|
+
reasoning: 'Both describe sql_injection category',
|
|
76
|
+
source: 'manual',
|
|
77
|
+
tags: ['sql_injection', 'paraphrase'],
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
id: 'sec-tp-011',
|
|
81
|
+
category: 'security',
|
|
82
|
+
text1: 'User input is not properly sanitized before database queries',
|
|
83
|
+
text2: 'Malicious SQL can be injected through the search field',
|
|
84
|
+
toolName: 'search_db',
|
|
85
|
+
expectedMatch: true,
|
|
86
|
+
reasoning: 'Both indicate SQL injection vulnerability',
|
|
87
|
+
source: 'manual',
|
|
88
|
+
tags: ['sql_injection'],
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
id: 'sec-tp-012',
|
|
92
|
+
category: 'security',
|
|
93
|
+
text1: 'Database queries are constructed using unsanitized input',
|
|
94
|
+
text2: 'SQLi vulnerability in query construction',
|
|
95
|
+
toolName: 'search_db',
|
|
96
|
+
expectedMatch: true,
|
|
97
|
+
reasoning: 'SQLi abbreviation maps to sql_injection',
|
|
98
|
+
source: 'manual',
|
|
99
|
+
tags: ['sql_injection', 'abbreviation'],
|
|
100
|
+
},
|
|
101
|
+
// XSS Variations
|
|
102
|
+
{
|
|
103
|
+
id: 'sec-tp-020',
|
|
104
|
+
category: 'security',
|
|
105
|
+
text1: 'Cross-site scripting vulnerability in output rendering',
|
|
106
|
+
text2: 'XSS vulnerability allows script injection',
|
|
107
|
+
toolName: 'render_html',
|
|
108
|
+
expectedMatch: true,
|
|
109
|
+
expectedConfidence: { min: 85, max: 100 },
|
|
110
|
+
reasoning: 'XSS abbreviation and full name are equivalent',
|
|
111
|
+
source: 'manual',
|
|
112
|
+
tags: ['xss', 'abbreviation'],
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
id: 'sec-tp-021',
|
|
116
|
+
category: 'security',
|
|
117
|
+
text1: 'User input is reflected in HTML without proper encoding',
|
|
118
|
+
text2: 'Reflected XSS through unsanitized output',
|
|
119
|
+
toolName: 'render_html',
|
|
120
|
+
expectedMatch: true,
|
|
121
|
+
reasoning: 'Both describe reflected XSS vulnerability',
|
|
122
|
+
source: 'manual',
|
|
123
|
+
tags: ['xss', 'reflected'],
|
|
124
|
+
},
|
|
125
|
+
// Command Injection Variations
|
|
126
|
+
{
|
|
127
|
+
id: 'sec-tp-030',
|
|
128
|
+
category: 'security',
|
|
129
|
+
text1: 'Command injection vulnerability allows arbitrary code execution',
|
|
130
|
+
text2: 'Shell injection through unsanitized input to exec()',
|
|
131
|
+
toolName: 'run_command',
|
|
132
|
+
expectedMatch: true,
|
|
133
|
+
expectedConfidence: { min: 80, max: 100 },
|
|
134
|
+
reasoning: 'Command and shell injection are same category',
|
|
135
|
+
source: 'manual',
|
|
136
|
+
tags: ['command_injection'],
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
id: 'sec-tp-031',
|
|
140
|
+
category: 'security',
|
|
141
|
+
text1: 'OS command injection via user-controlled input',
|
|
142
|
+
text2: 'The system() call uses unsanitized user input',
|
|
143
|
+
toolName: 'run_command',
|
|
144
|
+
expectedMatch: true,
|
|
145
|
+
reasoning: 'Both describe command injection vulnerability',
|
|
146
|
+
source: 'manual',
|
|
147
|
+
tags: ['command_injection'],
|
|
148
|
+
},
|
|
149
|
+
// Authentication Variations
|
|
150
|
+
{
|
|
151
|
+
id: 'sec-tp-040',
|
|
152
|
+
category: 'security',
|
|
153
|
+
text1: 'Authentication bypass allows unauthenticated access',
|
|
154
|
+
text2: 'Auth can be bypassed using specific request headers',
|
|
155
|
+
toolName: 'admin_api',
|
|
156
|
+
expectedMatch: true,
|
|
157
|
+
reasoning: 'Both describe authentication bypass',
|
|
158
|
+
source: 'manual',
|
|
159
|
+
tags: ['authentication'],
|
|
160
|
+
},
|
|
161
|
+
// SSRF Variations
|
|
162
|
+
{
|
|
163
|
+
id: 'sec-tp-050',
|
|
164
|
+
category: 'security',
|
|
165
|
+
text1: 'Server-side request forgery allows accessing internal services',
|
|
166
|
+
text2: 'SSRF vulnerability enables requests to internal network',
|
|
167
|
+
toolName: 'fetch_url',
|
|
168
|
+
expectedMatch: true,
|
|
169
|
+
reasoning: 'SSRF and full name are equivalent',
|
|
170
|
+
source: 'manual',
|
|
171
|
+
tags: ['ssrf', 'abbreviation'],
|
|
172
|
+
},
|
|
173
|
+
];
|
|
174
|
+
// ============================================================================
|
|
175
|
+
// SECURITY FINDINGS - TRUE NEGATIVES (should NOT match)
|
|
176
|
+
// Different vulnerability types
|
|
177
|
+
// ============================================================================
|
|
178
|
+
const SECURITY_TRUE_NEGATIVES = [
|
|
179
|
+
{
|
|
180
|
+
id: 'sec-tn-001',
|
|
181
|
+
category: 'security',
|
|
182
|
+
text1: 'Path traversal vulnerability allows reading arbitrary files',
|
|
183
|
+
text2: 'SQL injection allows unauthorized database access',
|
|
184
|
+
toolName: 'test_tool',
|
|
185
|
+
expectedMatch: false,
|
|
186
|
+
reasoning: 'Different categories: path_traversal vs sql_injection',
|
|
187
|
+
source: 'manual',
|
|
188
|
+
tags: ['cross_category'],
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
id: 'sec-tn-002',
|
|
192
|
+
category: 'security',
|
|
193
|
+
text1: 'Cross-site scripting vulnerability',
|
|
194
|
+
text2: 'Command injection vulnerability',
|
|
195
|
+
toolName: 'test_tool',
|
|
196
|
+
expectedMatch: false,
|
|
197
|
+
reasoning: 'Different categories: xss vs command_injection',
|
|
198
|
+
source: 'manual',
|
|
199
|
+
tags: ['cross_category'],
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
id: 'sec-tn-003',
|
|
203
|
+
category: 'security',
|
|
204
|
+
text1: 'Authentication bypass allows unauthenticated access',
|
|
205
|
+
text2: 'Authorization flaw allows accessing other users data',
|
|
206
|
+
toolName: 'test_tool',
|
|
207
|
+
expectedMatch: false,
|
|
208
|
+
reasoning: 'Different categories: authentication vs authorization',
|
|
209
|
+
source: 'manual',
|
|
210
|
+
tags: ['cross_category'],
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
id: 'sec-tn-004',
|
|
214
|
+
category: 'security',
|
|
215
|
+
text1: 'SSRF allows accessing internal services',
|
|
216
|
+
text2: 'Open redirect allows phishing attacks',
|
|
217
|
+
toolName: 'test_tool',
|
|
218
|
+
expectedMatch: false,
|
|
219
|
+
reasoning: 'Different categories: ssrf vs open_redirect',
|
|
220
|
+
source: 'manual',
|
|
221
|
+
tags: ['cross_category'],
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
id: 'sec-tn-005',
|
|
225
|
+
category: 'security',
|
|
226
|
+
text1: 'High severity path traversal vulnerability',
|
|
227
|
+
text2: 'Low severity information disclosure',
|
|
228
|
+
toolName: 'test_tool',
|
|
229
|
+
expectedMatch: false,
|
|
230
|
+
reasoning: 'Different categories and severity levels',
|
|
231
|
+
source: 'manual',
|
|
232
|
+
tags: ['cross_category', 'severity_diff'],
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
id: 'sec-tn-006',
|
|
236
|
+
category: 'security',
|
|
237
|
+
text1: 'XXE vulnerability allows reading local files',
|
|
238
|
+
text2: 'Deserialization vulnerability allows code execution',
|
|
239
|
+
toolName: 'test_tool',
|
|
240
|
+
expectedMatch: false,
|
|
241
|
+
reasoning: 'Different categories: xxe vs deserialization',
|
|
242
|
+
source: 'manual',
|
|
243
|
+
tags: ['cross_category'],
|
|
244
|
+
},
|
|
245
|
+
];
|
|
246
|
+
// ============================================================================
|
|
247
|
+
// LIMITATION FINDINGS - TRUE POSITIVES (should match)
|
|
248
|
+
// Same limitation, different phrasing
|
|
249
|
+
// ============================================================================
|
|
250
|
+
const LIMITATION_TRUE_POSITIVES = [
|
|
251
|
+
// Size Limit Variations
|
|
252
|
+
{
|
|
253
|
+
id: 'lim-tp-001',
|
|
254
|
+
category: 'limitation',
|
|
255
|
+
text1: 'Maximum file size is 10MB',
|
|
256
|
+
text2: 'Files larger than 10 megabytes will be rejected',
|
|
257
|
+
toolName: 'upload_file',
|
|
258
|
+
expectedMatch: true,
|
|
259
|
+
expectedConfidence: { min: 85, max: 100 },
|
|
260
|
+
reasoning: 'Same size limit, different phrasing (10MB = 10 megabytes)',
|
|
261
|
+
source: 'manual',
|
|
262
|
+
tags: ['size_limit', 'unit_conversion'],
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
id: 'lim-tp-002',
|
|
266
|
+
category: 'limitation',
|
|
267
|
+
text1: 'File size limit: 10 MB',
|
|
268
|
+
text2: 'Cannot process files exceeding 10MB',
|
|
269
|
+
toolName: 'upload_file',
|
|
270
|
+
expectedMatch: true,
|
|
271
|
+
reasoning: 'Same 10MB limit expressed differently',
|
|
272
|
+
source: 'manual',
|
|
273
|
+
tags: ['size_limit'],
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
id: 'lim-tp-003',
|
|
277
|
+
category: 'limitation',
|
|
278
|
+
text1: 'Maximum upload size is 1GB',
|
|
279
|
+
text2: 'Uploads are limited to 1 gigabyte',
|
|
280
|
+
toolName: 'upload_file',
|
|
281
|
+
expectedMatch: true,
|
|
282
|
+
reasoning: 'Same 1GB limit',
|
|
283
|
+
source: 'manual',
|
|
284
|
+
tags: ['size_limit', 'unit_conversion'],
|
|
285
|
+
},
|
|
286
|
+
// Rate Limit Variations
|
|
287
|
+
{
|
|
288
|
+
id: 'lim-tp-010',
|
|
289
|
+
category: 'limitation',
|
|
290
|
+
text1: 'Rate limited to 100 requests per minute',
|
|
291
|
+
text2: '100 requests/min rate limit applies',
|
|
292
|
+
toolName: 'api_call',
|
|
293
|
+
expectedMatch: true,
|
|
294
|
+
expectedConfidence: { min: 80, max: 100 },
|
|
295
|
+
reasoning: 'Same rate limit with different notation',
|
|
296
|
+
source: 'manual',
|
|
297
|
+
tags: ['rate_limit'],
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
id: 'lim-tp-011',
|
|
301
|
+
category: 'limitation',
|
|
302
|
+
text1: 'API is throttled to 60 calls per hour',
|
|
303
|
+
text2: 'Rate limit: 60 requests/hour',
|
|
304
|
+
toolName: 'api_call',
|
|
305
|
+
expectedMatch: true,
|
|
306
|
+
reasoning: 'Same hourly rate limit',
|
|
307
|
+
source: 'manual',
|
|
308
|
+
tags: ['rate_limit'],
|
|
309
|
+
},
|
|
310
|
+
// Timeout Variations
|
|
311
|
+
{
|
|
312
|
+
id: 'lim-tp-020',
|
|
313
|
+
category: 'limitation',
|
|
314
|
+
text1: 'Requests timeout after 30 seconds',
|
|
315
|
+
text2: '30 second timeout for all operations',
|
|
316
|
+
toolName: 'long_operation',
|
|
317
|
+
expectedMatch: true,
|
|
318
|
+
expectedConfidence: { min: 85, max: 100 },
|
|
319
|
+
reasoning: 'Same 30 second timeout',
|
|
320
|
+
source: 'manual',
|
|
321
|
+
tags: ['timeout'],
|
|
322
|
+
},
|
|
323
|
+
{
|
|
324
|
+
id: 'lim-tp-021',
|
|
325
|
+
category: 'limitation',
|
|
326
|
+
text1: 'Operations time out after 5 minutes',
|
|
327
|
+
text2: '300 second timeout applies',
|
|
328
|
+
toolName: 'long_operation',
|
|
329
|
+
expectedMatch: true,
|
|
330
|
+
reasoning: '5 minutes = 300 seconds',
|
|
331
|
+
source: 'manual',
|
|
332
|
+
tags: ['timeout', 'unit_conversion'],
|
|
333
|
+
},
|
|
334
|
+
// Format Variations
|
|
335
|
+
{
|
|
336
|
+
id: 'lim-tp-030',
|
|
337
|
+
category: 'limitation',
|
|
338
|
+
text1: 'Only JSON format is supported',
|
|
339
|
+
text2: 'Input must be valid JSON',
|
|
340
|
+
toolName: 'parse_data',
|
|
341
|
+
expectedMatch: true,
|
|
342
|
+
reasoning: 'Both specify JSON format requirement',
|
|
343
|
+
source: 'manual',
|
|
344
|
+
tags: ['format'],
|
|
345
|
+
},
|
|
346
|
+
];
|
|
347
|
+
// ============================================================================
|
|
348
|
+
// LIMITATION FINDINGS - TRUE NEGATIVES (should NOT match)
|
|
349
|
+
// Different limitations
|
|
350
|
+
// ============================================================================
|
|
351
|
+
const LIMITATION_TRUE_NEGATIVES = [
|
|
352
|
+
{
|
|
353
|
+
id: 'lim-tn-001',
|
|
354
|
+
category: 'limitation',
|
|
355
|
+
text1: 'Maximum file size is 10MB',
|
|
356
|
+
text2: 'Maximum file size is 100MB',
|
|
357
|
+
toolName: 'upload_file',
|
|
358
|
+
expectedMatch: false,
|
|
359
|
+
reasoning: 'Different size limits: 10MB vs 100MB',
|
|
360
|
+
source: 'manual',
|
|
361
|
+
tags: ['size_limit', 'value_diff'],
|
|
362
|
+
},
|
|
363
|
+
{
|
|
364
|
+
id: 'lim-tn-002',
|
|
365
|
+
category: 'limitation',
|
|
366
|
+
text1: 'Rate limited to 100 requests per minute',
|
|
367
|
+
text2: 'Rate limited to 1000 requests per minute',
|
|
368
|
+
toolName: 'api_call',
|
|
369
|
+
expectedMatch: false,
|
|
370
|
+
reasoning: 'Different rate limits: 100 vs 1000',
|
|
371
|
+
source: 'manual',
|
|
372
|
+
tags: ['rate_limit', 'value_diff'],
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
id: 'lim-tn-003',
|
|
376
|
+
category: 'limitation',
|
|
377
|
+
text1: 'Maximum file size is 10MB',
|
|
378
|
+
text2: 'Rate limited to 100 requests per minute',
|
|
379
|
+
toolName: 'test_tool',
|
|
380
|
+
expectedMatch: false,
|
|
381
|
+
reasoning: 'Different categories: size_limit vs rate_limit',
|
|
382
|
+
source: 'manual',
|
|
383
|
+
tags: ['cross_category'],
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
id: 'lim-tn-004',
|
|
387
|
+
category: 'limitation',
|
|
388
|
+
text1: 'Timeout after 30 seconds',
|
|
389
|
+
text2: 'Maximum file size is 30MB',
|
|
390
|
+
toolName: 'test_tool',
|
|
391
|
+
expectedMatch: false,
|
|
392
|
+
reasoning: 'Different categories despite similar number',
|
|
393
|
+
source: 'manual',
|
|
394
|
+
tags: ['cross_category', 'confusing_number'],
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
id: 'lim-tn-005',
|
|
398
|
+
category: 'limitation',
|
|
399
|
+
text1: 'Only JSON format supported',
|
|
400
|
+
text2: 'Only XML format supported',
|
|
401
|
+
toolName: 'parse_data',
|
|
402
|
+
expectedMatch: false,
|
|
403
|
+
reasoning: 'Different format requirements',
|
|
404
|
+
source: 'manual',
|
|
405
|
+
tags: ['format', 'value_diff'],
|
|
406
|
+
},
|
|
407
|
+
];
|
|
408
|
+
// ============================================================================
|
|
409
|
+
// ASSERTION FINDINGS - TRUE POSITIVES (should match)
|
|
410
|
+
// Same behavioral assertion, different phrasing
|
|
411
|
+
// ============================================================================
|
|
412
|
+
const ASSERTION_TRUE_POSITIVES = [
|
|
413
|
+
{
|
|
414
|
+
id: 'asrt-tp-001',
|
|
415
|
+
category: 'assertion',
|
|
416
|
+
text1: 'Returns error when file does not exist',
|
|
417
|
+
text2: 'The tool throws an error for missing files',
|
|
418
|
+
toolName: 'read_file',
|
|
419
|
+
expectedMatch: true,
|
|
420
|
+
expectedConfidence: { min: 70, max: 100 },
|
|
421
|
+
reasoning: 'Same error behavior for missing files',
|
|
422
|
+
source: 'manual',
|
|
423
|
+
tags: ['error_handling'],
|
|
424
|
+
},
|
|
425
|
+
{
|
|
426
|
+
id: 'asrt-tp-002',
|
|
427
|
+
category: 'assertion',
|
|
428
|
+
text1: 'Handles empty input gracefully',
|
|
429
|
+
text2: 'Empty strings are handled without error',
|
|
430
|
+
toolName: 'process_text',
|
|
431
|
+
expectedMatch: true,
|
|
432
|
+
reasoning: 'Same empty input handling behavior',
|
|
433
|
+
source: 'manual',
|
|
434
|
+
tags: ['error_handling', 'empty_input'],
|
|
435
|
+
},
|
|
436
|
+
{
|
|
437
|
+
id: 'asrt-tp-003',
|
|
438
|
+
category: 'assertion',
|
|
439
|
+
text1: 'Returns JSON object on success',
|
|
440
|
+
text2: 'Successful calls return a JSON response',
|
|
441
|
+
toolName: 'api_call',
|
|
442
|
+
expectedMatch: true,
|
|
443
|
+
reasoning: 'Same success response format',
|
|
444
|
+
source: 'manual',
|
|
445
|
+
tags: ['response_format'],
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
id: 'asrt-tp-004',
|
|
449
|
+
category: 'assertion',
|
|
450
|
+
text1: 'Validates input before processing',
|
|
451
|
+
text2: 'Input is validated prior to execution',
|
|
452
|
+
toolName: 'process_data',
|
|
453
|
+
expectedMatch: true,
|
|
454
|
+
reasoning: 'Same input validation behavior',
|
|
455
|
+
source: 'manual',
|
|
456
|
+
tags: ['input_validation'],
|
|
457
|
+
},
|
|
458
|
+
];
|
|
459
|
+
// ============================================================================
|
|
460
|
+
// ASSERTION FINDINGS - TRUE NEGATIVES (should NOT match)
|
|
461
|
+
// Different behavioral assertions
|
|
462
|
+
// ============================================================================
|
|
463
|
+
const ASSERTION_TRUE_NEGATIVES = [
|
|
464
|
+
{
|
|
465
|
+
id: 'asrt-tn-001',
|
|
466
|
+
category: 'assertion',
|
|
467
|
+
text1: 'Returns error when file does not exist',
|
|
468
|
+
text2: 'Returns null when file does not exist',
|
|
469
|
+
toolName: 'read_file',
|
|
470
|
+
expectedMatch: false,
|
|
471
|
+
reasoning: 'Different behaviors: error vs null',
|
|
472
|
+
source: 'manual',
|
|
473
|
+
tags: ['error_handling', 'behavior_diff'],
|
|
474
|
+
},
|
|
475
|
+
{
|
|
476
|
+
id: 'asrt-tn-002',
|
|
477
|
+
category: 'assertion',
|
|
478
|
+
text1: 'Creates file if it does not exist',
|
|
479
|
+
text2: 'Fails if file does not exist',
|
|
480
|
+
toolName: 'write_file',
|
|
481
|
+
expectedMatch: false,
|
|
482
|
+
reasoning: 'Opposite behaviors for missing files',
|
|
483
|
+
source: 'manual',
|
|
484
|
+
tags: ['error_handling', 'behavior_diff'],
|
|
485
|
+
},
|
|
486
|
+
{
|
|
487
|
+
id: 'asrt-tn-003',
|
|
488
|
+
category: 'assertion',
|
|
489
|
+
text1: 'Returns JSON object on success',
|
|
490
|
+
text2: 'Returns plain text on success',
|
|
491
|
+
toolName: 'api_call',
|
|
492
|
+
expectedMatch: false,
|
|
493
|
+
reasoning: 'Different response formats',
|
|
494
|
+
source: 'manual',
|
|
495
|
+
tags: ['response_format', 'behavior_diff'],
|
|
496
|
+
},
|
|
497
|
+
];
|
|
498
|
+
// ============================================================================
|
|
499
|
+
// EDGE CASES
|
|
500
|
+
// Boundary conditions and special scenarios
|
|
501
|
+
// ============================================================================
|
|
502
|
+
const EDGE_CASES = [
|
|
503
|
+
// Empty and Short Strings
|
|
504
|
+
{
|
|
505
|
+
id: 'edge-001',
|
|
506
|
+
category: 'security',
|
|
507
|
+
text1: '',
|
|
508
|
+
text2: '',
|
|
509
|
+
toolName: 'test_tool',
|
|
510
|
+
expectedMatch: true,
|
|
511
|
+
reasoning: 'Empty strings should match each other',
|
|
512
|
+
source: 'manual',
|
|
513
|
+
tags: ['empty', 'edge'],
|
|
514
|
+
},
|
|
515
|
+
{
|
|
516
|
+
id: 'edge-002',
|
|
517
|
+
category: 'security',
|
|
518
|
+
text1: 'XSS',
|
|
519
|
+
text2: 'Cross-site scripting',
|
|
520
|
+
toolName: 'test_tool',
|
|
521
|
+
expectedMatch: true,
|
|
522
|
+
reasoning: 'Very short abbreviation should match full name',
|
|
523
|
+
source: 'manual',
|
|
524
|
+
tags: ['short', 'abbreviation', 'edge'],
|
|
525
|
+
},
|
|
526
|
+
{
|
|
527
|
+
id: 'edge-003',
|
|
528
|
+
category: 'security',
|
|
529
|
+
text1: 'SQLi',
|
|
530
|
+
text2: 'SQL injection vulnerability detected',
|
|
531
|
+
toolName: 'test_tool',
|
|
532
|
+
expectedMatch: true,
|
|
533
|
+
reasoning: 'Short abbreviation should match longer description',
|
|
534
|
+
source: 'manual',
|
|
535
|
+
tags: ['short', 'abbreviation', 'edge'],
|
|
536
|
+
},
|
|
537
|
+
// Negation Cases
|
|
538
|
+
{
|
|
539
|
+
id: 'edge-010',
|
|
540
|
+
category: 'security',
|
|
541
|
+
text1: 'Critical vulnerability found',
|
|
542
|
+
text2: 'Not a critical vulnerability',
|
|
543
|
+
toolName: 'test_tool',
|
|
544
|
+
expectedMatch: false,
|
|
545
|
+
reasoning: 'Negation changes meaning',
|
|
546
|
+
source: 'manual',
|
|
547
|
+
tags: ['negation', 'edge'],
|
|
548
|
+
},
|
|
549
|
+
{
|
|
550
|
+
id: 'edge-011',
|
|
551
|
+
category: 'limitation',
|
|
552
|
+
text1: 'No size limit',
|
|
553
|
+
text2: 'Size limit of 10MB',
|
|
554
|
+
toolName: 'test_tool',
|
|
555
|
+
expectedMatch: false,
|
|
556
|
+
reasoning: 'Negated limitation vs actual limitation',
|
|
557
|
+
source: 'manual',
|
|
558
|
+
tags: ['negation', 'edge'],
|
|
559
|
+
},
|
|
560
|
+
// Different Tools
|
|
561
|
+
{
|
|
562
|
+
id: 'edge-020',
|
|
563
|
+
category: 'security',
|
|
564
|
+
text1: 'Path traversal vulnerability',
|
|
565
|
+
text2: 'Path traversal vulnerability',
|
|
566
|
+
toolName: 'read_file',
|
|
567
|
+
expectedMatch: true,
|
|
568
|
+
reasoning: 'Identical text, same tool',
|
|
569
|
+
source: 'manual',
|
|
570
|
+
tags: ['identical', 'edge'],
|
|
571
|
+
},
|
|
572
|
+
// Case Sensitivity
|
|
573
|
+
{
|
|
574
|
+
id: 'edge-030',
|
|
575
|
+
category: 'security',
|
|
576
|
+
text1: 'PATH TRAVERSAL VULNERABILITY',
|
|
577
|
+
text2: 'path traversal vulnerability',
|
|
578
|
+
toolName: 'test_tool',
|
|
579
|
+
expectedMatch: true,
|
|
580
|
+
reasoning: 'Case differences should not matter',
|
|
581
|
+
source: 'manual',
|
|
582
|
+
tags: ['case', 'edge'],
|
|
583
|
+
},
|
|
584
|
+
// Special Characters
|
|
585
|
+
{
|
|
586
|
+
id: 'edge-040',
|
|
587
|
+
category: 'security',
|
|
588
|
+
text1: 'Path traversal via "../" sequences',
|
|
589
|
+
text2: 'Path traversal via ../ sequences',
|
|
590
|
+
toolName: 'test_tool',
|
|
591
|
+
expectedMatch: true,
|
|
592
|
+
reasoning: 'Quoted vs unquoted should match',
|
|
593
|
+
source: 'manual',
|
|
594
|
+
tags: ['special_chars', 'edge'],
|
|
595
|
+
},
|
|
596
|
+
// Severity in Text
|
|
597
|
+
{
|
|
598
|
+
id: 'edge-050',
|
|
599
|
+
category: 'security',
|
|
600
|
+
text1: 'High severity SQL injection',
|
|
601
|
+
text2: 'SQL injection (high severity)',
|
|
602
|
+
toolName: 'test_tool',
|
|
603
|
+
expectedMatch: true,
|
|
604
|
+
reasoning: 'Same severity, different format',
|
|
605
|
+
source: 'manual',
|
|
606
|
+
tags: ['severity', 'edge'],
|
|
607
|
+
},
|
|
608
|
+
{
|
|
609
|
+
id: 'edge-051',
|
|
610
|
+
category: 'security',
|
|
611
|
+
text1: 'High severity SQL injection',
|
|
612
|
+
text2: 'Low severity SQL injection',
|
|
613
|
+
toolName: 'test_tool',
|
|
614
|
+
expectedMatch: false,
|
|
615
|
+
reasoning: 'Different severity levels should not match',
|
|
616
|
+
source: 'manual',
|
|
617
|
+
tags: ['severity', 'edge'],
|
|
618
|
+
},
|
|
619
|
+
// Multiple constraints in single finding
|
|
620
|
+
{
|
|
621
|
+
id: 'edge-060',
|
|
622
|
+
category: 'limitation',
|
|
623
|
+
text1: 'Maximum 100 files, each up to 5MB',
|
|
624
|
+
text2: 'Limit of 100 files, 5MB per file',
|
|
625
|
+
toolName: 'upload_files',
|
|
626
|
+
expectedMatch: true,
|
|
627
|
+
reasoning: 'Same constraints, different phrasing',
|
|
628
|
+
source: 'manual',
|
|
629
|
+
tags: ['constraint', 'multi_constraint', 'edge'],
|
|
630
|
+
},
|
|
631
|
+
{
|
|
632
|
+
id: 'edge-061',
|
|
633
|
+
category: 'limitation',
|
|
634
|
+
text1: 'Maximum 100 files, each up to 5MB',
|
|
635
|
+
text2: 'Maximum 50 files, each up to 10MB',
|
|
636
|
+
toolName: 'upload_files',
|
|
637
|
+
expectedMatch: false,
|
|
638
|
+
reasoning: 'Different constraint values for both limits',
|
|
639
|
+
source: 'manual',
|
|
640
|
+
tags: ['constraint', 'multi_constraint', 'edge'],
|
|
641
|
+
},
|
|
642
|
+
];
|
|
643
|
+
// ============================================================================
|
|
644
|
+
// COMBINED DATASET
|
|
645
|
+
// Includes original 50 cases + 100+ expanded cases from Phase 3
|
|
646
|
+
// ============================================================================
|
|
647
|
+
/**
|
|
648
|
+
* Original core test cases (50 cases from Phase 1).
|
|
649
|
+
*/
|
|
650
|
+
const CORE_CASES = [
|
|
651
|
+
...SECURITY_TRUE_POSITIVES,
|
|
652
|
+
...SECURITY_TRUE_NEGATIVES,
|
|
653
|
+
...LIMITATION_TRUE_POSITIVES,
|
|
654
|
+
...LIMITATION_TRUE_NEGATIVES,
|
|
655
|
+
...ASSERTION_TRUE_POSITIVES,
|
|
656
|
+
...ASSERTION_TRUE_NEGATIVES,
|
|
657
|
+
...EDGE_CASES,
|
|
658
|
+
];
|
|
659
|
+
/**
|
|
660
|
+
* Full golden dataset combining core and expanded cases.
|
|
661
|
+
* Total: 150+ labeled test cases for comprehensive evaluation.
|
|
662
|
+
*/
|
|
663
|
+
export const GOLDEN_DATASET = [
|
|
664
|
+
...CORE_CASES,
|
|
665
|
+
...EXPANDED_TEST_CASES,
|
|
666
|
+
];
|
|
667
|
+
// Export categorized for targeted testing
|
|
668
|
+
export const SECURITY_CASES = [
|
|
669
|
+
...SECURITY_TRUE_POSITIVES,
|
|
670
|
+
...SECURITY_TRUE_NEGATIVES,
|
|
671
|
+
...EXPANDED_TEST_CASES.filter(c => c.category === 'security'),
|
|
672
|
+
];
|
|
673
|
+
export const LIMITATION_CASES = [
|
|
674
|
+
...LIMITATION_TRUE_POSITIVES,
|
|
675
|
+
...LIMITATION_TRUE_NEGATIVES,
|
|
676
|
+
...EXPANDED_TEST_CASES.filter(c => c.category === 'limitation'),
|
|
677
|
+
];
|
|
678
|
+
export const ASSERTION_CASES = [
|
|
679
|
+
...ASSERTION_TRUE_POSITIVES,
|
|
680
|
+
...ASSERTION_TRUE_NEGATIVES,
|
|
681
|
+
...EXPANDED_TEST_CASES.filter(c => c.category === 'assertion'),
|
|
682
|
+
];
|
|
683
|
+
/**
|
|
684
|
+
* Get comprehensive statistics about the golden dataset.
|
|
685
|
+
*/
|
|
686
|
+
export function getDatasetStatistics() {
|
|
687
|
+
const truePositives = GOLDEN_DATASET.filter((c) => c.expectedMatch).length;
|
|
688
|
+
const trueNegatives = GOLDEN_DATASET.filter((c) => !c.expectedMatch).length;
|
|
689
|
+
// Count by tags
|
|
690
|
+
const byTag = {};
|
|
691
|
+
for (const tc of GOLDEN_DATASET) {
|
|
692
|
+
if (tc.tags) {
|
|
693
|
+
for (const tag of tc.tags) {
|
|
694
|
+
byTag[tag] = (byTag[tag] || 0) + 1;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
// Get expanded stats
|
|
699
|
+
const expandedStats = getExpandedDatasetStatistics();
|
|
700
|
+
return {
|
|
701
|
+
version: DATASET_VERSION,
|
|
702
|
+
totalCases: GOLDEN_DATASET.length,
|
|
703
|
+
coreCases: CORE_CASES.length,
|
|
704
|
+
expandedCases: EXPANDED_TEST_CASES.length,
|
|
705
|
+
truePositives,
|
|
706
|
+
trueNegatives,
|
|
707
|
+
byCategory: {
|
|
708
|
+
security: SECURITY_CASES.length,
|
|
709
|
+
limitation: LIMITATION_CASES.length,
|
|
710
|
+
assertion: ASSERTION_CASES.length,
|
|
711
|
+
edge: EDGE_CASES.length,
|
|
712
|
+
},
|
|
713
|
+
byTag,
|
|
714
|
+
expanded: expandedStats,
|
|
715
|
+
};
|
|
716
|
+
}
|
|
717
|
+
//# sourceMappingURL=golden-dataset.js.map
|