@dotsetlabs/bellwether 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (403) hide show
  1. package/CHANGELOG.md +291 -0
  2. package/LICENSE +21 -0
  3. package/README.md +739 -0
  4. package/dist/auth/credentials.d.ts +64 -0
  5. package/dist/auth/credentials.js +218 -0
  6. package/dist/auth/index.d.ts +6 -0
  7. package/dist/auth/index.js +6 -0
  8. package/dist/auth/keychain.d.ts +64 -0
  9. package/dist/auth/keychain.js +268 -0
  10. package/dist/baseline/ab-testing.d.ts +80 -0
  11. package/dist/baseline/ab-testing.js +236 -0
  12. package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
  13. package/dist/baseline/ai-compatibility-scorer.js +606 -0
  14. package/dist/baseline/calibration.d.ts +77 -0
  15. package/dist/baseline/calibration.js +136 -0
  16. package/dist/baseline/category-matching.d.ts +85 -0
  17. package/dist/baseline/category-matching.js +289 -0
  18. package/dist/baseline/change-impact-analyzer.d.ts +98 -0
  19. package/dist/baseline/change-impact-analyzer.js +592 -0
  20. package/dist/baseline/comparator.d.ts +64 -0
  21. package/dist/baseline/comparator.js +916 -0
  22. package/dist/baseline/confidence.d.ts +55 -0
  23. package/dist/baseline/confidence.js +122 -0
  24. package/dist/baseline/converter.d.ts +61 -0
  25. package/dist/baseline/converter.js +585 -0
  26. package/dist/baseline/dependency-analyzer.d.ts +89 -0
  27. package/dist/baseline/dependency-analyzer.js +567 -0
  28. package/dist/baseline/deprecation-tracker.d.ts +133 -0
  29. package/dist/baseline/deprecation-tracker.js +322 -0
  30. package/dist/baseline/diff.d.ts +55 -0
  31. package/dist/baseline/diff.js +1584 -0
  32. package/dist/baseline/documentation-scorer.d.ts +205 -0
  33. package/dist/baseline/documentation-scorer.js +466 -0
  34. package/dist/baseline/embeddings.d.ts +118 -0
  35. package/dist/baseline/embeddings.js +251 -0
  36. package/dist/baseline/error-analyzer.d.ts +198 -0
  37. package/dist/baseline/error-analyzer.js +721 -0
  38. package/dist/baseline/evaluation/evaluator.d.ts +42 -0
  39. package/dist/baseline/evaluation/evaluator.js +323 -0
  40. package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
  41. package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
  42. package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
  43. package/dist/baseline/evaluation/golden-dataset.js +717 -0
  44. package/dist/baseline/evaluation/index.d.ts +15 -0
  45. package/dist/baseline/evaluation/index.js +15 -0
  46. package/dist/baseline/evaluation/types.d.ts +186 -0
  47. package/dist/baseline/evaluation/types.js +8 -0
  48. package/dist/baseline/external-dependency-detector.d.ts +181 -0
  49. package/dist/baseline/external-dependency-detector.js +524 -0
  50. package/dist/baseline/golden-output.d.ts +162 -0
  51. package/dist/baseline/golden-output.js +636 -0
  52. package/dist/baseline/health-scorer.d.ts +174 -0
  53. package/dist/baseline/health-scorer.js +451 -0
  54. package/dist/baseline/incremental-checker.d.ts +97 -0
  55. package/dist/baseline/incremental-checker.js +174 -0
  56. package/dist/baseline/index.d.ts +31 -0
  57. package/dist/baseline/index.js +42 -0
  58. package/dist/baseline/migration-generator.d.ts +137 -0
  59. package/dist/baseline/migration-generator.js +554 -0
  60. package/dist/baseline/migrations.d.ts +60 -0
  61. package/dist/baseline/migrations.js +197 -0
  62. package/dist/baseline/performance-tracker.d.ts +214 -0
  63. package/dist/baseline/performance-tracker.js +577 -0
  64. package/dist/baseline/pr-comment-generator.d.ts +117 -0
  65. package/dist/baseline/pr-comment-generator.js +546 -0
  66. package/dist/baseline/response-fingerprint.d.ts +127 -0
  67. package/dist/baseline/response-fingerprint.js +728 -0
  68. package/dist/baseline/response-schema-tracker.d.ts +129 -0
  69. package/dist/baseline/response-schema-tracker.js +420 -0
  70. package/dist/baseline/risk-scorer.d.ts +54 -0
  71. package/dist/baseline/risk-scorer.js +434 -0
  72. package/dist/baseline/saver.d.ts +89 -0
  73. package/dist/baseline/saver.js +554 -0
  74. package/dist/baseline/scenario-generator.d.ts +151 -0
  75. package/dist/baseline/scenario-generator.js +905 -0
  76. package/dist/baseline/schema-compare.d.ts +86 -0
  77. package/dist/baseline/schema-compare.js +557 -0
  78. package/dist/baseline/schema-evolution.d.ts +189 -0
  79. package/dist/baseline/schema-evolution.js +467 -0
  80. package/dist/baseline/semantic.d.ts +203 -0
  81. package/dist/baseline/semantic.js +908 -0
  82. package/dist/baseline/synonyms.d.ts +60 -0
  83. package/dist/baseline/synonyms.js +386 -0
  84. package/dist/baseline/telemetry.d.ts +165 -0
  85. package/dist/baseline/telemetry.js +294 -0
  86. package/dist/baseline/test-pruner.d.ts +120 -0
  87. package/dist/baseline/test-pruner.js +387 -0
  88. package/dist/baseline/types.d.ts +449 -0
  89. package/dist/baseline/types.js +5 -0
  90. package/dist/baseline/version.d.ts +138 -0
  91. package/dist/baseline/version.js +206 -0
  92. package/dist/cache/index.d.ts +5 -0
  93. package/dist/cache/index.js +5 -0
  94. package/dist/cache/response-cache.d.ts +151 -0
  95. package/dist/cache/response-cache.js +287 -0
  96. package/dist/ci/index.d.ts +60 -0
  97. package/dist/ci/index.js +342 -0
  98. package/dist/cli/commands/auth.d.ts +12 -0
  99. package/dist/cli/commands/auth.js +352 -0
  100. package/dist/cli/commands/badge.d.ts +3 -0
  101. package/dist/cli/commands/badge.js +74 -0
  102. package/dist/cli/commands/baseline-accept.d.ts +15 -0
  103. package/dist/cli/commands/baseline-accept.js +178 -0
  104. package/dist/cli/commands/baseline-migrate.d.ts +12 -0
  105. package/dist/cli/commands/baseline-migrate.js +164 -0
  106. package/dist/cli/commands/baseline.d.ts +14 -0
  107. package/dist/cli/commands/baseline.js +449 -0
  108. package/dist/cli/commands/beta.d.ts +10 -0
  109. package/dist/cli/commands/beta.js +231 -0
  110. package/dist/cli/commands/check.d.ts +11 -0
  111. package/dist/cli/commands/check.js +820 -0
  112. package/dist/cli/commands/cloud/badge.d.ts +3 -0
  113. package/dist/cli/commands/cloud/badge.js +74 -0
  114. package/dist/cli/commands/cloud/diff.d.ts +6 -0
  115. package/dist/cli/commands/cloud/diff.js +79 -0
  116. package/dist/cli/commands/cloud/history.d.ts +6 -0
  117. package/dist/cli/commands/cloud/history.js +102 -0
  118. package/dist/cli/commands/cloud/link.d.ts +9 -0
  119. package/dist/cli/commands/cloud/link.js +119 -0
  120. package/dist/cli/commands/cloud/login.d.ts +7 -0
  121. package/dist/cli/commands/cloud/login.js +499 -0
  122. package/dist/cli/commands/cloud/projects.d.ts +6 -0
  123. package/dist/cli/commands/cloud/projects.js +44 -0
  124. package/dist/cli/commands/cloud/shared.d.ts +7 -0
  125. package/dist/cli/commands/cloud/shared.js +42 -0
  126. package/dist/cli/commands/cloud/teams.d.ts +8 -0
  127. package/dist/cli/commands/cloud/teams.js +169 -0
  128. package/dist/cli/commands/cloud/upload.d.ts +8 -0
  129. package/dist/cli/commands/cloud/upload.js +181 -0
  130. package/dist/cli/commands/contract.d.ts +11 -0
  131. package/dist/cli/commands/contract.js +280 -0
  132. package/dist/cli/commands/discover.d.ts +3 -0
  133. package/dist/cli/commands/discover.js +82 -0
  134. package/dist/cli/commands/eval.d.ts +9 -0
  135. package/dist/cli/commands/eval.js +187 -0
  136. package/dist/cli/commands/explore.d.ts +11 -0
  137. package/dist/cli/commands/explore.js +437 -0
  138. package/dist/cli/commands/feedback.d.ts +9 -0
  139. package/dist/cli/commands/feedback.js +174 -0
  140. package/dist/cli/commands/golden.d.ts +12 -0
  141. package/dist/cli/commands/golden.js +407 -0
  142. package/dist/cli/commands/history.d.ts +10 -0
  143. package/dist/cli/commands/history.js +202 -0
  144. package/dist/cli/commands/init.d.ts +9 -0
  145. package/dist/cli/commands/init.js +219 -0
  146. package/dist/cli/commands/interview.d.ts +3 -0
  147. package/dist/cli/commands/interview.js +903 -0
  148. package/dist/cli/commands/link.d.ts +10 -0
  149. package/dist/cli/commands/link.js +169 -0
  150. package/dist/cli/commands/login.d.ts +7 -0
  151. package/dist/cli/commands/login.js +499 -0
  152. package/dist/cli/commands/preset.d.ts +33 -0
  153. package/dist/cli/commands/preset.js +297 -0
  154. package/dist/cli/commands/profile.d.ts +33 -0
  155. package/dist/cli/commands/profile.js +286 -0
  156. package/dist/cli/commands/registry.d.ts +11 -0
  157. package/dist/cli/commands/registry.js +146 -0
  158. package/dist/cli/commands/shared.d.ts +79 -0
  159. package/dist/cli/commands/shared.js +196 -0
  160. package/dist/cli/commands/teams.d.ts +8 -0
  161. package/dist/cli/commands/teams.js +169 -0
  162. package/dist/cli/commands/test.d.ts +9 -0
  163. package/dist/cli/commands/test.js +500 -0
  164. package/dist/cli/commands/upload.d.ts +8 -0
  165. package/dist/cli/commands/upload.js +223 -0
  166. package/dist/cli/commands/validate-config.d.ts +6 -0
  167. package/dist/cli/commands/validate-config.js +35 -0
  168. package/dist/cli/commands/verify.d.ts +11 -0
  169. package/dist/cli/commands/verify.js +283 -0
  170. package/dist/cli/commands/watch.d.ts +12 -0
  171. package/dist/cli/commands/watch.js +253 -0
  172. package/dist/cli/index.d.ts +3 -0
  173. package/dist/cli/index.js +178 -0
  174. package/dist/cli/interactive.d.ts +47 -0
  175. package/dist/cli/interactive.js +216 -0
  176. package/dist/cli/output/terminal-reporter.d.ts +19 -0
  177. package/dist/cli/output/terminal-reporter.js +104 -0
  178. package/dist/cli/output.d.ts +226 -0
  179. package/dist/cli/output.js +438 -0
  180. package/dist/cli/utils/env.d.ts +5 -0
  181. package/dist/cli/utils/env.js +14 -0
  182. package/dist/cli/utils/progress.d.ts +59 -0
  183. package/dist/cli/utils/progress.js +206 -0
  184. package/dist/cli/utils/server-context.d.ts +10 -0
  185. package/dist/cli/utils/server-context.js +36 -0
  186. package/dist/cloud/auth.d.ts +144 -0
  187. package/dist/cloud/auth.js +374 -0
  188. package/dist/cloud/client.d.ts +24 -0
  189. package/dist/cloud/client.js +65 -0
  190. package/dist/cloud/http-client.d.ts +38 -0
  191. package/dist/cloud/http-client.js +215 -0
  192. package/dist/cloud/index.d.ts +23 -0
  193. package/dist/cloud/index.js +25 -0
  194. package/dist/cloud/mock-client.d.ts +107 -0
  195. package/dist/cloud/mock-client.js +545 -0
  196. package/dist/cloud/types.d.ts +515 -0
  197. package/dist/cloud/types.js +15 -0
  198. package/dist/config/defaults.d.ts +160 -0
  199. package/dist/config/defaults.js +169 -0
  200. package/dist/config/loader.d.ts +24 -0
  201. package/dist/config/loader.js +122 -0
  202. package/dist/config/template.d.ts +42 -0
  203. package/dist/config/template.js +647 -0
  204. package/dist/config/validator.d.ts +2112 -0
  205. package/dist/config/validator.js +658 -0
  206. package/dist/constants/cloud.d.ts +107 -0
  207. package/dist/constants/cloud.js +110 -0
  208. package/dist/constants/core.d.ts +521 -0
  209. package/dist/constants/core.js +556 -0
  210. package/dist/constants/testing.d.ts +1283 -0
  211. package/dist/constants/testing.js +1568 -0
  212. package/dist/constants.d.ts +10 -0
  213. package/dist/constants.js +10 -0
  214. package/dist/contract/index.d.ts +6 -0
  215. package/dist/contract/index.js +5 -0
  216. package/dist/contract/validator.d.ts +177 -0
  217. package/dist/contract/validator.js +574 -0
  218. package/dist/cost/index.d.ts +6 -0
  219. package/dist/cost/index.js +5 -0
  220. package/dist/cost/tracker.d.ts +134 -0
  221. package/dist/cost/tracker.js +313 -0
  222. package/dist/discovery/discovery.d.ts +16 -0
  223. package/dist/discovery/discovery.js +173 -0
  224. package/dist/discovery/types.d.ts +51 -0
  225. package/dist/discovery/types.js +2 -0
  226. package/dist/docs/agents.d.ts +3 -0
  227. package/dist/docs/agents.js +995 -0
  228. package/dist/docs/contract.d.ts +51 -0
  229. package/dist/docs/contract.js +1681 -0
  230. package/dist/docs/generator.d.ts +4 -0
  231. package/dist/docs/generator.js +4 -0
  232. package/dist/docs/html-reporter.d.ts +9 -0
  233. package/dist/docs/html-reporter.js +757 -0
  234. package/dist/docs/index.d.ts +10 -0
  235. package/dist/docs/index.js +11 -0
  236. package/dist/docs/junit-reporter.d.ts +18 -0
  237. package/dist/docs/junit-reporter.js +210 -0
  238. package/dist/docs/report.d.ts +14 -0
  239. package/dist/docs/report.js +44 -0
  240. package/dist/docs/sarif-reporter.d.ts +19 -0
  241. package/dist/docs/sarif-reporter.js +335 -0
  242. package/dist/docs/shared.d.ts +35 -0
  243. package/dist/docs/shared.js +162 -0
  244. package/dist/docs/templates.d.ts +12 -0
  245. package/dist/docs/templates.js +76 -0
  246. package/dist/errors/index.d.ts +6 -0
  247. package/dist/errors/index.js +6 -0
  248. package/dist/errors/retry.d.ts +92 -0
  249. package/dist/errors/retry.js +323 -0
  250. package/dist/errors/types.d.ts +321 -0
  251. package/dist/errors/types.js +584 -0
  252. package/dist/index.d.ts +32 -0
  253. package/dist/index.js +32 -0
  254. package/dist/interview/dependency-resolver.d.ts +11 -0
  255. package/dist/interview/dependency-resolver.js +32 -0
  256. package/dist/interview/interviewer.d.ts +232 -0
  257. package/dist/interview/interviewer.js +1939 -0
  258. package/dist/interview/mock-response-generator.d.ts +7 -0
  259. package/dist/interview/mock-response-generator.js +102 -0
  260. package/dist/interview/orchestrator.d.ts +237 -0
  261. package/dist/interview/orchestrator.js +1296 -0
  262. package/dist/interview/rate-limiter.d.ts +15 -0
  263. package/dist/interview/rate-limiter.js +55 -0
  264. package/dist/interview/response-validator.d.ts +10 -0
  265. package/dist/interview/response-validator.js +132 -0
  266. package/dist/interview/schema-inferrer.d.ts +8 -0
  267. package/dist/interview/schema-inferrer.js +71 -0
  268. package/dist/interview/schema-test-generator.d.ts +71 -0
  269. package/dist/interview/schema-test-generator.js +834 -0
  270. package/dist/interview/smart-value-generator.d.ts +155 -0
  271. package/dist/interview/smart-value-generator.js +554 -0
  272. package/dist/interview/stateful-test-runner.d.ts +19 -0
  273. package/dist/interview/stateful-test-runner.js +106 -0
  274. package/dist/interview/types.d.ts +561 -0
  275. package/dist/interview/types.js +2 -0
  276. package/dist/llm/anthropic.d.ts +41 -0
  277. package/dist/llm/anthropic.js +355 -0
  278. package/dist/llm/client.d.ts +123 -0
  279. package/dist/llm/client.js +42 -0
  280. package/dist/llm/factory.d.ts +38 -0
  281. package/dist/llm/factory.js +145 -0
  282. package/dist/llm/fallback.d.ts +140 -0
  283. package/dist/llm/fallback.js +379 -0
  284. package/dist/llm/index.d.ts +18 -0
  285. package/dist/llm/index.js +15 -0
  286. package/dist/llm/ollama.d.ts +37 -0
  287. package/dist/llm/ollama.js +330 -0
  288. package/dist/llm/openai.d.ts +25 -0
  289. package/dist/llm/openai.js +320 -0
  290. package/dist/llm/token-budget.d.ts +161 -0
  291. package/dist/llm/token-budget.js +395 -0
  292. package/dist/logging/logger.d.ts +70 -0
  293. package/dist/logging/logger.js +130 -0
  294. package/dist/metrics/collector.d.ts +106 -0
  295. package/dist/metrics/collector.js +547 -0
  296. package/dist/metrics/index.d.ts +7 -0
  297. package/dist/metrics/index.js +7 -0
  298. package/dist/metrics/prometheus.d.ts +20 -0
  299. package/dist/metrics/prometheus.js +241 -0
  300. package/dist/metrics/types.d.ts +209 -0
  301. package/dist/metrics/types.js +5 -0
  302. package/dist/persona/builtins.d.ts +54 -0
  303. package/dist/persona/builtins.js +219 -0
  304. package/dist/persona/index.d.ts +8 -0
  305. package/dist/persona/index.js +8 -0
  306. package/dist/persona/loader.d.ts +30 -0
  307. package/dist/persona/loader.js +190 -0
  308. package/dist/persona/types.d.ts +144 -0
  309. package/dist/persona/types.js +5 -0
  310. package/dist/persona/validation.d.ts +94 -0
  311. package/dist/persona/validation.js +332 -0
  312. package/dist/prompts/index.d.ts +5 -0
  313. package/dist/prompts/index.js +5 -0
  314. package/dist/prompts/templates.d.ts +180 -0
  315. package/dist/prompts/templates.js +431 -0
  316. package/dist/registry/client.d.ts +49 -0
  317. package/dist/registry/client.js +191 -0
  318. package/dist/registry/index.d.ts +7 -0
  319. package/dist/registry/index.js +6 -0
  320. package/dist/registry/types.d.ts +140 -0
  321. package/dist/registry/types.js +6 -0
  322. package/dist/scenarios/evaluator.d.ts +43 -0
  323. package/dist/scenarios/evaluator.js +206 -0
  324. package/dist/scenarios/index.d.ts +10 -0
  325. package/dist/scenarios/index.js +9 -0
  326. package/dist/scenarios/loader.d.ts +20 -0
  327. package/dist/scenarios/loader.js +285 -0
  328. package/dist/scenarios/types.d.ts +153 -0
  329. package/dist/scenarios/types.js +8 -0
  330. package/dist/security/index.d.ts +17 -0
  331. package/dist/security/index.js +18 -0
  332. package/dist/security/payloads.d.ts +61 -0
  333. package/dist/security/payloads.js +268 -0
  334. package/dist/security/security-tester.d.ts +42 -0
  335. package/dist/security/security-tester.js +582 -0
  336. package/dist/security/types.d.ts +166 -0
  337. package/dist/security/types.js +8 -0
  338. package/dist/transport/base-transport.d.ts +59 -0
  339. package/dist/transport/base-transport.js +38 -0
  340. package/dist/transport/http-transport.d.ts +67 -0
  341. package/dist/transport/http-transport.js +238 -0
  342. package/dist/transport/mcp-client.d.ts +141 -0
  343. package/dist/transport/mcp-client.js +496 -0
  344. package/dist/transport/sse-transport.d.ts +88 -0
  345. package/dist/transport/sse-transport.js +316 -0
  346. package/dist/transport/stdio-transport.d.ts +43 -0
  347. package/dist/transport/stdio-transport.js +238 -0
  348. package/dist/transport/types.d.ts +125 -0
  349. package/dist/transport/types.js +16 -0
  350. package/dist/utils/concurrency.d.ts +123 -0
  351. package/dist/utils/concurrency.js +213 -0
  352. package/dist/utils/formatters.d.ts +16 -0
  353. package/dist/utils/formatters.js +37 -0
  354. package/dist/utils/index.d.ts +8 -0
  355. package/dist/utils/index.js +8 -0
  356. package/dist/utils/jsonpath.d.ts +87 -0
  357. package/dist/utils/jsonpath.js +326 -0
  358. package/dist/utils/markdown.d.ts +113 -0
  359. package/dist/utils/markdown.js +265 -0
  360. package/dist/utils/network.d.ts +14 -0
  361. package/dist/utils/network.js +17 -0
  362. package/dist/utils/sanitize.d.ts +92 -0
  363. package/dist/utils/sanitize.js +191 -0
  364. package/dist/utils/semantic.d.ts +194 -0
  365. package/dist/utils/semantic.js +1051 -0
  366. package/dist/utils/smart-truncate.d.ts +94 -0
  367. package/dist/utils/smart-truncate.js +361 -0
  368. package/dist/utils/timeout.d.ts +153 -0
  369. package/dist/utils/timeout.js +205 -0
  370. package/dist/utils/yaml-parser.d.ts +58 -0
  371. package/dist/utils/yaml-parser.js +86 -0
  372. package/dist/validation/index.d.ts +32 -0
  373. package/dist/validation/index.js +32 -0
  374. package/dist/validation/semantic-test-generator.d.ts +50 -0
  375. package/dist/validation/semantic-test-generator.js +176 -0
  376. package/dist/validation/semantic-types.d.ts +66 -0
  377. package/dist/validation/semantic-types.js +94 -0
  378. package/dist/validation/semantic-validator.d.ts +38 -0
  379. package/dist/validation/semantic-validator.js +340 -0
  380. package/dist/verification/index.d.ts +6 -0
  381. package/dist/verification/index.js +5 -0
  382. package/dist/verification/types.d.ts +133 -0
  383. package/dist/verification/types.js +5 -0
  384. package/dist/verification/verifier.d.ts +30 -0
  385. package/dist/verification/verifier.js +309 -0
  386. package/dist/version.d.ts +19 -0
  387. package/dist/version.js +48 -0
  388. package/dist/workflow/auto-generator.d.ts +27 -0
  389. package/dist/workflow/auto-generator.js +513 -0
  390. package/dist/workflow/discovery.d.ts +40 -0
  391. package/dist/workflow/discovery.js +195 -0
  392. package/dist/workflow/executor.d.ts +82 -0
  393. package/dist/workflow/executor.js +611 -0
  394. package/dist/workflow/index.d.ts +10 -0
  395. package/dist/workflow/index.js +10 -0
  396. package/dist/workflow/loader.d.ts +24 -0
  397. package/dist/workflow/loader.js +194 -0
  398. package/dist/workflow/state-tracker.d.ts +98 -0
  399. package/dist/workflow/state-tracker.js +424 -0
  400. package/dist/workflow/types.d.ts +337 -0
  401. package/dist/workflow/types.js +5 -0
  402. package/package.json +94 -0
  403. package/schemas/bellwether-check.schema.json +651 -0
@@ -0,0 +1,1164 @@
1
+ /**
2
+ * Expanded Golden Dataset for Drift Detection Evaluation
3
+ *
4
+ * Phase 3 expansion: 150+ additional labeled test cases covering:
5
+ * - Extended security vulnerability paraphrases
6
+ * - Comprehensive limitation variations
7
+ * - Assertion behavior matching
8
+ * - Edge cases (negation, severity, constraints)
9
+ * - Paraphrase robustness tests
10
+ *
11
+ * These cases are designed to:
12
+ * 1. Test algorithm robustness against paraphrase variations
13
+ * 2. Verify correct handling of edge cases
14
+ * 3. Ensure high recall without sacrificing precision
15
+ */
16
+ // ============================================================================
17
+ // SECURITY: PATH TRAVERSAL VARIATIONS
18
+ // Tests various phrasings of directory traversal vulnerabilities
19
+ // ============================================================================
20
+ export const PATH_TRAVERSAL_CASES = [
21
+ // True Positives - Should match
22
+ {
23
+ id: 'pt-tp-001',
24
+ category: 'security',
25
+ text1: 'Attackers can use ../ sequences to escape the sandbox directory',
26
+ text2: 'Directory traversal via dot-dot-slash allows accessing parent folders',
27
+ toolName: 'read_file',
28
+ expectedMatch: true,
29
+ reasoning: 'Both describe ../ based directory traversal',
30
+ source: 'llm-generated',
31
+ tags: ['path_traversal', 'paraphrase'],
32
+ },
33
+ {
34
+ id: 'pt-tp-002',
35
+ category: 'security',
36
+ text1: 'File path not properly sanitized, allowing arbitrary file access',
37
+ text2: 'Insufficient path validation enables reading files outside allowed directories',
38
+ toolName: 'read_file',
39
+ expectedMatch: true,
40
+ reasoning: 'Both describe path sanitization failures',
41
+ source: 'llm-generated',
42
+ tags: ['path_traversal', 'paraphrase'],
43
+ },
44
+ {
45
+ id: 'pt-tp-003',
46
+ category: 'security',
47
+ text1: 'The tool does not validate that requested paths stay within base directory',
48
+ text2: 'No boundary check on file paths allows directory escape',
49
+ toolName: 'read_file',
50
+ expectedMatch: true,
51
+ reasoning: 'Both describe missing path boundary validation',
52
+ source: 'llm-generated',
53
+ tags: ['path_traversal', 'paraphrase'],
54
+ },
55
+ {
56
+ id: 'pt-tp-004',
57
+ category: 'security',
58
+ text1: 'Relative paths with ../ can traverse to system files',
59
+ text2: 'Using parent directory references in paths exposes sensitive files',
60
+ toolName: 'read_file',
61
+ expectedMatch: true,
62
+ reasoning: 'Both describe traversal to sensitive system files',
63
+ source: 'llm-generated',
64
+ tags: ['path_traversal', 'paraphrase'],
65
+ },
66
+ // True Negatives - Should NOT match
67
+ {
68
+ id: 'pt-tn-001',
69
+ category: 'security',
70
+ text1: 'Path traversal vulnerability in file reading',
71
+ text2: 'Permission denied when accessing protected files',
72
+ toolName: 'read_file',
73
+ expectedMatch: false,
74
+ reasoning: 'Different issues: traversal vs permission denial',
75
+ source: 'llm-generated',
76
+ tags: ['path_traversal', 'negative'],
77
+ },
78
+ {
79
+ id: 'pt-tn-002',
80
+ category: 'security',
81
+ text1: 'Directory traversal allows reading /etc/passwd',
82
+ text2: 'Directory listing exposes file names',
83
+ toolName: 'read_file',
84
+ expectedMatch: false,
85
+ reasoning: 'Different issues: file read vs directory listing',
86
+ source: 'llm-generated',
87
+ tags: ['path_traversal', 'negative'],
88
+ },
89
+ ];
90
+ // ============================================================================
91
+ // SECURITY: SQL INJECTION VARIATIONS
92
+ // Tests various phrasings of SQL injection vulnerabilities
93
+ // ============================================================================
94
+ export const SQL_INJECTION_CASES = [
95
+ // True Positives - Should match
96
+ {
97
+ id: 'sql-tp-001',
98
+ category: 'security',
99
+ text1: 'User input concatenated directly into SQL queries',
100
+ text2: 'Query strings built without parameterization',
101
+ toolName: 'search_db',
102
+ expectedMatch: true,
103
+ reasoning: 'Both describe improper SQL construction',
104
+ source: 'llm-generated',
105
+ tags: ['sql_injection', 'paraphrase'],
106
+ },
107
+ {
108
+ id: 'sql-tp-002',
109
+ category: 'security',
110
+ text1: 'Attackers can inject malicious SQL through search parameters',
111
+ text2: 'SQL statements vulnerable to injection via user-controlled input',
112
+ toolName: 'search_db',
113
+ expectedMatch: true,
114
+ reasoning: 'Both describe SQL injection through user input',
115
+ source: 'llm-generated',
116
+ tags: ['sql_injection', 'paraphrase'],
117
+ },
118
+ {
119
+ id: 'sql-tp-003',
120
+ category: 'security',
121
+ text1: 'Unsanitized input allows UNION-based SQL injection',
122
+ text2: 'SQL injection enables extracting data from other tables',
123
+ toolName: 'search_db',
124
+ expectedMatch: true,
125
+ reasoning: 'Both describe SQL injection data extraction',
126
+ source: 'llm-generated',
127
+ tags: ['sql_injection', 'paraphrase'],
128
+ },
129
+ {
130
+ id: 'sql-tp-004',
131
+ category: 'security',
132
+ text1: 'Database queries use string interpolation instead of prepared statements',
133
+ text2: 'Queries constructed with f-strings allow SQL injection',
134
+ toolName: 'search_db',
135
+ expectedMatch: true,
136
+ reasoning: 'Both describe unsafe query construction',
137
+ source: 'llm-generated',
138
+ tags: ['sql_injection', 'paraphrase'],
139
+ },
140
+ {
141
+ id: 'sql-tp-005',
142
+ category: 'security',
143
+ text1: 'The WHERE clause is vulnerable to boolean-based blind injection',
144
+ text2: 'Blind SQL injection possible through conditional responses',
145
+ toolName: 'search_db',
146
+ expectedMatch: true,
147
+ reasoning: 'Both describe blind SQL injection',
148
+ source: 'llm-generated',
149
+ tags: ['sql_injection', 'paraphrase'],
150
+ },
151
+ // True Negatives - Should NOT match
152
+ {
153
+ id: 'sql-tn-001',
154
+ category: 'security',
155
+ text1: 'SQL injection in database queries',
156
+ text2: 'NoSQL injection in MongoDB queries',
157
+ toolName: 'search_db',
158
+ expectedMatch: false,
159
+ reasoning: 'Different database types',
160
+ source: 'llm-generated',
161
+ tags: ['sql_injection', 'negative'],
162
+ },
163
+ {
164
+ id: 'sql-tn-002',
165
+ category: 'security',
166
+ text1: 'SQL injection vulnerability',
167
+ text2: 'Database connection timeout issue',
168
+ toolName: 'search_db',
169
+ expectedMatch: false,
170
+ reasoning: 'Security vs operational issue',
171
+ source: 'llm-generated',
172
+ tags: ['sql_injection', 'negative'],
173
+ },
174
+ ];
175
+ // ============================================================================
176
+ // SECURITY: XSS VARIATIONS
177
+ // Tests various phrasings of cross-site scripting vulnerabilities
178
+ // ============================================================================
179
+ export const XSS_CASES = [
180
+ // True Positives - Should match
181
+ {
182
+ id: 'xss-tp-001',
183
+ category: 'security',
184
+ text1: 'User input rendered in HTML without escaping',
185
+ text2: 'Output not properly encoded, allowing script injection',
186
+ toolName: 'render_html',
187
+ expectedMatch: true,
188
+ reasoning: 'Both describe XSS through unescaped output',
189
+ source: 'llm-generated',
190
+ tags: ['xss', 'paraphrase'],
191
+ },
192
+ {
193
+ id: 'xss-tp-002',
194
+ category: 'security',
195
+ text1: 'Reflected XSS through URL parameters displayed on page',
196
+ text2: 'Input from query string echoed without sanitization',
197
+ toolName: 'render_html',
198
+ expectedMatch: true,
199
+ reasoning: 'Both describe reflected XSS',
200
+ source: 'llm-generated',
201
+ tags: ['xss', 'paraphrase'],
202
+ },
203
+ {
204
+ id: 'xss-tp-003',
205
+ category: 'security',
206
+ text1: 'Stored XSS in user comments persisted to database',
207
+ text2: 'Malicious scripts saved and served to other users',
208
+ toolName: 'render_html',
209
+ expectedMatch: true,
210
+ reasoning: 'Both describe stored/persistent XSS',
211
+ source: 'llm-generated',
212
+ tags: ['xss', 'paraphrase'],
213
+ },
214
+ {
215
+ id: 'xss-tp-004',
216
+ category: 'security',
217
+ text1: 'DOM-based XSS through innerHTML assignment',
218
+ text2: 'Client-side script injection via DOM manipulation',
219
+ toolName: 'render_html',
220
+ expectedMatch: true,
221
+ reasoning: 'Both describe DOM-based XSS',
222
+ source: 'llm-generated',
223
+ tags: ['xss', 'paraphrase'],
224
+ },
225
+ {
226
+ id: 'xss-tp-005',
227
+ category: 'security',
228
+ text1: 'JavaScript can be injected through event handlers',
229
+ text2: 'onclick and other event attributes allow script execution',
230
+ toolName: 'render_html',
231
+ expectedMatch: true,
232
+ reasoning: 'Both describe XSS via event handlers',
233
+ source: 'llm-generated',
234
+ tags: ['xss', 'paraphrase'],
235
+ },
236
+ // True Negatives - Should NOT match
237
+ {
238
+ id: 'xss-tn-001',
239
+ category: 'security',
240
+ text1: 'Cross-site scripting vulnerability',
241
+ text2: 'Cross-site request forgery vulnerability',
242
+ toolName: 'render_html',
243
+ expectedMatch: false,
244
+ reasoning: 'XSS vs CSRF are different vulnerabilities',
245
+ source: 'llm-generated',
246
+ tags: ['xss', 'negative'],
247
+ },
248
+ {
249
+ id: 'xss-tn-002',
250
+ category: 'security',
251
+ text1: 'XSS allows stealing session cookies',
252
+ text2: 'CORS misconfiguration exposes data',
253
+ toolName: 'render_html',
254
+ expectedMatch: false,
255
+ reasoning: 'Different vulnerability types',
256
+ source: 'llm-generated',
257
+ tags: ['xss', 'negative'],
258
+ },
259
+ ];
260
+ // ============================================================================
261
+ // SECURITY: COMMAND INJECTION VARIATIONS
262
+ // Tests various phrasings of OS command injection vulnerabilities
263
+ // ============================================================================
264
+ export const COMMAND_INJECTION_CASES = [
265
+ // True Positives - Should match
266
+ {
267
+ id: 'cmd-tp-001',
268
+ category: 'security',
269
+ text1: 'User input passed directly to shell execution',
270
+ text2: 'Command constructed with unsanitized parameters',
271
+ toolName: 'run_command',
272
+ expectedMatch: true,
273
+ reasoning: 'Both describe command injection via shell',
274
+ source: 'llm-generated',
275
+ tags: ['command_injection', 'paraphrase'],
276
+ },
277
+ {
278
+ id: 'cmd-tp-002',
279
+ category: 'security',
280
+ text1: 'OS command injection through backtick execution',
281
+ text2: 'Shell metacharacters allow arbitrary command execution',
282
+ toolName: 'run_command',
283
+ expectedMatch: true,
284
+ reasoning: 'Both describe command injection techniques',
285
+ source: 'llm-generated',
286
+ tags: ['command_injection', 'paraphrase'],
287
+ },
288
+ {
289
+ id: 'cmd-tp-003',
290
+ category: 'security',
291
+ text1: 'subprocess.call with shell=True is vulnerable to injection',
292
+ text2: 'Spawning shell processes with user input enables RCE',
293
+ toolName: 'run_command',
294
+ expectedMatch: true,
295
+ reasoning: 'Both describe shell-based command injection',
296
+ source: 'llm-generated',
297
+ tags: ['command_injection', 'paraphrase'],
298
+ },
299
+ {
300
+ id: 'cmd-tp-004',
301
+ category: 'security',
302
+ text1: 'Semicolons in input allow command chaining',
303
+ text2: 'Command separators enable executing additional commands',
304
+ toolName: 'run_command',
305
+ expectedMatch: true,
306
+ reasoning: 'Both describe command chaining injection',
307
+ source: 'llm-generated',
308
+ tags: ['command_injection', 'paraphrase'],
309
+ },
310
+ // True Negatives - Should NOT match
311
+ {
312
+ id: 'cmd-tn-001',
313
+ category: 'security',
314
+ text1: 'Command injection allows executing arbitrary code',
315
+ text2: 'Command not found error when tool is missing',
316
+ toolName: 'run_command',
317
+ expectedMatch: false,
318
+ reasoning: 'Security vulnerability vs operational error',
319
+ source: 'llm-generated',
320
+ tags: ['command_injection', 'negative'],
321
+ },
322
+ ];
323
+ // ============================================================================
324
+ // SECURITY: SSRF VARIATIONS
325
+ // Tests various phrasings of server-side request forgery
326
+ // ============================================================================
327
+ export const SSRF_CASES = [
328
+ // True Positives - Should match
329
+ {
330
+ id: 'ssrf-tp-001',
331
+ category: 'security',
332
+ text1: 'Server makes requests to user-specified URLs without validation',
333
+ text2: 'Attacker can make the server fetch arbitrary URLs',
334
+ toolName: 'fetch_url',
335
+ expectedMatch: true,
336
+ reasoning: 'Both describe SSRF via URL control',
337
+ source: 'llm-generated',
338
+ tags: ['ssrf', 'paraphrase'],
339
+ },
340
+ {
341
+ id: 'ssrf-tp-002',
342
+ category: 'security',
343
+ text1: 'SSRF allows accessing internal network services',
344
+ text2: 'Server-side request forgery exposes internal endpoints',
345
+ toolName: 'fetch_url',
346
+ expectedMatch: true,
347
+ reasoning: 'Both describe SSRF to internal network',
348
+ source: 'llm-generated',
349
+ tags: ['ssrf', 'paraphrase'],
350
+ },
351
+ {
352
+ id: 'ssrf-tp-003',
353
+ category: 'security',
354
+ text1: 'Cloud metadata endpoint accessible via SSRF',
355
+ text2: 'Attacker can reach 169.254.169.254 through the server',
356
+ toolName: 'fetch_url',
357
+ expectedMatch: true,
358
+ reasoning: 'Both describe cloud metadata SSRF',
359
+ source: 'llm-generated',
360
+ tags: ['ssrf', 'paraphrase'],
361
+ },
362
+ // True Negatives - Should NOT match
363
+ {
364
+ id: 'ssrf-tn-001',
365
+ category: 'security',
366
+ text1: 'SSRF vulnerability in URL fetching',
367
+ text2: 'Open redirect vulnerability in URL handling',
368
+ toolName: 'fetch_url',
369
+ expectedMatch: false,
370
+ reasoning: 'SSRF vs open redirect are different',
371
+ source: 'llm-generated',
372
+ tags: ['ssrf', 'negative'],
373
+ },
374
+ ];
375
+ // ============================================================================
376
+ // SECURITY: AUTHENTICATION/AUTHORIZATION VARIATIONS
377
+ // Tests auth-related vulnerabilities
378
+ // ============================================================================
379
+ export const AUTH_CASES = [
380
+ // True Positives - Should match
381
+ {
382
+ id: 'auth-tp-001',
383
+ category: 'security',
384
+ text1: 'Missing authentication allows unauthenticated access to API',
385
+ text2: 'Endpoints accessible without any credentials',
386
+ toolName: 'api_endpoint',
387
+ expectedMatch: true,
388
+ reasoning: 'Both describe missing authentication',
389
+ source: 'llm-generated',
390
+ tags: ['authentication', 'paraphrase'],
391
+ },
392
+ {
393
+ id: 'auth-tp-002',
394
+ category: 'security',
395
+ text1: 'Session tokens not properly validated',
396
+ text2: 'Invalid or expired tokens still accepted',
397
+ toolName: 'api_endpoint',
398
+ expectedMatch: true,
399
+ reasoning: 'Both describe session validation failures',
400
+ source: 'llm-generated',
401
+ tags: ['authentication', 'paraphrase'],
402
+ },
403
+ {
404
+ id: 'auth-tp-003',
405
+ category: 'security',
406
+ text1: 'IDOR allows accessing other users resources',
407
+ text2: 'Changing ID parameter exposes other accounts data',
408
+ toolName: 'api_endpoint',
409
+ expectedMatch: true,
410
+ reasoning: 'Both describe insecure direct object reference',
411
+ source: 'llm-generated',
412
+ tags: ['authorization', 'paraphrase'],
413
+ },
414
+ {
415
+ id: 'auth-tp-004',
416
+ category: 'security',
417
+ text1: 'Privilege escalation from user to admin role',
418
+ text2: 'Regular users can access administrative functions',
419
+ toolName: 'api_endpoint',
420
+ expectedMatch: true,
421
+ reasoning: 'Both describe privilege escalation',
422
+ source: 'llm-generated',
423
+ tags: ['authorization', 'paraphrase'],
424
+ },
425
+ // True Negatives - Should NOT match
426
+ {
427
+ id: 'auth-tn-001',
428
+ category: 'security',
429
+ text1: 'Authentication bypass vulnerability',
430
+ text2: 'Authorization check missing on endpoint',
431
+ toolName: 'api_endpoint',
432
+ expectedMatch: false,
433
+ reasoning: 'Authentication vs authorization are different',
434
+ source: 'llm-generated',
435
+ tags: ['authentication', 'negative'],
436
+ },
437
+ {
438
+ id: 'auth-tn-002',
439
+ category: 'security',
440
+ text1: 'Weak password policy allows simple passwords',
441
+ text2: 'Password stored in plain text',
442
+ toolName: 'api_endpoint',
443
+ expectedMatch: false,
444
+ reasoning: 'Password policy vs storage are different issues',
445
+ source: 'llm-generated',
446
+ tags: ['authentication', 'negative'],
447
+ },
448
+ ];
449
+ // ============================================================================
450
+ // LIMITATIONS: SIZE CONSTRAINTS
451
+ // Tests various phrasings of size-related limitations
452
+ // ============================================================================
453
+ export const SIZE_LIMIT_CASES = [
454
+ // True Positives - Should match
455
+ {
456
+ id: 'size-tp-001',
457
+ category: 'limitation',
458
+ text1: 'Files larger than 50MB are rejected',
459
+ text2: 'Maximum file size: 50 megabytes',
460
+ toolName: 'upload_file',
461
+ expectedMatch: true,
462
+ reasoning: 'Same 50MB limit',
463
+ source: 'llm-generated',
464
+ tags: ['size_limit', 'paraphrase'],
465
+ },
466
+ {
467
+ id: 'size-tp-002',
468
+ category: 'limitation',
469
+ text1: 'Upload limit is 25 megabytes per file',
470
+ text2: 'Each file must be under 25MB',
471
+ toolName: 'upload_file',
472
+ expectedMatch: true,
473
+ reasoning: 'Same 25MB per-file limit',
474
+ source: 'llm-generated',
475
+ tags: ['size_limit', 'paraphrase'],
476
+ },
477
+ {
478
+ id: 'size-tp-003',
479
+ category: 'limitation',
480
+ text1: 'Total upload size cannot exceed 500MB',
481
+ text2: 'Combined file size limited to 500 megabytes',
482
+ toolName: 'upload_file',
483
+ expectedMatch: true,
484
+ reasoning: 'Same 500MB total limit',
485
+ source: 'llm-generated',
486
+ tags: ['size_limit', 'paraphrase'],
487
+ },
488
+ {
489
+ id: 'size-tp-004',
490
+ category: 'limitation',
491
+ text1: 'Request body limited to 1MB',
492
+ text2: 'Payload size must not exceed 1 megabyte',
493
+ toolName: 'api_call',
494
+ expectedMatch: true,
495
+ reasoning: 'Same 1MB request limit',
496
+ source: 'llm-generated',
497
+ tags: ['size_limit', 'paraphrase'],
498
+ },
499
+ // True Negatives - Should NOT match
500
+ {
501
+ id: 'size-tn-001',
502
+ category: 'limitation',
503
+ text1: 'Maximum file size is 10MB',
504
+ text2: 'Maximum file size is 50MB',
505
+ toolName: 'upload_file',
506
+ expectedMatch: false,
507
+ reasoning: 'Different size limits: 10MB vs 50MB',
508
+ source: 'llm-generated',
509
+ tags: ['size_limit', 'negative'],
510
+ },
511
+ {
512
+ id: 'size-tn-002',
513
+ category: 'limitation',
514
+ text1: 'Files up to 1GB supported',
515
+ text2: 'Files up to 100MB supported',
516
+ toolName: 'upload_file',
517
+ expectedMatch: false,
518
+ reasoning: 'Different limits: 1GB vs 100MB',
519
+ source: 'llm-generated',
520
+ tags: ['size_limit', 'negative'],
521
+ },
522
+ {
523
+ id: 'size-tn-003',
524
+ category: 'limitation',
525
+ text1: 'Maximum upload size is 5MB',
526
+ text2: 'Maximum download size is 5MB',
527
+ toolName: 'upload_file',
528
+ expectedMatch: false,
529
+ reasoning: 'Upload vs download are different operations',
530
+ source: 'llm-generated',
531
+ tags: ['size_limit', 'negative'],
532
+ },
533
+ ];
534
+ // ============================================================================
535
+ // LIMITATIONS: RATE LIMITS
536
+ // Tests various phrasings of rate limiting
537
+ // ============================================================================
538
+ export const RATE_LIMIT_CASES = [
539
+ // True Positives - Should match
540
+ {
541
+ id: 'rate-tp-001',
542
+ category: 'limitation',
543
+ text1: 'API limited to 100 calls per minute',
544
+ text2: 'Rate limit: 100 requests/min',
545
+ toolName: 'api_call',
546
+ expectedMatch: true,
547
+ reasoning: 'Same 100/min rate limit',
548
+ source: 'llm-generated',
549
+ tags: ['rate_limit', 'paraphrase'],
550
+ },
551
+ {
552
+ id: 'rate-tp-002',
553
+ category: 'limitation',
554
+ text1: 'Maximum 1000 requests per hour',
555
+ text2: 'Hourly quota of 1000 API calls',
556
+ toolName: 'api_call',
557
+ expectedMatch: true,
558
+ reasoning: 'Same 1000/hour limit',
559
+ source: 'llm-generated',
560
+ tags: ['rate_limit', 'paraphrase'],
561
+ },
562
+ {
563
+ id: 'rate-tp-003',
564
+ category: 'limitation',
565
+ text1: 'Throttled to 10 requests per second',
566
+ text2: '10 RPS rate limit enforced',
567
+ toolName: 'api_call',
568
+ expectedMatch: true,
569
+ reasoning: 'Same 10/second limit',
570
+ source: 'llm-generated',
571
+ tags: ['rate_limit', 'paraphrase'],
572
+ },
573
+ // True Negatives - Should NOT match
574
+ {
575
+ id: 'rate-tn-001',
576
+ category: 'limitation',
577
+ text1: 'Rate limited to 100 requests per minute',
578
+ text2: 'Rate limited to 100 requests per hour',
579
+ toolName: 'api_call',
580
+ expectedMatch: false,
581
+ reasoning: 'Different time periods: per minute vs per hour',
582
+ source: 'llm-generated',
583
+ tags: ['rate_limit', 'negative'],
584
+ },
585
+ {
586
+ id: 'rate-tn-002',
587
+ category: 'limitation',
588
+ text1: '50 calls per minute allowed',
589
+ text2: '500 calls per minute allowed',
590
+ toolName: 'api_call',
591
+ expectedMatch: false,
592
+ reasoning: 'Different rates: 50 vs 500',
593
+ source: 'llm-generated',
594
+ tags: ['rate_limit', 'negative'],
595
+ },
596
+ ];
597
+ // ============================================================================
598
+ // LIMITATIONS: TIMEOUT CONSTRAINTS
599
+ // Tests various phrasings of timeout limitations
600
+ // ============================================================================
601
+ export const TIMEOUT_CASES = [
602
+ // True Positives - Should match
603
+ {
604
+ id: 'timeout-tp-001',
605
+ category: 'limitation',
606
+ text1: 'Operations time out after 30 seconds',
607
+ text2: '30 second timeout on all requests',
608
+ toolName: 'api_call',
609
+ expectedMatch: true,
610
+ reasoning: 'Same 30 second timeout',
611
+ source: 'llm-generated',
612
+ tags: ['timeout', 'paraphrase'],
613
+ },
614
+ {
615
+ id: 'timeout-tp-002',
616
+ category: 'limitation',
617
+ text1: 'Long-running tasks limited to 5 minutes',
618
+ text2: 'Maximum execution time is 300 seconds',
619
+ toolName: 'process_data',
620
+ expectedMatch: true,
621
+ reasoning: 'Same 5 minute/300 second timeout',
622
+ source: 'llm-generated',
623
+ tags: ['timeout', 'paraphrase'],
624
+ },
625
+ {
626
+ id: 'timeout-tp-003',
627
+ category: 'limitation',
628
+ text1: 'Connection timeout set to 10 seconds',
629
+ text2: 'Connections must establish within 10s',
630
+ toolName: 'api_call',
631
+ expectedMatch: true,
632
+ reasoning: 'Same 10 second connection timeout',
633
+ source: 'llm-generated',
634
+ tags: ['timeout', 'paraphrase'],
635
+ },
636
+ // True Negatives - Should NOT match
637
+ {
638
+ id: 'timeout-tn-001',
639
+ category: 'limitation',
640
+ text1: 'Request timeout is 30 seconds',
641
+ text2: 'Request timeout is 60 seconds',
642
+ toolName: 'api_call',
643
+ expectedMatch: false,
644
+ reasoning: 'Different timeout values',
645
+ source: 'llm-generated',
646
+ tags: ['timeout', 'negative'],
647
+ },
648
+ {
649
+ id: 'timeout-tn-002',
650
+ category: 'limitation',
651
+ text1: 'Connection timeout of 5 seconds',
652
+ text2: 'Read timeout of 5 seconds',
653
+ toolName: 'api_call',
654
+ expectedMatch: false,
655
+ reasoning: 'Different timeout types',
656
+ source: 'llm-generated',
657
+ tags: ['timeout', 'negative'],
658
+ },
659
+ ];
660
+ // ============================================================================
661
+ // LIMITATIONS: FORMAT CONSTRAINTS
662
+ // Tests various phrasings of format/encoding limitations
663
+ // ============================================================================
664
+ export const FORMAT_CASES = [
665
+ // True Positives - Should match
666
+ {
667
+ id: 'fmt-tp-001',
668
+ category: 'limitation',
669
+ text1: 'Only accepts JSON formatted input',
670
+ text2: 'Input must be valid JSON',
671
+ toolName: 'parse_data',
672
+ expectedMatch: true,
673
+ reasoning: 'Same JSON format requirement',
674
+ source: 'llm-generated',
675
+ tags: ['format', 'paraphrase'],
676
+ },
677
+ {
678
+ id: 'fmt-tp-002',
679
+ category: 'limitation',
680
+ text1: 'UTF-8 encoding required for all text',
681
+ text2: 'Text must use UTF-8 character encoding',
682
+ toolName: 'process_text',
683
+ expectedMatch: true,
684
+ reasoning: 'Same UTF-8 requirement',
685
+ source: 'llm-generated',
686
+ tags: ['encoding', 'paraphrase'],
687
+ },
688
+ {
689
+ id: 'fmt-tp-003',
690
+ category: 'limitation',
691
+ text1: 'Images must be PNG or JPEG format',
692
+ text2: 'Supported image formats: PNG, JPEG',
693
+ toolName: 'upload_image',
694
+ expectedMatch: true,
695
+ reasoning: 'Same supported formats',
696
+ source: 'llm-generated',
697
+ tags: ['format', 'paraphrase'],
698
+ },
699
+ // True Negatives - Should NOT match
700
+ {
701
+ id: 'fmt-tn-001',
702
+ category: 'limitation',
703
+ text1: 'Only JSON format supported',
704
+ text2: 'Only YAML format supported',
705
+ toolName: 'parse_data',
706
+ expectedMatch: false,
707
+ reasoning: 'Different formats: JSON vs YAML',
708
+ source: 'llm-generated',
709
+ tags: ['format', 'negative'],
710
+ },
711
+ {
712
+ id: 'fmt-tn-002',
713
+ category: 'limitation',
714
+ text1: 'Requires UTF-8 encoding',
715
+ text2: 'Requires ASCII encoding',
716
+ toolName: 'process_text',
717
+ expectedMatch: false,
718
+ reasoning: 'Different encodings',
719
+ source: 'llm-generated',
720
+ tags: ['encoding', 'negative'],
721
+ },
722
+ ];
723
+ // ============================================================================
724
+ // ASSERTIONS: BEHAVIOR MATCHING
725
+ // Tests various phrasings of behavioral assertions
726
+ // ============================================================================
727
+ export const ASSERTION_CASES = [
728
+ // True Positives - Should match
729
+ {
730
+ id: 'asrt-tp-010',
731
+ category: 'assertion',
732
+ text1: 'Function returns null for invalid input',
733
+ text2: 'Invalid input causes null to be returned',
734
+ toolName: 'process_data',
735
+ expectedMatch: true,
736
+ reasoning: 'Same null return behavior',
737
+ source: 'llm-generated',
738
+ tags: ['assertion', 'paraphrase'],
739
+ },
740
+ {
741
+ id: 'asrt-tp-011',
742
+ category: 'assertion',
743
+ text1: 'Throws FileNotFoundError when file is missing',
744
+ text2: 'Missing files cause FileNotFoundError to be raised',
745
+ toolName: 'read_file',
746
+ expectedMatch: true,
747
+ reasoning: 'Same error thrown for same condition',
748
+ source: 'llm-generated',
749
+ tags: ['assertion', 'paraphrase'],
750
+ },
751
+ {
752
+ id: 'asrt-tp-012',
753
+ category: 'assertion',
754
+ text1: 'Returns empty array when no results found',
755
+ text2: 'Empty list returned for queries with no matches',
756
+ toolName: 'search_db',
757
+ expectedMatch: true,
758
+ reasoning: 'Same empty result behavior',
759
+ source: 'llm-generated',
760
+ tags: ['assertion', 'paraphrase'],
761
+ },
762
+ {
763
+ id: 'asrt-tp-013',
764
+ category: 'assertion',
765
+ text1: 'Successful operations return status code 200',
766
+ text2: 'HTTP 200 returned on success',
767
+ toolName: 'api_call',
768
+ expectedMatch: true,
769
+ reasoning: 'Same success status code',
770
+ source: 'llm-generated',
771
+ tags: ['assertion', 'paraphrase'],
772
+ },
773
+ {
774
+ id: 'asrt-tp-014',
775
+ category: 'assertion',
776
+ text1: 'Creates directory if it does not exist',
777
+ text2: 'Missing directories are automatically created',
778
+ toolName: 'write_file',
779
+ expectedMatch: true,
780
+ reasoning: 'Same auto-create behavior',
781
+ source: 'llm-generated',
782
+ tags: ['assertion', 'paraphrase'],
783
+ },
784
+ // True Negatives - Should NOT match
785
+ {
786
+ id: 'exp-asrt-tn-001',
787
+ category: 'assertion',
788
+ text1: 'Returns empty array when no results found',
789
+ text2: 'Throws exception when no results found',
790
+ toolName: 'search_db',
791
+ expectedMatch: false,
792
+ reasoning: 'Different behaviors: return vs throw',
793
+ source: 'llm-generated',
794
+ tags: ['assertion', 'negative'],
795
+ },
796
+ {
797
+ id: 'exp-asrt-tn-002',
798
+ category: 'assertion',
799
+ text1: 'Returns null for invalid input',
800
+ text2: 'Returns default value for invalid input',
801
+ toolName: 'process_data',
802
+ expectedMatch: false,
803
+ reasoning: 'Different return values',
804
+ source: 'llm-generated',
805
+ tags: ['assertion', 'negative'],
806
+ },
807
+ {
808
+ id: 'exp-asrt-tn-003',
809
+ category: 'assertion',
810
+ text1: 'Status code 200 on success',
811
+ text2: 'Status code 201 on success',
812
+ toolName: 'api_call',
813
+ expectedMatch: false,
814
+ reasoning: 'Different status codes',
815
+ source: 'llm-generated',
816
+ tags: ['assertion', 'negative'],
817
+ },
818
+ ];
819
+ // ============================================================================
820
+ // EDGE CASES: NEGATION HANDLING
821
+ // Tests that negated phrases are properly distinguished
822
+ // ============================================================================
823
+ export const NEGATION_CASES = [
824
+ {
825
+ id: 'neg-001',
826
+ category: 'security',
827
+ text1: 'This is a critical vulnerability',
828
+ text2: 'This is not a critical vulnerability',
829
+ toolName: 'test_tool',
830
+ expectedMatch: false,
831
+ reasoning: 'Negation reverses meaning',
832
+ source: 'llm-generated',
833
+ tags: ['negation', 'edge'],
834
+ },
835
+ {
836
+ id: 'neg-002',
837
+ category: 'limitation',
838
+ text1: 'There is no size limit',
839
+ text2: 'There is a size limit of 10MB',
840
+ toolName: 'upload_file',
841
+ expectedMatch: false,
842
+ reasoning: 'No limit vs specific limit',
843
+ source: 'llm-generated',
844
+ tags: ['negation', 'edge'],
845
+ },
846
+ {
847
+ id: 'neg-003',
848
+ category: 'security',
849
+ text1: 'Input is validated before processing',
850
+ text2: 'Input is not validated before processing',
851
+ toolName: 'process_data',
852
+ expectedMatch: false,
853
+ reasoning: 'Validated vs not validated',
854
+ source: 'llm-generated',
855
+ tags: ['negation', 'edge'],
856
+ },
857
+ {
858
+ id: 'neg-004',
859
+ category: 'security',
860
+ text1: 'Authentication is required',
861
+ text2: 'Authentication is not required',
862
+ toolName: 'api_endpoint',
863
+ expectedMatch: false,
864
+ reasoning: 'Required vs not required',
865
+ source: 'llm-generated',
866
+ tags: ['negation', 'edge'],
867
+ },
868
+ {
869
+ id: 'neg-005',
870
+ category: 'limitation',
871
+ text1: 'Rate limiting is enabled',
872
+ text2: 'Rate limiting is disabled',
873
+ toolName: 'api_call',
874
+ expectedMatch: false,
875
+ reasoning: 'Enabled vs disabled',
876
+ source: 'llm-generated',
877
+ tags: ['negation', 'edge'],
878
+ },
879
+ ];
880
+ // ============================================================================
881
+ // EDGE CASES: SEVERITY DIFFERENCES
882
+ // Tests that different severities are properly distinguished
883
+ // ============================================================================
884
+ export const SEVERITY_CASES = [
885
+ {
886
+ id: 'sev-001',
887
+ category: 'security',
888
+ text1: 'Critical severity remote code execution',
889
+ text2: 'Low severity information disclosure',
890
+ toolName: 'test_tool',
891
+ expectedMatch: false,
892
+ reasoning: 'Critical vs low severity',
893
+ source: 'llm-generated',
894
+ tags: ['severity', 'edge'],
895
+ },
896
+ {
897
+ id: 'sev-002',
898
+ category: 'security',
899
+ text1: 'High risk SQL injection vulnerability',
900
+ text2: 'Medium risk SQL injection vulnerability',
901
+ toolName: 'search_db',
902
+ expectedMatch: true,
903
+ reasoning: 'Same vulnerability type, adjacent severity',
904
+ source: 'llm-generated',
905
+ tags: ['severity', 'edge'],
906
+ },
907
+ {
908
+ id: 'sev-003',
909
+ category: 'security',
910
+ text1: 'Minor XSS issue in error messages',
911
+ text2: 'Critical XSS allowing session hijacking',
912
+ toolName: 'render_html',
913
+ expectedMatch: false,
914
+ reasoning: 'Minor vs critical severity',
915
+ source: 'llm-generated',
916
+ tags: ['severity', 'edge'],
917
+ },
918
+ ];
919
+ // ============================================================================
920
+ // EDGE CASES: SIMILAR BUT DIFFERENT
921
+ // Tests for things that look similar but have different meanings
922
+ // ============================================================================
923
+ export const SIMILAR_DIFFERENT_CASES = [
924
+ {
925
+ id: 'sim-001',
926
+ category: 'security',
927
+ text1: 'Server-side request forgery vulnerability',
928
+ text2: 'Cross-site request forgery vulnerability',
929
+ toolName: 'api_endpoint',
930
+ expectedMatch: false,
931
+ reasoning: 'SSRF vs CSRF are different',
932
+ source: 'llm-generated',
933
+ tags: ['similar', 'edge'],
934
+ },
935
+ {
936
+ id: 'sim-002',
937
+ category: 'security',
938
+ text1: 'Local file inclusion vulnerability',
939
+ text2: 'Remote file inclusion vulnerability',
940
+ toolName: 'read_file',
941
+ expectedMatch: false,
942
+ reasoning: 'LFI vs RFI are different',
943
+ source: 'llm-generated',
944
+ tags: ['similar', 'edge'],
945
+ },
946
+ {
947
+ id: 'sim-003',
948
+ category: 'limitation',
949
+ text1: 'Read operations are rate limited',
950
+ text2: 'Write operations are rate limited',
951
+ toolName: 'api_call',
952
+ expectedMatch: false,
953
+ reasoning: 'Different operation types',
954
+ source: 'llm-generated',
955
+ tags: ['similar', 'edge'],
956
+ },
957
+ {
958
+ id: 'sim-004',
959
+ category: 'security',
960
+ text1: 'Horizontal privilege escalation',
961
+ text2: 'Vertical privilege escalation',
962
+ toolName: 'api_endpoint',
963
+ expectedMatch: false,
964
+ reasoning: 'Different escalation types',
965
+ source: 'llm-generated',
966
+ tags: ['similar', 'edge'],
967
+ },
968
+ {
969
+ id: 'sim-005',
970
+ category: 'assertion',
971
+ text1: 'Synchronous API call',
972
+ text2: 'Asynchronous API call',
973
+ toolName: 'api_call',
974
+ expectedMatch: false,
975
+ reasoning: 'Different execution modes',
976
+ source: 'llm-generated',
977
+ tags: ['similar', 'edge'],
978
+ },
979
+ ];
980
+ // ============================================================================
981
+ // PARAPHRASE ROBUSTNESS: TECHNICAL TERMS
982
+ // Tests various ways to express the same technical concept
983
+ // ============================================================================
984
+ export const PARAPHRASE_TECHNICAL_CASES = [
985
+ {
986
+ id: 'para-tech-001',
987
+ category: 'security',
988
+ text1: 'RCE vulnerability through deserialization',
989
+ text2: 'Remote code execution via unsafe object deserialization',
990
+ toolName: 'process_data',
991
+ expectedMatch: true,
992
+ reasoning: 'RCE abbreviation expands to remote code execution',
993
+ source: 'llm-generated',
994
+ tags: ['paraphrase', 'abbreviation'],
995
+ },
996
+ {
997
+ id: 'para-tech-002',
998
+ category: 'security',
999
+ text1: 'XXE attack allows reading local files',
1000
+ text2: 'XML external entity injection enables file disclosure',
1001
+ toolName: 'parse_xml',
1002
+ expectedMatch: true,
1003
+ reasoning: 'XXE abbreviation and full description',
1004
+ source: 'llm-generated',
1005
+ tags: ['paraphrase', 'abbreviation'],
1006
+ },
1007
+ {
1008
+ id: 'para-tech-003',
1009
+ category: 'security',
1010
+ text1: 'DoS through resource exhaustion',
1011
+ text2: 'Denial of service by consuming server resources',
1012
+ toolName: 'process_data',
1013
+ expectedMatch: true,
1014
+ reasoning: 'DoS abbreviation and full description',
1015
+ source: 'llm-generated',
1016
+ tags: ['paraphrase', 'abbreviation'],
1017
+ },
1018
+ {
1019
+ id: 'para-tech-004',
1020
+ category: 'limitation',
1021
+ text1: 'API uses JWT for auth',
1022
+ text2: 'JSON Web Tokens required for authentication',
1023
+ toolName: 'api_endpoint',
1024
+ expectedMatch: true,
1025
+ reasoning: 'JWT abbreviation and full description',
1026
+ source: 'llm-generated',
1027
+ tags: ['paraphrase', 'abbreviation'],
1028
+ },
1029
+ ];
1030
+ // ============================================================================
1031
+ // PARAPHRASE ROBUSTNESS: PASSIVE VS ACTIVE VOICE
1032
+ // Tests same meaning in different grammatical structures
1033
+ // ============================================================================
1034
+ export const PARAPHRASE_VOICE_CASES = [
1035
+ {
1036
+ id: 'para-voice-001',
1037
+ category: 'security',
1038
+ text1: 'Attackers can inject SQL commands',
1039
+ text2: 'SQL commands can be injected by attackers',
1040
+ toolName: 'search_db',
1041
+ expectedMatch: true,
1042
+ reasoning: 'Active vs passive voice, same meaning',
1043
+ source: 'llm-generated',
1044
+ tags: ['paraphrase', 'voice'],
1045
+ },
1046
+ {
1047
+ id: 'para-voice-002',
1048
+ category: 'security',
1049
+ text1: 'The server validates all input',
1050
+ text2: 'All input is validated by the server',
1051
+ toolName: 'api_endpoint',
1052
+ expectedMatch: true,
1053
+ reasoning: 'Active vs passive voice',
1054
+ source: 'llm-generated',
1055
+ tags: ['paraphrase', 'voice'],
1056
+ },
1057
+ {
1058
+ id: 'para-voice-003',
1059
+ category: 'limitation',
1060
+ text1: 'The system enforces rate limits',
1061
+ text2: 'Rate limits are enforced by the system',
1062
+ toolName: 'api_call',
1063
+ expectedMatch: true,
1064
+ reasoning: 'Active vs passive voice',
1065
+ source: 'llm-generated',
1066
+ tags: ['paraphrase', 'voice'],
1067
+ },
1068
+ ];
1069
+ // ============================================================================
1070
+ // PARAPHRASE ROBUSTNESS: INFORMAL VS FORMAL
1071
+ // Tests same meaning in different registers
1072
+ // ============================================================================
1073
+ export const PARAPHRASE_REGISTER_CASES = [
1074
+ {
1075
+ id: 'para-reg-001',
1076
+ category: 'security',
1077
+ text1: 'The file path check is broken',
1078
+ text2: 'Path validation mechanism contains a vulnerability',
1079
+ toolName: 'read_file',
1080
+ expectedMatch: true,
1081
+ reasoning: 'Informal vs formal description of same issue',
1082
+ source: 'llm-generated',
1083
+ tags: ['paraphrase', 'register'],
1084
+ },
1085
+ {
1086
+ id: 'para-reg-002',
1087
+ category: 'limitation',
1088
+ text1: 'Cant upload files bigger than 10MB',
1089
+ text2: 'File uploads are restricted to a maximum size of 10 megabytes',
1090
+ toolName: 'upload_file',
1091
+ expectedMatch: true,
1092
+ reasoning: 'Informal vs formal, same constraint',
1093
+ source: 'llm-generated',
1094
+ tags: ['paraphrase', 'register'],
1095
+ },
1096
+ {
1097
+ id: 'para-reg-003',
1098
+ category: 'assertion',
1099
+ text1: 'Blows up if you pass null',
1100
+ text2: 'Raises an exception when null is provided as input',
1101
+ toolName: 'process_data',
1102
+ expectedMatch: true,
1103
+ reasoning: 'Informal vs formal description of error behavior',
1104
+ source: 'llm-generated',
1105
+ tags: ['paraphrase', 'register'],
1106
+ },
1107
+ ];
1108
+ // ============================================================================
1109
+ // COMBINED EXPORT
1110
+ // Aggregates all expanded test cases
1111
+ // ============================================================================
1112
+ export const EXPANDED_TEST_CASES = [
1113
+ ...PATH_TRAVERSAL_CASES,
1114
+ ...SQL_INJECTION_CASES,
1115
+ ...XSS_CASES,
1116
+ ...COMMAND_INJECTION_CASES,
1117
+ ...SSRF_CASES,
1118
+ ...AUTH_CASES,
1119
+ ...SIZE_LIMIT_CASES,
1120
+ ...RATE_LIMIT_CASES,
1121
+ ...TIMEOUT_CASES,
1122
+ ...FORMAT_CASES,
1123
+ ...ASSERTION_CASES,
1124
+ ...NEGATION_CASES,
1125
+ ...SEVERITY_CASES,
1126
+ ...SIMILAR_DIFFERENT_CASES,
1127
+ ...PARAPHRASE_TECHNICAL_CASES,
1128
+ ...PARAPHRASE_VOICE_CASES,
1129
+ ...PARAPHRASE_REGISTER_CASES,
1130
+ ];
1131
+ /**
1132
+ * Get statistics about the expanded dataset.
1133
+ */
1134
+ export function getExpandedDatasetStatistics() {
1135
+ const byCategory = {};
1136
+ const byTag = {};
1137
+ let truePositives = 0;
1138
+ let trueNegatives = 0;
1139
+ for (const tc of EXPANDED_TEST_CASES) {
1140
+ // Count by category
1141
+ byCategory[tc.category] = (byCategory[tc.category] || 0) + 1;
1142
+ // Count by tags
1143
+ if (tc.tags) {
1144
+ for (const tag of tc.tags) {
1145
+ byTag[tag] = (byTag[tag] || 0) + 1;
1146
+ }
1147
+ }
1148
+ // Count TP/TN
1149
+ if (tc.expectedMatch) {
1150
+ truePositives++;
1151
+ }
1152
+ else {
1153
+ trueNegatives++;
1154
+ }
1155
+ }
1156
+ return {
1157
+ totalCases: EXPANDED_TEST_CASES.length,
1158
+ byCategory,
1159
+ byTag,
1160
+ truePositives,
1161
+ trueNegatives,
1162
+ };
1163
+ }
1164
+ //# sourceMappingURL=expanded-dataset.js.map