@dotsetlabs/bellwether 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (403) hide show
  1. package/CHANGELOG.md +291 -0
  2. package/LICENSE +21 -0
  3. package/README.md +739 -0
  4. package/dist/auth/credentials.d.ts +64 -0
  5. package/dist/auth/credentials.js +218 -0
  6. package/dist/auth/index.d.ts +6 -0
  7. package/dist/auth/index.js +6 -0
  8. package/dist/auth/keychain.d.ts +64 -0
  9. package/dist/auth/keychain.js +268 -0
  10. package/dist/baseline/ab-testing.d.ts +80 -0
  11. package/dist/baseline/ab-testing.js +236 -0
  12. package/dist/baseline/ai-compatibility-scorer.d.ts +95 -0
  13. package/dist/baseline/ai-compatibility-scorer.js +606 -0
  14. package/dist/baseline/calibration.d.ts +77 -0
  15. package/dist/baseline/calibration.js +136 -0
  16. package/dist/baseline/category-matching.d.ts +85 -0
  17. package/dist/baseline/category-matching.js +289 -0
  18. package/dist/baseline/change-impact-analyzer.d.ts +98 -0
  19. package/dist/baseline/change-impact-analyzer.js +592 -0
  20. package/dist/baseline/comparator.d.ts +64 -0
  21. package/dist/baseline/comparator.js +916 -0
  22. package/dist/baseline/confidence.d.ts +55 -0
  23. package/dist/baseline/confidence.js +122 -0
  24. package/dist/baseline/converter.d.ts +61 -0
  25. package/dist/baseline/converter.js +585 -0
  26. package/dist/baseline/dependency-analyzer.d.ts +89 -0
  27. package/dist/baseline/dependency-analyzer.js +567 -0
  28. package/dist/baseline/deprecation-tracker.d.ts +133 -0
  29. package/dist/baseline/deprecation-tracker.js +322 -0
  30. package/dist/baseline/diff.d.ts +55 -0
  31. package/dist/baseline/diff.js +1584 -0
  32. package/dist/baseline/documentation-scorer.d.ts +205 -0
  33. package/dist/baseline/documentation-scorer.js +466 -0
  34. package/dist/baseline/embeddings.d.ts +118 -0
  35. package/dist/baseline/embeddings.js +251 -0
  36. package/dist/baseline/error-analyzer.d.ts +198 -0
  37. package/dist/baseline/error-analyzer.js +721 -0
  38. package/dist/baseline/evaluation/evaluator.d.ts +42 -0
  39. package/dist/baseline/evaluation/evaluator.js +323 -0
  40. package/dist/baseline/evaluation/expanded-dataset.d.ts +45 -0
  41. package/dist/baseline/evaluation/expanded-dataset.js +1164 -0
  42. package/dist/baseline/evaluation/golden-dataset.d.ts +58 -0
  43. package/dist/baseline/evaluation/golden-dataset.js +717 -0
  44. package/dist/baseline/evaluation/index.d.ts +15 -0
  45. package/dist/baseline/evaluation/index.js +15 -0
  46. package/dist/baseline/evaluation/types.d.ts +186 -0
  47. package/dist/baseline/evaluation/types.js +8 -0
  48. package/dist/baseline/external-dependency-detector.d.ts +181 -0
  49. package/dist/baseline/external-dependency-detector.js +524 -0
  50. package/dist/baseline/golden-output.d.ts +162 -0
  51. package/dist/baseline/golden-output.js +636 -0
  52. package/dist/baseline/health-scorer.d.ts +174 -0
  53. package/dist/baseline/health-scorer.js +451 -0
  54. package/dist/baseline/incremental-checker.d.ts +97 -0
  55. package/dist/baseline/incremental-checker.js +174 -0
  56. package/dist/baseline/index.d.ts +31 -0
  57. package/dist/baseline/index.js +42 -0
  58. package/dist/baseline/migration-generator.d.ts +137 -0
  59. package/dist/baseline/migration-generator.js +554 -0
  60. package/dist/baseline/migrations.d.ts +60 -0
  61. package/dist/baseline/migrations.js +197 -0
  62. package/dist/baseline/performance-tracker.d.ts +214 -0
  63. package/dist/baseline/performance-tracker.js +577 -0
  64. package/dist/baseline/pr-comment-generator.d.ts +117 -0
  65. package/dist/baseline/pr-comment-generator.js +546 -0
  66. package/dist/baseline/response-fingerprint.d.ts +127 -0
  67. package/dist/baseline/response-fingerprint.js +728 -0
  68. package/dist/baseline/response-schema-tracker.d.ts +129 -0
  69. package/dist/baseline/response-schema-tracker.js +420 -0
  70. package/dist/baseline/risk-scorer.d.ts +54 -0
  71. package/dist/baseline/risk-scorer.js +434 -0
  72. package/dist/baseline/saver.d.ts +89 -0
  73. package/dist/baseline/saver.js +554 -0
  74. package/dist/baseline/scenario-generator.d.ts +151 -0
  75. package/dist/baseline/scenario-generator.js +905 -0
  76. package/dist/baseline/schema-compare.d.ts +86 -0
  77. package/dist/baseline/schema-compare.js +557 -0
  78. package/dist/baseline/schema-evolution.d.ts +189 -0
  79. package/dist/baseline/schema-evolution.js +467 -0
  80. package/dist/baseline/semantic.d.ts +203 -0
  81. package/dist/baseline/semantic.js +908 -0
  82. package/dist/baseline/synonyms.d.ts +60 -0
  83. package/dist/baseline/synonyms.js +386 -0
  84. package/dist/baseline/telemetry.d.ts +165 -0
  85. package/dist/baseline/telemetry.js +294 -0
  86. package/dist/baseline/test-pruner.d.ts +120 -0
  87. package/dist/baseline/test-pruner.js +387 -0
  88. package/dist/baseline/types.d.ts +449 -0
  89. package/dist/baseline/types.js +5 -0
  90. package/dist/baseline/version.d.ts +138 -0
  91. package/dist/baseline/version.js +206 -0
  92. package/dist/cache/index.d.ts +5 -0
  93. package/dist/cache/index.js +5 -0
  94. package/dist/cache/response-cache.d.ts +151 -0
  95. package/dist/cache/response-cache.js +287 -0
  96. package/dist/ci/index.d.ts +60 -0
  97. package/dist/ci/index.js +342 -0
  98. package/dist/cli/commands/auth.d.ts +12 -0
  99. package/dist/cli/commands/auth.js +352 -0
  100. package/dist/cli/commands/badge.d.ts +3 -0
  101. package/dist/cli/commands/badge.js +74 -0
  102. package/dist/cli/commands/baseline-accept.d.ts +15 -0
  103. package/dist/cli/commands/baseline-accept.js +178 -0
  104. package/dist/cli/commands/baseline-migrate.d.ts +12 -0
  105. package/dist/cli/commands/baseline-migrate.js +164 -0
  106. package/dist/cli/commands/baseline.d.ts +14 -0
  107. package/dist/cli/commands/baseline.js +449 -0
  108. package/dist/cli/commands/beta.d.ts +10 -0
  109. package/dist/cli/commands/beta.js +231 -0
  110. package/dist/cli/commands/check.d.ts +11 -0
  111. package/dist/cli/commands/check.js +820 -0
  112. package/dist/cli/commands/cloud/badge.d.ts +3 -0
  113. package/dist/cli/commands/cloud/badge.js +74 -0
  114. package/dist/cli/commands/cloud/diff.d.ts +6 -0
  115. package/dist/cli/commands/cloud/diff.js +79 -0
  116. package/dist/cli/commands/cloud/history.d.ts +6 -0
  117. package/dist/cli/commands/cloud/history.js +102 -0
  118. package/dist/cli/commands/cloud/link.d.ts +9 -0
  119. package/dist/cli/commands/cloud/link.js +119 -0
  120. package/dist/cli/commands/cloud/login.d.ts +7 -0
  121. package/dist/cli/commands/cloud/login.js +499 -0
  122. package/dist/cli/commands/cloud/projects.d.ts +6 -0
  123. package/dist/cli/commands/cloud/projects.js +44 -0
  124. package/dist/cli/commands/cloud/shared.d.ts +7 -0
  125. package/dist/cli/commands/cloud/shared.js +42 -0
  126. package/dist/cli/commands/cloud/teams.d.ts +8 -0
  127. package/dist/cli/commands/cloud/teams.js +169 -0
  128. package/dist/cli/commands/cloud/upload.d.ts +8 -0
  129. package/dist/cli/commands/cloud/upload.js +181 -0
  130. package/dist/cli/commands/contract.d.ts +11 -0
  131. package/dist/cli/commands/contract.js +280 -0
  132. package/dist/cli/commands/discover.d.ts +3 -0
  133. package/dist/cli/commands/discover.js +82 -0
  134. package/dist/cli/commands/eval.d.ts +9 -0
  135. package/dist/cli/commands/eval.js +187 -0
  136. package/dist/cli/commands/explore.d.ts +11 -0
  137. package/dist/cli/commands/explore.js +437 -0
  138. package/dist/cli/commands/feedback.d.ts +9 -0
  139. package/dist/cli/commands/feedback.js +174 -0
  140. package/dist/cli/commands/golden.d.ts +12 -0
  141. package/dist/cli/commands/golden.js +407 -0
  142. package/dist/cli/commands/history.d.ts +10 -0
  143. package/dist/cli/commands/history.js +202 -0
  144. package/dist/cli/commands/init.d.ts +9 -0
  145. package/dist/cli/commands/init.js +219 -0
  146. package/dist/cli/commands/interview.d.ts +3 -0
  147. package/dist/cli/commands/interview.js +903 -0
  148. package/dist/cli/commands/link.d.ts +10 -0
  149. package/dist/cli/commands/link.js +169 -0
  150. package/dist/cli/commands/login.d.ts +7 -0
  151. package/dist/cli/commands/login.js +499 -0
  152. package/dist/cli/commands/preset.d.ts +33 -0
  153. package/dist/cli/commands/preset.js +297 -0
  154. package/dist/cli/commands/profile.d.ts +33 -0
  155. package/dist/cli/commands/profile.js +286 -0
  156. package/dist/cli/commands/registry.d.ts +11 -0
  157. package/dist/cli/commands/registry.js +146 -0
  158. package/dist/cli/commands/shared.d.ts +79 -0
  159. package/dist/cli/commands/shared.js +196 -0
  160. package/dist/cli/commands/teams.d.ts +8 -0
  161. package/dist/cli/commands/teams.js +169 -0
  162. package/dist/cli/commands/test.d.ts +9 -0
  163. package/dist/cli/commands/test.js +500 -0
  164. package/dist/cli/commands/upload.d.ts +8 -0
  165. package/dist/cli/commands/upload.js +223 -0
  166. package/dist/cli/commands/validate-config.d.ts +6 -0
  167. package/dist/cli/commands/validate-config.js +35 -0
  168. package/dist/cli/commands/verify.d.ts +11 -0
  169. package/dist/cli/commands/verify.js +283 -0
  170. package/dist/cli/commands/watch.d.ts +12 -0
  171. package/dist/cli/commands/watch.js +253 -0
  172. package/dist/cli/index.d.ts +3 -0
  173. package/dist/cli/index.js +178 -0
  174. package/dist/cli/interactive.d.ts +47 -0
  175. package/dist/cli/interactive.js +216 -0
  176. package/dist/cli/output/terminal-reporter.d.ts +19 -0
  177. package/dist/cli/output/terminal-reporter.js +104 -0
  178. package/dist/cli/output.d.ts +226 -0
  179. package/dist/cli/output.js +438 -0
  180. package/dist/cli/utils/env.d.ts +5 -0
  181. package/dist/cli/utils/env.js +14 -0
  182. package/dist/cli/utils/progress.d.ts +59 -0
  183. package/dist/cli/utils/progress.js +206 -0
  184. package/dist/cli/utils/server-context.d.ts +10 -0
  185. package/dist/cli/utils/server-context.js +36 -0
  186. package/dist/cloud/auth.d.ts +144 -0
  187. package/dist/cloud/auth.js +374 -0
  188. package/dist/cloud/client.d.ts +24 -0
  189. package/dist/cloud/client.js +65 -0
  190. package/dist/cloud/http-client.d.ts +38 -0
  191. package/dist/cloud/http-client.js +215 -0
  192. package/dist/cloud/index.d.ts +23 -0
  193. package/dist/cloud/index.js +25 -0
  194. package/dist/cloud/mock-client.d.ts +107 -0
  195. package/dist/cloud/mock-client.js +545 -0
  196. package/dist/cloud/types.d.ts +515 -0
  197. package/dist/cloud/types.js +15 -0
  198. package/dist/config/defaults.d.ts +160 -0
  199. package/dist/config/defaults.js +169 -0
  200. package/dist/config/loader.d.ts +24 -0
  201. package/dist/config/loader.js +122 -0
  202. package/dist/config/template.d.ts +42 -0
  203. package/dist/config/template.js +647 -0
  204. package/dist/config/validator.d.ts +2112 -0
  205. package/dist/config/validator.js +658 -0
  206. package/dist/constants/cloud.d.ts +107 -0
  207. package/dist/constants/cloud.js +110 -0
  208. package/dist/constants/core.d.ts +521 -0
  209. package/dist/constants/core.js +556 -0
  210. package/dist/constants/testing.d.ts +1283 -0
  211. package/dist/constants/testing.js +1568 -0
  212. package/dist/constants.d.ts +10 -0
  213. package/dist/constants.js +10 -0
  214. package/dist/contract/index.d.ts +6 -0
  215. package/dist/contract/index.js +5 -0
  216. package/dist/contract/validator.d.ts +177 -0
  217. package/dist/contract/validator.js +574 -0
  218. package/dist/cost/index.d.ts +6 -0
  219. package/dist/cost/index.js +5 -0
  220. package/dist/cost/tracker.d.ts +134 -0
  221. package/dist/cost/tracker.js +313 -0
  222. package/dist/discovery/discovery.d.ts +16 -0
  223. package/dist/discovery/discovery.js +173 -0
  224. package/dist/discovery/types.d.ts +51 -0
  225. package/dist/discovery/types.js +2 -0
  226. package/dist/docs/agents.d.ts +3 -0
  227. package/dist/docs/agents.js +995 -0
  228. package/dist/docs/contract.d.ts +51 -0
  229. package/dist/docs/contract.js +1681 -0
  230. package/dist/docs/generator.d.ts +4 -0
  231. package/dist/docs/generator.js +4 -0
  232. package/dist/docs/html-reporter.d.ts +9 -0
  233. package/dist/docs/html-reporter.js +757 -0
  234. package/dist/docs/index.d.ts +10 -0
  235. package/dist/docs/index.js +11 -0
  236. package/dist/docs/junit-reporter.d.ts +18 -0
  237. package/dist/docs/junit-reporter.js +210 -0
  238. package/dist/docs/report.d.ts +14 -0
  239. package/dist/docs/report.js +44 -0
  240. package/dist/docs/sarif-reporter.d.ts +19 -0
  241. package/dist/docs/sarif-reporter.js +335 -0
  242. package/dist/docs/shared.d.ts +35 -0
  243. package/dist/docs/shared.js +162 -0
  244. package/dist/docs/templates.d.ts +12 -0
  245. package/dist/docs/templates.js +76 -0
  246. package/dist/errors/index.d.ts +6 -0
  247. package/dist/errors/index.js +6 -0
  248. package/dist/errors/retry.d.ts +92 -0
  249. package/dist/errors/retry.js +323 -0
  250. package/dist/errors/types.d.ts +321 -0
  251. package/dist/errors/types.js +584 -0
  252. package/dist/index.d.ts +32 -0
  253. package/dist/index.js +32 -0
  254. package/dist/interview/dependency-resolver.d.ts +11 -0
  255. package/dist/interview/dependency-resolver.js +32 -0
  256. package/dist/interview/interviewer.d.ts +232 -0
  257. package/dist/interview/interviewer.js +1939 -0
  258. package/dist/interview/mock-response-generator.d.ts +7 -0
  259. package/dist/interview/mock-response-generator.js +102 -0
  260. package/dist/interview/orchestrator.d.ts +237 -0
  261. package/dist/interview/orchestrator.js +1296 -0
  262. package/dist/interview/rate-limiter.d.ts +15 -0
  263. package/dist/interview/rate-limiter.js +55 -0
  264. package/dist/interview/response-validator.d.ts +10 -0
  265. package/dist/interview/response-validator.js +132 -0
  266. package/dist/interview/schema-inferrer.d.ts +8 -0
  267. package/dist/interview/schema-inferrer.js +71 -0
  268. package/dist/interview/schema-test-generator.d.ts +71 -0
  269. package/dist/interview/schema-test-generator.js +834 -0
  270. package/dist/interview/smart-value-generator.d.ts +155 -0
  271. package/dist/interview/smart-value-generator.js +554 -0
  272. package/dist/interview/stateful-test-runner.d.ts +19 -0
  273. package/dist/interview/stateful-test-runner.js +106 -0
  274. package/dist/interview/types.d.ts +561 -0
  275. package/dist/interview/types.js +2 -0
  276. package/dist/llm/anthropic.d.ts +41 -0
  277. package/dist/llm/anthropic.js +355 -0
  278. package/dist/llm/client.d.ts +123 -0
  279. package/dist/llm/client.js +42 -0
  280. package/dist/llm/factory.d.ts +38 -0
  281. package/dist/llm/factory.js +145 -0
  282. package/dist/llm/fallback.d.ts +140 -0
  283. package/dist/llm/fallback.js +379 -0
  284. package/dist/llm/index.d.ts +18 -0
  285. package/dist/llm/index.js +15 -0
  286. package/dist/llm/ollama.d.ts +37 -0
  287. package/dist/llm/ollama.js +330 -0
  288. package/dist/llm/openai.d.ts +25 -0
  289. package/dist/llm/openai.js +320 -0
  290. package/dist/llm/token-budget.d.ts +161 -0
  291. package/dist/llm/token-budget.js +395 -0
  292. package/dist/logging/logger.d.ts +70 -0
  293. package/dist/logging/logger.js +130 -0
  294. package/dist/metrics/collector.d.ts +106 -0
  295. package/dist/metrics/collector.js +547 -0
  296. package/dist/metrics/index.d.ts +7 -0
  297. package/dist/metrics/index.js +7 -0
  298. package/dist/metrics/prometheus.d.ts +20 -0
  299. package/dist/metrics/prometheus.js +241 -0
  300. package/dist/metrics/types.d.ts +209 -0
  301. package/dist/metrics/types.js +5 -0
  302. package/dist/persona/builtins.d.ts +54 -0
  303. package/dist/persona/builtins.js +219 -0
  304. package/dist/persona/index.d.ts +8 -0
  305. package/dist/persona/index.js +8 -0
  306. package/dist/persona/loader.d.ts +30 -0
  307. package/dist/persona/loader.js +190 -0
  308. package/dist/persona/types.d.ts +144 -0
  309. package/dist/persona/types.js +5 -0
  310. package/dist/persona/validation.d.ts +94 -0
  311. package/dist/persona/validation.js +332 -0
  312. package/dist/prompts/index.d.ts +5 -0
  313. package/dist/prompts/index.js +5 -0
  314. package/dist/prompts/templates.d.ts +180 -0
  315. package/dist/prompts/templates.js +431 -0
  316. package/dist/registry/client.d.ts +49 -0
  317. package/dist/registry/client.js +191 -0
  318. package/dist/registry/index.d.ts +7 -0
  319. package/dist/registry/index.js +6 -0
  320. package/dist/registry/types.d.ts +140 -0
  321. package/dist/registry/types.js +6 -0
  322. package/dist/scenarios/evaluator.d.ts +43 -0
  323. package/dist/scenarios/evaluator.js +206 -0
  324. package/dist/scenarios/index.d.ts +10 -0
  325. package/dist/scenarios/index.js +9 -0
  326. package/dist/scenarios/loader.d.ts +20 -0
  327. package/dist/scenarios/loader.js +285 -0
  328. package/dist/scenarios/types.d.ts +153 -0
  329. package/dist/scenarios/types.js +8 -0
  330. package/dist/security/index.d.ts +17 -0
  331. package/dist/security/index.js +18 -0
  332. package/dist/security/payloads.d.ts +61 -0
  333. package/dist/security/payloads.js +268 -0
  334. package/dist/security/security-tester.d.ts +42 -0
  335. package/dist/security/security-tester.js +582 -0
  336. package/dist/security/types.d.ts +166 -0
  337. package/dist/security/types.js +8 -0
  338. package/dist/transport/base-transport.d.ts +59 -0
  339. package/dist/transport/base-transport.js +38 -0
  340. package/dist/transport/http-transport.d.ts +67 -0
  341. package/dist/transport/http-transport.js +238 -0
  342. package/dist/transport/mcp-client.d.ts +141 -0
  343. package/dist/transport/mcp-client.js +496 -0
  344. package/dist/transport/sse-transport.d.ts +88 -0
  345. package/dist/transport/sse-transport.js +316 -0
  346. package/dist/transport/stdio-transport.d.ts +43 -0
  347. package/dist/transport/stdio-transport.js +238 -0
  348. package/dist/transport/types.d.ts +125 -0
  349. package/dist/transport/types.js +16 -0
  350. package/dist/utils/concurrency.d.ts +123 -0
  351. package/dist/utils/concurrency.js +213 -0
  352. package/dist/utils/formatters.d.ts +16 -0
  353. package/dist/utils/formatters.js +37 -0
  354. package/dist/utils/index.d.ts +8 -0
  355. package/dist/utils/index.js +8 -0
  356. package/dist/utils/jsonpath.d.ts +87 -0
  357. package/dist/utils/jsonpath.js +326 -0
  358. package/dist/utils/markdown.d.ts +113 -0
  359. package/dist/utils/markdown.js +265 -0
  360. package/dist/utils/network.d.ts +14 -0
  361. package/dist/utils/network.js +17 -0
  362. package/dist/utils/sanitize.d.ts +92 -0
  363. package/dist/utils/sanitize.js +191 -0
  364. package/dist/utils/semantic.d.ts +194 -0
  365. package/dist/utils/semantic.js +1051 -0
  366. package/dist/utils/smart-truncate.d.ts +94 -0
  367. package/dist/utils/smart-truncate.js +361 -0
  368. package/dist/utils/timeout.d.ts +153 -0
  369. package/dist/utils/timeout.js +205 -0
  370. package/dist/utils/yaml-parser.d.ts +58 -0
  371. package/dist/utils/yaml-parser.js +86 -0
  372. package/dist/validation/index.d.ts +32 -0
  373. package/dist/validation/index.js +32 -0
  374. package/dist/validation/semantic-test-generator.d.ts +50 -0
  375. package/dist/validation/semantic-test-generator.js +176 -0
  376. package/dist/validation/semantic-types.d.ts +66 -0
  377. package/dist/validation/semantic-types.js +94 -0
  378. package/dist/validation/semantic-validator.d.ts +38 -0
  379. package/dist/validation/semantic-validator.js +340 -0
  380. package/dist/verification/index.d.ts +6 -0
  381. package/dist/verification/index.js +5 -0
  382. package/dist/verification/types.d.ts +133 -0
  383. package/dist/verification/types.js +5 -0
  384. package/dist/verification/verifier.d.ts +30 -0
  385. package/dist/verification/verifier.js +309 -0
  386. package/dist/version.d.ts +19 -0
  387. package/dist/version.js +48 -0
  388. package/dist/workflow/auto-generator.d.ts +27 -0
  389. package/dist/workflow/auto-generator.js +513 -0
  390. package/dist/workflow/discovery.d.ts +40 -0
  391. package/dist/workflow/discovery.js +195 -0
  392. package/dist/workflow/executor.d.ts +82 -0
  393. package/dist/workflow/executor.js +611 -0
  394. package/dist/workflow/index.d.ts +10 -0
  395. package/dist/workflow/index.js +10 -0
  396. package/dist/workflow/loader.d.ts +24 -0
  397. package/dist/workflow/loader.js +194 -0
  398. package/dist/workflow/state-tracker.d.ts +98 -0
  399. package/dist/workflow/state-tracker.js +424 -0
  400. package/dist/workflow/types.d.ts +337 -0
  401. package/dist/workflow/types.js +5 -0
  402. package/package.json +94 -0
  403. package/schemas/bellwether-check.schema.json +651 -0
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Evaluation Framework for Drift Detection
3
+ *
4
+ * Runs semantic comparison algorithms against the golden dataset
5
+ * and produces accuracy metrics including precision, recall, F1,
6
+ * and confidence calibration analysis.
7
+ */
8
+ import type { EvaluationResult, EvaluationSummary, EvaluationOptions, SemanticComparator } from './types.js';
9
+ import { GOLDEN_DATASET, getDatasetStatistics } from './golden-dataset.js';
10
+ /**
11
+ * Default semantic comparator using existing implementation.
12
+ */
13
+ export declare class DefaultSemanticComparator implements SemanticComparator {
14
+ compare(text1: string, text2: string, toolName: string, category: 'security' | 'limitation' | 'assertion'): {
15
+ matches: boolean;
16
+ confidence: number;
17
+ factors?: Array<{
18
+ name: string;
19
+ weight: number;
20
+ value: number;
21
+ description: string;
22
+ }>;
23
+ };
24
+ }
25
+ /**
26
+ * Run full evaluation against golden dataset.
27
+ */
28
+ export declare function evaluate(options?: EvaluationOptions, comparator?: SemanticComparator): EvaluationResult;
29
+ /**
30
+ * Create a summary for display.
31
+ */
32
+ export declare function createSummary(result: EvaluationResult): EvaluationSummary;
33
+ /**
34
+ * Format evaluation result for console output.
35
+ */
36
+ export declare function formatEvaluationReport(result: EvaluationResult): string;
37
+ /**
38
+ * Export results as JSON for external analysis.
39
+ */
40
+ export declare function exportResultsAsJson(result: EvaluationResult): string;
41
+ export { GOLDEN_DATASET, getDatasetStatistics };
42
+ //# sourceMappingURL=evaluator.d.ts.map
@@ -0,0 +1,323 @@
1
+ /**
2
+ * Evaluation Framework for Drift Detection
3
+ *
4
+ * Runs semantic comparison algorithms against the golden dataset
5
+ * and produces accuracy metrics including precision, recall, F1,
6
+ * and confidence calibration analysis.
7
+ */
8
+ import { GOLDEN_DATASET, DATASET_VERSION, getDatasetStatistics } from './golden-dataset.js';
9
+ import { structureSecurityNotes, structureLimitations, securityFindingsMatchWithConfidence, limitationsMatchWithConfidence, assertionsMatchWithConfidence, createFingerprint, } from '../semantic.js';
10
+ /**
11
+ * Default semantic comparator using existing implementation.
12
+ */
13
+ export class DefaultSemanticComparator {
14
+ compare(text1, text2, toolName, category) {
15
+ if (category === 'security') {
16
+ const findings1 = structureSecurityNotes(toolName, [text1]);
17
+ const findings2 = structureSecurityNotes(toolName, [text2]);
18
+ if (findings1.length === 0 || findings2.length === 0) {
19
+ return { matches: text1 === text2, confidence: text1 === text2 ? 100 : 0 };
20
+ }
21
+ const result = securityFindingsMatchWithConfidence(findings1[0], findings2[0]);
22
+ return {
23
+ matches: result.matches,
24
+ confidence: result.confidence.score,
25
+ factors: result.confidence.factors,
26
+ };
27
+ }
28
+ if (category === 'limitation') {
29
+ const lim1 = structureLimitations(toolName, [text1]);
30
+ const lim2 = structureLimitations(toolName, [text2]);
31
+ if (lim1.length === 0 || lim2.length === 0) {
32
+ return { matches: text1 === text2, confidence: text1 === text2 ? 100 : 0 };
33
+ }
34
+ const result = limitationsMatchWithConfidence(lim1[0], lim2[0]);
35
+ return {
36
+ matches: result.matches,
37
+ confidence: result.confidence.score,
38
+ factors: result.confidence.factors,
39
+ };
40
+ }
41
+ // Assertion comparison using normalized assertions with qualifier checking
42
+ const assertion1 = {
43
+ tool: toolName,
44
+ aspect: 'behavior',
45
+ fingerprint: createFingerprint(toolName, 'behavior', text1),
46
+ description: text1,
47
+ isPositive: !text1.toLowerCase().includes('not ') && !text1.toLowerCase().includes('no '),
48
+ };
49
+ const assertion2 = {
50
+ tool: toolName,
51
+ aspect: 'behavior',
52
+ fingerprint: createFingerprint(toolName, 'behavior', text2),
53
+ description: text2,
54
+ isPositive: !text2.toLowerCase().includes('not ') && !text2.toLowerCase().includes('no '),
55
+ };
56
+ const result = assertionsMatchWithConfidence(assertion1, assertion2);
57
+ return {
58
+ matches: result.matches,
59
+ confidence: result.confidence.score,
60
+ factors: result.confidence.factors,
61
+ };
62
+ }
63
+ }
64
+ /**
65
+ * Run a single test case.
66
+ */
67
+ function runTestCase(testCase, comparator, options) {
68
+ const startTime = performance.now();
69
+ const result = comparator.compare(testCase.text1, testCase.text2, testCase.toolName, testCase.category);
70
+ const durationMs = performance.now() - startTime;
71
+ // Determine if test passed
72
+ const matchCorrect = result.matches === testCase.expectedMatch;
73
+ let confidenceCorrect = true;
74
+ if (testCase.expectedConfidence && matchCorrect) {
75
+ confidenceCorrect =
76
+ result.confidence >= testCase.expectedConfidence.min &&
77
+ result.confidence <= testCase.expectedConfidence.max;
78
+ }
79
+ const passed = matchCorrect && confidenceCorrect;
80
+ // Determine failure type
81
+ let failureType;
82
+ if (!passed) {
83
+ if (!matchCorrect) {
84
+ failureType = testCase.expectedMatch ? 'false_negative' : 'false_positive';
85
+ }
86
+ else {
87
+ failureType = 'confidence_out_of_range';
88
+ }
89
+ }
90
+ return {
91
+ testCase,
92
+ actualMatch: result.matches,
93
+ actualConfidence: result.confidence,
94
+ passed,
95
+ failureType,
96
+ durationMs,
97
+ confidenceFactors: options.includeFactors ? result.factors : undefined,
98
+ };
99
+ }
100
+ /**
101
+ * Calculate metrics for a category.
102
+ */
103
+ function calculateCategoryMetrics(categoryName, results) {
104
+ const tp = results.filter((r) => r.actualMatch && r.testCase.expectedMatch).length;
105
+ const tn = results.filter((r) => !r.actualMatch && !r.testCase.expectedMatch).length;
106
+ const fp = results.filter((r) => r.actualMatch && !r.testCase.expectedMatch).length;
107
+ const fn = results.filter((r) => !r.actualMatch && r.testCase.expectedMatch).length;
108
+ const total = results.length;
109
+ const accuracy = total > 0 ? (tp + tn) / total : 0;
110
+ const precision = tp + fp > 0 ? tp / (tp + fp) : 0;
111
+ const recall = tp + fn > 0 ? tp / (tp + fn) : 0;
112
+ const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
113
+ return {
114
+ category: categoryName,
115
+ totalCases: total,
116
+ accuracy: Math.round(accuracy * 1000) / 10,
117
+ precision: Math.round(precision * 1000) / 10,
118
+ recall: Math.round(recall * 1000) / 10,
119
+ f1Score: Math.round(f1Score * 1000) / 10,
120
+ truePositives: tp,
121
+ trueNegatives: tn,
122
+ falsePositives: fp,
123
+ falseNegatives: fn,
124
+ };
125
+ }
126
+ /**
127
+ * Calculate calibration buckets.
128
+ */
129
+ function calculateCalibrationBuckets(results) {
130
+ const bucketRanges = [
131
+ { min: 90, max: 100 },
132
+ { min: 80, max: 90 },
133
+ { min: 70, max: 80 },
134
+ { min: 60, max: 70 },
135
+ { min: 50, max: 60 },
136
+ { min: 0, max: 50 },
137
+ ];
138
+ return bucketRanges.map((range) => {
139
+ const bucketResults = results.filter((r) => r.actualConfidence >= range.min && r.actualConfidence < range.max);
140
+ if (bucketResults.length === 0) {
141
+ return {
142
+ predictedRange: range,
143
+ actualAccuracy: 0,
144
+ sampleCount: 0,
145
+ calibrationError: 0,
146
+ };
147
+ }
148
+ const correct = bucketResults.filter((r) => r.actualMatch === r.testCase.expectedMatch).length;
149
+ const actualAccuracy = (correct / bucketResults.length) * 100;
150
+ const midpoint = (range.min + range.max) / 2;
151
+ const calibrationError = Math.abs(midpoint - actualAccuracy);
152
+ return {
153
+ predictedRange: range,
154
+ actualAccuracy: Math.round(actualAccuracy * 10) / 10,
155
+ sampleCount: bucketResults.length,
156
+ calibrationError: Math.round(calibrationError * 10) / 10,
157
+ };
158
+ });
159
+ }
160
+ /**
161
+ * Calculate Brier score for confidence calibration.
162
+ */
163
+ function calculateBrierScore(results) {
164
+ if (results.length === 0)
165
+ return 0;
166
+ const sumSquaredError = results.reduce((sum, r) => {
167
+ const predicted = r.actualConfidence / 100;
168
+ const actual = r.actualMatch === r.testCase.expectedMatch ? 1 : 0;
169
+ return sum + Math.pow(predicted - actual, 2);
170
+ }, 0);
171
+ return Math.round((sumSquaredError / results.length) * 1000) / 1000;
172
+ }
173
+ /**
174
+ * Run full evaluation against golden dataset.
175
+ */
176
+ export function evaluate(options = {}, comparator = new DefaultSemanticComparator()) {
177
+ const startTime = performance.now();
178
+ // Filter dataset by options
179
+ let dataset = GOLDEN_DATASET;
180
+ if (options.categories && options.categories.length > 0) {
181
+ dataset = dataset.filter((tc) => options.categories.includes(tc.category));
182
+ }
183
+ if (options.tags && options.tags.length > 0) {
184
+ dataset = dataset.filter((tc) => tc.tags && tc.tags.some((tag) => options.tags.includes(tag)));
185
+ }
186
+ // Run all test cases
187
+ const testResults = dataset.map((tc) => runTestCase(tc, comparator, options));
188
+ // Calculate overall metrics
189
+ const tp = testResults.filter((r) => r.actualMatch && r.testCase.expectedMatch).length;
190
+ const tn = testResults.filter((r) => !r.actualMatch && !r.testCase.expectedMatch).length;
191
+ const fp = testResults.filter((r) => r.actualMatch && !r.testCase.expectedMatch).length;
192
+ const fn = testResults.filter((r) => !r.actualMatch && r.testCase.expectedMatch).length;
193
+ const total = testResults.length;
194
+ const accuracy = total > 0 ? ((tp + tn) / total) * 100 : 0;
195
+ const precision = tp + fp > 0 ? (tp / (tp + fp)) * 100 : 0;
196
+ const recall = tp + fn > 0 ? (tp / (tp + fn)) * 100 : 0;
197
+ const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
198
+ // Calculate calibration
199
+ const calibrationBuckets = calculateCalibrationBuckets(testResults);
200
+ const calibrationError = calibrationBuckets.reduce((sum, b) => sum + b.calibrationError * b.sampleCount, 0) /
201
+ Math.max(1, testResults.length);
202
+ const brierScore = calculateBrierScore(testResults);
203
+ // Calculate category metrics
204
+ const categoryMetrics = [
205
+ calculateCategoryMetrics('security', testResults.filter((r) => r.testCase.category === 'security')),
206
+ calculateCategoryMetrics('limitation', testResults.filter((r) => r.testCase.category === 'limitation')),
207
+ calculateCategoryMetrics('assertion', testResults.filter((r) => r.testCase.category === 'assertion')),
208
+ ].filter((m) => m.totalCases > 0);
209
+ const totalDurationMs = performance.now() - startTime;
210
+ const averageComparisonMs = total > 0 ? totalDurationMs / total : 0;
211
+ return {
212
+ timestamp: new Date(),
213
+ algorithmVersion: '1.0.0',
214
+ datasetVersion: DATASET_VERSION,
215
+ totalCases: total,
216
+ accuracy: Math.round(accuracy * 10) / 10,
217
+ precision: Math.round(precision * 10) / 10,
218
+ recall: Math.round(recall * 10) / 10,
219
+ f1Score: Math.round(f1Score * 10) / 10,
220
+ truePositives: tp,
221
+ trueNegatives: tn,
222
+ falsePositives: fp,
223
+ falseNegatives: fn,
224
+ calibrationError: Math.round(calibrationError * 10) / 10,
225
+ brierScore,
226
+ calibrationBuckets,
227
+ categoryMetrics,
228
+ testResults,
229
+ failures: testResults.filter((r) => !r.passed),
230
+ totalDurationMs: Math.round(totalDurationMs),
231
+ averageComparisonMs: Math.round(averageComparisonMs * 100) / 100,
232
+ };
233
+ }
234
+ /**
235
+ * Create a summary for display.
236
+ */
237
+ export function createSummary(result) {
238
+ const fpRate = result.totalCases > 0 ? (result.falsePositives / result.totalCases) * 100 : 0;
239
+ const fnRate = result.totalCases > 0 ? (result.falseNegatives / result.totalCases) * 100 : 0;
240
+ return {
241
+ accuracy: `${result.accuracy}%`,
242
+ precision: `${result.precision}%`,
243
+ recall: `${result.recall}%`,
244
+ f1Score: `${result.f1Score}%`,
245
+ falsePositiveRate: `${Math.round(fpRate * 10) / 10}%`,
246
+ falseNegativeRate: `${Math.round(fnRate * 10) / 10}%`,
247
+ calibrationError: `${result.calibrationError}%`,
248
+ totalCases: result.totalCases,
249
+ passedCases: result.totalCases - result.failures.length,
250
+ failedCases: result.failures.length,
251
+ };
252
+ }
253
+ /**
254
+ * Format evaluation result for console output.
255
+ */
256
+ export function formatEvaluationReport(result) {
257
+ const summary = createSummary(result);
258
+ const lines = [];
259
+ lines.push('');
260
+ lines.push('═══════════════════════════════════════════════════════════════');
261
+ lines.push(' DRIFT DETECTION EVALUATION REPORT ');
262
+ lines.push('═══════════════════════════════════════════════════════════════');
263
+ lines.push('');
264
+ lines.push(` Dataset Version: ${result.datasetVersion}`);
265
+ lines.push(` Total Test Cases: ${result.totalCases}`);
266
+ lines.push('');
267
+ lines.push(' ACCURACY METRICS');
268
+ lines.push(' ────────────────────────────────────────────────────────────');
269
+ lines.push(` ├── Accuracy: ${summary.accuracy.padStart(6)} (${result.truePositives + result.trueNegatives}/${result.totalCases} correct)`);
270
+ lines.push(` ├── Precision: ${summary.precision.padStart(6)} (low false positive rate)`);
271
+ lines.push(` ├── Recall: ${summary.recall.padStart(6)} (catches most real drift)`);
272
+ lines.push(` └── F1 Score: ${summary.f1Score.padStart(6)}`);
273
+ lines.push('');
274
+ lines.push(' CONFUSION MATRIX');
275
+ lines.push(' ────────────────────────────────────────────────────────────');
276
+ lines.push(' ┌─────────────┬──────────┬──────────┐');
277
+ lines.push(' │ │ Predicted│ Predicted│');
278
+ lines.push(' │ │ Match │ Different│');
279
+ lines.push(' ├─────────────┼──────────┼──────────┤');
280
+ lines.push(` │ Actual Match│ ${String(result.truePositives).padStart(5)} TP │ ${String(result.falseNegatives).padStart(5)} FN │`);
281
+ lines.push(` │ Actual Diff │ ${String(result.falsePositives).padStart(5)} FP │ ${String(result.trueNegatives).padStart(5)} TN │`);
282
+ lines.push(' └─────────────┴──────────┴──────────┘');
283
+ lines.push('');
284
+ lines.push(' CONFIDENCE CALIBRATION');
285
+ lines.push(' ────────────────────────────────────────────────────────────');
286
+ lines.push(` ├── Calibration Error: ${summary.calibrationError}`);
287
+ lines.push(` └── Brier Score: ${result.brierScore}`);
288
+ lines.push('');
289
+ if (result.categoryMetrics.length > 0) {
290
+ lines.push(' CATEGORY BREAKDOWN');
291
+ lines.push(' ────────────────────────────────────────────────────────────');
292
+ for (const cat of result.categoryMetrics) {
293
+ lines.push(` ├── ${cat.category.padEnd(12)}: ${cat.accuracy}% accuracy (${cat.truePositives + cat.trueNegatives}/${cat.totalCases})`);
294
+ }
295
+ lines.push('');
296
+ }
297
+ if (result.failures.length > 0) {
298
+ lines.push(` FAILURES (${result.failures.length} cases)`);
299
+ lines.push(' ────────────────────────────────────────────────────────────');
300
+ const fpCount = result.failures.filter((f) => f.failureType === 'false_positive').length;
301
+ const fnCount = result.failures.filter((f) => f.failureType === 'false_negative').length;
302
+ const confCount = result.failures.filter((f) => f.failureType === 'confidence_out_of_range').length;
303
+ if (fpCount > 0)
304
+ lines.push(` ├── ${fpCount} False Positives (flagged drift when none)`);
305
+ if (fnCount > 0)
306
+ lines.push(` ├── ${fnCount} False Negatives (missed real drift)`);
307
+ if (confCount > 0)
308
+ lines.push(` └── ${confCount} Confidence Miscalibrations`);
309
+ lines.push('');
310
+ }
311
+ lines.push('═══════════════════════════════════════════════════════════════');
312
+ lines.push('');
313
+ return lines.join('\n');
314
+ }
315
+ /**
316
+ * Export results as JSON for external analysis.
317
+ */
318
+ export function exportResultsAsJson(result) {
319
+ return JSON.stringify(result, null, 2);
320
+ }
321
+ // Re-export for convenience
322
+ export { GOLDEN_DATASET, getDatasetStatistics };
323
+ //# sourceMappingURL=evaluator.js.map
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Expanded Golden Dataset for Drift Detection Evaluation
3
+ *
4
+ * Phase 3 expansion: 150+ additional labeled test cases covering:
5
+ * - Extended security vulnerability paraphrases
6
+ * - Comprehensive limitation variations
7
+ * - Assertion behavior matching
8
+ * - Edge cases (negation, severity, constraints)
9
+ * - Paraphrase robustness tests
10
+ *
11
+ * These cases are designed to:
12
+ * 1. Test algorithm robustness against paraphrase variations
13
+ * 2. Verify correct handling of edge cases
14
+ * 3. Ensure high recall without sacrificing precision
15
+ */
16
+ import type { GoldenTestCase } from './types.js';
17
+ export declare const PATH_TRAVERSAL_CASES: GoldenTestCase[];
18
+ export declare const SQL_INJECTION_CASES: GoldenTestCase[];
19
+ export declare const XSS_CASES: GoldenTestCase[];
20
+ export declare const COMMAND_INJECTION_CASES: GoldenTestCase[];
21
+ export declare const SSRF_CASES: GoldenTestCase[];
22
+ export declare const AUTH_CASES: GoldenTestCase[];
23
+ export declare const SIZE_LIMIT_CASES: GoldenTestCase[];
24
+ export declare const RATE_LIMIT_CASES: GoldenTestCase[];
25
+ export declare const TIMEOUT_CASES: GoldenTestCase[];
26
+ export declare const FORMAT_CASES: GoldenTestCase[];
27
+ export declare const ASSERTION_CASES: GoldenTestCase[];
28
+ export declare const NEGATION_CASES: GoldenTestCase[];
29
+ export declare const SEVERITY_CASES: GoldenTestCase[];
30
+ export declare const SIMILAR_DIFFERENT_CASES: GoldenTestCase[];
31
+ export declare const PARAPHRASE_TECHNICAL_CASES: GoldenTestCase[];
32
+ export declare const PARAPHRASE_VOICE_CASES: GoldenTestCase[];
33
+ export declare const PARAPHRASE_REGISTER_CASES: GoldenTestCase[];
34
+ export declare const EXPANDED_TEST_CASES: GoldenTestCase[];
35
+ /**
36
+ * Get statistics about the expanded dataset.
37
+ */
38
+ export declare function getExpandedDatasetStatistics(): {
39
+ totalCases: number;
40
+ byCategory: Record<string, number>;
41
+ byTag: Record<string, number>;
42
+ truePositives: number;
43
+ trueNegatives: number;
44
+ };
45
+ //# sourceMappingURL=expanded-dataset.d.ts.map