nodebench-mcp 2.58.0 → 2.60.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * llmJudgeEval.ts — LLM-judged boolean-metric eval harness for NodeBench MCP
4
+ *
5
+ * Architecture:
6
+ * 1. Query Corpus — 500+ typed test queries across 11 personas × 8 scenarios
7
+ * 2. Tool Executor — loads preset, runs discover_tools + tool chain, captures outputs
8
+ * 3. LLM Judge — Gemini Flash Lite boolean evaluation per criterion
9
+ * 4. Boolean Metrics — precision, recall, forbidden violations, criteria pass rate
10
+ * 5. Regression Detection — SQLite-backed diff between runs
11
+ *
12
+ * Usage:
13
+ * cd packages/mcp-local
14
+ * npx tsx src/benchmarks/llmJudgeEval.ts [--queries N] [--persona X] [--baseline RUN_ID] [--flywheel]
15
+ */
16
+ export type Persona = "founder" | "banker" | "ceo" | "researcher" | "student" | "operator" | "legal" | "pm" | "contractor" | "investor" | "content";
17
+ export type Scenario = "weekly_reset" | "company_search" | "competitor_brief" | "delegation" | "important_change" | "memo_export" | "packet_diff" | "role_switch";
18
+ export interface BooleanCriterion {
19
+ criterion: string;
20
+ weight: number;
21
+ }
22
+ export interface EvalQuery {
23
+ id: string;
24
+ query: string;
25
+ persona: Persona;
26
+ scenario: Scenario;
27
+ expectedTools: string[];
28
+ forbiddenTools: string[];
29
+ booleanCriteria: BooleanCriterion[];
30
+ }
31
+ export interface CriterionResult {
32
+ criterion: string;
33
+ pass: boolean;
34
+ evidence: string;
35
+ }
36
+ export interface JudgeResponse {
37
+ criteria: CriterionResult[];
38
+ overallPass: boolean;
39
+ }
40
+ export interface QueryResult {
41
+ queryId: string;
42
+ pass: boolean;
43
+ criteriaResults: CriterionResult[];
44
+ toolsFired: string[];
45
+ toolPrecision: number;
46
+ toolRecall: number;
47
+ forbiddenViolations: number;
48
+ criteriaPassRate: number;
49
+ judgeResponse: string;
50
+ ms: number;
51
+ }
52
+ export interface RunSummary {
53
+ runId: string;
54
+ timestamp: string;
55
+ queryCount: number;
56
+ passRate: number;
57
+ avgToolPrecision: number;
58
+ avgToolRecall: number;
59
+ totalForbiddenViolations: number;
60
+ avgCriteriaPassRate: number;
61
+ byPersona: Record<string, {
62
+ pass: number;
63
+ total: number;
64
+ rate: number;
65
+ }>;
66
+ byScenario: Record<string, {
67
+ pass: number;
68
+ total: number;
69
+ rate: number;
70
+ }>;
71
+ byCriterion: Record<string, {
72
+ pass: number;
73
+ total: number;
74
+ rate: number;
75
+ }>;
76
+ }
77
+ export interface RegressionItem {
78
+ queryId: string;
79
+ criterion: string;
80
+ baselinePass: boolean;
81
+ currentPass: boolean;
82
+ }
83
+ /** Build the full 500-query corpus */
84
+ export declare function generateQueryCorpus(): EvalQuery[];
85
+ export declare function detectRegressions(currentRunId: string, baselineRunId: string): RegressionItem[];
86
+ export declare function detectImprovements(currentRunId: string, baselineRunId: string): RegressionItem[];
87
+ export interface RunOptions {
88
+ queryLimit: number;
89
+ persona?: Persona;
90
+ scenario?: Scenario;
91
+ baselineRunId?: string;
92
+ /** If true, only generate corpus and print stats without executing */
93
+ dryRun?: boolean;
94
+ /** If true, run self-improving flywheel loop: eval → diagnose → grow → re-eval */
95
+ flywheel?: boolean;
96
+ }
97
+ export declare function runLlmJudgeEval(options: RunOptions): Promise<RunSummary>;
98
+ export type FailureRootCause = "tool_not_found" | "tool_error" | "empty_output" | "criteria_mismatch" | "heuristic_too_strict";
99
+ export interface DiagnosisEntry {
100
+ queryId: string;
101
+ rootCause: FailureRootCause;
102
+ detail: string;
103
+ suggestedFix: string;
104
+ }
105
+ export interface DiagnosisReport {
106
+ runId: string;
107
+ totalFails: number;
108
+ byCause: Record<FailureRootCause, DiagnosisEntry[]>;
109
+ topSuggestions: string[];
110
+ }
111
+ /** Diagnose all FAIL results from a given run, grouping by root cause */
112
+ export declare function diagnoseFailures(runId: string): Promise<DiagnosisReport>;
113
+ /** Generate new corpus queries from a diagnosis report to cover gaps */
114
+ export declare function growCorpus(diagnosis: DiagnosisReport): EvalQuery[];