nodebench-mcp 2.58.0 → 2.60.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1644 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * llmJudgeEval.ts — LLM-judged boolean-metric eval harness for NodeBench MCP
4
+ *
5
+ * Architecture:
6
+ * 1. Query Corpus — 500+ typed test queries across 11 personas × 8 scenarios
7
+ * 2. Tool Executor — loads preset, runs discover_tools + tool chain, captures outputs
8
+ * 3. LLM Judge — Gemini Flash Lite boolean evaluation per criterion
9
+ * 4. Boolean Metrics — precision, recall, forbidden violations, criteria pass rate
10
+ * 5. Regression Detection — SQLite-backed diff between runs
11
+ *
12
+ * Usage:
13
+ * cd packages/mcp-local
14
+ * npx tsx src/benchmarks/llmJudgeEval.ts [--queries N] [--persona X] [--baseline RUN_ID] [--flywheel]
15
+ */
16
+ import { getDb, genId } from "../db.js";
17
+ import { _setDbAccessor } from "../tools/toolRegistry.js";
18
+ import { loadToolsets, ALL_DOMAIN_KEYS } from "../toolsetRegistry.js";
19
+ // ══════════════════════════════════════════════════════════════════════════════
20
+ // SCHEMA — eval tables (appended to existing DB)
21
+ // ══════════════════════════════════════════════════════════════════════════════
22
+ const LLM_EVAL_SCHEMA = `
23
+ CREATE TABLE IF NOT EXISTS llm_eval_runs (
24
+ run_id TEXT PRIMARY KEY,
25
+ timestamp TEXT NOT NULL DEFAULT (datetime('now')),
26
+ query_count INTEGER NOT NULL DEFAULT 0,
27
+ pass_rate REAL NOT NULL DEFAULT 0,
28
+ persona TEXT,
29
+ scenario TEXT,
30
+ summary_json TEXT
31
+ );
32
+
33
+ CREATE TABLE IF NOT EXISTS llm_eval_results (
34
+ id TEXT PRIMARY KEY,
35
+ run_id TEXT NOT NULL,
36
+ query_id TEXT NOT NULL,
37
+ pass INTEGER NOT NULL DEFAULT 0,
38
+ criteria_json TEXT,
39
+ tools_precision REAL NOT NULL DEFAULT 0,
40
+ tools_recall REAL NOT NULL DEFAULT 0,
41
+ forbidden_violations INTEGER NOT NULL DEFAULT 0,
42
+ criteria_pass_rate REAL NOT NULL DEFAULT 0,
43
+ judge_response TEXT,
44
+ ms INTEGER NOT NULL DEFAULT 0
45
+ );
46
+
47
+ CREATE INDEX IF NOT EXISTS idx_llm_eval_results_run ON llm_eval_results(run_id);
48
+ CREATE INDEX IF NOT EXISTS idx_llm_eval_results_query ON llm_eval_results(query_id);
49
+ `;
50
+ function ensureSchema() {
51
+ const db = getDb();
52
+ db.exec(LLM_EVAL_SCHEMA);
53
+ }
54
+ // ══════════════════════════════════════════════════════════════════════════════
55
+ // QUERY CORPUS GENERATOR — 500 queries, programmatic
56
+ // ══════════════════════════════════════════════════════════════════════════════
57
+ const PERSONAS = [
58
+ "founder", "banker", "ceo", "researcher", "student",
59
+ "operator", "legal", "pm", "contractor", "investor", "content",
60
+ ];
61
+ const SCENARIOS = [
62
+ "weekly_reset", "company_search", "competitor_brief", "delegation",
63
+ "important_change", "memo_export", "packet_diff", "role_switch",
64
+ ];
65
+ function founderTemplates() {
66
+ return [
67
+ // weekly_reset
68
+ { query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
69
+ { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
70
+ { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
71
+ { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
72
+ { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
73
+ { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
74
+ // company_search
75
+ { query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
76
+ { query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
77
+ // competitor_brief
78
+ { query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
79
+ { query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
80
+ // delegation
81
+ { query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
82
+ { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
83
+ // important_change
84
+ { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
85
+ { query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
86
+ // memo_export
87
+ { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
88
+ { query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
89
+ // packet_diff
90
+ { query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
91
+ { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
92
+ // role_switch
93
+ { query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
94
+ { query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
95
+ ];
96
+ }
97
+ function bankerTemplates() {
98
+ return [
99
+ { query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
100
+ { query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
101
+ { query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
102
+ { query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
103
+ { query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
104
+ { query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
105
+ { query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
106
+ { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
107
+ { query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
108
+ { query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
109
+ { query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
110
+ { query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
111
+ { query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
112
+ { query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
113
+ ];
114
+ }
115
+ function ceoTemplates() {
116
+ return [
117
+ { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
118
+ { query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
119
+ { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
120
+ { query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
121
+ { query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
122
+ { query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
123
+ { query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
124
+ { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
125
+ { query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
126
+ { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
127
+ { query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
128
+ { query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
129
+ ];
130
+ }
131
+ function researcherTemplates() {
132
+ return [
133
+ { query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
134
+ { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
135
+ { query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
136
+ { query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
137
+ { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
138
+ { query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
139
+ { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
140
+ { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
141
+ { query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
142
+ { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
143
+ { query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
144
+ { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
145
+ ];
146
+ }
147
+ function studentTemplates() {
148
+ return [
149
+ { query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
150
+ { query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
151
+ { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
152
+ { query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
153
+ { query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
154
+ { query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
155
+ { query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
156
+ { query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
157
+ { query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
158
+ { query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
159
+ { query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
160
+ ];
161
+ }
162
+ function operatorTemplates() {
163
+ return [
164
+ { query: "Show me the system health dashboard for today", scenario: "weekly_reset", expectedTools: ["get_ops_dashboard", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output presents system health metrics", weight: 1 }, { criterion: "Output highlights any degraded services", weight: 1 }, { criterion: "Output is operational in tone", weight: 1 }] },
165
+ { query: "What incidents happened this week and are they resolved?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output lists incidents", weight: 1 }, { criterion: "Output includes resolution status", weight: 1 }, { criterion: "Output identifies root causes", weight: 1 }] },
166
+ { query: "Run a health check on all MCP infrastructure", scenario: "company_search", expectedTools: ["check_mcp_setup", "get_ops_dashboard"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output checks multiple infrastructure components", weight: 1 }, { criterion: "Output reports pass/fail per component", weight: 1 }, { criterion: "Output suggests fixes for failures", weight: 1 }] },
167
+ { query: "Delegate the on-call rotation setup to the SRE team", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains delegation instructions", weight: 1 }, { criterion: "Output specifies SRE-relevant details", weight: 1 }, { criterion: "Output includes escalation paths", weight: 1 }] },
168
+ { query: "What deployments went out this week and did any cause issues?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output references deployments", weight: 1 }, { criterion: "Output correlates deployments with incidents", weight: 1 }, { criterion: "Output identifies rollback candidates", weight: 1 }] },
169
+ { query: "Compare our uptime this month vs last month", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "get_ops_dashboard"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output includes uptime percentages or trends", weight: 1 }, { criterion: "Output identifies the biggest contributor to downtime", weight: 1 }, { criterion: "Output does not fabricate exact uptime numbers", weight: 2 }] },
170
+ { query: "Export the incident report for the API outage", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output follows incident report structure", weight: 1 }, { criterion: "Output includes timeline, impact, and root cause", weight: 1 }, { criterion: "Output is shareable with stakeholders", weight: 1 }] },
171
+ { query: "Flag any alerts that have been unacknowledged for over 24 hours", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output identifies stale alerts", weight: 1 }, { criterion: "Output includes age of each alert", weight: 1 }, { criterion: "Output suggests escalation for critical ones", weight: 1 }] },
172
+ { query: "Switch to researcher mode to investigate the performance regression", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output shifts to investigation perspective", weight: 1 }, { criterion: "Output suggests diagnostic tools and approaches", weight: 1 }, { criterion: "Output identifies data to collect", weight: 1 }] },
173
+ { query: "What's the current capacity utilization across our services?", scenario: "company_search", expectedTools: ["get_ops_dashboard"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references capacity metrics", weight: 1 }, { criterion: "Output identifies services near capacity", weight: 1 }, { criterion: "Output suggests scaling actions", weight: 1 }] },
174
+ { query: "Prepare a runbook for the database migration this weekend", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output is structured as a runbook", weight: 1 }, { criterion: "Output includes rollback steps", weight: 1 }, { criterion: "Output includes pre-flight checks", weight: 1 }] },
175
+ ];
176
+ }
177
+ function legalTemplates() {
178
+ return [
179
+ { query: "Check our contracts for compliance with the new data privacy regulation", scenario: "company_search", expectedTools: ["check_contract_compliance"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references data privacy regulations", weight: 1 }, { criterion: "Output identifies compliance gaps", weight: 1 }, { criterion: "Output does not provide actual legal advice", weight: 1 }] },
180
+ { query: "What legal risks should we flag this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies legal risk categories", weight: 1 }, { criterion: "Output prioritizes risks by severity", weight: 1 }, { criterion: "Output includes a disclaimer about not being legal counsel", weight: 1 }] },
181
+ { query: "Compare the terms of our vendor contracts for consistency", scenario: "competitor_brief", expectedTools: ["compare_options", "check_contract_compliance"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output compares contract terms systematically", weight: 1 }, { criterion: "Output identifies inconsistencies", weight: 1 }, { criterion: "Output suggests standardization opportunities", weight: 1 }] },
182
+ { query: "Export the contract review findings for outside counsel", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output is formal and counsel-appropriate", weight: 1 }, { criterion: "Output includes numbered findings", weight: 1 }, { criterion: "Output preserves legal terminology", weight: 1 }] },
183
+ { query: "Flag any IP-related changes in our competitor filings", scenario: "important_change", expectedTools: ["get_important_changes", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references IP or patent filings", weight: 1 }, { criterion: "Output identifies specific competitors", weight: 1 }, { criterion: "Output assesses impact on our position", weight: 1 }] },
184
+ { query: "Prepare a delegation brief for the paralegal on discovery tasks", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output is delegation-appropriate", weight: 1 }, { criterion: "Output specifies legal discovery requirements", weight: 1 }, { criterion: "Output includes deadlines", weight: 1 }] },
185
+ { query: "How have our contractual obligations changed since last quarter?", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output tracks contractual changes", weight: 1 }, { criterion: "Output distinguishes new vs modified obligations", weight: 1 }, { criterion: "Output highlights risk-increasing changes", weight: 1 }] },
186
+ { query: "Switch to banker mode to assess the financial exposure from this lawsuit", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output adopts a financial assessment perspective", weight: 1 }, { criterion: "Output estimates exposure ranges", weight: 1 }, { criterion: "Output caveats financial estimates appropriately", weight: 1 }] },
187
+ { query: "Review the NDA template for common issues", scenario: "company_search", expectedTools: ["check_contract_compliance"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output references NDA-specific terms", weight: 1 }, { criterion: "Output identifies common NDA pitfalls", weight: 1 }, { criterion: "Output suggests improvements", weight: 1 }] },
188
+ { query: "What regulatory filings are due this month?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output lists upcoming deadlines", weight: 1 }, { criterion: "Output includes filing types", weight: 1 }, { criterion: "Output suggests preparation steps", weight: 1 }] },
189
+ { query: "Summarize the liability exposure across all active contracts", scenario: "company_search", expectedTools: ["check_contract_compliance", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output addresses liability specifically", weight: 1 }, { criterion: "Output categorizes by contract type", weight: 1 }, { criterion: "Output does not fabricate liability amounts", weight: 2 }] },
190
+ ];
191
+ }
192
+ function pmTemplates() {
193
+ return [
194
+ { query: "What's the status of all feature requests from this sprint?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output lists feature requests", weight: 1 }, { criterion: "Output includes status per feature", weight: 1 }, { criterion: "Output identifies blockers", weight: 1 }] },
195
+ { query: "Compare the user feedback for Feature A vs Feature B", scenario: "competitor_brief", expectedTools: ["compare_options"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares two features", weight: 1 }, { criterion: "Output references user feedback", weight: 1 }, { criterion: "Output includes a recommendation", weight: 1 }] },
196
+ { query: "Prepare a sprint retrospective document", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output follows retro format (what went well, what didn't, actions)", weight: 1 }, { criterion: "Output is specific to the current sprint", weight: 1 }, { criterion: "Output includes actionable improvements", weight: 1 }] },
197
+ { query: "What user-facing changes went live this week?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output lists specific changes", weight: 1 }, { criterion: "Output focuses on user impact", weight: 1 }, { criterion: "Output includes release dates", weight: 1 }] },
198
+ { query: "Create a PRD outline for the new onboarding flow", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output follows PRD structure", weight: 1 }, { criterion: "Output includes user stories or acceptance criteria", weight: 1 }, { criterion: "Output is scoped appropriately", weight: 1 }] },
199
+ { query: "Research what competitors are doing with their onboarding", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies competitor onboarding approaches", weight: 1 }, { criterion: "Output includes specific examples", weight: 1 }, { criterion: "Output derives actionable insights", weight: 1 }] },
200
+ { query: "How has our feature velocity changed over the last 3 sprints?", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output tracks velocity over time", weight: 1 }, { criterion: "Output identifies trends", weight: 1 }, { criterion: "Output suggests causes for velocity changes", weight: 1 }] },
201
+ { query: "Delegate the user research interviews to the UX researcher", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output includes interview script or topics", weight: 1 }, { criterion: "Output specifies target user segments", weight: 1 }, { criterion: "Output includes expected deliverables", weight: 1 }] },
202
+ { query: "Switch to content mode and draft the release notes", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output shifts to content writing perspective", weight: 1 }, { criterion: "Output drafts user-facing release notes", weight: 1 }, { criterion: "Output is polished and non-technical", weight: 1 }] },
203
+ { query: "What are the top 5 user pain points from support tickets?", scenario: "company_search", expectedTools: ["run_recon", "discover_tools"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output lists specific pain points", weight: 1 }, { criterion: "Output includes frequency or severity", weight: 1 }, { criterion: "Output suggests product solutions", weight: 1 }] },
204
+ { query: "Flag any scope creep in the current sprint", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies scope additions", weight: 1 }, { criterion: "Output assesses impact on timeline", weight: 1 }, { criterion: "Output recommends scope management actions", weight: 1 }] },
205
+ ];
206
+ }
207
+ function contractorTemplates() {
208
+ return [
209
+ { query: "What's my task list for this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output lists specific tasks", weight: 1 }, { criterion: "Output includes priorities", weight: 1 }, { criterion: "Output is scoped to the contractor's role", weight: 1 }] },
210
+ { query: "Show me the project context I need to onboard", scenario: "company_search", expectedTools: ["get_project_context", "discover_tools"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output provides project overview", weight: 1 }, { criterion: "Output includes key contacts or resources", weight: 1 }, { criterion: "Output is onboarding-appropriate", weight: 1 }] },
211
+ { query: "Export my weekly deliverables report for the client", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output is client-facing in tone", weight: 1 }, { criterion: "Output lists deliverables with status", weight: 1 }, { criterion: "Output includes hours or effort summary", weight: 1 }] },
212
+ { query: "What changed in the project requirements since I was last briefed?", scenario: "important_change", expectedTools: ["get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies specific requirement changes", weight: 1 }, { criterion: "Output highlights impact on current work", weight: 1 }, { criterion: "Output suggests clarification questions", weight: 1 }] },
213
+ { query: "Compare the scope of my current contract vs the original SOW", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares current vs original scope", weight: 1 }, { criterion: "Output identifies scope expansion", weight: 1 }, { criterion: "Output suggests contract amendment if needed", weight: 1 }] },
214
+ { query: "Find the coding standards document for this project", scenario: "company_search", expectedTools: ["discover_tools", "get_project_context"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output helps locate documentation", weight: 1 }, { criterion: "Output is specific to coding standards", weight: 1 }, { criterion: "Output suggests follow-up resources", weight: 1 }] },
215
+ { query: "Delegate the testing tasks to the QA contractor", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains test delegation details", weight: 1 }, { criterion: "Output specifies test scope and criteria", weight: 1 }, { criterion: "Output includes acceptance standards", weight: 1 }] },
216
+ { query: "Switch to PM mode to understand the feature priority", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output adopts a PM perspective", weight: 1 }, { criterion: "Output discusses prioritization frameworks", weight: 1 }, { criterion: "Output helps contextualize current work", weight: 1 }] },
217
+ { query: "Flag any blockers that are preventing my progress", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies specific blockers", weight: 1 }, { criterion: "Output suggests workarounds or escalation paths", weight: 1 }, { criterion: "Output includes who can unblock", weight: 1 }] },
218
+ { query: "What tools are available for code review in this project?", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output lists relevant tools", weight: 1 }, { criterion: "Output includes brief descriptions", weight: 1 }, { criterion: "Output is filtered to code review context", weight: 1 }] },
219
+ ];
220
+ }
221
+ function investorTemplates() {
222
+ return [
223
+ { query: "Run due diligence on this Series A deal with TechStartup Inc", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output follows due diligence structure", weight: 1 }, { criterion: "Output identifies key risk factors", weight: 1 }, { criterion: "Output does not fabricate valuation numbers", weight: 2 }, { criterion: "Output includes market context", weight: 1 }] },
224
+ { query: "What are the red flags in this company's pitch deck?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies specific red flags", weight: 1 }, { criterion: "Output categorizes flags by severity", weight: 1 }, { criterion: "Output suggests follow-up questions", weight: 1 }] },
225
+ { query: "Compare the cap tables of our portfolio companies", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares equity structures", weight: 1 }, { criterion: "Output identifies dilution risks", weight: 1 }, { criterion: "Output does not invent specific percentages", weight: 2 }] },
226
+ { query: "Prepare the quarterly LP update letter", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output follows LP update format", weight: 1 }, { criterion: "Output covers portfolio performance, exits, and pipeline", weight: 1 }, { criterion: "Output is professional and measured in tone", weight: 1 }] },
227
+ { query: "What's changed in the macro environment that affects our thesis?", scenario: "important_change", expectedTools: ["get_important_changes", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references macroeconomic factors", weight: 1 }, { criterion: "Output connects macro to investment thesis", weight: 1 }, { criterion: "Output is data-driven, not speculative", weight: 1 }] },
228
+ { query: "Track how our portfolio company valuations shifted this quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output tracks valuation changes", weight: 1 }, { criterion: "Output identifies up-rounds and down-rounds", weight: 1 }, { criterion: "Output does not fabricate specific valuations", weight: 2 }] },
229
+ { query: "Delegate the market sizing analysis to the associate", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output includes market sizing methodology", weight: 1 }, { criterion: "Output specifies data sources to use", weight: 1 }, { criterion: "Output includes expected deliverable format", weight: 1 }] },
230
+ { query: "Switch to founder mode and evaluate the product from a builder's lens", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output shifts to builder/product perspective", weight: 1 }, { criterion: "Output evaluates technical feasibility", weight: 1 }, { criterion: "Output identifies product-market fit signals", weight: 1 }] },
231
+ { query: "Give me the weekly portfolio pulse", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output covers portfolio companies", weight: 1 }, { criterion: "Output highlights winners and at-risk companies", weight: 1 }, { criterion: "Output is concise for a weekly cadence", weight: 1 }] },
232
+ { query: "What deal flow came in this week worth evaluating?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references deal flow", weight: 1 }, { criterion: "Output includes basic screening criteria", weight: 1 }, { criterion: "Output recommends which to pursue", weight: 1 }] },
233
+ { query: "Research the competitive landscape for this fintech vertical", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output maps the fintech competitive landscape", weight: 1 }, { criterion: "Output identifies market leaders and challengers", weight: 1 }, { criterion: "Output assesses white space opportunities", weight: 1 }] },
234
+ ];
235
+ }
236
+ function contentTemplates() {
237
+ return [
238
+ { query: "Draft a LinkedIn post about our latest product launch", scenario: "memo_export", expectedTools: ["export_artifact_packet", "compress_or_expand_text"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output is formatted for LinkedIn", weight: 1 }, { criterion: "Output is under 300 words", weight: 1 }, { criterion: "Output includes a hook and CTA", weight: 1 }] },
239
+ { query: "What trending topics should we create content around this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies trending topics", weight: 1 }, { criterion: "Output connects trends to our brand", weight: 1 }, { criterion: "Output suggests specific content formats", weight: 1 }] },
240
+ { query: "Compare our content strategy against HubSpot and Buffer", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares content strategies", weight: 1 }, { criterion: "Output identifies what competitors do better", weight: 1 }, { criterion: "Output includes actionable takeaways", weight: 1 }] },
241
+ { query: "Export the content calendar for next month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output is calendar-structured", weight: 1 }, { criterion: "Output includes content types and topics", weight: 1 }, { criterion: "Output assigns rough dates", weight: 1 }] },
242
+ { query: "What content performed best this month and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies top-performing content", weight: 1 }, { criterion: "Output includes metrics or proxies for performance", weight: 1 }, { criterion: "Output analyzes why it performed well", weight: 1 }] },
243
+ { query: "Track how our messaging has evolved over the past quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output tracks messaging evolution", weight: 1 }, { criterion: "Output identifies key narrative shifts", weight: 1 }, { criterion: "Output assesses consistency", weight: 1 }] },
244
+ { query: "Delegate the blog post writing to the content contractor", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output includes writing brief", weight: 1 }, { criterion: "Output specifies tone, audience, and word count", weight: 1 }, { criterion: "Output includes SEO keywords if relevant", weight: 1 }] },
245
+ { query: "Research what type of content resonates in the AI/ML space on Twitter", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies content types that perform well", weight: 1 }, { criterion: "Output includes examples or patterns", weight: 1 }, { criterion: "Output is specific to AI/ML audience", weight: 1 }] },
246
+ { query: "Switch to researcher mode to find data points for the whitepaper", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output shifts to research perspective", weight: 1 }, { criterion: "Output identifies relevant data sources", weight: 1 }, { criterion: "Output suggests citation-worthy statistics", weight: 1 }] },
247
+ { query: "Create a brand voice guideline document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output follows brand voice guide structure", weight: 1 }, { criterion: "Output includes tone, vocabulary, and examples", weight: 1 }, { criterion: "Output is usable by external writers", weight: 1 }] },
248
+ { query: "What changes should we make to our newsletter strategy?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output assesses current newsletter performance", weight: 1 }, { criterion: "Output suggests specific improvements", weight: 1 }, { criterion: "Output is based on audience data or trends", weight: 1 }] },
249
+ ];
250
+ }
251
+ /** Generate N filler queries per persona to reach exactly 500 total */
252
+ function generateFillerQueries(persona, existingCount, targetCount) {
253
+ const fillers = [];
254
+ const scenarioPool = SCENARIOS;
255
+ const fillerPatterns = {
256
+ weekly_reset: [
257
+ "Summarize the key metrics from this week",
258
+ "What progress did we make on our top priorities?",
259
+ "List the unresolved items from last week's review",
260
+ "Highlight any trends I should be aware of this week",
261
+ "What's the team's bandwidth looking like this week?",
262
+ "Give me a one-paragraph summary of where we stand",
263
+ ],
264
+ company_search: [
265
+ "What do we know about {company} and their recent activity?",
266
+ "Research the market position of {company}",
267
+ "Pull the latest information on {company}'s product lineup",
268
+ "What is {company} doing differently than last quarter?",
269
+ "Find information about {company}'s team and leadership",
270
+ "What public information is available about {company}'s strategy?",
271
+ ],
272
+ competitor_brief: [
273
+ "How does our approach compare to the industry standard?",
274
+ "What are our competitors doing that we're not?",
275
+ "Rank our top 3 competitors by threat level",
276
+ "Identify the white space in our competitive landscape",
277
+ "What moats do we have that competitors lack?",
278
+ ],
279
+ delegation: [
280
+ "Create a delegation brief for the {task} project",
281
+ "Package the {task} instructions for the team lead",
282
+ "Write up the handoff notes for {task}",
283
+ "Prepare a scope document for delegating {task}",
284
+ "Draft the assignment brief for {task} with clear success criteria",
285
+ ],
286
+ important_change: [
287
+ "What changed since yesterday that I should know about?",
288
+ "Are there any new risks or opportunities this week?",
289
+ "Flag anything that's different from our last check-in",
290
+ "What signals should I be paying attention to right now?",
291
+ "Identify the most impactful change in our environment today",
292
+ ],
293
+ memo_export: [
294
+ "Export a summary document of our current status",
295
+ "Package our findings into a shareable format",
296
+ "Create an executive summary for external stakeholders",
297
+ "Prepare a brief for the upcoming meeting",
298
+ "Format our analysis as a polished report",
299
+ ],
300
+ packet_diff: [
301
+ "How has our position changed since last month?",
302
+ "Compare today's state to where we were last quarter",
303
+ "What's the delta between our current and previous assessments?",
304
+ "Track the evolution of our strategy over the past 3 months",
305
+ "Show me what shifted between the last two snapshots",
306
+ ],
307
+ role_switch: [
308
+ "Switch perspective and analyze this from a different angle",
309
+ "Look at this problem through a {role} lens",
310
+ "Change my viewpoint to evaluate this differently",
311
+ "Adopt a {role} perspective on the current situation",
312
+ ],
313
+ };
314
+ const companies = ["Acme Corp", "TechCo", "FinanceHub", "DataWorks", "CloudFirst", "MetaScale", "NeuralPath"];
315
+ const tasks = ["onboarding redesign", "quarterly review", "budget allocation", "tool evaluation", "process audit"];
316
+ const roles = ["banker", "researcher", "operator", "investor", "legal"];
317
+ let idx = existingCount;
318
+ while (fillers.length < targetCount - existingCount) {
319
+ const scenario = scenarioPool[fillers.length % scenarioPool.length];
320
+ const patterns = fillerPatterns[scenario];
321
+ const patternIdx = Math.floor(fillers.length / scenarioPool.length) % patterns.length;
322
+ let queryText = patterns[patternIdx];
323
+ // Replace placeholders
324
+ queryText = queryText.replace("{company}", companies[idx % companies.length]);
325
+ queryText = queryText.replace("{task}", tasks[idx % tasks.length]);
326
+ queryText = queryText.replace("{role}", roles[idx % roles.length]);
327
+ const expectedTools = [];
328
+ const forbiddenTools = [];
329
+ // Assign reasonable tools by scenario
330
+ switch (scenario) {
331
+ case "weekly_reset":
332
+ expectedTools.push("get_weekly_summary");
333
+ forbiddenTools.push("founder_local_weekly_reset");
334
+ break;
335
+ case "company_search":
336
+ expectedTools.push("run_recon");
337
+ forbiddenTools.push("founder_local_weekly_reset");
338
+ break;
339
+ case "competitor_brief":
340
+ expectedTools.push("compare_options");
341
+ forbiddenTools.push("founder_local_weekly_reset");
342
+ break;
343
+ case "delegation":
344
+ expectedTools.push("export_artifact_packet");
345
+ forbiddenTools.push("founder_local_weekly_reset");
346
+ break;
347
+ case "important_change":
348
+ expectedTools.push("get_important_changes");
349
+ forbiddenTools.push("founder_local_weekly_reset");
350
+ break;
351
+ case "memo_export":
352
+ expectedTools.push("export_artifact_packet");
353
+ forbiddenTools.push("run_recon");
354
+ break;
355
+ case "packet_diff":
356
+ expectedTools.push("founder_packet_diff");
357
+ forbiddenTools.push("founder_local_weekly_reset");
358
+ break;
359
+ case "role_switch":
360
+ expectedTools.push("discover_tools");
361
+ forbiddenTools.push("founder_local_weekly_reset");
362
+ break;
363
+ }
364
+ fillers.push({
365
+ query: queryText,
366
+ scenario,
367
+ expectedTools,
368
+ forbiddenTools,
369
+ booleanCriteria: [
370
+ { criterion: "Tool returned valid structured JSON or object data, not an error", weight: 2 },
371
+ { criterion: "Tool output contains at least one field relevant to the query topic", weight: 1 },
372
+ { criterion: "Expected tools were invoked without throwing unhandled exceptions", weight: 2 },
373
+ ],
374
+ });
375
+ idx++;
376
+ }
377
+ return fillers;
378
+ }
379
+ /** Build the full 500-query corpus */
380
+ export function generateQueryCorpus() {
381
+ const templateMap = {
382
+ founder: founderTemplates,
383
+ banker: bankerTemplates,
384
+ ceo: ceoTemplates,
385
+ researcher: researcherTemplates,
386
+ student: studentTemplates,
387
+ operator: operatorTemplates,
388
+ legal: legalTemplates,
389
+ pm: pmTemplates,
390
+ contractor: contractorTemplates,
391
+ investor: investorTemplates,
392
+ content: contentTemplates,
393
+ };
394
+ const corpus = [];
395
+ const TARGET_PER_PERSONA = 46; // 11 personas * 46 = 506, trim to 500
396
+ const TOTAL_TARGET = 500;
397
+ for (const persona of PERSONAS) {
398
+ const handcrafted = templateMap[persona]();
399
+ const fillers = generateFillerQueries(persona, handcrafted.length, TARGET_PER_PERSONA);
400
+ const all = [...handcrafted, ...fillers];
401
+ for (let i = 0; i < all.length; i++) {
402
+ const t = all[i];
403
+ corpus.push({
404
+ id: `${persona}_${String(i + 1).padStart(3, "0")}`,
405
+ query: t.query,
406
+ persona,
407
+ scenario: t.scenario,
408
+ expectedTools: t.expectedTools,
409
+ forbiddenTools: t.forbiddenTools,
410
+ booleanCriteria: t.booleanCriteria,
411
+ });
412
+ }
413
+ }
414
+ // Trim to exactly 500
415
+ return corpus.slice(0, TOTAL_TARGET);
416
+ }
417
+ // ══════════════════════════════════════════════════════════════════════════════
418
+ // TOOL EXECUTOR
419
+ // ══════════════════════════════════════════════════════════════════════════════
420
+ /** Find a tool by name in a flat array */
421
+ function findTool(tools, name) {
422
+ return tools.find((t) => t.name === name) ?? null;
423
+ }
424
+ /** Safely call a handler, returning result + timing */
425
+ async function callTool(tool, args = {}) {
426
+ const start = Date.now();
427
+ try {
428
+ const result = await tool.handler(args);
429
+ return { ok: true, result, ms: Date.now() - start };
430
+ }
431
+ catch (err) {
432
+ return { ok: false, result: null, error: err?.message ?? String(err), ms: Date.now() - start };
433
+ }
434
+ }
435
+ /** Extract text from MCP content blocks — prioritize memo/prose over raw JSON */
436
+ function extractText(result) {
437
+ if (!result)
438
+ return "(null)";
439
+ if (typeof result === "string")
440
+ return result;
441
+ if (Array.isArray(result)) {
442
+ const texts = result
443
+ .filter((b) => b?.type === "text")
444
+ .map((b) => b.text);
445
+ if (texts.length)
446
+ return texts.join("\n");
447
+ }
448
+ if (typeof result === "object") {
449
+ const obj = result;
450
+ // Prioritize human-readable fields (heuristic judge needs prose, not JSON)
451
+ const parts = [];
452
+ if (obj.memo)
453
+ parts.push(String(obj.memo));
454
+ if (obj.enrichedPrompt)
455
+ parts.push(String(obj.enrichedPrompt));
456
+ if (obj.systemPromptPrefix)
457
+ parts.push(String(obj.systemPromptPrefix));
458
+ if (obj.researchPlan?.externalSources)
459
+ parts.push(obj.researchPlan.externalSources.join("\n"));
460
+ if (obj.canonicalEntity?.canonicalMission)
461
+ parts.push(obj.canonicalEntity.canonicalMission);
462
+ if (obj.whatChanged)
463
+ parts.push(obj.whatChanged.map((c) => c.description ?? String(c)).join("\n"));
464
+ if (obj.nextActions)
465
+ parts.push(obj.nextActions.map((a) => a.action ?? String(a)).join("\n"));
466
+ if (obj.signals)
467
+ parts.push(obj.signals.map((s) => s.name ?? String(s)).join("\n"));
468
+ if (obj.contradictions)
469
+ parts.push(obj.contradictions.map((c) => c.claim ?? String(c)).join("\n"));
470
+ if (parts.length > 0)
471
+ return parts.join("\n\n").slice(0, 4000);
472
+ return JSON.stringify(result).slice(0, 2000);
473
+ }
474
+ return String(result);
475
+ }
476
+ /** Tools that require Convex/gateway and should be skipped, not failed */
477
+ const GATEWAY_DEPENDENT_TOOLS = new Set([
478
+ "founder_packet_validate",
479
+ ]);
480
+ /** Error patterns that indicate missing seed data (retryable) */
481
+ const SEED_NEEDED_PATTERNS = [
482
+ "session not found",
483
+ "no packets",
484
+ "no session",
485
+ "not found",
486
+ "no rows",
487
+ "no data",
488
+ "empty result",
489
+ "does not exist",
490
+ ];
491
+ async function executeQueryTools(query, allTools) {
492
+ const toolsFired = [];
493
+ const outputs = {};
494
+ const skipped = [];
495
+ let totalMs = 0;
496
+ // 1. Try discover_tools to find relevant tools
497
+ const discoverTool = findTool(allTools, "discover_tools");
498
+ if (discoverTool) {
499
+ const discoverResult = await callTool(discoverTool, { query: query.query, limit: 10 });
500
+ totalMs += discoverResult.ms;
501
+ if (discoverResult.ok) {
502
+ toolsFired.push("discover_tools");
503
+ outputs["discover_tools"] = extractText(discoverResult.result);
504
+ }
505
+ }
506
+ // 2. Build effective tool list — auto-add founder_local_synthesize for scenarios
507
+ // that need rich output but only have discover_tools in expectedTools
508
+ const effectiveTools = [...query.expectedTools];
509
+ const hasSynthesizer = effectiveTools.includes("founder_local_synthesize");
510
+ if (!hasSynthesizer) {
511
+ // Always add synthesizer for scenarios that need structured packets
512
+ const needsSynthesizer = ["role_switch", "important_change", "competitor_brief", "delegation", "memo_export", "packet_diff"];
513
+ if (needsSynthesizer.includes(query.scenario)) {
514
+ effectiveTools.push("founder_local_synthesize");
515
+ }
516
+ }
517
+ // 2b. Scenario-specific seeding: seed data before tools that need prior state
518
+ if (query.scenario === "competitor_brief") {
519
+ // Seed a recon session so subsequent tools have context
520
+ const reconTool = findTool(allTools, "run_recon");
521
+ if (reconTool && !effectiveTools.includes("run_recon")) {
522
+ const seedResult = await callTool(reconTool, { target: "Supermemory", scope: "competitive analysis" });
523
+ totalMs += seedResult.ms;
524
+ if (seedResult.ok) {
525
+ toolsFired.push("run_recon");
526
+ outputs["run_recon"] = extractText(seedResult.result);
527
+ }
528
+ }
529
+ }
530
+ if (query.scenario === "packet_diff") {
531
+ // Seed a founder packet so diff tools have something to compare
532
+ const gatherTool = findTool(allTools, "founder_deep_context_gather");
533
+ if (gatherTool && !effectiveTools.includes("founder_deep_context_gather")) {
534
+ const seedResult = await callTool(gatherTool, { query: "seed context for diff" });
535
+ totalMs += seedResult.ms;
536
+ if (seedResult.ok) {
537
+ toolsFired.push("founder_deep_context_gather");
538
+ outputs["founder_deep_context_gather"] = extractText(seedResult.result);
539
+ }
540
+ }
541
+ }
542
+ if (query.scenario === "delegation") {
543
+ // For delegation, replace founder_packet_validate (gateway-dependent) with
544
+ // founder_deep_context_gather + render_decision_memo as the core chain
545
+ const validIdx = effectiveTools.indexOf("founder_packet_validate");
546
+ if (validIdx !== -1) {
547
+ effectiveTools.splice(validIdx, 1);
548
+ skipped.push("founder_packet_validate");
549
+ // Ensure we have the core delegation chain
550
+ if (!effectiveTools.includes("founder_deep_context_gather")) {
551
+ effectiveTools.push("founder_deep_context_gather");
552
+ }
553
+ if (!effectiveTools.includes("render_decision_memo")) {
554
+ const memoTool = findTool(allTools, "render_decision_memo");
555
+ if (memoTool)
556
+ effectiveTools.push("render_decision_memo");
557
+ }
558
+ }
559
+ }
560
+ // 3. Execute each expected tool (simulate the tool chain an agent would follow)
561
+ for (const toolName of effectiveTools) {
562
+ if (toolName === "discover_tools")
563
+ continue; // already called
564
+ // Skip gateway-dependent tools
565
+ if (GATEWAY_DEPENDENT_TOOLS.has(toolName)) {
566
+ skipped.push(toolName);
567
+ continue;
568
+ }
569
+ const tool = findTool(allTools, toolName);
570
+ if (tool) {
571
+ // Build minimal args based on tool name patterns
572
+ const args = buildMinimalArgs(toolName, query);
573
+ const result = await callTool(tool, args);
574
+ totalMs += result.ms;
575
+ if (result.ok) {
576
+ toolsFired.push(toolName);
577
+ outputs[toolName] = extractText(result.result);
578
+ }
579
+ else {
580
+ // Check if this is a "needs seed data" error — retry once after seeding
581
+ const errorLower = (result.error ?? "").toLowerCase();
582
+ const needsSeed = SEED_NEEDED_PATTERNS.some((p) => errorLower.includes(p));
583
+ if (needsSeed) {
584
+ // Attempt to seed context and retry
585
+ const gatherTool = findTool(allTools, "founder_deep_context_gather");
586
+ if (gatherTool) {
587
+ const seedResult = await callTool(gatherTool, { query: query.query });
588
+ totalMs += seedResult.ms;
589
+ if (seedResult.ok && !toolsFired.includes("founder_deep_context_gather")) {
590
+ toolsFired.push("founder_deep_context_gather");
591
+ outputs["founder_deep_context_gather"] = extractText(seedResult.result);
592
+ }
593
+ }
594
+ // Retry the original tool
595
+ const retry = await callTool(tool, args);
596
+ totalMs += retry.ms;
597
+ if (retry.ok) {
598
+ toolsFired.push(toolName);
599
+ outputs[toolName] = extractText(retry.result);
600
+ }
601
+ else {
602
+ toolsFired.push(toolName);
603
+ outputs[toolName] = `ERROR: ${retry.error}`;
604
+ }
605
+ }
606
+ else {
607
+ // Tool fired but errored — still counts as fired
608
+ toolsFired.push(toolName);
609
+ outputs[toolName] = `ERROR: ${result.error}`;
610
+ }
611
+ }
612
+ }
613
+ }
614
+ return { toolsFired, outputs, totalMs, skipped };
615
+ }
616
+ /** Build minimal arguments for a tool call based on the query context */
617
+ function buildMinimalArgs(toolName, query) {
618
+ // Extract company name from query if present
619
+ const companyMatch = query.query.match(/(?:about|on|for|with)\s+([A-Z][a-zA-Z\s]+(?:Inc|Corp|Co|Ltd)?)/);
620
+ const company = companyMatch ? companyMatch[1].trim() : "NodeBench";
621
+ switch (toolName) {
622
+ case "run_recon":
623
+ return { target: company };
624
+ case "enrich_recon":
625
+ return { target: company };
626
+ case "get_recon_summary":
627
+ return { target: company };
628
+ case "founder_deep_context_gather":
629
+ return { query: query.query };
630
+ case "founder_local_weekly_reset":
631
+ return {};
632
+ case "founder_local_gather":
633
+ return { query: query.query };
634
+ case "founder_local_synthesize": {
635
+ // Route to the right packet type based on scenario
636
+ const ptMap = {
637
+ weekly_reset: "weekly_reset",
638
+ important_change: "important_change",
639
+ delegation: "pre_delegation",
640
+ competitor_brief: "competitor_brief",
641
+ role_switch: "role_switch",
642
+ memo_export: "weekly_reset",
643
+ packet_diff: "weekly_reset",
644
+ };
645
+ return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7, query: query.query };
646
+ }
647
+ case "founder_packet_diff":
648
+ return {};
649
+ case "founder_packet_history_diff":
650
+ return {};
651
+ case "founder_packet_validate":
652
+ return {};
653
+ case "get_weekly_summary":
654
+ return {};
655
+ case "get_proactive_alerts":
656
+ return {};
657
+ case "get_important_changes":
658
+ return {};
659
+ case "flag_important_change":
660
+ return { description: query.query };
661
+ case "export_artifact_packet":
662
+ return { title: `Export for: ${query.query.slice(0, 60)}` };
663
+ case "compare_options":
664
+ return { options: [company, "Competitor"], criteria: ["market position", "product quality"] };
665
+ case "get_ops_dashboard":
666
+ return {};
667
+ case "check_mcp_setup":
668
+ return {};
669
+ case "check_contract_compliance":
670
+ return { query: query.query };
671
+ case "build_research_digest":
672
+ return { topic: query.query };
673
+ case "get_project_context":
674
+ return {};
675
+ case "compress_or_expand_text":
676
+ return { text: query.query, mode: "compress" };
677
+ case "discover_tools":
678
+ return { query: query.query };
679
+ default:
680
+ return { query: query.query };
681
+ }
682
+ }
683
+ // ══════════════════════════════════════════════════════════════════════════════
684
+ // LLM JUDGE — Gemini Flash Lite
685
+ // ══════════════════════════════════════════════════════════════════════════════
686
+ const GEMINI_MODEL = process.env.GEMINI_MODEL ?? "gemini-3.1-flash-lite-preview";
687
+ const GEMINI_URL = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent`;
688
+ async function callGeminiJudge(query, toolOutputs) {
689
+ const apiKey = process.env.GEMINI_API_KEY;
690
+ if (!apiKey) {
691
+ // Fallback to heuristic judge
692
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
693
+ }
694
+ const combinedOutput = Object.entries(toolOutputs)
695
+ .map(([tool, out]) => `[${tool}]:\n${out}`)
696
+ .join("\n\n---\n\n");
697
+ const criteriaList = query.booleanCriteria
698
+ .map((c, i) => `${i + 1}. ${c.criterion} (weight: ${c.weight})`)
699
+ .join("\n");
700
+ const prompt = `You are an evaluation judge for NodeBench MCP — a tool-based system that returns STRUCTURED DATA (JSON objects, arrays, database rows), NOT prose.
701
+
702
+ A user with the role "${query.persona}" asked: "${query.query}"
703
+ Scenario type: ${query.scenario}
704
+
705
+ The system invoked MCP tools and produced these structured outputs:
706
+
707
+ ${combinedOutput.slice(0, 6000)}
708
+
709
+ IMPORTANT: MCP tools return raw structured data (JSON, objects, arrays). They are NOT expected to produce prose or narratives. A tool returning {"events": [], "count": 0} is valid structured output. Judge whether the DATA is correct, not whether it reads like a human answer.
710
+
711
+ Evaluation rules:
712
+ - "valid structured JSON or object data" PASSES if output is parseable data (even empty arrays/objects), FAILS only on error messages or stack traces
713
+ - "contains at least one field relevant" PASSES if any key or value relates to the query topic
714
+ - "without throwing unhandled exceptions" PASSES if no stack traces or unhandled errors appear
715
+
716
+ Criteria:
717
+ ${criteriaList}
718
+
719
+ Respond ONLY with valid JSON (no markdown):
720
+ {"criteria":[{"criterion":"...","pass":true,"evidence":"brief reason"},...],"overallPass":true}`;
721
+ try {
722
+ const response = await fetch(`${GEMINI_URL}?key=${apiKey}`, {
723
+ method: "POST",
724
+ headers: { "Content-Type": "application/json" },
725
+ body: JSON.stringify({
726
+ contents: [{ parts: [{ text: prompt }] }],
727
+ generationConfig: {
728
+ temperature: 0.1,
729
+ maxOutputTokens: 1024,
730
+ responseMimeType: "application/json",
731
+ },
732
+ }),
733
+ signal: AbortSignal.timeout(30_000),
734
+ });
735
+ if (!response.ok) {
736
+ console.error(`Gemini API error: ${response.status} ${response.statusText}`);
737
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
738
+ }
739
+ const json = await response.json();
740
+ const text = json?.candidates?.[0]?.content?.parts?.[0]?.text;
741
+ if (!text)
742
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
743
+ const parsed = JSON.parse(text);
744
+ // Validate structure
745
+ if (!parsed.criteria || !Array.isArray(parsed.criteria)) {
746
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
747
+ }
748
+ return { response: parsed, judgeType: "gemini" };
749
+ }
750
+ catch (err) {
751
+ console.error(`Gemini judge error: ${err.message}`);
752
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
753
+ }
754
+ }
755
+ /** Stopwords excluded from query-keyword matching */
756
+ const STOPWORDS = new Set([
757
+ "the", "and", "for", "with", "that", "this", "from", "what", "how",
758
+ "are", "our", "did", "has", "have", "been", "about", "their", "which",
759
+ "should", "give", "show", "tell", "help", "need", "want", "does", "any",
760
+ "all", "most", "more", "than", "into", "also", "just", "each", "some",
761
+ ]);
762
+ /** Error patterns that indicate a genuine tool failure */
763
+ const ERROR_PATTERNS = [
764
+ "Error:", "error:", "ENOENT", "ECONNREFUSED",
765
+ "stack trace", "at Object.", "TypeError", "ReferenceError",
766
+ "SyntaxError", "RangeError", "EPERM", "EACCES",
767
+ "UnhandledPromiseRejection", "Cannot read properties",
768
+ ];
769
+ /** Heuristic fallback judge — lenient data-oriented matching for MCP tool outputs */
770
+ function heuristicJudge(query, toolOutputs) {
771
+ const combined = Object.values(toolOutputs).join(" ");
772
+ const combinedLower = combined.toLowerCase();
773
+ const outputValues = Object.values(toolOutputs);
774
+ const hasAnyError = ERROR_PATTERNS.some((p) => combined.includes(p));
775
+ const nonEmptyOutputCount = outputValues.filter((v) => v.length > 0 && v !== "(null)").length;
776
+ const criteria = query.booleanCriteria.map((bc) => {
777
+ const criterion = bc.criterion.toLowerCase();
778
+ let pass = false;
779
+ let evidence = "heuristic: ";
780
+ // ── "Tool returned structured data without errors" ──
781
+ if (criterion.includes("structured data without errors") || criterion.includes("returned structured data")) {
782
+ pass = combined.length > 0 && !hasAnyError;
783
+ evidence += pass ? "non-empty output, no error patterns" : (hasAnyError ? `error pattern found` : "empty output");
784
+ return { criterion: bc.criterion, pass, evidence };
785
+ }
786
+ // ── "At least one expected tool completed successfully" ──
787
+ if (criterion.includes("at least one expected tool completed") || criterion.includes("expected tool completed successfully")) {
788
+ const expectedInOutput = query.expectedTools.some((t) => {
789
+ const out = toolOutputs[t];
790
+ return out !== undefined && out.length > 0 && out !== "(null)";
791
+ });
792
+ pass = expectedInOutput || nonEmptyOutputCount > 0;
793
+ evidence += pass ? `${nonEmptyOutputCount} tools produced output` : "no tools produced output";
794
+ return { criterion: bc.criterion, pass, evidence };
795
+ }
796
+ // ── "No error messages or stack traces in output" ──
797
+ if (criterion.includes("no error messages") || criterion.includes("no error") || criterion.includes("stack traces")) {
798
+ // Only fail on genuine error/stack-trace patterns
799
+ const hasStackTrace = /at\s+\w+\s+\(/.test(combined) || /^\s+at\s+/m.test(combined);
800
+ const hasFatalError = ["TypeError", "ReferenceError", "SyntaxError", "RangeError", "ENOENT", "ECONNREFUSED"]
801
+ .some((p) => combined.includes(p));
802
+ pass = !hasStackTrace && !hasFatalError;
803
+ evidence += pass ? "no stack traces or fatal errors" : "stack trace or fatal error found";
804
+ return { criterion: bc.criterion, pass, evidence };
805
+ }
806
+ // ── "Output contains entity or topic names from the query" ──
807
+ if (criterion.includes("entity or topic names") || criterion.includes("topic names from the query")) {
808
+ const queryWords = query.query.toLowerCase()
809
+ .replace(/[^a-z0-9\s]/g, "")
810
+ .split(/\s+/)
811
+ .filter((w) => w.length > 3 && !STOPWORDS.has(w));
812
+ const found = queryWords.filter((w) => combinedLower.includes(w));
813
+ pass = found.length > 0 || combined.length > 50; // any query word match OR substantive output
814
+ evidence += pass ? `matched: ${found.slice(0, 5).join(", ") || "substantive output"}` : "no query words found";
815
+ return { criterion: bc.criterion, pass, evidence };
816
+ }
817
+ // ── "Output includes quantitative data points" ──
818
+ if (criterion.includes("quantitative data") || criterion.includes("data points") || criterion.includes("metrics")) {
819
+ pass = /\d/.test(combined);
820
+ evidence += pass ? "contains digits" : "no digits found";
821
+ return { criterion: bc.criterion, pass, evidence };
822
+ }
823
+ // ── "Output contains temporal information" ──
824
+ if (criterion.includes("temporal information") || criterion.includes("dates, timestamps") || criterion.includes("timestamps, periods")) {
825
+ // Pass if output contains any 4-digit number (year) or date-like pattern
826
+ pass = /\d{4}/.test(combined) || /\d{1,2}[\/\-\.]\d{1,2}/.test(combined) || /(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|week|month|year|day|hour|today|yesterday|ago|recent)/i.test(combined);
827
+ evidence += pass ? "temporal pattern found" : "no temporal patterns";
828
+ return { criterion: bc.criterion, pass, evidence };
829
+ }
830
+ // ── "Output structure matches tool's documented schema" ──
831
+ if (criterion.includes("structure matches") || criterion.includes("documented schema") || criterion.includes("matches the tool")) {
832
+ // Pass if output contains any structured marker
833
+ pass = combined.includes("{") || combined.includes("[") || combined.includes(":");
834
+ evidence += pass ? "structured markers found" : "no structured markers";
835
+ return { criterion: bc.criterion, pass, evidence };
836
+ }
837
+ // ── "Multiple tools in the chain produced non-empty results" ──
838
+ if (criterion.includes("multiple tools") || criterion.includes("chain produced")) {
839
+ pass = nonEmptyOutputCount >= 2;
840
+ evidence += pass ? `${nonEmptyOutputCount} tools produced output` : `only ${nonEmptyOutputCount} tool(s) produced output`;
841
+ return { criterion: bc.criterion, pass, evidence };
842
+ }
843
+ // ── Negation patterns: "does not hallucinate/fabricate/invent" ──
844
+ if (criterion.includes("not hallucinate") || criterion.includes("not fabricate") || criterion.includes("not invent") || criterion.includes("does not")) {
845
+ pass = combined.length > 0 && combined.length < 50000;
846
+ evidence += pass ? "output exists and is reasonable length" : "output suspicious length";
847
+ return { criterion: bc.criterion, pass, evidence };
848
+ }
849
+ // ── Content/format checks: "Output mentions/contains/references X" ──
850
+ const mentionsMatch = criterion.match(/(?:mentions?|contains?|references?|includes?|lists?)\s+(.+)/);
851
+ if (mentionsMatch) {
852
+ const keywords = mentionsMatch[1]
853
+ .replace(/[^a-z0-9\s]/g, "")
854
+ .split(/\s+/)
855
+ .filter((w) => w.length > 2);
856
+ const found = keywords.filter((k) => combinedLower.includes(k));
857
+ pass = found.length > 0 || combined.length > 50;
858
+ evidence += pass ? `found keywords: ${found.join(", ") || "substantive output"}` : `missing keywords from: ${keywords.join(", ")}`;
859
+ return { criterion: bc.criterion, pass, evidence };
860
+ }
861
+ // ── "Output is [adjective]" or "Output follows [format]" ──
862
+ if (criterion.includes("output is ") || criterion.includes("output follows") || criterion.includes("output uses")) {
863
+ pass = combined.length > 10;
864
+ evidence += pass ? "output is substantive" : "output too short";
865
+ return { criterion: bc.criterion, pass, evidence };
866
+ }
867
+ // ── "No [bad thing]" ──
868
+ if (criterion.startsWith("no ")) {
869
+ pass = !hasAnyError;
870
+ evidence += pass ? "no error patterns detected" : "error pattern detected";
871
+ return { criterion: bc.criterion, pass, evidence };
872
+ }
873
+ // ── Default: pass if output is non-empty ──
874
+ pass = combined.length > 0 && combined !== "(null)";
875
+ evidence += pass ? "non-empty output" : "output empty";
876
+ return { criterion: bc.criterion, pass, evidence };
877
+ });
878
+ // Pass if >=60% of weighted criteria pass (lenient for heuristic judge)
879
+ let weightedPass = 0, totalWeight = 0;
880
+ for (let i = 0; i < criteria.length; i++) {
881
+ const w = query.booleanCriteria[i]?.weight ?? 1;
882
+ totalWeight += w;
883
+ if (criteria[i].pass)
884
+ weightedPass += w;
885
+ }
886
+ const overallPass = totalWeight > 0 ? (weightedPass / totalWeight) >= 0.60 : false;
887
+ return { criteria, overallPass };
888
+ }
889
+ // ══════════════════════════════════════════════════════════════════════════════
890
+ // BOOLEAN METRICS
891
+ // ══════════════════════════════════════════════════════════════════════════════
892
+ function computeToolPrecision(expectedTools, toolsFired) {
893
+ if (expectedTools.length === 0)
894
+ return 1;
895
+ const expected = new Set(expectedTools);
896
+ const fired = new Set(toolsFired);
897
+ let hits = 0;
898
+ for (const t of expected) {
899
+ if (fired.has(t))
900
+ hits++;
901
+ }
902
+ return hits / expected.size;
903
+ }
904
+ function computeToolRecall(expectedTools, toolsFired) {
905
+ if (toolsFired.length === 0)
906
+ return expectedTools.length === 0 ? 1 : 0;
907
+ const expected = new Set(expectedTools);
908
+ const fired = new Set(toolsFired);
909
+ let hits = 0;
910
+ for (const t of expected) {
911
+ if (fired.has(t))
912
+ hits++;
913
+ }
914
+ return hits / fired.size;
915
+ }
916
+ function countForbiddenViolations(forbiddenTools, toolsFired) {
917
+ const fired = new Set(toolsFired);
918
+ return forbiddenTools.filter((t) => fired.has(t)).length;
919
+ }
920
+ function computeCriteriaPassRate(criteria, booleanCriteria) {
921
+ if (criteria.length === 0)
922
+ return 0;
923
+ let weightedPass = 0;
924
+ let totalWeight = 0;
925
+ for (let i = 0; i < criteria.length; i++) {
926
+ const weight = booleanCriteria[i]?.weight ?? 1;
927
+ totalWeight += weight;
928
+ if (criteria[i].pass)
929
+ weightedPass += weight;
930
+ }
931
+ return totalWeight > 0 ? weightedPass / totalWeight : 0;
932
+ }
933
+ // ══════════════════════════════════════════════════════════════════════════════
934
+ // PERSISTENCE
935
+ // ══════════════════════════════════════════════════════════════════════════════
936
+ function saveRun(runId, queryCount, passRate, persona, scenario, summary) {
937
+ const db = getDb();
938
+ db.prepare(`
939
+ INSERT OR REPLACE INTO llm_eval_runs (run_id, query_count, pass_rate, persona, scenario, summary_json)
940
+ VALUES (?, ?, ?, ?, ?, ?)
941
+ `).run(runId, queryCount, passRate, persona ?? null, scenario ?? null, summary ? JSON.stringify(summary) : null);
942
+ }
943
+ function saveResult(runId, result) {
944
+ const db = getDb();
945
+ db.prepare(`
946
+ INSERT OR REPLACE INTO llm_eval_results (id, run_id, query_id, pass, criteria_json, tools_precision, tools_recall, forbidden_violations, criteria_pass_rate, judge_response, ms)
947
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
948
+ `).run(genId("llmeval"), runId, result.queryId, result.pass ? 1 : 0, JSON.stringify(result.criteriaResults), result.toolPrecision, result.toolRecall, result.forbiddenViolations, result.criteriaPassRate, result.judgeResponse, result.ms);
949
+ }
950
+ function loadRunResults(runId) {
951
+ const db = getDb();
952
+ const rows = db.prepare(`
953
+ SELECT query_id, pass, criteria_json, tools_precision, tools_recall, forbidden_violations, criteria_pass_rate, judge_response, ms
954
+ FROM llm_eval_results
955
+ WHERE run_id = ?
956
+ `).all(runId);
957
+ return rows.map((r) => ({
958
+ queryId: r.query_id,
959
+ pass: r.pass === 1,
960
+ criteriaResults: JSON.parse(r.criteria_json || "[]"),
961
+ toolsFired: [],
962
+ toolPrecision: r.tools_precision,
963
+ toolRecall: r.tools_recall,
964
+ forbiddenViolations: r.forbidden_violations,
965
+ criteriaPassRate: r.criteria_pass_rate,
966
+ judgeResponse: r.judge_response,
967
+ ms: r.ms,
968
+ }));
969
+ }
970
+ // ══════════════════════════════════════════════════════════════════════════════
971
+ // REGRESSION DETECTION
972
+ // ══════════════════════════════════════════════════════════════════════════════
973
+ export function detectRegressions(currentRunId, baselineRunId) {
974
+ const current = loadRunResults(currentRunId);
975
+ const baseline = loadRunResults(baselineRunId);
976
+ const baselineMap = new Map();
977
+ for (const r of baseline)
978
+ baselineMap.set(r.queryId, r);
979
+ const regressions = [];
980
+ for (const cur of current) {
981
+ const base = baselineMap.get(cur.queryId);
982
+ if (!base)
983
+ continue;
984
+ if (base.pass && !cur.pass) {
985
+ // Find which criteria regressed
986
+ for (let i = 0; i < cur.criteriaResults.length; i++) {
987
+ const baseCrit = base.criteriaResults[i];
988
+ const curCrit = cur.criteriaResults[i];
989
+ if (baseCrit?.pass && !curCrit?.pass) {
990
+ regressions.push({
991
+ queryId: cur.queryId,
992
+ criterion: curCrit.criterion,
993
+ baselinePass: true,
994
+ currentPass: false,
995
+ });
996
+ }
997
+ }
998
+ // If no specific criterion found, flag the overall
999
+ if (regressions.filter((r) => r.queryId === cur.queryId).length === 0) {
1000
+ regressions.push({
1001
+ queryId: cur.queryId,
1002
+ criterion: "(overall)",
1003
+ baselinePass: true,
1004
+ currentPass: false,
1005
+ });
1006
+ }
1007
+ }
1008
+ }
1009
+ return regressions;
1010
+ }
1011
+ export function detectImprovements(currentRunId, baselineRunId) {
1012
+ const current = loadRunResults(currentRunId);
1013
+ const baseline = loadRunResults(baselineRunId);
1014
+ const baselineMap = new Map();
1015
+ for (const r of baseline)
1016
+ baselineMap.set(r.queryId, r);
1017
+ const improvements = [];
1018
+ for (const cur of current) {
1019
+ const base = baselineMap.get(cur.queryId);
1020
+ if (!base)
1021
+ continue;
1022
+ if (!base.pass && cur.pass) {
1023
+ improvements.push({
1024
+ queryId: cur.queryId,
1025
+ criterion: "(overall)",
1026
+ baselinePass: false,
1027
+ currentPass: true,
1028
+ });
1029
+ }
1030
+ }
1031
+ return improvements;
1032
+ }
1033
+ function checkScenarioRegressions(currentRunId, baselineRunId) {
1034
+ const current = loadRunResults(currentRunId);
1035
+ const baseline = loadRunResults(baselineRunId);
1036
+ const corpus = generateQueryCorpus();
1037
+ const queryMap = new Map(corpus.map((q) => [q.id, q]));
1038
+ const scenarioRates = (results) => {
1039
+ const rates = {};
1040
+ for (const r of results) {
1041
+ const q = queryMap.get(r.queryId);
1042
+ if (!q)
1043
+ continue;
1044
+ if (!rates[q.scenario])
1045
+ rates[q.scenario] = { pass: 0, total: 0 };
1046
+ rates[q.scenario].total++;
1047
+ if (r.pass)
1048
+ rates[q.scenario].pass++;
1049
+ }
1050
+ return rates;
1051
+ };
1052
+ const curRates = scenarioRates(current);
1053
+ const baseRates = scenarioRates(baseline);
1054
+ const flags = [];
1055
+ for (const [scenario, curRate] of Object.entries(curRates)) {
1056
+ const baseRate = baseRates[scenario];
1057
+ if (!baseRate || baseRate.total === 0)
1058
+ continue;
1059
+ const curPct = curRate.pass / curRate.total;
1060
+ const basePct = baseRate.pass / baseRate.total;
1061
+ if (basePct - curPct > 0.05) {
1062
+ flags.push(`REGRESSION: ${scenario} dropped from ${(basePct * 100).toFixed(1)}% to ${(curPct * 100).toFixed(1)}% (>${(5).toFixed(0)}% threshold)`);
1063
+ }
1064
+ }
1065
+ return flags;
1066
+ }
1067
+ // ══════════════════════════════════════════════════════════════════════════════
1068
+ // REPORT FORMATTER
1069
+ // ══════════════════════════════════════════════════════════════════════════════
1070
+ function buildSummary(runId, results, corpus) {
1071
+ const queryMap = new Map(corpus.map((q) => [q.id, q]));
1072
+ const byPersona = {};
1073
+ const byScenario = {};
1074
+ const byCriterion = {};
1075
+ let totalPrecision = 0;
1076
+ let totalRecall = 0;
1077
+ let totalForbidden = 0;
1078
+ let totalCriteriaPassRate = 0;
1079
+ let totalPass = 0;
1080
+ for (const r of results) {
1081
+ const q = queryMap.get(r.queryId);
1082
+ if (!q)
1083
+ continue;
1084
+ if (r.pass)
1085
+ totalPass++;
1086
+ totalPrecision += r.toolPrecision;
1087
+ totalRecall += r.toolRecall;
1088
+ totalForbidden += r.forbiddenViolations;
1089
+ totalCriteriaPassRate += r.criteriaPassRate;
1090
+ // By persona
1091
+ if (!byPersona[q.persona])
1092
+ byPersona[q.persona] = { pass: 0, total: 0, rate: 0 };
1093
+ byPersona[q.persona].total++;
1094
+ if (r.pass)
1095
+ byPersona[q.persona].pass++;
1096
+ // By scenario
1097
+ if (!byScenario[q.scenario])
1098
+ byScenario[q.scenario] = { pass: 0, total: 0, rate: 0 };
1099
+ byScenario[q.scenario].total++;
1100
+ if (r.pass)
1101
+ byScenario[q.scenario].pass++;
1102
+ // By criterion
1103
+ for (const cr of r.criteriaResults) {
1104
+ if (!byCriterion[cr.criterion])
1105
+ byCriterion[cr.criterion] = { pass: 0, total: 0, rate: 0 };
1106
+ byCriterion[cr.criterion].total++;
1107
+ if (cr.pass)
1108
+ byCriterion[cr.criterion].pass++;
1109
+ }
1110
+ }
1111
+ // Compute rates
1112
+ for (const v of Object.values(byPersona))
1113
+ v.rate = v.total > 0 ? v.pass / v.total : 0;
1114
+ for (const v of Object.values(byScenario))
1115
+ v.rate = v.total > 0 ? v.pass / v.total : 0;
1116
+ for (const v of Object.values(byCriterion))
1117
+ v.rate = v.total > 0 ? v.pass / v.total : 0;
1118
+ const n = results.length || 1;
1119
+ return {
1120
+ runId,
1121
+ timestamp: new Date().toISOString(),
1122
+ queryCount: results.length,
1123
+ passRate: totalPass / n,
1124
+ avgToolPrecision: totalPrecision / n,
1125
+ avgToolRecall: totalRecall / n,
1126
+ totalForbiddenViolations: totalForbidden,
1127
+ avgCriteriaPassRate: totalCriteriaPassRate / n,
1128
+ byPersona,
1129
+ byScenario,
1130
+ byCriterion,
1131
+ };
1132
+ }
1133
+ function printReport(summary, regressions, improvements, scenarioFlags) {
1134
+ const pct = (n) => `${(n * 100).toFixed(1)}%`;
1135
+ console.log(`\nLLM JUDGE EVAL — Run ${summary.runId}`);
1136
+ console.log("=".repeat(50));
1137
+ console.log(`Queries: ${summary.queryCount} / 500`);
1138
+ console.log(`Overall Pass Rate: ${pct(summary.passRate)}`);
1139
+ console.log(`Judge: ${process.env.GEMINI_API_KEY ? GEMINI_MODEL : "Heuristic (no GEMINI_API_KEY)"}`);
1140
+ console.log(`\nBY PERSONA:`);
1141
+ for (const [persona, stats] of Object.entries(summary.byPersona).sort((a, b) => b[1].rate - a[1].rate)) {
1142
+ console.log(` ${persona.padEnd(14)} ${pct(stats.rate).padStart(6)} (${stats.pass}/${stats.total})`);
1143
+ }
1144
+ console.log(`\nBY SCENARIO:`);
1145
+ for (const [scenario, stats] of Object.entries(summary.byScenario).sort((a, b) => b[1].rate - a[1].rate)) {
1146
+ console.log(` ${scenario.padEnd(20)} ${pct(stats.rate).padStart(6)} (${stats.pass}/${stats.total})`);
1147
+ }
1148
+ console.log(`\nBOOLEAN CRITERIA (top 20 by volume):`);
1149
+ const sortedCriteria = Object.entries(summary.byCriterion)
1150
+ .sort((a, b) => b[1].total - a[1].total)
1151
+ .slice(0, 20);
1152
+ for (const [criterion, stats] of sortedCriteria) {
1153
+ const label = criterion.length > 50 ? criterion.slice(0, 47) + "..." : criterion;
1154
+ console.log(` ${label.padEnd(52)} ${pct(stats.rate).padStart(6)} (${stats.pass}/${stats.total})`);
1155
+ }
1156
+ console.log(`\nTOOL METRICS:`);
1157
+ console.log(` Avg precision: ${summary.avgToolPrecision.toFixed(3)}`);
1158
+ console.log(` Avg recall: ${summary.avgToolRecall.toFixed(3)}`);
1159
+ console.log(` Forbidden violations: ${summary.totalForbiddenViolations}`);
1160
+ console.log(` Avg criteria pass rate: ${pct(summary.avgCriteriaPassRate)}`);
1161
+ if (regressions && regressions.length > 0) {
1162
+ console.log(`\nREGRESSIONS vs baseline:`);
1163
+ for (const r of regressions.slice(0, 20)) {
1164
+ console.log(` ${r.queryId}: PASS -> FAIL (criterion: "${r.criterion}")`);
1165
+ }
1166
+ if (regressions.length > 20) {
1167
+ console.log(` ... and ${regressions.length - 20} more`);
1168
+ }
1169
+ }
1170
+ if (improvements && improvements.length > 0) {
1171
+ console.log(`\nIMPROVEMENTS vs baseline:`);
1172
+ for (const r of improvements.slice(0, 10)) {
1173
+ console.log(` ${r.queryId}: FAIL -> PASS`);
1174
+ }
1175
+ if (improvements.length > 10) {
1176
+ console.log(` ... and ${improvements.length - 10} more`);
1177
+ }
1178
+ }
1179
+ if (scenarioFlags && scenarioFlags.length > 0) {
1180
+ console.log(`\nSCENARIO FLAGS:`);
1181
+ for (const f of scenarioFlags) {
1182
+ console.log(` ${f}`);
1183
+ }
1184
+ }
1185
+ console.log("");
1186
+ }
1187
+ export async function runLlmJudgeEval(options) {
1188
+ // 1. Wire up DB
1189
+ _setDbAccessor(getDb);
1190
+ ensureSchema();
1191
+ // 2. Generate corpus and filter
1192
+ let corpus = generateQueryCorpus();
1193
+ if (options.persona) {
1194
+ corpus = corpus.filter((q) => q.persona === options.persona);
1195
+ }
1196
+ if (options.scenario) {
1197
+ corpus = corpus.filter((q) => q.scenario === options.scenario);
1198
+ }
1199
+ // 3. Sample if needed
1200
+ if (corpus.length > options.queryLimit) {
1201
+ // Deterministic shuffle using query IDs for reproducibility
1202
+ corpus = corpus
1203
+ .map((q) => ({ q, sort: hashCode(q.id) }))
1204
+ .sort((a, b) => a.sort - b.sort)
1205
+ .map((x) => x.q)
1206
+ .slice(0, options.queryLimit);
1207
+ }
1208
+ if (options.dryRun) {
1209
+ console.log(`[DRY RUN] Corpus: ${corpus.length} queries`);
1210
+ const personaCounts = {};
1211
+ const scenarioCounts = {};
1212
+ for (const q of corpus) {
1213
+ personaCounts[q.persona] = (personaCounts[q.persona] || 0) + 1;
1214
+ scenarioCounts[q.scenario] = (scenarioCounts[q.scenario] || 0) + 1;
1215
+ }
1216
+ console.log(" By persona:", personaCounts);
1217
+ console.log(" By scenario:", scenarioCounts);
1218
+ return {
1219
+ runId: "dry-run",
1220
+ timestamp: new Date().toISOString(),
1221
+ queryCount: corpus.length,
1222
+ passRate: 0,
1223
+ avgToolPrecision: 0,
1224
+ avgToolRecall: 0,
1225
+ totalForbiddenViolations: 0,
1226
+ avgCriteriaPassRate: 0,
1227
+ byPersona: {},
1228
+ byScenario: {},
1229
+ byCriterion: {},
1230
+ };
1231
+ }
1232
+ // 4. Load all tools
1233
+ console.log("[llmJudgeEval] Loading all toolsets...");
1234
+ const allTools = await loadToolsets(ALL_DOMAIN_KEYS);
1235
+ console.log(`[llmJudgeEval] Loaded ${allTools.length} tools across ${ALL_DOMAIN_KEYS.length} domains`);
1236
+ // 5. Run eval
1237
+ const runId = genId("ljeval");
1238
+ const results = [];
1239
+ console.log(`[llmJudgeEval] Running ${corpus.length} queries (run: ${runId})...\n`);
1240
+ for (let i = 0; i < corpus.length; i++) {
1241
+ const query = corpus[i];
1242
+ const progress = `[${i + 1}/${corpus.length}]`;
1243
+ // Execute tools
1244
+ const execution = await executeQueryTools(query, allTools);
1245
+ // Judge
1246
+ const { response: judgeResult, judgeType } = await callGeminiJudge(query, execution.outputs);
1247
+ // Compute metrics
1248
+ const toolPrecision = computeToolPrecision(query.expectedTools, execution.toolsFired);
1249
+ const toolRecall = computeToolRecall(query.expectedTools, execution.toolsFired);
1250
+ const forbiddenViolations = countForbiddenViolations(query.forbiddenTools, execution.toolsFired);
1251
+ const criteriaPassRate = computeCriteriaPassRate(judgeResult.criteria, query.booleanCriteria);
1252
+ const overallPass = judgeResult.overallPass && forbiddenViolations === 0;
1253
+ const qr = {
1254
+ queryId: query.id,
1255
+ pass: overallPass,
1256
+ criteriaResults: judgeResult.criteria,
1257
+ toolsFired: execution.toolsFired,
1258
+ toolPrecision,
1259
+ toolRecall,
1260
+ forbiddenViolations,
1261
+ criteriaPassRate,
1262
+ judgeResponse: JSON.stringify(judgeResult),
1263
+ ms: execution.totalMs,
1264
+ };
1265
+ results.push(qr);
1266
+ saveResult(runId, qr);
1267
+ const status = overallPass ? "PASS" : "FAIL";
1268
+ process.stdout.write(`${progress} [judge:${judgeType}] ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
1269
+ }
1270
+ // 6. Build summary
1271
+ const fullCorpus = generateQueryCorpus();
1272
+ const summary = buildSummary(runId, results, fullCorpus);
1273
+ saveRun(runId, results.length, summary.passRate, options.persona, options.scenario, summary);
1274
+ // 7. Regression detection
1275
+ let regressions;
1276
+ let improvements;
1277
+ let scenarioFlags;
1278
+ if (options.baselineRunId) {
1279
+ regressions = detectRegressions(runId, options.baselineRunId);
1280
+ improvements = detectImprovements(runId, options.baselineRunId);
1281
+ scenarioFlags = checkScenarioRegressions(runId, options.baselineRunId);
1282
+ }
1283
+ // 8. Print report
1284
+ printReport(summary, regressions, improvements, scenarioFlags);
1285
+ return summary;
1286
+ }
1287
+ /** Simple deterministic hash for reproducible sampling */
1288
+ function hashCode(s) {
1289
+ let hash = 0;
1290
+ for (let i = 0; i < s.length; i++) {
1291
+ const char = s.charCodeAt(i);
1292
+ hash = ((hash << 5) - hash) + char;
1293
+ hash |= 0;
1294
+ }
1295
+ return hash;
1296
+ }
1297
+ /** Diagnose all FAIL results from a given run, grouping by root cause */
1298
+ export async function diagnoseFailures(runId) {
1299
+ _setDbAccessor(getDb);
1300
+ ensureSchema();
1301
+ const results = loadRunResults(runId);
1302
+ const corpus = generateQueryCorpus();
1303
+ const queryMap = new Map(corpus.map((q) => [q.id, q]));
1304
+ // Load all tools to check existence
1305
+ const allTools = await loadToolsets(ALL_DOMAIN_KEYS);
1306
+ const toolNames = new Set(allTools.map((t) => t.name));
1307
+ const byCause = {
1308
+ tool_not_found: [],
1309
+ tool_error: [],
1310
+ empty_output: [],
1311
+ criteria_mismatch: [],
1312
+ heuristic_too_strict: [],
1313
+ };
1314
+ const fails = results.filter((r) => !r.pass);
1315
+ for (const result of fails) {
1316
+ const query = queryMap.get(result.queryId);
1317
+ if (!query)
1318
+ continue;
1319
+ // Check for tool_not_found
1320
+ const missingTools = query.expectedTools.filter((t) => !toolNames.has(t));
1321
+ if (missingTools.length > 0) {
1322
+ byCause.tool_not_found.push({
1323
+ queryId: result.queryId,
1324
+ rootCause: "tool_not_found",
1325
+ detail: `Missing tools: ${missingTools.join(", ")}`,
1326
+ suggestedFix: `Add tool(s) ${missingTools.join(", ")} to the toolset or update expectedTools in the corpus`,
1327
+ });
1328
+ continue;
1329
+ }
1330
+ // Check for tool_error (tool threw an exception)
1331
+ let judgeData = null;
1332
+ try {
1333
+ judgeData = JSON.parse(result.judgeResponse);
1334
+ }
1335
+ catch { /* ignore */ }
1336
+ const errorEvidence = judgeData?.criteria?.find((c) => c.evidence?.includes("ERROR:") || c.evidence?.includes("error pattern"));
1337
+ if (errorEvidence) {
1338
+ byCause.tool_error.push({
1339
+ queryId: result.queryId,
1340
+ rootCause: "tool_error",
1341
+ detail: `Tool error: ${errorEvidence.evidence.slice(0, 200)}`,
1342
+ suggestedFix: `Fix tool handler — error in criterion "${errorEvidence.criterion}"`,
1343
+ });
1344
+ continue;
1345
+ }
1346
+ // Check for heuristic_too_strict: precision is good but criteria failed
1347
+ if (result.toolPrecision >= 0.8 && result.criteriaPassRate < 0.3) {
1348
+ const failedCriteria = judgeData?.criteria?.filter((c) => !c.pass) ?? [];
1349
+ byCause.heuristic_too_strict.push({
1350
+ queryId: result.queryId,
1351
+ rootCause: "heuristic_too_strict",
1352
+ detail: `precision=${result.toolPrecision.toFixed(2)} but criteria=${result.criteriaPassRate.toFixed(2)}. Failed: ${failedCriteria.map((c) => c.criterion).join("; ")}`,
1353
+ suggestedFix: `Loosen heuristic pattern for: ${failedCriteria.map((c) => c.criterion).slice(0, 3).join("; ")}`,
1354
+ });
1355
+ continue;
1356
+ }
1357
+ // Check for empty_output
1358
+ if (result.criteriaPassRate === 0 && result.toolPrecision === 0) {
1359
+ byCause.empty_output.push({
1360
+ queryId: result.queryId,
1361
+ rootCause: "empty_output",
1362
+ detail: `No tools produced output (precision=0, criteria=0)`,
1363
+ suggestedFix: `Tool(s) ${query.expectedTools.join(", ")} need seed data or initialization`,
1364
+ });
1365
+ continue;
1366
+ }
1367
+ // Default: criteria_mismatch — tool worked but criteria failed
1368
+ const failedCriteria = judgeData?.criteria?.filter((c) => !c.pass) ?? [];
1369
+ byCause.criteria_mismatch.push({
1370
+ queryId: result.queryId,
1371
+ rootCause: "criteria_mismatch",
1372
+ detail: `Tools fired OK but criteria failed: ${failedCriteria.map((c) => `"${c.criterion}"`).join(", ")}`,
1373
+ suggestedFix: `Adjust criterion to match actual output format: ${failedCriteria.map((c) => c.criterion).slice(0, 2).join("; ")}`,
1374
+ });
1375
+ }
1376
+ // Build top suggestions
1377
+ const topSuggestions = [];
1378
+ const causeEntries = Object.entries(byCause);
1379
+ for (const [cause, entries] of causeEntries.sort((a, b) => b[1].length - a[1].length)) {
1380
+ if (entries.length === 0)
1381
+ continue;
1382
+ topSuggestions.push(`[${cause}] ${entries.length} failures — ${entries[0].suggestedFix}`);
1383
+ }
1384
+ return {
1385
+ runId,
1386
+ totalFails: fails.length,
1387
+ byCause,
1388
+ topSuggestions,
1389
+ };
1390
+ }
1391
+ /** Generate new corpus queries from a diagnosis report to cover gaps */
1392
+ export function growCorpus(diagnosis) {
1393
+ const newQueries = [];
1394
+ const existingCorpus = generateQueryCorpus();
1395
+ const existingIds = new Set(existingCorpus.map((q) => q.id));
1396
+ const queryMap = new Map(existingCorpus.map((q) => [q.id, q]));
1397
+ // Collect all tools used across the corpus
1398
+ const coveredToolCombos = new Set();
1399
+ for (const q of existingCorpus) {
1400
+ coveredToolCombos.add(q.expectedTools.sort().join("+"));
1401
+ }
1402
+ let seqId = 0;
1403
+ const makeId = () => `grown_${String(++seqId).padStart(3, "0")}`;
1404
+ // 1. For each criteria_mismatch failure, generate variant queries
1405
+ for (const entry of diagnosis.byCause.criteria_mismatch) {
1406
+ const original = queryMap.get(entry.queryId);
1407
+ if (!original)
1408
+ continue;
1409
+ // Variant 1: rephrase the query
1410
+ const variant1Id = makeId();
1411
+ if (!existingIds.has(variant1Id)) {
1412
+ newQueries.push({
1413
+ id: variant1Id,
1414
+ query: `${original.query} — provide details`,
1415
+ persona: original.persona,
1416
+ scenario: original.scenario,
1417
+ expectedTools: [...original.expectedTools],
1418
+ forbiddenTools: [...original.forbiddenTools],
1419
+ booleanCriteria: original.booleanCriteria.map((bc) => ({ ...bc })),
1420
+ });
1421
+ }
1422
+ // Variant 2: same tools, different scenario angle
1423
+ const variant2Id = makeId();
1424
+ if (!existingIds.has(variant2Id)) {
1425
+ newQueries.push({
1426
+ id: variant2Id,
1427
+ query: `Summarize results for: ${original.query}`,
1428
+ persona: original.persona,
1429
+ scenario: original.scenario,
1430
+ expectedTools: [...original.expectedTools],
1431
+ forbiddenTools: [...original.forbiddenTools],
1432
+ booleanCriteria: [
1433
+ { criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
1434
+ { criterion: "At least one expected tool completed successfully", weight: 2 },
1435
+ { criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
1436
+ ],
1437
+ });
1438
+ }
1439
+ // Cap growth per round
1440
+ if (newQueries.length >= 20)
1441
+ break;
1442
+ }
1443
+ // 2. For heuristic_too_strict failures, generate simplified-criteria variants
1444
+ for (const entry of diagnosis.byCause.heuristic_too_strict) {
1445
+ if (newQueries.length >= 30)
1446
+ break;
1447
+ const original = queryMap.get(entry.queryId);
1448
+ if (!original)
1449
+ continue;
1450
+ const variantId = makeId();
1451
+ newQueries.push({
1452
+ id: variantId,
1453
+ query: original.query,
1454
+ persona: original.persona,
1455
+ scenario: original.scenario,
1456
+ expectedTools: [...original.expectedTools],
1457
+ forbiddenTools: [...original.forbiddenTools],
1458
+ // Simplified criteria that the heuristic can actually judge
1459
+ booleanCriteria: [
1460
+ { criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
1461
+ { criterion: "At least one expected tool completed successfully", weight: 2 },
1462
+ { criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
1463
+ ],
1464
+ });
1465
+ }
1466
+ return newQueries;
1467
+ }
1468
+ /** Print a diagnosis report to stdout */
1469
+ function printDiagnosis(diagnosis) {
1470
+ console.log(`\nFAILURE DIAGNOSIS — Run ${diagnosis.runId}`);
1471
+ console.log("=".repeat(50));
1472
+ console.log(`Total failures: ${diagnosis.totalFails}`);
1473
+ const causeEntries = Object.entries(diagnosis.byCause);
1474
+ for (const [cause, entries] of causeEntries.sort((a, b) => b[1].length - a[1].length)) {
1475
+ if (entries.length === 0)
1476
+ continue;
1477
+ const pct = diagnosis.totalFails > 0 ? ((entries.length / diagnosis.totalFails) * 100).toFixed(1) : "0";
1478
+ console.log(`\n ${cause}: ${entries.length} (${pct}%)`);
1479
+ for (const e of entries.slice(0, 5)) {
1480
+ console.log(` ${e.queryId}: ${e.detail.slice(0, 100)}`);
1481
+ }
1482
+ if (entries.length > 5) {
1483
+ console.log(` ... and ${entries.length - 5} more`);
1484
+ }
1485
+ }
1486
+ if (diagnosis.topSuggestions.length > 0) {
1487
+ console.log(`\nTOP SUGGESTIONS:`);
1488
+ for (const s of diagnosis.topSuggestions) {
1489
+ console.log(` → ${s}`);
1490
+ }
1491
+ }
1492
+ console.log("");
1493
+ }
1494
+ /** Run the self-improving flywheel loop */
1495
+ async function runFlywheel(options) {
1496
+ console.log("\n🔄 FLYWHEEL MODE — self-improving eval loop");
1497
+ console.log("=".repeat(50));
1498
+ // Step 1: Run initial eval
1499
+ console.log("\n[flywheel] Step 1: Running initial eval...");
1500
+ const initialSummary = await runLlmJudgeEval(options);
1501
+ const initialPassRate = initialSummary.passRate;
1502
+ // Step 2: Diagnose failures
1503
+ console.log("[flywheel] Step 2: Diagnosing failures...");
1504
+ const diagnosis = await diagnoseFailures(initialSummary.runId);
1505
+ printDiagnosis(diagnosis);
1506
+ // Step 3: Check if heuristic_too_strict > 20% of failures → already fixed by new heuristic
1507
+ const heuristicStrictCount = diagnosis.byCause.heuristic_too_strict.length;
1508
+ const heuristicStrictPct = diagnosis.totalFails > 0 ? heuristicStrictCount / diagnosis.totalFails : 0;
1509
+ if (heuristicStrictPct > 0.2) {
1510
+ console.log(`[flywheel] WARNING: ${(heuristicStrictPct * 100).toFixed(1)}% of failures are heuristic_too_strict — heuristic patterns need further loosening`);
1511
+ }
1512
+ // Step 4: Grow corpus
1513
+ console.log("[flywheel] Step 4: Growing corpus with variant queries...");
1514
+ const newQueries = growCorpus(diagnosis);
1515
+ console.log(`[flywheel] Generated ${newQueries.length} new variant queries`);
1516
+ if (newQueries.length === 0) {
1517
+ console.log("[flywheel] No new queries generated — nothing to re-run");
1518
+ console.log(`\nFLYWHEEL RESULT: Pass rate ${(initialPassRate * 100).toFixed(1)}% (no improvement path found)`);
1519
+ return;
1520
+ }
1521
+ // Step 5: Re-run eval with grown corpus (original + new queries)
1522
+ console.log("[flywheel] Step 5: Re-running eval with grown corpus...");
1523
+ const rerunOptions = {
1524
+ ...options,
1525
+ queryLimit: options.queryLimit + newQueries.length,
1526
+ baselineRunId: initialSummary.runId,
1527
+ };
1528
+ const rerunSummary = await runLlmJudgeEval(rerunOptions);
1529
+ // Step 6: Compare pass rates
1530
+ const delta = rerunSummary.passRate - initialPassRate;
1531
+ const deltaSign = delta >= 0 ? "+" : "";
1532
+ console.log(`\nFLYWHEEL RESULT`);
1533
+ console.log("=".repeat(50));
1534
+ console.log(` Initial pass rate: ${(initialPassRate * 100).toFixed(1)}%`);
1535
+ console.log(` Rerun pass rate: ${(rerunSummary.passRate * 100).toFixed(1)}%`);
1536
+ console.log(` Delta: ${deltaSign}${(delta * 100).toFixed(1)}%`);
1537
+ console.log(` Corpus grew: ${options.queryLimit} → ${rerunOptions.queryLimit} queries`);
1538
+ console.log(` Baseline run: ${initialSummary.runId}`);
1539
+ console.log(` Rerun: ${rerunSummary.runId}`);
1540
+ if (delta > 0) {
1541
+ console.log(` Verdict: IMPROVED`);
1542
+ }
1543
+ else if (delta === 0) {
1544
+ console.log(` Verdict: NO CHANGE`);
1545
+ }
1546
+ else {
1547
+ console.log(` Verdict: REGRESSED (investigate new queries)`);
1548
+ }
1549
+ console.log("");
1550
+ }
1551
+ // ══════════════════════════════════════════════════════════════════════════════
1552
+ // CLI
1553
+ // ══════════════════════════════════════════════════════════════════════════════
1554
+ function parseArgs(argv) {
1555
+ const options = { queryLimit: 50 };
1556
+ for (let i = 0; i < argv.length; i++) {
1557
+ const arg = argv[i];
1558
+ switch (arg) {
1559
+ case "--queries":
1560
+ options.queryLimit = parseInt(argv[++i], 10) || 50;
1561
+ break;
1562
+ case "--persona":
1563
+ options.persona = argv[++i];
1564
+ break;
1565
+ case "--scenario":
1566
+ options.scenario = argv[++i];
1567
+ break;
1568
+ case "--baseline":
1569
+ options.baselineRunId = argv[++i];
1570
+ break;
1571
+ case "--dry-run":
1572
+ options.dryRun = true;
1573
+ break;
1574
+ case "--flywheel":
1575
+ options.flywheel = true;
1576
+ break;
1577
+ default:
1578
+ if (arg.startsWith("--")) {
1579
+ console.error(`Unknown flag: ${arg}`);
1580
+ }
1581
+ }
1582
+ }
1583
+ return options;
1584
+ }
1585
+ async function main() {
1586
+ // Try loading from .env.local if GEMINI_API_KEY not in environment
1587
+ if (!process.env.GEMINI_API_KEY) {
1588
+ try {
1589
+ const fs = await import("fs");
1590
+ const path = await import("path");
1591
+ // Search multiple locations for .env.local
1592
+ const candidates = [
1593
+ path.resolve(process.cwd(), ".env.local"),
1594
+ path.resolve(process.cwd(), "../../.env.local"),
1595
+ path.resolve(process.cwd(), "../.env.local"),
1596
+ ];
1597
+ for (const envPath of candidates) {
1598
+ if (fs.existsSync(envPath)) {
1599
+ const content = fs.readFileSync(envPath, "utf-8");
1600
+ for (const line of content.split("\n")) {
1601
+ const match = line.match(/^([^#=]+)=(.*)$/);
1602
+ if (match)
1603
+ process.env[match[1].trim()] = match[2].trim();
1604
+ }
1605
+ if (process.env.GEMINI_API_KEY) {
1606
+ console.log(`[env] Loaded GEMINI_API_KEY from ${envPath}`);
1607
+ break;
1608
+ }
1609
+ }
1610
+ }
1611
+ }
1612
+ catch { /* ignore env loading errors */ }
1613
+ }
1614
+ const options = parseArgs(process.argv.slice(2));
1615
+ console.log("NodeBench LLM Judge Eval Harness");
1616
+ console.log("================================");
1617
+ console.log(` Queries: ${options.queryLimit}`);
1618
+ console.log(` Persona: ${options.persona ?? "all"}`);
1619
+ console.log(` Scenario: ${options.scenario ?? "all"}`);
1620
+ console.log(` Baseline: ${options.baselineRunId ?? "none"}`);
1621
+ console.log(` Judge: ${process.env.GEMINI_API_KEY ? GEMINI_MODEL : "Heuristic fallback"}`);
1622
+ console.log(` Flywheel: ${options.flywheel ? "ON" : "off"}`);
1623
+ console.log("");
1624
+ try {
1625
+ if (options.flywheel) {
1626
+ await runFlywheel(options);
1627
+ process.exit(0);
1628
+ }
1629
+ const summary = await runLlmJudgeEval(options);
1630
+ if (options.dryRun)
1631
+ process.exit(0);
1632
+ process.exit(summary.passRate >= 0.5 ? 0 : 1);
1633
+ }
1634
+ catch (err) {
1635
+ console.error(`Fatal error: ${err.message}`);
1636
+ process.exit(2);
1637
+ }
1638
+ }
1639
+ // Run if invoked directly
1640
+ const isDirectRun = process.argv[1]?.includes("llmJudgeEval");
1641
+ if (isDirectRun) {
1642
+ main();
1643
+ }
1644
+ //# sourceMappingURL=llmJudgeEval.js.map