nodebench-mcp 2.59.0 → 2.61.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/llmJudgeEval.js +206 -86
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/searchQualityEval.d.ts +26 -21
- package/dist/benchmarks/searchQualityEval.js +324 -111
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/dist/dashboard/operatingDashboardHtml.d.ts +1 -0
- package/dist/dashboard/operatingDashboardHtml.js +274 -0
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +24 -0
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/tools/founderLocalPipeline.js +9 -4
- package/dist/tools/founderLocalPipeline.js.map +1 -1
- package/dist/tools/progressiveDiscoveryTools.js +1 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/toolRegistry.js +100 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/package.json +1 -1
|
@@ -65,98 +65,98 @@ const SCENARIOS = [
|
|
|
65
65
|
function founderTemplates() {
|
|
66
66
|
return [
|
|
67
67
|
// weekly_reset
|
|
68
|
-
{ query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
69
|
-
{ query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "
|
|
70
|
-
{ query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "
|
|
71
|
-
{ query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "
|
|
72
|
-
{ query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "
|
|
73
|
-
{ query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "
|
|
68
|
+
{ query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
69
|
+
{ query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
70
|
+
{ query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
71
|
+
{ query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
72
|
+
{ query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
73
|
+
{ query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
74
74
|
// company_search
|
|
75
|
-
{ query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
76
|
-
{ query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
75
|
+
{ query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
76
|
+
{ query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
77
77
|
// competitor_brief
|
|
78
|
-
{ query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "
|
|
79
|
-
{ query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "
|
|
78
|
+
{ query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
79
|
+
{ query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
80
80
|
// delegation
|
|
81
|
-
{ query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
82
|
-
{ query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "
|
|
81
|
+
{ query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
82
|
+
{ query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
83
83
|
// important_change
|
|
84
|
-
{ query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "
|
|
85
|
-
{ query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
84
|
+
{ query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
85
|
+
{ query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
86
86
|
// memo_export
|
|
87
|
-
{ query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "
|
|
88
|
-
{ query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "
|
|
87
|
+
{ query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
88
|
+
{ query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
89
89
|
// packet_diff
|
|
90
|
-
{ query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
91
|
-
{ query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "
|
|
90
|
+
{ query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
|
|
91
|
+
{ query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
|
|
92
92
|
// role_switch
|
|
93
|
-
{ query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "
|
|
94
|
-
{ query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "
|
|
93
|
+
{ query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
|
|
94
|
+
{ query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
|
|
95
95
|
];
|
|
96
96
|
}
|
|
97
97
|
function bankerTemplates() {
|
|
98
98
|
return [
|
|
99
|
-
{ query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
100
|
-
{ query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
101
|
-
{ query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
102
|
-
{ query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
103
|
-
{ query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
104
|
-
{ query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
105
|
-
{ query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
106
|
-
{ query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
107
|
-
{ query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
108
|
-
{ query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
109
|
-
{ query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
110
|
-
{ query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
111
|
-
{ query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
112
|
-
{ query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "
|
|
99
|
+
{ query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
100
|
+
{ query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
101
|
+
{ query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
102
|
+
{ query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
103
|
+
{ query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
104
|
+
{ query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
105
|
+
{ query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
106
|
+
{ query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
107
|
+
{ query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
|
|
108
|
+
{ query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
109
|
+
{ query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
110
|
+
{ query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
111
|
+
{ query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
112
|
+
{ query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
|
|
113
113
|
];
|
|
114
114
|
}
|
|
115
115
|
function ceoTemplates() {
|
|
116
116
|
return [
|
|
117
|
-
{ query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "
|
|
118
|
-
{ query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
119
|
-
{ query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "
|
|
120
|
-
{ query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
121
|
-
{ query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "
|
|
122
|
-
{ query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
123
|
-
{ query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
124
|
-
{ query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "
|
|
125
|
-
{ query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
126
|
-
{ query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "
|
|
127
|
-
{ query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
128
|
-
{ query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
117
|
+
{ query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
118
|
+
{ query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
119
|
+
{ query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
120
|
+
{ query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
121
|
+
{ query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
122
|
+
{ query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
123
|
+
{ query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
124
|
+
{ query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
|
|
125
|
+
{ query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
126
|
+
{ query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
|
|
127
|
+
{ query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
128
|
+
{ query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
129
129
|
];
|
|
130
130
|
}
|
|
131
131
|
function researcherTemplates() {
|
|
132
132
|
return [
|
|
133
|
-
{ query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
134
|
-
{ query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
135
|
-
{ query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "
|
|
136
|
-
{ query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
137
|
-
{ query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
138
|
-
{ query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "
|
|
139
|
-
{ query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
140
|
-
{ query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
141
|
-
{ query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
142
|
-
{ query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "
|
|
143
|
-
{ query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
144
|
-
{ query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
133
|
+
{ query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
134
|
+
{ query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
135
|
+
{ query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
136
|
+
{ query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
137
|
+
{ query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
138
|
+
{ query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
139
|
+
{ query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
140
|
+
{ query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
|
|
141
|
+
{ query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
142
|
+
{ query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
|
|
143
|
+
{ query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
144
|
+
{ query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
145
145
|
];
|
|
146
146
|
}
|
|
147
147
|
function studentTemplates() {
|
|
148
148
|
return [
|
|
149
|
-
{ query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "
|
|
150
|
-
{ query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
151
|
-
{ query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
152
|
-
{ query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
153
|
-
{ query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
154
|
-
{ query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "
|
|
155
|
-
{ query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
156
|
-
{ query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
157
|
-
{ query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
158
|
-
{ query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "
|
|
159
|
-
{ query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "
|
|
149
|
+
{ query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
150
|
+
{ query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
151
|
+
{ query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
152
|
+
{ query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
153
|
+
{ query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
154
|
+
{ query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
|
|
155
|
+
{ query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
156
|
+
{ query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
157
|
+
{ query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
|
|
158
|
+
{ query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
159
|
+
{ query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
|
|
160
160
|
];
|
|
161
161
|
}
|
|
162
162
|
function operatorTemplates() {
|
|
@@ -473,9 +473,25 @@ function extractText(result) {
|
|
|
473
473
|
}
|
|
474
474
|
return String(result);
|
|
475
475
|
}
|
|
476
|
+
/** Tools that require Convex/gateway and should be skipped, not failed */
|
|
477
|
+
const GATEWAY_DEPENDENT_TOOLS = new Set([
|
|
478
|
+
"founder_packet_validate",
|
|
479
|
+
]);
|
|
480
|
+
/** Error patterns that indicate missing seed data (retryable) */
|
|
481
|
+
const SEED_NEEDED_PATTERNS = [
|
|
482
|
+
"session not found",
|
|
483
|
+
"no packets",
|
|
484
|
+
"no session",
|
|
485
|
+
"not found",
|
|
486
|
+
"no rows",
|
|
487
|
+
"no data",
|
|
488
|
+
"empty result",
|
|
489
|
+
"does not exist",
|
|
490
|
+
];
|
|
476
491
|
async function executeQueryTools(query, allTools) {
|
|
477
492
|
const toolsFired = [];
|
|
478
493
|
const outputs = {};
|
|
494
|
+
const skipped = [];
|
|
479
495
|
let totalMs = 0;
|
|
480
496
|
// 1. Try discover_tools to find relevant tools
|
|
481
497
|
const discoverTool = findTool(allTools, "discover_tools");
|
|
@@ -498,10 +514,58 @@ async function executeQueryTools(query, allTools) {
|
|
|
498
514
|
effectiveTools.push("founder_local_synthesize");
|
|
499
515
|
}
|
|
500
516
|
}
|
|
517
|
+
// 2b. Scenario-specific seeding: seed data before tools that need prior state
|
|
518
|
+
if (query.scenario === "competitor_brief") {
|
|
519
|
+
// Seed a recon session so subsequent tools have context
|
|
520
|
+
const reconTool = findTool(allTools, "run_recon");
|
|
521
|
+
if (reconTool && !effectiveTools.includes("run_recon")) {
|
|
522
|
+
const seedResult = await callTool(reconTool, { target: "Supermemory", scope: "competitive analysis" });
|
|
523
|
+
totalMs += seedResult.ms;
|
|
524
|
+
if (seedResult.ok) {
|
|
525
|
+
toolsFired.push("run_recon");
|
|
526
|
+
outputs["run_recon"] = extractText(seedResult.result);
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
if (query.scenario === "packet_diff") {
|
|
531
|
+
// Seed a founder packet so diff tools have something to compare
|
|
532
|
+
const gatherTool = findTool(allTools, "founder_deep_context_gather");
|
|
533
|
+
if (gatherTool && !effectiveTools.includes("founder_deep_context_gather")) {
|
|
534
|
+
const seedResult = await callTool(gatherTool, { query: "seed context for diff" });
|
|
535
|
+
totalMs += seedResult.ms;
|
|
536
|
+
if (seedResult.ok) {
|
|
537
|
+
toolsFired.push("founder_deep_context_gather");
|
|
538
|
+
outputs["founder_deep_context_gather"] = extractText(seedResult.result);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
if (query.scenario === "delegation") {
|
|
543
|
+
// For delegation, replace founder_packet_validate (gateway-dependent) with
|
|
544
|
+
// founder_deep_context_gather + render_decision_memo as the core chain
|
|
545
|
+
const validIdx = effectiveTools.indexOf("founder_packet_validate");
|
|
546
|
+
if (validIdx !== -1) {
|
|
547
|
+
effectiveTools.splice(validIdx, 1);
|
|
548
|
+
skipped.push("founder_packet_validate");
|
|
549
|
+
// Ensure we have the core delegation chain
|
|
550
|
+
if (!effectiveTools.includes("founder_deep_context_gather")) {
|
|
551
|
+
effectiveTools.push("founder_deep_context_gather");
|
|
552
|
+
}
|
|
553
|
+
if (!effectiveTools.includes("render_decision_memo")) {
|
|
554
|
+
const memoTool = findTool(allTools, "render_decision_memo");
|
|
555
|
+
if (memoTool)
|
|
556
|
+
effectiveTools.push("render_decision_memo");
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
}
|
|
501
560
|
// 3. Execute each expected tool (simulate the tool chain an agent would follow)
|
|
502
561
|
for (const toolName of effectiveTools) {
|
|
503
562
|
if (toolName === "discover_tools")
|
|
504
563
|
continue; // already called
|
|
564
|
+
// Skip gateway-dependent tools
|
|
565
|
+
if (GATEWAY_DEPENDENT_TOOLS.has(toolName)) {
|
|
566
|
+
skipped.push(toolName);
|
|
567
|
+
continue;
|
|
568
|
+
}
|
|
505
569
|
const tool = findTool(allTools, toolName);
|
|
506
570
|
if (tool) {
|
|
507
571
|
// Build minimal args based on tool name patterns
|
|
@@ -513,13 +577,41 @@ async function executeQueryTools(query, allTools) {
|
|
|
513
577
|
outputs[toolName] = extractText(result.result);
|
|
514
578
|
}
|
|
515
579
|
else {
|
|
516
|
-
//
|
|
517
|
-
|
|
518
|
-
|
|
580
|
+
// Check if this is a "needs seed data" error — retry once after seeding
|
|
581
|
+
const errorLower = (result.error ?? "").toLowerCase();
|
|
582
|
+
const needsSeed = SEED_NEEDED_PATTERNS.some((p) => errorLower.includes(p));
|
|
583
|
+
if (needsSeed) {
|
|
584
|
+
// Attempt to seed context and retry
|
|
585
|
+
const gatherTool = findTool(allTools, "founder_deep_context_gather");
|
|
586
|
+
if (gatherTool) {
|
|
587
|
+
const seedResult = await callTool(gatherTool, { query: query.query });
|
|
588
|
+
totalMs += seedResult.ms;
|
|
589
|
+
if (seedResult.ok && !toolsFired.includes("founder_deep_context_gather")) {
|
|
590
|
+
toolsFired.push("founder_deep_context_gather");
|
|
591
|
+
outputs["founder_deep_context_gather"] = extractText(seedResult.result);
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
// Retry the original tool
|
|
595
|
+
const retry = await callTool(tool, args);
|
|
596
|
+
totalMs += retry.ms;
|
|
597
|
+
if (retry.ok) {
|
|
598
|
+
toolsFired.push(toolName);
|
|
599
|
+
outputs[toolName] = extractText(retry.result);
|
|
600
|
+
}
|
|
601
|
+
else {
|
|
602
|
+
toolsFired.push(toolName);
|
|
603
|
+
outputs[toolName] = `ERROR: ${retry.error}`;
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
else {
|
|
607
|
+
// Tool fired but errored — still counts as fired
|
|
608
|
+
toolsFired.push(toolName);
|
|
609
|
+
outputs[toolName] = `ERROR: ${result.error}`;
|
|
610
|
+
}
|
|
519
611
|
}
|
|
520
612
|
}
|
|
521
613
|
}
|
|
522
|
-
return { toolsFired, outputs, totalMs };
|
|
614
|
+
return { toolsFired, outputs, totalMs, skipped };
|
|
523
615
|
}
|
|
524
616
|
/** Build minimal arguments for a tool call based on the query context */
|
|
525
617
|
function buildMinimalArgs(toolName, query) {
|
|
@@ -550,7 +642,7 @@ function buildMinimalArgs(toolName, query) {
|
|
|
550
642
|
memo_export: "weekly_reset",
|
|
551
643
|
packet_diff: "weekly_reset",
|
|
552
644
|
};
|
|
553
|
-
return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7 };
|
|
645
|
+
return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7, query: query.query };
|
|
554
646
|
}
|
|
555
647
|
case "founder_packet_diff":
|
|
556
648
|
return {};
|
|
@@ -597,7 +689,7 @@ async function callGeminiJudge(query, toolOutputs) {
|
|
|
597
689
|
const apiKey = process.env.GEMINI_API_KEY;
|
|
598
690
|
if (!apiKey) {
|
|
599
691
|
// Fallback to heuristic judge
|
|
600
|
-
return heuristicJudge(query, toolOutputs);
|
|
692
|
+
return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
|
|
601
693
|
}
|
|
602
694
|
const combinedOutput = Object.entries(toolOutputs)
|
|
603
695
|
.map(([tool, out]) => `[${tool}]:\n${out}`)
|
|
@@ -642,22 +734,22 @@ Respond ONLY with valid JSON (no markdown):
|
|
|
642
734
|
});
|
|
643
735
|
if (!response.ok) {
|
|
644
736
|
console.error(`Gemini API error: ${response.status} ${response.statusText}`);
|
|
645
|
-
return heuristicJudge(query, toolOutputs);
|
|
737
|
+
return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
|
|
646
738
|
}
|
|
647
739
|
const json = await response.json();
|
|
648
740
|
const text = json?.candidates?.[0]?.content?.parts?.[0]?.text;
|
|
649
741
|
if (!text)
|
|
650
|
-
return heuristicJudge(query, toolOutputs);
|
|
742
|
+
return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
|
|
651
743
|
const parsed = JSON.parse(text);
|
|
652
744
|
// Validate structure
|
|
653
745
|
if (!parsed.criteria || !Array.isArray(parsed.criteria)) {
|
|
654
|
-
return heuristicJudge(query, toolOutputs);
|
|
746
|
+
return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
|
|
655
747
|
}
|
|
656
|
-
return parsed;
|
|
748
|
+
return { response: parsed, judgeType: "gemini" };
|
|
657
749
|
}
|
|
658
750
|
catch (err) {
|
|
659
751
|
console.error(`Gemini judge error: ${err.message}`);
|
|
660
|
-
return heuristicJudge(query, toolOutputs);
|
|
752
|
+
return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
|
|
661
753
|
}
|
|
662
754
|
}
|
|
663
755
|
/** Stopwords excluded from query-keyword matching */
|
|
@@ -1151,7 +1243,7 @@ export async function runLlmJudgeEval(options) {
|
|
|
1151
1243
|
// Execute tools
|
|
1152
1244
|
const execution = await executeQueryTools(query, allTools);
|
|
1153
1245
|
// Judge
|
|
1154
|
-
const judgeResult = await callGeminiJudge(query, execution.outputs);
|
|
1246
|
+
const { response: judgeResult, judgeType } = await callGeminiJudge(query, execution.outputs);
|
|
1155
1247
|
// Compute metrics
|
|
1156
1248
|
const toolPrecision = computeToolPrecision(query.expectedTools, execution.toolsFired);
|
|
1157
1249
|
const toolRecall = computeToolRecall(query.expectedTools, execution.toolsFired);
|
|
@@ -1173,7 +1265,7 @@ export async function runLlmJudgeEval(options) {
|
|
|
1173
1265
|
results.push(qr);
|
|
1174
1266
|
saveResult(runId, qr);
|
|
1175
1267
|
const status = overallPass ? "PASS" : "FAIL";
|
|
1176
|
-
process.stdout.write(`${progress} ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
|
|
1268
|
+
process.stdout.write(`${progress} [judge:${judgeType}] ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
|
|
1177
1269
|
}
|
|
1178
1270
|
// 6. Build summary
|
|
1179
1271
|
const fullCorpus = generateQueryCorpus();
|
|
@@ -1338,9 +1430,9 @@ export function growCorpus(diagnosis) {
|
|
|
1338
1430
|
expectedTools: [...original.expectedTools],
|
|
1339
1431
|
forbiddenTools: [...original.forbiddenTools],
|
|
1340
1432
|
booleanCriteria: [
|
|
1341
|
-
{ criterion: "
|
|
1433
|
+
{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
|
|
1342
1434
|
{ criterion: "At least one expected tool completed successfully", weight: 2 },
|
|
1343
|
-
{ criterion: "
|
|
1435
|
+
{ criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
|
|
1344
1436
|
],
|
|
1345
1437
|
});
|
|
1346
1438
|
}
|
|
@@ -1365,9 +1457,9 @@ export function growCorpus(diagnosis) {
|
|
|
1365
1457
|
forbiddenTools: [...original.forbiddenTools],
|
|
1366
1458
|
// Simplified criteria that the heuristic can actually judge
|
|
1367
1459
|
booleanCriteria: [
|
|
1368
|
-
{ criterion: "
|
|
1460
|
+
{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
|
|
1369
1461
|
{ criterion: "At least one expected tool completed successfully", weight: 2 },
|
|
1370
|
-
{ criterion: "
|
|
1462
|
+
{ criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
|
|
1371
1463
|
],
|
|
1372
1464
|
});
|
|
1373
1465
|
}
|
|
@@ -1491,6 +1583,34 @@ function parseArgs(argv) {
|
|
|
1491
1583
|
return options;
|
|
1492
1584
|
}
|
|
1493
1585
|
async function main() {
|
|
1586
|
+
// Try loading from .env.local if GEMINI_API_KEY not in environment
|
|
1587
|
+
if (!process.env.GEMINI_API_KEY) {
|
|
1588
|
+
try {
|
|
1589
|
+
const fs = await import("fs");
|
|
1590
|
+
const path = await import("path");
|
|
1591
|
+
// Search multiple locations for .env.local
|
|
1592
|
+
const candidates = [
|
|
1593
|
+
path.resolve(process.cwd(), ".env.local"),
|
|
1594
|
+
path.resolve(process.cwd(), "../../.env.local"),
|
|
1595
|
+
path.resolve(process.cwd(), "../.env.local"),
|
|
1596
|
+
];
|
|
1597
|
+
for (const envPath of candidates) {
|
|
1598
|
+
if (fs.existsSync(envPath)) {
|
|
1599
|
+
const content = fs.readFileSync(envPath, "utf-8");
|
|
1600
|
+
for (const line of content.split("\n")) {
|
|
1601
|
+
const match = line.match(/^([^#=]+)=(.*)$/);
|
|
1602
|
+
if (match)
|
|
1603
|
+
process.env[match[1].trim()] = match[2].trim();
|
|
1604
|
+
}
|
|
1605
|
+
if (process.env.GEMINI_API_KEY) {
|
|
1606
|
+
console.log(`[env] Loaded GEMINI_API_KEY from ${envPath}`);
|
|
1607
|
+
break;
|
|
1608
|
+
}
|
|
1609
|
+
}
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
catch { /* ignore env loading errors */ }
|
|
1613
|
+
}
|
|
1494
1614
|
const options = parseArgs(process.argv.slice(2));
|
|
1495
1615
|
console.log("NodeBench LLM Judge Eval Harness");
|
|
1496
1616
|
console.log("================================");
|