nodebench-mcp 2.59.0 → 2.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,98 +65,98 @@ const SCENARIOS = [
65
65
  function founderTemplates() {
66
66
  return [
67
67
  // weekly_reset
68
- { query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
69
- { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
70
- { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
71
- { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
72
- { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
73
- { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
68
+ { query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
69
+ { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
70
+ { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
71
+ { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
72
+ { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
73
+ { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
74
74
  // company_search
75
- { query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
76
- { query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
75
+ { query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
76
+ { query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
77
77
  // competitor_brief
78
- { query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
79
- { query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
78
+ { query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
79
+ { query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
80
80
  // delegation
81
- { query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
82
- { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
81
+ { query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
82
+ { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
83
83
  // important_change
84
- { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
85
- { query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
84
+ { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
85
+ { query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
86
86
  // memo_export
87
- { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
88
- { query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
87
+ { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
88
+ { query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
89
89
  // packet_diff
90
- { query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
91
- { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
90
+ { query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
91
+ { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
92
92
  // role_switch
93
- { query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
94
- { query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
93
+ { query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
94
+ { query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
95
95
  ];
96
96
  }
97
97
  function bankerTemplates() {
98
98
  return [
99
- { query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
100
- { query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
101
- { query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
102
- { query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
103
- { query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
104
- { query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
105
- { query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
106
- { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
107
- { query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
108
- { query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
109
- { query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
110
- { query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
111
- { query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
112
- { query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
99
+ { query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
100
+ { query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
101
+ { query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
102
+ { query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
103
+ { query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
104
+ { query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
105
+ { query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
106
+ { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
107
+ { query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
108
+ { query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
109
+ { query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
110
+ { query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
111
+ { query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
112
+ { query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
113
113
  ];
114
114
  }
115
115
  function ceoTemplates() {
116
116
  return [
117
- { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
118
- { query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
119
- { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
120
- { query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
121
- { query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
122
- { query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
123
- { query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
124
- { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
125
- { query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
126
- { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
127
- { query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
128
- { query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
117
+ { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
118
+ { query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
119
+ { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
120
+ { query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
121
+ { query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
122
+ { query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
123
+ { query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
124
+ { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
125
+ { query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
126
+ { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
127
+ { query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
128
+ { query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
129
129
  ];
130
130
  }
131
131
  function researcherTemplates() {
132
132
  return [
133
- { query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
134
- { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
135
- { query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
136
- { query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
137
- { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
138
- { query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
139
- { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
140
- { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
141
- { query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
142
- { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
143
- { query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
144
- { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
133
+ { query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
134
+ { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
135
+ { query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
136
+ { query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
137
+ { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
138
+ { query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
139
+ { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
140
+ { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
141
+ { query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
142
+ { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
143
+ { query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
144
+ { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
145
145
  ];
146
146
  }
147
147
  function studentTemplates() {
148
148
  return [
149
- { query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
150
- { query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
151
- { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
152
- { query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
153
- { query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
154
- { query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
155
- { query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
156
- { query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
157
- { query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
158
- { query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
159
- { query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
149
+ { query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
150
+ { query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
151
+ { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
152
+ { query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
153
+ { query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
154
+ { query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
155
+ { query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
156
+ { query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
157
+ { query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
158
+ { query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
159
+ { query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
160
160
  ];
161
161
  }
162
162
  function operatorTemplates() {
@@ -473,9 +473,25 @@ function extractText(result) {
473
473
  }
474
474
  return String(result);
475
475
  }
476
+ /** Tools that require Convex/gateway and should be skipped, not failed */
477
+ const GATEWAY_DEPENDENT_TOOLS = new Set([
478
+ "founder_packet_validate",
479
+ ]);
480
+ /** Error patterns that indicate missing seed data (retryable) */
481
+ const SEED_NEEDED_PATTERNS = [
482
+ "session not found",
483
+ "no packets",
484
+ "no session",
485
+ "not found",
486
+ "no rows",
487
+ "no data",
488
+ "empty result",
489
+ "does not exist",
490
+ ];
476
491
  async function executeQueryTools(query, allTools) {
477
492
  const toolsFired = [];
478
493
  const outputs = {};
494
+ const skipped = [];
479
495
  let totalMs = 0;
480
496
  // 1. Try discover_tools to find relevant tools
481
497
  const discoverTool = findTool(allTools, "discover_tools");
@@ -498,10 +514,58 @@ async function executeQueryTools(query, allTools) {
498
514
  effectiveTools.push("founder_local_synthesize");
499
515
  }
500
516
  }
517
+ // 2b. Scenario-specific seeding: seed data before tools that need prior state
518
+ if (query.scenario === "competitor_brief") {
519
+ // Seed a recon session so subsequent tools have context
520
+ const reconTool = findTool(allTools, "run_recon");
521
+ if (reconTool && !effectiveTools.includes("run_recon")) {
522
+ const seedResult = await callTool(reconTool, { target: "Supermemory", scope: "competitive analysis" });
523
+ totalMs += seedResult.ms;
524
+ if (seedResult.ok) {
525
+ toolsFired.push("run_recon");
526
+ outputs["run_recon"] = extractText(seedResult.result);
527
+ }
528
+ }
529
+ }
530
+ if (query.scenario === "packet_diff") {
531
+ // Seed a founder packet so diff tools have something to compare
532
+ const gatherTool = findTool(allTools, "founder_deep_context_gather");
533
+ if (gatherTool && !effectiveTools.includes("founder_deep_context_gather")) {
534
+ const seedResult = await callTool(gatherTool, { query: "seed context for diff" });
535
+ totalMs += seedResult.ms;
536
+ if (seedResult.ok) {
537
+ toolsFired.push("founder_deep_context_gather");
538
+ outputs["founder_deep_context_gather"] = extractText(seedResult.result);
539
+ }
540
+ }
541
+ }
542
+ if (query.scenario === "delegation") {
543
+ // For delegation, replace founder_packet_validate (gateway-dependent) with
544
+ // founder_deep_context_gather + render_decision_memo as the core chain
545
+ const validIdx = effectiveTools.indexOf("founder_packet_validate");
546
+ if (validIdx !== -1) {
547
+ effectiveTools.splice(validIdx, 1);
548
+ skipped.push("founder_packet_validate");
549
+ // Ensure we have the core delegation chain
550
+ if (!effectiveTools.includes("founder_deep_context_gather")) {
551
+ effectiveTools.push("founder_deep_context_gather");
552
+ }
553
+ if (!effectiveTools.includes("render_decision_memo")) {
554
+ const memoTool = findTool(allTools, "render_decision_memo");
555
+ if (memoTool)
556
+ effectiveTools.push("render_decision_memo");
557
+ }
558
+ }
559
+ }
501
560
  // 3. Execute each expected tool (simulate the tool chain an agent would follow)
502
561
  for (const toolName of effectiveTools) {
503
562
  if (toolName === "discover_tools")
504
563
  continue; // already called
564
+ // Skip gateway-dependent tools
565
+ if (GATEWAY_DEPENDENT_TOOLS.has(toolName)) {
566
+ skipped.push(toolName);
567
+ continue;
568
+ }
505
569
  const tool = findTool(allTools, toolName);
506
570
  if (tool) {
507
571
  // Build minimal args based on tool name patterns
@@ -513,13 +577,41 @@ async function executeQueryTools(query, allTools) {
513
577
  outputs[toolName] = extractText(result.result);
514
578
  }
515
579
  else {
516
- // Tool fired but erroredstill counts as fired
517
- toolsFired.push(toolName);
518
- outputs[toolName] = `ERROR: ${result.error}`;
580
+ // Check if this is a "needs seed data" error retry once after seeding
581
+ const errorLower = (result.error ?? "").toLowerCase();
582
+ const needsSeed = SEED_NEEDED_PATTERNS.some((p) => errorLower.includes(p));
583
+ if (needsSeed) {
584
+ // Attempt to seed context and retry
585
+ const gatherTool = findTool(allTools, "founder_deep_context_gather");
586
+ if (gatherTool) {
587
+ const seedResult = await callTool(gatherTool, { query: query.query });
588
+ totalMs += seedResult.ms;
589
+ if (seedResult.ok && !toolsFired.includes("founder_deep_context_gather")) {
590
+ toolsFired.push("founder_deep_context_gather");
591
+ outputs["founder_deep_context_gather"] = extractText(seedResult.result);
592
+ }
593
+ }
594
+ // Retry the original tool
595
+ const retry = await callTool(tool, args);
596
+ totalMs += retry.ms;
597
+ if (retry.ok) {
598
+ toolsFired.push(toolName);
599
+ outputs[toolName] = extractText(retry.result);
600
+ }
601
+ else {
602
+ toolsFired.push(toolName);
603
+ outputs[toolName] = `ERROR: ${retry.error}`;
604
+ }
605
+ }
606
+ else {
607
+ // Tool fired but errored — still counts as fired
608
+ toolsFired.push(toolName);
609
+ outputs[toolName] = `ERROR: ${result.error}`;
610
+ }
519
611
  }
520
612
  }
521
613
  }
522
- return { toolsFired, outputs, totalMs };
614
+ return { toolsFired, outputs, totalMs, skipped };
523
615
  }
524
616
  /** Build minimal arguments for a tool call based on the query context */
525
617
  function buildMinimalArgs(toolName, query) {
@@ -550,7 +642,7 @@ function buildMinimalArgs(toolName, query) {
550
642
  memo_export: "weekly_reset",
551
643
  packet_diff: "weekly_reset",
552
644
  };
553
- return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7 };
645
+ return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7, query: query.query };
554
646
  }
555
647
  case "founder_packet_diff":
556
648
  return {};
@@ -597,7 +689,7 @@ async function callGeminiJudge(query, toolOutputs) {
597
689
  const apiKey = process.env.GEMINI_API_KEY;
598
690
  if (!apiKey) {
599
691
  // Fallback to heuristic judge
600
- return heuristicJudge(query, toolOutputs);
692
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
601
693
  }
602
694
  const combinedOutput = Object.entries(toolOutputs)
603
695
  .map(([tool, out]) => `[${tool}]:\n${out}`)
@@ -642,22 +734,22 @@ Respond ONLY with valid JSON (no markdown):
642
734
  });
643
735
  if (!response.ok) {
644
736
  console.error(`Gemini API error: ${response.status} ${response.statusText}`);
645
- return heuristicJudge(query, toolOutputs);
737
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
646
738
  }
647
739
  const json = await response.json();
648
740
  const text = json?.candidates?.[0]?.content?.parts?.[0]?.text;
649
741
  if (!text)
650
- return heuristicJudge(query, toolOutputs);
742
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
651
743
  const parsed = JSON.parse(text);
652
744
  // Validate structure
653
745
  if (!parsed.criteria || !Array.isArray(parsed.criteria)) {
654
- return heuristicJudge(query, toolOutputs);
746
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
655
747
  }
656
- return parsed;
748
+ return { response: parsed, judgeType: "gemini" };
657
749
  }
658
750
  catch (err) {
659
751
  console.error(`Gemini judge error: ${err.message}`);
660
- return heuristicJudge(query, toolOutputs);
752
+ return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
661
753
  }
662
754
  }
663
755
  /** Stopwords excluded from query-keyword matching */
@@ -1151,7 +1243,7 @@ export async function runLlmJudgeEval(options) {
1151
1243
  // Execute tools
1152
1244
  const execution = await executeQueryTools(query, allTools);
1153
1245
  // Judge
1154
- const judgeResult = await callGeminiJudge(query, execution.outputs);
1246
+ const { response: judgeResult, judgeType } = await callGeminiJudge(query, execution.outputs);
1155
1247
  // Compute metrics
1156
1248
  const toolPrecision = computeToolPrecision(query.expectedTools, execution.toolsFired);
1157
1249
  const toolRecall = computeToolRecall(query.expectedTools, execution.toolsFired);
@@ -1173,7 +1265,7 @@ export async function runLlmJudgeEval(options) {
1173
1265
  results.push(qr);
1174
1266
  saveResult(runId, qr);
1175
1267
  const status = overallPass ? "PASS" : "FAIL";
1176
- process.stdout.write(`${progress} ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
1268
+ process.stdout.write(`${progress} [judge:${judgeType}] ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
1177
1269
  }
1178
1270
  // 6. Build summary
1179
1271
  const fullCorpus = generateQueryCorpus();
@@ -1338,9 +1430,9 @@ export function growCorpus(diagnosis) {
1338
1430
  expectedTools: [...original.expectedTools],
1339
1431
  forbiddenTools: [...original.forbiddenTools],
1340
1432
  booleanCriteria: [
1341
- { criterion: "Tool returned structured data without errors", weight: 2 },
1433
+ { criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
1342
1434
  { criterion: "At least one expected tool completed successfully", weight: 2 },
1343
- { criterion: "No error messages or stack traces in output", weight: 2 },
1435
+ { criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
1344
1436
  ],
1345
1437
  });
1346
1438
  }
@@ -1365,9 +1457,9 @@ export function growCorpus(diagnosis) {
1365
1457
  forbiddenTools: [...original.forbiddenTools],
1366
1458
  // Simplified criteria that the heuristic can actually judge
1367
1459
  booleanCriteria: [
1368
- { criterion: "Tool returned structured data without errors", weight: 2 },
1460
+ { criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
1369
1461
  { criterion: "At least one expected tool completed successfully", weight: 2 },
1370
- { criterion: "No error messages or stack traces in output", weight: 2 },
1462
+ { criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
1371
1463
  ],
1372
1464
  });
1373
1465
  }
@@ -1491,6 +1583,34 @@ function parseArgs(argv) {
1491
1583
  return options;
1492
1584
  }
1493
1585
  async function main() {
1586
+ // Try loading from .env.local if GEMINI_API_KEY not in environment
1587
+ if (!process.env.GEMINI_API_KEY) {
1588
+ try {
1589
+ const fs = await import("fs");
1590
+ const path = await import("path");
1591
+ // Search multiple locations for .env.local
1592
+ const candidates = [
1593
+ path.resolve(process.cwd(), ".env.local"),
1594
+ path.resolve(process.cwd(), "../../.env.local"),
1595
+ path.resolve(process.cwd(), "../.env.local"),
1596
+ ];
1597
+ for (const envPath of candidates) {
1598
+ if (fs.existsSync(envPath)) {
1599
+ const content = fs.readFileSync(envPath, "utf-8");
1600
+ for (const line of content.split("\n")) {
1601
+ const match = line.match(/^([^#=]+)=(.*)$/);
1602
+ if (match)
1603
+ process.env[match[1].trim()] = match[2].trim();
1604
+ }
1605
+ if (process.env.GEMINI_API_KEY) {
1606
+ console.log(`[env] Loaded GEMINI_API_KEY from ${envPath}`);
1607
+ break;
1608
+ }
1609
+ }
1610
+ }
1611
+ }
1612
+ catch { /* ignore env loading errors */ }
1613
+ }
1494
1614
  const options = parseArgs(process.argv.slice(2));
1495
1615
  console.log("NodeBench LLM Judge Eval Harness");
1496
1616
  console.log("================================");