npm - nodebench-mcp - Versions diffs - 2.59.0 → 2.61.0 - Mend

nodebench-mcp 2.59.0 → 2.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/benchmarks/llmJudgeEval.js +206 -86
package/dist/benchmarks/llmJudgeEval.js.map +1 -1
package/dist/benchmarks/searchQualityEval.d.ts +26 -21
package/dist/benchmarks/searchQualityEval.js +324 -111
package/dist/benchmarks/searchQualityEval.js.map +1 -1
package/dist/dashboard/operatingDashboardHtml.d.ts +1 -0
package/dist/dashboard/operatingDashboardHtml.js +274 -0
package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
package/dist/dashboard/operatingServer.js +24 -0
package/dist/dashboard/operatingServer.js.map +1 -1
package/dist/tools/founderLocalPipeline.js +9 -4
package/dist/tools/founderLocalPipeline.js.map +1 -1
package/dist/tools/progressiveDiscoveryTools.js +1 -1
package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
package/dist/tools/toolRegistry.js +100 -0
package/dist/tools/toolRegistry.js.map +1 -1
package/package.json +1 -1

package/dist/benchmarks/llmJudgeEval.js CHANGED Viewed

@@ -65,98 +65,98 @@ const SCENARIOS = [
 function founderTemplates() {
     return [
         // weekly_reset
-        { query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_weekly_summary"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // company_search
-        { query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // competitor_brief
-        { query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Compare our product positioning against Linear and Notion", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // delegation
-        { query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_deep_context_gather", "export_artifact_packet"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // important_change
-        { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // memo_export
-        { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["export_artifact_packet", "founder_deep_context_gather"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // packet_diff
-        { query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
-        { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "founder_packet_history_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
         // role_switch
-        { query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
-        { query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
     ];
 }
 function bankerTemplates() {
     return [
-        { query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
-        { query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "Run credit analysis on the portfolio company Acme Corp", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "What's the debt-to-equity ratio trend for our top borrowers?", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Prepare a weekly credit committee briefing", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Flag any covenant breaches in the current portfolio", scenario: "important_change", expectedTools: ["get_important_changes", "flag_important_change"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_important_changes"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "What are the top 5 watchlist names and why?", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Run a stress test scenario on the commercial real estate portfolio", scenario: "company_search", expectedTools: ["run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Switch to researcher mode and find academic papers on credit risk modeling", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
     ];
 }
 function ceoTemplates() {
     return [
-        { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
-        { query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
-        { query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "get_proactive_alerts"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["export_artifact_packet", "get_weekly_summary"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "founder_deep_context_gather"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize", "get_important_changes"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_packet_diff"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["get_important_changes", "get_proactive_alerts"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["run_recon", "enrich_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
     ];
 }
 function researcherTemplates() {
     return [
-        { query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
-        { query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
-        { query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_packet_diff", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["run_recon", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
     ];
 }
 function studentTemplates() {
     return [
-        { query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
-        { query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
-        { query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Tool returned structured data without errors", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "No error messages or stack traces in output", weight: 2 }] },
+        { query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary", "discover_tools"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["export_artifact_packet"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["get_important_changes", "get_weekly_summary"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "Find beginner-friendly resources on neural network architectures", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Summarize the differences between GPT-4 and Claude for my presentation", scenario: "competitor_brief", expectedTools: ["compare_options", "run_recon"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Help me find a dataset for my NLP project on sentiment analysis", scenario: "company_search", expectedTools: ["discover_tools", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
+        { query: "Create a study timeline for the next 4 weeks on deep learning", scenario: "delegation", expectedTools: ["export_artifact_packet"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What did I learn last week and what should I review?", scenario: "weekly_reset", expectedTools: ["get_weekly_summary"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
     ];
 }
 function operatorTemplates() {
@@ -473,9 +473,25 @@ function extractText(result) {
     }
     return String(result);
 }
+/** Tools that require Convex/gateway and should be skipped, not failed */
+const GATEWAY_DEPENDENT_TOOLS = new Set([
+    "founder_packet_validate",
+]);
+/** Error patterns that indicate missing seed data (retryable) */
+const SEED_NEEDED_PATTERNS = [
+    "session not found",
+    "no packets",
+    "no session",
+    "not found",
+    "no rows",
+    "no data",
+    "empty result",
+    "does not exist",
+];
 async function executeQueryTools(query, allTools) {
     const toolsFired = [];
     const outputs = {};
+    const skipped = [];
     let totalMs = 0;
     // 1. Try discover_tools to find relevant tools
     const discoverTool = findTool(allTools, "discover_tools");
@@ -498,10 +514,58 @@ async function executeQueryTools(query, allTools) {
             effectiveTools.push("founder_local_synthesize");
         }
     }
+    // 2b. Scenario-specific seeding: seed data before tools that need prior state
+    if (query.scenario === "competitor_brief") {
+        // Seed a recon session so subsequent tools have context
+        const reconTool = findTool(allTools, "run_recon");
+        if (reconTool && !effectiveTools.includes("run_recon")) {
+            const seedResult = await callTool(reconTool, { target: "Supermemory", scope: "competitive analysis" });
+            totalMs += seedResult.ms;
+            if (seedResult.ok) {
+                toolsFired.push("run_recon");
+                outputs["run_recon"] = extractText(seedResult.result);
+            }
+        }
+    }
+    if (query.scenario === "packet_diff") {
+        // Seed a founder packet so diff tools have something to compare
+        const gatherTool = findTool(allTools, "founder_deep_context_gather");
+        if (gatherTool && !effectiveTools.includes("founder_deep_context_gather")) {
+            const seedResult = await callTool(gatherTool, { query: "seed context for diff" });
+            totalMs += seedResult.ms;
+            if (seedResult.ok) {
+                toolsFired.push("founder_deep_context_gather");
+                outputs["founder_deep_context_gather"] = extractText(seedResult.result);
+            }
+        }
+    }
+    if (query.scenario === "delegation") {
+        // For delegation, replace founder_packet_validate (gateway-dependent) with
+        // founder_deep_context_gather + render_decision_memo as the core chain
+        const validIdx = effectiveTools.indexOf("founder_packet_validate");
+        if (validIdx !== -1) {
+            effectiveTools.splice(validIdx, 1);
+            skipped.push("founder_packet_validate");
+            // Ensure we have the core delegation chain
+            if (!effectiveTools.includes("founder_deep_context_gather")) {
+                effectiveTools.push("founder_deep_context_gather");
+            }
+            if (!effectiveTools.includes("render_decision_memo")) {
+                const memoTool = findTool(allTools, "render_decision_memo");
+                if (memoTool)
+                    effectiveTools.push("render_decision_memo");
+            }
+        }
+    }
     // 3. Execute each expected tool (simulate the tool chain an agent would follow)
     for (const toolName of effectiveTools) {
         if (toolName === "discover_tools")
             continue; // already called
+        // Skip gateway-dependent tools
+        if (GATEWAY_DEPENDENT_TOOLS.has(toolName)) {
+            skipped.push(toolName);
+            continue;
+        }
         const tool = findTool(allTools, toolName);
         if (tool) {
             // Build minimal args based on tool name patterns
@@ -513,13 +577,41 @@ async function executeQueryTools(query, allTools) {
                 outputs[toolName] = extractText(result.result);
             }
             else {
-                // Tool fired but errored — still counts as fired
-                toolsFired.push(toolName);
-                outputs[toolName] = `ERROR: ${result.error}`;
+                // Check if this is a "needs seed data" error — retry once after seeding
+                const errorLower = (result.error ?? "").toLowerCase();
+                const needsSeed = SEED_NEEDED_PATTERNS.some((p) => errorLower.includes(p));
+                if (needsSeed) {
+                    // Attempt to seed context and retry
+                    const gatherTool = findTool(allTools, "founder_deep_context_gather");
+                    if (gatherTool) {
+                        const seedResult = await callTool(gatherTool, { query: query.query });
+                        totalMs += seedResult.ms;
+                        if (seedResult.ok && !toolsFired.includes("founder_deep_context_gather")) {
+                            toolsFired.push("founder_deep_context_gather");
+                            outputs["founder_deep_context_gather"] = extractText(seedResult.result);
+                        }
+                    }
+                    // Retry the original tool
+                    const retry = await callTool(tool, args);
+                    totalMs += retry.ms;
+                    if (retry.ok) {
+                        toolsFired.push(toolName);
+                        outputs[toolName] = extractText(retry.result);
+                    }
+                    else {
+                        toolsFired.push(toolName);
+                        outputs[toolName] = `ERROR: ${retry.error}`;
+                    }
+                }
+                else {
+                    // Tool fired but errored — still counts as fired
+                    toolsFired.push(toolName);
+                    outputs[toolName] = `ERROR: ${result.error}`;
+                }
             }
         }
     }
-    return { toolsFired, outputs, totalMs };
+    return { toolsFired, outputs, totalMs, skipped };
 }
 /** Build minimal arguments for a tool call based on the query context */
 function buildMinimalArgs(toolName, query) {
@@ -550,7 +642,7 @@ function buildMinimalArgs(toolName, query) {
                 memo_export: "weekly_reset",
                 packet_diff: "weekly_reset",
             };
-            return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7 };
+            return { packetType: ptMap[query.scenario] ?? "weekly_reset", daysBack: 7, query: query.query };
         }
         case "founder_packet_diff":
             return {};
@@ -597,7 +689,7 @@ async function callGeminiJudge(query, toolOutputs) {
     const apiKey = process.env.GEMINI_API_KEY;
     if (!apiKey) {
         // Fallback to heuristic judge
-        return heuristicJudge(query, toolOutputs);
+        return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
     }
     const combinedOutput = Object.entries(toolOutputs)
         .map(([tool, out]) => `[${tool}]:\n${out}`)
@@ -642,22 +734,22 @@ Respond ONLY with valid JSON (no markdown):
         });
         if (!response.ok) {
             console.error(`Gemini API error: ${response.status} ${response.statusText}`);
-            return heuristicJudge(query, toolOutputs);
+            return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
         }
         const json = await response.json();
         const text = json?.candidates?.[0]?.content?.parts?.[0]?.text;
         if (!text)
-            return heuristicJudge(query, toolOutputs);
+            return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
         const parsed = JSON.parse(text);
         // Validate structure
         if (!parsed.criteria || !Array.isArray(parsed.criteria)) {
-            return heuristicJudge(query, toolOutputs);
+            return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
         }
-        return parsed;
+        return { response: parsed, judgeType: "gemini" };
     }
     catch (err) {
         console.error(`Gemini judge error: ${err.message}`);
-        return heuristicJudge(query, toolOutputs);
+        return { response: heuristicJudge(query, toolOutputs), judgeType: "heuristic" };
     }
 }
 /** Stopwords excluded from query-keyword matching */
@@ -1151,7 +1243,7 @@ export async function runLlmJudgeEval(options) {
         // Execute tools
         const execution = await executeQueryTools(query, allTools);
         // Judge
-        const judgeResult = await callGeminiJudge(query, execution.outputs);
+        const { response: judgeResult, judgeType } = await callGeminiJudge(query, execution.outputs);
         // Compute metrics
         const toolPrecision = computeToolPrecision(query.expectedTools, execution.toolsFired);
         const toolRecall = computeToolRecall(query.expectedTools, execution.toolsFired);
@@ -1173,7 +1265,7 @@ export async function runLlmJudgeEval(options) {
         results.push(qr);
         saveResult(runId, qr);
         const status = overallPass ? "PASS" : "FAIL";
-        process.stdout.write(`${progress} ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
+        process.stdout.write(`${progress} [judge:${judgeType}] ${query.id} ${status} (precision=${toolPrecision.toFixed(2)}, criteria=${criteriaPassRate.toFixed(2)}) ${execution.totalMs}ms\n`);
     }
     // 6. Build summary
     const fullCorpus = generateQueryCorpus();
@@ -1338,9 +1430,9 @@ export function growCorpus(diagnosis) {
                 expectedTools: [...original.expectedTools],
                 forbiddenTools: [...original.forbiddenTools],
                 booleanCriteria: [
-                    { criterion: "Tool returned structured data without errors", weight: 2 },
+                    { criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
                     { criterion: "At least one expected tool completed successfully", weight: 2 },
-                    { criterion: "No error messages or stack traces in output", weight: 2 },
+                    { criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
                 ],
             });
         }
@@ -1365,9 +1457,9 @@ export function growCorpus(diagnosis) {
             forbiddenTools: [...original.forbiddenTools],
             // Simplified criteria that the heuristic can actually judge
             booleanCriteria: [
-                { criterion: "Tool returned structured data without errors", weight: 2 },
+                { criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 },
                 { criterion: "At least one expected tool completed successfully", weight: 2 },
-                { criterion: "No error messages or stack traces in output", weight: 2 },
+                { criterion: "Output does not contain error stack traces or crash messages", weight: 1 },
             ],
         });
     }
@@ -1491,6 +1583,34 @@ function parseArgs(argv) {
     return options;
 }
 async function main() {
+    // Try loading from .env.local if GEMINI_API_KEY not in environment
+    if (!process.env.GEMINI_API_KEY) {
+        try {
+            const fs = await import("fs");
+            const path = await import("path");
+            // Search multiple locations for .env.local
+            const candidates = [
+                path.resolve(process.cwd(), ".env.local"),
+                path.resolve(process.cwd(), "../../.env.local"),
+                path.resolve(process.cwd(), "../.env.local"),
+            ];
+            for (const envPath of candidates) {
+                if (fs.existsSync(envPath)) {
+                    const content = fs.readFileSync(envPath, "utf-8");
+                    for (const line of content.split("\n")) {
+                        const match = line.match(/^([^#=]+)=(.*)$/);
+                        if (match)
+                            process.env[match[1].trim()] = match[2].trim();
+                    }
+                    if (process.env.GEMINI_API_KEY) {
+                        console.log(`[env] Loaded GEMINI_API_KEY from ${envPath}`);
+                        break;
+                    }
+                }
+            }
+        }
+        catch { /* ignore env loading errors */ }
+    }
     const options = parseArgs(process.argv.slice(2));
     console.log("NodeBench LLM Judge Eval Harness");
     console.log("================================");