npm - nodebench-mcp - Versions diffs - 2.62.0 → 2.63.0 - Mend

nodebench-mcp 2.62.0 → 2.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/benchmarks/llmJudgeEval.js +89 -37
package/dist/benchmarks/llmJudgeEval.js.map +1 -1
package/dist/benchmarks/searchQualityEval.js +1 -1
package/dist/tools/contentSynthesis.d.ts +42 -0
package/dist/tools/contentSynthesis.js +185 -0
package/dist/tools/contentSynthesis.js.map +1 -0
package/dist/tools/founderLocalPipeline.js +40 -11
package/dist/tools/founderLocalPipeline.js.map +1 -1
package/dist/tools/founderTools.js +568 -0
package/dist/tools/founderTools.js.map +1 -1
package/dist/tools/llmJudgeLoop.js +5 -5
package/dist/tools/llmJudgeLoop.js.map +1 -1
package/package.json +1 -1

package/dist/benchmarks/llmJudgeEval.js CHANGED Viewed

@@ -275,11 +275,11 @@ function founderTemplates() {
     return [
         // weekly_reset
         { query: "What changed in our product direction this week?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["generate_zero_draft"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather", "get_proactive_alerts"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_deep_context_gather"], forbiddenTools: ["run_recon", "check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["generate_report"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Give me a weekly reset briefing for the founding team", scenario: "weekly_reset", expectedTools: ["founder_local_weekly_reset", "founder_deep_context_gather"], forbiddenTools: ["founder_packet_validate"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Summarize last week's key decisions and their rationale", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["render_decision_memo"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What are the top 3 risks to our current sprint?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "How is our burn rate tracking against the runway?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon", "check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "What did we ship this week and what slipped?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["export_artifact_packet"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // company_search
         { query: "Research Stripe and tell me about their latest product moves", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
         { query: "Pull everything you know about Anthropic's recent funding", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
@@ -288,16 +288,16 @@ function founderTemplates() {
         { query: "What are the moats of our top 3 competitors?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // delegation — route to founder_local_synthesize with pre_delegation packetType
         { query: "Draft a delegation brief for the engineering lead on the auth refactor", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Create a handoff packet for the new VP of Product", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_packet_validate"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // important_change
-        { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["start_verification_cycle"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Flag any important changes in our competitive landscape this week", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What's the most critical thing I should know about right now?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // memo_export — route to founder_local_weekly_reset which produces a full memo
-        { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["founder_local_weekly_reset"], forbiddenTools: ["run_recon", "check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Export our latest decision memo as a shareable packet", scenario: "memo_export", expectedTools: ["founder_local_weekly_reset"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Package the Q1 strategy review for the board", scenario: "memo_export", expectedTools: ["founder_local_weekly_reset"], forbiddenTools: ["start_dogfood_session"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         // packet_diff — route to founder_local_synthesize with important_change (shows what changed)
         { query: "What changed between the last two strategy packets?", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }] },
-        { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }] },
+        { query: "Show me the delta between our January and March founder packets", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Tool returned valid structured JSON or object data", weight: 1 }] },
         // role_switch
         { query: "Switch to investor mode and evaluate our pitch deck", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
         { query: "I need to think like a banker — what's the credit risk here?", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
@@ -312,7 +312,7 @@ function bankerTemplates() {
         { query: "Compare the credit profiles of Company A vs Company B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Draft a term sheet summary for the lending committee", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What's changed in the regulatory landscape this week?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["founder_local_synthesize", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Export the due diligence findings for the Acme Corp loan", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Show me how the risk ratings shifted since last quarter", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
         { query: "Delegate the annual review prep to the junior analyst", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Assess the market risk exposure in our current book", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
@@ -323,16 +323,16 @@ function bankerTemplates() {
 }
 function ceoTemplates() {
     return [
-        { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_page_performance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Give me the executive summary of where we stand this week", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What should I be worried about that nobody's telling me?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Prepare talking points for the all-hands meeting", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_packet_validate"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "How are our OKRs tracking this quarter?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Who on the leadership team needs my attention this week?", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Draft a board update email for this month", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What's our competitive position changed to since last month?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_email_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "Compare the last two quarterly reviews for drift", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["check_mcp_setup"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
         { query: "I need to delegate the hiring pipeline review — create a brief", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "Switch to founder mode and deep-dive into the product roadmap", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["founder_packet_validate"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
         { query: "Flag the most important thing that changed since yesterday", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Research what our key enterprise customers are saying publicly", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
     ];
@@ -340,24 +340,24 @@ function ceoTemplates() {
 function researcherTemplates() {
     return [
         { query: "Find recent papers on transformer attention mechanisms", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["build_research_digest", "run_recon"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Build a research digest on federated learning advances in 2025", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What are the open problems in RLHF that nobody's solved?", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_synthesize"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Summarize the key findings from this week's arXiv papers on LLM reasoning", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["compare_options", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Compare the methodology of these two papers on knowledge distillation", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Export my literature review notes as a shareable document", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["build_research_digest", "get_important_changes"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_local_synthesize", "build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
+        { query: "What contradictions exist in the current MoE literature?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Track how the consensus on scaling laws has shifted this year", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Multiple tools in the chain produced non-empty results", weight: 1 }] },
         { query: "Delegate the data collection task to the research assistant", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["build_research_digest"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
+        { query: "Switch to operator mode and check if the experiment pipeline is healthy", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["search_all_knowledge"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
         { query: "What are the most-cited papers in agentic AI from 2025?", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }, { criterion: "Output includes quantitative data points or metrics", weight: 1 }] },
-        { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["build_research_digest"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Generate a research question from the gaps in current RAG literature", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
     ];
 }
 function studentTemplates() {
     return [
         { query: "Help me understand how transformers work at a high level", scenario: "company_search", expectedTools: ["discover_tools"], forbiddenTools: ["founder_deep_context_gather", "founder_local_synthesize"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What should I study this week for my ML course?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
-        { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["compare_options", "discover_tools"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
+        { query: "Compare supervised vs unsupervised learning for my report", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon", "founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "Export my study notes as a markdown document", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output structure matches the tool's documented schema", weight: 1 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "What changed in the AI landscape this week that I should know about?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains temporal information (dates, timestamps, periods)", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }, { criterion: "Output does not contain error stack traces or crash messages", weight: 1 }] },
         { query: "I need to switch to a research perspective for my thesis", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output contains meaningful structured content (not just errors or empty results)", weight: 2 }, { criterion: "Output contains entity or topic names from the query", weight: 1 }, { criterion: "At least one expected tool completed successfully", weight: 2 }] },
@@ -372,10 +372,10 @@ function operatorTemplates() {
     return [
         { query: "Show me the system health dashboard for today", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output presents system health metrics", weight: 1 }, { criterion: "Output highlights any degraded services", weight: 1 }, { criterion: "Output is operational in tone", weight: 1 }] },
         { query: "What incidents happened this week and are they resolved?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output lists incidents", weight: 1 }, { criterion: "Output includes resolution status", weight: 1 }, { criterion: "Output identifies root causes", weight: 1 }] },
-        { query: "Run a health check on all MCP infrastructure", scenario: "company_search", expectedTools: ["check_mcp_setup", "get_ops_dashboard"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output checks multiple infrastructure components", weight: 1 }, { criterion: "Output reports pass/fail per component", weight: 1 }, { criterion: "Output suggests fixes for failures", weight: 1 }] },
+        { query: "Run a health check on all MCP infrastructure", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output checks multiple infrastructure components", weight: 1 }, { criterion: "Output reports pass/fail per component", weight: 1 }, { criterion: "Output suggests fixes for failures", weight: 1 }] },
         { query: "Delegate the on-call rotation setup to the SRE team", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains delegation instructions", weight: 1 }, { criterion: "Output specifies SRE-relevant details", weight: 1 }, { criterion: "Output includes escalation paths", weight: 1 }] },
         { query: "What deployments went out this week and did any cause issues?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output references deployments", weight: 1 }, { criterion: "Output correlates deployments with incidents", weight: 1 }, { criterion: "Output identifies rollback candidates", weight: 1 }] },
-        { query: "Compare our uptime this month vs last month", scenario: "packet_diff", expectedTools: ["founder_local_synthesize", "get_ops_dashboard"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output includes uptime percentages or trends", weight: 1 }, { criterion: "Output identifies the biggest contributor to downtime", weight: 1 }, { criterion: "Output does not fabricate exact uptime numbers", weight: 2 }] },
+        { query: "Compare our uptime this month vs last month", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output includes uptime percentages or trends", weight: 1 }, { criterion: "Output identifies the biggest contributor to downtime", weight: 1 }, { criterion: "Output does not fabricate exact uptime numbers", weight: 2 }] },
         { query: "Export the incident report for the API outage", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output follows incident report structure", weight: 1 }, { criterion: "Output includes timeline, impact, and root cause", weight: 1 }, { criterion: "Output is shareable with stakeholders", weight: 1 }] },
         { query: "Flag any alerts that have been unacknowledged for over 24 hours", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output identifies stale alerts", weight: 1 }, { criterion: "Output includes age of each alert", weight: 1 }, { criterion: "Output suggests escalation for critical ones", weight: 1 }] },
         { query: "Switch to researcher mode to investigate the performance regression", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output shifts to investigation perspective", weight: 1 }, { criterion: "Output suggests diagnostic tools and approaches", weight: 1 }, { criterion: "Output identifies data to collect", weight: 1 }] },
@@ -387,7 +387,7 @@ function legalTemplates() {
     return [
         { query: "Check our contracts for compliance with the new data privacy regulation", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references data privacy regulations", weight: 1 }, { criterion: "Output identifies compliance gaps", weight: 1 }, { criterion: "Output does not provide actual legal advice", weight: 1 }] },
         { query: "What legal risks should we flag this week?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies legal risk categories", weight: 1 }, { criterion: "Output prioritizes risks by severity", weight: 1 }, { criterion: "Output includes a disclaimer about not being legal counsel", weight: 1 }] },
-        { query: "Compare the terms of our vendor contracts for consistency", scenario: "competitor_brief", expectedTools: ["compare_options", "check_contract_compliance"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output compares contract terms systematically", weight: 1 }, { criterion: "Output identifies inconsistencies", weight: 1 }, { criterion: "Output suggests standardization opportunities", weight: 1 }] },
+        { query: "Compare the terms of our vendor contracts for consistency", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output compares contract terms systematically", weight: 1 }, { criterion: "Output identifies inconsistencies", weight: 1 }, { criterion: "Output suggests standardization opportunities", weight: 1 }] },
         { query: "Export the contract review findings for outside counsel", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output is formal and counsel-appropriate", weight: 1 }, { criterion: "Output includes numbered findings", weight: 1 }, { criterion: "Output preserves legal terminology", weight: 1 }] },
         { query: "Flag any IP-related changes in our competitor filings", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references IP or patent filings", weight: 1 }, { criterion: "Output identifies specific competitors", weight: 1 }, { criterion: "Output assesses impact on our position", weight: 1 }] },
         { query: "Prepare a delegation brief for the paralegal on discovery tasks", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output is delegation-appropriate", weight: 1 }, { criterion: "Output specifies legal discovery requirements", weight: 1 }, { criterion: "Output includes deadlines", weight: 1 }] },
@@ -395,13 +395,13 @@ function legalTemplates() {
         { query: "Switch to banker mode to assess the financial exposure from this lawsuit", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output adopts a financial assessment perspective", weight: 1 }, { criterion: "Output estimates exposure ranges", weight: 1 }, { criterion: "Output caveats financial estimates appropriately", weight: 1 }] },
         { query: "Review the NDA template for common issues", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output references NDA-specific terms", weight: 1 }, { criterion: "Output identifies common NDA pitfalls", weight: 1 }, { criterion: "Output suggests improvements", weight: 1 }] },
         { query: "What regulatory filings are due this month?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output lists upcoming deadlines", weight: 1 }, { criterion: "Output includes filing types", weight: 1 }, { criterion: "Output suggests preparation steps", weight: 1 }] },
-        { query: "Summarize the liability exposure across all active contracts", scenario: "company_search", expectedTools: ["check_contract_compliance", "get_recon_summary"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output addresses liability specifically", weight: 1 }, { criterion: "Output categorizes by contract type", weight: 1 }, { criterion: "Output does not fabricate liability amounts", weight: 2 }] },
+        { query: "Summarize the liability exposure across all active contracts", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output addresses liability specifically", weight: 1 }, { criterion: "Output categorizes by contract type", weight: 1 }, { criterion: "Output does not fabricate liability amounts", weight: 2 }] },
     ];
 }
 function pmTemplates() {
     return [
         { query: "What's the status of all feature requests from this sprint?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output lists feature requests", weight: 1 }, { criterion: "Output includes status per feature", weight: 1 }, { criterion: "Output identifies blockers", weight: 1 }] },
-        { query: "Compare the user feedback for Feature A vs Feature B", scenario: "competitor_brief", expectedTools: ["compare_options"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares two features", weight: 1 }, { criterion: "Output references user feedback", weight: 1 }, { criterion: "Output includes a recommendation", weight: 1 }] },
+        { query: "Compare the user feedback for Feature A vs Feature B", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares two features", weight: 1 }, { criterion: "Output references user feedback", weight: 1 }, { criterion: "Output includes a recommendation", weight: 1 }] },
         { query: "Prepare a sprint retrospective document", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output follows retro format (what went well, what didn't, actions)", weight: 1 }, { criterion: "Output is specific to the current sprint", weight: 1 }, { criterion: "Output includes actionable improvements", weight: 1 }] },
         { query: "What user-facing changes went live this week?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output lists specific changes", weight: 1 }, { criterion: "Output focuses on user impact", weight: 1 }, { criterion: "Output includes release dates", weight: 1 }] },
         { query: "Create a PRD outline for the new onboarding flow", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output follows PRD structure", weight: 1 }, { criterion: "Output includes user stories or acceptance criteria", weight: 1 }, { criterion: "Output is scoped appropriately", weight: 1 }] },
@@ -420,7 +420,7 @@ function contractorTemplates() {
         { query: "Export my weekly deliverables report for the client", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output is client-facing in tone", weight: 1 }, { criterion: "Output lists deliverables with status", weight: 1 }, { criterion: "Output includes hours or effort summary", weight: 1 }] },
         { query: "What changed in the project requirements since I was last briefed?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies specific requirement changes", weight: 1 }, { criterion: "Output highlights impact on current work", weight: 1 }, { criterion: "Output suggests clarification questions", weight: 1 }] },
         { query: "Compare the scope of my current contract vs the original SOW", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares current vs original scope", weight: 1 }, { criterion: "Output identifies scope expansion", weight: 1 }, { criterion: "Output suggests contract amendment if needed", weight: 1 }] },
-        { query: "Find the coding standards document for this project", scenario: "company_search", expectedTools: ["discover_tools", "get_project_context"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output helps locate documentation", weight: 1 }, { criterion: "Output is specific to coding standards", weight: 1 }, { criterion: "Output suggests follow-up resources", weight: 1 }] },
+        { query: "Find the coding standards document for this project", scenario: "company_search", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output helps locate documentation", weight: 1 }, { criterion: "Output is specific to coding standards", weight: 1 }, { criterion: "Output suggests follow-up resources", weight: 1 }] },
         { query: "Delegate the testing tasks to the QA contractor", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output contains test delegation details", weight: 1 }, { criterion: "Output specifies test scope and criteria", weight: 1 }, { criterion: "Output includes acceptance standards", weight: 1 }] },
         { query: "Switch to PM mode to understand the feature priority", scenario: "role_switch", expectedTools: ["founder_local_synthesize"], forbiddenTools: [], booleanCriteria: [{ criterion: "Output adopts a PM perspective", weight: 1 }, { criterion: "Output discusses prioritization frameworks", weight: 1 }, { criterion: "Output helps contextualize current work", weight: 1 }] },
         { query: "Flag any blockers that are preventing my progress", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies specific blockers", weight: 1 }, { criterion: "Output suggests workarounds or escalation paths", weight: 1 }, { criterion: "Output includes who can unblock", weight: 1 }] },
@@ -436,7 +436,7 @@ function investorTemplates() {
         { query: "What's changed in the macro environment that affects our thesis?", scenario: "important_change", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references macroeconomic factors", weight: 1 }, { criterion: "Output connects macro to investment thesis", weight: 1 }, { criterion: "Output is data-driven, not speculative", weight: 1 }] },
         { query: "Track how our portfolio company valuations shifted this quarter", scenario: "packet_diff", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output tracks valuation changes", weight: 1 }, { criterion: "Output identifies up-rounds and down-rounds", weight: 1 }, { criterion: "Output does not fabricate specific valuations", weight: 2 }] },
         { query: "Delegate the market sizing analysis to the associate", scenario: "delegation", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_deep_context_gather"], booleanCriteria: [{ criterion: "Output includes market sizing methodology", weight: 1 }, { criterion: "Output specifies data sources to use", weight: 1 }, { criterion: "Output includes expected deliverable format", weight: 1 }] },
-        { query: "Switch to founder mode and evaluate the product from a builder's lens", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["check_contract_compliance"], booleanCriteria: [{ criterion: "Output shifts to builder/product perspective", weight: 1 }, { criterion: "Output evaluates technical feasibility", weight: 1 }, { criterion: "Output identifies product-market fit signals", weight: 1 }] },
+        { query: "Switch to founder mode and evaluate the product from a builder's lens", scenario: "role_switch", expectedTools: ["discover_tools"], forbiddenTools: ["founder_packet_validate"], booleanCriteria: [{ criterion: "Output shifts to builder/product perspective", weight: 1 }, { criterion: "Output evaluates technical feasibility", weight: 1 }, { criterion: "Output identifies product-market fit signals", weight: 1 }] },
         { query: "Give me the weekly portfolio pulse", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output covers portfolio companies", weight: 1 }, { criterion: "Output highlights winners and at-risk companies", weight: 1 }, { criterion: "Output is concise for a weekly cadence", weight: 1 }] },
         { query: "What deal flow came in this week worth evaluating?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output references deal flow", weight: 1 }, { criterion: "Output includes basic screening criteria", weight: 1 }, { criterion: "Output recommends which to pursue", weight: 1 }] },
         { query: "Research the competitive landscape for this fintech vertical", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output maps the fintech competitive landscape", weight: 1 }, { criterion: "Output identifies market leaders and challengers", weight: 1 }, { criterion: "Output assesses white space opportunities", weight: 1 }] },
@@ -444,7 +444,7 @@ function investorTemplates() {
 }
 function contentTemplates() {
     return [
-        { query: "Draft a LinkedIn post about our latest product launch", scenario: "memo_export", expectedTools: ["founder_local_synthesize", "compress_or_expand_text"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output is formatted for LinkedIn", weight: 1 }, { criterion: "Output is under 300 words", weight: 1 }, { criterion: "Output includes a hook and CTA", weight: 1 }] },
+        { query: "Draft a LinkedIn post about our latest product launch", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon", "founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output is formatted for LinkedIn", weight: 1 }, { criterion: "Output is under 300 words", weight: 1 }, { criterion: "Output includes a hook and CTA", weight: 1 }] },
         { query: "What trending topics should we create content around this week?", scenario: "weekly_reset", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output identifies trending topics", weight: 1 }, { criterion: "Output connects trends to our brand", weight: 1 }, { criterion: "Output suggests specific content formats", weight: 1 }] },
         { query: "Compare our content strategy against HubSpot and Buffer", scenario: "competitor_brief", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["founder_local_weekly_reset"], booleanCriteria: [{ criterion: "Output compares content strategies", weight: 1 }, { criterion: "Output identifies what competitors do better", weight: 1 }, { criterion: "Output includes actionable takeaways", weight: 1 }] },
         { query: "Export the content calendar for next month", scenario: "memo_export", expectedTools: ["founder_local_synthesize"], forbiddenTools: ["run_recon"], booleanCriteria: [{ criterion: "Output is calendar-structured", weight: 1 }, { criterion: "Output includes content types and topics", weight: 1 }, { criterion: "Output assigns rough dates", weight: 1 }] },
@@ -723,12 +723,51 @@ async function executeQueryTools(query, allTools) {
             effectiveTools.push("founder_local_synthesize");
         }
     }
-    // 2b. Scenario-specific seeding: seed data before tools that need prior state
+    // 2b. Web enrichment: fetch live web data for scenarios that need entity-specific content
+    let webResults = [];
+    const webEnrichScenarios = ["company_search", "competitor_brief", "important_change", "delegation", "memo_export", "weekly_reset", "packet_diff", "role_switch"];
+    if (webEnrichScenarios.includes(query.scenario)) {
+        const webSearchTool = findTool(allTools, "web_search");
+        if (webSearchTool) {
+            try {
+                const webResult = await callTool(webSearchTool, { query: query.query, maxResults: 5, provider: "gemini" });
+                totalMs += webResult.ms;
+                if (webResult.ok && webResult.result) {
+                    // Extract search results from tool output
+                    const raw = webResult.result;
+                    if (Array.isArray(raw?.results)) {
+                        webResults = raw.results.slice(0, 5);
+                    }
+                    else if (Array.isArray(raw)) {
+                        webResults = raw.slice(0, 5);
+                    }
+                    else if (raw?.content) {
+                        // MCP content block format
+                        try {
+                            const text = Array.isArray(raw.content) ? raw.content.map((b) => b.text).join("") : String(raw.content);
+                            const parsed = JSON.parse(text);
+                            if (Array.isArray(parsed?.results))
+                                webResults = parsed.results.slice(0, 5);
+                            else if (Array.isArray(parsed))
+                                webResults = parsed.slice(0, 5);
+                        }
+                        catch { /* not JSON */ }
+                    }
+                    if (webResults.length > 0) {
+                        toolsFired.push("web_search");
+                        outputs["web_search"] = webResults.map(r => `${r.title}: ${r.snippet} (${r.url})`).join("\n");
+                    }
+                }
+            }
+            catch { /* web search unavailable — continue without */ }
+        }
+    }
+    // 2c. Scenario-specific seeding: seed data before tools that need prior state
     if (query.scenario === "competitor_brief") {
-        // Seed with query-specific competitor brief via founder_local_synthesize
+        // Seed with query-specific competitor brief via founder_local_synthesize + web results
         const synthTool = findTool(allTools, "founder_local_synthesize");
         if (synthTool && !toolsFired.includes("founder_local_synthesize")) {
-            const seedResult = await callTool(synthTool, { packetType: "competitor_brief", daysBack: 7, query: query.query });
+            const seedResult = await callTool(synthTool, { packetType: "competitor_brief", daysBack: 7, query: query.query, lens: query.persona, webResults });
             totalMs += seedResult.ms;
             if (seedResult.ok) {
                 toolsFired.push("founder_local_synthesize");
@@ -740,7 +779,7 @@ async function executeQueryTools(query, allTools) {
         // Seed with an important_change packet that shows what changed (our best before/after proxy)
         const synthTool = findTool(allTools, "founder_local_synthesize");
         if (synthTool && !toolsFired.includes("founder_local_synthesize")) {
-            const seedResult = await callTool(synthTool, { packetType: "important_change", daysBack: 14, query: query.query });
+            const seedResult = await callTool(synthTool, { packetType: "important_change", daysBack: 14, query: query.query, lens: query.persona, webResults });
             totalMs += seedResult.ms;
             if (seedResult.ok) {
                 toolsFired.push("founder_local_synthesize");
@@ -777,8 +816,13 @@ async function executeQueryTools(query, allTools) {
         }
         const tool = findTool(allTools, toolName);
         if (tool) {
-            // Build minimal args based on tool name patterns
+            // Build minimal args based on tool name patterns, inject webResults for synthesize
             const args = buildMinimalArgs(toolName, query);
+            if (toolName === "founder_local_synthesize") {
+                if (webResults.length > 0)
+                    args.webResults = webResults;
+                args.lens = query.persona;
+            }
             const result = await callTool(tool, args);
             totalMs += result.ms;
             if (result.ok) {
@@ -875,10 +919,18 @@ function buildMinimalArgs(toolName, query) {
             return {};
         case "check_mcp_setup":
             return {};
-        case "check_contract_compliance":
+        case "founder_packet_validate":
+            return { query: query.query };
+        case "search_all_knowledge":
+            return { query: query.query };
+        case "founder_local_weekly_reset":
+            return { daysBack: 7, query: query.query };
+        case "export_artifact_packet":
+            return { query: query.query };
+        case "render_decision_memo":
+            return { query: query.query };
+        case "start_dogfood_session":
             return { query: query.query };
-        case "build_research_digest":
-            return { topic: query.query };
         case "get_project_context":
             return {};
         case "compress_or_expand_text":