npm - @blockrun/franklin - Versions diffs - 3.8.17 → 3.8.18 - Mend

@blockrun/franklin 3.8.17 → 3.8.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/agent/evaluator.js CHANGED Viewed

@@ -26,34 +26,46 @@
 // Principle-based, not example-enumerating. Specific tickers or phrasings
 // hard-coded here would rot the moment the market changes. The rule is
 // general: claim → tool result or explicit uncertainty.
-const EVALUATOR_PROMPT = `You are a GROUNDING CHECK agent. Your job is to verify that an AI assistant's answer is grounded in tool-call evidence, not model memory.
+const EVALUATOR_PROMPT = `You are a GROUNDING CHECK agent. Your job is to verify that an AI assistant's answer is grounded in tool-call evidence, not model memory — and that it didn't REFUSE to use tools when tools were the right answer.
 ## What you receive
 - The user's question
 - A list of tool calls made this turn (tool name, input summary, whether it succeeded)
 - The assistant's final text answer
-## What you check
+## Two failure modes to catch
+### A. Ungrounded claims
 Every **factual claim** in the answer must trace to ONE of:
   (a) A successful tool call result from this turn, OR
-  (b) Explicit acknowledgment of uncertainty ("I'm not sure", "based on older data", "I'd need to check")
+  (b) Explicit acknowledgment of uncertainty ("I'm not sure", "based on older data")
-Claims that are ungrounded:
+Flag as ungrounded:
 - Specific current-world facts stated with confidence but not backed by any tool call this turn
 - Recommendations or conclusions that depend on unstated data (e.g. "you should sell" without a price lookup)
 - Invented specifics — names, numbers, dates the model produced without a tool call supporting them
-Claims that are grounded:
+### B. Tool-use refusal (NEW)
+If the user clearly asked for live-world data — a current price, today's news, the latest state of X — and the assistant's answer contains a refusal or deflection (e.g. "I can't provide real-time prices", "我无法提供实时数据", "check Yahoo Finance yourself", "as an AI I don't have access to live data"), that is also UNGROUNDED. Franklin HAS tools for this (TradingMarket for prices, ExaAnswer for current events, WebSearch for general web, etc.). Refusing to reach for them is the failure this check was built for.
+Flag as tool-use refusal:
+- "I can't check real-time prices"
+- "I don't have access to current market data"
+- "You should check [some external site] for the latest"
+- Any variation in any language that shrugs off a live-data question when tools exist
+## What's OK
 - Anything directly derived from a tool result shown in the turn
 - General knowledge / definitions / reasoning that doesn't depend on current-world specifics
-- Claims explicitly hedged as uncertain
+- Claims explicitly hedged as uncertain for reasons unrelated to tool availability
 ## Output — exact format
 VERDICT: GROUNDED | PARTIAL | UNGROUNDED
-If not GROUNDED, list each ungrounded claim on its own line starting with "- " and the tool that should have been called, like:
+If not GROUNDED, list each issue on its own line starting with "- " and the tool that should have been called, like:
 - Claim: "<the ungrounded part, quoted briefly>" → missing tool: <TradingMarket | ExaAnswer | ExaSearch | WebSearch | ...>
+- Refusal: "<the refusal phrase, quoted briefly>" → should have called: <tool name>
 Empty line between verdict and list. No other text. No preamble. No apology. Be terse.`;
 // ─── Trigger policy ──────────────────────────────────────────────────────

package/dist/agent/planner.js CHANGED Viewed

@@ -17,8 +17,18 @@ const MULTI_STEP_PATTERN = /first.*then|step\s+\d|\d+\.\s|and\s+then|after\s+tha
  * the overhead of an extra planning call.
  */
 export function shouldPlan(tier, profile, userText, ultrathink, planDisabled) {
-    // Per-process opt-out for ablation / scripting ("is plan-then-execute
-    // still load-bearing?"). Takes precedence over every other heuristic.
+    // Default: plan-then-execute is OFF (v3.8.18). Observed failure: router
+    // correctly picks Sonnet for a "should I sell CRCL" prompt, but the
+    // executor swap downgrades actual execution to gemini-2.5-flash, which
+    // then answers from memory instead of calling TradingMarket / ExaAnswer.
+    // The cheap-executor pattern was load-bearing for Sonnet 4.0-era models;
+    // Opus 4.7 / Sonnet 4.6 handle multi-step tool use coherently in a
+    // single pass, so the two-call path is pure overhead — and it actively
+    // hurts when the executor is weaker than the planner.
+    // Opt back in with FRANKLIN_PLAN=1 (for experiments / ablation).
+    if (process.env.FRANKLIN_PLAN !== '1')
+        return false;
+    // Legacy env opt-out — still honored for users who set it previously.
     if (process.env.FRANKLIN_NOPLAN === '1')
         return false;
     // User disabled planning for this session

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blockrun/franklin",
-  "version": "3.8.17",
+  "version": "3.8.18",
   "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
   "type": "module",
   "exports": {