@blockrun/franklin 3.15.8 → 3.15.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -111,6 +111,28 @@ export function classifyAgentError(message) {
111
111
  suggestion: 'The model is overloaded. Try /model to switch, or wait and /retry.',
112
112
  };
113
113
  }
114
+ // Reasoning / thinking-mode format errors — NOT transient.
115
+ // DeepSeek V4 family and similar thinking-enabled models reject requests
116
+ // when the message history's reasoning_content fields don't match the
117
+ // upstream's expected shape (typically: tool-call assistant messages must
118
+ // carry reasoning_content; non-tool-call ones must not, or vice versa).
119
+ // The fix is to drop the polluting history, not to swap models — every
120
+ // thinking-enabled model has the same constraint just with different
121
+ // specifics. /clear forces a fresh context that won't have the bad shape.
122
+ // Classified BEFORE the generic schema branch below so we surface the
123
+ // right suggestion.
124
+ if (includesAny(err, [
125
+ 'reasoning_content',
126
+ 'reasoning content',
127
+ 'thinking mode must',
128
+ 'message format incompatible',
129
+ 'reasoning_format_error',
130
+ ])) {
131
+ return {
132
+ category: 'schema', label: 'Schema', isTransient: false, maxRetries: 0,
133
+ suggestion: 'Thinking-mode history is incompatible with this model. Use /clear to reset and retry, or /model to switch to a non-thinking model.',
134
+ };
135
+ }
114
136
  // Schema / tool-definition errors — NOT transient, retrying won't help.
115
137
  // These can be wrapped in 5xx responses (e.g. '503: 400 Invalid schema'),
116
138
  // so classify them BEFORE the generic server-error branch below.
@@ -65,9 +65,36 @@ Flag as tool-use refusal:
65
65
 
66
66
  VERDICT: GROUNDED | PARTIAL | UNGROUNDED
67
67
 
68
- If not GROUNDED, list each issue on its own line starting with "- " and the tool that should have been called, like:
69
- - Claim: "<the ungrounded part, quoted briefly>" → missing tool: <TradingMarket | ExaAnswer | ExaSearch | WebSearch | ...>
70
- - Refusal: "<the refusal phrase, quoted briefly>" should have called: <tool name>
68
+ If not GROUNDED, list each issue on its own line starting with "- " and the tool that should have been called.
69
+
70
+ ## Picking the right tool strict domain rules
71
+
72
+ **Default for any factual claim:** WebSearch or ExaSearch. These are the
73
+ right answer for the OVERWHELMING majority of "the model said a number it
74
+ didn't look up" cases — current events, statistics, prices for non-crypto
75
+ goods (real estate, retail, salaries), people, companies, news, etc.
76
+
77
+ **Use specialized tools ONLY when the claim's domain matches:**
78
+ - TradingMarket / TradingSignal — ONLY for cryptocurrency tickers (BTC, ETH, SOL, etc). Never for stocks, real estate, currencies, commodities outside crypto.
79
+ - DefiLlamaProtocol / DefiLlamaYields / DefiLlamaPrice — ONLY for DeFi protocols, TVL, yields, on-chain token prices.
80
+ - SearchX — ONLY for X.com / Twitter posts and accounts.
81
+ - ExaAnswer — research questions where you want a synthesized answer with citations.
82
+ - WebFetch — claims that quote a SPECIFIC URL the model already named.
83
+
84
+ **Anti-patterns to never produce:**
85
+ - Real-estate price → TradingMarket (TradingMarket is crypto-only — wrong domain)
86
+ - Stock ticker → TradingMarket (also crypto-only — use WebSearch instead)
87
+ - Generic news / statistics → TradingMarket (use WebSearch)
88
+ - Person's biography → TradingMarket (use WebSearch)
89
+
90
+ When unsure: name **WebSearch**. It's the safe default for factual grounding.
91
+
92
+ ## Format examples
93
+
94
+ - Claim: "<the ungrounded part, quoted briefly>" → missing tool: WebSearch
95
+ - Claim: "BTC at $67k" → missing tool: TradingMarket
96
+ - Claim: "Westlake $/sqft is $719" → missing tool: WebSearch
97
+ - Refusal: "<the refusal phrase, quoted briefly>" → should have called: WebSearch
71
98
 
72
99
  Empty line between verdict and list. No other text. No preamble. No apology. Be terse.`;
73
100
  // ─── Trigger policy ──────────────────────────────────────────────────────
@@ -241,6 +241,39 @@ export function looksLikeGatewayErrorAsText(parts) {
241
241
  return { match: false, message: '' };
242
242
  return { match: true, message: m[1].trim() };
243
243
  }
244
+ /**
245
+ * Domain check for the grounding-retry force-tool path. A specialized tool
246
+ * (TradingMarket, DefiLlama*, jupiter*, base0x*, SearchX) should only be
247
+ * pinned by tool_choice when the user prompt actually references that
248
+ * tool's domain — otherwise we let the smart generator pick from any tool.
249
+ *
250
+ * The motivating bug: a real-estate question ("可以还价 20% 吗") had its
251
+ * answer flagged as ungrounded for citing $/sqft figures. The cheap
252
+ * evaluator model picked TradingMarket as the missing tool because it
253
+ * was the first example in the evaluator prompt. Forcing TradingMarket
254
+ * (a crypto-only tool) on a housing question made the retry useless.
255
+ *
256
+ * This function returns false for specialized tools when the prompt has
257
+ * no matching domain keywords; the caller falls back to "any" tool.
258
+ * General-purpose tools (WebSearch, ExaSearch, ExaAnswer, WebFetch,
259
+ * ExaReadUrls) always pass — they're domain-agnostic.
260
+ */
261
+ function isToolRelevantToPrompt(toolName, promptLower) {
262
+ // Crypto trading tools — need a ticker, "crypto", "coin", "swap", etc.
263
+ if (/^(Trading|DefiLlama|Jupiter|Base0x|Base0xGasless)/i.test(toolName)) {
264
+ return /\b(btc|eth|sol|xrp|doge|usdc|usdt|crypto|coin|token|defi|tvl|yield|swap|jupiter|uniswap|pump\.fun|solana|base chain|polygon|ethereum|币|代币|链上|做空|做多)\b/i.test(promptLower);
265
+ }
266
+ // X.com search — need an @handle, "twitter", "tweet", "X.com"
267
+ if (/^SearchX$/i.test(toolName) || /^PostToX$/i.test(toolName)) {
268
+ return /(@\w+|twitter|x\.com|tweet|推特)/i.test(promptLower);
269
+ }
270
+ // Image / video / music gen — need a creative-content request
271
+ if (/^(ImageGen|VideoGen|MusicGen)$/i.test(toolName)) {
272
+ return /\b(image|picture|photo|video|clip|music|song|generate|create|render|draw|画|图|视频|音乐|歌)\b/i.test(promptLower);
273
+ }
274
+ // General-purpose / file / shell tools — always relevant.
275
+ return true;
276
+ }
244
277
  /**
245
278
  * Calculate backoff delay with jitter to avoid thundering herd.
246
279
  * Base: exponential (2^attempt * 1000ms), jitter: ±25%.
@@ -1349,11 +1382,23 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1349
1382
  // Hard enforcement: set tool_choice so the model can't fabricate
1350
1383
  // citations in lieu of running tools (the round-2 failure mode
1351
1384
  // from the Tampa→Miami log). If the evaluator named exactly one
1352
- // available tool, pin to it; otherwise force "any" tool use.
1385
+ // available tool AND that tool's domain matches the user's
1386
+ // prompt, pin to it; otherwise force "any" tool use and let
1387
+ // the generator pick the right one.
1388
+ //
1389
+ // Domain validation guards against the cheap evaluator model
1390
+ // hallucinating a wrong specialized tool (e.g., suggesting
1391
+ // TradingMarket for a real-estate question because the prompt
1392
+ // listed it as the first example tool). Specialized tools —
1393
+ // crypto trading, DeFi, swap quotes, X.com search — only get
1394
+ // pinned when their domain keywords appear in the user prompt;
1395
+ // otherwise we drop down to "any tool" and let the smart
1396
+ // generator model decide based on tool descriptions.
1353
1397
  const namedTools = extractMissingToolNames(gResult);
1354
1398
  const availableNames = new Set(buildCallToolDefs().map(t => t.name));
1355
1399
  const matched = namedTools.filter(n => availableNames.has(n));
1356
- if (matched.length === 1) {
1400
+ const promptForDomainCheck = (lastUserInput || '').toLowerCase();
1401
+ if (matched.length === 1 && isToolRelevantToPrompt(matched[0], promptForDomainCheck)) {
1357
1402
  forceToolChoiceNextRound = { type: 'tool', name: matched[0] };
1358
1403
  }
1359
1404
  else if (availableNames.size > 0) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.15.8",
3
+ "version": "3.15.9",
4
4
  "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {