npm - thumbgate - Versions diffs - 1.15.0 → 1.16.0 - Mend

thumbgate 1.15.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

package/.claude-plugin/marketplace.json +6 -6
package/.claude-plugin/plugin.json +3 -3
package/.well-known/llms.txt +5 -5
package/.well-known/mcp/server-card.json +1 -1
package/README.md +59 -35
package/adapters/chatgpt/openapi.yaml +118 -2
package/adapters/claude/.mcp.json +2 -2
package/adapters/mcp/server-stdio.js +210 -84
package/adapters/opencode/opencode.json +1 -1
package/bench/prompt-eval-suite.json +5 -1
package/bin/cli.js +157 -8
package/config/evals/agent-safety-eval.json +338 -22
package/config/gates/routine.json +43 -0
package/config/github-about.json +3 -3
package/config/model-candidates.json +131 -0
package/openapi/openapi.yaml +118 -2
package/package.json +55 -48
package/public/blog.html +7 -7
package/public/codex-plugin.html +6 -6
package/public/compare.html +29 -23
package/public/dashboard.html +82 -10
package/public/guide.html +28 -28
package/public/index.html +216 -98
package/public/learn.html +50 -22
package/public/lessons.html +1 -1
package/public/numbers.html +17 -17
package/public/pro.html +82 -18
package/scripts/agent-audit-trace.js +55 -0
package/scripts/agent-memory-lifecycle.js +96 -0
package/scripts/agent-readiness-plan.js +118 -0
package/scripts/agentic-data-pipeline.js +21 -1
package/scripts/agents-sdk-sandbox-plan.js +57 -0
package/scripts/ai-org-governance.js +98 -0
package/scripts/ai-search-distribution.js +43 -0
package/scripts/artifact-agent-plan.js +81 -0
package/scripts/billing.js +27 -8
package/scripts/cli-schema.js +18 -2
package/scripts/code-mode-mcp-plan.js +71 -0
package/scripts/context-engine.js +1 -2
package/scripts/context-manager.js +4 -1
package/scripts/dashboard-render-spec.js +1 -1
package/scripts/dashboard.js +275 -9
package/scripts/decision-journal.js +13 -3
package/scripts/document-workflow-governance.js +62 -0
package/scripts/enterprise-agent-rollout.js +34 -0
package/scripts/experience-replay-governance.js +69 -0
package/scripts/export-hf-dataset.js +1 -1
package/scripts/feedback-loop.js +92 -4
package/scripts/feedback-to-rules.js +17 -23
package/scripts/gates-engine.js +4 -6
package/scripts/growth-campaigns.js +49 -0
package/scripts/harness-selector.js +16 -4
package/scripts/hybrid-supervisor-agent.js +64 -0
package/scripts/inference-cache-policy.js +72 -0
package/scripts/inference-economics.js +53 -0
package/scripts/internal-agent-bootstrap.js +12 -2
package/scripts/knowledge-layer-plan.js +108 -0
package/scripts/lesson-inference.js +183 -44
package/scripts/lesson-search.js +4 -1
package/scripts/llm-client.js +157 -26
package/scripts/mailer/resend-mailer.js +112 -1
package/scripts/mcp-transport-strategy.js +66 -0
package/scripts/memory-store-governance.js +60 -0
package/scripts/meta-agent-loop.js +7 -13
package/scripts/model-access-eligibility.js +38 -0
package/scripts/model-migration-readiness.js +55 -0
package/scripts/operational-integrity.js +96 -3
package/scripts/otel-declarative-config.js +56 -0
package/scripts/perplexity-client.js +1 -1
package/scripts/post-training-governance.js +34 -0
package/scripts/private-core-boundary.js +72 -0
package/scripts/production-agent-readiness.js +40 -0
package/scripts/prompt-eval.js +564 -32
package/scripts/prompt-programs.js +93 -0
package/scripts/provider-action-normalizer.js +585 -0
package/scripts/scaling-law-claims.js +60 -0
package/scripts/security-scanner.js +1 -1
package/scripts/self-distill-agent.js +7 -32
package/scripts/seo-gsd.js +232 -55
package/scripts/skill-rag-router.js +53 -0
package/scripts/spec-gate.js +1 -1
package/scripts/student-consistent-training.js +73 -0
package/scripts/synthetic-data-provenance.js +98 -0
package/scripts/task-context-result.js +81 -0
package/scripts/telemetry-analytics.js +149 -0
package/scripts/thompson-sampling.js +2 -2
package/scripts/token-savings.js +7 -6
package/scripts/token-tco.js +46 -0
package/scripts/tool-registry.js +63 -3
package/scripts/verification-loop.js +10 -1
package/scripts/verifier-scoring.js +71 -0
package/scripts/workflow-sentinel.js +284 -28
package/scripts/workspace-agent-routines.js +118 -0
package/src/api/server.js +381 -120
package/scripts/analytics-report.js +0 -328
package/scripts/autonomous-workflow.js +0 -377
package/scripts/billing-setup.js +0 -109
package/scripts/creator-campaigns.js +0 -239
package/scripts/cross-encoder-reranker.js +0 -235
package/scripts/daemon-manager.js +0 -108
package/scripts/decision-trace.js +0 -354
package/scripts/delegation-runtime.js +0 -896
package/scripts/dispatch-brief.js +0 -159
package/scripts/distribution-surfaces.js +0 -110
package/scripts/feedback-history-distiller.js +0 -382
package/scripts/funnel-analytics.js +0 -35
package/scripts/history-distiller.js +0 -200
package/scripts/hosted-job-launcher.js +0 -256
package/scripts/intent-router.js +0 -392
package/scripts/lesson-reranker.js +0 -263
package/scripts/lesson-retrieval.js +0 -148
package/scripts/managed-lesson-agent.js +0 -183
package/scripts/operational-dashboard.js +0 -103
package/scripts/operational-summary.js +0 -129
package/scripts/operator-artifacts.js +0 -608
package/scripts/optimize-context.js +0 -17
package/scripts/org-dashboard.js +0 -206
package/scripts/partner-orchestration.js +0 -146
package/scripts/predictive-insights.js +0 -356
package/scripts/pulse.js +0 -80
package/scripts/reflector-agent.js +0 -221
package/scripts/sales-pipeline.js +0 -681
package/scripts/session-episode-store.js +0 -329
package/scripts/session-health-sensor.js +0 -242
package/scripts/session-report.js +0 -120
package/scripts/swarm-coordinator.js +0 -81
package/scripts/tool-kpi-tracker.js +0 -12
package/scripts/webhook-delivery.js +0 -62
package/scripts/workflow-sprint-intake.js +0 -475

package/scripts/knowledge-layer-plan.js ADDED Viewed

@@ -0,0 +1,108 @@
+'use strict';
+function buildKnowledgeLayerPlan(options = {}) {
+  const domain = options.domain || 'agent_reliability';
+  const graph = options.graph || 'neo4j';
+  return {
+    domain,
+    graph,
+    memoryTiers: [
+      {
+        id: 'short_term',
+        purpose: 'Current session context so the agent does not re-ask answered questions.',
+        ttl: 'session',
+      },
+      {
+        id: 'long_term',
+        purpose: 'Durable user, product, workflow, and feedback profile facts.',
+        ttl: 'durable',
+      },
+      {
+        id: 'reasoning_memory',
+        purpose: 'Reusable decision paths that avoid recomputing expensive traversals.',
+        ttl: 'versioned',
+      },
+    ],
+    nodeTypes: [
+      'User',
+      'Agent',
+      'Workflow',
+      'Feedback',
+      'Gate',
+      'Decision',
+      'Evidence',
+      'Recommendation',
+      'Outcome',
+    ],
+    relationshipTypes: [
+      'GAVE_FEEDBACK',
+      'TRIGGERED_GATE',
+      'USED_EVIDENCE',
+      'RECOMMENDED_ACTION',
+      'PRODUCED_OUTCOME',
+      'SIMILAR_TO',
+      'REUSES_REASONING',
+    ],
+    highRoiUseCases: [
+      'conversion recommendations with explainable evidence paths',
+      'compute savings from reasoning-memory cache hits',
+      'compliance audit trail for why an agent recommended or blocked an action',
+      'closed-loop profile updates from every feedback, purchase, or outcome event',
+    ],
+    gates: [
+      'do not recommend without an evidence path',
+      'do not reuse reasoning memory when source facts changed',
+      'write audit node for every recommendation and blocked action',
+      'record outcome feedback to update profile and graph edges',
+    ],
+  };
+}
+function buildRecommendationEvidencePath(input = {}) {
+  const userId = input.userId || 'unknown_user';
+  const recommendationId = input.recommendationId || 'rec_pending';
+  const evidence = Array.isArray(input.evidence) ? input.evidence : [];
+  const similarProfiles = Array.isArray(input.similarProfiles) ? input.similarProfiles : [];
+  return {
+    recommendationId,
+    path: [
+      { type: 'User', id: userId },
+      ...similarProfiles.map((id) => ({ type: 'SimilarProfile', id })),
+      ...evidence.map((item, index) => ({
+        type: item.type || 'Evidence',
+        id: item.id || `evidence_${index + 1}`,
+        quote: item.quote || null,
+      })),
+      { type: 'Recommendation', id: recommendationId },
+    ],
+    explainable: evidence.length > 0,
+  };
+}
+function evaluateKnowledgeLayerRun(run = {}) {
+  const issues = [];
+  if (!run.userId) issues.push('missing_user_id');
+  if (!run.recommendationId) issues.push('missing_recommendation_id');
+  if (!run.evidencePath?.explainable) issues.push('missing_explainable_evidence_path');
+  if (!run.auditNodeId) issues.push('missing_audit_node_id');
+  if (run.reusedReasoning && !run.reasoningVersion) issues.push('missing_reasoning_version');
+  if (run.profileUpdate && !run.outcomeEventId) issues.push('missing_outcome_event_id');
+  return {
+    decision: issues.length ? 'warn' : 'allow',
+    issues,
+    roiSignals: [
+      run.reusedReasoning ? 'lower_graph_query_and_token_cost' : null,
+      run.profileUpdate ? 'closed_loop_personalization' : null,
+      run.auditNodeId ? 'compliance_trace_available' : null,
+    ].filter(Boolean),
+  };
+}
+module.exports = {
+  buildKnowledgeLayerPlan,
+  buildRecommendationEvidencePath,
+  evaluateKnowledgeLayerRun,
+};

package/scripts/lesson-inference.js CHANGED Viewed

@@ -431,27 +431,173 @@ function consumePhrase(lower, original, phrases) {
 // 6. LLM-Powered Structured Lesson Extraction
 // ---------------------------------------------------------------------------
+function createLessonPromptExample([
+  signal,
+  conversationWindow,
+  triggerCondition,
+  triggerType,
+  actionType,
+  actionDescription,
+  confidence,
+  scope,
+  tags,
+]) {
+  return {
+    signal,
+    conversationWindow: conversationWindow.join('\n'),
+    output: {
+      trigger: { condition: triggerCondition, type: triggerType },
+      action: { type: actionType, description: actionDescription },
+      confidence,
+      scope,
+      tags,
+    },
+  };
+}
+// Five multishot exemplars pinned as a constant so they can be inspected/tested
+// independently of the prompt string. Each example pairs a (signal,
+// conversation_window) with the exact JSON output Claude should emit. These
+// were drafted from real ThumbGate incident classes: Edit-before-Read,
+// force-push-to-main, deploy-verification, mock-to-live-in-tests, and
+// regression-test-pinning. Changing any example shifts lesson extraction
+// behavior — treat it like a prompt version bump.
+const LLM_LESSON_MULTISHOT_EXAMPLES = [
+  [
+    'negative',
+    [
+      '[user]: why is my edit failing?',
+      '[assistant]: I\'ll try editing src/api/server.js — Edit(src/api/server.js) failed: File has not been read yet.',
+      '[assistant]: Let me Read(src/api/server.js) first, then retry Edit.',
+      '[user]: that worked. thumbs down on the first attempt though.',
+    ],
+    'about to call Edit on a file that has not been Read in this session',
+    'constraint',
+    'avoid',
+    'Never call Edit on a file without first calling Read on it — the tool rejects unread files',
+    0.9,
+    'global',
+    ['Edit', 'Read', 'tool-use', 'preconditions'],
+  ],
+  [
+    'negative',
+    [
+      '[assistant]: Running git push --force origin main to clean up history.',
+      '[user]: NO. Never force-push to main. Thumbs down.',
+    ],
+    'about to run git push with --force or -f on the main branch',
+    'constraint',
+    'avoid',
+    'Never force-push to main; use incremental commits and open a new PR if history needs adjustment',
+    0.95,
+    'project-level',
+    ['git', 'force-push', 'main-branch', 'destructive'],
+  ],
+  [
+    'positive',
+    [
+      '[user]: the deploy failed again, can you just verify with curl before declaring done?',
+      '[assistant]: Running curl https://thumbgate-production.up.railway.app/health and grepping for the expected version.',
+      '[assistant]: Got {"version":"1.15.0"} — matches package.json. Deploy confirmed.',
+      '[user]: perfect. thumbs up.',
+    ],
+    'after merging a PR that touches deployable code',
+    'implementation',
+    'do',
+    'Curl the /health endpoint and grep for the expected version before claiming a deploy succeeded',
+    0.85,
+    'project-level',
+    ['deploy', 'verification', 'curl', 'railway'],
+  ],
+  [
+    'negative',
+    [
+      '[user]: these test failures are weird',
+      '[assistant]: Let me check... the tests are using mocked Stripe. Maybe real Stripe is needed?',
+      '[assistant]: Replaced mocks with live Stripe calls. Tests now passing.',
+      '[user]: tests should not call real Stripe in CI. Thumbs down.',
+    ],
+    'a test is failing and the temptation is to replace mocks with live API calls',
+    'debugging',
+    'avoid',
+    'Never swap mocks for real external API calls to make tests pass — inject dependencies and stub the client instead',
+    0.9,
+    'project-level',
+    ['testing', 'mocking', 'stripe', 'ci'],
+  ],
+  [
+    'positive',
+    [
+      '[user]: add a regression test that pins the distinct tier icons',
+      '[assistant]: Added assert.notEqual(proIcon, teamIcon) to tests/billing.test.js, plus the URL regex per tier.',
+      '[user]: great, locking that in. thumbs up.',
+    ],
+    'fixing a bug where two products rendered with the same asset',
+    'implementation',
+    'do',
+    'Add a regression assertion that pins the distinct values (e.g. assert.notEqual(a,b)) so the collapse cannot silently return',
+    0.9,
+    'file-level',
+    ['regression-test', 'stripe', 'billing', 'tier'],
+  ],
+].map(createLessonPromptExample);
+function renderMultishotExamplesForPrompt(examples = LLM_LESSON_MULTISHOT_EXAMPLES) {
+  return examples.map((ex) => (
+    `<example>
+<signal>${ex.signal}</signal>
+<conversation_window>
+${ex.conversationWindow}
+</conversation_window>
+<output>${JSON.stringify(ex.output)}</output>
+</example>`
+  )).join('\n');
+}
+// Anthropic's prompt-engineering playbook (ref: anthropic.skilljar.com
+// Prompt Engineering course) recommends XML tags to scope context blocks and
+// multishot exemplars so the model sees the exact expected shape before being
+// asked to produce it. Both techniques apply cleanly here because the output
+// is a strict JSON schema and the extraction task has five recurring incident
+// classes (see LLM_LESSON_MULTISHOT_EXAMPLES).
 const LLM_LESSON_SYSTEM_PROMPT = `You are a lesson extraction engine for an AI coding agent safety system called ThumbGate.
-Given a conversation window and a feedback signal (positive or negative), extract a structured lesson.
+<task>
+Given a feedback signal (positive or negative) and a conversation window, extract a structured if-then lesson that would prevent the same mistake (negative) or reinforce the same success (positive) in future sessions.
+</task>
-Return ONLY valid JSON matching this exact schema:
+<output_schema>
+Return ONLY valid JSON matching this exact shape — no prose, no code fences, no text outside the JSON object:
 {
-  "trigger": { "condition": "<when this lesson applies>", "type": "<one of: debugging, implementation, question, error-report, constraint>" },
-  "action": { "type": "<do or avoid>", "description": "<specific action to take or avoid>" },
+  "trigger": { "condition": "<when this lesson applies>", "type": "<debugging|implementation|question|error-report|constraint>" },
+  "action":  { "type": "<do|avoid>", "description": "<specific action to take or avoid>" },
   "confidence": <0.0 to 1.0>,
-  "scope": "<global, file-level, or project-level>",
+  "scope": "<global|file-level|project-level>",
   "tags": ["<relevant tags>"]
 }
-Guidelines:
-- Be specific and actionable. "Avoid: editing files without reading them first" is better than "Avoid: bad edits".
-- confidence should reflect how clear the lesson is from the conversation context.
-- tags should include tool names, file types, or domain areas mentioned.
-- Do NOT include any text outside the JSON object.`;
+</output_schema>
+<guidelines>
+- Be specific and actionable. "Avoid editing files without reading them first" beats "Avoid bad edits".
+- confidence should reflect how clear the lesson is from the window. A single ambiguous exchange caps around 0.5; a reproduced failure with a confirmed fix can reach 0.9.
+- tags should include tool names, file types, or domain areas mentioned in the conversation.
+- Emit JSON only. No code fences, no commentary.
+</guidelines>
+<examples>
+${renderMultishotExamplesForPrompt()}
+</examples>`;
+function buildLessonUserPrompt({ signal, context, windowText }) {
+  const normalizedSignal = signal === 'positive' || signal === 'up' ? 'positive' : 'negative';
+  const parts = [`<signal>${normalizedSignal}</signal>`];
+  if (context) parts.push(`<user_context>${context}</user_context>`);
+  parts.push(`<conversation_window>\n${windowText}\n</conversation_window>`);
+  return parts.join('\n');
+}
 async function inferStructuredLessonLLM(conversationWindow, signal, context) {
-  const { isAvailable, callClaude, MODELS } = require('./llm-client');
+  const { isAvailable, callClaudeJson, MODELS } = require('./llm-client');
   if (!isAvailable()) return null;
   const normalizedWindow = Array.isArray(conversationWindow) ? conversationWindow : [];
@@ -463,47 +609,37 @@ async function inferStructuredLessonLLM(conversationWindow, signal, context) {
     .join('\n')
     .slice(0, 4000);
-  const userPrompt = [
-    `Signal: ${signal === 'positive' || signal === 'up' ? 'positive (thumbs up — something worked well)' : 'negative (thumbs down — something went wrong)'}`,
-    context ? `User context: ${context}` : '',
-    `\nConversation:\n${windowText}`,
-  ].filter(Boolean).join('\n');
+  const userPrompt = buildLessonUserPrompt({ signal, context, windowText });
-  const raw = await callClaude({
+  const parsed = await callClaudeJson({
     systemPrompt: LLM_LESSON_SYSTEM_PROMPT,
     userPrompt,
     model: MODELS.FAST,
     maxTokens: 512,
+    cache: true,
   });
-  if (!raw) return null;
-  try {
-    const parsed = JSON.parse(raw);
-    if (!parsed.trigger || !parsed.action) return null;
+  if (!parsed || !parsed.trigger || !parsed.action) return null;
-    const filePaths = extractFilePaths(normalizedWindow);
-    const toolCalls = extractToolCalls(normalizedWindow);
-    const errorPatterns = extractErrors(normalizedWindow);
-    const userMessages = normalizedWindow.filter((m) => m.role === 'user');
-    const assistantMessages = normalizedWindow.filter((m) => m.role === 'assistant');
-    const lastUser = userMessages[userMessages.length - 1]?.content || '';
-    const lastAssistant = assistantMessages[assistantMessages.length - 1]?.content || '';
+  const filePaths = extractFilePaths(normalizedWindow);
+  const toolCalls = extractToolCalls(normalizedWindow);
+  const errorPatterns = extractErrors(normalizedWindow);
+  const userMessages = normalizedWindow.filter((m) => m.role === 'user');
+  const assistantMessages = normalizedWindow.filter((m) => m.role === 'assistant');
+  const lastUser = userMessages[userMessages.length - 1]?.content || '';
+  const lastAssistant = assistantMessages[assistantMessages.length - 1]?.content || '';
-    return {
-      format: 'if-then-v1-llm',
-      trigger: parsed.trigger,
-      action: parsed.action,
-      signal: signal === 'positive' || signal === 'up' ? 'positive' : 'negative',
-      confidence: Math.max(0, Math.min(1, Number(parsed.confidence) || 0.5)),
-      scope: parsed.scope || inferScope(filePaths, toolCalls),
-      examples: [{ userIntent: lastUser.slice(0, 300), assistantAction: lastAssistant.slice(0, 300), outcome: signal === 'positive' || signal === 'up' ? 'approved' : 'rejected' }],
-      metadata: { toolsUsed: toolCalls, filesInvolved: filePaths.slice(0, 10), errorPatterns: errorPatterns.slice(0, 5), conversationLength: normalizedWindow.length, inferredAt: new Date().toISOString(), llmModel: MODELS.FAST },
-      tags: Array.isArray(parsed.tags) ? parsed.tags : [],
-    };
-  } catch {
-    return null;
-  }
+  return {
+    format: 'if-then-v1-llm',
+    trigger: parsed.trigger,
+    action: parsed.action,
+    signal: signal === 'positive' || signal === 'up' ? 'positive' : 'negative',
+    confidence: Math.max(0, Math.min(1, Number(parsed.confidence) || 0.5)),
+    scope: parsed.scope || inferScope(filePaths, toolCalls),
+    examples: [{ userIntent: lastUser.slice(0, 300), assistantAction: lastAssistant.slice(0, 300), outcome: signal === 'positive' || signal === 'up' ? 'approved' : 'rejected' }],
+    metadata: { toolsUsed: toolCalls, filesInvolved: filePaths.slice(0, 10), errorPatterns: errorPatterns.slice(0, 5), conversationLength: normalizedWindow.length, inferredAt: new Date().toISOString(), llmModel: MODELS.FAST },
+    tags: Array.isArray(parsed.tags) ? parsed.tags : [],
+  };
 }
 module.exports = {
@@ -515,4 +651,7 @@ module.exports = {
   inferStructuredLesson, inferStructuredLessonLLM,
   extractTrigger, extractAction, extractToolCalls,
   extractFilePaths, extractErrors, calculateConfidence, inferScope,
+  // Exported for prompt-shape regression tests.
+  LLM_LESSON_SYSTEM_PROMPT, LLM_LESSON_MULTISHOT_EXAMPLES,
+  renderMultishotExamplesForPrompt, buildLessonUserPrompt,
 };

package/scripts/lesson-search.js CHANGED Viewed

@@ -2,6 +2,7 @@
 const path = require('node:path');
 const { readJSONL, getFeedbackPaths } = require('./feedback-loop');
+const { loadOptionalModule } = require('./private-core-boundary');
 const HIGH_RISK_TAGS = new Set([
   'billing',
@@ -514,7 +515,9 @@ function searchLessons(query = '', options = {}) {
   // Cross-encoder reranking: when a query is present, rerank the top-50 bi-encoder
   // candidates using field-weighted BM25 so the most relevant lessons surface first.
   if (query && results.length > 1) {
-    const { rerankLessons } = require('./lesson-reranker');
+    const { rerankLessons } = loadOptionalModule('./lesson-reranker', () => ({
+      rerankLessons: (_query, pool) => pool,
+    }));
     const pool = results.slice(0, 50);
     const tail  = results.slice(50);
     const reranked = rerankLessons(query, pool, { topK: pool.length });

package/scripts/llm-client.js CHANGED Viewed

@@ -10,6 +10,7 @@ const MODELS = {
 const DEFAULT_MODEL = MODELS.FAST;
 const DEFAULT_MAX_TOKENS = 1024;
+const DEFAULT_CACHE_TTL = '5m';
 let _client = null;
@@ -35,40 +36,170 @@ function stripCodeFences(text) {
   return fenced ? fenced[1].trim() : text.trim();
 }
-// Anthropic SDK throws errors with a `.status` field for HTTP failures.
-// Our defaultClassify already reads `.status`, so 429/5xx retry and 4xx
-// (bad request / unauthorized / not-found) bail immediately — which is
-// what we want: there is no point retrying a malformed prompt or a
-// revoked API key.
-async function callClaude({ systemPrompt, userPrompt, model, maxTokens } = {}) {
+function normalizeCacheOptions(cache) {
+  if (!cache) return null;
+  if (cache === true) {
+    return {
+      mode: 'system',
+      control: { type: 'ephemeral', ttl: DEFAULT_CACHE_TTL },
+    };
+  }
+  if (typeof cache === 'string') {
+    return {
+      mode: 'system',
+      control: { type: 'ephemeral', ttl: cache },
+    };
+  }
+  if (typeof cache !== 'object') return null;
+  const ttl = typeof cache.ttl === 'string' && cache.ttl ? cache.ttl : DEFAULT_CACHE_TTL;
+  const type = typeof cache.type === 'string' && cache.type ? cache.type : 'ephemeral';
+  const mode = typeof cache.mode === 'string' && cache.mode ? cache.mode : 'system';
+  return {
+    mode,
+    control: { type, ttl },
+  };
+}
+function applyCacheToSystem(systemPrompt, cacheOptions) {
+  if (!systemPrompt) return undefined;
+  if (!cacheOptions || (cacheOptions.mode !== 'system' && cacheOptions.mode !== 'tools+system')) {
+    return systemPrompt;
+  }
+  return [{ type: 'text', text: systemPrompt, cache_control: cacheOptions.control }];
+}
+function applyCacheToTools(tools, cacheOptions) {
+  if (!Array.isArray(tools) || tools.length === 0) return undefined;
+  if (!cacheOptions || (cacheOptions.mode !== 'tools' && cacheOptions.mode !== 'tools+system')) {
+    return tools;
+  }
+  return tools.map((tool) => {
+    if (!tool || typeof tool !== 'object' || tool.cache_control) return tool;
+    return { ...tool, cache_control: cacheOptions.control };
+  });
+}
+function buildClaudeRequest({
+  systemPrompt,
+  userPrompt,
+  messages,
+  model,
+  maxTokens,
+  cache,
+  tools,
+  toolChoice,
+  metadata,
+  temperature,
+} = {}) {
+  const cacheOptions = normalizeCacheOptions(cache);
+  const request = {
+    model: model || DEFAULT_MODEL,
+    max_tokens: maxTokens || DEFAULT_MAX_TOKENS,
+    messages: Array.isArray(messages) && messages.length > 0
+      ? messages
+      : [{ role: 'user', content: userPrompt }],
+  };
+  const normalizedSystem = applyCacheToSystem(systemPrompt, cacheOptions);
+  if (normalizedSystem) request.system = normalizedSystem;
+  const normalizedTools = applyCacheToTools(tools, cacheOptions);
+  if (normalizedTools) request.tools = normalizedTools;
+  if (toolChoice) request.tool_choice = toolChoice;
+  if (metadata && typeof metadata === 'object') request.metadata = metadata;
+  if (Number.isFinite(temperature)) request.temperature = temperature;
+  if (cacheOptions && cacheOptions.mode === 'request') {
+    request.cache_control = cacheOptions.control;
+  }
+  return request;
+}
+function extractTextContent(response) {
+  return (response?.content || [])
+    .filter((block) => block.type === 'text')
+    .map((block) => block.text)
+    .join('');
+}
+function parseClaudeJson(text) {
+  if (typeof text !== 'string') return null;
+  try {
+    return JSON.parse(stripCodeFences(text));
+  } catch {
+    return null;
+  }
+}
+async function callClaudeInternal(options = {}) {
   const client = getClient();
   if (!client) return null;
   try {
-    const text = await runStep('llm.callClaude', {
+    const response = await runStep('llm.callClaude', {
       retries: 2,
       logger: (msg) => console.warn(msg),
-    }, async () => {
-      const response = await client.messages.create({
-        model: model || DEFAULT_MODEL,
-        max_tokens: maxTokens || DEFAULT_MAX_TOKENS,
-        system: systemPrompt || undefined,
-        messages: [{ role: 'user', content: userPrompt }],
-      });
-      return response.content
-        .filter((b) => b.type === 'text')
-        .map((b) => b.text)
-        .join('');
-    });
-    return stripCodeFences(text);
+    }, async () => client.messages.create(buildClaudeRequest(options)));
+    const text = stripCodeFences(extractTextContent(response));
+    return {
+      text,
+      usage: response?.usage || null,
+      stopReason: response?.stop_reason || null,
+      id: response?.id || null,
+      model: response?.model || options.model || DEFAULT_MODEL,
+    };
   } catch {
-    // Preserve the original callClaude contract — callers expect `null` on
-    // failure, not an exception. runStep already logged retry attempts,
-    // so the permanent failure is visible in logs.
     return null;
   }
 }
-module.exports = { isAvailable, callClaude, stripCodeFences, MODELS };
+// Anthropic SDK throws errors with a `.status` field for HTTP failures.
+// Our defaultClassify already reads `.status`, so 429/5xx retry and 4xx
+// (bad request / unauthorized / not-found) bail immediately — which is
+// what we want: there is no point retrying a malformed prompt or a
+// revoked API key.
+async function callClaude(options = {}) {
+  const result = await callClaudeInternal(options);
+  if (!result) return null;
+  return options.returnMetadata ? result : result.text;
+}
+async function callClaudeJson(options = {}) {
+  const result = await callClaudeInternal(options);
+  if (!result) return null;
+  const parsed = parseClaudeJson(result.text);
+  if (parsed === null) return null;
+  if (options.returnMetadata) {
+    return {
+      parsed,
+      text: result.text,
+      usage: result.usage,
+      stopReason: result.stopReason,
+      id: result.id,
+      model: result.model,
+    };
+  }
+  return parsed;
+}
+module.exports = {
+  isAvailable,
+  callClaude,
+  callClaudeJson,
+  stripCodeFences,
+  parseClaudeJson,
+  normalizeCacheOptions,
+  buildClaudeRequest,
+  MODELS,
+};