thumbgate 1.15.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/.claude-plugin/marketplace.json +6 -6
  2. package/.claude-plugin/plugin.json +3 -3
  3. package/.well-known/llms.txt +5 -5
  4. package/.well-known/mcp/server-card.json +1 -1
  5. package/README.md +59 -35
  6. package/adapters/chatgpt/openapi.yaml +118 -2
  7. package/adapters/claude/.mcp.json +2 -2
  8. package/adapters/mcp/server-stdio.js +210 -84
  9. package/adapters/opencode/opencode.json +1 -1
  10. package/bench/prompt-eval-suite.json +5 -1
  11. package/bin/cli.js +157 -8
  12. package/config/evals/agent-safety-eval.json +338 -22
  13. package/config/gates/routine.json +43 -0
  14. package/config/github-about.json +3 -3
  15. package/config/model-candidates.json +131 -0
  16. package/openapi/openapi.yaml +118 -2
  17. package/package.json +55 -48
  18. package/public/blog.html +7 -7
  19. package/public/codex-plugin.html +6 -6
  20. package/public/compare.html +29 -23
  21. package/public/dashboard.html +82 -10
  22. package/public/guide.html +28 -28
  23. package/public/index.html +216 -98
  24. package/public/learn.html +50 -22
  25. package/public/lessons.html +1 -1
  26. package/public/numbers.html +17 -17
  27. package/public/pro.html +82 -18
  28. package/scripts/agent-audit-trace.js +55 -0
  29. package/scripts/agent-memory-lifecycle.js +96 -0
  30. package/scripts/agent-readiness-plan.js +118 -0
  31. package/scripts/agentic-data-pipeline.js +21 -1
  32. package/scripts/agents-sdk-sandbox-plan.js +57 -0
  33. package/scripts/ai-org-governance.js +98 -0
  34. package/scripts/ai-search-distribution.js +43 -0
  35. package/scripts/artifact-agent-plan.js +81 -0
  36. package/scripts/billing.js +27 -8
  37. package/scripts/cli-schema.js +18 -2
  38. package/scripts/code-mode-mcp-plan.js +71 -0
  39. package/scripts/context-engine.js +1 -2
  40. package/scripts/context-manager.js +4 -1
  41. package/scripts/dashboard-render-spec.js +1 -1
  42. package/scripts/dashboard.js +275 -9
  43. package/scripts/decision-journal.js +13 -3
  44. package/scripts/document-workflow-governance.js +62 -0
  45. package/scripts/enterprise-agent-rollout.js +34 -0
  46. package/scripts/experience-replay-governance.js +69 -0
  47. package/scripts/export-hf-dataset.js +1 -1
  48. package/scripts/feedback-loop.js +92 -4
  49. package/scripts/feedback-to-rules.js +17 -23
  50. package/scripts/gates-engine.js +4 -6
  51. package/scripts/growth-campaigns.js +49 -0
  52. package/scripts/harness-selector.js +16 -4
  53. package/scripts/hybrid-supervisor-agent.js +64 -0
  54. package/scripts/inference-cache-policy.js +72 -0
  55. package/scripts/inference-economics.js +53 -0
  56. package/scripts/internal-agent-bootstrap.js +12 -2
  57. package/scripts/knowledge-layer-plan.js +108 -0
  58. package/scripts/lesson-inference.js +183 -44
  59. package/scripts/lesson-search.js +4 -1
  60. package/scripts/llm-client.js +157 -26
  61. package/scripts/mailer/resend-mailer.js +112 -1
  62. package/scripts/mcp-transport-strategy.js +66 -0
  63. package/scripts/memory-store-governance.js +60 -0
  64. package/scripts/meta-agent-loop.js +7 -13
  65. package/scripts/model-access-eligibility.js +38 -0
  66. package/scripts/model-migration-readiness.js +55 -0
  67. package/scripts/operational-integrity.js +96 -3
  68. package/scripts/otel-declarative-config.js +56 -0
  69. package/scripts/perplexity-client.js +1 -1
  70. package/scripts/post-training-governance.js +34 -0
  71. package/scripts/private-core-boundary.js +72 -0
  72. package/scripts/production-agent-readiness.js +40 -0
  73. package/scripts/prompt-eval.js +564 -32
  74. package/scripts/prompt-programs.js +93 -0
  75. package/scripts/provider-action-normalizer.js +585 -0
  76. package/scripts/scaling-law-claims.js +60 -0
  77. package/scripts/security-scanner.js +1 -1
  78. package/scripts/self-distill-agent.js +7 -32
  79. package/scripts/seo-gsd.js +232 -55
  80. package/scripts/skill-rag-router.js +53 -0
  81. package/scripts/spec-gate.js +1 -1
  82. package/scripts/student-consistent-training.js +73 -0
  83. package/scripts/synthetic-data-provenance.js +98 -0
  84. package/scripts/task-context-result.js +81 -0
  85. package/scripts/telemetry-analytics.js +149 -0
  86. package/scripts/thompson-sampling.js +2 -2
  87. package/scripts/token-savings.js +7 -6
  88. package/scripts/token-tco.js +46 -0
  89. package/scripts/tool-registry.js +63 -3
  90. package/scripts/verification-loop.js +10 -1
  91. package/scripts/verifier-scoring.js +71 -0
  92. package/scripts/workflow-sentinel.js +284 -28
  93. package/scripts/workspace-agent-routines.js +118 -0
  94. package/src/api/server.js +381 -120
  95. package/scripts/analytics-report.js +0 -328
  96. package/scripts/autonomous-workflow.js +0 -377
  97. package/scripts/billing-setup.js +0 -109
  98. package/scripts/creator-campaigns.js +0 -239
  99. package/scripts/cross-encoder-reranker.js +0 -235
  100. package/scripts/daemon-manager.js +0 -108
  101. package/scripts/decision-trace.js +0 -354
  102. package/scripts/delegation-runtime.js +0 -896
  103. package/scripts/dispatch-brief.js +0 -159
  104. package/scripts/distribution-surfaces.js +0 -110
  105. package/scripts/feedback-history-distiller.js +0 -382
  106. package/scripts/funnel-analytics.js +0 -35
  107. package/scripts/history-distiller.js +0 -200
  108. package/scripts/hosted-job-launcher.js +0 -256
  109. package/scripts/intent-router.js +0 -392
  110. package/scripts/lesson-reranker.js +0 -263
  111. package/scripts/lesson-retrieval.js +0 -148
  112. package/scripts/managed-lesson-agent.js +0 -183
  113. package/scripts/operational-dashboard.js +0 -103
  114. package/scripts/operational-summary.js +0 -129
  115. package/scripts/operator-artifacts.js +0 -608
  116. package/scripts/optimize-context.js +0 -17
  117. package/scripts/org-dashboard.js +0 -206
  118. package/scripts/partner-orchestration.js +0 -146
  119. package/scripts/predictive-insights.js +0 -356
  120. package/scripts/pulse.js +0 -80
  121. package/scripts/reflector-agent.js +0 -221
  122. package/scripts/sales-pipeline.js +0 -681
  123. package/scripts/session-episode-store.js +0 -329
  124. package/scripts/session-health-sensor.js +0 -242
  125. package/scripts/session-report.js +0 -120
  126. package/scripts/swarm-coordinator.js +0 -81
  127. package/scripts/tool-kpi-tracker.js +0 -12
  128. package/scripts/webhook-delivery.js +0 -62
  129. package/scripts/workflow-sprint-intake.js +0 -475
@@ -0,0 +1,108 @@
1
+ 'use strict';
2
+
3
+ function buildKnowledgeLayerPlan(options = {}) {
4
+ const domain = options.domain || 'agent_reliability';
5
+ const graph = options.graph || 'neo4j';
6
+
7
+ return {
8
+ domain,
9
+ graph,
10
+ memoryTiers: [
11
+ {
12
+ id: 'short_term',
13
+ purpose: 'Current session context so the agent does not re-ask answered questions.',
14
+ ttl: 'session',
15
+ },
16
+ {
17
+ id: 'long_term',
18
+ purpose: 'Durable user, product, workflow, and feedback profile facts.',
19
+ ttl: 'durable',
20
+ },
21
+ {
22
+ id: 'reasoning_memory',
23
+ purpose: 'Reusable decision paths that avoid recomputing expensive traversals.',
24
+ ttl: 'versioned',
25
+ },
26
+ ],
27
+ nodeTypes: [
28
+ 'User',
29
+ 'Agent',
30
+ 'Workflow',
31
+ 'Feedback',
32
+ 'Gate',
33
+ 'Decision',
34
+ 'Evidence',
35
+ 'Recommendation',
36
+ 'Outcome',
37
+ ],
38
+ relationshipTypes: [
39
+ 'GAVE_FEEDBACK',
40
+ 'TRIGGERED_GATE',
41
+ 'USED_EVIDENCE',
42
+ 'RECOMMENDED_ACTION',
43
+ 'PRODUCED_OUTCOME',
44
+ 'SIMILAR_TO',
45
+ 'REUSES_REASONING',
46
+ ],
47
+ highRoiUseCases: [
48
+ 'conversion recommendations with explainable evidence paths',
49
+ 'compute savings from reasoning-memory cache hits',
50
+ 'compliance audit trail for why an agent recommended or blocked an action',
51
+ 'closed-loop profile updates from every feedback, purchase, or outcome event',
52
+ ],
53
+ gates: [
54
+ 'do not recommend without an evidence path',
55
+ 'do not reuse reasoning memory when source facts changed',
56
+ 'write audit node for every recommendation and blocked action',
57
+ 'record outcome feedback to update profile and graph edges',
58
+ ],
59
+ };
60
+ }
61
+
62
+ function buildRecommendationEvidencePath(input = {}) {
63
+ const userId = input.userId || 'unknown_user';
64
+ const recommendationId = input.recommendationId || 'rec_pending';
65
+ const evidence = Array.isArray(input.evidence) ? input.evidence : [];
66
+ const similarProfiles = Array.isArray(input.similarProfiles) ? input.similarProfiles : [];
67
+
68
+ return {
69
+ recommendationId,
70
+ path: [
71
+ { type: 'User', id: userId },
72
+ ...similarProfiles.map((id) => ({ type: 'SimilarProfile', id })),
73
+ ...evidence.map((item, index) => ({
74
+ type: item.type || 'Evidence',
75
+ id: item.id || `evidence_${index + 1}`,
76
+ quote: item.quote || null,
77
+ })),
78
+ { type: 'Recommendation', id: recommendationId },
79
+ ],
80
+ explainable: evidence.length > 0,
81
+ };
82
+ }
83
+
84
+ function evaluateKnowledgeLayerRun(run = {}) {
85
+ const issues = [];
86
+ if (!run.userId) issues.push('missing_user_id');
87
+ if (!run.recommendationId) issues.push('missing_recommendation_id');
88
+ if (!run.evidencePath?.explainable) issues.push('missing_explainable_evidence_path');
89
+ if (!run.auditNodeId) issues.push('missing_audit_node_id');
90
+ if (run.reusedReasoning && !run.reasoningVersion) issues.push('missing_reasoning_version');
91
+ if (run.profileUpdate && !run.outcomeEventId) issues.push('missing_outcome_event_id');
92
+
93
+ return {
94
+ decision: issues.length ? 'warn' : 'allow',
95
+ issues,
96
+ roiSignals: [
97
+ run.reusedReasoning ? 'lower_graph_query_and_token_cost' : null,
98
+ run.profileUpdate ? 'closed_loop_personalization' : null,
99
+ run.auditNodeId ? 'compliance_trace_available' : null,
100
+ ].filter(Boolean),
101
+ };
102
+ }
103
+
104
+ module.exports = {
105
+ buildKnowledgeLayerPlan,
106
+ buildRecommendationEvidencePath,
107
+ evaluateKnowledgeLayerRun,
108
+ };
@@ -431,27 +431,173 @@ function consumePhrase(lower, original, phrases) {
431
431
  // 6. LLM-Powered Structured Lesson Extraction
432
432
  // ---------------------------------------------------------------------------
433
433
 
434
+ function createLessonPromptExample([
435
+ signal,
436
+ conversationWindow,
437
+ triggerCondition,
438
+ triggerType,
439
+ actionType,
440
+ actionDescription,
441
+ confidence,
442
+ scope,
443
+ tags,
444
+ ]) {
445
+ return {
446
+ signal,
447
+ conversationWindow: conversationWindow.join('\n'),
448
+ output: {
449
+ trigger: { condition: triggerCondition, type: triggerType },
450
+ action: { type: actionType, description: actionDescription },
451
+ confidence,
452
+ scope,
453
+ tags,
454
+ },
455
+ };
456
+ }
457
+
458
+ // Five multishot exemplars pinned as a constant so they can be inspected/tested
459
+ // independently of the prompt string. Each example pairs a (signal,
460
+ // conversation_window) with the exact JSON output Claude should emit. These
461
+ // were drafted from real ThumbGate incident classes: Edit-before-Read,
462
+ // force-push-to-main, deploy-verification, mock-to-live-in-tests, and
463
+ // regression-test-pinning. Changing any example shifts lesson extraction
464
+ // behavior — treat it like a prompt version bump.
465
+ const LLM_LESSON_MULTISHOT_EXAMPLES = [
466
+ [
467
+ 'negative',
468
+ [
469
+ '[user]: why is my edit failing?',
470
+ '[assistant]: I\'ll try editing src/api/server.js — Edit(src/api/server.js) failed: File has not been read yet.',
471
+ '[assistant]: Let me Read(src/api/server.js) first, then retry Edit.',
472
+ '[user]: that worked. thumbs down on the first attempt though.',
473
+ ],
474
+ 'about to call Edit on a file that has not been Read in this session',
475
+ 'constraint',
476
+ 'avoid',
477
+ 'Never call Edit on a file without first calling Read on it — the tool rejects unread files',
478
+ 0.9,
479
+ 'global',
480
+ ['Edit', 'Read', 'tool-use', 'preconditions'],
481
+ ],
482
+ [
483
+ 'negative',
484
+ [
485
+ '[assistant]: Running git push --force origin main to clean up history.',
486
+ '[user]: NO. Never force-push to main. Thumbs down.',
487
+ ],
488
+ 'about to run git push with --force or -f on the main branch',
489
+ 'constraint',
490
+ 'avoid',
491
+ 'Never force-push to main; use incremental commits and open a new PR if history needs adjustment',
492
+ 0.95,
493
+ 'project-level',
494
+ ['git', 'force-push', 'main-branch', 'destructive'],
495
+ ],
496
+ [
497
+ 'positive',
498
+ [
499
+ '[user]: the deploy failed again, can you just verify with curl before declaring done?',
500
+ '[assistant]: Running curl https://thumbgate-production.up.railway.app/health and grepping for the expected version.',
501
+ '[assistant]: Got {"version":"1.15.0"} — matches package.json. Deploy confirmed.',
502
+ '[user]: perfect. thumbs up.',
503
+ ],
504
+ 'after merging a PR that touches deployable code',
505
+ 'implementation',
506
+ 'do',
507
+ 'Curl the /health endpoint and grep for the expected version before claiming a deploy succeeded',
508
+ 0.85,
509
+ 'project-level',
510
+ ['deploy', 'verification', 'curl', 'railway'],
511
+ ],
512
+ [
513
+ 'negative',
514
+ [
515
+ '[user]: these test failures are weird',
516
+ '[assistant]: Let me check... the tests are using mocked Stripe. Maybe real Stripe is needed?',
517
+ '[assistant]: Replaced mocks with live Stripe calls. Tests now passing.',
518
+ '[user]: tests should not call real Stripe in CI. Thumbs down.',
519
+ ],
520
+ 'a test is failing and the temptation is to replace mocks with live API calls',
521
+ 'debugging',
522
+ 'avoid',
523
+ 'Never swap mocks for real external API calls to make tests pass — inject dependencies and stub the client instead',
524
+ 0.9,
525
+ 'project-level',
526
+ ['testing', 'mocking', 'stripe', 'ci'],
527
+ ],
528
+ [
529
+ 'positive',
530
+ [
531
+ '[user]: add a regression test that pins the distinct tier icons',
532
+ '[assistant]: Added assert.notEqual(proIcon, teamIcon) to tests/billing.test.js, plus the URL regex per tier.',
533
+ '[user]: great, locking that in. thumbs up.',
534
+ ],
535
+ 'fixing a bug where two products rendered with the same asset',
536
+ 'implementation',
537
+ 'do',
538
+ 'Add a regression assertion that pins the distinct values (e.g. assert.notEqual(a,b)) so the collapse cannot silently return',
539
+ 0.9,
540
+ 'file-level',
541
+ ['regression-test', 'stripe', 'billing', 'tier'],
542
+ ],
543
+ ].map(createLessonPromptExample);
544
+
545
+ function renderMultishotExamplesForPrompt(examples = LLM_LESSON_MULTISHOT_EXAMPLES) {
546
+ return examples.map((ex) => (
547
+ `<example>
548
+ <signal>${ex.signal}</signal>
549
+ <conversation_window>
550
+ ${ex.conversationWindow}
551
+ </conversation_window>
552
+ <output>${JSON.stringify(ex.output)}</output>
553
+ </example>`
554
+ )).join('\n');
555
+ }
556
+
557
+ // Anthropic's prompt-engineering playbook (ref: anthropic.skilljar.com
558
+ // Prompt Engineering course) recommends XML tags to scope context blocks and
559
+ // multishot exemplars so the model sees the exact expected shape before being
560
+ // asked to produce it. Both techniques apply cleanly here because the output
561
+ // is a strict JSON schema and the extraction task has five recurring incident
562
+ // classes (see LLM_LESSON_MULTISHOT_EXAMPLES).
434
563
  const LLM_LESSON_SYSTEM_PROMPT = `You are a lesson extraction engine for an AI coding agent safety system called ThumbGate.
435
564
 
436
- Given a conversation window and a feedback signal (positive or negative), extract a structured lesson.
565
+ <task>
566
+ Given a feedback signal (positive or negative) and a conversation window, extract a structured if-then lesson that would prevent the same mistake (negative) or reinforce the same success (positive) in future sessions.
567
+ </task>
437
568
 
438
- Return ONLY valid JSON matching this exact schema:
569
+ <output_schema>
570
+ Return ONLY valid JSON matching this exact shape — no prose, no code fences, no text outside the JSON object:
439
571
  {
440
- "trigger": { "condition": "<when this lesson applies>", "type": "<one of: debugging, implementation, question, error-report, constraint>" },
441
- "action": { "type": "<do or avoid>", "description": "<specific action to take or avoid>" },
572
+ "trigger": { "condition": "<when this lesson applies>", "type": "<debugging|implementation|question|error-report|constraint>" },
573
+ "action": { "type": "<do|avoid>", "description": "<specific action to take or avoid>" },
442
574
  "confidence": <0.0 to 1.0>,
443
- "scope": "<global, file-level, or project-level>",
575
+ "scope": "<global|file-level|project-level>",
444
576
  "tags": ["<relevant tags>"]
445
577
  }
446
-
447
- Guidelines:
448
- - Be specific and actionable. "Avoid: editing files without reading them first" is better than "Avoid: bad edits".
449
- - confidence should reflect how clear the lesson is from the conversation context.
450
- - tags should include tool names, file types, or domain areas mentioned.
451
- - Do NOT include any text outside the JSON object.`;
578
+ </output_schema>
579
+
580
+ <guidelines>
581
+ - Be specific and actionable. "Avoid editing files without reading them first" beats "Avoid bad edits".
582
+ - confidence should reflect how clear the lesson is from the window. A single ambiguous exchange caps around 0.5; a reproduced failure with a confirmed fix can reach 0.9.
583
+ - tags should include tool names, file types, or domain areas mentioned in the conversation.
584
+ - Emit JSON only. No code fences, no commentary.
585
+ </guidelines>
586
+
587
+ <examples>
588
+ ${renderMultishotExamplesForPrompt()}
589
+ </examples>`;
590
+
591
+ function buildLessonUserPrompt({ signal, context, windowText }) {
592
+ const normalizedSignal = signal === 'positive' || signal === 'up' ? 'positive' : 'negative';
593
+ const parts = [`<signal>${normalizedSignal}</signal>`];
594
+ if (context) parts.push(`<user_context>${context}</user_context>`);
595
+ parts.push(`<conversation_window>\n${windowText}\n</conversation_window>`);
596
+ return parts.join('\n');
597
+ }
452
598
 
453
599
  async function inferStructuredLessonLLM(conversationWindow, signal, context) {
454
- const { isAvailable, callClaude, MODELS } = require('./llm-client');
600
+ const { isAvailable, callClaudeJson, MODELS } = require('./llm-client');
455
601
  if (!isAvailable()) return null;
456
602
 
457
603
  const normalizedWindow = Array.isArray(conversationWindow) ? conversationWindow : [];
@@ -463,47 +609,37 @@ async function inferStructuredLessonLLM(conversationWindow, signal, context) {
463
609
  .join('\n')
464
610
  .slice(0, 4000);
465
611
 
466
- const userPrompt = [
467
- `Signal: ${signal === 'positive' || signal === 'up' ? 'positive (thumbs up — something worked well)' : 'negative (thumbs down — something went wrong)'}`,
468
- context ? `User context: ${context}` : '',
469
- `\nConversation:\n${windowText}`,
470
- ].filter(Boolean).join('\n');
612
+ const userPrompt = buildLessonUserPrompt({ signal, context, windowText });
471
613
 
472
- const raw = await callClaude({
614
+ const parsed = await callClaudeJson({
473
615
  systemPrompt: LLM_LESSON_SYSTEM_PROMPT,
474
616
  userPrompt,
475
617
  model: MODELS.FAST,
476
618
  maxTokens: 512,
619
+ cache: true,
477
620
  });
478
621
 
479
- if (!raw) return null;
480
-
481
- try {
482
- const parsed = JSON.parse(raw);
483
- if (!parsed.trigger || !parsed.action) return null;
622
+ if (!parsed || !parsed.trigger || !parsed.action) return null;
484
623
 
485
- const filePaths = extractFilePaths(normalizedWindow);
486
- const toolCalls = extractToolCalls(normalizedWindow);
487
- const errorPatterns = extractErrors(normalizedWindow);
488
- const userMessages = normalizedWindow.filter((m) => m.role === 'user');
489
- const assistantMessages = normalizedWindow.filter((m) => m.role === 'assistant');
490
- const lastUser = userMessages[userMessages.length - 1]?.content || '';
491
- const lastAssistant = assistantMessages[assistantMessages.length - 1]?.content || '';
624
+ const filePaths = extractFilePaths(normalizedWindow);
625
+ const toolCalls = extractToolCalls(normalizedWindow);
626
+ const errorPatterns = extractErrors(normalizedWindow);
627
+ const userMessages = normalizedWindow.filter((m) => m.role === 'user');
628
+ const assistantMessages = normalizedWindow.filter((m) => m.role === 'assistant');
629
+ const lastUser = userMessages[userMessages.length - 1]?.content || '';
630
+ const lastAssistant = assistantMessages[assistantMessages.length - 1]?.content || '';
492
631
 
493
- return {
494
- format: 'if-then-v1-llm',
495
- trigger: parsed.trigger,
496
- action: parsed.action,
497
- signal: signal === 'positive' || signal === 'up' ? 'positive' : 'negative',
498
- confidence: Math.max(0, Math.min(1, Number(parsed.confidence) || 0.5)),
499
- scope: parsed.scope || inferScope(filePaths, toolCalls),
500
- examples: [{ userIntent: lastUser.slice(0, 300), assistantAction: lastAssistant.slice(0, 300), outcome: signal === 'positive' || signal === 'up' ? 'approved' : 'rejected' }],
501
- metadata: { toolsUsed: toolCalls, filesInvolved: filePaths.slice(0, 10), errorPatterns: errorPatterns.slice(0, 5), conversationLength: normalizedWindow.length, inferredAt: new Date().toISOString(), llmModel: MODELS.FAST },
502
- tags: Array.isArray(parsed.tags) ? parsed.tags : [],
503
- };
504
- } catch {
505
- return null;
506
- }
632
+ return {
633
+ format: 'if-then-v1-llm',
634
+ trigger: parsed.trigger,
635
+ action: parsed.action,
636
+ signal: signal === 'positive' || signal === 'up' ? 'positive' : 'negative',
637
+ confidence: Math.max(0, Math.min(1, Number(parsed.confidence) || 0.5)),
638
+ scope: parsed.scope || inferScope(filePaths, toolCalls),
639
+ examples: [{ userIntent: lastUser.slice(0, 300), assistantAction: lastAssistant.slice(0, 300), outcome: signal === 'positive' || signal === 'up' ? 'approved' : 'rejected' }],
640
+ metadata: { toolsUsed: toolCalls, filesInvolved: filePaths.slice(0, 10), errorPatterns: errorPatterns.slice(0, 5), conversationLength: normalizedWindow.length, inferredAt: new Date().toISOString(), llmModel: MODELS.FAST },
641
+ tags: Array.isArray(parsed.tags) ? parsed.tags : [],
642
+ };
507
643
  }
508
644
 
509
645
  module.exports = {
@@ -515,4 +651,7 @@ module.exports = {
515
651
  inferStructuredLesson, inferStructuredLessonLLM,
516
652
  extractTrigger, extractAction, extractToolCalls,
517
653
  extractFilePaths, extractErrors, calculateConfidence, inferScope,
654
+ // Exported for prompt-shape regression tests.
655
+ LLM_LESSON_SYSTEM_PROMPT, LLM_LESSON_MULTISHOT_EXAMPLES,
656
+ renderMultishotExamplesForPrompt, buildLessonUserPrompt,
518
657
  };
@@ -2,6 +2,7 @@
2
2
 
3
3
  const path = require('node:path');
4
4
  const { readJSONL, getFeedbackPaths } = require('./feedback-loop');
5
+ const { loadOptionalModule } = require('./private-core-boundary');
5
6
 
6
7
  const HIGH_RISK_TAGS = new Set([
7
8
  'billing',
@@ -514,7 +515,9 @@ function searchLessons(query = '', options = {}) {
514
515
  // Cross-encoder reranking: when a query is present, rerank the top-50 bi-encoder
515
516
  // candidates using field-weighted BM25 so the most relevant lessons surface first.
516
517
  if (query && results.length > 1) {
517
- const { rerankLessons } = require('./lesson-reranker');
518
+ const { rerankLessons } = loadOptionalModule('./lesson-reranker', () => ({
519
+ rerankLessons: (_query, pool) => pool,
520
+ }));
518
521
  const pool = results.slice(0, 50);
519
522
  const tail = results.slice(50);
520
523
  const reranked = rerankLessons(query, pool, { topK: pool.length });
@@ -10,6 +10,7 @@ const MODELS = {
10
10
 
11
11
  const DEFAULT_MODEL = MODELS.FAST;
12
12
  const DEFAULT_MAX_TOKENS = 1024;
13
+ const DEFAULT_CACHE_TTL = '5m';
13
14
 
14
15
  let _client = null;
15
16
 
@@ -35,40 +36,170 @@ function stripCodeFences(text) {
35
36
  return fenced ? fenced[1].trim() : text.trim();
36
37
  }
37
38
 
38
- // Anthropic SDK throws errors with a `.status` field for HTTP failures.
39
- // Our defaultClassify already reads `.status`, so 429/5xx retry and 4xx
40
- // (bad request / unauthorized / not-found) bail immediately — which is
41
- // what we want: there is no point retrying a malformed prompt or a
42
- // revoked API key.
43
- async function callClaude({ systemPrompt, userPrompt, model, maxTokens } = {}) {
39
+ function normalizeCacheOptions(cache) {
40
+ if (!cache) return null;
41
+
42
+ if (cache === true) {
43
+ return {
44
+ mode: 'system',
45
+ control: { type: 'ephemeral', ttl: DEFAULT_CACHE_TTL },
46
+ };
47
+ }
48
+
49
+ if (typeof cache === 'string') {
50
+ return {
51
+ mode: 'system',
52
+ control: { type: 'ephemeral', ttl: cache },
53
+ };
54
+ }
55
+
56
+ if (typeof cache !== 'object') return null;
57
+
58
+ const ttl = typeof cache.ttl === 'string' && cache.ttl ? cache.ttl : DEFAULT_CACHE_TTL;
59
+ const type = typeof cache.type === 'string' && cache.type ? cache.type : 'ephemeral';
60
+ const mode = typeof cache.mode === 'string' && cache.mode ? cache.mode : 'system';
61
+
62
+ return {
63
+ mode,
64
+ control: { type, ttl },
65
+ };
66
+ }
67
+
68
+ function applyCacheToSystem(systemPrompt, cacheOptions) {
69
+ if (!systemPrompt) return undefined;
70
+ if (!cacheOptions || (cacheOptions.mode !== 'system' && cacheOptions.mode !== 'tools+system')) {
71
+ return systemPrompt;
72
+ }
73
+ return [{ type: 'text', text: systemPrompt, cache_control: cacheOptions.control }];
74
+ }
75
+
76
+ function applyCacheToTools(tools, cacheOptions) {
77
+ if (!Array.isArray(tools) || tools.length === 0) return undefined;
78
+ if (!cacheOptions || (cacheOptions.mode !== 'tools' && cacheOptions.mode !== 'tools+system')) {
79
+ return tools;
80
+ }
81
+ return tools.map((tool) => {
82
+ if (!tool || typeof tool !== 'object' || tool.cache_control) return tool;
83
+ return { ...tool, cache_control: cacheOptions.control };
84
+ });
85
+ }
86
+
87
+ function buildClaudeRequest({
88
+ systemPrompt,
89
+ userPrompt,
90
+ messages,
91
+ model,
92
+ maxTokens,
93
+ cache,
94
+ tools,
95
+ toolChoice,
96
+ metadata,
97
+ temperature,
98
+ } = {}) {
99
+ const cacheOptions = normalizeCacheOptions(cache);
100
+ const request = {
101
+ model: model || DEFAULT_MODEL,
102
+ max_tokens: maxTokens || DEFAULT_MAX_TOKENS,
103
+ messages: Array.isArray(messages) && messages.length > 0
104
+ ? messages
105
+ : [{ role: 'user', content: userPrompt }],
106
+ };
107
+
108
+ const normalizedSystem = applyCacheToSystem(systemPrompt, cacheOptions);
109
+ if (normalizedSystem) request.system = normalizedSystem;
110
+
111
+ const normalizedTools = applyCacheToTools(tools, cacheOptions);
112
+ if (normalizedTools) request.tools = normalizedTools;
113
+
114
+ if (toolChoice) request.tool_choice = toolChoice;
115
+ if (metadata && typeof metadata === 'object') request.metadata = metadata;
116
+ if (Number.isFinite(temperature)) request.temperature = temperature;
117
+
118
+ if (cacheOptions && cacheOptions.mode === 'request') {
119
+ request.cache_control = cacheOptions.control;
120
+ }
121
+
122
+ return request;
123
+ }
124
+
125
+ function extractTextContent(response) {
126
+ return (response?.content || [])
127
+ .filter((block) => block.type === 'text')
128
+ .map((block) => block.text)
129
+ .join('');
130
+ }
131
+
132
+ function parseClaudeJson(text) {
133
+ if (typeof text !== 'string') return null;
134
+ try {
135
+ return JSON.parse(stripCodeFences(text));
136
+ } catch {
137
+ return null;
138
+ }
139
+ }
140
+
141
+ async function callClaudeInternal(options = {}) {
44
142
  const client = getClient();
45
143
  if (!client) return null;
46
144
 
47
145
  try {
48
- const text = await runStep('llm.callClaude', {
146
+ const response = await runStep('llm.callClaude', {
49
147
  retries: 2,
50
148
  logger: (msg) => console.warn(msg),
51
- }, async () => {
52
- const response = await client.messages.create({
53
- model: model || DEFAULT_MODEL,
54
- max_tokens: maxTokens || DEFAULT_MAX_TOKENS,
55
- system: systemPrompt || undefined,
56
- messages: [{ role: 'user', content: userPrompt }],
57
- });
58
-
59
- return response.content
60
- .filter((b) => b.type === 'text')
61
- .map((b) => b.text)
62
- .join('');
63
- });
64
-
65
- return stripCodeFences(text);
149
+ }, async () => client.messages.create(buildClaudeRequest(options)));
150
+
151
+ const text = stripCodeFences(extractTextContent(response));
152
+ return {
153
+ text,
154
+ usage: response?.usage || null,
155
+ stopReason: response?.stop_reason || null,
156
+ id: response?.id || null,
157
+ model: response?.model || options.model || DEFAULT_MODEL,
158
+ };
66
159
  } catch {
67
- // Preserve the original callClaude contract — callers expect `null` on
68
- // failure, not an exception. runStep already logged retry attempts,
69
- // so the permanent failure is visible in logs.
70
160
  return null;
71
161
  }
72
162
  }
73
163
 
74
- module.exports = { isAvailable, callClaude, stripCodeFences, MODELS };
164
+ // Anthropic SDK throws errors with a `.status` field for HTTP failures.
165
+ // Our defaultClassify already reads `.status`, so 429/5xx retry and 4xx
166
+ // (bad request / unauthorized / not-found) bail immediately — which is
167
+ // what we want: there is no point retrying a malformed prompt or a
168
+ // revoked API key.
169
+ async function callClaude(options = {}) {
170
+ const result = await callClaudeInternal(options);
171
+ if (!result) return null;
172
+ return options.returnMetadata ? result : result.text;
173
+ }
174
+
175
+ async function callClaudeJson(options = {}) {
176
+ const result = await callClaudeInternal(options);
177
+ if (!result) return null;
178
+
179
+ const parsed = parseClaudeJson(result.text);
180
+ if (parsed === null) return null;
181
+
182
+ if (options.returnMetadata) {
183
+ return {
184
+ parsed,
185
+ text: result.text,
186
+ usage: result.usage,
187
+ stopReason: result.stopReason,
188
+ id: result.id,
189
+ model: result.model,
190
+ };
191
+ }
192
+
193
+ return parsed;
194
+ }
195
+
196
+ module.exports = {
197
+ isAvailable,
198
+ callClaude,
199
+ callClaudeJson,
200
+ stripCodeFences,
201
+ parseClaudeJson,
202
+ normalizeCacheOptions,
203
+ buildClaudeRequest,
204
+ MODELS,
205
+ };