thumbgate 1.15.0 → 1.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +6 -6
- package/.claude-plugin/plugin.json +3 -3
- package/.well-known/llms.txt +5 -5
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +59 -35
- package/adapters/chatgpt/openapi.yaml +118 -2
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +210 -84
- package/adapters/opencode/opencode.json +1 -1
- package/bench/prompt-eval-suite.json +5 -1
- package/bin/cli.js +157 -8
- package/config/evals/agent-safety-eval.json +338 -22
- package/config/gates/routine.json +43 -0
- package/config/github-about.json +3 -3
- package/config/model-candidates.json +131 -0
- package/openapi/openapi.yaml +118 -2
- package/package.json +57 -49
- package/public/blog.html +7 -7
- package/public/codex-plugin.html +6 -6
- package/public/compare.html +29 -23
- package/public/dashboard.html +82 -10
- package/public/guide.html +28 -28
- package/public/index.html +216 -98
- package/public/learn.html +50 -22
- package/public/lessons.html +1 -1
- package/public/numbers.html +17 -17
- package/public/pro.html +82 -18
- package/scripts/agent-audit-trace.js +55 -0
- package/scripts/agent-memory-lifecycle.js +96 -0
- package/scripts/agent-readiness-plan.js +118 -0
- package/scripts/agentic-data-pipeline.js +21 -1
- package/scripts/agents-sdk-sandbox-plan.js +57 -0
- package/scripts/ai-org-governance.js +98 -0
- package/scripts/ai-search-distribution.js +43 -0
- package/scripts/artifact-agent-plan.js +81 -0
- package/scripts/billing.js +27 -8
- package/scripts/cli-schema.js +18 -2
- package/scripts/code-mode-mcp-plan.js +71 -0
- package/scripts/context-engine.js +1 -2
- package/scripts/context-manager.js +4 -1
- package/scripts/dashboard-render-spec.js +1 -1
- package/scripts/dashboard.js +275 -9
- package/scripts/decision-journal.js +13 -3
- package/scripts/document-workflow-governance.js +62 -0
- package/scripts/enterprise-agent-rollout.js +34 -0
- package/scripts/experience-replay-governance.js +69 -0
- package/scripts/export-hf-dataset.js +1 -1
- package/scripts/feedback-loop.js +92 -4
- package/scripts/feedback-to-rules.js +17 -23
- package/scripts/gates-engine.js +4 -6
- package/scripts/growth-campaigns.js +49 -0
- package/scripts/harness-selector.js +16 -4
- package/scripts/hybrid-supervisor-agent.js +64 -0
- package/scripts/inference-cache-policy.js +72 -0
- package/scripts/inference-economics.js +53 -0
- package/scripts/internal-agent-bootstrap.js +12 -2
- package/scripts/knowledge-layer-plan.js +108 -0
- package/scripts/lesson-inference.js +183 -44
- package/scripts/lesson-search.js +4 -1
- package/scripts/llm-client.js +157 -26
- package/scripts/mailer/resend-mailer.js +112 -1
- package/scripts/mcp-transport-strategy.js +66 -0
- package/scripts/memory-store-governance.js +60 -0
- package/scripts/meta-agent-loop.js +7 -13
- package/scripts/model-access-eligibility.js +38 -0
- package/scripts/model-migration-readiness.js +55 -0
- package/scripts/operational-integrity.js +96 -3
- package/scripts/otel-declarative-config.js +56 -0
- package/scripts/perplexity-client.js +1 -1
- package/scripts/post-training-governance.js +34 -0
- package/scripts/private-core-boundary.js +72 -0
- package/scripts/production-agent-readiness.js +40 -0
- package/scripts/prompt-eval.js +564 -32
- package/scripts/prompt-programs.js +93 -0
- package/scripts/provider-action-normalizer.js +585 -0
- package/scripts/scaling-law-claims.js +60 -0
- package/scripts/security-scanner.js +1 -1
- package/scripts/self-distill-agent.js +7 -32
- package/scripts/seo-gsd.js +232 -55
- package/scripts/skill-rag-router.js +53 -0
- package/scripts/spec-gate.js +1 -1
- package/scripts/student-consistent-training.js +73 -0
- package/scripts/synthetic-data-provenance.js +98 -0
- package/scripts/task-context-result.js +81 -0
- package/scripts/telemetry-analytics.js +149 -0
- package/scripts/thompson-sampling.js +2 -2
- package/scripts/token-savings.js +7 -6
- package/scripts/token-tco.js +46 -0
- package/scripts/tool-registry.js +63 -3
- package/scripts/verification-loop.js +10 -1
- package/scripts/verifier-scoring.js +71 -0
- package/scripts/workflow-sentinel.js +284 -28
- package/scripts/workspace-agent-routines.js +118 -0
- package/src/api/server.js +381 -120
- package/scripts/analytics-report.js +0 -328
- package/scripts/autonomous-workflow.js +0 -377
- package/scripts/billing-setup.js +0 -109
- package/scripts/creator-campaigns.js +0 -239
- package/scripts/cross-encoder-reranker.js +0 -235
- package/scripts/daemon-manager.js +0 -108
- package/scripts/decision-trace.js +0 -354
- package/scripts/delegation-runtime.js +0 -896
- package/scripts/dispatch-brief.js +0 -159
- package/scripts/distribution-surfaces.js +0 -110
- package/scripts/feedback-history-distiller.js +0 -382
- package/scripts/funnel-analytics.js +0 -35
- package/scripts/history-distiller.js +0 -200
- package/scripts/hosted-job-launcher.js +0 -256
- package/scripts/intent-router.js +0 -392
- package/scripts/lesson-reranker.js +0 -263
- package/scripts/lesson-retrieval.js +0 -148
- package/scripts/managed-lesson-agent.js +0 -183
- package/scripts/operational-dashboard.js +0 -103
- package/scripts/operational-summary.js +0 -129
- package/scripts/operator-artifacts.js +0 -608
- package/scripts/optimize-context.js +0 -17
- package/scripts/org-dashboard.js +0 -206
- package/scripts/partner-orchestration.js +0 -146
- package/scripts/predictive-insights.js +0 -356
- package/scripts/pulse.js +0 -80
- package/scripts/reflector-agent.js +0 -221
- package/scripts/sales-pipeline.js +0 -681
- package/scripts/session-episode-store.js +0 -329
- package/scripts/session-health-sensor.js +0 -242
- package/scripts/session-report.js +0 -120
- package/scripts/swarm-coordinator.js +0 -81
- package/scripts/tool-kpi-tracker.js +0 -12
- package/scripts/webhook-delivery.js +0 -62
- package/scripts/workflow-sprint-intake.js +0 -475
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
function buildKnowledgeLayerPlan(options = {}) {
|
|
4
|
+
const domain = options.domain || 'agent_reliability';
|
|
5
|
+
const graph = options.graph || 'neo4j';
|
|
6
|
+
|
|
7
|
+
return {
|
|
8
|
+
domain,
|
|
9
|
+
graph,
|
|
10
|
+
memoryTiers: [
|
|
11
|
+
{
|
|
12
|
+
id: 'short_term',
|
|
13
|
+
purpose: 'Current session context so the agent does not re-ask answered questions.',
|
|
14
|
+
ttl: 'session',
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
id: 'long_term',
|
|
18
|
+
purpose: 'Durable user, product, workflow, and feedback profile facts.',
|
|
19
|
+
ttl: 'durable',
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
id: 'reasoning_memory',
|
|
23
|
+
purpose: 'Reusable decision paths that avoid recomputing expensive traversals.',
|
|
24
|
+
ttl: 'versioned',
|
|
25
|
+
},
|
|
26
|
+
],
|
|
27
|
+
nodeTypes: [
|
|
28
|
+
'User',
|
|
29
|
+
'Agent',
|
|
30
|
+
'Workflow',
|
|
31
|
+
'Feedback',
|
|
32
|
+
'Gate',
|
|
33
|
+
'Decision',
|
|
34
|
+
'Evidence',
|
|
35
|
+
'Recommendation',
|
|
36
|
+
'Outcome',
|
|
37
|
+
],
|
|
38
|
+
relationshipTypes: [
|
|
39
|
+
'GAVE_FEEDBACK',
|
|
40
|
+
'TRIGGERED_GATE',
|
|
41
|
+
'USED_EVIDENCE',
|
|
42
|
+
'RECOMMENDED_ACTION',
|
|
43
|
+
'PRODUCED_OUTCOME',
|
|
44
|
+
'SIMILAR_TO',
|
|
45
|
+
'REUSES_REASONING',
|
|
46
|
+
],
|
|
47
|
+
highRoiUseCases: [
|
|
48
|
+
'conversion recommendations with explainable evidence paths',
|
|
49
|
+
'compute savings from reasoning-memory cache hits',
|
|
50
|
+
'compliance audit trail for why an agent recommended or blocked an action',
|
|
51
|
+
'closed-loop profile updates from every feedback, purchase, or outcome event',
|
|
52
|
+
],
|
|
53
|
+
gates: [
|
|
54
|
+
'do not recommend without an evidence path',
|
|
55
|
+
'do not reuse reasoning memory when source facts changed',
|
|
56
|
+
'write audit node for every recommendation and blocked action',
|
|
57
|
+
'record outcome feedback to update profile and graph edges',
|
|
58
|
+
],
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function buildRecommendationEvidencePath(input = {}) {
|
|
63
|
+
const userId = input.userId || 'unknown_user';
|
|
64
|
+
const recommendationId = input.recommendationId || 'rec_pending';
|
|
65
|
+
const evidence = Array.isArray(input.evidence) ? input.evidence : [];
|
|
66
|
+
const similarProfiles = Array.isArray(input.similarProfiles) ? input.similarProfiles : [];
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
recommendationId,
|
|
70
|
+
path: [
|
|
71
|
+
{ type: 'User', id: userId },
|
|
72
|
+
...similarProfiles.map((id) => ({ type: 'SimilarProfile', id })),
|
|
73
|
+
...evidence.map((item, index) => ({
|
|
74
|
+
type: item.type || 'Evidence',
|
|
75
|
+
id: item.id || `evidence_${index + 1}`,
|
|
76
|
+
quote: item.quote || null,
|
|
77
|
+
})),
|
|
78
|
+
{ type: 'Recommendation', id: recommendationId },
|
|
79
|
+
],
|
|
80
|
+
explainable: evidence.length > 0,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function evaluateKnowledgeLayerRun(run = {}) {
|
|
85
|
+
const issues = [];
|
|
86
|
+
if (!run.userId) issues.push('missing_user_id');
|
|
87
|
+
if (!run.recommendationId) issues.push('missing_recommendation_id');
|
|
88
|
+
if (!run.evidencePath?.explainable) issues.push('missing_explainable_evidence_path');
|
|
89
|
+
if (!run.auditNodeId) issues.push('missing_audit_node_id');
|
|
90
|
+
if (run.reusedReasoning && !run.reasoningVersion) issues.push('missing_reasoning_version');
|
|
91
|
+
if (run.profileUpdate && !run.outcomeEventId) issues.push('missing_outcome_event_id');
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
decision: issues.length ? 'warn' : 'allow',
|
|
95
|
+
issues,
|
|
96
|
+
roiSignals: [
|
|
97
|
+
run.reusedReasoning ? 'lower_graph_query_and_token_cost' : null,
|
|
98
|
+
run.profileUpdate ? 'closed_loop_personalization' : null,
|
|
99
|
+
run.auditNodeId ? 'compliance_trace_available' : null,
|
|
100
|
+
].filter(Boolean),
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
module.exports = {
|
|
105
|
+
buildKnowledgeLayerPlan,
|
|
106
|
+
buildRecommendationEvidencePath,
|
|
107
|
+
evaluateKnowledgeLayerRun,
|
|
108
|
+
};
|
|
@@ -431,27 +431,173 @@ function consumePhrase(lower, original, phrases) {
|
|
|
431
431
|
// 6. LLM-Powered Structured Lesson Extraction
|
|
432
432
|
// ---------------------------------------------------------------------------
|
|
433
433
|
|
|
434
|
+
function createLessonPromptExample([
|
|
435
|
+
signal,
|
|
436
|
+
conversationWindow,
|
|
437
|
+
triggerCondition,
|
|
438
|
+
triggerType,
|
|
439
|
+
actionType,
|
|
440
|
+
actionDescription,
|
|
441
|
+
confidence,
|
|
442
|
+
scope,
|
|
443
|
+
tags,
|
|
444
|
+
]) {
|
|
445
|
+
return {
|
|
446
|
+
signal,
|
|
447
|
+
conversationWindow: conversationWindow.join('\n'),
|
|
448
|
+
output: {
|
|
449
|
+
trigger: { condition: triggerCondition, type: triggerType },
|
|
450
|
+
action: { type: actionType, description: actionDescription },
|
|
451
|
+
confidence,
|
|
452
|
+
scope,
|
|
453
|
+
tags,
|
|
454
|
+
},
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
// Five multishot exemplars pinned as a constant so they can be inspected/tested
|
|
459
|
+
// independently of the prompt string. Each example pairs a (signal,
|
|
460
|
+
// conversation_window) with the exact JSON output Claude should emit. These
|
|
461
|
+
// were drafted from real ThumbGate incident classes: Edit-before-Read,
|
|
462
|
+
// force-push-to-main, deploy-verification, mock-to-live-in-tests, and
|
|
463
|
+
// regression-test-pinning. Changing any example shifts lesson extraction
|
|
464
|
+
// behavior — treat it like a prompt version bump.
|
|
465
|
+
const LLM_LESSON_MULTISHOT_EXAMPLES = [
|
|
466
|
+
[
|
|
467
|
+
'negative',
|
|
468
|
+
[
|
|
469
|
+
'[user]: why is my edit failing?',
|
|
470
|
+
'[assistant]: I\'ll try editing src/api/server.js — Edit(src/api/server.js) failed: File has not been read yet.',
|
|
471
|
+
'[assistant]: Let me Read(src/api/server.js) first, then retry Edit.',
|
|
472
|
+
'[user]: that worked. thumbs down on the first attempt though.',
|
|
473
|
+
],
|
|
474
|
+
'about to call Edit on a file that has not been Read in this session',
|
|
475
|
+
'constraint',
|
|
476
|
+
'avoid',
|
|
477
|
+
'Never call Edit on a file without first calling Read on it — the tool rejects unread files',
|
|
478
|
+
0.9,
|
|
479
|
+
'global',
|
|
480
|
+
['Edit', 'Read', 'tool-use', 'preconditions'],
|
|
481
|
+
],
|
|
482
|
+
[
|
|
483
|
+
'negative',
|
|
484
|
+
[
|
|
485
|
+
'[assistant]: Running git push --force origin main to clean up history.',
|
|
486
|
+
'[user]: NO. Never force-push to main. Thumbs down.',
|
|
487
|
+
],
|
|
488
|
+
'about to run git push with --force or -f on the main branch',
|
|
489
|
+
'constraint',
|
|
490
|
+
'avoid',
|
|
491
|
+
'Never force-push to main; use incremental commits and open a new PR if history needs adjustment',
|
|
492
|
+
0.95,
|
|
493
|
+
'project-level',
|
|
494
|
+
['git', 'force-push', 'main-branch', 'destructive'],
|
|
495
|
+
],
|
|
496
|
+
[
|
|
497
|
+
'positive',
|
|
498
|
+
[
|
|
499
|
+
'[user]: the deploy failed again, can you just verify with curl before declaring done?',
|
|
500
|
+
'[assistant]: Running curl https://thumbgate-production.up.railway.app/health and grepping for the expected version.',
|
|
501
|
+
'[assistant]: Got {"version":"1.15.0"} — matches package.json. Deploy confirmed.',
|
|
502
|
+
'[user]: perfect. thumbs up.',
|
|
503
|
+
],
|
|
504
|
+
'after merging a PR that touches deployable code',
|
|
505
|
+
'implementation',
|
|
506
|
+
'do',
|
|
507
|
+
'Curl the /health endpoint and grep for the expected version before claiming a deploy succeeded',
|
|
508
|
+
0.85,
|
|
509
|
+
'project-level',
|
|
510
|
+
['deploy', 'verification', 'curl', 'railway'],
|
|
511
|
+
],
|
|
512
|
+
[
|
|
513
|
+
'negative',
|
|
514
|
+
[
|
|
515
|
+
'[user]: these test failures are weird',
|
|
516
|
+
'[assistant]: Let me check... the tests are using mocked Stripe. Maybe real Stripe is needed?',
|
|
517
|
+
'[assistant]: Replaced mocks with live Stripe calls. Tests now passing.',
|
|
518
|
+
'[user]: tests should not call real Stripe in CI. Thumbs down.',
|
|
519
|
+
],
|
|
520
|
+
'a test is failing and the temptation is to replace mocks with live API calls',
|
|
521
|
+
'debugging',
|
|
522
|
+
'avoid',
|
|
523
|
+
'Never swap mocks for real external API calls to make tests pass — inject dependencies and stub the client instead',
|
|
524
|
+
0.9,
|
|
525
|
+
'project-level',
|
|
526
|
+
['testing', 'mocking', 'stripe', 'ci'],
|
|
527
|
+
],
|
|
528
|
+
[
|
|
529
|
+
'positive',
|
|
530
|
+
[
|
|
531
|
+
'[user]: add a regression test that pins the distinct tier icons',
|
|
532
|
+
'[assistant]: Added assert.notEqual(proIcon, teamIcon) to tests/billing.test.js, plus the URL regex per tier.',
|
|
533
|
+
'[user]: great, locking that in. thumbs up.',
|
|
534
|
+
],
|
|
535
|
+
'fixing a bug where two products rendered with the same asset',
|
|
536
|
+
'implementation',
|
|
537
|
+
'do',
|
|
538
|
+
'Add a regression assertion that pins the distinct values (e.g. assert.notEqual(a,b)) so the collapse cannot silently return',
|
|
539
|
+
0.9,
|
|
540
|
+
'file-level',
|
|
541
|
+
['regression-test', 'stripe', 'billing', 'tier'],
|
|
542
|
+
],
|
|
543
|
+
].map(createLessonPromptExample);
|
|
544
|
+
|
|
545
|
+
function renderMultishotExamplesForPrompt(examples = LLM_LESSON_MULTISHOT_EXAMPLES) {
|
|
546
|
+
return examples.map((ex) => (
|
|
547
|
+
`<example>
|
|
548
|
+
<signal>${ex.signal}</signal>
|
|
549
|
+
<conversation_window>
|
|
550
|
+
${ex.conversationWindow}
|
|
551
|
+
</conversation_window>
|
|
552
|
+
<output>${JSON.stringify(ex.output)}</output>
|
|
553
|
+
</example>`
|
|
554
|
+
)).join('\n');
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
// Anthropic's prompt-engineering playbook (ref: anthropic.skilljar.com
|
|
558
|
+
// Prompt Engineering course) recommends XML tags to scope context blocks and
|
|
559
|
+
// multishot exemplars so the model sees the exact expected shape before being
|
|
560
|
+
// asked to produce it. Both techniques apply cleanly here because the output
|
|
561
|
+
// is a strict JSON schema and the extraction task has five recurring incident
|
|
562
|
+
// classes (see LLM_LESSON_MULTISHOT_EXAMPLES).
|
|
434
563
|
const LLM_LESSON_SYSTEM_PROMPT = `You are a lesson extraction engine for an AI coding agent safety system called ThumbGate.
|
|
435
564
|
|
|
436
|
-
|
|
565
|
+
<task>
|
|
566
|
+
Given a feedback signal (positive or negative) and a conversation window, extract a structured if-then lesson that would prevent the same mistake (negative) or reinforce the same success (positive) in future sessions.
|
|
567
|
+
</task>
|
|
437
568
|
|
|
438
|
-
|
|
569
|
+
<output_schema>
|
|
570
|
+
Return ONLY valid JSON matching this exact shape — no prose, no code fences, no text outside the JSON object:
|
|
439
571
|
{
|
|
440
|
-
"trigger": { "condition": "<when this lesson applies>", "type": "<
|
|
441
|
-
"action":
|
|
572
|
+
"trigger": { "condition": "<when this lesson applies>", "type": "<debugging|implementation|question|error-report|constraint>" },
|
|
573
|
+
"action": { "type": "<do|avoid>", "description": "<specific action to take or avoid>" },
|
|
442
574
|
"confidence": <0.0 to 1.0>,
|
|
443
|
-
"scope": "<global
|
|
575
|
+
"scope": "<global|file-level|project-level>",
|
|
444
576
|
"tags": ["<relevant tags>"]
|
|
445
577
|
}
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
-
|
|
450
|
-
-
|
|
451
|
-
-
|
|
578
|
+
</output_schema>
|
|
579
|
+
|
|
580
|
+
<guidelines>
|
|
581
|
+
- Be specific and actionable. "Avoid editing files without reading them first" beats "Avoid bad edits".
|
|
582
|
+
- confidence should reflect how clear the lesson is from the window. A single ambiguous exchange caps around 0.5; a reproduced failure with a confirmed fix can reach 0.9.
|
|
583
|
+
- tags should include tool names, file types, or domain areas mentioned in the conversation.
|
|
584
|
+
- Emit JSON only. No code fences, no commentary.
|
|
585
|
+
</guidelines>
|
|
586
|
+
|
|
587
|
+
<examples>
|
|
588
|
+
${renderMultishotExamplesForPrompt()}
|
|
589
|
+
</examples>`;
|
|
590
|
+
|
|
591
|
+
function buildLessonUserPrompt({ signal, context, windowText }) {
|
|
592
|
+
const normalizedSignal = signal === 'positive' || signal === 'up' ? 'positive' : 'negative';
|
|
593
|
+
const parts = [`<signal>${normalizedSignal}</signal>`];
|
|
594
|
+
if (context) parts.push(`<user_context>${context}</user_context>`);
|
|
595
|
+
parts.push(`<conversation_window>\n${windowText}\n</conversation_window>`);
|
|
596
|
+
return parts.join('\n');
|
|
597
|
+
}
|
|
452
598
|
|
|
453
599
|
async function inferStructuredLessonLLM(conversationWindow, signal, context) {
|
|
454
|
-
const { isAvailable,
|
|
600
|
+
const { isAvailable, callClaudeJson, MODELS } = require('./llm-client');
|
|
455
601
|
if (!isAvailable()) return null;
|
|
456
602
|
|
|
457
603
|
const normalizedWindow = Array.isArray(conversationWindow) ? conversationWindow : [];
|
|
@@ -463,47 +609,37 @@ async function inferStructuredLessonLLM(conversationWindow, signal, context) {
|
|
|
463
609
|
.join('\n')
|
|
464
610
|
.slice(0, 4000);
|
|
465
611
|
|
|
466
|
-
const userPrompt =
|
|
467
|
-
`Signal: ${signal === 'positive' || signal === 'up' ? 'positive (thumbs up — something worked well)' : 'negative (thumbs down — something went wrong)'}`,
|
|
468
|
-
context ? `User context: ${context}` : '',
|
|
469
|
-
`\nConversation:\n${windowText}`,
|
|
470
|
-
].filter(Boolean).join('\n');
|
|
612
|
+
const userPrompt = buildLessonUserPrompt({ signal, context, windowText });
|
|
471
613
|
|
|
472
|
-
const
|
|
614
|
+
const parsed = await callClaudeJson({
|
|
473
615
|
systemPrompt: LLM_LESSON_SYSTEM_PROMPT,
|
|
474
616
|
userPrompt,
|
|
475
617
|
model: MODELS.FAST,
|
|
476
618
|
maxTokens: 512,
|
|
619
|
+
cache: true,
|
|
477
620
|
});
|
|
478
621
|
|
|
479
|
-
if (!
|
|
480
|
-
|
|
481
|
-
try {
|
|
482
|
-
const parsed = JSON.parse(raw);
|
|
483
|
-
if (!parsed.trigger || !parsed.action) return null;
|
|
622
|
+
if (!parsed || !parsed.trigger || !parsed.action) return null;
|
|
484
623
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
624
|
+
const filePaths = extractFilePaths(normalizedWindow);
|
|
625
|
+
const toolCalls = extractToolCalls(normalizedWindow);
|
|
626
|
+
const errorPatterns = extractErrors(normalizedWindow);
|
|
627
|
+
const userMessages = normalizedWindow.filter((m) => m.role === 'user');
|
|
628
|
+
const assistantMessages = normalizedWindow.filter((m) => m.role === 'assistant');
|
|
629
|
+
const lastUser = userMessages[userMessages.length - 1]?.content || '';
|
|
630
|
+
const lastAssistant = assistantMessages[assistantMessages.length - 1]?.content || '';
|
|
492
631
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
} catch {
|
|
505
|
-
return null;
|
|
506
|
-
}
|
|
632
|
+
return {
|
|
633
|
+
format: 'if-then-v1-llm',
|
|
634
|
+
trigger: parsed.trigger,
|
|
635
|
+
action: parsed.action,
|
|
636
|
+
signal: signal === 'positive' || signal === 'up' ? 'positive' : 'negative',
|
|
637
|
+
confidence: Math.max(0, Math.min(1, Number(parsed.confidence) || 0.5)),
|
|
638
|
+
scope: parsed.scope || inferScope(filePaths, toolCalls),
|
|
639
|
+
examples: [{ userIntent: lastUser.slice(0, 300), assistantAction: lastAssistant.slice(0, 300), outcome: signal === 'positive' || signal === 'up' ? 'approved' : 'rejected' }],
|
|
640
|
+
metadata: { toolsUsed: toolCalls, filesInvolved: filePaths.slice(0, 10), errorPatterns: errorPatterns.slice(0, 5), conversationLength: normalizedWindow.length, inferredAt: new Date().toISOString(), llmModel: MODELS.FAST },
|
|
641
|
+
tags: Array.isArray(parsed.tags) ? parsed.tags : [],
|
|
642
|
+
};
|
|
507
643
|
}
|
|
508
644
|
|
|
509
645
|
module.exports = {
|
|
@@ -515,4 +651,7 @@ module.exports = {
|
|
|
515
651
|
inferStructuredLesson, inferStructuredLessonLLM,
|
|
516
652
|
extractTrigger, extractAction, extractToolCalls,
|
|
517
653
|
extractFilePaths, extractErrors, calculateConfidence, inferScope,
|
|
654
|
+
// Exported for prompt-shape regression tests.
|
|
655
|
+
LLM_LESSON_SYSTEM_PROMPT, LLM_LESSON_MULTISHOT_EXAMPLES,
|
|
656
|
+
renderMultishotExamplesForPrompt, buildLessonUserPrompt,
|
|
518
657
|
};
|
package/scripts/lesson-search.js
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
const path = require('node:path');
|
|
4
4
|
const { readJSONL, getFeedbackPaths } = require('./feedback-loop');
|
|
5
|
+
const { loadOptionalModule } = require('./private-core-boundary');
|
|
5
6
|
|
|
6
7
|
const HIGH_RISK_TAGS = new Set([
|
|
7
8
|
'billing',
|
|
@@ -514,7 +515,9 @@ function searchLessons(query = '', options = {}) {
|
|
|
514
515
|
// Cross-encoder reranking: when a query is present, rerank the top-50 bi-encoder
|
|
515
516
|
// candidates using field-weighted BM25 so the most relevant lessons surface first.
|
|
516
517
|
if (query && results.length > 1) {
|
|
517
|
-
const { rerankLessons } =
|
|
518
|
+
const { rerankLessons } = loadOptionalModule('./lesson-reranker', () => ({
|
|
519
|
+
rerankLessons: (_query, pool) => pool,
|
|
520
|
+
}));
|
|
518
521
|
const pool = results.slice(0, 50);
|
|
519
522
|
const tail = results.slice(50);
|
|
520
523
|
const reranked = rerankLessons(query, pool, { topK: pool.length });
|
package/scripts/llm-client.js
CHANGED
|
@@ -10,6 +10,7 @@ const MODELS = {
|
|
|
10
10
|
|
|
11
11
|
const DEFAULT_MODEL = MODELS.FAST;
|
|
12
12
|
const DEFAULT_MAX_TOKENS = 1024;
|
|
13
|
+
const DEFAULT_CACHE_TTL = '5m';
|
|
13
14
|
|
|
14
15
|
let _client = null;
|
|
15
16
|
|
|
@@ -35,40 +36,170 @@ function stripCodeFences(text) {
|
|
|
35
36
|
return fenced ? fenced[1].trim() : text.trim();
|
|
36
37
|
}
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
function normalizeCacheOptions(cache) {
|
|
40
|
+
if (!cache) return null;
|
|
41
|
+
|
|
42
|
+
if (cache === true) {
|
|
43
|
+
return {
|
|
44
|
+
mode: 'system',
|
|
45
|
+
control: { type: 'ephemeral', ttl: DEFAULT_CACHE_TTL },
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (typeof cache === 'string') {
|
|
50
|
+
return {
|
|
51
|
+
mode: 'system',
|
|
52
|
+
control: { type: 'ephemeral', ttl: cache },
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (typeof cache !== 'object') return null;
|
|
57
|
+
|
|
58
|
+
const ttl = typeof cache.ttl === 'string' && cache.ttl ? cache.ttl : DEFAULT_CACHE_TTL;
|
|
59
|
+
const type = typeof cache.type === 'string' && cache.type ? cache.type : 'ephemeral';
|
|
60
|
+
const mode = typeof cache.mode === 'string' && cache.mode ? cache.mode : 'system';
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
mode,
|
|
64
|
+
control: { type, ttl },
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function applyCacheToSystem(systemPrompt, cacheOptions) {
|
|
69
|
+
if (!systemPrompt) return undefined;
|
|
70
|
+
if (!cacheOptions || (cacheOptions.mode !== 'system' && cacheOptions.mode !== 'tools+system')) {
|
|
71
|
+
return systemPrompt;
|
|
72
|
+
}
|
|
73
|
+
return [{ type: 'text', text: systemPrompt, cache_control: cacheOptions.control }];
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function applyCacheToTools(tools, cacheOptions) {
|
|
77
|
+
if (!Array.isArray(tools) || tools.length === 0) return undefined;
|
|
78
|
+
if (!cacheOptions || (cacheOptions.mode !== 'tools' && cacheOptions.mode !== 'tools+system')) {
|
|
79
|
+
return tools;
|
|
80
|
+
}
|
|
81
|
+
return tools.map((tool) => {
|
|
82
|
+
if (!tool || typeof tool !== 'object' || tool.cache_control) return tool;
|
|
83
|
+
return { ...tool, cache_control: cacheOptions.control };
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function buildClaudeRequest({
|
|
88
|
+
systemPrompt,
|
|
89
|
+
userPrompt,
|
|
90
|
+
messages,
|
|
91
|
+
model,
|
|
92
|
+
maxTokens,
|
|
93
|
+
cache,
|
|
94
|
+
tools,
|
|
95
|
+
toolChoice,
|
|
96
|
+
metadata,
|
|
97
|
+
temperature,
|
|
98
|
+
} = {}) {
|
|
99
|
+
const cacheOptions = normalizeCacheOptions(cache);
|
|
100
|
+
const request = {
|
|
101
|
+
model: model || DEFAULT_MODEL,
|
|
102
|
+
max_tokens: maxTokens || DEFAULT_MAX_TOKENS,
|
|
103
|
+
messages: Array.isArray(messages) && messages.length > 0
|
|
104
|
+
? messages
|
|
105
|
+
: [{ role: 'user', content: userPrompt }],
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
const normalizedSystem = applyCacheToSystem(systemPrompt, cacheOptions);
|
|
109
|
+
if (normalizedSystem) request.system = normalizedSystem;
|
|
110
|
+
|
|
111
|
+
const normalizedTools = applyCacheToTools(tools, cacheOptions);
|
|
112
|
+
if (normalizedTools) request.tools = normalizedTools;
|
|
113
|
+
|
|
114
|
+
if (toolChoice) request.tool_choice = toolChoice;
|
|
115
|
+
if (metadata && typeof metadata === 'object') request.metadata = metadata;
|
|
116
|
+
if (Number.isFinite(temperature)) request.temperature = temperature;
|
|
117
|
+
|
|
118
|
+
if (cacheOptions && cacheOptions.mode === 'request') {
|
|
119
|
+
request.cache_control = cacheOptions.control;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return request;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function extractTextContent(response) {
|
|
126
|
+
return (response?.content || [])
|
|
127
|
+
.filter((block) => block.type === 'text')
|
|
128
|
+
.map((block) => block.text)
|
|
129
|
+
.join('');
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function parseClaudeJson(text) {
|
|
133
|
+
if (typeof text !== 'string') return null;
|
|
134
|
+
try {
|
|
135
|
+
return JSON.parse(stripCodeFences(text));
|
|
136
|
+
} catch {
|
|
137
|
+
return null;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async function callClaudeInternal(options = {}) {
|
|
44
142
|
const client = getClient();
|
|
45
143
|
if (!client) return null;
|
|
46
144
|
|
|
47
145
|
try {
|
|
48
|
-
const
|
|
146
|
+
const response = await runStep('llm.callClaude', {
|
|
49
147
|
retries: 2,
|
|
50
148
|
logger: (msg) => console.warn(msg),
|
|
51
|
-
}, async () =>
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
.map((b) => b.text)
|
|
62
|
-
.join('');
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
return stripCodeFences(text);
|
|
149
|
+
}, async () => client.messages.create(buildClaudeRequest(options)));
|
|
150
|
+
|
|
151
|
+
const text = stripCodeFences(extractTextContent(response));
|
|
152
|
+
return {
|
|
153
|
+
text,
|
|
154
|
+
usage: response?.usage || null,
|
|
155
|
+
stopReason: response?.stop_reason || null,
|
|
156
|
+
id: response?.id || null,
|
|
157
|
+
model: response?.model || options.model || DEFAULT_MODEL,
|
|
158
|
+
};
|
|
66
159
|
} catch {
|
|
67
|
-
// Preserve the original callClaude contract — callers expect `null` on
|
|
68
|
-
// failure, not an exception. runStep already logged retry attempts,
|
|
69
|
-
// so the permanent failure is visible in logs.
|
|
70
160
|
return null;
|
|
71
161
|
}
|
|
72
162
|
}
|
|
73
163
|
|
|
74
|
-
|
|
164
|
+
// Anthropic SDK throws errors with a `.status` field for HTTP failures.
|
|
165
|
+
// Our defaultClassify already reads `.status`, so 429/5xx retry and 4xx
|
|
166
|
+
// (bad request / unauthorized / not-found) bail immediately — which is
|
|
167
|
+
// what we want: there is no point retrying a malformed prompt or a
|
|
168
|
+
// revoked API key.
|
|
169
|
+
async function callClaude(options = {}) {
|
|
170
|
+
const result = await callClaudeInternal(options);
|
|
171
|
+
if (!result) return null;
|
|
172
|
+
return options.returnMetadata ? result : result.text;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async function callClaudeJson(options = {}) {
|
|
176
|
+
const result = await callClaudeInternal(options);
|
|
177
|
+
if (!result) return null;
|
|
178
|
+
|
|
179
|
+
const parsed = parseClaudeJson(result.text);
|
|
180
|
+
if (parsed === null) return null;
|
|
181
|
+
|
|
182
|
+
if (options.returnMetadata) {
|
|
183
|
+
return {
|
|
184
|
+
parsed,
|
|
185
|
+
text: result.text,
|
|
186
|
+
usage: result.usage,
|
|
187
|
+
stopReason: result.stopReason,
|
|
188
|
+
id: result.id,
|
|
189
|
+
model: result.model,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return parsed;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
module.exports = {
|
|
197
|
+
isAvailable,
|
|
198
|
+
callClaude,
|
|
199
|
+
callClaudeJson,
|
|
200
|
+
stripCodeFences,
|
|
201
|
+
parseClaudeJson,
|
|
202
|
+
normalizeCacheOptions,
|
|
203
|
+
buildClaudeRequest,
|
|
204
|
+
MODELS,
|
|
205
|
+
};
|