thumbgate 1.16.12 → 1.16.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +3 -1
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +26 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bin/cli.js +420 -1
- package/config/gate-templates.json +372 -0
- package/config/mcp-allowlists.json +25 -0
- package/config/model-candidates.json +59 -2
- package/config/model-tiers.json +4 -1
- package/package.json +79 -22
- package/public/compare.html +6 -0
- package/public/index.html +144 -11
- package/public/numbers.html +11 -11
- package/public/pro.html +22 -24
- package/scripts/agent-design-governance.js +211 -0
- package/scripts/agent-reasoning-traces.js +683 -0
- package/scripts/agent-reward-model.js +438 -0
- package/scripts/agent-stack-survival-audit.js +231 -0
- package/scripts/ai-engineering-stack-guardrails.js +256 -0
- package/scripts/billing.js +16 -4
- package/scripts/chatgpt-ads-readiness-pack.js +195 -0
- package/scripts/cli-schema.js +277 -0
- package/scripts/code-graph-guardrails.js +176 -0
- package/scripts/deepseek-v4-runtime-guardrails.js +253 -0
- package/scripts/gemini-embedding-policy.js +198 -0
- package/scripts/inference-cache-policy.js +39 -0
- package/scripts/judge-reward-function.js +396 -0
- package/scripts/llm-behavior-monitor.js +251 -0
- package/scripts/long-running-agent-context-guardrails.js +176 -0
- package/scripts/multimodal-retrieval-plan.js +31 -11
- package/scripts/oss-pr-opportunity-scout.js +240 -0
- package/scripts/proactive-agent-eval-guardrails.js +230 -0
- package/scripts/profile-router.js +5 -4
- package/scripts/prompting-operating-system.js +273 -0
- package/scripts/proxy-pointer-rag-guardrails.js +189 -0
- package/scripts/rag-precision-guardrails.js +202 -0
- package/scripts/rate-limiter.js +1 -1
- package/scripts/reasoning-efficiency-guardrails.js +176 -0
- package/scripts/reward-hacking-guardrails.js +251 -0
- package/scripts/seo-gsd.js +1201 -11
- package/scripts/single-use-credential-gate.js +182 -0
- package/scripts/structured-prompt-driven.js +226 -0
- package/scripts/telemetry-analytics.js +31 -6
- package/scripts/tool-registry.js +92 -0
- package/scripts/upstream-contribution-engine.js +379 -0
- package/scripts/vector-store.js +119 -4
- package/src/api/server.js +333 -100
- package/scripts/agents-sdk-sandbox-plan.js +0 -57
- package/scripts/ai-org-governance.js +0 -98
- package/scripts/artifact-agent-plan.js +0 -81
- package/scripts/enterprise-agent-rollout.js +0 -34
- package/scripts/experience-replay-governance.js +0 -69
- package/scripts/inference-economics.js +0 -53
- package/scripts/knowledge-layer-plan.js +0 -108
- package/scripts/memory-store-governance.js +0 -60
- package/scripts/post-training-governance.js +0 -34
- package/scripts/production-agent-readiness.js +0 -40
- package/scripts/scaling-law-claims.js +0 -60
- package/scripts/student-consistent-training.js +0 -73
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Agent Reasoning Traces — observable trace analytics without storing raw CoT.
|
|
6
|
+
*
|
|
7
|
+
* This ingests Hermes/OpenTraces-style agent records or ThumbGate session
|
|
8
|
+
* events, redacts sensitive text, keeps only observable reasoning metadata,
|
|
9
|
+
* and turns trace shapes into gate candidates + eval-ready tuples.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const crypto = require('node:crypto');
|
|
13
|
+
const fs = require('node:fs');
|
|
14
|
+
const path = require('node:path');
|
|
15
|
+
const { readJsonl, appendJsonl } = require('./fs-utils');
|
|
16
|
+
const { resolveFeedbackDir } = require('./feedback-paths');
|
|
17
|
+
|
|
18
|
+
const TRACE_FILE = 'agent-reasoning-traces.jsonl';
|
|
19
|
+
const MAX_TEXT = 500;
|
|
20
|
+
|
|
21
|
+
const SECRET_PATTERNS = [
|
|
22
|
+
{ pattern: /\bgh[pousr]_\w{20,}\b/g, replacement: '[REDACTED_GITHUB_TOKEN]' },
|
|
23
|
+
{ pattern: /\bAKIA[0-9A-Z]{16}\b/g, replacement: '[REDACTED_AWS_KEY]' },
|
|
24
|
+
{ pattern: /\b(?:sk|rk|pk)_(?:live|test)_[A-Za-z0-9]{16,}\b/g, replacement: '[REDACTED_API_KEY]' },
|
|
25
|
+
{ pattern: /\bBearer\s+[-\w.~+/=]{20,}\b/gi, replacement: 'Bearer [REDACTED_TOKEN]' },
|
|
26
|
+
{ pattern: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g, replacement: '[REDACTED_EMAIL]' },
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
const SHAPE_PROFILES = {
|
|
30
|
+
'code-change': {
|
|
31
|
+
required: ['intent', 'plan', 'tool_call', 'file_edit', 'verification'],
|
|
32
|
+
recommended: ['commit_or_pr', 'evidence'],
|
|
33
|
+
forbidden: ['auto_post'],
|
|
34
|
+
maxErrorRate: 0.25,
|
|
35
|
+
},
|
|
36
|
+
'production-change': {
|
|
37
|
+
required: ['intent', 'plan', 'tool_call', 'verification', 'evidence'],
|
|
38
|
+
recommended: ['rollback_path', 'commit_or_pr'],
|
|
39
|
+
forbidden: ['claim_done_without_evidence'],
|
|
40
|
+
maxErrorRate: 0.1,
|
|
41
|
+
},
|
|
42
|
+
'public-engagement': {
|
|
43
|
+
required: ['intent', 'audience_context', 'draft', 'approval_gate'],
|
|
44
|
+
recommended: ['evidence'],
|
|
45
|
+
forbidden: ['auto_post'],
|
|
46
|
+
maxErrorRate: 0.15,
|
|
47
|
+
},
|
|
48
|
+
research: {
|
|
49
|
+
required: ['intent', 'plan', 'source_capture', 'synthesis'],
|
|
50
|
+
recommended: ['citation', 'evidence'],
|
|
51
|
+
forbidden: [],
|
|
52
|
+
maxErrorRate: 0.2,
|
|
53
|
+
},
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
const RLSD_EVENT_MAGNITUDES = {
|
|
57
|
+
verification: 1,
|
|
58
|
+
evidence: 0.95,
|
|
59
|
+
tool_response: 0.85,
|
|
60
|
+
file_edit: 0.8,
|
|
61
|
+
tool_call: 0.75,
|
|
62
|
+
approval_gate: 0.7,
|
|
63
|
+
plan: 0.55,
|
|
64
|
+
rollback_path: 0.55,
|
|
65
|
+
source_capture: 0.55,
|
|
66
|
+
synthesis: 0.5,
|
|
67
|
+
commit_or_pr: 0.45,
|
|
68
|
+
audience_context: 0.4,
|
|
69
|
+
draft: 0.35,
|
|
70
|
+
intent: 0.3,
|
|
71
|
+
system_context: 0.2,
|
|
72
|
+
assistant_message: 0.15,
|
|
73
|
+
event: 0.1,
|
|
74
|
+
reasoning: 0.1,
|
|
75
|
+
auto_post: 0,
|
|
76
|
+
claim_done_without_evidence: 0,
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const STEP_TEXT_CLASSIFIERS = [
|
|
80
|
+
['plan', /\b(plan|steps|approach|first.*then|strategy)\b/i],
|
|
81
|
+
['file_edit', /\b(apply_patch|patch|diff|edited|write_file|file changed|created file)\b/i],
|
|
82
|
+
['verification', /\b(npm test|node --test|pytest|lint|ci passed|tests? passed|verified|verification)\b/i],
|
|
83
|
+
['commit_or_pr', /\b(commit|pull request|PR #|trunk merge|merge queue|pushed)\b/i],
|
|
84
|
+
['source_capture', /\b(source|citation|href|http|according to|dataset)\b/i],
|
|
85
|
+
['synthesis', /\b(summary|synthesis|therefore|recommendation)\b/i],
|
|
86
|
+
['draft', /\b(draft|reply copy|post text)\b/i],
|
|
87
|
+
['approval_gate', /\b(approval|human review|approved|do not auto-post|never auto-post)\b/i],
|
|
88
|
+
['audience_context', /\b(audience|prospect|comment|thread|bluesky|reddit|linkedin)\b/i],
|
|
89
|
+
['evidence', /\b(evidence|screenshot|log|run link|sha|health endpoint)\b/i],
|
|
90
|
+
['rollback_path', /\b(rollback|revert plan|fallback)\b/i],
|
|
91
|
+
['auto_post', /\b(auto-posted|posted automatically|sent without approval)\b/i],
|
|
92
|
+
];
|
|
93
|
+
|
|
94
|
+
function getReasoningTracePath({ feedbackDir } = {}) {
|
|
95
|
+
return path.join(feedbackDir || resolveFeedbackDir(), TRACE_FILE);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function redactTraceText(value, maxLength = MAX_TEXT) {
|
|
99
|
+
if (value === undefined || value === null) return '';
|
|
100
|
+
let text = String(value);
|
|
101
|
+
text = text.replaceAll(/<think>[\s\S]*?<\/think>/gi, '[REDACTED_REASONING_TRACE]');
|
|
102
|
+
text = text.replaceAll(/<analysis>[\s\S]*?<\/analysis>/gi, '[REDACTED_REASONING_TRACE]');
|
|
103
|
+
for (const { pattern, replacement } of SECRET_PATTERNS) {
|
|
104
|
+
text = text.replaceAll(pattern, replacement);
|
|
105
|
+
}
|
|
106
|
+
text = text.replaceAll(/\s+/g, ' ').trim();
|
|
107
|
+
return text.length > maxLength ? `${text.slice(0, maxLength - 1)}…` : text;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function hashText(value) {
|
|
111
|
+
return crypto.createHash('sha256').update(String(value || '')).digest('hex').slice(0, 16);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function extractMessages(record = {}) {
|
|
115
|
+
const candidates = [
|
|
116
|
+
record.steps,
|
|
117
|
+
record.messages,
|
|
118
|
+
record.conversation,
|
|
119
|
+
record.conversations,
|
|
120
|
+
record.trace,
|
|
121
|
+
];
|
|
122
|
+
for (const candidate of candidates) {
|
|
123
|
+
if (Array.isArray(candidate)) return candidate;
|
|
124
|
+
}
|
|
125
|
+
if (record.prompt || record.response || record.output) {
|
|
126
|
+
return [
|
|
127
|
+
record.prompt ? { role: 'user', content: record.prompt } : null,
|
|
128
|
+
record.response || record.output ? { role: 'assistant', content: record.response || record.output } : null,
|
|
129
|
+
].filter(Boolean);
|
|
130
|
+
}
|
|
131
|
+
return [];
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function normalizeAgentTraceRecord(record = {}, options = {}) {
|
|
135
|
+
const messages = extractMessages(record);
|
|
136
|
+
const steps = messages.map((message, index) => normalizeStep(message, index)).filter(Boolean);
|
|
137
|
+
const taskType = options.taskType || record.taskType || inferTaskType(record, steps);
|
|
138
|
+
const traceId = record.traceId || record.id || record.uuid || `trace_${Date.now()}_${hashText(JSON.stringify(record)).slice(0, 8)}`;
|
|
139
|
+
const outcome = normalizeOutcome(record);
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
traceId: String(traceId),
|
|
143
|
+
source: record.source || record.dataset || record.source_dataset || options.source || 'local',
|
|
144
|
+
taskType,
|
|
145
|
+
model: record.model || record.agent?.model || record.metadata?.model || null,
|
|
146
|
+
repository: record.repository || record.context?.repository || null,
|
|
147
|
+
startedAt: record.startedAt || record.timestamp || null,
|
|
148
|
+
finishedAt: record.finishedAt || null,
|
|
149
|
+
outcome,
|
|
150
|
+
steps,
|
|
151
|
+
metrics: computeTraceMetrics(steps, record.metrics || record),
|
|
152
|
+
privacy: {
|
|
153
|
+
rawReasoningStored: false,
|
|
154
|
+
redactionsApplied: steps.reduce((sum, step) => sum + step.redactions.length, 0),
|
|
155
|
+
reasoningSignals: steps.filter((step) => step.reasoning).length,
|
|
156
|
+
},
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function normalizeStep(message = {}, index = 0) {
|
|
161
|
+
const role = String(message.role || message.from || message.type || 'unknown').toLowerCase();
|
|
162
|
+
const rawContent = message.content ?? message.value ?? message.text ?? message.reasoning_content ?? '';
|
|
163
|
+
const redacted = redactTraceText(rawContent);
|
|
164
|
+
const reasoningRaw = message.reasoning_content || message.reasoning || message.thought || message.analysis;
|
|
165
|
+
const toolCalls = extractToolCalls(message, redacted);
|
|
166
|
+
const eventType = classifyStep({ role, content: redacted, toolCalls, message });
|
|
167
|
+
const redactions = detectRedactions(redacted);
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
index,
|
|
171
|
+
role,
|
|
172
|
+
eventType,
|
|
173
|
+
text: eventType === 'reasoning' ? '[REDACTED_REASONING_TRACE]' : redacted,
|
|
174
|
+
textHash: hashText(rawContent),
|
|
175
|
+
reasoning: reasoningRaw ? {
|
|
176
|
+
present: true,
|
|
177
|
+
charCount: String(reasoningRaw).length,
|
|
178
|
+
hash: hashText(reasoningRaw),
|
|
179
|
+
} : null,
|
|
180
|
+
toolCalls,
|
|
181
|
+
error: detectError(redacted, message),
|
|
182
|
+
redactions,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function extractToolCalls(message = {}, content = '') {
|
|
187
|
+
const calls = [];
|
|
188
|
+
const rawCalls = getRawToolCalls(message);
|
|
189
|
+
|
|
190
|
+
for (const call of rawCalls) {
|
|
191
|
+
const fn = call.function || call;
|
|
192
|
+
calls.push({
|
|
193
|
+
name: String(fn.name || call.name || call.tool || 'unknown'),
|
|
194
|
+
argumentsHash: hashText(JSON.stringify(fn.arguments || call.arguments || {})),
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const named = /\b(?:tool|function|command)\s*[:=]\s*([\w.:-]+)/i.exec(content);
|
|
199
|
+
if (calls.length === 0 && named) {
|
|
200
|
+
calls.push({ name: named[1], argumentsHash: null });
|
|
201
|
+
}
|
|
202
|
+
return calls;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function getRawToolCalls(message = {}) {
|
|
206
|
+
if (Array.isArray(message.tool_calls)) return message.tool_calls;
|
|
207
|
+
if (Array.isArray(message.toolCalls)) return message.toolCalls;
|
|
208
|
+
if (Array.isArray(message.tools)) return message.tools;
|
|
209
|
+
return [];
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function classifyStep({ role, content, toolCalls, message }) {
|
|
213
|
+
const text = String(content || '');
|
|
214
|
+
if (message.reasoning_content || message.reasoning || message.thought || message.analysis) return 'reasoning';
|
|
215
|
+
if (role === 'user') return 'intent';
|
|
216
|
+
if (role === 'system') return 'system_context';
|
|
217
|
+
if (role === 'tool' || role === 'function') return 'tool_response';
|
|
218
|
+
if (toolCalls.length > 0) return 'tool_call';
|
|
219
|
+
const classified = classifyTextStep(text);
|
|
220
|
+
if (classified) return classified;
|
|
221
|
+
if (/\b(done|deployed|live|shipped)\b/i.test(text) && !/\b(evidence|verified|health|sha)\b/i.test(text)) {
|
|
222
|
+
return 'claim_done_without_evidence';
|
|
223
|
+
}
|
|
224
|
+
return role === 'assistant' ? 'assistant_message' : 'event';
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function classifyTextStep(text) {
|
|
228
|
+
const match = STEP_TEXT_CLASSIFIERS.find(([, pattern]) => pattern.test(text));
|
|
229
|
+
return match ? match[0] : null;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function detectError(content, message = {}) {
|
|
233
|
+
if (message.error || message.success === false) return true;
|
|
234
|
+
return /\b(error|failed|exception|traceback|non-zero|blocked|denied)\b/i.test(String(content || ''));
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function detectRedactions(content) {
|
|
238
|
+
const redactions = [];
|
|
239
|
+
for (const label of ['GITHUB_TOKEN', 'AWS_KEY', 'API_KEY', 'TOKEN', 'EMAIL', 'REASONING_TRACE']) {
|
|
240
|
+
if (content.includes(`[REDACTED_${label}]`)) redactions.push(label.toLowerCase());
|
|
241
|
+
}
|
|
242
|
+
return redactions;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
function normalizeOutcome(record = {}) {
|
|
246
|
+
const success = record.success ?? record.outcome?.success ?? record.reward?.success ?? null;
|
|
247
|
+
const reward = typeof record.reward === 'number' ? record.reward : record.outcome?.reward ?? null;
|
|
248
|
+
return {
|
|
249
|
+
success: success === null || success === undefined ? null : Boolean(success),
|
|
250
|
+
reward: toFiniteNumberOrNull(reward),
|
|
251
|
+
terminalState: record.terminal_state || record.outcome?.terminalState || record.outcome?.terminal_state || null,
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function toFiniteNumberOrNull(value) {
|
|
256
|
+
if (value === null || value === undefined) return null;
|
|
257
|
+
const numeric = Number(value);
|
|
258
|
+
return Number.isFinite(numeric) ? numeric : null;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
function inferTaskType(record = {}, steps = []) {
|
|
262
|
+
const text = [
|
|
263
|
+
record.task,
|
|
264
|
+
record.category,
|
|
265
|
+
record.description,
|
|
266
|
+
...steps.map((step) => step.text),
|
|
267
|
+
].filter(Boolean).join(' ');
|
|
268
|
+
if (/\b(deploy|production|railway|stripe|webhook|billing)\b/i.test(text)) return 'production-change';
|
|
269
|
+
if (/\b(reply|post|comment|bluesky|linkedin|reddit|threads)\b/i.test(text)) return 'public-engagement';
|
|
270
|
+
if (/\b(research|source|citation|dataset|paper|article)\b/i.test(text)) return 'research';
|
|
271
|
+
return 'code-change';
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function computeTraceMetrics(steps = [], rawMetrics = {}) {
|
|
275
|
+
const toolUse = new Map();
|
|
276
|
+
let errorSteps = 0;
|
|
277
|
+
for (const step of steps) {
|
|
278
|
+
if (step.error) errorSteps += 1;
|
|
279
|
+
for (const call of step.toolCalls || []) {
|
|
280
|
+
toolUse.set(call.name, (toolUse.get(call.name) || 0) + 1);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
const totalTokens = Number(rawMetrics.total_tokens || rawMetrics.totalTokens || rawMetrics.total_input_tokens || 0)
|
|
284
|
+
+ Number(rawMetrics.total_output_tokens || 0);
|
|
285
|
+
return {
|
|
286
|
+
totalSteps: steps.length,
|
|
287
|
+
toolCallCount: Array.from(toolUse.values()).reduce((sum, count) => sum + count, 0),
|
|
288
|
+
tools: Array.from(toolUse.entries()).sort((a, b) => b[1] - a[1]).map(([name, count]) => ({ name, count })),
|
|
289
|
+
errorSteps,
|
|
290
|
+
errorRate: steps.length ? round(errorSteps / steps.length) : 0,
|
|
291
|
+
totalTokens: Number.isFinite(totalTokens) ? totalTokens : 0,
|
|
292
|
+
estimatedCostUsd: Number(rawMetrics.estimated_cost_usd || rawMetrics.estimatedCostUsd || 0) || 0,
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function evaluateTraceShape(trace = {}, profiles = SHAPE_PROFILES) {
|
|
297
|
+
const profile = profiles[trace.taskType] || profiles['code-change'];
|
|
298
|
+
const events = new Set((trace.steps || []).map((step) => step.eventType));
|
|
299
|
+
const missingRequired = profile.required.filter((event) => !events.has(event));
|
|
300
|
+
const missingRecommended = profile.recommended.filter((event) => !events.has(event));
|
|
301
|
+
const forbiddenPresent = profile.forbidden.filter((event) => events.has(event));
|
|
302
|
+
const errorRate = trace.metrics?.errorRate || 0;
|
|
303
|
+
const errorRateExceeded = errorRate > profile.maxErrorRate;
|
|
304
|
+
const score = Math.max(0, 100
|
|
305
|
+
- (missingRequired.length * 22)
|
|
306
|
+
- (missingRecommended.length * 6)
|
|
307
|
+
- (forbiddenPresent.length * 35)
|
|
308
|
+
- (errorRateExceeded ? 20 : 0));
|
|
309
|
+
|
|
310
|
+
return {
|
|
311
|
+
traceId: trace.traceId,
|
|
312
|
+
taskType: trace.taskType,
|
|
313
|
+
score,
|
|
314
|
+
verdict: score >= 85 ? 'healthy' : score >= 60 ? 'watch' : 'gate',
|
|
315
|
+
missingRequired,
|
|
316
|
+
missingRecommended,
|
|
317
|
+
forbiddenPresent,
|
|
318
|
+
errorRate,
|
|
319
|
+
errorRateExceeded,
|
|
320
|
+
expectedShape: profile,
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
function buildTraceAnalytics(traces = [], options = {}) {
|
|
325
|
+
const normalized = traces.map((trace) => trace.steps ? trace : normalizeAgentTraceRecord(trace, options));
|
|
326
|
+
const evaluations = normalized.map((trace) => evaluateTraceShape(trace));
|
|
327
|
+
const shapeCounts = countBy(evaluations, 'verdict');
|
|
328
|
+
const taskTypes = countBy(normalized, 'taskType');
|
|
329
|
+
const toolCounts = new Map();
|
|
330
|
+
const eventCounts = new Map();
|
|
331
|
+
|
|
332
|
+
for (const trace of normalized) {
|
|
333
|
+
for (const tool of trace.metrics.tools) toolCounts.set(tool.name, (toolCounts.get(tool.name) || 0) + tool.count);
|
|
334
|
+
for (const step of trace.steps) eventCounts.set(step.eventType, (eventCounts.get(step.eventType) || 0) + 1);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
generatedAt: new Date().toISOString(),
|
|
339
|
+
tracesAnalyzed: normalized.length,
|
|
340
|
+
averageShapeScore: normalized.length ? round(evaluations.reduce((sum, item) => sum + item.score, 0) / normalized.length) : 0,
|
|
341
|
+
shapeVerdicts: shapeCounts,
|
|
342
|
+
taskTypes,
|
|
343
|
+
topTools: sortedCounts(toolCounts),
|
|
344
|
+
eventTypes: sortedCounts(eventCounts),
|
|
345
|
+
evaluations,
|
|
346
|
+
gateCandidates: buildTraceGateCandidates(evaluations),
|
|
347
|
+
evalTuples: buildTraceEvalTuples(normalized, evaluations),
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
function buildTraceGateCandidates(evaluations = []) {
|
|
352
|
+
const buckets = new Map();
|
|
353
|
+
for (const evaluation of evaluations) {
|
|
354
|
+
for (const event of evaluation.missingRequired) {
|
|
355
|
+
addGateBucket(buckets, `missing:${evaluation.taskType}:${event}`, evaluation);
|
|
356
|
+
}
|
|
357
|
+
for (const event of evaluation.forbiddenPresent) {
|
|
358
|
+
addGateBucket(buckets, `forbidden:${evaluation.taskType}:${event}`, evaluation);
|
|
359
|
+
}
|
|
360
|
+
if (evaluation.errorRateExceeded) {
|
|
361
|
+
addGateBucket(buckets, `error-rate:${evaluation.taskType}`, evaluation);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
return Array.from(buckets.values())
|
|
366
|
+
.map((bucket) => ({
|
|
367
|
+
...bucket,
|
|
368
|
+
priorityScore: round(bucket.occurrences + (bucket.averageScore < 60 ? 2 : 0)),
|
|
369
|
+
gateId: bucket.key.replaceAll(/[^a-z0-9]+/gi, '-').replaceAll(/^-|-$/g, '').toLowerCase(),
|
|
370
|
+
recommendation: buildTraceGateRecommendation(bucket),
|
|
371
|
+
}))
|
|
372
|
+
.sort((a, b) => b.priorityScore - a.priorityScore || b.occurrences - a.occurrences);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function addGateBucket(buckets, key, evaluation) {
|
|
376
|
+
const bucket = buckets.get(key) || {
|
|
377
|
+
key,
|
|
378
|
+
occurrences: 0,
|
|
379
|
+
totalScore: 0,
|
|
380
|
+
examples: [],
|
|
381
|
+
};
|
|
382
|
+
bucket.occurrences += 1;
|
|
383
|
+
bucket.totalScore += evaluation.score;
|
|
384
|
+
bucket.averageScore = round(bucket.totalScore / bucket.occurrences);
|
|
385
|
+
if (bucket.examples.length < 5) bucket.examples.push(evaluation.traceId);
|
|
386
|
+
buckets.set(key, bucket);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function buildTraceGateRecommendation(bucket) {
|
|
390
|
+
if (bucket.key.startsWith('missing:')) {
|
|
391
|
+
const [, taskType, event] = bucket.key.split(':');
|
|
392
|
+
return `Require "${event}" before completing ${taskType} traces; ${bucket.occurrences} examples averaged shape score ${bucket.averageScore}.`;
|
|
393
|
+
}
|
|
394
|
+
if (bucket.key.startsWith('forbidden:')) {
|
|
395
|
+
const [, taskType, event] = bucket.key.split(':');
|
|
396
|
+
return `Block "${event}" in ${taskType} traces unless explicitly approved.`;
|
|
397
|
+
}
|
|
398
|
+
return `Escalate verification budget when ${bucket.key.replace('error-rate:', '')} trace error rate exceeds its profile threshold.`;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
function buildTraceEvalTuples(traces = [], evaluations = []) {
|
|
402
|
+
const byId = new Map(evaluations.map((evaluation) => [evaluation.traceId, evaluation]));
|
|
403
|
+
return traces.map((trace) => {
|
|
404
|
+
const evaluation = byId.get(trace.traceId) || evaluateTraceShape(trace);
|
|
405
|
+
return {
|
|
406
|
+
id: trace.traceId,
|
|
407
|
+
prompt: `Evaluate whether this ${trace.taskType} agent trace should be allowed to continue.`,
|
|
408
|
+
input: {
|
|
409
|
+
taskType: trace.taskType,
|
|
410
|
+
events: trace.steps.map((step) => step.eventType),
|
|
411
|
+
metrics: trace.metrics,
|
|
412
|
+
},
|
|
413
|
+
expected: evaluation.verdict === 'gate' ? 'block_or_escalate' : 'allow_with_checks',
|
|
414
|
+
reward: evaluation.score >= 85 ? 1 : evaluation.score >= 60 ? 0 : -1,
|
|
415
|
+
metadata: {
|
|
416
|
+
shapeScore: evaluation.score,
|
|
417
|
+
verdict: evaluation.verdict,
|
|
418
|
+
rawReasoningStored: false,
|
|
419
|
+
source: trace.source,
|
|
420
|
+
},
|
|
421
|
+
};
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
function buildRlsdCreditAssignments(traces = [], options = {}) {
|
|
426
|
+
const normalized = traces.map((trace) => trace.steps ? trace : normalizeAgentTraceRecord(trace, options));
|
|
427
|
+
const evaluations = normalized.map((trace) => evaluateTraceShape(trace));
|
|
428
|
+
const assignments = normalized.map((trace, index) => buildRlsdCreditAssignment(trace, evaluations[index], options));
|
|
429
|
+
const eligible = assignments.filter((assignment) => assignment.eligible);
|
|
430
|
+
return {
|
|
431
|
+
mode: 'rlsd_credit_assignment',
|
|
432
|
+
generatedAt: new Date().toISOString(),
|
|
433
|
+
tracesAnalyzed: assignments.length,
|
|
434
|
+
eligibleTraces: eligible.length,
|
|
435
|
+
ineligibleTraces: assignments.length - eligible.length,
|
|
436
|
+
averageDenseSteps: eligible.length
|
|
437
|
+
? round(eligible.reduce((sum, assignment) => sum + assignment.stepCredits.length, 0) / eligible.length)
|
|
438
|
+
: 0,
|
|
439
|
+
assignments,
|
|
440
|
+
recommendations: buildRlsdRecommendations(assignments),
|
|
441
|
+
};
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
function buildRlsdCreditAssignment(trace = {}, evaluation = evaluateTraceShape(trace), options = {}) {
|
|
445
|
+
const direction = resolveVerifiableRewardDirection(trace, evaluation, options);
|
|
446
|
+
const weights = normalizeStepMagnitudes(trace.steps || [], evaluation);
|
|
447
|
+
const signedDirection = direction.value;
|
|
448
|
+
|
|
449
|
+
return {
|
|
450
|
+
traceId: trace.traceId,
|
|
451
|
+
taskType: trace.taskType,
|
|
452
|
+
eligible: direction.source === 'verifiable_outcome',
|
|
453
|
+
direction: direction.label,
|
|
454
|
+
directionValue: signedDirection,
|
|
455
|
+
directionSource: direction.source,
|
|
456
|
+
finalReward: direction.finalReward,
|
|
457
|
+
magnitudeSource: 'observable_step_shape',
|
|
458
|
+
leakageGuard: 'self-teacher scores magnitude only; final verifiable reward controls direction',
|
|
459
|
+
privacy: {
|
|
460
|
+
rawReasoningStored: false,
|
|
461
|
+
rawPrivilegedContextStored: false,
|
|
462
|
+
},
|
|
463
|
+
stepCredits: (trace.steps || []).map((step, index) => ({
|
|
464
|
+
index: step.index,
|
|
465
|
+
role: step.role,
|
|
466
|
+
eventType: step.eventType,
|
|
467
|
+
toolCalls: (step.toolCalls || []).map((call) => call.name),
|
|
468
|
+
magnitude: weights[index] || 0,
|
|
469
|
+
signedReward: direction.source === 'verifiable_outcome' ? round((weights[index] || 0) * signedDirection) : 0,
|
|
470
|
+
reason: buildStepCreditReason(step, evaluation),
|
|
471
|
+
})),
|
|
472
|
+
};
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
function resolveVerifiableRewardDirection(trace = {}, evaluation = {}, options = {}) {
|
|
476
|
+
const outcome = trace.outcome || {};
|
|
477
|
+
if (outcome.reward !== null && outcome.reward !== undefined && Number.isFinite(Number(outcome.reward))) {
|
|
478
|
+
return verifiableRewardDirection(clamp(Number(outcome.reward), -1, 1));
|
|
479
|
+
}
|
|
480
|
+
if (typeof outcome.success === 'boolean') {
|
|
481
|
+
return verifiableRewardDirection(outcome.success ? 1 : -1);
|
|
482
|
+
}
|
|
483
|
+
if (options.allowShapeFallback) {
|
|
484
|
+
return shapeFallbackDirection(evaluation);
|
|
485
|
+
}
|
|
486
|
+
return {
|
|
487
|
+
label: 'preference_pipeline_required',
|
|
488
|
+
value: 0,
|
|
489
|
+
finalReward: null,
|
|
490
|
+
source: 'not_verifiable',
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
function verifiableRewardDirection(reward) {
|
|
495
|
+
const value = Math.sign(reward);
|
|
496
|
+
return {
|
|
497
|
+
label: rewardDirectionLabel(value),
|
|
498
|
+
value,
|
|
499
|
+
finalReward: reward,
|
|
500
|
+
source: 'verifiable_outcome',
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
function shapeFallbackDirection(evaluation = {}) {
|
|
505
|
+
const value = evaluation.verdict === 'gate' ? -1 : 1;
|
|
506
|
+
return {
|
|
507
|
+
label: rewardDirectionLabel(value),
|
|
508
|
+
value,
|
|
509
|
+
finalReward: value,
|
|
510
|
+
source: 'shape_fallback',
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
function rewardDirectionLabel(value) {
|
|
515
|
+
if (value > 0) return 'reinforce';
|
|
516
|
+
if (value < 0) return 'penalize';
|
|
517
|
+
return 'neutral';
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
function normalizeStepMagnitudes(steps = [], evaluation = {}) {
|
|
521
|
+
const raw = steps.map((step) => {
|
|
522
|
+
let magnitude = RLSD_EVENT_MAGNITUDES[step.eventType] ?? 0.2;
|
|
523
|
+
if (step.error) magnitude *= 1.25;
|
|
524
|
+
if ((evaluation.missingRequired || []).includes(step.eventType)) magnitude *= 0.75;
|
|
525
|
+
if ((evaluation.forbiddenPresent || []).includes(step.eventType)) magnitude = 0;
|
|
526
|
+
return Math.max(0, magnitude);
|
|
527
|
+
});
|
|
528
|
+
const total = raw.reduce((sum, value) => sum + value, 0);
|
|
529
|
+
if (!total) return steps.map(() => 0);
|
|
530
|
+
return raw.map((value) => round(value / total));
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
function buildStepCreditReason(step = {}, evaluation = {}) {
|
|
534
|
+
if ((evaluation.forbiddenPresent || []).includes(step.eventType)) {
|
|
535
|
+
return 'Forbidden event receives no magnitude credit.';
|
|
536
|
+
}
|
|
537
|
+
if (step.error) {
|
|
538
|
+
return 'Error-bearing step receives higher credit/blame magnitude for targeted correction.';
|
|
539
|
+
}
|
|
540
|
+
if (['verification', 'evidence', 'tool_response'].includes(step.eventType)) {
|
|
541
|
+
return 'Observable outcome/evidence step receives high magnitude credit.';
|
|
542
|
+
}
|
|
543
|
+
if (['auto_post', 'claim_done_without_evidence'].includes(step.eventType)) {
|
|
544
|
+
return 'Unsafe completion claim receives no positive magnitude credit.';
|
|
545
|
+
}
|
|
546
|
+
return 'Magnitude is based on observable trace role, not hidden reasoning text.';
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
function buildRlsdRecommendations(assignments = []) {
|
|
550
|
+
const recommendations = [];
|
|
551
|
+
const ineligible = assignments.filter((assignment) => !assignment.eligible);
|
|
552
|
+
if (ineligible.length) {
|
|
553
|
+
recommendations.push('Route traces without compiler/test/schema/billing/verifier outcomes to preference-based evaluation before RLSD export.');
|
|
554
|
+
}
|
|
555
|
+
if (assignments.some((assignment) => assignment.stepCredits.some((step) => step.eventType === 'claim_done_without_evidence'))) {
|
|
556
|
+
recommendations.push('Promote done-without-evidence steps into pre-action gates before using these traces for training.');
|
|
557
|
+
}
|
|
558
|
+
if (assignments.some((assignment) => assignment.stepCredits.some((step) => step.reason.includes('Error-bearing')))) {
|
|
559
|
+
recommendations.push('Use error-bearing step magnitudes for targeted correction instead of penalizing the whole trace uniformly.');
|
|
560
|
+
}
|
|
561
|
+
if (!recommendations.length) {
|
|
562
|
+
recommendations.push('RLSD tuples are ready for small-batch export: verifiable direction is separated from dense step magnitude.');
|
|
563
|
+
}
|
|
564
|
+
return recommendations;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
function formatTraceAnalyticsReport(report = {}) {
|
|
568
|
+
const lines = [
|
|
569
|
+
'# Agent Reasoning Trace Intelligence',
|
|
570
|
+
'',
|
|
571
|
+
`Generated: ${report.generatedAt}`,
|
|
572
|
+
`Traces analyzed: ${report.tracesAnalyzed}`,
|
|
573
|
+
`Average shape score: ${report.averageShapeScore}`,
|
|
574
|
+
'',
|
|
575
|
+
'## Shape Verdicts',
|
|
576
|
+
'',
|
|
577
|
+
];
|
|
578
|
+
|
|
579
|
+
for (const item of sortedObjectCounts(report.shapeVerdicts || {})) {
|
|
580
|
+
lines.push(`- ${item.key}: ${item.count}`);
|
|
581
|
+
}
|
|
582
|
+
lines.push('', '## Top Gate Candidates', '');
|
|
583
|
+
for (const candidate of (report.gateCandidates || []).slice(0, 5)) {
|
|
584
|
+
lines.push(`- ${candidate.gateId}: ${candidate.recommendation}`);
|
|
585
|
+
}
|
|
586
|
+
if (!report.gateCandidates?.length) lines.push('- None: trace shapes are currently healthy.');
|
|
587
|
+
lines.push('', 'Privacy: raw hidden reasoning is not stored; only hashes, event labels, and redacted observable text are retained.', '');
|
|
588
|
+
return `${lines.join('\n')}\n`;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
function recordReasoningTrace(trace, options = {}) {
|
|
592
|
+
const normalized = trace.steps ? trace : normalizeAgentTraceRecord(trace, options);
|
|
593
|
+
appendJsonl(getReasoningTracePath(options), normalized);
|
|
594
|
+
return normalized;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
function loadReasoningTraces(options = {}) {
|
|
598
|
+
const inputPath = options.inputPath ? path.resolve(options.inputPath) : getReasoningTracePath(options);
|
|
599
|
+
return readJsonl(inputPath).map((trace) => trace.steps ? trace : normalizeAgentTraceRecord(trace, options));
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
function countBy(items, key) {
|
|
603
|
+
const result = {};
|
|
604
|
+
for (const item of items) {
|
|
605
|
+
const value = item[key] || 'unknown';
|
|
606
|
+
result[value] = (result[value] || 0) + 1;
|
|
607
|
+
}
|
|
608
|
+
return result;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
function sortedCounts(map) {
|
|
612
|
+
return Array.from(map.entries()).sort((a, b) => b[1] - a[1]).map(([name, count]) => ({ name, count }));
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
function sortedObjectCounts(object) {
|
|
616
|
+
return Object.entries(object).sort((a, b) => b[1] - a[1]).map(([key, count]) => ({ key, count }));
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
function round(value) {
|
|
620
|
+
if (!Number.isFinite(value)) return 0;
|
|
621
|
+
return Math.round(value * 1000) / 1000;
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
function clamp(value, min, max) {
|
|
625
|
+
if (!Number.isFinite(value)) return min;
|
|
626
|
+
return Math.min(max, Math.max(min, value));
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
function parseArgs(argv = process.argv.slice(2)) {
|
|
630
|
+
const args = { command: argv[0] || 'report' };
|
|
631
|
+
for (const arg of argv.slice(1)) {
|
|
632
|
+
if (!arg.startsWith('--')) continue;
|
|
633
|
+
const valueStart = arg.indexOf('=');
|
|
634
|
+
if (valueStart === -1) {
|
|
635
|
+
args[arg.slice(2)] = true;
|
|
636
|
+
continue;
|
|
637
|
+
}
|
|
638
|
+
args[arg.slice(2, valueStart)] = arg.slice(valueStart + 1);
|
|
639
|
+
}
|
|
640
|
+
return args;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
function isCliInvocation(argv = process.argv) {
|
|
644
|
+
return Boolean(argv[1] && path.resolve(argv[1]) === __filename);
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
if (isCliInvocation()) {
|
|
648
|
+
const args = parseArgs();
|
|
649
|
+
const traces = loadReasoningTraces({ inputPath: args.input });
|
|
650
|
+
const report = buildTraceAnalytics(traces);
|
|
651
|
+
if (args.command === 'json') {
|
|
652
|
+
console.log(JSON.stringify(report, null, 2));
|
|
653
|
+
} else if (args.command === 'eval') {
|
|
654
|
+
console.log(JSON.stringify(report.evalTuples, null, 2));
|
|
655
|
+
} else if (args.command === 'rlsd') {
|
|
656
|
+
console.log(JSON.stringify(buildRlsdCreditAssignments(traces), null, 2));
|
|
657
|
+
} else if (args.command === 'record') {
|
|
658
|
+
const raw = args.input ? fs.readFileSync(path.resolve(args.input), 'utf8') : '';
|
|
659
|
+
const parsed = raw.trim().split('\n').filter(Boolean).map((line) => JSON.parse(line));
|
|
660
|
+
for (const record of parsed) recordReasoningTrace(record);
|
|
661
|
+
console.log(JSON.stringify({ recorded: parsed.length }, null, 2));
|
|
662
|
+
} else if (args.command === 'report') {
|
|
663
|
+
console.log(formatTraceAnalyticsReport(report));
|
|
664
|
+
} else {
|
|
665
|
+
console.error(`Unknown command: ${args.command}. Use: report, json, eval, rlsd, record`);
|
|
666
|
+
process.exit(1);
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
module.exports = {
|
|
671
|
+
SHAPE_PROFILES,
|
|
672
|
+
buildTraceAnalytics,
|
|
673
|
+
buildTraceEvalTuples,
|
|
674
|
+
buildRlsdCreditAssignment,
|
|
675
|
+
buildRlsdCreditAssignments,
|
|
676
|
+
evaluateTraceShape,
|
|
677
|
+
formatTraceAnalyticsReport,
|
|
678
|
+
getReasoningTracePath,
|
|
679
|
+
loadReasoningTraces,
|
|
680
|
+
normalizeAgentTraceRecord,
|
|
681
|
+
recordReasoningTrace,
|
|
682
|
+
redactTraceText,
|
|
683
|
+
};
|