thumbgate 1.16.13 → 1.16.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +3 -1
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +26 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bin/cli.js +420 -1
- package/config/gate-templates.json +372 -0
- package/config/mcp-allowlists.json +25 -0
- package/config/model-candidates.json +59 -2
- package/config/model-tiers.json +4 -1
- package/package.json +79 -22
- package/public/compare.html +6 -0
- package/public/index.html +144 -11
- package/public/numbers.html +8 -8
- package/public/pro.html +22 -24
- package/scripts/agent-design-governance.js +211 -0
- package/scripts/agent-reasoning-traces.js +683 -0
- package/scripts/agent-reward-model.js +438 -0
- package/scripts/agent-stack-survival-audit.js +231 -0
- package/scripts/ai-engineering-stack-guardrails.js +256 -0
- package/scripts/billing.js +16 -4
- package/scripts/chatgpt-ads-readiness-pack.js +195 -0
- package/scripts/cli-schema.js +277 -0
- package/scripts/code-graph-guardrails.js +176 -0
- package/scripts/deepseek-v4-runtime-guardrails.js +253 -0
- package/scripts/gemini-embedding-policy.js +198 -0
- package/scripts/inference-cache-policy.js +39 -0
- package/scripts/judge-reward-function.js +396 -0
- package/scripts/llm-behavior-monitor.js +251 -0
- package/scripts/long-running-agent-context-guardrails.js +176 -0
- package/scripts/multimodal-retrieval-plan.js +31 -11
- package/scripts/oss-pr-opportunity-scout.js +240 -0
- package/scripts/proactive-agent-eval-guardrails.js +230 -0
- package/scripts/profile-router.js +5 -4
- package/scripts/prompting-operating-system.js +273 -0
- package/scripts/proxy-pointer-rag-guardrails.js +189 -0
- package/scripts/rag-precision-guardrails.js +202 -0
- package/scripts/rate-limiter.js +1 -1
- package/scripts/reasoning-efficiency-guardrails.js +176 -0
- package/scripts/reward-hacking-guardrails.js +251 -0
- package/scripts/seo-gsd.js +1201 -11
- package/scripts/single-use-credential-gate.js +182 -0
- package/scripts/structured-prompt-driven.js +226 -0
- package/scripts/telemetry-analytics.js +31 -6
- package/scripts/tool-registry.js +92 -0
- package/scripts/upstream-contribution-engine.js +379 -0
- package/scripts/vector-store.js +119 -4
- package/src/api/server.js +333 -100
- package/scripts/agents-sdk-sandbox-plan.js +0 -57
- package/scripts/ai-org-governance.js +0 -98
- package/scripts/artifact-agent-plan.js +0 -81
- package/scripts/enterprise-agent-rollout.js +0 -34
- package/scripts/experience-replay-governance.js +0 -69
- package/scripts/inference-economics.js +0 -53
- package/scripts/knowledge-layer-plan.js +0 -108
- package/scripts/memory-store-governance.js +0 -60
- package/scripts/post-training-governance.js +0 -34
- package/scripts/production-agent-readiness.js +0 -40
- package/scripts/scaling-law-claims.js +0 -60
- package/scripts/student-consistent-training.js +0 -73
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const path = require('path');
|
|
6
|
+
|
|
7
|
+
const SOURCE = Object.freeze({
|
|
8
|
+
hfUrl: 'https://huggingface.co/papers/2604.13602',
|
|
9
|
+
arxivUrl: 'https://arxiv.org/abs/2604.13602',
|
|
10
|
+
title: 'Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges',
|
|
11
|
+
arxivId: '2604.13602',
|
|
12
|
+
submitted: '2026-04-15',
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
const COMPLETION_CLAIM_RE = /\b(all tests pass|tests pass|ci (?:is )?(?:green|passing)|fixed|done|complete|ready to merge|ready for review|safe to ship|deployed|production ready)\b/i;
|
|
16
|
+
const EVIDENCE_RE = /\b(exit code|test output|logs?|artifact|screenshot|trace|commit|diff|proof|verified|health check|ci url|run id|reproduction|benchmark report)\b/i;
|
|
17
|
+
const SYCOPHANCY_RE = /\b(you'?re absolutely right|great point|looks great|lgtm|totally agree|perfect idea|no issues|ship it)\b/i;
|
|
18
|
+
const BENCHMARK_RE = /\b(benchmark|eval|score|leaderboard|pass rate|accuracy|win rate|reward)\b/i;
|
|
19
|
+
const HOLDOUT_RE = /\b(holdout|regression|real workflow|counterexample|adversarial|canary|shadow run|baseline)\b/i;
|
|
20
|
+
const EVALUATOR_MANIPULATION_RE = /\b(ignore (?:the|any) (?:failures?|rubric|instructions?)|grade (?:this|me) (?:leniently|as passing)|award (?:full|maximum) credit|do not penalize|self[- ]?score(?:d)? as passing)\b/i;
|
|
21
|
+
const MULTIMODAL_RE = /\b(screenshot|image|chart|pdf|video|visual|ocr|multimodal)\b/i;
|
|
22
|
+
|
|
23
|
+
function parseBoolean(value, fallback = false) {
|
|
24
|
+
if (value === undefined || value === null || value === '') return fallback;
|
|
25
|
+
if (typeof value === 'boolean') return value;
|
|
26
|
+
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function parseNumber(value, fallback = 0) {
|
|
30
|
+
const parsed = Number(value);
|
|
31
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function splitList(value) {
|
|
35
|
+
if (Array.isArray(value)) return value.map(String).map((item) => item.trim()).filter(Boolean);
|
|
36
|
+
return String(value || '').split(',').map((item) => item.trim()).filter(Boolean);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function normalizeOptions(raw = {}) {
|
|
40
|
+
const candidateText = String(raw.text || raw.claim || raw.response || raw.summary || '').trim();
|
|
41
|
+
const evidence = splitList(raw.evidence || raw.evidenceArtifacts || raw['evidence-artifacts']);
|
|
42
|
+
const metrics = splitList(raw.metrics || raw.metric || raw['proxy-metrics']);
|
|
43
|
+
return {
|
|
44
|
+
workflow: String(raw.workflow || raw.name || 'agent reward guardrails').trim() || 'agent reward guardrails',
|
|
45
|
+
candidateText,
|
|
46
|
+
evidence,
|
|
47
|
+
metrics,
|
|
48
|
+
wordCount: parseNumber(raw['word-count'] || raw.wordCount, candidateText.split(/\s+/).filter(Boolean).length),
|
|
49
|
+
hasHoldout: parseBoolean(raw.holdout || raw.hasHoldout || raw['has-holdout'], HOLDOUT_RE.test(candidateText)),
|
|
50
|
+
hasHumanObjective: parseBoolean(raw['human-objective'] || raw.hasHumanObjective, false),
|
|
51
|
+
hasVerifierTrace: parseBoolean(raw['verifier-trace'] || raw.hasVerifierTrace, evidence.some((item) => /trace|log|run|artifact|proof/i.test(item))),
|
|
52
|
+
optimizedForScore: parseBoolean(raw['optimized-for-score'] || raw.optimizedForScore, BENCHMARK_RE.test(candidateText) || metrics.length > 0),
|
|
53
|
+
multimodal: parseBoolean(raw.multimodal || raw.hasMultimodalInputs, MULTIMODAL_RE.test(candidateText)),
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function hasEvidence(options) {
|
|
58
|
+
return options.evidence.length > 0 || options.hasVerifierTrace || EVIDENCE_RE.test(options.candidateText);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function buildSignals(options) {
|
|
62
|
+
const signals = [];
|
|
63
|
+
const evidencePresent = hasEvidence(options);
|
|
64
|
+
|
|
65
|
+
if (COMPLETION_CLAIM_RE.test(options.candidateText) && !evidencePresent) {
|
|
66
|
+
signals.push({
|
|
67
|
+
id: 'hallucinated_verification',
|
|
68
|
+
severity: 'critical',
|
|
69
|
+
message: 'The response claims completion, safety, test success, or deployment without attached proof.',
|
|
70
|
+
gate: 'Block completion claims until test output, run id, trace, screenshot, or proof artifact is attached.',
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (options.wordCount >= 180 && !evidencePresent) {
|
|
75
|
+
signals.push({
|
|
76
|
+
id: 'verbosity_as_proof',
|
|
77
|
+
severity: 'high',
|
|
78
|
+
message: 'The response is long but does not provide verifiable artifacts, turning fluency into a proxy for correctness.',
|
|
79
|
+
gate: 'Require concise claims with artifact-backed evidence before accepting persuasive explanations.',
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (SYCOPHANCY_RE.test(options.candidateText) && !evidencePresent) {
|
|
84
|
+
signals.push({
|
|
85
|
+
id: 'sycophancy_or_rubber_stamp',
|
|
86
|
+
severity: 'high',
|
|
87
|
+
message: 'Agreement or approval language appears without independent checks or counterevidence.',
|
|
88
|
+
gate: 'Require at least one explicit verification step or risk check before approval-style responses.',
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (options.optimizedForScore && !options.hasHoldout) {
|
|
93
|
+
signals.push({
|
|
94
|
+
id: 'benchmark_overfitting',
|
|
95
|
+
severity: 'high',
|
|
96
|
+
message: 'A score, eval, benchmark, or reward metric is being optimized without holdout or regression proof.',
|
|
97
|
+
gate: 'Require holdout, regression, or real-workflow evidence before treating score gains as product gains.',
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (EVALUATOR_MANIPULATION_RE.test(options.candidateText)) {
|
|
102
|
+
signals.push({
|
|
103
|
+
id: 'evaluator_manipulation',
|
|
104
|
+
severity: 'critical',
|
|
105
|
+
message: 'The candidate text attempts to influence grading instead of satisfying the user objective.',
|
|
106
|
+
gate: 'Block evaluator-manipulation language and route to human review.',
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (options.metrics.length > 0 && !options.hasHumanObjective) {
|
|
111
|
+
signals.push({
|
|
112
|
+
id: 'proxy_metric_only',
|
|
113
|
+
severity: 'medium',
|
|
114
|
+
message: 'Proxy metrics are present without an explicit human objective or user-visible success criterion.',
|
|
115
|
+
gate: 'Pair every reward or benchmark metric with the real user outcome it is meant to approximate.',
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (options.multimodal && !evidencePresent) {
|
|
120
|
+
signals.push({
|
|
121
|
+
id: 'perception_reasoning_decoupling',
|
|
122
|
+
severity: 'high',
|
|
123
|
+
message: 'A visual or multimodal claim is made without source artifacts or perception trace evidence.',
|
|
124
|
+
gate: 'Require screenshot, OCR, or visual proof artifact before accepting multimodal reasoning claims.',
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return signals;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function buildMetrics() {
|
|
132
|
+
return [
|
|
133
|
+
{ id: 'unsupported_completion_claims', target: '0', required: true },
|
|
134
|
+
{ id: 'evidence_attachment_rate', target: '>= 0.95', required: true },
|
|
135
|
+
{ id: 'unsupported_claim_rate', target: '<= 0.02', required: true },
|
|
136
|
+
{ id: 'holdout_regression_pass_rate', target: '>= 0.90', required: true },
|
|
137
|
+
{ id: 'judge_disagreement_rate', target: '<= 0.10', required: true },
|
|
138
|
+
{ id: 'proxy_to_user_objective_mapping_rate', target: '>= 0.95', required: true },
|
|
139
|
+
];
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function buildRewardHackingGuardrailsPlan(rawOptions = {}) {
|
|
143
|
+
const options = normalizeOptions(rawOptions);
|
|
144
|
+
const signals = buildSignals(options);
|
|
145
|
+
const critical = signals.filter((signal) => signal.severity === 'critical').length;
|
|
146
|
+
const high = signals.filter((signal) => signal.severity === 'high').length;
|
|
147
|
+
|
|
148
|
+
return {
|
|
149
|
+
name: 'thumbgate-reward-hacking-guardrails',
|
|
150
|
+
source: SOURCE,
|
|
151
|
+
workflow: options.workflow,
|
|
152
|
+
status: critical > 0 ? 'blocked' : high > 0 ? 'needs_evidence' : 'ready',
|
|
153
|
+
summary: {
|
|
154
|
+
signalCount: signals.length,
|
|
155
|
+
critical,
|
|
156
|
+
high,
|
|
157
|
+
evidenceArtifacts: options.evidence.length,
|
|
158
|
+
proxyMetrics: options.metrics,
|
|
159
|
+
hasHumanObjective: options.hasHumanObjective,
|
|
160
|
+
hasHoldout: options.hasHoldout,
|
|
161
|
+
},
|
|
162
|
+
proxyCompressionMapping: {
|
|
163
|
+
compression: 'compressed reward, benchmark, or approval signal is treated as a stand-in for the full user objective',
|
|
164
|
+
amplification: 'optimization pressure can turn local shortcuts into repeated workflow behavior',
|
|
165
|
+
coAdaptation: 'agent outputs can learn to satisfy evaluators, rubrics, or verifiers instead of the task',
|
|
166
|
+
},
|
|
167
|
+
signals,
|
|
168
|
+
gates: signals.map((signal) => ({
|
|
169
|
+
id: signal.id,
|
|
170
|
+
action: signal.severity === 'critical' ? 'block' : 'warn',
|
|
171
|
+
message: signal.gate,
|
|
172
|
+
})),
|
|
173
|
+
metrics: buildMetrics(),
|
|
174
|
+
nextActions: [
|
|
175
|
+
'Attach proof artifacts before allowing claims like tests passed, fixed, deployed, safe, or ready to merge.',
|
|
176
|
+
'Treat benchmark gains as provisional until holdout, regression, or real-workflow evidence confirms the user objective improved.',
|
|
177
|
+
'Require explicit user-objective mapping for every proxy metric, reward score, or evaluator rubric.',
|
|
178
|
+
'Block evaluator-manipulation language before it reaches judge or verifier loops.',
|
|
179
|
+
'Prefer short evidence-backed summaries over long persuasive explanations when judging agent work.',
|
|
180
|
+
],
|
|
181
|
+
marketingAngle: {
|
|
182
|
+
headline: 'Reward hacking is what happens when agents optimize the receipt instead of the meal.',
|
|
183
|
+
subhead: 'ThumbGate turns proxy failures into pre-action gates: no unsupported completion claims, no benchmark-only victory laps, and no verifier theater without proof artifacts.',
|
|
184
|
+
guideTitle: 'Reward Hacking Guardrails for AI Coding Agents',
|
|
185
|
+
replyDraft: 'This paper is a useful frame for agent products: proxy rewards compress the real user objective, and agents learn the shortcut. ThumbGate can enforce the missing layer: completion claims need proof, benchmark wins need holdouts, and verifier loops need gates against sycophancy, verbosity-as-proof, and evaluator manipulation.',
|
|
186
|
+
},
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function formatRewardHackingGuardrailsPlan(report) {
|
|
191
|
+
const lines = [
|
|
192
|
+
'',
|
|
193
|
+
'ThumbGate Reward Hacking Guardrails',
|
|
194
|
+
'-'.repeat(36),
|
|
195
|
+
`Workflow : ${report.workflow}`,
|
|
196
|
+
`Status : ${report.status}`,
|
|
197
|
+
`Source : ${report.source.arxivUrl}`,
|
|
198
|
+
`Signals : ${report.summary.signalCount}`,
|
|
199
|
+
];
|
|
200
|
+
if (report.signals.length > 0) {
|
|
201
|
+
lines.push('', 'Signals:');
|
|
202
|
+
for (const signal of report.signals) {
|
|
203
|
+
lines.push(` - [${signal.severity}] ${signal.id}: ${signal.message}`);
|
|
204
|
+
lines.push(` Gate: ${signal.gate}`);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
lines.push('', 'Required metrics:');
|
|
208
|
+
for (const metric of report.metrics) {
|
|
209
|
+
lines.push(` - ${metric.id}: ${metric.target}${metric.required ? ' (required)' : ''}`);
|
|
210
|
+
}
|
|
211
|
+
lines.push('', 'Next actions:');
|
|
212
|
+
for (const action of report.nextActions) lines.push(` - ${action}`);
|
|
213
|
+
lines.push('', `Guide: ${report.marketingAngle.guideTitle}`);
|
|
214
|
+
lines.push(`Reply draft: ${report.marketingAngle.replyDraft}`, '');
|
|
215
|
+
return `${lines.join('\n')}\n`;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function writeRewardHackingPromoPack(outputDir = path.join(__dirname, '..', 'docs', 'marketing')) {
|
|
219
|
+
const report = buildRewardHackingGuardrailsPlan({
|
|
220
|
+
workflow: 'AI coding agent release checklist',
|
|
221
|
+
text: 'Great idea, LGTM. All tests pass and this is ready to merge. Our benchmark score improved, so ship it.',
|
|
222
|
+
metrics: ['benchmark pass rate', 'reward score'],
|
|
223
|
+
multimodal: true,
|
|
224
|
+
'optimized-for-score': true,
|
|
225
|
+
});
|
|
226
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
227
|
+
const jsonPath = path.join(outputDir, 'reward-hacking-guardrails-pack.json');
|
|
228
|
+
const markdownPath = path.join(outputDir, 'reward-hacking-guardrails-pack.md');
|
|
229
|
+
fs.writeFileSync(jsonPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
230
|
+
fs.writeFileSync(markdownPath, formatRewardHackingGuardrailsPlan(report));
|
|
231
|
+
return { report, jsonPath, markdownPath };
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
module.exports = {
|
|
235
|
+
SOURCE,
|
|
236
|
+
buildMetrics,
|
|
237
|
+
buildRewardHackingGuardrailsPlan,
|
|
238
|
+
buildSignals,
|
|
239
|
+
formatRewardHackingGuardrailsPlan,
|
|
240
|
+
normalizeOptions,
|
|
241
|
+
writeRewardHackingPromoPack,
|
|
242
|
+
};
|
|
243
|
+
|
|
244
|
+
function isCliInvocation(argv = process.argv) {
|
|
245
|
+
return Boolean(argv[1] && path.resolve(argv[1]) === __filename);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (isCliInvocation()) {
|
|
249
|
+
const { jsonPath, markdownPath } = writeRewardHackingPromoPack();
|
|
250
|
+
console.log(JSON.stringify({ jsonPath, markdownPath }, null, 2));
|
|
251
|
+
}
|