rlhf-feedback-loop 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/LICENSE +21 -0
- package/README.md +308 -0
- package/adapters/README.md +8 -0
- package/adapters/amp/skills/rlhf-feedback/SKILL.md +20 -0
- package/adapters/chatgpt/INSTALL.md +80 -0
- package/adapters/chatgpt/openapi.yaml +292 -0
- package/adapters/claude/.mcp.json +8 -0
- package/adapters/codex/config.toml +4 -0
- package/adapters/gemini/function-declarations.json +95 -0
- package/adapters/mcp/server-stdio.js +444 -0
- package/bin/cli.js +167 -0
- package/config/mcp-allowlists.json +29 -0
- package/config/policy-bundles/constrained-v1.json +53 -0
- package/config/policy-bundles/default-v1.json +80 -0
- package/config/rubrics/default-v1.json +52 -0
- package/config/subagent-profiles.json +32 -0
- package/openapi/openapi.yaml +292 -0
- package/package.json +91 -0
- package/plugins/amp-skill/INSTALL.md +52 -0
- package/plugins/amp-skill/SKILL.md +31 -0
- package/plugins/claude-skill/INSTALL.md +55 -0
- package/plugins/claude-skill/SKILL.md +46 -0
- package/plugins/codex-profile/AGENTS.md +20 -0
- package/plugins/codex-profile/INSTALL.md +57 -0
- package/plugins/gemini-extension/INSTALL.md +74 -0
- package/plugins/gemini-extension/gemini_prompt.txt +10 -0
- package/plugins/gemini-extension/tool_contract.json +28 -0
- package/scripts/billing.js +471 -0
- package/scripts/budget-guard.js +173 -0
- package/scripts/code-reasoning.js +307 -0
- package/scripts/context-engine.js +547 -0
- package/scripts/contextfs.js +513 -0
- package/scripts/contract-audit.js +198 -0
- package/scripts/dpo-optimizer.js +208 -0
- package/scripts/export-dpo-pairs.js +316 -0
- package/scripts/export-training.js +448 -0
- package/scripts/feedback-attribution.js +313 -0
- package/scripts/feedback-inbox-read.js +162 -0
- package/scripts/feedback-loop.js +838 -0
- package/scripts/feedback-schema.js +300 -0
- package/scripts/feedback-to-memory.js +165 -0
- package/scripts/feedback-to-rules.js +109 -0
- package/scripts/generate-paperbanana-diagrams.sh +99 -0
- package/scripts/hybrid-feedback-context.js +676 -0
- package/scripts/intent-router.js +164 -0
- package/scripts/mcp-policy.js +92 -0
- package/scripts/meta-policy.js +194 -0
- package/scripts/plan-gate.js +154 -0
- package/scripts/prove-adapters.js +364 -0
- package/scripts/prove-attribution.js +364 -0
- package/scripts/prove-automation.js +393 -0
- package/scripts/prove-data-quality.js +219 -0
- package/scripts/prove-intelligence.js +256 -0
- package/scripts/prove-lancedb.js +370 -0
- package/scripts/prove-loop-closure.js +255 -0
- package/scripts/prove-rlaif.js +404 -0
- package/scripts/prove-subway-upgrades.js +250 -0
- package/scripts/prove-training-export.js +324 -0
- package/scripts/prove-v2-milestone.js +273 -0
- package/scripts/prove-v3-milestone.js +381 -0
- package/scripts/rlaif-self-audit.js +123 -0
- package/scripts/rubric-engine.js +230 -0
- package/scripts/self-heal.js +127 -0
- package/scripts/self-healing-check.js +111 -0
- package/scripts/skill-quality-tracker.js +284 -0
- package/scripts/subagent-profiles.js +79 -0
- package/scripts/sync-gh-secrets-from-env.sh +29 -0
- package/scripts/thompson-sampling.js +331 -0
- package/scripts/train_from_feedback.py +914 -0
- package/scripts/validate-feedback.js +580 -0
- package/scripts/vector-store.js +100 -0
- package/src/api/server.js +497 -0
|
@@ -0,0 +1,676 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* Hybrid Feedback Context — Pre-Tool Guard Engine (ATTR-02)
|
|
4
|
+
*
|
|
5
|
+
* Builds attributed feedback state from multiple JSONL sources and compiles
|
|
6
|
+
* it into a fast guard artifact for pre-tool execution decisions:
|
|
7
|
+
* block — attributed negative patterns exceed threshold
|
|
8
|
+
* warn — soft negative signal; proceed with caution
|
|
9
|
+
* allow — no matching negative patterns (default)
|
|
10
|
+
*
|
|
11
|
+
* Exports:
|
|
12
|
+
* buildHybridState, evaluatePretool, compileGuardArtifact,
|
|
13
|
+
* writeGuardArtifact, readGuardArtifact, evaluateCompiledGuards,
|
|
14
|
+
* evaluatePretoolFromState, deriveConstraints, buildAdditionalContext
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const fs = require('fs');
|
|
18
|
+
const path = require('path');
|
|
19
|
+
const os = require('os');
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Paths
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
const ROOT = path.join(__dirname, '..');
|
|
26
|
+
|
|
27
|
+
const PATHS = {
|
|
28
|
+
feedbackLog: path.join(ROOT, '.claude', 'memory', 'feedback', 'feedback-log.jsonl'),
|
|
29
|
+
inbox: path.join(ROOT, '.claude', 'memory', 'feedback', 'inbox.jsonl'),
|
|
30
|
+
pendingSync: path.join(ROOT, '.claude', 'memory', 'feedback', 'pending_cortex_sync.jsonl'),
|
|
31
|
+
attributedFeedback: path.join(ROOT, '.claude', 'memory', 'feedback', 'attributed-feedback.jsonl'),
|
|
32
|
+
guardArtifact: path.join(ROOT, '.claude', 'memory', 'feedback', 'pretool-guards.json'),
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Constants
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
const STOPWORDS = new Set([
|
|
40
|
+
'the', 'and', 'for', 'was', 'with', 'from', 'that', 'this', 'are', 'have',
|
|
41
|
+
'has', 'had', 'not', 'but', 'they', 'you', 'can', 'will', 'all', 'any',
|
|
42
|
+
'one', 'its', 'our', 'also', 'more', 'very', 'just', 'into', 'been',
|
|
43
|
+
'bash', 'edit', 'write', 'tool', 'hook', 'clear',
|
|
44
|
+
]);
|
|
45
|
+
|
|
46
|
+
const NEG = new Set([
|
|
47
|
+
'negative', 'thumbsdown', 'thumbs_down', 'thumbs-down', 'down', 'bad',
|
|
48
|
+
'wrong', 'error', 'fail', 'failed', 'failure', 'mistake', 'bug', 'broken',
|
|
49
|
+
]);
|
|
50
|
+
|
|
51
|
+
const POS = new Set([
|
|
52
|
+
'positive', 'thumbsup', 'thumbs_up', 'thumbs-up', 'up', 'good', 'correct',
|
|
53
|
+
'success', 'pass', 'passed', 'great', 'excellent', 'perfect', 'works',
|
|
54
|
+
]);
|
|
55
|
+
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
// Low-level helpers
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Read last maxLines of a JSONL file in reverse, then re-reverse so oldest-first.
|
|
62
|
+
*/
|
|
63
|
+
function readJsonl(filePath, maxLines) {
|
|
64
|
+
const limit = maxLines !== undefined ? maxLines : 400;
|
|
65
|
+
if (!fs.existsSync(filePath)) return [];
|
|
66
|
+
let raw;
|
|
67
|
+
try {
|
|
68
|
+
raw = fs.readFileSync(filePath, 'utf8').trimEnd();
|
|
69
|
+
} catch (_) {
|
|
70
|
+
return [];
|
|
71
|
+
}
|
|
72
|
+
if (!raw) return [];
|
|
73
|
+
const lines = raw.split('\n');
|
|
74
|
+
const slice = lines.slice(-limit);
|
|
75
|
+
const parsed = [];
|
|
76
|
+
for (let i = slice.length - 1; i >= 0; i--) {
|
|
77
|
+
const line = slice[i].trim();
|
|
78
|
+
if (!line) continue;
|
|
79
|
+
try {
|
|
80
|
+
parsed.push(JSON.parse(line));
|
|
81
|
+
} catch (_) {
|
|
82
|
+
// skip malformed
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
parsed.reverse(); // back to chronological order
|
|
86
|
+
return parsed;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Normalize text: strip /Users/ paths, port numbers, lowercase.
|
|
91
|
+
*/
|
|
92
|
+
function normalize(text) {
|
|
93
|
+
if (!text || typeof text !== 'string') return '';
|
|
94
|
+
return text
|
|
95
|
+
.replace(/\/Users\/[^\s/]+/g, '/Users/redacted')
|
|
96
|
+
.replace(/:\d{4,5}\b/g, ':PORT')
|
|
97
|
+
.toLowerCase()
|
|
98
|
+
.trim();
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Strip common feedback prefix tokens from a string.
|
|
103
|
+
*/
|
|
104
|
+
function stripFeedbackPrefix(text) {
|
|
105
|
+
if (!text) return '';
|
|
106
|
+
return text
|
|
107
|
+
.replace(/^(thumbs?\s*(up|down)\s*:?\s*)/i, '')
|
|
108
|
+
.replace(/^(positive|negative)\s*(feedback)?\s*:?\s*/i, '')
|
|
109
|
+
.replace(/^(good|bad|wrong|error|fail(ed|ure)?)\s*:?\s*/i, '')
|
|
110
|
+
.trim();
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Compose normalize + stripFeedbackPrefix.
|
|
115
|
+
*/
|
|
116
|
+
function normalizePatternText(text) {
|
|
117
|
+
return normalize(stripFeedbackPrefix(text));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Infer tool name from raw name or context keywords.
|
|
122
|
+
*/
|
|
123
|
+
function inferToolName(rawToolName, context) {
|
|
124
|
+
if (rawToolName && rawToolName !== 'unknown') return rawToolName;
|
|
125
|
+
const ctx = (context || '').toLowerCase();
|
|
126
|
+
if (ctx.includes('bash') || ctx.includes('command') || ctx.includes('shell')) return 'Bash';
|
|
127
|
+
if (ctx.includes('edit') || ctx.includes('patch') || ctx.includes('replace')) return 'Edit';
|
|
128
|
+
if (ctx.includes('write') || ctx.includes('create file') || ctx.includes('overwrite')) return 'Write';
|
|
129
|
+
if (ctx.includes('read') || ctx.includes('cat ') || ctx.includes('view file')) return 'Read';
|
|
130
|
+
if (ctx.includes('search') || ctx.includes('grep') || ctx.includes('find')) return 'Grep';
|
|
131
|
+
if (ctx.includes('glob') || ctx.includes('list files')) return 'Glob';
|
|
132
|
+
return rawToolName || 'unknown';
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Classify an entry as 'positive', 'negative', or 'neutral'.
|
|
137
|
+
*/
|
|
138
|
+
function classify(entry) {
|
|
139
|
+
const raw = String(entry.signal || entry.feedback || '').toLowerCase().trim();
|
|
140
|
+
if (NEG.has(raw)) return 'negative';
|
|
141
|
+
if (POS.has(raw)) return 'positive';
|
|
142
|
+
return 'neutral';
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Extract ms from a timestamp value. Returns 0 on failure.
|
|
147
|
+
*/
|
|
148
|
+
function getTimestampMs(value) {
|
|
149
|
+
if (!value) return 0;
|
|
150
|
+
const ms = Date.parse(value);
|
|
151
|
+
return isNaN(ms) ? 0 : ms;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Extract meaningful keywords from text.
|
|
156
|
+
* min 4 chars, no stopwords, max 8 tokens.
|
|
157
|
+
*/
|
|
158
|
+
function keywords(text) {
|
|
159
|
+
if (!text) return [];
|
|
160
|
+
const tokens = normalize(text)
|
|
161
|
+
.replace(/[^a-z0-9\s_-]/g, ' ')
|
|
162
|
+
.split(/\s+/)
|
|
163
|
+
.filter((t) => t.length >= 4 && !STOPWORDS.has(t));
|
|
164
|
+
return [...new Set(tokens)].slice(0, 8);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* FNV-1a 32-bit hash.
|
|
169
|
+
*/
|
|
170
|
+
function hashText(text) {
|
|
171
|
+
let hash = 2166136261;
|
|
172
|
+
const str = String(text || '');
|
|
173
|
+
for (let i = 0; i < str.length; i++) {
|
|
174
|
+
hash ^= str.charCodeAt(i);
|
|
175
|
+
hash = (hash * 16777619) >>> 0;
|
|
176
|
+
}
|
|
177
|
+
return hash.toString(16).padStart(8, '0');
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// ---------------------------------------------------------------------------
|
|
181
|
+
// buildHybridState
|
|
182
|
+
// ---------------------------------------------------------------------------
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Build hybrid state by reading from all JSONL sources.
|
|
186
|
+
*
|
|
187
|
+
* @param {Object} opts
|
|
188
|
+
* @param {string} [opts.feedbackLogPath]
|
|
189
|
+
* @param {string} [opts.inboxPath]
|
|
190
|
+
* @param {string} [opts.pendingSyncPath]
|
|
191
|
+
* @param {string} [opts.attributedFeedbackPath]
|
|
192
|
+
* @returns {Object} state
|
|
193
|
+
*/
|
|
194
|
+
function buildHybridState(opts) {
|
|
195
|
+
const o = opts || {};
|
|
196
|
+
const feedbackLogPath = o.feedbackLogPath || process.env.RLHF_FEEDBACK_LOG || PATHS.feedbackLog;
|
|
197
|
+
const inboxPath = o.inboxPath || process.env.RLHF_FEEDBACK_INBOX || PATHS.inbox;
|
|
198
|
+
const pendingSyncPath = o.pendingSyncPath || process.env.RLHF_PENDING_SYNC || PATHS.pendingSync;
|
|
199
|
+
const attributedFeedbackPath = o.attributedFeedbackPath || process.env.RLHF_ATTRIBUTED_FEEDBACK || PATHS.attributedFeedback;
|
|
200
|
+
|
|
201
|
+
const feedbackEntries = readJsonl(feedbackLogPath);
|
|
202
|
+
const inboxEntries = readJsonl(inboxPath);
|
|
203
|
+
const pendingSyncEntries = readJsonl(pendingSyncPath);
|
|
204
|
+
const attributedEntries = readJsonl(attributedFeedbackPath);
|
|
205
|
+
|
|
206
|
+
// Deduplicate by id across all sources
|
|
207
|
+
const seen = new Set();
|
|
208
|
+
const allEntries = [];
|
|
209
|
+
for (const entry of [...feedbackEntries, ...inboxEntries, ...pendingSyncEntries]) {
|
|
210
|
+
const key = entry.id || hashText(JSON.stringify(entry));
|
|
211
|
+
if (!seen.has(key)) {
|
|
212
|
+
seen.add(key);
|
|
213
|
+
allEntries.push(entry);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Build counts
|
|
218
|
+
let total = 0;
|
|
219
|
+
let positive = 0;
|
|
220
|
+
let negative = 0;
|
|
221
|
+
const patternMap = {}; // normalized text -> { count, lastSeen, sources, text }
|
|
222
|
+
const toolNegatives = {}; // toolName -> count
|
|
223
|
+
const toolNegativesAttributed = {}; // toolName -> count (from attributed only)
|
|
224
|
+
|
|
225
|
+
for (const entry of allEntries) {
|
|
226
|
+
total++;
|
|
227
|
+
const cls = classify(entry);
|
|
228
|
+
if (cls === 'positive') positive++;
|
|
229
|
+
if (cls === 'negative') {
|
|
230
|
+
negative++;
|
|
231
|
+
// Track tool-level negative counts
|
|
232
|
+
const toolName = inferToolName(entry.toolName || entry.tool_name || 'unknown', entry.context || '');
|
|
233
|
+
toolNegatives[toolName] = (toolNegatives[toolName] || 0) + 1;
|
|
234
|
+
|
|
235
|
+
// Build pattern from context / whatWentWrong / what_went_wrong
|
|
236
|
+
const rawText = [
|
|
237
|
+
entry.context || '',
|
|
238
|
+
entry.whatWentWrong || entry.what_went_wrong || '',
|
|
239
|
+
entry.whatToChange || entry.what_to_change || '',
|
|
240
|
+
].join(' ');
|
|
241
|
+
const norm = normalizePatternText(rawText);
|
|
242
|
+
if (!norm) continue;
|
|
243
|
+
const words = keywords(norm);
|
|
244
|
+
if (words.length < 2) continue; // need at least 2 meaningful words
|
|
245
|
+
const patKey = words.slice(0, 4).join('_');
|
|
246
|
+
if (!patternMap[patKey]) {
|
|
247
|
+
patternMap[patKey] = { count: 0, lastSeen: 0, sources: [], text: norm, words };
|
|
248
|
+
}
|
|
249
|
+
patternMap[patKey].count++;
|
|
250
|
+
const ts = getTimestampMs(entry.timestamp);
|
|
251
|
+
if (ts > patternMap[patKey].lastSeen) patternMap[patKey].lastSeen = ts;
|
|
252
|
+
patternMap[patKey].sources.push('feedbackLog');
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Process attributed feedback separately to track attributed tool counts
|
|
257
|
+
for (const entry of attributedEntries) {
|
|
258
|
+
const toolName = inferToolName(entry.toolName || entry.tool_name || entry.attributed_tool || 'unknown', entry.context || '');
|
|
259
|
+
toolNegativesAttributed[toolName] = (toolNegativesAttributed[toolName] || 0) + 1;
|
|
260
|
+
|
|
261
|
+
const rawText = [
|
|
262
|
+
entry.context || '',
|
|
263
|
+
entry.whatWentWrong || entry.what_went_wrong || '',
|
|
264
|
+
].join(' ');
|
|
265
|
+
const norm = normalizePatternText(rawText);
|
|
266
|
+
if (!norm) continue;
|
|
267
|
+
const words = keywords(norm);
|
|
268
|
+
if (words.length < 2) continue;
|
|
269
|
+
const patKey = words.slice(0, 4).join('_');
|
|
270
|
+
if (!patternMap[patKey]) {
|
|
271
|
+
patternMap[patKey] = { count: 0, lastSeen: 0, sources: [], text: norm, words };
|
|
272
|
+
}
|
|
273
|
+
// Mark as attributed source (prefer over raw feedbackLog)
|
|
274
|
+
if (!patternMap[patKey].sources.includes('attributedFeedback')) {
|
|
275
|
+
patternMap[patKey].sources.push('attributedFeedback');
|
|
276
|
+
}
|
|
277
|
+
patternMap[patKey].count++;
|
|
278
|
+
const ts = getTimestampMs(entry.timestamp);
|
|
279
|
+
if (ts > patternMap[patKey].lastSeen) patternMap[patKey].lastSeen = ts;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Recurring = count >= 2
|
|
283
|
+
const recurringNegativePatterns = Object.values(patternMap)
|
|
284
|
+
.filter((p) => p.count >= 2)
|
|
285
|
+
.sort((a, b) => b.count - a.count);
|
|
286
|
+
|
|
287
|
+
// Prevention rules from feedbackLog (whatToChange fields)
|
|
288
|
+
const preventionRules = allEntries
|
|
289
|
+
.filter((e) => classify(e) === 'negative' && (e.whatToChange || e.what_to_change))
|
|
290
|
+
.map((e) => normalize(e.whatToChange || e.what_to_change))
|
|
291
|
+
.filter(Boolean);
|
|
292
|
+
|
|
293
|
+
return {
|
|
294
|
+
counts: { total, positive, negative },
|
|
295
|
+
recurringNegativePatterns,
|
|
296
|
+
preventionRules,
|
|
297
|
+
negativeToolCounts: toolNegatives,
|
|
298
|
+
negativeToolCountsAttributed: toolNegativesAttributed,
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// ---------------------------------------------------------------------------
|
|
303
|
+
// deriveConstraints
|
|
304
|
+
// ---------------------------------------------------------------------------
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Produce up to `max` actionable constraint strings from recurring patterns.
|
|
308
|
+
*
|
|
309
|
+
* @param {Object} state - from buildHybridState()
|
|
310
|
+
* @param {number} [max=5]
|
|
311
|
+
* @returns {string[]}
|
|
312
|
+
*/
|
|
313
|
+
function deriveConstraints(state, max) {
|
|
314
|
+
const limit = max !== undefined ? max : 5;
|
|
315
|
+
const constraints = [];
|
|
316
|
+
|
|
317
|
+
// Top recurring patterns become constraints
|
|
318
|
+
for (const pattern of (state.recurringNegativePatterns || []).slice(0, limit)) {
|
|
319
|
+
const truncated = pattern.text.length > 100 ? pattern.text.slice(0, 100) + '...' : pattern.text;
|
|
320
|
+
constraints.push(`Avoid: "${truncated}" (seen ${pattern.count}x)`);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Prevention rules fill remaining slots
|
|
324
|
+
const remaining = limit - constraints.length;
|
|
325
|
+
for (const rule of (state.preventionRules || []).slice(0, remaining)) {
|
|
326
|
+
const truncated = rule.length > 100 ? rule.slice(0, 100) + '...' : rule;
|
|
327
|
+
constraints.push(`Rule: ${truncated}`);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
return constraints.slice(0, limit);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// ---------------------------------------------------------------------------
|
|
334
|
+
// buildAdditionalContext
|
|
335
|
+
// ---------------------------------------------------------------------------
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* Format a single summary string for pre-tool context injection.
|
|
339
|
+
*
|
|
340
|
+
* @param {Object} state
|
|
341
|
+
* @param {string[]} constraints
|
|
342
|
+
* @param {number} [maxChars=800]
|
|
343
|
+
* @returns {string}
|
|
344
|
+
*/
|
|
345
|
+
function buildAdditionalContext(state, constraints, maxChars) {
|
|
346
|
+
const limit = maxChars !== undefined ? maxChars : 800;
|
|
347
|
+
const { counts } = state;
|
|
348
|
+
const lines = [
|
|
349
|
+
`Feedback history: ${counts.total} total (${counts.positive} positive, ${counts.negative} negative)`,
|
|
350
|
+
`Recurring patterns: ${(state.recurringNegativePatterns || []).length}`,
|
|
351
|
+
];
|
|
352
|
+
if (constraints && constraints.length > 0) {
|
|
353
|
+
lines.push('Active constraints:');
|
|
354
|
+
constraints.forEach((c) => lines.push(` - ${c}`));
|
|
355
|
+
}
|
|
356
|
+
let result = lines.join('\n');
|
|
357
|
+
if (result.length > limit) {
|
|
358
|
+
result = result.slice(0, limit - 3) + '...';
|
|
359
|
+
}
|
|
360
|
+
return result;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// ---------------------------------------------------------------------------
|
|
364
|
+
// hasTwoKeywordHits
|
|
365
|
+
// ---------------------------------------------------------------------------
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Require 2+ keyword matches to reduce false positives (ATTR-03 no-false-positive invariant).
|
|
369
|
+
*
|
|
370
|
+
* @param {string} normalizedInput
|
|
371
|
+
* @param {string[]} words - keyword list from a pattern
|
|
372
|
+
* @returns {boolean}
|
|
373
|
+
*/
|
|
374
|
+
function hasTwoKeywordHits(normalizedInput, words) {
|
|
375
|
+
if (!normalizedInput || !words || words.length === 0) return false;
|
|
376
|
+
let hits = 0;
|
|
377
|
+
for (const word of words) {
|
|
378
|
+
if (normalizedInput.includes(word)) {
|
|
379
|
+
hits++;
|
|
380
|
+
if (hits >= 2) return true;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
return false;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// ---------------------------------------------------------------------------
|
|
387
|
+
// compileGuardArtifact
|
|
388
|
+
// ---------------------------------------------------------------------------
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Build deduped guards array from state.
|
|
392
|
+
* Prefers patterns sourced from attributedFeedback. Assigns block/warn mode.
|
|
393
|
+
*
|
|
394
|
+
* @param {Object} state - from buildHybridState()
|
|
395
|
+
* @param {Object} [opts]
|
|
396
|
+
* @param {number} [opts.blockThreshold=3] - count >= this → block
|
|
397
|
+
* @returns {Object} artifact
|
|
398
|
+
*/
|
|
399
|
+
function compileGuardArtifact(state, opts) {
|
|
400
|
+
const o = opts || {};
|
|
401
|
+
const blockThreshold = o.blockThreshold !== undefined ? o.blockThreshold : 3;
|
|
402
|
+
|
|
403
|
+
const guards = [];
|
|
404
|
+
const seenHashes = new Set();
|
|
405
|
+
|
|
406
|
+
for (const pattern of state.recurringNegativePatterns || []) {
|
|
407
|
+
const h = hashText(pattern.text);
|
|
408
|
+
if (seenHashes.has(h)) continue;
|
|
409
|
+
seenHashes.add(h);
|
|
410
|
+
|
|
411
|
+
const isAttributed = pattern.sources && pattern.sources.includes('attributedFeedback');
|
|
412
|
+
const mode = pattern.count >= blockThreshold ? 'block' : 'warn';
|
|
413
|
+
|
|
414
|
+
guards.push({
|
|
415
|
+
hash: h,
|
|
416
|
+
text: pattern.text,
|
|
417
|
+
words: pattern.words,
|
|
418
|
+
count: pattern.count,
|
|
419
|
+
lastSeen: pattern.lastSeen,
|
|
420
|
+
attributed: isAttributed,
|
|
421
|
+
mode,
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Sort: attributed first, then by count desc
|
|
426
|
+
guards.sort((a, b) => {
|
|
427
|
+
if (a.attributed && !b.attributed) return -1;
|
|
428
|
+
if (!a.attributed && b.attributed) return 1;
|
|
429
|
+
return b.count - a.count;
|
|
430
|
+
});
|
|
431
|
+
|
|
432
|
+
return {
|
|
433
|
+
compiledAt: new Date().toISOString(),
|
|
434
|
+
guardCount: guards.length,
|
|
435
|
+
blockThreshold,
|
|
436
|
+
guards,
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// ---------------------------------------------------------------------------
|
|
441
|
+
// writeGuardArtifact / readGuardArtifact
|
|
442
|
+
// ---------------------------------------------------------------------------
|
|
443
|
+
|
|
444
|
+
/**
|
|
445
|
+
* Atomic write via tmp → rename.
|
|
446
|
+
*
|
|
447
|
+
* @param {string} filePath
|
|
448
|
+
* @param {Object} artifact
|
|
449
|
+
*/
|
|
450
|
+
function writeGuardArtifact(filePath, artifact) {
|
|
451
|
+
const outPath = filePath || PATHS.guardArtifact;
|
|
452
|
+
fs.mkdirSync(path.dirname(outPath), { recursive: true });
|
|
453
|
+
const tmp = `${outPath}.tmp.${process.pid}.${Date.now()}`;
|
|
454
|
+
fs.writeFileSync(tmp, JSON.stringify(artifact, null, 2) + '\n');
|
|
455
|
+
fs.renameSync(tmp, outPath);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
/**
|
|
459
|
+
* Read + validate a guard artifact.
|
|
460
|
+
*
|
|
461
|
+
* @param {string} [filePath]
|
|
462
|
+
* @returns {Object|null} artifact or null if invalid/missing
|
|
463
|
+
*/
|
|
464
|
+
function readGuardArtifact(filePath) {
|
|
465
|
+
const inPath = filePath || process.env.RLHF_GUARDS_PATH || PATHS.guardArtifact;
|
|
466
|
+
if (!fs.existsSync(inPath)) return null;
|
|
467
|
+
try {
|
|
468
|
+
const raw = fs.readFileSync(inPath, 'utf8');
|
|
469
|
+
const obj = JSON.parse(raw);
|
|
470
|
+
if (!Array.isArray(obj.guards)) return null;
|
|
471
|
+
return obj;
|
|
472
|
+
} catch (_) {
|
|
473
|
+
return null;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// ---------------------------------------------------------------------------
|
|
478
|
+
// evaluateCompiledGuards (fast path)
|
|
479
|
+
// ---------------------------------------------------------------------------
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Check compiled artifact against toolName + toolInput.
|
|
483
|
+
*
|
|
484
|
+
* @param {Object} artifact
|
|
485
|
+
* @param {string} toolName
|
|
486
|
+
* @param {string} toolInput
|
|
487
|
+
* @returns {{ mode: string, reason: string, source: string }}
|
|
488
|
+
*/
|
|
489
|
+
function evaluateCompiledGuards(artifact, toolName, toolInput) {
|
|
490
|
+
if (!artifact || !Array.isArray(artifact.guards)) {
|
|
491
|
+
return { mode: 'allow', reason: '', source: 'compiled' };
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
const normInput = normalize(toolInput || '');
|
|
495
|
+
const normTool = (toolName || '').toLowerCase();
|
|
496
|
+
|
|
497
|
+
for (const guard of artifact.guards) {
|
|
498
|
+
// Check if tool context is relevant
|
|
499
|
+
const guardText = normalize(guard.text || '');
|
|
500
|
+
const toolMentioned = guardText.includes(normTool) || normTool === 'unknown';
|
|
501
|
+
|
|
502
|
+
if (hasTwoKeywordHits(normInput, guard.words || [])) {
|
|
503
|
+
return {
|
|
504
|
+
mode: guard.mode || 'warn',
|
|
505
|
+
reason: `Matched guard pattern (count: ${guard.count}): "${(guard.text || '').slice(0, 80)}"`,
|
|
506
|
+
source: 'compiled',
|
|
507
|
+
guardHash: guard.hash,
|
|
508
|
+
attributed: guard.attributed,
|
|
509
|
+
};
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// Also check tool-level match when input is empty or short
|
|
513
|
+
if (normInput.length < 10 && toolMentioned && guard.count >= (artifact.blockThreshold || 3)) {
|
|
514
|
+
return {
|
|
515
|
+
mode: guard.mode || 'warn',
|
|
516
|
+
reason: `Tool "${toolName}" has recurring negative patterns (count: ${guard.count})`,
|
|
517
|
+
source: 'compiled',
|
|
518
|
+
guardHash: guard.hash,
|
|
519
|
+
attributed: guard.attributed,
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
return { mode: 'allow', reason: '', source: 'compiled' };
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// ---------------------------------------------------------------------------
|
|
528
|
+
// evaluatePretoolFromState (live path)
|
|
529
|
+
// ---------------------------------------------------------------------------
|
|
530
|
+
|
|
531
|
+
/**
|
|
532
|
+
* Live path: check recurringNegativePatterns + negativeToolCounts.
|
|
533
|
+
*
|
|
534
|
+
* @param {Object} state - from buildHybridState()
|
|
535
|
+
* @param {string} toolName
|
|
536
|
+
* @param {string} toolInput
|
|
537
|
+
* @returns {{ mode: string, reason: string, source: string }}
|
|
538
|
+
*/
|
|
539
|
+
function evaluatePretoolFromState(state, toolName, toolInput) {
|
|
540
|
+
const normInput = normalize(toolInput || '');
|
|
541
|
+
const normTool = (toolName || '').toLowerCase();
|
|
542
|
+
|
|
543
|
+
for (const pattern of state.recurringNegativePatterns || []) {
|
|
544
|
+
if (hasTwoKeywordHits(normInput, pattern.words || [])) {
|
|
545
|
+
const mode = pattern.count >= 3 ? 'block' : 'warn';
|
|
546
|
+
return {
|
|
547
|
+
mode,
|
|
548
|
+
reason: `Recurring negative pattern (count: ${pattern.count}): "${(pattern.text || '').slice(0, 80)}"`,
|
|
549
|
+
source: 'state',
|
|
550
|
+
};
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// Tool-level check: if this tool has many attributed negatives
|
|
555
|
+
const attrCount = (state.negativeToolCountsAttributed || {})[toolName] || 0;
|
|
556
|
+
const rawCount = (state.negativeToolCounts || {})[toolName] || 0;
|
|
557
|
+
if (attrCount >= 3 || rawCount >= 5) {
|
|
558
|
+
return {
|
|
559
|
+
mode: attrCount >= 3 ? 'block' : 'warn',
|
|
560
|
+
reason: `Tool "${toolName}" has ${attrCount} attributed negative(s), ${rawCount} total negative(s)`,
|
|
561
|
+
source: 'state',
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return { mode: 'allow', reason: '', source: 'state' };
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
// ---------------------------------------------------------------------------
|
|
569
|
+
// evaluatePretool (orchestrator)
|
|
570
|
+
// ---------------------------------------------------------------------------
|
|
571
|
+
|
|
572
|
+
/**
|
|
573
|
+
* Main pre-tool evaluation. Tries compiled artifact first, falls back to live state.
|
|
574
|
+
*
|
|
575
|
+
* Important invariant: a tool+input with NEVER a negative returns {mode:'allow'}.
|
|
576
|
+
* hasTwoKeywordHits and count >= 2 filters enforce this (ATTR-03 no-false-positives).
|
|
577
|
+
*
|
|
578
|
+
* @param {string} toolName
|
|
579
|
+
* @param {string} toolInput
|
|
580
|
+
* @param {Object} [opts]
|
|
581
|
+
* @param {string} [opts.guardArtifactPath]
|
|
582
|
+
* @param {string} [opts.feedbackLogPath]
|
|
583
|
+
* @param {string} [opts.attributedFeedbackPath]
|
|
584
|
+
* @returns {{ mode: 'block'|'warn'|'allow', reason: string, source: string }}
|
|
585
|
+
*/
|
|
586
|
+
function evaluatePretool(toolName, toolInput, opts) {
|
|
587
|
+
const o = opts || {};
|
|
588
|
+
|
|
589
|
+
// Fast path: compiled artifact
|
|
590
|
+
const artifactPath = o.guardArtifactPath || process.env.RLHF_GUARDS_PATH || PATHS.guardArtifact;
|
|
591
|
+
const artifact = readGuardArtifact(artifactPath);
|
|
592
|
+
if (artifact) {
|
|
593
|
+
const result = evaluateCompiledGuards(artifact, toolName, toolInput);
|
|
594
|
+
if (result.mode !== 'allow') return result;
|
|
595
|
+
// Even if compiled says allow, we're done (trust compiled)
|
|
596
|
+
return result;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// Slow path: build live state
|
|
600
|
+
const state = buildHybridState({
|
|
601
|
+
feedbackLogPath: o.feedbackLogPath,
|
|
602
|
+
attributedFeedbackPath: o.attributedFeedbackPath,
|
|
603
|
+
});
|
|
604
|
+
return evaluatePretoolFromState(state, toolName, toolInput);
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// ---------------------------------------------------------------------------
|
|
608
|
+
// CLI main()
|
|
609
|
+
// ---------------------------------------------------------------------------
|
|
610
|
+
|
|
611
|
+
function main() {
|
|
612
|
+
const args = process.argv.slice(2);
|
|
613
|
+
|
|
614
|
+
if (args[0] === '--pretool') {
|
|
615
|
+
const toolName = args[1] || 'unknown';
|
|
616
|
+
const rawInput = args[2] || '';
|
|
617
|
+
let toolInput = rawInput;
|
|
618
|
+
try {
|
|
619
|
+
const parsed = JSON.parse(rawInput);
|
|
620
|
+
toolInput = typeof parsed === 'object' ? JSON.stringify(parsed) : String(parsed);
|
|
621
|
+
} catch (_) {
|
|
622
|
+
toolInput = rawInput;
|
|
623
|
+
}
|
|
624
|
+
const result = evaluatePretool(toolName, toolInput);
|
|
625
|
+
console.log(JSON.stringify(result, null, 2));
|
|
626
|
+
process.exit(result.mode === 'block' ? 2 : 0);
|
|
627
|
+
return;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if (args[0] === '--compile-guards') {
|
|
631
|
+
const outPath = args[1] || PATHS.guardArtifact;
|
|
632
|
+
const state = buildHybridState({});
|
|
633
|
+
const artifact = compileGuardArtifact(state);
|
|
634
|
+
writeGuardArtifact(outPath, artifact);
|
|
635
|
+
console.log(JSON.stringify({ guardCount: artifact.guardCount, outPath, compiledAt: artifact.compiledAt }, null, 2));
|
|
636
|
+
process.exit(0);
|
|
637
|
+
return;
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// Default: print full state + constraints + additional context
|
|
641
|
+
const state = buildHybridState({});
|
|
642
|
+
const constraints = deriveConstraints(state);
|
|
643
|
+
const additionalContext = buildAdditionalContext(state, constraints);
|
|
644
|
+
console.log('=== Hybrid Feedback State ===');
|
|
645
|
+
console.log(JSON.stringify({ state, constraints, additionalContext }, null, 2));
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
// ---------------------------------------------------------------------------
|
|
649
|
+
// Exports
|
|
650
|
+
// ---------------------------------------------------------------------------
|
|
651
|
+
|
|
652
|
+
module.exports = {
|
|
653
|
+
buildHybridState,
|
|
654
|
+
evaluatePretool,
|
|
655
|
+
compileGuardArtifact,
|
|
656
|
+
writeGuardArtifact,
|
|
657
|
+
readGuardArtifact,
|
|
658
|
+
evaluateCompiledGuards,
|
|
659
|
+
evaluatePretoolFromState,
|
|
660
|
+
deriveConstraints,
|
|
661
|
+
buildAdditionalContext,
|
|
662
|
+
// Internal helpers (exposed for testing)
|
|
663
|
+
normalize,
|
|
664
|
+
normalizePatternText,
|
|
665
|
+
inferToolName,
|
|
666
|
+
classify,
|
|
667
|
+
keywords,
|
|
668
|
+
hashText,
|
|
669
|
+
hasTwoKeywordHits,
|
|
670
|
+
readJsonl,
|
|
671
|
+
PATHS,
|
|
672
|
+
};
|
|
673
|
+
|
|
674
|
+
if (require.main === module) {
|
|
675
|
+
main();
|
|
676
|
+
}
|