rlhf-feedback-loop 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/CHANGELOG.md +26 -0
  2. package/LICENSE +21 -0
  3. package/README.md +308 -0
  4. package/adapters/README.md +8 -0
  5. package/adapters/amp/skills/rlhf-feedback/SKILL.md +20 -0
  6. package/adapters/chatgpt/INSTALL.md +80 -0
  7. package/adapters/chatgpt/openapi.yaml +292 -0
  8. package/adapters/claude/.mcp.json +8 -0
  9. package/adapters/codex/config.toml +4 -0
  10. package/adapters/gemini/function-declarations.json +95 -0
  11. package/adapters/mcp/server-stdio.js +444 -0
  12. package/bin/cli.js +167 -0
  13. package/config/mcp-allowlists.json +29 -0
  14. package/config/policy-bundles/constrained-v1.json +53 -0
  15. package/config/policy-bundles/default-v1.json +80 -0
  16. package/config/rubrics/default-v1.json +52 -0
  17. package/config/subagent-profiles.json +32 -0
  18. package/openapi/openapi.yaml +292 -0
  19. package/package.json +91 -0
  20. package/plugins/amp-skill/INSTALL.md +52 -0
  21. package/plugins/amp-skill/SKILL.md +31 -0
  22. package/plugins/claude-skill/INSTALL.md +55 -0
  23. package/plugins/claude-skill/SKILL.md +46 -0
  24. package/plugins/codex-profile/AGENTS.md +20 -0
  25. package/plugins/codex-profile/INSTALL.md +57 -0
  26. package/plugins/gemini-extension/INSTALL.md +74 -0
  27. package/plugins/gemini-extension/gemini_prompt.txt +10 -0
  28. package/plugins/gemini-extension/tool_contract.json +28 -0
  29. package/scripts/billing.js +471 -0
  30. package/scripts/budget-guard.js +173 -0
  31. package/scripts/code-reasoning.js +307 -0
  32. package/scripts/context-engine.js +547 -0
  33. package/scripts/contextfs.js +513 -0
  34. package/scripts/contract-audit.js +198 -0
  35. package/scripts/dpo-optimizer.js +208 -0
  36. package/scripts/export-dpo-pairs.js +316 -0
  37. package/scripts/export-training.js +448 -0
  38. package/scripts/feedback-attribution.js +313 -0
  39. package/scripts/feedback-inbox-read.js +162 -0
  40. package/scripts/feedback-loop.js +838 -0
  41. package/scripts/feedback-schema.js +300 -0
  42. package/scripts/feedback-to-memory.js +165 -0
  43. package/scripts/feedback-to-rules.js +109 -0
  44. package/scripts/generate-paperbanana-diagrams.sh +99 -0
  45. package/scripts/hybrid-feedback-context.js +676 -0
  46. package/scripts/intent-router.js +164 -0
  47. package/scripts/mcp-policy.js +92 -0
  48. package/scripts/meta-policy.js +194 -0
  49. package/scripts/plan-gate.js +154 -0
  50. package/scripts/prove-adapters.js +364 -0
  51. package/scripts/prove-attribution.js +364 -0
  52. package/scripts/prove-automation.js +393 -0
  53. package/scripts/prove-data-quality.js +219 -0
  54. package/scripts/prove-intelligence.js +256 -0
  55. package/scripts/prove-lancedb.js +370 -0
  56. package/scripts/prove-loop-closure.js +255 -0
  57. package/scripts/prove-rlaif.js +404 -0
  58. package/scripts/prove-subway-upgrades.js +250 -0
  59. package/scripts/prove-training-export.js +324 -0
  60. package/scripts/prove-v2-milestone.js +273 -0
  61. package/scripts/prove-v3-milestone.js +381 -0
  62. package/scripts/rlaif-self-audit.js +123 -0
  63. package/scripts/rubric-engine.js +230 -0
  64. package/scripts/self-heal.js +127 -0
  65. package/scripts/self-healing-check.js +111 -0
  66. package/scripts/skill-quality-tracker.js +284 -0
  67. package/scripts/subagent-profiles.js +79 -0
  68. package/scripts/sync-gh-secrets-from-env.sh +29 -0
  69. package/scripts/thompson-sampling.js +331 -0
  70. package/scripts/train_from_feedback.py +914 -0
  71. package/scripts/validate-feedback.js +580 -0
  72. package/scripts/vector-store.js +100 -0
  73. package/src/api/server.js +497 -0
@@ -0,0 +1,676 @@
1
+ 'use strict';
2
+ /**
3
+ * Hybrid Feedback Context — Pre-Tool Guard Engine (ATTR-02)
4
+ *
5
+ * Builds attributed feedback state from multiple JSONL sources and compiles
6
+ * it into a fast guard artifact for pre-tool execution decisions:
7
+ * block — attributed negative patterns exceed threshold
8
+ * warn — soft negative signal; proceed with caution
9
+ * allow — no matching negative patterns (default)
10
+ *
11
+ * Exports:
12
+ * buildHybridState, evaluatePretool, compileGuardArtifact,
13
+ * writeGuardArtifact, readGuardArtifact, evaluateCompiledGuards,
14
+ * evaluatePretoolFromState, deriveConstraints, buildAdditionalContext
15
+ */
16
+
17
+ const fs = require('fs');
18
+ const path = require('path');
19
+ const os = require('os');
20
+
21
+ // ---------------------------------------------------------------------------
22
+ // Paths
23
+ // ---------------------------------------------------------------------------
24
+
25
+ const ROOT = path.join(__dirname, '..');
26
+
27
+ const PATHS = {
28
+ feedbackLog: path.join(ROOT, '.claude', 'memory', 'feedback', 'feedback-log.jsonl'),
29
+ inbox: path.join(ROOT, '.claude', 'memory', 'feedback', 'inbox.jsonl'),
30
+ pendingSync: path.join(ROOT, '.claude', 'memory', 'feedback', 'pending_cortex_sync.jsonl'),
31
+ attributedFeedback: path.join(ROOT, '.claude', 'memory', 'feedback', 'attributed-feedback.jsonl'),
32
+ guardArtifact: path.join(ROOT, '.claude', 'memory', 'feedback', 'pretool-guards.json'),
33
+ };
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // Constants
37
+ // ---------------------------------------------------------------------------
38
+
39
+ const STOPWORDS = new Set([
40
+ 'the', 'and', 'for', 'was', 'with', 'from', 'that', 'this', 'are', 'have',
41
+ 'has', 'had', 'not', 'but', 'they', 'you', 'can', 'will', 'all', 'any',
42
+ 'one', 'its', 'our', 'also', 'more', 'very', 'just', 'into', 'been',
43
+ 'bash', 'edit', 'write', 'tool', 'hook', 'clear',
44
+ ]);
45
+
46
+ const NEG = new Set([
47
+ 'negative', 'thumbsdown', 'thumbs_down', 'thumbs-down', 'down', 'bad',
48
+ 'wrong', 'error', 'fail', 'failed', 'failure', 'mistake', 'bug', 'broken',
49
+ ]);
50
+
51
+ const POS = new Set([
52
+ 'positive', 'thumbsup', 'thumbs_up', 'thumbs-up', 'up', 'good', 'correct',
53
+ 'success', 'pass', 'passed', 'great', 'excellent', 'perfect', 'works',
54
+ ]);
55
+
56
+ // ---------------------------------------------------------------------------
57
+ // Low-level helpers
58
+ // ---------------------------------------------------------------------------
59
+
60
+ /**
61
+ * Read last maxLines of a JSONL file in reverse, then re-reverse so oldest-first.
62
+ */
63
+ function readJsonl(filePath, maxLines) {
64
+ const limit = maxLines !== undefined ? maxLines : 400;
65
+ if (!fs.existsSync(filePath)) return [];
66
+ let raw;
67
+ try {
68
+ raw = fs.readFileSync(filePath, 'utf8').trimEnd();
69
+ } catch (_) {
70
+ return [];
71
+ }
72
+ if (!raw) return [];
73
+ const lines = raw.split('\n');
74
+ const slice = lines.slice(-limit);
75
+ const parsed = [];
76
+ for (let i = slice.length - 1; i >= 0; i--) {
77
+ const line = slice[i].trim();
78
+ if (!line) continue;
79
+ try {
80
+ parsed.push(JSON.parse(line));
81
+ } catch (_) {
82
+ // skip malformed
83
+ }
84
+ }
85
+ parsed.reverse(); // back to chronological order
86
+ return parsed;
87
+ }
88
+
89
+ /**
90
+ * Normalize text: strip /Users/ paths, port numbers, lowercase.
91
+ */
92
+ function normalize(text) {
93
+ if (!text || typeof text !== 'string') return '';
94
+ return text
95
+ .replace(/\/Users\/[^\s/]+/g, '/Users/redacted')
96
+ .replace(/:\d{4,5}\b/g, ':PORT')
97
+ .toLowerCase()
98
+ .trim();
99
+ }
100
+
101
+ /**
102
+ * Strip common feedback prefix tokens from a string.
103
+ */
104
+ function stripFeedbackPrefix(text) {
105
+ if (!text) return '';
106
+ return text
107
+ .replace(/^(thumbs?\s*(up|down)\s*:?\s*)/i, '')
108
+ .replace(/^(positive|negative)\s*(feedback)?\s*:?\s*/i, '')
109
+ .replace(/^(good|bad|wrong|error|fail(ed|ure)?)\s*:?\s*/i, '')
110
+ .trim();
111
+ }
112
+
113
+ /**
114
+ * Compose normalize + stripFeedbackPrefix.
115
+ */
116
+ function normalizePatternText(text) {
117
+ return normalize(stripFeedbackPrefix(text));
118
+ }
119
+
120
+ /**
121
+ * Infer tool name from raw name or context keywords.
122
+ */
123
+ function inferToolName(rawToolName, context) {
124
+ if (rawToolName && rawToolName !== 'unknown') return rawToolName;
125
+ const ctx = (context || '').toLowerCase();
126
+ if (ctx.includes('bash') || ctx.includes('command') || ctx.includes('shell')) return 'Bash';
127
+ if (ctx.includes('edit') || ctx.includes('patch') || ctx.includes('replace')) return 'Edit';
128
+ if (ctx.includes('write') || ctx.includes('create file') || ctx.includes('overwrite')) return 'Write';
129
+ if (ctx.includes('read') || ctx.includes('cat ') || ctx.includes('view file')) return 'Read';
130
+ if (ctx.includes('search') || ctx.includes('grep') || ctx.includes('find')) return 'Grep';
131
+ if (ctx.includes('glob') || ctx.includes('list files')) return 'Glob';
132
+ return rawToolName || 'unknown';
133
+ }
134
+
135
+ /**
136
+ * Classify an entry as 'positive', 'negative', or 'neutral'.
137
+ */
138
+ function classify(entry) {
139
+ const raw = String(entry.signal || entry.feedback || '').toLowerCase().trim();
140
+ if (NEG.has(raw)) return 'negative';
141
+ if (POS.has(raw)) return 'positive';
142
+ return 'neutral';
143
+ }
144
+
145
+ /**
146
+ * Extract ms from a timestamp value. Returns 0 on failure.
147
+ */
148
+ function getTimestampMs(value) {
149
+ if (!value) return 0;
150
+ const ms = Date.parse(value);
151
+ return isNaN(ms) ? 0 : ms;
152
+ }
153
+
154
+ /**
155
+ * Extract meaningful keywords from text.
156
+ * min 4 chars, no stopwords, max 8 tokens.
157
+ */
158
+ function keywords(text) {
159
+ if (!text) return [];
160
+ const tokens = normalize(text)
161
+ .replace(/[^a-z0-9\s_-]/g, ' ')
162
+ .split(/\s+/)
163
+ .filter((t) => t.length >= 4 && !STOPWORDS.has(t));
164
+ return [...new Set(tokens)].slice(0, 8);
165
+ }
166
+
167
+ /**
168
+ * FNV-1a 32-bit hash.
169
+ */
170
+ function hashText(text) {
171
+ let hash = 2166136261;
172
+ const str = String(text || '');
173
+ for (let i = 0; i < str.length; i++) {
174
+ hash ^= str.charCodeAt(i);
175
+ hash = (hash * 16777619) >>> 0;
176
+ }
177
+ return hash.toString(16).padStart(8, '0');
178
+ }
179
+
180
+ // ---------------------------------------------------------------------------
181
+ // buildHybridState
182
+ // ---------------------------------------------------------------------------
183
+
184
+ /**
185
+ * Build hybrid state by reading from all JSONL sources.
186
+ *
187
+ * @param {Object} opts
188
+ * @param {string} [opts.feedbackLogPath]
189
+ * @param {string} [opts.inboxPath]
190
+ * @param {string} [opts.pendingSyncPath]
191
+ * @param {string} [opts.attributedFeedbackPath]
192
+ * @returns {Object} state
193
+ */
194
+ function buildHybridState(opts) {
195
+ const o = opts || {};
196
+ const feedbackLogPath = o.feedbackLogPath || process.env.RLHF_FEEDBACK_LOG || PATHS.feedbackLog;
197
+ const inboxPath = o.inboxPath || process.env.RLHF_FEEDBACK_INBOX || PATHS.inbox;
198
+ const pendingSyncPath = o.pendingSyncPath || process.env.RLHF_PENDING_SYNC || PATHS.pendingSync;
199
+ const attributedFeedbackPath = o.attributedFeedbackPath || process.env.RLHF_ATTRIBUTED_FEEDBACK || PATHS.attributedFeedback;
200
+
201
+ const feedbackEntries = readJsonl(feedbackLogPath);
202
+ const inboxEntries = readJsonl(inboxPath);
203
+ const pendingSyncEntries = readJsonl(pendingSyncPath);
204
+ const attributedEntries = readJsonl(attributedFeedbackPath);
205
+
206
+ // Deduplicate by id across all sources
207
+ const seen = new Set();
208
+ const allEntries = [];
209
+ for (const entry of [...feedbackEntries, ...inboxEntries, ...pendingSyncEntries]) {
210
+ const key = entry.id || hashText(JSON.stringify(entry));
211
+ if (!seen.has(key)) {
212
+ seen.add(key);
213
+ allEntries.push(entry);
214
+ }
215
+ }
216
+
217
+ // Build counts
218
+ let total = 0;
219
+ let positive = 0;
220
+ let negative = 0;
221
+ const patternMap = {}; // normalized text -> { count, lastSeen, sources, text }
222
+ const toolNegatives = {}; // toolName -> count
223
+ const toolNegativesAttributed = {}; // toolName -> count (from attributed only)
224
+
225
+ for (const entry of allEntries) {
226
+ total++;
227
+ const cls = classify(entry);
228
+ if (cls === 'positive') positive++;
229
+ if (cls === 'negative') {
230
+ negative++;
231
+ // Track tool-level negative counts
232
+ const toolName = inferToolName(entry.toolName || entry.tool_name || 'unknown', entry.context || '');
233
+ toolNegatives[toolName] = (toolNegatives[toolName] || 0) + 1;
234
+
235
+ // Build pattern from context / whatWentWrong / what_went_wrong
236
+ const rawText = [
237
+ entry.context || '',
238
+ entry.whatWentWrong || entry.what_went_wrong || '',
239
+ entry.whatToChange || entry.what_to_change || '',
240
+ ].join(' ');
241
+ const norm = normalizePatternText(rawText);
242
+ if (!norm) continue;
243
+ const words = keywords(norm);
244
+ if (words.length < 2) continue; // need at least 2 meaningful words
245
+ const patKey = words.slice(0, 4).join('_');
246
+ if (!patternMap[patKey]) {
247
+ patternMap[patKey] = { count: 0, lastSeen: 0, sources: [], text: norm, words };
248
+ }
249
+ patternMap[patKey].count++;
250
+ const ts = getTimestampMs(entry.timestamp);
251
+ if (ts > patternMap[patKey].lastSeen) patternMap[patKey].lastSeen = ts;
252
+ patternMap[patKey].sources.push('feedbackLog');
253
+ }
254
+ }
255
+
256
+ // Process attributed feedback separately to track attributed tool counts
257
+ for (const entry of attributedEntries) {
258
+ const toolName = inferToolName(entry.toolName || entry.tool_name || entry.attributed_tool || 'unknown', entry.context || '');
259
+ toolNegativesAttributed[toolName] = (toolNegativesAttributed[toolName] || 0) + 1;
260
+
261
+ const rawText = [
262
+ entry.context || '',
263
+ entry.whatWentWrong || entry.what_went_wrong || '',
264
+ ].join(' ');
265
+ const norm = normalizePatternText(rawText);
266
+ if (!norm) continue;
267
+ const words = keywords(norm);
268
+ if (words.length < 2) continue;
269
+ const patKey = words.slice(0, 4).join('_');
270
+ if (!patternMap[patKey]) {
271
+ patternMap[patKey] = { count: 0, lastSeen: 0, sources: [], text: norm, words };
272
+ }
273
+ // Mark as attributed source (prefer over raw feedbackLog)
274
+ if (!patternMap[patKey].sources.includes('attributedFeedback')) {
275
+ patternMap[patKey].sources.push('attributedFeedback');
276
+ }
277
+ patternMap[patKey].count++;
278
+ const ts = getTimestampMs(entry.timestamp);
279
+ if (ts > patternMap[patKey].lastSeen) patternMap[patKey].lastSeen = ts;
280
+ }
281
+
282
+ // Recurring = count >= 2
283
+ const recurringNegativePatterns = Object.values(patternMap)
284
+ .filter((p) => p.count >= 2)
285
+ .sort((a, b) => b.count - a.count);
286
+
287
+ // Prevention rules from feedbackLog (whatToChange fields)
288
+ const preventionRules = allEntries
289
+ .filter((e) => classify(e) === 'negative' && (e.whatToChange || e.what_to_change))
290
+ .map((e) => normalize(e.whatToChange || e.what_to_change))
291
+ .filter(Boolean);
292
+
293
+ return {
294
+ counts: { total, positive, negative },
295
+ recurringNegativePatterns,
296
+ preventionRules,
297
+ negativeToolCounts: toolNegatives,
298
+ negativeToolCountsAttributed: toolNegativesAttributed,
299
+ };
300
+ }
301
+
302
+ // ---------------------------------------------------------------------------
303
+ // deriveConstraints
304
+ // ---------------------------------------------------------------------------
305
+
306
+ /**
307
+ * Produce up to `max` actionable constraint strings from recurring patterns.
308
+ *
309
+ * @param {Object} state - from buildHybridState()
310
+ * @param {number} [max=5]
311
+ * @returns {string[]}
312
+ */
313
+ function deriveConstraints(state, max) {
314
+ const limit = max !== undefined ? max : 5;
315
+ const constraints = [];
316
+
317
+ // Top recurring patterns become constraints
318
+ for (const pattern of (state.recurringNegativePatterns || []).slice(0, limit)) {
319
+ const truncated = pattern.text.length > 100 ? pattern.text.slice(0, 100) + '...' : pattern.text;
320
+ constraints.push(`Avoid: "${truncated}" (seen ${pattern.count}x)`);
321
+ }
322
+
323
+ // Prevention rules fill remaining slots
324
+ const remaining = limit - constraints.length;
325
+ for (const rule of (state.preventionRules || []).slice(0, remaining)) {
326
+ const truncated = rule.length > 100 ? rule.slice(0, 100) + '...' : rule;
327
+ constraints.push(`Rule: ${truncated}`);
328
+ }
329
+
330
+ return constraints.slice(0, limit);
331
+ }
332
+
333
+ // ---------------------------------------------------------------------------
334
+ // buildAdditionalContext
335
+ // ---------------------------------------------------------------------------
336
+
337
+ /**
338
+ * Format a single summary string for pre-tool context injection.
339
+ *
340
+ * @param {Object} state
341
+ * @param {string[]} constraints
342
+ * @param {number} [maxChars=800]
343
+ * @returns {string}
344
+ */
345
+ function buildAdditionalContext(state, constraints, maxChars) {
346
+ const limit = maxChars !== undefined ? maxChars : 800;
347
+ const { counts } = state;
348
+ const lines = [
349
+ `Feedback history: ${counts.total} total (${counts.positive} positive, ${counts.negative} negative)`,
350
+ `Recurring patterns: ${(state.recurringNegativePatterns || []).length}`,
351
+ ];
352
+ if (constraints && constraints.length > 0) {
353
+ lines.push('Active constraints:');
354
+ constraints.forEach((c) => lines.push(` - ${c}`));
355
+ }
356
+ let result = lines.join('\n');
357
+ if (result.length > limit) {
358
+ result = result.slice(0, limit - 3) + '...';
359
+ }
360
+ return result;
361
+ }
362
+
363
+ // ---------------------------------------------------------------------------
364
+ // hasTwoKeywordHits
365
+ // ---------------------------------------------------------------------------
366
+
367
+ /**
368
+ * Require 2+ keyword matches to reduce false positives (ATTR-03 no-false-positive invariant).
369
+ *
370
+ * @param {string} normalizedInput
371
+ * @param {string[]} words - keyword list from a pattern
372
+ * @returns {boolean}
373
+ */
374
+ function hasTwoKeywordHits(normalizedInput, words) {
375
+ if (!normalizedInput || !words || words.length === 0) return false;
376
+ let hits = 0;
377
+ for (const word of words) {
378
+ if (normalizedInput.includes(word)) {
379
+ hits++;
380
+ if (hits >= 2) return true;
381
+ }
382
+ }
383
+ return false;
384
+ }
385
+
386
+ // ---------------------------------------------------------------------------
387
+ // compileGuardArtifact
388
+ // ---------------------------------------------------------------------------
389
+
390
+ /**
391
+ * Build deduped guards array from state.
392
+ * Prefers patterns sourced from attributedFeedback. Assigns block/warn mode.
393
+ *
394
+ * @param {Object} state - from buildHybridState()
395
+ * @param {Object} [opts]
396
+ * @param {number} [opts.blockThreshold=3] - count >= this → block
397
+ * @returns {Object} artifact
398
+ */
399
+ function compileGuardArtifact(state, opts) {
400
+ const o = opts || {};
401
+ const blockThreshold = o.blockThreshold !== undefined ? o.blockThreshold : 3;
402
+
403
+ const guards = [];
404
+ const seenHashes = new Set();
405
+
406
+ for (const pattern of state.recurringNegativePatterns || []) {
407
+ const h = hashText(pattern.text);
408
+ if (seenHashes.has(h)) continue;
409
+ seenHashes.add(h);
410
+
411
+ const isAttributed = pattern.sources && pattern.sources.includes('attributedFeedback');
412
+ const mode = pattern.count >= blockThreshold ? 'block' : 'warn';
413
+
414
+ guards.push({
415
+ hash: h,
416
+ text: pattern.text,
417
+ words: pattern.words,
418
+ count: pattern.count,
419
+ lastSeen: pattern.lastSeen,
420
+ attributed: isAttributed,
421
+ mode,
422
+ });
423
+ }
424
+
425
+ // Sort: attributed first, then by count desc
426
+ guards.sort((a, b) => {
427
+ if (a.attributed && !b.attributed) return -1;
428
+ if (!a.attributed && b.attributed) return 1;
429
+ return b.count - a.count;
430
+ });
431
+
432
+ return {
433
+ compiledAt: new Date().toISOString(),
434
+ guardCount: guards.length,
435
+ blockThreshold,
436
+ guards,
437
+ };
438
+ }
439
+
440
+ // ---------------------------------------------------------------------------
441
+ // writeGuardArtifact / readGuardArtifact
442
+ // ---------------------------------------------------------------------------
443
+
444
+ /**
445
+ * Atomic write via tmp → rename.
446
+ *
447
+ * @param {string} filePath
448
+ * @param {Object} artifact
449
+ */
450
+ function writeGuardArtifact(filePath, artifact) {
451
+ const outPath = filePath || PATHS.guardArtifact;
452
+ fs.mkdirSync(path.dirname(outPath), { recursive: true });
453
+ const tmp = `${outPath}.tmp.${process.pid}.${Date.now()}`;
454
+ fs.writeFileSync(tmp, JSON.stringify(artifact, null, 2) + '\n');
455
+ fs.renameSync(tmp, outPath);
456
+ }
457
+
458
+ /**
459
+ * Read + validate a guard artifact.
460
+ *
461
+ * @param {string} [filePath]
462
+ * @returns {Object|null} artifact or null if invalid/missing
463
+ */
464
+ function readGuardArtifact(filePath) {
465
+ const inPath = filePath || process.env.RLHF_GUARDS_PATH || PATHS.guardArtifact;
466
+ if (!fs.existsSync(inPath)) return null;
467
+ try {
468
+ const raw = fs.readFileSync(inPath, 'utf8');
469
+ const obj = JSON.parse(raw);
470
+ if (!Array.isArray(obj.guards)) return null;
471
+ return obj;
472
+ } catch (_) {
473
+ return null;
474
+ }
475
+ }
476
+
477
+ // ---------------------------------------------------------------------------
478
+ // evaluateCompiledGuards (fast path)
479
+ // ---------------------------------------------------------------------------
480
+
481
+ /**
482
+ * Check compiled artifact against toolName + toolInput.
483
+ *
484
+ * @param {Object} artifact
485
+ * @param {string} toolName
486
+ * @param {string} toolInput
487
+ * @returns {{ mode: string, reason: string, source: string }}
488
+ */
489
+ function evaluateCompiledGuards(artifact, toolName, toolInput) {
490
+ if (!artifact || !Array.isArray(artifact.guards)) {
491
+ return { mode: 'allow', reason: '', source: 'compiled' };
492
+ }
493
+
494
+ const normInput = normalize(toolInput || '');
495
+ const normTool = (toolName || '').toLowerCase();
496
+
497
+ for (const guard of artifact.guards) {
498
+ // Check if tool context is relevant
499
+ const guardText = normalize(guard.text || '');
500
+ const toolMentioned = guardText.includes(normTool) || normTool === 'unknown';
501
+
502
+ if (hasTwoKeywordHits(normInput, guard.words || [])) {
503
+ return {
504
+ mode: guard.mode || 'warn',
505
+ reason: `Matched guard pattern (count: ${guard.count}): "${(guard.text || '').slice(0, 80)}"`,
506
+ source: 'compiled',
507
+ guardHash: guard.hash,
508
+ attributed: guard.attributed,
509
+ };
510
+ }
511
+
512
+ // Also check tool-level match when input is empty or short
513
+ if (normInput.length < 10 && toolMentioned && guard.count >= (artifact.blockThreshold || 3)) {
514
+ return {
515
+ mode: guard.mode || 'warn',
516
+ reason: `Tool "${toolName}" has recurring negative patterns (count: ${guard.count})`,
517
+ source: 'compiled',
518
+ guardHash: guard.hash,
519
+ attributed: guard.attributed,
520
+ };
521
+ }
522
+ }
523
+
524
+ return { mode: 'allow', reason: '', source: 'compiled' };
525
+ }
526
+
527
+ // ---------------------------------------------------------------------------
528
+ // evaluatePretoolFromState (live path)
529
+ // ---------------------------------------------------------------------------
530
+
531
+ /**
532
+ * Live path: check recurringNegativePatterns + negativeToolCounts.
533
+ *
534
+ * @param {Object} state - from buildHybridState()
535
+ * @param {string} toolName
536
+ * @param {string} toolInput
537
+ * @returns {{ mode: string, reason: string, source: string }}
538
+ */
539
+ function evaluatePretoolFromState(state, toolName, toolInput) {
540
+ const normInput = normalize(toolInput || '');
541
+ const normTool = (toolName || '').toLowerCase();
542
+
543
+ for (const pattern of state.recurringNegativePatterns || []) {
544
+ if (hasTwoKeywordHits(normInput, pattern.words || [])) {
545
+ const mode = pattern.count >= 3 ? 'block' : 'warn';
546
+ return {
547
+ mode,
548
+ reason: `Recurring negative pattern (count: ${pattern.count}): "${(pattern.text || '').slice(0, 80)}"`,
549
+ source: 'state',
550
+ };
551
+ }
552
+ }
553
+
554
+ // Tool-level check: if this tool has many attributed negatives
555
+ const attrCount = (state.negativeToolCountsAttributed || {})[toolName] || 0;
556
+ const rawCount = (state.negativeToolCounts || {})[toolName] || 0;
557
+ if (attrCount >= 3 || rawCount >= 5) {
558
+ return {
559
+ mode: attrCount >= 3 ? 'block' : 'warn',
560
+ reason: `Tool "${toolName}" has ${attrCount} attributed negative(s), ${rawCount} total negative(s)`,
561
+ source: 'state',
562
+ };
563
+ }
564
+
565
+ return { mode: 'allow', reason: '', source: 'state' };
566
+ }
567
+
568
+ // ---------------------------------------------------------------------------
569
+ // evaluatePretool (orchestrator)
570
+ // ---------------------------------------------------------------------------
571
+
572
+ /**
573
+ * Main pre-tool evaluation. Tries compiled artifact first, falls back to live state.
574
+ *
575
+ * Important invariant: a tool+input with NEVER a negative returns {mode:'allow'}.
576
+ * hasTwoKeywordHits and count >= 2 filters enforce this (ATTR-03 no-false-positives).
577
+ *
578
+ * @param {string} toolName
579
+ * @param {string} toolInput
580
+ * @param {Object} [opts]
581
+ * @param {string} [opts.guardArtifactPath]
582
+ * @param {string} [opts.feedbackLogPath]
583
+ * @param {string} [opts.attributedFeedbackPath]
584
+ * @returns {{ mode: 'block'|'warn'|'allow', reason: string, source: string }}
585
+ */
586
+ function evaluatePretool(toolName, toolInput, opts) {
587
+ const o = opts || {};
588
+
589
+ // Fast path: compiled artifact
590
+ const artifactPath = o.guardArtifactPath || process.env.RLHF_GUARDS_PATH || PATHS.guardArtifact;
591
+ const artifact = readGuardArtifact(artifactPath);
592
+ if (artifact) {
593
+ const result = evaluateCompiledGuards(artifact, toolName, toolInput);
594
+ if (result.mode !== 'allow') return result;
595
+ // Even if compiled says allow, we're done (trust compiled)
596
+ return result;
597
+ }
598
+
599
+ // Slow path: build live state
600
+ const state = buildHybridState({
601
+ feedbackLogPath: o.feedbackLogPath,
602
+ attributedFeedbackPath: o.attributedFeedbackPath,
603
+ });
604
+ return evaluatePretoolFromState(state, toolName, toolInput);
605
+ }
606
+
607
+ // ---------------------------------------------------------------------------
608
+ // CLI main()
609
+ // ---------------------------------------------------------------------------
610
+
611
+ function main() {
612
+ const args = process.argv.slice(2);
613
+
614
+ if (args[0] === '--pretool') {
615
+ const toolName = args[1] || 'unknown';
616
+ const rawInput = args[2] || '';
617
+ let toolInput = rawInput;
618
+ try {
619
+ const parsed = JSON.parse(rawInput);
620
+ toolInput = typeof parsed === 'object' ? JSON.stringify(parsed) : String(parsed);
621
+ } catch (_) {
622
+ toolInput = rawInput;
623
+ }
624
+ const result = evaluatePretool(toolName, toolInput);
625
+ console.log(JSON.stringify(result, null, 2));
626
+ process.exit(result.mode === 'block' ? 2 : 0);
627
+ return;
628
+ }
629
+
630
+ if (args[0] === '--compile-guards') {
631
+ const outPath = args[1] || PATHS.guardArtifact;
632
+ const state = buildHybridState({});
633
+ const artifact = compileGuardArtifact(state);
634
+ writeGuardArtifact(outPath, artifact);
635
+ console.log(JSON.stringify({ guardCount: artifact.guardCount, outPath, compiledAt: artifact.compiledAt }, null, 2));
636
+ process.exit(0);
637
+ return;
638
+ }
639
+
640
+ // Default: print full state + constraints + additional context
641
+ const state = buildHybridState({});
642
+ const constraints = deriveConstraints(state);
643
+ const additionalContext = buildAdditionalContext(state, constraints);
644
+ console.log('=== Hybrid Feedback State ===');
645
+ console.log(JSON.stringify({ state, constraints, additionalContext }, null, 2));
646
+ }
647
+
648
+ // ---------------------------------------------------------------------------
649
+ // Exports
650
+ // ---------------------------------------------------------------------------
651
+
652
+ module.exports = {
653
+ buildHybridState,
654
+ evaluatePretool,
655
+ compileGuardArtifact,
656
+ writeGuardArtifact,
657
+ readGuardArtifact,
658
+ evaluateCompiledGuards,
659
+ evaluatePretoolFromState,
660
+ deriveConstraints,
661
+ buildAdditionalContext,
662
+ // Internal helpers (exposed for testing)
663
+ normalize,
664
+ normalizePatternText,
665
+ inferToolName,
666
+ classify,
667
+ keywords,
668
+ hashText,
669
+ hasTwoKeywordHits,
670
+ readJsonl,
671
+ PATHS,
672
+ };
673
+
674
+ if (require.main === module) {
675
+ main();
676
+ }