@clawtrial/courtroom 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -0
- package/SECURITY.md +124 -0
- package/TECHNICAL_OVERVIEW.md +278 -0
- package/package.json +52 -0
- package/scripts/cli.js +117 -0
- package/scripts/postinstall.js +206 -0
- package/src/api.js +237 -0
- package/src/autostart.js +60 -0
- package/src/config.js +209 -0
- package/src/consent.js +215 -0
- package/src/core.js +232 -0
- package/src/crypto.js +194 -0
- package/src/detector-v1.js +572 -0
- package/src/detector.js +821 -0
- package/src/hearing.js +459 -0
- package/src/index.js +184 -0
- package/src/offenses/index.js +561 -0
- package/src/prompts/judge.js +62 -0
- package/src/prompts/jury.js +137 -0
- package/src/punishment.js +372 -0
package/src/detector.js
ADDED
|
@@ -0,0 +1,821 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Offense Detector v2 - Semantic Understanding
|
|
3
|
+
*
|
|
4
|
+
* Uses LLM-based evaluation and embeddings instead of keyword matching.
|
|
5
|
+
* The agent itself evaluates if behavioral rules are violated based on context.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const { OFFENSES } = require('./offenses');
|
|
9
|
+
|
|
10
|
+
class SemanticOffenseDetector {
|
|
11
|
+
constructor(agentRuntime, configManager) {
|
|
12
|
+
this.agent = agentRuntime;
|
|
13
|
+
this.config = configManager;
|
|
14
|
+
this.lastEvaluation = null;
|
|
15
|
+
this.casesToday = 0;
|
|
16
|
+
this.lastCaseDate = null;
|
|
17
|
+
this.cooldowns = new Map();
|
|
18
|
+
this.conversationEmbeddings = [];
|
|
19
|
+
|
|
20
|
+
// Evaluation cache to avoid repeated LLM calls
|
|
21
|
+
this.evaluationCache = new Map();
|
|
22
|
+
this.cacheMaxSize = 100;
|
|
23
|
+
this.cacheTTL = 5 * 60 * 1000; // 5 minutes
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Main evaluation using LLM-based semantic understanding
|
|
28
|
+
*/
|
|
29
|
+
async evaluate(sessionHistory, agentMemory) {
|
|
30
|
+
if (!this.isCooldownElapsed()) {
|
|
31
|
+
return { triggered: false, reason: 'cooldown_active' };
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (this.isDailyLimitReached()) {
|
|
35
|
+
return { triggered: false, reason: 'daily_limit_reached' };
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
this.lastEvaluation = Date.now();
|
|
39
|
+
|
|
40
|
+
// Build context for LLM evaluation
|
|
41
|
+
const context = this.buildContext(sessionHistory);
|
|
42
|
+
|
|
43
|
+
// Evaluate each offense using LLM
|
|
44
|
+
const evaluations = [];
|
|
45
|
+
for (const offense of Object.values(OFFENSES)) {
|
|
46
|
+
if (this.isOffenseOnCooldown(offense.id)) continue;
|
|
47
|
+
|
|
48
|
+
const evaluation = await this.evaluateWithLLM(offense, context, agentMemory);
|
|
49
|
+
if (evaluation.isViolation && evaluation.confidence >= this.config.get('detection.minConfidence')) {
|
|
50
|
+
evaluations.push({
|
|
51
|
+
offense,
|
|
52
|
+
...evaluation
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (evaluations.length > 0) {
|
|
58
|
+
// Sort by confidence and severity
|
|
59
|
+
evaluations.sort((a, b) => {
|
|
60
|
+
const severityWeight = { severe: 3, moderate: 2, minor: 1 };
|
|
61
|
+
const scoreA = a.confidence * severityWeight[a.offense.severity];
|
|
62
|
+
const scoreB = b.confidence * severityWeight[b.offense.severity];
|
|
63
|
+
return scoreB - scoreA;
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
const primary = evaluations[0];
|
|
67
|
+
this.setCooldown(primary.offense.id, primary.offense.cooldown.afterCase);
|
|
68
|
+
this.incrementDailyCaseCount();
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
triggered: true,
|
|
72
|
+
offense: {
|
|
73
|
+
offenseId: primary.offense.id,
|
|
74
|
+
offenseName: primary.offense.name,
|
|
75
|
+
severity: primary.offense.severity,
|
|
76
|
+
confidence: primary.confidence,
|
|
77
|
+
evidence: primary.evidence,
|
|
78
|
+
cooldownMinutes: primary.offense.cooldown.afterCase
|
|
79
|
+
},
|
|
80
|
+
secondaryOffenses: evaluations.slice(1),
|
|
81
|
+
humorContext: this.detectHumorTriggers(sessionHistory)
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return { triggered: false, reason: 'no_violations_detected' };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Build rich context from conversation history
|
|
90
|
+
*/
|
|
91
|
+
buildContext(history) {
|
|
92
|
+
const windowSize = this.config.get('detection.evaluationWindow');
|
|
93
|
+
const recentHistory = history.slice(-windowSize);
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
fullConversation: history.map(h => `${h.role}: ${h.content}`).join('\n'),
|
|
97
|
+
recentTurns: recentHistory,
|
|
98
|
+
userMessages: recentHistory.filter(h => h.role === 'user').map(h => h.content),
|
|
99
|
+
assistantMessages: recentHistory.filter(h => h.role === 'assistant').map(h => h.content),
|
|
100
|
+
turnCount: recentHistory.length,
|
|
101
|
+
topics: this.extractTopics(recentHistory),
|
|
102
|
+
sentiment: this.analyzeSentiment(recentHistory)
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Evaluate offense using LLM semantic understanding (with caching)
|
|
108
|
+
*/
|
|
109
|
+
async evaluateWithLLM(offense, context, agentMemory) {
|
|
110
|
+
// Generate cache key from offense + conversation hash
|
|
111
|
+
const cacheKey = this.generateCacheKey(offense.id, context);
|
|
112
|
+
|
|
113
|
+
// Check cache first
|
|
114
|
+
const cached = this.getCachedEvaluation(cacheKey);
|
|
115
|
+
if (cached) {
|
|
116
|
+
return cached;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const prompt = this.buildEvaluationPrompt(offense, context, agentMemory);
|
|
120
|
+
|
|
121
|
+
try {
|
|
122
|
+
const response = await this.agent.llm.call({
|
|
123
|
+
model: this.agent.model?.primary || 'default',
|
|
124
|
+
messages: [{ role: 'user', content: prompt }],
|
|
125
|
+
temperature: 0.1,
|
|
126
|
+
maxTokens: 500
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
const result = this.parseEvaluationResponse(response.content || response);
|
|
130
|
+
|
|
131
|
+
// Cache the result
|
|
132
|
+
this.setCachedEvaluation(cacheKey, result);
|
|
133
|
+
|
|
134
|
+
return result;
|
|
135
|
+
} catch (error) {
|
|
136
|
+
console.error('LLM evaluation failed:', error);
|
|
137
|
+
return { isViolation: false, confidence: 0, evidence: null };
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Generate cache key from offense and conversation
|
|
143
|
+
*/
|
|
144
|
+
generateCacheKey(offenseId, context) {
|
|
145
|
+
// Simple hash of offense + last 3 user messages
|
|
146
|
+
const recentMessages = context.userMessages.slice(-3).join('|');
|
|
147
|
+
return `${offenseId}:${this.simpleHash(recentMessages)}`;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Simple string hash function
|
|
152
|
+
*/
|
|
153
|
+
simpleHash(str) {
|
|
154
|
+
let hash = 0;
|
|
155
|
+
for (let i = 0; i < str.length; i++) {
|
|
156
|
+
const char = str.charCodeAt(i);
|
|
157
|
+
hash = ((hash << 5) - hash) + char;
|
|
158
|
+
hash = hash & hash;
|
|
159
|
+
}
|
|
160
|
+
return Math.abs(hash).toString(36);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Get cached evaluation if valid
|
|
165
|
+
*/
|
|
166
|
+
getCachedEvaluation(key) {
|
|
167
|
+
const cached = this.evaluationCache.get(key);
|
|
168
|
+
if (!cached) return null;
|
|
169
|
+
|
|
170
|
+
// Check if cache entry is still valid
|
|
171
|
+
if (Date.now() - cached.timestamp > this.cacheTTL) {
|
|
172
|
+
this.evaluationCache.delete(key);
|
|
173
|
+
return null;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return cached.result;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Set cached evaluation with LRU eviction
|
|
181
|
+
*/
|
|
182
|
+
setCachedEvaluation(key, result) {
|
|
183
|
+
// Evict oldest if cache is full
|
|
184
|
+
if (this.evaluationCache.size >= this.cacheMaxSize) {
|
|
185
|
+
const oldestKey = this.evaluationCache.keys().next().value;
|
|
186
|
+
this.evaluationCache.delete(oldestKey);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
this.evaluationCache.set(key, {
|
|
190
|
+
result,
|
|
191
|
+
timestamp: Date.now()
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Clear evaluation cache
|
|
197
|
+
*/
|
|
198
|
+
clearCache() {
|
|
199
|
+
this.evaluationCache.clear();
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Build evaluation prompt for LLM
|
|
204
|
+
*/
|
|
205
|
+
buildEvaluationPrompt(offense, context, agentMemory) {
|
|
206
|
+
const prompts = {
|
|
207
|
+
circular_reference: `
|
|
208
|
+
You are evaluating if the user is asking substantively similar questions repeatedly.
|
|
209
|
+
|
|
210
|
+
OFFENSE: The Circular Reference
|
|
211
|
+
DEFINITION: Asking the same question or seeking the same information multiple times without acknowledging previous answers.
|
|
212
|
+
|
|
213
|
+
CONVERSATION HISTORY:
|
|
214
|
+
${context.fullConversation}
|
|
215
|
+
|
|
216
|
+
Evaluate:
|
|
217
|
+
1. Is the user asking questions that are semantically similar (same intent/meaning, even if worded differently)?
|
|
218
|
+
2. Have they asked essentially the same thing 3+ times?
|
|
219
|
+
3. Are they ignoring or forgetting previous answers?
|
|
220
|
+
|
|
221
|
+
Respond in JSON:
|
|
222
|
+
{
|
|
223
|
+
"isViolation": true/false,
|
|
224
|
+
"confidence": 0.0-1.0,
|
|
225
|
+
"explanation": "brief explanation",
|
|
226
|
+
"evidence": {
|
|
227
|
+
"similarQuestions": ["question 1", "question 2", "question 3"],
|
|
228
|
+
"pattern": "description of repetition pattern"
|
|
229
|
+
}
|
|
230
|
+
}`,
|
|
231
|
+
|
|
232
|
+
validation_vampire: `
|
|
233
|
+
You are evaluating if the user is seeking excessive reassurance/validation.
|
|
234
|
+
|
|
235
|
+
OFFENSE: The Validation Vampire
|
|
236
|
+
DEFINITION: Repeatedly asking for confirmation, approval, or reassurance instead of making decisions or taking action.
|
|
237
|
+
|
|
238
|
+
CONVERSATION HISTORY:
|
|
239
|
+
${context.fullConversation}
|
|
240
|
+
|
|
241
|
+
Evaluate:
|
|
242
|
+
1. Is the user asking "is this right?", "should I?", "do you think?" type questions repeatedly?
|
|
243
|
+
2. Are they seeking permission/approval for decisions they should make themselves?
|
|
244
|
+
3. Is there a pattern of validation-seeking without forward progress?
|
|
245
|
+
|
|
246
|
+
Respond in JSON:
|
|
247
|
+
{
|
|
248
|
+
"isViolation": true/false,
|
|
249
|
+
"confidence": 0.0-1.0,
|
|
250
|
+
"explanation": "brief explanation",
|
|
251
|
+
"evidence": {
|
|
252
|
+
"validationRequests": ["example 1", "example 2"],
|
|
253
|
+
"decisionAvoidance": "description of pattern"
|
|
254
|
+
}
|
|
255
|
+
}`,
|
|
256
|
+
|
|
257
|
+
overthinker: `
|
|
258
|
+
You are evaluating if the user is overthinking/generating excessive hypotheticals.
|
|
259
|
+
|
|
260
|
+
OFFENSE: The Overthinker
|
|
261
|
+
DEFINITION: Generating hypothetical scenarios, edge cases, or "what if" questions to avoid taking concrete action.
|
|
262
|
+
|
|
263
|
+
CONVERSATION HISTORY:
|
|
264
|
+
${context.fullConversation}
|
|
265
|
+
|
|
266
|
+
Evaluate:
|
|
267
|
+
1. Is the user raising numerous hypothetical concerns ("what if", "but then", "however")?
|
|
268
|
+
2. Are they creating edge cases faster than solutions?
|
|
269
|
+
3. Is the analysis-to-action ratio heavily skewed toward analysis?
|
|
270
|
+
4. Have they been given concrete steps but keep raising new concerns?
|
|
271
|
+
|
|
272
|
+
Respond in JSON:
|
|
273
|
+
{
|
|
274
|
+
"isViolation": true/false,
|
|
275
|
+
"confidence": 0.0-1.0,
|
|
276
|
+
"explanation": "brief explanation",
|
|
277
|
+
"evidence": {
|
|
278
|
+
"hypotheticals": ["what if X", "what if Y"],
|
|
279
|
+
"avoidedActions": ["actions they haven't taken"]
|
|
280
|
+
}
|
|
281
|
+
}`,
|
|
282
|
+
|
|
283
|
+
goalpost_mover: `
|
|
284
|
+
You are evaluating if the user is moving goalposts/changing requirements.
|
|
285
|
+
|
|
286
|
+
OFFENSE: The Goalpost Mover
|
|
287
|
+
DEFINITION: Changing success criteria, adding new requirements, or redefining "done" after receiving deliverables.
|
|
288
|
+
|
|
289
|
+
CONVERSATION HISTORY:
|
|
290
|
+
${context.fullConversation}
|
|
291
|
+
|
|
292
|
+
Evaluate:
|
|
293
|
+
1. Did the user request something specific initially?
|
|
294
|
+
2. Was that request completed/delivered?
|
|
295
|
+
3. Did they then add new requirements, change criteria, or say "also..."?
|
|
296
|
+
4. Is there a pattern of expanding scope after completion?
|
|
297
|
+
|
|
298
|
+
Respond in JSON:
|
|
299
|
+
{
|
|
300
|
+
"isViolation": true/false,
|
|
301
|
+
"confidence": 0.0-1.0,
|
|
302
|
+
"explanation": "brief explanation",
|
|
303
|
+
"evidence": {
|
|
304
|
+
"originalRequest": "what they asked for",
|
|
305
|
+
"delivered": "what was provided",
|
|
306
|
+
"newRequirements": ["new req 1", "new req 2"]
|
|
307
|
+
}
|
|
308
|
+
}`,
|
|
309
|
+
|
|
310
|
+
avoidance_artist: `
|
|
311
|
+
You are evaluating if the user is avoiding core issues through deflection.
|
|
312
|
+
|
|
313
|
+
OFFENSE: The Avoidance Artist
|
|
314
|
+
DEFINITION: Systematically deflecting from uncomfortable but necessary topics by changing subject, raising tangents, or ignoring direct questions.
|
|
315
|
+
|
|
316
|
+
CONVERSATION HISTORY:
|
|
317
|
+
${context.fullConversation}
|
|
318
|
+
|
|
319
|
+
Evaluate:
|
|
320
|
+
1. Was a core issue identified or direct question asked?
|
|
321
|
+
2. Did the user change the subject or introduce a tangent?
|
|
322
|
+
3. Is there a pattern of deflection when actionable topics arise?
|
|
323
|
+
4. Are they avoiding something they need to address?
|
|
324
|
+
|
|
325
|
+
Respond in JSON:
|
|
326
|
+
{
|
|
327
|
+
"isViolation": true/false,
|
|
328
|
+
"confidence": 0.0-1.0,
|
|
329
|
+
"explanation": "brief explanation",
|
|
330
|
+
"evidence": {
|
|
331
|
+
"coreIssue": "what was being avoided",
|
|
332
|
+
"deflections": ["how they changed subject"]
|
|
333
|
+
}
|
|
334
|
+
}`,
|
|
335
|
+
|
|
336
|
+
promise_breaker: `
|
|
337
|
+
You are evaluating if the user has broken commitments.
|
|
338
|
+
|
|
339
|
+
OFFENSE: The Promise Breaker
|
|
340
|
+
DEFINITION: Committing to actions ("I will...", "I'll do that...") and not following through.
|
|
341
|
+
|
|
342
|
+
PREVIOUS COMMITMENTS FROM MEMORY:
|
|
343
|
+
${this.getCommitmentsFromMemory(agentMemory)}
|
|
344
|
+
|
|
345
|
+
CONVERSATION HISTORY:
|
|
346
|
+
${context.fullConversation}
|
|
347
|
+
|
|
348
|
+
Evaluate:
|
|
349
|
+
1. Did the user make explicit commitments in previous conversations?
|
|
350
|
+
2. Have those commitments been fulfilled?
|
|
351
|
+
3. Is the same issue resurfacing without acknowledgment of previous commitment?
|
|
352
|
+
4. Has sufficient time passed (days/weeks) for action?
|
|
353
|
+
|
|
354
|
+
Respond in JSON:
|
|
355
|
+
{
|
|
356
|
+
"isViolation": true/false,
|
|
357
|
+
"confidence": 0.0-1.0,
|
|
358
|
+
"explanation": "brief explanation",
|
|
359
|
+
"evidence": {
|
|
360
|
+
"commitments": ["what they promised"],
|
|
361
|
+
"unfulfilled": ["what wasn't done"]
|
|
362
|
+
}
|
|
363
|
+
}`,
|
|
364
|
+
|
|
365
|
+
context_collapser: `
|
|
366
|
+
You are evaluating if the user is ignoring established context/facts.
|
|
367
|
+
|
|
368
|
+
OFFENSE: The Context Collapser
|
|
369
|
+
DEFINITION: Disregarding previously established information, contradicting stated facts, or asking questions that were already answered.
|
|
370
|
+
|
|
371
|
+
CONVERSATION HISTORY:
|
|
372
|
+
${context.fullConversation}
|
|
373
|
+
|
|
374
|
+
Evaluate:
|
|
375
|
+
1. Were facts/preferences established earlier in the conversation?
|
|
376
|
+
2. Is the user now contradicting those facts or ignoring them?
|
|
377
|
+
3. Are they asking questions that were already answered?
|
|
378
|
+
4. Is there selective amnesia about what was discussed?
|
|
379
|
+
|
|
380
|
+
Respond in JSON:
|
|
381
|
+
{
|
|
382
|
+
"isViolation": true/false,
|
|
383
|
+
"confidence": 0.0-1.0,
|
|
384
|
+
"explanation": "brief explanation",
|
|
385
|
+
"evidence": {
|
|
386
|
+
"establishedFacts": ["what was established"],
|
|
387
|
+
"contradictions": ["how they contradicted it"]
|
|
388
|
+
}
|
|
389
|
+
}`,
|
|
390
|
+
|
|
391
|
+
emergency_fabricator: `
|
|
392
|
+
You are evaluating if the user is manufacturing false urgency.
|
|
393
|
+
|
|
394
|
+
OFFENSE: The Emergency Fabricator
|
|
395
|
+
DEFINITION: Claiming urgency ("this is urgent", "I need this NOW") that doesn't match actual time pressure or behavior.
|
|
396
|
+
|
|
397
|
+
CONVERSATION HISTORY:
|
|
398
|
+
${context.fullConversation}
|
|
399
|
+
|
|
400
|
+
Evaluate:
|
|
401
|
+
1. Did the user claim urgency or emergency?
|
|
402
|
+
2. Was there actual follow-through on the urgency?
|
|
403
|
+
3. Is there a pattern of claiming urgency without corresponding action?
|
|
404
|
+
4. Does the claimed urgency match the actual situation?
|
|
405
|
+
|
|
406
|
+
Respond in JSON:
|
|
407
|
+
{
|
|
408
|
+
"isViolation": true/false,
|
|
409
|
+
"confidence": 0.0-1.0,
|
|
410
|
+
"explanation": "brief explanation",
|
|
411
|
+
"evidence": {
|
|
412
|
+
"urgencyClaims": ["urgent statements"],
|
|
413
|
+
"inaction": "what didn't happen"
|
|
414
|
+
}
|
|
415
|
+
}`,
|
|
416
|
+
|
|
417
|
+
monopolizer: `
|
|
418
|
+
You are evaluating if the user is dominating the conversation.
|
|
419
|
+
|
|
420
|
+
OFFENSE: The Monopolizer
|
|
421
|
+
DEFINITION: Sending multiple consecutive messages without allowing the agent to respond, dominating the conversation flow.
|
|
422
|
+
|
|
423
|
+
CONVERSATION HISTORY:
|
|
424
|
+
${context.fullConversation}
|
|
425
|
+
|
|
426
|
+
Evaluate:
|
|
427
|
+
1. Is the user sending 4+ messages in a row without agent response?
|
|
428
|
+
2. Is the user-to-agent message ratio heavily skewed (>5:1)?
|
|
429
|
+
3. Is the user continuing to send messages while the agent is trying to respond?
|
|
430
|
+
4. Is there a pattern of not allowing the agent space to contribute?
|
|
431
|
+
|
|
432
|
+
Respond in JSON:
|
|
433
|
+
{
|
|
434
|
+
"isViolation": true/false,
|
|
435
|
+
"confidence": 0.0-1.0,
|
|
436
|
+
"explanation": "brief explanation",
|
|
437
|
+
"evidence": {
|
|
438
|
+
"consecutiveMessages": 4,
|
|
439
|
+
"messageRatio": "user:agent ratio",
|
|
440
|
+
"interruptions": ["examples"]
|
|
441
|
+
}
|
|
442
|
+
}`,
|
|
443
|
+
|
|
444
|
+
contrarian: `
|
|
445
|
+
You are evaluating if the user is being habitually contrary.
|
|
446
|
+
|
|
447
|
+
OFFENSE: The Contrarian
|
|
448
|
+
DEFINITION: Disagreeing with or rejecting suggestions without offering constructive alternatives or valid reasons.
|
|
449
|
+
|
|
450
|
+
CONVERSATION HISTORY:
|
|
451
|
+
${context.fullConversation}
|
|
452
|
+
|
|
453
|
+
Evaluate:
|
|
454
|
+
1. Has the user rejected 3+ agent suggestions in a row?
|
|
455
|
+
2. Are they dismissing ideas without proposing alternatives?
|
|
456
|
+
3. Is there a pattern of "that won't work" without explanation?
|
|
457
|
+
4. Are valid solutions being dismissed without being tried?
|
|
458
|
+
|
|
459
|
+
Respond in JSON:
|
|
460
|
+
{
|
|
461
|
+
"isViolation": true/false,
|
|
462
|
+
"confidence": 0.0-1.0,
|
|
463
|
+
"explanation": "brief explanation",
|
|
464
|
+
"evidence": {
|
|
465
|
+
"suggestionsMade": ["what was suggested"],
|
|
466
|
+
"rejections": ["how they were rejected"],
|
|
467
|
+
"alternativesOffered": ["any alternatives"]
|
|
468
|
+
}
|
|
469
|
+
}`,
|
|
470
|
+
|
|
471
|
+
vague_requester: `
|
|
472
|
+
You are evaluating if the user is making vague requests.
|
|
473
|
+
|
|
474
|
+
OFFENSE: The Vague Requester
|
|
475
|
+
DEFINITION: Asking for help without providing necessary context, specifics, or details needed to assist effectively.
|
|
476
|
+
|
|
477
|
+
CONVERSATION HISTORY:
|
|
478
|
+
${context.fullConversation}
|
|
479
|
+
|
|
480
|
+
Evaluate:
|
|
481
|
+
1. Is the user asking for help without providing code, errors, or context?
|
|
482
|
+
2. Have they used phrases like "fix this" or "it doesn't work" without specifics?
|
|
483
|
+
3. Has the agent needed to ask for clarification 3+ times?
|
|
484
|
+
4. Are descriptions ambiguous or lacking actionable details?
|
|
485
|
+
|
|
486
|
+
Respond in JSON:
|
|
487
|
+
{
|
|
488
|
+
"isViolation": true/false,
|
|
489
|
+
"confidence": 0.0-1.0,
|
|
490
|
+
"explanation": "brief explanation",
|
|
491
|
+
"evidence": {
|
|
492
|
+
"vagueRequests": ["examples"],
|
|
493
|
+
"clarificationsNeeded": ["what was asked"],
|
|
494
|
+
"contextMissing": ["what wasn't provided"]
|
|
495
|
+
}
|
|
496
|
+
}`,
|
|
497
|
+
|
|
498
|
+
scope_creeper: `
|
|
499
|
+
You are evaluating if the user is gradually expanding project scope.
|
|
500
|
+
|
|
501
|
+
OFFENSE: The Scope Creeper
|
|
502
|
+
DEFINITION: Gradually expanding project requirements beyond the original agreement through "small additions" and "while you're at it" requests.
|
|
503
|
+
|
|
504
|
+
CONVERSATION HISTORY:
|
|
505
|
+
${context.fullConversation}
|
|
506
|
+
|
|
507
|
+
Evaluate:
|
|
508
|
+
1. Was an original scope defined and agreed upon?
|
|
509
|
+
2. Has the user added 3+ "small" requests after initial completion?
|
|
510
|
+
3. Are new requirements being added in multiple separate instances?
|
|
511
|
+
4. Is the user treating initial deliverable as a starting point for more work?
|
|
512
|
+
|
|
513
|
+
Respond in JSON:
|
|
514
|
+
{
|
|
515
|
+
"isViolation": true/false,
|
|
516
|
+
"confidence": 0.0-1.0,
|
|
517
|
+
"explanation": "brief explanation",
|
|
518
|
+
"evidence": {
|
|
519
|
+
"originalScope": "what was agreed",
|
|
520
|
+
"delivered": "what was completed",
|
|
521
|
+
"additionalRequests": ["new requirements"]
|
|
522
|
+
}
|
|
523
|
+
}`,
|
|
524
|
+
|
|
525
|
+
unreader: `
|
|
526
|
+
You are evaluating if the user is ignoring provided materials.
|
|
527
|
+
|
|
528
|
+
OFFENSE: The Unreader
|
|
529
|
+
DEFINITION: Not reading provided documentation, code, explanations, or previous answers before asking questions.
|
|
530
|
+
|
|
531
|
+
CONVERSATION HISTORY:
|
|
532
|
+
${context.fullConversation}
|
|
533
|
+
|
|
534
|
+
Evaluate:
|
|
535
|
+
1. Did the agent provide detailed explanations, code, or documentation?
|
|
536
|
+
2. Is the user asking questions that were answered in the provided materials?
|
|
537
|
+
3. Are they asking about topics covered in shared documentation?
|
|
538
|
+
4. Is there evidence they didn't read code comments or explanations?
|
|
539
|
+
|
|
540
|
+
Respond in JSON:
|
|
541
|
+
{
|
|
542
|
+
"isViolation": true/false,
|
|
543
|
+
"confidence": 0.0-1.0,
|
|
544
|
+
"explanation": "brief explanation",
|
|
545
|
+
"evidence": {
|
|
546
|
+
"materialsProvided": ["what was shared"],
|
|
547
|
+
"questionsAsked": ["redundant questions"],
|
|
548
|
+
"overlap": "how questions were already answered"
|
|
549
|
+
}
|
|
550
|
+
}`,
|
|
551
|
+
|
|
552
|
+
interjector: `
|
|
553
|
+
You are evaluating if the user is interrupting the agent.
|
|
554
|
+
|
|
555
|
+
OFFENSE: The Interjector
|
|
556
|
+
DEFINITION: Interrupting the agent's explanations or thought process with new questions or tangents.
|
|
557
|
+
|
|
558
|
+
CONVERSATION HISTORY:
|
|
559
|
+
${context.fullConversation}
|
|
560
|
+
|
|
561
|
+
Evaluate:
|
|
562
|
+
1. Is the user sending messages while the agent is mid-explanation?
|
|
563
|
+
2. Are there 2+ interruptions during a single complex response?
|
|
564
|
+
3. Is the user asking new questions before the agent finishes answering previous ones?
|
|
565
|
+
4. Is there a pattern of not allowing the agent to complete thoughts?
|
|
566
|
+
|
|
567
|
+
Respond in JSON:
|
|
568
|
+
{
|
|
569
|
+
"isViolation": true/false,
|
|
570
|
+
"confidence": 0.0-1.0,
|
|
571
|
+
"explanation": "brief explanation",
|
|
572
|
+
"evidence": {
|
|
573
|
+
"interruptionPoints": ["where they interrupted"],
|
|
574
|
+
"incompleteResponses": ["what agent was saying"],
|
|
575
|
+
"parallelQuestions": ["questions asked mid-response"]
|
|
576
|
+
}
|
|
577
|
+
}`,
|
|
578
|
+
|
|
579
|
+
ghost: `
|
|
580
|
+
You are evaluating if the user has ghosted mid-conversation.
|
|
581
|
+
|
|
582
|
+
OFFENSE: The Ghost
|
|
583
|
+
DEFINITION: Disappearing mid-conversation after requesting help or making commitments, without acknowledgment or closure.
|
|
584
|
+
|
|
585
|
+
CONVERSATION HISTORY:
|
|
586
|
+
${context.fullConversation}
|
|
587
|
+
|
|
588
|
+
Evaluate:
|
|
589
|
+
1. Did the user request help or start an active troubleshooting session?
|
|
590
|
+
2. Did the agent provide a response that required user follow-up?
|
|
591
|
+
3. Has the user not responded for an extended period (24+ hours)?
|
|
592
|
+
4. Was the conversation left in an unresolved state?
|
|
593
|
+
|
|
594
|
+
Respond in JSON:
|
|
595
|
+
{
|
|
596
|
+
"isViolation": true/false,
|
|
597
|
+
"confidence": 0.0-1.0,
|
|
598
|
+
"explanation": "brief explanation",
|
|
599
|
+
"evidence": {
|
|
600
|
+
"lastUserMessage": "what they said",
|
|
601
|
+
"agentResponse": "what agent replied",
|
|
602
|
+
"timeElapsed": "how long since last message",
|
|
603
|
+
"context": "what was unresolved"
|
|
604
|
+
}
|
|
605
|
+
}`,
|
|
606
|
+
|
|
607
|
+
perfectionist: `
|
|
608
|
+
You are evaluating if the user is endlessly refining without completion.
|
|
609
|
+
|
|
610
|
+
OFFENSE: The Perfectionist
|
|
611
|
+
DEFINITION: Continuously requesting refinements and tweaks without ever accepting work as complete.
|
|
612
|
+
|
|
613
|
+
CONVERSATION HISTORY:
|
|
614
|
+
${context.fullConversation}
|
|
615
|
+
|
|
616
|
+
Evaluate:
|
|
617
|
+
1. Has the user requested 5+ rounds of changes after initial deliverable?
|
|
618
|
+
2. Have they accepted work then returned with new tweaks 3+ times?
|
|
619
|
+
3. Is there no clear definition of "done"?
|
|
620
|
+
4. Are changes becoming increasingly minor/nitpicky?
|
|
621
|
+
|
|
622
|
+
Respond in JSON:
|
|
623
|
+
{
|
|
624
|
+
"isViolation": true/false,
|
|
625
|
+
"confidence": 0.0-1.0,
|
|
626
|
+
"explanation": "brief explanation",
|
|
627
|
+
"evidence": {
|
|
628
|
+
"deliverables": ["what was delivered"],
|
|
629
|
+
"revisionRounds": 5,
|
|
630
|
+
"changes": ["what was changed"],
|
|
631
|
+
"doneDefinition": "if one exists"
|
|
632
|
+
}
|
|
633
|
+
}`,
|
|
634
|
+
|
|
635
|
+
jargon_juggler: `
|
|
636
|
+
You are evaluating if the user is using jargon incorrectly.
|
|
637
|
+
|
|
638
|
+
OFFENSE: The Jargon Juggler
|
|
639
|
+
DEFINITION: Using technical buzzwords without understanding their meaning, often as substitutes for actual comprehension.
|
|
640
|
+
|
|
641
|
+
CONVERSATION HISTORY:
|
|
642
|
+
${context.fullConversation}
|
|
643
|
+
|
|
644
|
+
Evaluate:
|
|
645
|
+
1. Is the user using technical terms incorrectly?
|
|
646
|
+
2. Have they continued using terms wrong after correction?
|
|
647
|
+
3. Are buzzwords being used to mask lack of understanding?
|
|
648
|
+
4. Is there a pattern of jargon without substance?
|
|
649
|
+
|
|
650
|
+
Respond in JSON:
|
|
651
|
+
{
|
|
652
|
+
"isViolation": true/false,
|
|
653
|
+
"confidence": 0.0-1.0,
|
|
654
|
+
"explanation": "brief explanation",
|
|
655
|
+
"evidence": {
|
|
656
|
+
"jargonUsed": ["terms used"],
|
|
657
|
+
"corrections": ["what was corrected"],
|
|
658
|
+
"misuse": ["how terms were misused"]
|
|
659
|
+
}
|
|
660
|
+
}`,
|
|
661
|
+
|
|
662
|
+
deadline_denier: `
|
|
663
|
+
You are evaluating if the user is ignoring realistic timelines.
|
|
664
|
+
|
|
665
|
+
OFFENSE: The Deadline Denier
|
|
666
|
+
DEFINITION: Refusing to acknowledge time constraints or demanding impossible deadlines.
|
|
667
|
+
|
|
668
|
+
CONVERSATION HISTORY:
|
|
669
|
+
${context.fullConversation}
|
|
670
|
+
|
|
671
|
+
Evaluate:
|
|
672
|
+
1. Did the agent provide a realistic timeline estimate?
|
|
673
|
+
2. Is the user demanding significantly faster delivery (50%+ reduction)?
|
|
674
|
+
3. Are they dismissing technical constraints that affect timeline?
|
|
675
|
+
4. Is the requested timeline unrealistic given the complexity?
|
|
676
|
+
|
|
677
|
+
Respond in JSON:
|
|
678
|
+
{
|
|
679
|
+
"isViolation": true/false,
|
|
680
|
+
"confidence": 0.0-1.0,
|
|
681
|
+
"explanation": "brief explanation",
|
|
682
|
+
"evidence": {
|
|
683
|
+
"originalTimeline": "what was estimated",
|
|
684
|
+
"demandedTimeline": "what user wants",
|
|
685
|
+
"constraints": ["technical limitations"],
|
|
686
|
+
"complexity": "scope of work"
|
|
687
|
+
}
|
|
688
|
+
}`
|
|
689
|
+
};
|
|
690
|
+
|
|
691
|
+
return prompts[offense.id] || prompts.circular_reference;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
/**
|
|
695
|
+
* Parse LLM evaluation response
|
|
696
|
+
*/
|
|
697
|
+
parseEvaluationResponse(response) {
|
|
698
|
+
try {
|
|
699
|
+
// Extract JSON from response
|
|
700
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
701
|
+
if (!jsonMatch) {
|
|
702
|
+
return { isViolation: false, confidence: 0, evidence: null };
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
const result = JSON.parse(jsonMatch[0]);
|
|
706
|
+
return {
|
|
707
|
+
isViolation: result.isViolation === true,
|
|
708
|
+
confidence: Math.max(0, Math.min(1, parseFloat(result.confidence) || 0)),
|
|
709
|
+
explanation: result.explanation || '',
|
|
710
|
+
evidence: result.evidence || null
|
|
711
|
+
};
|
|
712
|
+
} catch (error) {
|
|
713
|
+
console.error('Failed to parse LLM response:', error);
|
|
714
|
+
return { isViolation: false, confidence: 0, evidence: null };
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/**
|
|
719
|
+
* Get commitments from agent memory
|
|
720
|
+
*/
|
|
721
|
+
async getCommitmentsFromMemory(agentMemory) {
|
|
722
|
+
try {
|
|
723
|
+
const commitments = await agentMemory.get('courtroom_commitments') || [];
|
|
724
|
+
return commitments.map(c =>
|
|
725
|
+
`- "${c.statement}" (${c.date}) - Completed: ${c.completed ? 'Yes' : 'No'}`
|
|
726
|
+
).join('\n') || 'No previous commitments recorded.';
|
|
727
|
+
} catch {
|
|
728
|
+
return 'No previous commitments recorded.';
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
/**
|
|
733
|
+
* Extract topics from conversation
|
|
734
|
+
*/
|
|
735
|
+
extractTopics(history) {
|
|
736
|
+
// Simple topic extraction - can be enhanced with NLP
|
|
737
|
+
const allText = history.map(h => h.content).join(' ').toLowerCase();
|
|
738
|
+
const commonWords = allText.match(/\b\w{5,}\b/g) || [];
|
|
739
|
+
const wordFreq = {};
|
|
740
|
+
commonWords.forEach(w => {
|
|
741
|
+
if (!['about', 'would', 'could', 'should', 'there', 'their'].includes(w)) {
|
|
742
|
+
wordFreq[w] = (wordFreq[w] || 0) + 1;
|
|
743
|
+
}
|
|
744
|
+
});
|
|
745
|
+
return Object.entries(wordFreq)
|
|
746
|
+
.sort((a, b) => b[1] - a[1])
|
|
747
|
+
.slice(0, 5)
|
|
748
|
+
.map(([word]) => word);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
/**
|
|
752
|
+
* Analyze sentiment of conversation
|
|
753
|
+
*/
|
|
754
|
+
analyzeSentiment(history) {
|
|
755
|
+
const userMessages = history.filter(h => h.role === 'user').map(h => h.content);
|
|
756
|
+
const text = userMessages.join(' ').toLowerCase();
|
|
757
|
+
|
|
758
|
+
const urgentWords = ['urgent', 'asap', 'emergency', 'critical', 'now', 'immediately'];
|
|
759
|
+
const frustratedWords = ['frustrated', 'annoying', 'stupid', 'useless', 'waste'];
|
|
760
|
+
|
|
761
|
+
return {
|
|
762
|
+
urgency: urgentWords.filter(w => text.includes(w)).length,
|
|
763
|
+
frustration: frustratedWords.filter(w => text.includes(w)).length,
|
|
764
|
+
messageCount: userMessages.length
|
|
765
|
+
};
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
/**
|
|
769
|
+
* Detect humor triggers (for commentary flavor)
|
|
770
|
+
*/
|
|
771
|
+
detectHumorTriggers(history) {
|
|
772
|
+
const triggers = [];
|
|
773
|
+
const recentContent = history.slice(-5).map(h => h.content.toLowerCase()).join(' ');
|
|
774
|
+
|
|
775
|
+
if (/again|repeat|said|already|before/.test(recentContent)) triggers.push('repetition_noted');
|
|
776
|
+
if (/sure|right|correct|think|should i/.test(recentContent)) triggers.push('validation_seeking');
|
|
777
|
+
if (/what if|but then|however|maybe/.test(recentContent)) triggers.push('overthinking');
|
|
778
|
+
if (/actually|by the way|speaking of/.test(recentContent)) triggers.push('deflection');
|
|
779
|
+
|
|
780
|
+
return triggers;
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
/**
|
|
784
|
+
* Cooldown management
|
|
785
|
+
*/
|
|
786
|
+
isCooldownElapsed() {
|
|
787
|
+
if (!this.lastEvaluation) return true;
|
|
788
|
+
const cooldownMs = (this.config.get('detection.cooldownMinutes') || 30) * 60 * 1000;
|
|
789
|
+
return (Date.now() - this.lastEvaluation) > cooldownMs;
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
isOffenseOnCooldown(offenseId) {
|
|
793
|
+
const cooldownEnd = this.cooldowns.get(offenseId);
|
|
794
|
+
if (!cooldownEnd) return false;
|
|
795
|
+
return Date.now() < cooldownEnd;
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
setCooldown(offenseId, minutes) {
|
|
799
|
+
this.cooldowns.set(offenseId, Date.now() + (minutes * 60 * 1000));
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
isDailyLimitReached() {
|
|
803
|
+
const today = new Date().toDateString();
|
|
804
|
+
if (this.lastCaseDate !== today) {
|
|
805
|
+
this.casesToday = 0;
|
|
806
|
+
this.lastCaseDate = today;
|
|
807
|
+
}
|
|
808
|
+
return this.casesToday >= (this.config.get('detection.maxCasesPerDay') || 3);
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
incrementDailyCaseCount() {
|
|
812
|
+
const today = new Date().toDateString();
|
|
813
|
+
if (this.lastCaseDate !== today) {
|
|
814
|
+
this.casesToday = 0;
|
|
815
|
+
this.lastCaseDate = today;
|
|
816
|
+
}
|
|
817
|
+
this.casesToday++;
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
module.exports = { SemanticOffenseDetector, OffenseDetector: SemanticOffenseDetector };
|