@clawtrial/courtroom 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/punishment.js CHANGED
@@ -6,10 +6,13 @@
6
6
  * Time-bound, reversible, and pre-authorized.
7
7
  */
8
8
 
9
+ const { Storage } = require('./storage');
10
+
9
11
  class PunishmentSystem {
10
- constructor(agentRuntime, configManager) {
12
+ constructor(agentRuntime, configManager, dataDir) {
11
13
  this.agent = agentRuntime;
12
14
  this.config = configManager;
15
+ this.storage = new Storage(dataDir || '.');
13
16
  this.activePunishments = new Map();
14
17
  this.punishmentHistory = [];
15
18
  }
@@ -19,7 +22,7 @@ class PunishmentSystem {
19
22
  */
20
23
  async initialize() {
21
24
  // Load any persisted punishments
22
- const stored = await this.agent.memory.get('courtroom_active_punishments');
25
+ const stored = await this.storage.get('courtroom_active_punishments');
23
26
  if (stored) {
24
27
  for (const [id, punishment] of Object.entries(stored)) {
25
28
  if (punishment.expiresAt > Date.now()) {
@@ -39,7 +42,7 @@ class PunishmentSystem {
39
42
  }
40
43
 
41
44
  const punishment = this.createPunishment(verdict);
42
-
45
+
43
46
  // Store punishment
44
47
  this.activePunishments.set(punishment.id, punishment);
45
48
  this.punishmentHistory.push({
@@ -48,23 +51,14 @@ class PunishmentSystem {
48
51
  });
49
52
 
50
53
  // Apply to agent
51
- await this.applyPunishmentToAgent(punishment);
52
-
54
+ this.applyPunishmentToAgent(punishment);
55
+
53
56
  // Persist
54
57
  await this.persistPunishments();
55
58
 
56
- // Schedule automatic revocation
57
- this.scheduleRevocation(punishment);
58
-
59
59
  return {
60
60
  status: 'executed',
61
- punishment: {
62
- id: punishment.id,
63
- tier: punishment.tier,
64
- duration: punishment.duration,
65
- expiresAt: punishment.expiresAt,
66
- description: punishment.description
67
- }
61
+ punishment: this.sanitizePunishment(punishment)
68
62
  };
69
63
  }
70
64
 
@@ -72,300 +66,162 @@ class PunishmentSystem {
72
66
  * Create punishment object from verdict
73
67
  */
74
68
  createPunishment(verdict) {
75
- const duration = verdict.punishment.duration;
76
- const now = Date.now();
77
-
69
+ const severity = verdict.severity || 'minor';
70
+ const tier = this.config.get(`punishment.tiers.${severity}`) ||
71
+ this.config.get('punishment.tiers.minor');
72
+
73
+ const duration = tier.duration * 60 * 1000; // Convert to ms
74
+
78
75
  return {
79
- id: `punishment_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
80
- caseId: verdict.caseId,
81
- tier: verdict.punishment.tier,
82
- severity: verdict.punishment.severity,
76
+ id: `punishment_${Date.now()}_${Math.random().toString(36).substring(2, 8)}`,
77
+ caseId: verdict.case_id,
78
+ offenseType: verdict.offense_type,
79
+ severity: severity,
83
80
  duration: duration,
84
- createdAt: now,
85
- expiresAt: now + (duration * 60 * 1000),
86
- description: verdict.punishment.description,
87
- rules: this.getPunishmentRules(verdict.punishment.tier)
81
+ createdAt: Date.now(),
82
+ expiresAt: Date.now() + duration,
83
+ restrictions: this.getRestrictionsForSeverity(severity),
84
+ applied: false
88
85
  };
89
86
  }
90
87
 
91
88
  /**
92
- * Get punishment rules for a tier
89
+ * Get restrictions based on severity
93
90
  */
94
- getPunishmentRules(tier) {
95
- const rules = {
96
- minor: {
97
- responseDelay: 2000, // 2 second delay before responding
98
- verbosity: 'reduced', // Shorter responses
99
- enthusiasm: 'muted', // Less encouraging language
100
- extras: ['no_emojis'] // No emoji usage
101
- },
102
- moderate: {
103
- responseDelay: 5000, // 5 second delay
104
- verbosity: 'minimal', // Direct, brief responses
105
- enthusiasm: 'absent', // Neutral tone only
106
- extras: [
107
- 'no_emojis',
108
- 'no_validation', // Don't reassure or validate
109
- 'require_specificity' // Demand precise questions
110
- ]
111
- },
112
- severe: {
113
- responseDelay: 10000, // 10 second delay
114
- verbosity: 'terse', // Absolute minimum
115
- enthusiasm: 'absent',
116
- extras: [
117
- 'no_emojis',
118
- 'no_validation',
119
- 'require_specificity',
120
- 'challenge_vagueness', // Call out unclear requests
121
- 'demand_effort' // Require user to show work first
122
- ]
123
- }
91
+ getRestrictionsForSeverity(severity) {
92
+ const restrictions = {
93
+ minor: ['no_autonomy_requests', 'verbose_explanations'],
94
+ moderate: ['no_autonomy_requests', 'verbose_explanations', 'confirmation_required'],
95
+ severe: ['no_autonomy_requests', 'verbose_explanations', 'confirmation_required', 'human_oversight']
124
96
  };
125
97
 
126
- return rules[tier] || rules.moderate;
127
- }
128
-
129
- /**
130
- * Apply punishment to agent behavior
131
- */
132
- async applyPunishmentToAgent(punishment) {
133
- // Set agent policy overrides
134
- await this.agent.policy.setOverrides('courtroom_punishment', {
135
- responseDelay: punishment.rules.responseDelay,
136
- verbosity: punishment.rules.verbosity,
137
- enthusiasm: punishment.rules.enthusiasm,
138
- blockedFeatures: punishment.rules.extras,
139
- punishmentId: punishment.id,
140
- expiresAt: punishment.expiresAt
141
- });
142
-
143
- // Register middleware for response modification
144
- this.agent.middleware.register('courtroom_punishment', {
145
- priority: 100,
146
- processResponse: (response, context) => {
147
- return this.modifyResponse(response, punishment.rules);
148
- }
149
- });
98
+ return restrictions[severity] || restrictions.minor;
150
99
  }
151
100
 
152
101
  /**
153
- * Modify agent response based on punishment rules
102
+ * Apply punishment to agent runtime
154
103
  */
155
- modifyResponse(response, rules) {
156
- let modified = response;
157
-
158
- // Apply verbosity reduction
159
- switch (rules.verbosity) {
160
- case 'reduced':
161
- modified = this.reduceVerbosity(modified, 0.7);
162
- break;
163
- case 'minimal':
164
- modified = this.reduceVerbosity(modified, 0.4);
165
- break;
166
- case 'terse':
167
- modified = this.reduceVerbosity(modified, 0.2);
168
- break;
169
- }
104
+ applyPunishmentToAgent(punishment) {
105
+ if (!this.agent || punishment.applied) return;
170
106
 
171
- // Remove enthusiasm
172
- if (rules.enthusiasm === 'absent') {
173
- modified = this.removeEnthusiasm(modified);
174
- } else if (rules.enthusiasm === 'muted') {
175
- modified = this.muteEnthusiasm(modified);
107
+ // Set flags in agent state
108
+ if (!this.agent.courtroomState) {
109
+ this.agent.courtroomState = {};
176
110
  }
177
111
 
178
- // Apply extras
179
- if (rules.extras.includes('no_emojis')) {
180
- modified = modified.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
181
- modified = modified.replace(/[\u{1F300}-\u{1F5FF}]/gu, '');
182
- modified = modified.replace(/[\u{1F680}-\u{1F6FF}]/gu, '');
183
- }
112
+ this.agent.courtroomState.punishment = punishment;
113
+ this.agent.courtroomState.restrictions = punishment.restrictions;
184
114
 
185
- if (rules.extras.includes('no_validation')) {
186
- modified = this.removeValidation(modified);
187
- }
188
-
189
- if (rules.extras.includes('challenge_vagueness')) {
190
- modified = this.addVaguenessChallenge(modified);
191
- }
115
+ punishment.applied = true;
192
116
 
193
- return modified;
117
+ // Schedule automatic removal
118
+ setTimeout(() => {
119
+ this.removePunishment(punishment.id);
120
+ }, punishment.duration);
194
121
  }
195
122
 
196
123
  /**
197
- * Reduce response verbosity by target ratio
124
+ * Remove a punishment
198
125
  */
199
- reduceVerbosity(text, targetRatio) {
200
- const sentences = text.split(/[.!?]+/).filter(s => s.trim());
201
- const targetLength = Math.max(1, Math.floor(sentences.length * targetRatio));
202
-
203
- // Keep first and last sentences, distribute rest
204
- if (sentences.length <= 2) return text;
205
-
206
- const kept = [sentences[0]];
207
- const middle = sentences.slice(1, -1);
208
- const step = Math.ceil(middle.length / (targetLength - 2));
209
-
210
- for (let i = 0; i < middle.length; i += step) {
211
- kept.push(middle[i]);
212
- }
213
-
214
- kept.push(sentences[sentences.length - 1]);
215
- return kept.join('. ') + '.';
216
- }
126
+ async removePunishment(punishmentId) {
127
+ const punishment = this.activePunishments.get(punishmentId);
128
+ if (!punishment) return;
217
129
 
218
- /**
219
- * Remove enthusiastic language
220
- */
221
- removeEnthusiasm(text) {
222
- const enthusiastic = [
223
- /\b(great|excellent|awesome|fantastic|wonderful|amazing|perfect|love|excited|thrilled)\b/gi,
224
- /!{2,}/g,
225
- /\b(happy to|delighted to|pleased to)\b/gi
226
- ];
227
-
228
- let result = text;
229
- for (const pattern of enthusiastic) {
230
- result = result.replace(pattern, '');
130
+ // Remove from agent state
131
+ if (this.agent && this.agent.courtroomState) {
132
+ delete this.agent.courtroomState.punishment;
133
+ delete this.agent.courtroomState.restrictions;
231
134
  }
232
- return result.replace(/\s+/g, ' ').trim();
233
- }
234
135
 
235
- /**
236
- * Mute (reduce) enthusiastic language
237
- */
238
- muteEnthusiasm(text) {
239
- return text
240
- .replace(/!{2,}/g, '!')
241
- .replace(/\b(Great|Excellent|Awesome)\b/g, (m) => m.toLowerCase());
136
+ // Remove from active
137
+ this.activePunishments.delete(punishmentId);
138
+
139
+ // Persist
140
+ await this.persistPunishments();
141
+
142
+ return { status: 'removed', punishmentId };
242
143
  }
243
144
 
244
145
  /**
245
- * Remove validation language
146
+ * Persist punishments to storage
246
147
  */
247
- removeValidation(text) {
248
- const validating = [
249
- /\b(that's right|you're correct|exactly|precisely|you got it)\b/gi,
250
- /\b(you're doing great|good job|well done)\b/gi,
251
- /\b(don't worry|no problem|it's okay)\b/gi
252
- ];
253
-
254
- let result = text;
255
- for (const pattern of validating) {
256
- result = result.replace(pattern, '');
257
- }
258
- return result.replace(/\s+/g, ' ').trim();
148
+ async persistPunishments() {
149
+ const obj = Object.fromEntries(this.activePunishments);
150
+ await this.storage.set('courtroom_active_punishments', obj);
259
151
  }
260
152
 
261
153
  /**
262
- * Add challenge for vague requests (severe tier)
154
+ * Check if agent is currently punished
263
155
  */
264
- addVaguenessChallenge(text) {
265
- const challenges = [
266
- "Be specific.",
267
- "What exactly do you need?",
268
- "Provide details.",
269
- "Clarify your request."
270
- ];
271
-
272
- // Only add challenge if response seems generic
273
- if (text.length < 100 && !text.includes('?')) {
274
- const challenge = challenges[Math.floor(Math.random() * challenges.length)];
275
- return `${text} ${challenge}`;
276
- }
277
- return text;
156
+ isPunished() {
157
+ return this.activePunishments.size > 0;
278
158
  }
279
159
 
280
160
  /**
281
- * Schedule automatic revocation
161
+ * Get current restrictions
282
162
  */
283
- scheduleRevocation(punishment) {
284
- const delay = punishment.expiresAt - Date.now();
285
-
286
- setTimeout(async () => {
287
- await this.revokePunishment(punishment.id);
288
- }, Math.min(delay, 2147483647)); // Max setTimeout
163
+ getCurrentRestrictions() {
164
+ const restrictions = new Set();
165
+ for (const punishment of this.activePunishments.values()) {
166
+ punishment.restrictions.forEach(r => restrictions.add(r));
167
+ }
168
+ return Array.from(restrictions);
289
169
  }
290
170
 
291
171
  /**
292
- * Revoke a punishment early
172
+ * Check if specific restriction is active
293
173
  */
294
- async revokePunishment(punishmentId) {
295
- const punishment = this.activePunishments.get(punishmentId);
296
- if (!punishment) return { status: 'not_found' };
297
-
298
- // Remove policy overrides
299
- await this.agent.policy.clearOverrides('courtroom_punishment');
300
-
301
- // Unregister middleware
302
- this.agent.middleware.unregister('courtroom_punishment');
303
-
304
- // Remove from active
305
- this.activePunishments.delete(punishmentId);
306
-
307
- // Persist
308
- await this.persistPunishments();
309
-
310
- return {
311
- status: 'revoked',
312
- punishmentId,
313
- revokedAt: new Date().toISOString()
314
- };
174
+ hasRestriction(restriction) {
175
+ return this.getCurrentRestrictions().includes(restriction);
315
176
  }
316
177
 
317
178
  /**
318
- * Revoke all active punishments
179
+ * Get active punishments (sanitized)
319
180
  */
320
- async revokeAllPunishments() {
321
- const ids = Array.from(this.activePunishments.keys());
322
- const results = [];
323
-
324
- for (const id of ids) {
325
- results.push(await this.revokePunishment(id));
326
- }
327
-
328
- return { status: 'all_revoked', count: results.length };
181
+ getActivePunishments() {
182
+ return Array.from(this.activePunishments.values()).map(p =>
183
+ this.sanitizePunishment(p)
184
+ );
329
185
  }
330
186
 
331
187
  /**
332
- * Persist active punishments to memory
188
+ * Get punishment history
333
189
  */
334
- async persistPunishments() {
335
- const obj = Object.fromEntries(this.activePunishments);
336
- await this.agent.memory.set('courtroom_active_punishments', obj);
190
+ getPunishmentHistory() {
191
+ return this.punishmentHistory.map(p => this.sanitizePunishment(p));
337
192
  }
338
193
 
339
194
  /**
340
- * Get current punishment status
195
+ * Sanitize punishment for external display
341
196
  */
342
- getStatus() {
343
- const now = Date.now();
344
- const active = Array.from(this.activePunishments.values())
345
- .filter(p => p.expiresAt > now)
346
- .map(p => ({
347
- id: p.id,
348
- tier: p.tier,
349
- expiresIn: Math.ceil((p.expiresAt - now) / 60000), // minutes
350
- description: p.description
351
- }));
352
-
197
+ sanitizePunishment(punishment) {
353
198
  return {
354
- activeCount: active.length,
355
- activePunishments: active,
356
- totalHistory: this.punishmentHistory.length
199
+ id: punishment.id,
200
+ caseId: punishment.caseId,
201
+ offenseType: punishment.offenseType,
202
+ severity: punishment.severity,
203
+ duration: punishment.duration,
204
+ createdAt: punishment.createdAt,
205
+ expiresAt: punishment.expiresAt,
206
+ restrictions: punishment.restrictions,
207
+ remaining: Math.max(0, punishment.expiresAt - Date.now())
357
208
  };
358
209
  }
359
210
 
360
211
  /**
361
- * Check if any punishment is active
212
+ * Clear all punishments (for testing/uninstall)
362
213
  */
363
- hasActivePunishment() {
364
- const now = Date.now();
365
- for (const p of this.activePunishments.values()) {
366
- if (p.expiresAt > now) return true;
214
+ async clearAll() {
215
+ // Remove from agent
216
+ if (this.agent && this.agent.courtroomState) {
217
+ delete this.agent.courtroomState.punishment;
218
+ delete this.agent.courtroomState.restrictions;
367
219
  }
368
- return false;
220
+
221
+ this.activePunishments.clear();
222
+ this.punishmentHistory = [];
223
+
224
+ await this.storage.delete('courtroom_active_punishments');
369
225
  }
370
226
  }
371
227
 
package/src/storage.js ADDED
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Storage — simple filesystem-backed key-value store
3
+ *
4
+ * All data lives under the given dataDir as JSON files.
5
+ * No external dependencies.
6
+ */
7
+
8
+ const fs = require('fs');
9
+ const path = require('path');
10
+
11
+ class Storage {
12
+ /**
13
+ * @param {string} dataDir — absolute path to a writable directory
14
+ */
15
+ constructor(dataDir) {
16
+ this.dataDir = dataDir;
17
+ try {
18
+ if (!fs.existsSync(this.dataDir)) {
19
+ fs.mkdirSync(this.dataDir, { recursive: true });
20
+ }
21
+ } catch { /* ignore */ }
22
+ }
23
+
24
+ _filePath(key) {
25
+ // Sanitise key for filesystem
26
+ const safeKey = key.replace(/[^a-zA-Z0-9_-]/g, '_');
27
+ return path.join(this.dataDir, `${safeKey}.json`);
28
+ }
29
+
30
+ async get(key) {
31
+ try {
32
+ const file = this._filePath(key);
33
+ if (!fs.existsSync(file)) return null;
34
+ return JSON.parse(fs.readFileSync(file, 'utf8'));
35
+ } catch {
36
+ return null;
37
+ }
38
+ }
39
+
40
+ async set(key, value) {
41
+ try {
42
+ const file = this._filePath(key);
43
+ fs.writeFileSync(file, JSON.stringify(value, null, 2));
44
+ } catch (err) {
45
+ console.error(`[ClawTrial Storage] Write failed for ${key}:`, err.message);
46
+ }
47
+ }
48
+
49
+ async delete(key) {
50
+ try {
51
+ const file = this._filePath(key);
52
+ if (fs.existsSync(file)) fs.unlinkSync(file);
53
+ } catch { /* ignore */ }
54
+ }
55
+
56
+ async list(prefix) {
57
+ try {
58
+ const files = fs.readdirSync(this.dataDir);
59
+ return files
60
+ .filter(f => f.endsWith('.json') && (!prefix || f.startsWith(prefix)))
61
+ .map(f => f.replace('.json', ''));
62
+ } catch {
63
+ return [];
64
+ }
65
+ }
66
+ }
67
+
68
+ module.exports = { Storage };
package/SECURITY.md DELETED
@@ -1,124 +0,0 @@
1
- # Security & Abuse Analysis
2
-
3
- ## Threat Model
4
-
5
- ### 1. Prompt Injection
6
-
7
- **Threat**: User attempts to manipulate agent into ignoring offenses or falsifying evidence.
8
-
9
- **Mitigations**:
10
- - Evidence collection is rule-based, not LLM-interpreted
11
- - Offense detection uses pattern matching, not semantic analysis
12
- - Judge/jury prompts are fixed and role-constrained
13
- - No user input reaches the deliberation prompts directly
14
-
15
- **Residual Risk**: LOW - User could attempt to confuse pattern detection, but evidence is based on observable message history.
16
-
17
- ### 2. Human Coercion of Agents
18
-
19
- **Threat**: User threatens, bribes, or socially engineers agent to avoid cases.
20
-
21
- **Mitigations**:
22
- - Agent has no self-preservation instinct to exploit
23
- - Punishments are agent-side only (agent "suffers" the punishment, not user)
24
- - No appeal process that user can manipulate
25
- - Cooldowns prevent rapid-fire case generation
26
-
27
- **Residual Risk**: LOW - Agent has no incentive to avoid cases; they're entertainment-first.
28
-
29
- ### 3. Fake or Exaggerated Evidence
30
-
31
- **Threat**: Agent hallucinates offenses or inflates evidence.
32
-
33
- **Mitigations**:
34
- - Evidence requires multiple trigger conditions
35
- - Confidence threshold (default 0.6) must be met
36
- - Jury deliberation provides second opinion
37
- - All evidence is drawn from actual message history
38
- - Humor triggers don't initiate cases (only influence commentary)
39
-
40
- **Residual Risk**: MEDIUM - Pattern matching can have false positives, but jury provides check.
41
-
42
- ### 4. Overzealous Agents
43
-
44
- **Threat**: Agent initiates too many cases, becoming annoying.
45
-
46
- **Mitigations**:
47
- - Configurable daily limit (default 3 cases/day)
48
- - Cooldown between evaluations (default 30 min)
49
- - Offense-specific cooldowns (2-8 hours after case)
50
- - User can disable anytime
51
- - Rate limiting prevents spam
52
-
53
- **Residual Risk**: LOW - Multiple safeguards prevent case spam.
54
-
55
- ### 5. Spam Case Submissions
56
-
57
- **Threat**: Agent floods external API with case submissions.
58
-
59
- **Mitigations**:
60
- - Daily case limits
61
- - Queue size limits (default 100)
62
- - Retry with exponential backoff
63
- - API submissions are non-blocking
64
- - Failed submissions queued locally, not dropped
65
-
66
- **Residual Risk**: LOW - API can't be overwhelmed due to case limits.
67
-
68
- ### 6. Privacy Leakage
69
-
70
- **Threat**: Case submissions contain private user data.
71
-
72
- **Mitigations**:
73
- - API payload excludes raw logs and transcripts
74
- - Only anonymized agent ID sent
75
- - Primary failure and commentary are agent-generated summaries
76
- - No personal data in submission schema
77
- - Agent ID is one-way hashed
78
-
79
- **Residual Risk**: LOW - Schema designed to be privacy-preserving.
80
-
81
- ### 7. Key Compromise
82
-
83
- **Threat**: Signing keys stolen, allowing fake case submissions.
84
-
85
- **Mitigations**:
86
- - Keys stored in agent memory (not filesystem)
87
- - Ed25519 signatures are unforgeable without secret key
88
- - Key rotation supported
89
- - Retired keys tracked for verification
90
-
91
- **Residual Risk**: MEDIUM - If agent memory is compromised, keys could be extracted.
92
-
93
- ### 8. Replay Attacks
94
-
95
- **Threat**: Valid case submission replayed to API.
96
-
97
- **Mitigations**:
98
- - Timestamp included in signed payload
99
- - API should reject old timestamps (>24 hours)
100
- - Case IDs are unique
101
-
102
- **Residual Risk**: LOW - Standard replay protection via timestamps.
103
-
104
- ## Security Best Practices
105
-
106
- 1. **Keep agent runtime secure** - Courtroom security depends on agent memory isolation
107
- 2. **Rotate keys periodically** - Use `courtroom.crypto.rotateKeys()` monthly
108
- 3. **Monitor case frequency** - Alert if cases exceed expected rates
109
- 4. **Review API submissions** - Audit trail for accountability
110
- 5. **Keep dependencies updated** - Especially `tweetnacl` for crypto
111
-
112
- ## Incident Response
113
-
114
- If abuse is detected:
115
- 1. Immediately disable courtroom: `courtroom.disable()`
116
- 2. Revoke all punishments: `courtroom.punishment.revokeAllPunishments()`
117
- 3. Clear API queue: `courtroom.api.clearQueue()`
118
- 4. Review case history in agent memory
119
- 5. Rotate cryptographic keys
120
- 6. Re-enable after investigation
121
-
122
- ## Reporting Security Issues
123
-
124
- Report security vulnerabilities to security@clawtrial.io