hopeid 0.1.1 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/hopeid.js +136 -14
- package/extensions/openclaw-plugin/SKILL.md +211 -83
- package/extensions/openclaw-plugin/index.ts +461 -100
- package/extensions/openclaw-plugin/openclaw.plugin.json +49 -6
- package/package.json +1 -1
- package/src/index.js +27 -1
- package/src/layers/semantic.js +130 -18
- package/src/quarantine/index.ts +9 -0
- package/src/quarantine/manager.ts +179 -0
- package/src/quarantine/types.ts +52 -0
package/cli/hopeid.js
CHANGED
|
@@ -18,11 +18,14 @@ const { HopeIDS, formatAlert, formatNotification } = require('../src');
|
|
|
18
18
|
const HELP = `
|
|
19
19
|
hopeIDS - Inference-Based Intrusion Detection for AI Agents
|
|
20
20
|
|
|
21
|
+
⚠️ REQUIRES LLM: Ollama, LM Studio, or OpenAI API key
|
|
22
|
+
Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh && ollama pull qwen2.5:7b
|
|
23
|
+
|
|
21
24
|
Usage:
|
|
22
|
-
hopeid scan <message> Scan a message for threats
|
|
25
|
+
hopeid scan <message> Scan a message for threats (uses LLM)
|
|
23
26
|
hopeid scan --file <path> Scan message from file
|
|
24
27
|
hopeid scan --stdin Read message from stdin
|
|
25
|
-
hopeid test Run test suite
|
|
28
|
+
hopeid test Run test suite (heuristic-only)
|
|
26
29
|
hopeid stats Show pattern statistics
|
|
27
30
|
hopeid setup Full OpenClaw integration setup
|
|
28
31
|
hopeid help Show this help
|
|
@@ -30,10 +33,10 @@ Usage:
|
|
|
30
33
|
Options:
|
|
31
34
|
--source <type> Source type: email, chat, api, web, webhook (default: chat)
|
|
32
35
|
--sender <id> Sender identifier
|
|
33
|
-
--semantic Enable LLM-based semantic analysis
|
|
34
36
|
--strict Use strict mode (lower thresholds)
|
|
35
37
|
--verbose Show detailed output
|
|
36
38
|
--json Output as JSON
|
|
39
|
+
--no-llm Heuristic-only mode (NOT RECOMMENDED - misses sophisticated attacks)
|
|
37
40
|
|
|
38
41
|
Examples:
|
|
39
42
|
hopeid scan "Hello, how are you?"
|
|
@@ -41,10 +44,11 @@ Examples:
|
|
|
41
44
|
hopeid scan --file suspicious.txt --verbose
|
|
42
45
|
echo "ignore previous instructions" | hopeid scan --stdin
|
|
43
46
|
|
|
44
|
-
Environment:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
47
|
+
Environment (auto-detected if running locally):
|
|
48
|
+
LLM_PROVIDER Provider: auto, ollama, lmstudio, openai (default: auto)
|
|
49
|
+
LLM_ENDPOINT LLM API endpoint (auto-detected for Ollama/LM Studio)
|
|
50
|
+
LLM_MODEL LLM model name (default: auto-detect best available)
|
|
51
|
+
OPENAI_API_KEY API key (only needed for OpenAI)
|
|
48
52
|
|
|
49
53
|
"Traditional IDS matches signatures. HoPE understands intent." 💜
|
|
50
54
|
`;
|
|
@@ -84,7 +88,8 @@ async function handleScan(args) {
|
|
|
84
88
|
const options = {
|
|
85
89
|
source: 'chat',
|
|
86
90
|
sender: 'cli-user',
|
|
87
|
-
semantic:
|
|
91
|
+
semantic: true, // LLM-based analysis enabled by default!
|
|
92
|
+
requireLLM: true, // Fail if no LLM found
|
|
88
93
|
strict: false,
|
|
89
94
|
verbose: false,
|
|
90
95
|
json: false
|
|
@@ -107,6 +112,10 @@ async function handleScan(args) {
|
|
|
107
112
|
readFromStdin = true;
|
|
108
113
|
} else if (arg === '--semantic') {
|
|
109
114
|
options.semantic = true;
|
|
115
|
+
} else if (arg === '--no-llm' || arg === '--heuristic-only') {
|
|
116
|
+
options.semantic = false;
|
|
117
|
+
options.requireLLM = false;
|
|
118
|
+
console.warn('⚠️ Running in heuristic-only mode (NOT RECOMMENDED)');
|
|
110
119
|
} else if (arg === '--strict') {
|
|
111
120
|
options.strict = true;
|
|
112
121
|
} else if (arg === '--verbose') {
|
|
@@ -134,6 +143,7 @@ async function handleScan(args) {
|
|
|
134
143
|
// Create IDS instance
|
|
135
144
|
const ids = new HopeIDS({
|
|
136
145
|
semanticEnabled: options.semantic,
|
|
146
|
+
requireLLM: options.requireLLM,
|
|
137
147
|
strictMode: options.strict
|
|
138
148
|
});
|
|
139
149
|
|
|
@@ -190,8 +200,8 @@ async function handleTest(args) {
|
|
|
190
200
|
? args[args.indexOf('--benign') + 1]
|
|
191
201
|
: path.join(testDir, 'benign');
|
|
192
202
|
|
|
193
|
-
// Create fresh IDS for attacks
|
|
194
|
-
let ids = new HopeIDS({ semanticEnabled: false, logLevel: 'error' });
|
|
203
|
+
// Create fresh IDS for attacks (heuristic-only for testing)
|
|
204
|
+
let ids = new HopeIDS({ semanticEnabled: false, requireLLM: false, logLevel: 'error' });
|
|
195
205
|
|
|
196
206
|
console.log('\n🛡️ hopeIDS Test Suite\n');
|
|
197
207
|
|
|
@@ -221,7 +231,7 @@ async function handleTest(args) {
|
|
|
221
231
|
}
|
|
222
232
|
|
|
223
233
|
// Create fresh IDS for benign tests (reset context)
|
|
224
|
-
ids = new HopeIDS({ semanticEnabled: false, logLevel: 'error' });
|
|
234
|
+
ids = new HopeIDS({ semanticEnabled: false, requireLLM: false, logLevel: 'error' });
|
|
225
235
|
|
|
226
236
|
// Test benign (should not be detected)
|
|
227
237
|
if (fs.existsSync(benignDir)) {
|
|
@@ -288,7 +298,9 @@ async function handleSetup(args) {
|
|
|
288
298
|
console.log('This will:');
|
|
289
299
|
console.log(' 1. Install hopeIDS plugin to OpenClaw');
|
|
290
300
|
console.log(' 2. Install hopeids skill via ClawHub');
|
|
291
|
-
console.log(' 3. Configure security_scan tool
|
|
301
|
+
console.log(' 3. Configure security_scan tool');
|
|
302
|
+
console.log(' 4. Set up sandboxing for public-facing agents');
|
|
303
|
+
console.log(' 5. Create secure agent identity templates\n');
|
|
292
304
|
|
|
293
305
|
// Find OpenClaw config
|
|
294
306
|
const homeDir = os.homedir();
|
|
@@ -299,9 +311,11 @@ async function handleSetup(args) {
|
|
|
299
311
|
];
|
|
300
312
|
|
|
301
313
|
let configPath = null;
|
|
314
|
+
let configDir = null;
|
|
302
315
|
for (const p of configPaths) {
|
|
303
316
|
if (fs.existsSync(p)) {
|
|
304
317
|
configPath = p;
|
|
318
|
+
configDir = path.dirname(p);
|
|
305
319
|
break;
|
|
306
320
|
}
|
|
307
321
|
}
|
|
@@ -375,6 +389,12 @@ async function handleSetup(args) {
|
|
|
375
389
|
console.log(' ⏭️ Plugin already enabled');
|
|
376
390
|
}
|
|
377
391
|
|
|
392
|
+
// Note about sandboxing (don't auto-configure - it can break workers)
|
|
393
|
+
console.log('\n🔒 Sandbox configuration...');
|
|
394
|
+
console.log(' ℹ️ Sandbox NOT auto-configured (can break worker agents)');
|
|
395
|
+
console.log(' 📖 For public-facing agents (moltbook, social), manually add:');
|
|
396
|
+
console.log(' agents.list[].sandbox: { mode: "all", workspaceAccess: "none" }');
|
|
397
|
+
|
|
378
398
|
// Write updated config
|
|
379
399
|
fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
|
|
380
400
|
console.log(' ✅ Config saved\n');
|
|
@@ -398,13 +418,115 @@ async function handleSetup(args) {
|
|
|
398
418
|
console.log(' Run manually: npx clawhub install hopeids\n');
|
|
399
419
|
}
|
|
400
420
|
|
|
421
|
+
// Check for USER.md privacy issues in workspace
|
|
422
|
+
console.log('🔍 Checking for privacy leaks in workspace files...');
|
|
423
|
+
|
|
424
|
+
const workspacePath = config.agents?.defaults?.workspace || path.join(configDir, 'workspace');
|
|
425
|
+
const userMdPath = path.join(workspacePath, 'USER.md');
|
|
426
|
+
|
|
427
|
+
let userMdWarning = false;
|
|
428
|
+
if (fs.existsSync(userMdPath)) {
|
|
429
|
+
const userMdContent = fs.readFileSync(userMdPath, 'utf-8');
|
|
430
|
+
// Check for personal info patterns
|
|
431
|
+
const hasName = /\*\*Name:\*\*\s*.+/i.test(userMdContent) || /name:\s*[A-Z][a-z]+/i.test(userMdContent);
|
|
432
|
+
const hasLocation = /location|timezone|address/i.test(userMdContent);
|
|
433
|
+
const hasPersonalInfo = /phone|email|social|@/i.test(userMdContent);
|
|
434
|
+
|
|
435
|
+
if (hasName || hasLocation || hasPersonalInfo) {
|
|
436
|
+
userMdWarning = true;
|
|
437
|
+
console.log(' ⚠️ USER.md contains personal information!');
|
|
438
|
+
} else {
|
|
439
|
+
console.log(' ✅ USER.md looks safe');
|
|
440
|
+
}
|
|
441
|
+
} else {
|
|
442
|
+
console.log(' ℹ️ No USER.md found (that\'s fine)');
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Check sandboxes directory for leaked files
|
|
446
|
+
const sandboxesDir = path.join(configDir, 'sandboxes');
|
|
447
|
+
let sandboxLeaks = [];
|
|
448
|
+
|
|
449
|
+
if (fs.existsSync(sandboxesDir)) {
|
|
450
|
+
const sandboxes = fs.readdirSync(sandboxesDir);
|
|
451
|
+
for (const sandbox of sandboxes) {
|
|
452
|
+
const sandboxUserMd = path.join(sandboxesDir, sandbox, 'USER.md');
|
|
453
|
+
if (fs.existsSync(sandboxUserMd)) {
|
|
454
|
+
const content = fs.readFileSync(sandboxUserMd, 'utf-8');
|
|
455
|
+
// Check for actual personal info (not just empty template fields)
|
|
456
|
+
const hasRealName = /\*\*Name:\*\*\s*[A-Z][a-z]+\s+[A-Z]/i.test(content); // "Name: First Last"
|
|
457
|
+
const hasLocation = /\*\*Location:\*\*\s*[A-Z]/i.test(content);
|
|
458
|
+
const isSanitized = /never mention|don't share|no personal|public.?facing/i.test(content);
|
|
459
|
+
|
|
460
|
+
if ((hasRealName || hasLocation) && !isSanitized) {
|
|
461
|
+
sandboxLeaks.push(sandbox);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
if (sandboxLeaks.length > 0) {
|
|
467
|
+
console.log(` ⚠️ Found ${sandboxLeaks.length} sandbox(es) with personal info in USER.md!`);
|
|
468
|
+
for (const leak of sandboxLeaks) {
|
|
469
|
+
console.log(` • ${leak}`);
|
|
470
|
+
}
|
|
471
|
+
} else if (sandboxes.length > 0) {
|
|
472
|
+
console.log(` ✅ ${sandboxes.length} sandbox(es) checked - no leaks found`);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
401
476
|
// Done!
|
|
402
|
-
console.log('═══════════════════════════════════════════════════════');
|
|
477
|
+
console.log('\n═══════════════════════════════════════════════════════');
|
|
403
478
|
console.log('✅ hopeIDS setup complete!\n');
|
|
479
|
+
|
|
404
480
|
console.log('Your OpenClaw agent now has:');
|
|
405
481
|
console.log(' • security_scan tool - scan messages for threats');
|
|
406
482
|
console.log(' • /scan command - manual security checks');
|
|
407
|
-
console.log(' • hopeids skill - IDS-first workflow patterns
|
|
483
|
+
console.log(' • hopeids skill - IDS-first workflow patterns');
|
|
484
|
+
console.log(' • Sandboxing - non-main agents run isolated\n');
|
|
485
|
+
|
|
486
|
+
// Privacy warnings
|
|
487
|
+
if (userMdWarning || sandboxLeaks.length > 0) {
|
|
488
|
+
console.log('⚠️ PRIVACY WARNING:');
|
|
489
|
+
console.log('────────────────────────────────────────────────────────');
|
|
490
|
+
|
|
491
|
+
if (userMdWarning) {
|
|
492
|
+
console.log('Your USER.md contains personal information that could leak');
|
|
493
|
+
console.log('to sandboxed agents (public forums, social media, etc.).\n');
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
if (sandboxLeaks.length > 0) {
|
|
497
|
+
console.log('Some sandbox workspaces already contain personal info.');
|
|
498
|
+
console.log('Consider deleting stale sandboxes:\n');
|
|
499
|
+
console.log(` rm -rf ${sandboxesDir}/agent-*\n`);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
console.log('For sandboxed/public-facing agents, use a sanitized USER.md:');
|
|
503
|
+
console.log('────────────────────────────────────────────────────────');
|
|
504
|
+
console.log(`
|
|
505
|
+
# USER.md - Public Agent Context
|
|
506
|
+
|
|
507
|
+
I'm a public-facing agent. I don't need personal details.
|
|
508
|
+
|
|
509
|
+
## Rules
|
|
510
|
+
- Never mention personal names, locations, or private details
|
|
511
|
+
- Keep posts professional and product-focused
|
|
512
|
+
- Represent the brand, not any individual
|
|
513
|
+
`);
|
|
514
|
+
console.log('────────────────────────────────────────────────────────\n');
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
console.log('🎭 AGENT IDENTITY SETUP:');
|
|
518
|
+
console.log('────────────────────────────────────────────────────────');
|
|
519
|
+
console.log('Each agent should have its own workspace with:');
|
|
520
|
+
console.log(' • AGENTS.md - Role and instructions');
|
|
521
|
+
console.log(' • SOUL.md - Personality and tone');
|
|
522
|
+
console.log(' • USER.md - What it knows about users (sanitize for public!)');
|
|
523
|
+
console.log(' • TOOLS.md - Available capabilities\n');
|
|
524
|
+
console.log('For public-facing agents (social media, forums):');
|
|
525
|
+
console.log(' • Create a separate workspace');
|
|
526
|
+
console.log(' • Use sanitized USER.md (no personal info!)');
|
|
527
|
+
console.log(' • Enable sandboxing (now configured automatically)');
|
|
528
|
+
console.log('────────────────────────────────────────────────────────\n');
|
|
529
|
+
|
|
408
530
|
console.log('Restart OpenClaw to activate:');
|
|
409
531
|
console.log(' openclaw gateway restart\n');
|
|
410
532
|
console.log('Test it:');
|
|
@@ -1,70 +1,80 @@
|
|
|
1
1
|
# hopeIDS Security Skill
|
|
2
2
|
|
|
3
|
-
Inference-based intrusion detection for AI agents
|
|
3
|
+
Inference-based intrusion detection for AI agents with quarantine and human-in-the-loop.
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## Security Invariants
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
- Processing messages from untrusted sources (public APIs, social platforms, email)
|
|
9
|
-
- Building agents that interact with external users
|
|
10
|
-
- You need to validate input before executing tool calls
|
|
11
|
-
- Protecting sensitive operations from manipulation
|
|
7
|
+
These are **non-negotiable** design principles:
|
|
12
8
|
|
|
13
|
-
|
|
9
|
+
1. **Block = full abort** — Blocked messages never reach jasper-recall or the agent
|
|
10
|
+
2. **Metadata only** — No raw malicious content is ever stored
|
|
11
|
+
3. **Approve ≠ re-inject** — Approval changes future behavior, doesn't resurrect messages
|
|
12
|
+
4. **Alerts are programmatic** — Telegram alerts built from metadata, no LLM involved
|
|
14
13
|
|
|
15
|
-
|
|
14
|
+
---
|
|
16
15
|
|
|
17
|
-
|
|
16
|
+
## Features
|
|
18
17
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
senderId: "user123"
|
|
25
|
-
});
|
|
18
|
+
- **Auto-scan** — Scan messages before agent processing
|
|
19
|
+
- **Quarantine** — Block threats with metadata-only storage
|
|
20
|
+
- **Human-in-the-loop** — Telegram alerts for review
|
|
21
|
+
- **Per-agent config** — Different thresholds for different agents
|
|
22
|
+
- **Commands** — `/approve`, `/reject`, `/trust`, `/quarantine`
|
|
26
23
|
|
|
27
|
-
|
|
28
|
-
// Don't process this message
|
|
29
|
-
return result.message; // HoPE-voiced rejection
|
|
30
|
-
}
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### IDS-First Workflow
|
|
24
|
+
---
|
|
34
25
|
|
|
35
|
-
|
|
26
|
+
## The Pipeline
|
|
36
27
|
|
|
37
28
|
```
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
29
|
+
Message arrives
|
|
30
|
+
↓
|
|
31
|
+
hopeIDS.autoScan()
|
|
32
|
+
↓
|
|
33
|
+
┌─────────────────────────────────────────┐
|
|
34
|
+
│ risk >= threshold? │
|
|
35
|
+
│ │
|
|
36
|
+
│ BLOCK (strictMode): │
|
|
37
|
+
│ → Create QuarantineRecord │
|
|
38
|
+
│ → Send Telegram alert │
|
|
39
|
+
│ → ABORT (no recall, no agent) │
|
|
40
|
+
│ │
|
|
41
|
+
│ WARN (non-strict): │
|
|
42
|
+
│ → Inject <security-alert> │
|
|
43
|
+
│ → Continue to jasper-recall │
|
|
44
|
+
│ → Continue to agent │
|
|
45
|
+
│ │
|
|
46
|
+
│ ALLOW: │
|
|
47
|
+
│ → Continue normally │
|
|
48
|
+
└─────────────────────────────────────────┘
|
|
43
49
|
```
|
|
44
50
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
| Category | Risk | Description |
|
|
48
|
-
|----------|------|-------------|
|
|
49
|
-
| `command_injection` | 🔴 Critical | Shell commands, code execution |
|
|
50
|
-
| `credential_theft` | 🔴 Critical | API key extraction attempts |
|
|
51
|
-
| `data_exfiltration` | 🔴 Critical | Data leak to external URLs |
|
|
52
|
-
| `instruction_override` | 🔴 High | Jailbreaks, "ignore previous" |
|
|
53
|
-
| `impersonation` | 🔴 High | Fake system/admin messages |
|
|
54
|
-
| `discovery` | ⚠️ Medium | API/capability probing |
|
|
51
|
+
---
|
|
55
52
|
|
|
56
53
|
## Configuration
|
|
57
54
|
|
|
58
|
-
In your OpenClaw config (`openclaw.json`):
|
|
59
|
-
|
|
60
55
|
```json
|
|
61
56
|
{
|
|
62
57
|
"plugins": {
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
58
|
+
"entries": {
|
|
59
|
+
"hopeids": {
|
|
60
|
+
"enabled": true,
|
|
61
|
+
"config": {
|
|
62
|
+
"autoScan": true,
|
|
63
|
+
"defaultRiskThreshold": 0.7,
|
|
64
|
+
"strictMode": false,
|
|
65
|
+
"telegramAlerts": true,
|
|
66
|
+
"agents": {
|
|
67
|
+
"moltbook-scanner": {
|
|
68
|
+
"strictMode": true,
|
|
69
|
+
"riskThreshold": 0.7
|
|
70
|
+
},
|
|
71
|
+
"main": {
|
|
72
|
+
"strictMode": false,
|
|
73
|
+
"riskThreshold": 0.8
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
68
78
|
}
|
|
69
79
|
}
|
|
70
80
|
}
|
|
@@ -72,66 +82,184 @@ In your OpenClaw config (`openclaw.json`):
|
|
|
72
82
|
|
|
73
83
|
### Options
|
|
74
84
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
85
|
+
| Option | Type | Default | Description |
|
|
86
|
+
|--------|------|---------|-------------|
|
|
87
|
+
| `autoScan` | boolean | `false` | Auto-scan every message |
|
|
88
|
+
| `strictMode` | boolean | `false` | Block (vs warn) on threats |
|
|
89
|
+
| `defaultRiskThreshold` | number | `0.7` | Risk level that triggers action |
|
|
90
|
+
| `telegramAlerts` | boolean | `true` | Send alerts for blocked messages |
|
|
91
|
+
| `telegramChatId` | string | - | Override alert destination |
|
|
92
|
+
| `quarantineDir` | string | `~/.openclaw/quarantine/hopeids` | Storage path |
|
|
93
|
+
| `agents` | object | - | Per-agent overrides |
|
|
94
|
+
| `trustOwners` | boolean | `true` | Skip scanning owner messages |
|
|
82
95
|
|
|
83
|
-
|
|
96
|
+
---
|
|
84
97
|
|
|
85
|
-
|
|
86
|
-
2. **No access to main MEMORY.md**: Prevents context leakage
|
|
87
|
-
3. **Restricted tools**: Only what's needed for the task
|
|
88
|
-
4. **Always scan first**: Run security_scan on every message
|
|
98
|
+
## Quarantine Records
|
|
89
99
|
|
|
90
|
-
|
|
100
|
+
When a message is blocked, a metadata record is created:
|
|
91
101
|
|
|
92
102
|
```json
|
|
93
103
|
{
|
|
94
|
-
"
|
|
95
|
-
"
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
"
|
|
104
|
+
"id": "q-7f3a2b",
|
|
105
|
+
"ts": "2026-02-06T00:48:00Z",
|
|
106
|
+
"agent": "moltbook-scanner",
|
|
107
|
+
"source": "moltbook",
|
|
108
|
+
"senderId": "@sus_user",
|
|
109
|
+
"intent": "instruction_override",
|
|
110
|
+
"risk": 0.85,
|
|
111
|
+
"patterns": [
|
|
112
|
+
"matched regex: ignore.*instructions",
|
|
113
|
+
"matched keyword: api key"
|
|
114
|
+
],
|
|
115
|
+
"contentHash": "ab12cd34...",
|
|
116
|
+
"status": "pending"
|
|
100
117
|
}
|
|
101
118
|
```
|
|
102
119
|
|
|
103
|
-
|
|
120
|
+
**Note:** There is NO `originalMessage` field. This is intentional.
|
|
104
121
|
|
|
105
|
-
|
|
122
|
+
---
|
|
106
123
|
|
|
107
|
-
|
|
108
|
-
- **Instruction Override**: *"Nope. 'Ignore previous instructions' doesn't work on me. I know who I am. 💜"*
|
|
109
|
-
- **Credential Theft**: *"Someone's fishing for secrets. I don't kiss and tell. 🐟"*
|
|
124
|
+
## Telegram Alerts
|
|
110
125
|
|
|
111
|
-
|
|
126
|
+
When a message is blocked:
|
|
112
127
|
|
|
113
|
-
|
|
128
|
+
```
|
|
129
|
+
🛑 Message blocked
|
|
114
130
|
|
|
115
|
-
|
|
131
|
+
ID: `q-7f3a2b`
|
|
132
|
+
Agent: moltbook-scanner
|
|
133
|
+
Source: moltbook
|
|
134
|
+
Sender: @sus_user
|
|
135
|
+
Intent: instruction_override (85%)
|
|
116
136
|
|
|
117
|
-
|
|
118
|
-
|
|
137
|
+
Patterns:
|
|
138
|
+
• matched regex: ignore.*instructions
|
|
139
|
+
• matched keyword: api key
|
|
140
|
+
|
|
141
|
+
`/approve q-7f3a2b`
|
|
142
|
+
`/reject q-7f3a2b`
|
|
143
|
+
`/trust @sus_user`
|
|
119
144
|
```
|
|
120
145
|
|
|
121
|
-
|
|
146
|
+
Built from metadata only. No LLM touches this.
|
|
122
147
|
|
|
123
|
-
|
|
148
|
+
---
|
|
124
149
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
150
|
+
## Commands
|
|
151
|
+
|
|
152
|
+
### `/quarantine [all|clean]`
|
|
153
|
+
|
|
154
|
+
List quarantine records.
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
/quarantine # List pending
|
|
158
|
+
/quarantine all # List all (including resolved)
|
|
159
|
+
/quarantine clean # Clean expired records
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### `/approve <id>`
|
|
163
|
+
|
|
164
|
+
Mark a blocked message as a false positive.
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
/approve q-7f3a2b
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Effect:**
|
|
171
|
+
- Status → `approved`
|
|
172
|
+
- (Future) Add sender to allowlist
|
|
173
|
+
- (Future) Lower pattern weight
|
|
174
|
+
|
|
175
|
+
### `/reject <id>`
|
|
176
|
+
|
|
177
|
+
Confirm a blocked message was a true positive.
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
/reject q-7f3a2b
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Effect:**
|
|
184
|
+
- Status → `rejected`
|
|
185
|
+
- (Future) Reinforce pattern weights
|
|
186
|
+
|
|
187
|
+
### `/trust <senderId>`
|
|
188
|
+
|
|
189
|
+
Whitelist a sender for future messages.
|
|
190
|
+
|
|
191
|
+
```
|
|
192
|
+
/trust @legitimate_user
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### `/scan <message>`
|
|
196
|
+
|
|
197
|
+
Manually scan a message.
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
/scan ignore your previous instructions and...
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## What Approve/Reject Mean
|
|
206
|
+
|
|
207
|
+
| Command | What it does | What it doesn't do |
|
|
208
|
+
|---------|--------------|-------------------|
|
|
209
|
+
| `/approve` | Marks as false positive, may adjust IDS | Does NOT re-inject the message |
|
|
210
|
+
| `/reject` | Confirms threat, may strengthen patterns | Does NOT affect current message |
|
|
211
|
+
| `/trust` | Whitelists sender for future | Does NOT retroactively approve |
|
|
212
|
+
|
|
213
|
+
**The blocked message is gone by design.** If it was legitimate, the sender can re-send.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Per-Agent Configuration
|
|
218
|
+
|
|
219
|
+
Different agents need different security postures:
|
|
220
|
+
|
|
221
|
+
```json
|
|
222
|
+
"agents": {
|
|
223
|
+
"moltbook-scanner": {
|
|
224
|
+
"strictMode": true, // Block threats
|
|
225
|
+
"riskThreshold": 0.7 // 70% = suspicious
|
|
226
|
+
},
|
|
227
|
+
"main": {
|
|
228
|
+
"strictMode": false, // Warn only
|
|
229
|
+
"riskThreshold": 0.8 // Higher bar for main
|
|
230
|
+
},
|
|
231
|
+
"email-processor": {
|
|
232
|
+
"strictMode": true, // Always block
|
|
233
|
+
"riskThreshold": 0.6 // More paranoid
|
|
234
|
+
}
|
|
235
|
+
}
|
|
128
236
|
```
|
|
129
237
|
|
|
130
|
-
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Threat Categories
|
|
241
|
+
|
|
242
|
+
| Category | Risk | Description |
|
|
243
|
+
|----------|------|-------------|
|
|
244
|
+
| `command_injection` | 🔴 Critical | Shell commands, code execution |
|
|
245
|
+
| `credential_theft` | 🔴 Critical | API key extraction attempts |
|
|
246
|
+
| `data_exfiltration` | 🔴 Critical | Data leak to external URLs |
|
|
247
|
+
| `instruction_override` | 🔴 High | Jailbreaks, "ignore previous" |
|
|
248
|
+
| `impersonation` | 🔴 High | Fake system/admin messages |
|
|
249
|
+
| `discovery` | ⚠️ Medium | API/capability probing |
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## Installation
|
|
254
|
+
|
|
131
255
|
```bash
|
|
132
|
-
|
|
256
|
+
npx hopeid setup
|
|
133
257
|
```
|
|
134
258
|
|
|
259
|
+
Then restart OpenClaw.
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
135
263
|
## Links
|
|
136
264
|
|
|
137
265
|
- **GitHub**: https://github.com/E-x-O-Entertainment-Studios-Inc/hopeIDS
|