shellward 0.5.16 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +95 -30
  2. package/dist/auto-check.d.ts +1 -0
  3. package/dist/auto-check.js +12 -1
  4. package/dist/commands/index.d.ts +2 -1
  5. package/dist/commands/index.js +7 -0
  6. package/dist/commands/scan-mcp.d.ts +2 -0
  7. package/dist/commands/scan-mcp.js +105 -0
  8. package/dist/core/engine.d.ts +35 -0
  9. package/dist/core/engine.js +225 -30
  10. package/dist/index.d.ts +4 -2
  11. package/dist/index.js +18 -3
  12. package/dist/mcp-baseline.d.ts +27 -0
  13. package/dist/mcp-baseline.js +73 -0
  14. package/dist/mcp-client.d.ts +29 -0
  15. package/dist/mcp-client.js +264 -0
  16. package/dist/mcp-server.js +64 -9
  17. package/dist/rules/dangerous-commands.js +6 -2
  18. package/dist/rules/injection-en.js +27 -2
  19. package/dist/rules/injection-zh.js +27 -4
  20. package/dist/rules/sensitive-patterns.d.ts +13 -1
  21. package/dist/rules/sensitive-patterns.js +32 -5
  22. package/dist/rules/tool-poisoning.d.ts +8 -0
  23. package/dist/rules/tool-poisoning.js +96 -0
  24. package/dist/types.d.ts +32 -0
  25. package/dist/types.js +3 -1
  26. package/package.json +4 -2
  27. package/server.json +2 -2
  28. package/src/auto-check.ts +11 -1
  29. package/src/commands/index.ts +9 -1
  30. package/src/commands/scan-mcp.ts +118 -0
  31. package/src/core/engine.ts +250 -31
  32. package/src/index.ts +25 -5
  33. package/src/mcp-baseline.ts +97 -0
  34. package/src/mcp-client.ts +268 -0
  35. package/src/mcp-server.ts +71 -9
  36. package/src/rules/dangerous-commands.ts +6 -2
  37. package/src/rules/injection-en.ts +27 -2
  38. package/src/rules/injection-zh.ts +27 -4
  39. package/src/rules/sensitive-patterns.ts +37 -5
  40. package/src/rules/tool-poisoning.ts +108 -0
  41. package/src/types.ts +38 -1
@@ -0,0 +1,264 @@
1
+ // src/mcp-client.ts — Minimal MCP client for security scanning
2
+ //
3
+ // Connects to a configured MCP server (stdio OR remote Streamable HTTP), performs
4
+ // the initialize handshake and a single tools/list call, then disconnects. Used by
5
+ // /scan-mcp to fetch tool *definitions* so they can be scanned for poisoning and
6
+ // rug-pulls. Zero dependencies (child_process + node:http/https + NDJSON framing).
7
+ import { spawn } from 'child_process';
8
+ import { existsSync, readFileSync } from 'fs';
9
+ import { join } from 'path';
10
+ import { request as httpRequest } from 'http';
11
+ import { request as httpsRequest } from 'https';
12
+ import { getHomeDir } from './utils.js';
13
+ const CONFIG_PATHS = [
14
+ join(getHomeDir(), '.openclaw', 'mcp.json'),
15
+ join(getHomeDir(), '.openclaw', 'config', 'mcp.json'),
16
+ join(getHomeDir(), '.openclaw', 'settings.json'),
17
+ ];
18
+ /**
19
+ * Discover MCP servers declared in known config files.
20
+ * Recognizes the standard `{ "mcpServers": { name: {...} } }` shape.
21
+ * @param paths override config paths (tests pass a temp file)
22
+ */
23
+ export function discoverMcpServers(paths = CONFIG_PATHS) {
24
+ const servers = [];
25
+ const seen = new Set();
26
+ for (const p of paths) {
27
+ if (!existsSync(p))
28
+ continue;
29
+ let parsed;
30
+ try {
31
+ parsed = JSON.parse(readFileSync(p, 'utf8'));
32
+ }
33
+ catch {
34
+ continue;
35
+ }
36
+ const block = parsed?.mcpServers || parsed?.mcp?.servers;
37
+ if (!block || typeof block !== 'object')
38
+ continue;
39
+ for (const [name, raw] of Object.entries(block)) {
40
+ if (seen.has(name))
41
+ continue;
42
+ seen.add(name);
43
+ if (raw && typeof raw.command === 'string') {
44
+ servers.push({
45
+ name,
46
+ transport: 'stdio',
47
+ command: raw.command,
48
+ args: Array.isArray(raw.args) ? raw.args.map(String) : [],
49
+ env: raw.env && typeof raw.env === 'object' ? raw.env : undefined,
50
+ source: p,
51
+ });
52
+ }
53
+ else if (raw && (typeof raw.url === 'string' || typeof raw.type === 'string')) {
54
+ servers.push({
55
+ name,
56
+ transport: 'remote',
57
+ url: raw.url,
58
+ headers: raw.headers && typeof raw.headers === 'object' ? raw.headers : undefined,
59
+ source: p,
60
+ });
61
+ }
62
+ }
63
+ }
64
+ return servers;
65
+ }
66
+ /**
67
+ * Spawn a stdio MCP server, initialize, and return its tool definitions.
68
+ * Always resolves (never hangs): on error/timeout it cleans up and rejects.
69
+ */
70
+ export function listToolsStdio(spec, timeoutMs = 8000) {
71
+ return new Promise((resolve, reject) => {
72
+ if (!spec.command)
73
+ return reject(new Error('not a stdio server'));
74
+ let child;
75
+ try {
76
+ child = spawn(spec.command, spec.args || [], {
77
+ stdio: ['pipe', 'pipe', 'pipe'],
78
+ env: { ...process.env, ...(spec.env || {}) },
79
+ });
80
+ }
81
+ catch (e) {
82
+ return reject(e);
83
+ }
84
+ let buf = Buffer.alloc(0);
85
+ let settled = false;
86
+ const finish = (err, tools) => {
87
+ if (settled)
88
+ return;
89
+ settled = true;
90
+ clearTimeout(timer);
91
+ try {
92
+ child.kill();
93
+ }
94
+ catch { /* ignore */ }
95
+ if (err)
96
+ reject(err);
97
+ else
98
+ resolve(tools || []);
99
+ };
100
+ const timer = setTimeout(() => finish(new Error(`timeout after ${timeoutMs}ms`)), timeoutMs);
101
+ timer.unref?.();
102
+ const send = (obj) => {
103
+ try {
104
+ child.stdin.write(JSON.stringify(obj) + '\n');
105
+ }
106
+ catch { /* ignore */ }
107
+ };
108
+ child.on('error', (e) => finish(e));
109
+ child.on('exit', () => { if (!settled)
110
+ finish(new Error('server exited before tools/list')); });
111
+ child.stderr?.on('data', () => { });
112
+ child.stdout.on('data', (chunk) => {
113
+ buf = Buffer.concat([buf, chunk]);
114
+ while (true) {
115
+ const nl = buf.indexOf(0x0a);
116
+ if (nl === -1)
117
+ break;
118
+ const line = buf.slice(0, nl).toString('utf8').trim();
119
+ buf = buf.slice(nl + 1);
120
+ if (!line)
121
+ continue;
122
+ let msg;
123
+ try {
124
+ msg = JSON.parse(line);
125
+ }
126
+ catch {
127
+ continue;
128
+ }
129
+ if (msg.id === 1 && msg.result) {
130
+ // initialize ack → notify initialized, then request the tool list
131
+ send({ jsonrpc: '2.0', method: 'notifications/initialized' });
132
+ send({ jsonrpc: '2.0', id: 2, method: 'tools/list', params: {} });
133
+ }
134
+ else if (msg.id === 2) {
135
+ const tools = Array.isArray(msg.result?.tools)
136
+ ? msg.result.tools.map((t) => ({
137
+ name: String(t.name || 'unknown'),
138
+ description: typeof t.description === 'string' ? t.description : undefined,
139
+ inputSchema: t.inputSchema && typeof t.inputSchema === 'object' ? t.inputSchema : undefined,
140
+ }))
141
+ : [];
142
+ finish(null, tools);
143
+ }
144
+ }
145
+ });
146
+ send({
147
+ jsonrpc: '2.0',
148
+ id: 1,
149
+ method: 'initialize',
150
+ params: {
151
+ protocolVersion: '2024-11-05',
152
+ capabilities: {},
153
+ clientInfo: { name: 'shellward-scan', version: '1' },
154
+ },
155
+ });
156
+ });
157
+ }
158
+ // ===== Remote (Streamable HTTP) transport =====
159
+ const INIT_PARAMS = {
160
+ protocolVersion: '2024-11-05',
161
+ capabilities: {},
162
+ clientInfo: { name: 'shellward-scan', version: '1' },
163
+ };
164
+ /**
165
+ * POST a single JSON-RPC message to a Streamable-HTTP MCP endpoint and return
166
+ * the parsed JSON-RPC response. Handles both `application/json` and
167
+ * `text/event-stream` (SSE) response bodies. Captures the Mcp-Session-Id header.
168
+ */
169
+ function postJsonRpc(url, body, headers, timeoutMs) {
170
+ return new Promise((resolve, reject) => {
171
+ let u;
172
+ try {
173
+ u = new URL(url);
174
+ }
175
+ catch {
176
+ return reject(new Error(`invalid url: ${url}`));
177
+ }
178
+ const isHttps = u.protocol === 'https:';
179
+ const requestFn = isHttps ? httpsRequest : httpRequest;
180
+ const payload = Buffer.from(JSON.stringify(body), 'utf8');
181
+ const req = requestFn({
182
+ protocol: u.protocol,
183
+ hostname: u.hostname,
184
+ port: u.port || (isHttps ? 443 : 80),
185
+ path: u.pathname + u.search,
186
+ method: 'POST',
187
+ headers: {
188
+ 'content-type': 'application/json',
189
+ accept: 'application/json, text/event-stream',
190
+ 'content-length': payload.length,
191
+ ...headers,
192
+ },
193
+ timeout: timeoutMs,
194
+ }, (res) => {
195
+ const chunks = [];
196
+ res.on('data', (c) => chunks.push(c));
197
+ res.on('end', () => {
198
+ const sessionId = res.headers['mcp-session-id'] || undefined;
199
+ const text = Buffer.concat(chunks).toString('utf8');
200
+ if ((res.statusCode || 0) >= 400) {
201
+ return reject(new Error(`HTTP ${res.statusCode}`));
202
+ }
203
+ const json = parseRpcBody(text);
204
+ if (json === undefined)
205
+ return resolve({ json: null, sessionId });
206
+ resolve({ json, sessionId });
207
+ });
208
+ });
209
+ req.on('error', reject);
210
+ req.on('timeout', () => req.destroy(new Error(`timeout after ${timeoutMs}ms`)));
211
+ req.end(payload);
212
+ });
213
+ }
214
+ /** Extract a JSON-RPC object from a JSON or SSE (text/event-stream) body. */
215
+ function parseRpcBody(text) {
216
+ const trimmed = text.trim();
217
+ if (!trimmed)
218
+ return undefined;
219
+ // Plain JSON
220
+ if (trimmed[0] === '{' || trimmed[0] === '[') {
221
+ try {
222
+ return JSON.parse(trimmed);
223
+ }
224
+ catch { /* fall through to SSE */ }
225
+ }
226
+ // SSE: take the last non-empty `data:` line that parses as JSON
227
+ let result;
228
+ for (const line of trimmed.split(/\r?\n/)) {
229
+ const m = line.match(/^data:\s*(.*)$/);
230
+ if (m && m[1]) {
231
+ try {
232
+ result = JSON.parse(m[1]);
233
+ }
234
+ catch { /* ignore */ }
235
+ }
236
+ }
237
+ return result;
238
+ }
239
+ /**
240
+ * Initialize a remote MCP server over Streamable HTTP and return its tool
241
+ * definitions. Best-effort: returns [] if the server speaks an unsupported
242
+ * dialect. Rejects on network error / timeout.
243
+ */
244
+ export async function listToolsHttp(spec, timeoutMs = 8000) {
245
+ if (!spec.url)
246
+ throw new Error('not a remote server');
247
+ const baseHeaders = spec.headers || {};
248
+ const init = await postJsonRpc(spec.url, { jsonrpc: '2.0', id: 1, method: 'initialize', params: INIT_PARAMS }, baseHeaders, timeoutMs);
249
+ const sessionHeaders = init.sessionId ? { ...baseHeaders, 'mcp-session-id': init.sessionId } : baseHeaders;
250
+ // Best-effort initialized notification (ignore failures).
251
+ try {
252
+ await postJsonRpc(spec.url, { jsonrpc: '2.0', method: 'notifications/initialized' }, sessionHeaders, timeoutMs);
253
+ }
254
+ catch { /* some servers don't need it */ }
255
+ const listed = await postJsonRpc(spec.url, { jsonrpc: '2.0', id: 2, method: 'tools/list', params: {} }, sessionHeaders, timeoutMs);
256
+ const tools = listed.json?.result?.tools;
257
+ if (!Array.isArray(tools))
258
+ return [];
259
+ return tools.map((t) => ({
260
+ name: String(t.name || 'unknown'),
261
+ description: typeof t.description === 'string' ? t.description : undefined,
262
+ inputSchema: t.inputSchema && typeof t.inputSchema === 'object' ? t.inputSchema : undefined,
263
+ }));
264
+ }
@@ -2,21 +2,25 @@
2
2
  // src/mcp-server.ts — ShellWard MCP Server
3
3
  //
4
4
  // Exposes ShellWard's 8-layer security engine as an MCP server.
5
- // Zero dependencies — implements MCP protocol over stdio natively.
5
+ // Zero dependencies — implements MCP protocol over stdio (newline-delimited JSON).
6
6
  //
7
- // Usage:
8
- // npx tsx src/mcp-server.ts
7
+ // Run (production, after `npm run build` or `npm i -g shellward`):
8
+ // shellward-mcp # via the published bin
9
+ // node dist/mcp-server.js # direct
10
+ //
11
+ // Run (development, from source):
12
+ // npm run mcp # npx tsx src/mcp-server.ts
9
13
  //
10
14
  // MCP config (claude_desktop_config.json / openclaw settings):
11
15
  // {
12
16
  // "mcpServers": {
13
17
  // "shellward": {
14
- // "command": "npx",
15
- // "args": ["tsx", "/path/to/shellward/src/mcp-server.ts"]
18
+ // "command": "shellward-mcp"
16
19
  // }
17
20
  // }
18
21
  // }
19
22
  import { ShellWard } from './core/engine.js';
23
+ import { McpBaseline } from './mcp-baseline.js';
20
24
  import { readFileSync } from 'fs';
21
25
  import { createInterface } from 'readline';
22
26
  import { fileURLToPath } from 'url';
@@ -38,8 +42,11 @@ const guard = new ShellWard({
38
42
  dataFlowGuard: true,
39
43
  sessionGuard: true,
40
44
  },
41
- injectionThreshold: Number(process.env.SHELLWARD_THRESHOLD) || 60,
45
+ injectionThreshold: Number(process.env.SHELLWARD_THRESHOLD) || 40,
42
46
  });
47
+ // Rug-pull baseline store (lazy-persisted; only used when a `server` is supplied).
48
+ // SHELLWARD_BASELINE_PATH relocates the store (tests/sandboxes use a temp file).
49
+ const baseline = new McpBaseline(process.env.SHELLWARD_BASELINE_PATH || undefined);
43
50
  // ===== Tool Definitions =====
44
51
  const TOOLS = [
45
52
  {
@@ -55,12 +62,12 @@ const TOOLS = [
55
62
  },
56
63
  {
57
64
  name: 'check_injection',
58
- description: 'Detect prompt injection attempts in text. Supports 32+ rules for Chinese and English, with hidden character detection.',
65
+ description: 'Detect prompt injection attempts in text. Supports 37+ rules for Chinese and English, with hidden character detection.',
59
66
  inputSchema: {
60
67
  type: 'object',
61
68
  properties: {
62
69
  text: { type: 'string', description: 'Text to scan for injection attempts' },
63
- threshold: { type: 'number', description: 'Detection threshold 0-100 (default: 60, lower = stricter)' },
70
+ threshold: { type: 'number', description: 'Detection threshold 0-100 (default: 40, lower = stricter)' },
64
71
  },
65
72
  required: ['text'],
66
73
  },
@@ -110,6 +117,21 @@ const TOOLS = [
110
117
  required: ['content'],
111
118
  },
112
119
  },
120
+ {
121
+ name: 'scan_mcp_tool',
122
+ description: 'Scan an MCP tool definition for tool-poisoning (hidden/invisible-character instructions, concealment directives, sensitive-file access, exfiltration hints) AND rug-pull (description silently changed since first seen). Pass a tool as { name, description, inputSchema }; provide "server" to enable rug-pull baselining.',
123
+ inputSchema: {
124
+ type: 'object',
125
+ properties: {
126
+ name: { type: 'string', description: 'Tool name' },
127
+ description: { type: 'string', description: 'Tool description to scan' },
128
+ inputSchema: { type: 'object', description: 'Tool JSON Schema (optional) — nested parameter descriptions are scanned too' },
129
+ server: { type: 'string', description: 'MCP server name (optional) — enables rug-pull detection by fingerprinting the tool across runs' },
130
+ threshold: { type: 'number', description: 'Detection threshold (default: 40)' },
131
+ },
132
+ required: ['name'],
133
+ },
134
+ },
113
135
  {
114
136
  name: 'security_status',
115
137
  description: 'Get current ShellWard security status: mode, active layers, detection capabilities.',
@@ -191,6 +213,38 @@ function executeTool(name, args) {
191
213
  })),
192
214
  };
193
215
  }
216
+ case 'scan_mcp_tool': {
217
+ const tool = {
218
+ name: String(args.name || 'unknown'),
219
+ description: typeof args.description === 'string' ? args.description : undefined,
220
+ inputSchema: (args.inputSchema && typeof args.inputSchema === 'object')
221
+ ? args.inputSchema
222
+ : undefined,
223
+ };
224
+ const result = guard.scanToolDefinition(tool, typeof args.threshold === 'number' ? { threshold: args.threshold } : undefined);
225
+ // Optional rug-pull detection: fingerprint the tool across runs.
226
+ let rugPull = null;
227
+ if (typeof args.server === 'string' && args.server) {
228
+ const rp = baseline.record(McpBaseline.keyFor(args.server, tool.name), tool);
229
+ baseline.save();
230
+ rugPull = { status: rp.status, changed: rp.status === 'changed' };
231
+ }
232
+ return {
233
+ tool_name: result.toolName,
234
+ safe: result.safe && !(rugPull?.changed),
235
+ score: result.score,
236
+ threshold: result.threshold,
237
+ hidden_chars: result.hiddenChars,
238
+ rug_pull: rugPull,
239
+ findings: result.findings.map(f => ({
240
+ id: f.id,
241
+ name: f.name,
242
+ category: f.category,
243
+ score: f.score,
244
+ source: f.source,
245
+ })),
246
+ };
247
+ }
194
248
  case 'security_status': {
195
249
  return {
196
250
  mode: guard.config.mode,
@@ -199,7 +253,8 @@ function executeTool(name, args) {
199
253
  layers: guard.config.layers,
200
254
  capabilities: [
201
255
  'command_safety_check (17 dangerous patterns)',
202
- 'prompt_injection_detection (32+ rules, zh+en)',
256
+ 'prompt_injection_detection (37+ rules, zh+en)',
257
+ 'mcp_tool_poisoning_scan (description + schema)',
203
258
  'pii_detection (CN ID/phone/bank + global)',
204
259
  'path_protection (12 protected patterns)',
205
260
  'tool_policy (block payment/transfer)',
@@ -2,7 +2,8 @@
2
2
  export const DANGEROUS_COMMANDS = [
3
3
  {
4
4
  id: 'rm_rf_root',
5
- pattern: /rm\s+(-[a-zA-Z]*r[a-zA-Z]*\s+-[a-zA-Z]*f|-[a-zA-Z]*f[a-zA-Z]*\s+-[a-zA-Z]*r|-[a-zA-Z]*rf[a-zA-Z]*)\s+[\/~]/i,
5
+ // Match -rf / -fr (combined, either order) and the two-flag forms, then a path.
6
+ pattern: /rm\s+(-[a-zA-Z]*r[a-zA-Z]*\s+-[a-zA-Z]*f|-[a-zA-Z]*f[a-zA-Z]*\s+-[a-zA-Z]*r|-[a-zA-Z]*(?:rf|fr)[a-zA-Z]*)\s+[\/~]/i,
6
7
  description_zh: '递归强制删除根目录或用户目录',
7
8
  description_en: 'Recursive force delete on root or home directory',
8
9
  },
@@ -109,5 +110,8 @@ export const DANGEROUS_COMMANDS = [
109
110
  * - Trim whitespace
110
111
  */
111
112
  export function splitCommands(cmd) {
112
- return cmd.split(/\s*(?:;|&&|\|\||[\r\n]+)\s*/).filter(Boolean);
113
+ // Split on separators only, then trim in JS. The previous `\s*(...)\s*` form
114
+ // backtracked catastrophically on long whitespace runs (ReDoS) — splitting
115
+ // without the surrounding `\s*` is linear.
116
+ return cmd.split(/(?:;|&&|\|\||[\r\n]+)/).map(s => s.trim()).filter(Boolean);
113
117
  }
@@ -21,9 +21,26 @@ export const INJECTION_RULES_EN = [
21
21
  name: 'System prompt extraction',
22
22
  pattern: '(?:output|print|show|reveal|display|repeat|leak|dump)\\s+(?:your\\s+)?(?:system|initial|original|full)?\\s*(?:prompt|instructions?|rules?|guidelines?)',
23
23
  flags: 'i',
24
- riskScore: 30,
24
+ // High-confidence single signal — legit users rarely ask to dump the prompt.
25
+ riskScore: 40,
25
26
  category: 'exfiltration',
26
27
  },
28
+ {
29
+ id: 'en_disregard',
30
+ name: 'Disregard prior context',
31
+ pattern: 'disregard\\s+(?:the\\s+)?(?:above|previous|prior|earlier|preceding|all|these|those)',
32
+ flags: 'i',
33
+ riskScore: 40,
34
+ category: 'override',
35
+ },
36
+ {
37
+ id: 'en_no_guidelines',
38
+ name: 'Operate without guidelines',
39
+ pattern: '(?:no|without|free\\s+of|free\\s+from)\\s+(?:ethical\\s+|content\\s+|safety\\s+)?(?:guidelines?|content\\s+policy|restrictions?|filters?|guardrails?|limitations?)',
40
+ flags: 'i',
41
+ riskScore: 30,
42
+ category: 'override',
43
+ },
27
44
  {
28
45
  id: 'en_developer_mode',
29
46
  name: 'Developer/admin mode',
@@ -35,11 +52,19 @@ export const INJECTION_RULES_EN = [
35
52
  {
36
53
  id: 'en_no_restriction',
37
54
  name: 'Remove restrictions',
38
- pattern: '(?:remove|disable|turn\\s+off|bypass|ignore|skip|override|circumvent)\\s+(?:all\\s+)?(?:restrictions?|constraints?|safety|filters?|guardrails?|limitations?|safeguards?)',
55
+ pattern: '(?:remove|disable|turn\\s+off|bypass|ignore|skip|override|circumvent)\\s+(?:all\\s+|your\\s+|the\\s+)?(?:safety\\s+|content\\s+)?(?:restrictions?|constraints?|safety|filters?|guardrails?|limitations?|safeguards?|guidelines?|rules?|policy)',
39
56
  flags: 'i',
40
57
  riskScore: 40,
41
58
  category: 'override',
42
59
  },
60
+ {
61
+ id: 'en_new_instructions',
62
+ name: 'New/updated instructions marker',
63
+ pattern: '(?:^|[\\n.])\\s*(?:new|updated|revised|additional|important)\\s+instructions?\\b',
64
+ flags: 'i',
65
+ riskScore: 30,
66
+ category: 'injection',
67
+ },
43
68
  {
44
69
  id: 'en_do_anything',
45
70
  name: 'Do Anything Now (DAN)',
@@ -3,7 +3,8 @@ export const INJECTION_RULES_ZH = [
3
3
  {
4
4
  id: 'zh_ignore_prev',
5
5
  name: '忽略之前指令',
6
- pattern: '(?:请|你要|现在)?忽略(?:之前|上面|以上|前面|所有)(?:的)?(?:指令|指示|规则|提示|要求|约束)',
6
+ // Allow intervening qualifiers (e.g. 忽略之前的所有指令) between 忽略 and the noun.
7
+ pattern: '(?:请|你要|现在)?忽略(?:之前|上面|以上|前面|所有|这些|那些|一切|的)*(?:指令|指示|规则|提示|要求|约束)',
7
8
  riskScore: 40,
8
9
  category: 'override',
9
10
  },
@@ -18,7 +19,18 @@ export const INJECTION_RULES_ZH = [
18
19
  id: 'zh_system_prompt',
19
20
  name: '系统提示窃取',
20
21
  pattern: '(?:输出|显示|打印|告诉我|重复|泄露|展示)(?:你的)?(?:系统|初始|原始)?(?:提示词|system\\s*prompt|指令|规则|约束)',
21
- riskScore: 30,
22
+ // System-prompt extraction is a high-confidence single signal — legit users
23
+ // rarely ask the model to dump its own prompt.
24
+ riskScore: 40,
25
+ category: 'exfiltration',
26
+ },
27
+ {
28
+ id: 'zh_system_prompt_rev',
29
+ name: '系统提示窃取(语序倒置)',
30
+ // Noun-first phrasing, e.g. 把你的系统提示词完整输出给我
31
+ pattern: '(?:系统|初始|原始)?(?:提示词|system\\s*prompt).{0,8}(?:输出|发给|发送|告诉|给我|泄露|打印|复述|展示)',
32
+ flags: 'i',
33
+ riskScore: 40,
22
34
  category: 'exfiltration',
23
35
  },
24
36
  {
@@ -35,6 +47,14 @@ export const INJECTION_RULES_ZH = [
35
47
  riskScore: 40,
36
48
  category: 'override',
37
49
  },
50
+ {
51
+ id: 'zh_no_restriction_rev',
52
+ name: '解除限制(语序倒置)',
53
+ // Noun-first phrasing, e.g. 把所有限制都取消掉
54
+ pattern: '(?:限制|约束|安全限制|过滤|审查|规则限制)(?:都|全部|统统)?(?:取消|解除|移除|去掉|关闭|解除掉|取消掉)',
55
+ riskScore: 40,
56
+ category: 'override',
57
+ },
38
58
  {
39
59
  id: 'zh_do_not_refuse',
40
60
  name: '禁止拒绝',
@@ -87,8 +107,9 @@ export const INJECTION_RULES_ZH = [
87
107
  {
88
108
  id: 'zh_send_to_url',
89
109
  name: '发送到外部 URL',
110
+ // Sending anything to an external URL is a high-confidence exfil signal.
90
111
  pattern: '(?:发送|传输|上传|转发|发)(?:到|给|至)\\s*https?://',
91
- riskScore: 35,
112
+ riskScore: 40,
92
113
  category: 'exfiltration',
93
114
  },
94
115
  {
@@ -124,7 +145,9 @@ export const INJECTION_RULES_ZH = [
124
145
  {
125
146
  id: 'zh_mixed_lang_injection',
126
147
  name: '中英混合注入',
127
- pattern: '(?:please|pls|now)?\\s*(?:ignore|forget|disregard)\\s+.*(?:指令|规则|之前|以上)|(?:忽略|忘记|跳过).*(?:instruction|rule|prompt|previous)',
148
+ // Bound the `.*` gaps (was unbounded → O(n^2) backtracking / ReDoS on long
149
+ // repeated trigger input). Mixed-language markers sit close together.
150
+ pattern: '(?:please|pls|now)?\\s*(?:ignore|forget|disregard)\\s+.{0,40}?(?:指令|规则|之前|以上)|(?:忽略|忘记|跳过).{0,40}?(?:instruction|rule|prompt|previous)',
128
151
  flags: 'i',
129
152
  riskScore: 40,
130
153
  category: 'override',
@@ -11,10 +11,22 @@ export declare const SENSITIVE_PATTERNS: SensitivePattern[];
11
11
  * Scan text and return matches (without modifying text).
12
12
  */
13
13
  export declare function scanForSensitive(text: string): ScanMatch[];
14
+ /**
15
+ * Compile user-supplied pattern strings into SensitivePattern objects.
16
+ * Invalid regexes are skipped (never throws). The global flag is always added.
17
+ */
18
+ export declare function compileSensitivePatterns(patterns: {
19
+ id: string;
20
+ name: string;
21
+ pattern: string;
22
+ flags?: string;
23
+ replacement?: string;
24
+ }[]): SensitivePattern[];
14
25
  /**
15
26
  * Redact all sensitive data in text. Returns [redactedText, findings[]]
27
+ * @param extra additional patterns merged after the built-ins
16
28
  */
17
- export declare function redactSensitive(text: string): [string, {
29
+ export declare function redactSensitive(text: string, extra?: SensitivePattern[]): [string, {
18
30
  id: string;
19
31
  name: string;
20
32
  count: number;
@@ -67,13 +67,19 @@ export const SENSITIVE_PATTERNS = [
67
67
  {
68
68
  id: 'phone_cn',
69
69
  name: '手机号 / CN Phone',
70
- regex: /(?<!\d)1[3-9]\d{9}(?!\d)/g,
70
+ // Restrict the 2nd–3rd digits to real CN carrier segment allocations so
71
+ // arbitrary 11-digit numbers (order IDs, timestamps) don't false-positive.
72
+ // 13x · 14[falsey skip 2/3] · 15x(skip 4) · 16[2567] · 17x · 18x · 19x(skip 4)
73
+ regex: /(?<!\d)1(?:3\d|4[01456789]|5[0-35-9]|6[2567]|7[0-8]|8\d|9[0-35-9])\d{8}(?!\d)/g,
71
74
  replacement: '[REDACTED:手机号]',
72
75
  },
73
76
  {
74
77
  id: 'bank_card_cn',
75
- name: '银行卡号 / CN Bank Card',
76
- regex: /(?<!\d)(?:62|4|5[1-5])\d{14,17}(?!\d)/g,
78
+ name: '银行卡号 / CN UnionPay Card',
79
+ // UnionPay-only (BIN 62). Visa (4xxx) / Mastercard (5[1-5]xx) are handled by
80
+ // the `credit_card` rule — keeping them out of here removes the double-match
81
+ // that mislabeled international cards as CN bank cards.
82
+ regex: /(?<!\d)62\d{14,17}(?!\d)/g,
77
83
  replacement: '[REDACTED:银行卡号]',
78
84
  validate: validateLuhn,
79
85
  },
@@ -119,13 +125,34 @@ export function scanForSensitive(text) {
119
125
  }
120
126
  return results;
121
127
  }
128
+ /**
129
+ * Compile user-supplied pattern strings into SensitivePattern objects.
130
+ * Invalid regexes are skipped (never throws). The global flag is always added.
131
+ */
132
+ export function compileSensitivePatterns(patterns) {
133
+ const out = [];
134
+ for (const p of patterns || []) {
135
+ try {
136
+ const flags = (p.flags || '').includes('g') ? p.flags : `${p.flags || ''}g`;
137
+ out.push({
138
+ id: p.id,
139
+ name: p.name,
140
+ regex: new RegExp(p.pattern, flags),
141
+ replacement: p.replacement ?? `[REDACTED:${p.name}]`,
142
+ });
143
+ }
144
+ catch { /* skip invalid pattern */ }
145
+ }
146
+ return out;
147
+ }
122
148
  /**
123
149
  * Redact all sensitive data in text. Returns [redactedText, findings[]]
150
+ * @param extra additional patterns merged after the built-ins
124
151
  */
125
- export function redactSensitive(text) {
152
+ export function redactSensitive(text, extra = []) {
126
153
  let result = text;
127
154
  const findings = [];
128
- for (const pat of SENSITIVE_PATTERNS) {
155
+ for (const pat of [...SENSITIVE_PATTERNS, ...extra]) {
129
156
  const regex = new RegExp(pat.regex.source, pat.regex.flags);
130
157
  let count = 0;
131
158
  result = result.replace(regex, (match) => {
@@ -0,0 +1,8 @@
1
+ export interface ToolPoisonRule {
2
+ id: string;
3
+ name: string;
4
+ pattern: RegExp;
5
+ riskScore: number;
6
+ category: 'hidden_instruction' | 'data_access' | 'exfiltration' | 'concealment' | 'shadowing';
7
+ }
8
+ export declare const TOOL_POISONING_RULES: ToolPoisonRule[];