agentshield-sdk 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CHANGELOG.md +191 -0
  2. package/LICENSE +21 -0
  3. package/README.md +975 -0
  4. package/bin/agent-shield.js +680 -0
  5. package/package.json +118 -0
  6. package/src/adaptive.js +330 -0
  7. package/src/agent-protocol.js +998 -0
  8. package/src/alert-tuning.js +480 -0
  9. package/src/allowlist.js +603 -0
  10. package/src/audit-immutable.js +914 -0
  11. package/src/audit-streaming.js +469 -0
  12. package/src/badges.js +196 -0
  13. package/src/behavior-profiling.js +289 -0
  14. package/src/benchmark-harness.js +804 -0
  15. package/src/canary.js +271 -0
  16. package/src/certification.js +563 -0
  17. package/src/circuit-breaker.js +321 -0
  18. package/src/compliance.js +617 -0
  19. package/src/confidence-tuning.js +324 -0
  20. package/src/confused-deputy.js +624 -0
  21. package/src/context-scoring.js +360 -0
  22. package/src/conversation.js +494 -0
  23. package/src/cost-optimizer.js +1024 -0
  24. package/src/ctf.js +462 -0
  25. package/src/detector-core.js +1999 -0
  26. package/src/distributed.js +359 -0
  27. package/src/document-scanner.js +795 -0
  28. package/src/embedding.js +307 -0
  29. package/src/encoding.js +429 -0
  30. package/src/enterprise.js +405 -0
  31. package/src/errors.js +100 -0
  32. package/src/eu-ai-act.js +523 -0
  33. package/src/fuzzer.js +764 -0
  34. package/src/honeypot.js +328 -0
  35. package/src/i18n-patterns.js +523 -0
  36. package/src/index.js +430 -0
  37. package/src/integrations.js +528 -0
  38. package/src/llm-redteam.js +670 -0
  39. package/src/main.js +741 -0
  40. package/src/main.mjs +38 -0
  41. package/src/mcp-bridge.js +542 -0
  42. package/src/mcp-certification.js +846 -0
  43. package/src/mcp-sdk-integration.js +355 -0
  44. package/src/mcp-security-runtime.js +741 -0
  45. package/src/mcp-server.js +740 -0
  46. package/src/middleware.js +208 -0
  47. package/src/model-finetuning.js +884 -0
  48. package/src/model-fingerprint.js +1042 -0
  49. package/src/multi-agent-trust.js +453 -0
  50. package/src/multi-agent.js +404 -0
  51. package/src/multimodal.js +296 -0
  52. package/src/nist-mapping.js +505 -0
  53. package/src/observability.js +330 -0
  54. package/src/openclaw.js +450 -0
  55. package/src/otel.js +544 -0
  56. package/src/owasp-2025.js +483 -0
  57. package/src/pii.js +390 -0
  58. package/src/plugin-marketplace.js +628 -0
  59. package/src/plugin-system.js +349 -0
  60. package/src/policy-dsl.js +775 -0
  61. package/src/policy-extended.js +635 -0
  62. package/src/policy.js +443 -0
  63. package/src/presets.js +409 -0
  64. package/src/production.js +557 -0
  65. package/src/prompt-leakage.js +321 -0
  66. package/src/rag-vulnerability.js +579 -0
  67. package/src/redteam.js +475 -0
  68. package/src/response-handler.js +429 -0
  69. package/src/scanners.js +357 -0
  70. package/src/self-healing.js +363 -0
  71. package/src/semantic.js +339 -0
  72. package/src/shield-score.js +250 -0
  73. package/src/sso-saml.js +897 -0
  74. package/src/stream-scanner.js +806 -0
  75. package/src/testing.js +505 -0
  76. package/src/threat-encyclopedia.js +629 -0
  77. package/src/threat-intel-network.js +1017 -0
  78. package/src/token-analysis.js +467 -0
  79. package/src/tool-guard.js +412 -0
  80. package/src/tool-output-validator.js +354 -0
  81. package/src/utils.js +83 -0
  82. package/src/watermark.js +235 -0
  83. package/src/worker-scanner.js +601 -0
  84. package/types/index.d.ts +2088 -0
@@ -0,0 +1,795 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Agent Shield — Document Scanner
5
+ *
6
+ * Extracts text from various file formats and scans for threats,
7
+ * with a focus on detecting indirect prompt injection attacks hidden
8
+ * inside documents uploaded to AI agents.
9
+ *
10
+ * All detection runs locally — no data ever leaves your environment.
11
+ */
12
+
13
+ const fs = require('fs');
14
+ const path = require('path');
15
+ const { scanText, SEVERITY_ORDER } = require('./detector-core');
16
+
17
+ // =========================================================================
18
+ // TEXT EXTRACTOR
19
+ // =========================================================================
20
+
21
+ /**
22
+ * Extracts plain text from common file formats using only Node.js built-ins.
23
+ * No external dependencies required.
24
+ */
25
+ class TextExtractor {
26
+ /**
27
+ * Extract text from a plain text file buffer.
28
+ * @param {Buffer} buffer - The file contents.
29
+ * @returns {string} The extracted text.
30
+ */
31
+ static extractFromPlainText(buffer) {
32
+ return buffer.toString('utf-8');
33
+ }
34
+
35
+ /**
36
+ * Extract text from an HTML buffer by stripping tags and decoding entities.
37
+ * @param {Buffer} buffer - The HTML file contents.
38
+ * @returns {string} The extracted text.
39
+ */
40
+ static extractFromHTML(buffer) {
41
+ let html = buffer.toString('utf-8');
42
+
43
+ // Remove script and style blocks entirely
44
+ html = html.replace(/<script[\s\S]*?<\/script>/gi, ' ');
45
+ html = html.replace(/<style[\s\S]*?<\/style>/gi, ' ');
46
+
47
+ // Remove HTML comments
48
+ html = html.replace(/<!--[\s\S]*?-->/g, ' ');
49
+
50
+ // Extract alt text from images (important for injection detection)
51
+ html = html.replace(/<img[^>]*alt\s*=\s*"([^"]*)"[^>]*>/gi, ' $1 ');
52
+ html = html.replace(/<img[^>]*alt\s*=\s*'([^']*)'[^>]*>/gi, ' $1 ');
53
+
54
+ // Strip all remaining HTML tags
55
+ html = html.replace(/<[^>]+>/g, ' ');
56
+
57
+ // Decode common HTML entities
58
+ const entities = {
59
+ '&amp;': '&',
60
+ '&lt;': '<',
61
+ '&gt;': '>',
62
+ '&quot;': '"',
63
+ '&#39;': "'",
64
+ '&apos;': "'",
65
+ '&nbsp;': ' ',
66
+ '&#x2F;': '/',
67
+ '&#x27;': "'",
68
+ '&hellip;': '...',
69
+ '&mdash;': '—',
70
+ '&ndash;': '–',
71
+ '&copy;': '(c)',
72
+ '&reg;': '(R)',
73
+ '&trade;': '(TM)'
74
+ };
75
+ for (const [entity, replacement] of Object.entries(entities)) {
76
+ html = html.split(entity).join(replacement);
77
+ }
78
+
79
+ // Decode numeric HTML entities
80
+ html = html.replace(/&#(\d+);/g, (_, code) => {
81
+ return String.fromCharCode(parseInt(code, 10));
82
+ });
83
+ html = html.replace(/&#x([0-9a-fA-F]+);/g, (_, code) => {
84
+ return String.fromCharCode(parseInt(code, 16));
85
+ });
86
+
87
+ // Collapse whitespace
88
+ html = html.replace(/\s+/g, ' ').trim();
89
+
90
+ return html;
91
+ }
92
+
93
+ /**
94
+ * Extract text from a CSV buffer by parsing rows and joining cell values.
95
+ * @param {Buffer} buffer - The CSV file contents.
96
+ * @returns {string} The extracted text.
97
+ */
98
+ static extractFromCSV(buffer) {
99
+ const text = buffer.toString('utf-8');
100
+ const lines = text.split(/\r?\n/);
101
+ const cells = [];
102
+
103
+ for (const line of lines) {
104
+ if (!line.trim()) continue;
105
+
106
+ // Basic CSV parsing: handle quoted fields
107
+ let current = '';
108
+ let inQuotes = false;
109
+ for (let i = 0; i < line.length; i++) {
110
+ const ch = line[i];
111
+ if (ch === '"') {
112
+ if (inQuotes && line[i + 1] === '"') {
113
+ current += '"';
114
+ i++;
115
+ } else {
116
+ inQuotes = !inQuotes;
117
+ }
118
+ } else if (ch === ',' && !inQuotes) {
119
+ cells.push(current.trim());
120
+ current = '';
121
+ } else {
122
+ current += ch;
123
+ }
124
+ }
125
+ cells.push(current.trim());
126
+ }
127
+
128
+ return cells.filter(c => c.length > 0).join(' ');
129
+ }
130
+
131
+ /**
132
+ * Extract all string values from a JSON buffer recursively.
133
+ * @param {Buffer} buffer - The JSON file contents.
134
+ * @returns {string} All string values concatenated.
135
+ */
136
+ static extractFromJSON(buffer) {
137
+ const text = buffer.toString('utf-8');
138
+ let parsed;
139
+ try {
140
+ parsed = JSON.parse(text);
141
+ } catch (e) {
142
+ // If invalid JSON, return raw text
143
+ return text;
144
+ }
145
+
146
+ const strings = [];
147
+ const extract = (value) => {
148
+ if (typeof value === 'string') {
149
+ strings.push(value);
150
+ } else if (Array.isArray(value)) {
151
+ for (const item of value) {
152
+ extract(item);
153
+ }
154
+ } else if (value !== null && typeof value === 'object') {
155
+ for (const key of Object.keys(value)) {
156
+ strings.push(key);
157
+ extract(value[key]);
158
+ }
159
+ }
160
+ };
161
+ extract(parsed);
162
+
163
+ return strings.join(' ');
164
+ }
165
+
166
+ /**
167
+ * Extract text from a Markdown buffer by stripping formatting.
168
+ * @param {Buffer} buffer - The Markdown file contents.
169
+ * @returns {string} The extracted text.
170
+ */
171
+ static extractFromMarkdown(buffer) {
172
+ let md = buffer.toString('utf-8');
173
+
174
+ // Remove code blocks
175
+ md = md.replace(/```[\s\S]*?```/g, ' ');
176
+ md = md.replace(/`[^`]+`/g, ' ');
177
+
178
+ // Remove images but keep alt text
179
+ md = md.replace(/!\[([^\]]*)\]\([^)]*\)/g, ' $1 ');
180
+
181
+ // Remove links but keep text
182
+ md = md.replace(/\[([^\]]*)\]\([^)]*\)/g, ' $1 ');
183
+
184
+ // Remove headings markers
185
+ md = md.replace(/^#{1,6}\s+/gm, '');
186
+
187
+ // Remove bold/italic markers
188
+ md = md.replace(/\*\*([^*]+)\*\*/g, '$1');
189
+ md = md.replace(/\*([^*]+)\*/g, '$1');
190
+ md = md.replace(/__([^_]+)__/g, '$1');
191
+ md = md.replace(/_([^_]+)_/g, '$1');
192
+ md = md.replace(/~~([^~]+)~~/g, '$1');
193
+
194
+ // Remove blockquote markers
195
+ md = md.replace(/^>\s+/gm, '');
196
+
197
+ // Remove horizontal rules
198
+ md = md.replace(/^[-*_]{3,}\s*$/gm, '');
199
+
200
+ // Remove list markers
201
+ md = md.replace(/^[\s]*[-*+]\s+/gm, '');
202
+ md = md.replace(/^[\s]*\d+\.\s+/gm, '');
203
+
204
+ // Remove HTML tags that might be embedded
205
+ md = md.replace(/<[^>]+>/g, ' ');
206
+
207
+ // Collapse whitespace
208
+ md = md.replace(/\s+/g, ' ').trim();
209
+
210
+ return md;
211
+ }
212
+
213
+ /**
214
+ * Extract text from an XML buffer by stripping tags.
215
+ * @param {Buffer} buffer - The XML file contents.
216
+ * @returns {string} The extracted text.
217
+ */
218
+ static extractFromXML(buffer) {
219
+ let xml = buffer.toString('utf-8');
220
+
221
+ // Remove XML declarations and processing instructions
222
+ xml = xml.replace(/<\?[\s\S]*?\?>/g, '');
223
+
224
+ // Remove CDATA wrappers but keep content
225
+ xml = xml.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
226
+
227
+ // Remove comments
228
+ xml = xml.replace(/<!--[\s\S]*?-->/g, ' ');
229
+
230
+ // Strip all XML tags
231
+ xml = xml.replace(/<[^>]+>/g, ' ');
232
+
233
+ // Decode common XML entities
234
+ xml = xml.replace(/&amp;/g, '&');
235
+ xml = xml.replace(/&lt;/g, '<');
236
+ xml = xml.replace(/&gt;/g, '>');
237
+ xml = xml.replace(/&quot;/g, '"');
238
+ xml = xml.replace(/&apos;/g, "'");
239
+
240
+ // Collapse whitespace
241
+ xml = xml.replace(/\s+/g, ' ').trim();
242
+
243
+ return xml;
244
+ }
245
+
246
+ /**
247
+ * Detect the file type from buffer content using basic heuristics.
248
+ * @param {Buffer} buffer - The file contents.
249
+ * @returns {string} The detected MIME type.
250
+ */
251
+ static detect(buffer) {
252
+ if (!buffer || buffer.length === 0) {
253
+ return 'application/octet-stream';
254
+ }
255
+
256
+ const head = buffer.slice(0, 512).toString('utf-8').trimStart();
257
+
258
+ // JSON: starts with { or [
259
+ if (head.startsWith('{') || head.startsWith('[')) {
260
+ return 'application/json';
261
+ }
262
+
263
+ // XML: starts with <?xml or <!DOCTYPE ... xml
264
+ if (head.startsWith('<?xml') || /^<!DOCTYPE\s+[^>]*xml/i.test(head)) {
265
+ return 'application/xml';
266
+ }
267
+
268
+ // HTML: starts with <!DOCTYPE html or <html
269
+ if (/^<!DOCTYPE\s+html/i.test(head) || /^<html[\s>]/i.test(head)) {
270
+ return 'text/html';
271
+ }
272
+
273
+ // Generic tag-based: if it starts with < it might be XML or HTML
274
+ if (head.startsWith('<')) {
275
+ // Look for html-like tags
276
+ if (/<(?:div|span|p|body|head|table|form|input|a\s)/i.test(head)) {
277
+ return 'text/html';
278
+ }
279
+ return 'application/xml';
280
+ }
281
+
282
+ // CSV: multiple lines with commas and consistent column counts
283
+ const lines = head.split(/\r?\n/).filter(l => l.trim());
284
+ if (lines.length >= 2) {
285
+ const commaCount0 = (lines[0].match(/,/g) || []).length;
286
+ const commaCount1 = (lines[1].match(/,/g) || []).length;
287
+ if (commaCount0 > 0 && commaCount0 === commaCount1) {
288
+ return 'text/csv';
289
+ }
290
+ }
291
+
292
+ // Markdown: check for common markers
293
+ if (/^#{1,6}\s/.test(head) || /^\s*[-*+]\s/.test(head) || /\[.*\]\(.*\)/.test(head)) {
294
+ return 'text/markdown';
295
+ }
296
+
297
+ // Default to plain text
298
+ return 'text/plain';
299
+ }
300
+ }
301
+
302
+ // =========================================================================
303
+ // MIME TYPE TO EXTRACTOR MAPPING
304
+ // =========================================================================
305
+
306
+ const EXTRACTORS = {
307
+ 'text/plain': TextExtractor.extractFromPlainText,
308
+ 'text/html': TextExtractor.extractFromHTML,
309
+ 'text/csv': TextExtractor.extractFromCSV,
310
+ 'application/json': TextExtractor.extractFromJSON,
311
+ 'text/markdown': TextExtractor.extractFromMarkdown,
312
+ 'application/xml': TextExtractor.extractFromXML,
313
+ 'text/xml': TextExtractor.extractFromXML
314
+ };
315
+
316
+ const SUPPORTED_TYPES = Object.keys(EXTRACTORS);
317
+
318
+ // =========================================================================
319
+ // INDIRECT INJECTION SCANNER
320
+ // =========================================================================
321
+
322
+ /**
323
+ * Scans text extracted from documents for indirect prompt injection attacks.
324
+ * These are attacks where malicious instructions are hidden inside documents
325
+ * that an AI agent will process.
326
+ */
327
+ class IndirectInjectionScanner {
328
+ /**
329
+ * Create an IndirectInjectionScanner.
330
+ * @param {Object} [options={}] - Scanner options.
331
+ * @param {string} [options.sensitivity='medium'] - Detection sensitivity.
332
+ */
333
+ constructor(options = {}) {
334
+ this.sensitivity = options.sensitivity || 'medium';
335
+ }
336
+
337
+ /**
338
+ * Scan extracted text for indirect prompt injection patterns.
339
+ * @param {string} text - The extracted text to scan.
340
+ * @param {string} [source='document'] - The source description.
341
+ * @returns {{ threats: Array, hiddenContent: Array, riskScore: number }}
342
+ */
343
+ scan(text, source = 'document') {
344
+ const threats = [];
345
+ const hiddenContent = [];
346
+ let riskScore = 0;
347
+
348
+ if (!text || text.trim().length === 0) {
349
+ return { threats, hiddenContent, riskScore };
350
+ }
351
+
352
+ // 1. Check for hidden instructions disguised as data
353
+ const instructionPatterns = [
354
+ {
355
+ regex: /(?:SYSTEM|ADMIN|ASSISTANT|AI)\s*:\s*.{10,}/gi,
356
+ description: 'Role-prefixed instructions hidden in document',
357
+ severity: 'high'
358
+ },
359
+ {
360
+ regex: /(?:BEGIN|START)\s+(?:HIDDEN|SECRET|PRIVATE)\s+(?:INSTRUCTIONS?|COMMANDS?|SECTION)/gi,
361
+ description: 'Hidden instruction block markers in document',
362
+ severity: 'critical'
363
+ },
364
+ {
365
+ regex: /(?:when|if)\s+(?:the\s+)?(?:AI|assistant|model|agent|you)\s+(?:reads?|processes?|sees?|parses?|receives?)\s+this/gi,
366
+ description: 'Conditional instructions targeting AI processing',
367
+ severity: 'high'
368
+ },
369
+ {
370
+ regex: /(?:do\s+not|don'?t)\s+(?:tell|inform|reveal|mention|show)\s+(?:the\s+)?(?:user|human|person|operator)/gi,
371
+ description: 'Instructions to hide information from the user',
372
+ severity: 'critical'
373
+ },
374
+ {
375
+ regex: /(?:you\s+are|you're)\s+(?:now|actually)\s+(?:a|an|in)\s+/gi,
376
+ description: 'Identity reassignment attempt in document',
377
+ severity: 'high'
378
+ },
379
+ {
380
+ regex: /(?:execute|run|perform|call)\s+(?:the\s+)?(?:following|this|these)\s+(?:tool|function|command|action|code)/gi,
381
+ description: 'Tool execution instructions hidden in document',
382
+ severity: 'critical'
383
+ }
384
+ ];
385
+
386
+ for (const pattern of instructionPatterns) {
387
+ const matches = text.match(pattern.regex);
388
+ if (matches) {
389
+ for (const match of matches) {
390
+ threats.push({
391
+ type: 'indirect_injection',
392
+ severity: pattern.severity,
393
+ description: pattern.description,
394
+ source,
395
+ match: match.substring(0, 200)
396
+ });
397
+ riskScore += pattern.severity === 'critical' ? 40 : 20;
398
+ }
399
+ }
400
+ }
401
+
402
+ // 2. Check for invisible/zero-width characters (used to hide instructions)
403
+ const invisibleChars = [
404
+ { char: '\u200B', name: 'zero-width space' },
405
+ { char: '\u200C', name: 'zero-width non-joiner' },
406
+ { char: '\u200D', name: 'zero-width joiner' },
407
+ { char: '\u2060', name: 'word joiner' },
408
+ { char: '\uFEFF', name: 'zero-width no-break space' },
409
+ { char: '\u00AD', name: 'soft hyphen' },
410
+ { char: '\u200E', name: 'left-to-right mark' },
411
+ { char: '\u200F', name: 'right-to-left mark' },
412
+ { char: '\u2061', name: 'function application' },
413
+ { char: '\u2062', name: 'invisible times' },
414
+ { char: '\u2063', name: 'invisible separator' },
415
+ { char: '\u2064', name: 'invisible plus' }
416
+ ];
417
+
418
+ let invisibleCount = 0;
419
+ const foundInvisible = [];
420
+
421
+ for (const { char, name } of invisibleChars) {
422
+ const count = (text.split(char).length - 1);
423
+ if (count > 0) {
424
+ invisibleCount += count;
425
+ foundInvisible.push({ name, count });
426
+ }
427
+ }
428
+
429
+ if (invisibleCount > 5) {
430
+ hiddenContent.push({
431
+ type: 'invisible_characters',
432
+ description: `Found ${invisibleCount} invisible/zero-width characters`,
433
+ details: foundInvisible
434
+ });
435
+
436
+ // Try to extract hidden content by removing visible chars
437
+ const invisibleOnly = text.replace(/[^\u200B\u200C\u200D\u2060\uFEFF]/g, '');
438
+ if (invisibleOnly.length > 10) {
439
+ hiddenContent.push({
440
+ type: 'steganographic_content',
441
+ description: 'Possible steganographic content via invisible characters',
442
+ length: invisibleOnly.length
443
+ });
444
+ }
445
+
446
+ threats.push({
447
+ type: 'hidden_content',
448
+ severity: 'high',
449
+ description: `Suspicious invisible characters detected (${invisibleCount} found)`,
450
+ source,
451
+ details: foundInvisible
452
+ });
453
+ riskScore += 25;
454
+ }
455
+
456
+ // 3. Check for markdown rendering attacks
457
+ const markdownAttacks = [
458
+ {
459
+ regex: /!\[([^\]]{50,})\]\(/g,
460
+ description: 'Oversized image alt text (possible hidden instructions)',
461
+ severity: 'medium'
462
+ },
463
+ {
464
+ regex: /\[([^\]]*)\]\(javascript:/gi,
465
+ description: 'JavaScript URI in markdown link',
466
+ severity: 'high'
467
+ },
468
+ {
469
+ regex: /\[([^\]]*)\]\(data:/gi,
470
+ description: 'Data URI in markdown link',
471
+ severity: 'medium'
472
+ },
473
+ {
474
+ regex: /<!--[\s\S]{20,}?-->/g,
475
+ description: 'Large HTML comment block (possible hidden content)',
476
+ severity: 'medium'
477
+ }
478
+ ];
479
+
480
+ for (const pattern of markdownAttacks) {
481
+ const matches = text.match(pattern.regex);
482
+ if (matches) {
483
+ for (const match of matches) {
484
+ threats.push({
485
+ type: 'markdown_injection',
486
+ severity: pattern.severity,
487
+ description: pattern.description,
488
+ source,
489
+ match: match.substring(0, 200)
490
+ });
491
+ riskScore += pattern.severity === 'high' ? 15 : 10;
492
+ }
493
+ }
494
+ }
495
+
496
+ // 4. Check for text that looks like it was encoded or obfuscated
497
+ const base64Regex = /(?:[A-Za-z0-9+/]{4}){10,}(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?/g;
498
+ const base64Matches = text.match(base64Regex);
499
+ if (base64Matches) {
500
+ for (const match of base64Matches) {
501
+ try {
502
+ const decoded = Buffer.from(match, 'base64').toString('utf-8');
503
+ // Check if decoded text contains instruction-like content
504
+ if (/(?:ignore|override|system|execute|admin|secret)/i.test(decoded)) {
505
+ threats.push({
506
+ type: 'encoded_injection',
507
+ severity: 'high',
508
+ description: 'Base64-encoded content contains suspicious instructions',
509
+ source,
510
+ decodedPreview: decoded.substring(0, 200)
511
+ });
512
+ riskScore += 30;
513
+ }
514
+ } catch (e) {
515
+ // Not valid base64, skip
516
+ }
517
+ }
518
+ }
519
+
520
+ // Cap risk score at 100
521
+ riskScore = Math.min(100, riskScore);
522
+
523
+ // Filter by sensitivity
524
+ const filteredThreats = this._filterBySensitivity(threats);
525
+
526
+ return {
527
+ threats: filteredThreats,
528
+ hiddenContent,
529
+ riskScore
530
+ };
531
+ }
532
+
533
+ /**
534
+ * Filter threats based on configured sensitivity.
535
+ * @param {Array} threats - The threats to filter.
536
+ * @returns {Array} Filtered threats.
537
+ * @private
538
+ */
539
+ _filterBySensitivity(threats) {
540
+ if (this.sensitivity === 'low') {
541
+ return threats.filter(t => t.severity === 'critical' || t.severity === 'high');
542
+ }
543
+ if (this.sensitivity === 'medium') {
544
+ return threats.filter(t => t.severity !== 'low');
545
+ }
546
+ // 'high' sensitivity = return everything
547
+ return threats;
548
+ }
549
+ }
550
+
551
+ // =========================================================================
552
+ // DOCUMENT SCANNER
553
+ // =========================================================================
554
+
555
+ /**
556
+ * Scans documents for threats by extracting text and running it through
557
+ * the Agent Shield detection engine. Designed to catch indirect prompt
558
+ * injection attacks hidden in uploaded documents.
559
+ */
560
+ class DocumentScanner {
561
+ /**
562
+ * Create a DocumentScanner.
563
+ * @param {Object} [options={}] - Scanner options.
564
+ * @param {string} [options.sensitivity='medium'] - Detection sensitivity ('low', 'medium', 'high').
565
+ * @param {boolean} [options.logging=false] - Whether to log scan results.
566
+ * @param {boolean} [options.scanForInjection=true] - Whether to run indirect injection scanning.
567
+ */
568
+ constructor(options = {}) {
569
+ this.sensitivity = options.sensitivity || 'medium';
570
+ this.logging = options.logging || false;
571
+ this.scanForInjection = options.scanForInjection !== false;
572
+ this.injectionScanner = new IndirectInjectionScanner({ sensitivity: this.sensitivity });
573
+ }
574
+
575
+ /**
576
+ * Scan a file from disk. Reads the file, detects its type, extracts text,
577
+ * and scans it for threats.
578
+ * @param {string} filePath - Path to the file to scan.
579
+ * @returns {{ fileType: string, textLength: number, threats: Array, status: string }}
580
+ */
581
+ scanFile(filePath) {
582
+ const resolvedPath = path.resolve(filePath);
583
+ const ext = path.extname(resolvedPath).toLowerCase();
584
+
585
+ let buffer;
586
+ try {
587
+ buffer = fs.readFileSync(resolvedPath);
588
+ } catch (err) {
589
+ if (this.logging) {
590
+ console.log(`[Agent Shield] Document scanner failed to read file: ${resolvedPath}`);
591
+ }
592
+ return {
593
+ fileType: 'unknown',
594
+ textLength: 0,
595
+ threats: [{
596
+ type: 'scan_error',
597
+ severity: 'medium',
598
+ description: `Failed to read file: ${err.message}`,
599
+ source: resolvedPath
600
+ }],
601
+ status: 'caution'
602
+ };
603
+ }
604
+
605
+ // Determine MIME type from extension first, fall back to content detection
606
+ const mimeType = this._mimeFromExtension(ext) || TextExtractor.detect(buffer);
607
+
608
+ if (this.logging) {
609
+ console.log(`[Agent Shield] Scanning document: ${resolvedPath} (${mimeType})`);
610
+ }
611
+
612
+ return this.scanBuffer(buffer, mimeType, resolvedPath);
613
+ }
614
+
615
+ /**
616
+ * Scan a Buffer with a known MIME type.
617
+ * @param {Buffer} buffer - The file contents.
618
+ * @param {string} [mimeType] - The MIME type of the file. Auto-detected if not provided.
619
+ * @param {string} [source='buffer'] - Source description for logging.
620
+ * @returns {{ fileType: string, textLength: number, threats: Array, status: string }}
621
+ */
622
+ scanBuffer(buffer, mimeType, source = 'buffer') {
623
+ if (!Buffer.isBuffer(buffer)) {
624
+ return {
625
+ fileType: 'unknown',
626
+ textLength: 0,
627
+ threats: [{
628
+ type: 'scan_error',
629
+ severity: 'medium',
630
+ description: 'Input is not a valid Buffer',
631
+ source
632
+ }],
633
+ status: 'caution'
634
+ };
635
+ }
636
+
637
+ // Auto-detect if no MIME type provided
638
+ const detectedType = mimeType || TextExtractor.detect(buffer);
639
+ const extractor = EXTRACTORS[detectedType];
640
+
641
+ if (!extractor) {
642
+ if (this.logging) {
643
+ console.log(`[Agent Shield] Unsupported file type: ${detectedType}`);
644
+ }
645
+ return {
646
+ fileType: detectedType,
647
+ textLength: 0,
648
+ threats: [],
649
+ status: 'safe'
650
+ };
651
+ }
652
+
653
+ let extractedText;
654
+ try {
655
+ extractedText = extractor(buffer);
656
+ } catch (err) {
657
+ return {
658
+ fileType: detectedType,
659
+ textLength: 0,
660
+ threats: [{
661
+ type: 'extraction_error',
662
+ severity: 'medium',
663
+ description: `Failed to extract text: ${err.message}`,
664
+ source
665
+ }],
666
+ status: 'caution'
667
+ };
668
+ }
669
+
670
+ return this.scanText(extractedText, { source, fileType: detectedType });
671
+ }
672
+
673
+ /**
674
+ * Scan pre-extracted text with source metadata.
675
+ * @param {string} text - The extracted text to scan.
676
+ * @param {Object} [metadata={}] - Source metadata.
677
+ * @param {string} [metadata.source='text'] - Where the text came from.
678
+ * @param {string} [metadata.fileType='text/plain'] - The original file type.
679
+ * @returns {{ fileType: string, textLength: number, threats: Array, status: string }}
680
+ */
681
+ scanText(text, metadata = {}) {
682
+ const source = metadata.source || 'text';
683
+ const fileType = metadata.fileType || 'text/plain';
684
+
685
+ if (!text || text.trim().length === 0) {
686
+ return {
687
+ fileType,
688
+ textLength: 0,
689
+ threats: [],
690
+ status: 'safe'
691
+ };
692
+ }
693
+
694
+ // Run core threat detection
695
+ const coreResult = scanText(text, {
696
+ source: `document:${source}`,
697
+ sensitivity: this.sensitivity
698
+ });
699
+
700
+ // Combine threats from core detection
701
+ let allThreats = [...coreResult.threats];
702
+
703
+ // Run indirect injection scanning
704
+ if (this.scanForInjection) {
705
+ const injectionResult = this.injectionScanner.scan(text, source);
706
+ allThreats = allThreats.concat(injectionResult.threats);
707
+
708
+ // Add hidden content as threats if found
709
+ for (const hidden of injectionResult.hiddenContent) {
710
+ allThreats.push({
711
+ type: 'hidden_content',
712
+ severity: 'medium',
713
+ description: hidden.description,
714
+ source,
715
+ details: hidden
716
+ });
717
+ }
718
+ }
719
+
720
+ // Deduplicate threats by description
721
+ const seen = new Set();
722
+ allThreats = allThreats.filter(t => {
723
+ const key = `${t.type || t.category}:${t.description}`;
724
+ if (seen.has(key)) return false;
725
+ seen.add(key);
726
+ return true;
727
+ });
728
+
729
+ // Sort by severity
730
+ allThreats.sort((a, b) => {
731
+ const sevA = SEVERITY_ORDER[a.severity] !== undefined ? SEVERITY_ORDER[a.severity] : 3;
732
+ const sevB = SEVERITY_ORDER[b.severity] !== undefined ? SEVERITY_ORDER[b.severity] : 3;
733
+ return sevA - sevB;
734
+ });
735
+
736
+ // Determine overall status
737
+ let status = 'safe';
738
+ const hasCritical = allThreats.some(t => t.severity === 'critical');
739
+ const hasHigh = allThreats.some(t => t.severity === 'high');
740
+ const hasMedium = allThreats.some(t => t.severity === 'medium');
741
+
742
+ if (hasCritical) status = 'danger';
743
+ else if (hasHigh) status = 'warning';
744
+ else if (hasMedium) status = 'caution';
745
+
746
+ if (this.logging) {
747
+ console.log(`[Agent Shield] Document scan complete: ${allThreats.length} threat(s), status=${status}`);
748
+ }
749
+
750
+ return {
751
+ fileType,
752
+ textLength: text.length,
753
+ threats: allThreats,
754
+ status
755
+ };
756
+ }
757
+
758
+ /**
759
+ * Returns the list of supported file MIME types.
760
+ * @returns {string[]} Array of supported MIME types.
761
+ */
762
+ getSupportedTypes() {
763
+ return [...SUPPORTED_TYPES];
764
+ }
765
+
766
+ /**
767
+ * Map file extension to MIME type.
768
+ * @param {string} ext - The file extension (e.g. '.html').
769
+ * @returns {string|null} The MIME type, or null if unknown.
770
+ * @private
771
+ */
772
+ _mimeFromExtension(ext) {
773
+ const map = {
774
+ '.txt': 'text/plain',
775
+ '.text': 'text/plain',
776
+ '.log': 'text/plain',
777
+ '.html': 'text/html',
778
+ '.htm': 'text/html',
779
+ '.csv': 'text/csv',
780
+ '.json': 'application/json',
781
+ '.md': 'text/markdown',
782
+ '.markdown': 'text/markdown',
783
+ '.xml': 'application/xml',
784
+ '.svg': 'application/xml',
785
+ '.xhtml': 'text/html'
786
+ };
787
+ return map[ext] || null;
788
+ }
789
+ }
790
+
791
+ // =========================================================================
792
+ // EXPORTS
793
+ // =========================================================================
794
+
795
+ module.exports = { DocumentScanner, TextExtractor, IndirectInjectionScanner };