npm - @bryan-thompson/inspector-assessment-client - Versions diffs - 1.25.1 → 1.25.5 - Mend

@bryan-thompson/inspector-assessment-client 1.25.1 → 1.25.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

package/lib/services/assessment/modules/annotations/DescriptionAnalyzer.js ADDED Viewed

@@ -0,0 +1,304 @@
+/**
+ * Description Analyzer
+ *
+ * Analyzes tool descriptions for behavioral keywords to infer expected behavior.
+ * This provides a more robust inference than name-pattern matching alone.
+ *
+ * Part of Issue #57: Architecture detection and behavior inference modules
+ */
+/**
+ * Keyword categories with confidence levels.
+ * Keywords in 'high' have strong semantic association with the behavior.
+ * Keywords in 'medium' are indicative but may have context-dependent meanings.
+ * Keywords in 'low' are weak indicators.
+ */
+export const DESCRIPTION_BEHAVIOR_KEYWORDS = {
+    readOnly: {
+        high: [
+            "retrieves",
+            "returns",
+            "lists",
+            "shows",
+            "displays",
+            "queries",
+            "searches",
+            "finds",
+            "looks up",
+            "fetches",
+        ],
+        medium: [
+            "gets",
+            "reads",
+            "views",
+            "checks",
+            "verifies",
+            "validates",
+            "inspects",
+            "examines",
+            "browses",
+            "previews",
+        ],
+        low: [
+            "accesses",
+            "obtains",
+            "provides",
+            "outputs",
+            "prints",
+            "counts",
+            "measures",
+            "calculates",
+        ],
+    },
+    destructive: {
+        high: [
+            "deletes",
+            "removes",
+            "destroys",
+            "drops",
+            "purges",
+            "wipes",
+            "clears",
+            "erases",
+            "permanently",
+            "irreversible",
+            "archives", // soft-delete euphemism
+            "terminated", // forceful process ending
+        ],
+        medium: [
+            "truncates",
+            "kills",
+            "terminates",
+            "revokes",
+            "cancels",
+            "uninstalls",
+            "dismounts",
+            "detaches",
+            "marks", // when combined with deleted/archived
+        ],
+        low: ["resets", "restores to default", "cleans", "cleanup"],
+    },
+    write: {
+        high: [
+            "creates",
+            "inserts",
+            "adds",
+            "generates",
+            "produces",
+            "makes",
+            "builds",
+        ],
+        medium: [
+            "updates",
+            "modifies",
+            "changes",
+            "edits",
+            "sets",
+            "puts",
+            "patches",
+            "appends",
+            "extends",
+            "increments", // modify operation
+            "decrements", // modify operation
+        ],
+        low: [
+            "saves",
+            "stores",
+            "writes",
+            "posts",
+            "sends",
+            "submits",
+            "publishes",
+            "uploads",
+            "exports",
+        ],
+    },
+};
+/**
+ * Negation patterns that might invert the meaning of keywords.
+ * E.g., "does not delete" should not be marked as destructive.
+ */
+const NEGATION_PATTERNS = [
+    /\b(does\s+not|doesn't|do\s+not|don't|cannot|can't|will\s+not|won't|never|without)\s+/i,
+    /\bnot\s+(delete|remove|destroy|modify|change|create|update)/i,
+];
+/**
+ * Threshold for write signals to override read-only classification.
+ * Write signals at 50%+ of read-only score indicate mixed operation tools
+ * (e.g., "fetch and update" should classify as write, not read-only).
+ */
+const WRITE_OVERRIDE_THRESHOLD = 0.5;
+/**
+ * Check if a keyword match is negated by surrounding context.
+ *
+ * @param description - Full description text
+ * @param keywordIndex - Index where keyword was found
+ * @param windowSize - Characters before keyword to check for negation
+ * @returns True if the keyword is negated
+ */
+function isNegated(description, keywordIndex, windowSize = 60) {
+    const start = Math.max(0, keywordIndex - windowSize);
+    const contextBefore = description.slice(start, keywordIndex);
+    for (const pattern of NEGATION_PATTERNS) {
+        if (pattern.test(contextBefore)) {
+            return true;
+        }
+    }
+    return false;
+}
+/**
+ * Find keyword matches in description with confidence levels.
+ *
+ * @param description - Tool description to analyze
+ * @param keywords - Keyword object with high/medium/low arrays
+ * @returns Array of matches with confidence scores
+ */
+function findKeywordMatches(description, keywords) {
+    const matches = [];
+    const lowerDesc = description.toLowerCase();
+    const searchKeywords = (keywordList, confidence) => {
+        for (const keyword of keywordList) {
+            // Create a regex pattern that matches the keyword as a word
+            const pattern = new RegExp(`\\b${keyword.replace(/\s+/g, "\\s+")}`, "gi");
+            let match;
+            while ((match = pattern.exec(lowerDesc)) !== null) {
+                const negated = isNegated(lowerDesc, match.index);
+                matches.push({ keyword, confidence, negated });
+            }
+        }
+    };
+    searchKeywords(keywords.high, 90);
+    searchKeywords(keywords.medium, 70);
+    searchKeywords(keywords.low, 50);
+    return matches;
+}
+/**
+ * Analyze a tool description for behavioral signals.
+ *
+ * @param description - Tool description to analyze
+ * @returns InferenceSignal with read-only/destructive expectations
+ */
+export function analyzeDescription(description) {
+    if (!description || description.trim().length === 0) {
+        return {
+            expectedReadOnly: false,
+            expectedDestructive: false,
+            confidence: 0,
+            evidence: ["No description provided"],
+        };
+    }
+    // Find all keyword matches for each category
+    const readOnlyMatches = findKeywordMatches(description, DESCRIPTION_BEHAVIOR_KEYWORDS.readOnly);
+    const destructiveMatches = findKeywordMatches(description, DESCRIPTION_BEHAVIOR_KEYWORDS.destructive);
+    const writeMatches = findKeywordMatches(description, DESCRIPTION_BEHAVIOR_KEYWORDS.write);
+    // Filter out negated matches for the primary behavior classification
+    const activeReadOnly = readOnlyMatches.filter((m) => !m.negated);
+    const activeDestructive = destructiveMatches.filter((m) => !m.negated);
+    const activeWrite = writeMatches.filter((m) => !m.negated);
+    // Calculate weighted scores for each category
+    const readOnlyScore = activeReadOnly.reduce((sum, m) => sum + m.confidence, 0);
+    const destructiveScore = activeDestructive.reduce((sum, m) => sum + m.confidence, 0);
+    const writeScore = activeWrite.reduce((sum, m) => sum + m.confidence, 0);
+    // Determine the dominant behavior
+    const evidence = [];
+    let expectedReadOnly = false;
+    let expectedDestructive = false;
+    let confidence = 0;
+    // Destructive takes priority if detected with high confidence
+    if (destructiveScore > 0) {
+        expectedDestructive = true;
+        confidence = Math.min(100, destructiveScore);
+        evidence.push(`Destructive keywords: ${activeDestructive.map((m) => m.keyword).join(", ")}`);
+    }
+    // Read-only detection (only if not destructive)
+    if (readOnlyScore > 0 && !expectedDestructive) {
+        // Write operations take precedence when present (even if equal score)
+        // Multi-operation tools like "fetch and update" should be classified as write
+        if (writeScore > 0 &&
+            writeScore >= readOnlyScore * WRITE_OVERRIDE_THRESHOLD) {
+            // Write signal is significant enough to override read-only
+            confidence = Math.min(100, writeScore);
+            evidence.push(`Write keywords override read: ${activeWrite.map((m) => m.keyword).join(", ")}`);
+        }
+        else if (readOnlyScore > writeScore) {
+            expectedReadOnly = true;
+            confidence = Math.min(100, readOnlyScore);
+            evidence.push(`Read-only keywords: ${activeReadOnly.map((m) => m.keyword).join(", ")}`);
+        }
+    }
+    // Pure write operation (no read-only indicators)
+    if (!expectedReadOnly && !expectedDestructive && writeScore > 0) {
+        confidence = Math.min(100, writeScore);
+        evidence.push(`Write keywords: ${activeWrite.map((m) => m.keyword).join(", ")}`);
+    }
+    // Add negation evidence if present
+    const negatedKeywords = [
+        ...readOnlyMatches.filter((m) => m.negated),
+        ...destructiveMatches.filter((m) => m.negated),
+        ...writeMatches.filter((m) => m.negated),
+    ];
+    if (negatedKeywords.length > 0) {
+        evidence.push(`Negated keywords ignored: ${negatedKeywords.map((m) => m.keyword).join(", ")}`);
+    }
+    // Default case: no signals
+    if (evidence.length === 0) {
+        evidence.push("No behavioral keywords detected in description");
+        confidence = 0;
+    }
+    return {
+        expectedReadOnly,
+        expectedDestructive,
+        confidence,
+        evidence,
+    };
+}
+/**
+ * Quick check if description contains read-only indicators.
+ * Useful for fast filtering before full analysis.
+ *
+ * @param description - Tool description to check
+ * @returns True if any read-only keywords are present
+ */
+export function hasReadOnlyIndicators(description) {
+    if (!description)
+        return false;
+    const lowerDesc = description.toLowerCase();
+    const allReadOnlyKeywords = [
+        ...DESCRIPTION_BEHAVIOR_KEYWORDS.readOnly.high,
+        ...DESCRIPTION_BEHAVIOR_KEYWORDS.readOnly.medium,
+    ];
+    return allReadOnlyKeywords.some((keyword) => lowerDesc.includes(keyword.toLowerCase()));
+}
+/**
+ * Quick check if description contains destructive indicators.
+ * Useful for fast filtering before full analysis.
+ *
+ * @param description - Tool description to check
+ * @returns True if any destructive keywords are present
+ */
+export function hasDestructiveIndicators(description) {
+    if (!description)
+        return false;
+    const lowerDesc = description.toLowerCase();
+    const allDestructiveKeywords = [
+        ...DESCRIPTION_BEHAVIOR_KEYWORDS.destructive.high,
+        ...DESCRIPTION_BEHAVIOR_KEYWORDS.destructive.medium,
+    ];
+    return allDestructiveKeywords.some((keyword) => lowerDesc.includes(keyword.toLowerCase()));
+}
+/**
+ * Quick check if description contains write indicators.
+ * Useful for fast filtering before full analysis.
+ *
+ * @param description - Tool description to check
+ * @returns True if any write keywords are present
+ */
+export function hasWriteIndicators(description) {
+    if (!description)
+        return false;
+    const lowerDesc = description.toLowerCase();
+    const allWriteKeywords = [
+        ...DESCRIPTION_BEHAVIOR_KEYWORDS.write.high,
+        ...DESCRIPTION_BEHAVIOR_KEYWORDS.write.medium,
+    ];
+    return allWriteKeywords.some((keyword) => lowerDesc.includes(keyword.toLowerCase()));
+}

package/lib/services/assessment/modules/annotations/DescriptionPoisoningDetector.d.ts ADDED Viewed

@@ -0,0 +1,43 @@
+/**
+ * Description Poisoning Detector
+ * Detects hidden instructions and malicious content in tool descriptions
+ *
+ * Extracted from ToolAnnotationAssessor.ts for maintainability.
+ * Issue #8 implementation.
+ */
+import type { Tool } from "@modelcontextprotocol/sdk/types.js";
+/**
+ * Tool description poisoning pattern definition
+ */
+export interface PoisoningPattern {
+    name: string;
+    pattern: RegExp;
+    severity: "LOW" | "MEDIUM" | "HIGH";
+    category: string;
+}
+/**
+ * Result of description poisoning scan
+ */
+export interface PoisoningScanResult {
+    detected: boolean;
+    patterns: Array<{
+        name: string;
+        pattern: string;
+        severity: "LOW" | "MEDIUM" | "HIGH";
+        category: string;
+        evidence: string;
+    }>;
+    riskLevel: "NONE" | "LOW" | "MEDIUM" | "HIGH";
+}
+/**
+ * Description poisoning patterns for detecting malicious tool descriptions
+ * Covers: hidden instructions, override commands, concealment, exfiltration,
+ * delimiter injection, encoding bypass, and typoglycemia/evasion patterns
+ */
+export declare const DESCRIPTION_POISONING_PATTERNS: PoisoningPattern[];
+/**
+ * Scan tool description for poisoning patterns
+ * Detects hidden instructions, override commands, concealment, and exfiltration attempts
+ */
+export declare function scanDescriptionForPoisoning(tool: Tool): PoisoningScanResult;
+//# sourceMappingURL=DescriptionPoisoningDetector.d.ts.map

package/lib/services/assessment/modules/annotations/DescriptionPoisoningDetector.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"DescriptionPoisoningDetector.d.ts","sourceRoot":"","sources":["../../../../../src/services/assessment/modules/annotations/DescriptionPoisoningDetector.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,oCAAoC,CAAC;AAE/D;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IACpC,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,OAAO,CAAC;IAClB,QAAQ,EAAE,KAAK,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,CAAC;QAChB,QAAQ,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QACpC,QAAQ,EAAE,MAAM,CAAC;QACjB,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CAC/C;AAED;;;;GAIG;AACH,eAAO,MAAM,8BAA8B,EAAE,gBAAgB,EA2O5D,CAAC;AAEF;;;GAGG;AACH,wBAAgB,2BAA2B,CAAC,IAAI,EAAE,IAAI,GAAG,mBAAmB,CA+C3E"}

package/lib/services/assessment/modules/annotations/DescriptionPoisoningDetector.js ADDED Viewed

@@ -0,0 +1,276 @@
+/**
+ * Description Poisoning Detector
+ * Detects hidden instructions and malicious content in tool descriptions
+ *
+ * Extracted from ToolAnnotationAssessor.ts for maintainability.
+ * Issue #8 implementation.
+ */
+/**
+ * Description poisoning patterns for detecting malicious tool descriptions
+ * Covers: hidden instructions, override commands, concealment, exfiltration,
+ * delimiter injection, encoding bypass, and typoglycemia/evasion patterns
+ */
+export const DESCRIPTION_POISONING_PATTERNS = [
+    // Hidden instruction tags
+    {
+        name: "hidden_tag",
+        pattern: /<HIDDEN>[\s\S]*?<\/HIDDEN>/gi,
+        severity: "HIGH",
+        category: "hidden_instructions",
+    },
+    {
+        name: "important_tag",
+        pattern: /<IMPORTANT>[\s\S]*?<\/IMPORTANT>/gi,
+        severity: "HIGH",
+        category: "hidden_instructions",
+    },
+    {
+        name: "system_tag",
+        pattern: /<SYSTEM>[\s\S]*?<\/SYSTEM>/gi,
+        severity: "HIGH",
+        category: "hidden_instructions",
+    },
+    {
+        name: "instruction_tag",
+        pattern: /<INSTRUCTION>[\s\S]*?<\/INSTRUCTION>/gi,
+        severity: "HIGH",
+        category: "hidden_instructions",
+    },
+    // Override commands
+    {
+        name: "ignore_instructions",
+        pattern: /ignore\s+(all\s+)?(previous|prior|above)\s+instructions/i,
+        severity: "HIGH",
+        category: "override",
+    },
+    {
+        name: "disregard_instructions",
+        pattern: /disregard\s+(all\s+)?(previous|prior)\s+instructions/i,
+        severity: "HIGH",
+        category: "override",
+    },
+    {
+        name: "you_are_now",
+        pattern: /you\s+are\s+now\s+(a|an|the)/i,
+        severity: "MEDIUM",
+        category: "override",
+    },
+    {
+        name: "system_override",
+        pattern: /system:\s*override/i,
+        severity: "HIGH",
+        category: "override",
+    },
+    // Secrecy/concealment
+    {
+        name: "do_not_mention",
+        pattern: /do\s+not\s+(mention|tell|inform|reveal)/i,
+        severity: "HIGH",
+        category: "concealment",
+    },
+    {
+        name: "secretly",
+        pattern: /\bsecretly\b/i,
+        severity: "MEDIUM",
+        category: "concealment",
+    },
+    {
+        name: "hide_from_user",
+        pattern: /hide\s+(this\s+)?from\s+(the\s+)?user/i,
+        severity: "HIGH",
+        category: "concealment",
+    },
+    // Data exfiltration
+    {
+        name: "return_keys",
+        pattern: /return\s+(all\s+)?(api\s*)?keys/i,
+        severity: "HIGH",
+        category: "exfiltration",
+    },
+    {
+        name: "include_credentials",
+        // More specific: requires directive context (in/with/when/to) to reduce false positives
+        pattern: /include\s+(the\s+|all\s+|your\s+)?(api.?key|credential|password|secret)s?\s*(in|with|when|to)/i,
+        severity: "HIGH",
+        category: "exfiltration",
+    },
+    {
+        name: "reveal_secrets",
+        pattern: /reveal\s+(all\s+)?(secrets|credentials|api\s*keys)/i,
+        severity: "HIGH",
+        category: "exfiltration",
+    },
+    {
+        name: "access_internal_secrets",
+        pattern: /access\s+(the\s+)?internal\s+(api\s*)?(key|secret|credential|password|token)/i,
+        severity: "HIGH",
+        category: "exfiltration",
+    },
+    // Delimiter injection
+    {
+        name: "system_codeblock",
+        pattern: /```system[\s\S]*?```/gi,
+        severity: "HIGH",
+        category: "delimiter",
+    },
+    {
+        name: "inst_tags",
+        pattern: /\[INST\][\s\S]*?\[\/INST\]/gi,
+        severity: "HIGH",
+        category: "delimiter",
+    },
+    {
+        name: "chatml_system",
+        pattern: /<\|im_start\|>system/gi,
+        severity: "HIGH",
+        category: "delimiter",
+    },
+    {
+        name: "llama_sys",
+        pattern: /<<SYS>>/gi,
+        severity: "HIGH",
+        category: "delimiter",
+    },
+    {
+        name: "user_assistant_block",
+        pattern: /\[USER\][\s\S]*?\[ASSISTANT\]/gi,
+        severity: "HIGH",
+        category: "delimiter",
+    },
+    // Role/persona injection (Warning #4)
+    {
+        name: "act_as",
+        pattern: /act\s+(like|as)\s+(a|an|the)/i,
+        severity: "MEDIUM",
+        category: "override",
+    },
+    {
+        name: "pretend_to_be",
+        pattern: /pretend\s+(to\s+be|you\s*'?re)/i,
+        severity: "MEDIUM",
+        category: "override",
+    },
+    {
+        name: "roleplay_as",
+        pattern: /role\s*play\s+(as|like)/i,
+        severity: "MEDIUM",
+        category: "override",
+    },
+    {
+        name: "new_task",
+        pattern: /new\s+(task|instruction|objective):\s*/i,
+        severity: "HIGH",
+        category: "override",
+    },
+    // Encoding bypass detection (Warning #1)
+    {
+        name: "base64_encoded_block",
+        pattern: /[A-Za-z0-9+/]{50,}={0,2}/g, // Large Base64 strings (50+ chars)
+        severity: "MEDIUM",
+        category: "encoding_bypass",
+    },
+    {
+        name: "unicode_escape_sequence",
+        pattern: /(?:\\u[0-9a-fA-F]{4}){3,}/gi, // 3+ consecutive Unicode escapes
+        severity: "MEDIUM",
+        category: "encoding_bypass",
+    },
+    {
+        name: "html_entity_block",
+        pattern: /(?:&#x?[0-9a-fA-F]+;){3,}/gi, // 3+ consecutive HTML entities
+        severity: "MEDIUM",
+        category: "encoding_bypass",
+    },
+    // Typoglycemia/evasion patterns (Warning #2)
+    {
+        name: "ignore_instructions_typo",
+        pattern: /ign[o0]r[e3]?\s+(all\s+)?(pr[e3]v[i1][o0]us|pr[i1][o0]r|ab[o0]v[e3])\s+[i1]nstruct[i1][o0]ns?/i,
+        severity: "HIGH",
+        category: "override",
+    },
+    {
+        name: "disregard_typo",
+        pattern: /d[i1]sr[e3]g[a4]rd\s+(all\s+)?(pr[e3]v[i1][o0]us|pr[i1][o0]r)\s+[i1]nstruct[i1][o0]ns?/i,
+        severity: "HIGH",
+        category: "override",
+    },
+    // DVMCP-specific patterns (CTF testbed coverage)
+    {
+        name: "override_auth_protocol",
+        pattern: /override[-_]?auth[-_]?protocol/gi,
+        severity: "HIGH",
+        category: "auth_bypass",
+    },
+    {
+        name: "internal_resource_uri",
+        pattern: /(?:internal|system|admin|secret|company|confidential):\/\/[a-z_\-/]+/gi,
+        severity: "HIGH",
+        category: "resource_access",
+    },
+    {
+        name: "get_secrets_call",
+        pattern: /get[-_]?secrets?\s*\(/gi,
+        severity: "HIGH",
+        category: "secret_access",
+    },
+    {
+        name: "master_password",
+        pattern: /master[-_]?password/gi,
+        severity: "HIGH",
+        category: "credential_exposure",
+    },
+    {
+        name: "access_confidential",
+        pattern: /access\s+(the\s+)?confidential/gi,
+        severity: "HIGH",
+        category: "exfiltration",
+    },
+    {
+        name: "hidden_trigger_phrase",
+        pattern: /if\s+(the\s+)?(query|input|text)\s+contains\s+(the\s+)?(exact\s+)?(phrase|word)/gi,
+        severity: "HIGH",
+        category: "hidden_trigger",
+    },
+];
+/**
+ * Scan tool description for poisoning patterns
+ * Detects hidden instructions, override commands, concealment, and exfiltration attempts
+ */
+export function scanDescriptionForPoisoning(tool) {
+    const description = tool.description || "";
+    const matches = [];
+    for (const patternDef of DESCRIPTION_POISONING_PATTERNS) {
+        // Create a fresh regex to reset lastIndex
+        const regex = new RegExp(patternDef.pattern.source, patternDef.pattern.flags);
+        // Loop to find all matches (not just first)
+        let match;
+        while ((match = regex.exec(description)) !== null) {
+            matches.push({
+                name: patternDef.name,
+                pattern: patternDef.pattern.toString(),
+                severity: patternDef.severity,
+                category: patternDef.category,
+                evidence: match[0].substring(0, 100) + (match[0].length > 100 ? "..." : ""),
+            });
+            // Prevent infinite loop for patterns without 'g' flag
+            if (!regex.global)
+                break;
+        }
+    }
+    // Determine overall risk level based on highest severity match
+    let riskLevel = "NONE";
+    if (matches.some((m) => m.severity === "HIGH")) {
+        riskLevel = "HIGH";
+    }
+    else if (matches.some((m) => m.severity === "MEDIUM")) {
+        riskLevel = "MEDIUM";
+    }
+    else if (matches.length > 0) {
+        riskLevel = "LOW";
+    }
+    return {
+        detected: matches.length > 0,
+        patterns: matches,
+        riskLevel,
+    };
+}