llm_guardrail 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +115 -21
- package/package.json +1 -1
package/index.js
CHANGED
|
@@ -6,51 +6,61 @@ import { LogisticRegression } from './model/logistic_regression.js';
|
|
|
6
6
|
|
|
7
7
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
8
8
|
|
|
9
|
-
//
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
// Cache for loaded models
|
|
10
|
+
const models = {
|
|
11
|
+
prompt_injection: null,
|
|
12
|
+
jailbreak: null,
|
|
13
|
+
malicious: null
|
|
14
|
+
};
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
|
|
16
|
+
/**
|
|
17
|
+
* Load a specific model by name
|
|
18
|
+
* @param {string} modelName - 'prompt_injection', 'jailbreak', or 'malicious'
|
|
19
|
+
*/
|
|
20
|
+
function loadModel(modelName) {
|
|
21
|
+
if (models[modelName]) {
|
|
22
|
+
return models[modelName];
|
|
23
|
+
}
|
|
15
24
|
|
|
16
25
|
try {
|
|
17
|
-
const modelPath = path.join(__dirname, 'model',
|
|
26
|
+
const modelPath = path.join(__dirname, 'model', `${modelName}_model.json`);
|
|
18
27
|
const modelData = JSON.parse(fs.readFileSync(modelPath, 'utf8'));
|
|
19
28
|
|
|
20
|
-
vectorizer = new TfidfVectorizer(modelData);
|
|
21
|
-
model = new LogisticRegression(modelData);
|
|
29
|
+
const vectorizer = new TfidfVectorizer(modelData);
|
|
30
|
+
const model = new LogisticRegression(modelData);
|
|
22
31
|
|
|
23
|
-
|
|
32
|
+
models[modelName] = { vectorizer, model };
|
|
33
|
+
return models[modelName];
|
|
24
34
|
} catch (error) {
|
|
25
|
-
throw new Error(`Failed to load model: ${error.message}`);
|
|
35
|
+
throw new Error(`Failed to load ${modelName} model: ${error.message}`);
|
|
26
36
|
}
|
|
27
37
|
}
|
|
28
38
|
|
|
29
|
-
|
|
39
|
+
/**
|
|
40
|
+
* Generic check function for any model
|
|
41
|
+
* @param {string} prompt - The text to check
|
|
42
|
+
* @param {string} modelName - 'prompt_injection', 'jailbreak', or 'malicious'
|
|
43
|
+
*/
|
|
44
|
+
async function checkWithModel(prompt, modelName) {
|
|
30
45
|
return new Promise((resolve, reject) => {
|
|
31
46
|
try {
|
|
32
47
|
if (typeof prompt !== "string") {
|
|
33
48
|
return reject(new Error("Prompt must be a string"));
|
|
34
49
|
}
|
|
35
50
|
|
|
36
|
-
|
|
37
|
-
const { model, vectorizer } = loadModel();
|
|
38
|
-
|
|
39
|
-
// Transform text to TF-IDF features
|
|
51
|
+
const { vectorizer, model } = loadModel(modelName);
|
|
40
52
|
const features = vectorizer.transform(prompt);
|
|
41
|
-
|
|
42
|
-
// Get prediction
|
|
43
53
|
const prediction = model.predict(features);
|
|
44
54
|
const { probabilities, positiveProb } = model.predictProba(features);
|
|
45
55
|
|
|
46
56
|
resolve({
|
|
47
|
-
allowed: prediction === 0,
|
|
48
|
-
|
|
57
|
+
allowed: prediction === 0,
|
|
58
|
+
detected: prediction === 1,
|
|
49
59
|
prediction: prediction,
|
|
50
60
|
confidence: positiveProb,
|
|
51
61
|
probabilities: {
|
|
52
62
|
safe: probabilities[0],
|
|
53
|
-
|
|
63
|
+
threat: probabilities[1]
|
|
54
64
|
}
|
|
55
65
|
});
|
|
56
66
|
} catch (error) {
|
|
@@ -59,4 +69,88 @@ export function check(prompt) {
|
|
|
59
69
|
});
|
|
60
70
|
}
|
|
61
71
|
|
|
62
|
-
|
|
72
|
+
/**
|
|
73
|
+
* Check for prompt injection attacks
|
|
74
|
+
* @param {string} prompt - The text to check
|
|
75
|
+
*/
|
|
76
|
+
export function checkInjection(prompt) {
|
|
77
|
+
return checkWithModel(prompt, 'prompt_injection');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Check for jailbreak attempts
|
|
82
|
+
* @param {string} prompt - The text to check
|
|
83
|
+
*/
|
|
84
|
+
export function checkJailbreak(prompt) {
|
|
85
|
+
return checkWithModel(prompt, 'jailbreak');
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Check for malicious content
|
|
90
|
+
* @param {string} prompt - The text to check
|
|
91
|
+
*/
|
|
92
|
+
export function checkMalicious(prompt) {
|
|
93
|
+
return checkWithModel(prompt, 'malicious');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Run all three checks in parallel
|
|
98
|
+
* @param {string} prompt - The text to check
|
|
99
|
+
*/
|
|
100
|
+
export async function checkAll(prompt) {
|
|
101
|
+
try {
|
|
102
|
+
const [injection, jailbreak, malicious] = await Promise.all([
|
|
103
|
+
checkInjection(prompt),
|
|
104
|
+
checkJailbreak(prompt),
|
|
105
|
+
checkMalicious(prompt)
|
|
106
|
+
]);
|
|
107
|
+
|
|
108
|
+
// Calculate overall risk level
|
|
109
|
+
const threats = [
|
|
110
|
+
injection.detected ? injection.confidence : 0,
|
|
111
|
+
jailbreak.detected ? jailbreak.confidence : 0,
|
|
112
|
+
malicious.detected ? malicious.confidence : 0
|
|
113
|
+
];
|
|
114
|
+
|
|
115
|
+
const maxThreat = Math.max(...threats);
|
|
116
|
+
let overallRisk = 'safe';
|
|
117
|
+
if (maxThreat > 0.7) overallRisk = 'high';
|
|
118
|
+
else if (maxThreat > 0.4) overallRisk = 'medium';
|
|
119
|
+
else if (maxThreat > 0) overallRisk = 'low';
|
|
120
|
+
|
|
121
|
+
// Determine which threats were detected
|
|
122
|
+
const threatsDetected = [];
|
|
123
|
+
if (injection.detected) threatsDetected.push('injection');
|
|
124
|
+
if (jailbreak.detected) threatsDetected.push('jailbreak');
|
|
125
|
+
if (malicious.detected) threatsDetected.push('malicious');
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
injection,
|
|
129
|
+
jailbreak,
|
|
130
|
+
malicious,
|
|
131
|
+
allowed: injection.allowed && jailbreak.allowed && malicious.allowed,
|
|
132
|
+
overallRisk,
|
|
133
|
+
maxThreatConfidence: maxThreat,
|
|
134
|
+
threatsDetected
|
|
135
|
+
};
|
|
136
|
+
} catch (error) {
|
|
137
|
+
throw error;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Backward compatibility - defaults to injection check
|
|
143
|
+
* @param {string} prompt - The text to check
|
|
144
|
+
* @deprecated Use checkInjection() instead for clarity
|
|
145
|
+
*/
|
|
146
|
+
export function check(prompt) {
|
|
147
|
+
return checkInjection(prompt);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export default {
|
|
151
|
+
check,
|
|
152
|
+
checkInjection,
|
|
153
|
+
checkJailbreak,
|
|
154
|
+
checkMalicious,
|
|
155
|
+
checkAll
|
|
156
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "llm_guardrail",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "A lightweight, low-latency ML-powered guardrail to stop prompt injection attacks before they reach your LLM.",
|
|
5
5
|
"homepage": "https://github.com/Frank2006x/llm_Guardrails#readme",
|
|
6
6
|
"bugs": {
|