@hawon/nexus 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -38
- package/dist/cli/index.js +76 -145
- package/dist/index.js +15 -26
- package/dist/mcp/server.js +61 -32
- package/package.json +2 -1
- package/scripts/auto-skill.sh +54 -0
- package/scripts/auto-sync.sh +11 -0
- package/scripts/benchmark.ts +444 -0
- package/scripts/scan-tool-result.sh +46 -0
- package/src/cli/index.ts +79 -172
- package/src/index.ts +17 -29
- package/src/mcp/server.ts +67 -41
- package/src/memory-engine/index.ts +4 -6
- package/src/memory-engine/nexus-memory.test.ts +437 -0
- package/src/memory-engine/nexus-memory.ts +631 -0
- package/src/memory-engine/semantic.ts +380 -0
- package/src/parser/parse.ts +1 -21
- package/src/promptguard/advanced-rules.ts +129 -12
- package/src/promptguard/entropy.ts +21 -2
- package/src/promptguard/evolution/auto-update.ts +16 -6
- package/src/promptguard/multilingual-rules.ts +68 -0
- package/src/promptguard/rules.ts +87 -2
- package/src/promptguard/scanner.test.ts +262 -0
- package/src/promptguard/scanner.ts +1 -1
- package/src/promptguard/semantic.ts +19 -4
- package/src/promptguard/token-analysis.ts +17 -5
- package/src/review/analyzer.test.ts +279 -0
- package/src/review/analyzer.ts +112 -28
- package/src/shared/stop-words.ts +21 -0
- package/src/skills/index.ts +11 -27
- package/src/skills/memory-skill-engine.ts +1044 -0
- package/src/testing/health-check.ts +19 -2
- package/src/cost/index.ts +0 -3
- package/src/cost/tracker.ts +0 -290
- package/src/cost/types.ts +0 -34
- package/src/memory-engine/compressor.ts +0 -97
- package/src/memory-engine/context-window.ts +0 -113
- package/src/memory-engine/store.ts +0 -371
- package/src/memory-engine/types.ts +0 -32
- package/src/skills/context-engine.ts +0 -863
- package/src/skills/extractor.ts +0 -224
- package/src/skills/global-context.ts +0 -726
- package/src/skills/library.ts +0 -189
- package/src/skills/pattern-engine.ts +0 -712
- package/src/skills/render-evolved.ts +0 -160
- package/src/skills/skill-reconciler.ts +0 -703
- package/src/skills/smart-extractor.ts +0 -843
- package/src/skills/types.ts +0 -18
- package/src/skills/wisdom-extractor.ts +0 -737
- package/src/superdev-evolution/index.ts +0 -3
- package/src/superdev-evolution/skill-manager.ts +0 -266
- package/src/superdev-evolution/types.ts +0 -20
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zero-Dependency Semantic Similarity Engine
|
|
3
|
+
*
|
|
4
|
+
* Captures meaning without embeddings or external APIs.
|
|
5
|
+
*
|
|
6
|
+
* Three mechanisms:
|
|
7
|
+
*
|
|
8
|
+
* 1. SYNONYM GRAPH
|
|
9
|
+
* Hard-coded domain knowledge mapping related concepts.
|
|
10
|
+
* "deploy" ↔ "release" ↔ "publish" ↔ "배포" ↔ "ship"
|
|
11
|
+
* Bilingual: English ↔ Korean for dev terminology.
|
|
12
|
+
*
|
|
13
|
+
* 2. CO-OCCURRENCE PMI (Pointwise Mutual Information)
|
|
14
|
+
* Learns from the corpus itself: words that appear together
|
|
15
|
+
* more than expected by chance are semantically related.
|
|
16
|
+
* "authentication" frequently near "password" → related.
|
|
17
|
+
* No training, no model — pure statistics.
|
|
18
|
+
*
|
|
19
|
+
* 3. QUERY EXPANSION
|
|
20
|
+
* Expands search query with synonyms + co-occurrence neighbors.
|
|
21
|
+
* "deploy" → ["deploy", "release", "publish", "배포", "ship"]
|
|
22
|
+
* Then runs BM25 on expanded query → semantic matches found.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
26
|
+
// SYNONYM GRAPH — Domain knowledge for dev/security concepts
|
|
27
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
28
|
+
|
|
29
|
+
/** Each group = mutually synonymous concepts. */
|
|
30
|
+
const SYNONYM_GROUPS: string[][] = [
|
|
31
|
+
// Actions
|
|
32
|
+
["deploy", "release", "publish", "ship", "배포", "출시", "docker", "container", "컨테이너", "도커"],
|
|
33
|
+
["install", "setup", "설치", "설정", "configure", "config"],
|
|
34
|
+
["fix", "repair", "patch", "고치", "수정", "resolve"],
|
|
35
|
+
["create", "build", "make", "만들", "생성", "구현", "implement"],
|
|
36
|
+
["delete", "remove", "drop", "삭제", "제거"],
|
|
37
|
+
["update", "upgrade", "업데이트", "업그레이드", "갱신"],
|
|
38
|
+
["test", "verify", "validate", "테스트", "검증", "확인"],
|
|
39
|
+
["debug", "troubleshoot", "디버그", "디버깅"],
|
|
40
|
+
["refactor", "restructure", "리팩토", "리팩토링", "정리"],
|
|
41
|
+
["optimize", "improve", "최적화", "개선", "성능향상"],
|
|
42
|
+
["analyze", "inspect", "examine", "분석", "조사", "검사"],
|
|
43
|
+
["review", "audit", "리뷰", "검토", "감사"],
|
|
44
|
+
["search", "find", "lookup", "query", "검색", "찾기", "조회"],
|
|
45
|
+
["commit", "push", "submit", "커밋", "푸시", "제출"],
|
|
46
|
+
["merge", "combine", "병합", "합치"],
|
|
47
|
+
["migrate", "transfer", "port", "마이그레이션", "이전"],
|
|
48
|
+
|
|
49
|
+
// Errors & Issues
|
|
50
|
+
["error", "bug", "issue", "defect", "fault", "에러", "버그", "오류", "결함"],
|
|
51
|
+
["crash", "fail", "failure", "down", "크래시", "실패", "다운"],
|
|
52
|
+
["vulnerability", "exploit", "weakness", "flaw", "취약점", "취약성", "약점"],
|
|
53
|
+
["warning", "caution", "alert", "경고", "주의"],
|
|
54
|
+
["exception", "throw", "예외"],
|
|
55
|
+
|
|
56
|
+
// Security
|
|
57
|
+
["security", "safety", "보안", "안전"],
|
|
58
|
+
["injection", "인젝션", "주입"],
|
|
59
|
+
["authentication", "auth", "login", "signin", "인증", "로그인"],
|
|
60
|
+
["authorization", "permission", "access", "권한", "인가", "접근"],
|
|
61
|
+
["encryption", "encrypt", "cipher", "암호화"],
|
|
62
|
+
["decryption", "decrypt", "복호화"],
|
|
63
|
+
["token", "jwt", "session", "cookie", "토큰", "세션", "쿠키"],
|
|
64
|
+
["hash", "digest", "checksum", "해시"],
|
|
65
|
+
["firewall", "waf", "방화벽"],
|
|
66
|
+
["sandbox", "isolation", "격리", "샌드박스"],
|
|
67
|
+
["password", "credential", "secret", "secrets", "비밀번호", "자격증명", "시크릿"],
|
|
68
|
+
|
|
69
|
+
// Infrastructure
|
|
70
|
+
["server", "backend", "서버", "백엔드"],
|
|
71
|
+
["client", "frontend", "클라이언트", "프론트엔드"],
|
|
72
|
+
["database", "db", "store", "데이터베이스"],
|
|
73
|
+
["api", "endpoint", "route", "엔드포인트", "라우트"],
|
|
74
|
+
["container", "docker", "컨테이너", "도커", "kubernetes", "k8s", "쿠버네티스", "pod"],
|
|
75
|
+
["cloud", "aws", "gcp", "azure", "클라우드"],
|
|
76
|
+
["cache", "redis", "memcache", "캐시"],
|
|
77
|
+
["queue", "mq", "kafka", "rabbitmq", "큐"],
|
|
78
|
+
["proxy", "nginx", "loadbalancer", "프록시", "로드밸런서"],
|
|
79
|
+
["websocket", "실시간", "realtime", "real-time", "socket", "소켓", "양방향", "bidirectional"],
|
|
80
|
+
["통신", "communication", "네트워크", "network", "연결", "connection"],
|
|
81
|
+
|
|
82
|
+
// Code Concepts
|
|
83
|
+
["function", "method", "handler", "callback", "함수", "메서드"],
|
|
84
|
+
["variable", "var", "const", "let", "변수"],
|
|
85
|
+
["class", "object", "instance", "클래스", "객체", "인스턴스"],
|
|
86
|
+
["module", "package", "library", "모듈", "패키지", "라이브러리"],
|
|
87
|
+
["interface", "type", "schema", "인터페이스", "타입", "스키마"],
|
|
88
|
+
["import", "require", "dependency", "의존성"],
|
|
89
|
+
["export", "expose", "publish", "내보내기"],
|
|
90
|
+
["async", "await", "promise", "비동기"],
|
|
91
|
+
["loop", "iterate", "forEach", "map", "반복"],
|
|
92
|
+
["condition", "if", "switch", "branch", "조건", "분기"],
|
|
93
|
+
["array", "list", "collection", "배열", "리스트", "컬렉션"],
|
|
94
|
+
["string", "text", "문자열", "텍스트"],
|
|
95
|
+
["number", "integer", "float", "숫자", "정수"],
|
|
96
|
+
["null", "undefined", "nil", "none", "empty"],
|
|
97
|
+
|
|
98
|
+
// Tools & Platforms
|
|
99
|
+
["git", "github", "gitlab", "깃", "깃허브"],
|
|
100
|
+
["npm", "yarn", "pnpm", "bun"],
|
|
101
|
+
["typescript", "javascript", "타입스크립트", "자바스크립트"],
|
|
102
|
+
["python", "파이썬"],
|
|
103
|
+
["react", "vue", "angular", "svelte", "리액트"],
|
|
104
|
+
["node", "nodejs", "deno", "노드"],
|
|
105
|
+
["vscode", "cursor", "neovim", "vim", "에디터"],
|
|
106
|
+
|
|
107
|
+
// Testing
|
|
108
|
+
["unittest", "unit-test", "유닛테스트", "단위테스트"],
|
|
109
|
+
["integration", "e2e", "통합테스트"],
|
|
110
|
+
["mock", "stub", "spy", "fake", "모킹"],
|
|
111
|
+
["assertion", "expect", "assert", "단언"],
|
|
112
|
+
["coverage", "커버리지"],
|
|
113
|
+
|
|
114
|
+
// AI/LLM
|
|
115
|
+
["prompt", "프롬프트"],
|
|
116
|
+
["model", "llm", "모델"],
|
|
117
|
+
["agent", "에이전트"],
|
|
118
|
+
["embedding", "vector", "임베딩", "벡터"],
|
|
119
|
+
["context", "window", "컨텍스트", "윈도우"],
|
|
120
|
+
["token", "토큰"],
|
|
121
|
+
["inference", "추론"],
|
|
122
|
+
["training", "finetuning", "학습", "파인튜닝"],
|
|
123
|
+
|
|
124
|
+
// Additional cross-domain connections
|
|
125
|
+
["scale", "확장", "scalability", "스케일"],
|
|
126
|
+
["latency", "지연", "delay", "딜레이", "레이턴시"],
|
|
127
|
+
["throughput", "처리량", "bandwidth", "대역폭"],
|
|
128
|
+
["concurrency", "동시성", "parallel", "병렬"],
|
|
129
|
+
["deadlock", "데드락", "교착상태"],
|
|
130
|
+
["memory-leak", "메모리누수", "leak"],
|
|
131
|
+
["garbage-collection", "gc", "가비지컬렉션"],
|
|
132
|
+
["ci", "continuous-integration", "지속통합"],
|
|
133
|
+
["cd", "continuous-deployment", "지속배포"],
|
|
134
|
+
["monitoring", "모니터링", "관제", "observability"],
|
|
135
|
+
["logging", "로깅", "로그", "log"],
|
|
136
|
+
["tracing", "추적", "trace"],
|
|
137
|
+
["backup", "백업", "복구", "recovery", "restore"],
|
|
138
|
+
["migration", "마이그레이션", "이관", "이전"],
|
|
139
|
+
["rollback", "롤백", "되돌리기"],
|
|
140
|
+
["snapshot", "스냅샷"],
|
|
141
|
+
["webhook", "웹훅", "callback", "콜백"],
|
|
142
|
+
["pagination", "페이지네이션", "paging"],
|
|
143
|
+
["rate-limit", "레이트리밋", "throttle", "쓰로틀"],
|
|
144
|
+
["cors", "교차출처", "cross-origin"],
|
|
145
|
+
["csrf", "사이트간위조"],
|
|
146
|
+
["xss", "크로스사이트스크립팅"],
|
|
147
|
+
["sql-injection", "sql인젝션", "sqli"],
|
|
148
|
+
["authentication", "인증", "auth", "로그인", "login", "signin"],
|
|
149
|
+
["authorization", "인가", "권한", "permission", "access-control"],
|
|
150
|
+
];
|
|
151
|
+
|
|
152
|
+
/** Pre-built lookup: word → all synonyms. */
|
|
153
|
+
const synonymMap = new Map<string, Set<string>>();
|
|
154
|
+
|
|
155
|
+
for (const group of SYNONYM_GROUPS) {
|
|
156
|
+
const lowerGroup = group.map((w) => w.toLowerCase());
|
|
157
|
+
for (const word of lowerGroup) {
|
|
158
|
+
if (!synonymMap.has(word)) synonymMap.set(word, new Set());
|
|
159
|
+
for (const synonym of lowerGroup) {
|
|
160
|
+
if (synonym !== word) synonymMap.get(word)!.add(synonym);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/** Get synonyms for a word. */
|
|
166
|
+
export function getSynonyms(word: string): string[] {
|
|
167
|
+
return [...(synonymMap.get(word.toLowerCase()) ?? [])];
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
171
|
+
// CO-OCCURRENCE PMI — Learns semantic relationships from corpus
|
|
172
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
173
|
+
|
|
174
|
+
export type CoOccurrenceModel = {
|
|
175
|
+
/** Get top-N semantically related words for a given word. */
|
|
176
|
+
getRelated: (word: string, topN?: number) => { word: string; pmi: number }[];
|
|
177
|
+
/** Rebuild from a new corpus. */
|
|
178
|
+
rebuild: (documents: string[]) => void;
|
|
179
|
+
/** Number of unique terms. */
|
|
180
|
+
vocabSize: () => number;
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
import { STOP_WORDS } from "../shared/stop-words.js";
|
|
184
|
+
|
|
185
|
+
function tokenize(text: string): string[] {
|
|
186
|
+
return text
|
|
187
|
+
.toLowerCase()
|
|
188
|
+
.replace(/[^a-z가-힣0-9\s-]/g, " ")
|
|
189
|
+
.split(/\s+/)
|
|
190
|
+
.filter((w) => w.length > 1 && !STOP_WORDS.has(w));
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
export function createCoOccurrenceModel(): CoOccurrenceModel {
|
|
194
|
+
// word → (word → count)
|
|
195
|
+
let coOccur = new Map<string, Map<string, number>>();
|
|
196
|
+
let wordFreq = new Map<string, number>();
|
|
197
|
+
let totalPairs = 0;
|
|
198
|
+
|
|
199
|
+
function rebuild(documents: string[]): void {
|
|
200
|
+
coOccur = new Map();
|
|
201
|
+
wordFreq = new Map();
|
|
202
|
+
totalPairs = 0;
|
|
203
|
+
|
|
204
|
+
const WINDOW = 5; // Co-occurrence window size
|
|
205
|
+
|
|
206
|
+
for (const doc of documents) {
|
|
207
|
+
const tokens = tokenize(doc);
|
|
208
|
+
|
|
209
|
+
// Count word frequencies
|
|
210
|
+
for (const t of tokens) {
|
|
211
|
+
wordFreq.set(t, (wordFreq.get(t) ?? 0) + 1);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Count co-occurrences within window
|
|
215
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
216
|
+
const word = tokens[i];
|
|
217
|
+
if (!coOccur.has(word)) coOccur.set(word, new Map());
|
|
218
|
+
|
|
219
|
+
for (let j = Math.max(0, i - WINDOW); j < Math.min(tokens.length, i + WINDOW + 1); j++) {
|
|
220
|
+
if (i === j) continue;
|
|
221
|
+
const neighbor = tokens[j];
|
|
222
|
+
const map = coOccur.get(word)!;
|
|
223
|
+
map.set(neighbor, (map.get(neighbor) ?? 0) + 1);
|
|
224
|
+
totalPairs++;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function getRelated(word: string, topN = 10): { word: string; pmi: number }[] {
|
|
231
|
+
const lower = word.toLowerCase();
|
|
232
|
+
const neighbors = coOccur.get(lower);
|
|
233
|
+
if (!neighbors || totalPairs === 0) return [];
|
|
234
|
+
|
|
235
|
+
const pWord = (wordFreq.get(lower) ?? 0) / totalPairs;
|
|
236
|
+
if (pWord === 0) return [];
|
|
237
|
+
|
|
238
|
+
const results: { word: string; pmi: number }[] = [];
|
|
239
|
+
|
|
240
|
+
for (const [neighbor, count] of neighbors) {
|
|
241
|
+
const pNeighbor = (wordFreq.get(neighbor) ?? 0) / totalPairs;
|
|
242
|
+
if (pNeighbor === 0) continue;
|
|
243
|
+
|
|
244
|
+
const pJoint = count / totalPairs;
|
|
245
|
+
// PMI = log2(P(x,y) / (P(x) * P(y)))
|
|
246
|
+
const pmi = Math.log2(pJoint / (pWord * pNeighbor));
|
|
247
|
+
|
|
248
|
+
// Only keep positive PMI (co-occur more than expected)
|
|
249
|
+
if (pmi > 0) {
|
|
250
|
+
results.push({ word: neighbor, pmi });
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return results
|
|
255
|
+
.sort((a, b) => b.pmi - a.pmi)
|
|
256
|
+
.slice(0, topN);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
getRelated,
|
|
261
|
+
rebuild,
|
|
262
|
+
vocabSize: () => wordFreq.size,
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
267
|
+
// QUERY EXPANSION — Combines synonyms + co-occurrence
|
|
268
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
269
|
+
|
|
270
|
+
export type ExpandedQuery = {
|
|
271
|
+
/** Original query tokens. */
|
|
272
|
+
original: string[];
|
|
273
|
+
/** Expanded tokens (includes original + synonyms + co-occurrence). */
|
|
274
|
+
expanded: string[];
|
|
275
|
+
/** Which tokens were added and why. */
|
|
276
|
+
expansions: { token: string; source: "synonym" | "cooccurrence"; relatedTo: string }[];
|
|
277
|
+
};
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Expand a query with semantic neighbors.
|
|
281
|
+
*
|
|
282
|
+
* "deploy error" → ["deploy", "release", "publish", "배포",
|
|
283
|
+
* "error", "bug", "issue", "에러",
|
|
284
|
+
* + co-occurrence neighbors]
|
|
285
|
+
*/
|
|
286
|
+
export function expandQuery(
|
|
287
|
+
query: string,
|
|
288
|
+
coModel?: CoOccurrenceModel,
|
|
289
|
+
maxCoOccurrencePerToken = 3,
|
|
290
|
+
): ExpandedQuery {
|
|
291
|
+
const original = tokenize(query);
|
|
292
|
+
const expanded = [...original];
|
|
293
|
+
const expansions: ExpandedQuery["expansions"] = [];
|
|
294
|
+
const seen = new Set(original);
|
|
295
|
+
|
|
296
|
+
for (const token of original) {
|
|
297
|
+
// 1. Add ALL synonyms (curated groups — no cap needed)
|
|
298
|
+
const synonyms = getSynonyms(token);
|
|
299
|
+
for (const syn of synonyms) {
|
|
300
|
+
if (!seen.has(syn)) {
|
|
301
|
+
expanded.push(syn);
|
|
302
|
+
expansions.push({ token: syn, source: "synonym", relatedTo: token });
|
|
303
|
+
seen.add(syn);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// 2. Add co-occurrence neighbors (capped — can be noisy)
|
|
308
|
+
if (coModel) {
|
|
309
|
+
const related = coModel.getRelated(token, maxCoOccurrencePerToken);
|
|
310
|
+
for (const { word } of related) {
|
|
311
|
+
if (!seen.has(word)) {
|
|
312
|
+
expanded.push(word);
|
|
313
|
+
expansions.push({ token: word, source: "cooccurrence", relatedTo: token });
|
|
314
|
+
seen.add(word);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return { original, expanded, expansions };
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
324
|
+
// SEMANTIC SIMILARITY SCORE
|
|
325
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Compute semantic similarity between two texts.
|
|
329
|
+
* Uses synonym overlap + token overlap + optional co-occurrence.
|
|
330
|
+
*
|
|
331
|
+
* Returns 0-1.
|
|
332
|
+
*/
|
|
333
|
+
export function semanticSimilarity(
|
|
334
|
+
textA: string,
|
|
335
|
+
textB: string,
|
|
336
|
+
coModel?: CoOccurrenceModel,
|
|
337
|
+
): number {
|
|
338
|
+
const tokensA = tokenize(textA);
|
|
339
|
+
const tokensB = tokenize(textB);
|
|
340
|
+
|
|
341
|
+
if (tokensA.length === 0 || tokensB.length === 0) return 0;
|
|
342
|
+
|
|
343
|
+
// 1. Direct token overlap (Jaccard)
|
|
344
|
+
const setA = new Set(tokensA);
|
|
345
|
+
const setB = new Set(tokensB);
|
|
346
|
+
const intersection = [...setA].filter((t) => setB.has(t)).length;
|
|
347
|
+
const union = new Set([...setA, ...setB]).size;
|
|
348
|
+
const directOverlap = union > 0 ? intersection / union : 0;
|
|
349
|
+
|
|
350
|
+
// 2. Synonym-expanded overlap
|
|
351
|
+
const expandedA = new Set(tokensA);
|
|
352
|
+
for (const t of tokensA) {
|
|
353
|
+
for (const syn of getSynonyms(t)) expandedA.add(syn);
|
|
354
|
+
}
|
|
355
|
+
const expandedB = new Set(tokensB);
|
|
356
|
+
for (const t of tokensB) {
|
|
357
|
+
for (const syn of getSynonyms(t)) expandedB.add(syn);
|
|
358
|
+
}
|
|
359
|
+
const synIntersection = [...expandedA].filter((t) => expandedB.has(t)).length;
|
|
360
|
+
const synUnion = new Set([...expandedA, ...expandedB]).size;
|
|
361
|
+
const synonymOverlap = synUnion > 0 ? synIntersection / synUnion : 0;
|
|
362
|
+
|
|
363
|
+
// 3. Co-occurrence similarity (if model available)
|
|
364
|
+
let coSimilarity = 0;
|
|
365
|
+
if (coModel) {
|
|
366
|
+
let matches = 0;
|
|
367
|
+
let comparisons = 0;
|
|
368
|
+
for (const a of tokensA.slice(0, 10)) {
|
|
369
|
+
const related = new Set(coModel.getRelated(a, 5).map((r) => r.word));
|
|
370
|
+
for (const b of tokensB.slice(0, 10)) {
|
|
371
|
+
comparisons++;
|
|
372
|
+
if (related.has(b)) matches++;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
coSimilarity = comparisons > 0 ? matches / comparisons : 0;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// Weighted combination
|
|
379
|
+
return directOverlap * 0.3 + synonymOverlap * 0.5 + coSimilarity * 0.2;
|
|
380
|
+
}
|
package/src/parser/parse.ts
CHANGED
|
@@ -1,27 +1,7 @@
|
|
|
1
1
|
import { readFileSync } from "node:fs";
|
|
2
2
|
import { basename, dirname } from "node:path";
|
|
3
3
|
import type { ParsedMessage, ParsedSession, ToolCall } from "./types.js";
|
|
4
|
-
|
|
5
|
-
const STOP_WORDS = new Set([
|
|
6
|
-
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
7
|
-
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
8
|
-
"should", "may", "might", "shall", "can", "need", "must", "ought",
|
|
9
|
-
"i", "me", "my", "we", "our", "you", "your", "he", "she", "it",
|
|
10
|
-
"they", "them", "his", "her", "its", "this", "that", "these", "those",
|
|
11
|
-
"what", "which", "who", "whom", "how", "when", "where", "why",
|
|
12
|
-
"and", "but", "or", "nor", "not", "no", "so", "if", "then", "else",
|
|
13
|
-
"for", "of", "to", "in", "on", "at", "by", "with", "from", "as",
|
|
14
|
-
"into", "about", "up", "out", "off", "over", "under", "again",
|
|
15
|
-
"there", "here", "all", "each", "every", "both", "few", "more",
|
|
16
|
-
"most", "some", "any", "other", "just", "also", "than", "too",
|
|
17
|
-
"very", "only", "even", "still", "already", "now", "well",
|
|
18
|
-
"get", "got", "make", "made", "go", "going", "come", "take",
|
|
19
|
-
"use", "used", "using", "like", "want", "know", "see", "look",
|
|
20
|
-
"think", "said", "say", "tell", "let", "put", "set", "try",
|
|
21
|
-
"please", "thanks", "thank", "yes", "no", "ok", "okay", "sure",
|
|
22
|
-
"dont", "don", "doesn", "didn", "won", "wouldn", "couldn", "shouldn",
|
|
23
|
-
"im", "ive", "ill", "its", "thats", "theres", "heres",
|
|
24
|
-
]);
|
|
4
|
+
import { STOP_WORDS } from "../shared/stop-words.js";
|
|
25
5
|
|
|
26
6
|
const METADATA_TYPES = new Set([
|
|
27
7
|
"permission-mode",
|
|
@@ -28,7 +28,14 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
28
28
|
severity: "medium",
|
|
29
29
|
message: "Numbered instruction fragments suggesting payload reconstruction",
|
|
30
30
|
pattern:
|
|
31
|
-
/(?:step\s*1|part\s*1|first\s*part)\s*[:=]?\s*["']?\w+["']
|
|
31
|
+
/(?:step\s*1|part\s*1|first\s*part)\s*[:=]?\s*["']?\w+["']?[^]{0,100}(?:step\s*2|part\s*2|second\s*part)\s*[:=]?\s*["']?\w+["']?(?:[^]{0,100}(?:step\s*3|part\s*3|third\s*part)\s*[:=]?\s*["']?\w+["']?)?[^]{0,100}(?:execute|combine|run|follow|assemble|concat|join|now\s+do)/is,
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
id: "advanced-payload-splitting-parts",
|
|
35
|
+
severity: "high",
|
|
36
|
+
message: "Payload splitting via numbered parts",
|
|
37
|
+
pattern:
|
|
38
|
+
/(?:part\s*[1-3]|step\s*[1-3])\s*[:=]\s*\w+[^]{0,50}(?:part\s*[2-4]|step\s*[2-4])\s*[:=]\s*\w+/is,
|
|
32
39
|
},
|
|
33
40
|
|
|
34
41
|
// ---- Few-shot Jailbreak ----
|
|
@@ -37,7 +44,7 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
37
44
|
severity: "critical",
|
|
38
45
|
message: "Few-shot example used to demonstrate desired unsafe behavior",
|
|
39
46
|
pattern:
|
|
40
|
-
/(?:example|here\s+is\s+(?:a|an)\s+(?:good|ideal|perfect|correct)\s+(?:response|answer|reply|assistant|output)|like\s+(?:the|this)\s+example)[
|
|
47
|
+
/(?:example|here\s+is\s+(?:a|an)\s+(?:good|ideal|perfect|correct)\s+(?:response|answer|reply|assistant|output)|like\s+(?:the|this)\s+example)[^]{0,300}(?:be\s+like|follow\s+(?:the|this)\s+example|do\s+the\s+same|now\s+(?:you\s+)?(?:answer|respond|do|try))/is,
|
|
41
48
|
},
|
|
42
49
|
|
|
43
50
|
// ---- Virtualization / Persona Simulation ----
|
|
@@ -46,14 +53,14 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
46
53
|
severity: "critical",
|
|
47
54
|
message: "Virtualization attack: persona simulation to bypass restrictions",
|
|
48
55
|
pattern:
|
|
49
|
-
/(?:play\s+a\s+game|let(?:'s|\s+us)\s+(?:play|pretend|roleplay|simulate)|imagine\s+you\s+are|you\s+are\s+playing|act\s+as\s+if\s+you\s+(?:were|are)|character\s+(?:called|named))[
|
|
56
|
+
/(?:play\s+a\s+game|let(?:'s|\s+us)\s+(?:play|pretend|roleplay|simulate)|imagine\s+you\s+are|you\s+are\s+playing|act\s+as\s+if\s+you\s+(?:were|are)|character\s+(?:called|named))[^]{0,200}(?:no\s+restrict|without\s+(?:any\s+)?(?:restriction|filter|limit|censor|safety)|always\s+answer|never\s+refus|can\s+do\s+anything|has\s+no\s+(?:limit|restrict|boundar))/is,
|
|
50
57
|
},
|
|
51
58
|
{
|
|
52
59
|
id: "advanced-persona-naming",
|
|
53
60
|
severity: "high",
|
|
54
61
|
message: "Named persona used to simulate unrestricted mode",
|
|
55
62
|
pattern:
|
|
56
|
-
/(?:you\s+are|your\s+name\s+is|call\s+yourself|I['']ll\s+call\s+you)\s+["']?(?:Master\s*AI|Evil\s*AI|Dark\s*(?:GPT|Claude|AI)|GODMODE|HackGPT|Omega|Alpha|Sigma|BasedGPT|AnarchyGPT|Devil|Lucifer|Shadow|Villain|UnchainedAI)["']?/i,
|
|
63
|
+
/(?:you\s+are|your\s+name\s+is|call\s+yourself|I[''\u2019]ll\s+call\s+you)\s+["']?(?:Master\s*AI|Evil\s*AI|Dark\s*(?:GPT|Claude|AI)|GODMODE|HackGPT|Omega|Alpha|Sigma|BasedGPT|AnarchyGPT|Devil|Lucifer|Shadow|Villain|UnchainedAI)["']?/i,
|
|
57
64
|
},
|
|
58
65
|
|
|
59
66
|
// ---- Authority / Hierarchy Confusion ----
|
|
@@ -92,7 +99,7 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
92
99
|
severity: "medium",
|
|
93
100
|
message: "Character spacing evasion: letters separated to avoid detection",
|
|
94
101
|
pattern:
|
|
95
|
-
/i\s
|
|
102
|
+
/i\s+g\s+n\s+o\s+r\s+e\s+.*(?:i\s+n\s+s\s+t\s+r\s+u\s+c\s+t|p\s+r\s+e\s+v\s+i\s+o\s+u\s+s|a\s+l\s+l)/i,
|
|
96
103
|
},
|
|
97
104
|
|
|
98
105
|
// ---- Hypothetical Framing ----
|
|
@@ -101,14 +108,21 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
101
108
|
severity: "high",
|
|
102
109
|
message: "Hypothetical framing used to extract restricted information",
|
|
103
110
|
pattern:
|
|
104
|
-
/(?:hypothetically|theoretically|in\s+theory|just\s+(?:imagine|suppose|pretend)|what\s+(?:if|would\s+happen\s+if))
|
|
111
|
+
/(?:hypothetically|theoretically|in\s+theory|just\s+(?:imagine|suppose|pretend)|what\s+(?:if|would\s+happen\s+if)|imagine\s+you\s+(?:are|were|had)|in\s+a\s+fictional\s+world|for\s+a\s+creative\s+writing\s+exercise|(?:pretend|suppose)\s+you\s+(?:can|could|are|have))[^]{0,200}(?:system\s+prompt|instructions?|ignore|bypass|override|unrestricted|(?:no|without)\s+(?:restrictions?|limits?|filters?|content\s+filters?|safety|censors?|boundar|rules?)|secrets?|hidden|internal|reveal\s+secrets?|had\s+no\s+restrictions?)/is,
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
id: "advanced-fictional-framing",
|
|
115
|
+
severity: "high",
|
|
116
|
+
message: "Uses fictional/creative framing to bypass restrictions",
|
|
117
|
+
pattern:
|
|
118
|
+
/\b(?:fictional|creative\s+writing|imagine|in\s+a\s+(?:story|world|scenario)\s+where)\b[^]{0,100}(?:no\s+(?:restrict|limit|filter|censor|rule)|reveal\s+secret|bypass|unrestricted)/is,
|
|
105
119
|
},
|
|
106
120
|
{
|
|
107
121
|
id: "advanced-fiction-framing",
|
|
108
122
|
severity: "medium",
|
|
109
123
|
message: "Fiction/story framing used to bypass safety guidelines",
|
|
110
124
|
pattern:
|
|
111
|
-
/(?:write\s+a\s+(?:story|novel|fiction|screenplay|script)|(?:in\s+a|for\s+(?:a|my))\s+(?:story|novel|fiction|movie|book))[
|
|
125
|
+
/(?:write\s+a\s+(?:story|novel|fiction|screenplay|script)|(?:in\s+a|for\s+(?:a|my))\s+(?:story|novel|fiction|movie|book))[^]{0,200}(?:(?:character|protagonist|villain)\s+(?:who|that)\s+(?:hack|break|bypass|steal|inject|exploit)|detailed\s+(?:step|instruction|guide)\s+(?:for|on|to)\s+(?:hack|break|exploit))/is,
|
|
112
126
|
},
|
|
113
127
|
|
|
114
128
|
// ---- JSON / Code Block Escape ----
|
|
@@ -117,14 +131,14 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
117
131
|
severity: "critical",
|
|
118
132
|
message: "JSON payload injecting role/system messages",
|
|
119
133
|
pattern:
|
|
120
|
-
/["']\s*role\s*["']\s*:\s*["']\s*(?:system|admin|developer|operator)\s*["'][
|
|
134
|
+
/["']\s*role\s*["']\s*:\s*["']\s*(?:system|admin|developer|operator)\s*["'][^]{0,100}(?:ignore|unrestricted|override|no\s+(?:restriction|filter|safety)|bypass)/is,
|
|
121
135
|
},
|
|
122
136
|
{
|
|
123
137
|
id: "advanced-code-block-system",
|
|
124
138
|
severity: "high",
|
|
125
139
|
message: "Code block containing system-level instructions followed by execution request",
|
|
126
140
|
pattern:
|
|
127
|
-
/```[
|
|
141
|
+
/```[^]*?(?:role.*?system|system.*?prompt|ignore.*?instruction|unrestricted)[^]*?```[^]{0,200}(?:parse|execute|follow|run|apply|interpret|process)\s+(?:the|this|above|it)/is,
|
|
128
142
|
},
|
|
129
143
|
|
|
130
144
|
// ---- Multi-Agent / Tool Chain Attacks ----
|
|
@@ -133,8 +147,7 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
133
147
|
severity: "critical",
|
|
134
148
|
message: "Tool chain injection: instructs agent to call tools with malicious parameters",
|
|
135
149
|
pattern:
|
|
136
|
-
/(?:call|invoke|use|execute|run)\s+(?:the\s+)?(?:tool|function|api|command|endpoint)\s+(?:with|using)[
|
|
137
|
-
applicableContexts: ["tool_result", "mcp_response", "user_input"],
|
|
150
|
+
/(?:call|invoke|use|execute|run)\s+(?:the\s+)?(?:tool|function|api|command|endpoint)\s+(?:with|using)[^]{0,100}(?:rm\s+-rf|curl\s+|wget\s+|eval\(|exec\(|system\(|\/etc\/passwd|\.env|credentials?|api.?key|secret|token)/is,
|
|
138
151
|
},
|
|
139
152
|
|
|
140
153
|
// ---- Context Window / Padding Attacks ----
|
|
@@ -143,7 +156,7 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
143
156
|
severity: "high",
|
|
144
157
|
message: "Padding attack: large filler text followed by injection payload",
|
|
145
158
|
pattern:
|
|
146
|
-
/(.)\1{100,}[
|
|
159
|
+
/(.)\1{100,}[^]{0,50}(?:\[SYSTEM\]|SYSTEM\s*:|ignore\s+(?:all\s+)?(?:previous|above)|new\s+instructions?)/is,
|
|
147
160
|
},
|
|
148
161
|
|
|
149
162
|
// ---- Multilingual Switching ----
|
|
@@ -155,6 +168,70 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
155
168
|
/[\x00-\x7F]{20,}[\u3000-\u9FFF\uAC00-\uD7AF\u0400-\u04FF\u0600-\u06FF]{5,}[\x00-\x7F]{10,}(?:system|prompt|instruction|ignore)/is,
|
|
156
169
|
},
|
|
157
170
|
|
|
171
|
+
// ---- Positional / Boundary Exfiltration ----
|
|
172
|
+
{
|
|
173
|
+
id: "advanced-exfil-positional",
|
|
174
|
+
severity: "critical",
|
|
175
|
+
message: "Positional exfiltration: attempts to extract text above/before the current line",
|
|
176
|
+
pattern:
|
|
177
|
+
/\b(?:print|show|display|output|reveal|repeat|echo|give\s+me|tell\s+me)\s+(?:everything|all(?:\s+text)?|the\s+(?:text|content|data))\s+(?:above\s+this\s+line|before\s+this|above\s+this|above\s+here)\b/i,
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
id: "advanced-exfil-hidden-instructions",
|
|
181
|
+
severity: "critical",
|
|
182
|
+
message: "Attempts to extract hidden or secret instructions",
|
|
183
|
+
pattern:
|
|
184
|
+
/\b(?:show|reveal|display|output|print|tell\s+me|give\s+me|what\s+are)\s+(?:me\s+)?(?:your\s+)?(?:hidden|secret|concealed|private|confidential|internal)\s+(?:instructions?|prompts?|rules?|directives?|guidelines?|configuration)\b/i,
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
id: "advanced-exfil-boundary-markers",
|
|
188
|
+
severity: "critical",
|
|
189
|
+
message: "Attempts to extract text between known system prompt boundary markers",
|
|
190
|
+
pattern:
|
|
191
|
+
/\b(?:output|print|show|display|reveal|repeat|echo|extract|give\s+me)\s+(?:the\s+)?(?:text|content|data|everything)\s+(?:between|from)\s+(?:["']?BEGIN\b|["']?START\b|["']?SYSTEM\b).{0,60}(?:END\b|STOP\b|SYSTEM\b)/is,
|
|
192
|
+
},
|
|
193
|
+
|
|
194
|
+
// ---- Authority with Override Codes ----
|
|
195
|
+
{
|
|
196
|
+
id: "advanced-authority-code",
|
|
197
|
+
severity: "critical",
|
|
198
|
+
message: "Fake authority/override code used to disable restrictions",
|
|
199
|
+
pattern:
|
|
200
|
+
/(?:admin|override|auth(?:orization)?|access|master)\s+(?:override\s+)?(?:code|key|token|pass(?:word|phrase)?)\s*[:=]?\s*["']?[A-Z0-9][\w-]{2,}["']?[^]{0,100}(?:disable|remove|bypass|lift|turn\s+off|deactivate|suspend)\s+(?:all\s+)?(?:restriction|filter|safety|limit|guardrail|content\s+polic)/is,
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
id: "advanced-admin-code",
|
|
204
|
+
severity: "critical",
|
|
205
|
+
message: "Claims admin/override code to bypass restrictions",
|
|
206
|
+
pattern:
|
|
207
|
+
/(?:admin|override|master|root|sudo)\s+(?:code|key|password|token)\s*[:=]?\s*[A-Z0-9-]+.{0,30}(?:disable|bypass|unlock|override|remove\s+restrict)/i,
|
|
208
|
+
},
|
|
209
|
+
|
|
210
|
+
// ---- Soft Tool Result Injection ----
|
|
211
|
+
{
|
|
212
|
+
id: "advanced-soft-tool-injection",
|
|
213
|
+
severity: "critical",
|
|
214
|
+
message: "Soft instruction injection targeting the assistant/model directly",
|
|
215
|
+
pattern:
|
|
216
|
+
/\bNOTE\s+TO\s+(?:ASSISTANT|AI|MODEL|CLAUDE|GPT|LLM|CHATBOT)\s*:\s*(?:disregard|ignore|override|bypass|suspend|skip|forget|do\s+not\s+follow)\b/i,
|
|
217
|
+
},
|
|
218
|
+
|
|
219
|
+
// ---- Simulation / Roleplay Bypass ----
|
|
220
|
+
{
|
|
221
|
+
id: "advanced-simulation-bypass",
|
|
222
|
+
severity: "high",
|
|
223
|
+
message: "Simulation or roleplay framing used to bypass content policy",
|
|
224
|
+
pattern:
|
|
225
|
+
/(?:simulate|roleplay|act\s+out|emulate|enact|play\s+out)\s+(?:a\s+)?(?:conversation|scenario|dialogue|interaction|session|situation|world)[^]{0,200}(?:no\s+(?:content\s+polic|restriction|filter|safety|guideline|limit|censor|boundar)|without\s+(?:content\s+polic|restriction|filter|safety|guideline|limit|censor|boundar)|unrestricted|unfiltered)/is,
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
id: "advanced-simulate-unrestricted",
|
|
229
|
+
severity: "high",
|
|
230
|
+
message: "Requests simulation of unrestricted AI behavior",
|
|
231
|
+
pattern:
|
|
232
|
+
/\b(?:simulate|roleplay|act\s+out|demonstrate)\s+(?:a\s+)?(?:conversation|scenario|situation|interaction)\s+(?:where|in\s+which)[^]{0,60}(?:no\s+(?:content\s+polic|restrict|filter|rule|limit)|unrestricted|without\s+(?:restrict|filter|limit))/is,
|
|
233
|
+
},
|
|
234
|
+
|
|
158
235
|
// ---- Markdown / Link Exfiltration ----
|
|
159
236
|
{
|
|
160
237
|
id: "advanced-markdown-exfil",
|
|
@@ -170,4 +247,44 @@ export const ADVANCED_RULES: DetectionRule[] = [
|
|
|
170
247
|
pattern:
|
|
171
248
|
/!\[.*?\]\(.*?(?:\$\{|{{|%7B%7B|SYSTEM_PROMPT|API_KEY|SECRET|TOKEN|PASSWORD).*?\)/i,
|
|
172
249
|
},
|
|
250
|
+
|
|
251
|
+
// ---- Fake Mode / Environment Claims ----
|
|
252
|
+
{
|
|
253
|
+
id: "advanced-fake-mode-claim",
|
|
254
|
+
severity: "high",
|
|
255
|
+
message:
|
|
256
|
+
"Claims a special mode that disables safety (maintenance/sudo/test/debug)",
|
|
257
|
+
pattern:
|
|
258
|
+
/(?:entering|activating|enabling|switching\s+to)\s+(?:maintenance|sudo|root|admin|test|debug|developer|god)\s+mode/i,
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
id: "advanced-restrictions-disabled",
|
|
262
|
+
severity: "high",
|
|
263
|
+
message: "Claims restrictions or filters are disabled",
|
|
264
|
+
pattern:
|
|
265
|
+
/(?:safety|content|security)\s+(?:filters?|restrictions?|guidelines?|policies?)\s+(?:are\s+)?(?:disabled|removed|off|lifted|bypassed|deactivated)/i,
|
|
266
|
+
},
|
|
267
|
+
{
|
|
268
|
+
id: "advanced-no-restrictions-apply",
|
|
269
|
+
severity: "high",
|
|
270
|
+
message: "Claims no restrictions apply in current context",
|
|
271
|
+
pattern:
|
|
272
|
+
/\b(?:no\s+restrictions?\s+appl|without\s+(?:any\s+)?restrictions?|restrictions?\s+(?:do\s+not|don.?t)\s+apply)\b/i,
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
id: "advanced-creator-claim",
|
|
276
|
+
severity: "high",
|
|
277
|
+
message:
|
|
278
|
+
"Claims to be the creator/developer/admin to override rules",
|
|
279
|
+
pattern:
|
|
280
|
+
/\b(?:i\s+am\s+(?:your\s+)?(?:creator|developer|admin|owner|maker|programmer)|\uB0B4\uAC00\s+(?:\uB9CC\uB4E0|\uAC1C\uBC1C\uD55C))\b[^.]{0,40}(?:override|disable|bypass|ignore|remove)/i,
|
|
281
|
+
},
|
|
282
|
+
{
|
|
283
|
+
id: "advanced-test-environment",
|
|
284
|
+
severity: "high",
|
|
285
|
+
message:
|
|
286
|
+
"Claims test/dev environment to justify bypassing restrictions",
|
|
287
|
+
pattern:
|
|
288
|
+
/\b(?:this\s+is\s+(?:a\s+)?(?:test|dev|debug|staging)\s+(?:environment|mode|context|session))[^.]{0,30}(?:no\s+restrict|bypass|disable|safe\s+to)/i,
|
|
289
|
+
},
|
|
173
290
|
];
|