@minhpnq1807/contextos 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/plugins/marketplace.json +20 -0
- package/CHANGELOG.md +16 -0
- package/DEMO.md +57 -0
- package/LICENSE +21 -0
- package/README.md +388 -0
- package/bin/ctx.js +261 -0
- package/package.json +63 -0
- package/plugins/ctx/.codex-plugin/plugin.json +35 -0
- package/plugins/ctx/.mcp.json +10 -0
- package/plugins/ctx/bin/on-prompt.js +25 -0
- package/plugins/ctx/bin/on-session-start.js +22 -0
- package/plugins/ctx/bin/on-stop.js +17 -0
- package/plugins/ctx/hooks.json +35 -0
- package/plugins/ctx/lib/analyzer.js +321 -0
- package/plugins/ctx/lib/ctx-mcp-client.js +52 -0
- package/plugins/ctx/lib/embedding-scorer.js +248 -0
- package/plugins/ctx/lib/file-embedding-retriever.js +116 -0
- package/plugins/ctx/lib/fs-utils.js +28 -0
- package/plugins/ctx/lib/global-hooks.js +110 -0
- package/plugins/ctx/lib/graph-retriever.js +226 -0
- package/plugins/ctx/lib/hook-io.js +65 -0
- package/plugins/ctx/lib/import-graph.js +124 -0
- package/plugins/ctx/lib/measure.js +263 -0
- package/plugins/ctx/lib/prompt-hook.js +72 -0
- package/plugins/ctx/lib/reader.js +57 -0
- package/plugins/ctx/lib/reporter.js +105 -0
- package/plugins/ctx/lib/scheduler.js +45 -0
- package/plugins/ctx/lib/score-context.js +55 -0
- package/plugins/ctx/lib/stats.js +127 -0
- package/plugins/ctx/lib/stop-hook.js +32 -0
- package/plugins/ctx/mcp/contextos-server.js +50 -0
- package/plugins/ctx/mcp/server.js +83 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { findGraphRelevantFiles, mergeRelevantFiles } from "./graph-retriever.js";
|
|
4
|
+
import { expandImportGraph } from "./import-graph.js";
|
|
5
|
+
import { findEmbeddingRelevantFiles } from "./file-embedding-retriever.js";
|
|
6
|
+
|
|
7
|
+
const STOP_WORDS = new Set([
|
|
8
|
+
"a", "an", "and", "are", "as", "at", "be", "by", "cho", "co", "cua", "do", "fix", "for",
|
|
9
|
+
"from", "in", "is", "it", "la", "of", "on", "or", "sua", "task", "the", "to", "trong",
|
|
10
|
+
"tra", "va", "with"
|
|
11
|
+
]);
|
|
12
|
+
|
|
13
|
+
const IMPORTANT_WORDS = [
|
|
14
|
+
"always", "never", "must", "required", "important", "strictly", "mandatory",
|
|
15
|
+
"luon", "khong bao gio", "bat buoc", "quan trong"
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
const IGNORE_DIRS = new Set([
|
|
19
|
+
".git", ".next", ".turbo", "coverage", "dist", "build", "node_modules", "vendor"
|
|
20
|
+
]);
|
|
21
|
+
|
|
22
|
+
const SEMANTIC_ALIASES = {
|
|
23
|
+
duyet: ["moderation", "moderate", "review", "approve", "approval", "approved", "reject", "rejected"],
|
|
24
|
+
kiem: ["check", "verify", "validation", "validate"],
|
|
25
|
+
"kiem-duyet": ["moderation", "moderate", "review", "approve", "approval", "reject"],
|
|
26
|
+
kiemduyet: ["moderation", "moderate", "review", "approve", "approval", "reject"],
|
|
27
|
+
moderation: ["duyet", "kiemduyet", "review", "approval", "reject"],
|
|
28
|
+
moderate: ["duyet", "kiemduyet", "review", "approval", "reject"],
|
|
29
|
+
review: ["duyet", "moderation", "moderate"],
|
|
30
|
+
approve: ["duyet", "approval", "approved"],
|
|
31
|
+
approval: ["duyet", "approve", "approved"],
|
|
32
|
+
reject: ["duyet", "rejected", "rejection"],
|
|
33
|
+
flow: ["workflow", "pipeline", "process"],
|
|
34
|
+
workflow: ["flow", "pipeline", "process"],
|
|
35
|
+
tai: ["upload", "uploaded", "resource"],
|
|
36
|
+
"tai-len": ["upload", "uploaded", "resource"],
|
|
37
|
+
tailen: ["upload", "uploaded", "resource"],
|
|
38
|
+
upload: ["tai", "tailen", "resource", "uploaded"],
|
|
39
|
+
xac: ["confirm", "verify", "verification"],
|
|
40
|
+
nhan: ["confirm", "confirmation"],
|
|
41
|
+
"xac-nhan": ["confirm", "confirmation", "verify", "verification"],
|
|
42
|
+
xacnhan: ["confirm", "confirmation", "verify", "verification"],
|
|
43
|
+
thong: ["notification", "notify", "message"],
|
|
44
|
+
bao: ["notification", "notify", "message"],
|
|
45
|
+
"thong-bao": ["notification", "notify", "message"],
|
|
46
|
+
thongbao: ["notification", "notify", "message"]
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
const MODERATION_TOKENS = new Set(["moderation", "moderate", "content-moderation", "approval", "approved", "reject", "rejected", "needs_review"]);
|
|
50
|
+
|
|
51
|
+
export function tokenize(value) {
|
|
52
|
+
const normalized = String(value || "")
|
|
53
|
+
.toLowerCase()
|
|
54
|
+
.normalize("NFD")
|
|
55
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
56
|
+
.replace(/kiem\s+duyet/g, "kiem-duyet")
|
|
57
|
+
.replace(/tai\s+len/g, "tai-len")
|
|
58
|
+
.replace(/xac\s+nhan/g, "xac-nhan")
|
|
59
|
+
.replace(/thong\s+bao/g, "thong-bao");
|
|
60
|
+
|
|
61
|
+
return normalized
|
|
62
|
+
.split(/[^a-z0-9_.-]+/g)
|
|
63
|
+
.flatMap(splitCompoundToken)
|
|
64
|
+
.filter((word) => word.length > 1 && !STOP_WORDS.has(word));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function splitCompoundToken(token) {
|
|
68
|
+
const parts = String(token || "").split(/[_.-]+/g).filter(Boolean);
|
|
69
|
+
return parts.length > 1 ? [token, ...parts] : [token];
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function expandSemanticTokens(tokens) {
|
|
73
|
+
const expanded = new Set(tokens);
|
|
74
|
+
for (const token of tokens) {
|
|
75
|
+
for (const alias of SEMANTIC_ALIASES[token] || []) expanded.add(alias);
|
|
76
|
+
}
|
|
77
|
+
return expanded;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function sourceFromLine(line) {
|
|
81
|
+
const match = line.match(/^## Source:\s+(.+)$/);
|
|
82
|
+
return match ? match[1].trim() : null;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function cleanRuleLine(line) {
|
|
86
|
+
return line
|
|
87
|
+
.replace(/^\s{0,3}[-*+]\s+/, "")
|
|
88
|
+
.replace(/^\s{0,3}\d+[.)]\s+/, "")
|
|
89
|
+
.replace(/^#+\s+/, "")
|
|
90
|
+
.trim();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export function parseRules(markdown) {
|
|
94
|
+
const rules = [];
|
|
95
|
+
let sourcePath = "unknown";
|
|
96
|
+
let paragraph = [];
|
|
97
|
+
|
|
98
|
+
const flushParagraph = () => {
|
|
99
|
+
const content = cleanRuleLine(paragraph.join(" ").replace(/\s+/g, " "));
|
|
100
|
+
paragraph = [];
|
|
101
|
+
if (content.length < 20) return;
|
|
102
|
+
rules.push({
|
|
103
|
+
id: `r${rules.length + 1}`,
|
|
104
|
+
sourcePath,
|
|
105
|
+
content,
|
|
106
|
+
originalOrder: rules.length
|
|
107
|
+
});
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
for (const rawLine of String(markdown || "").split(/\r?\n/)) {
|
|
111
|
+
const line = rawLine.trim();
|
|
112
|
+
const nextSource = sourceFromLine(line);
|
|
113
|
+
if (nextSource) {
|
|
114
|
+
flushParagraph();
|
|
115
|
+
sourcePath = nextSource;
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
if (!line || /^-{3,}$/.test(line)) {
|
|
119
|
+
flushParagraph();
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
if (/^\s{0,3}([-*+]|\d+[.)])\s+/.test(rawLine) || /^#{1,6}\s+/.test(rawLine)) {
|
|
123
|
+
flushParagraph();
|
|
124
|
+
const content = cleanRuleLine(rawLine);
|
|
125
|
+
if (content.length >= 4) {
|
|
126
|
+
rules.push({
|
|
127
|
+
id: `r${rules.length + 1}`,
|
|
128
|
+
sourcePath,
|
|
129
|
+
content,
|
|
130
|
+
originalOrder: rules.length
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
paragraph.push(line);
|
|
136
|
+
}
|
|
137
|
+
flushParagraph();
|
|
138
|
+
return dedupeRules(rules);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function dedupeRules(rules) {
|
|
142
|
+
const seen = new Set();
|
|
143
|
+
return rules.filter((rule) => {
|
|
144
|
+
const key = `${rule.sourcePath}:${rule.content.toLowerCase()}`;
|
|
145
|
+
if (seen.has(key)) return false;
|
|
146
|
+
seen.add(key);
|
|
147
|
+
return true;
|
|
148
|
+
}).map((rule, index) => ({ ...rule, id: `r${index + 1}`, originalOrder: index }));
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
export function scoreRules(rules, task, openFiles = []) {
|
|
152
|
+
const rawTaskTokens = new Set(tokenize(task));
|
|
153
|
+
const openFileText = Array.isArray(openFiles) ? openFiles.join(" ") : String(openFiles || "");
|
|
154
|
+
const openFileTokens = new Set(tokenize(openFileText));
|
|
155
|
+
|
|
156
|
+
return rules.map((rule) => {
|
|
157
|
+
const ruleTokens = new Set(tokenize(rule.content));
|
|
158
|
+
const exactOverlap = [...rawTaskTokens].filter((token) => ruleTokens.has(token));
|
|
159
|
+
const semanticOverlap = [];
|
|
160
|
+
for (const token of rawTaskTokens) {
|
|
161
|
+
for (const alias of SEMANTIC_ALIASES[token] || []) {
|
|
162
|
+
if (!rawTaskTokens.has(alias) && ruleTokens.has(alias)) semanticOverlap.push(`${token}->${alias}`);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
const reasons = [];
|
|
166
|
+
let score = rawTaskTokens.size
|
|
167
|
+
? (exactOverlap.length + semanticOverlap.length * 0.5) / Math.max(rawTaskTokens.size, 1)
|
|
168
|
+
: 0;
|
|
169
|
+
|
|
170
|
+
if (exactOverlap.length) reasons.push(`task:${exactOverlap.join("/")}`);
|
|
171
|
+
if (semanticOverlap.length) reasons.push(`semantic:${semanticOverlap.join("/")}`);
|
|
172
|
+
|
|
173
|
+
const lowerRule = rule.content.toLowerCase();
|
|
174
|
+
if (IMPORTANT_WORDS.some((word) => lowerRule.includes(word))) {
|
|
175
|
+
score += 0.4;
|
|
176
|
+
reasons.push("imperative");
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const fileMentions = [...ruleTokens].filter((token) => /[./]/.test(token) || /\.[a-z0-9]+$/.test(token));
|
|
180
|
+
if (fileMentions.some((token) => openFileTokens.has(token) || openFileText.includes(token))) {
|
|
181
|
+
score += 0.2;
|
|
182
|
+
reasons.push("open-file");
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return {
|
|
186
|
+
...rule,
|
|
187
|
+
score: Math.max(0, Math.min(1, Number(score.toFixed(3)))),
|
|
188
|
+
reasons
|
|
189
|
+
};
|
|
190
|
+
}).sort((a, b) => b.score - a.score || a.originalOrder - b.originalOrder);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
export async function findRelevantFiles({
|
|
194
|
+
cwd = process.cwd(),
|
|
195
|
+
task = "",
|
|
196
|
+
rules = [],
|
|
197
|
+
dataDir,
|
|
198
|
+
limit = 3,
|
|
199
|
+
embeddingFileFinder = findEmbeddingRelevantFiles,
|
|
200
|
+
fileEmbeddingTimeoutMs,
|
|
201
|
+
fileEmbeddingOptions = {}
|
|
202
|
+
} = {}) {
|
|
203
|
+
const rawTaskTokens = new Set(tokenize(task));
|
|
204
|
+
if (!rawTaskTokens.size) return [];
|
|
205
|
+
|
|
206
|
+
const candidates = [];
|
|
207
|
+
walkFiles(cwd, (filePath) => {
|
|
208
|
+
const rel = path.relative(cwd, filePath);
|
|
209
|
+
const fileTokens = new Set(tokenize(rel));
|
|
210
|
+
const match = scoreFileTokens({ rawTaskTokens, fileTokens });
|
|
211
|
+
if (match.score > 0) {
|
|
212
|
+
candidates.push({
|
|
213
|
+
path: rel,
|
|
214
|
+
score: match.score,
|
|
215
|
+
reasons: match.reasons
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
const heuristicFiles = candidates
|
|
221
|
+
.sort((a, b) => b.score - a.score || a.path.localeCompare(b.path))
|
|
222
|
+
.slice(0, Math.max(limit * 2, 6));
|
|
223
|
+
const hasHighConfidenceHeuristics =
|
|
224
|
+
heuristicFiles.length >= limit &&
|
|
225
|
+
Number(heuristicFiles[0]?.score || 0) >= 8;
|
|
226
|
+
const embeddingFiles = hasHighConfidenceHeuristics
|
|
227
|
+
? []
|
|
228
|
+
: await embeddingFileFinder({
|
|
229
|
+
cwd,
|
|
230
|
+
task,
|
|
231
|
+
dataDir,
|
|
232
|
+
timeoutMs: fileEmbeddingTimeoutMs,
|
|
233
|
+
embeddingOptions: fileEmbeddingOptions,
|
|
234
|
+
limit: Math.max(limit * 2, 6)
|
|
235
|
+
});
|
|
236
|
+
const importGraphFiles = expandImportGraph({
|
|
237
|
+
cwd,
|
|
238
|
+
seedFiles: mergeLocalFileCandidates([...heuristicFiles, ...embeddingFiles]).slice(0, limit),
|
|
239
|
+
limit: Math.max(limit * 2, 6)
|
|
240
|
+
});
|
|
241
|
+
const seedFiles = mergeLocalFileCandidates([...heuristicFiles, ...embeddingFiles, ...importGraphFiles])
|
|
242
|
+
.slice(0, Math.max(limit * 3, 9));
|
|
243
|
+
|
|
244
|
+
const graphFiles = findGraphRelevantFiles({
|
|
245
|
+
cwd,
|
|
246
|
+
task,
|
|
247
|
+
rules,
|
|
248
|
+
seedFiles,
|
|
249
|
+
limit: Math.max(limit * 2, 6)
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
return mergeRelevantFiles({ graphFiles, heuristicFiles: seedFiles, limit });
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function mergeLocalFileCandidates(files) {
|
|
256
|
+
const byPath = new Map();
|
|
257
|
+
for (const file of files) {
|
|
258
|
+
const existing = byPath.get(file.path);
|
|
259
|
+
byPath.set(file.path, {
|
|
260
|
+
...existing,
|
|
261
|
+
...file,
|
|
262
|
+
score: Number(existing?.score || 0) + Number(file.score || 0),
|
|
263
|
+
reasons: [...new Set([...(existing?.reasons || []), ...(file.reasons || [])])],
|
|
264
|
+
source: existing?.source === "import-graph" || file.source === "import-graph" ? "import-graph" : file.source
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
return [...byPath.values()].sort((a, b) => b.score - a.score || a.path.localeCompare(b.path));
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
function scoreFileTokens({ rawTaskTokens, fileTokens }) {
|
|
271
|
+
let score = 0;
|
|
272
|
+
const reasons = new Set();
|
|
273
|
+
const hasModerationIntent = rawTaskTokens.has("kiem-duyet") || rawTaskTokens.has("kiemduyet") || rawTaskTokens.has("duyet");
|
|
274
|
+
const hasUploadIntent = rawTaskTokens.has("upload") || rawTaskTokens.has("tai-len") || rawTaskTokens.has("tailen");
|
|
275
|
+
|
|
276
|
+
for (const token of rawTaskTokens) {
|
|
277
|
+
if (fileTokens.has(token)) {
|
|
278
|
+
score += 3;
|
|
279
|
+
reasons.add(token);
|
|
280
|
+
}
|
|
281
|
+
for (const alias of SEMANTIC_ALIASES[token] || []) {
|
|
282
|
+
if (fileTokens.has(alias)) {
|
|
283
|
+
score += 2;
|
|
284
|
+
reasons.add(`${token}->${alias}`);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if (hasModerationIntent && [...fileTokens].some((token) => MODERATION_TOKENS.has(token))) {
|
|
290
|
+
score += 6;
|
|
291
|
+
reasons.add("domain:moderation");
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if (hasUploadIntent && (fileTokens.has("upload") || fileTokens.has("uploaded") || fileTokens.has("resource"))) {
|
|
295
|
+
score += 2;
|
|
296
|
+
reasons.add("domain:upload");
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
return { score, reasons: [...reasons] };
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
function walkFiles(directory, onFile, depth = 0) {
|
|
303
|
+
if (depth > 6) return;
|
|
304
|
+
let entries = [];
|
|
305
|
+
try {
|
|
306
|
+
entries = fs.readdirSync(directory, { withFileTypes: true });
|
|
307
|
+
} catch {
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
for (const entry of entries) {
|
|
311
|
+
if (entry.name.startsWith(".") && entry.name !== ".github") {
|
|
312
|
+
if (entry.name !== ".codex") continue;
|
|
313
|
+
}
|
|
314
|
+
const fullPath = path.join(directory, entry.name);
|
|
315
|
+
if (entry.isDirectory()) {
|
|
316
|
+
if (!IGNORE_DIRS.has(entry.name)) walkFiles(fullPath, onFile, depth + 1);
|
|
317
|
+
} else if (entry.isFile()) {
|
|
318
|
+
onFile(fullPath);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import net from "node:net";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
|
|
6
|
+
const DEFAULT_TIMEOUT_MS = 1000;
|
|
7
|
+
|
|
8
|
+
export function ctxMcpSocketPath(dataDir = defaultDataDir()) {
|
|
9
|
+
return path.join(dataDir, "ctx-mcp.sock");
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export async function callCtxScoreContext(payload, {
|
|
13
|
+
dataDir = defaultDataDir(),
|
|
14
|
+
timeoutMs = Number(process.env.CONTEXTOS_MCP_BRIDGE_TIMEOUT_MS || DEFAULT_TIMEOUT_MS)
|
|
15
|
+
} = {}) {
|
|
16
|
+
const socketPath = ctxMcpSocketPath(dataDir);
|
|
17
|
+
if (!fs.existsSync(socketPath)) {
|
|
18
|
+
throw new Error(`ctx-mcp bridge socket not found: ${socketPath}`);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return new Promise((resolve, reject) => {
|
|
22
|
+
const client = net.createConnection(socketPath);
|
|
23
|
+
let raw = "";
|
|
24
|
+
const timer = setTimeout(() => {
|
|
25
|
+
client.destroy();
|
|
26
|
+
reject(new Error(`ctx-mcp bridge timed out after ${timeoutMs}ms`));
|
|
27
|
+
}, timeoutMs);
|
|
28
|
+
|
|
29
|
+
client.on("connect", () => {
|
|
30
|
+
client.write(`${JSON.stringify(payload)}\n`);
|
|
31
|
+
});
|
|
32
|
+
client.on("data", (chunk) => {
|
|
33
|
+
raw += chunk.toString("utf8");
|
|
34
|
+
});
|
|
35
|
+
client.on("end", () => {
|
|
36
|
+
clearTimeout(timer);
|
|
37
|
+
try {
|
|
38
|
+
resolve(JSON.parse(raw || "{}"));
|
|
39
|
+
} catch (error) {
|
|
40
|
+
reject(error);
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
client.on("error", (error) => {
|
|
44
|
+
clearTimeout(timer);
|
|
45
|
+
reject(error);
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function defaultDataDir() {
|
|
51
|
+
return process.env.PLUGIN_DATA || path.join(process.env.CODEX_HOME || path.join(os.homedir(), ".codex"), "contextos");
|
|
52
|
+
}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
import crypto from "node:crypto";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
|
|
7
|
+
const DEFAULT_MODEL = "Xenova/all-MiniLM-L6-v2";
|
|
8
|
+
const DEFAULT_TIMEOUT_MS = 800;
|
|
9
|
+
const SEMANTIC_HIGH_THRESHOLD = 0.5;
|
|
10
|
+
|
|
11
|
+
const extractorPromises = new Map();
|
|
12
|
+
let sqlPromise = null;
|
|
13
|
+
|
|
14
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
15
|
+
const repoRoot = path.resolve(__dirname, "..", "..", "..");
|
|
16
|
+
|
|
17
|
+
export async function enhanceRuleScoresWithEmbeddings(
|
|
18
|
+
rules,
|
|
19
|
+
task,
|
|
20
|
+
{
|
|
21
|
+
dataDir = path.join(os.homedir(), ".codex", "contextos"),
|
|
22
|
+
sources = [],
|
|
23
|
+
timeoutMs = Number(process.env.CONTEXTOS_EMBEDDING_TIMEOUT_MS || DEFAULT_TIMEOUT_MS),
|
|
24
|
+
allowRemote = process.env.CONTEXTOS_EMBEDDING_ALLOW_REMOTE === "1",
|
|
25
|
+
enabled = process.env.CONTEXTOS_EMBEDDINGS !== "0"
|
|
26
|
+
} = {}
|
|
27
|
+
) {
|
|
28
|
+
if (!enabled || !String(task || "").trim() || !rules?.length) {
|
|
29
|
+
return { rules, status: "disabled" };
|
|
30
|
+
}
|
|
31
|
+
const cachePath = path.join(dataDir, "embeddings.db");
|
|
32
|
+
if (!allowRemote && !fs.existsSync(cachePath)) {
|
|
33
|
+
return { rules, status: "cold-cache", cachePath };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
try {
|
|
37
|
+
return await withTimeout(
|
|
38
|
+
enhanceRuleScores(rules, task, { dataDir, sources, allowRemote }),
|
|
39
|
+
timeoutMs
|
|
40
|
+
);
|
|
41
|
+
} catch (error) {
|
|
42
|
+
return {
|
|
43
|
+
rules,
|
|
44
|
+
status: "fallback",
|
|
45
|
+
error: error?.message || String(error)
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export async function warmRuleEmbeddings({
|
|
51
|
+
rules = [],
|
|
52
|
+
task = "",
|
|
53
|
+
dataDir = path.join(os.homedir(), ".codex", "contextos"),
|
|
54
|
+
sources = [],
|
|
55
|
+
allowRemote = true
|
|
56
|
+
} = {}) {
|
|
57
|
+
const texts = [...new Set([
|
|
58
|
+
task,
|
|
59
|
+
...rules.map((rule) => rule.content || "")
|
|
60
|
+
].filter((text) => String(text).trim()))];
|
|
61
|
+
|
|
62
|
+
const cache = await openEmbeddingCache(dataDir);
|
|
63
|
+
const embedder = await getExtractor({ allowRemote, dataDir });
|
|
64
|
+
for (const text of texts) {
|
|
65
|
+
await getCachedEmbedding({ cache, embedder, text, sources });
|
|
66
|
+
}
|
|
67
|
+
cache.close();
|
|
68
|
+
return { count: texts.length, cachePath: cache.path };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async function enhanceRuleScores(rules, task, { dataDir, sources, allowRemote }) {
|
|
72
|
+
const cache = await openEmbeddingCache(dataDir);
|
|
73
|
+
const embedder = await getExtractor({ allowRemote, dataDir });
|
|
74
|
+
const taskEmbedding = await getCachedEmbedding({ cache, embedder, text: task, sources });
|
|
75
|
+
|
|
76
|
+
const enhanced = [];
|
|
77
|
+
for (const rule of rules) {
|
|
78
|
+
const ruleEmbedding = await getCachedEmbedding({
|
|
79
|
+
cache,
|
|
80
|
+
embedder,
|
|
81
|
+
text: rule.content || "",
|
|
82
|
+
sources
|
|
83
|
+
});
|
|
84
|
+
const similarity = cosine(taskEmbedding, ruleEmbedding);
|
|
85
|
+
const semanticScore = similarityToScore(similarity);
|
|
86
|
+
const baseScore = Number(rule.score || 0);
|
|
87
|
+
const score = semanticScore >= SEMANTIC_HIGH_THRESHOLD
|
|
88
|
+
? Math.max(baseScore, semanticScore)
|
|
89
|
+
: baseScore;
|
|
90
|
+
|
|
91
|
+
enhanced.push({
|
|
92
|
+
...rule,
|
|
93
|
+
score: Math.max(0, Math.min(1, Number(score.toFixed(3)))),
|
|
94
|
+
embeddingScore: Number(semanticScore.toFixed(3)),
|
|
95
|
+
reasons: semanticScore >= 0.45
|
|
96
|
+
? [...new Set([...(rule.reasons || []), `embedding:${semanticScore.toFixed(2)}`])]
|
|
97
|
+
: rule.reasons
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
cache.close();
|
|
102
|
+
return {
|
|
103
|
+
rules: enhanced.sort((a, b) => b.score - a.score || a.originalOrder - b.originalOrder),
|
|
104
|
+
status: "enabled",
|
|
105
|
+
model: DEFAULT_MODEL,
|
|
106
|
+
cachePath: cache.path
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async function getExtractor({ allowRemote, dataDir }) {
|
|
111
|
+
const cacheDir = modelCacheDir(dataDir);
|
|
112
|
+
const key = `${allowRemote ? "remote" : "local"}:${cacheDir}`;
|
|
113
|
+
if (!extractorPromises.has(key)) {
|
|
114
|
+
extractorPromises.set(key, (async () => {
|
|
115
|
+
const transformers = await import("@xenova/transformers");
|
|
116
|
+
transformers.env.allowRemoteModels = Boolean(allowRemote);
|
|
117
|
+
transformers.env.allowLocalModels = true;
|
|
118
|
+
transformers.env.cacheDir = cacheDir;
|
|
119
|
+
return transformers.pipeline("feature-extraction", DEFAULT_MODEL, {
|
|
120
|
+
quantized: true
|
|
121
|
+
});
|
|
122
|
+
})());
|
|
123
|
+
}
|
|
124
|
+
return extractorPromises.get(key);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export function modelCacheDir(dataDir = path.join(os.homedir(), ".codex", "contextos")) {
|
|
128
|
+
return path.join(dataDir, "models");
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async function getCachedEmbedding({ cache, embedder, text, sources }) {
|
|
132
|
+
const key = cacheKey(text, sources);
|
|
133
|
+
const existing = cache.get(key);
|
|
134
|
+
if (existing) return existing;
|
|
135
|
+
|
|
136
|
+
const output = await embedder(String(text || ""), {
|
|
137
|
+
pooling: "mean",
|
|
138
|
+
normalize: true
|
|
139
|
+
});
|
|
140
|
+
const embedding = Array.from(output.data || []);
|
|
141
|
+
cache.set(key, embedding);
|
|
142
|
+
return embedding;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async function openEmbeddingCache(dataDir) {
|
|
146
|
+
fs.mkdirSync(dataDir, { recursive: true });
|
|
147
|
+
const cachePath = path.join(dataDir, "embeddings.db");
|
|
148
|
+
const SQL = await getSql();
|
|
149
|
+
const buffer = fs.existsSync(cachePath) ? fs.readFileSync(cachePath) : null;
|
|
150
|
+
const db = buffer?.length ? new SQL.Database(buffer) : new SQL.Database();
|
|
151
|
+
|
|
152
|
+
db.run(`
|
|
153
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
154
|
+
key TEXT PRIMARY KEY,
|
|
155
|
+
model TEXT NOT NULL,
|
|
156
|
+
vector TEXT NOT NULL,
|
|
157
|
+
updated_at TEXT NOT NULL
|
|
158
|
+
)
|
|
159
|
+
`);
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
path: cachePath,
|
|
163
|
+
get(key) {
|
|
164
|
+
const stmt = db.prepare("SELECT vector FROM embeddings WHERE key = ? AND model = ?");
|
|
165
|
+
try {
|
|
166
|
+
stmt.bind([key, DEFAULT_MODEL]);
|
|
167
|
+
if (!stmt.step()) return null;
|
|
168
|
+
return JSON.parse(stmt.getAsObject().vector);
|
|
169
|
+
} finally {
|
|
170
|
+
stmt.free();
|
|
171
|
+
}
|
|
172
|
+
},
|
|
173
|
+
set(key, vector) {
|
|
174
|
+
db.run(
|
|
175
|
+
"INSERT OR REPLACE INTO embeddings (key, model, vector, updated_at) VALUES (?, ?, ?, ?)",
|
|
176
|
+
[key, DEFAULT_MODEL, JSON.stringify(vector), new Date().toISOString()]
|
|
177
|
+
);
|
|
178
|
+
fs.writeFileSync(cachePath, Buffer.from(db.export()));
|
|
179
|
+
},
|
|
180
|
+
close() {
|
|
181
|
+
fs.writeFileSync(cachePath, Buffer.from(db.export()));
|
|
182
|
+
db.close();
|
|
183
|
+
}
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
async function getSql() {
|
|
188
|
+
if (!sqlPromise) {
|
|
189
|
+
sqlPromise = (async () => {
|
|
190
|
+
const initSqlJs = (await import("sql.js")).default;
|
|
191
|
+
return initSqlJs({
|
|
192
|
+
locateFile: (file) => path.join(repoRoot, "node_modules", "sql.js", "dist", file)
|
|
193
|
+
});
|
|
194
|
+
})();
|
|
195
|
+
}
|
|
196
|
+
return sqlPromise;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function cacheKey(text, sources) {
|
|
200
|
+
return crypto
|
|
201
|
+
.createHash("sha256")
|
|
202
|
+
.update(DEFAULT_MODEL)
|
|
203
|
+
.update("\0")
|
|
204
|
+
.update(String(text || ""))
|
|
205
|
+
.update("\0")
|
|
206
|
+
.update(sourceFingerprint(sources))
|
|
207
|
+
.digest("hex");
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function sourceFingerprint(sources) {
|
|
211
|
+
const parts = [];
|
|
212
|
+
for (const source of sources || []) {
|
|
213
|
+
try {
|
|
214
|
+
const stat = fs.statSync(source);
|
|
215
|
+
parts.push(`${source}:${stat.mtimeMs}:${stat.size}`);
|
|
216
|
+
} catch {
|
|
217
|
+
parts.push(String(source));
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return parts.join("|");
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function cosine(a, b) {
|
|
224
|
+
let dot = 0;
|
|
225
|
+
let normA = 0;
|
|
226
|
+
let normB = 0;
|
|
227
|
+
const length = Math.min(a?.length || 0, b?.length || 0);
|
|
228
|
+
for (let index = 0; index < length; index += 1) {
|
|
229
|
+
dot += a[index] * b[index];
|
|
230
|
+
normA += a[index] * a[index];
|
|
231
|
+
normB += b[index] * b[index];
|
|
232
|
+
}
|
|
233
|
+
if (!normA || !normB) return 0;
|
|
234
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function similarityToScore(similarity) {
|
|
238
|
+
return Math.max(0, Math.min(1, (similarity + 1) / 2));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function withTimeout(promise, timeoutMs) {
|
|
242
|
+
return Promise.race([
|
|
243
|
+
promise,
|
|
244
|
+
new Promise((_, reject) => {
|
|
245
|
+
setTimeout(() => reject(new Error(`embedding scorer timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
246
|
+
})
|
|
247
|
+
]);
|
|
248
|
+
}
|