chainlesschain 0.37.8 → 0.37.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +403 -8
- package/bin/chainlesschain.js +4 -0
- package/package.json +7 -2
- package/src/commands/agent.js +30 -0
- package/src/commands/ask.js +114 -0
- package/src/commands/audit.js +286 -0
- package/src/commands/auth.js +387 -0
- package/src/commands/browse.js +184 -0
- package/src/commands/chat.js +35 -0
- package/src/commands/db.js +152 -0
- package/src/commands/did.js +376 -0
- package/src/commands/encrypt.js +233 -0
- package/src/commands/export.js +125 -0
- package/src/commands/git.js +215 -0
- package/src/commands/import.js +259 -0
- package/src/commands/instinct.js +202 -0
- package/src/commands/llm.js +288 -0
- package/src/commands/mcp.js +302 -0
- package/src/commands/memory.js +282 -0
- package/src/commands/note.js +489 -0
- package/src/commands/org.js +505 -0
- package/src/commands/p2p.js +274 -0
- package/src/commands/plugin.js +398 -0
- package/src/commands/search.js +237 -0
- package/src/commands/session.js +238 -0
- package/src/commands/skill.js +479 -0
- package/src/commands/sync.js +249 -0
- package/src/commands/tokens.js +214 -0
- package/src/commands/wallet.js +416 -0
- package/src/index.js +65 -0
- package/src/lib/audit-logger.js +364 -0
- package/src/lib/bm25-search.js +322 -0
- package/src/lib/browser-automation.js +216 -0
- package/src/lib/crypto-manager.js +246 -0
- package/src/lib/did-manager.js +270 -0
- package/src/lib/ensure-utf8.js +59 -0
- package/src/lib/git-integration.js +220 -0
- package/src/lib/instinct-manager.js +190 -0
- package/src/lib/knowledge-exporter.js +302 -0
- package/src/lib/knowledge-importer.js +293 -0
- package/src/lib/llm-providers.js +325 -0
- package/src/lib/mcp-client.js +413 -0
- package/src/lib/memory-manager.js +211 -0
- package/src/lib/note-versioning.js +244 -0
- package/src/lib/org-manager.js +424 -0
- package/src/lib/p2p-manager.js +317 -0
- package/src/lib/pdf-parser.js +96 -0
- package/src/lib/permission-engine.js +374 -0
- package/src/lib/plan-mode.js +333 -0
- package/src/lib/platform.js +15 -0
- package/src/lib/plugin-manager.js +312 -0
- package/src/lib/response-cache.js +156 -0
- package/src/lib/session-manager.js +189 -0
- package/src/lib/sync-manager.js +347 -0
- package/src/lib/token-tracker.js +200 -0
- package/src/lib/wallet-manager.js +348 -0
- package/src/repl/agent-repl.js +912 -0
- package/src/repl/chat-repl.js +262 -0
- package/src/runtime/bootstrap.js +159 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Audit Logger — records security events and operations for compliance.
|
|
3
|
+
* Provides event logging, querying, statistics, and export.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import crypto from "crypto";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Event types for audit logging.
|
|
10
|
+
*/
|
|
11
|
+
export const EVENT_TYPES = {
|
|
12
|
+
AUTH: "auth",
|
|
13
|
+
PERMISSION: "permission",
|
|
14
|
+
DATA: "data",
|
|
15
|
+
SYSTEM: "system",
|
|
16
|
+
FILE: "file",
|
|
17
|
+
DID: "did",
|
|
18
|
+
CRYPTO: "crypto",
|
|
19
|
+
API: "api",
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Risk levels.
|
|
24
|
+
*/
|
|
25
|
+
export const RISK_LEVELS = {
|
|
26
|
+
LOW: "low",
|
|
27
|
+
MEDIUM: "medium",
|
|
28
|
+
HIGH: "high",
|
|
29
|
+
CRITICAL: "critical",
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* High-risk operations that trigger elevated risk assessment.
|
|
34
|
+
*/
|
|
35
|
+
const HIGH_RISK_OPERATIONS = new Set([
|
|
36
|
+
"delete_identity",
|
|
37
|
+
"grant_admin",
|
|
38
|
+
"revoke_all",
|
|
39
|
+
"delete_role",
|
|
40
|
+
"db_encrypt",
|
|
41
|
+
"db_decrypt",
|
|
42
|
+
"config_change",
|
|
43
|
+
"export_secrets",
|
|
44
|
+
"bulk_delete",
|
|
45
|
+
"password_reset",
|
|
46
|
+
"schema_change",
|
|
47
|
+
]);
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Ensure audit tables exist.
|
|
51
|
+
*/
|
|
52
|
+
export function ensureAuditTables(db) {
|
|
53
|
+
db.exec(`
|
|
54
|
+
CREATE TABLE IF NOT EXISTS audit_log (
|
|
55
|
+
id TEXT PRIMARY KEY,
|
|
56
|
+
event_type TEXT NOT NULL,
|
|
57
|
+
operation TEXT NOT NULL,
|
|
58
|
+
actor TEXT,
|
|
59
|
+
target TEXT,
|
|
60
|
+
details TEXT,
|
|
61
|
+
risk_level TEXT DEFAULT 'low',
|
|
62
|
+
ip_address TEXT,
|
|
63
|
+
user_agent TEXT,
|
|
64
|
+
success INTEGER DEFAULT 1,
|
|
65
|
+
error_message TEXT,
|
|
66
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
67
|
+
)
|
|
68
|
+
`);
|
|
69
|
+
|
|
70
|
+
db.exec(`
|
|
71
|
+
CREATE INDEX IF NOT EXISTS idx_audit_event_type ON audit_log(event_type)
|
|
72
|
+
`);
|
|
73
|
+
db.exec(`
|
|
74
|
+
CREATE INDEX IF NOT EXISTS idx_audit_created_at ON audit_log(created_at)
|
|
75
|
+
`);
|
|
76
|
+
db.exec(`
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_audit_risk_level ON audit_log(risk_level)
|
|
78
|
+
`);
|
|
79
|
+
db.exec(`
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_audit_actor ON audit_log(actor)
|
|
81
|
+
`);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Assess risk level for an operation.
|
|
86
|
+
*/
|
|
87
|
+
export function assessRisk(eventType, operation, details) {
|
|
88
|
+
if (HIGH_RISK_OPERATIONS.has(operation)) {
|
|
89
|
+
return RISK_LEVELS.HIGH;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (eventType === EVENT_TYPES.AUTH && operation.includes("fail")) {
|
|
93
|
+
return RISK_LEVELS.MEDIUM;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (eventType === EVENT_TYPES.PERMISSION) {
|
|
97
|
+
return RISK_LEVELS.MEDIUM;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (details && typeof details === "object") {
|
|
101
|
+
if (details.bulkCount && details.bulkCount > 100) {
|
|
102
|
+
return RISK_LEVELS.HIGH;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return RISK_LEVELS.LOW;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Sanitize sensitive data from log details.
|
|
111
|
+
*/
|
|
112
|
+
export function sanitizeDetails(details) {
|
|
113
|
+
if (!details || typeof details !== "object") return details;
|
|
114
|
+
|
|
115
|
+
const sanitized = { ...details };
|
|
116
|
+
const sensitiveKeys = [
|
|
117
|
+
"password",
|
|
118
|
+
"secret",
|
|
119
|
+
"secretKey",
|
|
120
|
+
"secret_key",
|
|
121
|
+
"privateKey",
|
|
122
|
+
"private_key",
|
|
123
|
+
"token",
|
|
124
|
+
"apiKey",
|
|
125
|
+
"api_key",
|
|
126
|
+
"mnemonic",
|
|
127
|
+
];
|
|
128
|
+
|
|
129
|
+
for (const key of sensitiveKeys) {
|
|
130
|
+
if (sanitized[key]) {
|
|
131
|
+
sanitized[key] = "[REDACTED]";
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return sanitized;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Log an audit event.
|
|
140
|
+
*/
|
|
141
|
+
export function logEvent(db, event) {
|
|
142
|
+
ensureAuditTables(db);
|
|
143
|
+
|
|
144
|
+
const id = crypto.randomUUID();
|
|
145
|
+
const sanitized = sanitizeDetails(event.details);
|
|
146
|
+
const risk =
|
|
147
|
+
event.riskLevel ||
|
|
148
|
+
assessRisk(event.eventType, event.operation, event.details);
|
|
149
|
+
|
|
150
|
+
db.prepare(
|
|
151
|
+
`INSERT INTO audit_log (id, event_type, operation, actor, target, details, risk_level, ip_address, user_agent, success, error_message)
|
|
152
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
153
|
+
).run(
|
|
154
|
+
id,
|
|
155
|
+
event.eventType || EVENT_TYPES.SYSTEM,
|
|
156
|
+
event.operation || "unknown",
|
|
157
|
+
event.actor || null,
|
|
158
|
+
event.target || null,
|
|
159
|
+
sanitized ? JSON.stringify(sanitized) : null,
|
|
160
|
+
risk,
|
|
161
|
+
event.ipAddress || null,
|
|
162
|
+
event.userAgent || null,
|
|
163
|
+
event.success !== false ? 1 : 0,
|
|
164
|
+
event.errorMessage || null,
|
|
165
|
+
);
|
|
166
|
+
|
|
167
|
+
return { id, riskLevel: risk, createdAt: new Date().toISOString() };
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Query audit logs with filters.
|
|
172
|
+
*/
|
|
173
|
+
export function queryLogs(db, filters = {}) {
|
|
174
|
+
ensureAuditTables(db);
|
|
175
|
+
|
|
176
|
+
let sql = "SELECT * FROM audit_log WHERE 1=1";
|
|
177
|
+
const params = [];
|
|
178
|
+
|
|
179
|
+
if (filters.eventType) {
|
|
180
|
+
sql += " AND event_type = ?";
|
|
181
|
+
params.push(filters.eventType);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (filters.operation) {
|
|
185
|
+
sql += " AND operation LIKE ?";
|
|
186
|
+
params.push(`%${filters.operation}%`);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (filters.actor) {
|
|
190
|
+
sql += " AND actor LIKE ?";
|
|
191
|
+
params.push(`%${filters.actor}%`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (filters.riskLevel) {
|
|
195
|
+
sql += " AND risk_level = ?";
|
|
196
|
+
params.push(filters.riskLevel);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (filters.success !== undefined) {
|
|
200
|
+
sql += " AND success = ?";
|
|
201
|
+
params.push(filters.success ? 1 : 0);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (filters.startDate) {
|
|
205
|
+
sql += " AND created_at >= ?";
|
|
206
|
+
params.push(filters.startDate);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (filters.endDate) {
|
|
210
|
+
sql += " AND created_at <= ?";
|
|
211
|
+
params.push(filters.endDate);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (filters.search) {
|
|
215
|
+
// Search in operation field (primary search field for CLI)
|
|
216
|
+
sql += " AND operation LIKE ?";
|
|
217
|
+
params.push(`%${filters.search}%`);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
sql += " ORDER BY created_at DESC";
|
|
221
|
+
|
|
222
|
+
if (filters.limit) {
|
|
223
|
+
sql += " LIMIT ?";
|
|
224
|
+
params.push(filters.limit);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if (filters.offset) {
|
|
228
|
+
sql += " OFFSET ?";
|
|
229
|
+
params.push(filters.offset);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const rows = db.prepare(sql).all(...params);
|
|
233
|
+
return rows.map((r) => ({
|
|
234
|
+
...r,
|
|
235
|
+
details: r.details ? JSON.parse(r.details) : null,
|
|
236
|
+
success: r.success === 1,
|
|
237
|
+
}));
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Get audit statistics.
|
|
242
|
+
*/
|
|
243
|
+
export function getStatistics(db, startDate, endDate) {
|
|
244
|
+
ensureAuditTables(db);
|
|
245
|
+
|
|
246
|
+
let dateFilter = "";
|
|
247
|
+
const params = [];
|
|
248
|
+
|
|
249
|
+
if (startDate) {
|
|
250
|
+
dateFilter += " AND created_at >= ?";
|
|
251
|
+
params.push(startDate);
|
|
252
|
+
}
|
|
253
|
+
if (endDate) {
|
|
254
|
+
dateFilter += " AND created_at <= ?";
|
|
255
|
+
params.push(endDate);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
const total = db
|
|
259
|
+
.prepare(`SELECT COUNT(*) as c FROM audit_log WHERE 1=1 ${dateFilter}`)
|
|
260
|
+
.get(...params).c;
|
|
261
|
+
|
|
262
|
+
const byEventType = db
|
|
263
|
+
.prepare(
|
|
264
|
+
`SELECT event_type, COUNT(*) as count FROM audit_log WHERE 1=1 ${dateFilter} GROUP BY event_type ORDER BY count DESC`,
|
|
265
|
+
)
|
|
266
|
+
.all(...params);
|
|
267
|
+
|
|
268
|
+
const byRiskLevel = db
|
|
269
|
+
.prepare(
|
|
270
|
+
`SELECT risk_level, COUNT(*) as count FROM audit_log WHERE 1=1 ${dateFilter} GROUP BY risk_level ORDER BY count DESC`,
|
|
271
|
+
)
|
|
272
|
+
.all(...params);
|
|
273
|
+
|
|
274
|
+
const failures = db
|
|
275
|
+
.prepare(
|
|
276
|
+
`SELECT COUNT(*) as c FROM audit_log WHERE success = 0 ${dateFilter}`,
|
|
277
|
+
)
|
|
278
|
+
.get(...params).c;
|
|
279
|
+
|
|
280
|
+
const highRiskHigh = db
|
|
281
|
+
.prepare(
|
|
282
|
+
`SELECT COUNT(*) as c FROM audit_log WHERE risk_level = 'high' ${dateFilter}`,
|
|
283
|
+
)
|
|
284
|
+
.get(...params).c;
|
|
285
|
+
const highRiskCritical = db
|
|
286
|
+
.prepare(
|
|
287
|
+
`SELECT COUNT(*) as c FROM audit_log WHERE risk_level = 'critical' ${dateFilter}`,
|
|
288
|
+
)
|
|
289
|
+
.get(...params).c;
|
|
290
|
+
const highRisk = highRiskHigh + highRiskCritical;
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
total,
|
|
294
|
+
failures,
|
|
295
|
+
highRisk,
|
|
296
|
+
byEventType: Object.fromEntries(
|
|
297
|
+
byEventType.map((r) => [r.event_type, r.count]),
|
|
298
|
+
),
|
|
299
|
+
byRiskLevel: Object.fromEntries(
|
|
300
|
+
byRiskLevel.map((r) => [r.risk_level, r.count]),
|
|
301
|
+
),
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Export audit logs as JSON or CSV.
|
|
307
|
+
*/
|
|
308
|
+
export function exportLogs(db, format = "json", filters = {}) {
|
|
309
|
+
const logs = queryLogs(db, { ...filters, limit: filters.limit || 10000 });
|
|
310
|
+
|
|
311
|
+
if (format === "csv") {
|
|
312
|
+
const headers = [
|
|
313
|
+
"id",
|
|
314
|
+
"event_type",
|
|
315
|
+
"operation",
|
|
316
|
+
"actor",
|
|
317
|
+
"target",
|
|
318
|
+
"risk_level",
|
|
319
|
+
"success",
|
|
320
|
+
"error_message",
|
|
321
|
+
"created_at",
|
|
322
|
+
];
|
|
323
|
+
const csvRows = [headers.join(",")];
|
|
324
|
+
|
|
325
|
+
for (const log of logs) {
|
|
326
|
+
const row = headers.map((h) => {
|
|
327
|
+
const val = log[h];
|
|
328
|
+
if (val === null || val === undefined) return "";
|
|
329
|
+
const str = String(val);
|
|
330
|
+
return str.includes(",") || str.includes('"')
|
|
331
|
+
? `"${str.replace(/"/g, '""')}"`
|
|
332
|
+
: str;
|
|
333
|
+
});
|
|
334
|
+
csvRows.push(row.join(","));
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
return csvRows.join("\n");
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
return JSON.stringify(logs, null, 2);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Delete old audit logs.
|
|
345
|
+
*/
|
|
346
|
+
export function purgeLogs(db, daysToKeep = 90) {
|
|
347
|
+
ensureAuditTables(db);
|
|
348
|
+
|
|
349
|
+
const cutoff = new Date();
|
|
350
|
+
cutoff.setDate(cutoff.getDate() - daysToKeep);
|
|
351
|
+
const cutoffStr = cutoff.toISOString().replace("T", " ").slice(0, 19);
|
|
352
|
+
|
|
353
|
+
const result = db
|
|
354
|
+
.prepare("DELETE FROM audit_log WHERE created_at < ?")
|
|
355
|
+
.run(cutoffStr);
|
|
356
|
+
return result.changes;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Get the most recent audit events.
|
|
361
|
+
*/
|
|
362
|
+
export function getRecentEvents(db, limit = 20) {
|
|
363
|
+
return queryLogs(db, { limit });
|
|
364
|
+
}
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 search engine for CLI
|
|
3
|
+
*
|
|
4
|
+
* Implements Okapi BM25 ranking algorithm for keyword-based search.
|
|
5
|
+
* Lightweight port of desktop-app-vue/src/main/rag/bm25-search.js
|
|
6
|
+
* No external dependencies — uses simple tokenization.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Simple tokenizer for text
|
|
11
|
+
* Handles both Chinese and English text
|
|
12
|
+
*/
|
|
13
|
+
function tokenize(text, language = "auto") {
|
|
14
|
+
if (!text || typeof text !== "string") return [];
|
|
15
|
+
|
|
16
|
+
const normalized = text.toLowerCase().trim();
|
|
17
|
+
|
|
18
|
+
// Detect language
|
|
19
|
+
const hasChinese = /[\u4e00-\u9fff]/.test(normalized);
|
|
20
|
+
const lang = language === "auto" ? (hasChinese ? "zh" : "en") : language;
|
|
21
|
+
|
|
22
|
+
if (lang === "zh") {
|
|
23
|
+
// Chinese: character-level + word-level bigrams
|
|
24
|
+
const chars = normalized.match(/[\u4e00-\u9fff]/g) || [];
|
|
25
|
+
const words = normalized.match(/[a-z0-9]+/g) || [];
|
|
26
|
+
const bigrams = [];
|
|
27
|
+
for (let i = 0; i < chars.length - 1; i++) {
|
|
28
|
+
bigrams.push(chars[i] + chars[i + 1]);
|
|
29
|
+
}
|
|
30
|
+
return [...chars, ...bigrams, ...words];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// English: word tokenization with stop word removal
|
|
34
|
+
const STOP_WORDS = new Set([
|
|
35
|
+
"a",
|
|
36
|
+
"an",
|
|
37
|
+
"the",
|
|
38
|
+
"is",
|
|
39
|
+
"are",
|
|
40
|
+
"was",
|
|
41
|
+
"were",
|
|
42
|
+
"be",
|
|
43
|
+
"been",
|
|
44
|
+
"being",
|
|
45
|
+
"have",
|
|
46
|
+
"has",
|
|
47
|
+
"had",
|
|
48
|
+
"do",
|
|
49
|
+
"does",
|
|
50
|
+
"did",
|
|
51
|
+
"will",
|
|
52
|
+
"would",
|
|
53
|
+
"could",
|
|
54
|
+
"should",
|
|
55
|
+
"may",
|
|
56
|
+
"might",
|
|
57
|
+
"shall",
|
|
58
|
+
"can",
|
|
59
|
+
"to",
|
|
60
|
+
"of",
|
|
61
|
+
"in",
|
|
62
|
+
"for",
|
|
63
|
+
"on",
|
|
64
|
+
"with",
|
|
65
|
+
"at",
|
|
66
|
+
"by",
|
|
67
|
+
"from",
|
|
68
|
+
"as",
|
|
69
|
+
"into",
|
|
70
|
+
"through",
|
|
71
|
+
"during",
|
|
72
|
+
"before",
|
|
73
|
+
"after",
|
|
74
|
+
"above",
|
|
75
|
+
"below",
|
|
76
|
+
"between",
|
|
77
|
+
"but",
|
|
78
|
+
"and",
|
|
79
|
+
"or",
|
|
80
|
+
"not",
|
|
81
|
+
"no",
|
|
82
|
+
"nor",
|
|
83
|
+
"so",
|
|
84
|
+
"if",
|
|
85
|
+
"then",
|
|
86
|
+
"than",
|
|
87
|
+
"too",
|
|
88
|
+
"very",
|
|
89
|
+
"just",
|
|
90
|
+
"about",
|
|
91
|
+
"up",
|
|
92
|
+
"out",
|
|
93
|
+
"it",
|
|
94
|
+
"its",
|
|
95
|
+
"this",
|
|
96
|
+
"that",
|
|
97
|
+
"these",
|
|
98
|
+
"those",
|
|
99
|
+
"i",
|
|
100
|
+
"me",
|
|
101
|
+
"my",
|
|
102
|
+
"we",
|
|
103
|
+
"our",
|
|
104
|
+
"you",
|
|
105
|
+
"your",
|
|
106
|
+
"he",
|
|
107
|
+
"him",
|
|
108
|
+
"his",
|
|
109
|
+
"she",
|
|
110
|
+
"her",
|
|
111
|
+
"they",
|
|
112
|
+
"them",
|
|
113
|
+
"their",
|
|
114
|
+
"what",
|
|
115
|
+
"which",
|
|
116
|
+
]);
|
|
117
|
+
|
|
118
|
+
return normalized
|
|
119
|
+
.split(/[^a-z0-9\u4e00-\u9fff]+/)
|
|
120
|
+
.filter((w) => w.length > 1 && !STOP_WORDS.has(w));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* BM25 Search Engine
|
|
125
|
+
*/
|
|
126
|
+
export class BM25Search {
|
|
127
|
+
/**
|
|
128
|
+
* @param {object} options
|
|
129
|
+
* @param {number} [options.k1=1.5] - Term frequency saturation parameter
|
|
130
|
+
* @param {number} [options.b=0.75] - Length normalization parameter
|
|
131
|
+
* @param {string} [options.language="auto"] - Language for tokenization
|
|
132
|
+
*/
|
|
133
|
+
constructor(options = {}) {
|
|
134
|
+
this.k1 = options.k1 || 1.5;
|
|
135
|
+
this.b = options.b || 0.75;
|
|
136
|
+
this.language = options.language || "auto";
|
|
137
|
+
|
|
138
|
+
// Index state
|
|
139
|
+
this.documents = []; // Array of { id, tokens, originalDoc }
|
|
140
|
+
this.df = new Map(); // document frequency per term
|
|
141
|
+
this.avgDl = 0; // average document length
|
|
142
|
+
this.totalDocs = 0;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Index a batch of documents
|
|
147
|
+
* @param {Array<{id: string, title?: string, content?: string}>} documents
|
|
148
|
+
*/
|
|
149
|
+
indexDocuments(documents) {
|
|
150
|
+
this.documents = [];
|
|
151
|
+
this.df = new Map();
|
|
152
|
+
|
|
153
|
+
for (const doc of documents) {
|
|
154
|
+
const text = [doc.title || "", doc.content || ""].join(" ");
|
|
155
|
+
const tokens = tokenize(text, this.language);
|
|
156
|
+
|
|
157
|
+
this.documents.push({
|
|
158
|
+
id: doc.id,
|
|
159
|
+
tokens,
|
|
160
|
+
originalDoc: doc,
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// Count document frequency
|
|
164
|
+
const seen = new Set();
|
|
165
|
+
for (const token of tokens) {
|
|
166
|
+
if (!seen.has(token)) {
|
|
167
|
+
seen.add(token);
|
|
168
|
+
this.df.set(token, (this.df.get(token) || 0) + 1);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
this.totalDocs = this.documents.length;
|
|
174
|
+
this.avgDl =
|
|
175
|
+
this.totalDocs > 0
|
|
176
|
+
? this.documents.reduce((sum, d) => sum + d.tokens.length, 0) /
|
|
177
|
+
this.totalDocs
|
|
178
|
+
: 0;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Add a single document to the index
|
|
183
|
+
*/
|
|
184
|
+
addDocument(doc) {
|
|
185
|
+
const text = [doc.title || "", doc.content || ""].join(" ");
|
|
186
|
+
const tokens = tokenize(text, this.language);
|
|
187
|
+
|
|
188
|
+
this.documents.push({
|
|
189
|
+
id: doc.id,
|
|
190
|
+
tokens,
|
|
191
|
+
originalDoc: doc,
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
const seen = new Set();
|
|
195
|
+
for (const token of tokens) {
|
|
196
|
+
if (!seen.has(token)) {
|
|
197
|
+
seen.add(token);
|
|
198
|
+
this.df.set(token, (this.df.get(token) || 0) + 1);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
this.totalDocs = this.documents.length;
|
|
203
|
+
this.avgDl =
|
|
204
|
+
this.documents.reduce((sum, d) => sum + d.tokens.length, 0) /
|
|
205
|
+
this.totalDocs;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Remove a document from the index
|
|
210
|
+
*/
|
|
211
|
+
removeDocument(docId) {
|
|
212
|
+
const idx = this.documents.findIndex((d) => d.id === docId);
|
|
213
|
+
if (idx === -1) return false;
|
|
214
|
+
|
|
215
|
+
const doc = this.documents[idx];
|
|
216
|
+
const seen = new Set();
|
|
217
|
+
for (const token of doc.tokens) {
|
|
218
|
+
if (!seen.has(token)) {
|
|
219
|
+
seen.add(token);
|
|
220
|
+
const count = this.df.get(token) || 0;
|
|
221
|
+
if (count <= 1) {
|
|
222
|
+
this.df.delete(token);
|
|
223
|
+
} else {
|
|
224
|
+
this.df.set(token, count - 1);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
this.documents.splice(idx, 1);
|
|
230
|
+
this.totalDocs = this.documents.length;
|
|
231
|
+
this.avgDl =
|
|
232
|
+
this.totalDocs > 0
|
|
233
|
+
? this.documents.reduce((sum, d) => sum + d.tokens.length, 0) /
|
|
234
|
+
this.totalDocs
|
|
235
|
+
: 0;
|
|
236
|
+
|
|
237
|
+
return true;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Search for documents matching a query
|
|
242
|
+
* @param {string} query
|
|
243
|
+
* @param {object} [options]
|
|
244
|
+
* @param {number} [options.topK=10]
|
|
245
|
+
* @param {number} [options.threshold=0]
|
|
246
|
+
* @returns {Array<{id: string, score: number, doc: object}>}
|
|
247
|
+
*/
|
|
248
|
+
search(query, options = {}) {
|
|
249
|
+
const topK = options.topK || 10;
|
|
250
|
+
const threshold = options.threshold || 0;
|
|
251
|
+
|
|
252
|
+
if (this.totalDocs === 0) return [];
|
|
253
|
+
|
|
254
|
+
const queryTokens = tokenize(query, this.language);
|
|
255
|
+
if (queryTokens.length === 0) return [];
|
|
256
|
+
|
|
257
|
+
const scores = [];
|
|
258
|
+
|
|
259
|
+
for (let i = 0; i < this.documents.length; i++) {
|
|
260
|
+
const score = this._calculateBM25(queryTokens, i);
|
|
261
|
+
if (score > threshold) {
|
|
262
|
+
scores.push({
|
|
263
|
+
id: this.documents[i].id,
|
|
264
|
+
score,
|
|
265
|
+
doc: this.documents[i].originalDoc,
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
scores.sort((a, b) => b.score - a.score);
|
|
271
|
+
return scores.slice(0, topK);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Calculate BM25 score for a document
|
|
276
|
+
*/
|
|
277
|
+
_calculateBM25(queryTokens, docIdx) {
|
|
278
|
+
const doc = this.documents[docIdx];
|
|
279
|
+
const dl = doc.tokens.length;
|
|
280
|
+
let score = 0;
|
|
281
|
+
|
|
282
|
+
// Build term frequency map for this document
|
|
283
|
+
const tf = new Map();
|
|
284
|
+
for (const token of doc.tokens) {
|
|
285
|
+
tf.set(token, (tf.get(token) || 0) + 1);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
for (const term of queryTokens) {
|
|
289
|
+
const termFreq = tf.get(term) || 0;
|
|
290
|
+
if (termFreq === 0) continue;
|
|
291
|
+
|
|
292
|
+
const docFreq = this.df.get(term) || 0;
|
|
293
|
+
if (docFreq === 0) continue;
|
|
294
|
+
|
|
295
|
+
// IDF component
|
|
296
|
+
const idf = Math.log(
|
|
297
|
+
(this.totalDocs - docFreq + 0.5) / (docFreq + 0.5) + 1,
|
|
298
|
+
);
|
|
299
|
+
|
|
300
|
+
// TF component with length normalization
|
|
301
|
+
const avgDl = this.avgDl || 1;
|
|
302
|
+
const tfNorm =
|
|
303
|
+
(termFreq * (this.k1 + 1)) /
|
|
304
|
+
(termFreq + this.k1 * (1 - this.b + this.b * (dl / avgDl)));
|
|
305
|
+
|
|
306
|
+
score += idf * tfNorm;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
return score;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Get index statistics
|
|
314
|
+
*/
|
|
315
|
+
getStats() {
|
|
316
|
+
return {
|
|
317
|
+
totalDocuments: this.totalDocs,
|
|
318
|
+
uniqueTerms: this.df.size,
|
|
319
|
+
avgDocumentLength: Math.round(this.avgDl),
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
}
|