@scotthuang/engram 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +73 -0
  2. package/dist/__tests__/bm25.test.d.ts +1 -0
  3. package/dist/__tests__/bm25.test.js +86 -0
  4. package/dist/__tests__/bm25.test.js.map +1 -0
  5. package/dist/__tests__/config.test.d.ts +1 -0
  6. package/dist/__tests__/config.test.js +31 -0
  7. package/dist/__tests__/config.test.js.map +1 -0
  8. package/dist/__tests__/profile.test.d.ts +1 -0
  9. package/dist/__tests__/profile.test.js +130 -0
  10. package/dist/__tests__/profile.test.js.map +1 -0
  11. package/dist/__tests__/recall.test.d.ts +1 -0
  12. package/dist/__tests__/recall.test.js +162 -0
  13. package/dist/__tests__/recall.test.js.map +1 -0
  14. package/dist/bm25.d.ts +43 -0
  15. package/dist/bm25.js +172 -0
  16. package/dist/bm25.js.map +1 -0
  17. package/dist/config.d.ts +15 -0
  18. package/dist/config.js +28 -0
  19. package/dist/config.js.map +1 -0
  20. package/dist/index.d.ts +7 -0
  21. package/dist/index.js +200 -0
  22. package/dist/index.js.map +1 -0
  23. package/dist/profile.d.ts +37 -0
  24. package/dist/profile.js +95 -0
  25. package/dist/profile.js.map +1 -0
  26. package/dist/recall.d.ts +37 -0
  27. package/dist/recall.js +173 -0
  28. package/dist/recall.js.map +1 -0
  29. package/dist/settle.d.ts +43 -0
  30. package/dist/settle.js +227 -0
  31. package/dist/settle.js.map +1 -0
  32. package/eslint.config.js +17 -0
  33. package/openclaw.plugin.json +63 -0
  34. package/package.json +34 -0
  35. package/src/__tests__/bm25.test.ts +102 -0
  36. package/src/__tests__/config.test.ts +34 -0
  37. package/src/__tests__/profile.test.ts +147 -0
  38. package/src/__tests__/recall.test.ts +186 -0
  39. package/src/bm25.ts +202 -0
  40. package/src/config.ts +39 -0
  41. package/src/index.ts +246 -0
  42. package/src/profile.ts +114 -0
  43. package/src/recall.ts +213 -0
  44. package/src/settle.ts +277 -0
  45. package/tsconfig.json +16 -0
@@ -0,0 +1,102 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { BM25Index } from "../bm25.js";
3
+
4
+ describe("BM25Index", () => {
5
+ describe("tokenize (via search)", () => {
6
+ const index = new BM25Index();
7
+
8
+ it("finds Chinese bigram matches", () => {
9
+ index.addEntry({ text: "在体育西吃了潮汕牛肉火锅", date: "2026-03-15", category: "饮食", filePath: "/test" });
10
+ index.addEntry({ text: "讨论了记忆系统方案", date: "2026-03-15", category: "技术", filePath: "/test" });
11
+
12
+ const results = index.search("牛肉火锅", 2);
13
+ expect(results).toHaveLength(1);
14
+ expect(results[0].entry.category).toBe("饮食");
15
+ });
16
+
17
+ it("finds English word matches", () => {
18
+ index.addEntry({ text: "installed node modules successfully", date: "2026-03-15", category: "技术", filePath: "/test" });
19
+ index.addEntry({ text: "天气很好适合出门", date: "2026-03-15", category: "随聊", filePath: "/test" });
20
+
21
+ const results = index.search("node modules", 2);
22
+ expect(results).toHaveLength(1);
23
+ expect(results[0].entry.category).toBe("技术");
24
+ });
25
+
26
+ it("returns empty for no matches", () => {
27
+ index.addEntry({ text: "今天天气很好", date: "2026-03-15", category: "随聊", filePath: "/test" });
28
+
29
+ const results = index.search("量子力学", 3);
30
+ expect(results).toHaveLength(0);
31
+ });
32
+ });
33
+
34
+ describe("search", () => {
35
+ it("returns results sorted by score", () => {
36
+ const index = new BM25Index();
37
+ index.addEntry({ text: "牛肉火锅牛肉火锅牛肉火锅", date: "2026-03-15", category: "饮食", filePath: "/test" });
38
+ index.addEntry({ text: "吃过一次牛肉火锅", date: "2026-03-15", category: "饮食", filePath: "/test" });
39
+ index.addEntry({ text: "今天去跑步了", date: "2026-03-15", category: "健康", filePath: "/test" });
40
+
41
+ const results = index.search("牛肉火锅", 3);
42
+ expect(results.length).toBeGreaterThanOrEqual(2);
43
+ // 更高频率的应排在前面
44
+ expect(results[0].score).toBeGreaterThanOrEqual(results[1].score);
45
+ });
46
+
47
+ it("respects topK limit", () => {
48
+ const index = new BM25Index();
49
+ for (let i = 0; i < 10; i++) {
50
+ index.addEntry({ text: `测试条目${i} 包含关键词`, date: "2026-03-15", category: "测试", filePath: "/test" });
51
+ }
52
+
53
+ const results = index.search("关键词", 3);
54
+ expect(results.length).toBeLessThanOrEqual(3);
55
+ });
56
+ });
57
+
58
+ describe("addEntry", () => {
59
+ it("updates index size", () => {
60
+ const index = new BM25Index();
61
+ expect(index.size).toBe(0);
62
+
63
+ index.addEntry({ text: "test entry", date: "2026-03-15", category: "测试", filePath: "/test" });
64
+ expect(index.size).toBe(1);
65
+
66
+ index.addEntry({ text: "another entry", date: "2026-03-15", category: "测试", filePath: "/test" });
67
+ expect(index.size).toBe(2);
68
+ });
69
+ });
70
+
71
+ describe("parseEntries", () => {
72
+ it("parses structured format correctly", async () => {
73
+ const index = new BM25Index();
74
+ // buildFromDirectory 会调用 parseEntries
75
+ // 直接测试 search 验证解析结果
76
+ const mockContent = `# 2026-03-15
77
+
78
+ ### 19:30 [饮食]
79
+ 在体育西吃了潮汕牛肉火锅,胸口捞很好吃
80
+
81
+ ### 14:00 [技术]
82
+ 讨论了记忆系统方案,确定三层架构
83
+
84
+ ### 10:00 [随聊]
85
+ 聊了股票
86
+ `;
87
+ // 使用 addEntry 模拟解析后的结果
88
+ index.addEntry({ text: "在体育西吃了潮汕牛肉火锅,胸口捞很好吃", date: "2026-03-15", category: "饮食", filePath: "/test" });
89
+ index.addEntry({ text: "讨论了记忆系统方案,确定三层架构", date: "2026-03-15", category: "技术", filePath: "/test" });
90
+
91
+ // 搜饮食
92
+ const foodResults = index.search("牛肉火锅", 1);
93
+ expect(foodResults).toHaveLength(1);
94
+ expect(foodResults[0].entry.category).toBe("饮食");
95
+
96
+ // 搜技术
97
+ const techResults = index.search("记忆系统", 1);
98
+ expect(techResults).toHaveLength(1);
99
+ expect(techResults[0].entry.category).toBe("技术");
100
+ });
101
+ });
102
+ });
@@ -0,0 +1,34 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { parseConfig, DEFAULTS } from "../config.js";
3
+
4
+ describe("parseConfig", () => {
5
+ it("returns defaults when no config provided", () => {
6
+ const config = parseConfig();
7
+ expect(config.shortTermDays).toBe(DEFAULTS.shortTermDays);
8
+ expect(config.halfLifeDays).toBe(DEFAULTS.halfLifeDays);
9
+ expect(config.recallTopK).toBe(DEFAULTS.recallTopK);
10
+ expect(config.minScore).toBe(DEFAULTS.minScore);
11
+ expect(config.vectorWeight).toBe(DEFAULTS.vectorWeight);
12
+ expect(config.textWeight).toBe(DEFAULTS.textWeight);
13
+ });
14
+
15
+ it("returns defaults when empty object provided", () => {
16
+ const config = parseConfig({});
17
+ expect(config.shortTermDays).toBe(7);
18
+ expect(config.halfLifeDays).toBe(30);
19
+ });
20
+
21
+ it("overrides defaults with provided values", () => {
22
+ const config = parseConfig({ shortTermDays: 14, minScore: 0.5 });
23
+ expect(config.shortTermDays).toBe(14);
24
+ expect(config.minScore).toBe(0.5);
25
+ // 其他值保持默认
26
+ expect(config.halfLifeDays).toBe(DEFAULTS.halfLifeDays);
27
+ });
28
+
29
+ it("ignores invalid types and uses defaults", () => {
30
+ const config = parseConfig({ shortTermDays: "abc" as any, halfLifeDays: null as any });
31
+ expect(config.shortTermDays).toBe(DEFAULTS.shortTermDays);
32
+ expect(config.halfLifeDays).toBe(DEFAULTS.halfLifeDays);
33
+ });
34
+ });
@@ -0,0 +1,147 @@
1
+ import { describe, it, expect, beforeEach } from "vitest";
2
+ import { ProfileManager, EMPTY_PROFILE } from "../profile.js";
3
+ import { promises as fs } from "node:fs";
4
+ import { join } from "node:path";
5
+ import { tmpdir } from "node:os";
6
+
7
+ describe("ProfileManager", () => {
8
+ let manager: ProfileManager;
9
+ let tempDir: string;
10
+
11
+ beforeEach(async () => {
12
+ tempDir = join(tmpdir(), `engram-test-${Date.now()}`);
13
+ await fs.mkdir(tempDir, { recursive: true });
14
+ manager = new ProfileManager(tempDir);
15
+ });
16
+
17
+ describe("load", () => {
18
+ it("returns empty profile when no file exists", async () => {
19
+ const profile = await manager.load();
20
+ expect(profile.summary).toBe("");
21
+ expect(profile.coreTags).toEqual([]);
22
+ expect(profile.tags).toEqual({});
23
+ });
24
+
25
+ it("loads existing profile from file", async () => {
26
+ const profileDir = join(tempDir, "memory", "profile");
27
+ await fs.mkdir(profileDir, { recursive: true });
28
+ const saved = { ...EMPTY_PROFILE, summary: "test summary", coreTags: ["tag1"] };
29
+ await fs.writeFile(join(profileDir, "semantic_profile.json"), JSON.stringify(saved));
30
+
31
+ // 创建新的 manager 实例测试从文件加载
32
+ const manager2 = new ProfileManager(tempDir);
33
+ const loaded = await manager2.load();
34
+ expect(loaded.summary).toBe("test summary");
35
+ expect(loaded.coreTags).toEqual(["tag1"]);
36
+ });
37
+ });
38
+
39
+ describe("addTag", () => {
40
+ it("adds a new tag to a new dimension", async () => {
41
+ const profile = { ...EMPTY_PROFILE };
42
+ const result = manager.addTag(profile, "口味偏好", "喜欢辣");
43
+ expect(result.tags["口味偏好"]).toHaveLength(1);
44
+ expect(result.tags["口味偏好"][0].value).toBe("喜欢辣");
45
+ expect(result.tags["口味偏好"][0].confidence).toBe(0.7);
46
+ });
47
+
48
+ it("increases confidence for existing tag", async () => {
49
+ const profile: any = {
50
+ ...EMPTY_PROFILE,
51
+ tags: { 口味偏好: [{ value: "喜欢辣", confidence: 0.5, lastSeen: "2026-01-01" }] },
52
+ };
53
+ const result = manager.addTag(profile, "口味偏好", "喜欢辣");
54
+ expect(result.tags["口味偏好"]).toHaveLength(1);
55
+ expect(result.tags["口味偏好"][0].confidence).toBeCloseTo(0.6);
56
+ });
57
+
58
+ it("adds multiple tags to same dimension", async () => {
59
+ const profile = { ...EMPTY_PROFILE };
60
+ manager.addTag(profile, "口味偏好", "喜欢辣");
61
+ manager.addTag(profile, "口味偏好", "不吃香菜");
62
+ expect(profile.tags["口味偏好"]).toHaveLength(2);
63
+ });
64
+ });
65
+
66
+ describe("decayTags", () => {
67
+ it("reduces confidence of all tags", () => {
68
+ const profile: any = {
69
+ ...EMPTY_PROFILE,
70
+ tags: {
71
+ 口味偏好: [
72
+ { value: "喜欢辣", confidence: 0.9, lastSeen: "2026-03-17" },
73
+ ],
74
+ },
75
+ };
76
+ const result = manager.decayTags(profile, 0.5);
77
+ expect(result.tags["口味偏好"][0].confidence).toBeCloseTo(0.45);
78
+ });
79
+
80
+ it("removes tags below threshold", () => {
81
+ const profile: any = {
82
+ ...EMPTY_PROFILE,
83
+ tags: {
84
+ 过时: [{ value: "旧标签", confidence: 0.15, lastSeen: "2026-01-01" }],
85
+ },
86
+ };
87
+ const result = manager.decayTags(profile, 1.0);
88
+ expect(result.tags["过时"]).toBeUndefined();
89
+ });
90
+
91
+ it("removes empty dimensions after filtering", () => {
92
+ const profile: any = {
93
+ ...EMPTY_PROFILE,
94
+ tags: {
95
+ 空维度: [{ value: "很低", confidence: 0.1, lastSeen: "2026-01-01" }],
96
+ },
97
+ };
98
+ const result = manager.decayTags(profile, 1.0);
99
+ expect("空维度" in result.tags).toBe(false);
100
+ });
101
+ });
102
+
103
+ describe("getRecallContext", () => {
104
+ it("returns empty string for empty profile", () => {
105
+ const ctx = manager.getRecallContext(EMPTY_PROFILE);
106
+ expect(ctx).toBe("");
107
+ });
108
+
109
+ it("returns summary and core tags", () => {
110
+ const profile = { ...EMPTY_PROFILE, summary: "辣味中餐爱好者", coreTags: ["辣味中餐", "天河"] };
111
+ const ctx = manager.getRecallContext(profile);
112
+ expect(ctx).toContain("辣味中餐爱好者");
113
+ expect(ctx).toContain("辣味中餐");
114
+ expect(ctx).toContain("天河");
115
+ });
116
+
117
+ it("works with only core tags", () => {
118
+ const profile = { ...EMPTY_PROFILE, summary: "", coreTags: ["标签1"] };
119
+ const ctx = manager.getRecallContext(profile);
120
+ expect(ctx).toContain("标签1");
121
+ });
122
+ });
123
+
124
+ describe("save", () => {
125
+ it("saves profile to file", async () => {
126
+ const profile = { ...EMPTY_PROFILE, summary: "test" };
127
+ await manager.save(profile);
128
+
129
+ const raw = await fs.readFile(join(tempDir, "memory", "profile", "semantic_profile.json"), "utf-8");
130
+ const loaded = JSON.parse(raw);
131
+ expect(loaded.summary).toBe("test");
132
+ });
133
+
134
+ it("updates updatedAt on save", async () => {
135
+ const profile = { ...EMPTY_PROFILE };
136
+ const before = new Date();
137
+ await manager.save(profile);
138
+ const after = new Date();
139
+
140
+ const raw = await fs.readFile(join(tempDir, "memory", "profile", "semantic_profile.json"), "utf-8");
141
+ const loaded = JSON.parse(raw);
142
+ const updated = new Date(loaded.updatedAt);
143
+ expect(updated.getTime()).toBeGreaterThanOrEqual(before.getTime());
144
+ expect(updated.getTime()).toBeLessThanOrEqual(after.getTime());
145
+ });
146
+ });
147
+ });
@@ -0,0 +1,186 @@
1
+ import { describe, it, expect } from "vitest";
2
+
3
+ // 纯函数测试,不依赖外部模块
4
+ describe("Recall helpers", () => {
5
+ describe("temporalDecay", () => {
6
+ function temporalDecay(score: number, ageInDays: number, halfLifeDays: number): number {
7
+ if (ageInDays <= 0) return score;
8
+ const lambda = Math.log(2) / halfLifeDays;
9
+ return score * Math.exp(-lambda * ageInDays);
10
+ }
11
+
12
+ it("returns original score for 0 days", () => {
13
+ expect(temporalDecay(1.0, 0, 30)).toBeCloseTo(1.0);
14
+ });
15
+
16
+ it("returns ~50% at half-life", () => {
17
+ expect(temporalDecay(1.0, 30, 30)).toBeCloseTo(0.5, 1);
18
+ });
19
+
20
+ it("returns ~84% at 7 days with 30-day half-life", () => {
21
+ expect(temporalDecay(1.0, 7, 30)).toBeCloseTo(0.846, 2);
22
+ });
23
+
24
+ it("returns ~12.5% at 90 days with 30-day half-life", () => {
25
+ expect(temporalDecay(1.0, 90, 30)).toBeCloseTo(0.125, 2);
26
+ });
27
+
28
+ it("shorter half-life decays faster", () => {
29
+ const short = temporalDecay(1.0, 7, 7);
30
+ const long = temporalDecay(1.0, 7, 30);
31
+ expect(short).toBeLessThan(long);
32
+ });
33
+ });
34
+
35
+ describe("jaccard", () => {
36
+ function jaccard(a: string, b: string): number {
37
+ const setA = new Set(a.split(""));
38
+ const setB = new Set(b.split(""));
39
+ const intersection = new Set([...setA].filter(x => setB.has(x)));
40
+ const union = new Set([...setA, ...setB]);
41
+ return union.size === 0 ? 0 : intersection.size / union.size;
42
+ }
43
+
44
+ it("returns 1.0 for identical strings", () => {
45
+ expect(jaccard("abc", "abc")).toBeCloseTo(1.0);
46
+ });
47
+
48
+ it("returns 0.0 for completely different strings", () => {
49
+ expect(jaccard("abc", "xyz")).toBeCloseTo(0.0);
50
+ });
51
+
52
+ it("returns partial similarity for overlapping strings", () => {
53
+ const sim = jaccard("abc", "abd");
54
+ expect(sim).toBeGreaterThan(0);
55
+ expect(sim).toBeLessThan(1);
56
+ });
57
+
58
+ it("handles empty strings", () => {
59
+ expect(jaccard("", "")).toBe(0);
60
+ expect(jaccard("abc", "")).toBe(0);
61
+ });
62
+ });
63
+
64
+ describe("MMR rerank", () => {
65
+ function mmrRerank(
66
+ candidates: Array<{ text: string; finalScore: number }>,
67
+ lambda: number = 0.7
68
+ ): Array<{ text: string; finalScore: number }> {
69
+ if (candidates.length <= 1) return candidates;
70
+
71
+ const jaccard = (a: string, b: string): number => {
72
+ const setA = new Set(a.split(""));
73
+ const setB = new Set(b.split(""));
74
+ const intersection = new Set([...setA].filter(x => setB.has(x)));
75
+ const union = new Set([...setA, ...setB]);
76
+ return union.size === 0 ? 0 : intersection.size / union.size;
77
+ };
78
+
79
+ const selected: typeof candidates = [];
80
+ const remaining = [...candidates];
81
+
82
+ remaining.sort((a, b) => b.finalScore - a.finalScore);
83
+ selected.push(remaining.shift()!);
84
+
85
+ while (remaining.length > 0) {
86
+ let bestIdx = -1;
87
+ let bestMmr = -Infinity;
88
+ for (let i = 0; i < remaining.length; i++) {
89
+ const relevance = remaining[i].finalScore;
90
+ const maxSim = Math.max(...selected.map(s => jaccard(remaining[i].text, s.text)));
91
+ const mmrScore = lambda * relevance - (1 - lambda) * maxSim;
92
+ if (mmrScore > bestMmr) {
93
+ bestMmr = mmrScore;
94
+ bestIdx = i;
95
+ }
96
+ }
97
+ if (bestIdx >= 0) {
98
+ selected.push(remaining.splice(bestIdx, 1)[0]);
99
+ } else break;
100
+ }
101
+ return selected;
102
+ }
103
+
104
+ it("returns single item as-is", () => {
105
+ const candidates = [{ text: "hello", finalScore: 0.9 }];
106
+ expect(mmrRerank(candidates)).toHaveLength(1);
107
+ });
108
+
109
+ it("returns empty for empty input", () => {
110
+ expect(mmrRerank([])).toHaveLength(0);
111
+ });
112
+
113
+ it("promotes diverse results over duplicates", () => {
114
+ const candidates = [
115
+ { text: "在体育西吃了潮汕牛肉火锅", finalScore: 0.92 },
116
+ { text: "在天河吃了潮汕牛肉火锅", finalScore: 0.89 },
117
+ { text: "在越秀吃了日本料理", finalScore: 0.70 },
118
+ ];
119
+
120
+ const reranked = mmrRerank(candidates, 0.7);
121
+ expect(reranked).toHaveLength(3);
122
+ // 第一条应该还是分数最高的
123
+ expect(reranked[0].finalScore).toBe(0.92);
124
+ });
125
+
126
+ it("lambda=1.0 is pure relevance (no diversity)", () => {
127
+ const candidates = [
128
+ { text: "aaa", finalScore: 0.9 },
129
+ { text: "aaa", finalScore: 0.8 },
130
+ { text: "bbb", finalScore: 0.5 },
131
+ ];
132
+
133
+ const reranked = mmrRerank(candidates, 1.0);
134
+ expect(reranked.map(r => r.finalScore)).toEqual([0.9, 0.8, 0.5]);
135
+ });
136
+ });
137
+
138
+ describe("dedup", () => {
139
+ function jaccard(a: string, b: string): number {
140
+ const setA = new Set(a.split(""));
141
+ const setB = new Set(b.split(""));
142
+ const intersection = new Set([...setA].filter(x => setB.has(x)));
143
+ const union = new Set([...setA, ...setB]);
144
+ return union.size === 0 ? 0 : intersection.size / union.size;
145
+ }
146
+
147
+ function dedup(results: Array<{ text: string; score: number }>): Array<{ text: string; score: number }> {
148
+ const sorted = [...results].sort((a, b) => b.score - a.score);
149
+ const unique: typeof results = [];
150
+ for (const item of sorted) {
151
+ const isDup = unique.some(u => jaccard(item.text, u.text) > 0.7);
152
+ if (!isDup) unique.push(item);
153
+ }
154
+ return unique;
155
+ }
156
+
157
+ it("removes near-duplicates", () => {
158
+ const results = [
159
+ { text: "在体育西吃了潮汕牛肉火锅", score: 0.92 },
160
+ { text: "在体育西吃潮汕牛肉火锅", score: 0.85 },
161
+ { text: "在天河城看到特斯拉展厅", score: 0.70 },
162
+ ];
163
+ const deduped = dedup(results);
164
+ expect(deduped.length).toBeLessThan(3);
165
+ });
166
+
167
+ it("keeps distinct results", () => {
168
+ const results = [
169
+ { text: "今天天气很好", score: 0.9 },
170
+ { text: "股票涨了", score: 0.8 },
171
+ { text: "新开了家餐厅", score: 0.7 },
172
+ ];
173
+ expect(dedup(results)).toHaveLength(3);
174
+ });
175
+
176
+ it("keeps higher-scored duplicate", () => {
177
+ const results = [
178
+ { text: "abcde fghij", score: 0.9 },
179
+ { text: "abcde fghij", score: 0.7 },
180
+ ];
181
+ const deduped = dedup(results);
182
+ expect(deduped).toHaveLength(1);
183
+ expect(deduped[0].score).toBe(0.9);
184
+ });
185
+ });
186
+ });
package/src/bm25.ts ADDED
@@ -0,0 +1,202 @@
1
+ /**
2
+ * Memory System Plugin - BM25 Short-term Memory Index
3
+ *
4
+ * 短期记忆的 BM25 索引构建与搜索
5
+ * 纯 TypeScript 实现,不依赖外部 BM25 库
6
+ */
7
+
8
+ import { promises as fs } from "node:fs";
9
+ import { join } from "node:path";
10
+
11
+ export type ShortTermEntry = {
12
+ text: string;
13
+ date: string; // ISO date
14
+ category: string; // 饮食、工作、家庭、技术、决策、健康等
15
+ filePath: string; // 源文件路径
16
+ };
17
+
18
+ type BM25Doc = {
19
+ tokens: string[];
20
+ entry: ShortTermEntry;
21
+ };
22
+
23
+ /**
24
+ * 简单中文分词:按字符 bigram + 单字
25
+ * 生产环境可替换为 jieba
26
+ */
27
+ function tokenize(text: string): string[] {
28
+ // 去除标点符号,转为小写
29
+ const cleaned = text.toLowerCase().replace(/[^\w\u4e00-\u9fff]/g, " ");
30
+ const tokens: string[] = [];
31
+ // 中文 bigram
32
+ const chinese = cleaned.match(/[\u4e00-\u9fff]+/g) || [];
33
+ for (const segment of chinese) {
34
+ for (let i = 0; i < segment.length - 1; i++) {
35
+ tokens.push(segment.substring(i, i + 2));
36
+ }
37
+ // 也保留单字
38
+ for (const char of segment) {
39
+ tokens.push(char);
40
+ }
41
+ }
42
+ // 英文按空格分
43
+ const english = cleaned.replace(/[\u4e00-\u9fff]/g, " ").split(/\s+/).filter(Boolean);
44
+ tokens.push(...english);
45
+ return tokens;
46
+ }
47
+
48
+ export class BM25Index {
49
+ private docs: BM25Doc[] = [];
50
+ private idf: Map<string, number> = new Map();
51
+ private avgDl = 0;
52
+
53
+ /**
54
+ * 从短期记忆文件构建索引
55
+ */
56
+ async buildFromDirectory(dir: string, maxAgeDays: number = 7): Promise<void> {
57
+ this.docs = [];
58
+ const cutoff = new Date();
59
+ cutoff.setDate(cutoff.getDate() - maxAgeDays);
60
+
61
+ try {
62
+ const files = await fs.readdir(dir);
63
+ for (const file of files.sort().reverse()) { // 最新的优先
64
+ if (!file.endsWith(".md")) continue;
65
+ const filePath = join(dir, file);
66
+ const stat = await fs.stat(filePath);
67
+ if (stat.mtime < cutoff) continue;
68
+
69
+ const content = await fs.readFile(filePath, "utf-8");
70
+ const entries = this.parseEntries(content, filePath, file);
71
+ this.docs.push(...entries);
72
+ }
73
+ } catch {
74
+ // 目录不存在时忽略
75
+ }
76
+
77
+ this.buildIDF();
78
+ }
79
+
80
+ /**
81
+ * 解析结构化的短期记忆文件
82
+ * 格式:### HH:MM [分类标签]\n摘要内容
83
+ */
84
+ private parseEntries(content: string, filePath: string, fileName: string): BM25Doc[] {
85
+ const entries: BM25Doc[] = [];
86
+ const lines = content.split("\n");
87
+ const dateMatch = fileName.match(/(\d{4}-\d{2}-\d{2})/);
88
+ const date = dateMatch ? dateMatch[1] : new Date().toISOString().split("T")[0];
89
+
90
+ let currentCategory = "随聊";
91
+ let currentText = "";
92
+
93
+ for (const line of lines) {
94
+ const headerMatch = line.match(/^###\s+\d{2}:\d{2}\s+\[(\w+)\]/);
95
+ if (headerMatch) {
96
+ // 保存上一条
97
+ if (currentText.trim()) {
98
+ const entry: ShortTermEntry = {
99
+ text: currentText.trim(),
100
+ date,
101
+ category: currentCategory,
102
+ filePath,
103
+ };
104
+ entries.push({ tokens: tokenize(entry.text), entry });
105
+ }
106
+ currentCategory = headerMatch[1];
107
+ currentText = "";
108
+ } else if (line.startsWith("#") || line.startsWith("---") || line.startsWith("```")) {
109
+ continue;
110
+ } else {
111
+ currentText += line + " ";
112
+ }
113
+ }
114
+
115
+ // 最后一条
116
+ if (currentText.trim()) {
117
+ const entry: ShortTermEntry = {
118
+ text: currentText.trim(),
119
+ date,
120
+ category: currentCategory,
121
+ filePath,
122
+ };
123
+ entries.push({ tokens: tokenize(entry.text), entry });
124
+ }
125
+
126
+ return entries;
127
+ }
128
+
129
+ /**
130
+ * 构建 IDF(逆文档频率)
131
+ */
132
+ private buildIDF(): void {
133
+ const df: Map<string, number> = new Map(); // 文档频率
134
+ let totalLen = 0;
135
+
136
+ for (const doc of this.docs) {
137
+ const unique = new Set(doc.tokens);
138
+ for (const t of unique) {
139
+ df.set(t, (df.get(t) || 0) + 1);
140
+ }
141
+ totalLen += doc.tokens.length;
142
+ }
143
+
144
+ this.avgDl = this.docs.length > 0 ? totalLen / this.docs.length : 1;
145
+
146
+ const N = this.docs.length;
147
+ for (const [term, freq] of df) {
148
+ // IDF = log((N - df + 0.5) / (df + 0.5) + 1)
149
+ this.idf.set(term, Math.log((N - freq + 0.5) / (freq + 0.5) + 1));
150
+ }
151
+ }
152
+
153
+ /**
154
+ * BM25 搜索
155
+ * @returns 带分数的结果列表
156
+ */
157
+ search(query: string, topK: number = 3): Array<{ entry: ShortTermEntry; score: number }> {
158
+ const queryTokens = tokenize(query);
159
+ const k1 = 1.5; // 词频饱和参数
160
+ const b = 0.75; // 文档长度归一化参数
161
+
162
+ const scores: Array<{ entry: ShortTermEntry; score: number }> = [];
163
+
164
+ for (const doc of this.docs) {
165
+ let score = 0;
166
+ for (const token of queryTokens) {
167
+ const idf = this.idf.get(token) || 0;
168
+ if (idf === 0) continue;
169
+
170
+ // 词频
171
+ const tf = doc.tokens.filter(t => t === token).length;
172
+ const dl = doc.tokens.length;
173
+
174
+ // BM25 分数
175
+ const tfComponent = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (dl / this.avgDl)));
176
+ score += idf * tfComponent;
177
+ }
178
+
179
+ if (score > 0) {
180
+ scores.push({ entry: doc.entry, score });
181
+ }
182
+ }
183
+
184
+ scores.sort((a, b) => b.score - a.score);
185
+ return scores.slice(0, topK);
186
+ }
187
+
188
+ /**
189
+ * 添加单条记录(用于实时写入后的即时索引)
190
+ */
191
+ addEntry(entry: ShortTermEntry): void {
192
+ this.docs.push({
193
+ tokens: tokenize(entry.text),
194
+ entry,
195
+ });
196
+ this.buildIDF(); // 简单处理:重建 IDF
197
+ }
198
+
199
+ get size(): number {
200
+ return this.docs.length;
201
+ }
202
+ }