@morningljn/mnemo 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +156 -0
- package/README_zh.md +156 -0
- package/banner.png +0 -0
- package/dist/init.d.ts +10 -0
- package/dist/init.js +138 -0
- package/dist/init.js.map +1 -0
- package/dist/retriever.d.ts +70 -0
- package/dist/retriever.js +689 -0
- package/dist/retriever.js.map +1 -0
- package/dist/schema.d.ts +1 -0
- package/dist/schema.js +62 -0
- package/dist/schema.js.map +1 -0
- package/dist/security.d.ts +15 -0
- package/dist/security.js +116 -0
- package/dist/security.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.js +150 -0
- package/dist/server.js.map +1 -0
- package/dist/store.d.ts +122 -0
- package/dist/store.js +696 -0
- package/dist/store.js.map +1 -0
- package/dist/types.d.ts +72 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +46 -0
- package/src/init.ts +157 -0
- package/src/retriever.ts +806 -0
- package/src/schema.ts +61 -0
- package/src/security.ts +132 -0
- package/src/server.ts +172 -0
- package/src/store.ts +805 -0
- package/src/types.ts +81 -0
- package/tests/retriever.test.ts +55 -0
- package/tests/security.test.ts +30 -0
- package/tests/store.test.ts +104 -0
- package/tsconfig.json +16 -0
- package/vitest.config.ts +10 -0
package/src/retriever.ts
ADDED
|
@@ -0,0 +1,806 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 混合检索管线。
|
|
3
|
+
* 移植自 Ocean CLI FactRetriever,使用 better-sqlite3。
|
|
4
|
+
*
|
|
5
|
+
* 管线:FTS5 候选集 → Jaccard 重排序 → 信任评分加权 → 时间衰减
|
|
6
|
+
* 高级检索:probe/related/reason 基于 fact_entities 关联表
|
|
7
|
+
* 矛盾检测:实体重叠 + 内容差异
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type Database from 'better-sqlite3'
|
|
11
|
+
import type { Fact, FactCategory, ScoredFact, Contradiction, SearchOptions, ContradictOptions, RetrieverOptions } from './types.js'
|
|
12
|
+
import { MemoryStore } from './store.js'
|
|
13
|
+
|
|
14
|
+
// 中文字符级匹配的虚词集合(这些单字太常见,不参与字符交叉匹配)
|
|
15
|
+
const CN_OVERLAP_STOP = new Set([
|
|
16
|
+
'的', '了', '是', '在', '有', '和', '就', '不', '人', '都',
|
|
17
|
+
'一', '个', '上', '也', '很', '到', '说', '要', '去', '你',
|
|
18
|
+
'会', '着', '没', '看', '好', '自', '这', '他', '她', '它',
|
|
19
|
+
'那', '些', '用', '对', '下', '为', '从', '被', '把', '能',
|
|
20
|
+
'可', '以', '所', '而', '又', '与', '但', '或', '等', '中',
|
|
21
|
+
'大', '小', '多', '少', '其', '之', '做', '让', '给', '已',
|
|
22
|
+
'还', '来', '地', '得', '过', '时', '里', '后', '前', '当',
|
|
23
|
+
])
|
|
24
|
+
|
|
25
|
+
interface FtsCandidate extends Fact {
|
|
26
|
+
ftsRank: number
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export class FactRetriever {
|
|
30
|
+
private db: Database.Database
|
|
31
|
+
private ftsWeight: number
|
|
32
|
+
private jaccardWeight: number
|
|
33
|
+
private halfLifeDays: number
|
|
34
|
+
/** category → 高频 tag 集合(从事实库自动学习,惰性初始化) */
|
|
35
|
+
private _categoryTagMap: Map<FactCategory, Set<string>> | null = null
|
|
36
|
+
/** 中英术语对列表(从事实库自动学习,惰性初始化) */
|
|
37
|
+
private _cnEnPairs: Array<[string, string]> | null = null
|
|
38
|
+
|
|
39
|
+
constructor(
|
|
40
|
+
private store: MemoryStore,
|
|
41
|
+
options?: RetrieverOptions,
|
|
42
|
+
) {
|
|
43
|
+
this.db = store.connection
|
|
44
|
+
this.ftsWeight = options?.ftsWeight ?? 0.5
|
|
45
|
+
this.jaccardWeight = options?.jaccardWeight ?? 0.5
|
|
46
|
+
this.halfLifeDays = options?.temporalDecayHalfLife ?? 0
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** 主搜索:FTS5 → LIKE → 字符交叉 → 分类推断 → Jaccard → 信任评分 → 时间衰减 */
|
|
50
|
+
search(query: string, options?: SearchOptions): ScoredFact[] {
|
|
51
|
+
const minTrust = options?.minTrust ?? 0.3
|
|
52
|
+
const limit = options?.limit ?? 10
|
|
53
|
+
const category = options?.category
|
|
54
|
+
|
|
55
|
+
// 查询双语扩展:中文术语追加英文,英文术语追加中文
|
|
56
|
+
const expandedQuery = this.expandQueryBilingually(query)
|
|
57
|
+
|
|
58
|
+
// Stage 1: FTS5 候选集,空时逐级 fallback(使用双语扩展后的查询)
|
|
59
|
+
let candidates = this.ftsCandidates(expandedQuery, category, minTrust, limit * 3)
|
|
60
|
+
if (candidates.length === 0) {
|
|
61
|
+
candidates = this.likeFallback(expandedQuery, category, minTrust, limit * 3)
|
|
62
|
+
}
|
|
63
|
+
if (candidates.length === 0) {
|
|
64
|
+
candidates = this.charOverlapFallback(expandedQuery, category, minTrust, limit * 3)
|
|
65
|
+
}
|
|
66
|
+
if (candidates.length === 0) {
|
|
67
|
+
// 分类推断 fallback(仅无 category 过滤时生效)
|
|
68
|
+
if (!category) {
|
|
69
|
+
const inferred = this.categoryInferFallback(query, minTrust, limit)
|
|
70
|
+
if (inferred.length > 0) return inferred
|
|
71
|
+
}
|
|
72
|
+
// 个人/身份相关的短查询触发 trust fallback
|
|
73
|
+
if (this.isPersonalQuery(query)) {
|
|
74
|
+
return this.trustFallback(category, minTrust, limit)
|
|
75
|
+
}
|
|
76
|
+
return []
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Stage 2-4: Jaccard 重排序 + 信任评分 + 时间衰减
|
|
80
|
+
const queryTokens = this.tokenize(query)
|
|
81
|
+
|
|
82
|
+
const scored: ScoredFact[] = []
|
|
83
|
+
|
|
84
|
+
for (const fact of candidates) {
|
|
85
|
+
const contentTokens = this.tokenize(fact.content)
|
|
86
|
+
const tagTokens = this.tokenize(fact.tags)
|
|
87
|
+
const allTokens = new Set([...contentTokens, ...tagTokens])
|
|
88
|
+
|
|
89
|
+
const jaccard = this.jaccardSimilarity(queryTokens, allTokens)
|
|
90
|
+
// Containment: 查询 token 在事实 token 中的覆盖率
|
|
91
|
+
const qInF = this.containmentScore(queryTokens, allTokens)
|
|
92
|
+
|
|
93
|
+
// 混合相似度:Jaccard + Containment(简化版,移除 keywordScore)
|
|
94
|
+
const similarity = 0.3 * jaccard + 0.7 * qInF
|
|
95
|
+
const ftsScore = fact.ftsRank
|
|
96
|
+
|
|
97
|
+
// 综合评分
|
|
98
|
+
const relevance = this.ftsWeight * ftsScore + this.jaccardWeight * similarity
|
|
99
|
+
|
|
100
|
+
let score = relevance * fact.trustScore
|
|
101
|
+
|
|
102
|
+
// 时间衰减
|
|
103
|
+
if (this.halfLifeDays > 0) {
|
|
104
|
+
score *= this.temporalDecay(fact.updatedAt || fact.createdAt)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
scored.push({ ...fact, score })
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
scored.sort((a, b) => b.score - a.score)
|
|
111
|
+
|
|
112
|
+
// Category 多样性:同类事实只保留评分最高的,避免 general 黑洞效应
|
|
113
|
+
const seenCategories = new Set<FactCategory>()
|
|
114
|
+
const diverse: ScoredFact[] = []
|
|
115
|
+
for (const s of scored) {
|
|
116
|
+
if (!seenCategories.has(s.category)) {
|
|
117
|
+
seenCategories.add(s.category)
|
|
118
|
+
diverse.push(s)
|
|
119
|
+
}
|
|
120
|
+
if (diverse.length >= limit) break
|
|
121
|
+
}
|
|
122
|
+
// 补位:如果去重后不足 limit,从原排序列表中补(允许同类多次出现)
|
|
123
|
+
if (diverse.length < limit) {
|
|
124
|
+
const diverseIds = new Set(diverse.map(f => f.factId))
|
|
125
|
+
for (const s of scored) {
|
|
126
|
+
if (!diverseIds.has(s.factId)) {
|
|
127
|
+
diverse.push(s)
|
|
128
|
+
if (diverse.length >= limit) break
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const results = diverse
|
|
134
|
+
|
|
135
|
+
// 检索追踪:递增 retrieval_count + top3 信任刷新
|
|
136
|
+
if (results.length > 0) {
|
|
137
|
+
this.trackRetrieval(results)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return results
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** 实体探测:查询某实体关联的所有事实 */
|
|
144
|
+
probe(entity: string, options?: SearchOptions): ScoredFact[] {
|
|
145
|
+
const limit = options?.limit ?? 10
|
|
146
|
+
const facts = this.store.getFactsByEntity(entity, options?.category, limit)
|
|
147
|
+
return facts.map((f, i) => ({
|
|
148
|
+
...f,
|
|
149
|
+
score: f.trustScore * (1 - i * 0.05), // 按信任评分排序并给微小梯度
|
|
150
|
+
}))
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/** 实体关联:查找与某实体共享上下文的其他事实 */
|
|
154
|
+
related(entity: string, options?: SearchOptions): ScoredFact[] {
|
|
155
|
+
const limit = options?.limit ?? 10
|
|
156
|
+
const category = options?.category
|
|
157
|
+
|
|
158
|
+
// Step 1: 获取实体关联的 fact_id 列表
|
|
159
|
+
const entityFactsSql = `
|
|
160
|
+
SELECT fe.fact_id FROM fact_entities fe
|
|
161
|
+
JOIN entities e ON fe.entity_id = e.entity_id
|
|
162
|
+
WHERE e.name LIKE ?
|
|
163
|
+
`
|
|
164
|
+
const entityFactRows = this.db.prepare(entityFactsSql).all(entity) as Array<{ fact_id: number }>
|
|
165
|
+
if (entityFactRows.length === 0) return []
|
|
166
|
+
|
|
167
|
+
const factIds = entityFactRows.map(r => r.fact_id)
|
|
168
|
+
|
|
169
|
+
// Step 2: 获取这些 facts 关联的其他实体
|
|
170
|
+
const placeholders = factIds.map(() => '?').join(',')
|
|
171
|
+
const otherEntityRows = this.db.prepare(`
|
|
172
|
+
SELECT DISTINCT e.name FROM entities e
|
|
173
|
+
JOIN fact_entities fe ON fe.entity_id = e.entity_id
|
|
174
|
+
WHERE fe.fact_id IN (${placeholders})
|
|
175
|
+
AND e.name NOT LIKE ?
|
|
176
|
+
`).all(...factIds, entity) as Array<{ name: string }>
|
|
177
|
+
|
|
178
|
+
if (otherEntityRows.length === 0) return []
|
|
179
|
+
|
|
180
|
+
// Step 3: 获取关联这些其他实体但不包含原始事实的 facts
|
|
181
|
+
const otherEntities = otherEntityRows.map(r => r.name)
|
|
182
|
+
const entityPlaceholders = otherEntities.map(() => '?').join(',')
|
|
183
|
+
const excludePlaceholders = factIds.map(() => '?').join(',')
|
|
184
|
+
|
|
185
|
+
let categoryClause = ''
|
|
186
|
+
const params: unknown[] = [...otherEntities, ...factIds]
|
|
187
|
+
if (category) {
|
|
188
|
+
categoryClause = 'AND f.category = ?'
|
|
189
|
+
params.push(category)
|
|
190
|
+
}
|
|
191
|
+
params.push(limit)
|
|
192
|
+
|
|
193
|
+
const sql = `
|
|
194
|
+
SELECT DISTINCT f.fact_id, f.content, f.category, f.tags, f.keywords,
|
|
195
|
+
f.trust_score, f.retrieval_count, f.helpful_count,
|
|
196
|
+
f.created_at, f.updated_at
|
|
197
|
+
FROM facts f
|
|
198
|
+
JOIN fact_entities fe ON f.fact_id = fe.fact_id
|
|
199
|
+
JOIN entities e ON fe.entity_id = e.entity_id
|
|
200
|
+
WHERE e.name IN (${entityPlaceholders})
|
|
201
|
+
AND f.fact_id NOT IN (${excludePlaceholders})
|
|
202
|
+
${categoryClause}
|
|
203
|
+
ORDER BY f.trust_score DESC
|
|
204
|
+
LIMIT ?
|
|
205
|
+
`
|
|
206
|
+
|
|
207
|
+
const rows = this.db.prepare(sql).all(...params) as Array<{
|
|
208
|
+
fact_id: number; content: string; category: string; tags: string; keywords: string;
|
|
209
|
+
trust_score: number; retrieval_count: number; helpful_count: number;
|
|
210
|
+
created_at: string; updated_at: string;
|
|
211
|
+
}>
|
|
212
|
+
|
|
213
|
+
return rows.map((r, i) => ({
|
|
214
|
+
factId: r.fact_id,
|
|
215
|
+
content: r.content,
|
|
216
|
+
category: r.category as FactCategory,
|
|
217
|
+
tags: r.tags,
|
|
218
|
+
keywords: r.keywords ?? '[]',
|
|
219
|
+
trustScore: r.trust_score,
|
|
220
|
+
retrievalCount: r.retrieval_count,
|
|
221
|
+
helpfulCount: r.helpful_count,
|
|
222
|
+
createdAt: r.created_at,
|
|
223
|
+
updatedAt: r.updated_at,
|
|
224
|
+
score: r.trust_score * (1 - i * 0.05),
|
|
225
|
+
}))
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/** 多实体推理:查找同时关联多个实体的事实 */
|
|
229
|
+
reason(entities: string[], options?: SearchOptions): ScoredFact[] {
|
|
230
|
+
if (entities.length === 0) return []
|
|
231
|
+
const facts = this.store.getFactsByEntities(entities, options?.category, options?.limit ?? 10)
|
|
232
|
+
return facts.map((f, i) => ({
|
|
233
|
+
...f,
|
|
234
|
+
score: f.trustScore * (1 - i * 0.05),
|
|
235
|
+
}))
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/** 矛盾检测:实体重叠 + 内容差异 */
|
|
239
|
+
contradict(options?: ContradictOptions): Contradiction[] {
|
|
240
|
+
const threshold = options?.threshold ?? 0.3
|
|
241
|
+
const limit = options?.limit ?? 10
|
|
242
|
+
const category = options?.category
|
|
243
|
+
|
|
244
|
+
// 获取事实
|
|
245
|
+
let whereClause = ''
|
|
246
|
+
const params: unknown[] = []
|
|
247
|
+
if (category) {
|
|
248
|
+
whereClause = 'WHERE f.category = ?'
|
|
249
|
+
params.push(category)
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
let rows = this.db.prepare(`
|
|
253
|
+
SELECT f.fact_id, f.content, f.category, f.tags, f.keywords, f.trust_score,
|
|
254
|
+
f.created_at, f.updated_at
|
|
255
|
+
FROM facts f
|
|
256
|
+
${whereClause}
|
|
257
|
+
ORDER BY f.updated_at DESC
|
|
258
|
+
`).all(...params) as Array<{
|
|
259
|
+
fact_id: number; content: string; category: string; tags: string; keywords: string;
|
|
260
|
+
trust_score: number; created_at: string; updated_at: string;
|
|
261
|
+
}>
|
|
262
|
+
|
|
263
|
+
if (rows.length < 2) return []
|
|
264
|
+
|
|
265
|
+
// 限制 O(n²) 复杂度
|
|
266
|
+
const MAX_FACTS = 500
|
|
267
|
+
if (rows.length > MAX_FACTS) rows = rows.slice(0, MAX_FACTS)
|
|
268
|
+
|
|
269
|
+
// 构建实体集合
|
|
270
|
+
const factEntities = new Map<number, Set<string>>()
|
|
271
|
+
for (const row of rows) {
|
|
272
|
+
const names = this.store.getEntitiesForFact(row.fact_id)
|
|
273
|
+
factEntities.set(row.fact_id, new Set(names.map(n => n.toLowerCase())))
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// 比对所有事实对
|
|
277
|
+
const contradictions: Contradiction[] = []
|
|
278
|
+
for (let i = 0; i < rows.length; i++) {
|
|
279
|
+
for (let j = i + 1; j < rows.length; j++) {
|
|
280
|
+
const f1 = rows[i]
|
|
281
|
+
const f2 = rows[j]
|
|
282
|
+
const ents1 = factEntities.get(f1.fact_id) ?? new Set()
|
|
283
|
+
const ents2 = factEntities.get(f2.fact_id) ?? new Set()
|
|
284
|
+
|
|
285
|
+
if (ents1.size === 0 || ents2.size === 0) continue
|
|
286
|
+
|
|
287
|
+
// 实体重叠 (Jaccard)
|
|
288
|
+
const intersection = new Set([...ents1].filter(e => ents2.has(e)))
|
|
289
|
+
const union = new Set([...ents1, ...ents2])
|
|
290
|
+
const entityOverlap = union.size > 0 ? intersection.size / union.size : 0
|
|
291
|
+
|
|
292
|
+
if (entityOverlap < 0.3) continue
|
|
293
|
+
|
|
294
|
+
// 内容相似度 (Jaccard on tokens)
|
|
295
|
+
const tokens1 = this.tokenize(f1.content)
|
|
296
|
+
const tokens2 = this.tokenize(f2.content)
|
|
297
|
+
const contentSim = this.jaccardSimilarity(tokens1, tokens2)
|
|
298
|
+
|
|
299
|
+
// 高实体重叠 + 低内容相似度 = 潜在矛盾
|
|
300
|
+
const contradictionScore = entityOverlap * (1 - contentSim)
|
|
301
|
+
|
|
302
|
+
if (contradictionScore >= threshold) {
|
|
303
|
+
const toFact = (r: typeof rows[0]): Fact => ({
|
|
304
|
+
factId: r.fact_id,
|
|
305
|
+
content: r.content,
|
|
306
|
+
category: r.category as FactCategory,
|
|
307
|
+
tags: r.tags,
|
|
308
|
+
keywords: r.keywords ?? '[]',
|
|
309
|
+
trustScore: r.trust_score,
|
|
310
|
+
retrievalCount: 0,
|
|
311
|
+
helpfulCount: 0,
|
|
312
|
+
createdAt: r.created_at,
|
|
313
|
+
updatedAt: r.updated_at,
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
contradictions.push({
|
|
317
|
+
factA: toFact(f1),
|
|
318
|
+
factB: toFact(f2),
|
|
319
|
+
entityOverlap: Math.round(entityOverlap * 1000) / 1000,
|
|
320
|
+
contentSimilarity: Math.round(contentSim * 1000) / 1000,
|
|
321
|
+
contradictionScore: Math.round(contradictionScore * 1000) / 1000,
|
|
322
|
+
sharedEntities: [...intersection],
|
|
323
|
+
})
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
contradictions.sort((a, b) => b.contradictionScore - a.contradictionScore)
|
|
329
|
+
return contradictions.slice(0, limit)
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// ------------------------------------------------------------------
|
|
333
|
+
// 内部方法
|
|
334
|
+
// ------------------------------------------------------------------
|
|
335
|
+
|
|
336
|
+
/** Stage 1: FTS5 候选集 */
|
|
337
|
+
private ftsCandidates(
|
|
338
|
+
query: string,
|
|
339
|
+
category: FactCategory | undefined,
|
|
340
|
+
minTrust: number,
|
|
341
|
+
limit: number,
|
|
342
|
+
): FtsCandidate[] {
|
|
343
|
+
// 将查询转为 FTS5 可匹配的形式:
|
|
344
|
+
// 1. 原始词用 OR 连接
|
|
345
|
+
// 2. 中文部分追加 bigram,提升中文搜索召回率
|
|
346
|
+
const parts = query.split(/\s+/).filter(w => w.length > 0)
|
|
347
|
+
const ftsParts: string[] = []
|
|
348
|
+
|
|
349
|
+
for (const word of parts) {
|
|
350
|
+
ftsParts.push(`"${word}"`)
|
|
351
|
+
// 对中文部分追加 bigram
|
|
352
|
+
const cnChars = word.match(/[\u4e00-\u9fff]+/g)
|
|
353
|
+
if (cnChars) {
|
|
354
|
+
for (const seg of cnChars) {
|
|
355
|
+
for (let i = 0; i < seg.length - 1; i++) {
|
|
356
|
+
ftsParts.push(seg.slice(i, i + 2))
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
const ftsQuery = ftsParts.join(' OR ')
|
|
363
|
+
if (!ftsQuery) return []
|
|
364
|
+
|
|
365
|
+
const params: unknown[] = [ftsQuery, minTrust]
|
|
366
|
+
const whereClauses = ['facts_fts MATCH ?', 'f.trust_score >= ?']
|
|
367
|
+
|
|
368
|
+
if (category) {
|
|
369
|
+
whereClauses.push('f.category = ?')
|
|
370
|
+
params.push(category)
|
|
371
|
+
}
|
|
372
|
+
params.push(limit)
|
|
373
|
+
|
|
374
|
+
const whereSql = whereClauses.join(' AND ')
|
|
375
|
+
|
|
376
|
+
const sql = `
|
|
377
|
+
SELECT f.*, facts_fts.rank as fts_rank_raw
|
|
378
|
+
FROM facts_fts
|
|
379
|
+
JOIN facts f ON f.fact_id = facts_fts.rowid
|
|
380
|
+
WHERE ${whereSql}
|
|
381
|
+
ORDER BY facts_fts.rank
|
|
382
|
+
LIMIT ?
|
|
383
|
+
`
|
|
384
|
+
|
|
385
|
+
let rows: Array<Record<string, unknown>>
|
|
386
|
+
try {
|
|
387
|
+
rows = this.db.prepare(sql).all(...params) as Array<Record<string, unknown>>
|
|
388
|
+
} catch {
|
|
389
|
+
// FTS5 MATCH 可能在格式错误的查询上失败 — fallback 到 LIKE
|
|
390
|
+
return this.likeFallback(query, category, minTrust, limit)
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
if (rows.length === 0) return []
|
|
394
|
+
|
|
395
|
+
// 归一化 FTS5 rank: rank 是负数,越小越好
|
|
396
|
+
const rawRanks = rows.map(r => Math.abs(Number(r.fts_rank_raw)))
|
|
397
|
+
const maxRank = Math.max(...rawRanks, 1e-6)
|
|
398
|
+
|
|
399
|
+
return rows.map((row, i) => ({
|
|
400
|
+
factId: Number(row.fact_id),
|
|
401
|
+
content: String(row.content),
|
|
402
|
+
category: String(row.category) as FactCategory,
|
|
403
|
+
tags: String(row.tags),
|
|
404
|
+
keywords: String(row.keywords ?? '[]'),
|
|
405
|
+
trustScore: Number(row.trust_score),
|
|
406
|
+
retrievalCount: Number(row.retrieval_count),
|
|
407
|
+
helpfulCount: Number(row.helpful_count),
|
|
408
|
+
createdAt: String(row.created_at),
|
|
409
|
+
updatedAt: String(row.updated_at),
|
|
410
|
+
ftsRank: rawRanks[i] / maxRank,
|
|
411
|
+
}))
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/** 简单分词:空格/下划线/中英文标点分割 + 小写 + 中文 bigram */
|
|
415
|
+
private tokenize(text: string): Set<string> {
|
|
416
|
+
if (!text) return new Set()
|
|
417
|
+
const tokens = new Set<string>()
|
|
418
|
+
// 将下划线、中文标点、连接符都视为分隔符,再按空格分词
|
|
419
|
+
const normalized = text.toLowerCase()
|
|
420
|
+
.replace(/[_\-/\\|]/g, ' ')
|
|
421
|
+
.replace(/[,。!?;:、""''【】《》()…—·,.;:!?'"()\[\]{}<>@#$%^&*+=~`]/g, ' ')
|
|
422
|
+
for (const word of normalized.split(/\s+/)) {
|
|
423
|
+
if (word && word.length > 1) tokens.add(word)
|
|
424
|
+
}
|
|
425
|
+
// 中文 bigram:提升 Jaccard 对中文内容的匹配能力
|
|
426
|
+
const cnChars = text.match(/[\u4e00-\u9fff]+/g) ?? []
|
|
427
|
+
for (const seg of cnChars) {
|
|
428
|
+
for (let i = 0; i < seg.length - 1; i++) {
|
|
429
|
+
tokens.add(seg.slice(i, i + 2))
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
return tokens
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/** Jaccard 相似度 */
|
|
436
|
+
private jaccardSimilarity(a: Set<string>, b: Set<string>): number {
|
|
437
|
+
if (a.size === 0 || b.size === 0) return 0
|
|
438
|
+
let intersection = 0
|
|
439
|
+
for (const item of a) {
|
|
440
|
+
if (b.has(item)) intersection++
|
|
441
|
+
}
|
|
442
|
+
const unionSize = a.size + b.size - intersection
|
|
443
|
+
return unionSize > 0 ? intersection / unionSize : 0
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/** Containment: a 中有多少比例的 token 出现在 b 中(不对称,衡量"查询被事实覆盖"的程度) */
|
|
447
|
+
private containmentScore(a: Set<string>, b: Set<string>): number {
|
|
448
|
+
if (a.size === 0 || b.size === 0) return 0
|
|
449
|
+
let hits = 0
|
|
450
|
+
for (const item of a) {
|
|
451
|
+
if (b.has(item)) hits++
|
|
452
|
+
}
|
|
453
|
+
return hits / a.size
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/** 时间衰减: 0.5^(ageDays / halfLifeDays) */
|
|
457
|
+
private temporalDecay(timestampStr: string | null): number {
|
|
458
|
+
if (!this.halfLifeDays || !timestampStr) return 1.0
|
|
459
|
+
|
|
460
|
+
try {
|
|
461
|
+
const ts = new Date(timestampStr) // SQLite datetime 是本地时间
|
|
462
|
+
const ageMs = Date.now() - ts.getTime()
|
|
463
|
+
const ageDays = ageMs / 86400000
|
|
464
|
+
if (ageDays < 0) return 1.0
|
|
465
|
+
return Math.pow(0.5, ageDays / this.halfLifeDays)
|
|
466
|
+
} catch {
|
|
467
|
+
return 1.0
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/** 判断查询是否为个人/身份相关(应触发 trust fallback) */
|
|
472
|
+
private isPersonalQuery(query: string): boolean {
|
|
473
|
+
const q = query.trim().toLowerCase()
|
|
474
|
+
// 短查询(<=20字)+ 包含个人/身份关键词
|
|
475
|
+
if (q.length > 20) return false
|
|
476
|
+
|
|
477
|
+
// 通用身份/关于用户的查询模式
|
|
478
|
+
const patterns = [
|
|
479
|
+
/你(是谁|叫什么|的名字|的身份)/,
|
|
480
|
+
/我(是谁|叫什么|的名字|的身份|喜欢|偏好|习惯)/,
|
|
481
|
+
/(认识|记得|记住|知道).{0,4}(我|你)/,
|
|
482
|
+
/(名字|身份|称呼|角色|profile)/,
|
|
483
|
+
/(who are you|who am i|my name|about me|my role|call me|remember me)/i,
|
|
484
|
+
]
|
|
485
|
+
return patterns.some(p => p.test(q))
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
/** Trust fallback — 查询无法匹配任何事实时,按信任评分返回 top-N */
|
|
489
|
+
private trustFallback(
|
|
490
|
+
category: FactCategory | undefined,
|
|
491
|
+
minTrust: number,
|
|
492
|
+
limit: number,
|
|
493
|
+
): ScoredFact[] {
|
|
494
|
+
const facts = this.store.listFacts(category, minTrust, limit)
|
|
495
|
+
return facts.map((f, i) => ({
|
|
496
|
+
...f,
|
|
497
|
+
score: f.trustScore * (1 - i * 0.05),
|
|
498
|
+
}))
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
/** LIKE fallback — FTS5 失败或中文查询时使用 */
|
|
502
|
+
private likeFallback(
|
|
503
|
+
query: string,
|
|
504
|
+
category: FactCategory | undefined,
|
|
505
|
+
minTrust: number,
|
|
506
|
+
limit: number,
|
|
507
|
+
): FtsCandidate[] {
|
|
508
|
+
const words = query.split(/\s+/).filter(w => w.length > 0)
|
|
509
|
+
if (words.length === 0) return []
|
|
510
|
+
|
|
511
|
+
// 对每个词做 LIKE 匹配,取并集
|
|
512
|
+
const conditions: string[] = []
|
|
513
|
+
const params: unknown[] = []
|
|
514
|
+
for (const word of words) {
|
|
515
|
+
conditions.push('(f.content LIKE ? OR f.tags LIKE ?)')
|
|
516
|
+
params.push(`%${word}%`, `%${word}%`)
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// 中文子串分解:将中文查询拆为 2~3 字滑动窗口,追加 LIKE 条件
|
|
520
|
+
// 例:"颜色忌讳" → LIKE "%颜色%" OR "%色忌%" OR "%忌讳%"
|
|
521
|
+
const cnChars = query.match(/[\u4e00-\u9fff]+/g)
|
|
522
|
+
if (cnChars) {
|
|
523
|
+
for (const seg of cnChars) {
|
|
524
|
+
if (seg.length < 2) continue
|
|
525
|
+
// 2-gram
|
|
526
|
+
for (let i = 0; i < seg.length - 1; i++) {
|
|
527
|
+
const bigram = seg.slice(i, i + 2)
|
|
528
|
+
conditions.push('(f.content LIKE ? OR f.tags LIKE ?)')
|
|
529
|
+
params.push(`%${bigram}%`, `%${bigram}%`)
|
|
530
|
+
}
|
|
531
|
+
// 3-gram(覆盖更长的短语匹配)
|
|
532
|
+
for (let i = 0; i < seg.length - 2; i++) {
|
|
533
|
+
const trigram = seg.slice(i, i + 3)
|
|
534
|
+
conditions.push('(f.content LIKE ? OR f.tags LIKE ?)')
|
|
535
|
+
params.push(`%${trigram}%`, `%${trigram}%`)
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
const conditionsSql = conditions.join(' OR ')
|
|
541
|
+
|
|
542
|
+
params.push(minTrust)
|
|
543
|
+
let categoryClause = ''
|
|
544
|
+
if (category) {
|
|
545
|
+
categoryClause = 'AND f.category = ?'
|
|
546
|
+
params.push(category)
|
|
547
|
+
}
|
|
548
|
+
params.push(limit)
|
|
549
|
+
|
|
550
|
+
const sql = `
|
|
551
|
+
SELECT f.fact_id, f.content, f.category, f.tags, f.keywords,
|
|
552
|
+
f.trust_score, f.retrieval_count, f.helpful_count,
|
|
553
|
+
f.created_at, f.updated_at
|
|
554
|
+
FROM facts f
|
|
555
|
+
WHERE (${conditionsSql})
|
|
556
|
+
AND f.trust_score >= ?
|
|
557
|
+
${categoryClause}
|
|
558
|
+
ORDER BY f.trust_score DESC
|
|
559
|
+
LIMIT ?
|
|
560
|
+
`
|
|
561
|
+
|
|
562
|
+
const rows = this.db.prepare(sql).all(...params) as Array<{
|
|
563
|
+
fact_id: number; content: string; category: string; tags: string; keywords: string;
|
|
564
|
+
trust_score: number; retrieval_count: number; helpful_count: number;
|
|
565
|
+
created_at: string; updated_at: string;
|
|
566
|
+
}>
|
|
567
|
+
|
|
568
|
+
// LIKE 没有排名,给统一的中间排名
|
|
569
|
+
return rows.map(r => ({
|
|
570
|
+
factId: r.fact_id,
|
|
571
|
+
content: r.content,
|
|
572
|
+
category: r.category as FactCategory,
|
|
573
|
+
tags: r.tags,
|
|
574
|
+
keywords: r.keywords ?? '[]',
|
|
575
|
+
trustScore: r.trust_score,
|
|
576
|
+
retrievalCount: r.retrieval_count,
|
|
577
|
+
helpfulCount: r.helpful_count,
|
|
578
|
+
createdAt: r.created_at,
|
|
579
|
+
updatedAt: r.updated_at,
|
|
580
|
+
ftsRank: 0.5,
|
|
581
|
+
}))
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
/** 中文字符级 fallback — FTS5 和 LIKE 都失败时,用单字交叉匹配 */
|
|
585
|
+
private charOverlapFallback(
|
|
586
|
+
query: string,
|
|
587
|
+
category: FactCategory | undefined,
|
|
588
|
+
minTrust: number,
|
|
589
|
+
limit: number,
|
|
590
|
+
): FtsCandidate[] {
|
|
591
|
+
// 提取查询中的中文单字(去重,排除常见虚词)
|
|
592
|
+
const cnChars = [...new Set((query.match(/[\u4e00-\u9fff]/g) ?? []))]
|
|
593
|
+
.filter(c => !CN_OVERLAP_STOP.has(c))
|
|
594
|
+
if (cnChars.length < 2) return [] // 中文单字太少,不值得匹配
|
|
595
|
+
|
|
596
|
+
// 内存中扫描所有事实,计算字符重叠率
|
|
597
|
+
const allFacts = this.store.listFacts(category, minTrust, 200)
|
|
598
|
+
const results: Array<{ fact: Fact; overlap: number }> = []
|
|
599
|
+
|
|
600
|
+
for (const fact of allFacts) {
|
|
601
|
+
const text = (fact.content + fact.tags)
|
|
602
|
+
let hits = 0
|
|
603
|
+
for (const c of cnChars) {
|
|
604
|
+
if (text.includes(c)) hits++
|
|
605
|
+
}
|
|
606
|
+
const overlap = hits / cnChars.length
|
|
607
|
+
// 至少 40% 的查询字符出现在事实中
|
|
608
|
+
if (overlap >= 0.4) {
|
|
609
|
+
results.push({ fact, overlap })
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
if (results.length === 0) return []
|
|
614
|
+
|
|
615
|
+
// 按重叠率 * 信任评分排序
|
|
616
|
+
results.sort((a, b) => (b.overlap * b.fact.trustScore) - (a.overlap * a.fact.trustScore))
|
|
617
|
+
|
|
618
|
+
return results.slice(0, limit).map(({ fact, overlap }) => ({
|
|
619
|
+
...fact,
|
|
620
|
+
ftsRank: overlap * 0.8, // 字符重叠率作为伪排名
|
|
621
|
+
}))
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
/** 分类推断 fallback — 根据查询关键词推断 category,返回该分类的高信任事实 */
|
|
625
|
+
private categoryInferFallback(
|
|
626
|
+
query: string,
|
|
627
|
+
minTrust: number,
|
|
628
|
+
limit: number,
|
|
629
|
+
): ScoredFact[] {
|
|
630
|
+
const inferred = this.inferCategory(query)
|
|
631
|
+
if (!inferred) return []
|
|
632
|
+
|
|
633
|
+
const facts = this.store.listFacts(inferred, minTrust, limit)
|
|
634
|
+
return facts.map((f, i) => ({
|
|
635
|
+
...f,
|
|
636
|
+
score: f.trustScore * (1 - i * 0.05) * 0.7, // 分类推断的确定性较低,乘以 0.7 折扣
|
|
637
|
+
}))
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
/** 从事实库自动学习 category → tag 映射(惰性初始化 + 缓存) */
|
|
641
|
+
private getCategoryTagMap(): Map<FactCategory, Set<string>> {
|
|
642
|
+
if (this._categoryTagMap) return this._categoryTagMap
|
|
643
|
+
|
|
644
|
+
const map = new Map<FactCategory, Set<string>>()
|
|
645
|
+
const allFacts = this.store.listFacts(undefined, 0.2, 200)
|
|
646
|
+
|
|
647
|
+
for (const fact of allFacts) {
|
|
648
|
+
if (!map.has(fact.category)) map.set(fact.category, new Set())
|
|
649
|
+
const tagSet = map.get(fact.category)!
|
|
650
|
+
// 从 tags 字段提取
|
|
651
|
+
for (const tag of fact.tags.split(',')) {
|
|
652
|
+
const t = tag.trim().toLowerCase()
|
|
653
|
+
if (t.length >= 2) tagSet.add(t)
|
|
654
|
+
}
|
|
655
|
+
// 从 content 提取中文 bigram 作为隐式 tag
|
|
656
|
+
const cnChars = fact.content.match(/[\u4e00-\u9fff]+/g) ?? []
|
|
657
|
+
for (const seg of cnChars) {
|
|
658
|
+
for (let i = 0; i < seg.length - 1; i++) {
|
|
659
|
+
const bg = seg.slice(i, i + 2)
|
|
660
|
+
if (!CN_OVERLAP_STOP.has(bg[0]) && !CN_OVERLAP_STOP.has(bg[1])) {
|
|
661
|
+
tagSet.add(bg)
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
// 从 content 提取英文单词(≥3 字母)作为隐式 tag
|
|
666
|
+
const enWords = fact.content.match(/[a-zA-Z]{3,}/g) ?? []
|
|
667
|
+
for (const w of enWords) {
|
|
668
|
+
tagSet.add(w.toLowerCase())
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
this._categoryTagMap = map
|
|
673
|
+
return map
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* 从事实库自动学习中英术语对(惰性初始化 + 缓存)。
|
|
678
|
+
* 两个来源:
|
|
679
|
+
* 1. 种子表:极小的核心 IT 术语对照(稳定不变,覆盖高频查询)
|
|
680
|
+
* 2. 事实库提取:括号注释/分隔符关联的高置信翻译对(自动增长)
|
|
681
|
+
* 歧义对(一个中文对应多个英文)自动丢弃。
|
|
682
|
+
*/
|
|
683
|
+
private getCnEnPairs(): Array<[string, string]> {
|
|
684
|
+
if (this._cnEnPairs) return this._cnEnPairs
|
|
685
|
+
|
|
686
|
+
// cn → [en列表]
|
|
687
|
+
const candidateMap = new Map<string, Set<string>>()
|
|
688
|
+
|
|
689
|
+
// 种子表:核心 IT 术语(极小、稳定、高频,约20对)
|
|
690
|
+
const SEED_PAIRS: Array<[string, string]> = [
|
|
691
|
+
['抓取', 'scraping'], ['爬虫', 'crawler'], ['逆向', 'reverse'],
|
|
692
|
+
['部署', 'deploy'], ['架构', 'architecture'], ['接口', 'api'],
|
|
693
|
+
['数据库', 'database'], ['缓存', 'cache'], ['配置', 'config'],
|
|
694
|
+
['构建', 'build'], ['编译', 'compile'], ['调试', 'debug'],
|
|
695
|
+
['测试', 'test'], ['提交', 'commit'], ['合并', 'merge'],
|
|
696
|
+
['终端', 'terminal'], ['命令行', 'cli'], ['邮箱', 'email'],
|
|
697
|
+
['模型', 'model'], ['插件', 'plugin'], ['渐变', 'gradient'],
|
|
698
|
+
]
|
|
699
|
+
for (const [cn, en] of SEED_PAIRS) {
|
|
700
|
+
this.addPair(candidateMap, cn, en)
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
// 从事实库提取高置信对
|
|
704
|
+
const allFacts = this.store.listFacts(undefined, 0.2, 200)
|
|
705
|
+
for (const fact of allFacts) {
|
|
706
|
+
const text = fact.content + ' ' + fact.tags
|
|
707
|
+
|
|
708
|
+
// 括号注释:"逆向(reverse)"
|
|
709
|
+
for (const m of text.matchAll(/([\u4e00-\u9fff]{2,4})\s*[((]\s*([a-zA-Z]{2,})\s*[))]/g))
|
|
710
|
+
this.addPair(candidateMap, m[1], m[2].toLowerCase())
|
|
711
|
+
for (const m of text.matchAll(/([a-zA-Z]{2,})\s*[((]\s*([\u4e00-\u9fff]{2,4})\s*[))]/g))
|
|
712
|
+
this.addPair(candidateMap, m[2], m[1].toLowerCase())
|
|
713
|
+
|
|
714
|
+
// 分隔符关联:"名称:BlockShip"
|
|
715
|
+
for (const m of text.matchAll(/([\u4e00-\u9fff]{2,4})\s*[::=]\s*([a-zA-Z]{2,})/g))
|
|
716
|
+
this.addPair(candidateMap, m[1], m[2].toLowerCase())
|
|
717
|
+
for (const m of text.matchAll(/([a-zA-Z]{2,})\s*[::=]\s*([\u4e00-\u9fff]{2,4})/g))
|
|
718
|
+
this.addPair(candidateMap, m[2], m[1].toLowerCase())
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
// 过滤:只保留唯一映射,歧义对丢弃
|
|
722
|
+
const pairs: Array<[string, string]> = []
|
|
723
|
+
for (const [cn, enSet] of candidateMap) {
|
|
724
|
+
if (enSet.size === 1) {
|
|
725
|
+
const en = [...enSet][0]
|
|
726
|
+
if (en.length >= 2 && !CN_OVERLAP_STOP.has(cn[0])) {
|
|
727
|
+
pairs.push([cn, en])
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
this._cnEnPairs = pairs
|
|
733
|
+
return pairs
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/** 添加候选对到 map */
|
|
737
|
+
private addPair(map: Map<string, Set<string>>, cn: string, en: string): void {
|
|
738
|
+
if (!map.has(cn)) map.set(cn, new Set())
|
|
739
|
+
map.get(cn)!.add(en)
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
/** 查询双语扩展:基于事实库自动学习的术语对照,将查询中的术语翻译为对端语言 */
|
|
743
|
+
private expandQueryBilingually(query: string): string {
|
|
744
|
+
const pairs = this.getCnEnPairs()
|
|
745
|
+
if (pairs.length === 0) return query
|
|
746
|
+
|
|
747
|
+
const extras: string[] = []
|
|
748
|
+
const ql = query.toLowerCase()
|
|
749
|
+
|
|
750
|
+
for (const [cn, en] of pairs) {
|
|
751
|
+
// 中文→英文
|
|
752
|
+
if (ql.includes(cn) && !ql.includes(en)) {
|
|
753
|
+
extras.push(en)
|
|
754
|
+
}
|
|
755
|
+
// 英文→中文
|
|
756
|
+
if (ql.includes(en) && !ql.includes(cn)) {
|
|
757
|
+
extras.push(cn)
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
return extras.length > 0 ? `${query} ${extras.join(' ')}` : query
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/** 从查询内容推断 category — 基于事实库自动学习的 tag 映射 */
|
|
765
|
+
private inferCategory(query: string): FactCategory | null {
|
|
766
|
+
const tagMap = this.getCategoryTagMap()
|
|
767
|
+
const q = query.toLowerCase()
|
|
768
|
+
|
|
769
|
+
let bestCategory: FactCategory | null = null
|
|
770
|
+
let bestScore = 0
|
|
771
|
+
|
|
772
|
+
for (const [cat, tags] of tagMap) {
|
|
773
|
+
let score = 0
|
|
774
|
+
for (const tag of tags) {
|
|
775
|
+
if (q.includes(tag)) score++
|
|
776
|
+
}
|
|
777
|
+
if (score > bestScore) {
|
|
778
|
+
bestScore = score
|
|
779
|
+
bestCategory = cat
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
return bestScore >= 1 ? bestCategory : null
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
/** 检索追踪:递增 retrieval_count + top3 信任刷新(重置衰减时钟) */
|
|
787
|
+
private trackRetrieval(facts: ScoredFact[]): void {
|
|
788
|
+
if (facts.length === 0) return
|
|
789
|
+
|
|
790
|
+
const ids = facts.map(f => f.factId)
|
|
791
|
+
const placeholders = ids.map(() => '?').join(',')
|
|
792
|
+
|
|
793
|
+
// 递增所有返回结果的检索计数
|
|
794
|
+
this.db.prepare(
|
|
795
|
+
`UPDATE facts SET retrieval_count = retrieval_count + 1 WHERE fact_id IN (${placeholders})`
|
|
796
|
+
).run(...ids)
|
|
797
|
+
|
|
798
|
+
// top 3 信任刷新:+0.01 信任 + 重置 updated_at
|
|
799
|
+
const topN = facts.slice(0, 3)
|
|
800
|
+
for (const f of topN) {
|
|
801
|
+
this.db.prepare(
|
|
802
|
+
`UPDATE facts SET trust_score = MIN(1.0, trust_score + 0.01), updated_at = datetime('now', 'localtime') WHERE fact_id = ?`
|
|
803
|
+
).run(f.factId)
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
}
|