@scotthuang/engram 0.9.9 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/profile.js CHANGED
@@ -1,66 +1,200 @@
1
1
  /**
2
- * Memory System Plugin - Profile (三层语义画像)
2
+ * Memory System Plugin - Profile (四层语义画像)
3
3
  *
4
- * 三层架构:
4
+ * 四层架构:
5
5
  * identity — 核心身份(姓名/城市/职业/家人),几乎不变,不衰减
6
6
  * pattern — 行为模式(作息/饮食习惯/工作风格),统计驱动,慢衰减
7
7
  * interest — 动态兴趣(当前项目/近期关注),高频变化,快衰减
8
+ * event — 一次性事件(排查/bug/配置),带 TTL,短期内自动消除
8
9
  *
9
- * 画像 JSON 结构 + 读写 + 分层衰减 + 压缩摘要
10
+ * 画像 JSON 结构 + 读写 + 分层衰减 + 压缩摘要 + 受控维度 + LLM 自审
10
11
  */
11
12
  import { promises as fs } from "node:fs";
12
13
  import { logger } from "./logger.js";
13
14
  import { join } from "node:path";
14
- /** 各层的衰减因子和淘汰阈值 */
15
+ /** 各层的衰减因子、淘汰阈值、TTL 天数 */
15
16
  const LAYER_CONFIG = {
16
- identity: { decayFactor: 1.0, pruneThreshold: 0.1, defaultConfidence: 0.95 },
17
- pattern: { decayFactor: 0.995, pruneThreshold: 0.3, defaultConfidence: 0.7 },
18
- interest: { decayFactor: 0.95, pruneThreshold: 0.2, defaultConfidence: 0.7 },
17
+ identity: { decayFactor: 1.0, pruneThreshold: 0.1, defaultConfidence: 0.95, ttlDays: Infinity },
18
+ pattern: { decayFactor: 0.995, pruneThreshold: 0.3, defaultConfidence: 0.7, ttlDays: 180 },
19
+ interest: { decayFactor: 0.95, pruneThreshold: 0.25, defaultConfidence: 0.7, ttlDays: 30 },
20
+ event: { decayFactor: 0.85, pruneThreshold: 0.3, defaultConfidence: 0.55, ttlDays: 7 },
19
21
  };
22
+ const LAYER_PRIORITY = {
23
+ identity: 4,
24
+ pattern: 3,
25
+ interest: 2,
26
+ event: 1,
27
+ };
28
+ // ============================================================================
29
+ // 受控维度表(Canonical Vocabulary)
30
+ // ============================================================================
31
+ /**
32
+ * 受控维度表:settle 时 LLM 只能从这些维度里选
33
+ * 目的:消除"技术/技术调试/技术运维/技术/调试"等近义维度碎片
34
+ */
35
+ export const CANONICAL_DIMENSIONS = [
36
+ "身份", // 姓名、职业、家人
37
+ "作息", // 睡眠/饮食/运动规律
38
+ "技术", // 编程、技术栈、工具偏好
39
+ "项目", // 在做的/长期的项目
40
+ "兴趣", // 爱好、关注领域
41
+ "偏好", // 交互风格、决策模式
42
+ "人际", // 家庭、重要联系人
43
+ ];
44
+ /**
45
+ * 维度别名映射:旧维度名(LLM 自由发挥产生的碎片)→ 受控维度
46
+ * 用于历史清理 + 兼容 LLM 偶尔的越界产出
47
+ */
48
+ const DIMENSION_ALIAS = {
49
+ // 技术族
50
+ 技术行为: "技术",
51
+ 技术工具: "技术",
52
+ 技术方向: "技术",
53
+ 技术开发: "技术",
54
+ 技术排查: "技术",
55
+ 技术领域: "技术",
56
+ 技术配置: "技术",
57
+ 技术问题排查: "技术",
58
+ 技术调试: "技术",
59
+ 技术运维: "技术",
60
+ 技术操作: "技术",
61
+ 技术测试: "技术",
62
+ 技术探索: "技术",
63
+ 技术实践: "技术",
64
+ 技术活动: "技术",
65
+ 技术文档: "技术",
66
+ "技术/开发": "技术",
67
+ "技术/运维": "技术",
68
+ "技术/调试": "技术",
69
+ "技术/项目": "项目",
70
+ 配置优化: "技术",
71
+ 系统维护: "技术",
72
+ 系统配置: "技术",
73
+ 自动化运维: "技术",
74
+ 功能规则: "技术",
75
+ 新闻工具: "技术",
76
+ 数据分析: "技术",
77
+ 网络工具: "技术",
78
+ 通信技术: "技术",
79
+ 配置文件管理: "技术",
80
+ 调试: "技术",
81
+ "调试/排查": "技术",
82
+ 问题修复: "技术",
83
+ 系统清理: "技术",
84
+ 工具: "技术",
85
+ "工具/平台": "技术",
86
+ "工具/框架": "技术",
87
+ AI工具: "技术",
88
+ AI平台: "技术",
89
+ AI模型: "技术",
90
+ AI技术: "技术",
91
+ "AI/产品": "技术",
92
+ AI: "技术",
93
+ // 项目族
94
+ 应用场景: "项目",
95
+ 任务类型: "项目",
96
+ 协作规划: "项目",
97
+ 规划: "项目",
98
+ 活动: "项目",
99
+ // 作息族
100
+ 生活作息: "作息",
101
+ 个人习惯: "作息",
102
+ // 兴趣族
103
+ 兴趣爱好: "兴趣",
104
+ 新闻资讯: "兴趣",
105
+ 金融资讯: "兴趣",
106
+ 科技新闻: "兴趣",
107
+ 行业新闻: "兴趣",
108
+ 资讯: "兴趣",
109
+ 热点资讯: "兴趣",
110
+ 信息获取: "兴趣",
111
+ 投资: "兴趣",
112
+ // 偏好族
113
+ 决策风格: "偏好",
114
+ 生活: "偏好",
115
+ // 人际族
116
+ 家庭: "人际",
117
+ // 非画像(降级为 event 或丢弃)
118
+ 出行: "项目",
119
+ 交通: "项目",
120
+ "生活/计划": "项目",
121
+ 出行计划: "项目",
122
+ 旅行出行: "项目",
123
+ 地域活动: "项目",
124
+ 地点关注: "项目",
125
+ 工作动态: "项目",
126
+ // 位置类(会被特殊处理到 locations 字段,保留为 project 兜底)
127
+ 位置: "项目",
128
+ };
129
+ /**
130
+ * 识别"事件类"关键词:命中时强制降级为 event 层
131
+ * 解决一次性排查/修复被误写成长期 interest 的问题
132
+ */
133
+ const EVENT_KEYWORDS = [
134
+ "排查",
135
+ "修复",
136
+ "bug",
137
+ "Bug",
138
+ "BUG",
139
+ "调试",
140
+ "debug",
141
+ "验证",
142
+ "测试",
143
+ "超时",
144
+ "SIGTERM",
145
+ "报错",
146
+ "故障",
147
+ "异常",
148
+ "清理",
149
+ "恢复",
150
+ ];
151
+ export function isLikelyEvent(value) {
152
+ return EVENT_KEYWORDS.some((kw) => value.includes(kw));
153
+ }
154
+ /**
155
+ * 受控维度规范化:将任意维度名映射到 CANONICAL_DIMENSIONS
156
+ * - 完全匹配 → 直接返回
157
+ * - 有别名 → 返回别名映射
158
+ * - 前缀匹配受控维度 → 映射到该维度
159
+ * - 其他 → 返回 null,由调用方决定丢弃或归为"其他"
160
+ */
161
+ export function canonicalizeDimension(dim) {
162
+ const trimmed = dim.trim();
163
+ if (CANONICAL_DIMENSIONS.includes(trimmed)) {
164
+ return trimmed;
165
+ }
166
+ if (DIMENSION_ALIAS[trimmed])
167
+ return DIMENSION_ALIAS[trimmed];
168
+ // 前缀匹配:例如"技术/xxx"、"项目/xxx"
169
+ for (const canon of CANONICAL_DIMENSIONS) {
170
+ if (trimmed.startsWith(canon))
171
+ return canon;
172
+ }
173
+ return null;
174
+ }
20
175
  export const EMPTY_PROFILE = {
21
176
  summary: "",
22
177
  coreTags: [],
23
178
  tags: {},
179
+ locations: { recent: [] },
24
180
  updatedAt: new Date().toISOString(),
25
181
  };
182
+ // ============================================================================
183
+ // 工具函数
184
+ // ============================================================================
26
185
  /** 获取标签的 layer(兼容旧数据) */
27
186
  function getLayer(tag) {
28
187
  return tag.layer || "interest";
29
188
  }
30
- // ---- 维度归一化规则 ----
31
- /** 维度合并:源维度 目标维度(碎片化的近义维度归一化到标准维度) */
32
- const DIMENSION_MERGE_RULES = {
33
- "技术行为": "技术",
34
- "技术工具": "技术",
35
- "技术方向": "技术",
36
- "技术开发": "技术",
37
- "技术排查": "技术",
38
- "技术项目": "项目",
39
- "技术领域": "技术",
40
- "技术配置": "技术",
41
- "技术问题排查": "技术",
42
- "配置优化": "技术",
43
- "系统维护": "技术",
44
- "自动化运维": "技术",
45
- "功能规则": "技术",
46
- "新闻工具": "技术",
47
- "数据分析": "技术",
48
- "出行计划": "出行",
49
- "旅行出行": "出行",
50
- "生活作息": "作息",
51
- "个人习惯": "生活",
52
- "决策风格": "生活",
53
- "应用场景": "项目",
54
- };
55
- /** 语义去重组:[保留项, ...要合并删除的等价项] */
56
- const DEDUP_GROUPS = [
57
- ["talk-to-shadow", "talk-to-shadow语音交互项目", "talk-to-shadow方案"],
58
- ["声纹识别", "声纹识别逻辑修正", "置信度阈值调试"],
59
- ["语音交互/声纹识别", "TTS语音合成", "语音系统优化", "语音引擎切换"],
60
- ["家庭成员身份识别", "家庭成员声纹识别", "陌生人识别规则优化"],
61
- ["news-knowledge-base", "新闻知识库开发维护", "新闻知识库项目管理者", "AI-Agent新闻整理"],
62
- ["AI记忆系统研究者", "LLM应用技术关注者"],
63
- ];
189
+ /** 计算两个 ISO 日期之间的天数差 */
190
+ function daysBetween(iso1, iso2 = new Date().toISOString()) {
191
+ const d1 = new Date(iso1).getTime();
192
+ const d2 = new Date(iso2).getTime();
193
+ return Math.max(0, (d2 - d1) / 86400000);
194
+ }
195
+ // ============================================================================
196
+ // ProfileManager
197
+ // ============================================================================
64
198
  export class ProfileManager {
65
199
  profile = null;
66
200
  profilePath;
@@ -75,14 +209,18 @@ export class ProfileManager {
75
209
  return this.profile;
76
210
  try {
77
211
  const raw = await fs.readFile(this.profilePath, "utf-8");
78
- this.profile = JSON.parse(raw);
212
+ const parsed = JSON.parse(raw);
213
+ // 向后兼容:旧文件没有 locations 字段
214
+ if (!parsed.locations)
215
+ parsed.locations = { recent: [] };
216
+ this.profile = parsed;
79
217
  const tagCount = Object.values(this.profile.tags).reduce((sum, tags) => sum + tags.length, 0);
80
218
  const layerCounts = this.countByLayer(this.profile);
81
- logger.info(`[engram:profile] Loaded profile: ${tagCount} tags (identity=${layerCounts.identity} pattern=${layerCounts.pattern} interest=${layerCounts.interest}), ${Object.keys(this.profile.tags).length} dimensions, coreTags=[${this.profile.coreTags.join(", ")}]`);
219
+ logger.info(`[engram:profile] Loaded profile: ${tagCount} tags (identity=${layerCounts.identity} pattern=${layerCounts.pattern} interest=${layerCounts.interest} event=${layerCounts.event}), ${Object.keys(this.profile.tags).length} dimensions, coreTags=[${this.profile.coreTags.join(", ")}]`);
82
220
  }
83
221
  catch {
84
222
  logger.info(`[engram:profile] Profile not found at ${this.profilePath}, using empty profile`);
85
- this.profile = { ...EMPTY_PROFILE };
223
+ this.profile = { ...EMPTY_PROFILE, locations: { recent: [] } };
86
224
  }
87
225
  return this.profile;
88
226
  }
@@ -101,7 +239,12 @@ export class ProfileManager {
101
239
  }
102
240
  /** 统计各层标签数量 */
103
241
  countByLayer(profile) {
104
- const counts = { identity: 0, pattern: 0, interest: 0 };
242
+ const counts = {
243
+ identity: 0,
244
+ pattern: 0,
245
+ interest: 0,
246
+ event: 0,
247
+ };
105
248
  for (const tags of Object.values(profile.tags)) {
106
249
  for (const t of tags) {
107
250
  counts[getLayer(t)]++;
@@ -111,16 +254,19 @@ export class ProfileManager {
111
254
  }
112
255
  /**
113
256
  * 获取召回用的摘要信息(控制 token 消耗)
114
- * 优先展示 identity + pattern,interest confidence 排
257
+ * 优先展示 summary + locations + coreTags
115
258
  */
116
259
  getRecallContext(profile) {
117
- if (!profile.summary && profile.coreTags.length === 0) {
260
+ if (!profile.summary && profile.coreTags.length === 0 && !profile.locations?.primary) {
118
261
  return "";
119
262
  }
120
263
  const parts = [];
121
264
  if (profile.summary) {
122
265
  parts.push(`【用户画像】${profile.summary}`);
123
266
  }
267
+ if (profile.locations?.primary) {
268
+ parts.push(`【常驻地】${profile.locations.primary}`);
269
+ }
124
270
  if (profile.coreTags.length > 0) {
125
271
  parts.push(`【核心标签】${profile.coreTags.join(", ")}`);
126
272
  }
@@ -128,95 +274,122 @@ export class ProfileManager {
128
274
  }
129
275
  /**
130
276
  * 添加标签(增量更新,默认 layer="interest")
277
+ * 新增:
278
+ * - 若 dimension 不在受控词表里,尝试 canonicalize,失败则丢弃
279
+ * - 若 value 命中事件关键词,强制降级为 event 层
131
280
  */
132
281
  addTag(profile, dimension, value, layer = "interest") {
133
- if (!profile.tags[dimension]) {
134
- profile.tags[dimension] = [];
282
+ // 受控维度规范化
283
+ const canon = canonicalizeDimension(dimension);
284
+ if (!canon) {
285
+ logger.info(`[engram:profile] addTag: skip uncontrolled dimension "${dimension}" for value "${value}"`);
286
+ return profile;
135
287
  }
136
- const existing = profile.tags[dimension].find(t => t.value === value);
288
+ // 事件类关键词强制降级为 event 层(除非 LLM 明确声明 identity/pattern)
289
+ let finalLayer = layer;
290
+ if (layer === "interest" && isLikelyEvent(value)) {
291
+ finalLayer = "event";
292
+ logger.info(`[engram:profile] addTag: "${value}" demoted to event layer (keyword match)`);
293
+ }
294
+ if (!profile.tags[canon]) {
295
+ profile.tags[canon] = [];
296
+ }
297
+ const existing = profile.tags[canon].find((t) => t.value === value);
137
298
  if (existing) {
138
299
  existing.confidence = Math.min(1.0, existing.confidence + 0.1);
139
300
  existing.lastSeen = new Date().toISOString();
140
- // 如果已有标签被提升层级(如 interest → identity),更新 layer
141
301
  const existingLayer = getLayer(existing);
142
- const layerPriority = { identity: 3, pattern: 2, interest: 1 };
143
- if (layerPriority[layer] > layerPriority[existingLayer]) {
144
- existing.layer = layer;
145
- logger.info(`[engram:profile] Tag "${value}" promoted: ${existingLayer} → ${layer}`);
302
+ if (LAYER_PRIORITY[finalLayer] > LAYER_PRIORITY[existingLayer]) {
303
+ existing.layer = finalLayer;
304
+ logger.info(`[engram:profile] Tag "${value}" promoted: ${existingLayer} → ${finalLayer}`);
146
305
  }
147
306
  }
148
307
  else {
149
- const cfg = LAYER_CONFIG[layer];
150
- profile.tags[dimension].push({
308
+ const cfg = LAYER_CONFIG[finalLayer];
309
+ profile.tags[canon].push({
151
310
  value,
152
311
  confidence: cfg.defaultConfidence,
153
312
  lastSeen: new Date().toISOString(),
154
- layer,
313
+ layer: finalLayer,
155
314
  });
156
315
  }
157
316
  return profile;
158
317
  }
159
318
  /**
160
319
  * 分层衰减标签置信度
161
- * identity 不衰减,pattern 慢衰减(0.995),interest 快衰减(0.95)
320
+ * identity 不衰减,pattern 慢衰减(0.995),interest 快衰减(0.95),event 最快(0.85)
321
+ * 同时基于 TTL 清理过期条目(event: 7d, interest: 30d, pattern: 180d)
162
322
  */
163
323
  decayTags(profile, factor) {
164
324
  let decayed = 0;
165
- let pruned = 0;
325
+ let prunedByConfidence = 0;
326
+ let prunedByTtl = 0;
166
327
  const layerStats = {
167
- identity: { decayed: 0, pruned: 0 },
168
- pattern: { decayed: 0, pruned: 0 },
169
- interest: { decayed: 0, pruned: 0 },
328
+ identity: 0,
329
+ pattern: 0,
330
+ interest: 0,
331
+ event: 0,
170
332
  };
171
333
  for (const dimension of Object.keys(profile.tags)) {
172
334
  const before = profile.tags[dimension].length;
173
335
  profile.tags[dimension] = profile.tags[dimension]
174
- .map(t => {
336
+ .map((t) => {
175
337
  const layer = getLayer(t);
176
338
  const cfg = LAYER_CONFIG[layer];
177
- // 如果调用方传了 factor,对 interest 用 factor,其他层用各自配置
178
- // 如果没传 factor,全部用各层配置
339
+ // 调用方传了 factor 时仅对 interest 生效
179
340
  const actualFactor = factor !== undefined && layer === "interest" ? factor : cfg.decayFactor;
180
341
  return { ...t, confidence: t.confidence * actualFactor };
181
342
  })
182
- .filter(t => {
343
+ .filter((t) => {
183
344
  const layer = getLayer(t);
184
345
  const cfg = LAYER_CONFIG[layer];
185
- return t.confidence > cfg.pruneThreshold;
346
+ // TTL 剪枝
347
+ if (Number.isFinite(cfg.ttlDays) && daysBetween(t.lastSeen) > cfg.ttlDays) {
348
+ prunedByTtl++;
349
+ return false;
350
+ }
351
+ // 置信度剪枝
352
+ if (t.confidence <= cfg.pruneThreshold) {
353
+ prunedByConfidence++;
354
+ return false;
355
+ }
356
+ return true;
186
357
  });
187
- // 统计
188
358
  for (const t of profile.tags[dimension]) {
189
- layerStats[getLayer(t)].decayed++;
359
+ layerStats[getLayer(t)]++;
190
360
  }
191
- const prunedCount = before - profile.tags[dimension].length;
192
- pruned += prunedCount;
193
361
  decayed += profile.tags[dimension].length;
194
362
  if (profile.tags[dimension].length === 0) {
195
363
  delete profile.tags[dimension];
196
364
  }
365
+ void before;
197
366
  }
198
- logger.info(`[engram:profile] decayTags: identity=${layerStats.identity.decayed}(kept) pattern=${layerStats.pattern.decayed}(kept) interest=${layerStats.interest.decayed}(kept), pruned=${pruned} low-confidence tags`);
367
+ logger.info(`[engram:profile] decayTags: kept identity=${layerStats.identity} pattern=${layerStats.pattern} interest=${layerStats.interest} event=${layerStats.event}, pruned=${prunedByConfidence}(low-conf) + ${prunedByTtl}(TTL)`);
368
+ void decayed;
199
369
  return profile;
200
370
  }
201
371
  /**
202
- * 生成 coreTags:优先 identity → pattern → interest,每层取 confidence 最高的
372
+ * 生成 coreTags:优先 identity → pattern → interest,event 不计入
373
+ * 同时跳过纯经纬度和地址(交由 locations 字段独立展示)
203
374
  */
204
375
  generateCoreTags(profile) {
205
376
  const allTags = [];
206
377
  for (const tags of Object.values(profile.tags)) {
207
378
  for (const t of tags) {
208
- allTags.push({ value: t.value, confidence: t.confidence, layer: getLayer(t) });
379
+ const layer = getLayer(t);
380
+ if (layer === "event")
381
+ continue; // event 不参与 coreTags
382
+ if (isLikelyCoordinate(t.value))
383
+ continue; // 过滤纯经纬度
384
+ allTags.push({ value: t.value, confidence: t.confidence, layer });
209
385
  }
210
386
  }
211
- // 排序:identity 优先 > pattern > interest,同层内按 confidence 降序
212
- const layerPriority = { identity: 3, pattern: 2, interest: 1 };
213
387
  allTags.sort((a, b) => {
214
- const layerDiff = layerPriority[b.layer] - layerPriority[a.layer];
388
+ const layerDiff = LAYER_PRIORITY[b.layer] - LAYER_PRIORITY[a.layer];
215
389
  if (layerDiff !== 0)
216
390
  return layerDiff;
217
391
  return b.confidence - a.confidence;
218
392
  });
219
- // 去重,取前 10 个
220
393
  const seen = new Set();
221
394
  const result = [];
222
395
  for (const t of allTags) {
@@ -224,37 +397,40 @@ export class ProfileManager {
224
397
  continue;
225
398
  seen.add(t.value);
226
399
  result.push(t.value);
227
- if (result.length >= 10)
228
- break;
400
+ if (result.length >= 8)
401
+ break; // 从 10 收紧到 8
229
402
  }
230
403
  return result;
231
404
  }
232
405
  /**
233
- * 维度归一化:合并碎片维度 + 语义去重 + 清理空维度
234
- * 在月度 settle 中调用,防止 interest 更新时 LLM 自由发挥维度名导致碎片化
406
+ * 维度归一化:合并碎片维度 + 按受控表收敛 + 清理空维度
235
407
  *
236
- * @returns 统计信息 { merged, deduped, emptied }
408
+ * @returns 统计信息 { merged, deduped, emptied, demotedToEvent }
237
409
  */
238
410
  normalizeDimensions(profile) {
239
411
  let merged = 0;
240
412
  let deduped = 0;
241
413
  let emptied = 0;
242
- // ---- 维度合并规则 ----
243
- for (const [srcDim, targetDim] of Object.entries(DIMENSION_MERGE_RULES)) {
244
- if (!profile.tags[srcDim] || srcDim === targetDim)
414
+ let demotedToEvent = 0;
415
+ // ---- 1. 维度合并到受控表 ----
416
+ const dimsToProcess = Object.keys(profile.tags);
417
+ for (const srcDim of dimsToProcess) {
418
+ const targetDim = canonicalizeDimension(srcDim);
419
+ if (!targetDim || targetDim === srcDim)
245
420
  continue;
246
421
  if (!profile.tags[targetDim])
247
422
  profile.tags[targetDim] = [];
248
423
  for (const tag of profile.tags[srcDim]) {
249
- const existing = profile.tags[targetDim].find(t => t.value === tag.value);
424
+ const existing = profile.tags[targetDim].find((t) => t.value === tag.value);
250
425
  if (existing) {
251
426
  if (tag.confidence > existing.confidence) {
252
427
  existing.confidence = tag.confidence;
253
428
  existing.lastSeen = tag.lastSeen;
254
429
  }
255
- const lp = { identity: 3, pattern: 2, interest: 1 };
256
- if ((lp[tag.layer || "interest"] || 1) > (lp[existing.layer || "interest"] || 1)) {
257
- existing.layer = tag.layer;
430
+ const tagLayer = tag.layer || "interest";
431
+ const existingLayer = existing.layer || "interest";
432
+ if (LAYER_PRIORITY[tagLayer] > LAYER_PRIORITY[existingLayer]) {
433
+ existing.layer = tagLayer;
258
434
  }
259
435
  }
260
436
  else {
@@ -264,46 +440,266 @@ export class ProfileManager {
264
440
  }
265
441
  delete profile.tags[srcDim];
266
442
  }
267
- // ---- 语义去重 ----
268
- for (const group of DEDUP_GROUPS) {
269
- const [keepValue, ...removeValues] = group;
270
- const removeSet = new Set(removeValues);
271
- for (const [dim, tags] of Object.entries(profile.tags)) {
272
- const keepTag = tags.find(t => t.value === keepValue);
273
- const removeTags = tags.filter(t => removeSet.has(t.value));
274
- if (removeTags.length > 0) {
275
- if (keepTag) {
276
- for (const rt of removeTags) {
277
- if (rt.confidence > keepTag.confidence)
278
- keepTag.confidence = rt.confidence;
279
- if (rt.lastSeen > keepTag.lastSeen)
280
- keepTag.lastSeen = rt.lastSeen;
281
- const lp = { identity: 3, pattern: 2, interest: 1 };
282
- if ((lp[rt.layer || "interest"] || 1) > (lp[keepTag.layer || "interest"] || 1)) {
283
- keepTag.layer = rt.layer;
284
- }
285
- }
286
- }
287
- profile.tags[dim] = tags.filter(t => !removeSet.has(t.value));
288
- deduped += removeTags.length;
443
+ // ---- 2. 事件类关键词降级:interest → event ----
444
+ for (const dim of Object.keys(profile.tags)) {
445
+ for (const tag of profile.tags[dim]) {
446
+ if ((tag.layer || "interest") === "interest" && isLikelyEvent(tag.value)) {
447
+ tag.layer = "event";
448
+ tag.confidence = Math.min(tag.confidence, LAYER_CONFIG.event.defaultConfidence);
449
+ demotedToEvent++;
289
450
  }
290
451
  }
291
452
  }
292
- // ---- 清理空维度 ----
453
+ // ---- 3. 清理非受控维度 + 空维度 ----
293
454
  for (const dim of Object.keys(profile.tags)) {
455
+ if (!CANONICAL_DIMENSIONS.includes(dim)) {
456
+ // 不在受控表也没别名映射 → 丢弃
457
+ logger.info(`[engram:profile] normalize: dropping uncontrolled dimension [${dim}] with ${profile.tags[dim].length} tags`);
458
+ deduped += profile.tags[dim].length;
459
+ delete profile.tags[dim];
460
+ continue;
461
+ }
294
462
  if (profile.tags[dim].length === 0) {
295
463
  delete profile.tags[dim];
296
464
  emptied++;
297
465
  }
298
466
  }
299
- // ---- 各维度内按 confidence 降序排列 ----
467
+ // ---- 4. 各维度内按 confidence 降序排列 ----
300
468
  for (const dim of Object.keys(profile.tags)) {
301
469
  profile.tags[dim].sort((a, b) => b.confidence - a.confidence);
302
470
  }
303
- if (merged > 0 || deduped > 0 || emptied > 0) {
304
- logger.info(`[engram:profile] normalizeDimensions: merged=${merged} deduped=${deduped} emptied=${emptied}`);
471
+ if (merged > 0 || deduped > 0 || emptied > 0 || demotedToEvent > 0) {
472
+ logger.info(`[engram:profile] normalizeDimensions: merged=${merged} deduped=${deduped} emptied=${emptied} demotedToEvent=${demotedToEvent}`);
305
473
  }
306
- return { merged, deduped, emptied };
474
+ return { merged, deduped, emptied, demotedToEvent };
475
+ }
476
+ /**
477
+ * 更新位置信息
478
+ * - primary 由长期记忆或 LLM 自审确定(本函数不主动设)
479
+ * - recent 只保留最近的 3 个,按 updatedAt 排序
480
+ */
481
+ updateRecentLocation(profile, place) {
482
+ if (!profile.locations)
483
+ profile.locations = { recent: [] };
484
+ const now = new Date().toISOString();
485
+ const existing = profile.locations.recent.find((r) => r.place === place);
486
+ if (existing) {
487
+ existing.updatedAt = now;
488
+ }
489
+ else {
490
+ profile.locations.recent.push({ place, updatedAt: now });
491
+ }
492
+ profile.locations.recent.sort((a, b) => (a.updatedAt < b.updatedAt ? 1 : -1));
493
+ profile.locations.recent = profile.locations.recent.slice(0, 3);
494
+ return profile;
495
+ }
496
+ /**
497
+ * LLM 自审:月度兜底清理,交由 LLM 发现 hard-coded 规则遗漏的问题
498
+ *
499
+ * @param profile 当前画像
500
+ * @param llmCall LLM 调用函数
501
+ * @returns 应用的变更统计
502
+ */
503
+ async llmSelfAudit(profile, llmCall) {
504
+ const compactView = Object.entries(profile.tags)
505
+ .map(([dim, tags]) => `${dim}: ${tags.map((t) => `${t.value}(${t.layer ?? "interest"})`).join(" | ")}`)
506
+ .join("\n");
507
+ if (!compactView) {
508
+ return { mergedGroups: 0, demoted: 0, coreTagsRewritten: false };
509
+ }
510
+ const systemPrompt = `你是一个用户画像审核专家。分析当前画像,找出以下问题:
511
+
512
+ 1. **语义重复的 tag**:同一意思的不同写法(如 "ACP subagent开发测试" 和 "ACP子代理spawn测试")。
513
+ 对每组重复,挑选最规范的一条作为 keep,其他作为 remove。
514
+
515
+ 2. **错误分层的 tag**:一次性事件(排查/bug/超时/配置修改)被错误地标为 interest 层,应该降级为 event。
516
+ 罗列这些 tag 的 value。
517
+
518
+ 3. **重写 coreTags**:从当前画像中挑选最代表用户身份和长期特征的 5 个 tag。
519
+ 跳过经纬度、纯地址字符串、一次性事件。
520
+
521
+ 只输出 JSON,不要任何解释:
522
+ {
523
+ "merges": [{"keep": "标准写法", "remove": ["变体1", "变体2"]}],
524
+ "demotions": ["事件类tag值", ...],
525
+ "newCoreTags": ["tag1", "tag2", ...]
526
+ }
527
+
528
+ 如果某项无需变动,对应数组留空即可。`;
529
+ logger.info(`[engram:profile] llmSelfAudit: calling LLM (${compactView.length} chars of profile view)`);
530
+ let raw;
531
+ try {
532
+ raw = await llmCall(compactView, systemPrompt);
533
+ }
534
+ catch (err) {
535
+ logger.error(`[engram:profile] llmSelfAudit: LLM call failed: ${err}`);
536
+ return { mergedGroups: 0, demoted: 0, coreTagsRewritten: false };
537
+ }
538
+ // 容错 JSON 解析(移除 markdown code fence)
539
+ const cleaned = raw.replace(/^```(?:json)?\s*/i, "").replace(/\s*```\s*$/i, "").trim();
540
+ let audit;
541
+ try {
542
+ audit = JSON.parse(cleaned);
543
+ }
544
+ catch (err) {
545
+ logger.error(`[engram:profile] llmSelfAudit: JSON parse failed: ${err}, raw="${raw.slice(0, 200)}"`);
546
+ return { mergedGroups: 0, demoted: 0, coreTagsRewritten: false };
547
+ }
548
+ let mergedGroups = 0;
549
+ let demoted = 0;
550
+ // 应用合并
551
+ for (const group of audit.merges ?? []) {
552
+ if (!group.keep || !Array.isArray(group.remove) || group.remove.length === 0)
553
+ continue;
554
+ const removeSet = new Set(group.remove);
555
+ let applied = false;
556
+ for (const dim of Object.keys(profile.tags)) {
557
+ const keepTag = profile.tags[dim].find((t) => t.value === group.keep);
558
+ const removeTags = profile.tags[dim].filter((t) => removeSet.has(t.value));
559
+ if (removeTags.length === 0)
560
+ continue;
561
+ if (keepTag) {
562
+ for (const rt of removeTags) {
563
+ if (rt.confidence > keepTag.confidence)
564
+ keepTag.confidence = rt.confidence;
565
+ if (rt.lastSeen > keepTag.lastSeen)
566
+ keepTag.lastSeen = rt.lastSeen;
567
+ }
568
+ }
569
+ profile.tags[dim] = profile.tags[dim].filter((t) => !removeSet.has(t.value));
570
+ applied = true;
571
+ }
572
+ if (applied)
573
+ mergedGroups++;
574
+ }
575
+ // 应用降级
576
+ const demoteSet = new Set(audit.demotions ?? []);
577
+ if (demoteSet.size > 0) {
578
+ for (const dim of Object.keys(profile.tags)) {
579
+ for (const tag of profile.tags[dim]) {
580
+ if (demoteSet.has(tag.value) && (tag.layer ?? "interest") !== "event") {
581
+ tag.layer = "event";
582
+ tag.confidence = Math.min(tag.confidence, LAYER_CONFIG.event.defaultConfidence);
583
+ demoted++;
584
+ }
585
+ }
586
+ }
587
+ }
588
+ // 应用 coreTags 重写
589
+ let coreTagsRewritten = false;
590
+ if (Array.isArray(audit.newCoreTags) && audit.newCoreTags.length > 0) {
591
+ profile.coreTags = audit.newCoreTags.slice(0, 8);
592
+ coreTagsRewritten = true;
593
+ }
594
+ logger.info(`[engram:profile] llmSelfAudit: mergedGroups=${mergedGroups} demoted=${demoted} coreTagsRewritten=${coreTagsRewritten}`);
595
+ return { mergedGroups, demoted, coreTagsRewritten };
596
+ }
597
+ /**
598
+ * 基于 embedding 的同维度内语义去重
599
+ *
600
+ * 对每个维度内的 tag 两两计算 cosine 相似度,>= threshold 的合并为一组,
601
+ * 组内保留 confidence 最高的作为 canonical,其他并入(取 max confidence + 最新 lastSeen)。
602
+ *
603
+ * @param profile 当前画像
604
+ * @param embedder 将文本 -> 向量的函数
605
+ * @param threshold cosine 相似度阈值(默认 0.88)
606
+ */
607
+ async dedupByEmbedding(profile, embedder, threshold = 0.88) {
608
+ let merged = 0;
609
+ for (const dim of Object.keys(profile.tags)) {
610
+ const tags = profile.tags[dim];
611
+ if (tags.length < 2)
612
+ continue;
613
+ // 批量 embed(串行,避免 API QPS 限制)
614
+ const vectors = [];
615
+ for (const t of tags) {
616
+ try {
617
+ vectors.push(await embedder(t.value));
618
+ }
619
+ catch (err) {
620
+ logger.error(`[engram:profile] dedupByEmbedding: embed failed for "${t.value}": ${err}`);
621
+ vectors.push([]); // 占位,跳过相似度计算
622
+ }
623
+ }
624
+ // 并查集式分组
625
+ const parent = tags.map((_, i) => i);
626
+ const find = (x) => (parent[x] === x ? x : (parent[x] = find(parent[x])));
627
+ const union = (a, b) => {
628
+ const ra = find(a);
629
+ const rb = find(b);
630
+ if (ra !== rb)
631
+ parent[ra] = rb;
632
+ };
633
+ for (let i = 0; i < tags.length; i++) {
634
+ if (vectors[i].length === 0)
635
+ continue;
636
+ for (let j = i + 1; j < tags.length; j++) {
637
+ if (vectors[j].length === 0)
638
+ continue;
639
+ const sim = cosineSimilarity(vectors[i], vectors[j]);
640
+ if (sim >= threshold)
641
+ union(i, j);
642
+ }
643
+ }
644
+ // 按组合并
645
+ const groups = new Map();
646
+ for (let i = 0; i < tags.length; i++) {
647
+ const root = find(i);
648
+ if (!groups.has(root))
649
+ groups.set(root, []);
650
+ groups.get(root).push(i);
651
+ }
652
+ const keptTags = [];
653
+ for (const idxs of groups.values()) {
654
+ if (idxs.length === 1) {
655
+ keptTags.push(tags[idxs[0]]);
656
+ continue;
657
+ }
658
+ // 组内合并:选 confidence 最高的为 canonical
659
+ idxs.sort((a, b) => tags[b].confidence - tags[a].confidence);
660
+ const canonical = { ...tags[idxs[0]] };
661
+ for (let k = 1; k < idxs.length; k++) {
662
+ const t = tags[idxs[k]];
663
+ if (t.confidence > canonical.confidence)
664
+ canonical.confidence = t.confidence;
665
+ if (t.lastSeen > canonical.lastSeen)
666
+ canonical.lastSeen = t.lastSeen;
667
+ const tLayer = t.layer || "interest";
668
+ const cLayer = canonical.layer || "interest";
669
+ if (LAYER_PRIORITY[tLayer] > LAYER_PRIORITY[cLayer])
670
+ canonical.layer = tLayer;
671
+ }
672
+ logger.info(`[engram:profile] dedupByEmbedding [${dim}] merge: keep="${canonical.value}", drop=[${idxs.slice(1).map((i) => `"${tags[i].value}"`).join(", ")}]`);
673
+ keptTags.push(canonical);
674
+ merged += idxs.length - 1;
675
+ }
676
+ profile.tags[dim] = keptTags;
677
+ }
678
+ logger.info(`[engram:profile] dedupByEmbedding: merged ${merged} tags`);
679
+ return { merged };
680
+ }
681
+ }
682
+ // ============================================================================
683
+ // 辅助函数
684
+ // ============================================================================
685
+ /** 判断字符串是否像纯经纬度 */
686
+ function isLikelyCoordinate(s) {
687
+ return /^-?\d+\.\d+\s*,\s*-?\d+\.\d+$/.test(s.trim());
688
+ }
689
+ /** cosine 相似度 */
690
+ function cosineSimilarity(a, b) {
691
+ if (a.length !== b.length || a.length === 0)
692
+ return 0;
693
+ let dot = 0;
694
+ let na = 0;
695
+ let nb = 0;
696
+ for (let i = 0; i < a.length; i++) {
697
+ dot += a[i] * b[i];
698
+ na += a[i] * a[i];
699
+ nb += b[i] * b[i];
307
700
  }
701
+ if (na === 0 || nb === 0)
702
+ return 0;
703
+ return dot / (Math.sqrt(na) * Math.sqrt(nb));
308
704
  }
309
705
  //# sourceMappingURL=profile.js.map