@chainlesschain/personal-data-hub 0.4.27 → 0.4.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+
3
+ import { describe, it, expect } from "vitest";
4
+
5
+ const { WeChatPcAdapter } = require("../../lib/adapters/wechat-pc");
6
+
7
+ // Build the raw envelope shape that WeChatPcAdapter.normalize() consumes for a
8
+ // group message (the sync() generator yields { kind:"message", payload, ... }).
9
+ function groupMessageRaw(payload) {
10
+ return {
11
+ adapter: "wechat-pc",
12
+ kind: "message",
13
+ originalId: "orig-1",
14
+ capturedAt: 1780000000000,
15
+ payload: { kind: "message", ...payload },
16
+ };
17
+ }
18
+
19
+ describe("wechat-pc — group topic naming", () => {
20
+ it("uses the resolved group display name for the topic when groupName is present", () => {
21
+ const adapter = new WeChatPcAdapter();
22
+ const out = adapter.normalize(groupMessageRaw({
23
+ talker: "45498354778@chatroom",
24
+ isGroup: true,
25
+ senderWxid: "wxid_friend",
26
+ groupName: "家庭群",
27
+ text: "晚饭吃什么",
28
+ createdTimeMs: 1780000000000,
29
+ }));
30
+ expect(out.topics).toHaveLength(1);
31
+ // Stable id keyed on the chatroom wxid (identity unchanged)...
32
+ expect(out.topics[0].id).toBe("topic-wechat-group-45498354778@chatroom");
33
+ // ...but the human-readable display name is used, NOT the numeric id.
34
+ expect(out.topics[0].name).toBe("家庭群");
35
+ });
36
+
37
+ it("falls back to the raw numeric id when no group name was resolved", () => {
38
+ const adapter = new WeChatPcAdapter();
39
+ const out = adapter.normalize(groupMessageRaw({
40
+ talker: "45498354778@chatroom",
41
+ isGroup: true,
42
+ senderWxid: "wxid_friend",
43
+ groupName: null,
44
+ text: "hi",
45
+ createdTimeMs: 1780000000000,
46
+ }));
47
+ expect(out.topics).toHaveLength(1);
48
+ expect(out.topics[0].name).toBe("45498354778");
49
+ });
50
+
51
+ it("blank/whitespace group name falls back to the raw id (no empty topic name)", () => {
52
+ const adapter = new WeChatPcAdapter();
53
+ const out = adapter.normalize(groupMessageRaw({
54
+ talker: "12345@chatroom",
55
+ isGroup: true,
56
+ senderWxid: "wxid_x",
57
+ groupName: " ",
58
+ text: "hi",
59
+ createdTimeMs: 1780000000000,
60
+ }));
61
+ expect(out.topics[0].name).toBe("12345");
62
+ });
63
+ });
@@ -498,6 +498,32 @@ describe("InterestsSkill", () => {
498
498
  expect(r.topTopics[0].name).toBe("Travel");
499
499
  expect(r.llmInterests).toBeNull();
500
500
  });
501
+
502
+ it("drops unresolved numeric group-id topics (e.g. WeChat chatroom ids) from the profile", async () => {
503
+ // Real interest topic
504
+ rig.vault.putTopic({
505
+ id: "topic-doubao", type: "topic", name: "豆包",
506
+ derivedFromEvents: ["e1"],
507
+ ingestedAt: Date.now(), source: defaultSource("test"),
508
+ });
509
+ // Unresolved group-chat topics named by raw numeric chatroom id — noise.
510
+ rig.vault.putTopic({
511
+ id: "topic-g1", type: "topic", name: "45498354778",
512
+ derivedFromEvents: [],
513
+ ingestedAt: Date.now() + 1, source: defaultSource("test"),
514
+ });
515
+ rig.vault.putTopic({
516
+ id: "topic-g2", type: "topic", name: "54346634535",
517
+ derivedFromEvents: [],
518
+ ingestedAt: Date.now() + 2, source: defaultSource("test"),
519
+ });
520
+ const skill = new InterestsSkill({ vault: rig.vault });
521
+ const r = await skill.run({});
522
+ const names = r.topTopics.map((t) => t.name);
523
+ expect(names).toContain("豆包");
524
+ expect(names).not.toContain("45498354778");
525
+ expect(names).not.toContain("54346634535");
526
+ });
501
527
  });
502
528
 
503
529
  // ─── TimelineSkill ──────────────────────────────────────────────────────
@@ -534,6 +560,34 @@ describe("TimelineSkill", () => {
534
560
  const r = await skill.run({ since: ts(2026, 4, 1) });
535
561
  expect(r.llm_narrative).toBe("你这周点了一次外卖。");
536
562
  });
563
+
564
+ it("excludes inventory-snapshot events (installed-app / contact roster) from the narrative", async () => {
565
+ // Real activity event (extra has no `kind` → must be kept)
566
+ makePayment(rig.vault, { id: "act-1", occurredAt: ts(2026, 5, 1), counterpartyName: "美团", amount: 10, adapter: "alipay-bill", title: "外卖" });
567
+ // Inventory-snapshot events stamped at a LATER (collection) time — these
568
+ // would dominate a DESC time query but must be filtered out.
569
+ rig.vault.putEvent({
570
+ id: "event-android-app-com.x", type: "event", subtype: "other",
571
+ occurredAt: ts(2026, 6, 1), actor: "person-self",
572
+ content: { title: "应用:X" },
573
+ ingestedAt: Date.now(), source: defaultSource("system-data-android"),
574
+ extra: { kind: "app-snapshot", packageName: "com.x" },
575
+ });
576
+ rig.vault.putEvent({
577
+ id: "event-android-contact-y", type: "event", subtype: "other",
578
+ occurredAt: ts(2026, 6, 1), actor: "person-self",
579
+ content: { title: "联系人:Y" },
580
+ ingestedAt: Date.now(), source: defaultSource("system-data-android"),
581
+ extra: { kind: "contact-snapshot" },
582
+ });
583
+ const skill = new TimelineSkill({ vault: rig.vault });
584
+ const r = await skill.run({ since: ts(2026, 4, 1) });
585
+ const ids = r.entries.map((e) => e.id);
586
+ expect(ids).toContain("act-1");
587
+ expect(ids).not.toContain("event-android-app-com.x");
588
+ expect(ids).not.toContain("event-android-contact-y");
589
+ expect(r.summary.totalEvents).toBe(1);
590
+ });
537
591
  });
538
592
 
539
593
  // ─── runAnalysisSkill dispatcher ─────────────────────────────────────────
@@ -290,6 +290,22 @@ class WeChatPcAdapter {
290
290
  null;
291
291
  const fallbackCapturedAt = Date.now();
292
292
  const messages = (result && Array.isArray(result.messages)) ? result.messages : [];
293
+ // Harvest group display names from the contact roster: WeChat stores group
294
+ // chatrooms (wxid ending @chatroom) in contact.db with a nickname/remark.
295
+ // They are skipped as Person entities below, but their names let us label
296
+ // group Topics with a human-readable name instead of the raw numeric id.
297
+ const groupNames = new Map();
298
+ {
299
+ const contactsForNames = (result && Array.isArray(result.contacts)) ? result.contacts : [];
300
+ for (const c of contactsForNames) {
301
+ if (!c || typeof c.wxid !== "string" || !c.wxid.endsWith("@chatroom")) continue;
302
+ const nm =
303
+ (typeof c.remark === "string" && c.remark.trim()) ||
304
+ (typeof c.nickname === "string" && c.nickname.trim()) ||
305
+ "";
306
+ if (nm) groupNames.set(c.wxid, nm);
307
+ }
308
+ }
293
309
  let emitted = 0;
294
310
  // The sidecar already applied `limit` across all sources (chat/biz/sns/
295
311
  // favorite). Yield everything it returned — do NOT re-cap here, or the
@@ -311,6 +327,7 @@ class WeChatPcAdapter {
311
327
  text: typeof m.text === "string" ? m.text : "",
312
328
  senderWxid: isGroup ? (m.sender || null) : null,
313
329
  isGroup,
330
+ groupName: isGroup && conv ? (groupNames.get(conv) || null) : null,
314
331
  contentBlob: typeof m.text === "string" ? m.text : null,
315
332
  // provenance: chat | biz(公众号) | sns(朋友圈) | favorite(收藏)
316
333
  wechatSource: typeof m.source === "string" ? m.source : "chat",
@@ -423,10 +440,16 @@ function normalizeMessage(p, raw, ingestedAt) {
423
440
 
424
441
  const topics = [];
425
442
  if (isGroup && p.talker) {
443
+ // Prefer the resolved group display name (harvested from contact.db in
444
+ // sync()); fall back to the raw numeric chatroom id only when unknown.
445
+ const groupName =
446
+ (typeof p.groupName === "string" && p.groupName.trim())
447
+ ? p.groupName.trim()
448
+ : p.talker.replace("@chatroom", "");
426
449
  topics.push({
427
450
  id: `topic-wechat-group-${p.talker}`,
428
451
  type: ENTITY_TYPES.TOPIC,
429
- name: p.talker.replace("@chatroom", ""),
452
+ name: groupName,
430
453
  ingestedAt,
431
454
  source,
432
455
  extra: { platform: "wechat", source: "pc", wxid: p.talker },
@@ -28,6 +28,21 @@
28
28
 
29
29
  const { AnalysisSkill } = require("./base");
30
30
 
31
+ /**
32
+ * A topic name carries real interest signal only if it is a human-readable
33
+ * label. Unresolved group-chat IDs (e.g. WeChat group topics named by their
34
+ * raw numeric chatroom id "45498354778") and empty names are NOT interests —
35
+ * they would crowd out genuine topics (coffee, photography, 豆包...) in the
36
+ * profile. Drop them.
37
+ */
38
+ function isMeaningfulTopicName(name) {
39
+ if (typeof name !== "string") return false;
40
+ const s = name.trim();
41
+ if (s.length === 0) return false;
42
+ if (/^\d+$/.test(s)) return false; // pure-numeric = unresolved group id
43
+ return true;
44
+ }
45
+
31
46
  class InterestsSkill extends AnalysisSkill {
32
47
  constructor(opts) {
33
48
  super({ ...opts, name: "analysis.interests" });
@@ -61,25 +76,30 @@ class InterestsSkill extends AnalysisSkill {
61
76
  let topics = [];
62
77
  try {
63
78
  const db = this.vault._requireOpen();
79
+ // Over-fetch (×20, capped) before filtering: vaults can hold thousands
80
+ // of unresolved numeric group-chat topics that would otherwise starve
81
+ // the few human-readable interest topics out of the top-N budget.
64
82
  topics = db.prepare(
65
83
  "SELECT id, name, derived_from_events, ingested_at FROM topics ORDER BY ingested_at DESC LIMIT ?"
66
- ).all(topN * 3);
84
+ ).all(Math.min(topN * 20, 2000));
67
85
  } catch (_e) {
68
86
  // Older vaults may not have topics; non-fatal.
69
87
  }
70
- const mapped = topics.map((t) => {
71
- let eventCount = 0;
72
- try {
73
- const arr = t.derived_from_events ? JSON.parse(t.derived_from_events) : [];
74
- if (Array.isArray(arr)) eventCount = arr.length;
75
- } catch (_e) {}
76
- return {
77
- id: t.id,
78
- name: t.name,
79
- eventCount,
80
- lastSeen: t.ingested_at || null,
81
- };
82
- });
88
+ const mapped = topics
89
+ .filter((t) => isMeaningfulTopicName(t.name))
90
+ .map((t) => {
91
+ let eventCount = 0;
92
+ try {
93
+ const arr = t.derived_from_events ? JSON.parse(t.derived_from_events) : [];
94
+ if (Array.isArray(arr)) eventCount = arr.length;
95
+ } catch (_e) {}
96
+ return {
97
+ id: t.id,
98
+ name: t.name,
99
+ eventCount,
100
+ lastSeen: t.ingested_at || null,
101
+ };
102
+ });
83
103
  return mapped
84
104
  .sort((a, b) => (b.eventCount - a.eventCount) || ((b.lastSeen || 0) - (a.lastSeen || 0)))
85
105
  .slice(0, topN);
@@ -63,7 +63,12 @@ class TimelineSkill extends AnalysisSkill {
63
63
  }
64
64
 
65
65
  _fetchEvents({ since, until }, limit) {
66
- const q = { limit };
66
+ // Exclude inventory-snapshot events (installed-app roster + contact
67
+ // roster from system-data-android). They carry a synthetic
68
+ // collection-time occurredAt — tens of thousands of them cluster at one
69
+ // recent timestamp and would otherwise crowd out real activity from this
70
+ // chronological narrative. They remain in the vault for facet counts.
71
+ const q = { limit, excludeExtraKinds: ["app-snapshot", "contact-snapshot"] };
67
72
  if (since != null) q.since = since;
68
73
  if (until != null) q.until = until;
69
74
  const events = this.vault.queryEvents(q) || [];
package/lib/vault.js CHANGED
@@ -811,6 +811,18 @@ class LocalVault {
811
811
  where.push("source_adapter = @adapter");
812
812
  params.adapter = q.adapter;
813
813
  }
814
+ if (Array.isArray(q.excludeExtraKinds) && q.excludeExtraKinds.length > 0) {
815
+ // Exclude inventory-snapshot events (e.g. installed-app / contact-roster
816
+ // facet events) whose extra.kind is in the given list. Those carry a
817
+ // synthetic collection-time occurredAt and would otherwise dominate any
818
+ // time-ordered (occurred_at DESC) query. Rows with no extra.kind are kept.
819
+ const placeholders = q.excludeExtraKinds.map((_v, i) => `@xk${i}`);
820
+ q.excludeExtraKinds.forEach((v, i) => { params[`xk${i}`] = v; });
821
+ where.push(
822
+ "(json_extract(extra, '$.kind') IS NULL OR json_extract(extra, '$.kind') NOT IN (" +
823
+ placeholders.join(", ") + "))",
824
+ );
825
+ }
814
826
 
815
827
  const limit = Number.isInteger(q.limit) && q.limit > 0 ? Math.min(q.limit, 10000) : 100;
816
828
  const offset = Number.isInteger(q.offset) && q.offset >= 0 ? q.offset : 0;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chainlesschain/personal-data-hub",
3
- "version": "0.4.27",
3
+ "version": "0.4.28",
4
4
  "description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
5
5
  "type": "commonjs",
6
6
  "main": "lib/index.js",