@chainlesschain/personal-data-hub 0.4.26 → 0.4.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+
3
+ import { describe, it, expect } from "vitest";
4
+
5
+ const { WeChatPcAdapter } = require("../../lib/adapters/wechat-pc");
6
+
7
+ // Build the raw envelope shape that WeChatPcAdapter.normalize() consumes for a
8
+ // group message (the sync() generator yields { kind:"message", payload, ... }).
9
+ function groupMessageRaw(payload) {
10
+ return {
11
+ adapter: "wechat-pc",
12
+ kind: "message",
13
+ originalId: "orig-1",
14
+ capturedAt: 1780000000000,
15
+ payload: { kind: "message", ...payload },
16
+ };
17
+ }
18
+
19
+ describe("wechat-pc — group topic naming", () => {
20
+ it("uses the resolved group display name for the topic when groupName is present", () => {
21
+ const adapter = new WeChatPcAdapter();
22
+ const out = adapter.normalize(groupMessageRaw({
23
+ talker: "45498354778@chatroom",
24
+ isGroup: true,
25
+ senderWxid: "wxid_friend",
26
+ groupName: "家庭群",
27
+ text: "晚饭吃什么",
28
+ createdTimeMs: 1780000000000,
29
+ }));
30
+ expect(out.topics).toHaveLength(1);
31
+ // Stable id keyed on the chatroom wxid (identity unchanged)...
32
+ expect(out.topics[0].id).toBe("topic-wechat-group-45498354778@chatroom");
33
+ // ...but the human-readable display name is used, NOT the numeric id.
34
+ expect(out.topics[0].name).toBe("家庭群");
35
+ });
36
+
37
+ it("falls back to the raw numeric id when no group name was resolved", () => {
38
+ const adapter = new WeChatPcAdapter();
39
+ const out = adapter.normalize(groupMessageRaw({
40
+ talker: "45498354778@chatroom",
41
+ isGroup: true,
42
+ senderWxid: "wxid_friend",
43
+ groupName: null,
44
+ text: "hi",
45
+ createdTimeMs: 1780000000000,
46
+ }));
47
+ expect(out.topics).toHaveLength(1);
48
+ expect(out.topics[0].name).toBe("45498354778");
49
+ });
50
+
51
+ it("blank/whitespace group name falls back to the raw id (no empty topic name)", () => {
52
+ const adapter = new WeChatPcAdapter();
53
+ const out = adapter.normalize(groupMessageRaw({
54
+ talker: "12345@chatroom",
55
+ isGroup: true,
56
+ senderWxid: "wxid_x",
57
+ groupName: " ",
58
+ text: "hi",
59
+ createdTimeMs: 1780000000000,
60
+ }));
61
+ expect(out.topics[0].name).toBe("12345");
62
+ });
63
+ });
@@ -498,6 +498,32 @@ describe("InterestsSkill", () => {
498
498
  expect(r.topTopics[0].name).toBe("Travel");
499
499
  expect(r.llmInterests).toBeNull();
500
500
  });
501
+
502
+ it("drops unresolved numeric group-id topics (e.g. WeChat chatroom ids) from the profile", async () => {
503
+ // Real interest topic
504
+ rig.vault.putTopic({
505
+ id: "topic-doubao", type: "topic", name: "豆包",
506
+ derivedFromEvents: ["e1"],
507
+ ingestedAt: Date.now(), source: defaultSource("test"),
508
+ });
509
+ // Unresolved group-chat topics named by raw numeric chatroom id — noise.
510
+ rig.vault.putTopic({
511
+ id: "topic-g1", type: "topic", name: "45498354778",
512
+ derivedFromEvents: [],
513
+ ingestedAt: Date.now() + 1, source: defaultSource("test"),
514
+ });
515
+ rig.vault.putTopic({
516
+ id: "topic-g2", type: "topic", name: "54346634535",
517
+ derivedFromEvents: [],
518
+ ingestedAt: Date.now() + 2, source: defaultSource("test"),
519
+ });
520
+ const skill = new InterestsSkill({ vault: rig.vault });
521
+ const r = await skill.run({});
522
+ const names = r.topTopics.map((t) => t.name);
523
+ expect(names).toContain("豆包");
524
+ expect(names).not.toContain("45498354778");
525
+ expect(names).not.toContain("54346634535");
526
+ });
501
527
  });
502
528
 
503
529
  // ─── TimelineSkill ──────────────────────────────────────────────────────
@@ -534,6 +560,34 @@ describe("TimelineSkill", () => {
534
560
  const r = await skill.run({ since: ts(2026, 4, 1) });
535
561
  expect(r.llm_narrative).toBe("你这周点了一次外卖。");
536
562
  });
563
+
564
+ it("excludes inventory-snapshot events (installed-app / contact roster) from the narrative", async () => {
565
+ // Real activity event (extra has no `kind` → must be kept)
566
+ makePayment(rig.vault, { id: "act-1", occurredAt: ts(2026, 5, 1), counterpartyName: "美团", amount: 10, adapter: "alipay-bill", title: "外卖" });
567
+ // Inventory-snapshot events stamped at a LATER (collection) time — these
568
+ // would dominate a DESC time query but must be filtered out.
569
+ rig.vault.putEvent({
570
+ id: "event-android-app-com.x", type: "event", subtype: "other",
571
+ occurredAt: ts(2026, 6, 1), actor: "person-self",
572
+ content: { title: "应用:X" },
573
+ ingestedAt: Date.now(), source: defaultSource("system-data-android"),
574
+ extra: { kind: "app-snapshot", packageName: "com.x" },
575
+ });
576
+ rig.vault.putEvent({
577
+ id: "event-android-contact-y", type: "event", subtype: "other",
578
+ occurredAt: ts(2026, 6, 1), actor: "person-self",
579
+ content: { title: "联系人:Y" },
580
+ ingestedAt: Date.now(), source: defaultSource("system-data-android"),
581
+ extra: { kind: "contact-snapshot" },
582
+ });
583
+ const skill = new TimelineSkill({ vault: rig.vault });
584
+ const r = await skill.run({ since: ts(2026, 4, 1) });
585
+ const ids = r.entries.map((e) => e.id);
586
+ expect(ids).toContain("act-1");
587
+ expect(ids).not.toContain("event-android-app-com.x");
588
+ expect(ids).not.toContain("event-android-contact-y");
589
+ expect(r.summary.totalEvents).toBe(1);
590
+ });
537
591
  });
538
592
 
539
593
  // ─── runAnalysisSkill dispatcher ─────────────────────────────────────────
@@ -0,0 +1,97 @@
1
+ "use strict";
2
+
3
+ import { describe, it, expect, beforeAll, afterAll } from "vitest";
4
+
5
+ const fs = require("node:fs");
6
+ const path = require("node:path");
7
+ const os = require("node:os");
8
+
9
+ const { LocalVault } = require("../lib/vault");
10
+ const { generateKeyHex } = require("../lib/key-providers");
11
+ const {
12
+ buildSalvageEvents,
13
+ salvageDumpToVault,
14
+ resolveApp,
15
+ } = require("../lib/forensics/salvage-ingest");
16
+
17
+ // Build a real SQLite DB and treat its bytes as a memory dump; verify the
18
+ // generic salvage→vault path recovers messages AND tags them with the correct
19
+ // per-app source.adapter (multi-app de-silo). Real LocalVault → proves the
20
+ // hand-built events pass schema validation + are searchable.
21
+ describe("salvage-ingest — generic multi-app salvage → vault", () => {
22
+ let dir, dumpPath, vault, vdir;
23
+ const COLUMNS = ["msg_uuid", "conversation_id", "sender", "content", "created_time"];
24
+
25
+ beforeAll(() => {
26
+ const Database = require("better-sqlite3-multiple-ciphers");
27
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), "salvage-ing-"));
28
+ dumpPath = path.join(dir, "u.db");
29
+ const db = new Database(dumpPath);
30
+ db.exec("CREATE TABLE msg(msg_uuid TEXT, conversation_id TEXT, sender INTEGER, content TEXT, created_time INTEGER)");
31
+ const ins = db.prepare("INSERT INTO msg VALUES(?,?,?,?,?)");
32
+ ins.run("u1", "conv-1", 111, "今天的会议改到下午三点 hi", 1700000000000);
33
+ ins.run("u2", "conv-1", 222, "收到 👌", 1700000001000);
34
+ db.close();
35
+
36
+ vdir = fs.mkdtempSync(path.join(os.tmpdir(), "salvage-vault-"));
37
+ vault = new LocalVault({ path: path.join(vdir, "v.db"), key: generateKeyHex() });
38
+ vault.open();
39
+ });
40
+
41
+ afterAll(() => {
42
+ try { vault.close(); } catch (_e) {}
43
+ try { fs.rmSync(dir, { recursive: true, force: true }); } catch (_e) {}
44
+ try { fs.rmSync(vdir, { recursive: true, force: true }); } catch (_e) {}
45
+ });
46
+
47
+ it("resolveApp maps known apps to canonical source adapters; unknown → salvage:<app>", () => {
48
+ expect(resolveApp("douyin").sourceAdapter).toBe("social-douyin");
49
+ expect(resolveApp("toutiao").sourceAdapter).toBe("social-toutiao");
50
+ expect(resolveApp("wechat").sourceAdapter).toBe("wechat");
51
+ expect(resolveApp("kuaishou").sourceAdapter).toBe("social-kuaishou");
52
+ expect(resolveApp("bogusapp").sourceAdapter).toBe("salvage:bogusapp");
53
+ });
54
+
55
+ it("buildSalvageEvents tags per-app source + builds valid message events", () => {
56
+ const recs = [{ rowid: "1", cols: ["u1", "conv-1", 111, "hello 世界", 1700000000000] }];
57
+ const built = buildSalvageEvents(recs, { app: "toutiao", columns: COLUMNS, now: 1700000099000 });
58
+ expect(built.events.length).toBe(1);
59
+ const e = built.events[0];
60
+ expect(e.source.adapter).toBe("social-toutiao");
61
+ expect(e.source.capturedBy).toBe("sqlite"); // schema enum; provenance in extra.salvaged
62
+ expect(e.subtype).toBe("message");
63
+ expect(e.content.text).toBe("hello 世界");
64
+ expect(e.extra.platform).toBe("toutiao");
65
+ expect(e.extra.salvaged).toBe(true);
66
+ });
67
+
68
+ it("salvageDumpToVault ingests with douyin source + events are searchable", () => {
69
+ const r = salvageDumpToVault(vault, dumpPath, { app: "douyin", columns: COLUMNS, now: 1700000099000 });
70
+ expect(r.app).toBe("douyin");
71
+ expect(r.sourceAdapter).toBe("social-douyin");
72
+ expect(r.ingested).toBe(2);
73
+ // events landed under the correct source + are searchable
74
+ const events = vault.queryEvents({ limit: 100 }) || [];
75
+ const douyin = events.filter((e) => e.source && e.source.adapter === "social-douyin");
76
+ expect(douyin.length).toBe(2);
77
+ const texts = douyin.map((e) => e.content && e.content.text).sort();
78
+ expect(texts).toContain("收到 👌"); // UTF-8 emoji survives
79
+ });
80
+
81
+ it("same dump under a different app tags a different source (no cross-attribution)", () => {
82
+ const r = salvageDumpToVault(vault, dumpPath, { app: "toutiao", columns: COLUMNS, now: 1700000099000 });
83
+ expect(r.sourceAdapter).toBe("social-toutiao");
84
+ expect(r.ingested).toBe(2);
85
+ const events = vault.queryEvents({ limit: 100 }) || [];
86
+ expect(events.filter((e) => e.source && e.source.adapter === "social-toutiao").length).toBe(2);
87
+ // douyin events from prior test remain distinct
88
+ expect(events.filter((e) => e.source && e.source.adapter === "social-douyin").length).toBe(2);
89
+ });
90
+
91
+ it("re-ingesting the same dump dedups (stable originalId)", () => {
92
+ const before = (vault.queryEvents({ limit: 200 }) || []).length;
93
+ salvageDumpToVault(vault, dumpPath, { app: "douyin", columns: COLUMNS, now: 1700000099000 });
94
+ const after = (vault.queryEvents({ limit: 200 }) || []).length;
95
+ expect(after).toBe(before); // ON CONFLICT(source_adapter, source_original_id) updates, no dupes
96
+ });
97
+ });
@@ -290,6 +290,22 @@ class WeChatPcAdapter {
290
290
  null;
291
291
  const fallbackCapturedAt = Date.now();
292
292
  const messages = (result && Array.isArray(result.messages)) ? result.messages : [];
293
+ // Harvest group display names from the contact roster: WeChat stores group
294
+ // chatrooms (wxid ending @chatroom) in contact.db with a nickname/remark.
295
+ // They are skipped as Person entities below, but their names let us label
296
+ // group Topics with a human-readable name instead of the raw numeric id.
297
+ const groupNames = new Map();
298
+ {
299
+ const contactsForNames = (result && Array.isArray(result.contacts)) ? result.contacts : [];
300
+ for (const c of contactsForNames) {
301
+ if (!c || typeof c.wxid !== "string" || !c.wxid.endsWith("@chatroom")) continue;
302
+ const nm =
303
+ (typeof c.remark === "string" && c.remark.trim()) ||
304
+ (typeof c.nickname === "string" && c.nickname.trim()) ||
305
+ "";
306
+ if (nm) groupNames.set(c.wxid, nm);
307
+ }
308
+ }
293
309
  let emitted = 0;
294
310
  // The sidecar already applied `limit` across all sources (chat/biz/sns/
295
311
  // favorite). Yield everything it returned — do NOT re-cap here, or the
@@ -311,6 +327,7 @@ class WeChatPcAdapter {
311
327
  text: typeof m.text === "string" ? m.text : "",
312
328
  senderWxid: isGroup ? (m.sender || null) : null,
313
329
  isGroup,
330
+ groupName: isGroup && conv ? (groupNames.get(conv) || null) : null,
314
331
  contentBlob: typeof m.text === "string" ? m.text : null,
315
332
  // provenance: chat | biz(公众号) | sns(朋友圈) | favorite(收藏)
316
333
  wechatSource: typeof m.source === "string" ? m.source : "chat",
@@ -423,10 +440,16 @@ function normalizeMessage(p, raw, ingestedAt) {
423
440
 
424
441
  const topics = [];
425
442
  if (isGroup && p.talker) {
443
+ // Prefer the resolved group display name (harvested from contact.db in
444
+ // sync()); fall back to the raw numeric chatroom id only when unknown.
445
+ const groupName =
446
+ (typeof p.groupName === "string" && p.groupName.trim())
447
+ ? p.groupName.trim()
448
+ : p.talker.replace("@chatroom", "");
426
449
  topics.push({
427
450
  id: `topic-wechat-group-${p.talker}`,
428
451
  type: ENTITY_TYPES.TOPIC,
429
- name: p.talker.replace("@chatroom", ""),
452
+ name: groupName,
430
453
  ingestedAt,
431
454
  source,
432
455
  extra: { platform: "wechat", source: "pc", wxid: p.talker },
@@ -28,6 +28,21 @@
28
28
 
29
29
  const { AnalysisSkill } = require("./base");
30
30
 
31
+ /**
32
+ * A topic name carries real interest signal only if it is a human-readable
33
+ * label. Unresolved group-chat IDs (e.g. WeChat group topics named by their
34
+ * raw numeric chatroom id "45498354778") and empty names are NOT interests —
35
+ * they would crowd out genuine topics (coffee, photography, 豆包...) in the
36
+ * profile. Drop them.
37
+ */
38
+ function isMeaningfulTopicName(name) {
39
+ if (typeof name !== "string") return false;
40
+ const s = name.trim();
41
+ if (s.length === 0) return false;
42
+ if (/^\d+$/.test(s)) return false; // pure-numeric = unresolved group id
43
+ return true;
44
+ }
45
+
31
46
  class InterestsSkill extends AnalysisSkill {
32
47
  constructor(opts) {
33
48
  super({ ...opts, name: "analysis.interests" });
@@ -61,25 +76,30 @@ class InterestsSkill extends AnalysisSkill {
61
76
  let topics = [];
62
77
  try {
63
78
  const db = this.vault._requireOpen();
79
+ // Over-fetch (×20, capped) before filtering: vaults can hold thousands
80
+ // of unresolved numeric group-chat topics that would otherwise starve
81
+ // the few human-readable interest topics out of the top-N budget.
64
82
  topics = db.prepare(
65
83
  "SELECT id, name, derived_from_events, ingested_at FROM topics ORDER BY ingested_at DESC LIMIT ?"
66
- ).all(topN * 3);
84
+ ).all(Math.min(topN * 20, 2000));
67
85
  } catch (_e) {
68
86
  // Older vaults may not have topics; non-fatal.
69
87
  }
70
- const mapped = topics.map((t) => {
71
- let eventCount = 0;
72
- try {
73
- const arr = t.derived_from_events ? JSON.parse(t.derived_from_events) : [];
74
- if (Array.isArray(arr)) eventCount = arr.length;
75
- } catch (_e) {}
76
- return {
77
- id: t.id,
78
- name: t.name,
79
- eventCount,
80
- lastSeen: t.ingested_at || null,
81
- };
82
- });
88
+ const mapped = topics
89
+ .filter((t) => isMeaningfulTopicName(t.name))
90
+ .map((t) => {
91
+ let eventCount = 0;
92
+ try {
93
+ const arr = t.derived_from_events ? JSON.parse(t.derived_from_events) : [];
94
+ if (Array.isArray(arr)) eventCount = arr.length;
95
+ } catch (_e) {}
96
+ return {
97
+ id: t.id,
98
+ name: t.name,
99
+ eventCount,
100
+ lastSeen: t.ingested_at || null,
101
+ };
102
+ });
83
103
  return mapped
84
104
  .sort((a, b) => (b.eventCount - a.eventCount) || ((b.lastSeen || 0) - (a.lastSeen || 0)))
85
105
  .slice(0, topN);
@@ -63,7 +63,12 @@ class TimelineSkill extends AnalysisSkill {
63
63
  }
64
64
 
65
65
  _fetchEvents({ since, until }, limit) {
66
- const q = { limit };
66
+ // Exclude inventory-snapshot events (installed-app roster + contact
67
+ // roster from system-data-android). They carry a synthetic
68
+ // collection-time occurredAt — tens of thousands of them cluster at one
69
+ // recent timestamp and would otherwise crowd out real activity from this
70
+ // chronological narrative. They remain in the vault for facet counts.
71
+ const q = { limit, excludeExtraKinds: ["app-snapshot", "contact-snapshot"] };
67
72
  if (since != null) q.since = since;
68
73
  if (until != null) q.until = until;
69
74
  const events = this.vault.queryEvents(q) || [];
@@ -0,0 +1,160 @@
1
+ "use strict";
2
+ /*
3
+ * Generic salvage → vault ingest (multi-app, correct source attribution).
4
+ *
5
+ * The leaf-page salvager recovers raw {rowid, cols} tuples from a /proc/mem
6
+ * dump of ANY app. This maps them into MESSAGE events and writes them straight
7
+ * to the vault with the CORRECT per-app `source.adapter` — so 抖音 data shows as
8
+ * social-douyin, 头条 as social-toutiao, 微信 as wechat, etc., instead of all
9
+ * being mis-attributed to one adapter.
10
+ *
11
+ * Why direct vault.putBatch (not registry.syncAdapter): the registry path runs
12
+ * an adapter's normalize(), which hard-stamps that adapter's name as the source.
13
+ * For salvaged cross-app data we need to stamp the source per the app the dump
14
+ * came from. Building events here + putBatch keeps attribution correct and is
15
+ * engine-agnostic. Events are tagged `extra.capturedBy = "mem-salvage"` for
16
+ * provenance + `extra.salvaged = true`.
17
+ *
18
+ * Authorization: only on data you are entitled to (your own device/account).
19
+ * Docs: docs/internal/pdh-db-decryption-runbook.md (Method B).
20
+ */
21
+ const { newId } = require("../ids");
22
+ const { salvageFile } = require("./leaf-salvage");
23
+ const {
24
+ mapMsgRecords,
25
+ inferMsgColumns,
26
+ } = require("../adapters/social-douyin-adb/salvage-mapper");
27
+
28
+ const SALVAGE_VERSION = "salvage-0.1";
29
+
30
+ // appKey → { sourceAdapter (vault source.adapter, = the app's canonical adapter
31
+ // name so byApp aggregation attributes correctly), platform }. Unknown apps
32
+ // fall back to a "salvage:<app>" source so they're still distinct + traceable.
33
+ const APP_SALVAGE = Object.freeze({
34
+ douyin: { sourceAdapter: "social-douyin", platform: "douyin" },
35
+ toutiao: { sourceAdapter: "social-toutiao", platform: "toutiao" },
36
+ kuaishou: { sourceAdapter: "social-kuaishou", platform: "kuaishou" },
37
+ xiaohongshu: { sourceAdapter: "social-xiaohongshu", platform: "xiaohongshu" },
38
+ weibo: { sourceAdapter: "social-weibo", platform: "weibo" },
39
+ wechat: { sourceAdapter: "wechat", platform: "wechat" },
40
+ qq: { sourceAdapter: "qq", platform: "qq" },
41
+ });
42
+
43
+ function resolveApp(app) {
44
+ const key = String(app || "douyin").toLowerCase();
45
+ return APP_SALVAGE[key] || { sourceAdapter: `salvage:${key}`, platform: key };
46
+ }
47
+
48
+ // Stable, content-derived hash → dedup key. Same message salvaged again
49
+ // (re-scan) collides on ON CONFLICT(source_adapter, source_original_id) and
50
+ // updates rather than duplicating. Tiny FNV-1a over the salient fields.
51
+ function _stableKey(parts) {
52
+ const s = parts.join("|");
53
+ let h = 0x811c9dc5;
54
+ for (let i = 0; i < s.length; i++) {
55
+ h ^= s.charCodeAt(i);
56
+ h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0;
57
+ }
58
+ return h.toString(16).padStart(8, "0");
59
+ }
60
+
61
+ /**
62
+ * Map salvaged records → MESSAGE events tagged with the per-app source.
63
+ * @param {Array<{rowid,cols}>} records
64
+ * @param {{ app?: string, columns?: string[], now?: number }} [opts]
65
+ * @returns {{ events: object[], mapped: number, columns: string[], sourceAdapter: string }}
66
+ */
67
+ function buildSalvageEvents(records, opts = {}) {
68
+ const cfg = resolveApp(opts.app);
69
+ const now = Number.isFinite(opts.now) ? opts.now : Date.now();
70
+ const columns = Array.isArray(opts.columns) && opts.columns.length
71
+ ? opts.columns
72
+ : inferMsgColumns(records || []);
73
+ const messages = mapMsgRecords(records || [], columns);
74
+ const events = [];
75
+ for (const m of messages) {
76
+ const text = typeof m.text === "string" ? m.text : "";
77
+ const occurredAt = Number.isFinite(m.createdTimeMs) && m.createdTimeMs > 0
78
+ ? m.createdTimeMs
79
+ : now;
80
+ const key = _stableKey([
81
+ cfg.platform,
82
+ String(m.conversationId || m.senderUid || ""),
83
+ String(occurredAt),
84
+ text.slice(0, 64),
85
+ ]);
86
+ events.push({
87
+ id: newId(),
88
+ type: "event",
89
+ subtype: "message",
90
+ occurredAt,
91
+ actor: "person-self",
92
+ content: {
93
+ title: text ? text.slice(0, 80) : "(非文本消息)",
94
+ text,
95
+ },
96
+ ingestedAt: now,
97
+ source: {
98
+ adapter: cfg.sourceAdapter,
99
+ adapterVersion: SALVAGE_VERSION,
100
+ originalId: `salvage:${cfg.platform}:${key}`,
101
+ capturedAt: occurredAt,
102
+ // schema enum: export|api|sqlite|accessibility|ocr|manual. Salvaged
103
+ // pages ARE SQLite rows; mem-salvage provenance lives in extra.salvaged.
104
+ capturedBy: "sqlite",
105
+ },
106
+ extra: {
107
+ platform: cfg.platform,
108
+ channel: "im",
109
+ salvaged: true,
110
+ senderUid: m.senderUid || null,
111
+ conversationId: m.conversationId || null,
112
+ contentBlob: typeof m.contentBlob === "string" ? m.contentBlob : null,
113
+ },
114
+ });
115
+ }
116
+ return { events, mapped: messages.length, columns, sourceAdapter: cfg.sourceAdapter };
117
+ }
118
+
119
+ /**
120
+ * Salvage a memory dump and write the recovered messages straight into the
121
+ * vault with the correct per-app source. Returns counts.
122
+ *
123
+ * @param {object} vault LocalVault (must expose putBatch)
124
+ * @param {string} dumpPath
125
+ * @param {{ app?: string, columns?: string[], unaligned?: boolean, pageSize?: number, minCols?: number, now?: number }} [opts]
126
+ * @returns {{ ingested: number, salvaged: number, app: string, sourceAdapter: string, leafPages: number }}
127
+ */
128
+ function salvageDumpToVault(vault, dumpPath, opts = {}) {
129
+ if (!vault || typeof vault.putBatch !== "function") {
130
+ throw new TypeError("salvageDumpToVault: vault with putBatch required");
131
+ }
132
+ if (typeof dumpPath !== "string" || !dumpPath) {
133
+ throw new TypeError("salvageDumpToVault: dumpPath required");
134
+ }
135
+ const { records, pages } = salvageFile(dumpPath, {
136
+ pageSize: opts.pageSize,
137
+ minCols: opts.minCols,
138
+ unaligned: opts.unaligned,
139
+ stride: opts.stride,
140
+ });
141
+ const built = buildSalvageEvents(records, opts);
142
+ const res = built.events.length
143
+ ? vault.putBatch({ events: built.events })
144
+ : { events: 0 };
145
+ return {
146
+ ingested: res.events || 0,
147
+ salvaged: records.length,
148
+ mapped: built.mapped,
149
+ app: String(opts.app || "douyin").toLowerCase(),
150
+ sourceAdapter: built.sourceAdapter,
151
+ leafPages: pages,
152
+ };
153
+ }
154
+
155
+ module.exports = {
156
+ APP_SALVAGE,
157
+ resolveApp,
158
+ buildSalvageEvents,
159
+ salvageDumpToVault,
160
+ };
package/lib/vault.js CHANGED
@@ -811,6 +811,18 @@ class LocalVault {
811
811
  where.push("source_adapter = @adapter");
812
812
  params.adapter = q.adapter;
813
813
  }
814
+ if (Array.isArray(q.excludeExtraKinds) && q.excludeExtraKinds.length > 0) {
815
+ // Exclude inventory-snapshot events (e.g. installed-app / contact-roster
816
+ // facet events) whose extra.kind is in the given list. Those carry a
817
+ // synthetic collection-time occurredAt and would otherwise dominate any
818
+ // time-ordered (occurred_at DESC) query. Rows with no extra.kind are kept.
819
+ const placeholders = q.excludeExtraKinds.map((_v, i) => `@xk${i}`);
820
+ q.excludeExtraKinds.forEach((v, i) => { params[`xk${i}`] = v; });
821
+ where.push(
822
+ "(json_extract(extra, '$.kind') IS NULL OR json_extract(extra, '$.kind') NOT IN (" +
823
+ placeholders.join(", ") + "))",
824
+ );
825
+ }
814
826
 
815
827
  const limit = Number.isInteger(q.limit) && q.limit > 0 ? Math.min(q.limit, 10000) : 100;
816
828
  const offset = Number.isInteger(q.offset) && q.offset >= 0 ? q.offset : 0;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chainlesschain/personal-data-hub",
3
- "version": "0.4.26",
3
+ "version": "0.4.28",
4
4
  "description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
5
5
  "type": "commonjs",
6
6
  "main": "lib/index.js",
@@ -68,7 +68,9 @@
68
68
  "./adapters/messaging-qq": "./lib/adapters/messaging-qq/index.js",
69
69
  "./adapters/messaging-telegram": "./lib/adapters/messaging-telegram/index.js",
70
70
  "./adapters/messaging-whatsapp": "./lib/adapters/messaging-whatsapp/index.js",
71
- "./sidecar": "./lib/sidecar/index.js"
71
+ "./sidecar": "./lib/sidecar/index.js",
72
+ "./forensics/leaf-salvage": "./lib/forensics/leaf-salvage.js",
73
+ "./forensics/salvage-ingest": "./lib/forensics/salvage-ingest.js"
72
74
  },
73
75
  "scripts": {
74
76
  "test": "vitest run",