@chainlesschain/personal-data-hub 0.4.25 → 0.4.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,185 @@
1
+ "use strict";
2
+ /*
3
+ * PDH SQLite leaf-page record salvager — library form.
4
+ *
5
+ * Method B (`/proc/<pid>/mem` 内存扫描) dumps decrypted pages, but for DBs whose
6
+ * page cache is SCATTERED (not contiguous mmap) the rebuilt file is "malformed"
7
+ * (valid header, broken b-tree). This salvages the DATA anyway: it scans a dump
8
+ * (or concatenated dumps) for SQLite **table b-tree leaf pages** (type 0x0D) and
9
+ * parses each page's records directly — order-independent, exactly what sqlite3
10
+ * `.recover` does, but standalone (platform-tools sqlite3 lacks .recover).
11
+ *
12
+ * This file is the bundle-able home for the parser; the standalone CLI tool
13
+ * `scripts/android/pdh-sqlite-leaf-salvage.js` re-exports from here. Lives in
14
+ * pdh lib so the Android cc bundle (and `cc hub salvage`) can call it on-device.
15
+ *
16
+ * Output: array of {rowid, cols:[...]} — raw positional column tuples (leaf
17
+ * pages carry no column names). Map to a schema downstream (salvage-mapper.js).
18
+ *
19
+ * Authorization: only on data you are entitled to (your own device/account).
20
+ * Docs: docs/internal/pdh-db-decryption-runbook.md (Method B + reconstruction).
21
+ */
22
+ const fs = require("node:fs");
23
+
24
+ function readVarint(buf, off) {
25
+ // SQLite varint: up to 9 bytes, big-endian, high bit = continuation.
26
+ let result = 0n;
27
+ let i = 0;
28
+ for (; i < 8; i++) {
29
+ const b = buf[off + i];
30
+ if (b === undefined) return [null, off + i];
31
+ result = (result << 7n) | BigInt(b & 0x7f);
32
+ if ((b & 0x80) === 0) return [result, off + i + 1];
33
+ }
34
+ // 9th byte uses all 8 bits
35
+ const b9 = buf[off + 8];
36
+ if (b9 === undefined) return [null, off + 9];
37
+ result = (result << 8n) | BigInt(b9);
38
+ return [result, off + 9];
39
+ }
40
+
41
+ function serialTypeSize(t) {
42
+ // t is a BigInt
43
+ const n = Number(t);
44
+ if (n === 0 || n === 8 || n === 9 || n === 12 || n === 13) {
45
+ return n >= 12 ? (n % 2 === 0 ? (n - 12) / 2 : (n - 13) / 2) : 0;
46
+ }
47
+ if (n === 1) return 1;
48
+ if (n === 2) return 2;
49
+ if (n === 3) return 3;
50
+ if (n === 4) return 4;
51
+ if (n === 5) return 6;
52
+ if (n === 6) return 8;
53
+ if (n === 7) return 8;
54
+ if (n >= 12) return n % 2 === 0 ? (n - 12) / 2 : (n - 13) / 2;
55
+ return 0;
56
+ }
57
+
58
+ function readValue(buf, off, t) {
59
+ const n = Number(t);
60
+ const sz = serialTypeSize(t);
61
+ if (n === 0) return [null, off];
62
+ if (n === 8) return [0, off];
63
+ if (n === 9) return [1, off];
64
+ if (n >= 1 && n <= 6) {
65
+ let v = 0n;
66
+ for (let i = 0; i < sz; i++) v = (v << 8n) | BigInt(buf[off + i] || 0);
67
+ // sign-extend
68
+ const bits = BigInt(sz * 8);
69
+ if (v >= 1n << (bits - 1n)) v -= 1n << bits;
70
+ const num = Number(v);
71
+ return [Number.isSafeInteger(num) ? num : v.toString(), off + sz];
72
+ }
73
+ if (n === 7) return [buf.readDoubleBE(off), off + 8];
74
+ if (n >= 13 && n % 2 === 1) {
75
+ // text
76
+ return [buf.toString("utf8", off, off + sz), off + sz];
77
+ }
78
+ // blob (n>=12 even) — return length marker, not raw bytes
79
+ return [`<blob:${sz}>`, off + sz];
80
+ }
81
+
82
+ // Parse one table-leaf page at `base`. Returns array of {rowid, cols} or null.
83
+ function parseLeafPage(buf, base, pageSize, minCols) {
84
+ if (buf[base] !== 0x0d) return null; // 0x0D = table b-tree leaf
85
+ if (base + pageSize > buf.length) return null;
86
+ const numCells = (buf[base + 3] << 8) | buf[base + 4];
87
+ if (numCells <= 0 || numCells > Math.floor(pageSize / 4)) return null;
88
+ // cell-content-start (bytes 5-6; 0 means 65536) — must sit after the cell
89
+ // pointer array and within the page. This guard rejects the false positives a
90
+ // finer (unaligned) stride would otherwise hit on random 0x0D bytes.
91
+ let cellStart = (buf[base + 5] << 8) | buf[base + 6];
92
+ if (cellStart === 0) cellStart = 65536;
93
+ const hdrEndMin = 8 + numCells * 2;
94
+ if (cellStart < hdrEndMin || cellStart > pageSize) return null;
95
+ const out = [];
96
+ const ptrBase = base + 8;
97
+ for (let c = 0; c < numCells; c++) {
98
+ const ptr = (buf[ptrBase + c * 2] << 8) | buf[ptrBase + c * 2 + 1];
99
+ if (ptr < 8 || ptr >= pageSize) continue;
100
+ let off = base + ptr;
101
+ try {
102
+ const [payloadLen, o1] = readVarint(buf, off); off = o1;
103
+ if (payloadLen === null || payloadLen <= 0n || payloadLen > BigInt(pageSize)) continue;
104
+ const [rowid, o2] = readVarint(buf, off); off = o2;
105
+ // record header
106
+ const recStart = off;
107
+ const [hdrLen, o3] = readVarint(buf, off); off = o3;
108
+ if (hdrLen === null || hdrLen <= 0n) continue;
109
+ const hdrEnd = recStart + Number(hdrLen);
110
+ const serials = [];
111
+ while (off < hdrEnd) {
112
+ const [st, oN] = readVarint(buf, off); off = oN;
113
+ if (st === null) break;
114
+ serials.push(st);
115
+ }
116
+ let vOff = hdrEnd;
117
+ const cols = [];
118
+ for (const st of serials) {
119
+ const [val, vN] = readValue(buf, vOff, st);
120
+ cols.push(val); vOff = vN;
121
+ }
122
+ if (cols.length >= minCols) out.push({ rowid: rowid === null ? null : rowid.toString(), cols });
123
+ } catch (_e) { /* skip malformed cell */ }
124
+ }
125
+ return out.length ? out : null;
126
+ }
127
+
128
+ /**
129
+ * Scan an in-memory dump buffer for table-leaf pages and return all salvaged
130
+ * records (deduped). Engine-agnostic — works on any decrypted-page dump.
131
+ *
132
+ * @param {Buffer} buf
133
+ * @param {{pageSize?: number, minCols?: number, unaligned?: boolean, stride?: number}} [opts]
134
+ * - pageSize: SQLite page size (default 4096)
135
+ * - minCols: drop records with fewer columns (default 3)
136
+ * - unaligned: scan at a finer stride (512) to catch pages not 4096-aligned
137
+ * in a malloc'd page cache; ~8x slower, recovers pages the aligned scan
138
+ * misses. The strengthened header validation rejects the extra false
139
+ * positives.
140
+ * - stride: explicit scan stride (overrides the unaligned default)
141
+ * @returns {{records: Array<{rowid: string|null, cols: any[]}>, pages: number}}
142
+ */
143
+ function salvageBuffer(buf, opts = {}) {
144
+ if (!Buffer.isBuffer(buf)) {
145
+ throw new TypeError("salvageBuffer: buf must be a Buffer");
146
+ }
147
+ const pageSize = Number.isFinite(opts.pageSize) && opts.pageSize > 0 ? opts.pageSize : 4096;
148
+ const minCols = Number.isFinite(opts.minCols) && opts.minCols >= 0 ? opts.minCols : 3;
149
+ const unaligned = !!opts.unaligned;
150
+ const stride = Number.isFinite(opts.stride) && opts.stride > 0
151
+ ? opts.stride
152
+ : (unaligned ? 512 : pageSize);
153
+
154
+ let pages = 0;
155
+ const records = [];
156
+ const seen = new Set(); // dedup overlapping finds
157
+ for (let base = 0; base + 8 <= buf.length; base += stride) {
158
+ if (buf[base] !== 0x0d) continue; // cheap pre-filter before full parse
159
+ const recs = parseLeafPage(buf, base, pageSize, minCols);
160
+ if (!recs) continue;
161
+ pages++;
162
+ for (const r of recs) {
163
+ const key = r.rowid + " " + JSON.stringify(r.cols);
164
+ if (seen.has(key)) continue;
165
+ seen.add(key);
166
+ records.push(r);
167
+ }
168
+ }
169
+ return { records, pages };
170
+ }
171
+
172
+ /** Read a dump file and salvage records from it. See {@link salvageBuffer}. */
173
+ function salvageFile(filePath, opts = {}) {
174
+ const buf = fs.readFileSync(filePath);
175
+ return salvageBuffer(buf, opts);
176
+ }
177
+
178
+ module.exports = {
179
+ readVarint,
180
+ serialTypeSize,
181
+ readValue,
182
+ parseLeafPage,
183
+ salvageBuffer,
184
+ salvageFile,
185
+ };
@@ -0,0 +1,160 @@
1
+ "use strict";
2
+ /*
3
+ * Generic salvage → vault ingest (multi-app, correct source attribution).
4
+ *
5
+ * The leaf-page salvager recovers raw {rowid, cols} tuples from a /proc/mem
6
+ * dump of ANY app. This maps them into MESSAGE events and writes them straight
7
+ * to the vault with the CORRECT per-app `source.adapter` — so 抖音 data shows as
8
+ * social-douyin, 头条 as social-toutiao, 微信 as wechat, etc., instead of all
9
+ * being mis-attributed to one adapter.
10
+ *
11
+ * Why direct vault.putBatch (not registry.syncAdapter): the registry path runs
12
+ * an adapter's normalize(), which hard-stamps that adapter's name as the source.
13
+ * For salvaged cross-app data we need to stamp the source per the app the dump
14
+ * came from. Building events here + putBatch keeps attribution correct and is
15
+ * engine-agnostic. Events are tagged `extra.capturedBy = "mem-salvage"` for
16
+ * provenance + `extra.salvaged = true`.
17
+ *
18
+ * Authorization: only on data you are entitled to (your own device/account).
19
+ * Docs: docs/internal/pdh-db-decryption-runbook.md (Method B).
20
+ */
21
+ const { newId } = require("../ids");
22
+ const { salvageFile } = require("./leaf-salvage");
23
+ const {
24
+ mapMsgRecords,
25
+ inferMsgColumns,
26
+ } = require("../adapters/social-douyin-adb/salvage-mapper");
27
+
28
+ const SALVAGE_VERSION = "salvage-0.1";
29
+
30
+ // appKey → { sourceAdapter (vault source.adapter, = the app's canonical adapter
31
+ // name so byApp aggregation attributes correctly), platform }. Unknown apps
32
+ // fall back to a "salvage:<app>" source so they're still distinct + traceable.
33
+ const APP_SALVAGE = Object.freeze({
34
+ douyin: { sourceAdapter: "social-douyin", platform: "douyin" },
35
+ toutiao: { sourceAdapter: "social-toutiao", platform: "toutiao" },
36
+ kuaishou: { sourceAdapter: "social-kuaishou", platform: "kuaishou" },
37
+ xiaohongshu: { sourceAdapter: "social-xiaohongshu", platform: "xiaohongshu" },
38
+ weibo: { sourceAdapter: "social-weibo", platform: "weibo" },
39
+ wechat: { sourceAdapter: "wechat", platform: "wechat" },
40
+ qq: { sourceAdapter: "qq", platform: "qq" },
41
+ });
42
+
43
+ function resolveApp(app) {
44
+ const key = String(app || "douyin").toLowerCase();
45
+ return APP_SALVAGE[key] || { sourceAdapter: `salvage:${key}`, platform: key };
46
+ }
47
+
48
+ // Stable, content-derived hash → dedup key. Same message salvaged again
49
+ // (re-scan) collides on ON CONFLICT(source_adapter, source_original_id) and
50
+ // updates rather than duplicating. Tiny FNV-1a over the salient fields.
51
+ function _stableKey(parts) {
52
+ const s = parts.join("|");
53
+ let h = 0x811c9dc5;
54
+ for (let i = 0; i < s.length; i++) {
55
+ h ^= s.charCodeAt(i);
56
+ h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0;
57
+ }
58
+ return h.toString(16).padStart(8, "0");
59
+ }
60
+
61
+ /**
62
+ * Map salvaged records → MESSAGE events tagged with the per-app source.
63
+ * @param {Array<{rowid,cols}>} records
64
+ * @param {{ app?: string, columns?: string[], now?: number }} [opts]
65
+ * @returns {{ events: object[], mapped: number, columns: string[], sourceAdapter: string }}
66
+ */
67
+ function buildSalvageEvents(records, opts = {}) {
68
+ const cfg = resolveApp(opts.app);
69
+ const now = Number.isFinite(opts.now) ? opts.now : Date.now();
70
+ const columns = Array.isArray(opts.columns) && opts.columns.length
71
+ ? opts.columns
72
+ : inferMsgColumns(records || []);
73
+ const messages = mapMsgRecords(records || [], columns);
74
+ const events = [];
75
+ for (const m of messages) {
76
+ const text = typeof m.text === "string" ? m.text : "";
77
+ const occurredAt = Number.isFinite(m.createdTimeMs) && m.createdTimeMs > 0
78
+ ? m.createdTimeMs
79
+ : now;
80
+ const key = _stableKey([
81
+ cfg.platform,
82
+ String(m.conversationId || m.senderUid || ""),
83
+ String(occurredAt),
84
+ text.slice(0, 64),
85
+ ]);
86
+ events.push({
87
+ id: newId(),
88
+ type: "event",
89
+ subtype: "message",
90
+ occurredAt,
91
+ actor: "person-self",
92
+ content: {
93
+ title: text ? text.slice(0, 80) : "(非文本消息)",
94
+ text,
95
+ },
96
+ ingestedAt: now,
97
+ source: {
98
+ adapter: cfg.sourceAdapter,
99
+ adapterVersion: SALVAGE_VERSION,
100
+ originalId: `salvage:${cfg.platform}:${key}`,
101
+ capturedAt: occurredAt,
102
+ // schema enum: export|api|sqlite|accessibility|ocr|manual. Salvaged
103
+ // pages ARE SQLite rows; mem-salvage provenance lives in extra.salvaged.
104
+ capturedBy: "sqlite",
105
+ },
106
+ extra: {
107
+ platform: cfg.platform,
108
+ channel: "im",
109
+ salvaged: true,
110
+ senderUid: m.senderUid || null,
111
+ conversationId: m.conversationId || null,
112
+ contentBlob: typeof m.contentBlob === "string" ? m.contentBlob : null,
113
+ },
114
+ });
115
+ }
116
+ return { events, mapped: messages.length, columns, sourceAdapter: cfg.sourceAdapter };
117
+ }
118
+
119
+ /**
120
+ * Salvage a memory dump and write the recovered messages straight into the
121
+ * vault with the correct per-app source. Returns counts.
122
+ *
123
+ * @param {object} vault LocalVault (must expose putBatch)
124
+ * @param {string} dumpPath
125
+ * @param {{ app?: string, columns?: string[], unaligned?: boolean, pageSize?: number, minCols?: number, now?: number }} [opts]
126
+ * @returns {{ ingested: number, salvaged: number, app: string, sourceAdapter: string, leafPages: number }}
127
+ */
128
+ function salvageDumpToVault(vault, dumpPath, opts = {}) {
129
+ if (!vault || typeof vault.putBatch !== "function") {
130
+ throw new TypeError("salvageDumpToVault: vault with putBatch required");
131
+ }
132
+ if (typeof dumpPath !== "string" || !dumpPath) {
133
+ throw new TypeError("salvageDumpToVault: dumpPath required");
134
+ }
135
+ const { records, pages } = salvageFile(dumpPath, {
136
+ pageSize: opts.pageSize,
137
+ minCols: opts.minCols,
138
+ unaligned: opts.unaligned,
139
+ stride: opts.stride,
140
+ });
141
+ const built = buildSalvageEvents(records, opts);
142
+ const res = built.events.length
143
+ ? vault.putBatch({ events: built.events })
144
+ : { events: 0 };
145
+ return {
146
+ ingested: res.events || 0,
147
+ salvaged: records.length,
148
+ mapped: built.mapped,
149
+ app: String(opts.app || "douyin").toLowerCase(),
150
+ sourceAdapter: built.sourceAdapter,
151
+ leafPages: pages,
152
+ };
153
+ }
154
+
155
+ module.exports = {
156
+ APP_SALVAGE,
157
+ resolveApp,
158
+ buildSalvageEvents,
159
+ salvageDumpToVault,
160
+ };
@@ -40,6 +40,7 @@ const FACT_BLOCK_FOOTER = "END FACTS.";
40
40
  const NO_FACTS_HINT = "(FACTS is empty — the vault has nothing matching this question. Say so honestly.)";
41
41
  const TOTALS_HEADER = "TOTALS (authoritative entity counts from vault — use these for count questions, NOT FACTS length):";
42
42
  const AMOUNT_SUM_HEADER = "AMOUNT_SUM (authoritative SQL total of amount-bearing events — use for spending questions, NOT FACTS sums):";
43
+ const CROSS_APP_HEADER = "CROSS_APP_OVERVIEW (跨 app 汇聚画像 — 各 app 活跃度/类型/消费/高频联系人,回答跨 app 与决策类问题时优先参考;为汇总信号,非逐条事实):";
43
44
 
44
45
  // ─── Fact summarization ─────────────────────────────────────────────────
45
46
 
@@ -134,6 +135,10 @@ function buildPrompt(opts) {
134
135
  opts.vaultTotals && typeof opts.vaultTotals === "object" ? opts.vaultTotals : null;
135
136
  const amountSummary =
136
137
  opts.amountSummary && typeof opts.amountSummary === "object" ? opts.amountSummary : null;
138
+ const crossAppOverview =
139
+ typeof opts.crossAppOverview === "string" && opts.crossAppOverview.length > 0
140
+ ? opts.crossAppOverview
141
+ : null;
137
142
 
138
143
  const trimmed = facts.slice(0, maxFacts);
139
144
  const summaries = trimmed
@@ -170,6 +175,10 @@ function buildPrompt(opts) {
170
175
  if (amountSummary && Number.isFinite(amountSummary.total) && amountSummary.count > 0) {
171
176
  userContent += `\n${AMOUNT_SUM_HEADER}\n${JSON.stringify(amountSummary, null, 2)}\n`;
172
177
  }
178
+ // CROSS_APP_OVERVIEW — 跨 app 汇聚画像,置于 FACTS 前(同 TOTALS)。
179
+ if (crossAppOverview) {
180
+ userContent += `\n${CROSS_APP_HEADER}\n${crossAppOverview}\n`;
181
+ }
173
182
  userContent += `\n${FACT_BLOCK_HEADER}\n${factBody}\n${FACT_BLOCK_FOOTER}${truncatedNote}\n\nUSER QUESTION: ${question}`;
174
183
 
175
184
  return {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chainlesschain/personal-data-hub",
3
- "version": "0.4.25",
3
+ "version": "0.4.27",
4
4
  "description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
5
5
  "type": "commonjs",
6
6
  "main": "lib/index.js",
@@ -68,7 +68,9 @@
68
68
  "./adapters/messaging-qq": "./lib/adapters/messaging-qq/index.js",
69
69
  "./adapters/messaging-telegram": "./lib/adapters/messaging-telegram/index.js",
70
70
  "./adapters/messaging-whatsapp": "./lib/adapters/messaging-whatsapp/index.js",
71
- "./sidecar": "./lib/sidecar/index.js"
71
+ "./sidecar": "./lib/sidecar/index.js",
72
+ "./forensics/leaf-salvage": "./lib/forensics/leaf-salvage.js",
73
+ "./forensics/salvage-ingest": "./lib/forensics/salvage-ingest.js"
72
74
  },
73
75
  "scripts": {
74
76
  "test": "vitest run",