@chainlesschain/personal-data-hub 0.4.26 → 0.4.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
import { describe, it, expect, beforeAll, afterAll } from "vitest";
|
|
4
|
+
|
|
5
|
+
const fs = require("node:fs");
|
|
6
|
+
const path = require("node:path");
|
|
7
|
+
const os = require("node:os");
|
|
8
|
+
|
|
9
|
+
const { LocalVault } = require("../lib/vault");
|
|
10
|
+
const { generateKeyHex } = require("../lib/key-providers");
|
|
11
|
+
const {
|
|
12
|
+
buildSalvageEvents,
|
|
13
|
+
salvageDumpToVault,
|
|
14
|
+
resolveApp,
|
|
15
|
+
} = require("../lib/forensics/salvage-ingest");
|
|
16
|
+
|
|
17
|
+
// Build a real SQLite DB and treat its bytes as a memory dump; verify the
|
|
18
|
+
// generic salvage→vault path recovers messages AND tags them with the correct
|
|
19
|
+
// per-app source.adapter (multi-app de-silo). Real LocalVault → proves the
|
|
20
|
+
// hand-built events pass schema validation + are searchable.
|
|
21
|
+
describe("salvage-ingest — generic multi-app salvage → vault", () => {
|
|
22
|
+
let dir, dumpPath, vault, vdir;
|
|
23
|
+
const COLUMNS = ["msg_uuid", "conversation_id", "sender", "content", "created_time"];
|
|
24
|
+
|
|
25
|
+
beforeAll(() => {
|
|
26
|
+
const Database = require("better-sqlite3-multiple-ciphers");
|
|
27
|
+
dir = fs.mkdtempSync(path.join(os.tmpdir(), "salvage-ing-"));
|
|
28
|
+
dumpPath = path.join(dir, "u.db");
|
|
29
|
+
const db = new Database(dumpPath);
|
|
30
|
+
db.exec("CREATE TABLE msg(msg_uuid TEXT, conversation_id TEXT, sender INTEGER, content TEXT, created_time INTEGER)");
|
|
31
|
+
const ins = db.prepare("INSERT INTO msg VALUES(?,?,?,?,?)");
|
|
32
|
+
ins.run("u1", "conv-1", 111, "今天的会议改到下午三点 hi", 1700000000000);
|
|
33
|
+
ins.run("u2", "conv-1", 222, "收到 👌", 1700000001000);
|
|
34
|
+
db.close();
|
|
35
|
+
|
|
36
|
+
vdir = fs.mkdtempSync(path.join(os.tmpdir(), "salvage-vault-"));
|
|
37
|
+
vault = new LocalVault({ path: path.join(vdir, "v.db"), key: generateKeyHex() });
|
|
38
|
+
vault.open();
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
afterAll(() => {
|
|
42
|
+
try { vault.close(); } catch (_e) {}
|
|
43
|
+
try { fs.rmSync(dir, { recursive: true, force: true }); } catch (_e) {}
|
|
44
|
+
try { fs.rmSync(vdir, { recursive: true, force: true }); } catch (_e) {}
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it("resolveApp maps known apps to canonical source adapters; unknown → salvage:<app>", () => {
|
|
48
|
+
expect(resolveApp("douyin").sourceAdapter).toBe("social-douyin");
|
|
49
|
+
expect(resolveApp("toutiao").sourceAdapter).toBe("social-toutiao");
|
|
50
|
+
expect(resolveApp("wechat").sourceAdapter).toBe("wechat");
|
|
51
|
+
expect(resolveApp("kuaishou").sourceAdapter).toBe("social-kuaishou");
|
|
52
|
+
expect(resolveApp("bogusapp").sourceAdapter).toBe("salvage:bogusapp");
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it("buildSalvageEvents tags per-app source + builds valid message events", () => {
|
|
56
|
+
const recs = [{ rowid: "1", cols: ["u1", "conv-1", 111, "hello 世界", 1700000000000] }];
|
|
57
|
+
const built = buildSalvageEvents(recs, { app: "toutiao", columns: COLUMNS, now: 1700000099000 });
|
|
58
|
+
expect(built.events.length).toBe(1);
|
|
59
|
+
const e = built.events[0];
|
|
60
|
+
expect(e.source.adapter).toBe("social-toutiao");
|
|
61
|
+
expect(e.source.capturedBy).toBe("sqlite"); // schema enum; provenance in extra.salvaged
|
|
62
|
+
expect(e.subtype).toBe("message");
|
|
63
|
+
expect(e.content.text).toBe("hello 世界");
|
|
64
|
+
expect(e.extra.platform).toBe("toutiao");
|
|
65
|
+
expect(e.extra.salvaged).toBe(true);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("salvageDumpToVault ingests with douyin source + events are searchable", () => {
|
|
69
|
+
const r = salvageDumpToVault(vault, dumpPath, { app: "douyin", columns: COLUMNS, now: 1700000099000 });
|
|
70
|
+
expect(r.app).toBe("douyin");
|
|
71
|
+
expect(r.sourceAdapter).toBe("social-douyin");
|
|
72
|
+
expect(r.ingested).toBe(2);
|
|
73
|
+
// events landed under the correct source + are searchable
|
|
74
|
+
const events = vault.queryEvents({ limit: 100 }) || [];
|
|
75
|
+
const douyin = events.filter((e) => e.source && e.source.adapter === "social-douyin");
|
|
76
|
+
expect(douyin.length).toBe(2);
|
|
77
|
+
const texts = douyin.map((e) => e.content && e.content.text).sort();
|
|
78
|
+
expect(texts).toContain("收到 👌"); // UTF-8 emoji survives
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it("same dump under a different app tags a different source (no cross-attribution)", () => {
|
|
82
|
+
const r = salvageDumpToVault(vault, dumpPath, { app: "toutiao", columns: COLUMNS, now: 1700000099000 });
|
|
83
|
+
expect(r.sourceAdapter).toBe("social-toutiao");
|
|
84
|
+
expect(r.ingested).toBe(2);
|
|
85
|
+
const events = vault.queryEvents({ limit: 100 }) || [];
|
|
86
|
+
expect(events.filter((e) => e.source && e.source.adapter === "social-toutiao").length).toBe(2);
|
|
87
|
+
// douyin events from prior test remain distinct
|
|
88
|
+
expect(events.filter((e) => e.source && e.source.adapter === "social-douyin").length).toBe(2);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it("re-ingesting the same dump dedups (stable originalId)", () => {
|
|
92
|
+
const before = (vault.queryEvents({ limit: 200 }) || []).length;
|
|
93
|
+
salvageDumpToVault(vault, dumpPath, { app: "douyin", columns: COLUMNS, now: 1700000099000 });
|
|
94
|
+
const after = (vault.queryEvents({ limit: 200 }) || []).length;
|
|
95
|
+
expect(after).toBe(before); // ON CONFLICT(source_adapter, source_original_id) updates, no dupes
|
|
96
|
+
});
|
|
97
|
+
});
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
* Generic salvage → vault ingest (multi-app, correct source attribution).
|
|
4
|
+
*
|
|
5
|
+
* The leaf-page salvager recovers raw {rowid, cols} tuples from a /proc/mem
|
|
6
|
+
* dump of ANY app. This maps them into MESSAGE events and writes them straight
|
|
7
|
+
* to the vault with the CORRECT per-app `source.adapter` — so 抖音 data shows as
|
|
8
|
+
* social-douyin, 头条 as social-toutiao, 微信 as wechat, etc., instead of all
|
|
9
|
+
* being mis-attributed to one adapter.
|
|
10
|
+
*
|
|
11
|
+
* Why direct vault.putBatch (not registry.syncAdapter): the registry path runs
|
|
12
|
+
* an adapter's normalize(), which hard-stamps that adapter's name as the source.
|
|
13
|
+
* For salvaged cross-app data we need to stamp the source per the app the dump
|
|
14
|
+
* came from. Building events here + putBatch keeps attribution correct and is
|
|
15
|
+
* engine-agnostic. Events are tagged `extra.capturedBy = "mem-salvage"` for
|
|
16
|
+
* provenance + `extra.salvaged = true`.
|
|
17
|
+
*
|
|
18
|
+
* Authorization: only on data you are entitled to (your own device/account).
|
|
19
|
+
* Docs: docs/internal/pdh-db-decryption-runbook.md (Method B).
|
|
20
|
+
*/
|
|
21
|
+
const { newId } = require("../ids");
|
|
22
|
+
const { salvageFile } = require("./leaf-salvage");
|
|
23
|
+
const {
|
|
24
|
+
mapMsgRecords,
|
|
25
|
+
inferMsgColumns,
|
|
26
|
+
} = require("../adapters/social-douyin-adb/salvage-mapper");
|
|
27
|
+
|
|
28
|
+
const SALVAGE_VERSION = "salvage-0.1";
|
|
29
|
+
|
|
30
|
+
// appKey → { sourceAdapter (vault source.adapter, = the app's canonical adapter
|
|
31
|
+
// name so byApp aggregation attributes correctly), platform }. Unknown apps
|
|
32
|
+
// fall back to a "salvage:<app>" source so they're still distinct + traceable.
|
|
33
|
+
const APP_SALVAGE = Object.freeze({
|
|
34
|
+
douyin: { sourceAdapter: "social-douyin", platform: "douyin" },
|
|
35
|
+
toutiao: { sourceAdapter: "social-toutiao", platform: "toutiao" },
|
|
36
|
+
kuaishou: { sourceAdapter: "social-kuaishou", platform: "kuaishou" },
|
|
37
|
+
xiaohongshu: { sourceAdapter: "social-xiaohongshu", platform: "xiaohongshu" },
|
|
38
|
+
weibo: { sourceAdapter: "social-weibo", platform: "weibo" },
|
|
39
|
+
wechat: { sourceAdapter: "wechat", platform: "wechat" },
|
|
40
|
+
qq: { sourceAdapter: "qq", platform: "qq" },
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
function resolveApp(app) {
|
|
44
|
+
const key = String(app || "douyin").toLowerCase();
|
|
45
|
+
return APP_SALVAGE[key] || { sourceAdapter: `salvage:${key}`, platform: key };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Stable, content-derived hash → dedup key. Same message salvaged again
|
|
49
|
+
// (re-scan) collides on ON CONFLICT(source_adapter, source_original_id) and
|
|
50
|
+
// updates rather than duplicating. Tiny FNV-1a over the salient fields.
|
|
51
|
+
function _stableKey(parts) {
|
|
52
|
+
const s = parts.join("|");
|
|
53
|
+
let h = 0x811c9dc5;
|
|
54
|
+
for (let i = 0; i < s.length; i++) {
|
|
55
|
+
h ^= s.charCodeAt(i);
|
|
56
|
+
h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0;
|
|
57
|
+
}
|
|
58
|
+
return h.toString(16).padStart(8, "0");
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Map salvaged records → MESSAGE events tagged with the per-app source.
|
|
63
|
+
* @param {Array<{rowid,cols}>} records
|
|
64
|
+
* @param {{ app?: string, columns?: string[], now?: number }} [opts]
|
|
65
|
+
* @returns {{ events: object[], mapped: number, columns: string[], sourceAdapter: string }}
|
|
66
|
+
*/
|
|
67
|
+
function buildSalvageEvents(records, opts = {}) {
|
|
68
|
+
const cfg = resolveApp(opts.app);
|
|
69
|
+
const now = Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
70
|
+
const columns = Array.isArray(opts.columns) && opts.columns.length
|
|
71
|
+
? opts.columns
|
|
72
|
+
: inferMsgColumns(records || []);
|
|
73
|
+
const messages = mapMsgRecords(records || [], columns);
|
|
74
|
+
const events = [];
|
|
75
|
+
for (const m of messages) {
|
|
76
|
+
const text = typeof m.text === "string" ? m.text : "";
|
|
77
|
+
const occurredAt = Number.isFinite(m.createdTimeMs) && m.createdTimeMs > 0
|
|
78
|
+
? m.createdTimeMs
|
|
79
|
+
: now;
|
|
80
|
+
const key = _stableKey([
|
|
81
|
+
cfg.platform,
|
|
82
|
+
String(m.conversationId || m.senderUid || ""),
|
|
83
|
+
String(occurredAt),
|
|
84
|
+
text.slice(0, 64),
|
|
85
|
+
]);
|
|
86
|
+
events.push({
|
|
87
|
+
id: newId(),
|
|
88
|
+
type: "event",
|
|
89
|
+
subtype: "message",
|
|
90
|
+
occurredAt,
|
|
91
|
+
actor: "person-self",
|
|
92
|
+
content: {
|
|
93
|
+
title: text ? text.slice(0, 80) : "(非文本消息)",
|
|
94
|
+
text,
|
|
95
|
+
},
|
|
96
|
+
ingestedAt: now,
|
|
97
|
+
source: {
|
|
98
|
+
adapter: cfg.sourceAdapter,
|
|
99
|
+
adapterVersion: SALVAGE_VERSION,
|
|
100
|
+
originalId: `salvage:${cfg.platform}:${key}`,
|
|
101
|
+
capturedAt: occurredAt,
|
|
102
|
+
// schema enum: export|api|sqlite|accessibility|ocr|manual. Salvaged
|
|
103
|
+
// pages ARE SQLite rows; mem-salvage provenance lives in extra.salvaged.
|
|
104
|
+
capturedBy: "sqlite",
|
|
105
|
+
},
|
|
106
|
+
extra: {
|
|
107
|
+
platform: cfg.platform,
|
|
108
|
+
channel: "im",
|
|
109
|
+
salvaged: true,
|
|
110
|
+
senderUid: m.senderUid || null,
|
|
111
|
+
conversationId: m.conversationId || null,
|
|
112
|
+
contentBlob: typeof m.contentBlob === "string" ? m.contentBlob : null,
|
|
113
|
+
},
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
return { events, mapped: messages.length, columns, sourceAdapter: cfg.sourceAdapter };
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Salvage a memory dump and write the recovered messages straight into the
|
|
121
|
+
* vault with the correct per-app source. Returns counts.
|
|
122
|
+
*
|
|
123
|
+
* @param {object} vault LocalVault (must expose putBatch)
|
|
124
|
+
* @param {string} dumpPath
|
|
125
|
+
* @param {{ app?: string, columns?: string[], unaligned?: boolean, pageSize?: number, minCols?: number, now?: number }} [opts]
|
|
126
|
+
* @returns {{ ingested: number, salvaged: number, app: string, sourceAdapter: string, leafPages: number }}
|
|
127
|
+
*/
|
|
128
|
+
function salvageDumpToVault(vault, dumpPath, opts = {}) {
|
|
129
|
+
if (!vault || typeof vault.putBatch !== "function") {
|
|
130
|
+
throw new TypeError("salvageDumpToVault: vault with putBatch required");
|
|
131
|
+
}
|
|
132
|
+
if (typeof dumpPath !== "string" || !dumpPath) {
|
|
133
|
+
throw new TypeError("salvageDumpToVault: dumpPath required");
|
|
134
|
+
}
|
|
135
|
+
const { records, pages } = salvageFile(dumpPath, {
|
|
136
|
+
pageSize: opts.pageSize,
|
|
137
|
+
minCols: opts.minCols,
|
|
138
|
+
unaligned: opts.unaligned,
|
|
139
|
+
stride: opts.stride,
|
|
140
|
+
});
|
|
141
|
+
const built = buildSalvageEvents(records, opts);
|
|
142
|
+
const res = built.events.length
|
|
143
|
+
? vault.putBatch({ events: built.events })
|
|
144
|
+
: { events: 0 };
|
|
145
|
+
return {
|
|
146
|
+
ingested: res.events || 0,
|
|
147
|
+
salvaged: records.length,
|
|
148
|
+
mapped: built.mapped,
|
|
149
|
+
app: String(opts.app || "douyin").toLowerCase(),
|
|
150
|
+
sourceAdapter: built.sourceAdapter,
|
|
151
|
+
leafPages: pages,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
module.exports = {
|
|
156
|
+
APP_SALVAGE,
|
|
157
|
+
resolveApp,
|
|
158
|
+
buildSalvageEvents,
|
|
159
|
+
salvageDumpToVault,
|
|
160
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@chainlesschain/personal-data-hub",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.27",
|
|
4
4
|
"description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
|
|
5
5
|
"type": "commonjs",
|
|
6
6
|
"main": "lib/index.js",
|
|
@@ -68,7 +68,9 @@
|
|
|
68
68
|
"./adapters/messaging-qq": "./lib/adapters/messaging-qq/index.js",
|
|
69
69
|
"./adapters/messaging-telegram": "./lib/adapters/messaging-telegram/index.js",
|
|
70
70
|
"./adapters/messaging-whatsapp": "./lib/adapters/messaging-whatsapp/index.js",
|
|
71
|
-
"./sidecar": "./lib/sidecar/index.js"
|
|
71
|
+
"./sidecar": "./lib/sidecar/index.js",
|
|
72
|
+
"./forensics/leaf-salvage": "./lib/forensics/leaf-salvage.js",
|
|
73
|
+
"./forensics/salvage-ingest": "./lib/forensics/salvage-ingest.js"
|
|
72
74
|
},
|
|
73
75
|
"scripts": {
|
|
74
76
|
"test": "vitest run",
|