@chainlesschain/personal-data-hub 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,21 +81,52 @@ class WeChatPcAdapter {
81
81
  fs,
82
82
  // DI seam: tests inject a fake SQLite driver class via dbDriverFactory.
83
83
  dbDriverFactory: opts.dbDriverFactory || null,
84
+ // DI seam: tests inject a fake WeChat 4.x collector; default lazy-loads
85
+ // the forensics-bridge sidecar invoker.
86
+ v4Collector: opts.v4Collector || null,
87
+ // DI seam for discovery (see _autoDiscover).
88
+ discoveryDeps: opts.discoveryDeps || undefined,
84
89
  };
85
90
  }
86
91
 
92
+ // Auto-discover PC WeChat's local DB on the host (3.x + 4.x layouts) so the
93
+ // UI never needs a manually typed path. Lazy-required + cached per instance.
94
+ _autoDiscover() {
95
+ if (this._discovered !== undefined) return this._discovered;
96
+ try {
97
+ // eslint-disable-next-line global-require
98
+ const { discover } = require("../_pc-local-discovery");
99
+ this._discovered = discover("wechat-pc", this._deps.discoveryDeps || {});
100
+ } catch (_e) {
101
+ this._discovered = null;
102
+ }
103
+ return this._discovered;
104
+ }
105
+
87
106
  async authenticate(ctx = {}) {
88
107
  // Cheap readiness probe — never opens / decrypts a DB.
89
108
  if (ctx && ctx.readinessOnly) {
90
109
  if (this._dbPath) return { ok: true, mode: "configured" };
110
+ const disc = this._autoDiscover();
111
+ if (disc && disc.installed) {
112
+ return {
113
+ ok: false,
114
+ reason: "DB_FOUND_NEEDS_KEY",
115
+ message: `已找到本机微信库(${disc.layout || ""} ${disc.accounts.length} 个账号,主库 ${disc.primaryDb})`,
116
+ discovered: disc,
117
+ };
118
+ }
91
119
  return {
92
120
  ok: false,
93
- reason: "DB_NOT_PULLED",
94
- message:
95
- "wechat-pc: 需提供 PC 微信本地数据库路径(MSG*.db / MicroMsg.db),加密库需先解密或提供 key",
121
+ reason: "APP_NOT_INSTALLED",
122
+ message: (disc && disc.note) || "未检测到本机微信数据(可能未安装或未登录)",
96
123
  };
97
124
  }
98
- const dbPath = (ctx && ctx.inputPath) || (ctx && ctx.dbPath) || this._dbPath;
125
+ const dbPath =
126
+ (ctx && ctx.inputPath) ||
127
+ (ctx && ctx.dbPath) ||
128
+ this._dbPath ||
129
+ this._resolveDiscoveredDbPath();
99
130
  if (dbPath) {
100
131
  try {
101
132
  this._deps.fs.accessSync(dbPath, this._deps.fs.constants.R_OK);
@@ -108,22 +139,54 @@ class WeChatPcAdapter {
108
139
  }
109
140
  return { ok: true, mode: "sqlite" };
110
141
  }
142
+ const disc = this._autoDiscover();
143
+ if (disc && disc.installed) {
144
+ return {
145
+ ok: false,
146
+ reason: "DB_FOUND_NEEDS_KEY",
147
+ message: `已找到本机微信库(主库 ${disc.primaryDb}),需解密密钥`,
148
+ discovered: disc,
149
+ };
150
+ }
111
151
  return {
112
152
  ok: false,
113
- reason: "DB_NOT_PULLED",
114
- message: "wechat-pc.authenticate: needs opts.dbPath / inputPath (MSG*.db or MicroMsg.db)",
153
+ reason: "APP_NOT_INSTALLED",
154
+ message: "wechat-pc.authenticate: 未检测到本机微信库,也未提供 dbPath / inputPath",
115
155
  };
116
156
  }
117
157
 
158
+ // Resolve the auto-discovered primary message DB path (null if none).
159
+ _resolveDiscoveredDbPath() {
160
+ const disc = this._autoDiscover();
161
+ return disc && disc.installed && disc.primaryDb ? disc.primaryDb : null;
162
+ }
163
+
118
164
  async healthCheck() {
119
165
  return { ok: true, lastChecked: Date.now() };
120
166
  }
121
167
 
122
168
  async *sync(opts = {}) {
123
- const dbPath = opts.dbPath || opts.inputPath || this._dbPath;
169
+ // WeChat 4.x path: encrypted SQLCipher-4 DBs whose key lives in Weixin.exe
170
+ // memory. Route through the Python sidecar (memory key + decrypt + parse)
171
+ // and yield the decrypted messages. Triggered when the user gives no
172
+ // explicit plaintext path AND discovery sees the 4.x layout, or opts.mode.
173
+ const disc = this._autoDiscover();
174
+ const noExplicitPath = !opts.dbPath && !opts.inputPath && !this._dbPath;
175
+ const useV4 =
176
+ opts.mode === "v4" ||
177
+ (noExplicitPath && disc && disc.installed && disc.layout === "4.x");
178
+ if (useV4) {
179
+ yield* this._syncV4(opts, disc);
180
+ return;
181
+ }
182
+
183
+ // One-click: when no explicit path is given, fall back to the
184
+ // auto-discovered primary message DB on this host (3.x plaintext/keyed).
185
+ const dbPath =
186
+ opts.dbPath || opts.inputPath || this._dbPath || this._resolveDiscoveredDbPath();
124
187
  if (!dbPath) {
125
188
  throw new Error(
126
- "wechat-pc.sync: needs opts.dbPath / opts.inputPath pointing to a PC WeChat DB (MSG*.db or MicroMsg.db)",
189
+ "wechat-pc.sync: 未找到本机微信库且未提供 opts.dbPath / opts.inputPath",
127
190
  );
128
191
  }
129
192
  if (!this._deps.fs.existsSync(dbPath)) return;
@@ -186,6 +249,114 @@ class WeChatPcAdapter {
186
249
  }
187
250
  }
188
251
 
252
+ // WeChat 4.x: invoke the sidecar collector, then re-shape each decrypted
253
+ // message into the SAME payload the 3.x normalizeMessage() understands, so
254
+ // both layouts share one normalization path.
255
+ async *_syncV4(opts = {}, disc) {
256
+ let collect = this._deps.v4Collector;
257
+ if (!collect) {
258
+ // eslint-disable-next-line global-require
259
+ collect = require("./v4-sidecar").collectWeChatV4;
260
+ }
261
+ const limit = Number.isInteger(opts.limit) && opts.limit > 0 ? opts.limit : undefined;
262
+ const result = await collect({
263
+ limit,
264
+ key: opts.key || this._key || undefined,
265
+ pythonExe: opts.pythonExe,
266
+ bridgeDir: opts.bridgeDir,
267
+ timeoutMs: opts.timeoutMs,
268
+ onProgress:
269
+ typeof opts.onProgress === "function"
270
+ ? (m) => {
271
+ try { opts.onProgress({ phase: "wechat-v4", adapter: NAME, ...m }); } catch (_e) { /* best-effort */ }
272
+ }
273
+ : undefined,
274
+ _supervisorFactory: opts._supervisorFactory,
275
+ });
276
+ if (typeof opts.onProgress === "function") {
277
+ try {
278
+ opts.onProgress({
279
+ phase: "wechat-v4-done",
280
+ adapter: NAME,
281
+ account: result && result.account,
282
+ messageCount: result && result.messageCount,
283
+ dbs: result && result.dbs,
284
+ });
285
+ } catch (_e) { /* best-effort */ }
286
+ }
287
+ const selfWxid =
288
+ (result && result.account) ||
289
+ (disc && disc.accounts && disc.accounts[0] && disc.accounts[0].id) ||
290
+ null;
291
+ const fallbackCapturedAt = Date.now();
292
+ const messages = (result && Array.isArray(result.messages)) ? result.messages : [];
293
+ let emitted = 0;
294
+ // The sidecar already applied `limit` across all sources (chat/biz/sns/
295
+ // favorite). Yield everything it returned — do NOT re-cap here, or the
296
+ // trailing 朋友圈/收藏 entries and the contacts block would be skipped.
297
+ for (const m of messages) {
298
+ if (!m || typeof m !== "object") continue;
299
+ const conv = typeof m.conversation === "string" ? m.conversation : null;
300
+ const isGroup = !!conv && conv.endsWith("@chatroom");
301
+ const createdTimeMs =
302
+ typeof m.createTime === "number" && m.createTime > 0 ? m.createTime * 1000 : null;
303
+ // Map → 3.x payload shape consumed by normalizeMessage().
304
+ const payload = {
305
+ kind: KIND_MESSAGE,
306
+ msgSvrId: m.originalId || null,
307
+ talker: conv,
308
+ isSend: selfWxid && m.sender && m.sender === selfWxid ? 1 : 0,
309
+ type: typeof m.type === "number" ? m.type : null,
310
+ createdTimeMs,
311
+ text: typeof m.text === "string" ? m.text : "",
312
+ senderWxid: isGroup ? (m.sender || null) : null,
313
+ isGroup,
314
+ contentBlob: typeof m.text === "string" ? m.text : null,
315
+ // provenance: chat | biz(公众号) | sns(朋友圈) | favorite(收藏)
316
+ wechatSource: typeof m.source === "string" ? m.source : "chat",
317
+ appType: typeof m.appType === "number" ? m.appType : null, // appmsg subtype (type 49)
318
+ appUrl: typeof m.appUrl === "string" ? m.appUrl : null, // link/article url
319
+ };
320
+ const idPart =
321
+ m.originalId ||
322
+ (conv && createdTimeMs ? `${conv}-${createdTimeMs}` : `v4-${emitted}`);
323
+ yield {
324
+ adapter: NAME,
325
+ kind: KIND_MESSAGE,
326
+ originalId: m.originalId || stableOriginalId(KIND_MESSAGE, idPart),
327
+ capturedAt: createdTimeMs || fallbackCapturedAt,
328
+ payload,
329
+ };
330
+ emitted += 1;
331
+ }
332
+
333
+ // Contacts (from contact.db) → Person entities. Not bound by the message
334
+ // `limit` (that caps messages, not the address book). Opt out via
335
+ // opts.include.contact === false.
336
+ const include = opts.include || {};
337
+ if (include[KIND_CONTACT] !== false) {
338
+ const contacts = (result && Array.isArray(result.contacts)) ? result.contacts : [];
339
+ for (const c of contacts) {
340
+ if (!c || typeof c !== "object" || !c.wxid) continue;
341
+ if (typeof c.wxid === "string" && c.wxid.endsWith("@chatroom")) continue;
342
+ yield {
343
+ adapter: NAME,
344
+ kind: KIND_CONTACT,
345
+ originalId: stableOriginalId(KIND_CONTACT, c.wxid),
346
+ capturedAt: fallbackCapturedAt,
347
+ payload: {
348
+ kind: KIND_CONTACT,
349
+ wxid: c.wxid,
350
+ alias: c.alias || null,
351
+ nickname: c.nickname || null,
352
+ remark: c.remark || null,
353
+ type: typeof c.type === "number" ? c.type : null,
354
+ },
355
+ };
356
+ }
357
+ }
358
+ }
359
+
189
360
  normalize(raw) {
190
361
  if (!raw || !raw.payload) {
191
362
  throw new Error("WeChatPcAdapter.normalize: payload missing");
@@ -282,6 +453,9 @@ function normalizeMessage(p, raw, ingestedAt) {
282
453
  isSend,
283
454
  isGroup,
284
455
  wechatType: typeof p.type === "number" ? p.type : null,
456
+ wechatSource: typeof p.wechatSource === "string" ? p.wechatSource : "chat",
457
+ ...(p.appType != null ? { wechatAppType: p.appType } : {}),
458
+ ...(p.appUrl ? { url: p.appUrl } : {}),
285
459
  senderWxid: p.senderWxid || null,
286
460
  contentBlob: typeof p.contentBlob === "string" ? p.contentBlob : null,
287
461
  ...(topics.length ? { topicId: topics[0].id } : {}),
@@ -0,0 +1,112 @@
1
+ "use strict";
2
+
3
+ /**
4
+ * WeChat 4.x collection bridge — invokes the forensics-bridge Python sidecar's
5
+ * `wechat_v4.collect` method (memory key extraction + SQLCipher-4 decryption +
6
+ * Msg_<md5> parsing) and returns the decrypted messages to the node adapter.
7
+ *
8
+ * Why a sidecar: WeChat 4.0 DBs are SQLCipher-4 encrypted with a key cached in
9
+ * Weixin.exe process memory. Recovering it needs ReadProcessMemory (Windows)
10
+ * and AES/PBKDF2 — done in Python (`cryptography`), which also sidesteps the
11
+ * host-node bs3mc ABI problem (the node side never opens the encrypted DB).
12
+ *
13
+ * Resolution (all overridable for tests / packaging):
14
+ * - python exe: opts.pythonExe → env CC_PDH_PYTHON → "python" / "python3"
15
+ * - bridge dir: opts.bridgeDir → env CC_PDH_BRIDGE_DIR → sibling package
16
+ *
17
+ * Returns the sidecar result `{ account, messageCount, dbs, messages }`.
18
+ * Throws a typed Error (code on .code) the adapter maps to a sync failure.
19
+ */
20
+
21
+ const path = require("node:path");
22
+ const { existsSync } = require("node:fs");
23
+
24
+ function resolveBridgeDir(explicit) {
25
+ if (explicit) return explicit;
26
+ if (process.env.CC_PDH_BRIDGE_DIR) return process.env.CC_PDH_BRIDGE_DIR;
27
+ // lib/adapters/wechat-pc → up to packages/, then sibling bridge package.
28
+ return path.resolve(__dirname, "../../../../personal-data-hub-bridge");
29
+ }
30
+
31
+ function pythonCandidates(explicit) {
32
+ const list = [];
33
+ if (explicit) list.push(explicit);
34
+ if (process.env.CC_PDH_PYTHON) list.push(process.env.CC_PDH_PYTHON);
35
+ // Windows commonly ships `python`; *nix `python3`. Try both.
36
+ list.push(process.platform === "win32" ? "python" : "python3");
37
+ list.push(process.platform === "win32" ? "python3" : "python");
38
+ return [...new Set(list)];
39
+ }
40
+
41
+ /**
42
+ * @param {object} [opts]
43
+ * @param {number} [opts.limit] max messages
44
+ * @param {string} [opts.key] pre-extracted 64-hex key (skips memory scan)
45
+ * @param {string} [opts.pythonExe]
46
+ * @param {string} [opts.bridgeDir]
47
+ * @param {number} [opts.timeoutMs] collect timeout (default 120s)
48
+ * @param {(msg:object)=>void} [opts.onProgress]
49
+ * @param {object} [opts._supervisorFactory] test seam → returns a SidecarSupervisor-like
50
+ * @returns {Promise<{account:string,messageCount:number,dbs:object[],messages:object[]}>}
51
+ */
52
+ async function collectWeChatV4(opts = {}) {
53
+ const bridgeDir = resolveBridgeDir(opts.bridgeDir);
54
+ const makeSupervisor =
55
+ opts._supervisorFactory ||
56
+ ((command, cwd) => {
57
+ // eslint-disable-next-line global-require
58
+ const { SidecarSupervisor } = require("../../sidecar");
59
+ return new SidecarSupervisor({
60
+ command,
61
+ cwd,
62
+ defaultTimeoutMs: opts.timeoutMs || 120_000,
63
+ healthCheckIntervalMs: 0,
64
+ });
65
+ });
66
+
67
+ if (!opts._supervisorFactory && !existsSync(bridgeDir)) {
68
+ const e = new Error(
69
+ `wechat-pc v4: forensics-bridge not found at ${bridgeDir} (set CC_PDH_BRIDGE_DIR)`,
70
+ );
71
+ e.code = "BRIDGE_NOT_FOUND";
72
+ throw e;
73
+ }
74
+
75
+ const params = {};
76
+ if (Number.isInteger(opts.limit) && opts.limit > 0) params.limit = opts.limit;
77
+ if (opts.key) params.key = opts.key;
78
+
79
+ let lastErr = null;
80
+ for (const py of pythonCandidates(opts.pythonExe)) {
81
+ const command = [py, "-m", "forensics_bridge.ipc_server"];
82
+ const sup = makeSupervisor(command, bridgeDir);
83
+ try {
84
+ await sup.start({ readyTimeoutMs: opts.readyTimeoutMs || 15_000 });
85
+ const result = await sup.invoke("wechat_v4.collect", params, {
86
+ timeoutMs: opts.timeoutMs || 120_000,
87
+ onProgress: opts.onProgress,
88
+ });
89
+ try { await sup.stop(); } catch (_e) { /* best-effort */ }
90
+ return result;
91
+ } catch (err) {
92
+ lastErr = err;
93
+ try { await sup.stop(); } catch (_e) { /* best-effort */ }
94
+ const msg = (err && err.message) || "";
95
+ // Real WeChat-side failures (key/app/db) must surface immediately — the
96
+ // sidecar ran fine, the data just isn't there. Everything else (python
97
+ // missing, wrong python without `cryptography`, import errors, spawn
98
+ // death, handshake timeout) → try the next python candidate.
99
+ const isDataError = /KEY_NOT_FOUND|KEY_VERIFY|APP_NOT|DB_NOT|APP_NOT_RUNNING|EXTRACT_PERMISSION/i.test(msg);
100
+ if (isDataError) throw err;
101
+ // otherwise fall through to the next candidate
102
+ }
103
+ }
104
+ const e = new Error(
105
+ `wechat-pc v4: could not run forensics-bridge sidecar (tried ${pythonCandidates(opts.pythonExe).join(", ")}). ` +
106
+ `Install Python 3.11+ with the 'cryptography' package, or set CC_PDH_PYTHON. Last error: ${lastErr && lastErr.message}`,
107
+ );
108
+ e.code = "SIDECAR_UNAVAILABLE";
109
+ throw e;
110
+ }
111
+
112
+ module.exports = { collectWeChatV4, _internals: { resolveBridgeDir, pythonCandidates } };
package/lib/registry.js CHANGED
@@ -65,6 +65,26 @@ class AdapterRegistry {
65
65
  // depend on it).
66
66
  this.entityResolver = opts.entityResolver || null;
67
67
 
68
+ // ADB one-click readiness (Phase: social platforms). When supplied by the
69
+ // wiring, readiness() treats the named adapters as "collectable via a
70
+ // rooted-phone USB one-click" — flipping their NO_INPUT / DB_NOT_PULLED
71
+ // status to "ready (device connected)" or "ADB_DEVICE_NEEDED" depending on
72
+ // whether a device is currently attached. Keeps the registry generic: the
73
+ // platform list + the actual `adb devices` probe come from the host wiring.
74
+ // opts.adbReadiness = {
75
+ // probe: async () => ({ deviceConnected: boolean, serial?: string }),
76
+ // oneClickNames: Set<string>, // adapter names with an *AdbSync path
77
+ // }
78
+ this._adbReadiness =
79
+ opts.adbReadiness && typeof opts.adbReadiness.probe === "function"
80
+ ? {
81
+ probe: opts.adbReadiness.probe,
82
+ oneClickNames: opts.adbReadiness.oneClickNames instanceof Set
83
+ ? opts.adbReadiness.oneClickNames
84
+ : new Set(opts.adbReadiness.oneClickNames || []),
85
+ }
86
+ : null;
87
+
68
88
  this._adapters = new Map();
69
89
  this._activeSync = null; // name of currently-running adapter, or null
70
90
  }
@@ -158,9 +178,24 @@ class AdapterRegistry {
158
178
  Number.isInteger(opts.timeoutMs) && opts.timeoutMs > 0
159
179
  ? opts.timeoutMs
160
180
  : DEFAULT_READINESS_TIMEOUT_MS;
181
+ // Probe the host's ADB device state ONCE (best-effort) so all ADB
182
+ // one-click adapters share a single `adb devices` call this round.
183
+ let adbState = null;
184
+ if (this._adbReadiness) {
185
+ try {
186
+ adbState = await this._withTimeout(
187
+ Promise.resolve().then(() => this._adbReadiness.probe()),
188
+ timeoutMs,
189
+ "adb-probe"
190
+ );
191
+ } catch (_e) {
192
+ adbState = { deviceConnected: false };
193
+ }
194
+ }
195
+
161
196
  const reports = [];
162
197
  for (const adapter of this._adapters.values()) {
163
- const report = await this._probeReadiness(adapter, timeoutMs);
198
+ const report = await this._probeReadiness(adapter, timeoutMs, adbState);
164
199
  // Attach the step-by-step import guide (how to get this source's data
165
200
  // into the vault) keyed off the resolved category. Single source of
166
201
  // truth in adapter-guide.js — reused by every shell.
@@ -170,7 +205,7 @@ class AdapterRegistry {
170
205
  return reports;
171
206
  }
172
207
 
173
- async _probeReadiness(adapter, timeoutMs) {
208
+ async _probeReadiness(adapter, timeoutMs, adbState) {
174
209
  const dd = adapter.dataDisclosure || {};
175
210
  const extractMode = adapter.extractMode || "web-api";
176
211
  const base = {
@@ -240,6 +275,47 @@ class AdapterRegistry {
240
275
  }
241
276
 
242
277
  const reason = (auth && auth.reason) || "UNKNOWN";
278
+
279
+ // ADB one-click platforms (social): the adapter itself has no snapshot yet
280
+ // (NO_INPUT / INPUT_PATH_REQUIRED / DB_NOT_PULLED), but the platform CAN be
281
+ // collected in one click from a rooted phone over USB. Reflect the real
282
+ // device state instead of the misleading "采集需先在手机 App 内…".
283
+ if (
284
+ this._adbReadiness &&
285
+ this._adbReadiness.oneClickNames.has(adapter.name) &&
286
+ (reason === "NO_INPUT" || reason === "INPUT_PATH_REQUIRED" || reason === "DB_NOT_PULLED")
287
+ ) {
288
+ if (adbState && adbState.deviceConnected) {
289
+ return {
290
+ ...base,
291
+ ready: true,
292
+ status: "ready",
293
+ category: "device",
294
+ reason: null,
295
+ message: "已连接 root 手机,点「一键采集」即可拉取",
296
+ actionHint: null,
297
+ mode: "adb-oneclick",
298
+ lastSyncedAt,
299
+ lastStatus,
300
+ lastError,
301
+ };
302
+ }
303
+ const adbDesc = describeReadiness("ADB_DEVICE_NEEDED");
304
+ return {
305
+ ...base,
306
+ ready: false,
307
+ status: adbDesc.status,
308
+ category: adbDesc.category,
309
+ reason: "ADB_DEVICE_NEEDED",
310
+ message: adbDesc.message,
311
+ actionHint: adbDesc.actionHint,
312
+ mode: null,
313
+ lastSyncedAt,
314
+ lastStatus,
315
+ lastError,
316
+ };
317
+ }
318
+
243
319
  const desc = describeReadiness(reason);
244
320
  const detail = auth && (auth.message || auth.error);
245
321
  const message =
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chainlesschain/personal-data-hub",
3
- "version": "0.4.0",
3
+ "version": "0.4.2",
4
4
  "description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
5
5
  "type": "commonjs",
6
6
  "main": "lib/index.js",