@chainlesschain/personal-data-hub 0.4.35 → 0.4.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* qzone-collect — QQ空间 (Qzone) collector core: 说说 / 留言板 / 相册 → vault events.
|
|
4
|
+
*
|
|
5
|
+
* Qzone has NO local browsable DB (the QQNT databases only cache per-contact
|
|
6
|
+
* "latest feed" preview snippets), so this is the API path: Qzone CGI endpoints
|
|
7
|
+
* authed with the account's qzone-domain `p_skey` + `uin` + a `g_tk` token
|
|
8
|
+
* derived from p_skey (the bkn hash). Pure Node — the only side effect is the
|
|
9
|
+
* caller-supplied `fetchImpl` (defaults to global fetch), so the parsers are
|
|
10
|
+
* unit-testable and the same core runs on PC (`cc hub collect-qzone --cookie`)
|
|
11
|
+
* and in-APK (the Android app captures the cookie via a WebView and feeds it in).
|
|
12
|
+
*
|
|
13
|
+
* Cookie note: the base `.qq.com` skey is rejected by Qzone ("请先登录空间") —
|
|
14
|
+
* the qzone-domain `p_skey` is required (a browser login to user.qzone.qq.com,
|
|
15
|
+
* or the in-app WebView, yields it). Extracted from
|
|
16
|
+
* scripts/android/pdh-qzone-collect.mjs (behaviour identical).
|
|
17
|
+
*/
|
|
18
|
+
const SELF_ID = 'person-qq-self';
|
|
19
|
+
const UA = 'Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Mobile Safari/537.36';
|
|
20
|
+
const SRC = (originalId, at) => ({ adapter: 'qzone', adapterVersion: '0.1.0', originalId, capturedAt: at || Date.now(), capturedBy: 'api' });
|
|
21
|
+
|
|
22
|
+
/** Qzone bkn/g_tk hash over p_skey (or skey). */
|
|
23
|
+
function gtk(s) { let h = 5381; for (let i = 0; i < String(s).length; i++) h += (h << 5) + String(s).charCodeAt(i); return h & 0x7fffffff; }
|
|
24
|
+
|
|
25
|
+
function parseCookieStr(s) { const o = {}; for (const part of String(s).split(/;\s*/)) { const i = part.indexOf('='); if (i > 0) o[part.slice(0, i).trim()] = part.slice(i + 1).trim(); } return o; }
|
|
26
|
+
function cookieHeader(ck) { return Object.entries(ck).map(([k, v]) => `${k}=${v}`).join('; '); }
|
|
27
|
+
function stripHtml(s) {
|
|
28
|
+
return String(s || '')
|
|
29
|
+
.replace(/<img[^>]*>/gi, '')
|
|
30
|
+
.replace(/<[^>]+>/g, '')
|
|
31
|
+
.replace(/ /g, ' ').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"')
|
|
32
|
+
.replace(/\s+/g, ' ').trim();
|
|
33
|
+
}
|
|
34
|
+
function beijingMs(s) { const m = /^(\d{4})-(\d{2})-(\d{2})[ T](\d{2}):(\d{2}):(\d{2})/.exec(String(s || '')); if (!m) return 0; return Date.parse(`${m[1]}-${m[2]}-${m[3]}T${m[4]}:${m[5]}:${m[6]}+08:00`) || 0; }
|
|
35
|
+
function unwrap(text) { return String(text).trim().replace(/^[\w$]+\(/, '').replace(/\);?\s*$/, ''); }
|
|
36
|
+
|
|
37
|
+
// ── 说说 (emotion_cgi_msglist_v6) → EVENT(post) ─────────────────────────────
|
|
38
|
+
function parseQzoneFeed(text) {
|
|
39
|
+
let json; try { json = JSON.parse(unwrap(text)); } catch { return { code: -1, events: [] }; }
|
|
40
|
+
if (json.code !== undefined && json.code !== 0) return { code: json.code, message: json.message, events: [] };
|
|
41
|
+
const list = json.msglist || (json.result && json.result.msglist) || [];
|
|
42
|
+
const events = [];
|
|
43
|
+
for (const it of list) {
|
|
44
|
+
const tid = it.tid || it.t1_tid || it.cellid;
|
|
45
|
+
const occurredAt = (Number(it.created_time) || 0) * 1000;
|
|
46
|
+
if (!tid || !occurredAt) continue;
|
|
47
|
+
const txt = (it.content || it.summary || '').replace(/\s+/g, ' ').trim();
|
|
48
|
+
const pics = Array.isArray(it.pic) ? it.pic.length : 0;
|
|
49
|
+
if (!txt && !pics) continue;
|
|
50
|
+
events.push({
|
|
51
|
+
type: 'event', subtype: 'post', id: `qzone:${tid}`,
|
|
52
|
+
occurredAt, actor: SELF_ID, participants: [SELF_ID],
|
|
53
|
+
content: { title: (txt || '[图片] 我的说说').slice(0, 80), text: txt || undefined },
|
|
54
|
+
source: SRC(`qzone-${tid}`, occurredAt),
|
|
55
|
+
extra: { kind: 'qzone-shuoshuo', tid, mediaCount: pics, cmtnum: it.cmtnum || 0, secret: !!it.secret },
|
|
56
|
+
ingestedAt: Date.now(),
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
return { code: 0, events, total: json.total != null ? json.total : (json.result && json.result.total) };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ── 留言板 (get_msgb) → EVENT(message) by the commenter ────────────────────
|
|
63
|
+
function parseGuestbook(text) {
|
|
64
|
+
let json; try { json = JSON.parse(unwrap(text)); } catch { return { code: -1, events: [], persons: [] }; }
|
|
65
|
+
if (json.code !== 0) return { code: json.code, message: json.message, events: [], persons: [] };
|
|
66
|
+
const list = (json.data && json.data.commentList) || [];
|
|
67
|
+
const events = [], persons = new Map();
|
|
68
|
+
for (const c of list) {
|
|
69
|
+
const id = c.id; const occurredAt = beijingMs(c.pubtime);
|
|
70
|
+
const txt = stripHtml(c.htmlContent || c.content || '');
|
|
71
|
+
if (!id || !occurredAt || !txt) continue;
|
|
72
|
+
const fromUin = String(c.uin || '');
|
|
73
|
+
const fromNick = c.nickname || fromUin;
|
|
74
|
+
const actor = fromUin ? `person-qq-${fromUin}` : SELF_ID;
|
|
75
|
+
if (fromUin && !persons.has(actor)) persons.set(actor, { type: 'person', subtype: 'contact', id: actor, names: fromNick !== fromUin ? [fromNick, fromUin] : [fromUin], identifiers: { qqUin: fromUin }, source: SRC(actor), ingestedAt: Date.now() });
|
|
76
|
+
events.push({
|
|
77
|
+
type: 'event', subtype: 'message', id: `qzone-msgb:${id}`,
|
|
78
|
+
occurredAt, actor, participants: [actor, SELF_ID],
|
|
79
|
+
content: { title: txt.slice(0, 80), text: txt },
|
|
80
|
+
source: SRC(`qzone-msgb-${id}`, occurredAt),
|
|
81
|
+
extra: { kind: 'qzone-guestbook', fromUin, fromNick },
|
|
82
|
+
ingestedAt: Date.now(),
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
return { code: 0, events, persons: [...persons.values()], total: json.data && json.data.total };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ── 相册 (fcg_list_album_v3) → EVENT(media) per album ──────────────────────
|
|
89
|
+
function parseAlbums(text) {
|
|
90
|
+
let json; try { json = JSON.parse(unwrap(text)); } catch { return { code: -1, events: [] }; }
|
|
91
|
+
if (json.code !== 0) return { code: json.code, message: json.message, events: [] };
|
|
92
|
+
const list = (json.data && json.data.albumList) || [];
|
|
93
|
+
const events = [];
|
|
94
|
+
for (const a of list) {
|
|
95
|
+
if (!a.id) continue;
|
|
96
|
+
const occurredAt = (Number(a.createtime) || 0) * 1000;
|
|
97
|
+
const name = a.name || '(相册)';
|
|
98
|
+
events.push({
|
|
99
|
+
type: 'event', subtype: 'media', id: `qzone-album:${a.id}`,
|
|
100
|
+
occurredAt: occurredAt || Date.now(), actor: SELF_ID, participants: [SELF_ID],
|
|
101
|
+
content: { title: `相册:${name}(${a.total || 0} 张)`, text: a.desc || undefined },
|
|
102
|
+
source: SRC(`qzone-album-${a.id}`, occurredAt),
|
|
103
|
+
extra: { kind: 'qzone-album', albumId: a.id, photoCount: a.total || 0, desc: a.desc || '', commentCount: a.comment || 0 },
|
|
104
|
+
ingestedAt: Date.now(),
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
return { code: 0, events, total: (json.data && json.data.albumsInUser) != null ? json.data.albumsInUser : list.length };
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function qproxy(domainPath, params) {
|
|
111
|
+
const qs = Object.entries({ format: 'json', inCharset: 'utf-8', outCharset: 'utf-8', source: 'qzone', plat: 'qzone', ...params }).map(([k, v]) => `${k}=${encodeURIComponent(v)}`).join('&');
|
|
112
|
+
return `https://user.qzone.qq.com/proxy/domain/${domainPath}?${qs}`;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Collect Qzone data into a vault batch. `fetchImpl(url, opts)` is injectable
|
|
117
|
+
* (defaults to global fetch) so this is testable offline and runs in-APK.
|
|
118
|
+
* @returns {Promise<{ok, uin, events, persons, counts, reason?}>}
|
|
119
|
+
*/
|
|
120
|
+
async function collectQzone({ uin, cookie, what = ['shuoshuo'], max = 500, fetchImpl } = {}) {
|
|
121
|
+
const ck = typeof cookie === 'string' ? parseCookieStr(cookie) : (cookie || {});
|
|
122
|
+
// QQ uin cookies are `o0<uin>` — strip the o/0 prefix (uins never have leading zeros).
|
|
123
|
+
const cleanUin = (s) => String(s || '').replace(/\D/g, '').replace(/^0+/, '');
|
|
124
|
+
uin = cleanUin(uin) || cleanUin(ck.uin) || cleanUin(ck.p_uin);
|
|
125
|
+
const pskey = ck.p_skey || ck.skey;
|
|
126
|
+
if (!uin || !pskey) return { ok: false, reason: 'missing uin or p_skey', events: [], persons: [], counts: {} };
|
|
127
|
+
const _fetch = fetchImpl || (typeof fetch !== 'undefined' ? fetch : null);
|
|
128
|
+
if (!_fetch) throw new Error('qzone collect: no fetch implementation available');
|
|
129
|
+
const wantSet = new Set(Array.isArray(what) ? what : String(what).split(',').map((s) => s.trim()));
|
|
130
|
+
const g = gtk(pskey);
|
|
131
|
+
const headers = { Cookie: cookieHeader(ck), Referer: `https://user.qzone.qq.com/${uin}`, 'User-Agent': UA };
|
|
132
|
+
const get = async (url) => { const r = await _fetch(url, { headers }); return typeof r.text === 'function' ? r.text() : r; };
|
|
133
|
+
|
|
134
|
+
const events = [], persons = new Map();
|
|
135
|
+
const counts = {};
|
|
136
|
+
|
|
137
|
+
if (wantSet.has('shuoshuo')) {
|
|
138
|
+
let n = 0;
|
|
139
|
+
for (let pos = 0; pos < max; pos += 20) {
|
|
140
|
+
const r = parseQzoneFeed(await get(qproxy('taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6', { uin, hostUin: uin, num: 20, pos, g_tk: g, need_private_comment: 1 })));
|
|
141
|
+
if (r.code !== 0 || !r.events.length) break;
|
|
142
|
+
events.push(...r.events); n += r.events.length;
|
|
143
|
+
if (r.total != null && n >= r.total) break;
|
|
144
|
+
}
|
|
145
|
+
counts.shuoshuo = n;
|
|
146
|
+
}
|
|
147
|
+
if (wantSet.has('msgb')) {
|
|
148
|
+
let n = 0, total = null;
|
|
149
|
+
for (let start = 0; start < max; start += 20) {
|
|
150
|
+
const r = parseGuestbook(await get(qproxy('m.qzone.qq.com/cgi-bin/new/get_msgb', { uin, hostUin: uin, num: 20, start, g_tk: g })));
|
|
151
|
+
if (r.code !== 0) break;
|
|
152
|
+
total = r.total;
|
|
153
|
+
if (!r.events.length) break;
|
|
154
|
+
events.push(...r.events); for (const p of r.persons) persons.set(p.id, p); n += r.events.length;
|
|
155
|
+
if (total != null && n >= total) break;
|
|
156
|
+
}
|
|
157
|
+
counts.msgb = n;
|
|
158
|
+
}
|
|
159
|
+
if (wantSet.has('album')) {
|
|
160
|
+
const r = parseAlbums(await get(qproxy('photo.qzone.qq.com/fcgi-bin/fcg_list_album_v3', { g_tk: g, hostUin: uin, uin, mode: 2, pageStart: 0, pageNum: 200 })));
|
|
161
|
+
if (r.code === 0) { events.push(...r.events); counts.album = r.events.length; }
|
|
162
|
+
else counts.album = 0;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return { ok: true, uin, events, persons: [...persons.values()], counts };
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
module.exports = { gtk, parseCookieStr, stripHtml, parseQzoneFeed, parseGuestbook, parseAlbums, collectQzone, SELF_ID };
|
|
@@ -171,4 +171,102 @@ function parseEvents(Database, dbPath, _self) {
|
|
|
171
171
|
return { events, persons: [...persons.values()], topics: [...topics.values()] };
|
|
172
172
|
}
|
|
173
173
|
|
|
174
|
-
|
|
174
|
+
// ── 朋友圈 (SnsMicroMsg.db, PLAINTEXT — no SQLCipher) ───────────────────────
|
|
175
|
+
// Unlike EnMicroMsg.db, SnsMicroMsg.db is NOT encrypted (header = "SQLite
|
|
176
|
+
// format 3\0"), so it opens directly. SnsInfo.content is a protobuf
|
|
177
|
+
// TimelineObject: the post text is top-level field 5 (contentDesc), media are
|
|
178
|
+
// qpic.cn URLs embedded in the blob, and the poster nickname lives in attrBuf.
|
|
179
|
+
// SnsInfo.createTime is epoch SECONDS. Verified on chopin (WeChat 8.0.74):
|
|
180
|
+
// account 60e2c317… had 2931 posts (2623 with text) readable without any key.
|
|
181
|
+
function _pbReadVarint(buf, pos) {
|
|
182
|
+
let shift = 0, result = 0n;
|
|
183
|
+
while (pos < buf.length) {
|
|
184
|
+
const b = buf[pos++];
|
|
185
|
+
result |= BigInt(b & 0x7f) << BigInt(shift);
|
|
186
|
+
if (!(b & 0x80)) break;
|
|
187
|
+
shift += 7;
|
|
188
|
+
}
|
|
189
|
+
return [result, pos];
|
|
190
|
+
}
|
|
191
|
+
// Walk top-level protobuf fields → { fieldNum: [Buffer|BigInt, …] }. Best-effort
|
|
192
|
+
// (stops on malformed input); length-delimited values are returned as slices.
|
|
193
|
+
function _pbFields(buf) {
|
|
194
|
+
const out = {};
|
|
195
|
+
let pos = 0;
|
|
196
|
+
while (pos < buf.length) {
|
|
197
|
+
let tag; [tag, pos] = _pbReadVarint(buf, pos);
|
|
198
|
+
const field = Number(tag >> 3n), wire = Number(tag & 7n);
|
|
199
|
+
if (field === 0) break;
|
|
200
|
+
let val;
|
|
201
|
+
if (wire === 0) { [val, pos] = _pbReadVarint(buf, pos); }
|
|
202
|
+
else if (wire === 2) { let len; [len, pos] = _pbReadVarint(buf, pos); len = Number(len); if (len < 0 || pos + len > buf.length) break; val = buf.subarray(pos, pos + len); pos += len; }
|
|
203
|
+
else if (wire === 1) { val = buf.subarray(pos, pos + 8); pos += 8; }
|
|
204
|
+
else if (wire === 5) { val = buf.subarray(pos, pos + 4); pos += 4; }
|
|
205
|
+
else break;
|
|
206
|
+
(out[field] ||= []).push(val);
|
|
207
|
+
}
|
|
208
|
+
return out;
|
|
209
|
+
}
|
|
210
|
+
function snsPostText(contentBuf) {
|
|
211
|
+
try { const f = _pbFields(contentBuf); if (f[5] && f[5].length) { const t = f[5][0].toString('utf8').trim(); if (t) return t; } } catch { /* not a TimelineObject */ }
|
|
212
|
+
return '';
|
|
213
|
+
}
|
|
214
|
+
function snsMediaUrls(contentBuf) {
|
|
215
|
+
const s = contentBuf.toString('latin1'); const urls = new Set();
|
|
216
|
+
const re = /https?:\/\/[A-Za-z0-9._-]*qpic\.cn[A-Za-z0-9._\-/?=&%]+/g; let m;
|
|
217
|
+
while ((m = re.exec(s))) urls.add(m[0]);
|
|
218
|
+
return [...urls];
|
|
219
|
+
}
|
|
220
|
+
function snsNickname(attrBuf, wxid) {
|
|
221
|
+
try { const f = _pbFields(attrBuf); for (const vals of Object.values(f)) for (const v of vals) { if (Buffer.isBuffer(v)) { const s = v.toString('utf8'); if (s && s !== wxid && !/^wxid_/.test(s) && /[一-鿿A-Za-z]/.test(s) && s.length <= 40 && !/[\x00-\x08]/.test(s)) return s; } } } catch { /* ignore */ }
|
|
222
|
+
return '';
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Parse a PLAINTEXT SnsMicroMsg.db → 朋友圈 vault batch { events, persons, topics }.
|
|
227
|
+
* Each SnsInfo row → EVENT(post) attributed to the poster. `selfWxid` (optional)
|
|
228
|
+
* maps the user's own posts to SELF_ID; `nameMap` (wxid → displayName, e.g. from
|
|
229
|
+
* the matching account's decrypted rcontact) overrides attrBuf nicknames.
|
|
230
|
+
*/
|
|
231
|
+
function parseSnsEvents(Database, dbPath, { selfWxid, nameMap } = {}) {
|
|
232
|
+
const src = new Database(dbPath, { readonly: true });
|
|
233
|
+
const events = [];
|
|
234
|
+
const persons = new Map();
|
|
235
|
+
const names = nameMap instanceof Map ? nameMap : new Map(Object.entries(nameMap || {}));
|
|
236
|
+
try {
|
|
237
|
+
let rows = [];
|
|
238
|
+
try { rows = src.prepare('SELECT snsId,userName,createTime,type,content,attrBuf FROM SnsInfo ORDER BY createTime DESC LIMIT 5000').all(); }
|
|
239
|
+
catch { return { events: [], persons: [], topics: [] }; } // no SnsInfo table
|
|
240
|
+
for (const r of rows) {
|
|
241
|
+
const wxid = String(r.userName || '');
|
|
242
|
+
if (!wxid) continue;
|
|
243
|
+
const text = r.content ? snsPostText(r.content) : '';
|
|
244
|
+
const media = r.content ? snsMediaUrls(r.content) : [];
|
|
245
|
+
if (!text && !media.length) continue; // skip empty / pure-ad shells
|
|
246
|
+
const occurredAt = (Number(r.createTime) || 0) * 1000; // SnsInfo.createTime is seconds
|
|
247
|
+
if (!occurredAt) continue;
|
|
248
|
+
const isSelf = !!(selfWxid && wxid === selfWxid);
|
|
249
|
+
const nick = names.get(wxid) || (r.attrBuf ? snsNickname(r.attrBuf, wxid) : '') || wxid;
|
|
250
|
+
const actor = isSelf ? SELF_ID : `person-wechat-${wxid}`;
|
|
251
|
+
if (!isSelf && !persons.has(actor)) {
|
|
252
|
+
const nm = nick && nick !== wxid ? [nick, wxid] : [wxid];
|
|
253
|
+
persons.set(actor, { type: 'person', subtype: 'contact', id: actor, names: nm, identifiers: { wechatId: wxid }, source: SRC(actor), ingestedAt: Date.now() });
|
|
254
|
+
}
|
|
255
|
+
const title = (text || `[图片] ${nick}的朋友圈`).replace(/\s+/g, ' ').trim().slice(0, 80);
|
|
256
|
+
events.push({
|
|
257
|
+
type: 'event', subtype: 'post', id: `wechat-sns:${r.snsId}`,
|
|
258
|
+
occurredAt, actor, participants: [actor],
|
|
259
|
+
content: { title: title || '(朋友圈)', text: text || undefined },
|
|
260
|
+
source: SRC(`sns-${r.snsId}`, occurredAt),
|
|
261
|
+
extra: { kind: 'moment', isSelf, poster: nick, mediaCount: media.length, media: media.slice(0, 9) },
|
|
262
|
+
ingestedAt: Date.now(),
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
if (selfWxid) persons.set(SELF_ID, { type: 'person', subtype: 'contact', id: SELF_ID, names: ['我(微信)'], source: SRC(SELF_ID), ingestedAt: Date.now() });
|
|
266
|
+
} finally {
|
|
267
|
+
src.close();
|
|
268
|
+
}
|
|
269
|
+
return { events, persons: [...persons.values()], topics: [] };
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
module.exports = { computeKeyCandidates, deriveAndDecrypt, parseEvents, parseSnsEvents, snsPostText, snsMediaUrls, snsNickname };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@chainlesschain/personal-data-hub",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.36",
|
|
4
4
|
"description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
|
|
5
5
|
"type": "commonjs",
|
|
6
6
|
"main": "lib/index.js",
|
|
@@ -76,6 +76,7 @@
|
|
|
76
76
|
"./forensics/salvage-ingest": "./lib/forensics/salvage-ingest.js",
|
|
77
77
|
"./forensics/qq-nt-collect": "./lib/forensics/qq-nt-collect.js",
|
|
78
78
|
"./forensics/wechat-collect": "./lib/forensics/wechat-collect.js",
|
|
79
|
+
"./forensics/qzone-collect": "./lib/forensics/qzone-collect.js",
|
|
79
80
|
"./forensics/plaintext-db-collect": "./lib/forensics/plaintext-db-collect.js"
|
|
80
81
|
},
|
|
81
82
|
"scripts": {
|