@chainlesschain/personal-data-hub 0.4.7 → 0.4.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/biz-tianyancha.test.js +159 -0
- package/__tests__/adapters/doc-baidu-netdisk.test.js +102 -0
- package/__tests__/adapters/doc-camscanner.test.js +147 -0
- package/__tests__/adapters/doc-platforms.test.js +177 -0
- package/__tests__/adapters/gov-ixiamen.test.js +150 -0
- package/__tests__/adapters/gov-tax.test.js +135 -0
- package/__tests__/adapters/health-meiyou.test.js +125 -0
- package/__tests__/adapters/music-kugou.test.js +187 -0
- package/__tests__/adapters/recruit-boss.test.js +180 -0
- package/__tests__/adapters/shopping-dianping.test.js +239 -0
- package/__tests__/adapters/social-csdn.test.js +175 -0
- package/__tests__/adapters/social-dongchedi.test.js +165 -0
- package/__tests__/adapters/social-zhihu.test.js +246 -0
- package/__tests__/adapters/travel-ctrip.test.js +175 -1
- package/__tests__/adapters/travel-didi.test.js +204 -0
- package/__tests__/adapters/travel-tongcheng.test.js +289 -0
- package/__tests__/adapters/video-platforms.test.js +152 -0
- package/__tests__/adapters/video-xigua.test.js +106 -0
- package/__tests__/adapters/wework-pc.test.js +124 -0
- package/lib/adapter-guide.js +25 -3
- package/lib/adapters/_document-base.js +370 -0
- package/lib/adapters/_video-base.js +331 -0
- package/lib/adapters/biz-tianyancha/index.js +348 -0
- package/lib/adapters/doc-baidu-netdisk/index.js +91 -0
- package/lib/adapters/doc-camscanner/index.js +102 -0
- package/lib/adapters/doc-tencent-docs/index.js +94 -0
- package/lib/adapters/doc-wps/index.js +77 -0
- package/lib/adapters/gov-ixiamen/index.js +380 -0
- package/lib/adapters/gov-tax/index.js +451 -0
- package/lib/adapters/health-meiyou/index.js +393 -0
- package/lib/adapters/music-kugou/index.js +418 -0
- package/lib/adapters/recruit-boss/index.js +442 -0
- package/lib/adapters/shopping-dianping/index.js +473 -0
- package/lib/adapters/social-csdn/index.js +444 -0
- package/lib/adapters/social-dongchedi/index.js +360 -0
- package/lib/adapters/social-zhihu/index.js +488 -0
- package/lib/adapters/travel-ctrip/index.js +255 -40
- package/lib/adapters/travel-didi/index.js +327 -0
- package/lib/adapters/travel-tongcheng/index.js +393 -0
- package/lib/adapters/video-iqiyi/index.js +75 -0
- package/lib/adapters/video-tencent/index.js +78 -0
- package/lib/adapters/video-xigua/index.js +68 -0
- package/lib/adapters/wework-pc/index.js +31 -0
- package/lib/index.js +40 -0
- package/package.json +1 -1
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* _video-base — shared infrastructure for "video watch-history" adapters
|
|
3
|
+
* (爱奇艺 / 腾讯视频 / etc.), Phase 13+ §12.1 (ROI ⭐⭐ each).
|
|
4
|
+
*
|
|
5
|
+
* These platforms expose the same shape of personal data: a paginated list of
|
|
6
|
+
* videos the user watched (观看记录) + optionally favourited/追剧 (收藏). Rather
|
|
7
|
+
* than copy ~300 lines per platform (mirroring _document-base / shopping-base /
|
|
8
|
+
* travel-base), `createVideoAdapter(config)` returns a fully-formed adapter
|
|
9
|
+
* class with snapshot + cookie-api modes; each platform supplies only its
|
|
10
|
+
* endpoints + field mapping.
|
|
11
|
+
*
|
|
12
|
+
* 1. snapshot mode (opts.inputPath): JSON schemaVersion 1, stateless.
|
|
13
|
+
* 2. cookie-api mode (opts.account.cookies): fetch watch / favourite lists via
|
|
14
|
+
* the injected `fetchFn` (Android in-APK cc → OkHttp; desktop hub →
|
|
15
|
+
* Electron WebView net request), paginate. A sign seam (opts.signProvider)
|
|
16
|
+
* covers anti-bot tokens; best-effort unsigned. Endpoints overridable via
|
|
17
|
+
* opts.watchUrl / opts.favouriteUrl (best-effort, not field-verified —
|
|
18
|
+
* FAMILY-23 playbook).
|
|
19
|
+
*
|
|
20
|
+
* normalize() emits, per item: a MEDIA event (watch) or LIKE event (favourite)
|
|
21
|
+
* + a MEDIA item, mirroring netease-music / music-kugou so the vault can both
|
|
22
|
+
* timeline "我看了 X" and list the video entity.
|
|
23
|
+
*
|
|
24
|
+
* Snapshot schema (schemaVersion 1):
|
|
25
|
+
* {
|
|
26
|
+
* "schemaVersion": 1, "snapshottedAt": <ms>,
|
|
27
|
+
* "account": { "userId": "...", "name": "..." },
|
|
28
|
+
* "events": [
|
|
29
|
+
* { "kind": "watch", "id": "...", "videoId": "...", "title": "...",
|
|
30
|
+
* "category": "movie|tv|variety|anime|...", "episode": "...",
|
|
31
|
+
* "channel": "...", "durationSec": N, "capturedAt": <s|ms> },
|
|
32
|
+
* { "kind": "favourite", "id": "...", "videoId": "...", "title": "...",
|
|
33
|
+
* "category": "...", "capturedAt": <ms> }
|
|
34
|
+
* ]
|
|
35
|
+
* }
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
"use strict";
|
|
39
|
+
|
|
40
|
+
const fs = require("node:fs");
|
|
41
|
+
const { newId } = require("../ids");
|
|
42
|
+
const {
|
|
43
|
+
ENTITY_TYPES,
|
|
44
|
+
EVENT_SUBTYPES,
|
|
45
|
+
ITEM_SUBTYPES,
|
|
46
|
+
CAPTURED_BY,
|
|
47
|
+
} = require("../constants");
|
|
48
|
+
|
|
49
|
+
const SNAPSHOT_SCHEMA_VERSION = 1;
|
|
50
|
+
const KIND_WATCH = "watch";
|
|
51
|
+
const KIND_FAVOURITE = "favourite";
|
|
52
|
+
const VALID_SNAPSHOT_KINDS = Object.freeze([KIND_WATCH, KIND_FAVOURITE]);
|
|
53
|
+
const PAGE_SIZE = 30;
|
|
54
|
+
|
|
55
|
+
function parseTime(v) {
|
|
56
|
+
if (Number.isFinite(v)) return v > 1e12 ? v : v >= 1e9 ? v * 1000 : v;
|
|
57
|
+
if (typeof v === "string") {
|
|
58
|
+
if (/^\d+$/.test(v)) {
|
|
59
|
+
const n = parseInt(v, 10);
|
|
60
|
+
return n > 1e12 ? n : n >= 1e9 ? n * 1000 : n;
|
|
61
|
+
}
|
|
62
|
+
const t = Date.parse(v);
|
|
63
|
+
return Number.isFinite(t) ? t : null;
|
|
64
|
+
}
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* @param {object} config
|
|
70
|
+
* @param {string} config.NAME e.g. "video-iqiyi"
|
|
71
|
+
* @param {string} config.VERSION
|
|
72
|
+
* @param {string} config.platform e.g. "iqiyi"
|
|
73
|
+
* @param {string} config.watchUrl best-effort watch-history endpoint
|
|
74
|
+
* @param {string} config.favouriteUrl best-effort favourite/追剧 endpoint
|
|
75
|
+
* @param {(resp:any)=>any[]} config.extractItems
|
|
76
|
+
* @param {(raw:any)=>object|null} config.mapItem
|
|
77
|
+
* VideoRecord = { videoId, title, category, episode, channel, durationSec, url, occurredAt? }
|
|
78
|
+
*/
|
|
79
|
+
function createVideoAdapter(config) {
|
|
80
|
+
const { NAME, VERSION, platform, watchUrl, favouriteUrl, extractItems, mapItem } = config;
|
|
81
|
+
const { CookieAuth } = require("./shopping-base");
|
|
82
|
+
|
|
83
|
+
function stableOriginalId(kind, id) {
|
|
84
|
+
const safe =
|
|
85
|
+
(typeof id === "string" && id.length > 0 && id) ||
|
|
86
|
+
(typeof id === "number" && Number.isFinite(id) && String(id)) ||
|
|
87
|
+
`unknown-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
88
|
+
return `${platform}:${kind}:${safe}`;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
class VideoAdapter {
|
|
92
|
+
constructor(opts = {}) {
|
|
93
|
+
this.account = opts.account || null;
|
|
94
|
+
this._cookieAuth =
|
|
95
|
+
opts.account && opts.account.cookies
|
|
96
|
+
? new CookieAuth({ platform, cookies: opts.account.cookies })
|
|
97
|
+
: null;
|
|
98
|
+
this._fetchFn = typeof opts.fetchFn === "function" ? opts.fetchFn : defaultFetch;
|
|
99
|
+
this._signProvider =
|
|
100
|
+
typeof opts.signProvider === "function" ? opts.signProvider : null;
|
|
101
|
+
this._urls = {
|
|
102
|
+
watch: opts.watchUrl || watchUrl,
|
|
103
|
+
favourite: opts.favouriteUrl || favouriteUrl,
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
this.name = NAME;
|
|
107
|
+
this.version = VERSION;
|
|
108
|
+
this.capabilities = ["sync:snapshot", "sync:cookie-api", `parse:${platform}-watch`, `parse:${platform}-favourite`];
|
|
109
|
+
this.extractMode = "web-api";
|
|
110
|
+
this.rateLimits = {};
|
|
111
|
+
this.dataDisclosure = {
|
|
112
|
+
fields: [`${platform}:watch (title / category / episode / channel)`, `${platform}:favourite (title / category)`],
|
|
113
|
+
sensitivity: "low",
|
|
114
|
+
legalGate: false,
|
|
115
|
+
defaultInclude: { watch: true, favourite: true },
|
|
116
|
+
};
|
|
117
|
+
this._deps = { fs };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async authenticate(ctx = {}) {
|
|
121
|
+
if (ctx && typeof ctx.inputPath === "string" && ctx.inputPath.length > 0) {
|
|
122
|
+
try {
|
|
123
|
+
this._deps.fs.accessSync(ctx.inputPath, this._deps.fs.constants.R_OK);
|
|
124
|
+
} catch (err) {
|
|
125
|
+
return { ok: false, reason: "INPUT_PATH_UNREADABLE", message: `snapshot not readable at ${ctx.inputPath}: ${err.message}` };
|
|
126
|
+
}
|
|
127
|
+
return { ok: true, mode: "snapshot-file" };
|
|
128
|
+
}
|
|
129
|
+
if (this._cookieAuth) {
|
|
130
|
+
const ok = await this._cookieAuth.validate();
|
|
131
|
+
if (!ok) return { ok: false, reason: "INVALID_COOKIE", error: "cookies missing" };
|
|
132
|
+
return { ok: true, account: (this.account && this.account.userId) || null, mode: "cookie" };
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
ok: false,
|
|
136
|
+
reason: "NO_INPUT",
|
|
137
|
+
message: `${NAME}.authenticate: needs opts.inputPath (snapshot mode) OR opts.account.cookies (cookie-api mode)`,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async healthCheck() {
|
|
142
|
+
if (this._cookieAuth) {
|
|
143
|
+
const r = await this.authenticate();
|
|
144
|
+
return r.ok ? { ok: true, lastChecked: Date.now() } : { ok: false, reason: r.reason, error: r.error };
|
|
145
|
+
}
|
|
146
|
+
return { ok: true, lastChecked: Date.now() };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async *sync(opts = {}) {
|
|
150
|
+
if (typeof opts.inputPath === "string" && opts.inputPath.length > 0) {
|
|
151
|
+
yield* this._syncViaSnapshot(opts);
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
154
|
+
if (this._cookieAuth) {
|
|
155
|
+
yield* this._syncViaCookie(opts);
|
|
156
|
+
return;
|
|
157
|
+
}
|
|
158
|
+
throw new Error(`${NAME}.sync: needs opts.inputPath (snapshot mode) OR opts.account.cookies (cookie-api mode)`);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
async *_syncViaSnapshot(opts) {
|
|
162
|
+
const raw = this._deps.fs.readFileSync(opts.inputPath, "utf-8");
|
|
163
|
+
const snapshot = JSON.parse(raw);
|
|
164
|
+
if (!snapshot || typeof snapshot !== "object" || snapshot.schemaVersion !== SNAPSHOT_SCHEMA_VERSION) {
|
|
165
|
+
throw new Error(
|
|
166
|
+
`${NAME}.sync: snapshot schemaVersion mismatch (got ${snapshot && snapshot.schemaVersion}, expected ${SNAPSHOT_SCHEMA_VERSION})`,
|
|
167
|
+
);
|
|
168
|
+
}
|
|
169
|
+
const fallback =
|
|
170
|
+
Number.isFinite(snapshot.snapshottedAt) && snapshot.snapshottedAt > 0
|
|
171
|
+
? Math.floor(snapshot.snapshottedAt)
|
|
172
|
+
: Date.now();
|
|
173
|
+
const account = snapshot.account && typeof snapshot.account === "object" ? snapshot.account : null;
|
|
174
|
+
const include = opts.include || {};
|
|
175
|
+
const limit = Number.isInteger(opts.limit) && opts.limit > 0 ? opts.limit : Infinity;
|
|
176
|
+
const events = Array.isArray(snapshot.events) ? snapshot.events : [];
|
|
177
|
+
let emitted = 0;
|
|
178
|
+
for (const ev of events) {
|
|
179
|
+
if (emitted >= limit) return;
|
|
180
|
+
if (!ev || typeof ev !== "object" || !VALID_SNAPSHOT_KINDS.includes(ev.kind)) continue;
|
|
181
|
+
if (include[ev.kind] === false) continue;
|
|
182
|
+
const id = (typeof ev.id === "string" && ev.id) || ev.videoId || null;
|
|
183
|
+
yield {
|
|
184
|
+
adapter: NAME,
|
|
185
|
+
kind: ev.kind,
|
|
186
|
+
originalId: stableOriginalId(ev.kind, id),
|
|
187
|
+
capturedAt: parseTime(ev.capturedAt) || fallback,
|
|
188
|
+
payload: { record: snapshotEventToRecord(ev), kind: ev.kind, account },
|
|
189
|
+
};
|
|
190
|
+
emitted += 1;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
async *_syncViaCookie(opts = {}) {
|
|
195
|
+
if (!(await this._cookieAuth.validate())) return;
|
|
196
|
+
const cookies = this._cookieAuth.toHeader();
|
|
197
|
+
const include = opts.include || {};
|
|
198
|
+
const limit = Number.isInteger(opts.limit) && opts.limit > 0 ? opts.limit : Infinity;
|
|
199
|
+
const maxPages = Number.isInteger(opts.maxPages) && opts.maxPages > 0 ? opts.maxPages : 10;
|
|
200
|
+
|
|
201
|
+
const plan = [
|
|
202
|
+
{ kind: KIND_WATCH, url: this._urls.watch },
|
|
203
|
+
{ kind: KIND_FAVOURITE, url: this._urls.favourite },
|
|
204
|
+
];
|
|
205
|
+
|
|
206
|
+
let emitted = 0;
|
|
207
|
+
for (const step of plan) {
|
|
208
|
+
if (include[step.kind] === false) continue;
|
|
209
|
+
if (!step.url) continue;
|
|
210
|
+
let page = 1;
|
|
211
|
+
while (page <= maxPages) {
|
|
212
|
+
const query = { page, pageSize: PAGE_SIZE };
|
|
213
|
+
let sign = null;
|
|
214
|
+
if (this._signProvider) {
|
|
215
|
+
sign = await this._signProvider({ url: step.url, query, cookies });
|
|
216
|
+
}
|
|
217
|
+
const resp = await this._fetchFn({ url: step.url, cookies, query, sign });
|
|
218
|
+
const items = extractItems(resp) || [];
|
|
219
|
+
if (!items.length) break;
|
|
220
|
+
for (const it of items) {
|
|
221
|
+
const rec = mapItem(it);
|
|
222
|
+
if (!rec || !rec.videoId) continue;
|
|
223
|
+
if (emitted >= limit) return;
|
|
224
|
+
yield {
|
|
225
|
+
adapter: NAME,
|
|
226
|
+
kind: step.kind,
|
|
227
|
+
originalId: stableOriginalId(step.kind, rec.videoId),
|
|
228
|
+
capturedAt: rec.occurredAt || Date.now(),
|
|
229
|
+
payload: { record: rec, kind: step.kind },
|
|
230
|
+
};
|
|
231
|
+
emitted += 1;
|
|
232
|
+
}
|
|
233
|
+
if (items.length < PAGE_SIZE) break;
|
|
234
|
+
page += 1;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
normalize(raw) {
|
|
240
|
+
if (!raw || !raw.payload || !raw.payload.record) {
|
|
241
|
+
throw new Error(`${NAME}.normalize: payload.record missing`);
|
|
242
|
+
}
|
|
243
|
+
const kind = raw.kind || raw.payload.kind;
|
|
244
|
+
const subtype = kind === KIND_FAVOURITE ? EVENT_SUBTYPES.LIKE : EVENT_SUBTYPES.MEDIA;
|
|
245
|
+
const verb = kind === KIND_FAVOURITE ? "收藏" : "观看";
|
|
246
|
+
return normalizeVideoRecord(raw.payload.record, raw, platform, NAME, VERSION, subtype, verb);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return VideoAdapter;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
function snapshotEventToRecord(ev) {
|
|
254
|
+
return {
|
|
255
|
+
videoId: String(ev.videoId || ev.id || "unknown"),
|
|
256
|
+
title: ev.title || "(未知视频)",
|
|
257
|
+
category: ev.category || ev.type || null,
|
|
258
|
+
episode: ev.episode || null,
|
|
259
|
+
channel: ev.channel || ev.uploader || null,
|
|
260
|
+
durationSec: Number.isFinite(ev.durationSec) ? ev.durationSec : null,
|
|
261
|
+
url: ev.url || null,
|
|
262
|
+
occurredAt: parseTime(ev.capturedAt),
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
function normalizeVideoRecord(rec, raw, platform, NAME, VERSION, subtype, verb) {
|
|
267
|
+
const ingestedAt = Date.now();
|
|
268
|
+
const occurredAt = rec.occurredAt || raw.capturedAt || ingestedAt;
|
|
269
|
+
const source = {
|
|
270
|
+
adapter: NAME,
|
|
271
|
+
adapterVersion: VERSION,
|
|
272
|
+
originalId: raw.originalId,
|
|
273
|
+
capturedAt: raw.capturedAt || occurredAt,
|
|
274
|
+
capturedBy: CAPTURED_BY.API,
|
|
275
|
+
};
|
|
276
|
+
const title = rec.title || "(未知视频)";
|
|
277
|
+
const epSuffix = rec.episode ? ` ${rec.episode}` : "";
|
|
278
|
+
const itemId = `item-${platform}-video-${rec.videoId}`;
|
|
279
|
+
return {
|
|
280
|
+
events: [
|
|
281
|
+
{
|
|
282
|
+
id: newId(),
|
|
283
|
+
type: ENTITY_TYPES.EVENT,
|
|
284
|
+
subtype,
|
|
285
|
+
occurredAt,
|
|
286
|
+
actor: "person-self",
|
|
287
|
+
content: { title: `${verb}: ${title}${epSuffix}`, text: title },
|
|
288
|
+
ingestedAt,
|
|
289
|
+
source,
|
|
290
|
+
extra: {
|
|
291
|
+
platform,
|
|
292
|
+
videoId: rec.videoId,
|
|
293
|
+
category: rec.category || null,
|
|
294
|
+
episode: rec.episode || null,
|
|
295
|
+
channel: rec.channel || null,
|
|
296
|
+
durationSec: rec.durationSec != null ? rec.durationSec : null,
|
|
297
|
+
url: rec.url || null,
|
|
298
|
+
itemRef: itemId,
|
|
299
|
+
},
|
|
300
|
+
},
|
|
301
|
+
],
|
|
302
|
+
items: [
|
|
303
|
+
{
|
|
304
|
+
id: itemId,
|
|
305
|
+
type: ENTITY_TYPES.ITEM,
|
|
306
|
+
subtype: ITEM_SUBTYPES.MEDIA,
|
|
307
|
+
name: title,
|
|
308
|
+
ingestedAt,
|
|
309
|
+
source,
|
|
310
|
+
extra: { platform, kind: "video", videoId: rec.videoId, category: rec.category || null, channel: rec.channel || null },
|
|
311
|
+
},
|
|
312
|
+
],
|
|
313
|
+
persons: [],
|
|
314
|
+
places: [],
|
|
315
|
+
topics: [],
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
async function defaultFetch(_opts) {
|
|
320
|
+
throw new Error("video-base: no fetchFn configured for cookie-api mode");
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
module.exports = {
|
|
324
|
+
createVideoAdapter,
|
|
325
|
+
normalizeVideoRecord,
|
|
326
|
+
parseTime,
|
|
327
|
+
SNAPSHOT_SCHEMA_VERSION,
|
|
328
|
+
KIND_WATCH,
|
|
329
|
+
KIND_FAVOURITE,
|
|
330
|
+
VALID_SNAPSHOT_KINDS,
|
|
331
|
+
};
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* §A14 — 天眼查 (Tianyancha, com.tianyancha.skyeye) adapter, dual-mode
|
|
3
|
+
* (snapshot + cookie-api). Phase 13+ §12.1 line-780 ROI ⭐⭐ "自查公司关联".
|
|
4
|
+
*
|
|
5
|
+
* 天眼查 personal data = the user's business-intelligence footprint: companies
|
|
6
|
+
* they monitor/关注 and their company-search history. Both are interest signals →
|
|
7
|
+
* monitor maps to a LIKE event (关注某公司), search to an INTERACTION event
|
|
8
|
+
* (搜索某公司). Mirrors the social-dongchedi two-mode shape; events-only (a
|
|
9
|
+
* company is not a Person, and a lightweight event keeps the vault model simple).
|
|
10
|
+
*
|
|
11
|
+
* 1. snapshot mode (opts.inputPath): JSON schemaVersion 1, stateless.
|
|
12
|
+
* 2. cookie-api mode (opts.account.cookies): fetch monitor list + search
|
|
13
|
+
* history from tianyancha.com via the injected `fetchFn`, paginate; a sign
|
|
14
|
+
* seam (opts.signProvider) covers tianyancha's signed-request header (auth /
|
|
15
|
+
* version token); best-effort unsigned when absent. Endpoints overridable
|
|
16
|
+
* via opts.monitorUrl / opts.searchUrl (best-effort, not field-verified —
|
|
17
|
+
* FAMILY-23 playbook). account OPTIONAL — the cookie carries identity.
|
|
18
|
+
*
|
|
19
|
+
* Snapshot schema (schemaVersion 1):
|
|
20
|
+
* {
|
|
21
|
+
* "schemaVersion": 1, "snapshottedAt": <ms>,
|
|
22
|
+
* "account": { "userId": "...", "name": "..." },
|
|
23
|
+
* "events": [
|
|
24
|
+
* { "kind": "monitor", "id": "mon-<gid>", "companyId": "...", "companyName": "...",
|
|
25
|
+
* "legalPerson": "...", "regStatus": "...", "capturedAt": <ms> },
|
|
26
|
+
* { "kind": "search", "id": "search-<id>", "query": "...", "companyName": "...",
|
|
27
|
+
* "capturedAt": <ms> }
|
|
28
|
+
* ]
|
|
29
|
+
* }
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
"use strict";
|
|
33
|
+
|
|
34
|
+
const fs = require("node:fs");
|
|
35
|
+
const { newId } = require("../../ids");
|
|
36
|
+
const { ENTITY_TYPES, EVENT_SUBTYPES, CAPTURED_BY } = require("../../constants");
|
|
37
|
+
const { CookieAuth } = require("../shopping-base");
|
|
38
|
+
|
|
39
|
+
const NAME = "biz-tianyancha";
|
|
40
|
+
const VERSION = "0.1.0";
|
|
41
|
+
const SNAPSHOT_SCHEMA_VERSION = 1;
|
|
42
|
+
|
|
43
|
+
const KIND_MONITOR = "monitor";
|
|
44
|
+
const KIND_SEARCH = "search";
|
|
45
|
+
const VALID_SNAPSHOT_KINDS = Object.freeze([KIND_MONITOR, KIND_SEARCH]);
|
|
46
|
+
|
|
47
|
+
// Best-effort tianyancha.com endpoints. Overridable via opts.*Url.
|
|
48
|
+
const MONITOR_URL = "https://capi.tianyancha.com/cloud-monitor-app/monitor/list";
|
|
49
|
+
const SEARCH_URL = "https://capi.tianyancha.com/cloud-search-app/search/history";
|
|
50
|
+
const PAGE_SIZE = 20;
|
|
51
|
+
|
|
52
|
+
function parseTime(v) {
|
|
53
|
+
if (Number.isFinite(v)) return v > 1e12 ? v : v >= 1e9 ? v * 1000 : v;
|
|
54
|
+
if (typeof v === "string") {
|
|
55
|
+
if (/^\d+$/.test(v)) {
|
|
56
|
+
const n = parseInt(v, 10);
|
|
57
|
+
return n > 1e12 ? n : n >= 1e9 ? n * 1000 : n;
|
|
58
|
+
}
|
|
59
|
+
const t = Date.parse(v);
|
|
60
|
+
return Number.isFinite(t) ? t : null;
|
|
61
|
+
}
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function stableOriginalId(kind, id) {
|
|
66
|
+
const safe =
|
|
67
|
+
(typeof id === "string" && id.length > 0 && id) ||
|
|
68
|
+
(typeof id === "number" && Number.isFinite(id) && String(id)) ||
|
|
69
|
+
`unknown-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
70
|
+
return `tianyancha:${kind}:${safe}`;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
class TianyanchaAdapter {
|
|
74
|
+
constructor(opts = {}) {
|
|
75
|
+
this.account = opts.account || null;
|
|
76
|
+
this._cookieAuth =
|
|
77
|
+
opts.account && opts.account.cookies
|
|
78
|
+
? new CookieAuth({ platform: "tianyancha", cookies: opts.account.cookies })
|
|
79
|
+
: null;
|
|
80
|
+
this._fetchFn = typeof opts.fetchFn === "function" ? opts.fetchFn : defaultFetch;
|
|
81
|
+
this._signProvider =
|
|
82
|
+
typeof opts.signProvider === "function" ? opts.signProvider : null;
|
|
83
|
+
this._urls = {
|
|
84
|
+
monitor: opts.monitorUrl || MONITOR_URL,
|
|
85
|
+
search: opts.searchUrl || SEARCH_URL,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
this.name = NAME;
|
|
89
|
+
this.version = VERSION;
|
|
90
|
+
this.capabilities = [
|
|
91
|
+
"sync:snapshot",
|
|
92
|
+
"sync:cookie-api",
|
|
93
|
+
"parse:tianyancha-monitor",
|
|
94
|
+
"parse:tianyancha-search",
|
|
95
|
+
];
|
|
96
|
+
this.extractMode = "web-api";
|
|
97
|
+
this.rateLimits = { perMinute: 8, perDay: 200 };
|
|
98
|
+
this.dataDisclosure = {
|
|
99
|
+
fields: [
|
|
100
|
+
"tianyancha:monitor (companyName / legalPerson / regStatus)",
|
|
101
|
+
"tianyancha:search (query / companyName)",
|
|
102
|
+
],
|
|
103
|
+
sensitivity: "medium",
|
|
104
|
+
legalGate: false,
|
|
105
|
+
defaultInclude: { monitor: true, search: true },
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
this._deps = { fs };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
async authenticate(ctx = {}) {
|
|
112
|
+
if (ctx && typeof ctx.inputPath === "string" && ctx.inputPath.length > 0) {
|
|
113
|
+
try {
|
|
114
|
+
this._deps.fs.accessSync(ctx.inputPath, this._deps.fs.constants.R_OK);
|
|
115
|
+
} catch (err) {
|
|
116
|
+
return {
|
|
117
|
+
ok: false,
|
|
118
|
+
reason: "INPUT_PATH_UNREADABLE",
|
|
119
|
+
message: `snapshot not readable at ${ctx.inputPath}: ${err.message}`,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
return { ok: true, mode: "snapshot-file" };
|
|
123
|
+
}
|
|
124
|
+
if (this._cookieAuth) {
|
|
125
|
+
const ok = await this._cookieAuth.validate();
|
|
126
|
+
if (!ok) return { ok: false, reason: "INVALID_COOKIE", error: "cookies missing" };
|
|
127
|
+
return { ok: true, account: (this.account && this.account.userId) || null, mode: "cookie" };
|
|
128
|
+
}
|
|
129
|
+
return {
|
|
130
|
+
ok: false,
|
|
131
|
+
reason: "NO_INPUT",
|
|
132
|
+
message:
|
|
133
|
+
"biz-tianyancha.authenticate: needs opts.inputPath (snapshot mode) OR opts.account.cookies (cookie-api mode)",
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async healthCheck() {
|
|
138
|
+
if (this._cookieAuth) {
|
|
139
|
+
const r = await this.authenticate();
|
|
140
|
+
return r.ok ? { ok: true, lastChecked: Date.now() } : { ok: false, reason: r.reason, error: r.error };
|
|
141
|
+
}
|
|
142
|
+
return { ok: true, lastChecked: Date.now() };
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async *sync(opts = {}) {
|
|
146
|
+
if (typeof opts.inputPath === "string" && opts.inputPath.length > 0) {
|
|
147
|
+
yield* this._syncViaSnapshot(opts);
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
if (this._cookieAuth) {
|
|
151
|
+
yield* this._syncViaCookie(opts);
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
154
|
+
throw new Error(
|
|
155
|
+
"biz-tianyancha.sync: needs opts.inputPath (snapshot mode) OR opts.account.cookies (cookie-api mode)",
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
async *_syncViaSnapshot(opts) {
|
|
160
|
+
const raw = this._deps.fs.readFileSync(opts.inputPath, "utf-8");
|
|
161
|
+
const snapshot = JSON.parse(raw);
|
|
162
|
+
if (!snapshot || typeof snapshot !== "object" || snapshot.schemaVersion !== SNAPSHOT_SCHEMA_VERSION) {
|
|
163
|
+
throw new Error(
|
|
164
|
+
`biz-tianyancha.sync: snapshot schemaVersion mismatch (got ${snapshot && snapshot.schemaVersion}, expected ${SNAPSHOT_SCHEMA_VERSION})`,
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
const fallback =
|
|
168
|
+
Number.isFinite(snapshot.snapshottedAt) && snapshot.snapshottedAt > 0
|
|
169
|
+
? Math.floor(snapshot.snapshottedAt)
|
|
170
|
+
: Date.now();
|
|
171
|
+
const account = snapshot.account && typeof snapshot.account === "object" ? snapshot.account : null;
|
|
172
|
+
const include = opts.include || {};
|
|
173
|
+
const limit = Number.isInteger(opts.limit) && opts.limit > 0 ? opts.limit : Infinity;
|
|
174
|
+
const events = Array.isArray(snapshot.events) ? snapshot.events : [];
|
|
175
|
+
let emitted = 0;
|
|
176
|
+
for (const ev of events) {
|
|
177
|
+
if (emitted >= limit) return;
|
|
178
|
+
if (!ev || typeof ev !== "object" || !VALID_SNAPSHOT_KINDS.includes(ev.kind)) continue;
|
|
179
|
+
if (include[ev.kind] === false) continue;
|
|
180
|
+
const id = (typeof ev.id === "string" && ev.id) || ev.companyId || ev.query || null;
|
|
181
|
+
yield {
|
|
182
|
+
adapter: NAME,
|
|
183
|
+
kind: ev.kind,
|
|
184
|
+
originalId: stableOriginalId(ev.kind, id),
|
|
185
|
+
capturedAt: parseTime(ev.capturedAt) || fallback,
|
|
186
|
+
payload: { ...ev, account },
|
|
187
|
+
};
|
|
188
|
+
emitted += 1;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
async *_syncViaCookie(opts = {}) {
|
|
193
|
+
if (!(await this._cookieAuth.validate())) return;
|
|
194
|
+
const cookies = this._cookieAuth.toHeader();
|
|
195
|
+
const include = opts.include || {};
|
|
196
|
+
const limit = Number.isInteger(opts.limit) && opts.limit > 0 ? opts.limit : Infinity;
|
|
197
|
+
const maxPages = Number.isInteger(opts.maxPages) && opts.maxPages > 0 ? opts.maxPages : 10;
|
|
198
|
+
|
|
199
|
+
const plan = [
|
|
200
|
+
{ kind: KIND_MONITOR, url: this._urls.monitor, idOf: (it) => it.graphId || it.companyId || it.id },
|
|
201
|
+
{ kind: KIND_SEARCH, url: this._urls.search, idOf: (it) => it.id || it.keyword || it.word },
|
|
202
|
+
];
|
|
203
|
+
|
|
204
|
+
let emitted = 0;
|
|
205
|
+
for (const step of plan) {
|
|
206
|
+
if (include[step.kind] === false) continue;
|
|
207
|
+
let pageNum = 1;
|
|
208
|
+
while (pageNum <= maxPages) {
|
|
209
|
+
const query = { pageNum, pageSize: PAGE_SIZE };
|
|
210
|
+
let sign = null;
|
|
211
|
+
if (this._signProvider) {
|
|
212
|
+
sign = await this._signProvider({ url: step.url, query, cookies });
|
|
213
|
+
}
|
|
214
|
+
const resp = await this._fetchFn({ url: step.url, cookies, query, sign });
|
|
215
|
+
const items = extractData(resp);
|
|
216
|
+
if (!items.length) break;
|
|
217
|
+
for (const it of items) {
|
|
218
|
+
if (!it || typeof it !== "object") continue;
|
|
219
|
+
if (emitted >= limit) return;
|
|
220
|
+
yield {
|
|
221
|
+
adapter: NAME,
|
|
222
|
+
kind: step.kind,
|
|
223
|
+
originalId: stableOriginalId(step.kind, step.idOf(it)),
|
|
224
|
+
capturedAt: parseTime(it.createTime || it.monitorTime || it.searchTime || it.gmtCreate) || Date.now(),
|
|
225
|
+
payload: { item: it, kind: step.kind, cookie: true },
|
|
226
|
+
};
|
|
227
|
+
emitted += 1;
|
|
228
|
+
}
|
|
229
|
+
if (items.length < PAGE_SIZE) break;
|
|
230
|
+
pageNum += 1;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
normalize(raw) {
|
|
236
|
+
if (!raw || !raw.payload) throw new Error("TianyanchaAdapter.normalize: payload missing");
|
|
237
|
+
const ingestedAt = Date.now();
|
|
238
|
+
const kind = raw.kind || raw.payload.kind;
|
|
239
|
+
if (kind === KIND_MONITOR) return normalizeMonitor(raw, ingestedAt);
|
|
240
|
+
if (kind === KIND_SEARCH) return normalizeSearch(raw, ingestedAt);
|
|
241
|
+
throw new Error(`TianyanchaAdapter.normalize: unknown kind ${kind}`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// ─── cookie response helpers ─────────────────────────────────────────────────
|
|
246
|
+
|
|
247
|
+
function extractData(resp) {
|
|
248
|
+
if (!resp || typeof resp !== "object") return [];
|
|
249
|
+
if (Array.isArray(resp.data)) return resp.data;
|
|
250
|
+
if (Array.isArray(resp.list)) return resp.list;
|
|
251
|
+
const d = resp.data;
|
|
252
|
+
if (d && typeof d === "object") {
|
|
253
|
+
if (Array.isArray(d.list)) return d.list;
|
|
254
|
+
if (Array.isArray(d.items)) return d.items;
|
|
255
|
+
if (Array.isArray(d.resultList)) return d.resultList;
|
|
256
|
+
if (Array.isArray(d.records)) return d.records;
|
|
257
|
+
}
|
|
258
|
+
return [];
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// ─── per-kind normalizers (snapshot fields OR cookie payload.item) ────────────
|
|
262
|
+
|
|
263
|
+
function buildSource(raw, occurredAt) {
|
|
264
|
+
return {
|
|
265
|
+
adapter: NAME,
|
|
266
|
+
adapterVersion: VERSION,
|
|
267
|
+
originalId: raw.originalId,
|
|
268
|
+
capturedAt: raw.capturedAt || occurredAt,
|
|
269
|
+
capturedBy: CAPTURED_BY.API,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function normalizeMonitor(raw, ingestedAt) {
|
|
274
|
+
const p = raw.payload;
|
|
275
|
+
const it = p.cookie ? p.item : p;
|
|
276
|
+
const company = it.companyName || it.name || it.company || "";
|
|
277
|
+
const occurredAt = parseTime(it.capturedAt || it.createTime || it.monitorTime || raw.capturedAt) || ingestedAt;
|
|
278
|
+
const source = buildSource(raw, occurredAt);
|
|
279
|
+
return {
|
|
280
|
+
events: [
|
|
281
|
+
{
|
|
282
|
+
id: newId(),
|
|
283
|
+
type: ENTITY_TYPES.EVENT,
|
|
284
|
+
subtype: EVENT_SUBTYPES.LIKE,
|
|
285
|
+
occurredAt,
|
|
286
|
+
actor: "person-self",
|
|
287
|
+
content: { title: `关注公司: ${company}`.trim(), text: company },
|
|
288
|
+
ingestedAt,
|
|
289
|
+
source,
|
|
290
|
+
extra: {
|
|
291
|
+
platform: "tianyancha",
|
|
292
|
+
companyId: (it.companyId || it.graphId || it.id) != null ? String(it.companyId || it.graphId || it.id) : null,
|
|
293
|
+
companyName: company || null,
|
|
294
|
+
legalPerson: it.legalPerson || it.legalPersonName || null,
|
|
295
|
+
regStatus: it.regStatus || it.status || null,
|
|
296
|
+
},
|
|
297
|
+
},
|
|
298
|
+
],
|
|
299
|
+
persons: [],
|
|
300
|
+
places: [],
|
|
301
|
+
items: [],
|
|
302
|
+
topics: [],
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function normalizeSearch(raw, ingestedAt) {
|
|
307
|
+
const p = raw.payload;
|
|
308
|
+
const it = p.cookie ? p.item : p;
|
|
309
|
+
const q = it.query || it.keyword || it.word || it.companyName || "";
|
|
310
|
+
const occurredAt = parseTime(it.capturedAt || it.searchTime || it.createTime || raw.capturedAt) || ingestedAt;
|
|
311
|
+
const source = buildSource(raw, occurredAt);
|
|
312
|
+
return {
|
|
313
|
+
events: [
|
|
314
|
+
{
|
|
315
|
+
id: newId(),
|
|
316
|
+
type: ENTITY_TYPES.EVENT,
|
|
317
|
+
subtype: EVENT_SUBTYPES.INTERACTION,
|
|
318
|
+
occurredAt,
|
|
319
|
+
actor: "person-self",
|
|
320
|
+
content: { title: `搜索企业: ${q}`.trim(), text: q },
|
|
321
|
+
ingestedAt,
|
|
322
|
+
source,
|
|
323
|
+
extra: {
|
|
324
|
+
platform: "tianyancha",
|
|
325
|
+
query: q || null,
|
|
326
|
+
companyName: it.companyName || null,
|
|
327
|
+
},
|
|
328
|
+
},
|
|
329
|
+
],
|
|
330
|
+
persons: [],
|
|
331
|
+
places: [],
|
|
332
|
+
items: [],
|
|
333
|
+
topics: [],
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
async function defaultFetch(_opts) {
|
|
338
|
+
throw new Error("biz-tianyancha: no fetchFn configured for cookie-api mode");
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
module.exports = {
|
|
342
|
+
TianyanchaAdapter,
|
|
343
|
+
extractData,
|
|
344
|
+
NAME,
|
|
345
|
+
VERSION,
|
|
346
|
+
SNAPSHOT_SCHEMA_VERSION,
|
|
347
|
+
VALID_SNAPSHOT_KINDS,
|
|
348
|
+
};
|