benchmark-collector 1.5.2 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/dist/scripts/feige-history-collect.js +375 -0
- package/dist/scripts/pdd-history-collect.js +499 -0
- package/package.json +3 -1
package/README.md
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
- **Cookie 自动注入** — 从本地 Chrome 提取并解密 Cookie,保留登录态(支持 macOS / Windows / Linux)
|
|
12
12
|
- **对话采集** — 支持 DOM 检测和 API 拦截两种模式采集客服对话消息,内置平台预设选择器自动匹配,消息自动去重
|
|
13
13
|
- **平台预设** — 内置常见客服平台(抖店、拼多多等)的对话容器选择器,无需手动指定
|
|
14
|
+
- **历史会话批量采集** — 内置抖音飞鸽 / 拼多多客服后台的历史会话自动采集脚本,免人工操作即可批量拉取后台数据
|
|
14
15
|
- **长时间录制优化** — 写入节流、原子保存、CDP 自动清理,支持数小时连续采集
|
|
15
16
|
- **增量保存** — 每步操作实时保存,进程异常退出也不丢数据
|
|
16
17
|
- **跨平台** — 支持 macOS / Windows / Linux
|
|
@@ -147,6 +148,49 @@ output/处理退款_2026-03-22T10-00-00/
|
|
|
147
148
|
| `*_dom.html` | 页面 DOM 快照(含 iframe 内容) | DOM-based agent 训练 |
|
|
148
149
|
| `*_conversation.json` | 对话消息(角色 / 内容 / 时间戳 / 附件) | 客服对话训练数据(DOM 模式) |
|
|
149
150
|
|
|
151
|
+
## 历史会话批量采集
|
|
152
|
+
|
|
153
|
+
针对抖音飞鸽(抖店)和拼多多商家后台的历史会话页面,提供两个独立的批量采集脚本,自动操作页面、拦截接口、翻页累积数据。
|
|
154
|
+
|
|
155
|
+
### 抖音飞鸽
|
|
156
|
+
|
|
157
|
+
页面:`https://jsls.jinritemai.com/im/shop/main/data/historyConversation`,接口:`fuzzySearchConversation`。
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# 默认运行(自动注入本地 Chrome Cookie,有头模式)
|
|
161
|
+
npm run feige-history
|
|
162
|
+
|
|
163
|
+
# 自定义参数
|
|
164
|
+
npx ts-node scripts/feige-history-collect.ts \
|
|
165
|
+
--max-pages 100 \ # 最大账号列表翻页数(默认 100)
|
|
166
|
+
--delay 1000 \ # 每次翻页间隔,避免限流(默认 1000ms)
|
|
167
|
+
--output ./data \ # 输出目录(默认 output)
|
|
168
|
+
--headless # 无头模式
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
流程:打开页面 → 选"近30天" → 点击查询 → 拦截 `fuzzySearchConversation` → 自动翻页直到采集全部 → 保存到 `output/feige-history_<timestamp>.json`。
|
|
172
|
+
|
|
173
|
+
### 拼多多商家后台
|
|
174
|
+
|
|
175
|
+
页面:`https://mms.pinduoduo.com/mms-chat/search?msfrom=mms_sidenav`,接口:`getMessages`。
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# 默认运行
|
|
179
|
+
npm run pdd-history
|
|
180
|
+
|
|
181
|
+
# 自定义参数
|
|
182
|
+
npx ts-node scripts/pdd-history-collect.ts \
|
|
183
|
+
--max-accounts 200 \ # 最多采集消费者账号数(默认 500)
|
|
184
|
+
--max-msg-pages 20 \ # 每个账号最多翻多少页消息(默认 20)
|
|
185
|
+
--delay 600 \ # 操作间隔(默认 600ms)
|
|
186
|
+
--output ./data \
|
|
187
|
+
--headless
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
流程:打开页面 → 选"近30天" → 点击查询 → 遍历左侧 `.user-item` 消费者账号列表 → 每个账号点击后遍历右侧消息分页 → 拦截 `getMessages` 累积全部消息 → 保存到 `output/pdd-history_<timestamp>.json`。
|
|
191
|
+
|
|
192
|
+
> **前置条件:** 需要先在本地 Chrome 中登录对应的商家后台,脚本会自动从 Chrome 解密提取 Cookie 完成认证。
|
|
193
|
+
|
|
150
194
|
## 回放与调试
|
|
151
195
|
|
|
152
196
|
```bash
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
4
|
+
if (k2 === undefined) k2 = k;
|
|
5
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
6
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
7
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
8
|
+
}
|
|
9
|
+
Object.defineProperty(o, k2, desc);
|
|
10
|
+
}) : (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
o[k2] = m[k];
|
|
13
|
+
}));
|
|
14
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
15
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
16
|
+
}) : function(o, v) {
|
|
17
|
+
o["default"] = v;
|
|
18
|
+
});
|
|
19
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
20
|
+
var ownKeys = function(o) {
|
|
21
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
22
|
+
var ar = [];
|
|
23
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
24
|
+
return ar;
|
|
25
|
+
};
|
|
26
|
+
return ownKeys(o);
|
|
27
|
+
};
|
|
28
|
+
return function (mod) {
|
|
29
|
+
if (mod && mod.__esModule) return mod;
|
|
30
|
+
var result = {};
|
|
31
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
32
|
+
__setModuleDefault(result, mod);
|
|
33
|
+
return result;
|
|
34
|
+
};
|
|
35
|
+
})();
|
|
36
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
37
|
+
/**
|
|
38
|
+
* 抖音飞鸽历史会话数据采集脚本
|
|
39
|
+
*
|
|
40
|
+
* 功能:
|
|
41
|
+
* 1. 打开飞鸽数据页面 (historyConversation)
|
|
42
|
+
* 2. 注入 Cookie 实现免登录
|
|
43
|
+
* 3. 点击"查询"触发数据加载
|
|
44
|
+
* 4. 拦截 fuzzySearchConversation 接口响应
|
|
45
|
+
* 5. 自动翻页直到获取所有数据
|
|
46
|
+
* 6. 保存完整数据到本地 JSON 文件
|
|
47
|
+
*
|
|
48
|
+
* 用法:
|
|
49
|
+
* npx ts-node scripts/feige-history-collect.ts [--output <dir>] [--max-pages <n>] [--delay <ms>]
|
|
50
|
+
*/
|
|
51
|
+
const playwright_core_1 = require("playwright-core");
|
|
52
|
+
const fs = __importStar(require("fs"));
|
|
53
|
+
const path = __importStar(require("path"));
|
|
54
|
+
const cookie_extract_1 = require("../src/cookie-extract");
|
|
55
|
+
const PAGE_URL = 'https://jsls.jinritemai.com/im/shop/main/data/historyConversation';
|
|
56
|
+
const API_PATTERN = 'fuzzySearchConversation';
|
|
57
|
+
// ========== CLI 参数解析 ==========
|
|
58
|
+
const args = process.argv.slice(2);
|
|
59
|
+
function getArg(name, defaultValue) {
|
|
60
|
+
const idx = args.indexOf(`--${name}`);
|
|
61
|
+
return idx >= 0 && args[idx + 1] ? args[idx + 1] : defaultValue;
|
|
62
|
+
}
|
|
63
|
+
function hasFlag(name) {
|
|
64
|
+
return args.includes(`--${name}`);
|
|
65
|
+
}
|
|
66
|
+
const outputDir = getArg('output', 'output');
|
|
67
|
+
const maxPages = parseInt(getArg('max-pages', '100'), 10);
|
|
68
|
+
const delayMs = parseInt(getArg('delay', '1000'), 10);
|
|
69
|
+
const headless = hasFlag('headless');
|
|
70
|
+
// ========== 主逻辑 ==========
|
|
71
|
+
class FeigeHistoryCollector {
|
|
72
|
+
constructor() {
|
|
73
|
+
this.allConversations = [];
|
|
74
|
+
this.currentPageData = [];
|
|
75
|
+
this.responseReceived = false;
|
|
76
|
+
this.totalFromApi = 0;
|
|
77
|
+
}
|
|
78
|
+
async run() {
|
|
79
|
+
console.log('飞鸽历史会话数据采集');
|
|
80
|
+
console.log(` 页面: ${PAGE_URL}`);
|
|
81
|
+
console.log(` 最大页数: ${maxPages}`);
|
|
82
|
+
console.log(` 请求间隔: ${delayMs}ms`);
|
|
83
|
+
console.log(` 输出目录: ${outputDir}`);
|
|
84
|
+
console.log('');
|
|
85
|
+
const launchOpts = {
|
|
86
|
+
headless,
|
|
87
|
+
handleSIGINT: false,
|
|
88
|
+
handleSIGTERM: false,
|
|
89
|
+
handleSIGHUP: false,
|
|
90
|
+
};
|
|
91
|
+
let browser;
|
|
92
|
+
try {
|
|
93
|
+
browser = await playwright_core_1.chromium.launch({ ...launchOpts, channel: 'chrome' });
|
|
94
|
+
}
|
|
95
|
+
catch {
|
|
96
|
+
browser = await playwright_core_1.chromium.launch(launchOpts);
|
|
97
|
+
}
|
|
98
|
+
const context = await browser.newContext({ viewport: { width: 1440, height: 900 } });
|
|
99
|
+
// 注入 Cookie
|
|
100
|
+
console.log('🍪 从本地 Chrome 提取 Cookie...');
|
|
101
|
+
const cookies = (0, cookie_extract_1.extractCookiesForUrl)(PAGE_URL);
|
|
102
|
+
if (cookies.length > 0) {
|
|
103
|
+
let injected = 0;
|
|
104
|
+
for (const cookie of cookies) {
|
|
105
|
+
try {
|
|
106
|
+
await context.addCookies([cookie]);
|
|
107
|
+
injected++;
|
|
108
|
+
}
|
|
109
|
+
catch { }
|
|
110
|
+
}
|
|
111
|
+
console.log(` ✅ 已注入 ${injected}/${cookies.length} 个 Cookie`);
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
console.log(' ⚠️ 未找到 Cookie,可能需要先在 Chrome 中登录飞鸽');
|
|
115
|
+
}
|
|
116
|
+
this.page = await context.newPage();
|
|
117
|
+
await this.setupApiIntercept();
|
|
118
|
+
// 导航到页面
|
|
119
|
+
console.log('\n🌐 正在打开飞鸽数据页面...');
|
|
120
|
+
try {
|
|
121
|
+
await this.page.goto(PAGE_URL, { waitUntil: 'networkidle', timeout: 30000 });
|
|
122
|
+
}
|
|
123
|
+
catch (e) {
|
|
124
|
+
console.log(` ⚠️ 页面加载超时,继续尝试: ${e.message?.split('\n')[0]}`);
|
|
125
|
+
}
|
|
126
|
+
await this.page.waitForTimeout(2000);
|
|
127
|
+
// 点击"近30天"快捷按钮,设置时间范围
|
|
128
|
+
console.log('\n📅 点击"近30天"按钮...');
|
|
129
|
+
const rangeSelected = await this.clickLast30DaysButton();
|
|
130
|
+
if (!rangeSelected) {
|
|
131
|
+
console.log(' ⚠️ 未找到"近30天"按钮,使用默认时间范围');
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
console.log(' ✅ 已选择"近30天"');
|
|
135
|
+
await this.page.waitForTimeout(500);
|
|
136
|
+
}
|
|
137
|
+
// 点击"查询"按钮
|
|
138
|
+
console.log('\n🔍 点击查询按钮...');
|
|
139
|
+
this.responseReceived = false;
|
|
140
|
+
const searched = await this.clickSearchButton();
|
|
141
|
+
if (!searched) {
|
|
142
|
+
console.log(' ⚠️ 未找到查询按钮,尝试等待数据自动加载...');
|
|
143
|
+
}
|
|
144
|
+
// 等待第一次 API 响应
|
|
145
|
+
await this.waitForApiResponse(10000);
|
|
146
|
+
if (!this.responseReceived) {
|
|
147
|
+
console.log(' ⚠️ 未收到 API 响应,请确认页面已正确登录');
|
|
148
|
+
await browser.close();
|
|
149
|
+
process.exit(1);
|
|
150
|
+
}
|
|
151
|
+
// 自动翻页
|
|
152
|
+
let pageNum = 1;
|
|
153
|
+
console.log(`\n📄 开始翻页采集 (已获取第 1 页)...`);
|
|
154
|
+
while (pageNum < maxPages) {
|
|
155
|
+
if (this.totalFromApi > 0 && this.allConversations.length >= this.totalFromApi) {
|
|
156
|
+
console.log(` ✅ 已获取全部 ${this.totalFromApi} 条数据`);
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
if (this.currentPageData.length === 0 && pageNum > 1) {
|
|
160
|
+
console.log(` ✅ 当前页无数据,采集结束`);
|
|
161
|
+
break;
|
|
162
|
+
}
|
|
163
|
+
this.responseReceived = false;
|
|
164
|
+
this.currentPageData = [];
|
|
165
|
+
const hasNext = await this.clickNextPage();
|
|
166
|
+
if (!hasNext) {
|
|
167
|
+
console.log(` ✅ 没有下一页了,采集结束`);
|
|
168
|
+
break;
|
|
169
|
+
}
|
|
170
|
+
pageNum++;
|
|
171
|
+
await this.waitForApiResponse(10000);
|
|
172
|
+
if (!this.responseReceived) {
|
|
173
|
+
console.log(` ⚠️ 第 ${pageNum} 页未收到响应,停止翻页`);
|
|
174
|
+
break;
|
|
175
|
+
}
|
|
176
|
+
await this.page.waitForTimeout(delayMs);
|
|
177
|
+
}
|
|
178
|
+
if (pageNum >= maxPages) {
|
|
179
|
+
console.log(` ⚠️ 达到最大页数限制 (${maxPages})`);
|
|
180
|
+
}
|
|
181
|
+
// 保存数据
|
|
182
|
+
const result = {
|
|
183
|
+
conversations: this.allConversations,
|
|
184
|
+
totalPages: pageNum,
|
|
185
|
+
totalConversations: this.allConversations.length,
|
|
186
|
+
collectTime: new Date().toISOString(),
|
|
187
|
+
};
|
|
188
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
189
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
190
|
+
const outputFile = path.join(outputDir, `feige-history_${timestamp}.json`);
|
|
191
|
+
fs.writeFileSync(outputFile, JSON.stringify(result, null, 2));
|
|
192
|
+
console.log(`\n✅ 采集完成`);
|
|
193
|
+
console.log(` 共 ${this.allConversations.length} 条会话, ${pageNum} 页`);
|
|
194
|
+
console.log(` 文件: ${outputFile}`);
|
|
195
|
+
await browser.close();
|
|
196
|
+
}
|
|
197
|
+
/** 设置 API 拦截 */
|
|
198
|
+
async setupApiIntercept() {
|
|
199
|
+
await this.page.route(`**/${API_PATTERN}*`, async (route) => {
|
|
200
|
+
const response = await route.fetch();
|
|
201
|
+
const body = await response.text();
|
|
202
|
+
try {
|
|
203
|
+
const data = JSON.parse(body);
|
|
204
|
+
const conversations = this.extractConversationList(data);
|
|
205
|
+
this.totalFromApi = this.extractTotal(data);
|
|
206
|
+
if (conversations.length > 0) {
|
|
207
|
+
this.currentPageData = conversations;
|
|
208
|
+
this.allConversations.push(...conversations);
|
|
209
|
+
console.log(` 📡 获取到 ${conversations.length} 条会话 (累计: ${this.allConversations.length}/${this.totalFromApi || '?'})`);
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
this.currentPageData = [];
|
|
213
|
+
console.log(` 📡 该页无数据`);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
catch (e) {
|
|
217
|
+
console.log(` ⚠️ 解析响应失败: ${e.message}`);
|
|
218
|
+
this.currentPageData = [];
|
|
219
|
+
}
|
|
220
|
+
this.responseReceived = true;
|
|
221
|
+
await route.fulfill({ response });
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
/** 等待 API 响应(轮询 responseReceived 标志) */
|
|
225
|
+
async waitForApiResponse(timeout) {
|
|
226
|
+
const start = Date.now();
|
|
227
|
+
while (Date.now() - start < timeout && !this.responseReceived) {
|
|
228
|
+
await this.page.waitForTimeout(200);
|
|
229
|
+
}
|
|
230
|
+
await this.page.waitForTimeout(300);
|
|
231
|
+
}
|
|
232
|
+
/** 点击"近30天"快捷按钮(设置时间范围) */
|
|
233
|
+
async clickLast30DaysButton() {
|
|
234
|
+
const selectors = [
|
|
235
|
+
'button:has-text("近30天")',
|
|
236
|
+
'button:has-text("最近30天")',
|
|
237
|
+
'button:has-text("近三十天")',
|
|
238
|
+
'span:has-text("近30天")',
|
|
239
|
+
'a:has-text("近30天")',
|
|
240
|
+
'div[role="button"]:has-text("近30天")',
|
|
241
|
+
'[class*="tag"]:has-text("近30天")',
|
|
242
|
+
'[class*="quick"]:has-text("近30天")',
|
|
243
|
+
'label:has-text("近30天")',
|
|
244
|
+
// 兜底:包含"30天"文本的可点击元素
|
|
245
|
+
'button:has-text("30天")',
|
|
246
|
+
'span[class*="btn"]:has-text("30天")',
|
|
247
|
+
];
|
|
248
|
+
for (const selector of selectors) {
|
|
249
|
+
try {
|
|
250
|
+
const btn = this.page.locator(selector).first();
|
|
251
|
+
if (await btn.isVisible({ timeout: 1000 })) {
|
|
252
|
+
await btn.click();
|
|
253
|
+
return true;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
catch { }
|
|
257
|
+
}
|
|
258
|
+
return false;
|
|
259
|
+
}
|
|
260
|
+
/** 点击查询按钮 */
|
|
261
|
+
async clickSearchButton() {
|
|
262
|
+
const selectors = [
|
|
263
|
+
'button:has-text("查询")',
|
|
264
|
+
'button:has-text("搜索")',
|
|
265
|
+
'button:has-text("查找")',
|
|
266
|
+
'.ant-btn-primary:has-text("查询")',
|
|
267
|
+
'.ant-btn-primary:has-text("搜索")',
|
|
268
|
+
'[class*="search"] button',
|
|
269
|
+
'[class*="query"] button',
|
|
270
|
+
'button[type="submit"]',
|
|
271
|
+
];
|
|
272
|
+
for (const selector of selectors) {
|
|
273
|
+
try {
|
|
274
|
+
const btn = this.page.locator(selector).first();
|
|
275
|
+
if (await btn.isVisible({ timeout: 1000 })) {
|
|
276
|
+
await btn.click();
|
|
277
|
+
return true;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
catch { }
|
|
281
|
+
}
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
284
|
+
/** 点击下一页按钮 */
|
|
285
|
+
async clickNextPage() {
|
|
286
|
+
const selectors = [
|
|
287
|
+
'.ant-pagination-next:not(.ant-pagination-disabled)',
|
|
288
|
+
'li.ant-pagination-next:not(.ant-pagination-disabled) button',
|
|
289
|
+
'li.ant-pagination-next:not(.ant-pagination-disabled) a',
|
|
290
|
+
'button:has-text("下一页")',
|
|
291
|
+
'a:has-text("下一页")',
|
|
292
|
+
'.arco-pagination-item-next:not(.arco-pagination-item-disabled)',
|
|
293
|
+
'[class*="next"]:not([class*="disabled"]):not([disabled])',
|
|
294
|
+
];
|
|
295
|
+
for (const selector of selectors) {
|
|
296
|
+
try {
|
|
297
|
+
const btn = this.page.locator(selector).first();
|
|
298
|
+
if (await btn.isVisible({ timeout: 1000 })) {
|
|
299
|
+
const disabled = await btn.getAttribute('disabled');
|
|
300
|
+
const ariaDisabled = await btn.getAttribute('aria-disabled');
|
|
301
|
+
if (disabled !== null || ariaDisabled === 'true')
|
|
302
|
+
continue;
|
|
303
|
+
await btn.click();
|
|
304
|
+
return true;
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
catch { }
|
|
308
|
+
}
|
|
309
|
+
return false;
|
|
310
|
+
}
|
|
311
|
+
/** 从 API 响应中提取会话列表 */
|
|
312
|
+
extractConversationList(data) {
|
|
313
|
+
const paths = [
|
|
314
|
+
data?.data?.conversations,
|
|
315
|
+
data?.data?.list,
|
|
316
|
+
data?.data?.data,
|
|
317
|
+
data?.data?.records,
|
|
318
|
+
data?.data?.items,
|
|
319
|
+
data?.result?.conversations,
|
|
320
|
+
data?.result?.list,
|
|
321
|
+
data?.result?.data,
|
|
322
|
+
data?.conversations,
|
|
323
|
+
data?.list,
|
|
324
|
+
];
|
|
325
|
+
for (const p of paths) {
|
|
326
|
+
if (Array.isArray(p) && p.length > 0)
|
|
327
|
+
return p;
|
|
328
|
+
}
|
|
329
|
+
return this.findFirstArray(data, 0);
|
|
330
|
+
}
|
|
331
|
+
/** 递归查找第一个看起来像会话列表的数组 */
|
|
332
|
+
findFirstArray(obj, depth) {
|
|
333
|
+
if (depth > 5 || !obj || typeof obj !== 'object')
|
|
334
|
+
return [];
|
|
335
|
+
if (Array.isArray(obj) && obj.length > 0 && typeof obj[0] === 'object')
|
|
336
|
+
return obj;
|
|
337
|
+
for (const val of Object.values(obj)) {
|
|
338
|
+
if (Array.isArray(val) && val.length > 0 && typeof val[0] === 'object') {
|
|
339
|
+
return val;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
for (const val of Object.values(obj)) {
|
|
343
|
+
if (val && typeof val === 'object' && !Array.isArray(val)) {
|
|
344
|
+
const found = this.findFirstArray(val, depth + 1);
|
|
345
|
+
if (found.length > 0)
|
|
346
|
+
return found;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
return [];
|
|
350
|
+
}
|
|
351
|
+
/** 从响应中提取总数 */
|
|
352
|
+
extractTotal(data) {
|
|
353
|
+
const paths = [
|
|
354
|
+
data?.data?.total,
|
|
355
|
+
data?.data?.totalCount,
|
|
356
|
+
data?.data?.total_count,
|
|
357
|
+
data?.data?.pagination?.total,
|
|
358
|
+
data?.data?.page_info?.total,
|
|
359
|
+
data?.result?.total,
|
|
360
|
+
data?.total,
|
|
361
|
+
];
|
|
362
|
+
for (const p of paths) {
|
|
363
|
+
if (typeof p === 'number' && p > 0)
|
|
364
|
+
return p;
|
|
365
|
+
}
|
|
366
|
+
return 0;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
// ========== 启动 ==========
|
|
370
|
+
const collector = new FeigeHistoryCollector();
|
|
371
|
+
collector.run().catch((err) => {
|
|
372
|
+
console.error('采集出错:', err);
|
|
373
|
+
process.exit(1);
|
|
374
|
+
});
|
|
375
|
+
//# sourceMappingURL=feige-history-collect.js.map
|
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
4
|
+
if (k2 === undefined) k2 = k;
|
|
5
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
6
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
7
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
8
|
+
}
|
|
9
|
+
Object.defineProperty(o, k2, desc);
|
|
10
|
+
}) : (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
o[k2] = m[k];
|
|
13
|
+
}));
|
|
14
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
15
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
16
|
+
}) : function(o, v) {
|
|
17
|
+
o["default"] = v;
|
|
18
|
+
});
|
|
19
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
20
|
+
var ownKeys = function(o) {
|
|
21
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
22
|
+
var ar = [];
|
|
23
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
24
|
+
return ar;
|
|
25
|
+
};
|
|
26
|
+
return ownKeys(o);
|
|
27
|
+
};
|
|
28
|
+
return function (mod) {
|
|
29
|
+
if (mod && mod.__esModule) return mod;
|
|
30
|
+
var result = {};
|
|
31
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
32
|
+
__setModuleDefault(result, mod);
|
|
33
|
+
return result;
|
|
34
|
+
};
|
|
35
|
+
})();
|
|
36
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
37
|
+
/**
|
|
38
|
+
* 拼多多客服后台历史会话数据采集脚本
|
|
39
|
+
*
|
|
40
|
+
* 页面:https://mms.pinduoduo.com/mms-chat/search?msfrom=mms_sidenav
|
|
41
|
+
* 接口:POST https://mms.pinduoduo.com/latitude/search/message/getMessages
|
|
42
|
+
*
|
|
43
|
+
* 实际页面结构(已通过浏览器验证):
|
|
44
|
+
* - 顶部表单:查询方式 / 客服账号 / 时间 / 查询按钮
|
|
45
|
+
* - 时间输入框 placeholder="开始时间 ~ 结束时间", readonly
|
|
46
|
+
* - 时间浮层快捷按钮:今天 / 近7天 / 近30天 (点击后自动应用并关闭浮层,无需确认)
|
|
47
|
+
* - 查询按钮:button.BTN_primary_xxx 文字"查询"
|
|
48
|
+
* - 查询结果:左侧账号列表 .cs-list > .user-item (每页 10 条) + 右侧 .message-list
|
|
49
|
+
* - 分页器:.PGT_outerWrapper_xxx,下一页 .PGT_next_xxx
|
|
50
|
+
* - 接口返回的消息一次性全量渲染到 .message-list,无懒加载
|
|
51
|
+
*
|
|
52
|
+
* 流程:
|
|
53
|
+
* 1. 注入 Cookie 实现免登录
|
|
54
|
+
* 2. 打开页面
|
|
55
|
+
* 3. 点击时间输入框 → 点击"近30天" (自动应用 + 关闭浮层)
|
|
56
|
+
* 4. 点击"查询" → 左侧出现 .user-item 列表
|
|
57
|
+
* 5. 遍历当前页所有 .user-item,逐个点击 → 拦截 getMessages 响应
|
|
58
|
+
* 6. 点击分页器"下一页",重复步骤 5,直到没有下一页
|
|
59
|
+
* 7. 增量保存到 output/pdd-history_<timestamp>.json
|
|
60
|
+
*
|
|
61
|
+
* 用法:
|
|
62
|
+
* npx ts-node scripts/pdd-history-collect.ts [--output <dir>] [--max-accounts <n>]
|
|
63
|
+
* [--delay <ms>] [--headless]
|
|
64
|
+
*/
|
|
65
|
+
const playwright_core_1 = require("playwright-core");
|
|
66
|
+
const fs = __importStar(require("fs"));
|
|
67
|
+
const path = __importStar(require("path"));
|
|
68
|
+
const cookie_extract_1 = require("../src/cookie-extract");
|
|
69
|
+
const PAGE_URL = 'https://mms.pinduoduo.com/mms-chat/search?msfrom=mms_sidenav';
|
|
70
|
+
const API_PATTERN = '/latitude/search/message/getMessages';
|
|
71
|
+
// ========== CLI 参数 ==========
|
|
72
|
+
const args = process.argv.slice(2);
|
|
73
|
+
const getArg = (name, def) => {
|
|
74
|
+
const idx = args.indexOf(`--${name}`);
|
|
75
|
+
return idx >= 0 && args[idx + 1] ? args[idx + 1] : def;
|
|
76
|
+
};
|
|
77
|
+
const hasFlag = (name) => args.includes(`--${name}`);
|
|
78
|
+
const outputDir = getArg('output', 'output');
|
|
79
|
+
const maxAccounts = parseInt(getArg('max-accounts', '500'), 10);
|
|
80
|
+
const maxMsgPages = parseInt(getArg('max-msg-pages', '20'), 10);
|
|
81
|
+
const delayMs = parseInt(getArg('delay', '600'), 10);
|
|
82
|
+
const headless = hasFlag('headless');
|
|
83
|
+
class PddHistoryCollector {
|
|
84
|
+
constructor() {
|
|
85
|
+
this.currentMessages = new Map(); // 当前账号的消息(按 msg_id 去重)
|
|
86
|
+
this.currentResponseCount = 0;
|
|
87
|
+
this.results = [];
|
|
88
|
+
this.outputFile = '';
|
|
89
|
+
}
|
|
90
|
+
async run() {
|
|
91
|
+
console.log('拼多多客服历史会话采集');
|
|
92
|
+
console.log(` 页面: ${PAGE_URL}`);
|
|
93
|
+
console.log(` 最大账号数: ${maxAccounts}`);
|
|
94
|
+
console.log(` 每账号最大消息分页: ${maxMsgPages}`);
|
|
95
|
+
console.log(` 请求间隔: ${delayMs}ms`);
|
|
96
|
+
console.log(` 输出目录: ${outputDir}`);
|
|
97
|
+
console.log('');
|
|
98
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
99
|
+
const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
100
|
+
this.outputFile = path.join(outputDir, `pdd-history_${ts}.json`);
|
|
101
|
+
const launchOpts = {
|
|
102
|
+
headless,
|
|
103
|
+
handleSIGINT: false,
|
|
104
|
+
handleSIGTERM: false,
|
|
105
|
+
handleSIGHUP: false,
|
|
106
|
+
};
|
|
107
|
+
let browser;
|
|
108
|
+
try {
|
|
109
|
+
browser = await playwright_core_1.chromium.launch({ ...launchOpts, channel: 'chrome' });
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
browser = await playwright_core_1.chromium.launch(launchOpts);
|
|
113
|
+
}
|
|
114
|
+
const context = await browser.newContext({ viewport: { width: 1440, height: 900 } });
|
|
115
|
+
// Cookie 注入
|
|
116
|
+
console.log('🍪 从本地 Chrome 提取 Cookie...');
|
|
117
|
+
const cookies = (0, cookie_extract_1.extractCookiesForUrl)(PAGE_URL);
|
|
118
|
+
if (cookies.length > 0) {
|
|
119
|
+
let injected = 0;
|
|
120
|
+
for (const c of cookies) {
|
|
121
|
+
try {
|
|
122
|
+
await context.addCookies([c]);
|
|
123
|
+
injected++;
|
|
124
|
+
}
|
|
125
|
+
catch { }
|
|
126
|
+
}
|
|
127
|
+
console.log(` ✅ 已注入 ${injected}/${cookies.length} 个 Cookie`);
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
console.log(' ⚠️ 未找到 Cookie,请先在 Chrome 中登录拼多多商家后台');
|
|
131
|
+
}
|
|
132
|
+
this.page = await context.newPage();
|
|
133
|
+
await this.setupApiIntercept();
|
|
134
|
+
// 1. 导航
|
|
135
|
+
console.log('\n🌐 打开页面...');
|
|
136
|
+
try {
|
|
137
|
+
await this.page.goto(PAGE_URL, { waitUntil: 'networkidle', timeout: 30000 });
|
|
138
|
+
}
|
|
139
|
+
catch (e) {
|
|
140
|
+
console.log(` ⚠️ 加载超时,继续: ${e.message?.split('\n')[0]}`);
|
|
141
|
+
}
|
|
142
|
+
await this.page.waitForTimeout(2000);
|
|
143
|
+
// 2. 选择"近30天"
|
|
144
|
+
console.log('\n📅 选择时间范围"近30天"...');
|
|
145
|
+
const ok30 = await this.selectLast30Days();
|
|
146
|
+
if (!ok30) {
|
|
147
|
+
console.log(' ⚠️ 时间选择失败,使用页面默认');
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
console.log(' ✅ 已选择近30天');
|
|
151
|
+
}
|
|
152
|
+
// 3. 点击查询
|
|
153
|
+
console.log('\n🔍 点击查询按钮...');
|
|
154
|
+
const searched = await this.clickSearchButton();
|
|
155
|
+
if (!searched) {
|
|
156
|
+
console.log(' ❌ 未找到查询按钮');
|
|
157
|
+
await browser.close();
|
|
158
|
+
process.exit(1);
|
|
159
|
+
}
|
|
160
|
+
// 等待左侧 user-item 列表出现
|
|
161
|
+
try {
|
|
162
|
+
await this.page.waitForSelector('.user-item', { timeout: 15000 });
|
|
163
|
+
}
|
|
164
|
+
catch {
|
|
165
|
+
console.log(' ⚠️ 等待 .user-item 超时,可能查询无结果');
|
|
166
|
+
}
|
|
167
|
+
await this.page.waitForTimeout(1500);
|
|
168
|
+
// 4. 遍历左侧账号列表(左侧没有分页器,可能通过滚动加载;这里先取全部已渲染项,
|
|
169
|
+
// 若数量增长则继续滚动加载更多)
|
|
170
|
+
console.log('\n👥 开始遍历消费者账号...');
|
|
171
|
+
let processed = 0;
|
|
172
|
+
let lastSeenCount = 0;
|
|
173
|
+
let stableRounds = 0;
|
|
174
|
+
while (processed < maxAccounts) {
|
|
175
|
+
const itemCount = await this.page.locator('.user-item').count();
|
|
176
|
+
if (itemCount === 0) {
|
|
177
|
+
console.log(` ⚠️ 无账号,结束`);
|
|
178
|
+
break;
|
|
179
|
+
}
|
|
180
|
+
// 处理新出现的账号
|
|
181
|
+
for (let i = lastSeenCount; i < itemCount && processed < maxAccounts; i++) {
|
|
182
|
+
await this.processOneAccount(i, 1, processed + 1);
|
|
183
|
+
processed++;
|
|
184
|
+
this.saveResults();
|
|
185
|
+
await this.page.waitForTimeout(delayMs);
|
|
186
|
+
}
|
|
187
|
+
lastSeenCount = itemCount;
|
|
188
|
+
if (processed >= maxAccounts) {
|
|
189
|
+
console.log(` ⚠️ 达到最大账号数限制 (${maxAccounts})`);
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
// 滚动 .cs-list 到底部尝试加载更多
|
|
193
|
+
const grew = await this.scrollAccountListAndCheck();
|
|
194
|
+
if (!grew) {
|
|
195
|
+
stableRounds++;
|
|
196
|
+
if (stableRounds >= 2) {
|
|
197
|
+
console.log(' ✅ 账号列表无更多项,采集结束');
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
else {
|
|
202
|
+
stableRounds = 0;
|
|
203
|
+
}
|
|
204
|
+
await this.page.waitForTimeout(800);
|
|
205
|
+
}
|
|
206
|
+
this.saveResults();
|
|
207
|
+
const total = this.results.reduce((s, r) => s + r.messages.length, 0);
|
|
208
|
+
console.log(`\n✅ 采集完成`);
|
|
209
|
+
console.log(` 共 ${this.results.length} 个账号, ${total} 条消息`);
|
|
210
|
+
console.log(` 文件: ${this.outputFile}`);
|
|
211
|
+
await browser.close();
|
|
212
|
+
}
|
|
213
|
+
// ========== 单个账号 ==========
|
|
214
|
+
async processOneAccount(itemIdx, pageIdx, globalIdx) {
|
|
215
|
+
this.currentMessages.clear();
|
|
216
|
+
this.currentResponseCount = 0;
|
|
217
|
+
const items = this.page.locator('.user-item');
|
|
218
|
+
if (itemIdx >= (await items.count()))
|
|
219
|
+
return;
|
|
220
|
+
const item = items.nth(itemIdx);
|
|
221
|
+
const label = (await item.innerText().catch(() => '')).trim().replace(/\s+/g, ' ').slice(0, 80);
|
|
222
|
+
console.log(`\n[#${globalIdx} 页${pageIdx}/项${itemIdx + 1}] ${label || '(无文本)'}`);
|
|
223
|
+
try {
|
|
224
|
+
await item.scrollIntoViewIfNeeded({ timeout: 3000 });
|
|
225
|
+
}
|
|
226
|
+
catch { }
|
|
227
|
+
// 第一页第一个账号默认已被选中并已加载消息(等于无需点击就有响应)。
|
|
228
|
+
// 我们仍然点击一次,强制重新触发请求,以保证拦截能稳定捕获。
|
|
229
|
+
try {
|
|
230
|
+
await item.click({ timeout: 5000 });
|
|
231
|
+
}
|
|
232
|
+
catch (e) {
|
|
233
|
+
console.log(` ⚠️ 点击失败: ${e.message?.split('\n')[0]}`);
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
// 等 getMessages 响应(首页第一项可能没有新请求 — 因为页面初始化时已请求过;做兜底)
|
|
237
|
+
const got = await this.waitForResponse(8000);
|
|
238
|
+
if (!got && pageIdx === 1 && itemIdx === 0) {
|
|
239
|
+
console.log(` ℹ️ 首项无新请求,跳过`);
|
|
240
|
+
}
|
|
241
|
+
// 遍历该账号的消息分页(拼多多右侧消息分页,每个账号页数不同)
|
|
242
|
+
const msgPagesVisited = await this.iterateMessagePages();
|
|
243
|
+
const messages = Array.from(this.currentMessages.values());
|
|
244
|
+
this.results.push({
|
|
245
|
+
accountKey: `p${pageIdx}_i${itemIdx + 1}`,
|
|
246
|
+
accountLabel: label,
|
|
247
|
+
pageIndex: pageIdx,
|
|
248
|
+
itemIndex: itemIdx + 1,
|
|
249
|
+
messagePagesVisited: msgPagesVisited,
|
|
250
|
+
messages,
|
|
251
|
+
responseCount: this.currentResponseCount,
|
|
252
|
+
});
|
|
253
|
+
console.log(` ✅ ${messages.length} 条消息 (${this.currentResponseCount} 次响应)`);
|
|
254
|
+
}
|
|
255
|
+
// ========== 时间选择 ==========
|
|
256
|
+
/** 选择"近30天"(点击输入框 → 点近30天,浮层会自动关闭) */
|
|
257
|
+
async selectLast30Days() {
|
|
258
|
+
const input = this.page.locator('input[placeholder*="开始时间"][readonly]').first();
|
|
259
|
+
try {
|
|
260
|
+
if (!(await input.isVisible({ timeout: 3000 })))
|
|
261
|
+
return false;
|
|
262
|
+
await input.click({ timeout: 2000 });
|
|
263
|
+
}
|
|
264
|
+
catch {
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
// 等浮层
|
|
268
|
+
try {
|
|
269
|
+
await this.page.waitForSelector('.RPR_quickPickerWrapper_1198e34, [class*="RPR_quickPickerWrapper"]', { timeout: 3000 });
|
|
270
|
+
}
|
|
271
|
+
catch {
|
|
272
|
+
// 浮层 class 可能变化,继续尝试找按钮
|
|
273
|
+
}
|
|
274
|
+
await this.page.waitForTimeout(300);
|
|
275
|
+
// 点击"近30天"
|
|
276
|
+
const last30 = this.page.locator('button', { hasText: /^近30天$/ }).first();
|
|
277
|
+
try {
|
|
278
|
+
if (await last30.isVisible({ timeout: 2000 })) {
|
|
279
|
+
await last30.click({ timeout: 2000 });
|
|
280
|
+
await this.page.waitForTimeout(400);
|
|
281
|
+
return true;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
catch { }
|
|
285
|
+
return false;
|
|
286
|
+
}
|
|
287
|
+
// ========== 查询按钮 ==========
|
|
288
|
+
async clickSearchButton() {
|
|
289
|
+
// 优先 BTN_primary 中的"查询"
|
|
290
|
+
const candidates = [
|
|
291
|
+
this.page.locator('button[class*="BTN_primary"]', { hasText: /^查询$/ }),
|
|
292
|
+
this.page.locator('button', { hasText: /^查询$/ }),
|
|
293
|
+
];
|
|
294
|
+
for (const c of candidates) {
|
|
295
|
+
try {
|
|
296
|
+
const btn = c.first();
|
|
297
|
+
if (await btn.isVisible({ timeout: 1500 })) {
|
|
298
|
+
await btn.click({ timeout: 2000 });
|
|
299
|
+
return true;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
catch { }
|
|
303
|
+
}
|
|
304
|
+
return false;
|
|
305
|
+
}
|
|
306
|
+
// ========== 左侧账号列表滚动加载 ==========
|
|
307
|
+
/** 滚动 .cs-list 到底部,返回是否新增了项 */
|
|
308
|
+
async scrollAccountListAndCheck() {
|
|
309
|
+
return await this.page.evaluate(() => {
|
|
310
|
+
const list = document.querySelector('.cs-list');
|
|
311
|
+
if (!list)
|
|
312
|
+
return false;
|
|
313
|
+
const before = list.querySelectorAll('.user-item').length;
|
|
314
|
+
list.scrollTop = list.scrollHeight;
|
|
315
|
+
return new Promise(resolve => {
|
|
316
|
+
setTimeout(() => {
|
|
317
|
+
const after = list.querySelectorAll('.user-item').length;
|
|
318
|
+
resolve(after > before);
|
|
319
|
+
}, 1500);
|
|
320
|
+
});
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
// ========== 右侧消息分页(每个账号内部) ==========
|
|
324
|
+
/**
|
|
325
|
+
* 遍历当前选中账号的所有消息分页。
|
|
326
|
+
* 拼多多分页器结构:
|
|
327
|
+
* - .table-section-right-footer 内的 [class*="PGT_outerWrapper"]
|
|
328
|
+
* - 当前激活:[class*="PGT_pagerItemActive"]
|
|
329
|
+
* - 下一页按钮:[class*="PGT_next"],禁用时含 PGT_disabled
|
|
330
|
+
*
|
|
331
|
+
* 策略:循环点击下一页,直到 next 按钮 disabled 或没有新响应。
|
|
332
|
+
* 返回访问过的页数(含第 1 页)。
|
|
333
|
+
*/
|
|
334
|
+
async iterateMessagePages() {
|
|
335
|
+
let visited = 1; // 当前已经在第 1 页
|
|
336
|
+
while (visited < maxMsgPages) {
|
|
337
|
+
// 检查右侧分页器是否存在 + 下一页是否可用
|
|
338
|
+
const status = await this.page.evaluate(() => {
|
|
339
|
+
const footer = document.querySelector('.table-section-right-footer');
|
|
340
|
+
if (!footer)
|
|
341
|
+
return { exists: false, disabled: true };
|
|
342
|
+
const next = footer.querySelector('[class*="PGT_next"]');
|
|
343
|
+
if (!next)
|
|
344
|
+
return { exists: false, disabled: true };
|
|
345
|
+
const cls = next.className || '';
|
|
346
|
+
const ariaDisabled = next.getAttribute('aria-disabled');
|
|
347
|
+
const r = next.getBoundingClientRect();
|
|
348
|
+
return {
|
|
349
|
+
exists: r.width > 0 && r.height > 0,
|
|
350
|
+
disabled: /PGT_disabled|disabled/.test(cls) || ariaDisabled === 'true',
|
|
351
|
+
};
|
|
352
|
+
});
|
|
353
|
+
if (!status.exists || status.disabled)
|
|
354
|
+
break;
|
|
355
|
+
// 点击下一页
|
|
356
|
+
const baseline = this.currentResponseCount;
|
|
357
|
+
const clicked = await this.page.evaluate(() => {
|
|
358
|
+
const footer = document.querySelector('.table-section-right-footer');
|
|
359
|
+
if (!footer)
|
|
360
|
+
return false;
|
|
361
|
+
const next = footer.querySelector('[class*="PGT_next"]');
|
|
362
|
+
if (!next)
|
|
363
|
+
return false;
|
|
364
|
+
next.click();
|
|
365
|
+
return true;
|
|
366
|
+
});
|
|
367
|
+
if (!clicked)
|
|
368
|
+
break;
|
|
369
|
+
// 等响应
|
|
370
|
+
const got = await this.waitForResponseSince(baseline, 8000);
|
|
371
|
+
if (!got) {
|
|
372
|
+
// 如果没新响应(可能数据已缓存或异常),仍然让 visited++ 但只重试 1 次
|
|
373
|
+
console.log(` ⚠️ 第 ${visited + 1} 页消息无响应,停止该账号翻页`);
|
|
374
|
+
break;
|
|
375
|
+
}
|
|
376
|
+
visited++;
|
|
377
|
+
console.log(` 📄 已翻到消息第 ${visited} 页`);
|
|
378
|
+
await this.page.waitForTimeout(delayMs);
|
|
379
|
+
}
|
|
380
|
+
if (visited >= maxMsgPages) {
|
|
381
|
+
console.log(` ⚠️ 该账号达到消息分页上限 ${maxMsgPages},停止翻页`);
|
|
382
|
+
}
|
|
383
|
+
return visited;
|
|
384
|
+
}
|
|
385
|
+
// ========== API 拦截 ==========
|
|
386
|
+
async setupApiIntercept() {
|
|
387
|
+
await this.page.route(`**${API_PATTERN}*`, async (route) => {
|
|
388
|
+
const response = await route.fetch();
|
|
389
|
+
const body = await response.text();
|
|
390
|
+
try {
|
|
391
|
+
const data = JSON.parse(body);
|
|
392
|
+
const messages = this.extractMessages(data);
|
|
393
|
+
for (const m of messages) {
|
|
394
|
+
const id = this.getMessageId(m);
|
|
395
|
+
if (id && !this.currentMessages.has(id)) {
|
|
396
|
+
this.currentMessages.set(id, m);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
this.currentResponseCount++;
|
|
400
|
+
if (messages.length > 0) {
|
|
401
|
+
console.log(` 📡 +${messages.length} 条消息 (累计 ${this.currentMessages.size})`);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
catch (e) {
|
|
405
|
+
console.log(` ⚠️ 解析响应失败: ${e.message?.split('\n')[0]}`);
|
|
406
|
+
}
|
|
407
|
+
await route.fulfill({ response });
|
|
408
|
+
});
|
|
409
|
+
}
|
|
410
|
+
/** 等待至少一次新的 getMessages 响应 */
|
|
411
|
+
async waitForResponse(timeout) {
|
|
412
|
+
return this.waitForResponseSince(this.currentResponseCount, timeout);
|
|
413
|
+
}
|
|
414
|
+
/** 从指定 baseline 等待新响应 */
|
|
415
|
+
async waitForResponseSince(baseline, timeout) {
|
|
416
|
+
const start = Date.now();
|
|
417
|
+
while (Date.now() - start < timeout) {
|
|
418
|
+
if (this.currentResponseCount > baseline) {
|
|
419
|
+
await this.page.waitForTimeout(300); // 等数据落地
|
|
420
|
+
return true;
|
|
421
|
+
}
|
|
422
|
+
await this.page.waitForTimeout(150);
|
|
423
|
+
}
|
|
424
|
+
return false;
|
|
425
|
+
}
|
|
426
|
+
// ========== 响应解析 ==========
|
|
427
|
+
extractMessages(data) {
|
|
428
|
+
const paths = [
|
|
429
|
+
data?.result?.messages,
|
|
430
|
+
data?.result?.list,
|
|
431
|
+
data?.result?.data,
|
|
432
|
+
data?.result?.message_list,
|
|
433
|
+
data?.result?.records,
|
|
434
|
+
data?.result?.msg_list,
|
|
435
|
+
data?.data?.messages,
|
|
436
|
+
data?.data?.list,
|
|
437
|
+
data?.data?.message_list,
|
|
438
|
+
data?.messages,
|
|
439
|
+
data?.list,
|
|
440
|
+
];
|
|
441
|
+
for (const p of paths) {
|
|
442
|
+
if (Array.isArray(p) && p.length > 0)
|
|
443
|
+
return p;
|
|
444
|
+
}
|
|
445
|
+
return this.findMessageArray(data, 0);
|
|
446
|
+
}
|
|
447
|
+
findMessageArray(obj, depth) {
|
|
448
|
+
if (depth > 6 || !obj || typeof obj !== 'object')
|
|
449
|
+
return [];
|
|
450
|
+
if (Array.isArray(obj) && obj.length > 0 && this.looksLikeMessages(obj))
|
|
451
|
+
return obj;
|
|
452
|
+
for (const val of Object.values(obj)) {
|
|
453
|
+
if (Array.isArray(val) && val.length > 0 && this.looksLikeMessages(val)) {
|
|
454
|
+
return val;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
for (const val of Object.values(obj)) {
|
|
458
|
+
if (val && typeof val === 'object' && !Array.isArray(val)) {
|
|
459
|
+
const found = this.findMessageArray(val, depth + 1);
|
|
460
|
+
if (found.length > 0)
|
|
461
|
+
return found;
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
return [];
|
|
465
|
+
}
|
|
466
|
+
looksLikeMessages(arr) {
|
|
467
|
+
const sample = arr[0];
|
|
468
|
+
if (!sample || typeof sample !== 'object')
|
|
469
|
+
return false;
|
|
470
|
+
return (sample.msg_id !== undefined ||
|
|
471
|
+
sample.message_id !== undefined ||
|
|
472
|
+
sample.msgId !== undefined ||
|
|
473
|
+
sample.id !== undefined ||
|
|
474
|
+
sample.content !== undefined ||
|
|
475
|
+
sample.from !== undefined ||
|
|
476
|
+
sample.ts !== undefined);
|
|
477
|
+
}
|
|
478
|
+
getMessageId(m) {
|
|
479
|
+
return String(m?.msg_id ?? m?.message_id ?? m?.msgId ?? m?.id ?? m?.seq ?? m?.sequence ??
|
|
480
|
+
`auto_${JSON.stringify(m).slice(0, 200)}`);
|
|
481
|
+
}
|
|
482
|
+
// ========== 持久化 ==========
|
|
483
|
+
saveResults() {
|
|
484
|
+
const data = {
|
|
485
|
+
collectTime: new Date().toISOString(),
|
|
486
|
+
totalAccounts: this.results.length,
|
|
487
|
+
totalMessages: this.results.reduce((s, r) => s + r.messages.length, 0),
|
|
488
|
+
accounts: this.results,
|
|
489
|
+
};
|
|
490
|
+
fs.writeFileSync(this.outputFile, JSON.stringify(data, null, 2));
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
// ========== 启动 ==========
|
|
494
|
+
const collector = new PddHistoryCollector();
|
|
495
|
+
collector.run().catch((err) => {
|
|
496
|
+
console.error('采集出错:', err);
|
|
497
|
+
process.exit(1);
|
|
498
|
+
});
|
|
499
|
+
//# sourceMappingURL=pdd-history-collect.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "benchmark-collector",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "Playwright-based benchmark data collector for web agent training",
|
|
5
5
|
"main": "dist/src/index.js",
|
|
6
6
|
"types": "dist/src/index.d.ts",
|
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
],
|
|
19
19
|
"scripts": {
|
|
20
20
|
"collect": "ts-node scripts/collect.ts",
|
|
21
|
+
"feige-history": "ts-node scripts/feige-history-collect.ts",
|
|
22
|
+
"pdd-history": "ts-node scripts/pdd-history-collect.ts",
|
|
21
23
|
"build": "tsc",
|
|
22
24
|
"prepublishOnly": "npm run build",
|
|
23
25
|
"postinstall": "node dist/scripts/postinstall.js",
|