benchmark-collector 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/scripts/collect.js +4 -0
- package/dist/src/api-conversation-capture.d.ts +115 -0
- package/dist/src/api-conversation-capture.js +518 -0
- package/dist/src/api-conversation-capture.js.map +1 -0
- package/dist/src/collector.d.ts +37 -0
- package/dist/src/collector.js +361 -10
- package/dist/src/collector.js.map +1 -1
- package/dist/src/conversation-capture.d.ts +3 -2
- package/dist/src/conversation-capture.js +508 -99
- package/dist/src/conversation-capture.js.map +1 -1
- package/dist/src/index.d.ts +2 -1
- package/dist/src/index.js +3 -1
- package/dist/src/index.js.map +1 -1
- package/dist/src/types.d.ts +21 -0
- package/package.json +1 -1
package/dist/scripts/collect.js
CHANGED
|
@@ -13,6 +13,7 @@ function hasFlag(name) {
|
|
|
13
13
|
const task = getArg('task', '未命名任务');
|
|
14
14
|
const url = getArg('url', 'https://www.example.com');
|
|
15
15
|
const chatSelector = getArg('chat-selector', '');
|
|
16
|
+
const chatApi = getArg('chat-api', '');
|
|
16
17
|
const cookie = hasFlag('cookie');
|
|
17
18
|
const trace = hasFlag('trace');
|
|
18
19
|
console.log(`Benchmark 数据采集器`);
|
|
@@ -20,12 +21,15 @@ console.log(` 任务: ${task}`);
|
|
|
20
21
|
console.log(` URL: ${url}`);
|
|
21
22
|
if (chatSelector)
|
|
22
23
|
console.log(` 对话选择器: ${chatSelector}`);
|
|
24
|
+
if (chatApi)
|
|
25
|
+
console.log(` 对话接口: ${chatApi}`);
|
|
23
26
|
if (cookie)
|
|
24
27
|
console.log(` Cookie: 自动从本地 Chrome 提取并注入`);
|
|
25
28
|
if (trace)
|
|
26
29
|
console.log(` Trace: 开启(⚠️ 长时间采集可能产生大文件)`);
|
|
27
30
|
const collector = new collector_1.BenchmarkCollector(task, url, 'output', {
|
|
28
31
|
chatSelector: chatSelector || undefined,
|
|
32
|
+
chatApiPattern: chatApi || undefined,
|
|
29
33
|
cookie,
|
|
30
34
|
trace,
|
|
31
35
|
});
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 通过拦截网络 API 响应来采集对话数据。
|
|
3
|
+
*
|
|
4
|
+
* 适用场景:
|
|
5
|
+
* - DOM 检测不准确时,直接从接口获取结构化数据
|
|
6
|
+
* - 接口分页加载(如 拼多多 plateau/chat/list),需要累积多次响应
|
|
7
|
+
* - 数据比 DOM 更完整(含历史消息、被折叠的消息等)
|
|
8
|
+
*
|
|
9
|
+
* 自动翻页:
|
|
10
|
+
* 检测到首次请求后,自动读取 start_msg_id 对应字段,
|
|
11
|
+
* 用响应中最早消息的 msg_id 作为下一页的 start_msg_id,
|
|
12
|
+
* 循环拉取直到没有更多消息。
|
|
13
|
+
*
|
|
14
|
+
* 使用方式:
|
|
15
|
+
* --chat-api "plateau/chat/list" 匹配 URL 包含该字符串的响应
|
|
16
|
+
* --chat-api "/api/im/messages" 支持多种 IM 接口
|
|
17
|
+
*/
|
|
18
|
+
import { ChatMessage } from './types';
|
|
19
|
+
/** 捕获的请求模板(用于重放翻页) */
|
|
20
|
+
export interface CapturedRequest {
|
|
21
|
+
url: string;
|
|
22
|
+
method: string;
|
|
23
|
+
headers: Record<string, string>;
|
|
24
|
+
postData?: string;
|
|
25
|
+
/** 会话标识(如 data.list.with.id) */
|
|
26
|
+
conversationId?: string;
|
|
27
|
+
}
|
|
28
|
+
/** processResponse 的返回结果 */
|
|
29
|
+
export interface ProcessResult {
|
|
30
|
+
/** 本次响应新增的消息数 */
|
|
31
|
+
newMessageCount: number;
|
|
32
|
+
/** 本次响应的消息总数(含已有重复) */
|
|
33
|
+
responseMessageCount: number;
|
|
34
|
+
/** 最早消息的 ID(用作下一页 start_msg_id) */
|
|
35
|
+
oldestMsgId?: string;
|
|
36
|
+
/** 是否还有更多历史消息(newCount > 0 且非全部重复) */
|
|
37
|
+
hasMore: boolean;
|
|
38
|
+
}
|
|
39
|
+
/** API 对话采集器 */
|
|
40
|
+
export declare class ApiConversationCapture {
|
|
41
|
+
/** URL 匹配模式 */
|
|
42
|
+
private pattern;
|
|
43
|
+
/** 已采集的消息(按 ID 去重) */
|
|
44
|
+
private messageMap;
|
|
45
|
+
/** 原始响应缓存(用于调试) */
|
|
46
|
+
private rawResponses;
|
|
47
|
+
/** 变更回调 */
|
|
48
|
+
private onChange?;
|
|
49
|
+
/** 捕获的请求模板(首次匹配的请求,用于翻页重放) */
|
|
50
|
+
capturedRequest?: CapturedRequest;
|
|
51
|
+
/** 当前正在自动翻页 */
|
|
52
|
+
private _paginating;
|
|
53
|
+
constructor(pattern: string, onChange?: () => void);
|
|
54
|
+
/** 检查 URL 是否匹配 */
|
|
55
|
+
matches(url: string): boolean;
|
|
56
|
+
get isPaginating(): boolean;
|
|
57
|
+
set isPaginating(v: boolean);
|
|
58
|
+
/** 保存请求模板 */
|
|
59
|
+
saveRequestTemplate(req: CapturedRequest): void;
|
|
60
|
+
/** 处理一个 API 响应体,返回翻页信息 */
|
|
61
|
+
processResponse(url: string, body: string): ProcessResult;
|
|
62
|
+
/**
|
|
63
|
+
* 构建翻页请求的 postData。
|
|
64
|
+
* 在原始请求体中替换 start_msg_id 为新的 cursor。
|
|
65
|
+
*/
|
|
66
|
+
buildPageRequestBody(oldestMsgId: string): string | undefined;
|
|
67
|
+
/** 递归替换分页 cursor 字段 */
|
|
68
|
+
private replacePageCursor;
|
|
69
|
+
/** 获取累积的完整对话(按时间排序,去重后) */
|
|
70
|
+
getMessages(): ChatMessage[];
|
|
71
|
+
/** 获取消息数量 */
|
|
72
|
+
get messageCount(): number;
|
|
73
|
+
/** 获取诊断信息 */
|
|
74
|
+
getDiagnostics(): {
|
|
75
|
+
pattern: string;
|
|
76
|
+
responseCount: number;
|
|
77
|
+
messageCount: number;
|
|
78
|
+
conversationId: string | undefined;
|
|
79
|
+
lastResponseTime: number | null;
|
|
80
|
+
};
|
|
81
|
+
/** 检测响应中是否有 has_more 字段 */
|
|
82
|
+
private detectHasMore;
|
|
83
|
+
/** 在原始 JSON 中找到消息数组(未经解析的原始对象),用于提取 pre_msg_id 等 */
|
|
84
|
+
private findRawMessageArray;
|
|
85
|
+
/**
|
|
86
|
+
* 递归搜索 JSON 中的消息数组。
|
|
87
|
+
* 支持常见 IM 接口返回格式:
|
|
88
|
+
* { data: { list: [...] } }
|
|
89
|
+
* { result: { messages: [...] } }
|
|
90
|
+
* { data: { chat_list: [...] } }
|
|
91
|
+
* { messages: [...] }
|
|
92
|
+
* [...] (直接数组)
|
|
93
|
+
*/
|
|
94
|
+
private extractMessages;
|
|
95
|
+
/** 尝试将数组解析为消息列表 */
|
|
96
|
+
private tryParseMessageArray;
|
|
97
|
+
/** 检查一个对象是否像消息 */
|
|
98
|
+
private looksLikeMessage;
|
|
99
|
+
/** 解析单条消息 */
|
|
100
|
+
private parseOneMessage;
|
|
101
|
+
/** 提取消息文本内容 */
|
|
102
|
+
private extractContent;
|
|
103
|
+
/** 提取角色 */
|
|
104
|
+
private extractRole;
|
|
105
|
+
/** 提取时间 */
|
|
106
|
+
private extractTime;
|
|
107
|
+
/** 从 API 消息对象中提取媒体附件 */
|
|
108
|
+
private extractMediaFromApi;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* 从请求体中提取会话 ID。
|
|
112
|
+
* 支持:data.list.with.id(拼多多), conversation_id, session_id 等
|
|
113
|
+
*/
|
|
114
|
+
export declare function extractConversationId(postData: string): string | undefined;
|
|
115
|
+
//# sourceMappingURL=api-conversation-capture.d.ts.map
|
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ApiConversationCapture = void 0;
|
|
4
|
+
exports.extractConversationId = extractConversationId;
|
|
5
|
+
/** API 对话采集器 */
|
|
6
|
+
class ApiConversationCapture {
|
|
7
|
+
constructor(pattern, onChange) {
|
|
8
|
+
/** 已采集的消息(按 ID 去重) */
|
|
9
|
+
this.messageMap = new Map();
|
|
10
|
+
/** 原始响应缓存(用于调试) */
|
|
11
|
+
this.rawResponses = [];
|
|
12
|
+
/** 当前正在自动翻页 */
|
|
13
|
+
this._paginating = false;
|
|
14
|
+
this.pattern = pattern;
|
|
15
|
+
this.onChange = onChange;
|
|
16
|
+
}
|
|
17
|
+
/** 检查 URL 是否匹配 */
|
|
18
|
+
matches(url) {
|
|
19
|
+
return url.includes(this.pattern);
|
|
20
|
+
}
|
|
21
|
+
get isPaginating() { return this._paginating; }
|
|
22
|
+
set isPaginating(v) { this._paginating = v; }
|
|
23
|
+
/** 保存请求模板 */
|
|
24
|
+
saveRequestTemplate(req) {
|
|
25
|
+
// 只保存第一个匹配的请求(包含正确的 Cookie 和 headers)
|
|
26
|
+
if (!this.capturedRequest) {
|
|
27
|
+
this.capturedRequest = req;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
/** 处理一个 API 响应体,返回翻页信息 */
|
|
31
|
+
processResponse(url, body) {
|
|
32
|
+
const empty = { newMessageCount: 0, responseMessageCount: 0, hasMore: false };
|
|
33
|
+
let data;
|
|
34
|
+
try {
|
|
35
|
+
data = JSON.parse(body);
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
return empty;
|
|
39
|
+
}
|
|
40
|
+
this.rawResponses.push({ url, timestamp: Date.now(), bodySize: body.length });
|
|
41
|
+
// 检测响应级 has_more 字段
|
|
42
|
+
const responseHasMore = this.detectHasMore(data);
|
|
43
|
+
// 从响应中提取消息数组(返回的是解析后的 RawApiMessage)
|
|
44
|
+
const messages = this.extractMessages(data);
|
|
45
|
+
if (messages.length === 0)
|
|
46
|
+
return empty;
|
|
47
|
+
// 同时保留原始消息数组,用于提取 pre_msg_id 等原始字段
|
|
48
|
+
const rawMessages = this.findRawMessageArray(data);
|
|
49
|
+
let newCount = 0;
|
|
50
|
+
for (const msg of messages) {
|
|
51
|
+
if (!this.messageMap.has(msg.id)) {
|
|
52
|
+
this.messageMap.set(msg.id, msg);
|
|
53
|
+
newCount++;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
if (newCount > 0 && this.onChange) {
|
|
57
|
+
this.onChange();
|
|
58
|
+
}
|
|
59
|
+
// 确定翻页 cursor:取最后一条(最早的)消息的 msg_id
|
|
60
|
+
// 接口通常返回按时间倒序(最新在前),最后一条是最早的
|
|
61
|
+
let oldestMsgId;
|
|
62
|
+
if (rawMessages && rawMessages.length > 0) {
|
|
63
|
+
const lastRaw = rawMessages[rawMessages.length - 1];
|
|
64
|
+
// 优先用 pre_msg_id(指向下一页的起始点,避免重复)
|
|
65
|
+
oldestMsgId = String(lastRaw.pre_msg_id || lastRaw.msg_id || lastRaw.message_id || lastRaw.id || '');
|
|
66
|
+
}
|
|
67
|
+
if (!oldestMsgId && messages.length > 0) {
|
|
68
|
+
oldestMsgId = messages[messages.length - 1].id;
|
|
69
|
+
}
|
|
70
|
+
return {
|
|
71
|
+
newMessageCount: newCount,
|
|
72
|
+
responseMessageCount: messages.length,
|
|
73
|
+
oldestMsgId: oldestMsgId || undefined,
|
|
74
|
+
// 使用接口的 has_more 字段,或根据新消息数判断
|
|
75
|
+
hasMore: responseHasMore !== undefined ? (responseHasMore && newCount > 0) : (newCount > 0 && messages.length > 1),
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* 构建翻页请求的 postData。
|
|
80
|
+
* 在原始请求体中替换 start_msg_id 为新的 cursor。
|
|
81
|
+
*/
|
|
82
|
+
buildPageRequestBody(oldestMsgId) {
|
|
83
|
+
if (!this.capturedRequest?.postData)
|
|
84
|
+
return undefined;
|
|
85
|
+
try {
|
|
86
|
+
const body = JSON.parse(this.capturedRequest.postData);
|
|
87
|
+
// 递归查找并替换 start_msg_id / startMsgId / cursor / pageToken 等
|
|
88
|
+
const replaced = this.replacePageCursor(body, oldestMsgId);
|
|
89
|
+
if (replaced)
|
|
90
|
+
return JSON.stringify(body);
|
|
91
|
+
// 如果没找到已知字段,尝试在顶层 data 中查找
|
|
92
|
+
return JSON.stringify(body);
|
|
93
|
+
}
|
|
94
|
+
catch {
|
|
95
|
+
return undefined;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
/** 递归替换分页 cursor 字段 */
|
|
99
|
+
replacePageCursor(obj, newValue, depth = 0) {
|
|
100
|
+
if (depth > 10 || !obj || typeof obj !== 'object')
|
|
101
|
+
return false;
|
|
102
|
+
// 已知的分页 cursor 字段名
|
|
103
|
+
const cursorFields = [
|
|
104
|
+
'start_msg_id', 'startMsgId', 'start_message_id',
|
|
105
|
+
'cursor', 'pageToken', 'page_token', 'next_cursor',
|
|
106
|
+
'before_id', 'beforeId', 'min_id', 'minId',
|
|
107
|
+
'last_msg_id', 'lastMsgId', 'offset_msg_id',
|
|
108
|
+
];
|
|
109
|
+
let found = false;
|
|
110
|
+
for (const field of cursorFields) {
|
|
111
|
+
if (obj[field] !== undefined) {
|
|
112
|
+
obj[field] = newValue;
|
|
113
|
+
found = true;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
// 递归进入子对象
|
|
117
|
+
if (!found) {
|
|
118
|
+
for (const val of Object.values(obj)) {
|
|
119
|
+
if (val && typeof val === 'object' && !Array.isArray(val)) {
|
|
120
|
+
if (this.replacePageCursor(val, newValue, depth + 1)) {
|
|
121
|
+
found = true;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return found;
|
|
127
|
+
}
|
|
128
|
+
/** 获取累积的完整对话(按时间排序,去重后) */
|
|
129
|
+
getMessages() {
|
|
130
|
+
const sorted = Array.from(this.messageMap.values())
|
|
131
|
+
.sort((a, b) => (a.sortTime ?? 0) - (b.sortTime ?? 0));
|
|
132
|
+
return sorted.map(m => ({
|
|
133
|
+
role: m.role,
|
|
134
|
+
content: m.content,
|
|
135
|
+
...(m.timestamp ? { timestamp: m.timestamp } : {}),
|
|
136
|
+
...(m.media?.length ? { media: m.media } : {}),
|
|
137
|
+
}));
|
|
138
|
+
}
|
|
139
|
+
/** 获取消息数量 */
|
|
140
|
+
get messageCount() {
|
|
141
|
+
return this.messageMap.size;
|
|
142
|
+
}
|
|
143
|
+
/** 获取诊断信息 */
|
|
144
|
+
getDiagnostics() {
|
|
145
|
+
return {
|
|
146
|
+
pattern: this.pattern,
|
|
147
|
+
responseCount: this.rawResponses.length,
|
|
148
|
+
messageCount: this.messageMap.size,
|
|
149
|
+
conversationId: this.capturedRequest?.conversationId,
|
|
150
|
+
lastResponseTime: this.rawResponses.length > 0
|
|
151
|
+
? this.rawResponses[this.rawResponses.length - 1].timestamp
|
|
152
|
+
: null,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
// ========== 响应级字段解析 ==========
|
|
156
|
+
/** 检测响应中是否有 has_more 字段 */
|
|
157
|
+
detectHasMore(data, depth = 0) {
|
|
158
|
+
if (depth > 6 || !data || typeof data !== 'object')
|
|
159
|
+
return undefined;
|
|
160
|
+
const fields = ['has_more', 'hasMore', 'has_next', 'hasNext', 'more'];
|
|
161
|
+
for (const f of fields) {
|
|
162
|
+
if (data[f] !== undefined)
|
|
163
|
+
return !!data[f];
|
|
164
|
+
}
|
|
165
|
+
// 递归进入 result / data
|
|
166
|
+
for (const key of ['result', 'data']) {
|
|
167
|
+
if (data[key] && typeof data[key] === 'object') {
|
|
168
|
+
const found = this.detectHasMore(data[key], depth + 1);
|
|
169
|
+
if (found !== undefined)
|
|
170
|
+
return found;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return undefined;
|
|
174
|
+
}
|
|
175
|
+
/** 在原始 JSON 中找到消息数组(未经解析的原始对象),用于提取 pre_msg_id 等 */
|
|
176
|
+
findRawMessageArray(data, depth = 0) {
|
|
177
|
+
if (depth > 8 || !data || typeof data !== 'object')
|
|
178
|
+
return null;
|
|
179
|
+
if (Array.isArray(data) && data.length > 0 && data[0]?.msg_id)
|
|
180
|
+
return data;
|
|
181
|
+
const keys = ['messages', 'message_list', 'msg_list', 'chat_list', 'list', 'data', 'result', 'records', 'items'];
|
|
182
|
+
for (const key of keys) {
|
|
183
|
+
if (data[key] !== undefined) {
|
|
184
|
+
const found = this.findRawMessageArray(data[key], depth + 1);
|
|
185
|
+
if (found)
|
|
186
|
+
return found;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
// ========== 消息提取:自适应多种 API 格式 ==========
|
|
192
|
+
/**
|
|
193
|
+
* 递归搜索 JSON 中的消息数组。
|
|
194
|
+
* 支持常见 IM 接口返回格式:
|
|
195
|
+
* { data: { list: [...] } }
|
|
196
|
+
* { result: { messages: [...] } }
|
|
197
|
+
* { data: { chat_list: [...] } }
|
|
198
|
+
* { messages: [...] }
|
|
199
|
+
* [...] (直接数组)
|
|
200
|
+
*/
|
|
201
|
+
extractMessages(data, depth = 0) {
|
|
202
|
+
if (depth > 8)
|
|
203
|
+
return [];
|
|
204
|
+
if (Array.isArray(data)) {
|
|
205
|
+
const msgs = this.tryParseMessageArray(data);
|
|
206
|
+
if (msgs.length > 0)
|
|
207
|
+
return msgs;
|
|
208
|
+
}
|
|
209
|
+
if (data && typeof data === 'object' && !Array.isArray(data)) {
|
|
210
|
+
const msgKeys = [
|
|
211
|
+
'messages', 'message_list', 'msg_list', 'chat_list',
|
|
212
|
+
'list', 'data', 'result', 'records', 'items',
|
|
213
|
+
'chat_messages', 'im_messages', 'conversations',
|
|
214
|
+
];
|
|
215
|
+
for (const key of msgKeys) {
|
|
216
|
+
if (data[key] !== undefined) {
|
|
217
|
+
const found = this.extractMessages(data[key], depth + 1);
|
|
218
|
+
if (found.length > 0)
|
|
219
|
+
return found;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
for (const [key, val] of Object.entries(data)) {
|
|
223
|
+
if (msgKeys.includes(key))
|
|
224
|
+
continue;
|
|
225
|
+
if (val && typeof val === 'object') {
|
|
226
|
+
const found = this.extractMessages(val, depth + 1);
|
|
227
|
+
if (found.length > 0)
|
|
228
|
+
return found;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
return [];
|
|
233
|
+
}
|
|
234
|
+
/** 尝试将数组解析为消息列表 */
|
|
235
|
+
tryParseMessageArray(arr) {
|
|
236
|
+
if (arr.length === 0)
|
|
237
|
+
return [];
|
|
238
|
+
let looksLikeMessages = 0;
|
|
239
|
+
for (const item of arr) {
|
|
240
|
+
if (item && typeof item === 'object' && this.looksLikeMessage(item)) {
|
|
241
|
+
looksLikeMessages++;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
if (looksLikeMessages < 2 || looksLikeMessages / arr.length < 0.4) {
|
|
245
|
+
return [];
|
|
246
|
+
}
|
|
247
|
+
const results = [];
|
|
248
|
+
for (const item of arr) {
|
|
249
|
+
if (!item || typeof item !== 'object')
|
|
250
|
+
continue;
|
|
251
|
+
const msg = this.parseOneMessage(item);
|
|
252
|
+
if (msg)
|
|
253
|
+
results.push(msg);
|
|
254
|
+
}
|
|
255
|
+
return results;
|
|
256
|
+
}
|
|
257
|
+
/** 检查一个对象是否像消息 */
|
|
258
|
+
looksLikeMessage(obj) {
|
|
259
|
+
const contentFields = ['content', 'text', 'msg', 'body', 'message', 'msg_content',
|
|
260
|
+
'chat_content', 'im_content', 'richContent', 'rich_content'];
|
|
261
|
+
const hasContent = contentFields.some(f => obj[f] !== undefined);
|
|
262
|
+
const typeFields = ['msg_type', 'message_type', 'type', 'msgType', 'contentType'];
|
|
263
|
+
const hasType = typeFields.some(f => obj[f] !== undefined);
|
|
264
|
+
const roleFields = ['role', 'sender', 'from', 'sender_id', 'send_id', 'user_id',
|
|
265
|
+
'from_id', 'direction', 'is_self', 'isSelf', 'senderId'];
|
|
266
|
+
const hasRole = roleFields.some(f => obj[f] !== undefined);
|
|
267
|
+
// msg_id 是消息的强信号
|
|
268
|
+
const hasId = obj.msg_id !== undefined || obj.message_id !== undefined || obj.msgId !== undefined;
|
|
269
|
+
return (hasContent && hasRole) || (hasContent && hasType) || (hasRole && hasType) || (hasId && hasContent);
|
|
270
|
+
}
|
|
271
|
+
/** 解析单条消息 */
|
|
272
|
+
parseOneMessage(obj) {
|
|
273
|
+
const id = String(obj.id || obj.msg_id || obj.message_id || obj.msgId || obj.messageId ||
|
|
274
|
+
obj.seq || obj.sequence || obj._id || `auto_${JSON.stringify(obj).substring(0, 100)}`);
|
|
275
|
+
const content = this.extractContent(obj);
|
|
276
|
+
const role = this.extractRole(obj);
|
|
277
|
+
const { timestamp, sortTime } = this.extractTime(obj);
|
|
278
|
+
const media = this.extractMediaFromApi(obj);
|
|
279
|
+
if (!content && media.length === 0)
|
|
280
|
+
return null;
|
|
281
|
+
return { id, role, content, timestamp, sortTime, media: media.length > 0 ? media : undefined };
|
|
282
|
+
}
|
|
283
|
+
/** 提取消息文本内容 */
|
|
284
|
+
extractContent(obj) {
|
|
285
|
+
const textFields = ['content', 'text', 'msg', 'body', 'message', 'msg_content',
|
|
286
|
+
'chat_content', 'im_content'];
|
|
287
|
+
for (const f of textFields) {
|
|
288
|
+
if (typeof obj[f] === 'string' && obj[f].trim()) {
|
|
289
|
+
return obj[f].trim();
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
const richFields = ['richContent', 'rich_content', 'content'];
|
|
293
|
+
for (const f of richFields) {
|
|
294
|
+
if (obj[f] && typeof obj[f] === 'object') {
|
|
295
|
+
const rich = obj[f];
|
|
296
|
+
if (typeof rich.text === 'string')
|
|
297
|
+
return rich.text.trim();
|
|
298
|
+
if (Array.isArray(rich)) {
|
|
299
|
+
const texts = rich
|
|
300
|
+
.filter((r) => r.type === 'text' && typeof r.content === 'string')
|
|
301
|
+
.map((r) => r.content.trim());
|
|
302
|
+
if (texts.length > 0)
|
|
303
|
+
return texts.join(' ');
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
return '';
|
|
308
|
+
}
|
|
309
|
+
/** 提取角色 */
|
|
310
|
+
extractRole(obj) {
|
|
311
|
+
// ① 最高优先级:from.role(拼多多等 IM 接口的标准格式)
|
|
312
|
+
const fromRole = (obj.from?.role || '').toLowerCase();
|
|
313
|
+
if (fromRole) {
|
|
314
|
+
if (/system|notice|tip|notify|robot|bot/.test(fromRole))
|
|
315
|
+
return 'system';
|
|
316
|
+
// mall_cs / cs / agent / operator / staff / service 等 → agent
|
|
317
|
+
if (/cs|agent|operator|staff|service|kefu|客服|坐席|merchant|seller|mall/.test(fromRole))
|
|
318
|
+
return 'agent';
|
|
319
|
+
// user / buyer / customer / visitor 等 → customer
|
|
320
|
+
if (/user|customer|buyer|visitor|guest|客户|买家|访客/.test(fromRole))
|
|
321
|
+
return 'customer';
|
|
322
|
+
}
|
|
323
|
+
// ② 顶层 role / sender_type 等字段
|
|
324
|
+
const roleVal = String(obj.role || obj.sender_type || obj.senderType || obj.user_type || obj.userType || '').toLowerCase();
|
|
325
|
+
if (/system|notice|tip|notify/.test(roleVal))
|
|
326
|
+
return 'system';
|
|
327
|
+
if (/cs|agent|operator|staff|service|kefu|客服|坐席|merchant|seller|mall/.test(roleVal))
|
|
328
|
+
return 'agent';
|
|
329
|
+
if (/user|customer|buyer|visitor|guest|客户|买家|访客/.test(roleVal))
|
|
330
|
+
return 'customer';
|
|
331
|
+
// ③ direction 字段(0/1 或 in/out)
|
|
332
|
+
const dir = obj.direction ?? obj.dir ?? obj.flow;
|
|
333
|
+
if (dir === 'in' || dir === 'receive' || dir === 0 || dir === '0')
|
|
334
|
+
return 'customer';
|
|
335
|
+
if (dir === 'out' || dir === 'send' || dir === 1 || dir === '1')
|
|
336
|
+
return 'agent';
|
|
337
|
+
// ④ is_self 字段
|
|
338
|
+
if (obj.is_self === true || obj.isSelf === true || obj.is_mine === true)
|
|
339
|
+
return 'agent';
|
|
340
|
+
if (obj.is_self === false || obj.isSelf === false)
|
|
341
|
+
return 'customer';
|
|
342
|
+
// ⑤ to.role 反推(如果 to 是 user,说明 from 是 agent)
|
|
343
|
+
const toRole = (obj.to?.role || '').toLowerCase();
|
|
344
|
+
if (/user|customer|buyer/.test(toRole))
|
|
345
|
+
return 'agent';
|
|
346
|
+
if (/cs|agent|operator|mall/.test(toRole))
|
|
347
|
+
return 'customer';
|
|
348
|
+
return 'agent';
|
|
349
|
+
}
|
|
350
|
+
/** 提取时间 */
|
|
351
|
+
extractTime(obj) {
|
|
352
|
+
const timeFields = ['ts', 'timestamp', 'time', 'created_at', 'createdAt', 'create_time',
|
|
353
|
+
'createTime', 'send_time', 'sendTime', 'msg_time', 'msgTime', 'date'];
|
|
354
|
+
for (const f of timeFields) {
|
|
355
|
+
const val = obj[f];
|
|
356
|
+
if (val === undefined || val === null)
|
|
357
|
+
continue;
|
|
358
|
+
if (typeof val === 'number') {
|
|
359
|
+
const ms = val > 1e12 ? val : val * 1000;
|
|
360
|
+
return { timestamp: new Date(ms).toISOString(), sortTime: ms };
|
|
361
|
+
}
|
|
362
|
+
if (typeof val === 'string') {
|
|
363
|
+
const num = Number(val);
|
|
364
|
+
if (!isNaN(num) && num > 1e9) {
|
|
365
|
+
const ms = num > 1e12 ? num : num * 1000;
|
|
366
|
+
return { timestamp: new Date(ms).toISOString(), sortTime: ms };
|
|
367
|
+
}
|
|
368
|
+
const d = new Date(val);
|
|
369
|
+
if (!isNaN(d.getTime())) {
|
|
370
|
+
return { timestamp: val, sortTime: d.getTime() };
|
|
371
|
+
}
|
|
372
|
+
return { timestamp: val, sortTime: undefined };
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
return {};
|
|
376
|
+
}
|
|
377
|
+
/** 从 API 消息对象中提取媒体附件 */
|
|
378
|
+
extractMediaFromApi(obj) {
|
|
379
|
+
const media = [];
|
|
380
|
+
const seenUrls = new Set();
|
|
381
|
+
const addMedia = (item) => {
|
|
382
|
+
if (item.url && !seenUrls.has(item.url)) {
|
|
383
|
+
seenUrls.add(item.url);
|
|
384
|
+
media.push(item);
|
|
385
|
+
}
|
|
386
|
+
};
|
|
387
|
+
// ① 图片字段
|
|
388
|
+
const imgFields = ['image', 'img', 'img_url', 'image_url', 'imageUrl', 'imgUrl',
|
|
389
|
+
'pic', 'pic_url', 'picUrl', 'photo', 'photo_url', 'thumbnail', 'thumb'];
|
|
390
|
+
for (const f of imgFields) {
|
|
391
|
+
const val = obj[f];
|
|
392
|
+
if (typeof val === 'string' && val.startsWith('http')) {
|
|
393
|
+
addMedia({ type: 'image', url: val });
|
|
394
|
+
}
|
|
395
|
+
else if (val && typeof val === 'object' && val.url) {
|
|
396
|
+
addMedia({ type: 'image', url: val.url, width: val.width, height: val.height });
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
// ② 视频字段
|
|
400
|
+
const videoFields = ['video', 'video_url', 'videoUrl'];
|
|
401
|
+
for (const f of videoFields) {
|
|
402
|
+
const val = obj[f];
|
|
403
|
+
if (typeof val === 'string' && val.startsWith('http')) {
|
|
404
|
+
addMedia({ type: 'video', url: val });
|
|
405
|
+
}
|
|
406
|
+
else if (val && typeof val === 'object' && val.url) {
|
|
407
|
+
addMedia({ type: 'video', url: val.url });
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
// ③ 文件字段
|
|
411
|
+
const fileFields = ['file', 'attachment', 'attachments', 'file_url', 'fileUrl'];
|
|
412
|
+
for (const f of fileFields) {
|
|
413
|
+
const val = obj[f];
|
|
414
|
+
if (typeof val === 'string' && val.startsWith('http')) {
|
|
415
|
+
addMedia({ type: 'file', url: val });
|
|
416
|
+
}
|
|
417
|
+
else if (val && typeof val === 'object' && val.url) {
|
|
418
|
+
addMedia({
|
|
419
|
+
type: 'file', url: val.url,
|
|
420
|
+
alt: val.name || val.fileName || val.file_name,
|
|
421
|
+
fileSize: val.size ? String(val.size) : undefined,
|
|
422
|
+
mimeType: val.mime || val.mimeType || val.type,
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
else if (Array.isArray(val)) {
|
|
426
|
+
for (const item of val) {
|
|
427
|
+
if (item && typeof item === 'object' && item.url) {
|
|
428
|
+
addMedia({
|
|
429
|
+
type: 'file', url: item.url,
|
|
430
|
+
alt: item.name || item.fileName || item.file_name,
|
|
431
|
+
fileSize: item.size ? String(item.size) : undefined,
|
|
432
|
+
mimeType: item.mime || item.mimeType || item.type,
|
|
433
|
+
});
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
// ④ 富文本中的媒体
|
|
439
|
+
const richFields = ['richContent', 'rich_content', 'content'];
|
|
440
|
+
for (const f of richFields) {
|
|
441
|
+
const val = obj[f];
|
|
442
|
+
if (Array.isArray(val)) {
|
|
443
|
+
for (const part of val) {
|
|
444
|
+
if (!part || typeof part !== 'object')
|
|
445
|
+
continue;
|
|
446
|
+
const partType = (part.type || '').toLowerCase();
|
|
447
|
+
if (partType === 'image' && part.url) {
|
|
448
|
+
addMedia({ type: 'image', url: part.url, width: part.width, height: part.height });
|
|
449
|
+
}
|
|
450
|
+
else if (partType === 'video' && part.url) {
|
|
451
|
+
addMedia({ type: 'video', url: part.url });
|
|
452
|
+
}
|
|
453
|
+
else if (partType === 'file' && part.url) {
|
|
454
|
+
addMedia({ type: 'file', url: part.url, alt: part.name, fileSize: part.size ? String(part.size) : undefined });
|
|
455
|
+
}
|
|
456
|
+
else if (partType === 'audio' && part.url) {
|
|
457
|
+
addMedia({ type: 'audio', url: part.url });
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
// ⑤ 音频字段
|
|
463
|
+
const audioFields = ['audio', 'audio_url', 'audioUrl', 'voice', 'voice_url'];
|
|
464
|
+
for (const f of audioFields) {
|
|
465
|
+
const val = obj[f];
|
|
466
|
+
if (typeof val === 'string' && val.startsWith('http')) {
|
|
467
|
+
addMedia({ type: 'audio', url: val });
|
|
468
|
+
}
|
|
469
|
+
else if (val && typeof val === 'object' && val.url) {
|
|
470
|
+
addMedia({ type: 'audio', url: val.url });
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
return media;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
exports.ApiConversationCapture = ApiConversationCapture;
|
|
477
|
+
// ========== 工具函数 ==========
|
|
478
|
+
/**
|
|
479
|
+
* 从请求体中提取会话 ID。
|
|
480
|
+
* 支持:data.list.with.id(拼多多), conversation_id, session_id 等
|
|
481
|
+
*/
|
|
482
|
+
function extractConversationId(postData) {
|
|
483
|
+
try {
|
|
484
|
+
const body = JSON.parse(postData);
|
|
485
|
+
return findConversationId(body);
|
|
486
|
+
}
|
|
487
|
+
catch {
|
|
488
|
+
return undefined;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
function findConversationId(obj, depth = 0) {
|
|
492
|
+
if (depth > 6 || !obj || typeof obj !== 'object')
|
|
493
|
+
return undefined;
|
|
494
|
+
// 已知字段名
|
|
495
|
+
const idFields = [
|
|
496
|
+
'conversation_id', 'conversationId', 'session_id', 'sessionId',
|
|
497
|
+
'chat_id', 'chatId', 'thread_id', 'threadId', 'room_id', 'roomId',
|
|
498
|
+
];
|
|
499
|
+
for (const f of idFields) {
|
|
500
|
+
if (obj[f] !== undefined)
|
|
501
|
+
return String(obj[f]);
|
|
502
|
+
}
|
|
503
|
+
// 拼多多特殊结构:data.list.with.id
|
|
504
|
+
if (obj.with?.id !== undefined)
|
|
505
|
+
return String(obj.with.id);
|
|
506
|
+
if (obj.list?.with?.id !== undefined)
|
|
507
|
+
return String(obj.list.with.id);
|
|
508
|
+
// 递归
|
|
509
|
+
for (const val of Object.values(obj)) {
|
|
510
|
+
if (val && typeof val === 'object' && !Array.isArray(val)) {
|
|
511
|
+
const found = findConversationId(val, depth + 1);
|
|
512
|
+
if (found)
|
|
513
|
+
return found;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return undefined;
|
|
517
|
+
}
|
|
518
|
+
//# sourceMappingURL=api-conversation-capture.js.map
|