hyper-agent-browser 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ import type { Page } from "patchright";
2
+
3
+ export interface TableData {
4
+ type: "table";
5
+ url: string;
6
+ selector: string;
7
+ timestamp: number;
8
+ headers: string[];
9
+ rows: number;
10
+ data: Record<string, string>[];
11
+ }
12
+
13
+ export class TableExtractor {
14
+ /**
15
+ * 提取表格数据
16
+ */
17
+ async extract(
18
+ page: Page,
19
+ selector?: string,
20
+ options: { includeHeaders?: boolean; maxRows?: number } = {},
21
+ ): Promise<TableData> {
22
+ const maxRows = options.maxRows ?? Number.POSITIVE_INFINITY;
23
+
24
+ const url = page.url();
25
+ const timestamp = Date.now();
26
+
27
+ // 在页面上下文中执行提取逻辑
28
+ const result = await page.evaluate(
29
+ ({ selector, maxRows }) => {
30
+ // 查找表格元素
31
+ let table: HTMLTableElement | null = null;
32
+
33
+ if (selector) {
34
+ const element = document.querySelector(selector);
35
+ if (element instanceof HTMLTableElement) {
36
+ table = element;
37
+ } else if (element) {
38
+ // 在指定元素内查找表格
39
+ const nestedTable = element.querySelector("table");
40
+ if (nestedTable instanceof HTMLTableElement) {
41
+ table = nestedTable;
42
+ }
43
+ }
44
+ } else {
45
+ // 查找第一个可见的表格
46
+ const tables = Array.from(document.querySelectorAll("table"));
47
+ table =
48
+ tables.find((t) => {
49
+ const style = window.getComputedStyle(t);
50
+ return style.display !== "none" && style.visibility !== "hidden";
51
+ }) || null;
52
+ }
53
+
54
+ if (!table) {
55
+ // 尝试查找 ARIA table
56
+ const ariaTable = selector
57
+ ? document.querySelector(selector)
58
+ : document.querySelector('[role="table"], [role="grid"]');
59
+
60
+ if (ariaTable) {
61
+ return extractAriaTable(ariaTable as HTMLElement, maxRows);
62
+ }
63
+
64
+ throw new Error("No table found on page");
65
+ }
66
+
67
+ return extractHtmlTable(table, maxRows);
68
+ },
69
+ { selector: selector || "", maxRows },
70
+ );
71
+
72
+ return {
73
+ type: "table",
74
+ url,
75
+ selector: selector || "table",
76
+ timestamp,
77
+ headers: result.headers,
78
+ rows: result.data.length,
79
+ data: result.data,
80
+ };
81
+ }
82
+ }
83
+
84
+ /**
85
+ * 在页面上下文中执行的辅助函数
86
+ */
87
+
88
+ // 提取 HTML 表格
89
+ function extractHtmlTable(
90
+ table: HTMLTableElement,
91
+ maxRows: number,
92
+ ): { headers: string[]; data: Record<string, string>[] } {
93
+ const headers: string[] = [];
94
+ const data: Record<string, string>[] = [];
95
+
96
+ // 提取表头
97
+ const thead = table.querySelector("thead");
98
+ const headerRow = thead?.querySelector("tr") || table.querySelector("tr");
99
+
100
+ if (headerRow) {
101
+ const headerCells = Array.from(headerRow.querySelectorAll("th, td"));
102
+ headerCells.forEach((cell, index) => {
103
+ const text = cell.textContent?.trim() || "";
104
+ const headerText = text || `column_${index + 1}`;
105
+ headers.push(headerText);
106
+ });
107
+ }
108
+
109
+ // 如果没有表头,根据第一行推断
110
+ if (headers.length === 0) {
111
+ const firstRow = table.querySelector("tr");
112
+ if (firstRow) {
113
+ const cells = Array.from(firstRow.querySelectorAll("td, th"));
114
+ cells.forEach((_, index) => {
115
+ headers.push(`column_${index + 1}`);
116
+ });
117
+ }
118
+ }
119
+
120
+ // 提取数据行
121
+ const tbody = table.querySelector("tbody") || table;
122
+ const rows = Array.from(tbody.querySelectorAll("tr"));
123
+
124
+ // 跳过表头行(如果表头在 tbody 内)
125
+ const startIndex = thead ? 0 : 1;
126
+
127
+ for (let i = startIndex; i < rows.length && data.length < maxRows; i++) {
128
+ const row = rows[i];
129
+
130
+ // 跳过隐藏行
131
+ const style = window.getComputedStyle(row);
132
+ if (style.display === "none" || style.visibility === "hidden") {
133
+ continue;
134
+ }
135
+
136
+ const cells = Array.from(row.querySelectorAll("td, th"));
137
+ const rowData: Record<string, string> = {};
138
+
139
+ cells.forEach((cell, index) => {
140
+ if (index >= headers.length) return;
141
+
142
+ const text = cell.textContent?.trim() || "";
143
+ // 处理合并单元格
144
+ const colspan = Number.parseInt(cell.getAttribute("colspan") || "1");
145
+
146
+ if (colspan > 1) {
147
+ // 对于合并的单元格,填充到多个列
148
+ for (let j = 0; j < colspan && index + j < headers.length; j++) {
149
+ rowData[headers[index + j]] = text;
150
+ }
151
+ } else {
152
+ rowData[headers[index]] = text;
153
+ }
154
+ });
155
+
156
+ // 只添加非空行
157
+ if (Object.values(rowData).some((v) => v !== "")) {
158
+ data.push(rowData);
159
+ }
160
+ }
161
+
162
+ return { headers, data };
163
+ }
164
+
165
+ // 提取 ARIA 表格
166
+ function extractAriaTable(
167
+ table: HTMLElement,
168
+ maxRows: number,
169
+ ): { headers: string[]; data: Record<string, string>[] } {
170
+ const headers: string[] = [];
171
+ const data: Record<string, string>[] = [];
172
+
173
+ // 查找表头行
174
+ const headerRow = table.querySelector('[role="row"]:first-child');
175
+ if (headerRow) {
176
+ const headerCells = Array.from(
177
+ headerRow.querySelectorAll('[role="columnheader"], [role="gridcell"]'),
178
+ );
179
+ headerCells.forEach((cell, index) => {
180
+ const text = cell.textContent?.trim() || "";
181
+ headers.push(text || `column_${index + 1}`);
182
+ });
183
+ }
184
+
185
+ // 查找数据行
186
+ const rows = Array.from(table.querySelectorAll('[role="row"]'));
187
+
188
+ // 跳过表头行
189
+ const startIndex = headerRow ? 1 : 0;
190
+
191
+ for (let i = startIndex; i < rows.length && data.length < maxRows; i++) {
192
+ const row = rows[i];
193
+
194
+ // 跳过隐藏行
195
+ const style = window.getComputedStyle(row);
196
+ if (style.display === "none" || style.visibility === "hidden") {
197
+ continue;
198
+ }
199
+
200
+ const cells = Array.from(row.querySelectorAll('[role="gridcell"], [role="cell"]'));
201
+ const rowData: Record<string, string> = {};
202
+
203
+ cells.forEach((cell, index) => {
204
+ if (index >= headers.length) return;
205
+ const text = cell.textContent?.trim() || "";
206
+ rowData[headers[index]] = text;
207
+ });
208
+
209
+ if (Object.values(rowData).some((v) => v !== "")) {
210
+ data.push(rowData);
211
+ }
212
+ }
213
+
214
+ return { headers, data };
215
+ }
@@ -0,0 +1,247 @@
1
+ import { existsSync } from "node:fs";
2
+ import { appendFile, mkdir, readFile, writeFile } from "node:fs/promises";
3
+ import { join } from "node:path";
4
+ import type { Page } from "patchright";
5
+
6
+ export interface NetworkListenerConfig {
7
+ id: string;
8
+ sessionDir: string;
9
+ filter: {
10
+ types: string[];
11
+ urlPattern?: string;
12
+ methods?: string[];
13
+ };
14
+ startTime: number;
15
+ status: "active" | "stopped";
16
+ }
17
+
18
+ export interface NetworkRequest {
19
+ id: string;
20
+ url: string;
21
+ method: string;
22
+ request: {
23
+ headers: Record<string, string>;
24
+ postData?: any;
25
+ };
26
+ response?: {
27
+ status: number;
28
+ statusText: string;
29
+ headers: Record<string, string>;
30
+ body?: any;
31
+ };
32
+ timing: {
33
+ startTime: number;
34
+ endTime?: number;
35
+ duration?: number;
36
+ };
37
+ }
38
+
39
+ export class NetworkListener {
40
+ private config: NetworkListenerConfig;
41
+ private metaFile: string;
42
+ private requestsFile: string;
43
+ private requestCount = 0;
44
+ private cleanup?: () => void;
45
+
46
+ constructor(config: NetworkListenerConfig) {
47
+ this.config = config;
48
+ const networkDir = join(config.sessionDir, "network", config.id);
49
+ this.metaFile = join(networkDir, "meta.json");
50
+ this.requestsFile = join(networkDir, "requests.jsonl");
51
+ }
52
+
53
+ /**
54
+ * 启动网络监听
55
+ */
56
+ async start(page: Page): Promise<void> {
57
+ // 确保目录存在
58
+ const networkDir = join(this.config.sessionDir, "network", this.config.id);
59
+ if (!existsSync(networkDir)) {
60
+ await mkdir(networkDir, { recursive: true, mode: 0o700 });
61
+ }
62
+
63
+ // 保存元数据
64
+ await this.saveMeta();
65
+
66
+ // 设置请求拦截
67
+ const requestHandler = async (request: any) => {
68
+ const resourceType = request.resourceType();
69
+ const method = request.method();
70
+ const url = request.url();
71
+
72
+ // 过滤资源类型
73
+ if (!this.shouldCapture(resourceType, method, url)) {
74
+ return;
75
+ }
76
+
77
+ const requestId = `req_${++this.requestCount}`;
78
+ const requestData: NetworkRequest = {
79
+ id: requestId,
80
+ url,
81
+ method,
82
+ request: {
83
+ headers: request.headers(),
84
+ postData: request.postData(),
85
+ },
86
+ timing: {
87
+ startTime: Date.now(),
88
+ },
89
+ };
90
+
91
+ // 等待响应
92
+ try {
93
+ const response = await request.response();
94
+ if (response) {
95
+ requestData.response = {
96
+ status: response.status(),
97
+ statusText: response.statusText(),
98
+ headers: response.headers(),
99
+ };
100
+
101
+ // 尝试获取响应体
102
+ try {
103
+ const contentType = response.headers()["content-type"] || "";
104
+ if (contentType.includes("application/json")) {
105
+ requestData.response.body = await response.json();
106
+ } else if (contentType.includes("text/")) {
107
+ requestData.response.body = await response.text();
108
+ }
109
+ } catch {
110
+ // 忽略响应体获取失败
111
+ }
112
+
113
+ requestData.timing.endTime = Date.now();
114
+ requestData.timing.duration = requestData.timing.endTime - requestData.timing.startTime;
115
+ }
116
+ } catch {
117
+ // 请求失败
118
+ }
119
+
120
+ // 追加到文件
121
+ await this.appendRequest(requestData);
122
+ };
123
+
124
+ page.on("request", requestHandler);
125
+
126
+ // 保存清理函数
127
+ this.cleanup = () => {
128
+ page.off("request", requestHandler);
129
+ };
130
+
131
+ // 设置自动超时清理(1小时)
132
+ setTimeout(
133
+ async () => {
134
+ if (this.config.status === "active") {
135
+ await this.stop();
136
+ }
137
+ },
138
+ 60 * 60 * 1000,
139
+ );
140
+ }
141
+
142
+ /**
143
+ * 停止监听
144
+ */
145
+ async stop(): Promise<void> {
146
+ this.config.status = "stopped";
147
+ await this.saveMeta();
148
+
149
+ if (this.cleanup) {
150
+ this.cleanup();
151
+ }
152
+ }
153
+
154
+ /**
155
+ * 获取捕获的请求
156
+ */
157
+ async getRequests(): Promise<NetworkRequest[]> {
158
+ if (!existsSync(this.requestsFile)) {
159
+ return [];
160
+ }
161
+
162
+ const content = await readFile(this.requestsFile, "utf-8");
163
+ const lines = content.trim().split("\n");
164
+ return lines.filter((line) => line).map((line) => JSON.parse(line));
165
+ }
166
+
167
+ /**
168
+ * 获取监听器信息
169
+ */
170
+ getInfo(): {
171
+ listenerId: string;
172
+ startTime: number;
173
+ status: string;
174
+ filter: any;
175
+ } {
176
+ return {
177
+ listenerId: this.config.id,
178
+ startTime: this.config.startTime,
179
+ status: this.config.status,
180
+ filter: this.config.filter,
181
+ };
182
+ }
183
+
184
+ /**
185
+ * 判断是否应该捕获请求
186
+ */
187
+ private shouldCapture(resourceType: string, method: string, url: string): boolean {
188
+ // 资源类型过滤
189
+ const typeMap: Record<string, string> = {
190
+ xhr: "xhr",
191
+ fetch: "fetch",
192
+ document: "document",
193
+ script: "script",
194
+ image: "image",
195
+ font: "font",
196
+ stylesheet: "stylesheet",
197
+ };
198
+
199
+ const mappedType = typeMap[resourceType];
200
+ if (!mappedType || !this.config.filter.types.includes(mappedType)) {
201
+ return false;
202
+ }
203
+
204
+ // HTTP 方法过滤
205
+ if (this.config.filter.methods && this.config.filter.methods.length > 0) {
206
+ if (!this.config.filter.methods.includes(method)) {
207
+ return false;
208
+ }
209
+ }
210
+
211
+ // URL 模式过滤
212
+ if (this.config.filter.urlPattern) {
213
+ const pattern = this.config.filter.urlPattern;
214
+ // 简单的 glob 模式匹配
215
+ const regex = new RegExp(pattern.replace(/\*/g, ".*").replace(/\?/g, "."));
216
+ if (!regex.test(url)) {
217
+ return false;
218
+ }
219
+ }
220
+
221
+ return true;
222
+ }
223
+
224
+ /**
225
+ * 保存元数据
226
+ */
227
+ private async saveMeta(): Promise<void> {
228
+ await writeFile(this.metaFile, JSON.stringify(this.config, null, 2));
229
+ }
230
+
231
+ /**
232
+ * 追加请求数据
233
+ */
234
+ private async appendRequest(request: NetworkRequest): Promise<void> {
235
+ const line = `${JSON.stringify(request)}\n`;
236
+ await appendFile(this.requestsFile, line);
237
+ }
238
+ }
239
+
240
+ /**
241
+ * 生成唯一的监听器 ID
242
+ */
243
+ export function generateListenerId(): string {
244
+ const timestamp = Date.now().toString(36);
245
+ const random = Math.random().toString(36).substring(2, 8);
246
+ return `listener_${timestamp}_${random}`;
247
+ }