hyper-agent-browser 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +334 -92
- package/package.json +2 -2
- package/src/browser/manager.ts +61 -3
- package/src/cli.ts +151 -1
- package/src/commands/extract.ts +76 -0
- package/src/commands/network.ts +111 -0
- package/src/commands/wait.ts +226 -0
- package/src/daemon/browser-pool.ts +90 -17
- package/src/daemon/server.ts +66 -0
- package/src/extractors/form-extractor.ts +153 -0
- package/src/extractors/list-extractor.ts +213 -0
- package/src/extractors/meta-extractor.ts +139 -0
- package/src/extractors/table-extractor.ts +215 -0
- package/src/snapshot/dom-extractor.ts +28 -15
- package/src/utils/network-listener.ts +247 -0
|
@@ -2,55 +2,110 @@ import { BrowserManager } from "../browser/manager";
|
|
|
2
2
|
import type { BrowserManagerOptions } from "../browser/manager";
|
|
3
3
|
import type { Session } from "../session/store";
|
|
4
4
|
|
|
5
|
+
interface CachedBrowser {
|
|
6
|
+
manager: BrowserManager;
|
|
7
|
+
options: BrowserManagerOptions;
|
|
8
|
+
}
|
|
9
|
+
|
|
5
10
|
/**
|
|
6
11
|
* BrowserPool 管理多个 Session 的浏览器实例
|
|
7
12
|
*/
|
|
8
13
|
export class BrowserPool {
|
|
9
|
-
private browsers: Map<string,
|
|
14
|
+
private browsers: Map<string, CachedBrowser> = new Map();
|
|
10
15
|
|
|
11
16
|
async get(session: Session, options: BrowserManagerOptions = {}): Promise<BrowserManager> {
|
|
12
17
|
const key = session.name;
|
|
13
18
|
|
|
14
19
|
if (this.browsers.has(key)) {
|
|
15
|
-
const
|
|
20
|
+
const cached = this.browsers.get(key)!;
|
|
21
|
+
const browser = cached.manager;
|
|
16
22
|
|
|
17
|
-
//
|
|
18
|
-
|
|
19
|
-
return browser;
|
|
20
|
-
}
|
|
23
|
+
// 检查选项是否一致(特别是 headed 模式)
|
|
24
|
+
const optionsMismatch = this.hasOptionsMismatch(cached.options, options);
|
|
21
25
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
26
|
+
if (optionsMismatch) {
|
|
27
|
+
console.log(
|
|
28
|
+
`Options mismatch for session ${key} (headed: ${cached.options.headed} -> ${options.headed}), recreating browser`,
|
|
29
|
+
);
|
|
30
|
+
try {
|
|
31
|
+
await browser.close();
|
|
32
|
+
} catch (error) {
|
|
33
|
+
console.error("Error closing browser for options change:", error);
|
|
34
|
+
}
|
|
35
|
+
this.browsers.delete(key);
|
|
36
|
+
// 继续创建新的浏览器实例
|
|
37
|
+
} else if (browser.isConnected()) {
|
|
38
|
+
// 选项一致且浏览器已连接,直接返回
|
|
25
39
|
return browser;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
|
|
40
|
+
} else {
|
|
41
|
+
// 浏览器断开连接,尝试重连
|
|
42
|
+
try {
|
|
43
|
+
await browser.connect();
|
|
44
|
+
if (browser.isConnected()) {
|
|
45
|
+
return browser;
|
|
46
|
+
}
|
|
47
|
+
} catch (error) {
|
|
48
|
+
console.error(`Failed to reconnect browser for session ${key}:`, error);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// 重连失败,移除旧实例并创建新的
|
|
52
|
+
console.log(`Removing stale browser instance for session ${key}`);
|
|
29
53
|
this.browsers.delete(key);
|
|
30
54
|
}
|
|
31
55
|
}
|
|
32
56
|
|
|
33
57
|
// 创建新的浏览器实例
|
|
58
|
+
console.log(
|
|
59
|
+
`Creating new browser instance for session ${key} (headed: ${options.headed ?? false})`,
|
|
60
|
+
);
|
|
34
61
|
const browser = new BrowserManager(session, options);
|
|
35
62
|
await browser.connect();
|
|
36
|
-
this.browsers.set(key, browser);
|
|
63
|
+
this.browsers.set(key, { manager: browser, options: { ...options } });
|
|
37
64
|
|
|
38
65
|
return browser;
|
|
39
66
|
}
|
|
40
67
|
|
|
68
|
+
/**
|
|
69
|
+
* 检查选项是否有重要变化需要重建浏览器
|
|
70
|
+
*/
|
|
71
|
+
private hasOptionsMismatch(
|
|
72
|
+
cached: BrowserManagerOptions,
|
|
73
|
+
requested: BrowserManagerOptions,
|
|
74
|
+
): boolean {
|
|
75
|
+
// headed 模式变化需要重建浏览器
|
|
76
|
+
if ((cached.headed ?? false) !== (requested.headed ?? false)) {
|
|
77
|
+
return true;
|
|
78
|
+
}
|
|
79
|
+
// channel 变化也需要重建
|
|
80
|
+
if (cached.channel && requested.channel && cached.channel !== requested.channel) {
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
85
|
+
|
|
41
86
|
async close(sessionName: string): Promise<boolean> {
|
|
42
|
-
const
|
|
43
|
-
if (!
|
|
87
|
+
const cached = this.browsers.get(sessionName);
|
|
88
|
+
if (!cached) {
|
|
44
89
|
return false;
|
|
45
90
|
}
|
|
46
91
|
|
|
47
|
-
|
|
92
|
+
try {
|
|
93
|
+
await cached.manager.close();
|
|
94
|
+
} catch (error) {
|
|
95
|
+
console.error(`Error closing browser for session ${sessionName}:`, error);
|
|
96
|
+
}
|
|
48
97
|
this.browsers.delete(sessionName);
|
|
49
98
|
return true;
|
|
50
99
|
}
|
|
51
100
|
|
|
52
101
|
async closeAll(): Promise<void> {
|
|
53
|
-
const closePromises = Array.from(this.browsers.
|
|
102
|
+
const closePromises = Array.from(this.browsers.entries()).map(async ([name, cached]) => {
|
|
103
|
+
try {
|
|
104
|
+
await cached.manager.close();
|
|
105
|
+
} catch (error) {
|
|
106
|
+
console.error(`Error closing browser for session ${name}:`, error);
|
|
107
|
+
}
|
|
108
|
+
});
|
|
54
109
|
await Promise.all(closePromises);
|
|
55
110
|
this.browsers.clear();
|
|
56
111
|
}
|
|
@@ -62,4 +117,22 @@ export class BrowserPool {
|
|
|
62
117
|
size(): number {
|
|
63
118
|
return this.browsers.size;
|
|
64
119
|
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* 清理所有无效的浏览器实例
|
|
123
|
+
*/
|
|
124
|
+
async cleanup(): Promise<void> {
|
|
125
|
+
const toRemove: string[] = [];
|
|
126
|
+
|
|
127
|
+
for (const [name, cached] of this.browsers.entries()) {
|
|
128
|
+
if (!cached.manager.isConnected()) {
|
|
129
|
+
toRemove.push(name);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
for (const name of toRemove) {
|
|
134
|
+
console.log(`Cleaning up disconnected browser: ${name}`);
|
|
135
|
+
this.browsers.delete(name);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
65
138
|
}
|
package/src/daemon/server.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import * as actionCommands from "../commands/actions";
|
|
2
2
|
import * as advancedCommands from "../commands/advanced";
|
|
3
|
+
import * as extractCommands from "../commands/extract";
|
|
3
4
|
import * as getterCommands from "../commands/getters";
|
|
4
5
|
import * as infoCommands from "../commands/info";
|
|
5
6
|
import * as navigationCommands from "../commands/navigation";
|
|
7
|
+
import * as networkCommands from "../commands/network";
|
|
8
|
+
import * as waitCommands from "../commands/wait";
|
|
6
9
|
import { SessionManager } from "../session/manager";
|
|
7
10
|
import { ReferenceStore } from "../snapshot/reference-store";
|
|
8
11
|
import { formatError } from "../utils/errors";
|
|
@@ -529,6 +532,69 @@ export class DaemonServer {
|
|
|
529
532
|
result = `Highlighted: ${args.selector}`;
|
|
530
533
|
break;
|
|
531
534
|
|
|
535
|
+
// Wait commands
|
|
536
|
+
case "wait-idle":
|
|
537
|
+
await waitCommands.waitIdle(page, {
|
|
538
|
+
timeout: args.timeout,
|
|
539
|
+
strategy: args.strategy,
|
|
540
|
+
networkIdleTime: args.networkIdleTime,
|
|
541
|
+
domStableTime: args.domStableTime,
|
|
542
|
+
ignoreSelectors: args.ignoreSelectors,
|
|
543
|
+
});
|
|
544
|
+
result = "Page is idle";
|
|
545
|
+
break;
|
|
546
|
+
|
|
547
|
+
case "wait-element":
|
|
548
|
+
await waitCommands.waitElement(page, args.selector, {
|
|
549
|
+
state: args.state,
|
|
550
|
+
timeout: args.timeout,
|
|
551
|
+
});
|
|
552
|
+
result = `Element ${args.state || "visible"}: ${args.selector}`;
|
|
553
|
+
break;
|
|
554
|
+
|
|
555
|
+
// Extract commands
|
|
556
|
+
case "extract-table":
|
|
557
|
+
result = await extractCommands.extractTable(page, {
|
|
558
|
+
selector: args.selector,
|
|
559
|
+
includeHeaders: args.includeHeaders,
|
|
560
|
+
maxRows: args.maxRows,
|
|
561
|
+
});
|
|
562
|
+
break;
|
|
563
|
+
|
|
564
|
+
case "extract-list":
|
|
565
|
+
result = await extractCommands.extractList(page, {
|
|
566
|
+
selector: args.selector,
|
|
567
|
+
pattern: args.pattern,
|
|
568
|
+
maxItems: args.maxItems,
|
|
569
|
+
});
|
|
570
|
+
break;
|
|
571
|
+
|
|
572
|
+
case "extract-form":
|
|
573
|
+
result = await extractCommands.extractForm(page, {
|
|
574
|
+
selector: args.selector,
|
|
575
|
+
includeDisabled: args.includeDisabled,
|
|
576
|
+
});
|
|
577
|
+
break;
|
|
578
|
+
|
|
579
|
+
case "extract-meta":
|
|
580
|
+
result = await extractCommands.extractMeta(page, {
|
|
581
|
+
include: args.include,
|
|
582
|
+
});
|
|
583
|
+
break;
|
|
584
|
+
|
|
585
|
+
// Network commands
|
|
586
|
+
case "network-start":
|
|
587
|
+
result = await networkCommands.networkStart(page, session, {
|
|
588
|
+
filter: args.filter,
|
|
589
|
+
urlPattern: args.urlPattern,
|
|
590
|
+
methods: args.methods,
|
|
591
|
+
});
|
|
592
|
+
break;
|
|
593
|
+
|
|
594
|
+
case "network-stop":
|
|
595
|
+
result = await networkCommands.networkStop(args.listenerId);
|
|
596
|
+
break;
|
|
597
|
+
|
|
532
598
|
default:
|
|
533
599
|
throw new Error(`Unknown command: ${command}`);
|
|
534
600
|
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import type { Page } from "patchright";
|
|
2
|
+
|
|
3
|
+
export interface FormData {
|
|
4
|
+
type: "form";
|
|
5
|
+
url: string;
|
|
6
|
+
selector: string;
|
|
7
|
+
timestamp: number;
|
|
8
|
+
fields: FormField[];
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface FormField {
|
|
12
|
+
ref?: string;
|
|
13
|
+
name: string;
|
|
14
|
+
type: string;
|
|
15
|
+
value?: string;
|
|
16
|
+
checked?: boolean;
|
|
17
|
+
label?: string;
|
|
18
|
+
placeholder?: string;
|
|
19
|
+
required: boolean;
|
|
20
|
+
disabled: boolean;
|
|
21
|
+
readonly?: boolean;
|
|
22
|
+
options?: string[];
|
|
23
|
+
selectedOptions?: string[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export class FormExtractor {
|
|
27
|
+
/**
|
|
28
|
+
* 提取表单数据
|
|
29
|
+
*/
|
|
30
|
+
async extract(
|
|
31
|
+
page: Page,
|
|
32
|
+
selector?: string,
|
|
33
|
+
options: { includeDisabled?: boolean } = {},
|
|
34
|
+
): Promise<FormData> {
|
|
35
|
+
const includeDisabled = options.includeDisabled ?? false;
|
|
36
|
+
|
|
37
|
+
const url = page.url();
|
|
38
|
+
const timestamp = Date.now();
|
|
39
|
+
|
|
40
|
+
const result = await page.evaluate(
|
|
41
|
+
({ selector, includeDisabled }) => {
|
|
42
|
+
let container: Element | null = null;
|
|
43
|
+
|
|
44
|
+
if (selector) {
|
|
45
|
+
container = document.querySelector(selector);
|
|
46
|
+
} else {
|
|
47
|
+
container = document.querySelector("form") || document.body;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (!container) {
|
|
51
|
+
throw new Error("No form container found");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const fields: FormField[] = [];
|
|
55
|
+
|
|
56
|
+
// 查找所有表单控件
|
|
57
|
+
const inputs = container.querySelectorAll("input, textarea, select");
|
|
58
|
+
|
|
59
|
+
for (const input of Array.from(inputs)) {
|
|
60
|
+
if (
|
|
61
|
+
!(
|
|
62
|
+
input instanceof HTMLInputElement ||
|
|
63
|
+
input instanceof HTMLTextAreaElement ||
|
|
64
|
+
input instanceof HTMLSelectElement
|
|
65
|
+
)
|
|
66
|
+
) {
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// 跳过禁用字段(如果不包含)
|
|
71
|
+
if (!includeDisabled && input.disabled) {
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const field: FormField = {
|
|
76
|
+
name: input.name || input.id || "",
|
|
77
|
+
type:
|
|
78
|
+
input instanceof HTMLSelectElement
|
|
79
|
+
? "select"
|
|
80
|
+
: (input as HTMLInputElement).type || "text",
|
|
81
|
+
required: input.required,
|
|
82
|
+
disabled: input.disabled,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
// 提取 placeholder
|
|
86
|
+
if ("placeholder" in input && input.placeholder) {
|
|
87
|
+
field.placeholder = input.placeholder;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// 提取 readonly
|
|
91
|
+
if ("readOnly" in input) {
|
|
92
|
+
field.readonly = input.readOnly;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// 提取值
|
|
96
|
+
if (input instanceof HTMLInputElement) {
|
|
97
|
+
if (input.type === "checkbox" || input.type === "radio") {
|
|
98
|
+
field.checked = input.checked;
|
|
99
|
+
field.value = input.value;
|
|
100
|
+
} else if (input.type !== "password") {
|
|
101
|
+
// 不提取密码值
|
|
102
|
+
field.value = input.value;
|
|
103
|
+
} else {
|
|
104
|
+
field.value = input.value ? "******" : "";
|
|
105
|
+
}
|
|
106
|
+
} else if (input instanceof HTMLTextAreaElement) {
|
|
107
|
+
field.value = input.value;
|
|
108
|
+
} else if (input instanceof HTMLSelectElement) {
|
|
109
|
+
const selectedOptions = Array.from(input.selectedOptions).map((opt) => opt.value);
|
|
110
|
+
field.selectedOptions = selectedOptions;
|
|
111
|
+
field.options = Array.from(input.options).map((opt) => opt.value);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// 查找关联的 label
|
|
115
|
+
let label: HTMLLabelElement | null = null;
|
|
116
|
+
|
|
117
|
+
if (input.id) {
|
|
118
|
+
label = document.querySelector(`label[for="${input.id}"]`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (!label) {
|
|
122
|
+
// 查找父级 label
|
|
123
|
+
let parent = input.parentElement;
|
|
124
|
+
while (parent) {
|
|
125
|
+
if (parent instanceof HTMLLabelElement) {
|
|
126
|
+
label = parent;
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
parent = parent.parentElement;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if (label) {
|
|
134
|
+
field.label = label.textContent?.trim();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
fields.push(field);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return { fields };
|
|
141
|
+
},
|
|
142
|
+
{ selector: selector || "", includeDisabled },
|
|
143
|
+
);
|
|
144
|
+
|
|
145
|
+
return {
|
|
146
|
+
type: "form",
|
|
147
|
+
url,
|
|
148
|
+
selector: selector || "form",
|
|
149
|
+
timestamp,
|
|
150
|
+
fields: result.fields,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
}
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import type { Page } from "patchright";
|
|
2
|
+
|
|
3
|
+
export interface ListData {
|
|
4
|
+
type: "list";
|
|
5
|
+
url: string;
|
|
6
|
+
selector: string;
|
|
7
|
+
timestamp: number;
|
|
8
|
+
pattern: string;
|
|
9
|
+
items: number;
|
|
10
|
+
data: Record<string, any>[];
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export class ListExtractor {
|
|
14
|
+
/**
|
|
15
|
+
* 提取列表数据
|
|
16
|
+
*/
|
|
17
|
+
async extract(
|
|
18
|
+
page: Page,
|
|
19
|
+
selector?: string,
|
|
20
|
+
options: { pattern?: string; maxItems?: number } = {},
|
|
21
|
+
): Promise<ListData> {
|
|
22
|
+
const pattern = options.pattern ?? "auto";
|
|
23
|
+
const maxItems = options.maxItems ?? Number.POSITIVE_INFINITY;
|
|
24
|
+
|
|
25
|
+
const url = page.url();
|
|
26
|
+
const timestamp = Date.now();
|
|
27
|
+
|
|
28
|
+
const result = await page.evaluate(
|
|
29
|
+
({ selector, pattern, maxItems }) => {
|
|
30
|
+
let container: Element | null = null;
|
|
31
|
+
|
|
32
|
+
if (selector) {
|
|
33
|
+
container = document.querySelector(selector);
|
|
34
|
+
} else {
|
|
35
|
+
// 查找常见的列表容器
|
|
36
|
+
container =
|
|
37
|
+
document.querySelector("ul") ||
|
|
38
|
+
document.querySelector("ol") ||
|
|
39
|
+
document.querySelector('[role="list"]') ||
|
|
40
|
+
document.body;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (!container) {
|
|
44
|
+
throw new Error("No list container found");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// 自动检测重复结构
|
|
48
|
+
let items: Element[] = [];
|
|
49
|
+
let detectedPattern = "";
|
|
50
|
+
|
|
51
|
+
if (pattern === "auto") {
|
|
52
|
+
// 尝试不同的模式
|
|
53
|
+
const patterns = [
|
|
54
|
+
"li",
|
|
55
|
+
'[role="listitem"]',
|
|
56
|
+
".item",
|
|
57
|
+
".product",
|
|
58
|
+
".card",
|
|
59
|
+
"[class*='item']",
|
|
60
|
+
"[class*='card']",
|
|
61
|
+
"[class*='product']",
|
|
62
|
+
];
|
|
63
|
+
|
|
64
|
+
for (const p of patterns) {
|
|
65
|
+
const elements = Array.from(container.querySelectorAll(p));
|
|
66
|
+
if (elements.length > 1) {
|
|
67
|
+
items = elements;
|
|
68
|
+
detectedPattern = p;
|
|
69
|
+
break;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// 如果还是找不到,尝试检测相同 class 的兄弟元素
|
|
74
|
+
if (items.length === 0) {
|
|
75
|
+
const children = Array.from(container.children);
|
|
76
|
+
const classMap = new Map<string, Element[]>();
|
|
77
|
+
|
|
78
|
+
for (const child of children) {
|
|
79
|
+
const className = child.className;
|
|
80
|
+
if (className) {
|
|
81
|
+
if (!classMap.has(className)) {
|
|
82
|
+
classMap.set(className, []);
|
|
83
|
+
}
|
|
84
|
+
classMap.get(className)!.push(child);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// 找到最多重复的 class
|
|
89
|
+
let maxCount = 0;
|
|
90
|
+
let maxClass = "";
|
|
91
|
+
classMap.forEach((elements, className) => {
|
|
92
|
+
if (elements.length > maxCount) {
|
|
93
|
+
maxCount = elements.length;
|
|
94
|
+
maxClass = className;
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
if (maxCount > 1) {
|
|
99
|
+
items = classMap.get(maxClass)!;
|
|
100
|
+
detectedPattern = `.${maxClass.split(" ")[0]}`;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
} else {
|
|
104
|
+
items = Array.from(container.querySelectorAll(pattern));
|
|
105
|
+
detectedPattern = pattern;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (items.length === 0) {
|
|
109
|
+
throw new Error("No repeating items found");
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// 限制数量
|
|
113
|
+
items = items.slice(0, maxItems);
|
|
114
|
+
|
|
115
|
+
// 提取每个 item 的数据
|
|
116
|
+
const data = items.map((item) => extractItemData(item));
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
pattern: detectedPattern,
|
|
120
|
+
data,
|
|
121
|
+
};
|
|
122
|
+
},
|
|
123
|
+
{ selector: selector || "", pattern, maxItems },
|
|
124
|
+
);
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
type: "list",
|
|
128
|
+
url,
|
|
129
|
+
selector: selector || "auto",
|
|
130
|
+
timestamp,
|
|
131
|
+
pattern: result.pattern,
|
|
132
|
+
items: result.data.length,
|
|
133
|
+
data: result.data,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* 提取单个列表项的数据
|
|
140
|
+
*/
|
|
141
|
+
function extractItemData(item: Element): Record<string, any> {
|
|
142
|
+
const data: Record<string, any> = {};
|
|
143
|
+
|
|
144
|
+
// 提取 data-* 属性
|
|
145
|
+
if (item instanceof HTMLElement) {
|
|
146
|
+
for (const attr of Array.from(item.attributes)) {
|
|
147
|
+
if (attr.name.startsWith("data-")) {
|
|
148
|
+
const key = attr.name.substring(5);
|
|
149
|
+
data[key] = attr.value;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// 提取 id
|
|
155
|
+
if (item.id) {
|
|
156
|
+
data.id = item.id;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// 提取文本内容
|
|
160
|
+
const textNodes: string[] = [];
|
|
161
|
+
const walker = document.createTreeWalker(item, NodeFilter.SHOW_TEXT, null);
|
|
162
|
+
|
|
163
|
+
let node = walker.nextNode();
|
|
164
|
+
while (node) {
|
|
165
|
+
const text = node.textContent?.trim();
|
|
166
|
+
if (text && text.length > 0) {
|
|
167
|
+
textNodes.push(text);
|
|
168
|
+
}
|
|
169
|
+
node = walker.nextNode();
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
if (textNodes.length === 1) {
|
|
173
|
+
data.text = textNodes[0];
|
|
174
|
+
} else if (textNodes.length > 1) {
|
|
175
|
+
data.text = textNodes.join(" ");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// 提取链接
|
|
179
|
+
const link = item.querySelector("a");
|
|
180
|
+
if (link instanceof HTMLAnchorElement) {
|
|
181
|
+
data.link = link.href;
|
|
182
|
+
if (!data.text) {
|
|
183
|
+
data.text = link.textContent?.trim();
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// 提取图片
|
|
188
|
+
const img = item.querySelector("img");
|
|
189
|
+
if (img instanceof HTMLImageElement) {
|
|
190
|
+
data.image = img.src;
|
|
191
|
+
if (img.alt) {
|
|
192
|
+
data.imageAlt = img.alt;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// 提取常见字段(通过 class 或元素结构)
|
|
197
|
+
const title = item.querySelector('[class*="title"], h1, h2, h3, h4, h5, h6');
|
|
198
|
+
if (title) {
|
|
199
|
+
data.title = title.textContent?.trim();
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const price = item.querySelector('[class*="price"]');
|
|
203
|
+
if (price) {
|
|
204
|
+
data.price = price.textContent?.trim();
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const description = item.querySelector('[class*="desc"], [class*="summary"], p');
|
|
208
|
+
if (description && description !== title) {
|
|
209
|
+
data.description = description.textContent?.trim();
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
return data;
|
|
213
|
+
}
|