web2json-agent 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +14 -0
- agent/executor.py +545 -0
- agent/orchestrator.py +128 -0
- agent/planner.py +61 -0
- cli.py +195 -0
- config/__init__.py +4 -0
- config/settings.py +72 -0
- config/validator.py +409 -0
- main.py +137 -0
- prompts/__init__.py +14 -0
- prompts/code_generator.py +237 -0
- prompts/schema_extraction.py +169 -0
- prompts/schema_merge.py +168 -0
- tools/__init__.py +24 -0
- tools/code_generator.py +134 -0
- tools/html_simplifier.py +405 -0
- tools/schema_extraction.py +264 -0
- tools/webpage_screenshot.py +92 -0
- tools/webpage_source.py +46 -0
- utils/__init__.py +6 -0
- utils/llm_client.py +350 -0
- web2json_agent-1.0.0.dist-info/METADATA +348 -0
- web2json_agent-1.0.0.dist-info/RECORD +27 -0
- web2json_agent-1.0.0.dist-info/WHEEL +5 -0
- web2json_agent-1.0.0.dist-info/entry_points.txt +2 -0
- web2json_agent-1.0.0.dist-info/licenses/LICENSE +21 -0
- web2json_agent-1.0.0.dist-info/top_level.txt +7 -0
agent/__init__.py
ADDED
agent/executor.py
ADDED
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent 执行器
|
|
3
|
+
负责执行具体的任务步骤
|
|
4
|
+
"""
|
|
5
|
+
import importlib.util
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List
|
|
10
|
+
|
|
11
|
+
from loguru import logger
|
|
12
|
+
|
|
13
|
+
from config.settings import settings
|
|
14
|
+
from tools import (
|
|
15
|
+
get_html_from_file, # 从本地文件读取HTML工具
|
|
16
|
+
capture_html_file_screenshot, # 渲染本地HTML并截图工具
|
|
17
|
+
generate_parser_code, # 生成解析代码工具
|
|
18
|
+
extract_schema_from_html, # 从HTML提取Schema
|
|
19
|
+
extract_schema_from_image, # 从截图提取Schema
|
|
20
|
+
merge_html_and_visual_schema, # 合并单个HTML的两种Schema
|
|
21
|
+
merge_multiple_schemas, # 合并多个HTML的Schema
|
|
22
|
+
)
|
|
23
|
+
from tools.html_simplifier import simplify_html # HTML精简工具
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AgentExecutor:
|
|
27
|
+
"""Agent执行器,负责执行具体任务"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, output_dir: str = "output"):
|
|
30
|
+
self.output_dir = Path(output_dir)
|
|
31
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
|
|
33
|
+
# 创建子目录
|
|
34
|
+
self.screenshots_dir = self.output_dir / "screenshots"
|
|
35
|
+
self.parsers_dir = self.output_dir / "parsers"
|
|
36
|
+
self.html_original_dir = self.output_dir / "html_original" # 原始HTML
|
|
37
|
+
self.html_simplified_dir = self.output_dir / "html_simplified" # 精简HTML
|
|
38
|
+
self.result_dir = self.output_dir / "result"
|
|
39
|
+
self.schemas_dir = self.output_dir / "schemas"
|
|
40
|
+
|
|
41
|
+
self.screenshots_dir.mkdir(exist_ok=True)
|
|
42
|
+
self.parsers_dir.mkdir(exist_ok=True)
|
|
43
|
+
self.html_original_dir.mkdir(exist_ok=True)
|
|
44
|
+
self.html_simplified_dir.mkdir(exist_ok=True)
|
|
45
|
+
self.result_dir.mkdir(exist_ok=True)
|
|
46
|
+
self.schemas_dir.mkdir(exist_ok=True)
|
|
47
|
+
|
|
48
|
+
logger.info(f"输出目录已创建:")
|
|
49
|
+
logger.info(f" - 原始HTML: {self.html_original_dir}")
|
|
50
|
+
logger.info(f" - 精简HTML: {self.html_simplified_dir}")
|
|
51
|
+
|
|
52
|
+
def execute_plan(self, plan: Dict) -> Dict:
|
|
53
|
+
"""
|
|
54
|
+
执行计划 - 两阶段迭代执行
|
|
55
|
+
|
|
56
|
+
阶段1: Schema迭代(前N个URL)- 获取HTML -> 截图 -> 提取/优化JSON Schema
|
|
57
|
+
阶段2: 代码迭代(后M个URL)- 基于最终Schema生成代码 -> 验证 -> 优化代码
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
plan: 执行计划
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
执行结果
|
|
64
|
+
"""
|
|
65
|
+
logger.info("开始执行计划...")
|
|
66
|
+
|
|
67
|
+
results = {
|
|
68
|
+
'plan': plan,
|
|
69
|
+
'schema_phase': {
|
|
70
|
+
'rounds': [],
|
|
71
|
+
'final_schema': None,
|
|
72
|
+
},
|
|
73
|
+
'code_phase': {
|
|
74
|
+
'rounds': [],
|
|
75
|
+
'parsers': [],
|
|
76
|
+
},
|
|
77
|
+
'final_parser': None,
|
|
78
|
+
'success': False,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# 使用所有输入的URL进行迭代
|
|
82
|
+
sample_urls = plan['sample_urls']
|
|
83
|
+
num_urls = len(sample_urls)
|
|
84
|
+
|
|
85
|
+
# 阶段1: Schema迭代(使用所有URL)
|
|
86
|
+
logger.info(f"\n{'='*70}")
|
|
87
|
+
logger.info(f"阶段1: Schema迭代({num_urls}个URL,{num_urls}轮迭代)")
|
|
88
|
+
logger.info(f"{'='*70}")
|
|
89
|
+
|
|
90
|
+
schema_result = self._execute_schema_iteration_phase(sample_urls)
|
|
91
|
+
results['schema_phase'] = schema_result
|
|
92
|
+
|
|
93
|
+
if not schema_result['success']:
|
|
94
|
+
logger.error("Schema迭代阶段失败")
|
|
95
|
+
return results
|
|
96
|
+
|
|
97
|
+
final_schema = schema_result['final_schema']
|
|
98
|
+
logger.success(f"Schema迭代完成,最终Schema包含 {len(final_schema)} 个字段")
|
|
99
|
+
|
|
100
|
+
# 阶段2: 代码迭代(复用Schema阶段的HTML)
|
|
101
|
+
logger.info(f"\n{'='*70}")
|
|
102
|
+
logger.info(f"阶段2: 代码迭代(使用Schema阶段的{num_urls}个HTML,{num_urls}轮迭代)")
|
|
103
|
+
logger.info(f"{'='*70}")
|
|
104
|
+
|
|
105
|
+
code_result = self._execute_code_iteration_phase(
|
|
106
|
+
sample_urls, # 仅用于日志,实际使用schema_result['rounds']中的数据
|
|
107
|
+
final_schema,
|
|
108
|
+
schema_result['rounds'] # 传递schema阶段的数据(包含HTML)
|
|
109
|
+
)
|
|
110
|
+
results['code_phase'] = code_result
|
|
111
|
+
|
|
112
|
+
if code_result['success']:
|
|
113
|
+
results['final_parser'] = code_result['final_parser']
|
|
114
|
+
results['success'] = True
|
|
115
|
+
|
|
116
|
+
# 使用最终解析器解析所有HTML并生成JSON
|
|
117
|
+
logger.info(f"\n{'='*70}")
|
|
118
|
+
logger.info("使用最终解析器解析所有HTML")
|
|
119
|
+
logger.info(f"{'='*70}")
|
|
120
|
+
# 只使用Schema阶段的数据,因为代码阶段复用了Schema阶段的HTML
|
|
121
|
+
self._parse_all_html_with_final_parser(results, schema_result['rounds'])
|
|
122
|
+
else:
|
|
123
|
+
logger.error("代码迭代阶段失败")
|
|
124
|
+
|
|
125
|
+
return results
|
|
126
|
+
|
|
127
|
+
def _execute_schema_iteration_phase(self, html_files: List[str]) -> Dict:
|
|
128
|
+
"""
|
|
129
|
+
执行Schema迭代阶段(新版)
|
|
130
|
+
|
|
131
|
+
对每个HTML文件:
|
|
132
|
+
1. 读取HTML文件内容
|
|
133
|
+
2. 渲染HTML并截图
|
|
134
|
+
3. 从HTML提取Schema(包含xpath)
|
|
135
|
+
4. 从视觉提取Schema(包含visual_features)
|
|
136
|
+
5. 合并两个Schema
|
|
137
|
+
|
|
138
|
+
所有HTML处理完后:
|
|
139
|
+
6. 合并多个Schema,输出最终Schema
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
html_files: HTML文件路径列表
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Schema迭代结果
|
|
146
|
+
"""
|
|
147
|
+
result = {
|
|
148
|
+
'rounds': [],
|
|
149
|
+
'final_schema': None,
|
|
150
|
+
'success': False,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
all_merged_schemas = [] # 存储每个HTML的合并Schema
|
|
154
|
+
|
|
155
|
+
for idx, html_file_path in enumerate(html_files, 1):
|
|
156
|
+
logger.info(f"\n{'─'*70}")
|
|
157
|
+
logger.info(f"Schema迭代 - 第 {idx}/{len(html_files)} 轮")
|
|
158
|
+
logger.info(f"{'─'*70}")
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
# 1. 读取HTML文件内容
|
|
162
|
+
logger.info(f" [1/6] 读取本地HTML文件...")
|
|
163
|
+
html_content = get_html_from_file.invoke({"file_path": html_file_path})
|
|
164
|
+
logger.success(f" ✓ HTML文件已读取(长度: {len(html_content)} 字符)")
|
|
165
|
+
|
|
166
|
+
# 保存原始HTML(复制到输出目录)
|
|
167
|
+
html_original_path = self.html_original_dir / f"schema_round_{idx}.html"
|
|
168
|
+
with open(html_original_path, 'w', encoding='utf-8') as f:
|
|
169
|
+
f.write(html_content)
|
|
170
|
+
logger.success(f" ✓ 原始HTML已保存: {html_original_path}")
|
|
171
|
+
|
|
172
|
+
# 2. 精简HTML
|
|
173
|
+
logger.info(f" [2/6] 精简HTML...")
|
|
174
|
+
try:
|
|
175
|
+
# 根据配置选择精简模式
|
|
176
|
+
mode = settings.html_simplify_mode
|
|
177
|
+
keep_attrs = settings.html_keep_attrs if mode != 'conservative' else None
|
|
178
|
+
|
|
179
|
+
simplified_html = simplify_html(
|
|
180
|
+
html_content,
|
|
181
|
+
mode=mode,
|
|
182
|
+
keep_attrs=keep_attrs
|
|
183
|
+
)
|
|
184
|
+
# 保存精简后的HTML
|
|
185
|
+
html_simplified_path = self.html_simplified_dir / f"schema_round_{idx}.html"
|
|
186
|
+
with open(html_simplified_path, 'w', encoding='utf-8') as f:
|
|
187
|
+
f.write(simplified_html)
|
|
188
|
+
|
|
189
|
+
compression_rate = (1 - len(simplified_html) / len(html_content)) * 100
|
|
190
|
+
logger.success(f" ✓ 精简HTML已保存: {html_simplified_path}")
|
|
191
|
+
logger.info(f" 压缩率: {compression_rate:.1f}% ({len(html_content)} -> {len(simplified_html)} 字符)")
|
|
192
|
+
|
|
193
|
+
# 后续使用精简后的HTML
|
|
194
|
+
html_path = html_simplified_path
|
|
195
|
+
html_for_processing = simplified_html
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.warning(f" ⚠ HTML精简失败: {e},使用原始HTML")
|
|
198
|
+
html_path = html_original_path
|
|
199
|
+
html_for_processing = html_content
|
|
200
|
+
|
|
201
|
+
# 3. 渲染并截图
|
|
202
|
+
logger.info(f" [3/6] 渲染并截图本地HTML...")
|
|
203
|
+
screenshot_path = str(self.screenshots_dir / f"schema_round_{idx}.png")
|
|
204
|
+
screenshot_result = capture_html_file_screenshot.invoke({
|
|
205
|
+
"html_file_path": html_file_path,
|
|
206
|
+
"save_path": screenshot_path
|
|
207
|
+
})
|
|
208
|
+
logger.success(f" ✓ 截图已保存: {screenshot_path}")
|
|
209
|
+
|
|
210
|
+
# 4. 从HTML提取Schema(包含xpath)
|
|
211
|
+
logger.info(f" [4/6] 从HTML提取Schema(包含xpath)...")
|
|
212
|
+
html_schema = extract_schema_from_html.invoke({
|
|
213
|
+
"html_content": html_for_processing
|
|
214
|
+
})
|
|
215
|
+
logger.success(f" ✓ HTML Schema已提取,包含 {len(html_schema)} 个字段")
|
|
216
|
+
|
|
217
|
+
# 保存HTML Schema
|
|
218
|
+
html_schema_path = self.schemas_dir / f"html_schema_round_{idx}.json"
|
|
219
|
+
with open(html_schema_path, 'w', encoding='utf-8') as f:
|
|
220
|
+
json.dump(html_schema, f, ensure_ascii=False, indent=2)
|
|
221
|
+
logger.success(f" ✓ HTML Schema已保存: {html_schema_path}")
|
|
222
|
+
|
|
223
|
+
# 5. 从视觉提取Schema(包含visual_features)
|
|
224
|
+
logger.info(f" [5/6] 从视觉提取Schema(包含视觉描述)...")
|
|
225
|
+
visual_schema = extract_schema_from_image.invoke({
|
|
226
|
+
"image_path": screenshot_result
|
|
227
|
+
})
|
|
228
|
+
logger.success(f" ✓ 视觉Schema已提取,包含 {len(visual_schema)} 个字段")
|
|
229
|
+
|
|
230
|
+
# 保存视觉Schema
|
|
231
|
+
visual_schema_path = self.schemas_dir / f"visual_schema_round_{idx}.json"
|
|
232
|
+
with open(visual_schema_path, 'w', encoding='utf-8') as f:
|
|
233
|
+
json.dump(visual_schema, f, ensure_ascii=False, indent=2)
|
|
234
|
+
logger.success(f" ✓ 视觉Schema已保存: {visual_schema_path}")
|
|
235
|
+
|
|
236
|
+
# 6. 合并两个Schema
|
|
237
|
+
logger.info(f" [6/6] 合并HTML和视觉Schema...")
|
|
238
|
+
merged_schema = merge_html_and_visual_schema.invoke({
|
|
239
|
+
"html_schema": html_schema,
|
|
240
|
+
"visual_schema": visual_schema
|
|
241
|
+
})
|
|
242
|
+
logger.success(f" ✓ Schema已合并,包含 {len(merged_schema)} 个字段")
|
|
243
|
+
|
|
244
|
+
# 保存合并后的Schema
|
|
245
|
+
merged_schema_path = self.schemas_dir / f"merged_schema_round_{idx}.json"
|
|
246
|
+
with open(merged_schema_path, 'w', encoding='utf-8') as f:
|
|
247
|
+
json.dump(merged_schema, f, ensure_ascii=False, indent=2)
|
|
248
|
+
logger.success(f" ✓ 合并Schema已保存: {merged_schema_path}")
|
|
249
|
+
|
|
250
|
+
# 添加到列表,用于后续多Schema合并
|
|
251
|
+
all_merged_schemas.append(merged_schema)
|
|
252
|
+
|
|
253
|
+
# 记录本轮结果
|
|
254
|
+
round_result = {
|
|
255
|
+
'round': idx,
|
|
256
|
+
'html_file': html_file_path,
|
|
257
|
+
'url': html_file_path, # 为了兼容性保留
|
|
258
|
+
'html_original_path': str(html_original_path),
|
|
259
|
+
'html_path': str(html_path),
|
|
260
|
+
'screenshot': screenshot_result,
|
|
261
|
+
'html_schema': html_schema.copy(),
|
|
262
|
+
'html_schema_path': str(html_schema_path),
|
|
263
|
+
'visual_schema': visual_schema.copy(),
|
|
264
|
+
'visual_schema_path': str(visual_schema_path),
|
|
265
|
+
'merged_schema': merged_schema.copy(),
|
|
266
|
+
'merged_schema_path': str(merged_schema_path),
|
|
267
|
+
'schema': merged_schema.copy(), # 兼容性
|
|
268
|
+
'schema_path': str(merged_schema_path), # 兼容性
|
|
269
|
+
'groundtruth_schema': merged_schema.copy(), # 兼容性
|
|
270
|
+
'success': True,
|
|
271
|
+
}
|
|
272
|
+
result['rounds'].append(round_result)
|
|
273
|
+
logger.success(f"Schema迭代第 {idx} 轮完成")
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logger.error(f"Schema迭代第 {idx} 轮失败: {str(e)}")
|
|
277
|
+
import traceback
|
|
278
|
+
logger.debug(traceback.format_exc())
|
|
279
|
+
|
|
280
|
+
round_result = {
|
|
281
|
+
'round': idx,
|
|
282
|
+
'html_file': html_file_path,
|
|
283
|
+
'url': html_file_path, # 为了兼容性保留
|
|
284
|
+
'error': str(e),
|
|
285
|
+
'success': False,
|
|
286
|
+
}
|
|
287
|
+
result['rounds'].append(round_result)
|
|
288
|
+
|
|
289
|
+
if idx == 1:
|
|
290
|
+
# 第一轮失败则退出
|
|
291
|
+
return result
|
|
292
|
+
|
|
293
|
+
# 合并多个Schema,输出最终Schema
|
|
294
|
+
if all_merged_schemas:
|
|
295
|
+
logger.info(f"\n{'─'*70}")
|
|
296
|
+
logger.info(f"合并 {len(all_merged_schemas)} 个Schema,生成最终Schema")
|
|
297
|
+
logger.info(f"{'─'*70}")
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
final_schema = merge_multiple_schemas.invoke({
|
|
301
|
+
"schemas": all_merged_schemas
|
|
302
|
+
})
|
|
303
|
+
logger.success(f"最终Schema已生成,包含 {len(final_schema)} 个字段")
|
|
304
|
+
|
|
305
|
+
# 保存最终Schema
|
|
306
|
+
final_schema_path = self.schemas_dir / "final_schema.json"
|
|
307
|
+
with open(final_schema_path, 'w', encoding='utf-8') as f:
|
|
308
|
+
json.dump(final_schema, f, ensure_ascii=False, indent=2)
|
|
309
|
+
logger.success(f"最终Schema已保存: {final_schema_path}")
|
|
310
|
+
|
|
311
|
+
result['final_schema'] = final_schema
|
|
312
|
+
result['final_schema_path'] = str(final_schema_path)
|
|
313
|
+
result['success'] = True
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.error(f"合并多个Schema失败: {str(e)}")
|
|
317
|
+
import traceback
|
|
318
|
+
logger.debug(traceback.format_exc())
|
|
319
|
+
|
|
320
|
+
return result
|
|
321
|
+
|
|
322
|
+
def _execute_code_iteration_phase(
|
|
323
|
+
self,
|
|
324
|
+
urls: List[str],
|
|
325
|
+
final_schema: Dict,
|
|
326
|
+
schema_phase_rounds: List[Dict]
|
|
327
|
+
) -> Dict:
|
|
328
|
+
"""
|
|
329
|
+
执行代码迭代阶段
|
|
330
|
+
|
|
331
|
+
复用Schema阶段的HTML,不重复获取网页和截图
|
|
332
|
+
只进行代码生成和迭代优化
|
|
333
|
+
|
|
334
|
+
第一轮:基于最终Schema生成初始解析代码
|
|
335
|
+
后续轮:基于验证结果优化代码
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
urls: URL列表(应与schema_phase_rounds对应)
|
|
339
|
+
final_schema: 来自Schema迭代阶段的最终Schema
|
|
340
|
+
schema_phase_rounds: Schema阶段的轮次数据(包含HTML)
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
代码迭代结果
|
|
344
|
+
"""
|
|
345
|
+
result = {
|
|
346
|
+
'rounds': [],
|
|
347
|
+
'parsers': [],
|
|
348
|
+
'final_parser': None,
|
|
349
|
+
'success': False,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
# 确保有可用的Schema阶段数据
|
|
353
|
+
if not schema_phase_rounds:
|
|
354
|
+
logger.error("Schema阶段没有可用的数据")
|
|
355
|
+
return result
|
|
356
|
+
|
|
357
|
+
current_parser_code = None
|
|
358
|
+
current_parser_path = None
|
|
359
|
+
|
|
360
|
+
# 使用Schema阶段的轮次数据
|
|
361
|
+
for idx, schema_round in enumerate(schema_phase_rounds, 1):
|
|
362
|
+
if not schema_round.get('success'):
|
|
363
|
+
logger.warning(f"Schema阶段第 {idx} 轮失败,跳过代码生成")
|
|
364
|
+
continue
|
|
365
|
+
|
|
366
|
+
logger.info(f"\n{'─'*70}")
|
|
367
|
+
logger.info(f"代码迭代 - 第 {idx}/{len(schema_phase_rounds)} 轮")
|
|
368
|
+
logger.info(f"{'─'*70}")
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
# 复用Schema阶段的HTML(精简后的)
|
|
372
|
+
html_path = schema_round.get('html_path')
|
|
373
|
+
html_original_path = schema_round.get('html_original_path')
|
|
374
|
+
if not html_path:
|
|
375
|
+
logger.error(f" ✗ Schema阶段第 {idx} 轮缺少HTML路径")
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
logger.info(f" [1/2] 复用Schema阶段的精简HTML: {html_path}")
|
|
379
|
+
if html_original_path:
|
|
380
|
+
logger.debug(f" 原始HTML位置: {html_original_path}")
|
|
381
|
+
with open(html_path, 'r', encoding='utf-8') as f:
|
|
382
|
+
html_content = f.read()
|
|
383
|
+
logger.success(f" ✓ 精简HTML已加载(长度: {len(html_content)} 字符)")
|
|
384
|
+
|
|
385
|
+
# 生成或优化解析代码
|
|
386
|
+
if idx == 1:
|
|
387
|
+
logger.info(f" [2/2] 生成初始解析代码...")
|
|
388
|
+
parser_result = generate_parser_code.invoke({
|
|
389
|
+
"html_content": html_content,
|
|
390
|
+
"target_json": final_schema,
|
|
391
|
+
"output_dir": str(self.parsers_dir)
|
|
392
|
+
})
|
|
393
|
+
logger.success(f" ✓ 初始解析代码已生成")
|
|
394
|
+
else:
|
|
395
|
+
logger.info(f" [2/2] 优化解析代码(基于上一轮)...")
|
|
396
|
+
parser_result = generate_parser_code.invoke({
|
|
397
|
+
"html_content": html_content,
|
|
398
|
+
"target_json": final_schema,
|
|
399
|
+
"output_dir": str(self.parsers_dir),
|
|
400
|
+
"previous_parser_code": current_parser_code,
|
|
401
|
+
"previous_parser_path": current_parser_path,
|
|
402
|
+
"round_num": idx
|
|
403
|
+
})
|
|
404
|
+
logger.success(f" ✓ 解析代码已优化")
|
|
405
|
+
|
|
406
|
+
# 保存解析器代码
|
|
407
|
+
parser_filename = f"parser_round_{idx}.py"
|
|
408
|
+
code_parser_path = self.parsers_dir / parser_filename
|
|
409
|
+
with open(code_parser_path, 'w', encoding='utf-8') as f:
|
|
410
|
+
f.write(parser_result['code'])
|
|
411
|
+
logger.success(f" ✓ 解析器已保存: {code_parser_path}")
|
|
412
|
+
|
|
413
|
+
# 更新当前解析器
|
|
414
|
+
current_parser_code = parser_result['code']
|
|
415
|
+
current_parser_path = str(code_parser_path)
|
|
416
|
+
parser_result['parser_path'] = current_parser_path
|
|
417
|
+
|
|
418
|
+
# 记录本轮结果(复用Schema阶段的数据)
|
|
419
|
+
round_result = {
|
|
420
|
+
'round': idx,
|
|
421
|
+
'url': schema_round['url'],
|
|
422
|
+
'html_path': html_path,
|
|
423
|
+
'screenshot': schema_round.get('screenshot'), # 复用Schema阶段的截图
|
|
424
|
+
'groundtruth_schema': schema_round.get('groundtruth_schema'), # 复用Schema阶段的groundtruth
|
|
425
|
+
'parser_path': current_parser_path,
|
|
426
|
+
'parser_code': current_parser_code,
|
|
427
|
+
'parser_result': parser_result,
|
|
428
|
+
'success': True,
|
|
429
|
+
}
|
|
430
|
+
result['rounds'].append(round_result)
|
|
431
|
+
result['parsers'].append(parser_result)
|
|
432
|
+
logger.success(f"代码迭代第 {idx} 轮完成")
|
|
433
|
+
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.error(f"代码迭代第 {idx} 轮失败: {str(e)}")
|
|
436
|
+
import traceback
|
|
437
|
+
logger.debug(traceback.format_exc())
|
|
438
|
+
|
|
439
|
+
round_result = {
|
|
440
|
+
'round': idx,
|
|
441
|
+
'url': schema_round.get('url'),
|
|
442
|
+
'error': str(e),
|
|
443
|
+
'success': False,
|
|
444
|
+
}
|
|
445
|
+
result['rounds'].append(round_result)
|
|
446
|
+
|
|
447
|
+
if idx == 1:
|
|
448
|
+
# 第一轮失败则退出
|
|
449
|
+
return result
|
|
450
|
+
|
|
451
|
+
# 设置最终解析器
|
|
452
|
+
if current_parser_code:
|
|
453
|
+
# 保存最终解析器
|
|
454
|
+
final_parser_path = self.output_dir / "final_parser.py"
|
|
455
|
+
with open(final_parser_path, 'w', encoding='utf-8') as f:
|
|
456
|
+
f.write(current_parser_code)
|
|
457
|
+
logger.success(f"最终解析器已保存: {final_parser_path}")
|
|
458
|
+
|
|
459
|
+
result['final_parser'] = {
|
|
460
|
+
'parser_path': str(final_parser_path),
|
|
461
|
+
'code': current_parser_code,
|
|
462
|
+
'config_path': None, # 可以添加配置文件路径
|
|
463
|
+
'config': final_schema,
|
|
464
|
+
}
|
|
465
|
+
result['success'] = True
|
|
466
|
+
|
|
467
|
+
return result
|
|
468
|
+
|
|
469
|
+
def _parse_all_html_with_final_parser(self, results: Dict, all_rounds: List[Dict]) -> None:
|
|
470
|
+
"""
|
|
471
|
+
使用最终解析器解析所有HTML文件并生成JSON,保存到result目录
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
results: 执行结果
|
|
475
|
+
all_rounds: 所有轮次数据(包含schema和code阶段)
|
|
476
|
+
"""
|
|
477
|
+
final_parser_path = results['final_parser']['parser_path']
|
|
478
|
+
|
|
479
|
+
try:
|
|
480
|
+
# 加载最终解析器
|
|
481
|
+
logger.info(f" 加载最终解析器: {final_parser_path}")
|
|
482
|
+
parser = self._load_parser(final_parser_path)
|
|
483
|
+
|
|
484
|
+
# 扫描精简HTML目录下的所有HTML文件
|
|
485
|
+
html_files = sorted(self.html_simplified_dir.glob("*.html"))
|
|
486
|
+
|
|
487
|
+
if not html_files:
|
|
488
|
+
logger.warning(f" 精简HTML目录下没有找到HTML文件: {self.html_simplified_dir}")
|
|
489
|
+
# 尝试使用原始HTML目录
|
|
490
|
+
html_files = sorted(self.html_original_dir.glob("*.html"))
|
|
491
|
+
if html_files:
|
|
492
|
+
logger.info(f" 使用原始HTML目录: {self.html_original_dir}")
|
|
493
|
+
else:
|
|
494
|
+
logger.error(" 原始HTML目录也没有找到HTML文件")
|
|
495
|
+
return
|
|
496
|
+
|
|
497
|
+
logger.info(f" 找到 {len(html_files)} 个HTML文件")
|
|
498
|
+
|
|
499
|
+
# 遍历所有HTML文件
|
|
500
|
+
for html_path in html_files:
|
|
501
|
+
logger.info(f" 解析 {html_path.name}...")
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
# 读取HTML内容
|
|
505
|
+
with open(html_path, 'r', encoding='utf-8') as f:
|
|
506
|
+
html_content = f.read()
|
|
507
|
+
|
|
508
|
+
# 使用解析器解析HTML
|
|
509
|
+
parsed_data = parser.parse(html_content)
|
|
510
|
+
|
|
511
|
+
# 确定保存路径(基于原文件名),保存到result目录
|
|
512
|
+
json_filename = html_path.stem + '.json'
|
|
513
|
+
json_path = self.result_dir / json_filename
|
|
514
|
+
|
|
515
|
+
# 保存JSON
|
|
516
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
|
517
|
+
json.dump(parsed_data, f, ensure_ascii=False, indent=2)
|
|
518
|
+
|
|
519
|
+
logger.success(f" ✓ JSON已保存: {json_path}")
|
|
520
|
+
logger.info(f" 提取了 {len(parsed_data)} 个字段")
|
|
521
|
+
|
|
522
|
+
except Exception as e:
|
|
523
|
+
logger.error(f" ✗ 解析 {html_path.name} 失败: {str(e)}")
|
|
524
|
+
import traceback
|
|
525
|
+
logger.debug(traceback.format_exc())
|
|
526
|
+
|
|
527
|
+
logger.success(f"所有HTML解析完成,结果已保存到: {self.result_dir}")
|
|
528
|
+
|
|
529
|
+
except Exception as e:
|
|
530
|
+
logger.error(f"加载最终解析器失败: {str(e)}")
|
|
531
|
+
import traceback
|
|
532
|
+
logger.debug(traceback.format_exc())
|
|
533
|
+
|
|
534
|
+
def _load_parser(self, parser_path: str):
|
|
535
|
+
"""动态加载解析器类"""
|
|
536
|
+
spec = importlib.util.spec_from_file_location("parser_module", parser_path)
|
|
537
|
+
module = importlib.util.module_from_spec(spec)
|
|
538
|
+
sys.modules["parser_module"] = module
|
|
539
|
+
spec.loader.exec_module(module)
|
|
540
|
+
|
|
541
|
+
# 获取WebPageParser类
|
|
542
|
+
if hasattr(module, 'WebPageParser'):
|
|
543
|
+
return module.WebPageParser()
|
|
544
|
+
else:
|
|
545
|
+
raise Exception("解析器中未找到WebPageParser类")
|
agent/orchestrator.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent 编排器
|
|
3
|
+
整合规划器和执行器,提供统一的Agent接口
|
|
4
|
+
"""
|
|
5
|
+
from typing import List, Dict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from loguru import logger
|
|
8
|
+
from .planner import AgentPlanner
|
|
9
|
+
from .executor import AgentExecutor
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParserAgent:
|
|
13
|
+
"""
|
|
14
|
+
HTML解析器生成Agent
|
|
15
|
+
|
|
16
|
+
通过给定一组HTML文件,自动生成能够解析这些页面的Python代码
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, output_dir: str = "output"):
|
|
20
|
+
"""
|
|
21
|
+
初始化Agent
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
output_dir: 输出目录
|
|
25
|
+
"""
|
|
26
|
+
self.planner = AgentPlanner()
|
|
27
|
+
self.executor = AgentExecutor(output_dir)
|
|
28
|
+
self.output_dir = Path(output_dir)
|
|
29
|
+
|
|
30
|
+
logger.info("ParserAgent 初始化完成")
|
|
31
|
+
|
|
32
|
+
def generate_parser(
|
|
33
|
+
self,
|
|
34
|
+
html_files: List[str],
|
|
35
|
+
domain: str = None
|
|
36
|
+
) -> Dict:
|
|
37
|
+
"""
|
|
38
|
+
生成解析器
|
|
39
|
+
|
|
40
|
+
流程:
|
|
41
|
+
1. 规划:分析HTML文件并制定执行计划
|
|
42
|
+
2. 执行:
|
|
43
|
+
- 阶段1: Schema迭代 - 提取并优化Schema
|
|
44
|
+
- 阶段2: 代码迭代 - 生成并优化解析代码
|
|
45
|
+
3. 总结:生成执行总结
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
html_files: HTML文件路径列表
|
|
49
|
+
domain: 域名(可选)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
生成结果
|
|
53
|
+
"""
|
|
54
|
+
logger.info("="*70)
|
|
55
|
+
logger.info("开始生成解析器")
|
|
56
|
+
logger.info("="*70)
|
|
57
|
+
|
|
58
|
+
# 第一步:规划
|
|
59
|
+
logger.info("\n[步骤 1/3] 任务规划")
|
|
60
|
+
plan = self.planner.create_plan(html_files, domain)
|
|
61
|
+
|
|
62
|
+
# 第二步:执行(两阶段迭代)
|
|
63
|
+
logger.info("\n[步骤 2/3] 执行计划 - 两阶段迭代")
|
|
64
|
+
execution_result = self.executor.execute_plan(plan)
|
|
65
|
+
|
|
66
|
+
if not execution_result['success']:
|
|
67
|
+
logger.error("执行失败,无法生成解析器")
|
|
68
|
+
return {
|
|
69
|
+
'success': False,
|
|
70
|
+
'error': '执行失败',
|
|
71
|
+
'execution_result': execution_result
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# 第三步:总结
|
|
75
|
+
logger.info("\n[步骤 3/3] 生成总结")
|
|
76
|
+
summary = self._generate_summary(execution_result)
|
|
77
|
+
|
|
78
|
+
logger.info("="*70)
|
|
79
|
+
logger.success("解析器生成完成!")
|
|
80
|
+
logger.info("="*70)
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
'success': True,
|
|
84
|
+
'plan': plan,
|
|
85
|
+
'execution_result': execution_result,
|
|
86
|
+
'summary': summary,
|
|
87
|
+
'parser_path': execution_result['final_parser']['parser_path'],
|
|
88
|
+
'config_path': execution_result['final_parser'].get('config_path'),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
def _generate_summary(self, execution_result: Dict) -> str:
|
|
92
|
+
"""生成执行总结"""
|
|
93
|
+
lines = []
|
|
94
|
+
lines.append("\n" + "="*70)
|
|
95
|
+
lines.append("执行总结")
|
|
96
|
+
lines.append("="*70)
|
|
97
|
+
|
|
98
|
+
# Schema迭代阶段结果
|
|
99
|
+
schema_phase = execution_result.get('schema_phase', {})
|
|
100
|
+
schema_rounds = schema_phase.get('rounds', [])
|
|
101
|
+
schema_success_rounds = [r for r in schema_rounds if r.get('success')]
|
|
102
|
+
lines.append(f"\nSchema迭代阶段: {len(schema_success_rounds)}/{len(schema_rounds)} 轮成功")
|
|
103
|
+
|
|
104
|
+
if schema_phase.get('final_schema'):
|
|
105
|
+
final_schema_size = len(schema_phase['final_schema'])
|
|
106
|
+
lines.append(f" 最终Schema字段数: {final_schema_size}")
|
|
107
|
+
|
|
108
|
+
if schema_phase.get('final_schema_path'):
|
|
109
|
+
lines.append(f" 最终Schema路径: {schema_phase['final_schema_path']}")
|
|
110
|
+
|
|
111
|
+
# 代码迭代阶段结果
|
|
112
|
+
code_phase = execution_result.get('code_phase', {})
|
|
113
|
+
code_rounds = code_phase.get('rounds', [])
|
|
114
|
+
code_success_rounds = [r for r in code_rounds if r.get('success')]
|
|
115
|
+
lines.append(f"\n代码迭代阶段: {len(code_success_rounds)}/{len(code_rounds)} 轮成功")
|
|
116
|
+
|
|
117
|
+
# 解析器生成结果
|
|
118
|
+
if execution_result.get('final_parser'):
|
|
119
|
+
parser_path = execution_result['final_parser']['parser_path']
|
|
120
|
+
lines.append(f"\n最终解析器路径: {parser_path}")
|
|
121
|
+
|
|
122
|
+
lines.append("="*70)
|
|
123
|
+
|
|
124
|
+
summary = "\n".join(lines)
|
|
125
|
+
logger.info(summary)
|
|
126
|
+
|
|
127
|
+
return summary
|
|
128
|
+
|