web2json-agent 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agent/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """
2
+ Agent 模块
3
+ 提供智能解析代码生成的Agent系统
4
+ """
5
+ from .planner import AgentPlanner
6
+ from .executor import AgentExecutor
7
+ from .orchestrator import ParserAgent
8
+
9
+ __all__ = [
10
+ 'AgentPlanner',
11
+ 'AgentExecutor',
12
+ 'ParserAgent',
13
+ ]
14
+
agent/executor.py ADDED
@@ -0,0 +1,545 @@
1
+ """
2
+ Agent 执行器
3
+ 负责执行具体的任务步骤
4
+ """
5
+ import importlib.util
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Dict, List
10
+
11
+ from loguru import logger
12
+
13
+ from config.settings import settings
14
+ from tools import (
15
+ get_html_from_file, # 从本地文件读取HTML工具
16
+ capture_html_file_screenshot, # 渲染本地HTML并截图工具
17
+ generate_parser_code, # 生成解析代码工具
18
+ extract_schema_from_html, # 从HTML提取Schema
19
+ extract_schema_from_image, # 从截图提取Schema
20
+ merge_html_and_visual_schema, # 合并单个HTML的两种Schema
21
+ merge_multiple_schemas, # 合并多个HTML的Schema
22
+ )
23
+ from tools.html_simplifier import simplify_html # HTML精简工具
24
+
25
+
26
+ class AgentExecutor:
27
+ """Agent执行器,负责执行具体任务"""
28
+
29
+ def __init__(self, output_dir: str = "output"):
30
+ self.output_dir = Path(output_dir)
31
+ self.output_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ # 创建子目录
34
+ self.screenshots_dir = self.output_dir / "screenshots"
35
+ self.parsers_dir = self.output_dir / "parsers"
36
+ self.html_original_dir = self.output_dir / "html_original" # 原始HTML
37
+ self.html_simplified_dir = self.output_dir / "html_simplified" # 精简HTML
38
+ self.result_dir = self.output_dir / "result"
39
+ self.schemas_dir = self.output_dir / "schemas"
40
+
41
+ self.screenshots_dir.mkdir(exist_ok=True)
42
+ self.parsers_dir.mkdir(exist_ok=True)
43
+ self.html_original_dir.mkdir(exist_ok=True)
44
+ self.html_simplified_dir.mkdir(exist_ok=True)
45
+ self.result_dir.mkdir(exist_ok=True)
46
+ self.schemas_dir.mkdir(exist_ok=True)
47
+
48
+ logger.info(f"输出目录已创建:")
49
+ logger.info(f" - 原始HTML: {self.html_original_dir}")
50
+ logger.info(f" - 精简HTML: {self.html_simplified_dir}")
51
+
52
+ def execute_plan(self, plan: Dict) -> Dict:
53
+ """
54
+ 执行计划 - 两阶段迭代执行
55
+
56
+ 阶段1: Schema迭代(前N个URL)- 获取HTML -> 截图 -> 提取/优化JSON Schema
57
+ 阶段2: 代码迭代(后M个URL)- 基于最终Schema生成代码 -> 验证 -> 优化代码
58
+
59
+ Args:
60
+ plan: 执行计划
61
+
62
+ Returns:
63
+ 执行结果
64
+ """
65
+ logger.info("开始执行计划...")
66
+
67
+ results = {
68
+ 'plan': plan,
69
+ 'schema_phase': {
70
+ 'rounds': [],
71
+ 'final_schema': None,
72
+ },
73
+ 'code_phase': {
74
+ 'rounds': [],
75
+ 'parsers': [],
76
+ },
77
+ 'final_parser': None,
78
+ 'success': False,
79
+ }
80
+
81
+ # 使用所有输入的URL进行迭代
82
+ sample_urls = plan['sample_urls']
83
+ num_urls = len(sample_urls)
84
+
85
+ # 阶段1: Schema迭代(使用所有URL)
86
+ logger.info(f"\n{'='*70}")
87
+ logger.info(f"阶段1: Schema迭代({num_urls}个URL,{num_urls}轮迭代)")
88
+ logger.info(f"{'='*70}")
89
+
90
+ schema_result = self._execute_schema_iteration_phase(sample_urls)
91
+ results['schema_phase'] = schema_result
92
+
93
+ if not schema_result['success']:
94
+ logger.error("Schema迭代阶段失败")
95
+ return results
96
+
97
+ final_schema = schema_result['final_schema']
98
+ logger.success(f"Schema迭代完成,最终Schema包含 {len(final_schema)} 个字段")
99
+
100
+ # 阶段2: 代码迭代(复用Schema阶段的HTML)
101
+ logger.info(f"\n{'='*70}")
102
+ logger.info(f"阶段2: 代码迭代(使用Schema阶段的{num_urls}个HTML,{num_urls}轮迭代)")
103
+ logger.info(f"{'='*70}")
104
+
105
+ code_result = self._execute_code_iteration_phase(
106
+ sample_urls, # 仅用于日志,实际使用schema_result['rounds']中的数据
107
+ final_schema,
108
+ schema_result['rounds'] # 传递schema阶段的数据(包含HTML)
109
+ )
110
+ results['code_phase'] = code_result
111
+
112
+ if code_result['success']:
113
+ results['final_parser'] = code_result['final_parser']
114
+ results['success'] = True
115
+
116
+ # 使用最终解析器解析所有HTML并生成JSON
117
+ logger.info(f"\n{'='*70}")
118
+ logger.info("使用最终解析器解析所有HTML")
119
+ logger.info(f"{'='*70}")
120
+ # 只使用Schema阶段的数据,因为代码阶段复用了Schema阶段的HTML
121
+ self._parse_all_html_with_final_parser(results, schema_result['rounds'])
122
+ else:
123
+ logger.error("代码迭代阶段失败")
124
+
125
+ return results
126
+
127
+ def _execute_schema_iteration_phase(self, html_files: List[str]) -> Dict:
128
+ """
129
+ 执行Schema迭代阶段(新版)
130
+
131
+ 对每个HTML文件:
132
+ 1. 读取HTML文件内容
133
+ 2. 渲染HTML并截图
134
+ 3. 从HTML提取Schema(包含xpath)
135
+ 4. 从视觉提取Schema(包含visual_features)
136
+ 5. 合并两个Schema
137
+
138
+ 所有HTML处理完后:
139
+ 6. 合并多个Schema,输出最终Schema
140
+
141
+ Args:
142
+ html_files: HTML文件路径列表
143
+
144
+ Returns:
145
+ Schema迭代结果
146
+ """
147
+ result = {
148
+ 'rounds': [],
149
+ 'final_schema': None,
150
+ 'success': False,
151
+ }
152
+
153
+ all_merged_schemas = [] # 存储每个HTML的合并Schema
154
+
155
+ for idx, html_file_path in enumerate(html_files, 1):
156
+ logger.info(f"\n{'─'*70}")
157
+ logger.info(f"Schema迭代 - 第 {idx}/{len(html_files)} 轮")
158
+ logger.info(f"{'─'*70}")
159
+
160
+ try:
161
+ # 1. 读取HTML文件内容
162
+ logger.info(f" [1/6] 读取本地HTML文件...")
163
+ html_content = get_html_from_file.invoke({"file_path": html_file_path})
164
+ logger.success(f" ✓ HTML文件已读取(长度: {len(html_content)} 字符)")
165
+
166
+ # 保存原始HTML(复制到输出目录)
167
+ html_original_path = self.html_original_dir / f"schema_round_{idx}.html"
168
+ with open(html_original_path, 'w', encoding='utf-8') as f:
169
+ f.write(html_content)
170
+ logger.success(f" ✓ 原始HTML已保存: {html_original_path}")
171
+
172
+ # 2. 精简HTML
173
+ logger.info(f" [2/6] 精简HTML...")
174
+ try:
175
+ # 根据配置选择精简模式
176
+ mode = settings.html_simplify_mode
177
+ keep_attrs = settings.html_keep_attrs if mode != 'conservative' else None
178
+
179
+ simplified_html = simplify_html(
180
+ html_content,
181
+ mode=mode,
182
+ keep_attrs=keep_attrs
183
+ )
184
+ # 保存精简后的HTML
185
+ html_simplified_path = self.html_simplified_dir / f"schema_round_{idx}.html"
186
+ with open(html_simplified_path, 'w', encoding='utf-8') as f:
187
+ f.write(simplified_html)
188
+
189
+ compression_rate = (1 - len(simplified_html) / len(html_content)) * 100
190
+ logger.success(f" ✓ 精简HTML已保存: {html_simplified_path}")
191
+ logger.info(f" 压缩率: {compression_rate:.1f}% ({len(html_content)} -> {len(simplified_html)} 字符)")
192
+
193
+ # 后续使用精简后的HTML
194
+ html_path = html_simplified_path
195
+ html_for_processing = simplified_html
196
+ except Exception as e:
197
+ logger.warning(f" ⚠ HTML精简失败: {e},使用原始HTML")
198
+ html_path = html_original_path
199
+ html_for_processing = html_content
200
+
201
+ # 3. 渲染并截图
202
+ logger.info(f" [3/6] 渲染并截图本地HTML...")
203
+ screenshot_path = str(self.screenshots_dir / f"schema_round_{idx}.png")
204
+ screenshot_result = capture_html_file_screenshot.invoke({
205
+ "html_file_path": html_file_path,
206
+ "save_path": screenshot_path
207
+ })
208
+ logger.success(f" ✓ 截图已保存: {screenshot_path}")
209
+
210
+ # 4. 从HTML提取Schema(包含xpath)
211
+ logger.info(f" [4/6] 从HTML提取Schema(包含xpath)...")
212
+ html_schema = extract_schema_from_html.invoke({
213
+ "html_content": html_for_processing
214
+ })
215
+ logger.success(f" ✓ HTML Schema已提取,包含 {len(html_schema)} 个字段")
216
+
217
+ # 保存HTML Schema
218
+ html_schema_path = self.schemas_dir / f"html_schema_round_{idx}.json"
219
+ with open(html_schema_path, 'w', encoding='utf-8') as f:
220
+ json.dump(html_schema, f, ensure_ascii=False, indent=2)
221
+ logger.success(f" ✓ HTML Schema已保存: {html_schema_path}")
222
+
223
+ # 5. 从视觉提取Schema(包含visual_features)
224
+ logger.info(f" [5/6] 从视觉提取Schema(包含视觉描述)...")
225
+ visual_schema = extract_schema_from_image.invoke({
226
+ "image_path": screenshot_result
227
+ })
228
+ logger.success(f" ✓ 视觉Schema已提取,包含 {len(visual_schema)} 个字段")
229
+
230
+ # 保存视觉Schema
231
+ visual_schema_path = self.schemas_dir / f"visual_schema_round_{idx}.json"
232
+ with open(visual_schema_path, 'w', encoding='utf-8') as f:
233
+ json.dump(visual_schema, f, ensure_ascii=False, indent=2)
234
+ logger.success(f" ✓ 视觉Schema已保存: {visual_schema_path}")
235
+
236
+ # 6. 合并两个Schema
237
+ logger.info(f" [6/6] 合并HTML和视觉Schema...")
238
+ merged_schema = merge_html_and_visual_schema.invoke({
239
+ "html_schema": html_schema,
240
+ "visual_schema": visual_schema
241
+ })
242
+ logger.success(f" ✓ Schema已合并,包含 {len(merged_schema)} 个字段")
243
+
244
+ # 保存合并后的Schema
245
+ merged_schema_path = self.schemas_dir / f"merged_schema_round_{idx}.json"
246
+ with open(merged_schema_path, 'w', encoding='utf-8') as f:
247
+ json.dump(merged_schema, f, ensure_ascii=False, indent=2)
248
+ logger.success(f" ✓ 合并Schema已保存: {merged_schema_path}")
249
+
250
+ # 添加到列表,用于后续多Schema合并
251
+ all_merged_schemas.append(merged_schema)
252
+
253
+ # 记录本轮结果
254
+ round_result = {
255
+ 'round': idx,
256
+ 'html_file': html_file_path,
257
+ 'url': html_file_path, # 为了兼容性保留
258
+ 'html_original_path': str(html_original_path),
259
+ 'html_path': str(html_path),
260
+ 'screenshot': screenshot_result,
261
+ 'html_schema': html_schema.copy(),
262
+ 'html_schema_path': str(html_schema_path),
263
+ 'visual_schema': visual_schema.copy(),
264
+ 'visual_schema_path': str(visual_schema_path),
265
+ 'merged_schema': merged_schema.copy(),
266
+ 'merged_schema_path': str(merged_schema_path),
267
+ 'schema': merged_schema.copy(), # 兼容性
268
+ 'schema_path': str(merged_schema_path), # 兼容性
269
+ 'groundtruth_schema': merged_schema.copy(), # 兼容性
270
+ 'success': True,
271
+ }
272
+ result['rounds'].append(round_result)
273
+ logger.success(f"Schema迭代第 {idx} 轮完成")
274
+
275
+ except Exception as e:
276
+ logger.error(f"Schema迭代第 {idx} 轮失败: {str(e)}")
277
+ import traceback
278
+ logger.debug(traceback.format_exc())
279
+
280
+ round_result = {
281
+ 'round': idx,
282
+ 'html_file': html_file_path,
283
+ 'url': html_file_path, # 为了兼容性保留
284
+ 'error': str(e),
285
+ 'success': False,
286
+ }
287
+ result['rounds'].append(round_result)
288
+
289
+ if idx == 1:
290
+ # 第一轮失败则退出
291
+ return result
292
+
293
+ # 合并多个Schema,输出最终Schema
294
+ if all_merged_schemas:
295
+ logger.info(f"\n{'─'*70}")
296
+ logger.info(f"合并 {len(all_merged_schemas)} 个Schema,生成最终Schema")
297
+ logger.info(f"{'─'*70}")
298
+
299
+ try:
300
+ final_schema = merge_multiple_schemas.invoke({
301
+ "schemas": all_merged_schemas
302
+ })
303
+ logger.success(f"最终Schema已生成,包含 {len(final_schema)} 个字段")
304
+
305
+ # 保存最终Schema
306
+ final_schema_path = self.schemas_dir / "final_schema.json"
307
+ with open(final_schema_path, 'w', encoding='utf-8') as f:
308
+ json.dump(final_schema, f, ensure_ascii=False, indent=2)
309
+ logger.success(f"最终Schema已保存: {final_schema_path}")
310
+
311
+ result['final_schema'] = final_schema
312
+ result['final_schema_path'] = str(final_schema_path)
313
+ result['success'] = True
314
+
315
+ except Exception as e:
316
+ logger.error(f"合并多个Schema失败: {str(e)}")
317
+ import traceback
318
+ logger.debug(traceback.format_exc())
319
+
320
+ return result
321
+
322
+ def _execute_code_iteration_phase(
323
+ self,
324
+ urls: List[str],
325
+ final_schema: Dict,
326
+ schema_phase_rounds: List[Dict]
327
+ ) -> Dict:
328
+ """
329
+ 执行代码迭代阶段
330
+
331
+ 复用Schema阶段的HTML,不重复获取网页和截图
332
+ 只进行代码生成和迭代优化
333
+
334
+ 第一轮:基于最终Schema生成初始解析代码
335
+ 后续轮:基于验证结果优化代码
336
+
337
+ Args:
338
+ urls: URL列表(应与schema_phase_rounds对应)
339
+ final_schema: 来自Schema迭代阶段的最终Schema
340
+ schema_phase_rounds: Schema阶段的轮次数据(包含HTML)
341
+
342
+ Returns:
343
+ 代码迭代结果
344
+ """
345
+ result = {
346
+ 'rounds': [],
347
+ 'parsers': [],
348
+ 'final_parser': None,
349
+ 'success': False,
350
+ }
351
+
352
+ # 确保有可用的Schema阶段数据
353
+ if not schema_phase_rounds:
354
+ logger.error("Schema阶段没有可用的数据")
355
+ return result
356
+
357
+ current_parser_code = None
358
+ current_parser_path = None
359
+
360
+ # 使用Schema阶段的轮次数据
361
+ for idx, schema_round in enumerate(schema_phase_rounds, 1):
362
+ if not schema_round.get('success'):
363
+ logger.warning(f"Schema阶段第 {idx} 轮失败,跳过代码生成")
364
+ continue
365
+
366
+ logger.info(f"\n{'─'*70}")
367
+ logger.info(f"代码迭代 - 第 {idx}/{len(schema_phase_rounds)} 轮")
368
+ logger.info(f"{'─'*70}")
369
+
370
+ try:
371
+ # 复用Schema阶段的HTML(精简后的)
372
+ html_path = schema_round.get('html_path')
373
+ html_original_path = schema_round.get('html_original_path')
374
+ if not html_path:
375
+ logger.error(f" ✗ Schema阶段第 {idx} 轮缺少HTML路径")
376
+ continue
377
+
378
+ logger.info(f" [1/2] 复用Schema阶段的精简HTML: {html_path}")
379
+ if html_original_path:
380
+ logger.debug(f" 原始HTML位置: {html_original_path}")
381
+ with open(html_path, 'r', encoding='utf-8') as f:
382
+ html_content = f.read()
383
+ logger.success(f" ✓ 精简HTML已加载(长度: {len(html_content)} 字符)")
384
+
385
+ # 生成或优化解析代码
386
+ if idx == 1:
387
+ logger.info(f" [2/2] 生成初始解析代码...")
388
+ parser_result = generate_parser_code.invoke({
389
+ "html_content": html_content,
390
+ "target_json": final_schema,
391
+ "output_dir": str(self.parsers_dir)
392
+ })
393
+ logger.success(f" ✓ 初始解析代码已生成")
394
+ else:
395
+ logger.info(f" [2/2] 优化解析代码(基于上一轮)...")
396
+ parser_result = generate_parser_code.invoke({
397
+ "html_content": html_content,
398
+ "target_json": final_schema,
399
+ "output_dir": str(self.parsers_dir),
400
+ "previous_parser_code": current_parser_code,
401
+ "previous_parser_path": current_parser_path,
402
+ "round_num": idx
403
+ })
404
+ logger.success(f" ✓ 解析代码已优化")
405
+
406
+ # 保存解析器代码
407
+ parser_filename = f"parser_round_{idx}.py"
408
+ code_parser_path = self.parsers_dir / parser_filename
409
+ with open(code_parser_path, 'w', encoding='utf-8') as f:
410
+ f.write(parser_result['code'])
411
+ logger.success(f" ✓ 解析器已保存: {code_parser_path}")
412
+
413
+ # 更新当前解析器
414
+ current_parser_code = parser_result['code']
415
+ current_parser_path = str(code_parser_path)
416
+ parser_result['parser_path'] = current_parser_path
417
+
418
+ # 记录本轮结果(复用Schema阶段的数据)
419
+ round_result = {
420
+ 'round': idx,
421
+ 'url': schema_round['url'],
422
+ 'html_path': html_path,
423
+ 'screenshot': schema_round.get('screenshot'), # 复用Schema阶段的截图
424
+ 'groundtruth_schema': schema_round.get('groundtruth_schema'), # 复用Schema阶段的groundtruth
425
+ 'parser_path': current_parser_path,
426
+ 'parser_code': current_parser_code,
427
+ 'parser_result': parser_result,
428
+ 'success': True,
429
+ }
430
+ result['rounds'].append(round_result)
431
+ result['parsers'].append(parser_result)
432
+ logger.success(f"代码迭代第 {idx} 轮完成")
433
+
434
+ except Exception as e:
435
+ logger.error(f"代码迭代第 {idx} 轮失败: {str(e)}")
436
+ import traceback
437
+ logger.debug(traceback.format_exc())
438
+
439
+ round_result = {
440
+ 'round': idx,
441
+ 'url': schema_round.get('url'),
442
+ 'error': str(e),
443
+ 'success': False,
444
+ }
445
+ result['rounds'].append(round_result)
446
+
447
+ if idx == 1:
448
+ # 第一轮失败则退出
449
+ return result
450
+
451
+ # 设置最终解析器
452
+ if current_parser_code:
453
+ # 保存最终解析器
454
+ final_parser_path = self.output_dir / "final_parser.py"
455
+ with open(final_parser_path, 'w', encoding='utf-8') as f:
456
+ f.write(current_parser_code)
457
+ logger.success(f"最终解析器已保存: {final_parser_path}")
458
+
459
+ result['final_parser'] = {
460
+ 'parser_path': str(final_parser_path),
461
+ 'code': current_parser_code,
462
+ 'config_path': None, # 可以添加配置文件路径
463
+ 'config': final_schema,
464
+ }
465
+ result['success'] = True
466
+
467
+ return result
468
+
469
+ def _parse_all_html_with_final_parser(self, results: Dict, all_rounds: List[Dict]) -> None:
470
+ """
471
+ 使用最终解析器解析所有HTML文件并生成JSON,保存到result目录
472
+
473
+ Args:
474
+ results: 执行结果
475
+ all_rounds: 所有轮次数据(包含schema和code阶段)
476
+ """
477
+ final_parser_path = results['final_parser']['parser_path']
478
+
479
+ try:
480
+ # 加载最终解析器
481
+ logger.info(f" 加载最终解析器: {final_parser_path}")
482
+ parser = self._load_parser(final_parser_path)
483
+
484
+ # 扫描精简HTML目录下的所有HTML文件
485
+ html_files = sorted(self.html_simplified_dir.glob("*.html"))
486
+
487
+ if not html_files:
488
+ logger.warning(f" 精简HTML目录下没有找到HTML文件: {self.html_simplified_dir}")
489
+ # 尝试使用原始HTML目录
490
+ html_files = sorted(self.html_original_dir.glob("*.html"))
491
+ if html_files:
492
+ logger.info(f" 使用原始HTML目录: {self.html_original_dir}")
493
+ else:
494
+ logger.error(" 原始HTML目录也没有找到HTML文件")
495
+ return
496
+
497
+ logger.info(f" 找到 {len(html_files)} 个HTML文件")
498
+
499
+ # 遍历所有HTML文件
500
+ for html_path in html_files:
501
+ logger.info(f" 解析 {html_path.name}...")
502
+
503
+ try:
504
+ # 读取HTML内容
505
+ with open(html_path, 'r', encoding='utf-8') as f:
506
+ html_content = f.read()
507
+
508
+ # 使用解析器解析HTML
509
+ parsed_data = parser.parse(html_content)
510
+
511
+ # 确定保存路径(基于原文件名),保存到result目录
512
+ json_filename = html_path.stem + '.json'
513
+ json_path = self.result_dir / json_filename
514
+
515
+ # 保存JSON
516
+ with open(json_path, 'w', encoding='utf-8') as f:
517
+ json.dump(parsed_data, f, ensure_ascii=False, indent=2)
518
+
519
+ logger.success(f" ✓ JSON已保存: {json_path}")
520
+ logger.info(f" 提取了 {len(parsed_data)} 个字段")
521
+
522
+ except Exception as e:
523
+ logger.error(f" ✗ 解析 {html_path.name} 失败: {str(e)}")
524
+ import traceback
525
+ logger.debug(traceback.format_exc())
526
+
527
+ logger.success(f"所有HTML解析完成,结果已保存到: {self.result_dir}")
528
+
529
+ except Exception as e:
530
+ logger.error(f"加载最终解析器失败: {str(e)}")
531
+ import traceback
532
+ logger.debug(traceback.format_exc())
533
+
534
+ def _load_parser(self, parser_path: str):
535
+ """动态加载解析器类"""
536
+ spec = importlib.util.spec_from_file_location("parser_module", parser_path)
537
+ module = importlib.util.module_from_spec(spec)
538
+ sys.modules["parser_module"] = module
539
+ spec.loader.exec_module(module)
540
+
541
+ # 获取WebPageParser类
542
+ if hasattr(module, 'WebPageParser'):
543
+ return module.WebPageParser()
544
+ else:
545
+ raise Exception("解析器中未找到WebPageParser类")
agent/orchestrator.py ADDED
@@ -0,0 +1,128 @@
1
+ """
2
+ Agent 编排器
3
+ 整合规划器和执行器,提供统一的Agent接口
4
+ """
5
+ from typing import List, Dict
6
+ from pathlib import Path
7
+ from loguru import logger
8
+ from .planner import AgentPlanner
9
+ from .executor import AgentExecutor
10
+
11
+
12
+ class ParserAgent:
13
+ """
14
+ HTML解析器生成Agent
15
+
16
+ 通过给定一组HTML文件,自动生成能够解析这些页面的Python代码
17
+ """
18
+
19
+ def __init__(self, output_dir: str = "output"):
20
+ """
21
+ 初始化Agent
22
+
23
+ Args:
24
+ output_dir: 输出目录
25
+ """
26
+ self.planner = AgentPlanner()
27
+ self.executor = AgentExecutor(output_dir)
28
+ self.output_dir = Path(output_dir)
29
+
30
+ logger.info("ParserAgent 初始化完成")
31
+
32
+ def generate_parser(
33
+ self,
34
+ html_files: List[str],
35
+ domain: str = None
36
+ ) -> Dict:
37
+ """
38
+ 生成解析器
39
+
40
+ 流程:
41
+ 1. 规划:分析HTML文件并制定执行计划
42
+ 2. 执行:
43
+ - 阶段1: Schema迭代 - 提取并优化Schema
44
+ - 阶段2: 代码迭代 - 生成并优化解析代码
45
+ 3. 总结:生成执行总结
46
+
47
+ Args:
48
+ html_files: HTML文件路径列表
49
+ domain: 域名(可选)
50
+
51
+ Returns:
52
+ 生成结果
53
+ """
54
+ logger.info("="*70)
55
+ logger.info("开始生成解析器")
56
+ logger.info("="*70)
57
+
58
+ # 第一步:规划
59
+ logger.info("\n[步骤 1/3] 任务规划")
60
+ plan = self.planner.create_plan(html_files, domain)
61
+
62
+ # 第二步:执行(两阶段迭代)
63
+ logger.info("\n[步骤 2/3] 执行计划 - 两阶段迭代")
64
+ execution_result = self.executor.execute_plan(plan)
65
+
66
+ if not execution_result['success']:
67
+ logger.error("执行失败,无法生成解析器")
68
+ return {
69
+ 'success': False,
70
+ 'error': '执行失败',
71
+ 'execution_result': execution_result
72
+ }
73
+
74
+ # 第三步:总结
75
+ logger.info("\n[步骤 3/3] 生成总结")
76
+ summary = self._generate_summary(execution_result)
77
+
78
+ logger.info("="*70)
79
+ logger.success("解析器生成完成!")
80
+ logger.info("="*70)
81
+
82
+ return {
83
+ 'success': True,
84
+ 'plan': plan,
85
+ 'execution_result': execution_result,
86
+ 'summary': summary,
87
+ 'parser_path': execution_result['final_parser']['parser_path'],
88
+ 'config_path': execution_result['final_parser'].get('config_path'),
89
+ }
90
+
91
+ def _generate_summary(self, execution_result: Dict) -> str:
92
+ """生成执行总结"""
93
+ lines = []
94
+ lines.append("\n" + "="*70)
95
+ lines.append("执行总结")
96
+ lines.append("="*70)
97
+
98
+ # Schema迭代阶段结果
99
+ schema_phase = execution_result.get('schema_phase', {})
100
+ schema_rounds = schema_phase.get('rounds', [])
101
+ schema_success_rounds = [r for r in schema_rounds if r.get('success')]
102
+ lines.append(f"\nSchema迭代阶段: {len(schema_success_rounds)}/{len(schema_rounds)} 轮成功")
103
+
104
+ if schema_phase.get('final_schema'):
105
+ final_schema_size = len(schema_phase['final_schema'])
106
+ lines.append(f" 最终Schema字段数: {final_schema_size}")
107
+
108
+ if schema_phase.get('final_schema_path'):
109
+ lines.append(f" 最终Schema路径: {schema_phase['final_schema_path']}")
110
+
111
+ # 代码迭代阶段结果
112
+ code_phase = execution_result.get('code_phase', {})
113
+ code_rounds = code_phase.get('rounds', [])
114
+ code_success_rounds = [r for r in code_rounds if r.get('success')]
115
+ lines.append(f"\n代码迭代阶段: {len(code_success_rounds)}/{len(code_rounds)} 轮成功")
116
+
117
+ # 解析器生成结果
118
+ if execution_result.get('final_parser'):
119
+ parser_path = execution_result['final_parser']['parser_path']
120
+ lines.append(f"\n最终解析器路径: {parser_path}")
121
+
122
+ lines.append("="*70)
123
+
124
+ summary = "\n".join(lines)
125
+ logger.info(summary)
126
+
127
+ return summary
128
+