web2json-agent 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +14 -0
- agent/executor.py +545 -0
- agent/orchestrator.py +128 -0
- agent/planner.py +61 -0
- cli.py +195 -0
- config/__init__.py +4 -0
- config/settings.py +72 -0
- config/validator.py +409 -0
- main.py +137 -0
- prompts/__init__.py +14 -0
- prompts/code_generator.py +237 -0
- prompts/schema_extraction.py +169 -0
- prompts/schema_merge.py +168 -0
- tools/__init__.py +24 -0
- tools/code_generator.py +134 -0
- tools/html_simplifier.py +405 -0
- tools/schema_extraction.py +264 -0
- tools/webpage_screenshot.py +92 -0
- tools/webpage_source.py +46 -0
- utils/__init__.py +6 -0
- utils/llm_client.py +350 -0
- web2json_agent-1.0.0.dist-info/METADATA +348 -0
- web2json_agent-1.0.0.dist-info/RECORD +27 -0
- web2json_agent-1.0.0.dist-info/WHEEL +5 -0
- web2json_agent-1.0.0.dist-info/entry_points.txt +2 -0
- web2json_agent-1.0.0.dist-info/licenses/LICENSE +21 -0
- web2json_agent-1.0.0.dist-info/top_level.txt +7 -0
agent/planner.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent 规划器
|
|
3
|
+
负责分析任务并生成执行计划(简化版,无需 LLM)
|
|
4
|
+
"""
|
|
5
|
+
from typing import List, Dict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from loguru import logger
|
|
8
|
+
from config.settings import settings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AgentPlanner:
|
|
12
|
+
"""Agent规划器,负责任务分析和计划生成"""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
"""初始化规划器(不再需要 LLM)"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def create_plan(self, html_files: List[str], domain: str = None) -> Dict:
|
|
19
|
+
"""
|
|
20
|
+
创建解析任务计划
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
html_files: 待解析的HTML文件路径列表
|
|
24
|
+
domain: 域名(可选)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
执行计划字典
|
|
28
|
+
"""
|
|
29
|
+
logger.info(f"正在为 {len(html_files)} 个HTML文件创建执行计划...")
|
|
30
|
+
|
|
31
|
+
# 如果没有提供域名,使用默认值
|
|
32
|
+
if not domain:
|
|
33
|
+
domain = "local_html_files"
|
|
34
|
+
|
|
35
|
+
# 使用所有输入的HTML文件
|
|
36
|
+
sample_files = html_files
|
|
37
|
+
num_samples = len(html_files)
|
|
38
|
+
|
|
39
|
+
# 构建标准执行计划
|
|
40
|
+
plan = {
|
|
41
|
+
'domain': domain,
|
|
42
|
+
'total_files': len(html_files),
|
|
43
|
+
'sample_files': sample_files, # HTML文件路径列表
|
|
44
|
+
'sample_urls': sample_files, # 为了兼容性,保留这个字段
|
|
45
|
+
'num_samples': num_samples,
|
|
46
|
+
'steps': [
|
|
47
|
+
'read_html_file', # 1. 读取HTML文件
|
|
48
|
+
'capture_screenshot', # 2. 渲染并截图
|
|
49
|
+
'extract_schema', # 3. 提取JSON Schema
|
|
50
|
+
'generate_code', # 4. 生成解析代码
|
|
51
|
+
],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
logger.success(f"执行计划创建完成:")
|
|
55
|
+
logger.info(f" 域名: {domain}")
|
|
56
|
+
logger.info(f" HTML文件数量: {num_samples}")
|
|
57
|
+
logger.info(f" Schema迭代: {num_samples}轮")
|
|
58
|
+
logger.info(f" 代码迭代: {num_samples}轮")
|
|
59
|
+
logger.info(f" 执行步骤: {len(plan['steps'])} 个")
|
|
60
|
+
|
|
61
|
+
return plan
|
cli.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
web2json-agent CLI 入口点
|
|
3
|
+
提供 pip 安装后的命令行接口,支持多个子命令
|
|
4
|
+
"""
|
|
5
|
+
import sys
|
|
6
|
+
import argparse
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from config.validator import ConfigValidator, check_config_or_guide
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def cmd_init(args):
|
|
12
|
+
"""初始化配置文件"""
|
|
13
|
+
print("\n🚀 初始化 web2json-agent 配置\n")
|
|
14
|
+
|
|
15
|
+
target_dir = Path(args.dir) if args.dir else Path.cwd()
|
|
16
|
+
env_file = ConfigValidator.create_env_file(target_dir)
|
|
17
|
+
|
|
18
|
+
print("\n下一步:")
|
|
19
|
+
print(f" 1. 编辑 {env_file}")
|
|
20
|
+
print(" 2. 填入你的 API 密钥(OPENAI_API_KEY 和 OPENAI_API_BASE)")
|
|
21
|
+
print(" 3. 运行 'web2json check --test-api' 检查API响应")
|
|
22
|
+
print()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def cmd_setup(args):
|
|
26
|
+
"""交互式配置向导"""
|
|
27
|
+
print("\n🚀 web2json-agent 交互式配置\n")
|
|
28
|
+
ConfigValidator.interactive_setup()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def cmd_check(args):
|
|
32
|
+
"""检查配置"""
|
|
33
|
+
print("\n🔍 检查配置...\n")
|
|
34
|
+
is_valid, missing = ConfigValidator.check_config(verbose=True)
|
|
35
|
+
|
|
36
|
+
if not is_valid:
|
|
37
|
+
print("\n❌ 配置不完整")
|
|
38
|
+
print("\n解决方法:")
|
|
39
|
+
print(" 1. 运行 'web2json init' 创建配置文件")
|
|
40
|
+
print(" 2. 或运行 'web2json setup' 使用交互式配置向导")
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
|
|
43
|
+
# 如果基本配置通过,且用户要求测试 API
|
|
44
|
+
if args.test_api:
|
|
45
|
+
print("\n🔌 测试 API 连接...\n")
|
|
46
|
+
api_valid, errors = ConfigValidator.test_api_connection(test_models=True)
|
|
47
|
+
|
|
48
|
+
if not api_valid:
|
|
49
|
+
print("\n❌ API 连接测试失败")
|
|
50
|
+
for model_name, error in errors.items():
|
|
51
|
+
print(f" ✗ {model_name}: {error}")
|
|
52
|
+
print("\n请检查:")
|
|
53
|
+
print(" 1. API 密钥是否正确")
|
|
54
|
+
print(" 2. API Base URL 是否可访问")
|
|
55
|
+
print(" 3. 模型名称是否正确")
|
|
56
|
+
print(" 4. 网络连接是否正常")
|
|
57
|
+
sys.exit(1)
|
|
58
|
+
|
|
59
|
+
print("\n✅ 所有检查通过!可以开始使用了")
|
|
60
|
+
print("\n示例命令:")
|
|
61
|
+
print(" web2json -d input_html/ -o output/blog")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def cmd_generate(args):
|
|
66
|
+
"""生成解析器(主功能)"""
|
|
67
|
+
# 在执行主功能前检查配置
|
|
68
|
+
if not args.skip_config_check:
|
|
69
|
+
check_config_or_guide()
|
|
70
|
+
|
|
71
|
+
# 导入并执行主程序
|
|
72
|
+
from main import main as main_func, setup_logger, read_html_files_from_directory
|
|
73
|
+
from agent import ParserAgent
|
|
74
|
+
from loguru import logger
|
|
75
|
+
|
|
76
|
+
setup_logger()
|
|
77
|
+
|
|
78
|
+
logger.info("="*70)
|
|
79
|
+
logger.info("web2json-agent - 智能网页解析代码生成器")
|
|
80
|
+
logger.info("="*70)
|
|
81
|
+
|
|
82
|
+
# 获取HTML文件列表
|
|
83
|
+
logger.info(f"从目录读取HTML文件: {args.directory}")
|
|
84
|
+
html_files = read_html_files_from_directory(args.directory)
|
|
85
|
+
logger.info(f"读取到 {len(html_files)} 个HTML文件")
|
|
86
|
+
|
|
87
|
+
# 创建Agent
|
|
88
|
+
agent = ParserAgent(output_dir=args.output)
|
|
89
|
+
|
|
90
|
+
# 生成解析器
|
|
91
|
+
result = agent.generate_parser(
|
|
92
|
+
html_files=html_files,
|
|
93
|
+
domain=args.domain
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 输出结果
|
|
97
|
+
if result['success']:
|
|
98
|
+
logger.success("\n✓ 解析器生成成功!")
|
|
99
|
+
logger.info(f" 解析器路径: {result['parser_path']}")
|
|
100
|
+
logger.info(f" 配置路径: {result['config_path']}")
|
|
101
|
+
|
|
102
|
+
logger.info("\n使用方法:")
|
|
103
|
+
logger.info(f" python {result['parser_path']} <url_or_html_file>")
|
|
104
|
+
else:
|
|
105
|
+
logger.error("\n✗ 解析器生成失败")
|
|
106
|
+
if 'error' in result:
|
|
107
|
+
logger.error(f" 错误: {result['error']}")
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def main():
|
|
112
|
+
"""CLI 主入口"""
|
|
113
|
+
parser = argparse.ArgumentParser(
|
|
114
|
+
prog='web2json',
|
|
115
|
+
description='web2json-agent - 智能网页解析代码生成器',
|
|
116
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
117
|
+
epilog="""
|
|
118
|
+
示例:
|
|
119
|
+
# 首次使用:初始化配置
|
|
120
|
+
web2json init
|
|
121
|
+
web2json setup # 或使用交互式配置向导
|
|
122
|
+
|
|
123
|
+
# 检查配置
|
|
124
|
+
web2json check
|
|
125
|
+
web2json check --test-api
|
|
126
|
+
|
|
127
|
+
# 从目录读取HTML文件并生成解析器
|
|
128
|
+
web2json -d input_html/ -o output/blog
|
|
129
|
+
|
|
130
|
+
更多信息: https://github.com/ccprocessor/web2json-agent
|
|
131
|
+
"""
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
subparsers = parser.add_subparsers(dest='command', help='子命令')
|
|
135
|
+
|
|
136
|
+
# init 命令
|
|
137
|
+
parser_init = subparsers.add_parser('init', help='初始化配置文件')
|
|
138
|
+
parser_init.add_argument(
|
|
139
|
+
'--dir',
|
|
140
|
+
help='配置文件目录(默认: 当前目录)'
|
|
141
|
+
)
|
|
142
|
+
parser_init.set_defaults(func=cmd_init)
|
|
143
|
+
|
|
144
|
+
# setup 命令
|
|
145
|
+
parser_setup = subparsers.add_parser('setup', help='交互式配置向导')
|
|
146
|
+
parser_setup.set_defaults(func=cmd_setup)
|
|
147
|
+
|
|
148
|
+
# check 命令
|
|
149
|
+
parser_check = subparsers.add_parser('check', help='检查配置')
|
|
150
|
+
parser_check.add_argument(
|
|
151
|
+
'--test-api',
|
|
152
|
+
action='store_true',
|
|
153
|
+
help='测试 API 连接和模型可用性'
|
|
154
|
+
)
|
|
155
|
+
parser_check.set_defaults(func=cmd_check)
|
|
156
|
+
|
|
157
|
+
# 主命令参数(生成解析器)
|
|
158
|
+
parser.add_argument(
|
|
159
|
+
'-d', '--directory',
|
|
160
|
+
help='HTML文件目录路径'
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
'-o', '--output',
|
|
164
|
+
default='output',
|
|
165
|
+
help='输出目录(默认: output)'
|
|
166
|
+
)
|
|
167
|
+
parser.add_argument(
|
|
168
|
+
'--domain',
|
|
169
|
+
help='域名(可选)'
|
|
170
|
+
)
|
|
171
|
+
parser.add_argument(
|
|
172
|
+
'--skip-config-check',
|
|
173
|
+
action='store_true',
|
|
174
|
+
help='跳过配置检查(不推荐)'
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# 解析参数
|
|
178
|
+
args = parser.parse_args()
|
|
179
|
+
|
|
180
|
+
# 如果没有指定子命令,检查是否提供了目录参数
|
|
181
|
+
if args.command is None:
|
|
182
|
+
if args.directory:
|
|
183
|
+
# 当作生成命令处理
|
|
184
|
+
cmd_generate(args)
|
|
185
|
+
else:
|
|
186
|
+
# 显示帮助信息
|
|
187
|
+
parser.print_help()
|
|
188
|
+
print("\n💡 提示: 首次使用请先运行 'web2json init' 或 'web2json setup'")
|
|
189
|
+
else:
|
|
190
|
+
# 执行子命令
|
|
191
|
+
args.func(args)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
if __name__ == "__main__":
|
|
195
|
+
main()
|
config/__init__.py
ADDED
config/settings.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HtmlParserAgent 配置管理模块
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
# 加载环境变量
|
|
11
|
+
env_path = Path(__file__).parent.parent / ".env"
|
|
12
|
+
load_dotenv(env_path)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Settings(BaseModel):
|
|
16
|
+
"""全局配置"""
|
|
17
|
+
|
|
18
|
+
# ============================================
|
|
19
|
+
# API 配置
|
|
20
|
+
# ============================================
|
|
21
|
+
openai_api_key: str = Field(default_factory=lambda: os.getenv("OPENAI_API_KEY", ""))
|
|
22
|
+
openai_api_base: str = Field(default_factory=lambda: os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"))
|
|
23
|
+
|
|
24
|
+
# ============================================
|
|
25
|
+
# 模型配置
|
|
26
|
+
# ============================================
|
|
27
|
+
# 默认模型(通用场景)
|
|
28
|
+
default_model: str = Field(default_factory=lambda: os.getenv("DEFAULT_MODEL", "claude-sonnet-4-5-20250929"))
|
|
29
|
+
default_temperature: float = Field(default_factory=lambda: float(os.getenv("DEFAULT_TEMPERATURE", "0.3")))
|
|
30
|
+
|
|
31
|
+
# Agent
|
|
32
|
+
agent_model: str = Field(default_factory=lambda: os.getenv("AGENT_MODEL", "claude-sonnet-4-5-20250929"))
|
|
33
|
+
agent_temperature: float = Field(default_factory=lambda: float(os.getenv("AGENT_TEMPERATURE", "0")))
|
|
34
|
+
|
|
35
|
+
# 代码生成
|
|
36
|
+
code_gen_model: str = Field(default_factory=lambda: os.getenv("CODE_GEN_MODEL", "claude-sonnet-4-5-20250929"))
|
|
37
|
+
code_gen_temperature: float = Field(default_factory=lambda: float(os.getenv("CODE_GEN_TEMPERATURE", "0.3")))
|
|
38
|
+
code_gen_max_tokens: int = Field(default_factory=lambda: int(os.getenv("CODE_GEN_MAX_TOKENS", "16384")))
|
|
39
|
+
|
|
40
|
+
# 视觉理解
|
|
41
|
+
vision_model: str = Field(default_factory=lambda: os.getenv("VISION_MODEL", "qwen-vl-max"))
|
|
42
|
+
vision_temperature: float = Field(default_factory=lambda: float(os.getenv("VISION_TEMPERATURE", "0")))
|
|
43
|
+
vision_max_tokens: int = Field(default_factory=lambda: int(os.getenv("VISION_MAX_TOKENS", "16384")))
|
|
44
|
+
|
|
45
|
+
# ============================================
|
|
46
|
+
# Agent 配置
|
|
47
|
+
# ============================================
|
|
48
|
+
# 迭代次数由输入URL数量决定,无需配置
|
|
49
|
+
|
|
50
|
+
# ============================================
|
|
51
|
+
# 浏览器配置
|
|
52
|
+
# ============================================
|
|
53
|
+
headless: bool = Field(default_factory=lambda: os.getenv("HEADLESS", "true").lower() == "true")
|
|
54
|
+
timeout: int = Field(default_factory=lambda: int(os.getenv("TIMEOUT", "30000")))
|
|
55
|
+
screenshot_full_page: bool = Field(default_factory=lambda: os.getenv("SCREENSHOT_FULL_PAGE", "true").lower() == "true")
|
|
56
|
+
|
|
57
|
+
# ============================================
|
|
58
|
+
# HTML精简配置
|
|
59
|
+
# ============================================
|
|
60
|
+
html_simplify_mode: str = Field(default_factory=lambda: os.getenv("HTML_SIMPLIFY_MODE", "xpath"))
|
|
61
|
+
html_keep_attrs: list = Field(default_factory=lambda: [
|
|
62
|
+
attr.strip() for attr in os.getenv("HTML_KEEP_ATTRS", "class,id,href,src,data-id").split(",")
|
|
63
|
+
])
|
|
64
|
+
|
|
65
|
+
class Config:
|
|
66
|
+
"""Pydantic配置"""
|
|
67
|
+
env_file = ".env"
|
|
68
|
+
env_file_encoding = "utf-8"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# 全局配置实例
|
|
72
|
+
settings = Settings()
|