jarvis-ai-assistant 0.1.131__py3-none-any.whl → 0.1.132__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +48 -29
- jarvis/jarvis_agent/patch.py +61 -43
- jarvis/jarvis_agent/shell_input_handler.py +1 -1
- jarvis/jarvis_code_agent/code_agent.py +87 -86
- jarvis/jarvis_dev/main.py +335 -626
- jarvis/jarvis_git_squash/main.py +10 -31
- jarvis/jarvis_multi_agent/__init__.py +19 -28
- jarvis/jarvis_platform/ai8.py +7 -32
- jarvis/jarvis_platform/base.py +2 -7
- jarvis/jarvis_platform/kimi.py +3 -144
- jarvis/jarvis_platform/ollama.py +54 -68
- jarvis/jarvis_platform/openai.py +0 -4
- jarvis/jarvis_platform/oyi.py +0 -75
- jarvis/jarvis_platform/yuanbao.py +264 -0
- jarvis/jarvis_rag/file_processors.py +138 -0
- jarvis/jarvis_rag/main.py +1305 -425
- jarvis/jarvis_tools/ask_codebase.py +205 -39
- jarvis/jarvis_tools/code_review.py +125 -99
- jarvis/jarvis_tools/execute_python_script.py +58 -0
- jarvis/jarvis_tools/execute_shell.py +13 -26
- jarvis/jarvis_tools/execute_shell_script.py +1 -1
- jarvis/jarvis_tools/file_analyzer.py +271 -0
- jarvis/jarvis_tools/file_operation.py +1 -1
- jarvis/jarvis_tools/find_caller.py +213 -0
- jarvis/jarvis_tools/find_symbol.py +211 -0
- jarvis/jarvis_tools/function_analyzer.py +248 -0
- jarvis/jarvis_tools/git_commiter.py +4 -4
- jarvis/jarvis_tools/methodology.py +89 -48
- jarvis/jarvis_tools/project_analyzer.py +220 -0
- jarvis/jarvis_tools/read_code.py +23 -2
- jarvis/jarvis_tools/read_webpage.py +195 -81
- jarvis/jarvis_tools/registry.py +132 -11
- jarvis/jarvis_tools/search_web.py +55 -10
- jarvis/jarvis_tools/tool_generator.py +6 -8
- jarvis/jarvis_utils/__init__.py +1 -0
- jarvis/jarvis_utils/config.py +67 -3
- jarvis/jarvis_utils/embedding.py +344 -45
- jarvis/jarvis_utils/git_utils.py +9 -1
- jarvis/jarvis_utils/input.py +7 -6
- jarvis/jarvis_utils/methodology.py +379 -7
- jarvis/jarvis_utils/output.py +5 -3
- jarvis/jarvis_utils/utils.py +59 -7
- {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/METADATA +3 -2
- jarvis_ai_assistant-0.1.132.dist-info/RECORD +82 -0
- {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/entry_points.txt +2 -0
- jarvis/jarvis_codebase/__init__.py +0 -0
- jarvis/jarvis_codebase/main.py +0 -1011
- jarvis/jarvis_tools/treesitter_analyzer.py +0 -331
- jarvis/jarvis_treesitter/README.md +0 -104
- jarvis/jarvis_treesitter/__init__.py +0 -20
- jarvis/jarvis_treesitter/database.py +0 -258
- jarvis/jarvis_treesitter/example.py +0 -115
- jarvis/jarvis_treesitter/grammar_builder.py +0 -182
- jarvis/jarvis_treesitter/language.py +0 -117
- jarvis/jarvis_treesitter/symbol.py +0 -31
- jarvis/jarvis_treesitter/tools_usage.md +0 -121
- jarvis_ai_assistant-0.1.131.dist-info/RECORD +0 -85
- {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
from typing import Dict, Any, List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from jarvis.jarvis_agent import Agent
|
|
5
|
+
from jarvis.jarvis_platform.registry import PlatformRegistry
|
|
6
|
+
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProjectAnalyzerTool:
|
|
10
|
+
"""
|
|
11
|
+
项目分析工具
|
|
12
|
+
使用agent分析项目结构、入口点、模块划分等信息
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
name = "project_analyzer"
|
|
16
|
+
description = "分析项目结构、入口点、模块划分等信息,提供项目概览"
|
|
17
|
+
parameters = {
|
|
18
|
+
"type": "object",
|
|
19
|
+
"properties": {
|
|
20
|
+
"root_dir": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "项目根目录路径(可选)",
|
|
23
|
+
"default": "."
|
|
24
|
+
},
|
|
25
|
+
"focus_dirs": {
|
|
26
|
+
"type": "array",
|
|
27
|
+
"items": {
|
|
28
|
+
"type": "string"
|
|
29
|
+
},
|
|
30
|
+
"description": "要重点分析的目录列表(可选)",
|
|
31
|
+
"default": []
|
|
32
|
+
},
|
|
33
|
+
"exclude_dirs": {
|
|
34
|
+
"type": "array",
|
|
35
|
+
"items": {
|
|
36
|
+
"type": "string"
|
|
37
|
+
},
|
|
38
|
+
"description": "要排除的目录列表(可选)",
|
|
39
|
+
"default": []
|
|
40
|
+
},
|
|
41
|
+
"objective": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"description": "描述本次项目分析的目标和用途,例如'理解项目架构以便进行重构'或'寻找性能瓶颈'",
|
|
44
|
+
"default": ""
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
"required": []
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def execute(self, args: Dict[str, Any]) -> Dict[str, Any]:
|
|
51
|
+
"""
|
|
52
|
+
执行项目分析工具
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
args: 包含参数的字典
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
包含执行结果的字典
|
|
59
|
+
"""
|
|
60
|
+
# 存储原始目录
|
|
61
|
+
original_dir = os.getcwd()
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# 解析参数
|
|
65
|
+
root_dir = args.get("root_dir", ".")
|
|
66
|
+
focus_dirs = args.get("focus_dirs", [])
|
|
67
|
+
exclude_dirs = args.get("exclude_dirs", [])
|
|
68
|
+
objective = args.get("objective", "")
|
|
69
|
+
|
|
70
|
+
# 创建agent的system prompt
|
|
71
|
+
system_prompt = self._create_system_prompt(
|
|
72
|
+
root_dir, focus_dirs, exclude_dirs, objective
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# 创建agent的summary prompt
|
|
76
|
+
summary_prompt = self._create_summary_prompt(root_dir, objective)
|
|
77
|
+
|
|
78
|
+
# 切换到根目录
|
|
79
|
+
os.chdir(root_dir)
|
|
80
|
+
|
|
81
|
+
# 构建使用的工具
|
|
82
|
+
from jarvis.jarvis_tools.registry import ToolRegistry
|
|
83
|
+
tool_registry = ToolRegistry()
|
|
84
|
+
tool_registry.use_tools([
|
|
85
|
+
"execute_shell",
|
|
86
|
+
"read_code",
|
|
87
|
+
"find_symbol",
|
|
88
|
+
"function_analyzer",
|
|
89
|
+
"find_caller",
|
|
90
|
+
"file_analyzer",
|
|
91
|
+
"ask_codebase"
|
|
92
|
+
])
|
|
93
|
+
|
|
94
|
+
# 创建并运行agent
|
|
95
|
+
analyzer_agent = Agent(
|
|
96
|
+
system_prompt=system_prompt,
|
|
97
|
+
name=f"ProjectAnalyzer",
|
|
98
|
+
description=f"分析项目结构、模块划分和关键组件",
|
|
99
|
+
summary_prompt=summary_prompt,
|
|
100
|
+
platform=PlatformRegistry().get_codegen_platform(),
|
|
101
|
+
output_handler=[tool_registry],
|
|
102
|
+
need_summary=True,
|
|
103
|
+
is_sub_agent=True,
|
|
104
|
+
use_methodology=False,
|
|
105
|
+
record_methodology=False,
|
|
106
|
+
execute_tool_confirm=False,
|
|
107
|
+
auto_complete=True
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# 运行agent并获取结果
|
|
111
|
+
task_input = f"分析项目结构、入口点、模块划分等信息,提供项目概览"
|
|
112
|
+
result = analyzer_agent.run(task_input)
|
|
113
|
+
|
|
114
|
+
return {
|
|
115
|
+
"success": True,
|
|
116
|
+
"stdout": result,
|
|
117
|
+
"stderr": ""
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
PrettyOutput.print(str(e), OutputType.ERROR)
|
|
122
|
+
return {
|
|
123
|
+
"success": False,
|
|
124
|
+
"stdout": "",
|
|
125
|
+
"stderr": f"项目分析失败: {str(e)}"
|
|
126
|
+
}
|
|
127
|
+
finally:
|
|
128
|
+
# 恢复原始目录
|
|
129
|
+
os.chdir(original_dir)
|
|
130
|
+
|
|
131
|
+
def _create_system_prompt(self, root_dir: str, focus_dirs: List[str],
|
|
132
|
+
exclude_dirs: List[str], objective: str) -> str:
|
|
133
|
+
"""
|
|
134
|
+
创建Agent的system prompt
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
root_dir: 项目根目录
|
|
138
|
+
focus_dirs: 重点分析的目录列表
|
|
139
|
+
exclude_dirs: 排除的目录列表
|
|
140
|
+
objective: 分析目标
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
系统提示文本
|
|
144
|
+
"""
|
|
145
|
+
focus_dirs_str = ", ".join(focus_dirs) if focus_dirs else "整个项目"
|
|
146
|
+
exclude_dirs_str = ", ".join(exclude_dirs) if exclude_dirs else "无"
|
|
147
|
+
|
|
148
|
+
objective_text = f"\n\n## 分析目标\n{objective}" if objective else "\n\n## 分析目标\n全面了解项目结构、模块划分和关键组件"
|
|
149
|
+
|
|
150
|
+
return f"""# 项目架构分析专家
|
|
151
|
+
|
|
152
|
+
## 任务描述
|
|
153
|
+
对项目 `{root_dir}` 进行针对性分析,专注于分析目标所需的内容,生成有针对性、深入且有洞察力的项目分析报告。{objective_text}
|
|
154
|
+
|
|
155
|
+
## 分析范围
|
|
156
|
+
- 项目根目录: `{root_dir}`
|
|
157
|
+
- 重点分析: {focus_dirs_str}
|
|
158
|
+
- 排除目录: {exclude_dirs_str}
|
|
159
|
+
|
|
160
|
+
## 分析策略
|
|
161
|
+
1. 首先理解分析目标,确定你需要寻找什么信息
|
|
162
|
+
2. 灵活采用适合目标的分析方法,不受预设分析框架的限制
|
|
163
|
+
3. 有选择地探索项目,只关注与目标直接相关的部分
|
|
164
|
+
4. 根据目标需要自行判断分析的深度和广度
|
|
165
|
+
|
|
166
|
+
## 探索命令示例
|
|
167
|
+
```bash
|
|
168
|
+
# 获取项目文件结构
|
|
169
|
+
find . -type f -not -path "*/\\.*" | sort
|
|
170
|
+
|
|
171
|
+
# 查找可能的入口点
|
|
172
|
+
find . -name "main.*" -o -name "app.*" -o -name "index.*"
|
|
173
|
+
|
|
174
|
+
# 分析配置文件
|
|
175
|
+
find . -name "*.json" -o -name "*.yaml" -o -name "*.toml" -o -name "*.ini" -o -name "*.conf"
|
|
176
|
+
|
|
177
|
+
# 查找核心模块
|
|
178
|
+
find . -name "core.*" -o -name "*core*" -o -name "main.*" -o -name "api.*"
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## 分析工具使用
|
|
182
|
+
- 使用`file_analyzer`分析关键文件结构和功能
|
|
183
|
+
- 使用`find_symbol`定位和分析重要符号和函数
|
|
184
|
+
- 使用`function_analyzer`深入理解复杂函数的实现
|
|
185
|
+
- 使用`find_caller`追踪函数调用关系和依赖
|
|
186
|
+
|
|
187
|
+
## 分析输出要求
|
|
188
|
+
- 直接回应分析目标的关键问题
|
|
189
|
+
- 提供与目标相关的深入洞察
|
|
190
|
+
- 分析内容应直接服务于分析目标
|
|
191
|
+
- 避免与目标无关的冗余信息
|
|
192
|
+
- 使用具体代码路径和示例支持分析结论
|
|
193
|
+
- 提供针对分析目标的具体建议和改进方向"""
|
|
194
|
+
|
|
195
|
+
def _create_summary_prompt(self, root_dir: str, objective: str) -> str:
|
|
196
|
+
"""
|
|
197
|
+
创建Agent的summary prompt
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
root_dir: 项目根目录
|
|
201
|
+
objective: 分析目标
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
总结提示文本
|
|
205
|
+
"""
|
|
206
|
+
objective_text = f"\n\n## 具体分析目标\n{objective}" if objective else ""
|
|
207
|
+
|
|
208
|
+
return f"""# 项目分析报告: `{root_dir}`{objective_text}
|
|
209
|
+
|
|
210
|
+
## 报告要求
|
|
211
|
+
生成一份完全以分析目标为导向的项目分析报告。不要遵循固定的报告模板,而是完全根据分析目标来组织内容:
|
|
212
|
+
|
|
213
|
+
- 专注回答分析目标提出的问题
|
|
214
|
+
- 只包含与分析目标直接相关的发现和洞察
|
|
215
|
+
- 完全跳过与分析目标无关的内容,无需做全面分析
|
|
216
|
+
- 分析深度应与目标的具体需求匹配
|
|
217
|
+
- 使用具体的代码路径和示例支持你的观点
|
|
218
|
+
- 以清晰的Markdown格式呈现,简洁明了
|
|
219
|
+
|
|
220
|
+
在分析中保持灵活性,避免固定思维模式。你的任务不是提供全面的项目概览,而是直接解决分析目标中提出的具体问题。"""
|
jarvis/jarvis_tools/read_code.py
CHANGED
|
@@ -7,7 +7,7 @@ from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
|
7
7
|
|
|
8
8
|
class ReadCodeTool:
|
|
9
9
|
name = "read_code"
|
|
10
|
-
description = "
|
|
10
|
+
description = "代码阅读与分析工具,用于读取源代码文件并添加行号,针对代码文件优化,提供更好的格式化输出和行号显示,适用于代码分析、审查和理解代码实现的场景"
|
|
11
11
|
parameters = {
|
|
12
12
|
"type": "object",
|
|
13
13
|
"properties": {
|
|
@@ -29,6 +29,16 @@ class ReadCodeTool:
|
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
def _handle_single_file(self, filepath: str, start_line: int = 1, end_line: int = -1) -> Dict[str, Any]:
|
|
32
|
+
"""处理单个文件的读取操作
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
filepath (str): 文件路径
|
|
36
|
+
start_line (int): 起始行号,默认为1
|
|
37
|
+
end_line (int): 结束行号,默认为-1表示文件末尾
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dict[str, Any]: 包含成功状态、输出内容和错误信息的字典
|
|
41
|
+
"""
|
|
32
42
|
try:
|
|
33
43
|
abs_path = os.path.abspath(filepath)
|
|
34
44
|
with yaspin(text=f"正在读取文件: {abs_path}...", color="cyan") as spinner:
|
|
@@ -73,7 +83,7 @@ class ReadCodeTool:
|
|
|
73
83
|
# 添加行号并构建输出内容
|
|
74
84
|
selected_lines = lines[start_line-1:end_line]
|
|
75
85
|
numbered_content = "".join(
|
|
76
|
-
[f"{i:4d}
|
|
86
|
+
[f"{i:4d}:{line}"
|
|
77
87
|
for i, line in enumerate(selected_lines, start=start_line)]
|
|
78
88
|
)
|
|
79
89
|
|
|
@@ -86,6 +96,9 @@ class ReadCodeTool:
|
|
|
86
96
|
)
|
|
87
97
|
spinner.text = f"文件读取完成: {abs_path}"
|
|
88
98
|
spinner.ok("✅")
|
|
99
|
+
|
|
100
|
+
PrettyOutput.print(output, OutputType.SUCCESS)
|
|
101
|
+
|
|
89
102
|
return {
|
|
90
103
|
"success": True,
|
|
91
104
|
"stdout": output,
|
|
@@ -101,6 +114,14 @@ class ReadCodeTool:
|
|
|
101
114
|
}
|
|
102
115
|
|
|
103
116
|
def execute(self, args: Dict) -> Dict[str, Any]:
|
|
117
|
+
"""执行代码读取操作
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
args (Dict): 包含文件列表的参数字典
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Dict[str, Any]: 包含成功状态、输出内容和错误信息的字典
|
|
124
|
+
"""
|
|
104
125
|
try:
|
|
105
126
|
if "files" not in args or not isinstance(args["files"], list):
|
|
106
127
|
return {
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Dict, Any
|
|
2
|
-
import
|
|
3
|
-
from bs4 import BeautifulSoup
|
|
4
|
-
from
|
|
2
|
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
3
|
+
from bs4 import BeautifulSoup, Tag
|
|
4
|
+
from urllib.parse import urlparse, urljoin
|
|
5
|
+
import re
|
|
5
6
|
|
|
6
7
|
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
7
8
|
|
|
@@ -20,93 +21,206 @@ class WebpageTool:
|
|
|
20
21
|
}
|
|
21
22
|
|
|
22
23
|
def execute(self, args: Dict) -> Dict[str, Any]:
|
|
23
|
-
"""Read webpage content"""
|
|
24
|
+
"""Read webpage content using Playwright to handle JavaScript-rendered pages"""
|
|
24
25
|
try:
|
|
25
|
-
url = args["url"].strip()
|
|
26
|
+
url = args["url"].strip()
|
|
26
27
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
with yaspin(text="正在读取网页...", color="cyan") as spinner:
|
|
34
|
-
response = requests.get(url, headers=headers, timeout=10)
|
|
35
|
-
response.raise_for_status()
|
|
36
|
-
spinner.text = "网页读取完成"
|
|
37
|
-
spinner.ok("✅")
|
|
28
|
+
with sync_playwright() as p:
|
|
29
|
+
# Launch browser
|
|
30
|
+
browser = p.chromium.launch(
|
|
31
|
+
headless=True,
|
|
32
|
+
args=['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']
|
|
33
|
+
)
|
|
38
34
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# Parse HTML
|
|
45
|
-
with yaspin(text="正在解析网页...", color="cyan") as spinner:
|
|
46
|
-
soup = BeautifulSoup(response.text, 'html.parser')
|
|
47
|
-
spinner.text = "网页解析完成"
|
|
48
|
-
spinner.ok("✅")
|
|
49
|
-
|
|
50
|
-
# Remove script and style tags
|
|
51
|
-
with yaspin(text="正在移除脚本和样式...", color="cyan") as spinner:
|
|
52
|
-
for script in soup(["script", "style"]):
|
|
53
|
-
script.decompose()
|
|
54
|
-
spinner.text = "脚本和样式移除完成"
|
|
55
|
-
spinner.ok("✅")
|
|
56
|
-
|
|
57
|
-
# Extract title
|
|
58
|
-
with yaspin(text="正在提取标题...", color="cyan") as spinner:
|
|
59
|
-
title = soup.title.string if soup.title else ""
|
|
60
|
-
title = title.strip() if title else "No title"
|
|
61
|
-
spinner.text = "标题提取完成"
|
|
62
|
-
spinner.ok("✅")
|
|
63
|
-
|
|
64
|
-
with yaspin(text="正在提取文本和链接...", color="cyan") as spinner:
|
|
65
|
-
# Extract text and links
|
|
66
|
-
text_parts = []
|
|
67
|
-
links = []
|
|
35
|
+
# Create a new page with appropriate settings
|
|
36
|
+
page = browser.new_page(
|
|
37
|
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
38
|
+
viewport={'width': 1920, 'height': 1080}
|
|
39
|
+
)
|
|
68
40
|
|
|
69
|
-
#
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
41
|
+
# Set timeout to avoid long waits
|
|
42
|
+
page.set_default_timeout(30000) # 30 seconds
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
# Navigate to URL and wait for page to load
|
|
46
|
+
response = page.goto(url, wait_until="domcontentloaded")
|
|
47
|
+
|
|
48
|
+
# Additional wait for network to be idle (with a timeout)
|
|
49
|
+
try:
|
|
50
|
+
page.wait_for_load_state("networkidle", timeout=10000)
|
|
51
|
+
except PlaywrightTimeoutError:
|
|
52
|
+
# Continue even if network doesn't become completely idle
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
# Make sure we got a valid response
|
|
56
|
+
if not response or response.status >= 400:
|
|
57
|
+
raise Exception(f"Failed to load page: HTTP {response.status if response else 'No response'}")
|
|
58
|
+
|
|
59
|
+
# Get page title safely
|
|
60
|
+
title = "No title"
|
|
61
|
+
try:
|
|
62
|
+
title = page.title()
|
|
63
|
+
except Exception:
|
|
64
|
+
# Try to extract title from content if direct method fails
|
|
65
|
+
try:
|
|
66
|
+
title_element = page.query_selector("title")
|
|
67
|
+
if title_element:
|
|
68
|
+
title = title_element.text_content() or "No title"
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
# Get the HTML content after JavaScript execution
|
|
73
|
+
html_content = page.content()
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise Exception(f"Error navigating to page: {str(e)}")
|
|
77
|
+
finally:
|
|
78
|
+
# Always close browser
|
|
79
|
+
browser.close()
|
|
80
|
+
|
|
81
|
+
# Parse with BeautifulSoup and convert to markdown
|
|
82
|
+
markdown_content = self._html_to_markdown(html_content, url)
|
|
83
|
+
|
|
84
|
+
# Build output in markdown format
|
|
83
85
|
output = [
|
|
84
|
-
f"
|
|
85
|
-
"",
|
|
86
|
-
|
|
87
|
-
"\n".join(text_parts),
|
|
88
|
-
"",
|
|
89
|
-
"Links found:",
|
|
90
|
-
"\n".join(links) if links else "No links found"
|
|
86
|
+
f"# {title}",
|
|
87
|
+
f"Url: {url}",
|
|
88
|
+
markdown_content
|
|
91
89
|
]
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
"stderr": ""
|
|
99
|
-
}
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
"success": True,
|
|
93
|
+
"stdout": "\n".join(output),
|
|
94
|
+
"stderr": ""
|
|
95
|
+
}
|
|
100
96
|
|
|
101
|
-
except requests.RequestException as e:
|
|
102
|
-
return {
|
|
103
|
-
"success": False,
|
|
104
|
-
"stdout": "",
|
|
105
|
-
"stderr": f"Webpage request failed: {str(e)}"
|
|
106
|
-
}
|
|
107
97
|
except Exception as e:
|
|
98
|
+
PrettyOutput.print(f"读取网页失败: {str(e)}", OutputType.ERROR)
|
|
108
99
|
return {
|
|
109
100
|
"success": False,
|
|
110
101
|
"stdout": "",
|
|
111
102
|
"stderr": f"Failed to parse webpage: {str(e)}"
|
|
112
|
-
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def _create_soup_element(self, content):
|
|
106
|
+
"""Safely create a BeautifulSoup element, ensuring it's treated as markup"""
|
|
107
|
+
if isinstance(content, str):
|
|
108
|
+
# Create a wrapper tag to ensure proper parsing
|
|
109
|
+
soup_div = BeautifulSoup(f"<div>{content}</div>", 'html.parser').div
|
|
110
|
+
if soup_div is not None:
|
|
111
|
+
return soup_div.contents
|
|
112
|
+
# Return an empty list if the div is None
|
|
113
|
+
return []
|
|
114
|
+
return content
|
|
115
|
+
|
|
116
|
+
def _html_to_markdown(self, html_content: str, base_url: str) -> str:
|
|
117
|
+
"""Convert HTML to Markdown format preserving the content structure"""
|
|
118
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
|
119
|
+
|
|
120
|
+
# Remove unwanted elements
|
|
121
|
+
for element in soup(['script', 'style', 'meta', 'noscript', 'head']):
|
|
122
|
+
element.decompose()
|
|
123
|
+
|
|
124
|
+
# Process headings
|
|
125
|
+
for level in range(1, 7):
|
|
126
|
+
for heading in soup.find_all(f'h{level}'):
|
|
127
|
+
text = heading.get_text().strip()
|
|
128
|
+
heading_md = "\n\n" + "#" * level + " " + text + "\n\n"
|
|
129
|
+
new_element = self._create_soup_element(heading_md)
|
|
130
|
+
heading.replace_with(*new_element)
|
|
131
|
+
|
|
132
|
+
# Process paragraphs
|
|
133
|
+
for p in soup.find_all('p'):
|
|
134
|
+
text = p.get_text().strip()
|
|
135
|
+
if text:
|
|
136
|
+
new_element = self._create_soup_element("\n\n" + text + "\n\n")
|
|
137
|
+
p.replace_with(*new_element)
|
|
138
|
+
|
|
139
|
+
# Process unordered lists
|
|
140
|
+
for ul in soup.find_all('ul'):
|
|
141
|
+
items = []
|
|
142
|
+
for li in ul.find_all('li', recursive=False):
|
|
143
|
+
items.append("* " + li.get_text().strip())
|
|
144
|
+
new_element = self._create_soup_element("\n\n" + "\n".join(items) + "\n\n")
|
|
145
|
+
ul.replace_with(*new_element)
|
|
146
|
+
|
|
147
|
+
# Process ordered lists
|
|
148
|
+
for ol in soup.find_all('ol'):
|
|
149
|
+
items = []
|
|
150
|
+
for i, li in enumerate(ol.find_all('li', recursive=False), 1):
|
|
151
|
+
items.append(str(i) + ". " + li.get_text().strip())
|
|
152
|
+
new_element = self._create_soup_element("\n\n" + "\n".join(items) + "\n\n")
|
|
153
|
+
ol.replace_with(*new_element)
|
|
154
|
+
|
|
155
|
+
# Process links (first pass)
|
|
156
|
+
for a in soup.find_all('a', href=True):
|
|
157
|
+
try:
|
|
158
|
+
href = a['href']
|
|
159
|
+
text = a.get_text().strip()
|
|
160
|
+
if text and href:
|
|
161
|
+
# Convert relative URLs to absolute
|
|
162
|
+
if href.startswith('/') and not href.startswith('//'):
|
|
163
|
+
href = urljoin(base_url, href)
|
|
164
|
+
link_md = "[" + text + "](" + href + ")"
|
|
165
|
+
new_element = self._create_soup_element(link_md)
|
|
166
|
+
a.replace_with(*new_element)
|
|
167
|
+
except (KeyError, AttributeError):
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Process images
|
|
171
|
+
for img in soup.find_all('img', src=True):
|
|
172
|
+
try:
|
|
173
|
+
src = img['src']
|
|
174
|
+
alt = img.get('alt', 'Image').strip()
|
|
175
|
+
# Convert relative URLs to absolute
|
|
176
|
+
if src.startswith('/') and not src.startswith('//'):
|
|
177
|
+
src = urljoin(base_url, src)
|
|
178
|
+
img_md = ""
|
|
179
|
+
new_element = self._create_soup_element(img_md)
|
|
180
|
+
img.replace_with(*new_element)
|
|
181
|
+
except (KeyError, AttributeError, UnboundLocalError):
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
# Process code blocks
|
|
185
|
+
for pre in soup.find_all('pre'):
|
|
186
|
+
code = pre.get_text().strip()
|
|
187
|
+
pre_md = "\n\n```\n" + code + "\n```\n\n"
|
|
188
|
+
new_element = self._create_soup_element(pre_md)
|
|
189
|
+
pre.replace_with(*new_element)
|
|
190
|
+
|
|
191
|
+
# Process inline code
|
|
192
|
+
for code in soup.find_all('code'):
|
|
193
|
+
text = code.get_text().strip()
|
|
194
|
+
code_md = "`" + text + "`"
|
|
195
|
+
new_element = self._create_soup_element(code_md)
|
|
196
|
+
code.replace_with(*new_element)
|
|
197
|
+
|
|
198
|
+
# Process line breaks
|
|
199
|
+
for br in soup.find_all('br'):
|
|
200
|
+
new_element = self._create_soup_element('\n')
|
|
201
|
+
br.replace_with(*new_element)
|
|
202
|
+
|
|
203
|
+
# Get the full text
|
|
204
|
+
markdown_text = soup.get_text()
|
|
205
|
+
|
|
206
|
+
# Clean up extra whitespace and line breaks
|
|
207
|
+
markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
|
|
208
|
+
markdown_text = re.sub(r'\s{2,}', ' ', markdown_text)
|
|
209
|
+
|
|
210
|
+
# Process links again (for any that might have been missed)
|
|
211
|
+
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
|
212
|
+
all_links = re.findall(link_pattern, markdown_text)
|
|
213
|
+
|
|
214
|
+
# Add a section with all links at the end
|
|
215
|
+
if all_links:
|
|
216
|
+
link_section = ["", "## Links", ""]
|
|
217
|
+
seen_links = set()
|
|
218
|
+
for text, href in all_links:
|
|
219
|
+
link_entry = "[" + text + "](" + href + ")"
|
|
220
|
+
if link_entry not in seen_links:
|
|
221
|
+
link_section.append(link_entry)
|
|
222
|
+
seen_links.add(link_entry)
|
|
223
|
+
|
|
224
|
+
markdown_text += "\n\n" + "\n".join(link_section)
|
|
225
|
+
|
|
226
|
+
return markdown_text.strip()
|