jarvis-ai-assistant 0.1.131__py3-none-any.whl → 0.1.132__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +48 -29
  3. jarvis/jarvis_agent/patch.py +61 -43
  4. jarvis/jarvis_agent/shell_input_handler.py +1 -1
  5. jarvis/jarvis_code_agent/code_agent.py +87 -86
  6. jarvis/jarvis_dev/main.py +335 -626
  7. jarvis/jarvis_git_squash/main.py +10 -31
  8. jarvis/jarvis_multi_agent/__init__.py +19 -28
  9. jarvis/jarvis_platform/ai8.py +7 -32
  10. jarvis/jarvis_platform/base.py +2 -7
  11. jarvis/jarvis_platform/kimi.py +3 -144
  12. jarvis/jarvis_platform/ollama.py +54 -68
  13. jarvis/jarvis_platform/openai.py +0 -4
  14. jarvis/jarvis_platform/oyi.py +0 -75
  15. jarvis/jarvis_platform/yuanbao.py +264 -0
  16. jarvis/jarvis_rag/file_processors.py +138 -0
  17. jarvis/jarvis_rag/main.py +1305 -425
  18. jarvis/jarvis_tools/ask_codebase.py +205 -39
  19. jarvis/jarvis_tools/code_review.py +125 -99
  20. jarvis/jarvis_tools/execute_python_script.py +58 -0
  21. jarvis/jarvis_tools/execute_shell.py +13 -26
  22. jarvis/jarvis_tools/execute_shell_script.py +1 -1
  23. jarvis/jarvis_tools/file_analyzer.py +271 -0
  24. jarvis/jarvis_tools/file_operation.py +1 -1
  25. jarvis/jarvis_tools/find_caller.py +213 -0
  26. jarvis/jarvis_tools/find_symbol.py +211 -0
  27. jarvis/jarvis_tools/function_analyzer.py +248 -0
  28. jarvis/jarvis_tools/git_commiter.py +4 -4
  29. jarvis/jarvis_tools/methodology.py +89 -48
  30. jarvis/jarvis_tools/project_analyzer.py +220 -0
  31. jarvis/jarvis_tools/read_code.py +23 -2
  32. jarvis/jarvis_tools/read_webpage.py +195 -81
  33. jarvis/jarvis_tools/registry.py +132 -11
  34. jarvis/jarvis_tools/search_web.py +55 -10
  35. jarvis/jarvis_tools/tool_generator.py +6 -8
  36. jarvis/jarvis_utils/__init__.py +1 -0
  37. jarvis/jarvis_utils/config.py +67 -3
  38. jarvis/jarvis_utils/embedding.py +344 -45
  39. jarvis/jarvis_utils/git_utils.py +9 -1
  40. jarvis/jarvis_utils/input.py +7 -6
  41. jarvis/jarvis_utils/methodology.py +379 -7
  42. jarvis/jarvis_utils/output.py +5 -3
  43. jarvis/jarvis_utils/utils.py +59 -7
  44. {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/METADATA +3 -2
  45. jarvis_ai_assistant-0.1.132.dist-info/RECORD +82 -0
  46. {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/entry_points.txt +2 -0
  47. jarvis/jarvis_codebase/__init__.py +0 -0
  48. jarvis/jarvis_codebase/main.py +0 -1011
  49. jarvis/jarvis_tools/treesitter_analyzer.py +0 -331
  50. jarvis/jarvis_treesitter/README.md +0 -104
  51. jarvis/jarvis_treesitter/__init__.py +0 -20
  52. jarvis/jarvis_treesitter/database.py +0 -258
  53. jarvis/jarvis_treesitter/example.py +0 -115
  54. jarvis/jarvis_treesitter/grammar_builder.py +0 -182
  55. jarvis/jarvis_treesitter/language.py +0 -117
  56. jarvis/jarvis_treesitter/symbol.py +0 -31
  57. jarvis/jarvis_treesitter/tools_usage.md +0 -121
  58. jarvis_ai_assistant-0.1.131.dist-info/RECORD +0 -85
  59. {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/LICENSE +0 -0
  60. {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/WHEEL +0 -0
  61. {jarvis_ai_assistant-0.1.131.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,220 @@
1
+ from typing import Dict, Any, List
2
+ import os
3
+
4
+ from jarvis.jarvis_agent import Agent
5
+ from jarvis.jarvis_platform.registry import PlatformRegistry
6
+ from jarvis.jarvis_utils.output import OutputType, PrettyOutput
7
+
8
+
9
+ class ProjectAnalyzerTool:
10
+ """
11
+ 项目分析工具
12
+ 使用agent分析项目结构、入口点、模块划分等信息
13
+ """
14
+
15
+ name = "project_analyzer"
16
+ description = "分析项目结构、入口点、模块划分等信息,提供项目概览"
17
+ parameters = {
18
+ "type": "object",
19
+ "properties": {
20
+ "root_dir": {
21
+ "type": "string",
22
+ "description": "项目根目录路径(可选)",
23
+ "default": "."
24
+ },
25
+ "focus_dirs": {
26
+ "type": "array",
27
+ "items": {
28
+ "type": "string"
29
+ },
30
+ "description": "要重点分析的目录列表(可选)",
31
+ "default": []
32
+ },
33
+ "exclude_dirs": {
34
+ "type": "array",
35
+ "items": {
36
+ "type": "string"
37
+ },
38
+ "description": "要排除的目录列表(可选)",
39
+ "default": []
40
+ },
41
+ "objective": {
42
+ "type": "string",
43
+ "description": "描述本次项目分析的目标和用途,例如'理解项目架构以便进行重构'或'寻找性能瓶颈'",
44
+ "default": ""
45
+ }
46
+ },
47
+ "required": []
48
+ }
49
+
50
+ def execute(self, args: Dict[str, Any]) -> Dict[str, Any]:
51
+ """
52
+ 执行项目分析工具
53
+
54
+ Args:
55
+ args: 包含参数的字典
56
+
57
+ Returns:
58
+ 包含执行结果的字典
59
+ """
60
+ # 存储原始目录
61
+ original_dir = os.getcwd()
62
+
63
+ try:
64
+ # 解析参数
65
+ root_dir = args.get("root_dir", ".")
66
+ focus_dirs = args.get("focus_dirs", [])
67
+ exclude_dirs = args.get("exclude_dirs", [])
68
+ objective = args.get("objective", "")
69
+
70
+ # 创建agent的system prompt
71
+ system_prompt = self._create_system_prompt(
72
+ root_dir, focus_dirs, exclude_dirs, objective
73
+ )
74
+
75
+ # 创建agent的summary prompt
76
+ summary_prompt = self._create_summary_prompt(root_dir, objective)
77
+
78
+ # 切换到根目录
79
+ os.chdir(root_dir)
80
+
81
+ # 构建使用的工具
82
+ from jarvis.jarvis_tools.registry import ToolRegistry
83
+ tool_registry = ToolRegistry()
84
+ tool_registry.use_tools([
85
+ "execute_shell",
86
+ "read_code",
87
+ "find_symbol",
88
+ "function_analyzer",
89
+ "find_caller",
90
+ "file_analyzer",
91
+ "ask_codebase"
92
+ ])
93
+
94
+ # 创建并运行agent
95
+ analyzer_agent = Agent(
96
+ system_prompt=system_prompt,
97
+ name=f"ProjectAnalyzer",
98
+ description=f"分析项目结构、模块划分和关键组件",
99
+ summary_prompt=summary_prompt,
100
+ platform=PlatformRegistry().get_codegen_platform(),
101
+ output_handler=[tool_registry],
102
+ need_summary=True,
103
+ is_sub_agent=True,
104
+ use_methodology=False,
105
+ record_methodology=False,
106
+ execute_tool_confirm=False,
107
+ auto_complete=True
108
+ )
109
+
110
+ # 运行agent并获取结果
111
+ task_input = f"分析项目结构、入口点、模块划分等信息,提供项目概览"
112
+ result = analyzer_agent.run(task_input)
113
+
114
+ return {
115
+ "success": True,
116
+ "stdout": result,
117
+ "stderr": ""
118
+ }
119
+
120
+ except Exception as e:
121
+ PrettyOutput.print(str(e), OutputType.ERROR)
122
+ return {
123
+ "success": False,
124
+ "stdout": "",
125
+ "stderr": f"项目分析失败: {str(e)}"
126
+ }
127
+ finally:
128
+ # 恢复原始目录
129
+ os.chdir(original_dir)
130
+
131
+ def _create_system_prompt(self, root_dir: str, focus_dirs: List[str],
132
+ exclude_dirs: List[str], objective: str) -> str:
133
+ """
134
+ 创建Agent的system prompt
135
+
136
+ Args:
137
+ root_dir: 项目根目录
138
+ focus_dirs: 重点分析的目录列表
139
+ exclude_dirs: 排除的目录列表
140
+ objective: 分析目标
141
+
142
+ Returns:
143
+ 系统提示文本
144
+ """
145
+ focus_dirs_str = ", ".join(focus_dirs) if focus_dirs else "整个项目"
146
+ exclude_dirs_str = ", ".join(exclude_dirs) if exclude_dirs else "无"
147
+
148
+ objective_text = f"\n\n## 分析目标\n{objective}" if objective else "\n\n## 分析目标\n全面了解项目结构、模块划分和关键组件"
149
+
150
+ return f"""# 项目架构分析专家
151
+
152
+ ## 任务描述
153
+ 对项目 `{root_dir}` 进行针对性分析,专注于分析目标所需的内容,生成有针对性、深入且有洞察力的项目分析报告。{objective_text}
154
+
155
+ ## 分析范围
156
+ - 项目根目录: `{root_dir}`
157
+ - 重点分析: {focus_dirs_str}
158
+ - 排除目录: {exclude_dirs_str}
159
+
160
+ ## 分析策略
161
+ 1. 首先理解分析目标,确定你需要寻找什么信息
162
+ 2. 灵活采用适合目标的分析方法,不受预设分析框架的限制
163
+ 3. 有选择地探索项目,只关注与目标直接相关的部分
164
+ 4. 根据目标需要自行判断分析的深度和广度
165
+
166
+ ## 探索命令示例
167
+ ```bash
168
+ # 获取项目文件结构
169
+ find . -type f -not -path "*/\\.*" | sort
170
+
171
+ # 查找可能的入口点
172
+ find . -name "main.*" -o -name "app.*" -o -name "index.*"
173
+
174
+ # 分析配置文件
175
+ find . -name "*.json" -o -name "*.yaml" -o -name "*.toml" -o -name "*.ini" -o -name "*.conf"
176
+
177
+ # 查找核心模块
178
+ find . -name "core.*" -o -name "*core*" -o -name "main.*" -o -name "api.*"
179
+ ```
180
+
181
+ ## 分析工具使用
182
+ - 使用`file_analyzer`分析关键文件结构和功能
183
+ - 使用`find_symbol`定位和分析重要符号和函数
184
+ - 使用`function_analyzer`深入理解复杂函数的实现
185
+ - 使用`find_caller`追踪函数调用关系和依赖
186
+
187
+ ## 分析输出要求
188
+ - 直接回应分析目标的关键问题
189
+ - 提供与目标相关的深入洞察
190
+ - 分析内容应直接服务于分析目标
191
+ - 避免与目标无关的冗余信息
192
+ - 使用具体代码路径和示例支持分析结论
193
+ - 提供针对分析目标的具体建议和改进方向"""
194
+
195
+ def _create_summary_prompt(self, root_dir: str, objective: str) -> str:
196
+ """
197
+ 创建Agent的summary prompt
198
+
199
+ Args:
200
+ root_dir: 项目根目录
201
+ objective: 分析目标
202
+
203
+ Returns:
204
+ 总结提示文本
205
+ """
206
+ objective_text = f"\n\n## 具体分析目标\n{objective}" if objective else ""
207
+
208
+ return f"""# 项目分析报告: `{root_dir}`{objective_text}
209
+
210
+ ## 报告要求
211
+ 生成一份完全以分析目标为导向的项目分析报告。不要遵循固定的报告模板,而是完全根据分析目标来组织内容:
212
+
213
+ - 专注回答分析目标提出的问题
214
+ - 只包含与分析目标直接相关的发现和洞察
215
+ - 完全跳过与分析目标无关的内容,无需做全面分析
216
+ - 分析深度应与目标的具体需求匹配
217
+ - 使用具体的代码路径和示例支持你的观点
218
+ - 以清晰的Markdown格式呈现,简洁明了
219
+
220
+ 在分析中保持灵活性,避免固定思维模式。你的任务不是提供全面的项目概览,而是直接解决分析目标中提出的具体问题。"""
@@ -7,7 +7,7 @@ from jarvis.jarvis_utils.output import OutputType, PrettyOutput
7
7
 
8
8
  class ReadCodeTool:
9
9
  name = "read_code"
10
- description = "用于读取代码文件并在每行前添加行号的工具"
10
+ description = "代码阅读与分析工具,用于读取源代码文件并添加行号,针对代码文件优化,提供更好的格式化输出和行号显示,适用于代码分析、审查和理解代码实现的场景"
11
11
  parameters = {
12
12
  "type": "object",
13
13
  "properties": {
@@ -29,6 +29,16 @@ class ReadCodeTool:
29
29
  }
30
30
 
31
31
  def _handle_single_file(self, filepath: str, start_line: int = 1, end_line: int = -1) -> Dict[str, Any]:
32
+ """处理单个文件的读取操作
33
+
34
+ Args:
35
+ filepath (str): 文件路径
36
+ start_line (int): 起始行号,默认为1
37
+ end_line (int): 结束行号,默认为-1表示文件末尾
38
+
39
+ Returns:
40
+ Dict[str, Any]: 包含成功状态、输出内容和错误信息的字典
41
+ """
32
42
  try:
33
43
  abs_path = os.path.abspath(filepath)
34
44
  with yaspin(text=f"正在读取文件: {abs_path}...", color="cyan") as spinner:
@@ -73,7 +83,7 @@ class ReadCodeTool:
73
83
  # 添加行号并构建输出内容
74
84
  selected_lines = lines[start_line-1:end_line]
75
85
  numbered_content = "".join(
76
- [f"{i:4d} | {line}"
86
+ [f"{i:4d}:{line}"
77
87
  for i, line in enumerate(selected_lines, start=start_line)]
78
88
  )
79
89
 
@@ -86,6 +96,9 @@ class ReadCodeTool:
86
96
  )
87
97
  spinner.text = f"文件读取完成: {abs_path}"
88
98
  spinner.ok("✅")
99
+
100
+ PrettyOutput.print(output, OutputType.SUCCESS)
101
+
89
102
  return {
90
103
  "success": True,
91
104
  "stdout": output,
@@ -101,6 +114,14 @@ class ReadCodeTool:
101
114
  }
102
115
 
103
116
  def execute(self, args: Dict) -> Dict[str, Any]:
117
+ """执行代码读取操作
118
+
119
+ Args:
120
+ args (Dict): 包含文件列表的参数字典
121
+
122
+ Returns:
123
+ Dict[str, Any]: 包含成功状态、输出内容和错误信息的字典
124
+ """
104
125
  try:
105
126
  if "files" not in args or not isinstance(args["files"], list):
106
127
  return {
@@ -1,7 +1,8 @@
1
1
  from typing import Dict, Any
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from yaspin import yaspin
2
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
3
+ from bs4 import BeautifulSoup, Tag
4
+ from urllib.parse import urlparse, urljoin
5
+ import re
5
6
 
6
7
  from jarvis.jarvis_utils.output import OutputType, PrettyOutput
7
8
 
@@ -20,93 +21,206 @@ class WebpageTool:
20
21
  }
21
22
 
22
23
  def execute(self, args: Dict) -> Dict[str, Any]:
23
- """Read webpage content"""
24
+ """Read webpage content using Playwright to handle JavaScript-rendered pages"""
24
25
  try:
25
- url = args["url"].strip()
26
+ url = args["url"].strip()
26
27
 
27
- # Set request headers
28
- headers = {
29
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
30
- }
31
-
32
- # Send request
33
- with yaspin(text="正在读取网页...", color="cyan") as spinner:
34
- response = requests.get(url, headers=headers, timeout=10)
35
- response.raise_for_status()
36
- spinner.text = "网页读取完成"
37
- spinner.ok("✅")
28
+ with sync_playwright() as p:
29
+ # Launch browser
30
+ browser = p.chromium.launch(
31
+ headless=True,
32
+ args=['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']
33
+ )
38
34
 
39
-
40
- # Use correct encoding
41
-
42
- response.encoding = response.apparent_encoding
43
-
44
- # Parse HTML
45
- with yaspin(text="正在解析网页...", color="cyan") as spinner:
46
- soup = BeautifulSoup(response.text, 'html.parser')
47
- spinner.text = "网页解析完成"
48
- spinner.ok("✅")
49
-
50
- # Remove script and style tags
51
- with yaspin(text="正在移除脚本和样式...", color="cyan") as spinner:
52
- for script in soup(["script", "style"]):
53
- script.decompose()
54
- spinner.text = "脚本和样式移除完成"
55
- spinner.ok("✅")
56
-
57
- # Extract title
58
- with yaspin(text="正在提取标题...", color="cyan") as spinner:
59
- title = soup.title.string if soup.title else ""
60
- title = title.strip() if title else "No title"
61
- spinner.text = "标题提取完成"
62
- spinner.ok("✅")
63
-
64
- with yaspin(text="正在提取文本和链接...", color="cyan") as spinner:
65
- # Extract text and links
66
- text_parts = []
67
- links = []
35
+ # Create a new page with appropriate settings
36
+ page = browser.new_page(
37
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
38
+ viewport={'width': 1920, 'height': 1080}
39
+ )
68
40
 
69
- # Process content and collect links
70
- for element in soup.descendants:
71
- if element.name == 'a' and element.get('href'): # type: ignore
72
- href = element.get('href') # type: ignore
73
- text = element.get_text(strip=True)
74
- if text and href:
75
- links.append(f"[{text}]({href})")
76
- elif isinstance(element, str) and element.strip():
77
- text_parts.append(element.strip())
78
- spinner.text = "文本和链接提取完成"
79
- spinner.ok("✅")
80
-
81
- # Build output
82
- with yaspin(text="正在构建输出...", color="cyan") as spinner:
41
+ # Set timeout to avoid long waits
42
+ page.set_default_timeout(30000) # 30 seconds
43
+
44
+ try:
45
+ # Navigate to URL and wait for page to load
46
+ response = page.goto(url, wait_until="domcontentloaded")
47
+
48
+ # Additional wait for network to be idle (with a timeout)
49
+ try:
50
+ page.wait_for_load_state("networkidle", timeout=10000)
51
+ except PlaywrightTimeoutError:
52
+ # Continue even if network doesn't become completely idle
53
+ pass
54
+
55
+ # Make sure we got a valid response
56
+ if not response or response.status >= 400:
57
+ raise Exception(f"Failed to load page: HTTP {response.status if response else 'No response'}")
58
+
59
+ # Get page title safely
60
+ title = "No title"
61
+ try:
62
+ title = page.title()
63
+ except Exception:
64
+ # Try to extract title from content if direct method fails
65
+ try:
66
+ title_element = page.query_selector("title")
67
+ if title_element:
68
+ title = title_element.text_content() or "No title"
69
+ except Exception:
70
+ pass
71
+
72
+ # Get the HTML content after JavaScript execution
73
+ html_content = page.content()
74
+
75
+ except Exception as e:
76
+ raise Exception(f"Error navigating to page: {str(e)}")
77
+ finally:
78
+ # Always close browser
79
+ browser.close()
80
+
81
+ # Parse with BeautifulSoup and convert to markdown
82
+ markdown_content = self._html_to_markdown(html_content, url)
83
+
84
+ # Build output in markdown format
83
85
  output = [
84
- f"Title: {title}",
85
- "",
86
- "Text content:",
87
- "\n".join(text_parts),
88
- "",
89
- "Links found:",
90
- "\n".join(links) if links else "No links found"
86
+ f"# {title}",
87
+ f"Url: {url}",
88
+ markdown_content
91
89
  ]
92
- spinner.text = "输出构建完成"
93
- spinner.ok("✅")
94
-
95
- return {
96
- "success": True,
97
- "stdout": "\n".join(output),
98
- "stderr": ""
99
- }
90
+
91
+ return {
92
+ "success": True,
93
+ "stdout": "\n".join(output),
94
+ "stderr": ""
95
+ }
100
96
 
101
- except requests.RequestException as e:
102
- return {
103
- "success": False,
104
- "stdout": "",
105
- "stderr": f"Webpage request failed: {str(e)}"
106
- }
107
97
  except Exception as e:
98
+ PrettyOutput.print(f"读取网页失败: {str(e)}", OutputType.ERROR)
108
99
  return {
109
100
  "success": False,
110
101
  "stdout": "",
111
102
  "stderr": f"Failed to parse webpage: {str(e)}"
112
- }
103
+ }
104
+
105
+ def _create_soup_element(self, content):
106
+ """Safely create a BeautifulSoup element, ensuring it's treated as markup"""
107
+ if isinstance(content, str):
108
+ # Create a wrapper tag to ensure proper parsing
109
+ soup_div = BeautifulSoup(f"<div>{content}</div>", 'html.parser').div
110
+ if soup_div is not None:
111
+ return soup_div.contents
112
+ # Return an empty list if the div is None
113
+ return []
114
+ return content
115
+
116
+ def _html_to_markdown(self, html_content: str, base_url: str) -> str:
117
+ """Convert HTML to Markdown format preserving the content structure"""
118
+ soup = BeautifulSoup(html_content, 'html.parser')
119
+
120
+ # Remove unwanted elements
121
+ for element in soup(['script', 'style', 'meta', 'noscript', 'head']):
122
+ element.decompose()
123
+
124
+ # Process headings
125
+ for level in range(1, 7):
126
+ for heading in soup.find_all(f'h{level}'):
127
+ text = heading.get_text().strip()
128
+ heading_md = "\n\n" + "#" * level + " " + text + "\n\n"
129
+ new_element = self._create_soup_element(heading_md)
130
+ heading.replace_with(*new_element)
131
+
132
+ # Process paragraphs
133
+ for p in soup.find_all('p'):
134
+ text = p.get_text().strip()
135
+ if text:
136
+ new_element = self._create_soup_element("\n\n" + text + "\n\n")
137
+ p.replace_with(*new_element)
138
+
139
+ # Process unordered lists
140
+ for ul in soup.find_all('ul'):
141
+ items = []
142
+ for li in ul.find_all('li', recursive=False):
143
+ items.append("* " + li.get_text().strip())
144
+ new_element = self._create_soup_element("\n\n" + "\n".join(items) + "\n\n")
145
+ ul.replace_with(*new_element)
146
+
147
+ # Process ordered lists
148
+ for ol in soup.find_all('ol'):
149
+ items = []
150
+ for i, li in enumerate(ol.find_all('li', recursive=False), 1):
151
+ items.append(str(i) + ". " + li.get_text().strip())
152
+ new_element = self._create_soup_element("\n\n" + "\n".join(items) + "\n\n")
153
+ ol.replace_with(*new_element)
154
+
155
+ # Process links (first pass)
156
+ for a in soup.find_all('a', href=True):
157
+ try:
158
+ href = a['href']
159
+ text = a.get_text().strip()
160
+ if text and href:
161
+ # Convert relative URLs to absolute
162
+ if href.startswith('/') and not href.startswith('//'):
163
+ href = urljoin(base_url, href)
164
+ link_md = "[" + text + "](" + href + ")"
165
+ new_element = self._create_soup_element(link_md)
166
+ a.replace_with(*new_element)
167
+ except (KeyError, AttributeError):
168
+ continue
169
+
170
+ # Process images
171
+ for img in soup.find_all('img', src=True):
172
+ try:
173
+ src = img['src']
174
+ alt = img.get('alt', 'Image').strip()
175
+ # Convert relative URLs to absolute
176
+ if src.startswith('/') and not src.startswith('//'):
177
+ src = urljoin(base_url, src)
178
+ img_md = "![" + alt + "](" + src + ")"
179
+ new_element = self._create_soup_element(img_md)
180
+ img.replace_with(*new_element)
181
+ except (KeyError, AttributeError, UnboundLocalError):
182
+ continue
183
+
184
+ # Process code blocks
185
+ for pre in soup.find_all('pre'):
186
+ code = pre.get_text().strip()
187
+ pre_md = "\n\n```\n" + code + "\n```\n\n"
188
+ new_element = self._create_soup_element(pre_md)
189
+ pre.replace_with(*new_element)
190
+
191
+ # Process inline code
192
+ for code in soup.find_all('code'):
193
+ text = code.get_text().strip()
194
+ code_md = "`" + text + "`"
195
+ new_element = self._create_soup_element(code_md)
196
+ code.replace_with(*new_element)
197
+
198
+ # Process line breaks
199
+ for br in soup.find_all('br'):
200
+ new_element = self._create_soup_element('\n')
201
+ br.replace_with(*new_element)
202
+
203
+ # Get the full text
204
+ markdown_text = soup.get_text()
205
+
206
+ # Clean up extra whitespace and line breaks
207
+ markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
208
+ markdown_text = re.sub(r'\s{2,}', ' ', markdown_text)
209
+
210
+ # Process links again (for any that might have been missed)
211
+ link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
212
+ all_links = re.findall(link_pattern, markdown_text)
213
+
214
+ # Add a section with all links at the end
215
+ if all_links:
216
+ link_section = ["", "## Links", ""]
217
+ seen_links = set()
218
+ for text, href in all_links:
219
+ link_entry = "[" + text + "](" + href + ")"
220
+ if link_entry not in seen_links:
221
+ link_section.append(link_entry)
222
+ seen_links.add(link_entry)
223
+
224
+ markdown_text += "\n\n" + "\n".join(link_section)
225
+
226
+ return markdown_text.strip()