jarvis-ai-assistant 0.1.148__py3-none-any.whl → 0.1.150__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jarvis-ai-assistant might be problematic. Click here for more details.

Files changed (41) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/jarvis.py +7 -5
  3. jarvis/jarvis_agent/main.py +0 -1
  4. jarvis/jarvis_agent/patch.py +42 -7
  5. jarvis/jarvis_code_analysis/code_review.py +0 -1
  6. jarvis/jarvis_git_utils/git_commiter.py +0 -1
  7. jarvis/jarvis_lsp/base.py +1 -1
  8. jarvis/jarvis_lsp/cpp.py +1 -1
  9. jarvis/jarvis_lsp/go.py +1 -1
  10. jarvis/jarvis_lsp/python.py +1 -1
  11. jarvis/jarvis_lsp/registry.py +2 -1
  12. jarvis/jarvis_lsp/rust.py +1 -1
  13. jarvis/jarvis_mcp/__init__.py +36 -0
  14. jarvis/jarvis_mcp/local_mcp_client.py +241 -0
  15. jarvis/jarvis_mcp/remote_mcp_client.py +230 -0
  16. jarvis/jarvis_platform/base.py +1 -1
  17. jarvis/jarvis_platform/kimi.py +3 -3
  18. jarvis/jarvis_platform/registry.py +2 -4
  19. jarvis/jarvis_platform/yuanbao.py +4 -4
  20. jarvis/jarvis_tools/code_plan.py +0 -1
  21. jarvis/jarvis_tools/file_analyzer.py +1 -1
  22. jarvis/jarvis_tools/file_operation.py +7 -26
  23. jarvis/jarvis_tools/methodology.py +2 -1
  24. jarvis/jarvis_tools/read_code.py +0 -1
  25. jarvis/jarvis_tools/registry.py +108 -3
  26. jarvis/jarvis_tools/search_web.py +0 -1
  27. jarvis/jarvis_utils/config.py +14 -8
  28. jarvis/jarvis_utils/embedding.py +4 -8
  29. jarvis/jarvis_utils/file_processors.py +0 -262
  30. jarvis/jarvis_utils/git_utils.py +19 -8
  31. jarvis/jarvis_utils/input.py +7 -1
  32. jarvis/jarvis_utils/methodology.py +2 -2
  33. jarvis/jarvis_utils/output.py +0 -1
  34. jarvis/jarvis_utils/utils.py +23 -221
  35. {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/METADATA +31 -38
  36. {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/RECORD +40 -38
  37. jarvis/jarvis_platform_manager/openai_test.py +0 -138
  38. {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/LICENSE +0 -0
  39. {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/WHEEL +0 -0
  40. {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/entry_points.txt +0 -0
  41. {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,3 @@
1
- from pathlib import Path
2
- import fitz # PyMuPDF for PDF files
3
- from docx import Document as DocxDocument # python-docx for DOCX files
4
- from pptx import Presentation
5
- import pandas as pd
6
1
  import unicodedata
7
2
 
8
3
  class FileProcessor:
@@ -84,260 +79,3 @@ class TextFileProcessor(FileProcessor):
84
79
 
85
80
  except Exception as e:
86
81
  raise Exception(f"Failed to read file: {str(e)}")
87
-
88
- class PDFProcessor(FileProcessor):
89
- """PDF file processor"""
90
- @staticmethod
91
- def can_handle(file_path: str) -> bool:
92
- return Path(file_path).suffix.lower() == '.pdf'
93
-
94
- @staticmethod
95
- def extract_text(file_path: str) -> str:
96
- """提取PDF文件中的所有文本内容,包括页码、图片描述等"""
97
- try:
98
- text_parts = []
99
- with fitz.open(file_path) as doc: # type: ignore
100
- # 添加文档信息
101
- info = doc.metadata
102
- if info:
103
- meta_text = []
104
- if info.get("title"):
105
- meta_text.append(f"标题: {info['title']}")
106
- if info.get("author"):
107
- meta_text.append(f"作者: {info['author']}")
108
- if info.get("subject"):
109
- meta_text.append(f"主题: {info['subject']}")
110
- if info.get("keywords"):
111
- meta_text.append(f"关键词: {info['keywords']}")
112
-
113
- if meta_text:
114
- text_parts.append("=== 文档信息 ===")
115
- text_parts.append("\n".join(meta_text))
116
-
117
- # 提取目录结构(如果有)
118
- toc = doc.get_toc() # type: ignore
119
- if toc:
120
- text_parts.append("\n=== 目录结构 ===")
121
- for level, title, page in toc:
122
- indent = " " * (level - 1)
123
- text_parts.append(f"{indent}- {title} (第{page}页)")
124
-
125
- # 处理各页内容
126
- text_parts.append("\n=== 页面内容 ===")
127
- for page_index in range(len(doc)): # 使用范围遍历而不是直接枚举文档对象
128
- # 添加页码标记
129
- text_parts.append(f"\n--- 第{page_index+1}页 ---")
130
-
131
- # 获取页面
132
- page = doc[page_index]
133
-
134
- # 提取页面文本(包括结构信息)
135
- try:
136
- # 尝试使用结构化提取(保留段落和块结构)
137
- text = page.get_text("text") # type: ignore
138
- text = text.strip()
139
- if text:
140
- text_parts.append(text)
141
- except Exception:
142
- # 如果结构化提取失败,回退到简单文本提取
143
- text = page.get_text() # type: ignore
144
- if text.strip():
145
- text_parts.append(text.strip())
146
-
147
- # 提取图像信息(如果需要)
148
- # 注意:这可能会增加处理时间,可根据需要启用
149
- """
150
- image_list = page.get_images()
151
- if image_list:
152
- text_parts.append(f"本页包含 {len(image_list)} 个图像")
153
- """
154
-
155
- # 合并所有文本
156
- return "\n".join(text_parts)
157
-
158
- except Exception as e:
159
- # 处理可能的异常
160
- return f"PDF处理错误: {str(e)}"
161
-
162
- class DocxProcessor(FileProcessor):
163
- """DOCX file processor"""
164
- @staticmethod
165
- def can_handle(file_path: str) -> bool:
166
- return Path(file_path).suffix.lower() == '.docx'
167
-
168
- @staticmethod
169
- def extract_text(file_path: str) -> str:
170
- """提取 DOCX 文件中的所有文本内容,包括段落、表格、页眉页脚等"""
171
- doc = DocxDocument(file_path)
172
- full_text = []
173
-
174
- # 提取段落文本
175
- for para in doc.paragraphs:
176
- if para.text.strip(): # 跳过空段落
177
- full_text.append(para.text)
178
-
179
- # 提取表格文本
180
- for table in doc.tables:
181
- for row in table.rows:
182
- row_texts = []
183
- for cell in row.cells:
184
- # 每个单元格可能包含多个段落
185
- cell_text = "\n".join([p.text for p in cell.paragraphs if p.text.strip()])
186
- if cell_text:
187
- row_texts.append(cell_text)
188
- if row_texts:
189
- full_text.append(" | ".join(row_texts))
190
-
191
- # 提取页眉页脚(如果有节)
192
- try:
193
- for section in doc.sections:
194
- # 提取页眉
195
- if section.header:
196
- header_text = "\n".join([p.text for p in section.header.paragraphs if p.text.strip()])
197
- if header_text:
198
- full_text.append(f"页眉: {header_text}")
199
-
200
- # 提取页脚
201
- if section.footer:
202
- footer_text = "\n".join([p.text for p in section.footer.paragraphs if p.text.strip()])
203
- if footer_text:
204
- full_text.append(f"页脚: {footer_text}")
205
- except:
206
- # 如果提取页眉页脚失败,忽略错误继续
207
- pass
208
-
209
- # 合并所有文本
210
- return "\n\n".join(full_text)
211
-
212
- class PPTProcessor(FileProcessor):
213
- """PPT file processor"""
214
- @staticmethod
215
- def can_handle(file_path: str) -> bool:
216
- return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
217
-
218
- @staticmethod
219
- def extract_text(file_path: str) -> str:
220
- """提取PPT文件中的所有文本内容,包括标题、文本框、备注等"""
221
- prs = Presentation(file_path)
222
- all_text = []
223
-
224
- # 遍历所有幻灯片
225
- for slide_index, slide in enumerate(prs.slides, 1):
226
- slide_text = []
227
-
228
- # 添加幻灯片编号
229
- slide_text.append(f"=== 幻灯片 {slide_index} ===")
230
-
231
- # 提取幻灯片中所有形状的文本
232
- for shape in slide.shapes:
233
- # 提取带有文本的形状
234
- try:
235
- if hasattr(shape, "text_frame") and shape.text_frame: # type: ignore
236
- for paragraph in shape.text_frame.paragraphs: # type: ignore
237
- text = paragraph.text.strip()
238
- if text:
239
- slide_text.append(text)
240
- except AttributeError:
241
- pass
242
-
243
- # 提取表格内容
244
- try:
245
- if hasattr(shape, "table") and shape.table: # type: ignore
246
- for row in shape.table.rows: # type: ignore
247
- row_texts = []
248
- for cell in row.cells:
249
- if hasattr(cell, "text_frame") and cell.text_frame:
250
- cell_paragraphs = cell.text_frame.paragraphs # type: ignore
251
- cell_text = " ".join([p.text.strip() for p in cell_paragraphs if p.text.strip()])
252
- if cell_text:
253
- row_texts.append(cell_text)
254
- if row_texts:
255
- slide_text.append(" | ".join(row_texts))
256
- except AttributeError:
257
- pass
258
-
259
- # 提取幻灯片备注
260
- try:
261
- if hasattr(slide, "has_notes_slide") and slide.has_notes_slide:
262
- notes_slide = slide.notes_slide
263
- if notes_slide and hasattr(notes_slide, "notes_text_frame") and notes_slide.notes_text_frame:
264
- notes_text = notes_slide.notes_text_frame.text.strip() # type: ignore
265
- if notes_text:
266
- slide_text.append(f"备注: {notes_text}")
267
- except AttributeError:
268
- pass
269
-
270
- # 合并当前幻灯片的所有文本
271
- if len(slide_text) > 1: # 如果除了幻灯片编号外还有其他内容
272
- all_text.append("\n".join(slide_text))
273
-
274
- # 返回所有幻灯片的文本内容
275
- return "\n\n".join(all_text)
276
-
277
- class ExcelProcessor(FileProcessor):
278
- """Excel file processor"""
279
- @staticmethod
280
- def can_handle(file_path: str) -> bool:
281
- return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
282
-
283
- @staticmethod
284
- def extract_text(file_path: str) -> str:
285
- """提取 Excel 文件中的所有文本内容,包括多个工作表及格式化内容"""
286
- try:
287
- # 读取所有工作表
288
- excel_file = pd.ExcelFile(file_path)
289
- sheets_text = []
290
-
291
- # 处理每个工作表
292
- for sheet_name in excel_file.sheet_names:
293
- # 读取当前工作表
294
- df = pd.read_excel(file_path, sheet_name=sheet_name)
295
-
296
- # 如果是空表格,跳过
297
- if df.empty:
298
- continue
299
-
300
- # 添加工作表标题
301
- sheet_text = [f"=== 工作表: {sheet_name} ==="]
302
-
303
- # 填充空单元格,避免NaN显示
304
- df = df.fillna("")
305
-
306
- # 提取表格头信息
307
- if not df.columns.empty:
308
- headers = [str(col) for col in df.columns]
309
- sheet_text.append("列标题: " + " | ".join(headers))
310
-
311
- # 尝试提取表格中可能的关键信息
312
- # 1. 表格内容概述
313
- row_count, col_count = df.shape
314
- sheet_text.append(f"表格大小: {row_count}行 x {col_count}列")
315
-
316
- # 2. 表格数据,使用更友好的格式
317
- try:
318
- # 转换数据框为字符串表示
319
- # 设置最大行数和列数,避免过大的表格
320
- max_rows = min(500, row_count) # 最多显示500行
321
- if row_count > max_rows:
322
- sheet_text.append(f"注意: 表格太大,仅显示前{max_rows}行")
323
-
324
- # 将DataFrame转换为字符串表格
325
- table_str = df.head(max_rows).to_string(index=True, max_rows=max_rows, max_cols=None)
326
- sheet_text.append(table_str)
327
-
328
- except Exception as e:
329
- sheet_text.append(f"表格数据提取错误: {str(e)}")
330
-
331
- # 合并当前工作表的文本
332
- sheets_text.append("\n".join(sheet_text))
333
-
334
- # 如果没有提取到任何内容,返回一个提示信息
335
- if not sheets_text:
336
- return "Excel文件为空或无法提取内容"
337
-
338
- # 合并所有工作表的文本
339
- return "\n\n".join(sheets_text)
340
-
341
- except Exception as e:
342
- # 处理可能的异常,返回错误信息
343
- return f"Excel文件处理错误: {str(e)}"
@@ -74,14 +74,16 @@ def get_commits_between(start_hash: str, end_hash: str) -> List[Tuple[str, str]]
74
74
  ['git', 'log', f'{start_hash}..{end_hash}', '--pretty=format:%H|%s'],
75
75
  stdout=subprocess.PIPE,
76
76
  stderr=subprocess.PIPE,
77
- text=True
77
+ text=False # 禁用自动文本解码
78
78
  )
79
79
  if result.returncode != 0:
80
- PrettyOutput.print(f"获取commit历史失败: {result.stderr}", OutputType.ERROR)
80
+ error_msg = result.stderr.decode('utf-8', errors='replace')
81
+ PrettyOutput.print(f"获取commit历史失败: {error_msg}", OutputType.ERROR)
81
82
  return []
82
83
 
84
+ output = result.stdout.decode('utf-8', errors='replace')
83
85
  commits = []
84
- for line in result.stdout.splitlines():
86
+ for line in output.splitlines():
85
87
  if '|' in line:
86
88
  commit_hash, message = line.split('|', 1)
87
89
  commits.append((commit_hash, message))
@@ -94,18 +96,27 @@ def get_latest_commit_hash() -> str:
94
96
  """获取当前Git仓库的最新提交哈希值
95
97
 
96
98
  返回:
97
- str: 提交哈希值,如果不在Git仓库或发生错误则返回空字符串
99
+ str: 提交哈希值,如果不在Git仓库、空仓库或发生错误则返回空字符串
98
100
  """
99
101
  try:
102
+ # 首先检查是否存在HEAD引用
103
+ head_check = subprocess.run(
104
+ ['git', 'rev-parse', '--verify', 'HEAD'],
105
+ stdout=subprocess.PIPE,
106
+ stderr=subprocess.PIPE,
107
+ text=False
108
+ )
109
+ if head_check.returncode != 0:
110
+ return "" # 空仓库或无效HEAD
111
+
112
+ # 获取HEAD的完整哈希值
100
113
  result = subprocess.run(
101
114
  ['git', 'rev-parse', 'HEAD'],
102
115
  stdout=subprocess.PIPE,
103
116
  stderr=subprocess.PIPE,
104
- text=True
117
+ text=False
105
118
  )
106
- if result.returncode == 0:
107
- return result.stdout.strip()
108
- return ""
119
+ return result.stdout.decode('utf-8', errors='replace').strip() if result.returncode == 0 else ""
109
120
  except Exception:
110
121
  return ""
111
122
  def get_modified_line_ranges() -> Dict[str, Tuple[int, int]]:
@@ -158,8 +158,14 @@ def get_multiline_input(tip: str) -> str:
158
158
  'prompt': 'ansicyan',
159
159
  })
160
160
  try:
161
+ from prompt_toolkit.history import FileHistory
162
+ from jarvis.jarvis_utils.config import get_data_dir
163
+ import os
164
+ # 获取数据目录路径
165
+ history_dir = get_data_dir()
166
+ # 初始化带历史记录的会话
161
167
  session = PromptSession(
162
- history=None,
168
+ history=FileHistory(os.path.join(history_dir, 'multiline_input_history')),
163
169
  completer=FileCompleter(),
164
170
  key_bindings=bindings,
165
171
  complete_while_typing=True,
@@ -11,7 +11,7 @@ import json
11
11
  import tempfile
12
12
  from typing import Dict, Optional
13
13
 
14
- from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count
14
+ from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count, get_data_dir
15
15
  from jarvis.jarvis_utils.embedding import get_context_token_count
16
16
  from jarvis.jarvis_utils.output import PrettyOutput, OutputType
17
17
  from jarvis.jarvis_platform.registry import PlatformRegistry
@@ -23,7 +23,7 @@ def _get_methodology_directory() -> str:
23
23
  返回:
24
24
  str: 方法论目录的路径
25
25
  """
26
- methodology_dir = os.path.expanduser("~/.jarvis/methodologies")
26
+ methodology_dir = os.path.join(get_data_dir(), "methodologies")
27
27
  if not os.path.exists(methodology_dir):
28
28
  try:
29
29
  os.makedirs(methodology_dir, exist_ok=True)
@@ -11,7 +11,6 @@ from enum import Enum
11
11
  from datetime import datetime
12
12
  from typing import Optional
13
13
  from rich.panel import Panel
14
- from rich.box import HEAVY
15
14
  from rich.text import Text
16
15
  from rich.syntax import Syntax
17
16
  from rich.style import Style as RichStyle
@@ -2,24 +2,23 @@ import os
2
2
  import time
3
3
  import hashlib
4
4
  from pathlib import Path
5
- from typing import Union, List, Dict, Any, Callable, cast
6
- from bs4 import BeautifulSoup, Tag
7
- from jarvis.jarvis_utils.config import get_max_input_token_count
5
+ from typing import List, Any, Callable
6
+ from jarvis.jarvis_utils.config import get_max_input_token_count, get_data_dir
8
7
  from jarvis.jarvis_utils.embedding import get_context_token_count
9
8
  from jarvis.jarvis_utils.input import get_single_line_input
10
9
  from jarvis.jarvis_utils.output import PrettyOutput, OutputType
11
10
  def init_env() -> None:
12
- """初始化环境变量从~/.jarvis/env文件
11
+ """初始化环境变量从jarvis_data/env文件
13
12
 
14
13
  功能:
15
- 1. 创建不存在的.jarvis目录
14
+ 1. 创建不存在的jarvis_data目录
16
15
  2. 加载环境变量到os.environ
17
16
  3. 处理文件读取异常
18
17
  """
19
- jarvis_dir = Path.home() / ".jarvis"
18
+ jarvis_dir = Path(get_data_dir())
20
19
  env_file = jarvis_dir / "env"
21
20
 
22
- # Check if ~/.jarvis directory exists
21
+ # 检查jarvis_data目录是否存在
23
22
  if not jarvis_dir.exists():
24
23
  jarvis_dir.mkdir(parents=True)
25
24
  if env_file.exists():
@@ -53,7 +52,7 @@ def while_success(func: Callable[[], Any], sleep_time: float = 0.1) -> Any:
53
52
  time.sleep(sleep_time)
54
53
  continue
55
54
  def while_true(func: Callable[[], bool], sleep_time: float = 0.1) -> Any:
56
- """Loop execution function, until the function returns True"""
55
+ """循环执行函数直到返回True"""
57
56
  while True:
58
57
  ret = func()
59
58
  if ret:
@@ -62,37 +61,37 @@ def while_true(func: Callable[[], bool], sleep_time: float = 0.1) -> Any:
62
61
  time.sleep(sleep_time)
63
62
  return ret
64
63
  def get_file_md5(filepath: str)->str:
65
- """Calculate the MD5 hash of a file's content.
64
+ """计算文件内容的MD5哈希值
66
65
 
67
- Args:
68
- filepath: Path to the file to hash
66
+ 参数:
67
+ filepath: 要计算哈希的文件路径
69
68
 
70
- Returns:
71
- str: MD5 hash of the file's content
69
+ 返回:
70
+ str: 文件内容的MD5哈希值
72
71
  """
73
72
  return hashlib.md5(open(filepath, "rb").read(100*1024*1024)).hexdigest()
74
73
  def user_confirm(tip: str, default: bool = True) -> bool:
75
- """Prompt the user for confirmation with a yes/no question.
74
+ """提示用户确认是/否问题
76
75
 
77
- Args:
78
- tip: The message to show to the user
79
- default: The default response if user hits enter
76
+ 参数:
77
+ tip: 显示给用户的消息
78
+ default: 用户直接回车时的默认响应
80
79
 
81
- Returns:
82
- bool: True if user confirmed, False otherwise
80
+ 返回:
81
+ bool: 用户确认返回True,否则返回False
83
82
  """
84
83
  suffix = "[Y/n]" if default else "[y/N]"
85
84
  ret = get_single_line_input(f"{tip} {suffix}: ")
86
85
  return default if ret == "" else ret.lower() == "y"
87
86
 
88
87
  def get_file_line_count(filename: str) -> int:
89
- """Count the number of lines in a file.
88
+ """计算文件中的行数
90
89
 
91
- Args:
92
- filename: Path to the file to count lines for
90
+ 参数:
91
+ filename: 要计算行数的文件路径
93
92
 
94
- Returns:
95
- int: Number of lines in the file, 0 if file cannot be read
93
+ 返回:
94
+ int: 文件中的行数,如果文件无法读取则返回0
96
95
  """
97
96
  try:
98
97
  return len(open(filename, "r", encoding="utf-8", errors="ignore").readlines())
@@ -153,200 +152,3 @@ def ct(tag_name: str) -> str:
153
152
  """
154
153
  return f"</{tag_name}>"
155
154
 
156
-
157
- def create_soup_element(content: Union[str, Tag, List[Any]]) -> List[Union[Tag, str]]:
158
- """Safely create a BeautifulSoup element, ensuring it's treated as markup
159
-
160
- Args:
161
- content: Input content to convert to BeautifulSoup elements
162
- Returns:
163
- List of BeautifulSoup elements or strings
164
- """
165
- if isinstance(content, str):
166
- # Create a wrapper tag to ensure proper parsing
167
- soup_div = BeautifulSoup(f"<div>{content}</div>", 'html.parser').div
168
- if soup_div is not None:
169
- return [cast(Union[Tag, str], el) for el in soup_div.contents]
170
- return []
171
- elif isinstance(content, list):
172
- return content
173
- return [content]
174
-
175
- def extract_interactive_elements(html_content: str) -> List[Dict[str, Any]]:
176
- """Extract all interactive elements from HTML content with their properties.
177
-
178
- Args:
179
- html_content: HTML content to parse
180
-
181
- Returns:
182
- List of dictionaries containing element properties:
183
- - xpath: XPath of the element
184
- - tag: HTML tag name
185
- - text: Text content
186
- - is_clickable: Whether element is clickable
187
- - is_input: Whether element is an input field
188
- - is_select: Whether element is a select dropdown
189
- """
190
- soup = BeautifulSoup(html_content, 'html.parser')
191
- interactive_elements = []
192
-
193
- # Define interactive tags
194
- clickable_tags = {'a', 'button', 'input', 'select', 'textarea'}
195
- input_tags = {'input', 'textarea', 'select'}
196
-
197
- def get_xpath(element: Tag) -> str:
198
- """Generate XPath for an element"""
199
- components = []
200
- current = element
201
-
202
- while current and current.name:
203
- siblings = current.find_previous_siblings(current.name)
204
- index = len(siblings) + 1
205
- components.append(f"{current.name}[{index}]")
206
- current = current.parent
207
-
208
- return "/".join(reversed(components))
209
-
210
- def process_element(element: Tag) -> None:
211
- """Process a single element and add it to interactive_elements if it's interactive"""
212
- tag_name = element.name.lower()
213
-
214
- # Skip non-interactive elements
215
- if tag_name not in clickable_tags and not element.find_parent(clickable_tags):
216
- return
217
-
218
- # Get element properties
219
- element_info = {
220
- 'xpath': get_xpath(element),
221
- 'tag': tag_name,
222
- 'text': element.get_text().strip(),
223
- 'is_clickable': tag_name in clickable_tags or bool(element.find_parent('a')) or bool(element.find_parent('button')),
224
- 'is_input': tag_name in input_tags,
225
- 'is_select': tag_name == 'select'
226
- }
227
-
228
- # Add additional properties for input elements
229
- if element_info['is_input']:
230
- element_info['input_type'] = element.get('type', 'text')
231
- element_info['name'] = element.get('name', '')
232
- element_info['value'] = element.get('value', '')
233
-
234
- # Add options for select elements
235
- if element_info['is_select']:
236
- element_info['options'] = [
237
- {'value': opt.get('value', ''), 'text': opt.get_text().strip()}
238
- for opt in element.find_all('option')
239
- if isinstance(opt, Tag)
240
- ]
241
-
242
- interactive_elements.append(element_info)
243
-
244
- # Process all elements
245
- for element in soup.find_all():
246
- if isinstance(element, Tag):
247
- process_element(element)
248
-
249
- return interactive_elements
250
-
251
- def extract_display_elements(html_content: str) -> List[Dict[str, Any]]:
252
- """Extract all display elements from HTML content with their properties.
253
-
254
- Args:
255
- html_content: HTML content to parse
256
-
257
- Returns:
258
- List of dictionaries containing element properties:
259
- - xpath: XPath of the element
260
- - tag: HTML tag name
261
- - text: Text content
262
- - heading_level: Heading level (1-6) if the element is a heading
263
- - is_list: Whether the element is a list
264
- - is_list_item: Whether the element is a list item
265
- - is_table: Whether the element is a table
266
- - is_table_cell: Whether the element is a table cell
267
- """
268
- soup = BeautifulSoup(html_content, 'html.parser')
269
- display_elements = []
270
-
271
- # Define display tags
272
- display_tags = {
273
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', # Headings
274
- 'p', 'div', 'span', # Text containers
275
- 'ul', 'ol', 'li', # Lists
276
- 'table', 'tr', 'td', 'th', # Tables
277
- 'article', 'section', 'main', # Content sections
278
- 'header', 'footer', 'nav', # Layout sections
279
- 'aside', 'figure', 'figcaption' # Side content
280
- }
281
-
282
- # Define interactive tags to exclude
283
- interactive_tags = {'a', 'button', 'input', 'select', 'textarea', 'form'}
284
-
285
- def get_xpath(element: Tag) -> str:
286
- """Generate XPath for an element"""
287
- components = []
288
- current = element
289
-
290
- while current and current.name:
291
- siblings = current.find_previous_siblings(current.name)
292
- index = len(siblings) + 1
293
- components.append(f"{current.name}[{index}]")
294
- current = current.parent
295
-
296
- return "/".join(reversed(components))
297
-
298
- def process_element(element: Tag) -> None:
299
- """Process a single element and add it to display_elements if it's a display element"""
300
- tag_name = element.name.lower()
301
-
302
- # Skip non-display elements and interactive elements
303
- if tag_name not in display_tags or element.find_parent(interactive_tags):
304
- return
305
-
306
- # Get text content
307
- text = element.get_text().strip()
308
- if not text: # Skip empty elements
309
- return
310
-
311
- # Get element properties
312
- element_info = {
313
- 'xpath': get_xpath(element),
314
- 'tag': tag_name,
315
- 'text': text,
316
- 'heading_level': int(tag_name[1]) if tag_name.startswith('h') and len(tag_name) == 2 else None,
317
- 'is_list': tag_name in {'ul', 'ol'},
318
- 'is_list_item': tag_name == 'li',
319
- 'is_table': tag_name == 'table',
320
- 'is_table_cell': tag_name in {'td', 'th'}
321
- }
322
-
323
- # Add list-specific properties
324
- if element_info['is_list']:
325
- element_info['list_items'] = [
326
- {'text': li.get_text().strip()}
327
- for li in element.find_all('li')
328
- if isinstance(li, Tag)
329
- ]
330
-
331
- # Add table-specific properties
332
- if element_info['is_table']:
333
- element_info['table_rows'] = [
334
- {
335
- 'cells': [
336
- {'text': cell.get_text().strip(), 'is_header': cell.name == 'th'}
337
- for cell in row.find_all(['td', 'th'])
338
- if isinstance(cell, Tag)
339
- ]
340
- }
341
- for row in element.find_all('tr')
342
- if isinstance(row, Tag)
343
- ]
344
-
345
- display_elements.append(element_info)
346
-
347
- # Process all elements
348
- for element in soup.find_all():
349
- if isinstance(element, Tag):
350
- process_element(element)
351
-
352
- return display_elements