jarvis-ai-assistant 0.1.148__py3-none-any.whl → 0.1.150__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jarvis-ai-assistant might be problematic. Click here for more details.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/jarvis.py +7 -5
- jarvis/jarvis_agent/main.py +0 -1
- jarvis/jarvis_agent/patch.py +42 -7
- jarvis/jarvis_code_analysis/code_review.py +0 -1
- jarvis/jarvis_git_utils/git_commiter.py +0 -1
- jarvis/jarvis_lsp/base.py +1 -1
- jarvis/jarvis_lsp/cpp.py +1 -1
- jarvis/jarvis_lsp/go.py +1 -1
- jarvis/jarvis_lsp/python.py +1 -1
- jarvis/jarvis_lsp/registry.py +2 -1
- jarvis/jarvis_lsp/rust.py +1 -1
- jarvis/jarvis_mcp/__init__.py +36 -0
- jarvis/jarvis_mcp/local_mcp_client.py +241 -0
- jarvis/jarvis_mcp/remote_mcp_client.py +230 -0
- jarvis/jarvis_platform/base.py +1 -1
- jarvis/jarvis_platform/kimi.py +3 -3
- jarvis/jarvis_platform/registry.py +2 -4
- jarvis/jarvis_platform/yuanbao.py +4 -4
- jarvis/jarvis_tools/code_plan.py +0 -1
- jarvis/jarvis_tools/file_analyzer.py +1 -1
- jarvis/jarvis_tools/file_operation.py +7 -26
- jarvis/jarvis_tools/methodology.py +2 -1
- jarvis/jarvis_tools/read_code.py +0 -1
- jarvis/jarvis_tools/registry.py +108 -3
- jarvis/jarvis_tools/search_web.py +0 -1
- jarvis/jarvis_utils/config.py +14 -8
- jarvis/jarvis_utils/embedding.py +4 -8
- jarvis/jarvis_utils/file_processors.py +0 -262
- jarvis/jarvis_utils/git_utils.py +19 -8
- jarvis/jarvis_utils/input.py +7 -1
- jarvis/jarvis_utils/methodology.py +2 -2
- jarvis/jarvis_utils/output.py +0 -1
- jarvis/jarvis_utils/utils.py +23 -221
- {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/METADATA +31 -38
- {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/RECORD +40 -38
- jarvis/jarvis_platform_manager/openai_test.py +0 -138
- {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/entry_points.txt +0 -0
- {jarvis_ai_assistant-0.1.148.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,3 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import fitz # PyMuPDF for PDF files
|
|
3
|
-
from docx import Document as DocxDocument # python-docx for DOCX files
|
|
4
|
-
from pptx import Presentation
|
|
5
|
-
import pandas as pd
|
|
6
1
|
import unicodedata
|
|
7
2
|
|
|
8
3
|
class FileProcessor:
|
|
@@ -84,260 +79,3 @@ class TextFileProcessor(FileProcessor):
|
|
|
84
79
|
|
|
85
80
|
except Exception as e:
|
|
86
81
|
raise Exception(f"Failed to read file: {str(e)}")
|
|
87
|
-
|
|
88
|
-
class PDFProcessor(FileProcessor):
|
|
89
|
-
"""PDF file processor"""
|
|
90
|
-
@staticmethod
|
|
91
|
-
def can_handle(file_path: str) -> bool:
|
|
92
|
-
return Path(file_path).suffix.lower() == '.pdf'
|
|
93
|
-
|
|
94
|
-
@staticmethod
|
|
95
|
-
def extract_text(file_path: str) -> str:
|
|
96
|
-
"""提取PDF文件中的所有文本内容,包括页码、图片描述等"""
|
|
97
|
-
try:
|
|
98
|
-
text_parts = []
|
|
99
|
-
with fitz.open(file_path) as doc: # type: ignore
|
|
100
|
-
# 添加文档信息
|
|
101
|
-
info = doc.metadata
|
|
102
|
-
if info:
|
|
103
|
-
meta_text = []
|
|
104
|
-
if info.get("title"):
|
|
105
|
-
meta_text.append(f"标题: {info['title']}")
|
|
106
|
-
if info.get("author"):
|
|
107
|
-
meta_text.append(f"作者: {info['author']}")
|
|
108
|
-
if info.get("subject"):
|
|
109
|
-
meta_text.append(f"主题: {info['subject']}")
|
|
110
|
-
if info.get("keywords"):
|
|
111
|
-
meta_text.append(f"关键词: {info['keywords']}")
|
|
112
|
-
|
|
113
|
-
if meta_text:
|
|
114
|
-
text_parts.append("=== 文档信息 ===")
|
|
115
|
-
text_parts.append("\n".join(meta_text))
|
|
116
|
-
|
|
117
|
-
# 提取目录结构(如果有)
|
|
118
|
-
toc = doc.get_toc() # type: ignore
|
|
119
|
-
if toc:
|
|
120
|
-
text_parts.append("\n=== 目录结构 ===")
|
|
121
|
-
for level, title, page in toc:
|
|
122
|
-
indent = " " * (level - 1)
|
|
123
|
-
text_parts.append(f"{indent}- {title} (第{page}页)")
|
|
124
|
-
|
|
125
|
-
# 处理各页内容
|
|
126
|
-
text_parts.append("\n=== 页面内容 ===")
|
|
127
|
-
for page_index in range(len(doc)): # 使用范围遍历而不是直接枚举文档对象
|
|
128
|
-
# 添加页码标记
|
|
129
|
-
text_parts.append(f"\n--- 第{page_index+1}页 ---")
|
|
130
|
-
|
|
131
|
-
# 获取页面
|
|
132
|
-
page = doc[page_index]
|
|
133
|
-
|
|
134
|
-
# 提取页面文本(包括结构信息)
|
|
135
|
-
try:
|
|
136
|
-
# 尝试使用结构化提取(保留段落和块结构)
|
|
137
|
-
text = page.get_text("text") # type: ignore
|
|
138
|
-
text = text.strip()
|
|
139
|
-
if text:
|
|
140
|
-
text_parts.append(text)
|
|
141
|
-
except Exception:
|
|
142
|
-
# 如果结构化提取失败,回退到简单文本提取
|
|
143
|
-
text = page.get_text() # type: ignore
|
|
144
|
-
if text.strip():
|
|
145
|
-
text_parts.append(text.strip())
|
|
146
|
-
|
|
147
|
-
# 提取图像信息(如果需要)
|
|
148
|
-
# 注意:这可能会增加处理时间,可根据需要启用
|
|
149
|
-
"""
|
|
150
|
-
image_list = page.get_images()
|
|
151
|
-
if image_list:
|
|
152
|
-
text_parts.append(f"本页包含 {len(image_list)} 个图像")
|
|
153
|
-
"""
|
|
154
|
-
|
|
155
|
-
# 合并所有文本
|
|
156
|
-
return "\n".join(text_parts)
|
|
157
|
-
|
|
158
|
-
except Exception as e:
|
|
159
|
-
# 处理可能的异常
|
|
160
|
-
return f"PDF处理错误: {str(e)}"
|
|
161
|
-
|
|
162
|
-
class DocxProcessor(FileProcessor):
|
|
163
|
-
"""DOCX file processor"""
|
|
164
|
-
@staticmethod
|
|
165
|
-
def can_handle(file_path: str) -> bool:
|
|
166
|
-
return Path(file_path).suffix.lower() == '.docx'
|
|
167
|
-
|
|
168
|
-
@staticmethod
|
|
169
|
-
def extract_text(file_path: str) -> str:
|
|
170
|
-
"""提取 DOCX 文件中的所有文本内容,包括段落、表格、页眉页脚等"""
|
|
171
|
-
doc = DocxDocument(file_path)
|
|
172
|
-
full_text = []
|
|
173
|
-
|
|
174
|
-
# 提取段落文本
|
|
175
|
-
for para in doc.paragraphs:
|
|
176
|
-
if para.text.strip(): # 跳过空段落
|
|
177
|
-
full_text.append(para.text)
|
|
178
|
-
|
|
179
|
-
# 提取表格文本
|
|
180
|
-
for table in doc.tables:
|
|
181
|
-
for row in table.rows:
|
|
182
|
-
row_texts = []
|
|
183
|
-
for cell in row.cells:
|
|
184
|
-
# 每个单元格可能包含多个段落
|
|
185
|
-
cell_text = "\n".join([p.text for p in cell.paragraphs if p.text.strip()])
|
|
186
|
-
if cell_text:
|
|
187
|
-
row_texts.append(cell_text)
|
|
188
|
-
if row_texts:
|
|
189
|
-
full_text.append(" | ".join(row_texts))
|
|
190
|
-
|
|
191
|
-
# 提取页眉页脚(如果有节)
|
|
192
|
-
try:
|
|
193
|
-
for section in doc.sections:
|
|
194
|
-
# 提取页眉
|
|
195
|
-
if section.header:
|
|
196
|
-
header_text = "\n".join([p.text for p in section.header.paragraphs if p.text.strip()])
|
|
197
|
-
if header_text:
|
|
198
|
-
full_text.append(f"页眉: {header_text}")
|
|
199
|
-
|
|
200
|
-
# 提取页脚
|
|
201
|
-
if section.footer:
|
|
202
|
-
footer_text = "\n".join([p.text for p in section.footer.paragraphs if p.text.strip()])
|
|
203
|
-
if footer_text:
|
|
204
|
-
full_text.append(f"页脚: {footer_text}")
|
|
205
|
-
except:
|
|
206
|
-
# 如果提取页眉页脚失败,忽略错误继续
|
|
207
|
-
pass
|
|
208
|
-
|
|
209
|
-
# 合并所有文本
|
|
210
|
-
return "\n\n".join(full_text)
|
|
211
|
-
|
|
212
|
-
class PPTProcessor(FileProcessor):
|
|
213
|
-
"""PPT file processor"""
|
|
214
|
-
@staticmethod
|
|
215
|
-
def can_handle(file_path: str) -> bool:
|
|
216
|
-
return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
|
|
217
|
-
|
|
218
|
-
@staticmethod
|
|
219
|
-
def extract_text(file_path: str) -> str:
|
|
220
|
-
"""提取PPT文件中的所有文本内容,包括标题、文本框、备注等"""
|
|
221
|
-
prs = Presentation(file_path)
|
|
222
|
-
all_text = []
|
|
223
|
-
|
|
224
|
-
# 遍历所有幻灯片
|
|
225
|
-
for slide_index, slide in enumerate(prs.slides, 1):
|
|
226
|
-
slide_text = []
|
|
227
|
-
|
|
228
|
-
# 添加幻灯片编号
|
|
229
|
-
slide_text.append(f"=== 幻灯片 {slide_index} ===")
|
|
230
|
-
|
|
231
|
-
# 提取幻灯片中所有形状的文本
|
|
232
|
-
for shape in slide.shapes:
|
|
233
|
-
# 提取带有文本的形状
|
|
234
|
-
try:
|
|
235
|
-
if hasattr(shape, "text_frame") and shape.text_frame: # type: ignore
|
|
236
|
-
for paragraph in shape.text_frame.paragraphs: # type: ignore
|
|
237
|
-
text = paragraph.text.strip()
|
|
238
|
-
if text:
|
|
239
|
-
slide_text.append(text)
|
|
240
|
-
except AttributeError:
|
|
241
|
-
pass
|
|
242
|
-
|
|
243
|
-
# 提取表格内容
|
|
244
|
-
try:
|
|
245
|
-
if hasattr(shape, "table") and shape.table: # type: ignore
|
|
246
|
-
for row in shape.table.rows: # type: ignore
|
|
247
|
-
row_texts = []
|
|
248
|
-
for cell in row.cells:
|
|
249
|
-
if hasattr(cell, "text_frame") and cell.text_frame:
|
|
250
|
-
cell_paragraphs = cell.text_frame.paragraphs # type: ignore
|
|
251
|
-
cell_text = " ".join([p.text.strip() for p in cell_paragraphs if p.text.strip()])
|
|
252
|
-
if cell_text:
|
|
253
|
-
row_texts.append(cell_text)
|
|
254
|
-
if row_texts:
|
|
255
|
-
slide_text.append(" | ".join(row_texts))
|
|
256
|
-
except AttributeError:
|
|
257
|
-
pass
|
|
258
|
-
|
|
259
|
-
# 提取幻灯片备注
|
|
260
|
-
try:
|
|
261
|
-
if hasattr(slide, "has_notes_slide") and slide.has_notes_slide:
|
|
262
|
-
notes_slide = slide.notes_slide
|
|
263
|
-
if notes_slide and hasattr(notes_slide, "notes_text_frame") and notes_slide.notes_text_frame:
|
|
264
|
-
notes_text = notes_slide.notes_text_frame.text.strip() # type: ignore
|
|
265
|
-
if notes_text:
|
|
266
|
-
slide_text.append(f"备注: {notes_text}")
|
|
267
|
-
except AttributeError:
|
|
268
|
-
pass
|
|
269
|
-
|
|
270
|
-
# 合并当前幻灯片的所有文本
|
|
271
|
-
if len(slide_text) > 1: # 如果除了幻灯片编号外还有其他内容
|
|
272
|
-
all_text.append("\n".join(slide_text))
|
|
273
|
-
|
|
274
|
-
# 返回所有幻灯片的文本内容
|
|
275
|
-
return "\n\n".join(all_text)
|
|
276
|
-
|
|
277
|
-
class ExcelProcessor(FileProcessor):
|
|
278
|
-
"""Excel file processor"""
|
|
279
|
-
@staticmethod
|
|
280
|
-
def can_handle(file_path: str) -> bool:
|
|
281
|
-
return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
|
|
282
|
-
|
|
283
|
-
@staticmethod
|
|
284
|
-
def extract_text(file_path: str) -> str:
|
|
285
|
-
"""提取 Excel 文件中的所有文本内容,包括多个工作表及格式化内容"""
|
|
286
|
-
try:
|
|
287
|
-
# 读取所有工作表
|
|
288
|
-
excel_file = pd.ExcelFile(file_path)
|
|
289
|
-
sheets_text = []
|
|
290
|
-
|
|
291
|
-
# 处理每个工作表
|
|
292
|
-
for sheet_name in excel_file.sheet_names:
|
|
293
|
-
# 读取当前工作表
|
|
294
|
-
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
295
|
-
|
|
296
|
-
# 如果是空表格,跳过
|
|
297
|
-
if df.empty:
|
|
298
|
-
continue
|
|
299
|
-
|
|
300
|
-
# 添加工作表标题
|
|
301
|
-
sheet_text = [f"=== 工作表: {sheet_name} ==="]
|
|
302
|
-
|
|
303
|
-
# 填充空单元格,避免NaN显示
|
|
304
|
-
df = df.fillna("")
|
|
305
|
-
|
|
306
|
-
# 提取表格头信息
|
|
307
|
-
if not df.columns.empty:
|
|
308
|
-
headers = [str(col) for col in df.columns]
|
|
309
|
-
sheet_text.append("列标题: " + " | ".join(headers))
|
|
310
|
-
|
|
311
|
-
# 尝试提取表格中可能的关键信息
|
|
312
|
-
# 1. 表格内容概述
|
|
313
|
-
row_count, col_count = df.shape
|
|
314
|
-
sheet_text.append(f"表格大小: {row_count}行 x {col_count}列")
|
|
315
|
-
|
|
316
|
-
# 2. 表格数据,使用更友好的格式
|
|
317
|
-
try:
|
|
318
|
-
# 转换数据框为字符串表示
|
|
319
|
-
# 设置最大行数和列数,避免过大的表格
|
|
320
|
-
max_rows = min(500, row_count) # 最多显示500行
|
|
321
|
-
if row_count > max_rows:
|
|
322
|
-
sheet_text.append(f"注意: 表格太大,仅显示前{max_rows}行")
|
|
323
|
-
|
|
324
|
-
# 将DataFrame转换为字符串表格
|
|
325
|
-
table_str = df.head(max_rows).to_string(index=True, max_rows=max_rows, max_cols=None)
|
|
326
|
-
sheet_text.append(table_str)
|
|
327
|
-
|
|
328
|
-
except Exception as e:
|
|
329
|
-
sheet_text.append(f"表格数据提取错误: {str(e)}")
|
|
330
|
-
|
|
331
|
-
# 合并当前工作表的文本
|
|
332
|
-
sheets_text.append("\n".join(sheet_text))
|
|
333
|
-
|
|
334
|
-
# 如果没有提取到任何内容,返回一个提示信息
|
|
335
|
-
if not sheets_text:
|
|
336
|
-
return "Excel文件为空或无法提取内容"
|
|
337
|
-
|
|
338
|
-
# 合并所有工作表的文本
|
|
339
|
-
return "\n\n".join(sheets_text)
|
|
340
|
-
|
|
341
|
-
except Exception as e:
|
|
342
|
-
# 处理可能的异常,返回错误信息
|
|
343
|
-
return f"Excel文件处理错误: {str(e)}"
|
jarvis/jarvis_utils/git_utils.py
CHANGED
|
@@ -74,14 +74,16 @@ def get_commits_between(start_hash: str, end_hash: str) -> List[Tuple[str, str]]
|
|
|
74
74
|
['git', 'log', f'{start_hash}..{end_hash}', '--pretty=format:%H|%s'],
|
|
75
75
|
stdout=subprocess.PIPE,
|
|
76
76
|
stderr=subprocess.PIPE,
|
|
77
|
-
text=
|
|
77
|
+
text=False # 禁用自动文本解码
|
|
78
78
|
)
|
|
79
79
|
if result.returncode != 0:
|
|
80
|
-
|
|
80
|
+
error_msg = result.stderr.decode('utf-8', errors='replace')
|
|
81
|
+
PrettyOutput.print(f"获取commit历史失败: {error_msg}", OutputType.ERROR)
|
|
81
82
|
return []
|
|
82
83
|
|
|
84
|
+
output = result.stdout.decode('utf-8', errors='replace')
|
|
83
85
|
commits = []
|
|
84
|
-
for line in
|
|
86
|
+
for line in output.splitlines():
|
|
85
87
|
if '|' in line:
|
|
86
88
|
commit_hash, message = line.split('|', 1)
|
|
87
89
|
commits.append((commit_hash, message))
|
|
@@ -94,18 +96,27 @@ def get_latest_commit_hash() -> str:
|
|
|
94
96
|
"""获取当前Git仓库的最新提交哈希值
|
|
95
97
|
|
|
96
98
|
返回:
|
|
97
|
-
str: 提交哈希值,如果不在Git
|
|
99
|
+
str: 提交哈希值,如果不在Git仓库、空仓库或发生错误则返回空字符串
|
|
98
100
|
"""
|
|
99
101
|
try:
|
|
102
|
+
# 首先检查是否存在HEAD引用
|
|
103
|
+
head_check = subprocess.run(
|
|
104
|
+
['git', 'rev-parse', '--verify', 'HEAD'],
|
|
105
|
+
stdout=subprocess.PIPE,
|
|
106
|
+
stderr=subprocess.PIPE,
|
|
107
|
+
text=False
|
|
108
|
+
)
|
|
109
|
+
if head_check.returncode != 0:
|
|
110
|
+
return "" # 空仓库或无效HEAD
|
|
111
|
+
|
|
112
|
+
# 获取HEAD的完整哈希值
|
|
100
113
|
result = subprocess.run(
|
|
101
114
|
['git', 'rev-parse', 'HEAD'],
|
|
102
115
|
stdout=subprocess.PIPE,
|
|
103
116
|
stderr=subprocess.PIPE,
|
|
104
|
-
text=
|
|
117
|
+
text=False
|
|
105
118
|
)
|
|
106
|
-
if result.returncode == 0
|
|
107
|
-
return result.stdout.strip()
|
|
108
|
-
return ""
|
|
119
|
+
return result.stdout.decode('utf-8', errors='replace').strip() if result.returncode == 0 else ""
|
|
109
120
|
except Exception:
|
|
110
121
|
return ""
|
|
111
122
|
def get_modified_line_ranges() -> Dict[str, Tuple[int, int]]:
|
jarvis/jarvis_utils/input.py
CHANGED
|
@@ -158,8 +158,14 @@ def get_multiline_input(tip: str) -> str:
|
|
|
158
158
|
'prompt': 'ansicyan',
|
|
159
159
|
})
|
|
160
160
|
try:
|
|
161
|
+
from prompt_toolkit.history import FileHistory
|
|
162
|
+
from jarvis.jarvis_utils.config import get_data_dir
|
|
163
|
+
import os
|
|
164
|
+
# 获取数据目录路径
|
|
165
|
+
history_dir = get_data_dir()
|
|
166
|
+
# 初始化带历史记录的会话
|
|
161
167
|
session = PromptSession(
|
|
162
|
-
history=
|
|
168
|
+
history=FileHistory(os.path.join(history_dir, 'multiline_input_history')),
|
|
163
169
|
completer=FileCompleter(),
|
|
164
170
|
key_bindings=bindings,
|
|
165
171
|
complete_while_typing=True,
|
|
@@ -11,7 +11,7 @@ import json
|
|
|
11
11
|
import tempfile
|
|
12
12
|
from typing import Dict, Optional
|
|
13
13
|
|
|
14
|
-
from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count
|
|
14
|
+
from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count, get_data_dir
|
|
15
15
|
from jarvis.jarvis_utils.embedding import get_context_token_count
|
|
16
16
|
from jarvis.jarvis_utils.output import PrettyOutput, OutputType
|
|
17
17
|
from jarvis.jarvis_platform.registry import PlatformRegistry
|
|
@@ -23,7 +23,7 @@ def _get_methodology_directory() -> str:
|
|
|
23
23
|
返回:
|
|
24
24
|
str: 方法论目录的路径
|
|
25
25
|
"""
|
|
26
|
-
methodology_dir = os.path.
|
|
26
|
+
methodology_dir = os.path.join(get_data_dir(), "methodologies")
|
|
27
27
|
if not os.path.exists(methodology_dir):
|
|
28
28
|
try:
|
|
29
29
|
os.makedirs(methodology_dir, exist_ok=True)
|
jarvis/jarvis_utils/output.py
CHANGED
jarvis/jarvis_utils/utils.py
CHANGED
|
@@ -2,24 +2,23 @@ import os
|
|
|
2
2
|
import time
|
|
3
3
|
import hashlib
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
6
|
-
from
|
|
7
|
-
from jarvis.jarvis_utils.config import get_max_input_token_count
|
|
5
|
+
from typing import List, Any, Callable
|
|
6
|
+
from jarvis.jarvis_utils.config import get_max_input_token_count, get_data_dir
|
|
8
7
|
from jarvis.jarvis_utils.embedding import get_context_token_count
|
|
9
8
|
from jarvis.jarvis_utils.input import get_single_line_input
|
|
10
9
|
from jarvis.jarvis_utils.output import PrettyOutput, OutputType
|
|
11
10
|
def init_env() -> None:
|
|
12
|
-
"""
|
|
11
|
+
"""初始化环境变量从jarvis_data/env文件
|
|
13
12
|
|
|
14
13
|
功能:
|
|
15
|
-
1.
|
|
14
|
+
1. 创建不存在的jarvis_data目录
|
|
16
15
|
2. 加载环境变量到os.environ
|
|
17
16
|
3. 处理文件读取异常
|
|
18
17
|
"""
|
|
19
|
-
jarvis_dir = Path
|
|
18
|
+
jarvis_dir = Path(get_data_dir())
|
|
20
19
|
env_file = jarvis_dir / "env"
|
|
21
20
|
|
|
22
|
-
#
|
|
21
|
+
# 检查jarvis_data目录是否存在
|
|
23
22
|
if not jarvis_dir.exists():
|
|
24
23
|
jarvis_dir.mkdir(parents=True)
|
|
25
24
|
if env_file.exists():
|
|
@@ -53,7 +52,7 @@ def while_success(func: Callable[[], Any], sleep_time: float = 0.1) -> Any:
|
|
|
53
52
|
time.sleep(sleep_time)
|
|
54
53
|
continue
|
|
55
54
|
def while_true(func: Callable[[], bool], sleep_time: float = 0.1) -> Any:
|
|
56
|
-
"""
|
|
55
|
+
"""循环执行函数直到返回True"""
|
|
57
56
|
while True:
|
|
58
57
|
ret = func()
|
|
59
58
|
if ret:
|
|
@@ -62,37 +61,37 @@ def while_true(func: Callable[[], bool], sleep_time: float = 0.1) -> Any:
|
|
|
62
61
|
time.sleep(sleep_time)
|
|
63
62
|
return ret
|
|
64
63
|
def get_file_md5(filepath: str)->str:
|
|
65
|
-
"""
|
|
64
|
+
"""计算文件内容的MD5哈希值
|
|
66
65
|
|
|
67
|
-
|
|
68
|
-
filepath:
|
|
66
|
+
参数:
|
|
67
|
+
filepath: 要计算哈希的文件路径
|
|
69
68
|
|
|
70
|
-
|
|
71
|
-
str: MD5
|
|
69
|
+
返回:
|
|
70
|
+
str: 文件内容的MD5哈希值
|
|
72
71
|
"""
|
|
73
72
|
return hashlib.md5(open(filepath, "rb").read(100*1024*1024)).hexdigest()
|
|
74
73
|
def user_confirm(tip: str, default: bool = True) -> bool:
|
|
75
|
-
"""
|
|
74
|
+
"""提示用户确认是/否问题
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
tip:
|
|
79
|
-
default:
|
|
76
|
+
参数:
|
|
77
|
+
tip: 显示给用户的消息
|
|
78
|
+
default: 用户直接回车时的默认响应
|
|
80
79
|
|
|
81
|
-
|
|
82
|
-
bool: True
|
|
80
|
+
返回:
|
|
81
|
+
bool: 用户确认返回True,否则返回False
|
|
83
82
|
"""
|
|
84
83
|
suffix = "[Y/n]" if default else "[y/N]"
|
|
85
84
|
ret = get_single_line_input(f"{tip} {suffix}: ")
|
|
86
85
|
return default if ret == "" else ret.lower() == "y"
|
|
87
86
|
|
|
88
87
|
def get_file_line_count(filename: str) -> int:
|
|
89
|
-
"""
|
|
88
|
+
"""计算文件中的行数
|
|
90
89
|
|
|
91
|
-
|
|
92
|
-
filename:
|
|
90
|
+
参数:
|
|
91
|
+
filename: 要计算行数的文件路径
|
|
93
92
|
|
|
94
|
-
|
|
95
|
-
int:
|
|
93
|
+
返回:
|
|
94
|
+
int: 文件中的行数,如果文件无法读取则返回0
|
|
96
95
|
"""
|
|
97
96
|
try:
|
|
98
97
|
return len(open(filename, "r", encoding="utf-8", errors="ignore").readlines())
|
|
@@ -153,200 +152,3 @@ def ct(tag_name: str) -> str:
|
|
|
153
152
|
"""
|
|
154
153
|
return f"</{tag_name}>"
|
|
155
154
|
|
|
156
|
-
|
|
157
|
-
def create_soup_element(content: Union[str, Tag, List[Any]]) -> List[Union[Tag, str]]:
|
|
158
|
-
"""Safely create a BeautifulSoup element, ensuring it's treated as markup
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
content: Input content to convert to BeautifulSoup elements
|
|
162
|
-
Returns:
|
|
163
|
-
List of BeautifulSoup elements or strings
|
|
164
|
-
"""
|
|
165
|
-
if isinstance(content, str):
|
|
166
|
-
# Create a wrapper tag to ensure proper parsing
|
|
167
|
-
soup_div = BeautifulSoup(f"<div>{content}</div>", 'html.parser').div
|
|
168
|
-
if soup_div is not None:
|
|
169
|
-
return [cast(Union[Tag, str], el) for el in soup_div.contents]
|
|
170
|
-
return []
|
|
171
|
-
elif isinstance(content, list):
|
|
172
|
-
return content
|
|
173
|
-
return [content]
|
|
174
|
-
|
|
175
|
-
def extract_interactive_elements(html_content: str) -> List[Dict[str, Any]]:
|
|
176
|
-
"""Extract all interactive elements from HTML content with their properties.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
html_content: HTML content to parse
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
List of dictionaries containing element properties:
|
|
183
|
-
- xpath: XPath of the element
|
|
184
|
-
- tag: HTML tag name
|
|
185
|
-
- text: Text content
|
|
186
|
-
- is_clickable: Whether element is clickable
|
|
187
|
-
- is_input: Whether element is an input field
|
|
188
|
-
- is_select: Whether element is a select dropdown
|
|
189
|
-
"""
|
|
190
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
|
191
|
-
interactive_elements = []
|
|
192
|
-
|
|
193
|
-
# Define interactive tags
|
|
194
|
-
clickable_tags = {'a', 'button', 'input', 'select', 'textarea'}
|
|
195
|
-
input_tags = {'input', 'textarea', 'select'}
|
|
196
|
-
|
|
197
|
-
def get_xpath(element: Tag) -> str:
|
|
198
|
-
"""Generate XPath for an element"""
|
|
199
|
-
components = []
|
|
200
|
-
current = element
|
|
201
|
-
|
|
202
|
-
while current and current.name:
|
|
203
|
-
siblings = current.find_previous_siblings(current.name)
|
|
204
|
-
index = len(siblings) + 1
|
|
205
|
-
components.append(f"{current.name}[{index}]")
|
|
206
|
-
current = current.parent
|
|
207
|
-
|
|
208
|
-
return "/".join(reversed(components))
|
|
209
|
-
|
|
210
|
-
def process_element(element: Tag) -> None:
|
|
211
|
-
"""Process a single element and add it to interactive_elements if it's interactive"""
|
|
212
|
-
tag_name = element.name.lower()
|
|
213
|
-
|
|
214
|
-
# Skip non-interactive elements
|
|
215
|
-
if tag_name not in clickable_tags and not element.find_parent(clickable_tags):
|
|
216
|
-
return
|
|
217
|
-
|
|
218
|
-
# Get element properties
|
|
219
|
-
element_info = {
|
|
220
|
-
'xpath': get_xpath(element),
|
|
221
|
-
'tag': tag_name,
|
|
222
|
-
'text': element.get_text().strip(),
|
|
223
|
-
'is_clickable': tag_name in clickable_tags or bool(element.find_parent('a')) or bool(element.find_parent('button')),
|
|
224
|
-
'is_input': tag_name in input_tags,
|
|
225
|
-
'is_select': tag_name == 'select'
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
# Add additional properties for input elements
|
|
229
|
-
if element_info['is_input']:
|
|
230
|
-
element_info['input_type'] = element.get('type', 'text')
|
|
231
|
-
element_info['name'] = element.get('name', '')
|
|
232
|
-
element_info['value'] = element.get('value', '')
|
|
233
|
-
|
|
234
|
-
# Add options for select elements
|
|
235
|
-
if element_info['is_select']:
|
|
236
|
-
element_info['options'] = [
|
|
237
|
-
{'value': opt.get('value', ''), 'text': opt.get_text().strip()}
|
|
238
|
-
for opt in element.find_all('option')
|
|
239
|
-
if isinstance(opt, Tag)
|
|
240
|
-
]
|
|
241
|
-
|
|
242
|
-
interactive_elements.append(element_info)
|
|
243
|
-
|
|
244
|
-
# Process all elements
|
|
245
|
-
for element in soup.find_all():
|
|
246
|
-
if isinstance(element, Tag):
|
|
247
|
-
process_element(element)
|
|
248
|
-
|
|
249
|
-
return interactive_elements
|
|
250
|
-
|
|
251
|
-
def extract_display_elements(html_content: str) -> List[Dict[str, Any]]:
|
|
252
|
-
"""Extract all display elements from HTML content with their properties.
|
|
253
|
-
|
|
254
|
-
Args:
|
|
255
|
-
html_content: HTML content to parse
|
|
256
|
-
|
|
257
|
-
Returns:
|
|
258
|
-
List of dictionaries containing element properties:
|
|
259
|
-
- xpath: XPath of the element
|
|
260
|
-
- tag: HTML tag name
|
|
261
|
-
- text: Text content
|
|
262
|
-
- heading_level: Heading level (1-6) if the element is a heading
|
|
263
|
-
- is_list: Whether the element is a list
|
|
264
|
-
- is_list_item: Whether the element is a list item
|
|
265
|
-
- is_table: Whether the element is a table
|
|
266
|
-
- is_table_cell: Whether the element is a table cell
|
|
267
|
-
"""
|
|
268
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
|
269
|
-
display_elements = []
|
|
270
|
-
|
|
271
|
-
# Define display tags
|
|
272
|
-
display_tags = {
|
|
273
|
-
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', # Headings
|
|
274
|
-
'p', 'div', 'span', # Text containers
|
|
275
|
-
'ul', 'ol', 'li', # Lists
|
|
276
|
-
'table', 'tr', 'td', 'th', # Tables
|
|
277
|
-
'article', 'section', 'main', # Content sections
|
|
278
|
-
'header', 'footer', 'nav', # Layout sections
|
|
279
|
-
'aside', 'figure', 'figcaption' # Side content
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
# Define interactive tags to exclude
|
|
283
|
-
interactive_tags = {'a', 'button', 'input', 'select', 'textarea', 'form'}
|
|
284
|
-
|
|
285
|
-
def get_xpath(element: Tag) -> str:
|
|
286
|
-
"""Generate XPath for an element"""
|
|
287
|
-
components = []
|
|
288
|
-
current = element
|
|
289
|
-
|
|
290
|
-
while current and current.name:
|
|
291
|
-
siblings = current.find_previous_siblings(current.name)
|
|
292
|
-
index = len(siblings) + 1
|
|
293
|
-
components.append(f"{current.name}[{index}]")
|
|
294
|
-
current = current.parent
|
|
295
|
-
|
|
296
|
-
return "/".join(reversed(components))
|
|
297
|
-
|
|
298
|
-
def process_element(element: Tag) -> None:
|
|
299
|
-
"""Process a single element and add it to display_elements if it's a display element"""
|
|
300
|
-
tag_name = element.name.lower()
|
|
301
|
-
|
|
302
|
-
# Skip non-display elements and interactive elements
|
|
303
|
-
if tag_name not in display_tags or element.find_parent(interactive_tags):
|
|
304
|
-
return
|
|
305
|
-
|
|
306
|
-
# Get text content
|
|
307
|
-
text = element.get_text().strip()
|
|
308
|
-
if not text: # Skip empty elements
|
|
309
|
-
return
|
|
310
|
-
|
|
311
|
-
# Get element properties
|
|
312
|
-
element_info = {
|
|
313
|
-
'xpath': get_xpath(element),
|
|
314
|
-
'tag': tag_name,
|
|
315
|
-
'text': text,
|
|
316
|
-
'heading_level': int(tag_name[1]) if tag_name.startswith('h') and len(tag_name) == 2 else None,
|
|
317
|
-
'is_list': tag_name in {'ul', 'ol'},
|
|
318
|
-
'is_list_item': tag_name == 'li',
|
|
319
|
-
'is_table': tag_name == 'table',
|
|
320
|
-
'is_table_cell': tag_name in {'td', 'th'}
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
# Add list-specific properties
|
|
324
|
-
if element_info['is_list']:
|
|
325
|
-
element_info['list_items'] = [
|
|
326
|
-
{'text': li.get_text().strip()}
|
|
327
|
-
for li in element.find_all('li')
|
|
328
|
-
if isinstance(li, Tag)
|
|
329
|
-
]
|
|
330
|
-
|
|
331
|
-
# Add table-specific properties
|
|
332
|
-
if element_info['is_table']:
|
|
333
|
-
element_info['table_rows'] = [
|
|
334
|
-
{
|
|
335
|
-
'cells': [
|
|
336
|
-
{'text': cell.get_text().strip(), 'is_header': cell.name == 'th'}
|
|
337
|
-
for cell in row.find_all(['td', 'th'])
|
|
338
|
-
if isinstance(cell, Tag)
|
|
339
|
-
]
|
|
340
|
-
}
|
|
341
|
-
for row in element.find_all('tr')
|
|
342
|
-
if isinstance(row, Tag)
|
|
343
|
-
]
|
|
344
|
-
|
|
345
|
-
display_elements.append(element_info)
|
|
346
|
-
|
|
347
|
-
# Process all elements
|
|
348
|
-
for element in soup.find_all():
|
|
349
|
-
if isinstance(element, Tag):
|
|
350
|
-
process_element(element)
|
|
351
|
-
|
|
352
|
-
return display_elements
|