jarvis-ai-assistant 0.1.138__py3-none-any.whl → 0.1.141__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jarvis-ai-assistant might be problematic. Click here for more details.

Files changed (85) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +62 -14
  3. jarvis/jarvis_agent/builtin_input_handler.py +4 -14
  4. jarvis/jarvis_agent/main.py +1 -1
  5. jarvis/jarvis_agent/patch.py +37 -40
  6. jarvis/jarvis_agent/shell_input_handler.py +2 -3
  7. jarvis/jarvis_code_agent/code_agent.py +23 -30
  8. jarvis/jarvis_code_analysis/checklists/__init__.py +3 -0
  9. jarvis/jarvis_code_analysis/checklists/c_cpp.py +50 -0
  10. jarvis/jarvis_code_analysis/checklists/csharp.py +75 -0
  11. jarvis/jarvis_code_analysis/checklists/data_format.py +82 -0
  12. jarvis/jarvis_code_analysis/checklists/devops.py +107 -0
  13. jarvis/jarvis_code_analysis/checklists/docs.py +87 -0
  14. jarvis/jarvis_code_analysis/checklists/go.py +52 -0
  15. jarvis/jarvis_code_analysis/checklists/infrastructure.py +98 -0
  16. jarvis/jarvis_code_analysis/checklists/java.py +66 -0
  17. jarvis/jarvis_code_analysis/checklists/javascript.py +73 -0
  18. jarvis/jarvis_code_analysis/checklists/kotlin.py +107 -0
  19. jarvis/jarvis_code_analysis/checklists/loader.py +76 -0
  20. jarvis/jarvis_code_analysis/checklists/php.py +77 -0
  21. jarvis/jarvis_code_analysis/checklists/python.py +56 -0
  22. jarvis/jarvis_code_analysis/checklists/ruby.py +107 -0
  23. jarvis/jarvis_code_analysis/checklists/rust.py +58 -0
  24. jarvis/jarvis_code_analysis/checklists/shell.py +75 -0
  25. jarvis/jarvis_code_analysis/checklists/sql.py +72 -0
  26. jarvis/jarvis_code_analysis/checklists/swift.py +77 -0
  27. jarvis/jarvis_code_analysis/checklists/web.py +97 -0
  28. jarvis/jarvis_code_analysis/code_review.py +660 -0
  29. jarvis/jarvis_dev/main.py +61 -88
  30. jarvis/jarvis_git_squash/main.py +3 -3
  31. jarvis/jarvis_git_utils/git_commiter.py +242 -0
  32. jarvis/jarvis_init/main.py +62 -0
  33. jarvis/jarvis_platform/base.py +4 -0
  34. jarvis/jarvis_platform/kimi.py +173 -5
  35. jarvis/jarvis_platform/openai.py +3 -0
  36. jarvis/jarvis_platform/registry.py +1 -0
  37. jarvis/jarvis_platform/yuanbao.py +275 -5
  38. jarvis/jarvis_tools/ask_codebase.py +6 -9
  39. jarvis/jarvis_tools/ask_user.py +17 -5
  40. jarvis/jarvis_tools/base.py +3 -1
  41. jarvis/jarvis_tools/chdir.py +1 -0
  42. jarvis/jarvis_tools/create_code_agent.py +4 -3
  43. jarvis/jarvis_tools/create_sub_agent.py +1 -0
  44. jarvis/jarvis_tools/execute_script.py +170 -0
  45. jarvis/jarvis_tools/file_analyzer.py +90 -239
  46. jarvis/jarvis_tools/file_operation.py +99 -31
  47. jarvis/jarvis_tools/{find_methodolopy.py → find_methodology.py} +2 -1
  48. jarvis/jarvis_tools/lsp_get_diagnostics.py +2 -0
  49. jarvis/jarvis_tools/methodology.py +11 -11
  50. jarvis/jarvis_tools/read_code.py +2 -0
  51. jarvis/jarvis_tools/read_webpage.py +33 -196
  52. jarvis/jarvis_tools/registry.py +68 -131
  53. jarvis/jarvis_tools/search_web.py +14 -6
  54. jarvis/jarvis_tools/virtual_tty.py +399 -0
  55. jarvis/jarvis_utils/config.py +29 -3
  56. jarvis/jarvis_utils/embedding.py +0 -317
  57. jarvis/jarvis_utils/file_processors.py +343 -0
  58. jarvis/jarvis_utils/input.py +0 -1
  59. jarvis/jarvis_utils/methodology.py +94 -435
  60. jarvis/jarvis_utils/utils.py +207 -9
  61. {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/METADATA +4 -4
  62. jarvis_ai_assistant-0.1.141.dist-info/RECORD +94 -0
  63. {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/entry_points.txt +4 -4
  64. jarvis/jarvis_code_agent/file_select.py +0 -202
  65. jarvis/jarvis_platform/ai8.py +0 -268
  66. jarvis/jarvis_platform/ollama.py +0 -137
  67. jarvis/jarvis_platform/oyi.py +0 -307
  68. jarvis/jarvis_rag/file_processors.py +0 -138
  69. jarvis/jarvis_rag/main.py +0 -1734
  70. jarvis/jarvis_tools/code_review.py +0 -333
  71. jarvis/jarvis_tools/execute_python_script.py +0 -58
  72. jarvis/jarvis_tools/execute_shell.py +0 -97
  73. jarvis/jarvis_tools/execute_shell_script.py +0 -58
  74. jarvis/jarvis_tools/find_caller.py +0 -278
  75. jarvis/jarvis_tools/find_symbol.py +0 -295
  76. jarvis/jarvis_tools/function_analyzer.py +0 -331
  77. jarvis/jarvis_tools/git_commiter.py +0 -167
  78. jarvis/jarvis_tools/project_analyzer.py +0 -304
  79. jarvis/jarvis_tools/rag.py +0 -143
  80. jarvis/jarvis_tools/tool_generator.py +0 -221
  81. jarvis_ai_assistant-0.1.138.dist-info/RECORD +0 -85
  82. /jarvis/{jarvis_rag → jarvis_init}/__init__.py +0 -0
  83. {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/LICENSE +0 -0
  84. {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/WHEEL +0 -0
  85. {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/top_level.txt +0 -0
@@ -33,181 +33,6 @@ def get_context_token_count(text: str) -> int:
33
33
  # 回退到基于字符的粗略估计
34
34
  return len(text) // 4 # 每个token大约4个字符的粗略估计
35
35
 
36
- @functools.lru_cache(maxsize=1)
37
- def load_embedding_model() -> SentenceTransformer:
38
- """
39
- 加载句子嵌入模型,使用缓存避免重复加载。
40
-
41
- 返回:
42
- SentenceTransformer: 加载的嵌入模型
43
- """
44
- model_name = "BAAI/bge-m3"
45
- cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
46
-
47
- # 检查全局缓存中是否已有模型
48
- if model_name in _global_models:
49
- return _global_models[model_name]
50
-
51
- try:
52
- embedding_model = SentenceTransformer(
53
- model_name,
54
- cache_folder=cache_dir,
55
- local_files_only=True
56
- )
57
- except Exception:
58
- embedding_model = SentenceTransformer(
59
- model_name,
60
- cache_folder=cache_dir,
61
- local_files_only=False
62
- )
63
-
64
- # 如果可用,将模型移到GPU上
65
- if torch.cuda.is_available():
66
- embedding_model.to(torch.device("cuda"))
67
-
68
- # 保存到全局缓存
69
- _global_models[model_name] = embedding_model
70
-
71
- return embedding_model
72
-
73
- def get_embedding(embedding_model: Any, text: str) -> np.ndarray:
74
- """
75
- 为给定文本生成嵌入向量。
76
-
77
- 参数:
78
- embedding_model: 使用的嵌入模型
79
- text: 要嵌入的输入文本
80
-
81
- 返回:
82
- np.ndarray: 嵌入向量
83
- """
84
- embedding = embedding_model.encode(text,
85
- normalize_embeddings=True,
86
- show_progress_bar=False)
87
- return np.array(embedding, dtype=np.float32)
88
-
89
- def get_embedding_batch(embedding_model: Any, prefix: str, texts: List[str], spinner: Optional[Yaspin] = None, batch_size: int = 8) -> np.ndarray:
90
- """
91
- 为一批文本生成嵌入向量,使用高效的批处理,针对RAG优化。
92
-
93
- 参数:
94
- embedding_model: 使用的嵌入模型
95
- prefix: 进度条前缀
96
- texts: 要嵌入的文本列表
97
- spinner: 可选的进度指示器
98
- batch_size: 批处理大小,更大的值可能更快但需要更多内存
99
-
100
- 返回:
101
- np.ndarray: 堆叠的嵌入向量
102
- """
103
- # 简单嵌入缓存,避免重复计算相同文本块
104
- embedding_cache = {}
105
- cache_hits = 0
106
-
107
- try:
108
- # 预处理:将所有文本分块
109
- all_chunks = []
110
- chunk_indices = [] # 跟踪每个原始文本对应的块索引
111
-
112
- for i, text in enumerate(texts):
113
- if spinner:
114
- spinner.text = f"{prefix} 预处理中 ({i+1}/{len(texts)}) ..."
115
-
116
- # 预处理文本:移除多余空白,规范化
117
- text = ' '.join(text.split()) if text else ""
118
-
119
- # 使用更优化的分块函数
120
- chunks = split_text_into_chunks(text, 512)
121
- start_idx = len(all_chunks)
122
- all_chunks.extend(chunks)
123
- end_idx = len(all_chunks)
124
- chunk_indices.append((start_idx, end_idx))
125
-
126
- if not all_chunks:
127
- return np.zeros((0, embedding_model.get_sentence_embedding_dimension()), dtype=np.float32)
128
-
129
- # 批量处理所有块
130
- all_vectors = []
131
- for i in range(0, len(all_chunks), batch_size):
132
- if spinner:
133
- spinner.text = f"{prefix} 批量处理嵌入 ({i+1}/{len(all_chunks)}) ..."
134
-
135
- batch = all_chunks[i:i+batch_size]
136
- batch_to_process = []
137
- batch_indices = []
138
-
139
- # 检查缓存,避免重复计算
140
- for j, chunk in enumerate(batch):
141
- chunk_hash = hash(chunk)
142
- if chunk_hash in embedding_cache:
143
- all_vectors.append(embedding_cache[chunk_hash])
144
- cache_hits += 1
145
- else:
146
- batch_to_process.append(chunk)
147
- batch_indices.append(j)
148
-
149
- if batch_to_process:
150
- # 对未缓存的块处理
151
- batch_vectors = embedding_model.encode(
152
- batch_to_process,
153
- normalize_embeddings=True,
154
- show_progress_bar=False,
155
- convert_to_numpy=True,
156
- )
157
-
158
- # 处理结果并更新缓存
159
- if len(batch_to_process) == 1:
160
- vec = batch_vectors
161
- chunk_hash = hash(batch_to_process[0])
162
- embedding_cache[chunk_hash] = vec
163
- all_vectors.append(vec)
164
- else:
165
- for j, vec in enumerate(batch_vectors):
166
- chunk_hash = hash(batch_to_process[j])
167
- embedding_cache[chunk_hash] = vec
168
- all_vectors.append(vec)
169
-
170
- # 组织结果到原始文本顺序
171
- result_vectors = []
172
- for start_idx, end_idx in chunk_indices:
173
- text_vectors = []
174
- for j in range(start_idx, end_idx):
175
- if j < len(all_vectors):
176
- text_vectors.append(all_vectors[j])
177
-
178
- if text_vectors:
179
- # 当一个文本被分成多个块时,采用加权平均
180
- if len(text_vectors) > 1:
181
- # 针对RAG优化:对多个块进行加权平均,前面的块权重略高
182
- weights = np.linspace(1.0, 0.8, len(text_vectors))
183
- weights = weights / weights.sum() # 归一化权重
184
-
185
- # 应用权重并求和
186
- weighted_sum = np.zeros_like(text_vectors[0])
187
- for i, vec in enumerate(text_vectors):
188
- # 确保向量形状一致,处理可能的维度不匹配问题
189
- vec_array = np.asarray(vec).reshape(weighted_sum.shape)
190
- weighted_sum += vec_array * weights[i]
191
-
192
- # 归一化结果向量
193
- norm = np.linalg.norm(weighted_sum)
194
- if norm > 0:
195
- weighted_sum = weighted_sum / norm
196
-
197
- result_vectors.append(weighted_sum)
198
- else:
199
- # 单块直接使用
200
- result_vectors.append(text_vectors[0])
201
-
202
- if spinner and cache_hits > 0:
203
- spinner.text = f"{prefix} 缓存命中: {cache_hits}/{len(all_chunks)} 块"
204
-
205
- return np.vstack(result_vectors)
206
-
207
- except Exception as e:
208
- PrettyOutput.print(f"批量嵌入失败: {str(e)}", OutputType.ERROR)
209
- return np.zeros((0, embedding_model.get_sentence_embedding_dimension()), dtype=np.float32)
210
-
211
36
  def split_text_into_chunks(text: str, max_length: int = 512, min_length: int = 50) -> List[str]:
212
37
  """将文本分割成带重叠窗口的块,优化RAG检索效果。
213
38
 
@@ -357,145 +182,3 @@ def load_tokenizer() -> AutoTokenizer:
357
182
  _global_tokenizers[model_name] = tokenizer
358
183
 
359
184
  return tokenizer # type: ignore
360
-
361
- @functools.lru_cache(maxsize=1)
362
- def load_rerank_model() -> Tuple[AutoModelForSequenceClassification, AutoTokenizer]:
363
- """
364
- 加载重排序模型和分词器,使用缓存避免重复加载。
365
-
366
- 返回:
367
- Tuple[AutoModelForSequenceClassification, AutoTokenizer]: 加载的模型和分词器
368
- """
369
- model_name = "BAAI/bge-reranker-v2-m3"
370
- cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
371
-
372
- # 检查全局缓存
373
- key = f"rerank_{model_name}"
374
- if key in _global_models and f"{key}_tokenizer" in _global_tokenizers:
375
- return _global_models[key], _global_tokenizers[f"{key}_tokenizer"]
376
-
377
- try:
378
- tokenizer = AutoTokenizer.from_pretrained(
379
- model_name,
380
- cache_dir=cache_dir,
381
- local_files_only=True
382
- )
383
- model = AutoModelForSequenceClassification.from_pretrained(
384
- model_name,
385
- cache_dir=cache_dir,
386
- local_files_only=True
387
- )
388
- except Exception:
389
- tokenizer = AutoTokenizer.from_pretrained(
390
- model_name,
391
- cache_dir=cache_dir,
392
- local_files_only=False
393
- )
394
- model = AutoModelForSequenceClassification.from_pretrained(
395
- model_name,
396
- cache_dir=cache_dir,
397
- local_files_only=False
398
- )
399
-
400
- if torch.cuda.is_available():
401
- model = model.cuda()
402
- model.eval()
403
-
404
- # 保存到全局缓存
405
- _global_models[key] = model
406
- _global_tokenizers[f"{key}_tokenizer"] = tokenizer
407
-
408
- return model, tokenizer # type: ignore
409
-
410
- def rerank_results(query: str, documents: List[str], initial_scores: Optional[List[float]] = None,
411
- batch_size: int = 8, spinner: Optional[Yaspin] = None) -> List[float]:
412
- """
413
- 使用交叉编码器重排序检索结果,提高RAG精度。
414
-
415
- 参数:
416
- query: 查询文本
417
- documents: 要重排序的文档内容列表
418
- initial_scores: 初始检索分数,可选。如果提供,将与重排序分数融合
419
- batch_size: 批处理大小
420
- spinner: 可选的进度指示器
421
-
422
- 返回:
423
- List[float]: 重排序后的分数列表,与输入文档对应
424
- """
425
- try:
426
- if not documents:
427
- return []
428
-
429
- # 加载重排序模型
430
- if spinner:
431
- spinner.text = "加载重排序模型..."
432
- model, tokenizer = load_rerank_model()
433
-
434
- # 准备评分
435
- all_scores = []
436
-
437
- # 批量处理
438
- for i in range(0, len(documents), batch_size):
439
- if spinner:
440
- spinner.text = f"重排序进度: {i}/{len(documents)}..."
441
-
442
- # 准备当前批次
443
- batch_docs = documents[i:i+batch_size]
444
- pairs = [(query, doc) for doc in batch_docs]
445
-
446
- # 编码输入
447
- with torch.no_grad():
448
- # 使用类型忽略以避免mypy错误
449
- inputs = tokenizer( # type: ignore
450
- pairs,
451
- padding=True,
452
- truncation=True,
453
- return_tensors="pt",
454
- max_length=512
455
- )
456
-
457
- # 使用GPU加速(如果可用)
458
- if torch.cuda.is_available():
459
- inputs = {k: v.cuda() for k, v in inputs.items()}
460
-
461
- # 获取分数
462
- outputs = model(**inputs) # type: ignore
463
- scores = outputs.logits.squeeze(-1).cpu().tolist()
464
-
465
- # 如果只有一个文档,确保返回列表
466
- if len(batch_docs) == 1:
467
- all_scores.append(float(scores))
468
- else:
469
- all_scores.extend(scores)
470
-
471
- # 归一化分数到0-1范围
472
- if all_scores:
473
- min_score = min(all_scores)
474
- max_score = max(all_scores)
475
- if max_score > min_score:
476
- normalized_scores = [(score - min_score) / (max_score - min_score) for score in all_scores]
477
- else:
478
- normalized_scores = [0.5] * len(all_scores)
479
-
480
- # 融合初始分数(如果提供)
481
- if initial_scores and len(initial_scores) == len(normalized_scores):
482
- # 使用加权平均融合分数:初始分数权重0.3,重排序分数权重0.7
483
- final_scores = [0.3 * init_score + 0.7 * rerank_score
484
- for init_score, rerank_score in zip(initial_scores, normalized_scores)]
485
- return final_scores
486
-
487
- return normalized_scores
488
-
489
- if spinner:
490
- spinner.text = "重排序完成"
491
-
492
- # 如果重排序失败,返回初始分数或默认分数
493
- return initial_scores if initial_scores else [0.5] * len(documents)
494
-
495
- except Exception as e:
496
- PrettyOutput.print(f"重排序失败: {str(e)}", OutputType.ERROR)
497
- if spinner:
498
- spinner.text = f"重排序失败: {str(e)}"
499
-
500
- # 发生错误时回退到初始分数
501
- return initial_scores if initial_scores else [0.5] * len(documents)
@@ -0,0 +1,343 @@
1
+ from pathlib import Path
2
+ import fitz # PyMuPDF for PDF files
3
+ from docx import Document as DocxDocument # python-docx for DOCX files
4
+ from pptx import Presentation
5
+ import pandas as pd
6
+ import unicodedata
7
+
8
+ class FileProcessor:
9
+ """Base class for file processor"""
10
+ @staticmethod
11
+ def can_handle(file_path: str) -> bool:
12
+ """Determine if the file can be processed"""
13
+ raise NotImplementedError
14
+
15
+ @staticmethod
16
+ def extract_text(file_path: str) -> str:
17
+ """Extract file text content"""
18
+ raise NotImplementedError
19
+
20
+ class TextFileProcessor(FileProcessor):
21
+ """Text file processor"""
22
+ ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin1']
23
+ SAMPLE_SIZE = 8192 # Read the first 8KB to detect encoding
24
+
25
+ @staticmethod
26
+ def can_handle(file_path: str) -> bool:
27
+ """Determine if the file is a text file by trying to decode it"""
28
+ try:
29
+ # Read the first part of the file to detect encoding
30
+ with open(file_path, 'rb') as f:
31
+ sample = f.read(TextFileProcessor.SAMPLE_SIZE)
32
+
33
+ # Check if it contains null bytes (usually represents a binary file)
34
+ if b'\x00' in sample:
35
+ return False
36
+
37
+ # Check if it contains too many non-printable characters (usually represents a binary file)
38
+ non_printable = sum(1 for byte in sample if byte < 32 and byte not in (9, 10, 13)) # tab, newline, carriage return
39
+ if non_printable / len(sample) > 0.3: # If non-printable characters exceed 30%, it is considered a binary file
40
+ return False
41
+
42
+ # Try to decode with different encodings
43
+ for encoding in TextFileProcessor.ENCODINGS:
44
+ try:
45
+ sample.decode(encoding)
46
+ return True
47
+ except UnicodeDecodeError:
48
+ continue
49
+
50
+ return False
51
+
52
+ except Exception:
53
+ return False
54
+
55
+ @staticmethod
56
+ def extract_text(file_path: str) -> str:
57
+ """Extract text content, using the detected correct encoding"""
58
+ detected_encoding = None
59
+ try:
60
+ # First try to detect encoding
61
+ with open(file_path, 'rb') as f:
62
+ raw_data = f.read()
63
+
64
+ # Try different encodings
65
+ for encoding in TextFileProcessor.ENCODINGS:
66
+ try:
67
+ raw_data.decode(encoding)
68
+ detected_encoding = encoding
69
+ break
70
+ except UnicodeDecodeError:
71
+ continue
72
+
73
+ if not detected_encoding:
74
+ raise UnicodeDecodeError(f"Failed to decode file with supported encodings: {file_path}") # type: ignore
75
+
76
+ # Use the detected encoding to read the file
77
+ with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as f:
78
+ content = f.read()
79
+
80
+ # Normalize Unicode characters
81
+ content = unicodedata.normalize('NFKC', content)
82
+
83
+ return content
84
+
85
+ except Exception as e:
86
+ raise Exception(f"Failed to read file: {str(e)}")
87
+
88
+ class PDFProcessor(FileProcessor):
89
+ """PDF file processor"""
90
+ @staticmethod
91
+ def can_handle(file_path: str) -> bool:
92
+ return Path(file_path).suffix.lower() == '.pdf'
93
+
94
+ @staticmethod
95
+ def extract_text(file_path: str) -> str:
96
+ """提取PDF文件中的所有文本内容,包括页码、图片描述等"""
97
+ try:
98
+ text_parts = []
99
+ with fitz.open(file_path) as doc: # type: ignore
100
+ # 添加文档信息
101
+ info = doc.metadata
102
+ if info:
103
+ meta_text = []
104
+ if info.get("title"):
105
+ meta_text.append(f"标题: {info['title']}")
106
+ if info.get("author"):
107
+ meta_text.append(f"作者: {info['author']}")
108
+ if info.get("subject"):
109
+ meta_text.append(f"主题: {info['subject']}")
110
+ if info.get("keywords"):
111
+ meta_text.append(f"关键词: {info['keywords']}")
112
+
113
+ if meta_text:
114
+ text_parts.append("=== 文档信息 ===")
115
+ text_parts.append("\n".join(meta_text))
116
+
117
+ # 提取目录结构(如果有)
118
+ toc = doc.get_toc() # type: ignore
119
+ if toc:
120
+ text_parts.append("\n=== 目录结构 ===")
121
+ for level, title, page in toc:
122
+ indent = " " * (level - 1)
123
+ text_parts.append(f"{indent}- {title} (第{page}页)")
124
+
125
+ # 处理各页内容
126
+ text_parts.append("\n=== 页面内容 ===")
127
+ for page_index in range(len(doc)): # 使用范围遍历而不是直接枚举文档对象
128
+ # 添加页码标记
129
+ text_parts.append(f"\n--- 第{page_index+1}页 ---")
130
+
131
+ # 获取页面
132
+ page = doc[page_index]
133
+
134
+ # 提取页面文本(包括结构信息)
135
+ try:
136
+ # 尝试使用结构化提取(保留段落和块结构)
137
+ text = page.get_text("text") # type: ignore
138
+ text = text.strip()
139
+ if text:
140
+ text_parts.append(text)
141
+ except Exception:
142
+ # 如果结构化提取失败,回退到简单文本提取
143
+ text = page.get_text() # type: ignore
144
+ if text.strip():
145
+ text_parts.append(text.strip())
146
+
147
+ # 提取图像信息(如果需要)
148
+ # 注意:这可能会增加处理时间,可根据需要启用
149
+ """
150
+ image_list = page.get_images()
151
+ if image_list:
152
+ text_parts.append(f"本页包含 {len(image_list)} 个图像")
153
+ """
154
+
155
+ # 合并所有文本
156
+ return "\n".join(text_parts)
157
+
158
+ except Exception as e:
159
+ # 处理可能的异常
160
+ return f"PDF处理错误: {str(e)}"
161
+
162
+ class DocxProcessor(FileProcessor):
163
+ """DOCX file processor"""
164
+ @staticmethod
165
+ def can_handle(file_path: str) -> bool:
166
+ return Path(file_path).suffix.lower() == '.docx'
167
+
168
+ @staticmethod
169
+ def extract_text(file_path: str) -> str:
170
+ """提取 DOCX 文件中的所有文本内容,包括段落、表格、页眉页脚等"""
171
+ doc = DocxDocument(file_path)
172
+ full_text = []
173
+
174
+ # 提取段落文本
175
+ for para in doc.paragraphs:
176
+ if para.text.strip(): # 跳过空段落
177
+ full_text.append(para.text)
178
+
179
+ # 提取表格文本
180
+ for table in doc.tables:
181
+ for row in table.rows:
182
+ row_texts = []
183
+ for cell in row.cells:
184
+ # 每个单元格可能包含多个段落
185
+ cell_text = "\n".join([p.text for p in cell.paragraphs if p.text.strip()])
186
+ if cell_text:
187
+ row_texts.append(cell_text)
188
+ if row_texts:
189
+ full_text.append(" | ".join(row_texts))
190
+
191
+ # 提取页眉页脚(如果有节)
192
+ try:
193
+ for section in doc.sections:
194
+ # 提取页眉
195
+ if section.header:
196
+ header_text = "\n".join([p.text for p in section.header.paragraphs if p.text.strip()])
197
+ if header_text:
198
+ full_text.append(f"页眉: {header_text}")
199
+
200
+ # 提取页脚
201
+ if section.footer:
202
+ footer_text = "\n".join([p.text for p in section.footer.paragraphs if p.text.strip()])
203
+ if footer_text:
204
+ full_text.append(f"页脚: {footer_text}")
205
+ except:
206
+ # 如果提取页眉页脚失败,忽略错误继续
207
+ pass
208
+
209
+ # 合并所有文本
210
+ return "\n\n".join(full_text)
211
+
212
+ class PPTProcessor(FileProcessor):
213
+ """PPT file processor"""
214
+ @staticmethod
215
+ def can_handle(file_path: str) -> bool:
216
+ return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
217
+
218
+ @staticmethod
219
+ def extract_text(file_path: str) -> str:
220
+ """提取PPT文件中的所有文本内容,包括标题、文本框、备注等"""
221
+ prs = Presentation(file_path)
222
+ all_text = []
223
+
224
+ # 遍历所有幻灯片
225
+ for slide_index, slide in enumerate(prs.slides, 1):
226
+ slide_text = []
227
+
228
+ # 添加幻灯片编号
229
+ slide_text.append(f"=== 幻灯片 {slide_index} ===")
230
+
231
+ # 提取幻灯片中所有形状的文本
232
+ for shape in slide.shapes:
233
+ # 提取带有文本的形状
234
+ try:
235
+ if hasattr(shape, "text_frame") and shape.text_frame: # type: ignore
236
+ for paragraph in shape.text_frame.paragraphs: # type: ignore
237
+ text = paragraph.text.strip()
238
+ if text:
239
+ slide_text.append(text)
240
+ except AttributeError:
241
+ pass
242
+
243
+ # 提取表格内容
244
+ try:
245
+ if hasattr(shape, "table") and shape.table: # type: ignore
246
+ for row in shape.table.rows: # type: ignore
247
+ row_texts = []
248
+ for cell in row.cells:
249
+ if hasattr(cell, "text_frame") and cell.text_frame:
250
+ cell_paragraphs = cell.text_frame.paragraphs # type: ignore
251
+ cell_text = " ".join([p.text.strip() for p in cell_paragraphs if p.text.strip()])
252
+ if cell_text:
253
+ row_texts.append(cell_text)
254
+ if row_texts:
255
+ slide_text.append(" | ".join(row_texts))
256
+ except AttributeError:
257
+ pass
258
+
259
+ # 提取幻灯片备注
260
+ try:
261
+ if hasattr(slide, "has_notes_slide") and slide.has_notes_slide:
262
+ notes_slide = slide.notes_slide
263
+ if notes_slide and hasattr(notes_slide, "notes_text_frame") and notes_slide.notes_text_frame:
264
+ notes_text = notes_slide.notes_text_frame.text.strip() # type: ignore
265
+ if notes_text:
266
+ slide_text.append(f"备注: {notes_text}")
267
+ except AttributeError:
268
+ pass
269
+
270
+ # 合并当前幻灯片的所有文本
271
+ if len(slide_text) > 1: # 如果除了幻灯片编号外还有其他内容
272
+ all_text.append("\n".join(slide_text))
273
+
274
+ # 返回所有幻灯片的文本内容
275
+ return "\n\n".join(all_text)
276
+
277
+ class ExcelProcessor(FileProcessor):
278
+ """Excel file processor"""
279
+ @staticmethod
280
+ def can_handle(file_path: str) -> bool:
281
+ return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
282
+
283
+ @staticmethod
284
+ def extract_text(file_path: str) -> str:
285
+ """提取 Excel 文件中的所有文本内容,包括多个工作表及格式化内容"""
286
+ try:
287
+ # 读取所有工作表
288
+ excel_file = pd.ExcelFile(file_path)
289
+ sheets_text = []
290
+
291
+ # 处理每个工作表
292
+ for sheet_name in excel_file.sheet_names:
293
+ # 读取当前工作表
294
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
295
+
296
+ # 如果是空表格,跳过
297
+ if df.empty:
298
+ continue
299
+
300
+ # 添加工作表标题
301
+ sheet_text = [f"=== 工作表: {sheet_name} ==="]
302
+
303
+ # 填充空单元格,避免NaN显示
304
+ df = df.fillna("")
305
+
306
+ # 提取表格头信息
307
+ if not df.columns.empty:
308
+ headers = [str(col) for col in df.columns]
309
+ sheet_text.append("列标题: " + " | ".join(headers))
310
+
311
+ # 尝试提取表格中可能的关键信息
312
+ # 1. 表格内容概述
313
+ row_count, col_count = df.shape
314
+ sheet_text.append(f"表格大小: {row_count}行 x {col_count}列")
315
+
316
+ # 2. 表格数据,使用更友好的格式
317
+ try:
318
+ # 转换数据框为字符串表示
319
+ # 设置最大行数和列数,避免过大的表格
320
+ max_rows = min(500, row_count) # 最多显示500行
321
+ if row_count > max_rows:
322
+ sheet_text.append(f"注意: 表格太大,仅显示前{max_rows}行")
323
+
324
+ # 将DataFrame转换为字符串表格
325
+ table_str = df.head(max_rows).to_string(index=True, max_rows=max_rows, max_cols=None)
326
+ sheet_text.append(table_str)
327
+
328
+ except Exception as e:
329
+ sheet_text.append(f"表格数据提取错误: {str(e)}")
330
+
331
+ # 合并当前工作表的文本
332
+ sheets_text.append("\n".join(sheet_text))
333
+
334
+ # 如果没有提取到任何内容,返回一个提示信息
335
+ if not sheets_text:
336
+ return "Excel文件为空或无法提取内容"
337
+
338
+ # 合并所有工作表的文本
339
+ return "\n\n".join(sheets_text)
340
+
341
+ except Exception as e:
342
+ # 处理可能的异常,返回错误信息
343
+ return f"Excel文件处理错误: {str(e)}"
@@ -78,7 +78,6 @@ class FileCompleter(Completer):
78
78
  default_suggestions = [
79
79
  (ot("CodeBase"), '查询代码库'),
80
80
  (ot("Web"), '网页搜索'),
81
- (ot("RAG"), '知识库检索'),
82
81
  (ot("Summary"), '总结'),
83
82
  (ot("Clear"), '清除历史'),
84
83
  (ot("Methodology"), '查找相关方法论'),