jarvis-ai-assistant 0.1.96__py3-none-any.whl → 0.1.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jarvis-ai-assistant might be problematic. Click here for more details.

Files changed (41) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/agent.py +138 -144
  3. jarvis/jarvis_codebase/main.py +87 -54
  4. jarvis/jarvis_coder/git_utils.py +22 -25
  5. jarvis/jarvis_coder/main.py +166 -171
  6. jarvis/jarvis_coder/patch_handler.py +153 -453
  7. jarvis/jarvis_coder/plan_generator.py +76 -48
  8. jarvis/jarvis_platform/main.py +39 -39
  9. jarvis/jarvis_rag/main.py +182 -182
  10. jarvis/jarvis_smart_shell/main.py +34 -34
  11. jarvis/main.py +24 -24
  12. jarvis/models/ai8.py +22 -22
  13. jarvis/models/base.py +17 -13
  14. jarvis/models/kimi.py +31 -31
  15. jarvis/models/ollama.py +28 -28
  16. jarvis/models/openai.py +22 -24
  17. jarvis/models/oyi.py +25 -25
  18. jarvis/models/registry.py +33 -34
  19. jarvis/tools/ask_user.py +5 -5
  20. jarvis/tools/base.py +2 -2
  21. jarvis/tools/chdir.py +9 -9
  22. jarvis/tools/codebase_qa.py +4 -4
  23. jarvis/tools/coder.py +4 -4
  24. jarvis/tools/file_ops.py +1 -1
  25. jarvis/tools/generator.py +23 -23
  26. jarvis/tools/methodology.py +4 -4
  27. jarvis/tools/rag.py +4 -4
  28. jarvis/tools/registry.py +38 -38
  29. jarvis/tools/search.py +42 -42
  30. jarvis/tools/shell.py +13 -13
  31. jarvis/tools/sub_agent.py +16 -16
  32. jarvis/tools/thinker.py +41 -41
  33. jarvis/tools/webpage.py +17 -17
  34. jarvis/utils.py +59 -60
  35. {jarvis_ai_assistant-0.1.96.dist-info → jarvis_ai_assistant-0.1.98.dist-info}/METADATA +1 -1
  36. jarvis_ai_assistant-0.1.98.dist-info/RECORD +47 -0
  37. jarvis_ai_assistant-0.1.96.dist-info/RECORD +0 -47
  38. {jarvis_ai_assistant-0.1.96.dist-info → jarvis_ai_assistant-0.1.98.dist-info}/LICENSE +0 -0
  39. {jarvis_ai_assistant-0.1.96.dist-info → jarvis_ai_assistant-0.1.98.dist-info}/WHEEL +0 -0
  40. {jarvis_ai_assistant-0.1.96.dist-info → jarvis_ai_assistant-0.1.98.dist-info}/entry_points.txt +0 -0
  41. {jarvis_ai_assistant-0.1.96.dist-info → jarvis_ai_assistant-0.1.98.dist-info}/top_level.txt +0 -0
jarvis/jarvis_rag/main.py CHANGED
@@ -19,46 +19,46 @@ from threading import Lock
19
19
 
20
20
  @dataclass
21
21
  class Document:
22
- """文档类,用于存储文档内容和元数据"""
23
- content: str # 文档内容
24
- metadata: Dict # 元数据(文件路径、位置等)
25
- md5: str = "" # 文件MD5值,用于增量更新检测
22
+ """Document class, for storing document content and metadata"""
23
+ content: str # Document content
24
+ metadata: Dict # Metadata (file path, position, etc.)
25
+ md5: str = "" # File MD5 value, for incremental update detection
26
26
 
27
27
  class FileProcessor:
28
- """文件处理器基类"""
28
+ """Base class for file processor"""
29
29
  @staticmethod
30
30
  def can_handle(file_path: str) -> bool:
31
- """判断是否可以处理该文件"""
31
+ """Determine if the file can be processed"""
32
32
  raise NotImplementedError
33
33
 
34
34
  @staticmethod
35
35
  def extract_text(file_path: str) -> str:
36
- """提取文件文本内容"""
36
+ """Extract file text content"""
37
37
  raise NotImplementedError
38
38
 
39
39
  class TextFileProcessor(FileProcessor):
40
- """文本文件处理器"""
40
+ """Text file processor"""
41
41
  ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin1']
42
- SAMPLE_SIZE = 8192 # 读取前8KB来检测编码
42
+ SAMPLE_SIZE = 8192 # Read the first 8KB to detect encoding
43
43
 
44
44
  @staticmethod
45
45
  def can_handle(file_path: str) -> bool:
46
- """判断文件是否为文本文件,通过尝试解码来判断"""
46
+ """Determine if the file is a text file by trying to decode it"""
47
47
  try:
48
- # 读取文件开头的一小部分来检测
48
+ # Read the first part of the file to detect encoding
49
49
  with open(file_path, 'rb') as f:
50
50
  sample = f.read(TextFileProcessor.SAMPLE_SIZE)
51
51
 
52
- # 检查是否包含空字节(通常表示二进制文件)
52
+ # Check if it contains null bytes (usually represents a binary file)
53
53
  if b'\x00' in sample:
54
54
  return False
55
55
 
56
- # 检查是否包含过多的非打印字符(通常表示二进制文件)
56
+ # Check if it contains too many non-printable characters (usually represents a binary file)
57
57
  non_printable = sum(1 for byte in sample if byte < 32 and byte not in (9, 10, 13)) # tab, newline, carriage return
58
- if non_printable / len(sample) > 0.3: # 如果非打印字符超过30%,认为是二进制文件
58
+ if non_printable / len(sample) > 0.3: # If non-printable characters exceed 30%, it is considered a binary file
59
59
  return False
60
60
 
61
- # 尝试用不同编码解码
61
+ # Try to decode with different encodings
62
62
  for encoding in TextFileProcessor.ENCODINGS:
63
63
  try:
64
64
  sample.decode(encoding)
@@ -73,14 +73,14 @@ class TextFileProcessor(FileProcessor):
73
73
 
74
74
  @staticmethod
75
75
  def extract_text(file_path: str) -> str:
76
- """提取文本内容,使用检测到的正确编码"""
76
+ """Extract text content, using the detected correct encoding"""
77
77
  detected_encoding = None
78
78
  try:
79
- # 首先尝试检测编码
79
+ # First try to detect encoding
80
80
  with open(file_path, 'rb') as f:
81
81
  raw_data = f.read()
82
82
 
83
- # 尝试不同的编码
83
+ # Try different encodings
84
84
  for encoding in TextFileProcessor.ENCODINGS:
85
85
  try:
86
86
  raw_data.decode(encoding)
@@ -90,23 +90,23 @@ class TextFileProcessor(FileProcessor):
90
90
  continue
91
91
 
92
92
  if not detected_encoding:
93
- raise UnicodeDecodeError(f"无法用支持的编码解码文件: {file_path}")
93
+ raise UnicodeDecodeError(f"Failed to decode file with supported encodings: {file_path}")
94
94
 
95
- # 使用检测到的编码读取文件
95
+ # Use the detected encoding to read the file
96
96
  with open(file_path, 'r', encoding=detected_encoding, errors='replace') as f:
97
97
  content = f.read()
98
98
 
99
- # 规范化Unicode字符
99
+ # Normalize Unicode characters
100
100
  import unicodedata
101
101
  content = unicodedata.normalize('NFKC', content)
102
102
 
103
103
  return content
104
104
 
105
105
  except Exception as e:
106
- raise Exception(f"读取文件失败: {str(e)}")
106
+ raise Exception(f"Failed to read file: {str(e)}")
107
107
 
108
108
  class PDFProcessor(FileProcessor):
109
- """PDF文件处理器"""
109
+ """PDF file processor"""
110
110
  @staticmethod
111
111
  def can_handle(file_path: str) -> bool:
112
112
  return Path(file_path).suffix.lower() == '.pdf'
@@ -120,7 +120,7 @@ class PDFProcessor(FileProcessor):
120
120
  return "\n".join(text_parts)
121
121
 
122
122
  class DocxProcessor(FileProcessor):
123
- """DOCX文件处理器"""
123
+ """DOCX file processor"""
124
124
  @staticmethod
125
125
  def can_handle(file_path: str) -> bool:
126
126
  return Path(file_path).suffix.lower() == '.docx'
@@ -132,58 +132,58 @@ class DocxProcessor(FileProcessor):
132
132
 
133
133
  class RAGTool:
134
134
  def __init__(self, root_dir: str):
135
- """初始化RAG工具
135
+ """Initialize RAG tool
136
136
 
137
137
  Args:
138
- root_dir: 项目根目录
138
+ root_dir: Project root directory
139
139
  """
140
140
  load_env_from_file()
141
141
  self.root_dir = root_dir
142
142
  os.chdir(self.root_dir)
143
143
 
144
- # 初始化配置
145
- self.min_paragraph_length = int(os.environ.get("JARVIS_MIN_PARAGRAPH_LENGTH", "50")) # 最小段落长度
146
- self.max_paragraph_length = int(os.environ.get("JARVIS_MAX_PARAGRAPH_LENGTH", "1000")) # 最大段落长度
147
- self.context_window = int(os.environ.get("JARVIS_CONTEXT_WINDOW", "5")) # 上下文窗口大小,默认前后各5个片段
144
+ # Initialize configuration
145
+ self.min_paragraph_length = int(os.environ.get("JARVIS_MIN_PARAGRAPH_LENGTH", "50")) # Minimum paragraph length
146
+ self.max_paragraph_length = int(os.environ.get("JARVIS_MAX_PARAGRAPH_LENGTH", "1000")) # Maximum paragraph length
147
+ self.context_window = int(os.environ.get("JARVIS_CONTEXT_WINDOW", "5")) # Context window size, default前后各5个片段
148
148
  self.max_context_length = int(get_max_context_length() * 0.8)
149
149
 
150
- # 初始化数据目录
150
+ # Initialize data directory
151
151
  self.data_dir = os.path.join(self.root_dir, ".jarvis-rag")
152
152
  if not os.path.exists(self.data_dir):
153
153
  os.makedirs(self.data_dir)
154
154
 
155
- # 初始化嵌入模型
155
+ # Initialize embedding model
156
156
  try:
157
157
  self.embedding_model = load_embedding_model()
158
158
  self.vector_dim = self.embedding_model.get_sentence_embedding_dimension()
159
- PrettyOutput.print("模型加载完成", output_type=OutputType.SUCCESS)
159
+ PrettyOutput.print("Model loaded", output_type=OutputType.SUCCESS)
160
160
  except Exception as e:
161
- PrettyOutput.print(f"加载模型失败: {str(e)}", output_type=OutputType.ERROR)
161
+ PrettyOutput.print(f"Failed to load model: {str(e)}", output_type=OutputType.ERROR)
162
162
  raise
163
163
 
164
- # 初始化缓存和索引
164
+ # Initialize cache and index
165
165
  self.cache_path = os.path.join(self.data_dir, "cache.pkl")
166
166
  self.documents: List[Document] = []
167
- self.index = None # 用于搜索的IVF索引
168
- self.flat_index = None # 用于存储原始向量
169
- self.file_md5_cache = {} # 用于存储文件的MD5
167
+ self.index = None # IVF index for search
168
+ self.flat_index = None # Store original vectors
169
+ self.file_md5_cache = {} # Store file MD5 values
170
170
 
171
- # 加载缓存
171
+ # Load cache
172
172
  self._load_cache()
173
173
 
174
- # 注册文件处理器
174
+ # Register file processors
175
175
  self.file_processors = [
176
176
  TextFileProcessor(),
177
177
  PDFProcessor(),
178
178
  DocxProcessor()
179
179
  ]
180
180
 
181
- # 添加线程相关配置
181
+ # Add thread related configuration
182
182
  self.thread_count = int(os.environ.get("JARVIS_THREAD_COUNT", os.cpu_count() or 4))
183
- self.vector_lock = Lock() # 用于保护向量列表的并发访问
183
+ self.vector_lock = Lock() # Protect vector list concurrency
184
184
 
185
185
  def _load_cache(self):
186
- """加载缓存数据"""
186
+ """Load cache data"""
187
187
  if os.path.exists(self.cache_path):
188
188
  try:
189
189
  with lzma.open(self.cache_path, 'rb') as f:
@@ -195,10 +195,10 @@ class RAGTool:
195
195
  # 重建索引
196
196
  if vectors is not None:
197
197
  self._build_index(vectors)
198
- PrettyOutput.print(f"加载了 {len(self.documents)} 个文档片段",
198
+ PrettyOutput.print(f"Loaded {len(self.documents)} document fragments",
199
199
  output_type=OutputType.INFO)
200
200
  except Exception as e:
201
- PrettyOutput.print(f"加载缓存失败: {str(e)}",
201
+ PrettyOutput.print(f"Failed to load cache: {str(e)}",
202
202
  output_type=OutputType.WARNING)
203
203
  self.documents = []
204
204
  self.index = None
@@ -206,14 +206,14 @@ class RAGTool:
206
206
  self.file_md5_cache = {}
207
207
 
208
208
  def _save_cache(self, vectors: np.ndarray):
209
- """优化缓存保存"""
209
+ """Optimize cache saving"""
210
210
  try:
211
211
  cache_data = {
212
212
  "version": "1.0",
213
213
  "timestamp": datetime.now().isoformat(),
214
214
  "documents": self.documents,
215
- "vectors": vectors.copy() if vectors is not None else None, # 创建数组的副本
216
- "file_md5_cache": dict(self.file_md5_cache), # 创建字典的副本
215
+ "vectors": vectors.copy() if vectors is not None else None, # Create a copy of the array
216
+ "file_md5_cache": dict(self.file_md5_cache), # Create a copy of the dictionary
217
217
  "metadata": {
218
218
  "vector_dim": self.vector_dim,
219
219
  "total_docs": len(self.documents),
@@ -221,56 +221,56 @@ class RAGTool:
221
221
  }
222
222
  }
223
223
 
224
- # 先将数据序列化为字节流
224
+ # First serialize the data to a byte stream
225
225
  data = pickle.dumps(cache_data, protocol=pickle.HIGHEST_PROTOCOL)
226
226
 
227
- # 然后使用 LZMA 压缩字节流
227
+ # Then use LZMA to compress the byte stream
228
228
  with lzma.open(self.cache_path, 'wb') as f:
229
229
  f.write(data)
230
230
 
231
- # 创建备份
231
+ # Create a backup
232
232
  backup_path = f"{self.cache_path}.backup"
233
233
  shutil.copy2(self.cache_path, backup_path)
234
234
 
235
- PrettyOutput.print(f"缓存已保存: {len(self.documents)} 个文档片段",
235
+ PrettyOutput.print(f"Cache saved: {len(self.documents)} document fragments",
236
236
  output_type=OutputType.INFO)
237
237
  except Exception as e:
238
- PrettyOutput.print(f"保存缓存失败: {str(e)}",
238
+ PrettyOutput.print(f"Failed to save cache: {str(e)}",
239
239
  output_type=OutputType.ERROR)
240
240
  raise
241
241
 
242
242
  def _build_index(self, vectors: np.ndarray):
243
- """构建FAISS索引"""
243
+ """Build FAISS index"""
244
244
  if vectors.shape[0] == 0:
245
245
  self.index = None
246
246
  self.flat_index = None
247
247
  return
248
248
 
249
- # 创建扁平索引存储原始向量,用于重建
249
+ # Create a flat index to store original vectors, for reconstruction
250
250
  self.flat_index = faiss.IndexFlatIP(self.vector_dim)
251
251
  self.flat_index.add(vectors)
252
252
 
253
- # 创建IVF索引用于快速搜索
253
+ # Create an IVF index for fast search
254
254
  nlist = max(4, int(vectors.shape[0] / 1000)) # 每1000个向量一个聚类中心
255
255
  quantizer = faiss.IndexFlatIP(self.vector_dim)
256
256
  self.index = faiss.IndexIVFFlat(quantizer, self.vector_dim, nlist, faiss.METRIC_INNER_PRODUCT)
257
257
 
258
- # 训练并添加向量
258
+ # Train and add vectors
259
259
  self.index.train(vectors)
260
260
  self.index.add(vectors)
261
- # 设置搜索时探测的聚类数
261
+ # Set the number of clusters to probe during search
262
262
  self.index.nprobe = min(nlist, 10)
263
263
 
264
264
  def _split_text(self, text: str) -> List[str]:
265
- """使用更智能的分块策略"""
266
- # 添加重叠分块以保持上下文连贯性
265
+ """Use a more intelligent splitting strategy"""
266
+ # Add overlapping blocks to maintain context consistency
267
267
  overlap_size = min(200, self.max_paragraph_length // 4)
268
268
 
269
269
  paragraphs = []
270
270
  current_chunk = []
271
271
  current_length = 0
272
272
 
273
- # 首先按句子分割
273
+ # First split by sentence
274
274
  sentences = []
275
275
  current_sentence = []
276
276
  sentence_ends = {'。', '!', '?', '…', '.', '!', '?'}
@@ -288,7 +288,7 @@ class RAGTool:
288
288
  if sentence.strip():
289
289
  sentences.append(sentence)
290
290
 
291
- # 基于句子构建重叠块
291
+ # Build overlapping blocks based on sentences
292
292
  for sentence in sentences:
293
293
  if current_length + len(sentence) > self.max_paragraph_length:
294
294
  if current_chunk:
@@ -296,8 +296,8 @@ class RAGTool:
296
296
  if len(chunk_text) >= self.min_paragraph_length:
297
297
  paragraphs.append(chunk_text)
298
298
 
299
- # 保留部分内容作为重叠
300
- overlap_text = ' '.join(current_chunk[-2:]) # 保留最后两句
299
+ # Keep some content as overlap
300
+ overlap_text = ' '.join(current_chunk[-2:]) # Keep the last two sentences
301
301
  current_chunk = []
302
302
  if overlap_text:
303
303
  current_chunk.append(overlap_text)
@@ -308,7 +308,7 @@ class RAGTool:
308
308
  current_chunk.append(sentence)
309
309
  current_length += len(sentence)
310
310
 
311
- # 处理最后一个chunk
311
+ # Process the last chunk
312
312
  if current_chunk:
313
313
  chunk_text = ' '.join(current_chunk)
314
314
  if len(chunk_text) >= self.min_paragraph_length:
@@ -317,65 +317,65 @@ class RAGTool:
317
317
  return paragraphs
318
318
 
319
319
  def _get_embedding(self, text: str) -> np.ndarray:
320
- """获取文本的向量表示"""
320
+ """Get the vector representation of the text"""
321
321
  embedding = self.embedding_model.encode(text,
322
322
  normalize_embeddings=True,
323
323
  show_progress_bar=False)
324
324
  return np.array(embedding, dtype=np.float32)
325
325
 
326
326
  def _get_embedding_batch(self, texts: List[str]) -> np.ndarray:
327
- """批量获取文本的向量表示
327
+ """Get the vector representation of the text batch
328
328
 
329
329
  Args:
330
- texts: 文本列表
330
+ texts: Text list
331
331
 
332
332
  Returns:
333
- np.ndarray: 向量表示数组
333
+ np.ndarray: Vector representation array
334
334
  """
335
335
  try:
336
336
  embeddings = self.embedding_model.encode(texts,
337
337
  normalize_embeddings=True,
338
338
  show_progress_bar=False,
339
- batch_size=32) # 使用批处理提高效率
339
+ batch_size=32) # Use batch processing to improve efficiency
340
340
  return np.array(embeddings, dtype=np.float32)
341
341
  except Exception as e:
342
- PrettyOutput.print(f"获取向量表示失败: {str(e)}",
342
+ PrettyOutput.print(f"Failed to get vector representation: {str(e)}",
343
343
  output_type=OutputType.ERROR)
344
344
  return np.zeros((len(texts), self.vector_dim), dtype=np.float32)
345
345
 
346
346
  def _process_document_batch(self, documents: List[Document]) -> List[np.ndarray]:
347
- """处理一批文档的向量化
347
+ """Process a batch of documents vectorization
348
348
 
349
349
  Args:
350
- documents: 文档列表
350
+ documents: Document list
351
351
 
352
352
  Returns:
353
- List[np.ndarray]: 向量列表
353
+ List[np.ndarray]: Vector list
354
354
  """
355
355
  texts = []
356
356
  for doc in documents:
357
- # 组合文档信息
357
+ # Combine document information
358
358
  combined_text = f"""
359
- 文件: {doc.metadata['file_path']}
360
- 内容: {doc.content}
359
+ File: {doc.metadata['file_path']}
360
+ Content: {doc.content}
361
361
  """
362
362
  texts.append(combined_text)
363
363
 
364
364
  return self._get_embedding_batch(texts)
365
365
 
366
366
  def _process_file(self, file_path: str) -> List[Document]:
367
- """处理单个文件"""
367
+ """Process a single file"""
368
368
  try:
369
- # 计算文件MD5
369
+ # Calculate file MD5
370
370
  current_md5 = get_file_md5(file_path)
371
371
  if not current_md5:
372
372
  return []
373
373
 
374
- # 检查文件是否需要重新处理
374
+ # Check if the file needs to be reprocessed
375
375
  if file_path in self.file_md5_cache and self.file_md5_cache[file_path] == current_md5:
376
376
  return []
377
377
 
378
- # 查找合适的处理器
378
+ # Find the appropriate processor
379
379
  processor = None
380
380
  for p in self.file_processors:
381
381
  if p.can_handle(file_path):
@@ -383,18 +383,18 @@ class RAGTool:
383
383
  break
384
384
 
385
385
  if not processor:
386
- # 如果找不到合适的处理器,则返回一个空的文档
386
+ # If no appropriate processor is found, return an empty document
387
387
  return []
388
388
 
389
- # 提取文本内容
389
+ # Extract text content
390
390
  content = processor.extract_text(file_path)
391
391
  if not content.strip():
392
392
  return []
393
393
 
394
- # 分割文本
394
+ # Split text
395
395
  chunks = self._split_text(content)
396
396
 
397
- # 创建文档对象
397
+ # Create document objects
398
398
  documents = []
399
399
  for i, chunk in enumerate(chunks):
400
400
  doc = Document(
@@ -409,18 +409,18 @@ class RAGTool:
409
409
  )
410
410
  documents.append(doc)
411
411
 
412
- # 更新MD5缓存
412
+ # Update MD5 cache
413
413
  self.file_md5_cache[file_path] = current_md5
414
414
  return documents
415
415
 
416
416
  except Exception as e:
417
- PrettyOutput.print(f"处理文件失败 {file_path}: {str(e)}",
417
+ PrettyOutput.print(f"Failed to process file {file_path}: {str(e)}",
418
418
  output_type=OutputType.ERROR)
419
419
  return []
420
420
 
421
421
  def build_index(self, dir: str):
422
- """构建文档索引"""
423
- # 获取所有文件
422
+ """Build document index"""
423
+ # Get all files
424
424
  all_files = []
425
425
  for root, _, files in os.walk(dir):
426
426
  if any(ignored in root for ignored in ['.git', '__pycache__', 'node_modules']) or \
@@ -432,69 +432,69 @@ class RAGTool:
432
432
 
433
433
  file_path = os.path.join(root, file)
434
434
  if os.path.getsize(file_path) > 100 * 1024 * 1024: # 100MB
435
- PrettyOutput.print(f"跳过大文件: {file_path}",
435
+ PrettyOutput.print(f"Skip large file: {file_path}",
436
436
  output_type=OutputType.WARNING)
437
437
  continue
438
438
  all_files.append(file_path)
439
439
 
440
- # 清理已删除文件的缓存
440
+ # Clean up cache for deleted files
441
441
  deleted_files = set(self.file_md5_cache.keys()) - set(all_files)
442
442
  for file_path in deleted_files:
443
443
  del self.file_md5_cache[file_path]
444
- # 移除相关的文档
444
+ # Remove related documents
445
445
  self.documents = [doc for doc in self.documents if doc.metadata['file_path'] != file_path]
446
446
 
447
- # 检查文件变化
447
+ # Check file changes
448
448
  files_to_process = []
449
449
  unchanged_files = []
450
450
 
451
- with tqdm(total=len(all_files), desc="检查文件状态") as pbar:
451
+ with tqdm(total=len(all_files), desc="Check file status") as pbar:
452
452
  for file_path in all_files:
453
453
  current_md5 = get_file_md5(file_path)
454
- if current_md5: # 只处理能成功计算MD5的文件
454
+ if current_md5: # Only process files that can successfully calculate MD5
455
455
  if file_path in self.file_md5_cache and self.file_md5_cache[file_path] == current_md5:
456
- # 文件未变化,记录但不重新处理
456
+ # File未变化,记录但不重新处理
457
457
  unchanged_files.append(file_path)
458
458
  else:
459
- # 新文件或已修改的文件
459
+ # New file or modified file
460
460
  files_to_process.append(file_path)
461
461
  pbar.update(1)
462
462
 
463
- # 保留未变化文件的文档
463
+ # Keep documents for unchanged files
464
464
  unchanged_documents = [doc for doc in self.documents
465
465
  if doc.metadata['file_path'] in unchanged_files]
466
466
 
467
- # 处理新文件和修改的文件
467
+ # Process new files and modified files
468
468
  new_documents = []
469
469
  if files_to_process:
470
- with tqdm(total=len(files_to_process), desc="处理文件") as pbar:
470
+ with tqdm(total=len(files_to_process), desc="Process files") as pbar:
471
471
  for file_path in files_to_process:
472
472
  try:
473
473
  docs = self._process_file(file_path)
474
474
  if len(docs) > 0:
475
475
  new_documents.extend(docs)
476
476
  except Exception as e:
477
- PrettyOutput.print(f"处理文件失败 {file_path}: {str(e)}",
477
+ PrettyOutput.print(f"Failed to process file {file_path}: {str(e)}",
478
478
  output_type=OutputType.ERROR)
479
479
  pbar.update(1)
480
480
 
481
- # 更新文档列表
481
+ # Update document list
482
482
  self.documents = unchanged_documents + new_documents
483
483
 
484
484
  if not self.documents:
485
- PrettyOutput.print("没有需要处理的文档", output_type=OutputType.WARNING)
485
+ PrettyOutput.print("No documents to process", output_type=OutputType.WARNING)
486
486
  return
487
487
 
488
- # 只对新文档进行向量化
488
+ # Only vectorize new documents
489
489
  if new_documents:
490
- PrettyOutput.print(f"开始处理 {len(new_documents)} 个新文档",
490
+ PrettyOutput.print(f"Start processing {len(new_documents)} new documents",
491
491
  output_type=OutputType.INFO)
492
492
 
493
- # 使用线程池并发处理向量化
493
+ # Use thread pool to process vectorization
494
494
  batch_size = 32
495
495
  new_vectors = []
496
496
 
497
- with tqdm(total=len(new_documents), desc="生成向量") as pbar:
497
+ with tqdm(total=len(new_documents), desc="Generating vectors") as pbar:
498
498
  with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
499
499
  for i in range(0, len(new_documents), batch_size):
500
500
  batch = new_documents[i:i + batch_size]
@@ -506,16 +506,16 @@ class RAGTool:
506
506
 
507
507
  pbar.update(len(batch))
508
508
 
509
- # 合并新旧向量
509
+ # Merge new and old vectors
510
510
  if self.flat_index is not None:
511
- # 获取未变化文档的向量
511
+ # Get vectors for unchanged documents
512
512
  unchanged_vectors = []
513
513
  for doc in unchanged_documents:
514
- # 从现有索引中提取向量
514
+ # Get vectors from existing index
515
515
  doc_idx = next((i for i, d in enumerate(self.documents)
516
516
  if d.metadata['file_path'] == doc.metadata['file_path']), None)
517
517
  if doc_idx is not None:
518
- # 从扁平索引中重建向量
518
+ # Reconstruct vectors from flat index
519
519
  vector = np.zeros((1, self.vector_dim), dtype=np.float32)
520
520
  self.flat_index.reconstruct(doc_idx, vector.ravel())
521
521
  unchanged_vectors.append(vector)
@@ -528,21 +528,21 @@ class RAGTool:
528
528
  else:
529
529
  vectors = np.vstack(new_vectors)
530
530
 
531
- # 构建索引
531
+ # Build index
532
532
  self._build_index(vectors)
533
- # 保存缓存
533
+ # Save cache
534
534
  self._save_cache(vectors)
535
535
 
536
- PrettyOutput.print(f"成功索引了 {len(self.documents)} 个文档片段 (新增/修改: {len(new_documents)}, 未变化: {len(unchanged_documents)})",
536
+ PrettyOutput.print(f"Successfully indexed {len(self.documents)} document fragments (Added/Modified: {len(new_documents)}, Unchanged: {len(unchanged_documents)})",
537
537
  output_type=OutputType.SUCCESS)
538
538
 
539
539
  def search(self, query: str, top_k: int = 30) -> List[Tuple[Document, float]]:
540
- """优化搜索策略"""
540
+ """Optimize search strategy"""
541
541
  if not self.index:
542
- PrettyOutput.print("索引未构建,正在构建...", output_type=OutputType.INFO)
542
+ PrettyOutput.print("Index not built, building...", output_type=OutputType.INFO)
543
543
  self.build_index(self.root_dir)
544
544
 
545
- # 实现MMR (Maximal Marginal Relevance) 来增加结果多样性
545
+ # Implement MMR (Maximal Marginal Relevance) to increase result diversity
546
546
  def mmr(query_vec, doc_vecs, doc_ids, lambda_param=0.5, n_docs=top_k):
547
547
  selected = []
548
548
  selected_ids = []
@@ -552,10 +552,10 @@ class RAGTool:
552
552
  best_idx = -1
553
553
 
554
554
  for i, (doc_vec, doc_id) in enumerate(zip(doc_vecs, doc_ids)):
555
- # 计算与查询的相似度
555
+ # Calculate similarity with query
556
556
  query_sim = float(np.dot(query_vec, doc_vec))
557
557
 
558
- # 计算与已选文档的最大相似度
558
+ # Calculate maximum similarity with selected documents
559
559
  if selected:
560
560
  doc_sims = [float(np.dot(doc_vec, selected_doc)) for selected_doc in selected]
561
561
  max_doc_sim = max(doc_sims)
@@ -579,22 +579,22 @@ class RAGTool:
579
579
 
580
580
  return selected_ids
581
581
 
582
- # 获取查询向量
582
+ # Get query vector
583
583
  query_vector = self._get_embedding(query)
584
584
  query_vector = query_vector.reshape(1, -1)
585
585
 
586
- # 初始搜索更多结果用于MMR
586
+ # Initial search more results for MMR
587
587
  initial_k = min(top_k * 2, len(self.documents))
588
588
  distances, indices = self.index.search(query_vector, initial_k)
589
589
 
590
- # 获取有效结果
590
+ # Get valid results
591
591
  valid_indices = indices[0][indices[0] != -1]
592
592
  valid_vectors = np.vstack([self._get_embedding(self.documents[idx].content) for idx in valid_indices])
593
593
 
594
- # 应用MMR
594
+ # Apply MMR
595
595
  final_indices = mmr(query_vector[0], valid_vectors, valid_indices, n_docs=top_k)
596
596
 
597
- # 构建结果
597
+ # Build results
598
598
  results = []
599
599
  for idx in final_indices:
600
600
  doc = self.documents[idx]
@@ -604,22 +604,22 @@ class RAGTool:
604
604
  return results
605
605
 
606
606
  def _rerank_results(self, query: str, initial_results: List[Tuple[Document, float]]) -> List[Tuple[Document, float]]:
607
- """使用 rerank 模型重新排序搜索结果"""
607
+ """Use rerank model to rerank search results"""
608
608
  try:
609
609
  import torch
610
610
  model, tokenizer = load_rerank_model()
611
611
 
612
- # 准备数据
612
+ # Prepare data
613
613
  pairs = []
614
614
  for doc, _ in initial_results:
615
- # 组合文档信息
615
+ # Combine document information
616
616
  doc_content = f"""
617
- 文件: {doc.metadata['file_path']}
618
- 内容: {doc.content}
617
+ File: {doc.metadata['file_path']}
618
+ Content: {doc.content}
619
619
  """
620
620
  pairs.append([query, doc_content])
621
621
 
622
- # 对每个文档对进行打分
622
+ # Score each document pair
623
623
  scores = []
624
624
  batch_size = 8
625
625
 
@@ -641,133 +641,133 @@ class RAGTool:
641
641
  batch_scores = outputs.logits.squeeze(-1).cpu().numpy()
642
642
  scores.extend(batch_scores.tolist())
643
643
 
644
- # 归一化分数到 0-1 范围
644
+ # Normalize scores to 0-1 range
645
645
  if scores:
646
646
  min_score = min(scores)
647
647
  max_score = max(scores)
648
648
  if max_score > min_score:
649
649
  scores = [(s - min_score) / (max_score - min_score) for s in scores]
650
650
 
651
- # 将分数与文档组合并排序
651
+ # Combine scores with documents and sort
652
652
  scored_results = []
653
653
  for (doc, _), score in zip(initial_results, scores):
654
- if score >= 0.5: # 只保留关联度大于 0.5 的结果
654
+ if score >= 0.5: # Only keep results with a score greater than 0.5
655
655
  scored_results.append((doc, float(score)))
656
656
 
657
- # 按分数降序排序
657
+ # Sort by score in descending order
658
658
  scored_results.sort(key=lambda x: x[1], reverse=True)
659
659
 
660
660
  return scored_results
661
661
 
662
662
  except Exception as e:
663
- PrettyOutput.print(f"重排序失败,使用原始排序: {str(e)}", output_type=OutputType.WARNING)
663
+ PrettyOutput.print(f"Failed to rerank, using original sorting: {str(e)}", output_type=OutputType.WARNING)
664
664
  return initial_results
665
665
 
666
666
  def is_index_built(self):
667
- """检查索引是否已构建"""
667
+ """Check if index is built"""
668
668
  return self.index is not None
669
669
 
670
670
  def query(self, query: str) -> List[Document]:
671
- """查询相关文档
671
+ """Query related documents
672
672
 
673
673
  Args:
674
- query: 查询文本
674
+ query: Query text
675
675
 
676
676
  Returns:
677
- 相关文档列表,包含上下文
677
+ List[Document]: Related documents, including context
678
678
  """
679
679
  results = self.search(query)
680
680
  return [doc for doc, _ in results]
681
681
 
682
682
  def ask(self, question: str) -> Optional[str]:
683
- """询问关于文档的问题
683
+ """Ask about documents
684
684
 
685
685
  Args:
686
- question: 用户问题
686
+ question: User question
687
687
 
688
688
  Returns:
689
- 模型回答,如果失败则返回 None
689
+ Model answer, return None if failed
690
690
  """
691
691
  try:
692
- # 搜索相关文档片段
692
+ # Search related document fragments
693
693
  results = self.query(question)
694
694
  if not results:
695
695
  return None
696
696
 
697
- # 显示找到的文档片段
697
+ # Display found document fragments
698
698
  for doc in results:
699
- PrettyOutput.print(f"文件: {doc.metadata['file_path']}", output_type=OutputType.INFO)
700
- PrettyOutput.print(f"片段 {doc.metadata['chunk_index'] + 1}/{doc.metadata['total_chunks']}",
699
+ PrettyOutput.print(f"File: {doc.metadata['file_path']}", output_type=OutputType.INFO)
700
+ PrettyOutput.print(f"Fragment {doc.metadata['chunk_index'] + 1}/{doc.metadata['total_chunks']}",
701
701
  output_type=OutputType.INFO)
702
- PrettyOutput.print("\n内容:", output_type=OutputType.INFO)
702
+ PrettyOutput.print("\nContent:", output_type=OutputType.INFO)
703
703
  content = doc.content.encode('utf-8', errors='replace').decode('utf-8')
704
704
  PrettyOutput.print(content, output_type=OutputType.INFO)
705
705
 
706
- # 构建基础提示词
707
- base_prompt = f"""请基于以下文档片段回答用户的问题。如果文档内容不足以完整回答问题,请明确指出。
706
+ # Build base prompt
707
+ base_prompt = f"""Please answer the user's question based on the following document fragments. If the document content is not sufficient to answer the question completely, please clearly indicate.
708
708
 
709
- 用户问题: {question}
709
+ User question: {question}
710
710
 
711
- 相关文档片段:
711
+ Related document fragments:
712
712
  """
713
- end_prompt = "\n请提供准确、简洁的回答,如果文档内容不足以完整回答问题,请明确指出。"
713
+ end_prompt = "\nPlease provide an accurate and concise answer. If the document content is not sufficient to answer the question completely, please clearly indicate."
714
714
 
715
- # 计算可用于文档内容的最大长度
716
- # 预留一些空间给模型回答
715
+ # Calculate the maximum length that can be used for document content
716
+ # Leave some space for the model's answer
717
717
  available_length = self.max_context_length - len(base_prompt) - len(end_prompt) - 500
718
718
 
719
- # 构建上下文,同时控制总长度
719
+ # Build context, while controlling the total length
720
720
  context = []
721
721
  current_length = 0
722
722
 
723
723
  for doc in results:
724
- # 计算这个文档片段的内容长度
724
+ # Calculate the length of this document fragment's content
725
725
  doc_content = f"""
726
- 来源文件: {doc.metadata['file_path']}
727
- 内容:
726
+ Source file: {doc.metadata['file_path']}
727
+ Content:
728
728
  {doc.content}
729
729
  ---
730
730
  """
731
731
  content_length = len(doc_content)
732
732
 
733
- # 如果添加这个片段会超出限制,就停止添加
733
+ # If adding this fragment would exceed the limit, stop adding
734
734
  if current_length + content_length > available_length:
735
- PrettyOutput.print("由于上下文长度限制,部分相关文档片段被省略",
735
+ PrettyOutput.print("Due to context length limit, some related document fragments were omitted",
736
736
  output_type=OutputType.WARNING)
737
737
  break
738
738
 
739
739
  context.append(doc_content)
740
740
  current_length += content_length
741
741
 
742
- # 构建完整的提示词
742
+ # Build complete prompt
743
743
  prompt = base_prompt + ''.join(context) + end_prompt
744
744
 
745
- # 获取模型实例并生成回答
745
+ # Get model instance and generate answer
746
746
  model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
747
- response = model.chat(prompt)
747
+ response = model.chat_until_success(prompt)
748
748
 
749
749
  return response
750
750
 
751
751
  except Exception as e:
752
- PrettyOutput.print(f"问答失败: {str(e)}", output_type=OutputType.ERROR)
752
+ PrettyOutput.print(f"Failed to answer: {str(e)}", output_type=OutputType.ERROR)
753
753
  return None
754
754
 
755
755
  def main():
756
- """主函数"""
756
+ """Main function"""
757
757
  import argparse
758
758
  import sys
759
759
 
760
- # 设置标准输出编码为UTF-8
760
+ # Set standard output encoding to UTF-8
761
761
  if sys.stdout.encoding != 'utf-8':
762
762
  import codecs
763
763
  sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
764
764
  sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
765
765
 
766
- parser = argparse.ArgumentParser(description='文档检索和分析工具')
767
- parser.add_argument('--dir', type=str, help='要处理的文档目录')
768
- parser.add_argument('--build', action='store_true', help='构建文档索引')
769
- parser.add_argument('--search', type=str, help='搜索文档内容')
770
- parser.add_argument('--ask', type=str, help='询问关于文档的问题')
766
+ parser = argparse.ArgumentParser(description='Document retrieval and analysis tool')
767
+ parser.add_argument('--dir', type=str, help='Directory to process')
768
+ parser.add_argument('--build', action='store_true', help='Build document index')
769
+ parser.add_argument('--search', type=str, help='Search document content')
770
+ parser.add_argument('--ask', type=str, help='Ask about documents')
771
771
  args = parser.parse_args()
772
772
 
773
773
  try:
@@ -778,7 +778,7 @@ def main():
778
778
  args.dir = current_dir
779
779
 
780
780
  if args.dir and args.build:
781
- PrettyOutput.print(f"正在处理目录: {args.dir}", output_type=OutputType.INFO)
781
+ PrettyOutput.print(f"Processing directory: {args.dir}", output_type=OutputType.INFO)
782
782
  rag.build_index(args.dir)
783
783
  return 0
784
784
 
@@ -787,35 +787,35 @@ def main():
787
787
  if args.search:
788
788
  results = rag.query(args.search)
789
789
  if not results:
790
- PrettyOutput.print("未找到相关内容", output_type=OutputType.WARNING)
790
+ PrettyOutput.print("No related content found", output_type=OutputType.WARNING)
791
791
  return 1
792
792
 
793
793
  for doc in results:
794
- PrettyOutput.print(f"\n文件: {doc.metadata['file_path']}", output_type=OutputType.INFO)
795
- PrettyOutput.print(f"片段 {doc.metadata['chunk_index'] + 1}/{doc.metadata['total_chunks']}",
794
+ PrettyOutput.print(f"\nFile: {doc.metadata['file_path']}", output_type=OutputType.INFO)
795
+ PrettyOutput.print(f"Fragment {doc.metadata['chunk_index'] + 1}/{doc.metadata['total_chunks']}",
796
796
  output_type=OutputType.INFO)
797
- PrettyOutput.print("\n内容:", output_type=OutputType.INFO)
797
+ PrettyOutput.print("\nContent:", output_type=OutputType.INFO)
798
798
  content = doc.content.encode('utf-8', errors='replace').decode('utf-8')
799
799
  PrettyOutput.print(content, output_type=OutputType.INFO)
800
800
  return 0
801
801
 
802
802
  if args.ask:
803
- # 调用 ask 方法
803
+ # Call ask method
804
804
  response = rag.ask(args.ask)
805
805
  if not response:
806
- PrettyOutput.print("未能获取答案", output_type=OutputType.WARNING)
806
+ PrettyOutput.print("Failed to get answer", output_type=OutputType.WARNING)
807
807
  return 1
808
808
 
809
- # 显示回答
810
- PrettyOutput.print("\n回答:", output_type=OutputType.INFO)
809
+ # Display answer
810
+ PrettyOutput.print("\nAnswer:", output_type=OutputType.INFO)
811
811
  PrettyOutput.print(response, output_type=OutputType.INFO)
812
812
  return 0
813
813
 
814
- PrettyOutput.print("请指定操作参数。使用 -h 查看帮助。", output_type=OutputType.WARNING)
814
+ PrettyOutput.print("Please specify operation parameters. Use -h to view help.", output_type=OutputType.WARNING)
815
815
  return 1
816
816
 
817
817
  except Exception as e:
818
- PrettyOutput.print(f"执行失败: {str(e)}", output_type=OutputType.ERROR)
818
+ PrettyOutput.print(f"Failed to execute: {str(e)}", output_type=OutputType.ERROR)
819
819
  return 1
820
820
 
821
821
  if __name__ == "__main__":