auto-coder 0.1.400__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (48) hide show
  1. {auto_coder-0.1.400.dist-info → auto_coder-1.0.0.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.400.dist-info → auto_coder-1.0.0.dist-info}/RECORD +48 -31
  3. autocoder/agent/agentic_filter.py +1 -1
  4. autocoder/agent/base_agentic/tools/read_file_tool_resolver.py +1 -1
  5. autocoder/auto_coder_runner.py +120 -26
  6. autocoder/chat_auto_coder.py +81 -22
  7. autocoder/commands/auto_command.py +1 -1
  8. autocoder/common/__init__.py +2 -2
  9. autocoder/common/file_monitor/test_file_monitor.py +307 -0
  10. autocoder/common/git_utils.py +7 -2
  11. autocoder/common/pruner/__init__.py +0 -0
  12. autocoder/common/pruner/agentic_conversation_pruner.py +197 -0
  13. autocoder/common/pruner/context_pruner.py +574 -0
  14. autocoder/common/pruner/conversation_pruner.py +132 -0
  15. autocoder/common/pruner/test_agentic_conversation_pruner.py +342 -0
  16. autocoder/common/pruner/test_context_pruner.py +546 -0
  17. autocoder/common/tokens/__init__.py +15 -0
  18. autocoder/common/tokens/counter.py +20 -0
  19. autocoder/common/v2/agent/agentic_edit.py +372 -538
  20. autocoder/common/v2/agent/agentic_edit_tools/__init__.py +8 -1
  21. autocoder/common/v2/agent/agentic_edit_tools/ac_mod_read_tool_resolver.py +40 -0
  22. autocoder/common/v2/agent/agentic_edit_tools/ac_mod_write_tool_resolver.py +43 -0
  23. autocoder/common/v2/agent/agentic_edit_tools/ask_followup_question_tool_resolver.py +8 -0
  24. autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +1 -1
  25. autocoder/common/v2/agent/agentic_edit_tools/read_file_tool_resolver.py +1 -1
  26. autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +33 -88
  27. autocoder/common/v2/agent/agentic_edit_tools/test_write_to_file_tool_resolver.py +8 -8
  28. autocoder/common/v2/agent/agentic_edit_tools/todo_read_tool_resolver.py +118 -0
  29. autocoder/common/v2/agent/agentic_edit_tools/todo_write_tool_resolver.py +324 -0
  30. autocoder/common/v2/agent/agentic_edit_types.py +46 -4
  31. autocoder/common/v2/agent/runner/__init__.py +31 -0
  32. autocoder/common/v2/agent/runner/base_runner.py +106 -0
  33. autocoder/common/v2/agent/runner/event_runner.py +216 -0
  34. autocoder/common/v2/agent/runner/sdk_runner.py +40 -0
  35. autocoder/common/v2/agent/runner/terminal_runner.py +283 -0
  36. autocoder/common/v2/agent/runner/tool_display.py +191 -0
  37. autocoder/index/entry.py +1 -1
  38. autocoder/plugins/token_helper_plugin.py +107 -7
  39. autocoder/run_context.py +9 -0
  40. autocoder/sdk/__init__.py +114 -81
  41. autocoder/sdk/cli/main.py +5 -0
  42. autocoder/sdk/core/auto_coder_core.py +0 -158
  43. autocoder/sdk/core/bridge.py +2 -4
  44. autocoder/version.py +1 -1
  45. {auto_coder-0.1.400.dist-info → auto_coder-1.0.0.dist-info}/WHEEL +0 -0
  46. {auto_coder-0.1.400.dist-info → auto_coder-1.0.0.dist-info}/entry_points.txt +0 -0
  47. {auto_coder-0.1.400.dist-info → auto_coder-1.0.0.dist-info}/licenses/LICENSE +0 -0
  48. {auto_coder-0.1.400.dist-info → auto_coder-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,574 @@
1
+ from typing import List, Dict, Any, Union
2
+ from typing import Tuple
3
+ from pathlib import Path
4
+ import json
5
+ from loguru import logger
6
+ from autocoder.common.tokens import count_string_tokens as count_tokens
7
+ from autocoder.common import AutoCoderArgs, SourceCode
8
+ from byzerllm.utils.client.code_utils import extract_code
9
+ from autocoder.index.types import VerifyFileRelevance
10
+ import byzerllm
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+
13
+ from autocoder.common.printer import Printer
14
+ from autocoder.common.auto_coder_lang import get_message_with_format
15
+
16
+
17
+ class PruneContext:
18
+ def __init__(self, max_tokens: int, args: AutoCoderArgs, llm: Union[byzerllm.ByzerLLM, byzerllm.SimpleByzerLLM], verbose: bool = False):
19
+ self.max_tokens = max_tokens
20
+ self.args = args
21
+ self.llm = llm
22
+ self.printer = Printer()
23
+ self.verbose = verbose
24
+
25
+ def _split_content_with_sliding_window(self, content: str, window_size=100, overlap=20) -> List[Tuple[int, int, str]]:
26
+ """使用滑动窗口分割大文件内容,返回包含行号信息的文本块
27
+
28
+ Args:
29
+ content: 要分割的文件内容
30
+ window_size: 每个窗口包含的行数
31
+ overlap: 相邻窗口的重叠行数
32
+
33
+ Returns:
34
+ List[Tuple[int, int, str]]: 返回元组列表,每个元组包含:
35
+ - 起始行号(从1开始),在原始文件的绝对行号
36
+ - 结束行号,在原始文件的绝对行号
37
+ - 带行号的内容文本
38
+ """
39
+ # 按行分割内容
40
+ lines = content.splitlines()
41
+ chunks = []
42
+ start = 0
43
+
44
+ while start < len(lines):
45
+ # 计算当前窗口的结束位置
46
+ end = min(start + window_size, len(lines))
47
+
48
+ # 计算实际的起始位置(考虑重叠)
49
+ actual_start = max(0, start - overlap)
50
+
51
+ # 提取当前窗口的行
52
+ chunk_lines = lines[actual_start:end]
53
+
54
+ # 为每一行添加行号
55
+ # 行号从actual_start+1开始,保持与原文件的绝对行号一致
56
+ chunk_content = "\n".join([
57
+ f"{i+1} {line}" for i, line in enumerate(chunk_lines, start=actual_start)
58
+ ])
59
+
60
+ # 保存分块信息:(起始行号, 结束行号, 带行号的内容)
61
+ # 行号从1开始计数
62
+ chunks.append((actual_start + 1, end, chunk_content))
63
+
64
+ # 移动到下一个窗口的起始位置
65
+ # 减去overlap确保窗口重叠
66
+ start += (window_size - overlap)
67
+
68
+ return chunks
69
+
70
+
71
+ def _delete_overflow_files(self, file_sources: List[SourceCode]) -> List[SourceCode]:
72
+ """直接删除超出 token 限制的文件"""
73
+ total_tokens = 0
74
+ selected_files = []
75
+ token_count = 0
76
+ for file_source in file_sources:
77
+ try:
78
+ token_count = file_source.tokens
79
+ if token_count <= 0:
80
+ token_count = count_tokens(file_source.source_code)
81
+
82
+ if total_tokens + token_count <= self.max_tokens:
83
+ total_tokens += token_count
84
+ print(f"{file_source.module_name} {token_count}")
85
+ selected_files.append(file_source)
86
+ else:
87
+ break
88
+ except Exception as e:
89
+ logger.error(f"Failed to read file {file_source.module_name}: {e}")
90
+ selected_files.append(file_source)
91
+
92
+ return selected_files
93
+
94
+ def _extract_code_snippets(self, file_sources: List[SourceCode], conversations: List[Dict[str, str]]) -> List[SourceCode]:
95
+ """抽取关键代码片段策略"""
96
+ token_count = 0
97
+ selected_files = []
98
+ full_file_tokens = int(self.max_tokens * 0.8)
99
+
100
+ if self.verbose:
101
+ total_input_tokens = sum(f.tokens for f in file_sources)
102
+ self.printer.print_str_in_terminal(f"🚀 开始代码片段抽取处理,共 {len(file_sources)} 个文件,总token数: {total_input_tokens}")
103
+ self.printer.print_str_in_terminal(f"📋 处理策略: 完整文件优先阈值={full_file_tokens}, 最大token限制={self.max_tokens}")
104
+
105
+ @byzerllm.prompt()
106
+ def extract_code_snippets(conversations: List[Dict[str, str]], content: str, is_partial_content: bool = False) -> str:
107
+ """
108
+ 根据提供的代码文件和对话历史提取相关代码片段。
109
+
110
+ 处理示例:
111
+ <examples>
112
+ 1. 代码文件:
113
+ <code_file>
114
+ 1 def add(a, b):
115
+ 2 return a + b
116
+ 3 def sub(a, b):
117
+ 4 return a - b
118
+ </code_file>
119
+ <conversation_history>
120
+ <user>: 如何实现加法?
121
+ </conversation_history>
122
+
123
+ 输出:
124
+ ```json
125
+ [
126
+ {"start_line": 1, "end_line": 2}
127
+ ]
128
+ ```
129
+
130
+ 2. 代码文件:
131
+ 1 class User:
132
+ 2 def __init__(self, name):
133
+ 3 self.name = name
134
+ 4 def greet(self):
135
+ 5 return f"Hello, {self.name}"
136
+ </code_file>
137
+ <conversation_history>
138
+ <user>: 如何创建一个User对象?
139
+ </conversation_history>
140
+
141
+ 输出:
142
+ ```json
143
+ [
144
+ {"start_line": 1, "end_line": 3}
145
+ ]
146
+ ```
147
+
148
+ 3. 代码文件:
149
+ <code_file>
150
+ 1 def foo():
151
+ 2 pass
152
+ </code_file>
153
+ <conversation_history>
154
+ <user>: 如何实现减法?
155
+ </conversation_history>
156
+
157
+ 输出:
158
+ ```json
159
+ []
160
+ ```
161
+ </examples>
162
+
163
+ 输入:
164
+ 1. 代码文件内容:
165
+ <code_file>
166
+ {{ content }}
167
+ </code_file>
168
+
169
+ <% if is_partial_content: %>
170
+ <partial_content_process_note>
171
+ 当前处理的是文件的局部内容(行号{start_line}-{end_line}),
172
+ 请仅基于当前可见内容判断相关性,返回标注的行号区间。
173
+ </partial_content_process_note>
174
+ <% endif %>
175
+
176
+ 2. 对话历史:
177
+ <conversation_history>
178
+ {% for msg in conversations %}
179
+ <{{ msg.role }}>: {{ msg.content }}
180
+ {% endfor %}
181
+ </conversation_history>
182
+
183
+ 任务:
184
+ 1. 分析最后一个用户问题及其上下文。
185
+ 2. 在代码文件中找出与问题相关的一个或多个重要代码段。
186
+ 3. 对每个相关代码段,确定其起始行号(start_line)和结束行号(end_line)。
187
+ 4. 代码段数量不超过4个。
188
+
189
+ 输出要求:
190
+ 1. 返回一个JSON数组,每个元素包含"start_line"和"end_line"。
191
+ 2. start_line和end_line必须是整数,表示代码文件中的行号。
192
+ 3. 行号从1开始计数。
193
+ 4. 如果没有相关代码段,返回空数组[]。
194
+
195
+ 输出格式:
196
+ 严格的JSON数组,不包含其他文字或解释。
197
+
198
+ ```json
199
+ [
200
+ {"start_line": 第一个代码段的起始行号, "end_line": 第一个代码段的结束行号},
201
+ {"start_line": 第二个代码段的起始行号, "end_line": 第二个代码段的结束行号}
202
+ ]
203
+ ```
204
+
205
+ """
206
+
207
+ for file_source in file_sources:
208
+ try:
209
+ # 完整文件优先
210
+ tokens = file_source.tokens
211
+ if token_count + tokens <= full_file_tokens:
212
+ selected_files.append(SourceCode(
213
+ module_name=file_source.module_name, source_code=file_source.source_code, tokens=tokens))
214
+ token_count += tokens
215
+ if self.verbose:
216
+ self.printer.print_str_in_terminal(f"✅ 文件 {file_source.module_name} 完整保留 (token数: {tokens},当前总token数: {token_count})")
217
+ continue
218
+
219
+ # 如果单个文件太大,那么先按滑动窗口分割,然后对窗口抽取代码片段
220
+ if tokens > self.max_tokens:
221
+ self.printer.print_in_terminal(
222
+ "file_sliding_window_processing", file_path=file_source.module_name, tokens=tokens)
223
+
224
+ chunks = self._split_content_with_sliding_window(file_source.source_code,
225
+ self.args.context_prune_sliding_window_size,
226
+ self.args.context_prune_sliding_window_overlap)
227
+
228
+ if self.verbose:
229
+ self.printer.print_str_in_terminal(f"📊 文件 {file_source.module_name} 通过滑动窗口分割为 {len(chunks)} 个chunks")
230
+
231
+ all_snippets = []
232
+ chunk_with_results = 0
233
+ for chunk_idx, (chunk_start, chunk_end, chunk_content) in enumerate(chunks):
234
+ if self.verbose:
235
+ self.printer.print_str_in_terminal(f" 🔍 处理chunk {chunk_idx + 1}/{len(chunks)} (行号: {chunk_start}-{chunk_end})")
236
+
237
+ extracted = extract_code_snippets.with_llm(self.llm).run(
238
+ conversations=conversations,
239
+ content=chunk_content,
240
+ is_partial_content=True
241
+ )
242
+ if extracted:
243
+ json_str = extract_code(extracted)[0][1]
244
+ snippets = json.loads(json_str)
245
+
246
+ if snippets: # 有抽取结果
247
+ chunk_with_results += 1
248
+ if self.verbose:
249
+ self.printer.print_str_in_terminal(f" ✅ chunk {chunk_idx + 1} 抽取到 {len(snippets)} 个代码片段: {snippets}")
250
+
251
+ # 获取到的本来就是在原始文件里的绝对行号
252
+ # 后续在构建代码片段内容时,会为了适配数组操作修改行号,这里无需处理
253
+ adjusted_snippets = [{
254
+ "start_line": snippet["start_line"],
255
+ "end_line": snippet["end_line"]
256
+ } for snippet in snippets]
257
+ all_snippets.extend(adjusted_snippets)
258
+ else:
259
+ if self.verbose:
260
+ self.printer.print_str_in_terminal(f" ❌ chunk {chunk_idx + 1} 未抽取到相关代码片段")
261
+ else:
262
+ if self.verbose:
263
+ self.printer.print_str_in_terminal(f" ❌ chunk {chunk_idx + 1} 抽取失败,未返回结果")
264
+
265
+ if self.verbose:
266
+ self.printer.print_str_in_terminal(f"📈 滑动窗口处理完成: {chunk_with_results}/{len(chunks)} 个chunks有抽取结果,共收集到 {len(all_snippets)} 个代码片段")
267
+
268
+ merged_snippets = self._merge_overlapping_snippets(all_snippets)
269
+
270
+ if self.verbose:
271
+ self.printer.print_str_in_terminal(f"🔄 合并重叠片段: {len(all_snippets)} -> {len(merged_snippets)} 个片段")
272
+ if merged_snippets:
273
+ self.printer.print_str_in_terminal(f" 合并后的片段: {merged_snippets}")
274
+
275
+ # 只有当有代码片段时才处理
276
+ if merged_snippets:
277
+ content_snippets = self._build_snippet_content(
278
+ file_source.module_name, file_source.source_code, merged_snippets)
279
+ snippet_tokens = count_tokens(content_snippets)
280
+
281
+ if token_count + snippet_tokens <= self.max_tokens:
282
+ selected_files.append(SourceCode(
283
+ module_name=file_source.module_name, source_code=content_snippets, tokens=snippet_tokens))
284
+ token_count += snippet_tokens
285
+ self.printer.print_in_terminal("file_snippet_procesed", file_path=file_source.module_name,
286
+ total_tokens=token_count,
287
+ tokens=tokens,
288
+ snippet_tokens=snippet_tokens)
289
+ if self.verbose:
290
+ self.printer.print_str_in_terminal(f"✅ 文件 {file_source.module_name} 滑动窗口处理成功,最终抽取到结果")
291
+ continue
292
+ else:
293
+ if self.verbose:
294
+ self.printer.print_str_in_terminal(f"❌ 文件 {file_source.module_name} 滑动窗口处理后token数超限 ({token_count + snippet_tokens} > {self.max_tokens}),停止处理")
295
+ break
296
+ else:
297
+ # 滑动窗口处理后没有相关代码片段,跳过这个文件
298
+ if self.verbose:
299
+ self.printer.print_str_in_terminal(f"⏭️ 文件 {file_source.module_name} 滑动窗口处理后无相关代码片段,跳过处理")
300
+ continue
301
+
302
+ # 抽取关键片段
303
+ lines = file_source.source_code.splitlines()
304
+ new_content = ""
305
+
306
+ # 将文件内容按行编号
307
+ for index, line in enumerate(lines):
308
+ new_content += f"{index+1} {line}\n"
309
+
310
+ # 抽取代码片段
311
+ self.printer.print_in_terminal(
312
+ "file_snippet_processing", file_path=file_source.module_name)
313
+
314
+ if self.verbose:
315
+ self.printer.print_str_in_terminal(f"🔍 开始对文件 {file_source.module_name} 进行整体代码片段抽取 (共 {len(lines)} 行)")
316
+
317
+ extracted = extract_code_snippets.with_llm(self.llm).run(
318
+ conversations=conversations,
319
+ content=new_content
320
+ )
321
+
322
+ # 构建代码片段内容
323
+ if extracted:
324
+ json_str = extract_code(extracted)[0][1]
325
+ snippets = json.loads(json_str)
326
+
327
+ if self.verbose:
328
+ if snippets:
329
+ self.printer.print_str_in_terminal(f" ✅ 抽取到 {len(snippets)} 个代码片段: {snippets}")
330
+ else:
331
+ self.printer.print_str_in_terminal(f" ❌ 未抽取到相关代码片段")
332
+
333
+ # 只有当有代码片段时才处理
334
+ if snippets:
335
+ content_snippets = self._build_snippet_content(
336
+ file_source.module_name, file_source.source_code, snippets)
337
+
338
+ snippet_tokens = count_tokens(content_snippets)
339
+ if token_count + snippet_tokens <= self.max_tokens:
340
+ selected_files.append(SourceCode(module_name=file_source.module_name,
341
+ source_code=content_snippets,
342
+ tokens=snippet_tokens))
343
+ token_count += snippet_tokens
344
+ self.printer.print_in_terminal("file_snippet_procesed", file_path=file_source.module_name,
345
+ total_tokens=token_count,
346
+ tokens=tokens,
347
+ snippet_tokens=snippet_tokens)
348
+ if self.verbose:
349
+ self.printer.print_str_in_terminal(f"✅ 文件 {file_source.module_name} 整体抽取成功,最终抽取到结果")
350
+ else:
351
+ if self.verbose:
352
+ self.printer.print_str_in_terminal(f"❌ 文件 {file_source.module_name} 整体抽取后token数超限 ({token_count + snippet_tokens} > {self.max_tokens}),停止处理")
353
+ break
354
+ else:
355
+ # 没有相关代码片段,跳过这个文件
356
+ if self.verbose:
357
+ self.printer.print_str_in_terminal(f"⏭️ 文件 {file_source.module_name} 无相关代码片段,跳过处理")
358
+ else:
359
+ if self.verbose:
360
+ self.printer.print_str_in_terminal(f"❌ 文件 {file_source.module_name} 整体抽取失败,未返回结果")
361
+ except Exception as e:
362
+ logger.error(f"Failed to process {file_source.module_name}: {e}")
363
+ if self.verbose:
364
+ self.printer.print_str_in_terminal(f"❌ 文件 {file_source.module_name} 处理异常: {e}")
365
+ continue
366
+
367
+ if self.verbose:
368
+ total_input_tokens = sum(f.tokens for f in file_sources)
369
+ final_tokens = sum(f.tokens for f in selected_files)
370
+ self.printer.print_str_in_terminal(f"🎯 代码片段抽取处理完成")
371
+ self.printer.print_str_in_terminal(f"📊 处理结果统计:")
372
+ self.printer.print_str_in_terminal(f" • 输入文件数: {len(file_sources)} 个,输入token数: {total_input_tokens}")
373
+ self.printer.print_str_in_terminal(f" • 输出文件数: {len(selected_files)} 个,输出token数: {final_tokens}")
374
+ self.printer.print_str_in_terminal(f" • Token压缩率: {((total_input_tokens - final_tokens) / total_input_tokens * 100):.1f}%")
375
+
376
+ # 统计各种处理方式的文件数量
377
+ complete_files = 0
378
+ snippet_files = 0
379
+ for i, file_source in enumerate(file_sources):
380
+ if i < len(selected_files):
381
+ if selected_files[i].source_code == file_source.source_code:
382
+ complete_files += 1
383
+ else:
384
+ snippet_files += 1
385
+
386
+ self.printer.print_str_in_terminal(f" • 完整保留文件: {complete_files} 个")
387
+ self.printer.print_str_in_terminal(f" • 片段抽取文件: {snippet_files} 个")
388
+ self.printer.print_str_in_terminal(f" • 跳过处理文件: {len(file_sources) - len(selected_files)} 个")
389
+
390
+ return selected_files
391
+
392
+ def _merge_overlapping_snippets(self, snippets: List[dict]) -> List[dict]:
393
+ if not snippets:
394
+ return []
395
+
396
+ # 按起始行排序
397
+ sorted_snippets = sorted(snippets, key=lambda x: x["start_line"])
398
+
399
+ merged = [sorted_snippets[0]]
400
+ for current in sorted_snippets[1:]:
401
+ last = merged[-1]
402
+ if current["start_line"] <= last["end_line"] + 1: # 允许1行间隔
403
+ # 合并区间
404
+ merged[-1] = {
405
+ "start_line": min(last["start_line"], current["start_line"]),
406
+ "end_line": max(last["end_line"], current["end_line"])
407
+ }
408
+ else:
409
+ merged.append(current)
410
+
411
+ return merged
412
+
413
+ def _build_snippet_content(self, file_path: str, full_content: str, snippets: List[dict]) -> str:
414
+ """构建包含代码片段的文件内容"""
415
+ lines = full_content.splitlines()
416
+ header = f"Snippets:\n"
417
+
418
+ content = []
419
+ for snippet in snippets:
420
+ start = max(0, snippet["start_line"] - 1)
421
+ end = min(len(lines), snippet["end_line"])
422
+ content.append(
423
+ f"# Lines {start+1}-{end} ({snippet.get('reason','')})")
424
+ content.extend(lines[start:end])
425
+
426
+ return header + "\n".join(content)
427
+
428
+ def handle_overflow(
429
+ self,
430
+ file_sources: List[SourceCode],
431
+ conversations: List[Dict[str, str]],
432
+ strategy: str = "score"
433
+ ) -> List[SourceCode]:
434
+ """
435
+ 处理超出 token 限制的文件
436
+ :param file_sources: 要处理的文件
437
+ :param conversations: 对话上下文(用于提取策略)
438
+ :param strategy: 处理策略 (delete/extract/score)
439
+ """
440
+ file_paths = [file_source.module_name for file_source in file_sources]
441
+ total_tokens, sources = self._count_tokens(file_sources=file_sources)
442
+ if total_tokens <= self.max_tokens:
443
+ return sources
444
+
445
+ self.printer.print_in_terminal(
446
+ "context_pruning_reason",
447
+ total_tokens=total_tokens,
448
+ max_tokens=self.max_tokens,
449
+ style="yellow"
450
+ )
451
+
452
+ self.printer.print_in_terminal(
453
+ "sorted_files_message",
454
+ files=file_paths
455
+ )
456
+
457
+ self.printer.print_in_terminal(
458
+ "context_pruning_start",
459
+ total_tokens=total_tokens,
460
+ max_tokens=self.max_tokens,
461
+ strategy=strategy
462
+ )
463
+
464
+ if strategy == "score":
465
+ return self._score_and_filter_files(sources, conversations)
466
+ if strategy == "delete":
467
+ return self._delete_overflow_files(sources)
468
+ elif strategy == "extract":
469
+ return self._extract_code_snippets(sources, conversations)
470
+ else:
471
+ raise ValueError(f"无效策略: {strategy}. 可选值: delete/extract/score")
472
+
473
+ def _count_tokens(self, file_sources: List[SourceCode]) -> Tuple[int, List[SourceCode]]:
474
+ """计算文件总token数"""
475
+ total_tokens = 0
476
+ sources = []
477
+ for file_source in file_sources:
478
+ try:
479
+ if file_source.tokens > 0:
480
+ tokens = file_source.tokens
481
+ total_tokens += file_source.tokens
482
+ else:
483
+ tokens = count_tokens(file_source.source_code)
484
+ total_tokens += tokens
485
+
486
+ sources.append(SourceCode(module_name=file_source.module_name,
487
+ source_code=file_source.source_code, tokens=tokens))
488
+
489
+ except Exception as e:
490
+ logger.error(f"Failed to count tokens for {file_source.module_name}: {e}")
491
+ sources.append(SourceCode(module_name=file_source.module_name,
492
+ source_code=file_source.source_code, tokens=0))
493
+ return total_tokens, sources
494
+
495
+ def _score_and_filter_files(self, file_sources: List[SourceCode], conversations: List[Dict[str, str]]) -> List[SourceCode]:
496
+ """根据文件相关性评分过滤文件,直到token数大于max_tokens 停止追加"""
497
+ selected_files = []
498
+ total_tokens = 0
499
+ scored_files = []
500
+
501
+ @byzerllm.prompt()
502
+ def verify_file_relevance(file_content: str, conversations: List[Dict[str, str]]) -> str:
503
+ """
504
+ 请验证下面的文件内容是否与用户对话相关:
505
+
506
+ 文件内容:
507
+ {{ file_content }}
508
+
509
+ 历史对话:
510
+ <conversation_history>
511
+ {% for msg in conversations %}
512
+ <{{ msg.role }}>: {{ msg.content }}
513
+ {% endfor %}
514
+ </conversation_history>
515
+
516
+ 相关是指,需要依赖这个文件提供上下文,或者需要修改这个文件才能解决用户的问题。
517
+ 请给出相应的可能性分数:0-10,并结合用户问题,理由控制在50字以内。格式如下:
518
+
519
+ ```json
520
+ {
521
+ "relevant_score": 0-10,
522
+ "reason": "这是相关的原因(不超过10个中文字符)..."
523
+ }
524
+ ```
525
+ """
526
+
527
+ def _score_file(file_source: SourceCode) -> dict:
528
+ try:
529
+ result = verify_file_relevance.with_llm(self.llm).with_return_type(VerifyFileRelevance).run(
530
+ file_content=file_source.source_code,
531
+ conversations=conversations
532
+ )
533
+ return {
534
+ "file_path": file_source.module_name,
535
+ "score": result.relevant_score,
536
+ "tokens": file_source.tokens,
537
+ "content": file_source.source_code
538
+ }
539
+ except Exception as e:
540
+ logger.error(f"Failed to score file {file_source.module_name}: {e}")
541
+ return None
542
+
543
+ # 使用线程池并行打分
544
+ with ThreadPoolExecutor() as executor:
545
+ futures = [executor.submit(_score_file, file_source)
546
+ for file_source in file_sources]
547
+ for future in as_completed(futures):
548
+ result = future.result()
549
+ if result:
550
+ self.printer.print_str_in_terminal(
551
+ get_message_with_format(
552
+ "file_scored_message",
553
+ file_path=result["file_path"],
554
+ score=result["score"]
555
+ )
556
+ )
557
+ scored_files.append(result)
558
+
559
+ # 第二步:按分数从高到低排序
560
+ scored_files.sort(key=lambda x: x["score"], reverse=True)
561
+
562
+ # 第三步:从高分开始过滤,直到token数大于max_tokens 停止追加
563
+ for file_info in scored_files:
564
+ if total_tokens + file_info["tokens"] <= self.max_tokens:
565
+ selected_files.append(SourceCode(
566
+ module_name=file_info["file_path"],
567
+ source_code=file_info["content"],
568
+ tokens=file_info["tokens"]
569
+ ))
570
+ total_tokens += file_info["tokens"]
571
+ else:
572
+ break
573
+
574
+ return selected_files