auto-coder 0.1.353__py3-none-any.whl → 0.1.355__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (60) hide show
  1. {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/RECORD +60 -45
  3. autocoder/agent/agentic_filter.py +1 -1
  4. autocoder/auto_coder.py +8 -0
  5. autocoder/auto_coder_rag.py +37 -1
  6. autocoder/auto_coder_runner.py +58 -77
  7. autocoder/chat/conf_command.py +270 -0
  8. autocoder/chat/models_command.py +485 -0
  9. autocoder/chat_auto_coder.py +29 -24
  10. autocoder/chat_auto_coder_lang.py +26 -2
  11. autocoder/commands/auto_command.py +60 -132
  12. autocoder/commands/auto_web.py +1 -1
  13. autocoder/commands/tools.py +1 -1
  14. autocoder/common/__init__.py +3 -1
  15. autocoder/common/command_completer.py +58 -12
  16. autocoder/common/command_completer_v2.py +576 -0
  17. autocoder/common/conversations/__init__.py +52 -0
  18. autocoder/common/conversations/compatibility.py +303 -0
  19. autocoder/common/conversations/conversation_manager.py +502 -0
  20. autocoder/common/conversations/example.py +152 -0
  21. autocoder/common/file_monitor/__init__.py +5 -0
  22. autocoder/common/file_monitor/monitor.py +383 -0
  23. autocoder/common/global_cancel.py +53 -16
  24. autocoder/common/ignorefiles/__init__.py +4 -0
  25. autocoder/common/ignorefiles/ignore_file_utils.py +103 -0
  26. autocoder/common/ignorefiles/test_ignore_file_utils.py +91 -0
  27. autocoder/common/rulefiles/__init__.py +15 -0
  28. autocoder/common/rulefiles/autocoderrules_utils.py +173 -0
  29. autocoder/common/save_formatted_log.py +54 -0
  30. autocoder/common/v2/agent/agentic_edit.py +10 -39
  31. autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +1 -1
  32. autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +73 -43
  33. autocoder/common/v2/code_agentic_editblock_manager.py +9 -9
  34. autocoder/common/v2/code_diff_manager.py +2 -2
  35. autocoder/common/v2/code_editblock_manager.py +31 -18
  36. autocoder/common/v2/code_strict_diff_manager.py +3 -2
  37. autocoder/dispacher/actions/action.py +6 -6
  38. autocoder/dispacher/actions/plugins/action_regex_project.py +2 -2
  39. autocoder/events/event_manager_singleton.py +1 -1
  40. autocoder/index/index.py +3 -3
  41. autocoder/models.py +22 -9
  42. autocoder/rag/api_server.py +14 -2
  43. autocoder/rag/cache/local_byzer_storage_cache.py +1 -1
  44. autocoder/rag/cache/local_duckdb_storage_cache.py +8 -0
  45. autocoder/rag/cache/simple_cache.py +63 -33
  46. autocoder/rag/loaders/docx_loader.py +1 -1
  47. autocoder/rag/loaders/filter_utils.py +133 -76
  48. autocoder/rag/loaders/image_loader.py +15 -3
  49. autocoder/rag/loaders/pdf_loader.py +2 -2
  50. autocoder/rag/long_context_rag.py +11 -0
  51. autocoder/rag/qa_conversation_strategy.py +5 -31
  52. autocoder/rag/utils.py +21 -2
  53. autocoder/utils/_markitdown.py +66 -25
  54. autocoder/utils/auto_coder_utils/chat_stream_out.py +4 -4
  55. autocoder/utils/thread_utils.py +9 -27
  56. autocoder/version.py +1 -1
  57. {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/LICENSE +0 -0
  58. {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/WHEEL +0 -0
  59. {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/entry_points.txt +0 -0
  60. {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ from .failed_files_utils import load_failed_files, save_failed_files
24
24
  from autocoder.common import AutoCoderArgs
25
25
  from byzerllm import SimpleByzerLLM, ByzerLLM
26
26
  from autocoder.utils.llms import get_llm_names
27
+ from autocoder.common.file_monitor.monitor import get_file_monitor, Change
27
28
 
28
29
 
29
30
  default_ignore_dirs = [
@@ -50,7 +51,7 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
50
51
 
51
52
 
52
53
  class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
53
- def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args: Optional[AutoCoderArgs] = None, llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None):
54
+ def __init__(self, path: str, ignore_spec, required_exts: list, args: Optional[AutoCoderArgs] = None, llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None):
54
55
  """
55
56
  初始化异步更新队列,用于管理代码文件的缓存。
56
57
 
@@ -58,7 +59,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
58
59
  path: 需要索引的代码库根目录
59
60
  ignore_spec: 指定哪些文件/目录应被忽略的规则
60
61
  required_exts: 需要处理的文件扩展名列表
61
- update_interval: 自动触发更新的时间间隔(秒),默认为5秒
62
+ args: AutoCoderArgs 对象,包含配置信息
63
+ llm: 用于代码分析的 LLM 实例
62
64
 
63
65
  缓存结构 (self.cache):
64
66
  self.cache 是一个字典,其结构如下:
@@ -99,7 +101,6 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
99
101
  self.args = args
100
102
  self.llm = llm
101
103
  self.product_mode = args.product_mode or "lite"
102
- self.update_interval = update_interval
103
104
  self.queue = []
104
105
  self.cache = {} # 初始化为空字典,稍后通过 read_cache() 填充
105
106
  self.lock = threading.Lock()
@@ -115,10 +116,16 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
115
116
  self.queue_thread.daemon = True
116
117
  self.queue_thread.start()
117
118
 
118
- # 启动定时触发更新的线程
119
- self.update_thread = threading.Thread(target=self._periodic_update)
120
- self.update_thread.daemon = True
121
- self.update_thread.start()
119
+ # 注册文件监控回调
120
+ self.file_monitor = get_file_monitor(self.path)
121
+ # 注册根目录的监控,这样可以捕获所有子目录和文件的变化
122
+ self.file_monitor.register(self.path, self._on_file_change)
123
+ # 确保监控器已启动
124
+ if not self.file_monitor.is_running():
125
+ self.file_monitor.start()
126
+ logger.info(f"Started file monitor for {self.path}")
127
+ else:
128
+ logger.info(f"File monitor already running for {self.path}")
122
129
 
123
130
  self.cache = self.read_cache()
124
131
 
@@ -130,37 +137,57 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
130
137
  logger.error(f"Error in process_queue: {e}")
131
138
  time.sleep(1) # 避免过于频繁的检查
132
139
 
133
- def _periodic_update(self):
134
- """定时触发文件更新检查"""
135
- while not self.stop_event.is_set():
136
- try:
137
- logger.debug(
138
- f"Periodic update triggered (every {self.update_interval}s)")
139
- # 如果没有被初始化过,不会增量触发
140
- if not self.cache:
141
- time.sleep(self.update_interval)
142
- continue
143
- self.trigger_update()
144
- except Exception as e:
145
- logger.error(f"Error in periodic update: {e}")
146
- time.sleep(self.update_interval)
140
+ def _on_file_change(self, change_type: Change, file_path: str):
141
+ """
142
+ 文件监控回调函数,当文件发生变化时触发更新
143
+
144
+ 参数:
145
+ change_type: 变化类型 (Change.added, Change.modified, Change.deleted)
146
+ file_path: 发生变化的文件路径
147
+ """
148
+ try:
149
+ # 如果缓存还没有初始化,跳过触发
150
+ if not self.cache:
151
+ return
152
+
153
+ # 检查文件扩展名,如果不在需要处理的扩展名列表中,跳过
154
+ if self.required_exts and not any(file_path.endswith(ext) for ext in self.required_exts):
155
+ return
156
+
157
+ # 检查是否在忽略规则中
158
+ if self.ignore_spec and self.ignore_spec.match_file(os.path.relpath(file_path, self.path)):
159
+ return
160
+
161
+ logger.info(f"File change detected: {change_type} - {file_path}")
162
+ self.trigger_update()
163
+ except Exception as e:
164
+ logger.error(f"Error in file change handler: {e}")
165
+ logger.exception(e)
147
166
 
148
167
  def stop(self):
149
168
  self.stop_event.set()
150
- self.queue_thread.join()
151
- self.update_thread.join()
169
+ # 取消注册文件监控回调
170
+ try:
171
+ self.file_monitor.unregister(self.path, self._on_file_change)
172
+ logger.info(f"Unregistered file monitor callback for {self.path}")
173
+ except Exception as e:
174
+ logger.error(f"Error unregistering file monitor callback: {e}")
175
+ # 只等待队列处理线程结束
176
+ if hasattr(self, 'queue_thread') and self.queue_thread.is_alive():
177
+ self.queue_thread.join(timeout=2.0)
152
178
 
153
179
  def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
154
180
  return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
155
181
 
156
182
  def __del__(self):
183
+ # 确保在对象被销毁时停止监控并清理资源
157
184
  self.stop()
158
185
 
159
186
  def load_first(self):
160
187
  with self.lock:
161
188
  if self.cache:
162
189
  return
163
- files_to_process = []
190
+ files_to_process = []
164
191
  for file_info in self.get_all_files():
165
192
  file_path, _, modify_time, file_md5 = file_info
166
193
  if (
@@ -175,7 +202,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
175
202
  # [process_file.remote(file_info) for file_info in files_to_process]
176
203
  # )
177
204
  from autocoder.rag.token_counter import initialize_tokenizer
178
- llm_name = get_llm_names(self.llm)[0] if self.llm else None
205
+ llm_name = get_llm_names(self.llm)[0] if self.llm else None
179
206
  with Pool(
180
207
  processes=os.cpu_count(),
181
208
  initializer=initialize_tokenizer,
@@ -184,8 +211,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
184
211
 
185
212
  worker_func = functools.partial(
186
213
  process_file_in_multi_process, llm=llm_name, product_mode=self.product_mode)
187
- results = pool.map(worker_func, files_to_process)
188
-
214
+ results = pool.map(worker_func, files_to_process)
215
+
189
216
  for file_info, result in zip(files_to_process, results):
190
217
  if result: # 只有当result不为空时才更新缓存
191
218
  self.update_cache(file_info, result)
@@ -203,16 +230,15 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
203
230
  file_path, relative_path, modify_time, file_md5 = file_info
204
231
  current_files.add(file_path)
205
232
  # 如果文件曾经解析失败,跳过本次增量更新
206
- if file_path in self.failed_files:
207
- # logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
233
+ if file_path in self.failed_files:
208
234
  continue
209
- # 变更检测
235
+ # 变更检测
210
236
  if (
211
237
  file_path not in self.cache
212
238
  or self.cache[file_path].get("md5", "") != file_md5
213
239
  ):
214
240
  files_to_process.append(
215
- (file_path, relative_path, modify_time, file_md5))
241
+ (file_path, relative_path, modify_time, file_md5))
216
242
 
217
243
  deleted_files = set(self.cache.keys()) - current_files
218
244
  logger.info(f"files_to_process: {files_to_process}")
@@ -289,6 +315,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
289
315
  for line in f:
290
316
  data = json.loads(line)
291
317
  cache[data["file_path"]] = data
318
+ else:
319
+ self.load_first()
292
320
  return cache
293
321
 
294
322
  def write_cache(self):
@@ -366,6 +394,9 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
366
394
  dirs[:] = [d for d in dirs if not d.startswith(
367
395
  ".") and d not in default_ignore_dirs]
368
396
 
397
+ # Filter out files that start with a dot
398
+ files[:] = [f for f in files if not f.startswith(".")]
399
+
369
400
  if self.ignore_spec:
370
401
  relative_root = os.path.relpath(root, self.path)
371
402
  dirs[:] = [
@@ -390,6 +421,5 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
390
421
  modify_time = os.path.getmtime(file_path)
391
422
  file_md5 = generate_file_md5(file_path)
392
423
  all_files.append(
393
- (file_path, relative_path, modify_time, file_md5))
394
-
424
+ (file_path, relative_path, modify_time, file_md5))
395
425
  return all_files
@@ -1,5 +1,4 @@
1
1
  from io import BytesIO
2
- from autocoder.utils._markitdown import MarkItDown
3
2
  import traceback
4
3
 
5
4
  def extract_text_from_docx_old(docx_path):
@@ -13,6 +12,7 @@ def extract_text_from_docx_old(docx_path):
13
12
 
14
13
  def extract_text_from_docx(docx_path):
15
14
  try:
15
+ from autocoder.utils._markitdown import MarkItDown
16
16
  md_converter = MarkItDown()
17
17
  result = md_converter.convert(docx_path)
18
18
  return result.text_content
@@ -1,106 +1,163 @@
1
-
2
1
  import os
3
2
  import json
4
- from typing import Dict, Optional
3
+ import threading
4
+ from typing import Dict, Optional, List
5
5
  from loguru import logger
6
+ from functools import lru_cache
6
7
 
7
8
  class FilterRuleManager:
8
9
  '''
10
+ 单例模式的过滤规则管理器。支持按文件类型定义不同的过滤规则。
11
+
12
+ 支持的规则格式:
9
13
  {
10
- "whitelist": [
11
- "glob:*.png",
12
- "regex:^/tmp/.*hidden.*"
13
- ],
14
- "blacklist": [
15
- "glob:*/private/*",
16
- "regex:.*/secret/.*\\.jpg$"
17
- ]
14
+ "image": {
15
+ "whitelist": ["*.png", "*.jpg"],
16
+ "blacklist": ["*/private/*"]
17
+ },
18
+ "document": {
19
+ "whitelist": ["*.pdf", "*.docx"],
20
+ "blacklist": ["*/tmp/*"]
21
+ },
22
+ "default": {
23
+ "whitelist": [],
24
+ "blacklist": ["*/node_modules/*", "*/.*"]
18
25
  }
19
- '''
20
- _cache_rules: Optional[Dict] = None
21
- _cache_mtime: Optional[float] = None
22
-
23
- def __init__(self, llm, source_dir: str):
24
- """
25
- 初始化过滤规则管理器
26
-
27
- 参数:
28
- llm: 大模型对象,当前未使用,预留
29
- source_dir: 项目根目录路径
30
- """
31
- self.llm = llm
32
- self.source_dir = source_dir
33
- self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
26
+ }
27
+ '''
28
+ _instance = None
29
+ _lock = threading.RLock() # 使用可重入锁避免死锁
30
+
31
+ def __new__(cls, *args, **kwargs):
32
+ if cls._instance is None:
33
+ with cls._lock:
34
+ if cls._instance is None: # 双重检查锁定模式
35
+ cls._instance = super(FilterRuleManager, cls).__new__(cls)
36
+ cls._instance._initialized = False
37
+ return cls._instance
38
+
39
+ @classmethod
40
+ def get_instance(cls):
41
+ return cls() # 直接调用__new__,不需要重复加锁
42
+
43
+ def __init__(self):
44
+ with self._lock:
45
+ if hasattr(self, '_initialized') and self._initialized:
46
+ return
47
+
48
+ self.source_dir = os.getcwd()
49
+ self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
50
+ self._cache_rules: Optional[Dict] = None
51
+ self._cache_mtime: Optional[float] = None
52
+ self._rule_lock = threading.RLock() # 单独的锁用于规则访问
53
+ self._initialized = True
34
54
 
35
55
  def load_filter_rules(self) -> Dict:
36
- try:
37
- current_mtime = os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
38
- except Exception:
39
- current_mtime = None
40
-
56
+ # 先检查是否需要重新加载,不持有锁
57
+ current_mtime = self._get_file_mtime()
41
58
  need_reload = False
42
59
 
43
- # 如果缓存为空,或者文件已更新,触发重新加载
44
- if FilterRuleManager._cache_rules is None:
60
+ if self._cache_rules is None:
45
61
  need_reload = True
46
- elif current_mtime is not None and FilterRuleManager._cache_mtime != current_mtime:
62
+ elif current_mtime is not None and self._cache_mtime != current_mtime:
47
63
  need_reload = True
48
64
 
65
+ # 只在需要重新加载时获取锁
49
66
  if need_reload:
50
- FilterRuleManager._cache_rules = {"whitelist": [], "blacklist": []}
51
- try:
52
- if os.path.exists(self.filter_rules_path):
53
- with open(self.filter_rules_path, "r", encoding="utf-8") as f:
54
- FilterRuleManager._cache_rules = json.load(f)
55
- FilterRuleManager._cache_mtime = current_mtime
56
- except Exception as e:
57
- logger.warning(f"Failed to load filterrules: {e}")
58
-
59
- return FilterRuleManager._cache_rules or {"whitelist": [], "blacklist": []}
67
+ with self._rule_lock:
68
+ # 双重检查,避免多线程重复加载
69
+ current_mtime = self._get_file_mtime()
70
+ if self._cache_rules is None or (current_mtime is not None and self._cache_mtime != current_mtime):
71
+ self._load_rules_from_file(current_mtime)
72
+
73
+ # 返回规则副本,避免外部修改影响缓存
74
+ with self._rule_lock:
75
+ return self._cache_rules.copy() if self._cache_rules else self._get_default_rules()
76
+
77
+ def _get_file_mtime(self) -> Optional[float]:
78
+ """获取文件修改时间,与IO相关的操作单独提取出来"""
79
+ try:
80
+ return os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
81
+ except Exception:
82
+ logger.warning(f"Failed to get mtime for {self.filter_rules_path}")
83
+ return None
84
+
85
+ def _get_default_rules(self) -> Dict:
86
+ """返回默认的规则结构"""
87
+ return {
88
+ "default": {
89
+ "whitelist": [],
90
+ "blacklist": []
91
+ }
92
+ }
60
93
 
61
- def should_parse_image(self, file_path: str) -> bool:
94
+ def _load_rules_from_file(self, current_mtime: Optional[float]) -> None:
95
+ """从文件加载规则,仅在持有锁时调用"""
96
+ self._cache_rules = self._get_default_rules()
97
+ try:
98
+ if os.path.exists(self.filter_rules_path):
99
+ with open(self.filter_rules_path, "r", encoding="utf-8") as f:
100
+ file_rules = json.load(f)
101
+
102
+ # 转换旧格式规则到新格式(如果需要)
103
+ if "whitelist" in file_rules or "blacklist" in file_rules:
104
+ # 旧格式转换为新格式
105
+ self._cache_rules = {
106
+ "default": {
107
+ "whitelist": file_rules.get("whitelist", []),
108
+ "blacklist": file_rules.get("blacklist", [])
109
+ }
110
+ }
111
+ logger.info("Converted old format rules to new format")
112
+ else:
113
+ # 新格式直接使用
114
+ self._cache_rules = file_rules
115
+ self._cache_mtime = current_mtime
116
+ except Exception as e:
117
+ logger.warning(f"Failed to load filterrules: {e}")
118
+
119
+ @lru_cache(maxsize=1024) # 缓存频繁使用的路径判断结果
120
+ def should_parse_file(self, file_path: str, file_type: str = "default") -> bool:
62
121
  """
63
- 判断某个文件是否需要对图片进行解析。
64
-
65
- 支持规则格式:
66
- - glob通配符匹配,示例:"glob:*.png" 或 "*.png"
67
- - 正则表达式匹配,示例:"regex:^/tmp/.*hidden.*"
68
-
122
+ 判断某个文件是否需要进行解析。
123
+
124
+ 参数:
125
+ file_path: 文件路径
126
+ file_type: 文件类型(如"image"、"document"等),默认为"default"
127
+
69
128
  返回:
70
129
  True 表示应该解析
71
130
  False 表示不解析
72
131
  """
73
132
  import fnmatch
74
- import re
75
-
133
+
76
134
  rules = self.load_filter_rules()
77
- whitelist = rules.get("whitelist", [])
78
- blacklist = rules.get("blacklist", [])
79
-
80
- def match_pattern(pattern: str, path: str) -> bool:
81
- if pattern.startswith("glob:"):
82
- pat = pattern[len("glob:"):]
83
- return fnmatch.fnmatch(path, pat)
84
- elif pattern.startswith("regex:"):
85
- pat = pattern[len("regex:"):]
86
- try:
87
- return re.search(pat, path) is not None
88
- except re.error:
89
- logger.warning(f"Invalid regex pattern: {pat}")
90
- return False
91
- else:
92
- # 默认按glob处理
93
- return fnmatch.fnmatch(path, pattern)
94
-
135
+
136
+ # 获取指定类型的规则,如果不存在则使用默认规则
137
+ type_rules = rules.get(file_type, rules.get("default", {"whitelist": [], "blacklist": []}))
138
+ whitelist = type_rules.get("whitelist", [])
139
+ blacklist = type_rules.get("blacklist", [])
140
+
95
141
  # 优先匹配黑名单
96
142
  for pattern in blacklist:
97
- if match_pattern(pattern, file_path):
143
+ if fnmatch.fnmatch(file_path, pattern):
98
144
  return False
99
-
100
- # 再匹配白名单
145
+
146
+ # 如果白名单为空,则默认所有文件都通过(除非被黑名单过滤)
147
+ if not whitelist:
148
+ return True
149
+
150
+ # 匹配白名单
101
151
  for pattern in whitelist:
102
- if match_pattern(pattern, file_path):
152
+ if fnmatch.fnmatch(file_path, pattern):
103
153
  return True
104
-
105
- # 默认不解析
154
+
155
+ # 有白名单但不匹配,不通过
106
156
  return False
157
+
158
+ # 保持向后兼容
159
+ def should_parse_image(self, file_path: str) -> bool:
160
+ """
161
+ 判断某个图片文件是否需要解析(兼容旧版API)
162
+ """
163
+ return self.should_parse_file(file_path, "image")
@@ -538,7 +538,7 @@ class ImageLoader:
538
538
  def image_to_markdown(
539
539
  image_path: str,
540
540
  llm,
541
- engine: str = "vl",
541
+ engine: str = "paddle",
542
542
  product_mode: str = "lite",
543
543
  paddle_kwargs: dict = None
544
544
  ) -> str:
@@ -554,6 +554,13 @@ class ImageLoader:
554
554
  Returns:
555
555
  markdown内容字符串
556
556
  """
557
+ logger.info(f"image_path: {image_path} engine: {engine} product_mode: {product_mode} paddle_kwargs: {paddle_kwargs}")
558
+
559
+ # 新增:如果 engine 为 paddle 且 PaddleOCR 为 None,直接返回空字符串
560
+ if engine == "paddle" and PaddleOCR is None:
561
+ logger.warning("PaddleOCR 未安装,无法识别图片内容,直接返回空字符串。")
562
+ return ""
563
+
557
564
  md_content = ImageLoader.extract_text_from_image(
558
565
  image_path,
559
566
  llm,
@@ -561,8 +568,13 @@ class ImageLoader:
561
568
  product_mode=product_mode,
562
569
  paddle_kwargs=paddle_kwargs
563
570
  )
564
-
565
- md_path = os.path.splitext(image_path)[0] + ".md"
571
+
572
+ # Get directory and filename separately
573
+ dir_name = os.path.dirname(image_path)
574
+ file_name = os.path.basename(image_path)
575
+ base_name = os.path.splitext(file_name)[0]
576
+ # Create new path with dot before filename
577
+ md_path = os.path.join(dir_name, f".{base_name}.md")
566
578
  try:
567
579
  with open(md_path, "w", encoding="utf-8") as f:
568
580
  f.write(md_content)
@@ -1,6 +1,5 @@
1
1
  from io import BytesIO
2
2
  from pypdf import PdfReader
3
- from autocoder.utils._markitdown import MarkItDown
4
3
  import traceback
5
4
 
6
5
 
@@ -15,7 +14,8 @@ def extract_text_from_pdf_old(file_path):
15
14
  return text
16
15
 
17
16
  def extract_text_from_pdf(file_path, llm=None, product_mode="lite"):
18
- try:
17
+ try:
18
+ from autocoder.utils._markitdown import MarkItDown
19
19
  md_converter = MarkItDown(llm=llm, product_mode=product_mode)
20
20
  result = md_converter.convert(file_path)
21
21
  return result.text_content
@@ -41,6 +41,8 @@ from autocoder.rag.qa_conversation_strategy import get_qa_strategy
41
41
  from autocoder.rag.searchable import SearchableResults
42
42
  from autocoder.rag.conversation_to_queries import extract_search_queries
43
43
  from autocoder.common import openai_content as OpenAIContentProcessor
44
+ from autocoder.common.save_formatted_log import save_formatted_log
45
+ import json, os
44
46
  try:
45
47
  from autocoder_pro.rag.llm_compute import LLMComputeEngine
46
48
  pro_version = version("auto-coder-pro")
@@ -849,6 +851,15 @@ class LongContextRAG:
849
851
  conversations=conversations, local_image_host=self.args.local_image_host
850
852
  )
851
853
 
854
+ # 保存 new_conversations
855
+ try:
856
+ logger.info(f"Saving new_conversations log to {self.args.source_dir}/.cache/logs")
857
+ project_root = self.args.source_dir
858
+ json_text = json.dumps(new_conversations, ensure_ascii=False)
859
+ save_formatted_log(project_root, json_text, "rag_conversation")
860
+ except Exception as e:
861
+ logger.warning(f"Failed to save new_conversations log: {e}")
862
+
852
863
  chunks = target_llm.stream_chat_oai(
853
864
  conversations=new_conversations,
854
865
  model=model,
@@ -2,6 +2,7 @@ from abc import ABC, abstractmethod
2
2
  from typing import List, Dict, Any, Generator
3
3
  import byzerllm
4
4
  from autocoder.common import AutoCoderArgs
5
+ from autocoder.common.rulefiles.autocoderrules_utils import get_rules
5
6
 
6
7
  class QAConversationStrategy(ABC):
7
8
  """
@@ -124,22 +125,8 @@ class MultiRoundStrategy(QAConversationStrategy):
124
125
  {% endfor %}
125
126
  {% endif %}
126
127
 
127
- """
128
-
129
- import os
130
- extra_docs = {}
131
- rules_dir = os.path.join(self.args.source_dir, ".autocoderrules")
132
- if os.path.isdir(rules_dir):
133
- for fname in os.listdir(rules_dir):
134
- if fname.endswith(".md"):
135
- fpath = os.path.join(rules_dir, fname)
136
- try:
137
- with open(fpath, "r", encoding="utf-8") as f:
138
- content = f.read()
139
- key = os.path.splitext(fname)[0]
140
- extra_docs[key] = content
141
- except Exception:
142
- continue
128
+ """
129
+ extra_docs = get_rules()
143
130
  return {"extra_docs": extra_docs}
144
131
 
145
132
  class SingleRoundStrategy(QAConversationStrategy):
@@ -253,21 +240,8 @@ class SingleRoundStrategy(QAConversationStrategy):
253
240
  {% endfor %}
254
241
  {% endif %}
255
242
 
256
- """
257
- import os
258
- extra_docs = {}
259
- rules_dir = os.path.join(getattr(self, 'args', None).source_dir if getattr(self, 'args', None) else ".", ".autocoderrules")
260
- if os.path.isdir(rules_dir):
261
- for fname in os.listdir(rules_dir):
262
- if fname.endswith(".md"):
263
- fpath = os.path.join(rules_dir, fname)
264
- try:
265
- with open(fpath, "r", encoding="utf-8") as f:
266
- content = f.read()
267
- key = os.path.splitext(fname)[0]
268
- extra_docs[key] = content
269
- except Exception:
270
- continue
243
+ """
244
+ extra_docs = extra_docs = get_rules()
271
245
  return {"extra_docs": extra_docs}
272
246
 
273
247
  def get_qa_strategy(args: AutoCoderArgs) -> QAConversationStrategy:
autocoder/rag/utils.py CHANGED
@@ -2,8 +2,9 @@ from autocoder.common import SourceCode
2
2
  from autocoder.rag.token_counter import count_tokens_worker, count_tokens
3
3
  from autocoder.rag.loaders.pdf_loader import extract_text_from_pdf
4
4
  from autocoder.rag.loaders.docx_loader import extract_text_from_docx
5
- from autocoder.rag.loaders.excel_loader import extract_text_from_excel
5
+ from autocoder.rag.loaders.excel_loader import extract_text_from_excel
6
6
  from autocoder.rag.loaders.ppt_loader import extract_text_from_ppt
7
+ from autocoder.rag.loaders.image_loader import ImageLoader
7
8
  from typing import List, Tuple, Optional, Union
8
9
  import time
9
10
  from loguru import logger
@@ -21,7 +22,7 @@ def process_file_in_multi_process(
21
22
  llm = get_single_llm(llm,product_mode)
22
23
 
23
24
  start_time = time.time()
24
- file_path, relative_path, _, _ = file_info
25
+ file_path, relative_path, _, _ = file_info
25
26
  try:
26
27
  if file_path.endswith(".pdf"):
27
28
  content = extract_text_from_pdf(file_path, llm, product_mode)
@@ -61,6 +62,15 @@ def process_file_in_multi_process(
61
62
  tokens=count_tokens_worker(content),
62
63
  )
63
64
  ]
65
+ elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
66
+ content = ImageLoader.image_to_markdown(file_path, llm=llm, product_mode=product_mode)
67
+ v = [
68
+ SourceCode(
69
+ module_name=f"##File: {file_path}",
70
+ source_code=content,
71
+ tokens=count_tokens_worker(content),
72
+ )
73
+ ]
64
74
  else:
65
75
  with open(file_path, "r", encoding="utf-8") as f:
66
76
  content = f.read()
@@ -126,6 +136,15 @@ def process_file_local(
126
136
  tokens=count_tokens(content),
127
137
  )
128
138
  ]
139
+ elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
140
+ content = ImageLoader.image_to_markdown(file_path, llm=llm, product_mode=product_mode)
141
+ v = [
142
+ SourceCode(
143
+ module_name=f"##File: {file_path}",
144
+ source_code=content,
145
+ tokens=count_tokens(content),
146
+ )
147
+ ]
129
148
  else:
130
149
  with open(file_path, "r", encoding="utf-8") as f:
131
150
  content = f.read()