auto-coder 0.1.353__py3-none-any.whl → 0.1.355__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/METADATA +1 -1
- {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/RECORD +60 -45
- autocoder/agent/agentic_filter.py +1 -1
- autocoder/auto_coder.py +8 -0
- autocoder/auto_coder_rag.py +37 -1
- autocoder/auto_coder_runner.py +58 -77
- autocoder/chat/conf_command.py +270 -0
- autocoder/chat/models_command.py +485 -0
- autocoder/chat_auto_coder.py +29 -24
- autocoder/chat_auto_coder_lang.py +26 -2
- autocoder/commands/auto_command.py +60 -132
- autocoder/commands/auto_web.py +1 -1
- autocoder/commands/tools.py +1 -1
- autocoder/common/__init__.py +3 -1
- autocoder/common/command_completer.py +58 -12
- autocoder/common/command_completer_v2.py +576 -0
- autocoder/common/conversations/__init__.py +52 -0
- autocoder/common/conversations/compatibility.py +303 -0
- autocoder/common/conversations/conversation_manager.py +502 -0
- autocoder/common/conversations/example.py +152 -0
- autocoder/common/file_monitor/__init__.py +5 -0
- autocoder/common/file_monitor/monitor.py +383 -0
- autocoder/common/global_cancel.py +53 -16
- autocoder/common/ignorefiles/__init__.py +4 -0
- autocoder/common/ignorefiles/ignore_file_utils.py +103 -0
- autocoder/common/ignorefiles/test_ignore_file_utils.py +91 -0
- autocoder/common/rulefiles/__init__.py +15 -0
- autocoder/common/rulefiles/autocoderrules_utils.py +173 -0
- autocoder/common/save_formatted_log.py +54 -0
- autocoder/common/v2/agent/agentic_edit.py +10 -39
- autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +1 -1
- autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +73 -43
- autocoder/common/v2/code_agentic_editblock_manager.py +9 -9
- autocoder/common/v2/code_diff_manager.py +2 -2
- autocoder/common/v2/code_editblock_manager.py +31 -18
- autocoder/common/v2/code_strict_diff_manager.py +3 -2
- autocoder/dispacher/actions/action.py +6 -6
- autocoder/dispacher/actions/plugins/action_regex_project.py +2 -2
- autocoder/events/event_manager_singleton.py +1 -1
- autocoder/index/index.py +3 -3
- autocoder/models.py +22 -9
- autocoder/rag/api_server.py +14 -2
- autocoder/rag/cache/local_byzer_storage_cache.py +1 -1
- autocoder/rag/cache/local_duckdb_storage_cache.py +8 -0
- autocoder/rag/cache/simple_cache.py +63 -33
- autocoder/rag/loaders/docx_loader.py +1 -1
- autocoder/rag/loaders/filter_utils.py +133 -76
- autocoder/rag/loaders/image_loader.py +15 -3
- autocoder/rag/loaders/pdf_loader.py +2 -2
- autocoder/rag/long_context_rag.py +11 -0
- autocoder/rag/qa_conversation_strategy.py +5 -31
- autocoder/rag/utils.py +21 -2
- autocoder/utils/_markitdown.py +66 -25
- autocoder/utils/auto_coder_utils/chat_stream_out.py +4 -4
- autocoder/utils/thread_utils.py +9 -27
- autocoder/version.py +1 -1
- {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.353.dist-info → auto_coder-0.1.355.dist-info}/top_level.txt +0 -0
|
@@ -24,6 +24,7 @@ from .failed_files_utils import load_failed_files, save_failed_files
|
|
|
24
24
|
from autocoder.common import AutoCoderArgs
|
|
25
25
|
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
26
26
|
from autocoder.utils.llms import get_llm_names
|
|
27
|
+
from autocoder.common.file_monitor.monitor import get_file_monitor, Change
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
default_ignore_dirs = [
|
|
@@ -50,7 +51,7 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
53
|
-
def __init__(self, path: str, ignore_spec, required_exts: list,
|
|
54
|
+
def __init__(self, path: str, ignore_spec, required_exts: list, args: Optional[AutoCoderArgs] = None, llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None):
|
|
54
55
|
"""
|
|
55
56
|
初始化异步更新队列,用于管理代码文件的缓存。
|
|
56
57
|
|
|
@@ -58,7 +59,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
58
59
|
path: 需要索引的代码库根目录
|
|
59
60
|
ignore_spec: 指定哪些文件/目录应被忽略的规则
|
|
60
61
|
required_exts: 需要处理的文件扩展名列表
|
|
61
|
-
|
|
62
|
+
args: AutoCoderArgs 对象,包含配置信息
|
|
63
|
+
llm: 用于代码分析的 LLM 实例
|
|
62
64
|
|
|
63
65
|
缓存结构 (self.cache):
|
|
64
66
|
self.cache 是一个字典,其结构如下:
|
|
@@ -99,7 +101,6 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
99
101
|
self.args = args
|
|
100
102
|
self.llm = llm
|
|
101
103
|
self.product_mode = args.product_mode or "lite"
|
|
102
|
-
self.update_interval = update_interval
|
|
103
104
|
self.queue = []
|
|
104
105
|
self.cache = {} # 初始化为空字典,稍后通过 read_cache() 填充
|
|
105
106
|
self.lock = threading.Lock()
|
|
@@ -115,10 +116,16 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
115
116
|
self.queue_thread.daemon = True
|
|
116
117
|
self.queue_thread.start()
|
|
117
118
|
|
|
118
|
-
#
|
|
119
|
-
self.
|
|
120
|
-
|
|
121
|
-
self.
|
|
119
|
+
# 注册文件监控回调
|
|
120
|
+
self.file_monitor = get_file_monitor(self.path)
|
|
121
|
+
# 注册根目录的监控,这样可以捕获所有子目录和文件的变化
|
|
122
|
+
self.file_monitor.register(self.path, self._on_file_change)
|
|
123
|
+
# 确保监控器已启动
|
|
124
|
+
if not self.file_monitor.is_running():
|
|
125
|
+
self.file_monitor.start()
|
|
126
|
+
logger.info(f"Started file monitor for {self.path}")
|
|
127
|
+
else:
|
|
128
|
+
logger.info(f"File monitor already running for {self.path}")
|
|
122
129
|
|
|
123
130
|
self.cache = self.read_cache()
|
|
124
131
|
|
|
@@ -130,37 +137,57 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
130
137
|
logger.error(f"Error in process_queue: {e}")
|
|
131
138
|
time.sleep(1) # 避免过于频繁的检查
|
|
132
139
|
|
|
133
|
-
def
|
|
134
|
-
"""
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
140
|
+
def _on_file_change(self, change_type: Change, file_path: str):
|
|
141
|
+
"""
|
|
142
|
+
文件监控回调函数,当文件发生变化时触发更新
|
|
143
|
+
|
|
144
|
+
参数:
|
|
145
|
+
change_type: 变化类型 (Change.added, Change.modified, Change.deleted)
|
|
146
|
+
file_path: 发生变化的文件路径
|
|
147
|
+
"""
|
|
148
|
+
try:
|
|
149
|
+
# 如果缓存还没有初始化,跳过触发
|
|
150
|
+
if not self.cache:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
# 检查文件扩展名,如果不在需要处理的扩展名列表中,跳过
|
|
154
|
+
if self.required_exts and not any(file_path.endswith(ext) for ext in self.required_exts):
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
# 检查是否在忽略规则中
|
|
158
|
+
if self.ignore_spec and self.ignore_spec.match_file(os.path.relpath(file_path, self.path)):
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
logger.info(f"File change detected: {change_type} - {file_path}")
|
|
162
|
+
self.trigger_update()
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Error in file change handler: {e}")
|
|
165
|
+
logger.exception(e)
|
|
147
166
|
|
|
148
167
|
def stop(self):
|
|
149
168
|
self.stop_event.set()
|
|
150
|
-
|
|
151
|
-
|
|
169
|
+
# 取消注册文件监控回调
|
|
170
|
+
try:
|
|
171
|
+
self.file_monitor.unregister(self.path, self._on_file_change)
|
|
172
|
+
logger.info(f"Unregistered file monitor callback for {self.path}")
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.error(f"Error unregistering file monitor callback: {e}")
|
|
175
|
+
# 只等待队列处理线程结束
|
|
176
|
+
if hasattr(self, 'queue_thread') and self.queue_thread.is_alive():
|
|
177
|
+
self.queue_thread.join(timeout=2.0)
|
|
152
178
|
|
|
153
179
|
def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
|
|
154
180
|
return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
|
|
155
181
|
|
|
156
182
|
def __del__(self):
|
|
183
|
+
# 确保在对象被销毁时停止监控并清理资源
|
|
157
184
|
self.stop()
|
|
158
185
|
|
|
159
186
|
def load_first(self):
|
|
160
187
|
with self.lock:
|
|
161
188
|
if self.cache:
|
|
162
189
|
return
|
|
163
|
-
files_to_process = []
|
|
190
|
+
files_to_process = []
|
|
164
191
|
for file_info in self.get_all_files():
|
|
165
192
|
file_path, _, modify_time, file_md5 = file_info
|
|
166
193
|
if (
|
|
@@ -175,7 +202,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
175
202
|
# [process_file.remote(file_info) for file_info in files_to_process]
|
|
176
203
|
# )
|
|
177
204
|
from autocoder.rag.token_counter import initialize_tokenizer
|
|
178
|
-
llm_name = get_llm_names(self.llm)[0] if self.llm else None
|
|
205
|
+
llm_name = get_llm_names(self.llm)[0] if self.llm else None
|
|
179
206
|
with Pool(
|
|
180
207
|
processes=os.cpu_count(),
|
|
181
208
|
initializer=initialize_tokenizer,
|
|
@@ -184,8 +211,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
184
211
|
|
|
185
212
|
worker_func = functools.partial(
|
|
186
213
|
process_file_in_multi_process, llm=llm_name, product_mode=self.product_mode)
|
|
187
|
-
results = pool.map(worker_func, files_to_process)
|
|
188
|
-
|
|
214
|
+
results = pool.map(worker_func, files_to_process)
|
|
215
|
+
|
|
189
216
|
for file_info, result in zip(files_to_process, results):
|
|
190
217
|
if result: # 只有当result不为空时才更新缓存
|
|
191
218
|
self.update_cache(file_info, result)
|
|
@@ -203,16 +230,15 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
203
230
|
file_path, relative_path, modify_time, file_md5 = file_info
|
|
204
231
|
current_files.add(file_path)
|
|
205
232
|
# 如果文件曾经解析失败,跳过本次增量更新
|
|
206
|
-
if file_path in self.failed_files:
|
|
207
|
-
# logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
|
|
233
|
+
if file_path in self.failed_files:
|
|
208
234
|
continue
|
|
209
|
-
# 变更检测
|
|
235
|
+
# 变更检测
|
|
210
236
|
if (
|
|
211
237
|
file_path not in self.cache
|
|
212
238
|
or self.cache[file_path].get("md5", "") != file_md5
|
|
213
239
|
):
|
|
214
240
|
files_to_process.append(
|
|
215
|
-
(file_path, relative_path, modify_time, file_md5))
|
|
241
|
+
(file_path, relative_path, modify_time, file_md5))
|
|
216
242
|
|
|
217
243
|
deleted_files = set(self.cache.keys()) - current_files
|
|
218
244
|
logger.info(f"files_to_process: {files_to_process}")
|
|
@@ -289,6 +315,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
289
315
|
for line in f:
|
|
290
316
|
data = json.loads(line)
|
|
291
317
|
cache[data["file_path"]] = data
|
|
318
|
+
else:
|
|
319
|
+
self.load_first()
|
|
292
320
|
return cache
|
|
293
321
|
|
|
294
322
|
def write_cache(self):
|
|
@@ -366,6 +394,9 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
366
394
|
dirs[:] = [d for d in dirs if not d.startswith(
|
|
367
395
|
".") and d not in default_ignore_dirs]
|
|
368
396
|
|
|
397
|
+
# Filter out files that start with a dot
|
|
398
|
+
files[:] = [f for f in files if not f.startswith(".")]
|
|
399
|
+
|
|
369
400
|
if self.ignore_spec:
|
|
370
401
|
relative_root = os.path.relpath(root, self.path)
|
|
371
402
|
dirs[:] = [
|
|
@@ -390,6 +421,5 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
390
421
|
modify_time = os.path.getmtime(file_path)
|
|
391
422
|
file_md5 = generate_file_md5(file_path)
|
|
392
423
|
all_files.append(
|
|
393
|
-
(file_path, relative_path, modify_time, file_md5))
|
|
394
|
-
|
|
424
|
+
(file_path, relative_path, modify_time, file_md5))
|
|
395
425
|
return all_files
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
|
-
from autocoder.utils._markitdown import MarkItDown
|
|
3
2
|
import traceback
|
|
4
3
|
|
|
5
4
|
def extract_text_from_docx_old(docx_path):
|
|
@@ -13,6 +12,7 @@ def extract_text_from_docx_old(docx_path):
|
|
|
13
12
|
|
|
14
13
|
def extract_text_from_docx(docx_path):
|
|
15
14
|
try:
|
|
15
|
+
from autocoder.utils._markitdown import MarkItDown
|
|
16
16
|
md_converter = MarkItDown()
|
|
17
17
|
result = md_converter.convert(docx_path)
|
|
18
18
|
return result.text_content
|
|
@@ -1,106 +1,163 @@
|
|
|
1
|
-
|
|
2
1
|
import os
|
|
3
2
|
import json
|
|
4
|
-
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Dict, Optional, List
|
|
5
5
|
from loguru import logger
|
|
6
|
+
from functools import lru_cache
|
|
6
7
|
|
|
7
8
|
class FilterRuleManager:
|
|
8
9
|
'''
|
|
10
|
+
单例模式的过滤规则管理器。支持按文件类型定义不同的过滤规则。
|
|
11
|
+
|
|
12
|
+
支持的规则格式:
|
|
9
13
|
{
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
|
|
14
|
+
"image": {
|
|
15
|
+
"whitelist": ["*.png", "*.jpg"],
|
|
16
|
+
"blacklist": ["*/private/*"]
|
|
17
|
+
},
|
|
18
|
+
"document": {
|
|
19
|
+
"whitelist": ["*.pdf", "*.docx"],
|
|
20
|
+
"blacklist": ["*/tmp/*"]
|
|
21
|
+
},
|
|
22
|
+
"default": {
|
|
23
|
+
"whitelist": [],
|
|
24
|
+
"blacklist": ["*/node_modules/*", "*/.*"]
|
|
18
25
|
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
26
|
+
}
|
|
27
|
+
'''
|
|
28
|
+
_instance = None
|
|
29
|
+
_lock = threading.RLock() # 使用可重入锁避免死锁
|
|
30
|
+
|
|
31
|
+
def __new__(cls, *args, **kwargs):
|
|
32
|
+
if cls._instance is None:
|
|
33
|
+
with cls._lock:
|
|
34
|
+
if cls._instance is None: # 双重检查锁定模式
|
|
35
|
+
cls._instance = super(FilterRuleManager, cls).__new__(cls)
|
|
36
|
+
cls._instance._initialized = False
|
|
37
|
+
return cls._instance
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def get_instance(cls):
|
|
41
|
+
return cls() # 直接调用__new__,不需要重复加锁
|
|
42
|
+
|
|
43
|
+
def __init__(self):
|
|
44
|
+
with self._lock:
|
|
45
|
+
if hasattr(self, '_initialized') and self._initialized:
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
self.source_dir = os.getcwd()
|
|
49
|
+
self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
|
|
50
|
+
self._cache_rules: Optional[Dict] = None
|
|
51
|
+
self._cache_mtime: Optional[float] = None
|
|
52
|
+
self._rule_lock = threading.RLock() # 单独的锁用于规则访问
|
|
53
|
+
self._initialized = True
|
|
34
54
|
|
|
35
55
|
def load_filter_rules(self) -> Dict:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
except Exception:
|
|
39
|
-
current_mtime = None
|
|
40
|
-
|
|
56
|
+
# 先检查是否需要重新加载,不持有锁
|
|
57
|
+
current_mtime = self._get_file_mtime()
|
|
41
58
|
need_reload = False
|
|
42
59
|
|
|
43
|
-
|
|
44
|
-
if FilterRuleManager._cache_rules is None:
|
|
60
|
+
if self._cache_rules is None:
|
|
45
61
|
need_reload = True
|
|
46
|
-
elif current_mtime is not None and
|
|
62
|
+
elif current_mtime is not None and self._cache_mtime != current_mtime:
|
|
47
63
|
need_reload = True
|
|
48
64
|
|
|
65
|
+
# 只在需要重新加载时获取锁
|
|
49
66
|
if need_reload:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
67
|
+
with self._rule_lock:
|
|
68
|
+
# 双重检查,避免多线程重复加载
|
|
69
|
+
current_mtime = self._get_file_mtime()
|
|
70
|
+
if self._cache_rules is None or (current_mtime is not None and self._cache_mtime != current_mtime):
|
|
71
|
+
self._load_rules_from_file(current_mtime)
|
|
72
|
+
|
|
73
|
+
# 返回规则副本,避免外部修改影响缓存
|
|
74
|
+
with self._rule_lock:
|
|
75
|
+
return self._cache_rules.copy() if self._cache_rules else self._get_default_rules()
|
|
76
|
+
|
|
77
|
+
def _get_file_mtime(self) -> Optional[float]:
|
|
78
|
+
"""获取文件修改时间,与IO相关的操作单独提取出来"""
|
|
79
|
+
try:
|
|
80
|
+
return os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
|
|
81
|
+
except Exception:
|
|
82
|
+
logger.warning(f"Failed to get mtime for {self.filter_rules_path}")
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def _get_default_rules(self) -> Dict:
|
|
86
|
+
"""返回默认的规则结构"""
|
|
87
|
+
return {
|
|
88
|
+
"default": {
|
|
89
|
+
"whitelist": [],
|
|
90
|
+
"blacklist": []
|
|
91
|
+
}
|
|
92
|
+
}
|
|
60
93
|
|
|
61
|
-
def
|
|
94
|
+
def _load_rules_from_file(self, current_mtime: Optional[float]) -> None:
|
|
95
|
+
"""从文件加载规则,仅在持有锁时调用"""
|
|
96
|
+
self._cache_rules = self._get_default_rules()
|
|
97
|
+
try:
|
|
98
|
+
if os.path.exists(self.filter_rules_path):
|
|
99
|
+
with open(self.filter_rules_path, "r", encoding="utf-8") as f:
|
|
100
|
+
file_rules = json.load(f)
|
|
101
|
+
|
|
102
|
+
# 转换旧格式规则到新格式(如果需要)
|
|
103
|
+
if "whitelist" in file_rules or "blacklist" in file_rules:
|
|
104
|
+
# 旧格式转换为新格式
|
|
105
|
+
self._cache_rules = {
|
|
106
|
+
"default": {
|
|
107
|
+
"whitelist": file_rules.get("whitelist", []),
|
|
108
|
+
"blacklist": file_rules.get("blacklist", [])
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
logger.info("Converted old format rules to new format")
|
|
112
|
+
else:
|
|
113
|
+
# 新格式直接使用
|
|
114
|
+
self._cache_rules = file_rules
|
|
115
|
+
self._cache_mtime = current_mtime
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.warning(f"Failed to load filterrules: {e}")
|
|
118
|
+
|
|
119
|
+
@lru_cache(maxsize=1024) # 缓存频繁使用的路径判断结果
|
|
120
|
+
def should_parse_file(self, file_path: str, file_type: str = "default") -> bool:
|
|
62
121
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
122
|
+
判断某个文件是否需要进行解析。
|
|
123
|
+
|
|
124
|
+
参数:
|
|
125
|
+
file_path: 文件路径
|
|
126
|
+
file_type: 文件类型(如"image"、"document"等),默认为"default"
|
|
127
|
+
|
|
69
128
|
返回:
|
|
70
129
|
True 表示应该解析
|
|
71
130
|
False 表示不解析
|
|
72
131
|
"""
|
|
73
132
|
import fnmatch
|
|
74
|
-
|
|
75
|
-
|
|
133
|
+
|
|
76
134
|
rules = self.load_filter_rules()
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
return fnmatch.fnmatch(path, pat)
|
|
84
|
-
elif pattern.startswith("regex:"):
|
|
85
|
-
pat = pattern[len("regex:"):]
|
|
86
|
-
try:
|
|
87
|
-
return re.search(pat, path) is not None
|
|
88
|
-
except re.error:
|
|
89
|
-
logger.warning(f"Invalid regex pattern: {pat}")
|
|
90
|
-
return False
|
|
91
|
-
else:
|
|
92
|
-
# 默认按glob处理
|
|
93
|
-
return fnmatch.fnmatch(path, pattern)
|
|
94
|
-
|
|
135
|
+
|
|
136
|
+
# 获取指定类型的规则,如果不存在则使用默认规则
|
|
137
|
+
type_rules = rules.get(file_type, rules.get("default", {"whitelist": [], "blacklist": []}))
|
|
138
|
+
whitelist = type_rules.get("whitelist", [])
|
|
139
|
+
blacklist = type_rules.get("blacklist", [])
|
|
140
|
+
|
|
95
141
|
# 优先匹配黑名单
|
|
96
142
|
for pattern in blacklist:
|
|
97
|
-
if
|
|
143
|
+
if fnmatch.fnmatch(file_path, pattern):
|
|
98
144
|
return False
|
|
99
|
-
|
|
100
|
-
#
|
|
145
|
+
|
|
146
|
+
# 如果白名单为空,则默认所有文件都通过(除非被黑名单过滤)
|
|
147
|
+
if not whitelist:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
# 匹配白名单
|
|
101
151
|
for pattern in whitelist:
|
|
102
|
-
if
|
|
152
|
+
if fnmatch.fnmatch(file_path, pattern):
|
|
103
153
|
return True
|
|
104
|
-
|
|
105
|
-
#
|
|
154
|
+
|
|
155
|
+
# 有白名单但不匹配,不通过
|
|
106
156
|
return False
|
|
157
|
+
|
|
158
|
+
# 保持向后兼容
|
|
159
|
+
def should_parse_image(self, file_path: str) -> bool:
|
|
160
|
+
"""
|
|
161
|
+
判断某个图片文件是否需要解析(兼容旧版API)
|
|
162
|
+
"""
|
|
163
|
+
return self.should_parse_file(file_path, "image")
|
|
@@ -538,7 +538,7 @@ class ImageLoader:
|
|
|
538
538
|
def image_to_markdown(
|
|
539
539
|
image_path: str,
|
|
540
540
|
llm,
|
|
541
|
-
engine: str = "
|
|
541
|
+
engine: str = "paddle",
|
|
542
542
|
product_mode: str = "lite",
|
|
543
543
|
paddle_kwargs: dict = None
|
|
544
544
|
) -> str:
|
|
@@ -554,6 +554,13 @@ class ImageLoader:
|
|
|
554
554
|
Returns:
|
|
555
555
|
markdown内容字符串
|
|
556
556
|
"""
|
|
557
|
+
logger.info(f"image_path: {image_path} engine: {engine} product_mode: {product_mode} paddle_kwargs: {paddle_kwargs}")
|
|
558
|
+
|
|
559
|
+
# 新增:如果 engine 为 paddle 且 PaddleOCR 为 None,直接返回空字符串
|
|
560
|
+
if engine == "paddle" and PaddleOCR is None:
|
|
561
|
+
logger.warning("PaddleOCR 未安装,无法识别图片内容,直接返回空字符串。")
|
|
562
|
+
return ""
|
|
563
|
+
|
|
557
564
|
md_content = ImageLoader.extract_text_from_image(
|
|
558
565
|
image_path,
|
|
559
566
|
llm,
|
|
@@ -561,8 +568,13 @@ class ImageLoader:
|
|
|
561
568
|
product_mode=product_mode,
|
|
562
569
|
paddle_kwargs=paddle_kwargs
|
|
563
570
|
)
|
|
564
|
-
|
|
565
|
-
|
|
571
|
+
|
|
572
|
+
# Get directory and filename separately
|
|
573
|
+
dir_name = os.path.dirname(image_path)
|
|
574
|
+
file_name = os.path.basename(image_path)
|
|
575
|
+
base_name = os.path.splitext(file_name)[0]
|
|
576
|
+
# Create new path with dot before filename
|
|
577
|
+
md_path = os.path.join(dir_name, f".{base_name}.md")
|
|
566
578
|
try:
|
|
567
579
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
568
580
|
f.write(md_content)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
2
|
from pypdf import PdfReader
|
|
3
|
-
from autocoder.utils._markitdown import MarkItDown
|
|
4
3
|
import traceback
|
|
5
4
|
|
|
6
5
|
|
|
@@ -15,7 +14,8 @@ def extract_text_from_pdf_old(file_path):
|
|
|
15
14
|
return text
|
|
16
15
|
|
|
17
16
|
def extract_text_from_pdf(file_path, llm=None, product_mode="lite"):
|
|
18
|
-
try:
|
|
17
|
+
try:
|
|
18
|
+
from autocoder.utils._markitdown import MarkItDown
|
|
19
19
|
md_converter = MarkItDown(llm=llm, product_mode=product_mode)
|
|
20
20
|
result = md_converter.convert(file_path)
|
|
21
21
|
return result.text_content
|
|
@@ -41,6 +41,8 @@ from autocoder.rag.qa_conversation_strategy import get_qa_strategy
|
|
|
41
41
|
from autocoder.rag.searchable import SearchableResults
|
|
42
42
|
from autocoder.rag.conversation_to_queries import extract_search_queries
|
|
43
43
|
from autocoder.common import openai_content as OpenAIContentProcessor
|
|
44
|
+
from autocoder.common.save_formatted_log import save_formatted_log
|
|
45
|
+
import json, os
|
|
44
46
|
try:
|
|
45
47
|
from autocoder_pro.rag.llm_compute import LLMComputeEngine
|
|
46
48
|
pro_version = version("auto-coder-pro")
|
|
@@ -849,6 +851,15 @@ class LongContextRAG:
|
|
|
849
851
|
conversations=conversations, local_image_host=self.args.local_image_host
|
|
850
852
|
)
|
|
851
853
|
|
|
854
|
+
# 保存 new_conversations
|
|
855
|
+
try:
|
|
856
|
+
logger.info(f"Saving new_conversations log to {self.args.source_dir}/.cache/logs")
|
|
857
|
+
project_root = self.args.source_dir
|
|
858
|
+
json_text = json.dumps(new_conversations, ensure_ascii=False)
|
|
859
|
+
save_formatted_log(project_root, json_text, "rag_conversation")
|
|
860
|
+
except Exception as e:
|
|
861
|
+
logger.warning(f"Failed to save new_conversations log: {e}")
|
|
862
|
+
|
|
852
863
|
chunks = target_llm.stream_chat_oai(
|
|
853
864
|
conversations=new_conversations,
|
|
854
865
|
model=model,
|
|
@@ -2,6 +2,7 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
from typing import List, Dict, Any, Generator
|
|
3
3
|
import byzerllm
|
|
4
4
|
from autocoder.common import AutoCoderArgs
|
|
5
|
+
from autocoder.common.rulefiles.autocoderrules_utils import get_rules
|
|
5
6
|
|
|
6
7
|
class QAConversationStrategy(ABC):
|
|
7
8
|
"""
|
|
@@ -124,22 +125,8 @@ class MultiRoundStrategy(QAConversationStrategy):
|
|
|
124
125
|
{% endfor %}
|
|
125
126
|
{% endif %}
|
|
126
127
|
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
import os
|
|
130
|
-
extra_docs = {}
|
|
131
|
-
rules_dir = os.path.join(self.args.source_dir, ".autocoderrules")
|
|
132
|
-
if os.path.isdir(rules_dir):
|
|
133
|
-
for fname in os.listdir(rules_dir):
|
|
134
|
-
if fname.endswith(".md"):
|
|
135
|
-
fpath = os.path.join(rules_dir, fname)
|
|
136
|
-
try:
|
|
137
|
-
with open(fpath, "r", encoding="utf-8") as f:
|
|
138
|
-
content = f.read()
|
|
139
|
-
key = os.path.splitext(fname)[0]
|
|
140
|
-
extra_docs[key] = content
|
|
141
|
-
except Exception:
|
|
142
|
-
continue
|
|
128
|
+
"""
|
|
129
|
+
extra_docs = get_rules()
|
|
143
130
|
return {"extra_docs": extra_docs}
|
|
144
131
|
|
|
145
132
|
class SingleRoundStrategy(QAConversationStrategy):
|
|
@@ -253,21 +240,8 @@ class SingleRoundStrategy(QAConversationStrategy):
|
|
|
253
240
|
{% endfor %}
|
|
254
241
|
{% endif %}
|
|
255
242
|
|
|
256
|
-
"""
|
|
257
|
-
|
|
258
|
-
extra_docs = {}
|
|
259
|
-
rules_dir = os.path.join(getattr(self, 'args', None).source_dir if getattr(self, 'args', None) else ".", ".autocoderrules")
|
|
260
|
-
if os.path.isdir(rules_dir):
|
|
261
|
-
for fname in os.listdir(rules_dir):
|
|
262
|
-
if fname.endswith(".md"):
|
|
263
|
-
fpath = os.path.join(rules_dir, fname)
|
|
264
|
-
try:
|
|
265
|
-
with open(fpath, "r", encoding="utf-8") as f:
|
|
266
|
-
content = f.read()
|
|
267
|
-
key = os.path.splitext(fname)[0]
|
|
268
|
-
extra_docs[key] = content
|
|
269
|
-
except Exception:
|
|
270
|
-
continue
|
|
243
|
+
"""
|
|
244
|
+
extra_docs = extra_docs = get_rules()
|
|
271
245
|
return {"extra_docs": extra_docs}
|
|
272
246
|
|
|
273
247
|
def get_qa_strategy(args: AutoCoderArgs) -> QAConversationStrategy:
|
autocoder/rag/utils.py
CHANGED
|
@@ -2,8 +2,9 @@ from autocoder.common import SourceCode
|
|
|
2
2
|
from autocoder.rag.token_counter import count_tokens_worker, count_tokens
|
|
3
3
|
from autocoder.rag.loaders.pdf_loader import extract_text_from_pdf
|
|
4
4
|
from autocoder.rag.loaders.docx_loader import extract_text_from_docx
|
|
5
|
-
from autocoder.rag.loaders.excel_loader import extract_text_from_excel
|
|
5
|
+
from autocoder.rag.loaders.excel_loader import extract_text_from_excel
|
|
6
6
|
from autocoder.rag.loaders.ppt_loader import extract_text_from_ppt
|
|
7
|
+
from autocoder.rag.loaders.image_loader import ImageLoader
|
|
7
8
|
from typing import List, Tuple, Optional, Union
|
|
8
9
|
import time
|
|
9
10
|
from loguru import logger
|
|
@@ -21,7 +22,7 @@ def process_file_in_multi_process(
|
|
|
21
22
|
llm = get_single_llm(llm,product_mode)
|
|
22
23
|
|
|
23
24
|
start_time = time.time()
|
|
24
|
-
file_path, relative_path, _, _ = file_info
|
|
25
|
+
file_path, relative_path, _, _ = file_info
|
|
25
26
|
try:
|
|
26
27
|
if file_path.endswith(".pdf"):
|
|
27
28
|
content = extract_text_from_pdf(file_path, llm, product_mode)
|
|
@@ -61,6 +62,15 @@ def process_file_in_multi_process(
|
|
|
61
62
|
tokens=count_tokens_worker(content),
|
|
62
63
|
)
|
|
63
64
|
]
|
|
65
|
+
elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
|
|
66
|
+
content = ImageLoader.image_to_markdown(file_path, llm=llm, product_mode=product_mode)
|
|
67
|
+
v = [
|
|
68
|
+
SourceCode(
|
|
69
|
+
module_name=f"##File: {file_path}",
|
|
70
|
+
source_code=content,
|
|
71
|
+
tokens=count_tokens_worker(content),
|
|
72
|
+
)
|
|
73
|
+
]
|
|
64
74
|
else:
|
|
65
75
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
66
76
|
content = f.read()
|
|
@@ -126,6 +136,15 @@ def process_file_local(
|
|
|
126
136
|
tokens=count_tokens(content),
|
|
127
137
|
)
|
|
128
138
|
]
|
|
139
|
+
elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
|
|
140
|
+
content = ImageLoader.image_to_markdown(file_path, llm=llm, product_mode=product_mode)
|
|
141
|
+
v = [
|
|
142
|
+
SourceCode(
|
|
143
|
+
module_name=f"##File: {file_path}",
|
|
144
|
+
source_code=content,
|
|
145
|
+
tokens=count_tokens(content),
|
|
146
|
+
)
|
|
147
|
+
]
|
|
129
148
|
else:
|
|
130
149
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
131
150
|
content = f.read()
|