auto-coder 0.1.348__py3-none-any.whl → 0.1.350__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/METADATA +1 -1
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/RECORD +35 -26
- autocoder/auto_coder_runner.py +14 -10
- autocoder/chat_auto_coder_lang.py +5 -3
- autocoder/common/model_speed_tester.py +392 -0
- autocoder/common/printer.py +7 -8
- autocoder/common/run_cmd.py +247 -0
- autocoder/common/test_run_cmd.py +110 -0
- autocoder/common/v2/agent/agentic_edit.py +61 -11
- autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
- autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
- autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
- autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
- autocoder/helper/rag_doc_creator.py +141 -0
- autocoder/ignorefiles/__init__.py +4 -0
- autocoder/ignorefiles/ignore_file_utils.py +63 -0
- autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
- autocoder/models.py +48 -8
- autocoder/rag/cache/byzer_storage_cache.py +10 -4
- autocoder/rag/cache/file_monitor_cache.py +27 -24
- autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
- autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
- autocoder/rag/cache/simple_cache.py +56 -37
- autocoder/rag/loaders/filter_utils.py +106 -0
- autocoder/rag/loaders/image_loader.py +45 -23
- autocoder/rag/loaders/pdf_loader.py +3 -3
- autocoder/rag/loaders/test_image_loader.py +209 -0
- autocoder/rag/qa_conversation_strategy.py +3 -5
- autocoder/rag/utils.py +20 -9
- autocoder/utils/_markitdown.py +35 -0
- autocoder/version.py +1 -1
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from multiprocessing import Pool
|
|
2
|
+
import functools
|
|
2
3
|
from autocoder.common import SourceCode
|
|
3
4
|
from autocoder.rag.cache.base_cache import (
|
|
4
5
|
BaseCacheManager, DeleteEvent, AddOrUpdateEvent,
|
|
@@ -20,6 +21,9 @@ from autocoder.rag.utils import process_file_in_multi_process, process_file_loca
|
|
|
20
21
|
from autocoder.rag.variable_holder import VariableHolder
|
|
21
22
|
import hashlib
|
|
22
23
|
from .failed_files_utils import load_failed_files, save_failed_files
|
|
24
|
+
from autocoder.common import AutoCoderArgs
|
|
25
|
+
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
26
|
+
from autocoder.utils.llms import get_llm_names
|
|
23
27
|
|
|
24
28
|
|
|
25
29
|
default_ignore_dirs = [
|
|
@@ -46,16 +50,16 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
|
|
|
46
50
|
|
|
47
51
|
|
|
48
52
|
class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
49
|
-
def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args=None, llm=None):
|
|
53
|
+
def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args: Optional[AutoCoderArgs] = None, llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None):
|
|
50
54
|
"""
|
|
51
55
|
初始化异步更新队列,用于管理代码文件的缓存。
|
|
52
|
-
|
|
56
|
+
|
|
53
57
|
参数:
|
|
54
58
|
path: 需要索引的代码库根目录
|
|
55
59
|
ignore_spec: 指定哪些文件/目录应被忽略的规则
|
|
56
60
|
required_exts: 需要处理的文件扩展名列表
|
|
57
61
|
update_interval: 自动触发更新的时间间隔(秒),默认为5秒
|
|
58
|
-
|
|
62
|
+
|
|
59
63
|
缓存结构 (self.cache):
|
|
60
64
|
self.cache 是一个字典,其结构如下:
|
|
61
65
|
{
|
|
@@ -69,23 +73,23 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
69
73
|
"file_path2": { ... },
|
|
70
74
|
...
|
|
71
75
|
}
|
|
72
|
-
|
|
76
|
+
|
|
73
77
|
这个缓存保存在项目根目录的 .cache/cache.jsonl 文件中,采用 JSONL 格式存储。
|
|
74
78
|
每次启动时从磁盘加载,并在文件变更时异步更新。
|
|
75
|
-
|
|
79
|
+
|
|
76
80
|
源代码处理函数:
|
|
77
81
|
在缓存更新过程中使用了两个关键函数:
|
|
78
|
-
|
|
82
|
+
|
|
79
83
|
1. process_file_in_multi_process: 在多进程环境中处理文件
|
|
80
84
|
- 参数: file_info (文件信息元组)
|
|
81
85
|
- 返回值: List[SourceCode] 或 None
|
|
82
86
|
- 用途: 在初始加载时并行处理多个文件
|
|
83
|
-
|
|
87
|
+
|
|
84
88
|
2. process_file_local: 在当前进程中处理单个文件
|
|
85
89
|
- 参数: file_path (文件路径)
|
|
86
90
|
- 返回值: List[SourceCode] 或 None
|
|
87
91
|
- 用途: 在检测到文件更新时处理单个文件
|
|
88
|
-
|
|
92
|
+
|
|
89
93
|
这两个函数返回的 SourceCode 对象列表会通过 model_dump() 方法序列化为字典,
|
|
90
94
|
然后存储在缓存的 "content" 字段中。如果返回为空,则跳过缓存更新。
|
|
91
95
|
"""
|
|
@@ -94,6 +98,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
94
98
|
self.required_exts = required_exts
|
|
95
99
|
self.args = args
|
|
96
100
|
self.llm = llm
|
|
101
|
+
self.product_mode = args.product_mode or "lite"
|
|
97
102
|
self.update_interval = update_interval
|
|
98
103
|
self.queue = []
|
|
99
104
|
self.cache = {} # 初始化为空字典,稍后通过 read_cache() 填充
|
|
@@ -101,7 +106,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
101
106
|
self.stop_event = threading.Event()
|
|
102
107
|
|
|
103
108
|
# 用于存放解析失败的文件路径集合
|
|
104
|
-
self.failed_files_path = os.path.join(
|
|
109
|
+
self.failed_files_path = os.path.join(
|
|
110
|
+
self.path, ".cache", "failed_files.json")
|
|
105
111
|
self.failed_files = load_failed_files(self.failed_files_path)
|
|
106
112
|
|
|
107
113
|
# 启动处理队列的线程
|
|
@@ -116,7 +122,6 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
116
122
|
|
|
117
123
|
self.cache = self.read_cache()
|
|
118
124
|
|
|
119
|
-
|
|
120
125
|
def _process_queue(self):
|
|
121
126
|
while not self.stop_event.is_set():
|
|
122
127
|
try:
|
|
@@ -124,12 +129,13 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
124
129
|
except Exception as e:
|
|
125
130
|
logger.error(f"Error in process_queue: {e}")
|
|
126
131
|
time.sleep(1) # 避免过于频繁的检查
|
|
127
|
-
|
|
132
|
+
|
|
128
133
|
def _periodic_update(self):
|
|
129
134
|
"""定时触发文件更新检查"""
|
|
130
|
-
while not self.stop_event.is_set():
|
|
135
|
+
while not self.stop_event.is_set():
|
|
131
136
|
try:
|
|
132
|
-
logger.debug(
|
|
137
|
+
logger.debug(
|
|
138
|
+
f"Periodic update triggered (every {self.update_interval}s)")
|
|
133
139
|
# 如果没有被初始化过,不会增量触发
|
|
134
140
|
if not self.cache:
|
|
135
141
|
time.sleep(self.update_interval)
|
|
@@ -145,7 +151,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
145
151
|
self.update_thread.join()
|
|
146
152
|
|
|
147
153
|
def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
|
|
148
|
-
return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
|
|
154
|
+
return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
|
|
149
155
|
|
|
150
156
|
def __del__(self):
|
|
151
157
|
self.stop()
|
|
@@ -159,7 +165,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
159
165
|
file_path, _, modify_time, file_md5 = file_info
|
|
160
166
|
if (
|
|
161
167
|
file_path not in self.cache
|
|
162
|
-
or self.cache[file_path].get("md5","") != file_md5
|
|
168
|
+
or self.cache[file_path].get("md5", "") != file_md5
|
|
163
169
|
):
|
|
164
170
|
files_to_process.append(file_info)
|
|
165
171
|
if not files_to_process:
|
|
@@ -169,20 +175,23 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
169
175
|
# [process_file.remote(file_info) for file_info in files_to_process]
|
|
170
176
|
# )
|
|
171
177
|
from autocoder.rag.token_counter import initialize_tokenizer
|
|
172
|
-
|
|
178
|
+
llm_name = get_llm_names(self.llm)[0] if self.llm else None
|
|
173
179
|
with Pool(
|
|
174
180
|
processes=os.cpu_count(),
|
|
175
181
|
initializer=initialize_tokenizer,
|
|
176
182
|
initargs=(VariableHolder.TOKENIZER_PATH,),
|
|
177
183
|
) as pool:
|
|
178
|
-
|
|
179
|
-
|
|
184
|
+
|
|
185
|
+
worker_func = functools.partial(
|
|
186
|
+
process_file_in_multi_process, llm=llm_name, product_mode=self.product_mode)
|
|
187
|
+
results = pool.map(worker_func, files_to_process)
|
|
180
188
|
|
|
181
189
|
for file_info, result in zip(files_to_process, results):
|
|
182
190
|
if result: # 只有当result不为空时才更新缓存
|
|
183
191
|
self.update_cache(file_info, result)
|
|
184
192
|
else:
|
|
185
|
-
logger.warning(
|
|
193
|
+
logger.warning(
|
|
194
|
+
f"Empty result for file: {file_info[0]}, skipping cache update")
|
|
186
195
|
|
|
187
196
|
self.write_cache()
|
|
188
197
|
|
|
@@ -195,14 +204,15 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
195
204
|
current_files.add(file_path)
|
|
196
205
|
# 如果文件曾经解析失败,跳过本次增量更新
|
|
197
206
|
if file_path in self.failed_files:
|
|
198
|
-
logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
|
|
207
|
+
# logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
|
|
199
208
|
continue
|
|
200
209
|
# 变更检测
|
|
201
210
|
if (
|
|
202
211
|
file_path not in self.cache
|
|
203
212
|
or self.cache[file_path].get("md5", "") != file_md5
|
|
204
213
|
):
|
|
205
|
-
files_to_process.append(
|
|
214
|
+
files_to_process.append(
|
|
215
|
+
(file_path, relative_path, modify_time, file_md5))
|
|
206
216
|
|
|
207
217
|
deleted_files = set(self.cache.keys()) - current_files
|
|
208
218
|
logger.info(f"files_to_process: {files_to_process}")
|
|
@@ -231,29 +241,38 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
231
241
|
# 删除时也从失败列表中移除(防止文件已修复)
|
|
232
242
|
if item in self.failed_files:
|
|
233
243
|
self.failed_files.remove(item)
|
|
234
|
-
save_failed_files(
|
|
244
|
+
save_failed_files(
|
|
245
|
+
self.failed_files_path, self.failed_files)
|
|
235
246
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
236
247
|
for file_info in file_list.file_infos:
|
|
237
|
-
logger.info(
|
|
248
|
+
logger.info(
|
|
249
|
+
f"{file_info.file_path} is detected to be updated")
|
|
238
250
|
try:
|
|
239
|
-
result = process_file_local(
|
|
251
|
+
result = process_file_local(
|
|
252
|
+
file_info.file_path, llm=self.llm, product_mode=self.product_mode)
|
|
240
253
|
if result:
|
|
241
254
|
# 解析成功且非空
|
|
242
|
-
self.update_cache(
|
|
255
|
+
self.update_cache(
|
|
256
|
+
self.fileinfo_to_tuple(file_info), result)
|
|
243
257
|
# 如果之前失败过且本次成功,移除失败记录
|
|
244
258
|
if file_info.file_path in self.failed_files:
|
|
245
259
|
self.failed_files.remove(file_info.file_path)
|
|
246
|
-
save_failed_files(
|
|
260
|
+
save_failed_files(
|
|
261
|
+
self.failed_files_path, self.failed_files)
|
|
247
262
|
else:
|
|
248
263
|
# 只要为空也认为解析失败,加入失败列表
|
|
249
|
-
logger.warning(
|
|
264
|
+
logger.warning(
|
|
265
|
+
f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
|
|
250
266
|
self.failed_files.add(file_info.file_path)
|
|
251
|
-
save_failed_files(
|
|
267
|
+
save_failed_files(
|
|
268
|
+
self.failed_files_path, self.failed_files)
|
|
252
269
|
except Exception as e:
|
|
253
|
-
logger.error(
|
|
270
|
+
logger.error(
|
|
271
|
+
f"SimpleCache Error in process_queue: {e}")
|
|
254
272
|
# 解析失败则加入失败列表
|
|
255
273
|
self.failed_files.add(file_info.file_path)
|
|
256
|
-
save_failed_files(
|
|
274
|
+
save_failed_files(
|
|
275
|
+
self.failed_files_path, self.failed_files)
|
|
257
276
|
|
|
258
277
|
self.write_cache()
|
|
259
278
|
|
|
@@ -266,7 +285,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
266
285
|
|
|
267
286
|
cache = {}
|
|
268
287
|
if os.path.exists(cache_file):
|
|
269
|
-
with open(cache_file, "r",encoding="utf-8") as f:
|
|
288
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
270
289
|
for line in f:
|
|
271
290
|
data = json.loads(line)
|
|
272
291
|
cache[data["file_path"]] = data
|
|
@@ -277,7 +296,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
277
296
|
cache_file = os.path.join(cache_dir, "cache.jsonl")
|
|
278
297
|
|
|
279
298
|
if not fcntl:
|
|
280
|
-
with open(cache_file, "w",encoding="utf-8") as f:
|
|
299
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
281
300
|
for data in self.cache.values():
|
|
282
301
|
try:
|
|
283
302
|
json.dump(data, f, ensure_ascii=False)
|
|
@@ -287,12 +306,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
287
306
|
f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
|
|
288
307
|
else:
|
|
289
308
|
lock_file = cache_file + ".lock"
|
|
290
|
-
with open(lock_file, "w",encoding="utf-8") as lockf:
|
|
309
|
+
with open(lock_file, "w", encoding="utf-8") as lockf:
|
|
291
310
|
try:
|
|
292
311
|
# 获取文件锁
|
|
293
312
|
fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
294
313
|
# 写入缓存文件
|
|
295
|
-
with open(cache_file, "w",encoding="utf-8") as f:
|
|
314
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
296
315
|
for data in self.cache.values():
|
|
297
316
|
try:
|
|
298
317
|
json.dump(data, f, ensure_ascii=False)
|
|
@@ -310,11 +329,11 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
310
329
|
):
|
|
311
330
|
"""
|
|
312
331
|
更新缓存中的文件信息。
|
|
313
|
-
|
|
332
|
+
|
|
314
333
|
参数:
|
|
315
334
|
file_info: 包含文件信息的元组 (file_path, relative_path, modify_time, file_md5)
|
|
316
335
|
content: 解析后的文件内容,SourceCode 对象列表
|
|
317
|
-
|
|
336
|
+
|
|
318
337
|
说明:
|
|
319
338
|
此方法将文件的最新内容更新到缓存中。缓存项的结构为:
|
|
320
339
|
{
|
|
@@ -324,7 +343,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
324
343
|
"modify_time": float, # 文件最后修改时间的时间戳
|
|
325
344
|
"md5": str # 文件内容的 MD5 哈希值,用于检测变更
|
|
326
345
|
}
|
|
327
|
-
|
|
346
|
+
|
|
328
347
|
该方法不会立即写入磁盘,需调用 write_cache() 方法将更新后的缓存持久化。
|
|
329
348
|
"""
|
|
330
349
|
file_path, relative_path, modify_time, file_md5 = file_info
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
class FilterRuleManager:
|
|
8
|
+
'''
|
|
9
|
+
{
|
|
10
|
+
"whitelist": [
|
|
11
|
+
"glob:*.png",
|
|
12
|
+
"regex:^/tmp/.*hidden.*"
|
|
13
|
+
],
|
|
14
|
+
"blacklist": [
|
|
15
|
+
"glob:*/private/*",
|
|
16
|
+
"regex:.*/secret/.*\\.jpg$"
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
'''
|
|
20
|
+
_cache_rules: Optional[Dict] = None
|
|
21
|
+
_cache_mtime: Optional[float] = None
|
|
22
|
+
|
|
23
|
+
def __init__(self, llm, source_dir: str):
|
|
24
|
+
"""
|
|
25
|
+
初始化过滤规则管理器
|
|
26
|
+
|
|
27
|
+
参数:
|
|
28
|
+
llm: 大模型对象,当前未使用,预留
|
|
29
|
+
source_dir: 项目根目录路径
|
|
30
|
+
"""
|
|
31
|
+
self.llm = llm
|
|
32
|
+
self.source_dir = source_dir
|
|
33
|
+
self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
|
|
34
|
+
|
|
35
|
+
def load_filter_rules(self) -> Dict:
|
|
36
|
+
try:
|
|
37
|
+
current_mtime = os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
|
|
38
|
+
except Exception:
|
|
39
|
+
current_mtime = None
|
|
40
|
+
|
|
41
|
+
need_reload = False
|
|
42
|
+
|
|
43
|
+
# 如果缓存为空,或者文件已更新,触发重新加载
|
|
44
|
+
if FilterRuleManager._cache_rules is None:
|
|
45
|
+
need_reload = True
|
|
46
|
+
elif current_mtime is not None and FilterRuleManager._cache_mtime != current_mtime:
|
|
47
|
+
need_reload = True
|
|
48
|
+
|
|
49
|
+
if need_reload:
|
|
50
|
+
FilterRuleManager._cache_rules = {"whitelist": [], "blacklist": []}
|
|
51
|
+
try:
|
|
52
|
+
if os.path.exists(self.filter_rules_path):
|
|
53
|
+
with open(self.filter_rules_path, "r", encoding="utf-8") as f:
|
|
54
|
+
FilterRuleManager._cache_rules = json.load(f)
|
|
55
|
+
FilterRuleManager._cache_mtime = current_mtime
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.warning(f"Failed to load filterrules: {e}")
|
|
58
|
+
|
|
59
|
+
return FilterRuleManager._cache_rules or {"whitelist": [], "blacklist": []}
|
|
60
|
+
|
|
61
|
+
def should_parse_image(self, file_path: str) -> bool:
|
|
62
|
+
"""
|
|
63
|
+
判断某个文件是否需要对图片进行解析。
|
|
64
|
+
|
|
65
|
+
支持规则格式:
|
|
66
|
+
- glob通配符匹配,示例:"glob:*.png" 或 "*.png"
|
|
67
|
+
- 正则表达式匹配,示例:"regex:^/tmp/.*hidden.*"
|
|
68
|
+
|
|
69
|
+
返回:
|
|
70
|
+
True 表示应该解析
|
|
71
|
+
False 表示不解析
|
|
72
|
+
"""
|
|
73
|
+
import fnmatch
|
|
74
|
+
import re
|
|
75
|
+
|
|
76
|
+
rules = self.load_filter_rules()
|
|
77
|
+
whitelist = rules.get("whitelist", [])
|
|
78
|
+
blacklist = rules.get("blacklist", [])
|
|
79
|
+
|
|
80
|
+
def match_pattern(pattern: str, path: str) -> bool:
|
|
81
|
+
if pattern.startswith("glob:"):
|
|
82
|
+
pat = pattern[len("glob:"):]
|
|
83
|
+
return fnmatch.fnmatch(path, pat)
|
|
84
|
+
elif pattern.startswith("regex:"):
|
|
85
|
+
pat = pattern[len("regex:"):]
|
|
86
|
+
try:
|
|
87
|
+
return re.search(pat, path) is not None
|
|
88
|
+
except re.error:
|
|
89
|
+
logger.warning(f"Invalid regex pattern: {pat}")
|
|
90
|
+
return False
|
|
91
|
+
else:
|
|
92
|
+
# 默认按glob处理
|
|
93
|
+
return fnmatch.fnmatch(path, pattern)
|
|
94
|
+
|
|
95
|
+
# 优先匹配黑名单
|
|
96
|
+
for pattern in blacklist:
|
|
97
|
+
if match_pattern(pattern, file_path):
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
# 再匹配白名单
|
|
101
|
+
for pattern in whitelist:
|
|
102
|
+
if match_pattern(pattern, file_path):
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
# 默认不解析
|
|
106
|
+
return False
|
|
@@ -18,6 +18,7 @@ from byzerllm.utils.client import code_utils
|
|
|
18
18
|
from autocoder.utils.llms import get_single_llm
|
|
19
19
|
from loguru import logger
|
|
20
20
|
from typing import List, Tuple, Optional
|
|
21
|
+
from autocoder.common.text import TextSimilarity
|
|
21
22
|
from pydantic import BaseModel
|
|
22
23
|
|
|
23
24
|
|
|
@@ -280,6 +281,20 @@ class ImageLoader:
|
|
|
280
281
|
except Exception:
|
|
281
282
|
traceback.print_exc()
|
|
282
283
|
return ""
|
|
284
|
+
|
|
285
|
+
@staticmethod
|
|
286
|
+
def extract_replace_in_file_tools(response)->List[ReplaceInFileTool]:
|
|
287
|
+
tools = []
|
|
288
|
+
# Pattern to match replace_in_file tool blocks
|
|
289
|
+
pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
|
|
290
|
+
matches = re.finditer(pattern, response, re.DOTALL)
|
|
291
|
+
|
|
292
|
+
for match in matches:
|
|
293
|
+
path = match.group(1).strip()
|
|
294
|
+
diff = match.group(2).strip()
|
|
295
|
+
tools.append(ReplaceInFileTool(path=path, diff=diff))
|
|
296
|
+
|
|
297
|
+
return tools
|
|
283
298
|
|
|
284
299
|
@staticmethod
|
|
285
300
|
def format_table_in_content(content: str, llm=None) -> str:
|
|
@@ -406,35 +421,42 @@ class ImageLoader:
|
|
|
406
421
|
'''
|
|
407
422
|
|
|
408
423
|
# Run the prompt with the provided content
|
|
409
|
-
tool_response = _format_table.with_llm(llm).run(content)
|
|
410
|
-
|
|
411
|
-
# Parse the tool response to extract replace_in_file tool calls
|
|
412
|
-
def extract_replace_in_file_tools(response):
|
|
413
|
-
tools = []
|
|
414
|
-
# Pattern to match replace_in_file tool blocks
|
|
415
|
-
pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
|
|
416
|
-
matches = re.finditer(pattern, response, re.DOTALL)
|
|
417
|
-
|
|
418
|
-
for match in matches:
|
|
419
|
-
path = match.group(1).strip()
|
|
420
|
-
diff = match.group(2).strip()
|
|
421
|
-
tools.append(ReplaceInFileTool(path=path, diff=diff))
|
|
422
|
-
|
|
423
|
-
return tools
|
|
424
|
+
tool_response = _format_table.with_llm(llm).run(content)
|
|
424
425
|
|
|
425
426
|
# Extract tools from the response
|
|
426
|
-
tools = extract_replace_in_file_tools(tool_response)
|
|
427
|
+
tools = ImageLoader.extract_replace_in_file_tools(tool_response)
|
|
427
428
|
|
|
428
429
|
# Process each tool to apply the replacements
|
|
429
430
|
formatted_content = content
|
|
430
431
|
for tool in tools:
|
|
431
|
-
# For in-memory content replacement (not actual file modification)
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
432
|
+
# For in-memory content replacement (not actual file modification)
|
|
433
|
+
# Parse the diff to get search/replace blocks
|
|
434
|
+
blocks = ImageLoader.parse_diff(tool.diff)
|
|
435
|
+
# Apply each replacement to the content
|
|
436
|
+
for search_block, replace_block in blocks:
|
|
437
|
+
# Check if the search_block exists in the content
|
|
438
|
+
if search_block in formatted_content:
|
|
439
|
+
# Replace and verify the replacement occurred
|
|
440
|
+
new_content = formatted_content.replace(search_block, replace_block)
|
|
441
|
+
if new_content == formatted_content:
|
|
442
|
+
logger.warning(f"Replacement failed despite search block found. Search block length: {len(search_block)}")
|
|
443
|
+
print(f"\n=== FAILED SEARCH BLOCK ===\n{search_block}\n=== END FAILED SEARCH BLOCK ===\n")
|
|
444
|
+
formatted_content = new_content
|
|
445
|
+
else:
|
|
446
|
+
# Fallback to similarity matching when exact match fails
|
|
447
|
+
logger.warning(f"Search block not found in content. Trying similarity matching. Search block length: {len(search_block)}")
|
|
448
|
+
print(f"\n=== NOT FOUND SEARCH BLOCK (trying similarity) ===\n{search_block}\n=== END NOT FOUND SEARCH BLOCK ===\n")
|
|
449
|
+
|
|
450
|
+
# Use TextSimilarity to find the best matching window
|
|
451
|
+
similarity, best_window = TextSimilarity(search_block, formatted_content).get_best_matching_window()
|
|
452
|
+
similarity_threshold = 0.8 # Can be adjusted based on needs
|
|
453
|
+
|
|
454
|
+
if similarity > similarity_threshold:
|
|
455
|
+
logger.info(f"Found similar block with similarity {similarity:.2f}")
|
|
456
|
+
print(f"\n=== SIMILAR BLOCK FOUND (similarity: {similarity:.2f}) ===\n{best_window}\n=== END SIMILAR BLOCK ===\n")
|
|
457
|
+
formatted_content = formatted_content.replace(best_window, replace_block, 1)
|
|
458
|
+
else:
|
|
459
|
+
logger.warning(f"No similar block found. Best similarity: {similarity:.2f}")
|
|
438
460
|
|
|
439
461
|
return formatted_content
|
|
440
462
|
|
|
@@ -14,9 +14,9 @@ def extract_text_from_pdf_old(file_path):
|
|
|
14
14
|
text += page.extract_text()
|
|
15
15
|
return text
|
|
16
16
|
|
|
17
|
-
def extract_text_from_pdf(file_path):
|
|
18
|
-
try:
|
|
19
|
-
md_converter = MarkItDown()
|
|
17
|
+
def extract_text_from_pdf(file_path, llm=None, product_mode="lite"):
|
|
18
|
+
try:
|
|
19
|
+
md_converter = MarkItDown(llm=llm, product_mode=product_mode)
|
|
20
20
|
result = md_converter.convert(file_path)
|
|
21
21
|
return result.text_content
|
|
22
22
|
except (BaseException, Exception) as e:
|