auto-coder 0.1.200__tar.gz → 0.1.201__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.200 → auto_coder-0.1.201}/PKG-INFO +9 -1
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/auto_coder.egg-info/PKG-INFO +9 -1
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/auto_coder.egg-info/SOURCES.txt +1 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/auto_coder.egg-info/requires.txt +8 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/cache/base_cache.py +1 -1
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/cache/byzer_storage_cache.py +33 -10
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/cache/simple_cache.py +65 -24
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/document_retriever.py +5 -10
- auto_coder-0.1.201/src/autocoder/rag/loaders/docx_loader.py +20 -0
- auto_coder-0.1.201/src/autocoder/rag/loaders/pdf_loader.py +22 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/long_context_rag.py +3 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/utils.py +9 -13
- auto_coder-0.1.201/src/autocoder/utils/_markitdown.py +1298 -0
- auto_coder-0.1.201/src/autocoder/version.py +1 -0
- auto_coder-0.1.200/src/autocoder/rag/loaders/docx_loader.py +0 -7
- auto_coder-0.1.200/src/autocoder/rag/loaders/pdf_loader.py +0 -10
- auto_coder-0.1.200/src/autocoder/version.py +0 -1
- {auto_coder-0.1.200 → auto_coder-0.1.201}/LICENSE +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/README.md +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/setup.cfg +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/setup.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/auto_coder.egg-info/dependency_links.txt +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/auto_coder.egg-info/entry_points.txt +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/auto_coder.egg-info/top_level.txt +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/agent/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/agent/auto_tool.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/agent/coder.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/agent/designer.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/agent/planner.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/agent/project_reader.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/auto_coder.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/auto_coder_lang.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/auto_coder_rag.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/auto_coder_server.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/chat/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/chat_auto_coder.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/chat_auto_coder_lang.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/command_args.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/JupyterClient.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/ShellClient.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/anything2images.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/audio.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/cleaner.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_execute.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate_diff.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate_editblock.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate_strict_diff.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge_diff.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge_editblock.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge_strict_diff.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/command_completer.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/command_generator.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/command_templates.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/const.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/git_utils.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/image_to_page.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/interpreter.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/llm_rerank.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/screenshots.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/search.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/search_replace.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/sys_prompt.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/text.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/common/types.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/db/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/db/store.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/action.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/copilot.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/plugins/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/plugins/action_regex_project.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/plugins/action_translate.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/index/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/index/for_command.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/index/index.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/index/symbols_utils.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/lang.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/pyproject/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/api_server.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/cache/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/cache/file_monitor_cache.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/doc_filter.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/llm_wrapper.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/loaders/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/loaders/excel_loader.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/loaders/ppt_loader.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/rag_config.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/rag_entry.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/raw_rag.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/relevant_utils.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/simple_directory_reader.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/simple_rag.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/stream_event/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/stream_event/event_writer.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/stream_event/types.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/token_checker.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/token_counter.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/token_limiter.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/types.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/rag/variable_holder.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/regexproject/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/suffixproject/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/tsproject/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/__init__.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/conversation_store.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/llm_client_interceptors.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/log_capture.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/multi_turn.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/print_table.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/queue_communicate.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/request_event_queue.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/request_queue.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/rest.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/src/autocoder/utils/tests.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_action_regex_project.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_chat_auto_coder.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_code_auto_merge_editblock.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_command_completer.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_planner.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_queue_communicate.py +0 -0
- {auto_coder-0.1.200 → auto_coder-0.1.201}/tests/test_symbols_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: auto-coder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.201
|
|
4
4
|
Summary: AutoCoder: AutoCoder
|
|
5
5
|
Author: allwefantasy
|
|
6
6
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
@@ -49,6 +49,14 @@ Requires-Dist: python-pptx
|
|
|
49
49
|
Requires-Dist: watchfiles
|
|
50
50
|
Requires-Dist: cairosvg
|
|
51
51
|
Requires-Dist: matplotlib
|
|
52
|
+
Requires-Dist: mammoth
|
|
53
|
+
Requires-Dist: markdownify
|
|
54
|
+
Requires-Dist: pdfminer.six
|
|
55
|
+
Requires-Dist: puremagic
|
|
56
|
+
Requires-Dist: pydub
|
|
57
|
+
Requires-Dist: youtube-transcript-api
|
|
58
|
+
Requires-Dist: SpeechRecognition
|
|
59
|
+
Requires-Dist: pathvalidate
|
|
52
60
|
|
|
53
61
|
<p align="center">
|
|
54
62
|
<picture>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: auto-coder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.201
|
|
4
4
|
Summary: AutoCoder: AutoCoder
|
|
5
5
|
Author: allwefantasy
|
|
6
6
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
@@ -49,6 +49,14 @@ Requires-Dist: python-pptx
|
|
|
49
49
|
Requires-Dist: watchfiles
|
|
50
50
|
Requires-Dist: cairosvg
|
|
51
51
|
Requires-Dist: matplotlib
|
|
52
|
+
Requires-Dist: mammoth
|
|
53
|
+
Requires-Dist: markdownify
|
|
54
|
+
Requires-Dist: pdfminer.six
|
|
55
|
+
Requires-Dist: puremagic
|
|
56
|
+
Requires-Dist: pydub
|
|
57
|
+
Requires-Dist: youtube-transcript-api
|
|
58
|
+
Requires-Dist: SpeechRecognition
|
|
59
|
+
Requires-Dist: pathvalidate
|
|
52
60
|
|
|
53
61
|
<p align="center">
|
|
54
62
|
<picture>
|
|
@@ -102,6 +102,7 @@ src/autocoder/regexproject/__init__.py
|
|
|
102
102
|
src/autocoder/suffixproject/__init__.py
|
|
103
103
|
src/autocoder/tsproject/__init__.py
|
|
104
104
|
src/autocoder/utils/__init__.py
|
|
105
|
+
src/autocoder/utils/_markitdown.py
|
|
105
106
|
src/autocoder/utils/conversation_store.py
|
|
106
107
|
src/autocoder/utils/llm_client_interceptors.py
|
|
107
108
|
src/autocoder/utils/log_capture.py
|
|
@@ -23,13 +23,34 @@ from multiprocessing import Pool
|
|
|
23
23
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
24
24
|
from autocoder.rag.variable_holder import VariableHolder
|
|
25
25
|
import platform
|
|
26
|
+
import hashlib
|
|
27
|
+
from typing import Union
|
|
26
28
|
|
|
27
29
|
if platform.system() != "Windows":
|
|
28
30
|
import fcntl
|
|
29
31
|
else:
|
|
30
32
|
fcntl = None
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
def generate_file_md5(file_path: str) -> str:
|
|
35
|
+
md5_hash = hashlib.md5()
|
|
36
|
+
with open(file_path, "rb") as f:
|
|
37
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
38
|
+
md5_hash.update(chunk)
|
|
39
|
+
return md5_hash.hexdigest()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def generate_content_md5(content: Union[str, bytes]) -> str:
|
|
43
|
+
if isinstance(content, str):
|
|
44
|
+
content = content.encode("utf-8")
|
|
45
|
+
md5_hash = hashlib.md5()
|
|
46
|
+
md5_hash.update(content)
|
|
47
|
+
return md5_hash.hexdigest()
|
|
48
|
+
|
|
49
|
+
default_ignore_dirs = [
|
|
50
|
+
"__pycache__",
|
|
51
|
+
"node_modules",
|
|
52
|
+
"_images"
|
|
53
|
+
]
|
|
33
54
|
class ByzerStorageCache(BaseCacheManager):
|
|
34
55
|
def __init__(
|
|
35
56
|
self,
|
|
@@ -154,10 +175,10 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
154
175
|
|
|
155
176
|
files_to_process = []
|
|
156
177
|
for file_info in self.get_all_files():
|
|
157
|
-
file_path, _, modify_time = file_info
|
|
178
|
+
file_path, _, modify_time, file_md5 = file_info
|
|
158
179
|
if (
|
|
159
|
-
file_path not in self.cache
|
|
160
|
-
or self.cache[file_path]["
|
|
180
|
+
file_path not in self.cache
|
|
181
|
+
or self.cache[file_path]["md5"] != file_md5
|
|
161
182
|
):
|
|
162
183
|
files_to_process.append(file_info)
|
|
163
184
|
|
|
@@ -175,13 +196,14 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
175
196
|
|
|
176
197
|
items = []
|
|
177
198
|
for file_info, result in zip(files_to_process, results):
|
|
178
|
-
file_path, relative_path, modify_time = file_info
|
|
199
|
+
file_path, relative_path, modify_time, file_md5 = file_info
|
|
179
200
|
content: List[SourceCode] = result
|
|
180
201
|
self.cache[file_path] = {
|
|
181
202
|
"file_path": file_path,
|
|
182
203
|
"relative_path": relative_path,
|
|
183
204
|
"content": [c.model_dump() for c in content],
|
|
184
205
|
"modify_time": modify_time,
|
|
206
|
+
"md5": file_md5,
|
|
185
207
|
}
|
|
186
208
|
|
|
187
209
|
for doc in content:
|
|
@@ -295,11 +317,11 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
295
317
|
files_to_process = []
|
|
296
318
|
current_files = set()
|
|
297
319
|
for file_info in self.get_all_files():
|
|
298
|
-
file_path, _,
|
|
320
|
+
file_path, _, _, file_md5 = file_info
|
|
299
321
|
current_files.add(file_path)
|
|
300
322
|
if (
|
|
301
323
|
file_path not in self.cache
|
|
302
|
-
or self.cache[file_path]["
|
|
324
|
+
or self.cache[file_path]["md5"] != file_md5
|
|
303
325
|
):
|
|
304
326
|
files_to_process.append(file_info)
|
|
305
327
|
|
|
@@ -362,10 +384,10 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
362
384
|
|
|
363
385
|
|
|
364
386
|
|
|
365
|
-
def get_all_files(self) -> List[Tuple[str, str, float]]:
|
|
387
|
+
def get_all_files(self) -> List[Tuple[str, str, float, str]]:
|
|
366
388
|
all_files = []
|
|
367
389
|
for root, dirs, files in os.walk(self.path,followlinks=True):
|
|
368
|
-
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
|
390
|
+
dirs[:] = [d for d in dirs if not d.startswith(".") and d not in default_ignore_dirs]
|
|
369
391
|
|
|
370
392
|
if self.ignore_spec:
|
|
371
393
|
relative_root = os.path.relpath(root, self.path)
|
|
@@ -389,6 +411,7 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
389
411
|
file_path = os.path.join(root, file)
|
|
390
412
|
relative_path = os.path.relpath(file_path, self.path)
|
|
391
413
|
modify_time = os.path.getmtime(file_path)
|
|
392
|
-
|
|
414
|
+
file_md5 = generate_file_md5(file_path)
|
|
415
|
+
all_files.append((file_path, relative_path, modify_time, file_md5))
|
|
393
416
|
|
|
394
417
|
return all_files
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
|
|
2
|
-
from multiprocessing import Pool
|
|
3
|
-
from autocoder.common import SourceCode
|
|
4
|
-
from autocoder.rag.cache.base_cache import BaseCacheManager,DeleteEvent,AddOrUpdateEvent
|
|
5
|
-
from typing import Dict, List, Tuple,Any,Optional
|
|
2
|
+
from multiprocessing import Pool
|
|
3
|
+
from autocoder.common import SourceCode
|
|
4
|
+
from autocoder.rag.cache.base_cache import BaseCacheManager, DeleteEvent, AddOrUpdateEvent
|
|
5
|
+
from typing import Dict, List, Tuple, Any, Optional, Union
|
|
6
6
|
import os
|
|
7
7
|
import threading
|
|
8
8
|
import json
|
|
@@ -13,8 +13,31 @@ else:
|
|
|
13
13
|
fcntl = None
|
|
14
14
|
import time
|
|
15
15
|
from loguru import logger
|
|
16
|
-
from autocoder.rag.utils import process_file_in_multi_process,process_file_local
|
|
16
|
+
from autocoder.rag.utils import process_file_in_multi_process, process_file_local
|
|
17
17
|
from autocoder.rag.variable_holder import VariableHolder
|
|
18
|
+
import hashlib
|
|
19
|
+
|
|
20
|
+
default_ignore_dirs = [
|
|
21
|
+
"__pycache__",
|
|
22
|
+
"node_modules",
|
|
23
|
+
"_images"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def generate_file_md5(file_path: str) -> str:
|
|
28
|
+
md5_hash = hashlib.md5()
|
|
29
|
+
with open(file_path, "rb") as f:
|
|
30
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
31
|
+
md5_hash.update(chunk)
|
|
32
|
+
return md5_hash.hexdigest()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def generate_content_md5(content: Union[str, bytes]) -> str:
|
|
36
|
+
if isinstance(content, str):
|
|
37
|
+
content = content.encode("utf-8")
|
|
38
|
+
md5_hash = hashlib.md5()
|
|
39
|
+
md5_hash.update(content)
|
|
40
|
+
return md5_hash.hexdigest()
|
|
18
41
|
|
|
19
42
|
|
|
20
43
|
class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
@@ -52,10 +75,10 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
52
75
|
return
|
|
53
76
|
files_to_process = []
|
|
54
77
|
for file_info in self.get_all_files():
|
|
55
|
-
file_path, _, modify_time = file_info
|
|
78
|
+
file_path, _, modify_time, file_md5 = file_info
|
|
56
79
|
if (
|
|
57
80
|
file_path not in self.cache
|
|
58
|
-
or self.cache[file_path]
|
|
81
|
+
or self.cache[file_path].get("md5","") != file_md5
|
|
59
82
|
):
|
|
60
83
|
files_to_process.append(file_info)
|
|
61
84
|
if not files_to_process:
|
|
@@ -71,7 +94,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
71
94
|
initializer=initialize_tokenizer,
|
|
72
95
|
initargs=(VariableHolder.TOKENIZER_PATH,),
|
|
73
96
|
) as pool:
|
|
74
|
-
results = pool.map(
|
|
97
|
+
results = pool.map(
|
|
98
|
+
process_file_in_multi_process, files_to_process)
|
|
75
99
|
|
|
76
100
|
for file_info, result in zip(files_to_process, results):
|
|
77
101
|
self.update_cache(file_info, result)
|
|
@@ -83,11 +107,11 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
83
107
|
files_to_process = []
|
|
84
108
|
current_files = set()
|
|
85
109
|
for file_info in self.get_all_files():
|
|
86
|
-
file_path, _,
|
|
110
|
+
file_path, _, _, file_md5 = file_info
|
|
87
111
|
current_files.add(file_path)
|
|
88
112
|
if (
|
|
89
113
|
file_path not in self.cache
|
|
90
|
-
or self.cache[file_path]
|
|
114
|
+
or self.cache[file_path].get("md5","") != file_md5
|
|
91
115
|
):
|
|
92
116
|
files_to_process.append(file_info)
|
|
93
117
|
|
|
@@ -99,7 +123,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
99
123
|
self.queue.append(DeleteEvent(file_paths=deleted_files))
|
|
100
124
|
if files_to_process:
|
|
101
125
|
with self.lock:
|
|
102
|
-
self.queue.append(AddOrUpdateEvent(
|
|
126
|
+
self.queue.append(AddOrUpdateEvent(
|
|
127
|
+
file_infos=files_to_process))
|
|
103
128
|
|
|
104
129
|
def process_queue(self):
|
|
105
130
|
while self.queue:
|
|
@@ -111,8 +136,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
111
136
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
112
137
|
for file_info in file_list.file_infos:
|
|
113
138
|
logger.info(f"{file_info[0]} is detected to be updated")
|
|
114
|
-
|
|
115
|
-
|
|
139
|
+
try:
|
|
140
|
+
result = process_file_local(file_info[0])
|
|
141
|
+
self.update_cache(file_info, result)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.error(
|
|
144
|
+
f"SimpleCache Error in process_queue: {e}")
|
|
116
145
|
|
|
117
146
|
self.write_cache()
|
|
118
147
|
|
|
@@ -138,8 +167,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
138
167
|
if not fcntl:
|
|
139
168
|
with open(cache_file, "w") as f:
|
|
140
169
|
for data in self.cache.values():
|
|
141
|
-
|
|
142
|
-
|
|
170
|
+
try:
|
|
171
|
+
json.dump(data, f, ensure_ascii=False)
|
|
172
|
+
f.write("\n")
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.error(
|
|
175
|
+
f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
|
|
143
176
|
else:
|
|
144
177
|
lock_file = cache_file + ".lock"
|
|
145
178
|
with open(lock_file, "w") as lockf:
|
|
@@ -149,33 +182,39 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
149
182
|
# 写入缓存文件
|
|
150
183
|
with open(cache_file, "w") as f:
|
|
151
184
|
for data in self.cache.values():
|
|
152
|
-
|
|
153
|
-
|
|
185
|
+
try:
|
|
186
|
+
json.dump(data, f, ensure_ascii=False)
|
|
187
|
+
f.write("\n")
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.error(
|
|
190
|
+
f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
|
|
154
191
|
|
|
155
192
|
finally:
|
|
156
193
|
# 释放文件锁
|
|
157
194
|
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
158
195
|
|
|
159
196
|
def update_cache(
|
|
160
|
-
self, file_info: Tuple[str, str, float], content: List[SourceCode]
|
|
197
|
+
self, file_info: Tuple[str, str, float, str], content: List[SourceCode]
|
|
161
198
|
):
|
|
162
|
-
file_path, relative_path, modify_time = file_info
|
|
199
|
+
file_path, relative_path, modify_time, file_md5 = file_info
|
|
163
200
|
self.cache[file_path] = {
|
|
164
201
|
"file_path": file_path,
|
|
165
202
|
"relative_path": relative_path,
|
|
166
203
|
"content": [c.model_dump() for c in content],
|
|
167
204
|
"modify_time": modify_time,
|
|
205
|
+
"md5": file_md5,
|
|
168
206
|
}
|
|
169
207
|
|
|
170
|
-
def get_cache(self,options:Optional[Dict[str,Any]]=None):
|
|
208
|
+
def get_cache(self, options: Optional[Dict[str, Any]] = None):
|
|
171
209
|
self.load_first()
|
|
172
210
|
self.trigger_update()
|
|
173
211
|
return self.cache
|
|
174
212
|
|
|
175
213
|
def get_all_files(self) -> List[Tuple[str, str, float]]:
|
|
176
214
|
all_files = []
|
|
177
|
-
for root, dirs, files in os.walk(self.path,followlinks=True):
|
|
178
|
-
dirs[:] = [d for d in dirs if not d.startswith(
|
|
215
|
+
for root, dirs, files in os.walk(self.path, followlinks=True):
|
|
216
|
+
dirs[:] = [d for d in dirs if not d.startswith(
|
|
217
|
+
".") and d not in default_ignore_dirs]
|
|
179
218
|
|
|
180
219
|
if self.ignore_spec:
|
|
181
220
|
relative_root = os.path.relpath(root, self.path)
|
|
@@ -199,6 +238,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
199
238
|
file_path = os.path.join(root, file)
|
|
200
239
|
relative_path = os.path.relpath(file_path, self.path)
|
|
201
240
|
modify_time = os.path.getmtime(file_path)
|
|
202
|
-
|
|
241
|
+
file_md5 = generate_file_md5(file_path)
|
|
242
|
+
all_files.append(
|
|
243
|
+
(file_path, relative_path, modify_time, file_md5))
|
|
203
244
|
|
|
204
|
-
return all_files
|
|
245
|
+
return all_files
|
|
@@ -13,13 +13,6 @@ from loguru import logger
|
|
|
13
13
|
from pydantic import BaseModel
|
|
14
14
|
|
|
15
15
|
from autocoder.common import SourceCode
|
|
16
|
-
from autocoder.rag.loaders import (
|
|
17
|
-
extract_text_from_docx,
|
|
18
|
-
extract_text_from_excel,
|
|
19
|
-
extract_text_from_pdf,
|
|
20
|
-
extract_text_from_ppt,
|
|
21
|
-
)
|
|
22
|
-
from autocoder.rag.token_counter import count_tokens_worker, count_tokens
|
|
23
16
|
from uuid import uuid4
|
|
24
17
|
from autocoder.rag.variable_holder import VariableHolder
|
|
25
18
|
from abc import ABC, abstractmethod
|
|
@@ -119,7 +112,8 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
119
112
|
logger.info(f"DocumentRetriever initialized with:")
|
|
120
113
|
logger.info(f" Path: {self.path}")
|
|
121
114
|
logger.info(f" Diable auto window: {self.disable_auto_window} ")
|
|
122
|
-
logger.info(
|
|
115
|
+
logger.info(
|
|
116
|
+
f" Single file token limit: {self.single_file_token_limit}")
|
|
123
117
|
logger.info(f" Small file token limit: {self.small_file_token_limit}")
|
|
124
118
|
logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
|
|
125
119
|
logger.info(f" Enable hybrid index: {self.enable_hybrid_index}")
|
|
@@ -200,9 +194,10 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
200
194
|
) -> Generator[SourceCode, None, None]:
|
|
201
195
|
chunk_size = self.single_file_token_limit
|
|
202
196
|
total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
|
|
203
|
-
logger.info(
|
|
197
|
+
logger.info(
|
|
198
|
+
f"Splitting document {doc.module_name} into {total_chunks} chunks")
|
|
204
199
|
for i in range(0, doc.tokens, chunk_size):
|
|
205
|
-
chunk_content = doc.source_code[i
|
|
200
|
+
chunk_content = doc.source_code[i: i + chunk_size]
|
|
206
201
|
chunk_tokens = min(chunk_size, doc.tokens - i)
|
|
207
202
|
chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
|
|
208
203
|
# logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
import docx2txt
|
|
3
|
+
from autocoder.utils._markitdown import MarkItDown
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_text_from_docx_old(docx_path):
|
|
7
|
+
with open(docx_path, "rb") as f:
|
|
8
|
+
docx_content = f.read()
|
|
9
|
+
docx_file = BytesIO(docx_content)
|
|
10
|
+
text = docx2txt.process(docx_file)
|
|
11
|
+
return text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_text_from_docx(docx_path):
|
|
15
|
+
try:
|
|
16
|
+
md_converter = MarkItDown()
|
|
17
|
+
result = md_converter.convert(docx_path)
|
|
18
|
+
return result.text_content
|
|
19
|
+
except Exception as e:
|
|
20
|
+
return extract_text_from_docx_old(docx_path)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from pypdf import PdfReader
|
|
3
|
+
from autocoder.utils._markitdown import MarkItDown
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_text_from_pdf_old(file_path):
|
|
7
|
+
with open(file_path, "rb") as f:
|
|
8
|
+
pdf_content = f.read()
|
|
9
|
+
pdf_file = BytesIO(pdf_content)
|
|
10
|
+
pdf_reader = PdfReader(pdf_file)
|
|
11
|
+
text = ""
|
|
12
|
+
for page in pdf_reader.pages:
|
|
13
|
+
text += page.extract_text()
|
|
14
|
+
return text
|
|
15
|
+
|
|
16
|
+
def extract_text_from_pdf(file_path):
|
|
17
|
+
try:
|
|
18
|
+
md_converter = MarkItDown()
|
|
19
|
+
result = md_converter.convert(file_path)
|
|
20
|
+
return result.text_content
|
|
21
|
+
except Exception as e:
|
|
22
|
+
return extract_text_from_pdf_old(file_path)
|
|
@@ -13,11 +13,10 @@ def process_file_in_multi_process(
|
|
|
13
13
|
file_info: Tuple[str, str, float]
|
|
14
14
|
) -> List[SourceCode]:
|
|
15
15
|
start_time = time.time()
|
|
16
|
-
file_path, relative_path, _ = file_info
|
|
16
|
+
file_path, relative_path, _, _ = file_info
|
|
17
17
|
try:
|
|
18
|
-
if file_path.endswith(".pdf"):
|
|
19
|
-
|
|
20
|
-
content = extract_text_from_pdf(f.read())
|
|
18
|
+
if file_path.endswith(".pdf"):
|
|
19
|
+
content = extract_text_from_pdf(file_path)
|
|
21
20
|
v = [
|
|
22
21
|
SourceCode(
|
|
23
22
|
module_name=file_path,
|
|
@@ -25,9 +24,8 @@ def process_file_in_multi_process(
|
|
|
25
24
|
tokens=count_tokens_worker(content),
|
|
26
25
|
)
|
|
27
26
|
]
|
|
28
|
-
elif file_path.endswith(".docx"):
|
|
29
|
-
|
|
30
|
-
content = extract_text_from_docx(f.read())
|
|
27
|
+
elif file_path.endswith(".docx"):
|
|
28
|
+
content = extract_text_from_docx(file_path)
|
|
31
29
|
v = [
|
|
32
30
|
SourceCode(
|
|
33
31
|
module_name=f"##File: {file_path}",
|
|
@@ -75,9 +73,8 @@ def process_file_in_multi_process(
|
|
|
75
73
|
def process_file_local(file_path: str) -> List[SourceCode]:
|
|
76
74
|
start_time = time.time()
|
|
77
75
|
try:
|
|
78
|
-
if file_path.endswith(".pdf"):
|
|
79
|
-
|
|
80
|
-
content = extract_text_from_pdf(f.read())
|
|
76
|
+
if file_path.endswith(".pdf"):
|
|
77
|
+
content = extract_text_from_pdf(file_path)
|
|
81
78
|
v = [
|
|
82
79
|
SourceCode(
|
|
83
80
|
module_name=file_path,
|
|
@@ -85,9 +82,8 @@ def process_file_local(file_path: str) -> List[SourceCode]:
|
|
|
85
82
|
tokens=count_tokens(content),
|
|
86
83
|
)
|
|
87
84
|
]
|
|
88
|
-
elif file_path.endswith(".docx"):
|
|
89
|
-
|
|
90
|
-
content = extract_text_from_docx(f.read())
|
|
85
|
+
elif file_path.endswith(".docx"):
|
|
86
|
+
content = extract_text_from_docx(file_path)
|
|
91
87
|
v = [
|
|
92
88
|
SourceCode(
|
|
93
89
|
module_name=f"##File: {file_path}",
|