auto-coder 0.1.173__tar.gz → 0.1.176__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto-coder-0.1.173 → auto-coder-0.1.176}/PKG-INFO +1 -1
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/PKG-INFO +1 -1
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/SOURCES.txt +1 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder.py +8 -9
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_rag.py +46 -13
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/__init__.py +11 -3
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/pyproject/__init__.py +5 -1
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/document_retriever.py +196 -55
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/long_context_rag.py +81 -23
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/token_counter.py +31 -9
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/token_limiter.py +66 -13
- auto-coder-0.1.176/src/autocoder/rag/variable_holder.py +2 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/suffixproject/__init__.py +5 -1
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/tsproject/__init__.py +5 -1
- auto-coder-0.1.176/src/autocoder/version.py +1 -0
- auto-coder-0.1.173/src/autocoder/version.py +0 -1
- {auto-coder-0.1.173 → auto-coder-0.1.176}/LICENSE +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/README.md +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/setup.cfg +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/setup.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/dependency_links.txt +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/entry_points.txt +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/requires.txt +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/top_level.txt +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/auto_tool.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/coder.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/designer.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/planner.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/project_reader.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_lang.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_server.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/chat/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/chat_auto_coder.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/chat_auto_coder_lang.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/command_args.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/JupyterClient.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/ShellClient.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/anything2images.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/audio.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/cleaner.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_execute.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate_diff.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate_editblock.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate_strict_diff.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge_diff.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge_editblock.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge_strict_diff.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/command_completer.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/command_generator.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/command_templates.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/const.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/git_utils.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/image_to_page.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/interpreter.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/llm_rerank.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/screenshots.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/search.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/search_replace.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/text.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/types.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/db/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/db/store.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/action.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/copilot.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/plugins/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/plugins/action_regex_project.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/plugins/action_translate.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/for_command.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/index.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/symbols_utils.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/lang.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/api_server.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/doc_filter.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/llm_wrapper.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/docx_loader.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/excel_loader.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/pdf_loader.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/ppt_loader.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/rag_config.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/rag_entry.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/raw_rag.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/relevant_utils.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/simple_directory_reader.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/simple_rag.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/token_checker.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/types.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/regexproject/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/__init__.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/conversation_store.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/llm_client_interceptors.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/log_capture.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/multi_turn.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/print_table.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/queue_communicate.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/request_event_queue.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/request_queue.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/rest.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/tests.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_action_regex_project.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_chat_auto_coder.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_code_auto_merge_editblock.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_command_completer.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_planner.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_queue_communicate.py +0 -0
- {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_symbols_utils.py +0 -0
|
@@ -82,6 +82,7 @@ src/autocoder/rag/token_checker.py
|
|
|
82
82
|
src/autocoder/rag/token_counter.py
|
|
83
83
|
src/autocoder/rag/token_limiter.py
|
|
84
84
|
src/autocoder/rag/types.py
|
|
85
|
+
src/autocoder/rag/variable_holder.py
|
|
85
86
|
src/autocoder/rag/loaders/__init__.py
|
|
86
87
|
src/autocoder/rag/loaders/docx_loader.py
|
|
87
88
|
src/autocoder/rag/loaders/excel_loader.py
|
|
@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
813
813
|
llm, args, code_auto_execute.Mode.SINGLE_ROUND
|
|
814
814
|
)
|
|
815
815
|
executor.run(query=args.query, context=s, source_code="")
|
|
816
|
-
return
|
|
817
|
-
elif raw_args.agent_command == "chat":
|
|
818
|
-
from autocoder.rag.rag_entry import RAGFactory
|
|
819
|
-
|
|
820
|
-
rag = RAGFactory.get_rag(llm=llm, args=args, path="")
|
|
821
|
-
rag.stream_chat_repl(args.query)
|
|
822
|
-
return
|
|
823
|
-
|
|
824
|
-
|
|
816
|
+
return
|
|
825
817
|
elif raw_args.doc_command == "serve":
|
|
826
818
|
|
|
827
819
|
from autocoder.rag.llm_wrapper import LLWrapper
|
|
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
846
838
|
llm_wrapper = LLWrapper(llm=llm, rag=rag)
|
|
847
839
|
serve(llm=llm_wrapper, args=server_args)
|
|
848
840
|
return
|
|
841
|
+
|
|
842
|
+
elif raw_args.doc_command == "chat":
|
|
843
|
+
from autocoder.rag.rag_entry import RAGFactory
|
|
844
|
+
|
|
845
|
+
rag = RAGFactory.get_rag(llm=llm, args=args, path="")
|
|
846
|
+
rag.stream_chat_repl(args.query)
|
|
847
|
+
return
|
|
849
848
|
|
|
850
849
|
else:
|
|
851
850
|
http_doc = HttpDoc(args=args, llm=llm, urls=None)
|
|
@@ -18,7 +18,7 @@ from rich.console import Console
|
|
|
18
18
|
from rich.table import Table
|
|
19
19
|
import os
|
|
20
20
|
|
|
21
|
-
from autocoder.rag.document_retriever import
|
|
21
|
+
from autocoder.rag.document_retriever import process_file_local
|
|
22
22
|
from autocoder.rag.token_counter import TokenCounter
|
|
23
23
|
|
|
24
24
|
if platform.system() == "Windows":
|
|
@@ -90,16 +90,24 @@ def initialize_system():
|
|
|
90
90
|
|
|
91
91
|
if choice == "1":
|
|
92
92
|
print_status(get_message("deploying_model").format("Deepseek官方"), "")
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
deploy_cmd = [
|
|
95
|
-
"byzerllm",
|
|
96
|
-
"
|
|
97
|
-
"--
|
|
98
|
-
"
|
|
99
|
-
"--
|
|
100
|
-
"
|
|
101
|
-
"--
|
|
102
|
-
"
|
|
95
|
+
"byzerllm",
|
|
96
|
+
"deploy",
|
|
97
|
+
"--pretrained_model_type",
|
|
98
|
+
"saas/openai",
|
|
99
|
+
"--cpus_per_worker",
|
|
100
|
+
"0.001",
|
|
101
|
+
"--gpus_per_worker",
|
|
102
|
+
"0",
|
|
103
|
+
"--worker_concurrency",
|
|
104
|
+
"1000",
|
|
105
|
+
"--num_workers",
|
|
106
|
+
"1",
|
|
107
|
+
"--infer_params",
|
|
108
|
+
f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
|
|
109
|
+
"--model",
|
|
110
|
+
"deepseek_chat",
|
|
103
111
|
]
|
|
104
112
|
|
|
105
113
|
try:
|
|
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
138
146
|
|
|
139
147
|
# Serve command
|
|
140
148
|
serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
|
|
141
|
-
serve_parser.add_argument(
|
|
149
|
+
serve_parser.add_argument(
|
|
150
|
+
"--quick", action="store_true", help="Skip system initialization"
|
|
151
|
+
)
|
|
142
152
|
serve_parser.add_argument("--file", default="", help=desc["file"])
|
|
143
153
|
serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
|
|
144
154
|
serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
|
|
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
160
170
|
"--rag_context_window_limit",
|
|
161
171
|
type=int,
|
|
162
172
|
default=110000,
|
|
163
|
-
help="",
|
|
173
|
+
help="The input context window limit for RAG",
|
|
174
|
+
)
|
|
175
|
+
serve_parser.add_argument(
|
|
176
|
+
"--full_text_ratio",
|
|
177
|
+
type=float,
|
|
178
|
+
default=0.7,
|
|
179
|
+
help="The ratio of full text area in the input context window (0.0 to 1.0)",
|
|
180
|
+
)
|
|
181
|
+
serve_parser.add_argument(
|
|
182
|
+
"--segment_ratio",
|
|
183
|
+
type=float,
|
|
184
|
+
default=0.2,
|
|
185
|
+
help="The ratio of segment area in the input context window (0.0 to 1.0)",
|
|
164
186
|
)
|
|
165
187
|
serve_parser.add_argument(
|
|
166
188
|
"--required_exts", default="", help=desc["doc_build_parse_required_exts"]
|
|
@@ -198,6 +220,17 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
198
220
|
help="Monitor mode for the doc update",
|
|
199
221
|
)
|
|
200
222
|
|
|
223
|
+
serve_parser.add_argument(
|
|
224
|
+
"--disable_auto_window",
|
|
225
|
+
action="store_true",
|
|
226
|
+
help="Disable automatic window adaptation for documents",
|
|
227
|
+
)
|
|
228
|
+
serve_parser.add_argument(
|
|
229
|
+
"--disable_segment_reorder",
|
|
230
|
+
action="store_true",
|
|
231
|
+
help="Disable reordering of document segments after retrieval",
|
|
232
|
+
)
|
|
233
|
+
|
|
201
234
|
# Tools command
|
|
202
235
|
tools_parser = subparsers.add_parser("tools", help="Various tools")
|
|
203
236
|
tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
|
|
@@ -255,7 +288,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
255
288
|
|
|
256
289
|
def count_tokens(tokenizer_path: str, file_path: str):
|
|
257
290
|
token_counter = TokenCounter(tokenizer_path)
|
|
258
|
-
source_codes =
|
|
291
|
+
source_codes = process_file_local(file_path)
|
|
259
292
|
|
|
260
293
|
console = Console()
|
|
261
294
|
table = Table(title="Token Count Results")
|
|
@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
|
|
|
11
11
|
module_name: str
|
|
12
12
|
source_code: str
|
|
13
13
|
tag: str = ""
|
|
14
|
+
tokens: int = -1
|
|
15
|
+
metadata: Dict[str, Any] = {}
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class TranslateReadme(pydantic.BaseModel):
|
|
@@ -281,9 +283,11 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
281
283
|
doc_command: Optional[str] = None
|
|
282
284
|
required_exts: Optional[str] = None
|
|
283
285
|
|
|
284
|
-
monitor_mode:
|
|
285
|
-
|
|
286
|
-
|
|
286
|
+
monitor_mode: bool = False
|
|
287
|
+
disable_auto_window: bool = False
|
|
288
|
+
disable_segment_reorder: bool = False
|
|
289
|
+
rag_doc_filter_relevance: int = 5
|
|
290
|
+
tokenizer_path: Optional[str] = None
|
|
287
291
|
skip_confirm: Optional[bool] = False
|
|
288
292
|
silence: Optional[bool] = False
|
|
289
293
|
exclude_files: Optional[Union[str, List[str]]] = ""
|
|
@@ -304,5 +308,9 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
304
308
|
|
|
305
309
|
agent_designer_mode: Optional[str] = "svg"
|
|
306
310
|
|
|
311
|
+
full_text_ratio: Optional[float] = 0.7
|
|
312
|
+
segment_ratio: Optional[float] = 0.2
|
|
313
|
+
buff_ratio: Optional[float] = 0.1
|
|
314
|
+
|
|
307
315
|
class Config:
|
|
308
316
|
protected_namespaces = ()
|
|
@@ -187,7 +187,11 @@ class PyProject:
|
|
|
187
187
|
|
|
188
188
|
def convert_to_source_code(self, file_path):
|
|
189
189
|
module_name = file_path
|
|
190
|
-
|
|
190
|
+
try:
|
|
191
|
+
source_code = self.read_file_content(file_path)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
194
|
+
return None
|
|
191
195
|
return SourceCode(module_name=module_name, source_code=source_code)
|
|
192
196
|
|
|
193
197
|
def get_package_source_codes(
|
|
@@ -18,10 +18,15 @@ from loguru import logger
|
|
|
18
18
|
from pydantic import BaseModel
|
|
19
19
|
|
|
20
20
|
from autocoder.common import SourceCode
|
|
21
|
-
from autocoder.rag.loaders import (
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
from autocoder.rag.loaders import (
|
|
22
|
+
extract_text_from_docx,
|
|
23
|
+
extract_text_from_excel,
|
|
24
|
+
extract_text_from_pdf,
|
|
25
|
+
extract_text_from_ppt,
|
|
26
|
+
)
|
|
27
|
+
from autocoder.rag import variable_holder
|
|
28
|
+
from autocoder.rag.token_counter import count_tokens_worker, count_tokens
|
|
29
|
+
from uuid import uuid4
|
|
25
30
|
|
|
26
31
|
cache_lock = threading.Lock()
|
|
27
32
|
|
|
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
|
|
|
34
39
|
file_infos: List[Tuple[str, str, float]]
|
|
35
40
|
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
|
|
42
|
+
def process_file_in_multi_process(
|
|
43
|
+
file_info: Tuple[str, str, float]
|
|
44
|
+
) -> List[SourceCode]:
|
|
39
45
|
start_time = time.time()
|
|
40
46
|
file_path, relative_path, _ = file_info
|
|
41
47
|
try:
|
|
42
48
|
if file_path.endswith(".pdf"):
|
|
43
49
|
with open(file_path, "rb") as f:
|
|
44
50
|
content = extract_text_from_pdf(f.read())
|
|
45
|
-
v = [SourceCode(module_name=file_path, source_code=content)]
|
|
46
|
-
elif file_path.endswith(".docx"):
|
|
47
|
-
with open(file_path, "rb") as f:
|
|
48
|
-
content = extract_text_from_docx(f.read())
|
|
49
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
50
|
-
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
51
|
-
sheets = extract_text_from_excel(file_path)
|
|
52
51
|
v = [
|
|
53
52
|
SourceCode(
|
|
54
|
-
module_name=
|
|
55
|
-
source_code=
|
|
53
|
+
module_name=file_path,
|
|
54
|
+
source_code=content,
|
|
55
|
+
tokens=count_tokens_worker(content),
|
|
56
56
|
)
|
|
57
|
-
for sheet in sheets
|
|
58
57
|
]
|
|
59
|
-
elif file_path.endswith(".pptx"):
|
|
60
|
-
slides = extract_text_from_ppt(file_path)
|
|
61
|
-
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
62
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
63
|
-
else:
|
|
64
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
65
|
-
content = f.read()
|
|
66
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
67
|
-
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
68
|
-
return v
|
|
69
|
-
except Exception as e:
|
|
70
|
-
logger.error(f"Error processing file {file_path}: {str(e)}")
|
|
71
|
-
return []
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
|
|
75
|
-
start_time = time.time()
|
|
76
|
-
file_path, relative_path, _ = file_info
|
|
77
|
-
try:
|
|
78
|
-
if file_path.endswith(".pdf"):
|
|
79
|
-
with open(file_path, "rb") as f:
|
|
80
|
-
content = extract_text_from_pdf(f.read())
|
|
81
|
-
v = [SourceCode(module_name=file_path, source_code=content)]
|
|
82
58
|
elif file_path.endswith(".docx"):
|
|
83
59
|
with open(file_path, "rb") as f:
|
|
84
60
|
content = extract_text_from_docx(f.read())
|
|
85
|
-
v = [
|
|
61
|
+
v = [
|
|
62
|
+
SourceCode(
|
|
63
|
+
module_name=f"##File: {file_path}",
|
|
64
|
+
source_code=content,
|
|
65
|
+
tokens=count_tokens_worker(content),
|
|
66
|
+
)
|
|
67
|
+
]
|
|
86
68
|
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
87
69
|
sheets = extract_text_from_excel(file_path)
|
|
88
70
|
v = [
|
|
89
71
|
SourceCode(
|
|
90
72
|
module_name=f"##File: {file_path}#{sheet[0]}",
|
|
91
73
|
source_code=sheet[1],
|
|
74
|
+
tokens=count_tokens_worker(sheet[1]),
|
|
92
75
|
)
|
|
93
76
|
for sheet in sheets
|
|
94
77
|
]
|
|
95
78
|
elif file_path.endswith(".pptx"):
|
|
96
79
|
slides = extract_text_from_ppt(file_path)
|
|
97
80
|
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
98
|
-
v = [
|
|
81
|
+
v = [
|
|
82
|
+
SourceCode(
|
|
83
|
+
module_name=f"##File: {file_path}",
|
|
84
|
+
source_code=content,
|
|
85
|
+
tokens=count_tokens_worker(content),
|
|
86
|
+
)
|
|
87
|
+
]
|
|
99
88
|
else:
|
|
100
89
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
101
90
|
content = f.read()
|
|
102
|
-
v = [
|
|
91
|
+
v = [
|
|
92
|
+
SourceCode(
|
|
93
|
+
module_name=f"##File: {file_path}",
|
|
94
|
+
source_code=content,
|
|
95
|
+
tokens=count_tokens_worker(content),
|
|
96
|
+
)
|
|
97
|
+
]
|
|
103
98
|
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
104
99
|
return v
|
|
105
100
|
except Exception as e:
|
|
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
|
|
|
107
102
|
return []
|
|
108
103
|
|
|
109
104
|
|
|
110
|
-
def
|
|
105
|
+
def process_file_local(file_path: str) -> List[SourceCode]:
|
|
111
106
|
start_time = time.time()
|
|
112
107
|
try:
|
|
113
108
|
if file_path.endswith(".pdf"):
|
|
114
109
|
with open(file_path, "rb") as f:
|
|
115
110
|
content = extract_text_from_pdf(f.read())
|
|
116
|
-
v = [
|
|
111
|
+
v = [
|
|
112
|
+
SourceCode(
|
|
113
|
+
module_name=file_path,
|
|
114
|
+
source_code=content,
|
|
115
|
+
tokens=count_tokens(content),
|
|
116
|
+
)
|
|
117
|
+
]
|
|
117
118
|
elif file_path.endswith(".docx"):
|
|
118
119
|
with open(file_path, "rb") as f:
|
|
119
120
|
content = extract_text_from_docx(f.read())
|
|
120
|
-
v = [
|
|
121
|
+
v = [
|
|
122
|
+
SourceCode(
|
|
123
|
+
module_name=f"##File: {file_path}",
|
|
124
|
+
source_code=content,
|
|
125
|
+
tokens=count_tokens(content),
|
|
126
|
+
)
|
|
127
|
+
]
|
|
121
128
|
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
122
129
|
sheets = extract_text_from_excel(file_path)
|
|
123
130
|
v = [
|
|
124
131
|
SourceCode(
|
|
125
132
|
module_name=f"##File: {file_path}#{sheet[0]}",
|
|
126
133
|
source_code=sheet[1],
|
|
134
|
+
tokens=count_tokens(sheet[1]),
|
|
127
135
|
)
|
|
128
136
|
for sheet in sheets
|
|
129
137
|
]
|
|
130
138
|
elif file_path.endswith(".pptx"):
|
|
131
139
|
slides = extract_text_from_ppt(file_path)
|
|
132
140
|
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
133
|
-
v = [
|
|
141
|
+
v = [
|
|
142
|
+
SourceCode(
|
|
143
|
+
module_name=f"##File: {file_path}",
|
|
144
|
+
source_code=content,
|
|
145
|
+
tokens=count_tokens(content),
|
|
146
|
+
)
|
|
147
|
+
]
|
|
134
148
|
else:
|
|
135
149
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
136
150
|
content = f.read()
|
|
137
|
-
v = [
|
|
151
|
+
v = [
|
|
152
|
+
SourceCode(
|
|
153
|
+
module_name=f"##File: {file_path}",
|
|
154
|
+
source_code=content,
|
|
155
|
+
tokens=count_tokens(content),
|
|
156
|
+
)
|
|
157
|
+
]
|
|
138
158
|
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
139
159
|
return v
|
|
140
160
|
except Exception as e:
|
|
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
|
|
|
205
225
|
self.update_cache(item)
|
|
206
226
|
|
|
207
227
|
def update_cache(self, file_path):
|
|
208
|
-
source_code =
|
|
228
|
+
source_code = process_file_local(file_path)
|
|
209
229
|
self.cache[file_path] = {
|
|
210
230
|
"file_path": file_path,
|
|
211
231
|
"content": [c.model_dump() for c in source_code],
|
|
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
|
|
|
220
240
|
|
|
221
241
|
def open_watch(self):
|
|
222
242
|
logger.info(f"start monitor: {self.path}...")
|
|
223
|
-
for changes in watch(
|
|
243
|
+
for changes in watch(
|
|
244
|
+
self.path, watch_filter=self.file_filter, stop_event=self.stop_event
|
|
245
|
+
):
|
|
224
246
|
for change in changes:
|
|
225
247
|
(action, path) = change
|
|
226
248
|
if action == Change.added or action == Change.modified:
|
|
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
290
312
|
self.thread.start()
|
|
291
313
|
self.cache = self.read_cache()
|
|
292
314
|
|
|
293
|
-
|
|
294
315
|
def _process_queue(self):
|
|
295
316
|
while not self.stop_event.is_set():
|
|
296
317
|
try:
|
|
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
324
345
|
# results = ray.get(
|
|
325
346
|
# [process_file.remote(file_info) for file_info in files_to_process]
|
|
326
347
|
# )
|
|
327
|
-
|
|
328
|
-
|
|
348
|
+
from autocoder.rag.token_counter import initialize_tokenizer
|
|
349
|
+
|
|
350
|
+
with Pool(
|
|
351
|
+
processes=os.cpu_count(),
|
|
352
|
+
initializer=initialize_tokenizer,
|
|
353
|
+
initargs=(variable_holder.TOKENIZER_PATH,),
|
|
354
|
+
) as pool:
|
|
355
|
+
results = pool.map(process_file_in_multi_process, files_to_process)
|
|
329
356
|
|
|
330
357
|
for file_info, result in zip(files_to_process, results):
|
|
331
358
|
self.update_cache(file_info, result)
|
|
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
365
392
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
366
393
|
for file_info in file_list.file_infos:
|
|
367
394
|
logger.info(f"{file_info[0]} is detected to be updated")
|
|
368
|
-
result =
|
|
395
|
+
result = process_file_local(file_info)
|
|
369
396
|
self.update_cache(file_info, result)
|
|
370
397
|
|
|
371
398
|
self.write_cache()
|
|
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
410
437
|
# 释放文件锁
|
|
411
438
|
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
412
439
|
|
|
413
|
-
def update_cache(
|
|
440
|
+
def update_cache(
|
|
441
|
+
self, file_info: Tuple[str, str, float], content: List[SourceCode]
|
|
442
|
+
):
|
|
414
443
|
file_path, relative_path, modify_time = file_info
|
|
415
444
|
self.cache[file_path] = {
|
|
416
445
|
"file_path": file_path,
|
|
@@ -485,11 +514,20 @@ class DocumentRetriever:
|
|
|
485
514
|
required_exts: list,
|
|
486
515
|
on_ray: bool = False,
|
|
487
516
|
monitor_mode: bool = False,
|
|
517
|
+
single_file_token_limit: int = 60000,
|
|
518
|
+
disable_auto_window: bool = False,
|
|
488
519
|
) -> None:
|
|
489
520
|
self.path = path
|
|
490
521
|
self.ignore_spec = ignore_spec
|
|
491
522
|
self.required_exts = required_exts
|
|
492
523
|
self.monitor_mode = monitor_mode
|
|
524
|
+
self.single_file_token_limit = single_file_token_limit
|
|
525
|
+
self.disable_auto_window = disable_auto_window
|
|
526
|
+
|
|
527
|
+
# 多小的文件会被合并
|
|
528
|
+
self.small_file_token_limit = self.single_file_token_limit / 4
|
|
529
|
+
# 合并后的最大文件大小
|
|
530
|
+
self.small_file_merge_limit = self.single_file_token_limit / 2
|
|
493
531
|
|
|
494
532
|
self.on_ray = on_ray
|
|
495
533
|
if self.on_ray:
|
|
@@ -502,6 +540,13 @@ class DocumentRetriever:
|
|
|
502
540
|
path, ignore_spec, required_exts
|
|
503
541
|
)
|
|
504
542
|
|
|
543
|
+
logger.info(f"DocumentRetriever initialized with:")
|
|
544
|
+
logger.info(f" Path: {self.path}")
|
|
545
|
+
logger.info(f" Diable auto window: {self.disable_auto_window} ")
|
|
546
|
+
logger.info(f" Single file token limit: {self.single_file_token_limit}")
|
|
547
|
+
logger.info(f" Small file token limit: {self.small_file_token_limit}")
|
|
548
|
+
logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
|
|
549
|
+
|
|
505
550
|
def get_cache(self):
|
|
506
551
|
if self.on_ray:
|
|
507
552
|
return ray.get(self.cacher.get_cache.remote())
|
|
@@ -509,6 +554,102 @@ class DocumentRetriever:
|
|
|
509
554
|
return self.cacher.get_cache()
|
|
510
555
|
|
|
511
556
|
def retrieve_documents(self) -> Generator[SourceCode, None, None]:
|
|
557
|
+
logger.info("Starting document retrieval process")
|
|
558
|
+
waiting_list = []
|
|
559
|
+
waiting_tokens = 0
|
|
512
560
|
for _, data in self.get_cache().items():
|
|
513
561
|
for source_code in data["content"]:
|
|
514
|
-
|
|
562
|
+
doc = SourceCode.model_validate(source_code)
|
|
563
|
+
if self.disable_auto_window:
|
|
564
|
+
yield doc
|
|
565
|
+
else:
|
|
566
|
+
if doc.tokens <= 0:
|
|
567
|
+
yield doc
|
|
568
|
+
elif doc.tokens < self.small_file_token_limit:
|
|
569
|
+
waiting_list, waiting_tokens = self._add_to_waiting_list(
|
|
570
|
+
doc, waiting_list, waiting_tokens
|
|
571
|
+
)
|
|
572
|
+
if waiting_tokens >= self.small_file_merge_limit:
|
|
573
|
+
yield from self._process_waiting_list(waiting_list)
|
|
574
|
+
waiting_list = []
|
|
575
|
+
waiting_tokens = 0
|
|
576
|
+
elif doc.tokens > self.single_file_token_limit:
|
|
577
|
+
yield from self._split_large_document(doc)
|
|
578
|
+
else:
|
|
579
|
+
yield doc
|
|
580
|
+
if waiting_list and not self.disable_auto_window:
|
|
581
|
+
yield from self._process_waiting_list(waiting_list)
|
|
582
|
+
|
|
583
|
+
logger.info("Document retrieval process completed")
|
|
584
|
+
|
|
585
|
+
def _add_to_waiting_list(
|
|
586
|
+
self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
|
|
587
|
+
) -> Tuple[List[SourceCode], int]:
|
|
588
|
+
waiting_list.append(doc)
|
|
589
|
+
return waiting_list, waiting_tokens + doc.tokens
|
|
590
|
+
|
|
591
|
+
def _process_waiting_list(
|
|
592
|
+
self, waiting_list: List[SourceCode]
|
|
593
|
+
) -> Generator[SourceCode, None, None]:
|
|
594
|
+
if len(waiting_list) == 1:
|
|
595
|
+
yield waiting_list[0]
|
|
596
|
+
elif len(waiting_list) > 1:
|
|
597
|
+
yield self._merge_documents(waiting_list)
|
|
598
|
+
|
|
599
|
+
def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
|
|
600
|
+
merged_content = "\n".join(
|
|
601
|
+
[f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
|
|
602
|
+
)
|
|
603
|
+
merged_tokens = sum([doc.tokens for doc in docs])
|
|
604
|
+
merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
|
|
605
|
+
logger.info(
|
|
606
|
+
f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
|
|
607
|
+
)
|
|
608
|
+
return SourceCode(
|
|
609
|
+
module_name=merged_name,
|
|
610
|
+
source_code=merged_content,
|
|
611
|
+
tokens=merged_tokens,
|
|
612
|
+
metadata={"original_docs": [doc.module_name for doc in docs]},
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
def _split_large_document(
|
|
616
|
+
self, doc: SourceCode
|
|
617
|
+
) -> Generator[SourceCode, None, None]:
|
|
618
|
+
chunk_size = self.single_file_token_limit
|
|
619
|
+
total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
|
|
620
|
+
logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
|
|
621
|
+
for i in range(0, doc.tokens, chunk_size):
|
|
622
|
+
chunk_content = doc.source_code[i : i + chunk_size]
|
|
623
|
+
chunk_tokens = min(chunk_size, doc.tokens - i)
|
|
624
|
+
chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
|
|
625
|
+
# logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
|
|
626
|
+
yield SourceCode(
|
|
627
|
+
module_name=chunk_name,
|
|
628
|
+
source_code=chunk_content,
|
|
629
|
+
tokens=chunk_tokens,
|
|
630
|
+
metadata={
|
|
631
|
+
"original_doc": doc.module_name,
|
|
632
|
+
"chunk_index": i // chunk_size + 1,
|
|
633
|
+
},
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
def _split_document(
|
|
637
|
+
self, doc: SourceCode, token_limit: int
|
|
638
|
+
) -> Generator[SourceCode, None, None]:
|
|
639
|
+
remaining_tokens = doc.tokens
|
|
640
|
+
chunk_number = 1
|
|
641
|
+
start_index = 0
|
|
642
|
+
|
|
643
|
+
while remaining_tokens > 0:
|
|
644
|
+
end_index = start_index + token_limit
|
|
645
|
+
chunk_content = doc.source_code[start_index:end_index]
|
|
646
|
+
chunk_tokens = min(token_limit, remaining_tokens)
|
|
647
|
+
|
|
648
|
+
chunk_name = f"{doc.module_name}#{chunk_number:06d}"
|
|
649
|
+
yield SourceCode(
|
|
650
|
+
module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
start_index = end_index
|
|
654
|
+
remaining_tokens -= chunk_tokens
|
|
655
|
+
chunk_number += 1
|