auto-coder 0.1.173__py3-none-any.whl → 0.1.175__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.175.dist-info}/METADATA +1 -1
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.175.dist-info}/RECORD +18 -17
- autocoder/auto_coder.py +8 -9
- autocoder/auto_coder_rag.py +41 -13
- autocoder/common/__init__.py +8 -1
- autocoder/pyproject/__init__.py +5 -1
- autocoder/rag/document_retriever.py +196 -55
- autocoder/rag/long_context_rag.py +80 -23
- autocoder/rag/token_counter.py +31 -9
- autocoder/rag/token_limiter.py +34 -9
- autocoder/rag/variable_holder.py +2 -0
- autocoder/suffixproject/__init__.py +5 -1
- autocoder/tsproject/__init__.py +5 -1
- autocoder/version.py +1 -1
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.175.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.175.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.175.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.175.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
autocoder/auto_coder.py,sha256=
|
|
2
|
+
autocoder/auto_coder.py,sha256=HmgKa_ZApFlCsqo6BvuVeCPuncBT_Dh29ayZxxGR6lo,32216
|
|
3
3
|
autocoder/auto_coder_lang.py,sha256=4qIS1tbEI8mpbtt6ThppTwKOM6MLuJTWJdgs5jIDGE0,2301
|
|
4
|
-
autocoder/auto_coder_rag.py,sha256=
|
|
4
|
+
autocoder/auto_coder_rag.py,sha256=y-iZkb7Zw2JORbxGYaOoIhVQXBohGyUBFXN675qsgXg,11280
|
|
5
5
|
autocoder/auto_coder_server.py,sha256=qRY88mkBnqSGFDcwYE5gwpe2WPhIw1nEH6LdbjCQhQk,20306
|
|
6
6
|
autocoder/chat_auto_coder.py,sha256=i5xIuWlTqF0pJz8kXoa-_bW3Ic3SfCFvU2WJIMxrUHU,81798
|
|
7
7
|
autocoder/chat_auto_coder_lang.py,sha256=QYtu5gWEQmWKVovR_qUZ8plySZarNFX_Onk-1vN9IiA,8524
|
|
8
8
|
autocoder/command_args.py,sha256=ftWw6HnFUZPiQPt1oV-SfpHQe69XN3knaFy1lpROBcU,26854
|
|
9
9
|
autocoder/lang.py,sha256=e-07rYTgimpxS8sm-AxKSmH4kKQX4N05YFHJBg9trVs,12598
|
|
10
|
-
autocoder/version.py,sha256=
|
|
10
|
+
autocoder/version.py,sha256=7VuhzksZrpUCv5FBnLSYtIkaqKki-s7Gt9mfU67bB6A,23
|
|
11
11
|
autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
autocoder/agent/auto_tool.py,sha256=DBzip-P_T6ZtT2eHexPcusmKYD0h7ufzp7TLwXAY10E,11554
|
|
13
13
|
autocoder/agent/coder.py,sha256=dnITYHqkcOip8zV4lywbkYNH9w7Q3qyYaUArJ4WPrTs,866
|
|
@@ -17,7 +17,7 @@ autocoder/agent/project_reader.py,sha256=-MWRqsr7O4mvU0PIpAhOUBb29htZAvA37pa_GeE
|
|
|
17
17
|
autocoder/chat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
autocoder/common/JupyterClient.py,sha256=O-wi6pXeAEYhAY24kDa0BINrLYvKS6rKyWe98pDClS0,2816
|
|
19
19
|
autocoder/common/ShellClient.py,sha256=fM1q8t_XMSbLBl2zkCNC2J9xuyKN3eXzGm6hHhqL2WY,2286
|
|
20
|
-
autocoder/common/__init__.py,sha256=
|
|
20
|
+
autocoder/common/__init__.py,sha256=FB0MdcGtI60-jPWXurNvOS9dZUZCg6HmgzHNVuHiW6c,10076
|
|
21
21
|
autocoder/common/anything2images.py,sha256=0ILBbWzY02M-CiWB-vzuomb_J1hVdxRcenAfIrAXq9M,25283
|
|
22
22
|
autocoder/common/audio.py,sha256=Kn9nWKQddWnUrAz0a_ZUgjcu4VUU_IcZBigT7n3N3qc,7439
|
|
23
23
|
autocoder/common/cleaner.py,sha256=NU72i8C6o9m0vXExab7nao5bstBUsfJFcj11cXa9l4U,1089
|
|
@@ -56,13 +56,13 @@ autocoder/index/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
56
56
|
autocoder/index/for_command.py,sha256=zfbvQnhHjsAqBc4Ce1kMGIu0jPEk_rtH7fntg89_4z0,3092
|
|
57
57
|
autocoder/index/index.py,sha256=6uakPXThpDWxAyOAP-7AbMuXaXJJkBKctL5RkNWGdGw,22485
|
|
58
58
|
autocoder/index/symbols_utils.py,sha256=CjcjUVajmJZB75Ty3a7kMv1BZphrm-tIBAdOJv6uo-0,2037
|
|
59
|
-
autocoder/pyproject/__init__.py,sha256
|
|
59
|
+
autocoder/pyproject/__init__.py,sha256=-2-ImQVw6e3NQZQOyDlHEP5b4xVs5ur2G5izB-JCa-A,13160
|
|
60
60
|
autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
61
|
autocoder/rag/api_server.py,sha256=zokIlDJlk7ucRorSLQm80uICO1mecfmn4J2zVqEBskE,6786
|
|
62
62
|
autocoder/rag/doc_filter.py,sha256=LqU8Wi6klwpY9WTHVtkioSHpmo9IWhRz39dzV1gvp6E,9315
|
|
63
|
-
autocoder/rag/document_retriever.py,sha256=
|
|
63
|
+
autocoder/rag/document_retriever.py,sha256=plwm8BpC55VJTUWCZyG4HsXYm-niqUsXaBMDLrLgYj0,23348
|
|
64
64
|
autocoder/rag/llm_wrapper.py,sha256=xRbTBpLUH43Ah5jplL8WWWU-kjKfNgEJoUntLGBq5F4,2484
|
|
65
|
-
autocoder/rag/long_context_rag.py,sha256=
|
|
65
|
+
autocoder/rag/long_context_rag.py,sha256=qKuXBuwuI_eoBh7XNApbDfVYtG4sPqBbh3s7yscIUuI,19391
|
|
66
66
|
autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
|
|
67
67
|
autocoder/rag/rag_entry.py,sha256=V1RJ8RGqM30DNPmzymv64rZjNRGWn6kfc8sRy_LECg0,2451
|
|
68
68
|
autocoder/rag/raw_rag.py,sha256=yS2Ur6kG0IRjhCj2_VonwxjY_xls_E62jO5Gz5j2nqE,2952
|
|
@@ -70,9 +70,10 @@ autocoder/rag/relevant_utils.py,sha256=OGfp98OXG4jr3jNmtHIeXGPF8mOlIbTnolPIVTZzY
|
|
|
70
70
|
autocoder/rag/simple_directory_reader.py,sha256=LkKreCkNdEOoL4fNhc3_hDoyyWTQUte4uqextISRz4U,24485
|
|
71
71
|
autocoder/rag/simple_rag.py,sha256=I902EUqOK1WM0Y2WFd7RzDJYofElvTZNLVCBtX5A9rc,14885
|
|
72
72
|
autocoder/rag/token_checker.py,sha256=jc76x6KWmvVxds6W8juZfQGaoErudc2HenG3sNQfSLs,2819
|
|
73
|
-
autocoder/rag/token_counter.py,sha256=
|
|
74
|
-
autocoder/rag/token_limiter.py,sha256=
|
|
73
|
+
autocoder/rag/token_counter.py,sha256=9ujfI5xQvwzKpN9XFWQGnXpm0h1sL7kgIJxgposcxNo,2096
|
|
74
|
+
autocoder/rag/token_limiter.py,sha256=dGSjKWwP_3rMwr8Yq06xqK2BrHpxW8Trn1gQLfnDOA8,8749
|
|
75
75
|
autocoder/rag/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
|
+
autocoder/rag/variable_holder.py,sha256=pDayuCnlKj7-bkn4iUHX5gea9UObddbi3ZnXotmxCs4,45
|
|
76
77
|
autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
|
|
77
78
|
autocoder/rag/loaders/docx_loader.py,sha256=g6Ta8rMUbfgwB8N1qiajhyO6wpaWl7zygAZiKShuioI,174
|
|
78
79
|
autocoder/rag/loaders/excel_loader.py,sha256=Ue8YB1z_kBs8SjIPuBskyM08Q1JiONs_BJZPrzi59oo,896
|
|
@@ -80,8 +81,8 @@ autocoder/rag/loaders/pdf_loader.py,sha256=CGfXOja7QZ7mHN-U5MsTiVMFzjP322rTj3dkY
|
|
|
80
81
|
autocoder/rag/loaders/ppt_loader.py,sha256=7VEYc-bqgK8VHCoGC3DIUcqbpda-E5jQF9lYLqP256I,1681
|
|
81
82
|
autocoder/regex_project/__init__.py,sha256=EBZeCL5ORyD_9_5u_UuG4s7XtpXOu0y1sWDmxWFtufE,6781
|
|
82
83
|
autocoder/regexproject/__init__.py,sha256=ThuvVFdpw1EgWv4aIRkhg3ZclKPxMVharUKWppFpQ8o,8436
|
|
83
|
-
autocoder/suffixproject/__init__.py,sha256=
|
|
84
|
-
autocoder/tsproject/__init__.py,sha256=
|
|
84
|
+
autocoder/suffixproject/__init__.py,sha256=EaQoumMzZ2COxMiI_GnL3SG4LGzRj0Qw7UpqLfNLCw8,9823
|
|
85
|
+
autocoder/tsproject/__init__.py,sha256=QmEpNZYUJq1o0lGMs3UuUIUU-2aq_3eh1VxqnIc-hME,10431
|
|
85
86
|
autocoder/utils/__init__.py,sha256=O3n6cpsgkIbbMuwmBHSQ1dls_IBD7_7YKFFaeKNo_tc,1193
|
|
86
87
|
autocoder/utils/coder.py,sha256=rK8e0svQBe0NOP26dIGToUXgha_hUDgxlWoC_p_r7oc,5698
|
|
87
88
|
autocoder/utils/conversation_store.py,sha256=sz-hhY7sttPAUOAQU6Pze-5zJc3j0_Emj22dM_0l5ro,1161
|
|
@@ -94,9 +95,9 @@ autocoder/utils/request_event_queue.py,sha256=r3lo5qGsB1dIjzVQ05dnr0z_9Z3zOkBdP1
|
|
|
94
95
|
autocoder/utils/request_queue.py,sha256=nwp6PMtgTCiuwJI24p8OLNZjUiprC-TsefQrhMI-yPE,3889
|
|
95
96
|
autocoder/utils/rest.py,sha256=3tXA8KZG6jKz_tddHNLGx77Icee88WcUeesfNsgPno4,8790
|
|
96
97
|
autocoder/utils/tests.py,sha256=BqphrwyycGAvs-5mhH8pKtMZdObwhFtJ5MC_ZAOiLq8,1340
|
|
97
|
-
auto_coder-0.1.
|
|
98
|
-
auto_coder-0.1.
|
|
99
|
-
auto_coder-0.1.
|
|
100
|
-
auto_coder-0.1.
|
|
101
|
-
auto_coder-0.1.
|
|
102
|
-
auto_coder-0.1.
|
|
98
|
+
auto_coder-0.1.175.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
99
|
+
auto_coder-0.1.175.dist-info/METADATA,sha256=JrgeRETDy_kU_7g-1UNJfKkxWYlSgamCpiihAYl04Yw,2352
|
|
100
|
+
auto_coder-0.1.175.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
101
|
+
auto_coder-0.1.175.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
|
|
102
|
+
auto_coder-0.1.175.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
|
|
103
|
+
auto_coder-0.1.175.dist-info/RECORD,,
|
autocoder/auto_coder.py
CHANGED
|
@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
813
813
|
llm, args, code_auto_execute.Mode.SINGLE_ROUND
|
|
814
814
|
)
|
|
815
815
|
executor.run(query=args.query, context=s, source_code="")
|
|
816
|
-
return
|
|
817
|
-
elif raw_args.agent_command == "chat":
|
|
818
|
-
from autocoder.rag.rag_entry import RAGFactory
|
|
819
|
-
|
|
820
|
-
rag = RAGFactory.get_rag(llm=llm, args=args, path="")
|
|
821
|
-
rag.stream_chat_repl(args.query)
|
|
822
|
-
return
|
|
823
|
-
|
|
824
|
-
|
|
816
|
+
return
|
|
825
817
|
elif raw_args.doc_command == "serve":
|
|
826
818
|
|
|
827
819
|
from autocoder.rag.llm_wrapper import LLWrapper
|
|
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
846
838
|
llm_wrapper = LLWrapper(llm=llm, rag=rag)
|
|
847
839
|
serve(llm=llm_wrapper, args=server_args)
|
|
848
840
|
return
|
|
841
|
+
|
|
842
|
+
elif raw_args.doc_command == "chat":
|
|
843
|
+
from autocoder.rag.rag_entry import RAGFactory
|
|
844
|
+
|
|
845
|
+
rag = RAGFactory.get_rag(llm=llm, args=args, path="")
|
|
846
|
+
rag.stream_chat_repl(args.query)
|
|
847
|
+
return
|
|
849
848
|
|
|
850
849
|
else:
|
|
851
850
|
http_doc = HttpDoc(args=args, llm=llm, urls=None)
|
autocoder/auto_coder_rag.py
CHANGED
|
@@ -18,7 +18,7 @@ from rich.console import Console
|
|
|
18
18
|
from rich.table import Table
|
|
19
19
|
import os
|
|
20
20
|
|
|
21
|
-
from autocoder.rag.document_retriever import
|
|
21
|
+
from autocoder.rag.document_retriever import process_file_local
|
|
22
22
|
from autocoder.rag.token_counter import TokenCounter
|
|
23
23
|
|
|
24
24
|
if platform.system() == "Windows":
|
|
@@ -90,16 +90,24 @@ def initialize_system():
|
|
|
90
90
|
|
|
91
91
|
if choice == "1":
|
|
92
92
|
print_status(get_message("deploying_model").format("Deepseek官方"), "")
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
deploy_cmd = [
|
|
95
|
-
"byzerllm",
|
|
96
|
-
"
|
|
97
|
-
"--
|
|
98
|
-
"
|
|
99
|
-
"--
|
|
100
|
-
"
|
|
101
|
-
"--
|
|
102
|
-
"
|
|
95
|
+
"byzerllm",
|
|
96
|
+
"deploy",
|
|
97
|
+
"--pretrained_model_type",
|
|
98
|
+
"saas/openai",
|
|
99
|
+
"--cpus_per_worker",
|
|
100
|
+
"0.001",
|
|
101
|
+
"--gpus_per_worker",
|
|
102
|
+
"0",
|
|
103
|
+
"--worker_concurrency",
|
|
104
|
+
"1000",
|
|
105
|
+
"--num_workers",
|
|
106
|
+
"1",
|
|
107
|
+
"--infer_params",
|
|
108
|
+
f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
|
|
109
|
+
"--model",
|
|
110
|
+
"deepseek_chat",
|
|
103
111
|
]
|
|
104
112
|
|
|
105
113
|
try:
|
|
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
138
146
|
|
|
139
147
|
# Serve command
|
|
140
148
|
serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
|
|
141
|
-
serve_parser.add_argument(
|
|
149
|
+
serve_parser.add_argument(
|
|
150
|
+
"--quick", action="store_true", help="Skip system initialization"
|
|
151
|
+
)
|
|
142
152
|
serve_parser.add_argument("--file", default="", help=desc["file"])
|
|
143
153
|
serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
|
|
144
154
|
serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
|
|
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
160
170
|
"--rag_context_window_limit",
|
|
161
171
|
type=int,
|
|
162
172
|
default=110000,
|
|
163
|
-
help="",
|
|
173
|
+
help="The input context window limit for RAG",
|
|
174
|
+
)
|
|
175
|
+
serve_parser.add_argument(
|
|
176
|
+
"--full_text_ratio",
|
|
177
|
+
type=float,
|
|
178
|
+
default=0.7,
|
|
179
|
+
help="The ratio of full text area in the input context window (0.0 to 1.0)",
|
|
180
|
+
)
|
|
181
|
+
serve_parser.add_argument(
|
|
182
|
+
"--segment_ratio",
|
|
183
|
+
type=float,
|
|
184
|
+
default=0.2,
|
|
185
|
+
help="The ratio of segment area in the input context window (0.0 to 1.0)",
|
|
164
186
|
)
|
|
165
187
|
serve_parser.add_argument(
|
|
166
188
|
"--required_exts", default="", help=desc["doc_build_parse_required_exts"]
|
|
@@ -198,6 +220,12 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
198
220
|
help="Monitor mode for the doc update",
|
|
199
221
|
)
|
|
200
222
|
|
|
223
|
+
serve_parser.add_argument(
|
|
224
|
+
"--disable_auto_window",
|
|
225
|
+
action="store_true",
|
|
226
|
+
help="Disable automatic window adaptation for documents",
|
|
227
|
+
)
|
|
228
|
+
|
|
201
229
|
# Tools command
|
|
202
230
|
tools_parser = subparsers.add_parser("tools", help="Various tools")
|
|
203
231
|
tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
|
|
@@ -255,7 +283,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
255
283
|
|
|
256
284
|
def count_tokens(tokenizer_path: str, file_path: str):
|
|
257
285
|
token_counter = TokenCounter(tokenizer_path)
|
|
258
|
-
source_codes =
|
|
286
|
+
source_codes = process_file_local(file_path)
|
|
259
287
|
|
|
260
288
|
console = Console()
|
|
261
289
|
table = Table(title="Token Count Results")
|
autocoder/common/__init__.py
CHANGED
|
@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
|
|
|
11
11
|
module_name: str
|
|
12
12
|
source_code: str
|
|
13
13
|
tag: str = ""
|
|
14
|
+
tokens: int = -1
|
|
15
|
+
metadata: Dict[str, Any] = {}
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class TranslateReadme(pydantic.BaseModel):
|
|
@@ -281,7 +283,8 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
281
283
|
doc_command: Optional[str] = None
|
|
282
284
|
required_exts: Optional[str] = None
|
|
283
285
|
|
|
284
|
-
monitor_mode:
|
|
286
|
+
monitor_mode: bool = False
|
|
287
|
+
disable_auto_window: bool = False
|
|
285
288
|
|
|
286
289
|
description: Optional[str] = ""
|
|
287
290
|
skip_confirm: Optional[bool] = False
|
|
@@ -304,5 +307,9 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
304
307
|
|
|
305
308
|
agent_designer_mode: Optional[str] = "svg"
|
|
306
309
|
|
|
310
|
+
full_text_ratio: Optional[float] = 0.7
|
|
311
|
+
segment_ratio: Optional[float] = 0.2
|
|
312
|
+
buff_ratio: Optional[float] = 0.1
|
|
313
|
+
|
|
307
314
|
class Config:
|
|
308
315
|
protected_namespaces = ()
|
autocoder/pyproject/__init__.py
CHANGED
|
@@ -187,7 +187,11 @@ class PyProject:
|
|
|
187
187
|
|
|
188
188
|
def convert_to_source_code(self, file_path):
|
|
189
189
|
module_name = file_path
|
|
190
|
-
|
|
190
|
+
try:
|
|
191
|
+
source_code = self.read_file_content(file_path)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
194
|
+
return None
|
|
191
195
|
return SourceCode(module_name=module_name, source_code=source_code)
|
|
192
196
|
|
|
193
197
|
def get_package_source_codes(
|
|
@@ -18,10 +18,15 @@ from loguru import logger
|
|
|
18
18
|
from pydantic import BaseModel
|
|
19
19
|
|
|
20
20
|
from autocoder.common import SourceCode
|
|
21
|
-
from autocoder.rag.loaders import (
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
from autocoder.rag.loaders import (
|
|
22
|
+
extract_text_from_docx,
|
|
23
|
+
extract_text_from_excel,
|
|
24
|
+
extract_text_from_pdf,
|
|
25
|
+
extract_text_from_ppt,
|
|
26
|
+
)
|
|
27
|
+
from autocoder.rag import variable_holder
|
|
28
|
+
from autocoder.rag.token_counter import count_tokens_worker, count_tokens
|
|
29
|
+
from uuid import uuid4
|
|
25
30
|
|
|
26
31
|
cache_lock = threading.Lock()
|
|
27
32
|
|
|
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
|
|
|
34
39
|
file_infos: List[Tuple[str, str, float]]
|
|
35
40
|
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
|
|
42
|
+
def process_file_in_multi_process(
|
|
43
|
+
file_info: Tuple[str, str, float]
|
|
44
|
+
) -> List[SourceCode]:
|
|
39
45
|
start_time = time.time()
|
|
40
46
|
file_path, relative_path, _ = file_info
|
|
41
47
|
try:
|
|
42
48
|
if file_path.endswith(".pdf"):
|
|
43
49
|
with open(file_path, "rb") as f:
|
|
44
50
|
content = extract_text_from_pdf(f.read())
|
|
45
|
-
v = [SourceCode(module_name=file_path, source_code=content)]
|
|
46
|
-
elif file_path.endswith(".docx"):
|
|
47
|
-
with open(file_path, "rb") as f:
|
|
48
|
-
content = extract_text_from_docx(f.read())
|
|
49
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
50
|
-
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
51
|
-
sheets = extract_text_from_excel(file_path)
|
|
52
51
|
v = [
|
|
53
52
|
SourceCode(
|
|
54
|
-
module_name=
|
|
55
|
-
source_code=
|
|
53
|
+
module_name=file_path,
|
|
54
|
+
source_code=content,
|
|
55
|
+
tokens=count_tokens_worker(content),
|
|
56
56
|
)
|
|
57
|
-
for sheet in sheets
|
|
58
57
|
]
|
|
59
|
-
elif file_path.endswith(".pptx"):
|
|
60
|
-
slides = extract_text_from_ppt(file_path)
|
|
61
|
-
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
62
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
63
|
-
else:
|
|
64
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
65
|
-
content = f.read()
|
|
66
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
67
|
-
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
68
|
-
return v
|
|
69
|
-
except Exception as e:
|
|
70
|
-
logger.error(f"Error processing file {file_path}: {str(e)}")
|
|
71
|
-
return []
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
|
|
75
|
-
start_time = time.time()
|
|
76
|
-
file_path, relative_path, _ = file_info
|
|
77
|
-
try:
|
|
78
|
-
if file_path.endswith(".pdf"):
|
|
79
|
-
with open(file_path, "rb") as f:
|
|
80
|
-
content = extract_text_from_pdf(f.read())
|
|
81
|
-
v = [SourceCode(module_name=file_path, source_code=content)]
|
|
82
58
|
elif file_path.endswith(".docx"):
|
|
83
59
|
with open(file_path, "rb") as f:
|
|
84
60
|
content = extract_text_from_docx(f.read())
|
|
85
|
-
v = [
|
|
61
|
+
v = [
|
|
62
|
+
SourceCode(
|
|
63
|
+
module_name=f"##File: {file_path}",
|
|
64
|
+
source_code=content,
|
|
65
|
+
tokens=count_tokens_worker(content),
|
|
66
|
+
)
|
|
67
|
+
]
|
|
86
68
|
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
87
69
|
sheets = extract_text_from_excel(file_path)
|
|
88
70
|
v = [
|
|
89
71
|
SourceCode(
|
|
90
72
|
module_name=f"##File: {file_path}#{sheet[0]}",
|
|
91
73
|
source_code=sheet[1],
|
|
74
|
+
tokens=count_tokens_worker(sheet[1]),
|
|
92
75
|
)
|
|
93
76
|
for sheet in sheets
|
|
94
77
|
]
|
|
95
78
|
elif file_path.endswith(".pptx"):
|
|
96
79
|
slides = extract_text_from_ppt(file_path)
|
|
97
80
|
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
98
|
-
v = [
|
|
81
|
+
v = [
|
|
82
|
+
SourceCode(
|
|
83
|
+
module_name=f"##File: {file_path}",
|
|
84
|
+
source_code=content,
|
|
85
|
+
tokens=count_tokens_worker(content),
|
|
86
|
+
)
|
|
87
|
+
]
|
|
99
88
|
else:
|
|
100
89
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
101
90
|
content = f.read()
|
|
102
|
-
v = [
|
|
91
|
+
v = [
|
|
92
|
+
SourceCode(
|
|
93
|
+
module_name=f"##File: {file_path}",
|
|
94
|
+
source_code=content,
|
|
95
|
+
tokens=count_tokens_worker(content),
|
|
96
|
+
)
|
|
97
|
+
]
|
|
103
98
|
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
104
99
|
return v
|
|
105
100
|
except Exception as e:
|
|
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
|
|
|
107
102
|
return []
|
|
108
103
|
|
|
109
104
|
|
|
110
|
-
def
|
|
105
|
+
def process_file_local(file_path: str) -> List[SourceCode]:
|
|
111
106
|
start_time = time.time()
|
|
112
107
|
try:
|
|
113
108
|
if file_path.endswith(".pdf"):
|
|
114
109
|
with open(file_path, "rb") as f:
|
|
115
110
|
content = extract_text_from_pdf(f.read())
|
|
116
|
-
v = [
|
|
111
|
+
v = [
|
|
112
|
+
SourceCode(
|
|
113
|
+
module_name=file_path,
|
|
114
|
+
source_code=content,
|
|
115
|
+
tokens=count_tokens(content),
|
|
116
|
+
)
|
|
117
|
+
]
|
|
117
118
|
elif file_path.endswith(".docx"):
|
|
118
119
|
with open(file_path, "rb") as f:
|
|
119
120
|
content = extract_text_from_docx(f.read())
|
|
120
|
-
v = [
|
|
121
|
+
v = [
|
|
122
|
+
SourceCode(
|
|
123
|
+
module_name=f"##File: {file_path}",
|
|
124
|
+
source_code=content,
|
|
125
|
+
tokens=count_tokens(content),
|
|
126
|
+
)
|
|
127
|
+
]
|
|
121
128
|
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
122
129
|
sheets = extract_text_from_excel(file_path)
|
|
123
130
|
v = [
|
|
124
131
|
SourceCode(
|
|
125
132
|
module_name=f"##File: {file_path}#{sheet[0]}",
|
|
126
133
|
source_code=sheet[1],
|
|
134
|
+
tokens=count_tokens(sheet[1]),
|
|
127
135
|
)
|
|
128
136
|
for sheet in sheets
|
|
129
137
|
]
|
|
130
138
|
elif file_path.endswith(".pptx"):
|
|
131
139
|
slides = extract_text_from_ppt(file_path)
|
|
132
140
|
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
133
|
-
v = [
|
|
141
|
+
v = [
|
|
142
|
+
SourceCode(
|
|
143
|
+
module_name=f"##File: {file_path}",
|
|
144
|
+
source_code=content,
|
|
145
|
+
tokens=count_tokens(content),
|
|
146
|
+
)
|
|
147
|
+
]
|
|
134
148
|
else:
|
|
135
149
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
136
150
|
content = f.read()
|
|
137
|
-
v = [
|
|
151
|
+
v = [
|
|
152
|
+
SourceCode(
|
|
153
|
+
module_name=f"##File: {file_path}",
|
|
154
|
+
source_code=content,
|
|
155
|
+
tokens=count_tokens(content),
|
|
156
|
+
)
|
|
157
|
+
]
|
|
138
158
|
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
139
159
|
return v
|
|
140
160
|
except Exception as e:
|
|
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
|
|
|
205
225
|
self.update_cache(item)
|
|
206
226
|
|
|
207
227
|
def update_cache(self, file_path):
|
|
208
|
-
source_code =
|
|
228
|
+
source_code = process_file_local(file_path)
|
|
209
229
|
self.cache[file_path] = {
|
|
210
230
|
"file_path": file_path,
|
|
211
231
|
"content": [c.model_dump() for c in source_code],
|
|
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
|
|
|
220
240
|
|
|
221
241
|
def open_watch(self):
|
|
222
242
|
logger.info(f"start monitor: {self.path}...")
|
|
223
|
-
for changes in watch(
|
|
243
|
+
for changes in watch(
|
|
244
|
+
self.path, watch_filter=self.file_filter, stop_event=self.stop_event
|
|
245
|
+
):
|
|
224
246
|
for change in changes:
|
|
225
247
|
(action, path) = change
|
|
226
248
|
if action == Change.added or action == Change.modified:
|
|
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
290
312
|
self.thread.start()
|
|
291
313
|
self.cache = self.read_cache()
|
|
292
314
|
|
|
293
|
-
|
|
294
315
|
def _process_queue(self):
|
|
295
316
|
while not self.stop_event.is_set():
|
|
296
317
|
try:
|
|
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
324
345
|
# results = ray.get(
|
|
325
346
|
# [process_file.remote(file_info) for file_info in files_to_process]
|
|
326
347
|
# )
|
|
327
|
-
|
|
328
|
-
|
|
348
|
+
from autocoder.rag.token_counter import initialize_tokenizer
|
|
349
|
+
|
|
350
|
+
with Pool(
|
|
351
|
+
processes=os.cpu_count(),
|
|
352
|
+
initializer=initialize_tokenizer,
|
|
353
|
+
initargs=(variable_holder.TOKENIZER_PATH,),
|
|
354
|
+
) as pool:
|
|
355
|
+
results = pool.map(process_file_in_multi_process, files_to_process)
|
|
329
356
|
|
|
330
357
|
for file_info, result in zip(files_to_process, results):
|
|
331
358
|
self.update_cache(file_info, result)
|
|
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
365
392
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
366
393
|
for file_info in file_list.file_infos:
|
|
367
394
|
logger.info(f"{file_info[0]} is detected to be updated")
|
|
368
|
-
result =
|
|
395
|
+
result = process_file_local(file_info)
|
|
369
396
|
self.update_cache(file_info, result)
|
|
370
397
|
|
|
371
398
|
self.write_cache()
|
|
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
410
437
|
# 释放文件锁
|
|
411
438
|
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
412
439
|
|
|
413
|
-
def update_cache(
|
|
440
|
+
def update_cache(
|
|
441
|
+
self, file_info: Tuple[str, str, float], content: List[SourceCode]
|
|
442
|
+
):
|
|
414
443
|
file_path, relative_path, modify_time = file_info
|
|
415
444
|
self.cache[file_path] = {
|
|
416
445
|
"file_path": file_path,
|
|
@@ -485,11 +514,20 @@ class DocumentRetriever:
|
|
|
485
514
|
required_exts: list,
|
|
486
515
|
on_ray: bool = False,
|
|
487
516
|
monitor_mode: bool = False,
|
|
517
|
+
single_file_token_limit: int = 60000,
|
|
518
|
+
disable_auto_window: bool = False,
|
|
488
519
|
) -> None:
|
|
489
520
|
self.path = path
|
|
490
521
|
self.ignore_spec = ignore_spec
|
|
491
522
|
self.required_exts = required_exts
|
|
492
523
|
self.monitor_mode = monitor_mode
|
|
524
|
+
self.single_file_token_limit = single_file_token_limit
|
|
525
|
+
self.disable_auto_window = disable_auto_window
|
|
526
|
+
|
|
527
|
+
# 多小的文件会被合并
|
|
528
|
+
self.small_file_token_limit = self.single_file_token_limit / 4
|
|
529
|
+
# 合并后的最大文件大小
|
|
530
|
+
self.small_file_merge_limit = self.single_file_token_limit / 2
|
|
493
531
|
|
|
494
532
|
self.on_ray = on_ray
|
|
495
533
|
if self.on_ray:
|
|
@@ -502,6 +540,13 @@ class DocumentRetriever:
|
|
|
502
540
|
path, ignore_spec, required_exts
|
|
503
541
|
)
|
|
504
542
|
|
|
543
|
+
logger.info(f"DocumentRetriever initialized with:")
|
|
544
|
+
logger.info(f" Path: {self.path}")
|
|
545
|
+
logger.info(f" Diable auto window: {self.disable_auto_window} ")
|
|
546
|
+
logger.info(f" Single file token limit: {self.single_file_token_limit}")
|
|
547
|
+
logger.info(f" Small file token limit: {self.small_file_token_limit}")
|
|
548
|
+
logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
|
|
549
|
+
|
|
505
550
|
def get_cache(self):
|
|
506
551
|
if self.on_ray:
|
|
507
552
|
return ray.get(self.cacher.get_cache.remote())
|
|
@@ -509,6 +554,102 @@ class DocumentRetriever:
|
|
|
509
554
|
return self.cacher.get_cache()
|
|
510
555
|
|
|
511
556
|
def retrieve_documents(self) -> Generator[SourceCode, None, None]:
|
|
557
|
+
logger.info("Starting document retrieval process")
|
|
558
|
+
waiting_list = []
|
|
559
|
+
waiting_tokens = 0
|
|
512
560
|
for _, data in self.get_cache().items():
|
|
513
561
|
for source_code in data["content"]:
|
|
514
|
-
|
|
562
|
+
doc = SourceCode.model_validate(source_code)
|
|
563
|
+
if self.disable_auto_window:
|
|
564
|
+
yield doc
|
|
565
|
+
else:
|
|
566
|
+
if doc.tokens <= 0:
|
|
567
|
+
yield doc
|
|
568
|
+
elif doc.tokens < self.small_file_token_limit:
|
|
569
|
+
waiting_list, waiting_tokens = self._add_to_waiting_list(
|
|
570
|
+
doc, waiting_list, waiting_tokens
|
|
571
|
+
)
|
|
572
|
+
if waiting_tokens >= self.small_file_merge_limit:
|
|
573
|
+
yield from self._process_waiting_list(waiting_list)
|
|
574
|
+
waiting_list = []
|
|
575
|
+
waiting_tokens = 0
|
|
576
|
+
elif doc.tokens > self.single_file_token_limit:
|
|
577
|
+
yield from self._split_large_document(doc)
|
|
578
|
+
else:
|
|
579
|
+
yield doc
|
|
580
|
+
if waiting_list and not self.disable_auto_window:
|
|
581
|
+
yield from self._process_waiting_list(waiting_list)
|
|
582
|
+
|
|
583
|
+
logger.info("Document retrieval process completed")
|
|
584
|
+
|
|
585
|
+
def _add_to_waiting_list(
|
|
586
|
+
self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
|
|
587
|
+
) -> Tuple[List[SourceCode], int]:
|
|
588
|
+
waiting_list.append(doc)
|
|
589
|
+
return waiting_list, waiting_tokens + doc.tokens
|
|
590
|
+
|
|
591
|
+
def _process_waiting_list(
|
|
592
|
+
self, waiting_list: List[SourceCode]
|
|
593
|
+
) -> Generator[SourceCode, None, None]:
|
|
594
|
+
if len(waiting_list) == 1:
|
|
595
|
+
yield waiting_list[0]
|
|
596
|
+
elif len(waiting_list) > 1:
|
|
597
|
+
yield self._merge_documents(waiting_list)
|
|
598
|
+
|
|
599
|
+
def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
|
|
600
|
+
merged_content = "\n".join(
|
|
601
|
+
[f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
|
|
602
|
+
)
|
|
603
|
+
merged_tokens = sum([doc.tokens for doc in docs])
|
|
604
|
+
merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
|
|
605
|
+
logger.info(
|
|
606
|
+
f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
|
|
607
|
+
)
|
|
608
|
+
return SourceCode(
|
|
609
|
+
module_name=merged_name,
|
|
610
|
+
source_code=merged_content,
|
|
611
|
+
tokens=merged_tokens,
|
|
612
|
+
metadata={"original_docs": [doc.module_name for doc in docs]},
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
def _split_large_document(
|
|
616
|
+
self, doc: SourceCode
|
|
617
|
+
) -> Generator[SourceCode, None, None]:
|
|
618
|
+
chunk_size = self.single_file_token_limit
|
|
619
|
+
total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
|
|
620
|
+
logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
|
|
621
|
+
for i in range(0, doc.tokens, chunk_size):
|
|
622
|
+
chunk_content = doc.source_code[i : i + chunk_size]
|
|
623
|
+
chunk_tokens = min(chunk_size, doc.tokens - i)
|
|
624
|
+
chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
|
|
625
|
+
# logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
|
|
626
|
+
yield SourceCode(
|
|
627
|
+
module_name=chunk_name,
|
|
628
|
+
source_code=chunk_content,
|
|
629
|
+
tokens=chunk_tokens,
|
|
630
|
+
metadata={
|
|
631
|
+
"original_doc": doc.module_name,
|
|
632
|
+
"chunk_index": i // chunk_size + 1,
|
|
633
|
+
},
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
def _split_document(
|
|
637
|
+
self, doc: SourceCode, token_limit: int
|
|
638
|
+
) -> Generator[SourceCode, None, None]:
|
|
639
|
+
remaining_tokens = doc.tokens
|
|
640
|
+
chunk_number = 1
|
|
641
|
+
start_index = 0
|
|
642
|
+
|
|
643
|
+
while remaining_tokens > 0:
|
|
644
|
+
end_index = start_index + token_limit
|
|
645
|
+
chunk_content = doc.source_code[start_index:end_index]
|
|
646
|
+
chunk_tokens = min(token_limit, remaining_tokens)
|
|
647
|
+
|
|
648
|
+
chunk_name = f"{doc.module_name}#{chunk_number:06d}"
|
|
649
|
+
yield SourceCode(
|
|
650
|
+
module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
start_index = end_index
|
|
654
|
+
remaining_tokens -= chunk_tokens
|
|
655
|
+
chunk_number += 1
|
|
@@ -13,16 +13,22 @@ from openai import OpenAI
|
|
|
13
13
|
from rich.console import Console
|
|
14
14
|
from rich.panel import Panel
|
|
15
15
|
from rich.table import Table
|
|
16
|
-
|
|
16
|
+
import statistics
|
|
17
17
|
|
|
18
18
|
from autocoder.common import AutoCoderArgs, SourceCode
|
|
19
19
|
from autocoder.rag.doc_filter import DocFilter
|
|
20
20
|
from autocoder.rag.document_retriever import DocumentRetriever
|
|
21
|
-
from autocoder.rag.relevant_utils import (
|
|
22
|
-
|
|
21
|
+
from autocoder.rag.relevant_utils import (
|
|
22
|
+
DocRelevance,
|
|
23
|
+
FilterDoc,
|
|
24
|
+
TaskTiming,
|
|
25
|
+
parse_relevance,
|
|
26
|
+
)
|
|
23
27
|
from autocoder.rag.token_checker import check_token_limit
|
|
24
28
|
from autocoder.rag.token_counter import RemoteTokenCounter, TokenCounter
|
|
25
29
|
from autocoder.rag.token_limiter import TokenLimiter
|
|
30
|
+
from tokenizers import Tokenizer
|
|
31
|
+
from autocoder.rag import variable_holder
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
class LongContextRAG:
|
|
@@ -44,11 +50,26 @@ class LongContextRAG:
|
|
|
44
50
|
self.path = path
|
|
45
51
|
self.relevant_score = self.args.rag_doc_filter_relevance or 5
|
|
46
52
|
|
|
53
|
+
self.full_text_ratio = args.full_text_ratio
|
|
54
|
+
self.segment_ratio = args.segment_ratio
|
|
55
|
+
self.buff_ratio = 1 - self.full_text_ratio - self.segment_ratio
|
|
56
|
+
|
|
57
|
+
if self.buff_ratio < 0:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"The sum of full_text_ratio and segment_ratio must be less than or equal to 1.0"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.full_text_limit = int(args.rag_context_window_limit * self.full_text_ratio)
|
|
63
|
+
self.segment_limit = int(args.rag_context_window_limit * self.segment_ratio)
|
|
64
|
+
self.buff_limit = int(args.rag_context_window_limit * self.buff_ratio)
|
|
65
|
+
|
|
47
66
|
self.tokenizer = None
|
|
48
67
|
self.tokenizer_path = tokenizer_path
|
|
49
68
|
self.on_ray = False
|
|
50
69
|
|
|
51
70
|
if self.tokenizer_path:
|
|
71
|
+
variable_holder.TOKENIZER_PATH = self.tokenizer_path
|
|
72
|
+
variable_holder.TOKENIZER_MODEL = Tokenizer.from_file(self.tokenizer_path)
|
|
52
73
|
self.tokenizer = TokenCounter(self.tokenizer_path)
|
|
53
74
|
else:
|
|
54
75
|
if llm.is_model_exist("deepseek_tokenizer"):
|
|
@@ -96,24 +117,41 @@ class LongContextRAG:
|
|
|
96
117
|
self.required_exts,
|
|
97
118
|
self.on_ray,
|
|
98
119
|
self.monitor_mode,
|
|
120
|
+
## 确保全文区至少能放下一个文件
|
|
121
|
+
single_file_token_limit=self.full_text_limit - 100,
|
|
122
|
+
disable_auto_window=self.args.disable_auto_window
|
|
99
123
|
)
|
|
100
124
|
|
|
101
125
|
self.doc_filter = DocFilter(
|
|
102
126
|
self.index_model, self.args, on_ray=self.on_ray, path=self.path
|
|
103
127
|
)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
128
|
+
|
|
129
|
+
doc_num = 0
|
|
130
|
+
token_num = 0
|
|
131
|
+
token_counts = []
|
|
132
|
+
for doc in self._retrieve_documents():
|
|
133
|
+
doc_num += 1
|
|
134
|
+
doc_tokens = doc.tokens
|
|
135
|
+
token_num += doc_tokens
|
|
136
|
+
token_counts.append(doc_tokens)
|
|
137
|
+
|
|
138
|
+
avg_tokens = statistics.mean(token_counts) if token_counts else 0
|
|
139
|
+
median_tokens = statistics.median(token_counts) if token_counts else 0
|
|
114
140
|
|
|
115
141
|
logger.info(
|
|
116
|
-
|
|
142
|
+
"RAG Configuration:\n"
|
|
143
|
+
f" Total docs: {doc_num}\n"
|
|
144
|
+
f" Total tokens: {token_num}\n"
|
|
145
|
+
f" Tokenizer path: {self.tokenizer_path}\n"
|
|
146
|
+
f" Relevant score: {self.relevant_score}\n"
|
|
147
|
+
f" Token limit: {self.token_limit}\n"
|
|
148
|
+
f" Full text limit: {self.full_text_limit}\n"
|
|
149
|
+
f" Segment limit: {self.segment_limit}\n"
|
|
150
|
+
f" Buff limit: {self.buff_limit}\n"
|
|
151
|
+
f" Max doc tokens: {max(token_counts) if token_counts else 0}\n"
|
|
152
|
+
f" Min doc tokens: {min(token_counts) if token_counts else 0}\n"
|
|
153
|
+
f" Avg doc tokens: {avg_tokens:.2f}\n"
|
|
154
|
+
f" Median doc tokens: {median_tokens:.2f}\n"
|
|
117
155
|
)
|
|
118
156
|
|
|
119
157
|
def count_tokens(self, text: str) -> int:
|
|
@@ -350,9 +388,15 @@ class LongContextRAG:
|
|
|
350
388
|
query_table.add_row("Relevant docs", str(len(relevant_docs)))
|
|
351
389
|
|
|
352
390
|
# Add relevant docs information
|
|
353
|
-
relevant_docs_info =
|
|
354
|
-
|
|
355
|
-
|
|
391
|
+
relevant_docs_info = []
|
|
392
|
+
for doc in relevant_docs:
|
|
393
|
+
info = f"- {doc.module_name.replace(self.path,'',1)}"
|
|
394
|
+
if 'original_docs' in doc.metadata:
|
|
395
|
+
original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
|
|
396
|
+
info += f" (Original docs: {original_docs})"
|
|
397
|
+
relevant_docs_info.append(info)
|
|
398
|
+
|
|
399
|
+
relevant_docs_info = "\n".join(relevant_docs_info)
|
|
356
400
|
query_table.add_row("Relevant docs list", relevant_docs_info)
|
|
357
401
|
|
|
358
402
|
first_round_full_docs = []
|
|
@@ -363,7 +407,9 @@ class LongContextRAG:
|
|
|
363
407
|
|
|
364
408
|
token_limiter = TokenLimiter(
|
|
365
409
|
count_tokens=self.count_tokens,
|
|
366
|
-
|
|
410
|
+
full_text_limit=self.full_text_limit,
|
|
411
|
+
segment_limit=self.segment_limit,
|
|
412
|
+
buff_limit=self.buff_limit,
|
|
367
413
|
llm=self.llm,
|
|
368
414
|
)
|
|
369
415
|
final_relevant_docs = token_limiter.limit_tokens(
|
|
@@ -395,9 +441,18 @@ class LongContextRAG:
|
|
|
395
441
|
)
|
|
396
442
|
|
|
397
443
|
# Add relevant docs information
|
|
398
|
-
final_relevant_docs_info =
|
|
399
|
-
|
|
400
|
-
|
|
444
|
+
final_relevant_docs_info = []
|
|
445
|
+
for doc in relevant_docs:
|
|
446
|
+
info = f"- {doc.module_name.replace(self.path,'',1)}"
|
|
447
|
+
if 'original_docs' in doc.metadata:
|
|
448
|
+
original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
|
|
449
|
+
info += f" (Original docs: {original_docs})"
|
|
450
|
+
if "chunk_ranges" in doc.metadata:
|
|
451
|
+
chunk_ranges = json.dumps(doc.metadata['chunk_ranges'],ensure_ascii=False)
|
|
452
|
+
info += f" (Chunk ranges: {chunk_ranges})"
|
|
453
|
+
final_relevant_docs_info.append(info)
|
|
454
|
+
|
|
455
|
+
final_relevant_docs_info = "\n".join(final_relevant_docs_info)
|
|
401
456
|
query_table.add_row("Final Relevant docs list", final_relevant_docs_info)
|
|
402
457
|
|
|
403
458
|
# Create a panel to contain the table
|
|
@@ -409,8 +464,10 @@ class LongContextRAG:
|
|
|
409
464
|
|
|
410
465
|
# Log the panel using rich
|
|
411
466
|
console.print(panel)
|
|
412
|
-
|
|
413
|
-
|
|
467
|
+
|
|
468
|
+
request_tokens = sum([doc.tokens for doc in relevant_docs])
|
|
469
|
+
target_model = model or self.llm.default_model_name
|
|
470
|
+
logger.info(f"Start to send to model {target_model} with {request_tokens} tokens")
|
|
414
471
|
|
|
415
472
|
new_conversations = conversations[:-1] + [
|
|
416
473
|
{
|
autocoder/rag/token_counter.py
CHANGED
|
@@ -2,29 +2,46 @@ import time
|
|
|
2
2
|
from loguru import logger
|
|
3
3
|
from tokenizers import Tokenizer
|
|
4
4
|
from multiprocessing import Pool, cpu_count
|
|
5
|
+
from autocoder.rag.variable_holder import TOKENIZER_MODEL
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class RemoteTokenCounter:
|
|
7
|
-
def __init__(self,tokenizer) -> None:
|
|
9
|
+
def __init__(self, tokenizer) -> None:
|
|
8
10
|
self.tokenizer = tokenizer
|
|
9
11
|
|
|
10
|
-
def count_tokens(self, text: str) -> int:
|
|
11
|
-
try:
|
|
12
|
+
def count_tokens(self, text: str) -> int:
|
|
13
|
+
try:
|
|
12
14
|
v = self.tokenizer.chat_oai(
|
|
13
15
|
conversations=[{"role": "user", "content": text}]
|
|
14
|
-
)
|
|
16
|
+
)
|
|
15
17
|
return int(v[0].output)
|
|
16
18
|
except Exception as e:
|
|
17
19
|
logger.error(f"Error counting tokens: {str(e)}")
|
|
18
20
|
return -1
|
|
19
|
-
|
|
21
|
+
|
|
22
|
+
|
|
20
23
|
def initialize_tokenizer(tokenizer_path):
|
|
21
|
-
global tokenizer_model
|
|
24
|
+
global tokenizer_model
|
|
22
25
|
tokenizer_model = Tokenizer.from_file(tokenizer_path)
|
|
23
26
|
|
|
27
|
+
|
|
28
|
+
def count_tokens(text: str) -> int:
|
|
29
|
+
try:
|
|
30
|
+
# start_time = time.time_ns()
|
|
31
|
+
encoded = TOKENIZER_MODEL.encode('{"role":"user","content":"' + text + '"}')
|
|
32
|
+
v = len(encoded.ids)
|
|
33
|
+
# elapsed_time = time.time_ns() - start_time
|
|
34
|
+
# logger.info(f"Token counting took {elapsed_time/1000000} ms")
|
|
35
|
+
return v
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"Error counting tokens: {str(e)}")
|
|
38
|
+
return -1
|
|
39
|
+
|
|
40
|
+
|
|
24
41
|
def count_tokens_worker(text: str) -> int:
|
|
25
42
|
try:
|
|
26
43
|
# start_time = time.time_ns()
|
|
27
|
-
encoded = tokenizer_model.encode('{"role":"user","content":"'+text+'"}')
|
|
44
|
+
encoded = tokenizer_model.encode('{"role":"user","content":"' + text + '"}')
|
|
28
45
|
v = len(encoded.ids)
|
|
29
46
|
# elapsed_time = time.time_ns() - start_time
|
|
30
47
|
# logger.info(f"Token counting took {elapsed_time/1000000} ms")
|
|
@@ -33,11 +50,16 @@ def count_tokens_worker(text: str) -> int:
|
|
|
33
50
|
logger.error(f"Error counting tokens: {str(e)}")
|
|
34
51
|
return -1
|
|
35
52
|
|
|
53
|
+
|
|
36
54
|
class TokenCounter:
|
|
37
55
|
def __init__(self, tokenizer_path: str):
|
|
38
56
|
self.tokenizer_path = tokenizer_path
|
|
39
57
|
self.num_processes = cpu_count() - 1 if cpu_count() > 1 else 1
|
|
40
|
-
self.pool = Pool(
|
|
58
|
+
self.pool = Pool(
|
|
59
|
+
processes=self.num_processes,
|
|
60
|
+
initializer=initialize_tokenizer,
|
|
61
|
+
initargs=(self.tokenizer_path,),
|
|
62
|
+
)
|
|
41
63
|
|
|
42
64
|
def count_tokens(self, text: str) -> int:
|
|
43
|
-
return self.pool.apply(count_tokens_worker, (text,))
|
|
65
|
+
return self.pool.apply(count_tokens_worker, (text,))
|
autocoder/rag/token_limiter.py
CHANGED
|
@@ -13,11 +13,15 @@ class TokenLimiter:
|
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
15
|
count_tokens: Callable[[str], int],
|
|
16
|
-
|
|
16
|
+
full_text_limit: int,
|
|
17
|
+
segment_limit: int,
|
|
18
|
+
buff_limit: int,
|
|
17
19
|
llm,
|
|
18
20
|
):
|
|
19
21
|
self.count_tokens = count_tokens
|
|
20
|
-
self.
|
|
22
|
+
self.full_text_limit = full_text_limit
|
|
23
|
+
self.segment_limit = segment_limit
|
|
24
|
+
self.buff_limit = buff_limit
|
|
21
25
|
self.llm = llm
|
|
22
26
|
self.first_round_full_docs = []
|
|
23
27
|
self.second_round_extracted_docs = []
|
|
@@ -88,19 +92,22 @@ class TokenLimiter:
|
|
|
88
92
|
final_relevant_docs = []
|
|
89
93
|
token_count = 0
|
|
90
94
|
doc_num_count = 0
|
|
95
|
+
|
|
96
|
+
## 非窗口分区实现
|
|
91
97
|
for doc in relevant_docs:
|
|
92
98
|
doc_tokens = self.count_tokens(doc.source_code)
|
|
93
99
|
doc_num_count += 1
|
|
94
|
-
if token_count + doc_tokens <= self.
|
|
100
|
+
if token_count + doc_tokens <= self.full_text_limit + self.segment_limit:
|
|
95
101
|
final_relevant_docs.append(doc)
|
|
96
102
|
token_count += doc_tokens
|
|
97
103
|
else:
|
|
98
104
|
break
|
|
99
105
|
|
|
106
|
+
## 如果窗口无法放下所有的相关文档,则需要分区
|
|
100
107
|
if len(final_relevant_docs) < len(relevant_docs):
|
|
101
|
-
|
|
108
|
+
## 先填充full_text分区
|
|
102
109
|
token_count = 0
|
|
103
|
-
new_token_limit = self.
|
|
110
|
+
new_token_limit = self.full_text_limit
|
|
104
111
|
doc_num_count = 0
|
|
105
112
|
for doc in relevant_docs:
|
|
106
113
|
doc_tokens = self.count_tokens(doc.source_code)
|
|
@@ -111,8 +118,18 @@ class TokenLimiter:
|
|
|
111
118
|
else:
|
|
112
119
|
break
|
|
113
120
|
|
|
121
|
+
if len(self.first_round_full_docs) > 0:
|
|
122
|
+
remaining_tokens = (
|
|
123
|
+
self.full_text_limit + self.segment_limit - token_count
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
logger.warning(
|
|
127
|
+
"Full text area is empty, this is may caused by the single doc is too long"
|
|
128
|
+
)
|
|
129
|
+
remaining_tokens = self.full_text_limit + self.segment_limit
|
|
130
|
+
|
|
131
|
+
## 继续填充segment分区
|
|
114
132
|
sencond_round_start_time = time.time()
|
|
115
|
-
remaining_tokens = self.token_limit - new_token_limit
|
|
116
133
|
remaining_docs = relevant_docs[len(self.first_round_full_docs) :]
|
|
117
134
|
logger.info(
|
|
118
135
|
f"first round docs: {len(self.first_round_full_docs)} remaining docs: {len(remaining_docs)} index_filter_workers: {index_filter_workers}"
|
|
@@ -130,7 +147,7 @@ class TokenLimiter:
|
|
|
130
147
|
result = future.result()
|
|
131
148
|
if result and remaining_tokens > 0:
|
|
132
149
|
self.second_round_extracted_docs.append(result)
|
|
133
|
-
tokens =
|
|
150
|
+
tokens = result.tokens
|
|
134
151
|
if tokens > 0:
|
|
135
152
|
remaining_tokens -= tokens
|
|
136
153
|
else:
|
|
@@ -184,7 +201,13 @@ class TokenLimiter:
|
|
|
184
201
|
content += chunk + "\n"
|
|
185
202
|
|
|
186
203
|
return SourceCode(
|
|
187
|
-
module_name=doc.module_name,
|
|
204
|
+
module_name=doc.module_name,
|
|
205
|
+
source_code=content.strip(),
|
|
206
|
+
tokens=self.count_tokens(content),
|
|
207
|
+
metadata={
|
|
208
|
+
"original_doc": doc.module_name,
|
|
209
|
+
"chunk_ranges": json_objs,
|
|
210
|
+
},
|
|
188
211
|
)
|
|
189
212
|
except Exception as e:
|
|
190
213
|
if attempt < max_retries - 1:
|
|
@@ -196,5 +219,7 @@ class TokenLimiter:
|
|
|
196
219
|
f"Failed to process doc {doc.module_name} after {max_retries} attempts: {str(e)}"
|
|
197
220
|
)
|
|
198
221
|
return SourceCode(
|
|
199
|
-
module_name=doc.module_name,
|
|
222
|
+
module_name=doc.module_name,
|
|
223
|
+
source_code="",
|
|
224
|
+
tokens= 0
|
|
200
225
|
)
|
|
@@ -121,7 +121,11 @@ class SuffixProject:
|
|
|
121
121
|
|
|
122
122
|
def convert_to_source_code(self, file_path):
|
|
123
123
|
module_name = file_path
|
|
124
|
-
|
|
124
|
+
try:
|
|
125
|
+
source_code = self.read_file_content(file_path)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
128
|
+
return None
|
|
125
129
|
return SourceCode(module_name=module_name, source_code=source_code)
|
|
126
130
|
|
|
127
131
|
def get_source_codes(self) -> Generator[SourceCode, None, None]:
|
autocoder/tsproject/__init__.py
CHANGED
|
@@ -152,7 +152,11 @@ class TSProject:
|
|
|
152
152
|
return None
|
|
153
153
|
|
|
154
154
|
module_name = file_path
|
|
155
|
-
|
|
155
|
+
try:
|
|
156
|
+
source_code = self.read_file_content(file_path)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
159
|
+
return None
|
|
156
160
|
|
|
157
161
|
if not FileUtils.has_sufficient_content(source_code, min_line_count=1):
|
|
158
162
|
return None
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.175"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|