auto-coder 0.1.173__py3-none-any.whl → 0.1.176__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/METADATA +1 -1
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/RECORD +18 -17
- autocoder/auto_coder.py +8 -9
- autocoder/auto_coder_rag.py +46 -13
- autocoder/common/__init__.py +11 -3
- autocoder/pyproject/__init__.py +5 -1
- autocoder/rag/document_retriever.py +196 -55
- autocoder/rag/long_context_rag.py +81 -23
- autocoder/rag/token_counter.py +31 -9
- autocoder/rag/token_limiter.py +66 -13
- autocoder/rag/variable_holder.py +2 -0
- autocoder/suffixproject/__init__.py +5 -1
- autocoder/tsproject/__init__.py +5 -1
- autocoder/version.py +1 -1
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
autocoder/auto_coder.py,sha256=
|
|
2
|
+
autocoder/auto_coder.py,sha256=HmgKa_ZApFlCsqo6BvuVeCPuncBT_Dh29ayZxxGR6lo,32216
|
|
3
3
|
autocoder/auto_coder_lang.py,sha256=4qIS1tbEI8mpbtt6ThppTwKOM6MLuJTWJdgs5jIDGE0,2301
|
|
4
|
-
autocoder/auto_coder_rag.py,sha256=
|
|
4
|
+
autocoder/auto_coder_rag.py,sha256=V82EyeslAO2Z8qkMrwkyC11f1Cz6Ccjo9c867f0J_x8,11455
|
|
5
5
|
autocoder/auto_coder_server.py,sha256=qRY88mkBnqSGFDcwYE5gwpe2WPhIw1nEH6LdbjCQhQk,20306
|
|
6
6
|
autocoder/chat_auto_coder.py,sha256=i5xIuWlTqF0pJz8kXoa-_bW3Ic3SfCFvU2WJIMxrUHU,81798
|
|
7
7
|
autocoder/chat_auto_coder_lang.py,sha256=QYtu5gWEQmWKVovR_qUZ8plySZarNFX_Onk-1vN9IiA,8524
|
|
8
8
|
autocoder/command_args.py,sha256=ftWw6HnFUZPiQPt1oV-SfpHQe69XN3knaFy1lpROBcU,26854
|
|
9
9
|
autocoder/lang.py,sha256=e-07rYTgimpxS8sm-AxKSmH4kKQX4N05YFHJBg9trVs,12598
|
|
10
|
-
autocoder/version.py,sha256=
|
|
10
|
+
autocoder/version.py,sha256=yiACry4Tn-v8T0DYTTygfQmb9WG4pVkXXkB6IB4a1yg,23
|
|
11
11
|
autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
autocoder/agent/auto_tool.py,sha256=DBzip-P_T6ZtT2eHexPcusmKYD0h7ufzp7TLwXAY10E,11554
|
|
13
13
|
autocoder/agent/coder.py,sha256=dnITYHqkcOip8zV4lywbkYNH9w7Q3qyYaUArJ4WPrTs,866
|
|
@@ -17,7 +17,7 @@ autocoder/agent/project_reader.py,sha256=-MWRqsr7O4mvU0PIpAhOUBb29htZAvA37pa_GeE
|
|
|
17
17
|
autocoder/chat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
autocoder/common/JupyterClient.py,sha256=O-wi6pXeAEYhAY24kDa0BINrLYvKS6rKyWe98pDClS0,2816
|
|
19
19
|
autocoder/common/ShellClient.py,sha256=fM1q8t_XMSbLBl2zkCNC2J9xuyKN3eXzGm6hHhqL2WY,2286
|
|
20
|
-
autocoder/common/__init__.py,sha256=
|
|
20
|
+
autocoder/common/__init__.py,sha256=wKrFLZk9BMl755nL1gvPjXU-3uWKEnYBP8xsObIjM4g,10156
|
|
21
21
|
autocoder/common/anything2images.py,sha256=0ILBbWzY02M-CiWB-vzuomb_J1hVdxRcenAfIrAXq9M,25283
|
|
22
22
|
autocoder/common/audio.py,sha256=Kn9nWKQddWnUrAz0a_ZUgjcu4VUU_IcZBigT7n3N3qc,7439
|
|
23
23
|
autocoder/common/cleaner.py,sha256=NU72i8C6o9m0vXExab7nao5bstBUsfJFcj11cXa9l4U,1089
|
|
@@ -56,13 +56,13 @@ autocoder/index/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
56
56
|
autocoder/index/for_command.py,sha256=zfbvQnhHjsAqBc4Ce1kMGIu0jPEk_rtH7fntg89_4z0,3092
|
|
57
57
|
autocoder/index/index.py,sha256=6uakPXThpDWxAyOAP-7AbMuXaXJJkBKctL5RkNWGdGw,22485
|
|
58
58
|
autocoder/index/symbols_utils.py,sha256=CjcjUVajmJZB75Ty3a7kMv1BZphrm-tIBAdOJv6uo-0,2037
|
|
59
|
-
autocoder/pyproject/__init__.py,sha256
|
|
59
|
+
autocoder/pyproject/__init__.py,sha256=-2-ImQVw6e3NQZQOyDlHEP5b4xVs5ur2G5izB-JCa-A,13160
|
|
60
60
|
autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
61
|
autocoder/rag/api_server.py,sha256=zokIlDJlk7ucRorSLQm80uICO1mecfmn4J2zVqEBskE,6786
|
|
62
62
|
autocoder/rag/doc_filter.py,sha256=LqU8Wi6klwpY9WTHVtkioSHpmo9IWhRz39dzV1gvp6E,9315
|
|
63
|
-
autocoder/rag/document_retriever.py,sha256=
|
|
63
|
+
autocoder/rag/document_retriever.py,sha256=plwm8BpC55VJTUWCZyG4HsXYm-niqUsXaBMDLrLgYj0,23348
|
|
64
64
|
autocoder/rag/llm_wrapper.py,sha256=xRbTBpLUH43Ah5jplL8WWWU-kjKfNgEJoUntLGBq5F4,2484
|
|
65
|
-
autocoder/rag/long_context_rag.py,sha256=
|
|
65
|
+
autocoder/rag/long_context_rag.py,sha256=626f5-XFyTxmnbUJ_a9GiaMPuqWhTDVMcg0b0ePW_mQ,19471
|
|
66
66
|
autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
|
|
67
67
|
autocoder/rag/rag_entry.py,sha256=V1RJ8RGqM30DNPmzymv64rZjNRGWn6kfc8sRy_LECg0,2451
|
|
68
68
|
autocoder/rag/raw_rag.py,sha256=yS2Ur6kG0IRjhCj2_VonwxjY_xls_E62jO5Gz5j2nqE,2952
|
|
@@ -70,9 +70,10 @@ autocoder/rag/relevant_utils.py,sha256=OGfp98OXG4jr3jNmtHIeXGPF8mOlIbTnolPIVTZzY
|
|
|
70
70
|
autocoder/rag/simple_directory_reader.py,sha256=LkKreCkNdEOoL4fNhc3_hDoyyWTQUte4uqextISRz4U,24485
|
|
71
71
|
autocoder/rag/simple_rag.py,sha256=I902EUqOK1WM0Y2WFd7RzDJYofElvTZNLVCBtX5A9rc,14885
|
|
72
72
|
autocoder/rag/token_checker.py,sha256=jc76x6KWmvVxds6W8juZfQGaoErudc2HenG3sNQfSLs,2819
|
|
73
|
-
autocoder/rag/token_counter.py,sha256=
|
|
74
|
-
autocoder/rag/token_limiter.py,sha256=
|
|
73
|
+
autocoder/rag/token_counter.py,sha256=9ujfI5xQvwzKpN9XFWQGnXpm0h1sL7kgIJxgposcxNo,2096
|
|
74
|
+
autocoder/rag/token_limiter.py,sha256=nUxaaKJTWEi4J5c5Tz4BkwU4G1B74VxLlMinqu5s41A,10660
|
|
75
75
|
autocoder/rag/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
|
+
autocoder/rag/variable_holder.py,sha256=pDayuCnlKj7-bkn4iUHX5gea9UObddbi3ZnXotmxCs4,45
|
|
76
77
|
autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
|
|
77
78
|
autocoder/rag/loaders/docx_loader.py,sha256=g6Ta8rMUbfgwB8N1qiajhyO6wpaWl7zygAZiKShuioI,174
|
|
78
79
|
autocoder/rag/loaders/excel_loader.py,sha256=Ue8YB1z_kBs8SjIPuBskyM08Q1JiONs_BJZPrzi59oo,896
|
|
@@ -80,8 +81,8 @@ autocoder/rag/loaders/pdf_loader.py,sha256=CGfXOja7QZ7mHN-U5MsTiVMFzjP322rTj3dkY
|
|
|
80
81
|
autocoder/rag/loaders/ppt_loader.py,sha256=7VEYc-bqgK8VHCoGC3DIUcqbpda-E5jQF9lYLqP256I,1681
|
|
81
82
|
autocoder/regex_project/__init__.py,sha256=EBZeCL5ORyD_9_5u_UuG4s7XtpXOu0y1sWDmxWFtufE,6781
|
|
82
83
|
autocoder/regexproject/__init__.py,sha256=ThuvVFdpw1EgWv4aIRkhg3ZclKPxMVharUKWppFpQ8o,8436
|
|
83
|
-
autocoder/suffixproject/__init__.py,sha256=
|
|
84
|
-
autocoder/tsproject/__init__.py,sha256=
|
|
84
|
+
autocoder/suffixproject/__init__.py,sha256=EaQoumMzZ2COxMiI_GnL3SG4LGzRj0Qw7UpqLfNLCw8,9823
|
|
85
|
+
autocoder/tsproject/__init__.py,sha256=QmEpNZYUJq1o0lGMs3UuUIUU-2aq_3eh1VxqnIc-hME,10431
|
|
85
86
|
autocoder/utils/__init__.py,sha256=O3n6cpsgkIbbMuwmBHSQ1dls_IBD7_7YKFFaeKNo_tc,1193
|
|
86
87
|
autocoder/utils/coder.py,sha256=rK8e0svQBe0NOP26dIGToUXgha_hUDgxlWoC_p_r7oc,5698
|
|
87
88
|
autocoder/utils/conversation_store.py,sha256=sz-hhY7sttPAUOAQU6Pze-5zJc3j0_Emj22dM_0l5ro,1161
|
|
@@ -94,9 +95,9 @@ autocoder/utils/request_event_queue.py,sha256=r3lo5qGsB1dIjzVQ05dnr0z_9Z3zOkBdP1
|
|
|
94
95
|
autocoder/utils/request_queue.py,sha256=nwp6PMtgTCiuwJI24p8OLNZjUiprC-TsefQrhMI-yPE,3889
|
|
95
96
|
autocoder/utils/rest.py,sha256=3tXA8KZG6jKz_tddHNLGx77Icee88WcUeesfNsgPno4,8790
|
|
96
97
|
autocoder/utils/tests.py,sha256=BqphrwyycGAvs-5mhH8pKtMZdObwhFtJ5MC_ZAOiLq8,1340
|
|
97
|
-
auto_coder-0.1.
|
|
98
|
-
auto_coder-0.1.
|
|
99
|
-
auto_coder-0.1.
|
|
100
|
-
auto_coder-0.1.
|
|
101
|
-
auto_coder-0.1.
|
|
102
|
-
auto_coder-0.1.
|
|
98
|
+
auto_coder-0.1.176.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
99
|
+
auto_coder-0.1.176.dist-info/METADATA,sha256=-Jm1GW-7-Htzi_6l3MGRGTvl0ytk1ZyMGB2ZpiZoYa8,2352
|
|
100
|
+
auto_coder-0.1.176.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
101
|
+
auto_coder-0.1.176.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
|
|
102
|
+
auto_coder-0.1.176.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
|
|
103
|
+
auto_coder-0.1.176.dist-info/RECORD,,
|
autocoder/auto_coder.py
CHANGED
|
@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
813
813
|
llm, args, code_auto_execute.Mode.SINGLE_ROUND
|
|
814
814
|
)
|
|
815
815
|
executor.run(query=args.query, context=s, source_code="")
|
|
816
|
-
return
|
|
817
|
-
elif raw_args.agent_command == "chat":
|
|
818
|
-
from autocoder.rag.rag_entry import RAGFactory
|
|
819
|
-
|
|
820
|
-
rag = RAGFactory.get_rag(llm=llm, args=args, path="")
|
|
821
|
-
rag.stream_chat_repl(args.query)
|
|
822
|
-
return
|
|
823
|
-
|
|
824
|
-
|
|
816
|
+
return
|
|
825
817
|
elif raw_args.doc_command == "serve":
|
|
826
818
|
|
|
827
819
|
from autocoder.rag.llm_wrapper import LLWrapper
|
|
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
846
838
|
llm_wrapper = LLWrapper(llm=llm, rag=rag)
|
|
847
839
|
serve(llm=llm_wrapper, args=server_args)
|
|
848
840
|
return
|
|
841
|
+
|
|
842
|
+
elif raw_args.doc_command == "chat":
|
|
843
|
+
from autocoder.rag.rag_entry import RAGFactory
|
|
844
|
+
|
|
845
|
+
rag = RAGFactory.get_rag(llm=llm, args=args, path="")
|
|
846
|
+
rag.stream_chat_repl(args.query)
|
|
847
|
+
return
|
|
849
848
|
|
|
850
849
|
else:
|
|
851
850
|
http_doc = HttpDoc(args=args, llm=llm, urls=None)
|
autocoder/auto_coder_rag.py
CHANGED
|
@@ -18,7 +18,7 @@ from rich.console import Console
|
|
|
18
18
|
from rich.table import Table
|
|
19
19
|
import os
|
|
20
20
|
|
|
21
|
-
from autocoder.rag.document_retriever import
|
|
21
|
+
from autocoder.rag.document_retriever import process_file_local
|
|
22
22
|
from autocoder.rag.token_counter import TokenCounter
|
|
23
23
|
|
|
24
24
|
if platform.system() == "Windows":
|
|
@@ -90,16 +90,24 @@ def initialize_system():
|
|
|
90
90
|
|
|
91
91
|
if choice == "1":
|
|
92
92
|
print_status(get_message("deploying_model").format("Deepseek官方"), "")
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
deploy_cmd = [
|
|
95
|
-
"byzerllm",
|
|
96
|
-
"
|
|
97
|
-
"--
|
|
98
|
-
"
|
|
99
|
-
"--
|
|
100
|
-
"
|
|
101
|
-
"--
|
|
102
|
-
"
|
|
95
|
+
"byzerllm",
|
|
96
|
+
"deploy",
|
|
97
|
+
"--pretrained_model_type",
|
|
98
|
+
"saas/openai",
|
|
99
|
+
"--cpus_per_worker",
|
|
100
|
+
"0.001",
|
|
101
|
+
"--gpus_per_worker",
|
|
102
|
+
"0",
|
|
103
|
+
"--worker_concurrency",
|
|
104
|
+
"1000",
|
|
105
|
+
"--num_workers",
|
|
106
|
+
"1",
|
|
107
|
+
"--infer_params",
|
|
108
|
+
f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
|
|
109
|
+
"--model",
|
|
110
|
+
"deepseek_chat",
|
|
103
111
|
]
|
|
104
112
|
|
|
105
113
|
try:
|
|
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
138
146
|
|
|
139
147
|
# Serve command
|
|
140
148
|
serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
|
|
141
|
-
serve_parser.add_argument(
|
|
149
|
+
serve_parser.add_argument(
|
|
150
|
+
"--quick", action="store_true", help="Skip system initialization"
|
|
151
|
+
)
|
|
142
152
|
serve_parser.add_argument("--file", default="", help=desc["file"])
|
|
143
153
|
serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
|
|
144
154
|
serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
|
|
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
160
170
|
"--rag_context_window_limit",
|
|
161
171
|
type=int,
|
|
162
172
|
default=110000,
|
|
163
|
-
help="",
|
|
173
|
+
help="The input context window limit for RAG",
|
|
174
|
+
)
|
|
175
|
+
serve_parser.add_argument(
|
|
176
|
+
"--full_text_ratio",
|
|
177
|
+
type=float,
|
|
178
|
+
default=0.7,
|
|
179
|
+
help="The ratio of full text area in the input context window (0.0 to 1.0)",
|
|
180
|
+
)
|
|
181
|
+
serve_parser.add_argument(
|
|
182
|
+
"--segment_ratio",
|
|
183
|
+
type=float,
|
|
184
|
+
default=0.2,
|
|
185
|
+
help="The ratio of segment area in the input context window (0.0 to 1.0)",
|
|
164
186
|
)
|
|
165
187
|
serve_parser.add_argument(
|
|
166
188
|
"--required_exts", default="", help=desc["doc_build_parse_required_exts"]
|
|
@@ -198,6 +220,17 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
198
220
|
help="Monitor mode for the doc update",
|
|
199
221
|
)
|
|
200
222
|
|
|
223
|
+
serve_parser.add_argument(
|
|
224
|
+
"--disable_auto_window",
|
|
225
|
+
action="store_true",
|
|
226
|
+
help="Disable automatic window adaptation for documents",
|
|
227
|
+
)
|
|
228
|
+
serve_parser.add_argument(
|
|
229
|
+
"--disable_segment_reorder",
|
|
230
|
+
action="store_true",
|
|
231
|
+
help="Disable reordering of document segments after retrieval",
|
|
232
|
+
)
|
|
233
|
+
|
|
201
234
|
# Tools command
|
|
202
235
|
tools_parser = subparsers.add_parser("tools", help="Various tools")
|
|
203
236
|
tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
|
|
@@ -255,7 +288,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
255
288
|
|
|
256
289
|
def count_tokens(tokenizer_path: str, file_path: str):
|
|
257
290
|
token_counter = TokenCounter(tokenizer_path)
|
|
258
|
-
source_codes =
|
|
291
|
+
source_codes = process_file_local(file_path)
|
|
259
292
|
|
|
260
293
|
console = Console()
|
|
261
294
|
table = Table(title="Token Count Results")
|
autocoder/common/__init__.py
CHANGED
|
@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
|
|
|
11
11
|
module_name: str
|
|
12
12
|
source_code: str
|
|
13
13
|
tag: str = ""
|
|
14
|
+
tokens: int = -1
|
|
15
|
+
metadata: Dict[str, Any] = {}
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class TranslateReadme(pydantic.BaseModel):
|
|
@@ -281,9 +283,11 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
281
283
|
doc_command: Optional[str] = None
|
|
282
284
|
required_exts: Optional[str] = None
|
|
283
285
|
|
|
284
|
-
monitor_mode:
|
|
285
|
-
|
|
286
|
-
|
|
286
|
+
monitor_mode: bool = False
|
|
287
|
+
disable_auto_window: bool = False
|
|
288
|
+
disable_segment_reorder: bool = False
|
|
289
|
+
rag_doc_filter_relevance: int = 5
|
|
290
|
+
tokenizer_path: Optional[str] = None
|
|
287
291
|
skip_confirm: Optional[bool] = False
|
|
288
292
|
silence: Optional[bool] = False
|
|
289
293
|
exclude_files: Optional[Union[str, List[str]]] = ""
|
|
@@ -304,5 +308,9 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
304
308
|
|
|
305
309
|
agent_designer_mode: Optional[str] = "svg"
|
|
306
310
|
|
|
311
|
+
full_text_ratio: Optional[float] = 0.7
|
|
312
|
+
segment_ratio: Optional[float] = 0.2
|
|
313
|
+
buff_ratio: Optional[float] = 0.1
|
|
314
|
+
|
|
307
315
|
class Config:
|
|
308
316
|
protected_namespaces = ()
|
autocoder/pyproject/__init__.py
CHANGED
|
@@ -187,7 +187,11 @@ class PyProject:
|
|
|
187
187
|
|
|
188
188
|
def convert_to_source_code(self, file_path):
|
|
189
189
|
module_name = file_path
|
|
190
|
-
|
|
190
|
+
try:
|
|
191
|
+
source_code = self.read_file_content(file_path)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
194
|
+
return None
|
|
191
195
|
return SourceCode(module_name=module_name, source_code=source_code)
|
|
192
196
|
|
|
193
197
|
def get_package_source_codes(
|
|
@@ -18,10 +18,15 @@ from loguru import logger
|
|
|
18
18
|
from pydantic import BaseModel
|
|
19
19
|
|
|
20
20
|
from autocoder.common import SourceCode
|
|
21
|
-
from autocoder.rag.loaders import (
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
from autocoder.rag.loaders import (
|
|
22
|
+
extract_text_from_docx,
|
|
23
|
+
extract_text_from_excel,
|
|
24
|
+
extract_text_from_pdf,
|
|
25
|
+
extract_text_from_ppt,
|
|
26
|
+
)
|
|
27
|
+
from autocoder.rag import variable_holder
|
|
28
|
+
from autocoder.rag.token_counter import count_tokens_worker, count_tokens
|
|
29
|
+
from uuid import uuid4
|
|
25
30
|
|
|
26
31
|
cache_lock = threading.Lock()
|
|
27
32
|
|
|
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
|
|
|
34
39
|
file_infos: List[Tuple[str, str, float]]
|
|
35
40
|
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
|
|
42
|
+
def process_file_in_multi_process(
|
|
43
|
+
file_info: Tuple[str, str, float]
|
|
44
|
+
) -> List[SourceCode]:
|
|
39
45
|
start_time = time.time()
|
|
40
46
|
file_path, relative_path, _ = file_info
|
|
41
47
|
try:
|
|
42
48
|
if file_path.endswith(".pdf"):
|
|
43
49
|
with open(file_path, "rb") as f:
|
|
44
50
|
content = extract_text_from_pdf(f.read())
|
|
45
|
-
v = [SourceCode(module_name=file_path, source_code=content)]
|
|
46
|
-
elif file_path.endswith(".docx"):
|
|
47
|
-
with open(file_path, "rb") as f:
|
|
48
|
-
content = extract_text_from_docx(f.read())
|
|
49
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
50
|
-
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
51
|
-
sheets = extract_text_from_excel(file_path)
|
|
52
51
|
v = [
|
|
53
52
|
SourceCode(
|
|
54
|
-
module_name=
|
|
55
|
-
source_code=
|
|
53
|
+
module_name=file_path,
|
|
54
|
+
source_code=content,
|
|
55
|
+
tokens=count_tokens_worker(content),
|
|
56
56
|
)
|
|
57
|
-
for sheet in sheets
|
|
58
57
|
]
|
|
59
|
-
elif file_path.endswith(".pptx"):
|
|
60
|
-
slides = extract_text_from_ppt(file_path)
|
|
61
|
-
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
62
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
63
|
-
else:
|
|
64
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
65
|
-
content = f.read()
|
|
66
|
-
v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
|
|
67
|
-
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
68
|
-
return v
|
|
69
|
-
except Exception as e:
|
|
70
|
-
logger.error(f"Error processing file {file_path}: {str(e)}")
|
|
71
|
-
return []
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
|
|
75
|
-
start_time = time.time()
|
|
76
|
-
file_path, relative_path, _ = file_info
|
|
77
|
-
try:
|
|
78
|
-
if file_path.endswith(".pdf"):
|
|
79
|
-
with open(file_path, "rb") as f:
|
|
80
|
-
content = extract_text_from_pdf(f.read())
|
|
81
|
-
v = [SourceCode(module_name=file_path, source_code=content)]
|
|
82
58
|
elif file_path.endswith(".docx"):
|
|
83
59
|
with open(file_path, "rb") as f:
|
|
84
60
|
content = extract_text_from_docx(f.read())
|
|
85
|
-
v = [
|
|
61
|
+
v = [
|
|
62
|
+
SourceCode(
|
|
63
|
+
module_name=f"##File: {file_path}",
|
|
64
|
+
source_code=content,
|
|
65
|
+
tokens=count_tokens_worker(content),
|
|
66
|
+
)
|
|
67
|
+
]
|
|
86
68
|
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
87
69
|
sheets = extract_text_from_excel(file_path)
|
|
88
70
|
v = [
|
|
89
71
|
SourceCode(
|
|
90
72
|
module_name=f"##File: {file_path}#{sheet[0]}",
|
|
91
73
|
source_code=sheet[1],
|
|
74
|
+
tokens=count_tokens_worker(sheet[1]),
|
|
92
75
|
)
|
|
93
76
|
for sheet in sheets
|
|
94
77
|
]
|
|
95
78
|
elif file_path.endswith(".pptx"):
|
|
96
79
|
slides = extract_text_from_ppt(file_path)
|
|
97
80
|
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
98
|
-
v = [
|
|
81
|
+
v = [
|
|
82
|
+
SourceCode(
|
|
83
|
+
module_name=f"##File: {file_path}",
|
|
84
|
+
source_code=content,
|
|
85
|
+
tokens=count_tokens_worker(content),
|
|
86
|
+
)
|
|
87
|
+
]
|
|
99
88
|
else:
|
|
100
89
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
101
90
|
content = f.read()
|
|
102
|
-
v = [
|
|
91
|
+
v = [
|
|
92
|
+
SourceCode(
|
|
93
|
+
module_name=f"##File: {file_path}",
|
|
94
|
+
source_code=content,
|
|
95
|
+
tokens=count_tokens_worker(content),
|
|
96
|
+
)
|
|
97
|
+
]
|
|
103
98
|
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
104
99
|
return v
|
|
105
100
|
except Exception as e:
|
|
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
|
|
|
107
102
|
return []
|
|
108
103
|
|
|
109
104
|
|
|
110
|
-
def
|
|
105
|
+
def process_file_local(file_path: str) -> List[SourceCode]:
|
|
111
106
|
start_time = time.time()
|
|
112
107
|
try:
|
|
113
108
|
if file_path.endswith(".pdf"):
|
|
114
109
|
with open(file_path, "rb") as f:
|
|
115
110
|
content = extract_text_from_pdf(f.read())
|
|
116
|
-
v = [
|
|
111
|
+
v = [
|
|
112
|
+
SourceCode(
|
|
113
|
+
module_name=file_path,
|
|
114
|
+
source_code=content,
|
|
115
|
+
tokens=count_tokens(content),
|
|
116
|
+
)
|
|
117
|
+
]
|
|
117
118
|
elif file_path.endswith(".docx"):
|
|
118
119
|
with open(file_path, "rb") as f:
|
|
119
120
|
content = extract_text_from_docx(f.read())
|
|
120
|
-
v = [
|
|
121
|
+
v = [
|
|
122
|
+
SourceCode(
|
|
123
|
+
module_name=f"##File: {file_path}",
|
|
124
|
+
source_code=content,
|
|
125
|
+
tokens=count_tokens(content),
|
|
126
|
+
)
|
|
127
|
+
]
|
|
121
128
|
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
|
|
122
129
|
sheets = extract_text_from_excel(file_path)
|
|
123
130
|
v = [
|
|
124
131
|
SourceCode(
|
|
125
132
|
module_name=f"##File: {file_path}#{sheet[0]}",
|
|
126
133
|
source_code=sheet[1],
|
|
134
|
+
tokens=count_tokens(sheet[1]),
|
|
127
135
|
)
|
|
128
136
|
for sheet in sheets
|
|
129
137
|
]
|
|
130
138
|
elif file_path.endswith(".pptx"):
|
|
131
139
|
slides = extract_text_from_ppt(file_path)
|
|
132
140
|
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
133
|
-
v = [
|
|
141
|
+
v = [
|
|
142
|
+
SourceCode(
|
|
143
|
+
module_name=f"##File: {file_path}",
|
|
144
|
+
source_code=content,
|
|
145
|
+
tokens=count_tokens(content),
|
|
146
|
+
)
|
|
147
|
+
]
|
|
134
148
|
else:
|
|
135
149
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
136
150
|
content = f.read()
|
|
137
|
-
v = [
|
|
151
|
+
v = [
|
|
152
|
+
SourceCode(
|
|
153
|
+
module_name=f"##File: {file_path}",
|
|
154
|
+
source_code=content,
|
|
155
|
+
tokens=count_tokens(content),
|
|
156
|
+
)
|
|
157
|
+
]
|
|
138
158
|
logger.info(f"Load file {file_path} in {time.time() - start_time}")
|
|
139
159
|
return v
|
|
140
160
|
except Exception as e:
|
|
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
|
|
|
205
225
|
self.update_cache(item)
|
|
206
226
|
|
|
207
227
|
def update_cache(self, file_path):
|
|
208
|
-
source_code =
|
|
228
|
+
source_code = process_file_local(file_path)
|
|
209
229
|
self.cache[file_path] = {
|
|
210
230
|
"file_path": file_path,
|
|
211
231
|
"content": [c.model_dump() for c in source_code],
|
|
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
|
|
|
220
240
|
|
|
221
241
|
def open_watch(self):
|
|
222
242
|
logger.info(f"start monitor: {self.path}...")
|
|
223
|
-
for changes in watch(
|
|
243
|
+
for changes in watch(
|
|
244
|
+
self.path, watch_filter=self.file_filter, stop_event=self.stop_event
|
|
245
|
+
):
|
|
224
246
|
for change in changes:
|
|
225
247
|
(action, path) = change
|
|
226
248
|
if action == Change.added or action == Change.modified:
|
|
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
290
312
|
self.thread.start()
|
|
291
313
|
self.cache = self.read_cache()
|
|
292
314
|
|
|
293
|
-
|
|
294
315
|
def _process_queue(self):
|
|
295
316
|
while not self.stop_event.is_set():
|
|
296
317
|
try:
|
|
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
324
345
|
# results = ray.get(
|
|
325
346
|
# [process_file.remote(file_info) for file_info in files_to_process]
|
|
326
347
|
# )
|
|
327
|
-
|
|
328
|
-
|
|
348
|
+
from autocoder.rag.token_counter import initialize_tokenizer
|
|
349
|
+
|
|
350
|
+
with Pool(
|
|
351
|
+
processes=os.cpu_count(),
|
|
352
|
+
initializer=initialize_tokenizer,
|
|
353
|
+
initargs=(variable_holder.TOKENIZER_PATH,),
|
|
354
|
+
) as pool:
|
|
355
|
+
results = pool.map(process_file_in_multi_process, files_to_process)
|
|
329
356
|
|
|
330
357
|
for file_info, result in zip(files_to_process, results):
|
|
331
358
|
self.update_cache(file_info, result)
|
|
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
365
392
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
366
393
|
for file_info in file_list.file_infos:
|
|
367
394
|
logger.info(f"{file_info[0]} is detected to be updated")
|
|
368
|
-
result =
|
|
395
|
+
result = process_file_local(file_info)
|
|
369
396
|
self.update_cache(file_info, result)
|
|
370
397
|
|
|
371
398
|
self.write_cache()
|
|
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
|
|
|
410
437
|
# 释放文件锁
|
|
411
438
|
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
412
439
|
|
|
413
|
-
def update_cache(
|
|
440
|
+
def update_cache(
|
|
441
|
+
self, file_info: Tuple[str, str, float], content: List[SourceCode]
|
|
442
|
+
):
|
|
414
443
|
file_path, relative_path, modify_time = file_info
|
|
415
444
|
self.cache[file_path] = {
|
|
416
445
|
"file_path": file_path,
|
|
@@ -485,11 +514,20 @@ class DocumentRetriever:
|
|
|
485
514
|
required_exts: list,
|
|
486
515
|
on_ray: bool = False,
|
|
487
516
|
monitor_mode: bool = False,
|
|
517
|
+
single_file_token_limit: int = 60000,
|
|
518
|
+
disable_auto_window: bool = False,
|
|
488
519
|
) -> None:
|
|
489
520
|
self.path = path
|
|
490
521
|
self.ignore_spec = ignore_spec
|
|
491
522
|
self.required_exts = required_exts
|
|
492
523
|
self.monitor_mode = monitor_mode
|
|
524
|
+
self.single_file_token_limit = single_file_token_limit
|
|
525
|
+
self.disable_auto_window = disable_auto_window
|
|
526
|
+
|
|
527
|
+
# 多小的文件会被合并
|
|
528
|
+
self.small_file_token_limit = self.single_file_token_limit / 4
|
|
529
|
+
# 合并后的最大文件大小
|
|
530
|
+
self.small_file_merge_limit = self.single_file_token_limit / 2
|
|
493
531
|
|
|
494
532
|
self.on_ray = on_ray
|
|
495
533
|
if self.on_ray:
|
|
@@ -502,6 +540,13 @@ class DocumentRetriever:
|
|
|
502
540
|
path, ignore_spec, required_exts
|
|
503
541
|
)
|
|
504
542
|
|
|
543
|
+
logger.info(f"DocumentRetriever initialized with:")
|
|
544
|
+
logger.info(f" Path: {self.path}")
|
|
545
|
+
logger.info(f" Diable auto window: {self.disable_auto_window} ")
|
|
546
|
+
logger.info(f" Single file token limit: {self.single_file_token_limit}")
|
|
547
|
+
logger.info(f" Small file token limit: {self.small_file_token_limit}")
|
|
548
|
+
logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
|
|
549
|
+
|
|
505
550
|
def get_cache(self):
|
|
506
551
|
if self.on_ray:
|
|
507
552
|
return ray.get(self.cacher.get_cache.remote())
|
|
@@ -509,6 +554,102 @@ class DocumentRetriever:
|
|
|
509
554
|
return self.cacher.get_cache()
|
|
510
555
|
|
|
511
556
|
def retrieve_documents(self) -> Generator[SourceCode, None, None]:
|
|
557
|
+
logger.info("Starting document retrieval process")
|
|
558
|
+
waiting_list = []
|
|
559
|
+
waiting_tokens = 0
|
|
512
560
|
for _, data in self.get_cache().items():
|
|
513
561
|
for source_code in data["content"]:
|
|
514
|
-
|
|
562
|
+
doc = SourceCode.model_validate(source_code)
|
|
563
|
+
if self.disable_auto_window:
|
|
564
|
+
yield doc
|
|
565
|
+
else:
|
|
566
|
+
if doc.tokens <= 0:
|
|
567
|
+
yield doc
|
|
568
|
+
elif doc.tokens < self.small_file_token_limit:
|
|
569
|
+
waiting_list, waiting_tokens = self._add_to_waiting_list(
|
|
570
|
+
doc, waiting_list, waiting_tokens
|
|
571
|
+
)
|
|
572
|
+
if waiting_tokens >= self.small_file_merge_limit:
|
|
573
|
+
yield from self._process_waiting_list(waiting_list)
|
|
574
|
+
waiting_list = []
|
|
575
|
+
waiting_tokens = 0
|
|
576
|
+
elif doc.tokens > self.single_file_token_limit:
|
|
577
|
+
yield from self._split_large_document(doc)
|
|
578
|
+
else:
|
|
579
|
+
yield doc
|
|
580
|
+
if waiting_list and not self.disable_auto_window:
|
|
581
|
+
yield from self._process_waiting_list(waiting_list)
|
|
582
|
+
|
|
583
|
+
logger.info("Document retrieval process completed")
|
|
584
|
+
|
|
585
|
+
def _add_to_waiting_list(
|
|
586
|
+
self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
|
|
587
|
+
) -> Tuple[List[SourceCode], int]:
|
|
588
|
+
waiting_list.append(doc)
|
|
589
|
+
return waiting_list, waiting_tokens + doc.tokens
|
|
590
|
+
|
|
591
|
+
def _process_waiting_list(
|
|
592
|
+
self, waiting_list: List[SourceCode]
|
|
593
|
+
) -> Generator[SourceCode, None, None]:
|
|
594
|
+
if len(waiting_list) == 1:
|
|
595
|
+
yield waiting_list[0]
|
|
596
|
+
elif len(waiting_list) > 1:
|
|
597
|
+
yield self._merge_documents(waiting_list)
|
|
598
|
+
|
|
599
|
+
def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
|
|
600
|
+
merged_content = "\n".join(
|
|
601
|
+
[f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
|
|
602
|
+
)
|
|
603
|
+
merged_tokens = sum([doc.tokens for doc in docs])
|
|
604
|
+
merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
|
|
605
|
+
logger.info(
|
|
606
|
+
f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
|
|
607
|
+
)
|
|
608
|
+
return SourceCode(
|
|
609
|
+
module_name=merged_name,
|
|
610
|
+
source_code=merged_content,
|
|
611
|
+
tokens=merged_tokens,
|
|
612
|
+
metadata={"original_docs": [doc.module_name for doc in docs]},
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
def _split_large_document(
|
|
616
|
+
self, doc: SourceCode
|
|
617
|
+
) -> Generator[SourceCode, None, None]:
|
|
618
|
+
chunk_size = self.single_file_token_limit
|
|
619
|
+
total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
|
|
620
|
+
logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
|
|
621
|
+
for i in range(0, doc.tokens, chunk_size):
|
|
622
|
+
chunk_content = doc.source_code[i : i + chunk_size]
|
|
623
|
+
chunk_tokens = min(chunk_size, doc.tokens - i)
|
|
624
|
+
chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
|
|
625
|
+
# logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
|
|
626
|
+
yield SourceCode(
|
|
627
|
+
module_name=chunk_name,
|
|
628
|
+
source_code=chunk_content,
|
|
629
|
+
tokens=chunk_tokens,
|
|
630
|
+
metadata={
|
|
631
|
+
"original_doc": doc.module_name,
|
|
632
|
+
"chunk_index": i // chunk_size + 1,
|
|
633
|
+
},
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
def _split_document(
|
|
637
|
+
self, doc: SourceCode, token_limit: int
|
|
638
|
+
) -> Generator[SourceCode, None, None]:
|
|
639
|
+
remaining_tokens = doc.tokens
|
|
640
|
+
chunk_number = 1
|
|
641
|
+
start_index = 0
|
|
642
|
+
|
|
643
|
+
while remaining_tokens > 0:
|
|
644
|
+
end_index = start_index + token_limit
|
|
645
|
+
chunk_content = doc.source_code[start_index:end_index]
|
|
646
|
+
chunk_tokens = min(token_limit, remaining_tokens)
|
|
647
|
+
|
|
648
|
+
chunk_name = f"{doc.module_name}#{chunk_number:06d}"
|
|
649
|
+
yield SourceCode(
|
|
650
|
+
module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
start_index = end_index
|
|
654
|
+
remaining_tokens -= chunk_tokens
|
|
655
|
+
chunk_number += 1
|
|
@@ -13,16 +13,22 @@ from openai import OpenAI
|
|
|
13
13
|
from rich.console import Console
|
|
14
14
|
from rich.panel import Panel
|
|
15
15
|
from rich.table import Table
|
|
16
|
-
|
|
16
|
+
import statistics
|
|
17
17
|
|
|
18
18
|
from autocoder.common import AutoCoderArgs, SourceCode
|
|
19
19
|
from autocoder.rag.doc_filter import DocFilter
|
|
20
20
|
from autocoder.rag.document_retriever import DocumentRetriever
|
|
21
|
-
from autocoder.rag.relevant_utils import (
|
|
22
|
-
|
|
21
|
+
from autocoder.rag.relevant_utils import (
|
|
22
|
+
DocRelevance,
|
|
23
|
+
FilterDoc,
|
|
24
|
+
TaskTiming,
|
|
25
|
+
parse_relevance,
|
|
26
|
+
)
|
|
23
27
|
from autocoder.rag.token_checker import check_token_limit
|
|
24
28
|
from autocoder.rag.token_counter import RemoteTokenCounter, TokenCounter
|
|
25
29
|
from autocoder.rag.token_limiter import TokenLimiter
|
|
30
|
+
from tokenizers import Tokenizer
|
|
31
|
+
from autocoder.rag import variable_holder
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
class LongContextRAG:
|
|
@@ -44,11 +50,26 @@ class LongContextRAG:
|
|
|
44
50
|
self.path = path
|
|
45
51
|
self.relevant_score = self.args.rag_doc_filter_relevance or 5
|
|
46
52
|
|
|
53
|
+
self.full_text_ratio = args.full_text_ratio
|
|
54
|
+
self.segment_ratio = args.segment_ratio
|
|
55
|
+
self.buff_ratio = 1 - self.full_text_ratio - self.segment_ratio
|
|
56
|
+
|
|
57
|
+
if self.buff_ratio < 0:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"The sum of full_text_ratio and segment_ratio must be less than or equal to 1.0"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.full_text_limit = int(args.rag_context_window_limit * self.full_text_ratio)
|
|
63
|
+
self.segment_limit = int(args.rag_context_window_limit * self.segment_ratio)
|
|
64
|
+
self.buff_limit = int(args.rag_context_window_limit * self.buff_ratio)
|
|
65
|
+
|
|
47
66
|
self.tokenizer = None
|
|
48
67
|
self.tokenizer_path = tokenizer_path
|
|
49
68
|
self.on_ray = False
|
|
50
69
|
|
|
51
70
|
if self.tokenizer_path:
|
|
71
|
+
variable_holder.TOKENIZER_PATH = self.tokenizer_path
|
|
72
|
+
variable_holder.TOKENIZER_MODEL = Tokenizer.from_file(self.tokenizer_path)
|
|
52
73
|
self.tokenizer = TokenCounter(self.tokenizer_path)
|
|
53
74
|
else:
|
|
54
75
|
if llm.is_model_exist("deepseek_tokenizer"):
|
|
@@ -96,24 +117,41 @@ class LongContextRAG:
|
|
|
96
117
|
self.required_exts,
|
|
97
118
|
self.on_ray,
|
|
98
119
|
self.monitor_mode,
|
|
120
|
+
## 确保全文区至少能放下一个文件
|
|
121
|
+
single_file_token_limit=self.full_text_limit - 100,
|
|
122
|
+
disable_auto_window=self.args.disable_auto_window
|
|
99
123
|
)
|
|
100
124
|
|
|
101
125
|
self.doc_filter = DocFilter(
|
|
102
126
|
self.index_model, self.args, on_ray=self.on_ray, path=self.path
|
|
103
127
|
)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
128
|
+
|
|
129
|
+
doc_num = 0
|
|
130
|
+
token_num = 0
|
|
131
|
+
token_counts = []
|
|
132
|
+
for doc in self._retrieve_documents():
|
|
133
|
+
doc_num += 1
|
|
134
|
+
doc_tokens = doc.tokens
|
|
135
|
+
token_num += doc_tokens
|
|
136
|
+
token_counts.append(doc_tokens)
|
|
137
|
+
|
|
138
|
+
avg_tokens = statistics.mean(token_counts) if token_counts else 0
|
|
139
|
+
median_tokens = statistics.median(token_counts) if token_counts else 0
|
|
114
140
|
|
|
115
141
|
logger.info(
|
|
116
|
-
|
|
142
|
+
"RAG Configuration:\n"
|
|
143
|
+
f" Total docs: {doc_num}\n"
|
|
144
|
+
f" Total tokens: {token_num}\n"
|
|
145
|
+
f" Tokenizer path: {self.tokenizer_path}\n"
|
|
146
|
+
f" Relevant score: {self.relevant_score}\n"
|
|
147
|
+
f" Token limit: {self.token_limit}\n"
|
|
148
|
+
f" Full text limit: {self.full_text_limit}\n"
|
|
149
|
+
f" Segment limit: {self.segment_limit}\n"
|
|
150
|
+
f" Buff limit: {self.buff_limit}\n"
|
|
151
|
+
f" Max doc tokens: {max(token_counts) if token_counts else 0}\n"
|
|
152
|
+
f" Min doc tokens: {min(token_counts) if token_counts else 0}\n"
|
|
153
|
+
f" Avg doc tokens: {avg_tokens:.2f}\n"
|
|
154
|
+
f" Median doc tokens: {median_tokens:.2f}\n"
|
|
117
155
|
)
|
|
118
156
|
|
|
119
157
|
def count_tokens(self, text: str) -> int:
|
|
@@ -350,9 +388,15 @@ class LongContextRAG:
|
|
|
350
388
|
query_table.add_row("Relevant docs", str(len(relevant_docs)))
|
|
351
389
|
|
|
352
390
|
# Add relevant docs information
|
|
353
|
-
relevant_docs_info =
|
|
354
|
-
|
|
355
|
-
|
|
391
|
+
relevant_docs_info = []
|
|
392
|
+
for doc in relevant_docs:
|
|
393
|
+
info = f"- {doc.module_name.replace(self.path,'',1)}"
|
|
394
|
+
if 'original_docs' in doc.metadata:
|
|
395
|
+
original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
|
|
396
|
+
info += f" (Original docs: {original_docs})"
|
|
397
|
+
relevant_docs_info.append(info)
|
|
398
|
+
|
|
399
|
+
relevant_docs_info = "\n".join(relevant_docs_info)
|
|
356
400
|
query_table.add_row("Relevant docs list", relevant_docs_info)
|
|
357
401
|
|
|
358
402
|
first_round_full_docs = []
|
|
@@ -363,8 +407,11 @@ class LongContextRAG:
|
|
|
363
407
|
|
|
364
408
|
token_limiter = TokenLimiter(
|
|
365
409
|
count_tokens=self.count_tokens,
|
|
366
|
-
|
|
410
|
+
full_text_limit=self.full_text_limit,
|
|
411
|
+
segment_limit=self.segment_limit,
|
|
412
|
+
buff_limit=self.buff_limit,
|
|
367
413
|
llm=self.llm,
|
|
414
|
+
disable_segment_reorder = self.args.disable_segment_reorder
|
|
368
415
|
)
|
|
369
416
|
final_relevant_docs = token_limiter.limit_tokens(
|
|
370
417
|
relevant_docs=relevant_docs,
|
|
@@ -395,9 +442,18 @@ class LongContextRAG:
|
|
|
395
442
|
)
|
|
396
443
|
|
|
397
444
|
# Add relevant docs information
|
|
398
|
-
final_relevant_docs_info =
|
|
399
|
-
|
|
400
|
-
|
|
445
|
+
final_relevant_docs_info = []
|
|
446
|
+
for doc in relevant_docs:
|
|
447
|
+
info = f"- {doc.module_name.replace(self.path,'',1)}"
|
|
448
|
+
if 'original_docs' in doc.metadata:
|
|
449
|
+
original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
|
|
450
|
+
info += f" (Original docs: {original_docs})"
|
|
451
|
+
if "chunk_ranges" in doc.metadata:
|
|
452
|
+
chunk_ranges = json.dumps(doc.metadata['chunk_ranges'],ensure_ascii=False)
|
|
453
|
+
info += f" (Chunk ranges: {chunk_ranges})"
|
|
454
|
+
final_relevant_docs_info.append(info)
|
|
455
|
+
|
|
456
|
+
final_relevant_docs_info = "\n".join(final_relevant_docs_info)
|
|
401
457
|
query_table.add_row("Final Relevant docs list", final_relevant_docs_info)
|
|
402
458
|
|
|
403
459
|
# Create a panel to contain the table
|
|
@@ -409,8 +465,10 @@ class LongContextRAG:
|
|
|
409
465
|
|
|
410
466
|
# Log the panel using rich
|
|
411
467
|
console.print(panel)
|
|
412
|
-
|
|
413
|
-
|
|
468
|
+
|
|
469
|
+
request_tokens = sum([doc.tokens for doc in relevant_docs])
|
|
470
|
+
target_model = model or self.llm.default_model_name
|
|
471
|
+
logger.info(f"Start to send to model {target_model} with {request_tokens} tokens")
|
|
414
472
|
|
|
415
473
|
new_conversations = conversations[:-1] + [
|
|
416
474
|
{
|
autocoder/rag/token_counter.py
CHANGED
|
@@ -2,29 +2,46 @@ import time
|
|
|
2
2
|
from loguru import logger
|
|
3
3
|
from tokenizers import Tokenizer
|
|
4
4
|
from multiprocessing import Pool, cpu_count
|
|
5
|
+
from autocoder.rag.variable_holder import TOKENIZER_MODEL
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class RemoteTokenCounter:
|
|
7
|
-
def __init__(self,tokenizer) -> None:
|
|
9
|
+
def __init__(self, tokenizer) -> None:
|
|
8
10
|
self.tokenizer = tokenizer
|
|
9
11
|
|
|
10
|
-
def count_tokens(self, text: str) -> int:
|
|
11
|
-
try:
|
|
12
|
+
def count_tokens(self, text: str) -> int:
|
|
13
|
+
try:
|
|
12
14
|
v = self.tokenizer.chat_oai(
|
|
13
15
|
conversations=[{"role": "user", "content": text}]
|
|
14
|
-
)
|
|
16
|
+
)
|
|
15
17
|
return int(v[0].output)
|
|
16
18
|
except Exception as e:
|
|
17
19
|
logger.error(f"Error counting tokens: {str(e)}")
|
|
18
20
|
return -1
|
|
19
|
-
|
|
21
|
+
|
|
22
|
+
|
|
20
23
|
def initialize_tokenizer(tokenizer_path):
|
|
21
|
-
global tokenizer_model
|
|
24
|
+
global tokenizer_model
|
|
22
25
|
tokenizer_model = Tokenizer.from_file(tokenizer_path)
|
|
23
26
|
|
|
27
|
+
|
|
28
|
+
def count_tokens(text: str) -> int:
|
|
29
|
+
try:
|
|
30
|
+
# start_time = time.time_ns()
|
|
31
|
+
encoded = TOKENIZER_MODEL.encode('{"role":"user","content":"' + text + '"}')
|
|
32
|
+
v = len(encoded.ids)
|
|
33
|
+
# elapsed_time = time.time_ns() - start_time
|
|
34
|
+
# logger.info(f"Token counting took {elapsed_time/1000000} ms")
|
|
35
|
+
return v
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"Error counting tokens: {str(e)}")
|
|
38
|
+
return -1
|
|
39
|
+
|
|
40
|
+
|
|
24
41
|
def count_tokens_worker(text: str) -> int:
|
|
25
42
|
try:
|
|
26
43
|
# start_time = time.time_ns()
|
|
27
|
-
encoded = tokenizer_model.encode('{"role":"user","content":"'+text+'"}')
|
|
44
|
+
encoded = tokenizer_model.encode('{"role":"user","content":"' + text + '"}')
|
|
28
45
|
v = len(encoded.ids)
|
|
29
46
|
# elapsed_time = time.time_ns() - start_time
|
|
30
47
|
# logger.info(f"Token counting took {elapsed_time/1000000} ms")
|
|
@@ -33,11 +50,16 @@ def count_tokens_worker(text: str) -> int:
|
|
|
33
50
|
logger.error(f"Error counting tokens: {str(e)}")
|
|
34
51
|
return -1
|
|
35
52
|
|
|
53
|
+
|
|
36
54
|
class TokenCounter:
|
|
37
55
|
def __init__(self, tokenizer_path: str):
|
|
38
56
|
self.tokenizer_path = tokenizer_path
|
|
39
57
|
self.num_processes = cpu_count() - 1 if cpu_count() > 1 else 1
|
|
40
|
-
self.pool = Pool(
|
|
58
|
+
self.pool = Pool(
|
|
59
|
+
processes=self.num_processes,
|
|
60
|
+
initializer=initialize_tokenizer,
|
|
61
|
+
initargs=(self.tokenizer_path,),
|
|
62
|
+
)
|
|
41
63
|
|
|
42
64
|
def count_tokens(self, text: str) -> int:
|
|
43
|
-
return self.pool.apply(count_tokens_worker, (text,))
|
|
65
|
+
return self.pool.apply(count_tokens_worker, (text,))
|
autocoder/rag/token_limiter.py
CHANGED
|
@@ -13,15 +13,21 @@ class TokenLimiter:
|
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
15
|
count_tokens: Callable[[str], int],
|
|
16
|
-
|
|
16
|
+
full_text_limit: int,
|
|
17
|
+
segment_limit: int,
|
|
18
|
+
buff_limit: int,
|
|
17
19
|
llm,
|
|
20
|
+
disable_segment_reorder:bool
|
|
18
21
|
):
|
|
19
22
|
self.count_tokens = count_tokens
|
|
20
|
-
self.
|
|
23
|
+
self.full_text_limit = full_text_limit
|
|
24
|
+
self.segment_limit = segment_limit
|
|
25
|
+
self.buff_limit = buff_limit
|
|
21
26
|
self.llm = llm
|
|
22
27
|
self.first_round_full_docs = []
|
|
23
28
|
self.second_round_extracted_docs = []
|
|
24
29
|
self.sencond_round_time = 0
|
|
30
|
+
self.disable_segment_reorder = disable_segment_reorder
|
|
25
31
|
|
|
26
32
|
@byzerllm.prompt()
|
|
27
33
|
def extract_relevance_range_from_docs_with_conversation(
|
|
@@ -88,21 +94,50 @@ class TokenLimiter:
|
|
|
88
94
|
final_relevant_docs = []
|
|
89
95
|
token_count = 0
|
|
90
96
|
doc_num_count = 0
|
|
91
|
-
|
|
97
|
+
|
|
98
|
+
reorder_relevant_docs = []
|
|
99
|
+
added_docs = set()
|
|
100
|
+
|
|
101
|
+
## 文档分段(单个文档过大)和重排序逻辑
|
|
102
|
+
## 1. 背景:在检索过程中,许多文档被切割成多个段落(segments)
|
|
103
|
+
## 2. 问题:这些segments在召回时因为是按相关分做了排序可能是乱序的,不符合原文顺序,会强化大模型的幻觉。
|
|
104
|
+
## 3. 目标:重新排序这些segments,确保来自同一文档的segments保持连续且按正确顺序排列。
|
|
105
|
+
## 4. 实现方案:
|
|
106
|
+
## a) 方案一(保留位置):统一文档的不同segments 根据chunk_index 来置换位置
|
|
107
|
+
## b) 方案二(当前实现):遍历文档,发现某文档的segment A,立即查找该文档的所有其他segments,
|
|
108
|
+
## 对它们进行排序,并将排序后多个segments插入到当前的segment A 位置中。
|
|
109
|
+
## TODO:
|
|
110
|
+
## 1. 未来根据参数决定是否开启重排以及重排的策略
|
|
111
|
+
if not self.disable_segment_reorder:
|
|
112
|
+
for doc in relevant_docs:
|
|
113
|
+
if doc.metadata.get('original_doc') and doc.metadata.get('chunk_index'):
|
|
114
|
+
if doc.metadata['original_doc'] not in added_docs:
|
|
115
|
+
original_doc = doc.metadata['original_doc']
|
|
116
|
+
chunks = [d for d in relevant_docs if d.metadata.get('original_doc') == original_doc]
|
|
117
|
+
chunks.sort(key=lambda x: x.metadata['chunk_index'])
|
|
118
|
+
reorder_relevant_docs.extend(chunks)
|
|
119
|
+
added_docs.add(original_doc)
|
|
120
|
+
elif doc not in added_docs:
|
|
121
|
+
reorder_relevant_docs.append(doc)
|
|
122
|
+
added_docs.add(doc.module_name)
|
|
123
|
+
|
|
124
|
+
## 非窗口分区实现
|
|
125
|
+
for doc in reorder_relevant_docs:
|
|
92
126
|
doc_tokens = self.count_tokens(doc.source_code)
|
|
93
127
|
doc_num_count += 1
|
|
94
|
-
if token_count + doc_tokens <= self.
|
|
128
|
+
if token_count + doc_tokens <= self.full_text_limit + self.segment_limit:
|
|
95
129
|
final_relevant_docs.append(doc)
|
|
96
130
|
token_count += doc_tokens
|
|
97
131
|
else:
|
|
98
132
|
break
|
|
99
133
|
|
|
100
|
-
|
|
101
|
-
|
|
134
|
+
## 如果窗口无法放下所有的相关文档,则需要分区
|
|
135
|
+
if len(final_relevant_docs) < len(reorder_relevant_docs):
|
|
136
|
+
## 先填充full_text分区
|
|
102
137
|
token_count = 0
|
|
103
|
-
new_token_limit = self.
|
|
138
|
+
new_token_limit = self.full_text_limit
|
|
104
139
|
doc_num_count = 0
|
|
105
|
-
for doc in
|
|
140
|
+
for doc in reorder_relevant_docs:
|
|
106
141
|
doc_tokens = self.count_tokens(doc.source_code)
|
|
107
142
|
doc_num_count += 1
|
|
108
143
|
if token_count + doc_tokens <= new_token_limit:
|
|
@@ -111,9 +146,19 @@ class TokenLimiter:
|
|
|
111
146
|
else:
|
|
112
147
|
break
|
|
113
148
|
|
|
149
|
+
if len(self.first_round_full_docs) > 0:
|
|
150
|
+
remaining_tokens = (
|
|
151
|
+
self.full_text_limit + self.segment_limit - token_count
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
logger.warning(
|
|
155
|
+
"Full text area is empty, this is may caused by the single doc is too long"
|
|
156
|
+
)
|
|
157
|
+
remaining_tokens = self.full_text_limit + self.segment_limit
|
|
158
|
+
|
|
159
|
+
## 继续填充segment分区
|
|
114
160
|
sencond_round_start_time = time.time()
|
|
115
|
-
|
|
116
|
-
remaining_docs = relevant_docs[len(self.first_round_full_docs) :]
|
|
161
|
+
remaining_docs = reorder_relevant_docs[len(self.first_round_full_docs) :]
|
|
117
162
|
logger.info(
|
|
118
163
|
f"first round docs: {len(self.first_round_full_docs)} remaining docs: {len(remaining_docs)} index_filter_workers: {index_filter_workers}"
|
|
119
164
|
)
|
|
@@ -130,7 +175,7 @@ class TokenLimiter:
|
|
|
130
175
|
result = future.result()
|
|
131
176
|
if result and remaining_tokens > 0:
|
|
132
177
|
self.second_round_extracted_docs.append(result)
|
|
133
|
-
tokens =
|
|
178
|
+
tokens = result.tokens
|
|
134
179
|
if tokens > 0:
|
|
135
180
|
remaining_tokens -= tokens
|
|
136
181
|
else:
|
|
@@ -184,7 +229,13 @@ class TokenLimiter:
|
|
|
184
229
|
content += chunk + "\n"
|
|
185
230
|
|
|
186
231
|
return SourceCode(
|
|
187
|
-
module_name=doc.module_name,
|
|
232
|
+
module_name=doc.module_name,
|
|
233
|
+
source_code=content.strip(),
|
|
234
|
+
tokens=self.count_tokens(content),
|
|
235
|
+
metadata={
|
|
236
|
+
"original_doc": doc.module_name,
|
|
237
|
+
"chunk_ranges": json_objs,
|
|
238
|
+
},
|
|
188
239
|
)
|
|
189
240
|
except Exception as e:
|
|
190
241
|
if attempt < max_retries - 1:
|
|
@@ -196,5 +247,7 @@ class TokenLimiter:
|
|
|
196
247
|
f"Failed to process doc {doc.module_name} after {max_retries} attempts: {str(e)}"
|
|
197
248
|
)
|
|
198
249
|
return SourceCode(
|
|
199
|
-
module_name=doc.module_name,
|
|
250
|
+
module_name=doc.module_name,
|
|
251
|
+
source_code="",
|
|
252
|
+
tokens= 0
|
|
200
253
|
)
|
|
@@ -121,7 +121,11 @@ class SuffixProject:
|
|
|
121
121
|
|
|
122
122
|
def convert_to_source_code(self, file_path):
|
|
123
123
|
module_name = file_path
|
|
124
|
-
|
|
124
|
+
try:
|
|
125
|
+
source_code = self.read_file_content(file_path)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
128
|
+
return None
|
|
125
129
|
return SourceCode(module_name=module_name, source_code=source_code)
|
|
126
130
|
|
|
127
131
|
def get_source_codes(self) -> Generator[SourceCode, None, None]:
|
autocoder/tsproject/__init__.py
CHANGED
|
@@ -152,7 +152,11 @@ class TSProject:
|
|
|
152
152
|
return None
|
|
153
153
|
|
|
154
154
|
module_name = file_path
|
|
155
|
-
|
|
155
|
+
try:
|
|
156
|
+
source_code = self.read_file_content(file_path)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
|
|
159
|
+
return None
|
|
156
160
|
|
|
157
161
|
if not FileUtils.has_sufficient_content(source_code, min_line_count=1):
|
|
158
162
|
return None
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.176"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|