auto-coder 0.1.173__py3-none-any.whl → 0.1.175__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.173
3
+ Version: 0.1.175
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -1,13 +1,13 @@
1
1
  autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- autocoder/auto_coder.py,sha256=eKwIakuHyjagdDcuMMFqcfu05N9nNOCoVd2fmsVjyLA,32211
2
+ autocoder/auto_coder.py,sha256=HmgKa_ZApFlCsqo6BvuVeCPuncBT_Dh29ayZxxGR6lo,32216
3
3
  autocoder/auto_coder_lang.py,sha256=4qIS1tbEI8mpbtt6ThppTwKOM6MLuJTWJdgs5jIDGE0,2301
4
- autocoder/auto_coder_rag.py,sha256=kd62w64-MD2zHzEYgh_9HUeDUJPUn4pwRQy10WrN-1o,10583
4
+ autocoder/auto_coder_rag.py,sha256=y-iZkb7Zw2JORbxGYaOoIhVQXBohGyUBFXN675qsgXg,11280
5
5
  autocoder/auto_coder_server.py,sha256=qRY88mkBnqSGFDcwYE5gwpe2WPhIw1nEH6LdbjCQhQk,20306
6
6
  autocoder/chat_auto_coder.py,sha256=i5xIuWlTqF0pJz8kXoa-_bW3Ic3SfCFvU2WJIMxrUHU,81798
7
7
  autocoder/chat_auto_coder_lang.py,sha256=QYtu5gWEQmWKVovR_qUZ8plySZarNFX_Onk-1vN9IiA,8524
8
8
  autocoder/command_args.py,sha256=ftWw6HnFUZPiQPt1oV-SfpHQe69XN3knaFy1lpROBcU,26854
9
9
  autocoder/lang.py,sha256=e-07rYTgimpxS8sm-AxKSmH4kKQX4N05YFHJBg9trVs,12598
10
- autocoder/version.py,sha256=9_6c2OzVoNP3LHOtNcm5gMxCqJ1Fv1Ql0AoMYGqXy88,23
10
+ autocoder/version.py,sha256=7VuhzksZrpUCv5FBnLSYtIkaqKki-s7Gt9mfU67bB6A,23
11
11
  autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  autocoder/agent/auto_tool.py,sha256=DBzip-P_T6ZtT2eHexPcusmKYD0h7ufzp7TLwXAY10E,11554
13
13
  autocoder/agent/coder.py,sha256=dnITYHqkcOip8zV4lywbkYNH9w7Q3qyYaUArJ4WPrTs,866
@@ -17,7 +17,7 @@ autocoder/agent/project_reader.py,sha256=-MWRqsr7O4mvU0PIpAhOUBb29htZAvA37pa_GeE
17
17
  autocoder/chat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  autocoder/common/JupyterClient.py,sha256=O-wi6pXeAEYhAY24kDa0BINrLYvKS6rKyWe98pDClS0,2816
19
19
  autocoder/common/ShellClient.py,sha256=fM1q8t_XMSbLBl2zkCNC2J9xuyKN3eXzGm6hHhqL2WY,2286
20
- autocoder/common/__init__.py,sha256=n9NwLwM8Rej_wgAvTXB6owPtGkITOrP5Y69Bg2PGYfY,9870
20
+ autocoder/common/__init__.py,sha256=FB0MdcGtI60-jPWXurNvOS9dZUZCg6HmgzHNVuHiW6c,10076
21
21
  autocoder/common/anything2images.py,sha256=0ILBbWzY02M-CiWB-vzuomb_J1hVdxRcenAfIrAXq9M,25283
22
22
  autocoder/common/audio.py,sha256=Kn9nWKQddWnUrAz0a_ZUgjcu4VUU_IcZBigT7n3N3qc,7439
23
23
  autocoder/common/cleaner.py,sha256=NU72i8C6o9m0vXExab7nao5bstBUsfJFcj11cXa9l4U,1089
@@ -56,13 +56,13 @@ autocoder/index/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  autocoder/index/for_command.py,sha256=zfbvQnhHjsAqBc4Ce1kMGIu0jPEk_rtH7fntg89_4z0,3092
57
57
  autocoder/index/index.py,sha256=6uakPXThpDWxAyOAP-7AbMuXaXJJkBKctL5RkNWGdGw,22485
58
58
  autocoder/index/symbols_utils.py,sha256=CjcjUVajmJZB75Ty3a7kMv1BZphrm-tIBAdOJv6uo-0,2037
59
- autocoder/pyproject/__init__.py,sha256=oTGAy6sV4ua7l3nRxfzZtZrwq_YhQOKqLzbdVWDN7yY,13007
59
+ autocoder/pyproject/__init__.py,sha256=-2-ImQVw6e3NQZQOyDlHEP5b4xVs5ur2G5izB-JCa-A,13160
60
60
  autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  autocoder/rag/api_server.py,sha256=zokIlDJlk7ucRorSLQm80uICO1mecfmn4J2zVqEBskE,6786
62
62
  autocoder/rag/doc_filter.py,sha256=LqU8Wi6klwpY9WTHVtkioSHpmo9IWhRz39dzV1gvp6E,9315
63
- autocoder/rag/document_retriever.py,sha256=04Vhbr1jZgPbTefFWK1TI-9MrwjGqb_Ls7QOAU3CPjw,18479
63
+ autocoder/rag/document_retriever.py,sha256=plwm8BpC55VJTUWCZyG4HsXYm-niqUsXaBMDLrLgYj0,23348
64
64
  autocoder/rag/llm_wrapper.py,sha256=xRbTBpLUH43Ah5jplL8WWWU-kjKfNgEJoUntLGBq5F4,2484
65
- autocoder/rag/long_context_rag.py,sha256=H-2N_lgBbCqcWKlh0yZIYDDm2p_Y0WRLJG6-m0wNRlU,16380
65
+ autocoder/rag/long_context_rag.py,sha256=qKuXBuwuI_eoBh7XNApbDfVYtG4sPqBbh3s7yscIUuI,19391
66
66
  autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
67
67
  autocoder/rag/rag_entry.py,sha256=V1RJ8RGqM30DNPmzymv64rZjNRGWn6kfc8sRy_LECg0,2451
68
68
  autocoder/rag/raw_rag.py,sha256=yS2Ur6kG0IRjhCj2_VonwxjY_xls_E62jO5Gz5j2nqE,2952
@@ -70,9 +70,10 @@ autocoder/rag/relevant_utils.py,sha256=OGfp98OXG4jr3jNmtHIeXGPF8mOlIbTnolPIVTZzY
70
70
  autocoder/rag/simple_directory_reader.py,sha256=LkKreCkNdEOoL4fNhc3_hDoyyWTQUte4uqextISRz4U,24485
71
71
  autocoder/rag/simple_rag.py,sha256=I902EUqOK1WM0Y2WFd7RzDJYofElvTZNLVCBtX5A9rc,14885
72
72
  autocoder/rag/token_checker.py,sha256=jc76x6KWmvVxds6W8juZfQGaoErudc2HenG3sNQfSLs,2819
73
- autocoder/rag/token_counter.py,sha256=8bcnDPpYkbq_KGhw4xIQAYmZwpVqPyMizZmoh7FsUnA,1592
74
- autocoder/rag/token_limiter.py,sha256=6rNsR0iQPfh8Vk4LSiF81Vnp3bVNOg6WTsFBR5tPYic,7749
73
+ autocoder/rag/token_counter.py,sha256=9ujfI5xQvwzKpN9XFWQGnXpm0h1sL7kgIJxgposcxNo,2096
74
+ autocoder/rag/token_limiter.py,sha256=dGSjKWwP_3rMwr8Yq06xqK2BrHpxW8Trn1gQLfnDOA8,8749
75
75
  autocoder/rag/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
+ autocoder/rag/variable_holder.py,sha256=pDayuCnlKj7-bkn4iUHX5gea9UObddbi3ZnXotmxCs4,45
76
77
  autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
77
78
  autocoder/rag/loaders/docx_loader.py,sha256=g6Ta8rMUbfgwB8N1qiajhyO6wpaWl7zygAZiKShuioI,174
78
79
  autocoder/rag/loaders/excel_loader.py,sha256=Ue8YB1z_kBs8SjIPuBskyM08Q1JiONs_BJZPrzi59oo,896
@@ -80,8 +81,8 @@ autocoder/rag/loaders/pdf_loader.py,sha256=CGfXOja7QZ7mHN-U5MsTiVMFzjP322rTj3dkY
80
81
  autocoder/rag/loaders/ppt_loader.py,sha256=7VEYc-bqgK8VHCoGC3DIUcqbpda-E5jQF9lYLqP256I,1681
81
82
  autocoder/regex_project/__init__.py,sha256=EBZeCL5ORyD_9_5u_UuG4s7XtpXOu0y1sWDmxWFtufE,6781
82
83
  autocoder/regexproject/__init__.py,sha256=ThuvVFdpw1EgWv4aIRkhg3ZclKPxMVharUKWppFpQ8o,8436
83
- autocoder/suffixproject/__init__.py,sha256=L_xgbsiQAJev0N3RIFp5w2cRGHptoL2sF7Omzp6Z6NU,9670
84
- autocoder/tsproject/__init__.py,sha256=FLwH14wSr3w0Ul7MfRggv6ol2rzctB4EN_qQFUM_Xag,10278
84
+ autocoder/suffixproject/__init__.py,sha256=EaQoumMzZ2COxMiI_GnL3SG4LGzRj0Qw7UpqLfNLCw8,9823
85
+ autocoder/tsproject/__init__.py,sha256=QmEpNZYUJq1o0lGMs3UuUIUU-2aq_3eh1VxqnIc-hME,10431
85
86
  autocoder/utils/__init__.py,sha256=O3n6cpsgkIbbMuwmBHSQ1dls_IBD7_7YKFFaeKNo_tc,1193
86
87
  autocoder/utils/coder.py,sha256=rK8e0svQBe0NOP26dIGToUXgha_hUDgxlWoC_p_r7oc,5698
87
88
  autocoder/utils/conversation_store.py,sha256=sz-hhY7sttPAUOAQU6Pze-5zJc3j0_Emj22dM_0l5ro,1161
@@ -94,9 +95,9 @@ autocoder/utils/request_event_queue.py,sha256=r3lo5qGsB1dIjzVQ05dnr0z_9Z3zOkBdP1
94
95
  autocoder/utils/request_queue.py,sha256=nwp6PMtgTCiuwJI24p8OLNZjUiprC-TsefQrhMI-yPE,3889
95
96
  autocoder/utils/rest.py,sha256=3tXA8KZG6jKz_tddHNLGx77Icee88WcUeesfNsgPno4,8790
96
97
  autocoder/utils/tests.py,sha256=BqphrwyycGAvs-5mhH8pKtMZdObwhFtJ5MC_ZAOiLq8,1340
97
- auto_coder-0.1.173.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
98
- auto_coder-0.1.173.dist-info/METADATA,sha256=2sQqrmtnneZ55RPdg6SMblMmbtMD7fv7-IbLSFAcmiU,2352
99
- auto_coder-0.1.173.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
100
- auto_coder-0.1.173.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
101
- auto_coder-0.1.173.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
102
- auto_coder-0.1.173.dist-info/RECORD,,
98
+ auto_coder-0.1.175.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
99
+ auto_coder-0.1.175.dist-info/METADATA,sha256=JrgeRETDy_kU_7g-1UNJfKkxWYlSgamCpiihAYl04Yw,2352
100
+ auto_coder-0.1.175.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
101
+ auto_coder-0.1.175.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
102
+ auto_coder-0.1.175.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
103
+ auto_coder-0.1.175.dist-info/RECORD,,
autocoder/auto_coder.py CHANGED
@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
813
813
  llm, args, code_auto_execute.Mode.SINGLE_ROUND
814
814
  )
815
815
  executor.run(query=args.query, context=s, source_code="")
816
- return
817
- elif raw_args.agent_command == "chat":
818
- from autocoder.rag.rag_entry import RAGFactory
819
-
820
- rag = RAGFactory.get_rag(llm=llm, args=args, path="")
821
- rag.stream_chat_repl(args.query)
822
- return
823
-
824
-
816
+ return
825
817
  elif raw_args.doc_command == "serve":
826
818
 
827
819
  from autocoder.rag.llm_wrapper import LLWrapper
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
846
838
  llm_wrapper = LLWrapper(llm=llm, rag=rag)
847
839
  serve(llm=llm_wrapper, args=server_args)
848
840
  return
841
+
842
+ elif raw_args.doc_command == "chat":
843
+ from autocoder.rag.rag_entry import RAGFactory
844
+
845
+ rag = RAGFactory.get_rag(llm=llm, args=args, path="")
846
+ rag.stream_chat_repl(args.query)
847
+ return
849
848
 
850
849
  else:
851
850
  http_doc = HttpDoc(args=args, llm=llm, urls=None)
@@ -18,7 +18,7 @@ from rich.console import Console
18
18
  from rich.table import Table
19
19
  import os
20
20
 
21
- from autocoder.rag.document_retriever import process_file3
21
+ from autocoder.rag.document_retriever import process_file_local
22
22
  from autocoder.rag.token_counter import TokenCounter
23
23
 
24
24
  if platform.system() == "Windows":
@@ -90,16 +90,24 @@ def initialize_system():
90
90
 
91
91
  if choice == "1":
92
92
  print_status(get_message("deploying_model").format("Deepseek官方"), "")
93
-
93
+
94
94
  deploy_cmd = [
95
- "byzerllm", "deploy",
96
- "--pretrained_model_type", "saas/openai",
97
- "--cpus_per_worker", "0.001",
98
- "--gpus_per_worker", "0",
99
- "--worker_concurrency", "1000",
100
- "--num_workers", "1",
101
- "--infer_params", f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
102
- "--model", "deepseek_chat"
95
+ "byzerllm",
96
+ "deploy",
97
+ "--pretrained_model_type",
98
+ "saas/openai",
99
+ "--cpus_per_worker",
100
+ "0.001",
101
+ "--gpus_per_worker",
102
+ "0",
103
+ "--worker_concurrency",
104
+ "1000",
105
+ "--num_workers",
106
+ "1",
107
+ "--infer_params",
108
+ f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
109
+ "--model",
110
+ "deepseek_chat",
103
111
  ]
104
112
 
105
113
  try:
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
138
146
 
139
147
  # Serve command
140
148
  serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
141
- serve_parser.add_argument("--quick", action="store_true", help="Skip system initialization")
149
+ serve_parser.add_argument(
150
+ "--quick", action="store_true", help="Skip system initialization"
151
+ )
142
152
  serve_parser.add_argument("--file", default="", help=desc["file"])
143
153
  serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
144
154
  serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
160
170
  "--rag_context_window_limit",
161
171
  type=int,
162
172
  default=110000,
163
- help="",
173
+ help="The input context window limit for RAG",
174
+ )
175
+ serve_parser.add_argument(
176
+ "--full_text_ratio",
177
+ type=float,
178
+ default=0.7,
179
+ help="The ratio of full text area in the input context window (0.0 to 1.0)",
180
+ )
181
+ serve_parser.add_argument(
182
+ "--segment_ratio",
183
+ type=float,
184
+ default=0.2,
185
+ help="The ratio of segment area in the input context window (0.0 to 1.0)",
164
186
  )
165
187
  serve_parser.add_argument(
166
188
  "--required_exts", default="", help=desc["doc_build_parse_required_exts"]
@@ -198,6 +220,12 @@ def main(input_args: Optional[List[str]] = None):
198
220
  help="Monitor mode for the doc update",
199
221
  )
200
222
 
223
+ serve_parser.add_argument(
224
+ "--disable_auto_window",
225
+ action="store_true",
226
+ help="Disable automatic window adaptation for documents",
227
+ )
228
+
201
229
  # Tools command
202
230
  tools_parser = subparsers.add_parser("tools", help="Various tools")
203
231
  tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
@@ -255,7 +283,7 @@ def main(input_args: Optional[List[str]] = None):
255
283
 
256
284
  def count_tokens(tokenizer_path: str, file_path: str):
257
285
  token_counter = TokenCounter(tokenizer_path)
258
- source_codes = process_file3(file_path)
286
+ source_codes = process_file_local(file_path)
259
287
 
260
288
  console = Console()
261
289
  table = Table(title="Token Count Results")
@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
11
11
  module_name: str
12
12
  source_code: str
13
13
  tag: str = ""
14
+ tokens: int = -1
15
+ metadata: Dict[str, Any] = {}
14
16
 
15
17
 
16
18
  class TranslateReadme(pydantic.BaseModel):
@@ -281,7 +283,8 @@ class AutoCoderArgs(pydantic.BaseModel):
281
283
  doc_command: Optional[str] = None
282
284
  required_exts: Optional[str] = None
283
285
 
284
- monitor_mode: Optional[bool] = False
286
+ monitor_mode: bool = False
287
+ disable_auto_window: bool = False
285
288
 
286
289
  description: Optional[str] = ""
287
290
  skip_confirm: Optional[bool] = False
@@ -304,5 +307,9 @@ class AutoCoderArgs(pydantic.BaseModel):
304
307
 
305
308
  agent_designer_mode: Optional[str] = "svg"
306
309
 
310
+ full_text_ratio: Optional[float] = 0.7
311
+ segment_ratio: Optional[float] = 0.2
312
+ buff_ratio: Optional[float] = 0.1
313
+
307
314
  class Config:
308
315
  protected_namespaces = ()
@@ -187,7 +187,11 @@ class PyProject:
187
187
 
188
188
  def convert_to_source_code(self, file_path):
189
189
  module_name = file_path
190
- source_code = self.read_file_content(file_path)
190
+ try:
191
+ source_code = self.read_file_content(file_path)
192
+ except Exception as e:
193
+ logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
194
+ return None
191
195
  return SourceCode(module_name=module_name, source_code=source_code)
192
196
 
193
197
  def get_package_source_codes(
@@ -18,10 +18,15 @@ from loguru import logger
18
18
  from pydantic import BaseModel
19
19
 
20
20
  from autocoder.common import SourceCode
21
- from autocoder.rag.loaders import (extract_text_from_docx,
22
- extract_text_from_excel,
23
- extract_text_from_pdf,
24
- extract_text_from_ppt)
21
+ from autocoder.rag.loaders import (
22
+ extract_text_from_docx,
23
+ extract_text_from_excel,
24
+ extract_text_from_pdf,
25
+ extract_text_from_ppt,
26
+ )
27
+ from autocoder.rag import variable_holder
28
+ from autocoder.rag.token_counter import count_tokens_worker, count_tokens
29
+ from uuid import uuid4
25
30
 
26
31
  cache_lock = threading.Lock()
27
32
 
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
34
39
  file_infos: List[Tuple[str, str, float]]
35
40
 
36
41
 
37
- @ray.remote
38
- def process_file(file_info: Tuple[str, str, float]) -> List[SourceCode]:
42
+ def process_file_in_multi_process(
43
+ file_info: Tuple[str, str, float]
44
+ ) -> List[SourceCode]:
39
45
  start_time = time.time()
40
46
  file_path, relative_path, _ = file_info
41
47
  try:
42
48
  if file_path.endswith(".pdf"):
43
49
  with open(file_path, "rb") as f:
44
50
  content = extract_text_from_pdf(f.read())
45
- v = [SourceCode(module_name=file_path, source_code=content)]
46
- elif file_path.endswith(".docx"):
47
- with open(file_path, "rb") as f:
48
- content = extract_text_from_docx(f.read())
49
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
50
- elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
51
- sheets = extract_text_from_excel(file_path)
52
51
  v = [
53
52
  SourceCode(
54
- module_name=f"##File: {file_path}#{sheet[0]}",
55
- source_code=sheet[1],
53
+ module_name=file_path,
54
+ source_code=content,
55
+ tokens=count_tokens_worker(content),
56
56
  )
57
- for sheet in sheets
58
57
  ]
59
- elif file_path.endswith(".pptx"):
60
- slides = extract_text_from_ppt(file_path)
61
- content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
62
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
63
- else:
64
- with open(file_path, "r", encoding="utf-8") as f:
65
- content = f.read()
66
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
67
- logger.info(f"Load file {file_path} in {time.time() - start_time}")
68
- return v
69
- except Exception as e:
70
- logger.error(f"Error processing file {file_path}: {str(e)}")
71
- return []
72
-
73
-
74
- def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
75
- start_time = time.time()
76
- file_path, relative_path, _ = file_info
77
- try:
78
- if file_path.endswith(".pdf"):
79
- with open(file_path, "rb") as f:
80
- content = extract_text_from_pdf(f.read())
81
- v = [SourceCode(module_name=file_path, source_code=content)]
82
58
  elif file_path.endswith(".docx"):
83
59
  with open(file_path, "rb") as f:
84
60
  content = extract_text_from_docx(f.read())
85
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
61
+ v = [
62
+ SourceCode(
63
+ module_name=f"##File: {file_path}",
64
+ source_code=content,
65
+ tokens=count_tokens_worker(content),
66
+ )
67
+ ]
86
68
  elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
87
69
  sheets = extract_text_from_excel(file_path)
88
70
  v = [
89
71
  SourceCode(
90
72
  module_name=f"##File: {file_path}#{sheet[0]}",
91
73
  source_code=sheet[1],
74
+ tokens=count_tokens_worker(sheet[1]),
92
75
  )
93
76
  for sheet in sheets
94
77
  ]
95
78
  elif file_path.endswith(".pptx"):
96
79
  slides = extract_text_from_ppt(file_path)
97
80
  content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
98
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
81
+ v = [
82
+ SourceCode(
83
+ module_name=f"##File: {file_path}",
84
+ source_code=content,
85
+ tokens=count_tokens_worker(content),
86
+ )
87
+ ]
99
88
  else:
100
89
  with open(file_path, "r", encoding="utf-8") as f:
101
90
  content = f.read()
102
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
91
+ v = [
92
+ SourceCode(
93
+ module_name=f"##File: {file_path}",
94
+ source_code=content,
95
+ tokens=count_tokens_worker(content),
96
+ )
97
+ ]
103
98
  logger.info(f"Load file {file_path} in {time.time() - start_time}")
104
99
  return v
105
100
  except Exception as e:
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
107
102
  return []
108
103
 
109
104
 
110
- def process_file3(file_path: str) -> List[SourceCode]:
105
+ def process_file_local(file_path: str) -> List[SourceCode]:
111
106
  start_time = time.time()
112
107
  try:
113
108
  if file_path.endswith(".pdf"):
114
109
  with open(file_path, "rb") as f:
115
110
  content = extract_text_from_pdf(f.read())
116
- v = [SourceCode(module_name=file_path, source_code=content)]
111
+ v = [
112
+ SourceCode(
113
+ module_name=file_path,
114
+ source_code=content,
115
+ tokens=count_tokens(content),
116
+ )
117
+ ]
117
118
  elif file_path.endswith(".docx"):
118
119
  with open(file_path, "rb") as f:
119
120
  content = extract_text_from_docx(f.read())
120
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
121
+ v = [
122
+ SourceCode(
123
+ module_name=f"##File: {file_path}",
124
+ source_code=content,
125
+ tokens=count_tokens(content),
126
+ )
127
+ ]
121
128
  elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
122
129
  sheets = extract_text_from_excel(file_path)
123
130
  v = [
124
131
  SourceCode(
125
132
  module_name=f"##File: {file_path}#{sheet[0]}",
126
133
  source_code=sheet[1],
134
+ tokens=count_tokens(sheet[1]),
127
135
  )
128
136
  for sheet in sheets
129
137
  ]
130
138
  elif file_path.endswith(".pptx"):
131
139
  slides = extract_text_from_ppt(file_path)
132
140
  content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
133
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
141
+ v = [
142
+ SourceCode(
143
+ module_name=f"##File: {file_path}",
144
+ source_code=content,
145
+ tokens=count_tokens(content),
146
+ )
147
+ ]
134
148
  else:
135
149
  with open(file_path, "r", encoding="utf-8") as f:
136
150
  content = f.read()
137
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
151
+ v = [
152
+ SourceCode(
153
+ module_name=f"##File: {file_path}",
154
+ source_code=content,
155
+ tokens=count_tokens(content),
156
+ )
157
+ ]
138
158
  logger.info(f"Load file {file_path} in {time.time() - start_time}")
139
159
  return v
140
160
  except Exception as e:
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
205
225
  self.update_cache(item)
206
226
 
207
227
  def update_cache(self, file_path):
208
- source_code = process_file3(file_path)
228
+ source_code = process_file_local(file_path)
209
229
  self.cache[file_path] = {
210
230
  "file_path": file_path,
211
231
  "content": [c.model_dump() for c in source_code],
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
220
240
 
221
241
  def open_watch(self):
222
242
  logger.info(f"start monitor: {self.path}...")
223
- for changes in watch(self.path, watch_filter=self.file_filter, stop_event=self.stop_event):
243
+ for changes in watch(
244
+ self.path, watch_filter=self.file_filter, stop_event=self.stop_event
245
+ ):
224
246
  for change in changes:
225
247
  (action, path) = change
226
248
  if action == Change.added or action == Change.modified:
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
290
312
  self.thread.start()
291
313
  self.cache = self.read_cache()
292
314
 
293
-
294
315
  def _process_queue(self):
295
316
  while not self.stop_event.is_set():
296
317
  try:
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
324
345
  # results = ray.get(
325
346
  # [process_file.remote(file_info) for file_info in files_to_process]
326
347
  # )
327
- with Pool(processes=os.cpu_count()) as pool:
328
- results = pool.map(process_file2, files_to_process)
348
+ from autocoder.rag.token_counter import initialize_tokenizer
349
+
350
+ with Pool(
351
+ processes=os.cpu_count(),
352
+ initializer=initialize_tokenizer,
353
+ initargs=(variable_holder.TOKENIZER_PATH,),
354
+ ) as pool:
355
+ results = pool.map(process_file_in_multi_process, files_to_process)
329
356
 
330
357
  for file_info, result in zip(files_to_process, results):
331
358
  self.update_cache(file_info, result)
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
365
392
  elif isinstance(file_list, AddOrUpdateEvent):
366
393
  for file_info in file_list.file_infos:
367
394
  logger.info(f"{file_info[0]} is detected to be updated")
368
- result = process_file2(file_info)
395
+ result = process_file_local(file_info)
369
396
  self.update_cache(file_info, result)
370
397
 
371
398
  self.write_cache()
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
410
437
  # 释放文件锁
411
438
  fcntl.flock(lockf, fcntl.LOCK_UN)
412
439
 
413
- def update_cache(self, file_info: Tuple[str, str, float], content: List[SourceCode]):
440
+ def update_cache(
441
+ self, file_info: Tuple[str, str, float], content: List[SourceCode]
442
+ ):
414
443
  file_path, relative_path, modify_time = file_info
415
444
  self.cache[file_path] = {
416
445
  "file_path": file_path,
@@ -485,11 +514,20 @@ class DocumentRetriever:
485
514
  required_exts: list,
486
515
  on_ray: bool = False,
487
516
  monitor_mode: bool = False,
517
+ single_file_token_limit: int = 60000,
518
+ disable_auto_window: bool = False,
488
519
  ) -> None:
489
520
  self.path = path
490
521
  self.ignore_spec = ignore_spec
491
522
  self.required_exts = required_exts
492
523
  self.monitor_mode = monitor_mode
524
+ self.single_file_token_limit = single_file_token_limit
525
+ self.disable_auto_window = disable_auto_window
526
+
527
+ # 多小的文件会被合并
528
+ self.small_file_token_limit = self.single_file_token_limit / 4
529
+ # 合并后的最大文件大小
530
+ self.small_file_merge_limit = self.single_file_token_limit / 2
493
531
 
494
532
  self.on_ray = on_ray
495
533
  if self.on_ray:
@@ -502,6 +540,13 @@ class DocumentRetriever:
502
540
  path, ignore_spec, required_exts
503
541
  )
504
542
 
543
+ logger.info(f"DocumentRetriever initialized with:")
544
+ logger.info(f" Path: {self.path}")
545
+ logger.info(f" Diable auto window: {self.disable_auto_window} ")
546
+ logger.info(f" Single file token limit: {self.single_file_token_limit}")
547
+ logger.info(f" Small file token limit: {self.small_file_token_limit}")
548
+ logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
549
+
505
550
  def get_cache(self):
506
551
  if self.on_ray:
507
552
  return ray.get(self.cacher.get_cache.remote())
@@ -509,6 +554,102 @@ class DocumentRetriever:
509
554
  return self.cacher.get_cache()
510
555
 
511
556
  def retrieve_documents(self) -> Generator[SourceCode, None, None]:
557
+ logger.info("Starting document retrieval process")
558
+ waiting_list = []
559
+ waiting_tokens = 0
512
560
  for _, data in self.get_cache().items():
513
561
  for source_code in data["content"]:
514
- yield SourceCode.model_validate(source_code)
562
+ doc = SourceCode.model_validate(source_code)
563
+ if self.disable_auto_window:
564
+ yield doc
565
+ else:
566
+ if doc.tokens <= 0:
567
+ yield doc
568
+ elif doc.tokens < self.small_file_token_limit:
569
+ waiting_list, waiting_tokens = self._add_to_waiting_list(
570
+ doc, waiting_list, waiting_tokens
571
+ )
572
+ if waiting_tokens >= self.small_file_merge_limit:
573
+ yield from self._process_waiting_list(waiting_list)
574
+ waiting_list = []
575
+ waiting_tokens = 0
576
+ elif doc.tokens > self.single_file_token_limit:
577
+ yield from self._split_large_document(doc)
578
+ else:
579
+ yield doc
580
+ if waiting_list and not self.disable_auto_window:
581
+ yield from self._process_waiting_list(waiting_list)
582
+
583
+ logger.info("Document retrieval process completed")
584
+
585
+ def _add_to_waiting_list(
586
+ self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
587
+ ) -> Tuple[List[SourceCode], int]:
588
+ waiting_list.append(doc)
589
+ return waiting_list, waiting_tokens + doc.tokens
590
+
591
+ def _process_waiting_list(
592
+ self, waiting_list: List[SourceCode]
593
+ ) -> Generator[SourceCode, None, None]:
594
+ if len(waiting_list) == 1:
595
+ yield waiting_list[0]
596
+ elif len(waiting_list) > 1:
597
+ yield self._merge_documents(waiting_list)
598
+
599
+ def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
600
+ merged_content = "\n".join(
601
+ [f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
602
+ )
603
+ merged_tokens = sum([doc.tokens for doc in docs])
604
+ merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
605
+ logger.info(
606
+ f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
607
+ )
608
+ return SourceCode(
609
+ module_name=merged_name,
610
+ source_code=merged_content,
611
+ tokens=merged_tokens,
612
+ metadata={"original_docs": [doc.module_name for doc in docs]},
613
+ )
614
+
615
+ def _split_large_document(
616
+ self, doc: SourceCode
617
+ ) -> Generator[SourceCode, None, None]:
618
+ chunk_size = self.single_file_token_limit
619
+ total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
620
+ logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
621
+ for i in range(0, doc.tokens, chunk_size):
622
+ chunk_content = doc.source_code[i : i + chunk_size]
623
+ chunk_tokens = min(chunk_size, doc.tokens - i)
624
+ chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
625
+ # logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
626
+ yield SourceCode(
627
+ module_name=chunk_name,
628
+ source_code=chunk_content,
629
+ tokens=chunk_tokens,
630
+ metadata={
631
+ "original_doc": doc.module_name,
632
+ "chunk_index": i // chunk_size + 1,
633
+ },
634
+ )
635
+
636
+ def _split_document(
637
+ self, doc: SourceCode, token_limit: int
638
+ ) -> Generator[SourceCode, None, None]:
639
+ remaining_tokens = doc.tokens
640
+ chunk_number = 1
641
+ start_index = 0
642
+
643
+ while remaining_tokens > 0:
644
+ end_index = start_index + token_limit
645
+ chunk_content = doc.source_code[start_index:end_index]
646
+ chunk_tokens = min(token_limit, remaining_tokens)
647
+
648
+ chunk_name = f"{doc.module_name}#{chunk_number:06d}"
649
+ yield SourceCode(
650
+ module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
651
+ )
652
+
653
+ start_index = end_index
654
+ remaining_tokens -= chunk_tokens
655
+ chunk_number += 1
@@ -13,16 +13,22 @@ from openai import OpenAI
13
13
  from rich.console import Console
14
14
  from rich.panel import Panel
15
15
  from rich.table import Table
16
- from rich.text import Text
16
+ import statistics
17
17
 
18
18
  from autocoder.common import AutoCoderArgs, SourceCode
19
19
  from autocoder.rag.doc_filter import DocFilter
20
20
  from autocoder.rag.document_retriever import DocumentRetriever
21
- from autocoder.rag.relevant_utils import (DocRelevance, FilterDoc, TaskTiming,
22
- parse_relevance)
21
+ from autocoder.rag.relevant_utils import (
22
+ DocRelevance,
23
+ FilterDoc,
24
+ TaskTiming,
25
+ parse_relevance,
26
+ )
23
27
  from autocoder.rag.token_checker import check_token_limit
24
28
  from autocoder.rag.token_counter import RemoteTokenCounter, TokenCounter
25
29
  from autocoder.rag.token_limiter import TokenLimiter
30
+ from tokenizers import Tokenizer
31
+ from autocoder.rag import variable_holder
26
32
 
27
33
 
28
34
  class LongContextRAG:
@@ -44,11 +50,26 @@ class LongContextRAG:
44
50
  self.path = path
45
51
  self.relevant_score = self.args.rag_doc_filter_relevance or 5
46
52
 
53
+ self.full_text_ratio = args.full_text_ratio
54
+ self.segment_ratio = args.segment_ratio
55
+ self.buff_ratio = 1 - self.full_text_ratio - self.segment_ratio
56
+
57
+ if self.buff_ratio < 0:
58
+ raise ValueError(
59
+ "The sum of full_text_ratio and segment_ratio must be less than or equal to 1.0"
60
+ )
61
+
62
+ self.full_text_limit = int(args.rag_context_window_limit * self.full_text_ratio)
63
+ self.segment_limit = int(args.rag_context_window_limit * self.segment_ratio)
64
+ self.buff_limit = int(args.rag_context_window_limit * self.buff_ratio)
65
+
47
66
  self.tokenizer = None
48
67
  self.tokenizer_path = tokenizer_path
49
68
  self.on_ray = False
50
69
 
51
70
  if self.tokenizer_path:
71
+ variable_holder.TOKENIZER_PATH = self.tokenizer_path
72
+ variable_holder.TOKENIZER_MODEL = Tokenizer.from_file(self.tokenizer_path)
52
73
  self.tokenizer = TokenCounter(self.tokenizer_path)
53
74
  else:
54
75
  if llm.is_model_exist("deepseek_tokenizer"):
@@ -96,24 +117,41 @@ class LongContextRAG:
96
117
  self.required_exts,
97
118
  self.on_ray,
98
119
  self.monitor_mode,
120
+ ## 确保全文区至少能放下一个文件
121
+ single_file_token_limit=self.full_text_limit - 100,
122
+ disable_auto_window=self.args.disable_auto_window
99
123
  )
100
124
 
101
125
  self.doc_filter = DocFilter(
102
126
  self.index_model, self.args, on_ray=self.on_ray, path=self.path
103
127
  )
104
-
105
- # 检查当前目录下所有文件是否超过 120k tokens ,并且打印出来
106
- self.token_exceed_files = []
107
- if self.tokenizer is not None:
108
- self.token_exceed_files = check_token_limit(
109
- count_tokens=self.count_tokens,
110
- token_limit=self.token_limit,
111
- retrieve_documents=self._retrieve_documents,
112
- max_workers=self.args.index_filter_workers or 5,
113
- )
128
+
129
+ doc_num = 0
130
+ token_num = 0
131
+ token_counts = []
132
+ for doc in self._retrieve_documents():
133
+ doc_num += 1
134
+ doc_tokens = doc.tokens
135
+ token_num += doc_tokens
136
+ token_counts.append(doc_tokens)
137
+
138
+ avg_tokens = statistics.mean(token_counts) if token_counts else 0
139
+ median_tokens = statistics.median(token_counts) if token_counts else 0
114
140
 
115
141
  logger.info(
116
- f"Tokenizer path: {self.tokenizer_path} relevant_score: {self.relevant_score} token_limit: {self.token_limit}"
142
+ "RAG Configuration:\n"
143
+ f" Total docs: {doc_num}\n"
144
+ f" Total tokens: {token_num}\n"
145
+ f" Tokenizer path: {self.tokenizer_path}\n"
146
+ f" Relevant score: {self.relevant_score}\n"
147
+ f" Token limit: {self.token_limit}\n"
148
+ f" Full text limit: {self.full_text_limit}\n"
149
+ f" Segment limit: {self.segment_limit}\n"
150
+ f" Buff limit: {self.buff_limit}\n"
151
+ f" Max doc tokens: {max(token_counts) if token_counts else 0}\n"
152
+ f" Min doc tokens: {min(token_counts) if token_counts else 0}\n"
153
+ f" Avg doc tokens: {avg_tokens:.2f}\n"
154
+ f" Median doc tokens: {median_tokens:.2f}\n"
117
155
  )
118
156
 
119
157
  def count_tokens(self, text: str) -> int:
@@ -350,9 +388,15 @@ class LongContextRAG:
350
388
  query_table.add_row("Relevant docs", str(len(relevant_docs)))
351
389
 
352
390
  # Add relevant docs information
353
- relevant_docs_info = "\n".join(
354
- [f"- {doc.module_name}" for doc in relevant_docs]
355
- )
391
+ relevant_docs_info = []
392
+ for doc in relevant_docs:
393
+ info = f"- {doc.module_name.replace(self.path,'',1)}"
394
+ if 'original_docs' in doc.metadata:
395
+ original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
396
+ info += f" (Original docs: {original_docs})"
397
+ relevant_docs_info.append(info)
398
+
399
+ relevant_docs_info = "\n".join(relevant_docs_info)
356
400
  query_table.add_row("Relevant docs list", relevant_docs_info)
357
401
 
358
402
  first_round_full_docs = []
@@ -363,7 +407,9 @@ class LongContextRAG:
363
407
 
364
408
  token_limiter = TokenLimiter(
365
409
  count_tokens=self.count_tokens,
366
- token_limit=self.token_limit,
410
+ full_text_limit=self.full_text_limit,
411
+ segment_limit=self.segment_limit,
412
+ buff_limit=self.buff_limit,
367
413
  llm=self.llm,
368
414
  )
369
415
  final_relevant_docs = token_limiter.limit_tokens(
@@ -395,9 +441,18 @@ class LongContextRAG:
395
441
  )
396
442
 
397
443
  # Add relevant docs information
398
- final_relevant_docs_info = "\n".join(
399
- [f"- {doc.module_name}" for doc in relevant_docs]
400
- )
444
+ final_relevant_docs_info = []
445
+ for doc in relevant_docs:
446
+ info = f"- {doc.module_name.replace(self.path,'',1)}"
447
+ if 'original_docs' in doc.metadata:
448
+ original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
449
+ info += f" (Original docs: {original_docs})"
450
+ if "chunk_ranges" in doc.metadata:
451
+ chunk_ranges = json.dumps(doc.metadata['chunk_ranges'],ensure_ascii=False)
452
+ info += f" (Chunk ranges: {chunk_ranges})"
453
+ final_relevant_docs_info.append(info)
454
+
455
+ final_relevant_docs_info = "\n".join(final_relevant_docs_info)
401
456
  query_table.add_row("Final Relevant docs list", final_relevant_docs_info)
402
457
 
403
458
  # Create a panel to contain the table
@@ -409,8 +464,10 @@ class LongContextRAG:
409
464
 
410
465
  # Log the panel using rich
411
466
  console.print(panel)
412
-
413
- logger.info(f"Start to send to model {model}")
467
+
468
+ request_tokens = sum([doc.tokens for doc in relevant_docs])
469
+ target_model = model or self.llm.default_model_name
470
+ logger.info(f"Start to send to model {target_model} with {request_tokens} tokens")
414
471
 
415
472
  new_conversations = conversations[:-1] + [
416
473
  {
@@ -2,29 +2,46 @@ import time
2
2
  from loguru import logger
3
3
  from tokenizers import Tokenizer
4
4
  from multiprocessing import Pool, cpu_count
5
+ from autocoder.rag.variable_holder import TOKENIZER_MODEL
6
+
5
7
 
6
8
  class RemoteTokenCounter:
7
- def __init__(self,tokenizer) -> None:
9
+ def __init__(self, tokenizer) -> None:
8
10
  self.tokenizer = tokenizer
9
11
 
10
- def count_tokens(self, text: str) -> int:
11
- try:
12
+ def count_tokens(self, text: str) -> int:
13
+ try:
12
14
  v = self.tokenizer.chat_oai(
13
15
  conversations=[{"role": "user", "content": text}]
14
- )
16
+ )
15
17
  return int(v[0].output)
16
18
  except Exception as e:
17
19
  logger.error(f"Error counting tokens: {str(e)}")
18
20
  return -1
19
-
21
+
22
+
20
23
  def initialize_tokenizer(tokenizer_path):
21
- global tokenizer_model
24
+ global tokenizer_model
22
25
  tokenizer_model = Tokenizer.from_file(tokenizer_path)
23
26
 
27
+
28
+ def count_tokens(text: str) -> int:
29
+ try:
30
+ # start_time = time.time_ns()
31
+ encoded = TOKENIZER_MODEL.encode('{"role":"user","content":"' + text + '"}')
32
+ v = len(encoded.ids)
33
+ # elapsed_time = time.time_ns() - start_time
34
+ # logger.info(f"Token counting took {elapsed_time/1000000} ms")
35
+ return v
36
+ except Exception as e:
37
+ logger.error(f"Error counting tokens: {str(e)}")
38
+ return -1
39
+
40
+
24
41
  def count_tokens_worker(text: str) -> int:
25
42
  try:
26
43
  # start_time = time.time_ns()
27
- encoded = tokenizer_model.encode('{"role":"user","content":"'+text+'"}')
44
+ encoded = tokenizer_model.encode('{"role":"user","content":"' + text + '"}')
28
45
  v = len(encoded.ids)
29
46
  # elapsed_time = time.time_ns() - start_time
30
47
  # logger.info(f"Token counting took {elapsed_time/1000000} ms")
@@ -33,11 +50,16 @@ def count_tokens_worker(text: str) -> int:
33
50
  logger.error(f"Error counting tokens: {str(e)}")
34
51
  return -1
35
52
 
53
+
36
54
  class TokenCounter:
37
55
  def __init__(self, tokenizer_path: str):
38
56
  self.tokenizer_path = tokenizer_path
39
57
  self.num_processes = cpu_count() - 1 if cpu_count() > 1 else 1
40
- self.pool = Pool(processes=self.num_processes, initializer=initialize_tokenizer, initargs=(self.tokenizer_path,))
58
+ self.pool = Pool(
59
+ processes=self.num_processes,
60
+ initializer=initialize_tokenizer,
61
+ initargs=(self.tokenizer_path,),
62
+ )
41
63
 
42
64
  def count_tokens(self, text: str) -> int:
43
- return self.pool.apply(count_tokens_worker, (text,))
65
+ return self.pool.apply(count_tokens_worker, (text,))
@@ -13,11 +13,15 @@ class TokenLimiter:
13
13
  def __init__(
14
14
  self,
15
15
  count_tokens: Callable[[str], int],
16
- token_limit: int,
16
+ full_text_limit: int,
17
+ segment_limit: int,
18
+ buff_limit: int,
17
19
  llm,
18
20
  ):
19
21
  self.count_tokens = count_tokens
20
- self.token_limit = token_limit
22
+ self.full_text_limit = full_text_limit
23
+ self.segment_limit = segment_limit
24
+ self.buff_limit = buff_limit
21
25
  self.llm = llm
22
26
  self.first_round_full_docs = []
23
27
  self.second_round_extracted_docs = []
@@ -88,19 +92,22 @@ class TokenLimiter:
88
92
  final_relevant_docs = []
89
93
  token_count = 0
90
94
  doc_num_count = 0
95
+
96
+ ## 非窗口分区实现
91
97
  for doc in relevant_docs:
92
98
  doc_tokens = self.count_tokens(doc.source_code)
93
99
  doc_num_count += 1
94
- if token_count + doc_tokens <= self.token_limit:
100
+ if token_count + doc_tokens <= self.full_text_limit + self.segment_limit:
95
101
  final_relevant_docs.append(doc)
96
102
  token_count += doc_tokens
97
103
  else:
98
104
  break
99
105
 
106
+ ## 如果窗口无法放下所有的相关文档,则需要分区
100
107
  if len(final_relevant_docs) < len(relevant_docs):
101
-
108
+ ## 先填充full_text分区
102
109
  token_count = 0
103
- new_token_limit = self.token_limit * 0.8
110
+ new_token_limit = self.full_text_limit
104
111
  doc_num_count = 0
105
112
  for doc in relevant_docs:
106
113
  doc_tokens = self.count_tokens(doc.source_code)
@@ -111,8 +118,18 @@ class TokenLimiter:
111
118
  else:
112
119
  break
113
120
 
121
+ if len(self.first_round_full_docs) > 0:
122
+ remaining_tokens = (
123
+ self.full_text_limit + self.segment_limit - token_count
124
+ )
125
+ else:
126
+ logger.warning(
127
+ "Full text area is empty, this is may caused by the single doc is too long"
128
+ )
129
+ remaining_tokens = self.full_text_limit + self.segment_limit
130
+
131
+ ## 继续填充segment分区
114
132
  sencond_round_start_time = time.time()
115
- remaining_tokens = self.token_limit - new_token_limit
116
133
  remaining_docs = relevant_docs[len(self.first_round_full_docs) :]
117
134
  logger.info(
118
135
  f"first round docs: {len(self.first_round_full_docs)} remaining docs: {len(remaining_docs)} index_filter_workers: {index_filter_workers}"
@@ -130,7 +147,7 @@ class TokenLimiter:
130
147
  result = future.result()
131
148
  if result and remaining_tokens > 0:
132
149
  self.second_round_extracted_docs.append(result)
133
- tokens = self.count_tokens(result.source_code)
150
+ tokens = result.tokens
134
151
  if tokens > 0:
135
152
  remaining_tokens -= tokens
136
153
  else:
@@ -184,7 +201,13 @@ class TokenLimiter:
184
201
  content += chunk + "\n"
185
202
 
186
203
  return SourceCode(
187
- module_name=doc.module_name, source_code=content.strip()
204
+ module_name=doc.module_name,
205
+ source_code=content.strip(),
206
+ tokens=self.count_tokens(content),
207
+ metadata={
208
+ "original_doc": doc.module_name,
209
+ "chunk_ranges": json_objs,
210
+ },
188
211
  )
189
212
  except Exception as e:
190
213
  if attempt < max_retries - 1:
@@ -196,5 +219,7 @@ class TokenLimiter:
196
219
  f"Failed to process doc {doc.module_name} after {max_retries} attempts: {str(e)}"
197
220
  )
198
221
  return SourceCode(
199
- module_name=doc.module_name, source_code=content.strip()
222
+ module_name=doc.module_name,
223
+ source_code="",
224
+ tokens= 0
200
225
  )
@@ -0,0 +1,2 @@
1
+ TOKENIZER_PATH = None
2
+ TOKENIZER_MODEL = None
@@ -121,7 +121,11 @@ class SuffixProject:
121
121
 
122
122
  def convert_to_source_code(self, file_path):
123
123
  module_name = file_path
124
- source_code = self.read_file_content(file_path)
124
+ try:
125
+ source_code = self.read_file_content(file_path)
126
+ except Exception as e:
127
+ logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
128
+ return None
125
129
  return SourceCode(module_name=module_name, source_code=source_code)
126
130
 
127
131
  def get_source_codes(self) -> Generator[SourceCode, None, None]:
@@ -152,7 +152,11 @@ class TSProject:
152
152
  return None
153
153
 
154
154
  module_name = file_path
155
- source_code = self.read_file_content(file_path)
155
+ try:
156
+ source_code = self.read_file_content(file_path)
157
+ except Exception as e:
158
+ logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
159
+ return None
156
160
 
157
161
  if not FileUtils.has_sufficient_content(source_code, min_line_count=1):
158
162
  return None
autocoder/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.173"
1
+ __version__ = "0.1.175"