PyPI - auto-coder - Versions diffs - 0.1.173__py3-none-any.whl → 0.1.176__py3-none-any.whl - Mend

auto-coder 0.1.173py3-none-any.whl → 0.1.176py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (18) hide show

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/METADATA +1 -1
{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/RECORD +18 -17
autocoder/auto_coder.py +8 -9
autocoder/auto_coder_rag.py +46 -13
autocoder/common/__init__.py +11 -3
autocoder/pyproject/__init__.py +5 -1
autocoder/rag/document_retriever.py +196 -55
autocoder/rag/long_context_rag.py +81 -23
autocoder/rag/token_counter.py +31 -9
autocoder/rag/token_limiter.py +66 -13
autocoder/rag/variable_holder.py +2 -0
autocoder/suffixproject/__init__.py +5 -1
autocoder/tsproject/__init__.py +5 -1
autocoder/version.py +1 -1
{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/LICENSE +0 -0
{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/WHEEL +0 -0
{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/entry_points.txt +0 -0
{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/top_level.txt +0 -0

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: auto-coder
-Version: 0.1.173
+Version: 0.1.176
 Summary: AutoCoder: AutoCoder
 Author: allwefantasy
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-autocoder/auto_coder.py,sha256=eKwIakuHyjagdDcuMMFqcfu05N9nNOCoVd2fmsVjyLA,32211
+autocoder/auto_coder.py,sha256=HmgKa_ZApFlCsqo6BvuVeCPuncBT_Dh29ayZxxGR6lo,32216
 autocoder/auto_coder_lang.py,sha256=4qIS1tbEI8mpbtt6ThppTwKOM6MLuJTWJdgs5jIDGE0,2301
-autocoder/auto_coder_rag.py,sha256=kd62w64-MD2zHzEYgh_9HUeDUJPUn4pwRQy10WrN-1o,10583
+autocoder/auto_coder_rag.py,sha256=V82EyeslAO2Z8qkMrwkyC11f1Cz6Ccjo9c867f0J_x8,11455
 autocoder/auto_coder_server.py,sha256=qRY88mkBnqSGFDcwYE5gwpe2WPhIw1nEH6LdbjCQhQk,20306
 autocoder/chat_auto_coder.py,sha256=i5xIuWlTqF0pJz8kXoa-_bW3Ic3SfCFvU2WJIMxrUHU,81798
 autocoder/chat_auto_coder_lang.py,sha256=QYtu5gWEQmWKVovR_qUZ8plySZarNFX_Onk-1vN9IiA,8524
 autocoder/command_args.py,sha256=ftWw6HnFUZPiQPt1oV-SfpHQe69XN3knaFy1lpROBcU,26854
 autocoder/lang.py,sha256=e-07rYTgimpxS8sm-AxKSmH4kKQX4N05YFHJBg9trVs,12598
-autocoder/version.py,sha256=9_6c2OzVoNP3LHOtNcm5gMxCqJ1Fv1Ql0AoMYGqXy88,23
+autocoder/version.py,sha256=yiACry4Tn-v8T0DYTTygfQmb9WG4pVkXXkB6IB4a1yg,23
 autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 autocoder/agent/auto_tool.py,sha256=DBzip-P_T6ZtT2eHexPcusmKYD0h7ufzp7TLwXAY10E,11554
 autocoder/agent/coder.py,sha256=dnITYHqkcOip8zV4lywbkYNH9w7Q3qyYaUArJ4WPrTs,866
@@ -17,7 +17,7 @@ autocoder/agent/project_reader.py,sha256=-MWRqsr7O4mvU0PIpAhOUBb29htZAvA37pa_GeE
 autocoder/chat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 autocoder/common/JupyterClient.py,sha256=O-wi6pXeAEYhAY24kDa0BINrLYvKS6rKyWe98pDClS0,2816
 autocoder/common/ShellClient.py,sha256=fM1q8t_XMSbLBl2zkCNC2J9xuyKN3eXzGm6hHhqL2WY,2286
-autocoder/common/__init__.py,sha256=n9NwLwM8Rej_wgAvTXB6owPtGkITOrP5Y69Bg2PGYfY,9870
+autocoder/common/__init__.py,sha256=wKrFLZk9BMl755nL1gvPjXU-3uWKEnYBP8xsObIjM4g,10156
 autocoder/common/anything2images.py,sha256=0ILBbWzY02M-CiWB-vzuomb_J1hVdxRcenAfIrAXq9M,25283
 autocoder/common/audio.py,sha256=Kn9nWKQddWnUrAz0a_ZUgjcu4VUU_IcZBigT7n3N3qc,7439
 autocoder/common/cleaner.py,sha256=NU72i8C6o9m0vXExab7nao5bstBUsfJFcj11cXa9l4U,1089
@@ -56,13 +56,13 @@ autocoder/index/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 autocoder/index/for_command.py,sha256=zfbvQnhHjsAqBc4Ce1kMGIu0jPEk_rtH7fntg89_4z0,3092
 autocoder/index/index.py,sha256=6uakPXThpDWxAyOAP-7AbMuXaXJJkBKctL5RkNWGdGw,22485
 autocoder/index/symbols_utils.py,sha256=CjcjUVajmJZB75Ty3a7kMv1BZphrm-tIBAdOJv6uo-0,2037
-autocoder/pyproject/__init__.py,sha256=oTGAy6sV4ua7l3nRxfzZtZrwq_YhQOKqLzbdVWDN7yY,13007
+autocoder/pyproject/__init__.py,sha256=-2-ImQVw6e3NQZQOyDlHEP5b4xVs5ur2G5izB-JCa-A,13160
 autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 autocoder/rag/api_server.py,sha256=zokIlDJlk7ucRorSLQm80uICO1mecfmn4J2zVqEBskE,6786
 autocoder/rag/doc_filter.py,sha256=LqU8Wi6klwpY9WTHVtkioSHpmo9IWhRz39dzV1gvp6E,9315
-autocoder/rag/document_retriever.py,sha256=04Vhbr1jZgPbTefFWK1TI-9MrwjGqb_Ls7QOAU3CPjw,18479
+autocoder/rag/document_retriever.py,sha256=plwm8BpC55VJTUWCZyG4HsXYm-niqUsXaBMDLrLgYj0,23348
 autocoder/rag/llm_wrapper.py,sha256=xRbTBpLUH43Ah5jplL8WWWU-kjKfNgEJoUntLGBq5F4,2484
-autocoder/rag/long_context_rag.py,sha256=H-2N_lgBbCqcWKlh0yZIYDDm2p_Y0WRLJG6-m0wNRlU,16380
+autocoder/rag/long_context_rag.py,sha256=626f5-XFyTxmnbUJ_a9GiaMPuqWhTDVMcg0b0ePW_mQ,19471
 autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
 autocoder/rag/rag_entry.py,sha256=V1RJ8RGqM30DNPmzymv64rZjNRGWn6kfc8sRy_LECg0,2451
 autocoder/rag/raw_rag.py,sha256=yS2Ur6kG0IRjhCj2_VonwxjY_xls_E62jO5Gz5j2nqE,2952
@@ -70,9 +70,10 @@ autocoder/rag/relevant_utils.py,sha256=OGfp98OXG4jr3jNmtHIeXGPF8mOlIbTnolPIVTZzY
 autocoder/rag/simple_directory_reader.py,sha256=LkKreCkNdEOoL4fNhc3_hDoyyWTQUte4uqextISRz4U,24485
 autocoder/rag/simple_rag.py,sha256=I902EUqOK1WM0Y2WFd7RzDJYofElvTZNLVCBtX5A9rc,14885
 autocoder/rag/token_checker.py,sha256=jc76x6KWmvVxds6W8juZfQGaoErudc2HenG3sNQfSLs,2819
-autocoder/rag/token_counter.py,sha256=8bcnDPpYkbq_KGhw4xIQAYmZwpVqPyMizZmoh7FsUnA,1592
-autocoder/rag/token_limiter.py,sha256=6rNsR0iQPfh8Vk4LSiF81Vnp3bVNOg6WTsFBR5tPYic,7749
+autocoder/rag/token_counter.py,sha256=9ujfI5xQvwzKpN9XFWQGnXpm0h1sL7kgIJxgposcxNo,2096
+autocoder/rag/token_limiter.py,sha256=nUxaaKJTWEi4J5c5Tz4BkwU4G1B74VxLlMinqu5s41A,10660
 autocoder/rag/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+autocoder/rag/variable_holder.py,sha256=pDayuCnlKj7-bkn4iUHX5gea9UObddbi3ZnXotmxCs4,45
 autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
 autocoder/rag/loaders/docx_loader.py,sha256=g6Ta8rMUbfgwB8N1qiajhyO6wpaWl7zygAZiKShuioI,174
 autocoder/rag/loaders/excel_loader.py,sha256=Ue8YB1z_kBs8SjIPuBskyM08Q1JiONs_BJZPrzi59oo,896
@@ -80,8 +81,8 @@ autocoder/rag/loaders/pdf_loader.py,sha256=CGfXOja7QZ7mHN-U5MsTiVMFzjP322rTj3dkY
 autocoder/rag/loaders/ppt_loader.py,sha256=7VEYc-bqgK8VHCoGC3DIUcqbpda-E5jQF9lYLqP256I,1681
 autocoder/regex_project/__init__.py,sha256=EBZeCL5ORyD_9_5u_UuG4s7XtpXOu0y1sWDmxWFtufE,6781
 autocoder/regexproject/__init__.py,sha256=ThuvVFdpw1EgWv4aIRkhg3ZclKPxMVharUKWppFpQ8o,8436
-autocoder/suffixproject/__init__.py,sha256=L_xgbsiQAJev0N3RIFp5w2cRGHptoL2sF7Omzp6Z6NU,9670
-autocoder/tsproject/__init__.py,sha256=FLwH14wSr3w0Ul7MfRggv6ol2rzctB4EN_qQFUM_Xag,10278
+autocoder/suffixproject/__init__.py,sha256=EaQoumMzZ2COxMiI_GnL3SG4LGzRj0Qw7UpqLfNLCw8,9823
+autocoder/tsproject/__init__.py,sha256=QmEpNZYUJq1o0lGMs3UuUIUU-2aq_3eh1VxqnIc-hME,10431
 autocoder/utils/__init__.py,sha256=O3n6cpsgkIbbMuwmBHSQ1dls_IBD7_7YKFFaeKNo_tc,1193
 autocoder/utils/coder.py,sha256=rK8e0svQBe0NOP26dIGToUXgha_hUDgxlWoC_p_r7oc,5698
 autocoder/utils/conversation_store.py,sha256=sz-hhY7sttPAUOAQU6Pze-5zJc3j0_Emj22dM_0l5ro,1161
@@ -94,9 +95,9 @@ autocoder/utils/request_event_queue.py,sha256=r3lo5qGsB1dIjzVQ05dnr0z_9Z3zOkBdP1
 autocoder/utils/request_queue.py,sha256=nwp6PMtgTCiuwJI24p8OLNZjUiprC-TsefQrhMI-yPE,3889
 autocoder/utils/rest.py,sha256=3tXA8KZG6jKz_tddHNLGx77Icee88WcUeesfNsgPno4,8790
 autocoder/utils/tests.py,sha256=BqphrwyycGAvs-5mhH8pKtMZdObwhFtJ5MC_ZAOiLq8,1340
-auto_coder-0.1.173.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-auto_coder-0.1.173.dist-info/METADATA,sha256=2sQqrmtnneZ55RPdg6SMblMmbtMD7fv7-IbLSFAcmiU,2352
-auto_coder-0.1.173.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-auto_coder-0.1.173.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
-auto_coder-0.1.173.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
-auto_coder-0.1.173.dist-info/RECORD,,
+auto_coder-0.1.176.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+auto_coder-0.1.176.dist-info/METADATA,sha256=-Jm1GW-7-Htzi_6l3MGRGTvl0ytk1ZyMGB2ZpiZoYa8,2352
+auto_coder-0.1.176.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+auto_coder-0.1.176.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
+auto_coder-0.1.176.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
+auto_coder-0.1.176.dist-info/RECORD,,

autocoder/auto_coder.py CHANGED Viewed

@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
                     llm, args, code_auto_execute.Mode.SINGLE_ROUND
                 )
                 executor.run(query=args.query, context=s, source_code="")
-            return
-        elif raw_args.agent_command == "chat":
-            from autocoder.rag.rag_entry import RAGFactory
-            rag = RAGFactory.get_rag(llm=llm, args=args, path="")
-            rag.stream_chat_repl(args.query)
-            return
+            return
         elif raw_args.doc_command == "serve":
             from autocoder.rag.llm_wrapper import LLWrapper
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
             llm_wrapper = LLWrapper(llm=llm, rag=rag)
             serve(llm=llm_wrapper, args=server_args)
             return
+        elif raw_args.doc_command == "chat":
+            from autocoder.rag.rag_entry import RAGFactory
+            rag = RAGFactory.get_rag(llm=llm, args=args, path="")
+            rag.stream_chat_repl(args.query)
+            return
         else:
             http_doc = HttpDoc(args=args, llm=llm, urls=None)

autocoder/auto_coder_rag.py CHANGED Viewed

@@ -18,7 +18,7 @@ from rich.console import Console
 from rich.table import Table
 import os
-from autocoder.rag.document_retriever import process_file3
+from autocoder.rag.document_retriever import process_file_local
 from autocoder.rag.token_counter import TokenCounter
 if platform.system() == "Windows":
@@ -90,16 +90,24 @@ def initialize_system():
     if choice == "1":
         print_status(get_message("deploying_model").format("Deepseek官方"), "")
         deploy_cmd = [
-            "byzerllm", "deploy",
-            "--pretrained_model_type", "saas/openai",
-            "--cpus_per_worker", "0.001",
-            "--gpus_per_worker", "0",
-            "--worker_concurrency", "1000",
-            "--num_workers", "1",
-            "--infer_params", f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
-            "--model", "deepseek_chat"
+            "byzerllm",
+            "deploy",
+            "--pretrained_model_type",
+            "saas/openai",
+            "--cpus_per_worker",
+            "0.001",
+            "--gpus_per_worker",
+            "0",
+            "--worker_concurrency",
+            "1000",
+            "--num_workers",
+            "1",
+            "--infer_params",
+            f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
+            "--model",
+            "deepseek_chat",
         ]
     try:
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
     # Serve command
     serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
-    serve_parser.add_argument("--quick", action="store_true", help="Skip system initialization")
+    serve_parser.add_argument(
+        "--quick", action="store_true", help="Skip system initialization"
+    )
     serve_parser.add_argument("--file", default="", help=desc["file"])
     serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
     serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
         "--rag_context_window_limit",
         type=int,
         default=110000,
-        help="",
+        help="The input context window limit for RAG",
+    )
+    serve_parser.add_argument(
+        "--full_text_ratio",
+        type=float,
+        default=0.7,
+        help="The ratio of full text area in the input context window (0.0 to 1.0)",
+    )
+    serve_parser.add_argument(
+        "--segment_ratio",
+        type=float,
+        default=0.2,
+        help="The ratio of segment area in the input context window (0.0 to 1.0)",
     )
     serve_parser.add_argument(
         "--required_exts", default="", help=desc["doc_build_parse_required_exts"]
@@ -198,6 +220,17 @@ def main(input_args: Optional[List[str]] = None):
         help="Monitor mode for the doc update",
     )
+    serve_parser.add_argument(
+        "--disable_auto_window",
+        action="store_true",
+        help="Disable automatic window adaptation for documents",
+    )
+    serve_parser.add_argument(
+        "--disable_segment_reorder",
+        action="store_true",
+        help="Disable reordering of document segments after retrieval",
+    )
     # Tools command
     tools_parser = subparsers.add_parser("tools", help="Various tools")
     tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
@@ -255,7 +288,7 @@ def main(input_args: Optional[List[str]] = None):
 def count_tokens(tokenizer_path: str, file_path: str):
     token_counter = TokenCounter(tokenizer_path)
-    source_codes = process_file3(file_path)
+    source_codes = process_file_local(file_path)
     console = Console()
     table = Table(title="Token Count Results")

autocoder/common/__init__.py CHANGED Viewed

@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
     module_name: str
     source_code: str
     tag: str = ""
+    tokens: int = -1
+    metadata: Dict[str, Any] = {}
 class TranslateReadme(pydantic.BaseModel):
@@ -281,9 +283,11 @@ class AutoCoderArgs(pydantic.BaseModel):
     doc_command: Optional[str] = None
     required_exts: Optional[str] = None
-    monitor_mode: Optional[bool] = False
-    description: Optional[str] = ""
+    monitor_mode: bool = False
+    disable_auto_window: bool = False
+    disable_segment_reorder: bool = False
+    rag_doc_filter_relevance: int = 5
+    tokenizer_path: Optional[str] = None
     skip_confirm: Optional[bool] = False
     silence: Optional[bool] = False
     exclude_files: Optional[Union[str, List[str]]] = ""
@@ -304,5 +308,9 @@ class AutoCoderArgs(pydantic.BaseModel):
     agent_designer_mode: Optional[str] = "svg"
+    full_text_ratio: Optional[float] = 0.7
+    segment_ratio: Optional[float] = 0.2
+    buff_ratio: Optional[float] = 0.1
     class Config:
         protected_namespaces = ()

autocoder/pyproject/__init__.py CHANGED Viewed

@@ -187,7 +187,11 @@ class PyProject:
     def convert_to_source_code(self, file_path):
         module_name = file_path
-        source_code = self.read_file_content(file_path)
+        try:
+            source_code = self.read_file_content(file_path)
+        except Exception as e:
+            logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
+            return None
         return SourceCode(module_name=module_name, source_code=source_code)
     def get_package_source_codes(

autocoder/rag/document_retriever.py CHANGED Viewed

@@ -18,10 +18,15 @@ from loguru import logger
 from pydantic import BaseModel
 from autocoder.common import SourceCode
-from autocoder.rag.loaders import (extract_text_from_docx,
-                                   extract_text_from_excel,
-                                   extract_text_from_pdf,
-                                   extract_text_from_ppt)
+from autocoder.rag.loaders import (
+    extract_text_from_docx,
+    extract_text_from_excel,
+    extract_text_from_pdf,
+    extract_text_from_ppt,
+)
+from autocoder.rag import variable_holder
+from autocoder.rag.token_counter import count_tokens_worker, count_tokens
+from uuid import uuid4
 cache_lock = threading.Lock()
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
     file_infos: List[Tuple[str, str, float]]
-@ray.remote
-def process_file(file_info: Tuple[str, str, float]) -> List[SourceCode]:
+def process_file_in_multi_process(
+    file_info: Tuple[str, str, float]
+) -> List[SourceCode]:
     start_time = time.time()
     file_path, relative_path, _ = file_info
     try:
         if file_path.endswith(".pdf"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
-        elif file_path.endswith(".docx"):
-            with open(file_path, "rb") as f:
-                content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
-            sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
-                    module_name=f"##File: {file_path}#{sheet[0]}",
-                    source_code=sheet[1],
+                    module_name=file_path,
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
                 )
-                for sheet in sheets
             ]
-        elif file_path.endswith(".pptx"):
-            slides = extract_text_from_ppt(file_path)
-            content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        else:
-            with open(file_path, "r", encoding="utf-8") as f:
-                content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        logger.info(f"Load file {file_path} in {time.time() - start_time}")
-        return v
-    except Exception as e:
-        logger.error(f"Error processing file {file_path}: {str(e)}")
-        return []
-def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
-    start_time = time.time()
-    file_path, relative_path, _ = file_info
-    try:
-        if file_path.endswith(".pdf"):
-            with open(file_path, "rb") as f:
-                content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
         elif file_path.endswith(".docx"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
             sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}#{sheet[0]}",
                     source_code=sheet[1],
+                    tokens=count_tokens_worker(sheet[1]),
                 )
                 for sheet in sheets
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
             content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         else:
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         logger.info(f"Load file {file_path} in {time.time() - start_time}")
         return v
     except Exception as e:
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
         return []
-def process_file3(file_path: str) -> List[SourceCode]:
+def process_file_local(file_path: str) -> List[SourceCode]:
     start_time = time.time()
     try:
         if file_path.endswith(".pdf"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=file_path,
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         elif file_path.endswith(".docx"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
             sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}#{sheet[0]}",
                     source_code=sheet[1],
+                    tokens=count_tokens(sheet[1]),
                 )
                 for sheet in sheets
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
             content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         else:
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         logger.info(f"Load file {file_path} in {time.time() - start_time}")
         return v
     except Exception as e:
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
             self.update_cache(item)
     def update_cache(self, file_path):
-        source_code = process_file3(file_path)
+        source_code = process_file_local(file_path)
         self.cache[file_path] = {
             "file_path": file_path,
             "content": [c.model_dump() for c in source_code],
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
     def open_watch(self):
         logger.info(f"start monitor: {self.path}...")
-        for changes in watch(self.path, watch_filter=self.file_filter, stop_event=self.stop_event):
+        for changes in watch(
+            self.path, watch_filter=self.file_filter, stop_event=self.stop_event
+        ):
             for change in changes:
                 (action, path) = change
                 if action == Change.added or action == Change.modified:
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
         self.thread.start()
         self.cache = self.read_cache()
     def _process_queue(self):
         while not self.stop_event.is_set():
             try:
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
             # results = ray.get(
             #     [process_file.remote(file_info) for file_info in files_to_process]
             # )
-            with Pool(processes=os.cpu_count()) as pool:
-                results = pool.map(process_file2, files_to_process)
+            from autocoder.rag.token_counter import initialize_tokenizer
+            with Pool(
+                processes=os.cpu_count(),
+                initializer=initialize_tokenizer,
+                initargs=(variable_holder.TOKENIZER_PATH,),
+            ) as pool:
+                results = pool.map(process_file_in_multi_process, files_to_process)
             for file_info, result in zip(files_to_process, results):
                 self.update_cache(file_info, result)
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
             elif isinstance(file_list, AddOrUpdateEvent):
                 for file_info in file_list.file_infos:
                     logger.info(f"{file_info[0]} is detected to be updated")
-                    result = process_file2(file_info)
+                    result = process_file_local(file_info)
                     self.update_cache(file_info, result)
             self.write_cache()
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
                     # 释放文件锁
                     fcntl.flock(lockf, fcntl.LOCK_UN)
-    def update_cache(self, file_info: Tuple[str, str, float], content: List[SourceCode]):
+    def update_cache(
+        self, file_info: Tuple[str, str, float], content: List[SourceCode]
+    ):
         file_path, relative_path, modify_time = file_info
         self.cache[file_path] = {
             "file_path": file_path,
@@ -485,11 +514,20 @@ class DocumentRetriever:
         required_exts: list,
         on_ray: bool = False,
         monitor_mode: bool = False,
+        single_file_token_limit: int = 60000,
+        disable_auto_window: bool = False,
     ) -> None:
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
         self.monitor_mode = monitor_mode
+        self.single_file_token_limit = single_file_token_limit
+        self.disable_auto_window = disable_auto_window
+        # 多小的文件会被合并
+        self.small_file_token_limit = self.single_file_token_limit / 4
+        # 合并后的最大文件大小
+        self.small_file_merge_limit = self.single_file_token_limit / 2
         self.on_ray = on_ray
         if self.on_ray:
@@ -502,6 +540,13 @@ class DocumentRetriever:
                     path, ignore_spec, required_exts
                 )
+        logger.info(f"DocumentRetriever initialized with:")
+        logger.info(f"  Path: {self.path}")
+        logger.info(f"  Diable auto window: {self.disable_auto_window} ")
+        logger.info(f"  Single file token limit: {self.single_file_token_limit}")
+        logger.info(f"  Small file token limit: {self.small_file_token_limit}")
+        logger.info(f"  Small file merge limit: {self.small_file_merge_limit}")
     def get_cache(self):
         if self.on_ray:
             return ray.get(self.cacher.get_cache.remote())
@@ -509,6 +554,102 @@ class DocumentRetriever:
             return self.cacher.get_cache()
     def retrieve_documents(self) -> Generator[SourceCode, None, None]:
+        logger.info("Starting document retrieval process")
+        waiting_list = []
+        waiting_tokens = 0
         for _, data in self.get_cache().items():
             for source_code in data["content"]:
-                yield SourceCode.model_validate(source_code)
+                doc = SourceCode.model_validate(source_code)
+                if self.disable_auto_window:
+                    yield doc
+                else:
+                    if doc.tokens <= 0:
+                        yield doc
+                    elif doc.tokens < self.small_file_token_limit:
+                        waiting_list, waiting_tokens = self._add_to_waiting_list(
+                            doc, waiting_list, waiting_tokens
+                        )
+                        if waiting_tokens >= self.small_file_merge_limit:
+                            yield from self._process_waiting_list(waiting_list)
+                            waiting_list = []
+                            waiting_tokens = 0
+                    elif doc.tokens > self.single_file_token_limit:
+                        yield from self._split_large_document(doc)
+                    else:
+                        yield doc
+        if waiting_list and not self.disable_auto_window:
+            yield from self._process_waiting_list(waiting_list)
+        logger.info("Document retrieval process completed")
+    def _add_to_waiting_list(
+        self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
+    ) -> Tuple[List[SourceCode], int]:
+        waiting_list.append(doc)
+        return waiting_list, waiting_tokens + doc.tokens
+    def _process_waiting_list(
+        self, waiting_list: List[SourceCode]
+    ) -> Generator[SourceCode, None, None]:
+        if len(waiting_list) == 1:
+            yield waiting_list[0]
+        elif len(waiting_list) > 1:
+            yield self._merge_documents(waiting_list)
+    def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
+        merged_content = "\n".join(
+            [f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
+        )
+        merged_tokens = sum([doc.tokens for doc in docs])
+        merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
+        logger.info(
+            f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
+        )
+        return SourceCode(
+            module_name=merged_name,
+            source_code=merged_content,
+            tokens=merged_tokens,
+            metadata={"original_docs": [doc.module_name for doc in docs]},
+        )
+    def _split_large_document(
+        self, doc: SourceCode
+    ) -> Generator[SourceCode, None, None]:
+        chunk_size = self.single_file_token_limit
+        total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
+        logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
+        for i in range(0, doc.tokens, chunk_size):
+            chunk_content = doc.source_code[i : i + chunk_size]
+            chunk_tokens = min(chunk_size, doc.tokens - i)
+            chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
+            # logger.debug(f"  Created chunk: {chunk_name} (tokens: {chunk_tokens})")
+            yield SourceCode(
+                module_name=chunk_name,
+                source_code=chunk_content,
+                tokens=chunk_tokens,
+                metadata={
+                    "original_doc": doc.module_name,
+                    "chunk_index": i // chunk_size + 1,
+                },
+            )
+    def _split_document(
+        self, doc: SourceCode, token_limit: int
+    ) -> Generator[SourceCode, None, None]:
+        remaining_tokens = doc.tokens
+        chunk_number = 1
+        start_index = 0
+        while remaining_tokens > 0:
+            end_index = start_index + token_limit
+            chunk_content = doc.source_code[start_index:end_index]
+            chunk_tokens = min(token_limit, remaining_tokens)
+            chunk_name = f"{doc.module_name}#{chunk_number:06d}"
+            yield SourceCode(
+                module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
+            )
+            start_index = end_index
+            remaining_tokens -= chunk_tokens
+            chunk_number += 1

autocoder/rag/long_context_rag.py CHANGED Viewed

@@ -13,16 +13,22 @@ from openai import OpenAI
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
-from rich.text import Text
+import statistics
 from autocoder.common import AutoCoderArgs, SourceCode
 from autocoder.rag.doc_filter import DocFilter
 from autocoder.rag.document_retriever import DocumentRetriever
-from autocoder.rag.relevant_utils import (DocRelevance, FilterDoc, TaskTiming,
-                                          parse_relevance)
+from autocoder.rag.relevant_utils import (
+    DocRelevance,
+    FilterDoc,
+    TaskTiming,
+    parse_relevance,
+)
 from autocoder.rag.token_checker import check_token_limit
 from autocoder.rag.token_counter import RemoteTokenCounter, TokenCounter
 from autocoder.rag.token_limiter import TokenLimiter
+from tokenizers import Tokenizer
+from autocoder.rag import variable_holder
 class LongContextRAG:
@@ -44,11 +50,26 @@ class LongContextRAG:
         self.path = path
         self.relevant_score = self.args.rag_doc_filter_relevance or 5
+        self.full_text_ratio = args.full_text_ratio
+        self.segment_ratio = args.segment_ratio
+        self.buff_ratio = 1 - self.full_text_ratio - self.segment_ratio
+        if self.buff_ratio < 0:
+            raise ValueError(
+                "The sum of full_text_ratio and segment_ratio must be less than or equal to 1.0"
+            )
+        self.full_text_limit = int(args.rag_context_window_limit * self.full_text_ratio)
+        self.segment_limit = int(args.rag_context_window_limit * self.segment_ratio)
+        self.buff_limit = int(args.rag_context_window_limit * self.buff_ratio)
         self.tokenizer = None
         self.tokenizer_path = tokenizer_path
         self.on_ray = False
         if self.tokenizer_path:
+            variable_holder.TOKENIZER_PATH = self.tokenizer_path
+            variable_holder.TOKENIZER_MODEL = Tokenizer.from_file(self.tokenizer_path)
             self.tokenizer = TokenCounter(self.tokenizer_path)
         else:
             if llm.is_model_exist("deepseek_tokenizer"):
@@ -96,24 +117,41 @@ class LongContextRAG:
             self.required_exts,
             self.on_ray,
             self.monitor_mode,
+            ## 确保全文区至少能放下一个文件
+            single_file_token_limit=self.full_text_limit - 100,
+            disable_auto_window=self.args.disable_auto_window
         )
         self.doc_filter = DocFilter(
             self.index_model, self.args, on_ray=self.on_ray, path=self.path
         )
-        # 检查当前目录下所有文件是否超过 120k tokens ，并且打印出来
-        self.token_exceed_files = []
-        if self.tokenizer is not None:
-            self.token_exceed_files = check_token_limit(
-                count_tokens=self.count_tokens,
-                token_limit=self.token_limit,
-                retrieve_documents=self._retrieve_documents,
-                max_workers=self.args.index_filter_workers or 5,
-            )
+        doc_num = 0
+        token_num = 0
+        token_counts = []
+        for doc in self._retrieve_documents():
+            doc_num += 1
+            doc_tokens = doc.tokens
+            token_num += doc_tokens
+            token_counts.append(doc_tokens)
+        avg_tokens = statistics.mean(token_counts) if token_counts else 0
+        median_tokens = statistics.median(token_counts) if token_counts else 0
         logger.info(
-            f"Tokenizer path: {self.tokenizer_path} relevant_score: {self.relevant_score} token_limit: {self.token_limit}"
+            "RAG Configuration:\n"
+            f"  Total docs:        {doc_num}\n"
+            f"  Total tokens:      {token_num}\n"
+            f"  Tokenizer path:    {self.tokenizer_path}\n"
+            f"  Relevant score:    {self.relevant_score}\n"
+            f"  Token limit:       {self.token_limit}\n"
+            f"  Full text limit:   {self.full_text_limit}\n"
+            f"  Segment limit:     {self.segment_limit}\n"
+            f"  Buff limit:        {self.buff_limit}\n"
+            f"  Max doc tokens:    {max(token_counts) if token_counts else 0}\n"
+            f"  Min doc tokens:    {min(token_counts) if token_counts else 0}\n"
+            f"  Avg doc tokens:    {avg_tokens:.2f}\n"
+            f"  Median doc tokens: {median_tokens:.2f}\n"
         )
     def count_tokens(self, text: str) -> int:
@@ -350,9 +388,15 @@ class LongContextRAG:
             query_table.add_row("Relevant docs", str(len(relevant_docs)))
             # Add relevant docs information
-            relevant_docs_info = "\n".join(
-                [f"- {doc.module_name}" for doc in relevant_docs]
-            )
+            relevant_docs_info = []
+            for doc in relevant_docs:
+                info = f"- {doc.module_name.replace(self.path,'',1)}"
+                if 'original_docs' in doc.metadata:
+                    original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
+                    info += f" (Original docs: {original_docs})"
+                relevant_docs_info.append(info)
+            relevant_docs_info = "\n".join(relevant_docs_info)
             query_table.add_row("Relevant docs list", relevant_docs_info)
             first_round_full_docs = []
@@ -363,8 +407,11 @@ class LongContextRAG:
                 token_limiter = TokenLimiter(
                     count_tokens=self.count_tokens,
-                    token_limit=self.token_limit,
+                    full_text_limit=self.full_text_limit,
+                    segment_limit=self.segment_limit,
+                    buff_limit=self.buff_limit,
                     llm=self.llm,
+                    disable_segment_reorder = self.args.disable_segment_reorder
                 )
                 final_relevant_docs = token_limiter.limit_tokens(
                     relevant_docs=relevant_docs,
@@ -395,9 +442,18 @@ class LongContextRAG:
             )
             # Add relevant docs information
-            final_relevant_docs_info = "\n".join(
-                [f"- {doc.module_name}" for doc in relevant_docs]
-            )
+            final_relevant_docs_info = []
+            for doc in relevant_docs:
+                info = f"- {doc.module_name.replace(self.path,'',1)}"
+                if 'original_docs' in doc.metadata:
+                    original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
+                    info += f" (Original docs: {original_docs})"
+                if "chunk_ranges" in doc.metadata:
+                    chunk_ranges = json.dumps(doc.metadata['chunk_ranges'],ensure_ascii=False)
+                    info += f" (Chunk ranges: {chunk_ranges})"
+                final_relevant_docs_info.append(info)
+            final_relevant_docs_info = "\n".join(final_relevant_docs_info)
             query_table.add_row("Final Relevant docs list", final_relevant_docs_info)
             # Create a panel to contain the table
@@ -409,8 +465,10 @@ class LongContextRAG:
             # Log the panel using rich
             console.print(panel)
-            logger.info(f"Start to send to model {model}")
+            request_tokens = sum([doc.tokens for doc in relevant_docs])
+            target_model = model or self.llm.default_model_name
+            logger.info(f"Start to send to model {target_model} with {request_tokens} tokens")
             new_conversations = conversations[:-1] + [
                 {

autocoder/rag/token_counter.py CHANGED Viewed

@@ -2,29 +2,46 @@ import time
 from loguru import logger
 from tokenizers import Tokenizer
 from multiprocessing import Pool, cpu_count
+from autocoder.rag.variable_holder import TOKENIZER_MODEL
 class RemoteTokenCounter:
-    def __init__(self,tokenizer) -> None:
+    def __init__(self, tokenizer) -> None:
         self.tokenizer = tokenizer
-    def count_tokens(self, text: str) -> int:
-        try:
+    def count_tokens(self, text: str) -> int:
+        try:
             v = self.tokenizer.chat_oai(
                 conversations=[{"role": "user", "content": text}]
-            )
+            )
             return int(v[0].output)
         except Exception as e:
             logger.error(f"Error counting tokens: {str(e)}")
             return -1
 def initialize_tokenizer(tokenizer_path):
-    global tokenizer_model
+    global tokenizer_model
     tokenizer_model = Tokenizer.from_file(tokenizer_path)
+def count_tokens(text: str) -> int:
+    try:
+        # start_time = time.time_ns()
+        encoded = TOKENIZER_MODEL.encode('{"role":"user","content":"' + text + '"}')
+        v = len(encoded.ids)
+        # elapsed_time = time.time_ns() - start_time
+        # logger.info(f"Token counting took {elapsed_time/1000000} ms")
+        return v
+    except Exception as e:
+        logger.error(f"Error counting tokens: {str(e)}")
+        return -1
 def count_tokens_worker(text: str) -> int:
     try:
         # start_time = time.time_ns()
-        encoded = tokenizer_model.encode('{"role":"user","content":"'+text+'"}')
+        encoded = tokenizer_model.encode('{"role":"user","content":"' + text + '"}')
         v = len(encoded.ids)
         # elapsed_time = time.time_ns() - start_time
         # logger.info(f"Token counting took {elapsed_time/1000000} ms")
@@ -33,11 +50,16 @@ def count_tokens_worker(text: str) -> int:
         logger.error(f"Error counting tokens: {str(e)}")
         return -1
 class TokenCounter:
     def __init__(self, tokenizer_path: str):
         self.tokenizer_path = tokenizer_path
         self.num_processes = cpu_count() - 1 if cpu_count() > 1 else 1
-        self.pool = Pool(processes=self.num_processes, initializer=initialize_tokenizer, initargs=(self.tokenizer_path,))
+        self.pool = Pool(
+            processes=self.num_processes,
+            initializer=initialize_tokenizer,
+            initargs=(self.tokenizer_path,),
+        )
     def count_tokens(self, text: str) -> int:
-        return self.pool.apply(count_tokens_worker, (text,))
+        return self.pool.apply(count_tokens_worker, (text,))

autocoder/rag/token_limiter.py CHANGED Viewed

@@ -13,15 +13,21 @@ class TokenLimiter:
     def __init__(
         self,
         count_tokens: Callable[[str], int],
-        token_limit: int,
+        full_text_limit: int,
+        segment_limit: int,
+        buff_limit: int,
         llm,
+        disable_segment_reorder:bool
     ):
         self.count_tokens = count_tokens
-        self.token_limit = token_limit
+        self.full_text_limit = full_text_limit
+        self.segment_limit = segment_limit
+        self.buff_limit = buff_limit
         self.llm = llm
         self.first_round_full_docs = []
         self.second_round_extracted_docs = []
         self.sencond_round_time = 0
+        self.disable_segment_reorder = disable_segment_reorder
     @byzerllm.prompt()
     def extract_relevance_range_from_docs_with_conversation(
@@ -88,21 +94,50 @@ class TokenLimiter:
         final_relevant_docs = []
         token_count = 0
         doc_num_count = 0
-        for doc in relevant_docs:
+        reorder_relevant_docs = []
+        added_docs = set()
+        ## 文档分段（单个文档过大）和重排序逻辑
+        ## 1. 背景：在检索过程中，许多文档被切割成多个段落（segments）
+        ## 2. 问题：这些segments在召回时因为是按相关分做了排序可能是乱序的，不符合原文顺序，会强化大模型的幻觉。
+        ## 3. 目标：重新排序这些segments，确保来自同一文档的segments保持连续且按正确顺序排列。
+        ## 4. 实现方案：
+        ##    a) 方案一（保留位置）：统一文档的不同segments 根据chunk_index 来置换位置
+        ##    b) 方案二（当前实现）：遍历文档，发现某文档的segment A，立即查找该文档的所有其他segments，
+        ##       对它们进行排序，并将排序后多个segments插入到当前的segment A 位置中。
+        ## TODO:
+        ##     1. 未来根据参数决定是否开启重排以及重排的策略
+        if not self.disable_segment_reorder:
+            for doc in relevant_docs:
+                if doc.metadata.get('original_doc') and doc.metadata.get('chunk_index'):
+                    if doc.metadata['original_doc'] not in added_docs:
+                        original_doc = doc.metadata['original_doc']
+                        chunks = [d for d in relevant_docs if d.metadata.get('original_doc') == original_doc]
+                        chunks.sort(key=lambda x: x.metadata['chunk_index'])
+                        reorder_relevant_docs.extend(chunks)
+                        added_docs.add(original_doc)
+                elif doc not in added_docs:
+                    reorder_relevant_docs.append(doc)
+                    added_docs.add(doc.module_name)
+        ## 非窗口分区实现
+        for doc in reorder_relevant_docs:
             doc_tokens = self.count_tokens(doc.source_code)
             doc_num_count += 1
-            if token_count + doc_tokens <= self.token_limit:
+            if token_count + doc_tokens <= self.full_text_limit + self.segment_limit:
                 final_relevant_docs.append(doc)
                 token_count += doc_tokens
             else:
                 break
-        if len(final_relevant_docs) < len(relevant_docs):
+        ## 如果窗口无法放下所有的相关文档，则需要分区
+        if len(final_relevant_docs) < len(reorder_relevant_docs):
+            ## 先填充full_text分区
             token_count = 0
-            new_token_limit = self.token_limit * 0.8
+            new_token_limit = self.full_text_limit
             doc_num_count = 0
-            for doc in relevant_docs:
+            for doc in reorder_relevant_docs:
                 doc_tokens = self.count_tokens(doc.source_code)
                 doc_num_count += 1
                 if token_count + doc_tokens <= new_token_limit:
@@ -111,9 +146,19 @@ class TokenLimiter:
                 else:
                     break
+            if len(self.first_round_full_docs) > 0:
+                remaining_tokens = (
+                    self.full_text_limit + self.segment_limit - token_count
+                )
+            else:
+                logger.warning(
+                    "Full text area is empty, this is may caused by the single doc is too long"
+                )
+                remaining_tokens = self.full_text_limit + self.segment_limit
+            ## 继续填充segment分区
             sencond_round_start_time = time.time()
-            remaining_tokens = self.token_limit - new_token_limit
-            remaining_docs = relevant_docs[len(self.first_round_full_docs) :]
+            remaining_docs = reorder_relevant_docs[len(self.first_round_full_docs) :]
             logger.info(
                 f"first round docs: {len(self.first_round_full_docs)} remaining docs: {len(remaining_docs)} index_filter_workers: {index_filter_workers}"
             )
@@ -130,7 +175,7 @@ class TokenLimiter:
                         result = future.result()
                         if result and remaining_tokens > 0:
                             self.second_round_extracted_docs.append(result)
-                            tokens = self.count_tokens(result.source_code)
+                            tokens = result.tokens
                             if tokens > 0:
                                 remaining_tokens -= tokens
                             else:
@@ -184,7 +229,13 @@ class TokenLimiter:
                     content += chunk + "\n"
                 return SourceCode(
-                    module_name=doc.module_name, source_code=content.strip()
+                    module_name=doc.module_name,
+                    source_code=content.strip(),
+                    tokens=self.count_tokens(content),
+                    metadata={
+                        "original_doc": doc.module_name,
+                        "chunk_ranges": json_objs,
+                    },
                 )
             except Exception as e:
                 if attempt < max_retries - 1:
@@ -196,5 +247,7 @@ class TokenLimiter:
                         f"Failed to process doc {doc.module_name} after {max_retries} attempts: {str(e)}"
                     )
                     return SourceCode(
-                        module_name=doc.module_name, source_code=content.strip()
+                        module_name=doc.module_name,
+                        source_code="",
+                        tokens= 0
                     )

autocoder/rag/variable_holder.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ TOKENIZER_PATH = None
2	+ TOKENIZER_MODEL = None

autocoder/suffixproject/__init__.py CHANGED Viewed

@@ -121,7 +121,11 @@ class SuffixProject:
     def convert_to_source_code(self, file_path):
         module_name = file_path
-        source_code = self.read_file_content(file_path)
+        try:
+            source_code = self.read_file_content(file_path)
+        except Exception as e:
+            logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
+            return None
         return SourceCode(module_name=module_name, source_code=source_code)
     def get_source_codes(self) -> Generator[SourceCode, None, None]:

autocoder/tsproject/__init__.py CHANGED Viewed

@@ -152,7 +152,11 @@ class TSProject:
             return None
         module_name = file_path
-        source_code = self.read_file_content(file_path)
+        try:
+            source_code = self.read_file_content(file_path)
+        except Exception as e:
+            logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
+            return None
         if not FileUtils.has_sufficient_content(source_code, min_line_count=1):
             return None

autocoder/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.~~173~~"
1	+ __version__ = "0.1.176"

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/LICENSE RENAMED Viewed

File without changes

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/WHEEL RENAMED Viewed

File without changes

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{auto_coder-0.1.173.dist-info → auto_coder-0.1.176.dist-info}/top_level.txt RENAMED Viewed

File without changes

auto-coder 0.1.173__py3-none-any.whl → 0.1.176__py3-none-any.whl

Potentially problematic release.

auto-coder 0.1.173py3-none-any.whl → 0.1.176py3-none-any.whl