PyPI - auto-coder - Versions diffs - 0.1.173__tar.gz → 0.1.176__tar.gz - Mend

auto-coder 0.1.173tar.gz → 0.1.176tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (114) hide show

{auto-coder-0.1.173 → auto-coder-0.1.176}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: auto-coder
-Version: 0.1.173
+Version: 0.1.176
 Summary: AutoCoder: AutoCoder
 Author: allwefantasy
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: auto-coder
-Version: 0.1.173
+Version: 0.1.176
 Summary: AutoCoder: AutoCoder
 Author: allwefantasy
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/SOURCES.txt RENAMED Viewed

@@ -82,6 +82,7 @@ src/autocoder/rag/token_checker.py
 src/autocoder/rag/token_counter.py
 src/autocoder/rag/token_limiter.py
 src/autocoder/rag/types.py
+src/autocoder/rag/variable_holder.py
 src/autocoder/rag/loaders/__init__.py
 src/autocoder/rag/loaders/docx_loader.py
 src/autocoder/rag/loaders/excel_loader.py

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder.py RENAMED Viewed

@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
                     llm, args, code_auto_execute.Mode.SINGLE_ROUND
                 )
                 executor.run(query=args.query, context=s, source_code="")
-            return
-        elif raw_args.agent_command == "chat":
-            from autocoder.rag.rag_entry import RAGFactory
-            rag = RAGFactory.get_rag(llm=llm, args=args, path="")
-            rag.stream_chat_repl(args.query)
-            return
+            return
         elif raw_args.doc_command == "serve":
             from autocoder.rag.llm_wrapper import LLWrapper
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
             llm_wrapper = LLWrapper(llm=llm, rag=rag)
             serve(llm=llm_wrapper, args=server_args)
             return
+        elif raw_args.doc_command == "chat":
+            from autocoder.rag.rag_entry import RAGFactory
+            rag = RAGFactory.get_rag(llm=llm, args=args, path="")
+            rag.stream_chat_repl(args.query)
+            return
         else:
             http_doc = HttpDoc(args=args, llm=llm, urls=None)

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_rag.py RENAMED Viewed

@@ -18,7 +18,7 @@ from rich.console import Console
 from rich.table import Table
 import os
-from autocoder.rag.document_retriever import process_file3
+from autocoder.rag.document_retriever import process_file_local
 from autocoder.rag.token_counter import TokenCounter
 if platform.system() == "Windows":
@@ -90,16 +90,24 @@ def initialize_system():
     if choice == "1":
         print_status(get_message("deploying_model").format("Deepseek官方"), "")
         deploy_cmd = [
-            "byzerllm", "deploy",
-            "--pretrained_model_type", "saas/openai",
-            "--cpus_per_worker", "0.001",
-            "--gpus_per_worker", "0",
-            "--worker_concurrency", "1000",
-            "--num_workers", "1",
-            "--infer_params", f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
-            "--model", "deepseek_chat"
+            "byzerllm",
+            "deploy",
+            "--pretrained_model_type",
+            "saas/openai",
+            "--cpus_per_worker",
+            "0.001",
+            "--gpus_per_worker",
+            "0",
+            "--worker_concurrency",
+            "1000",
+            "--num_workers",
+            "1",
+            "--infer_params",
+            f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
+            "--model",
+            "deepseek_chat",
         ]
     try:
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
     # Serve command
     serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
-    serve_parser.add_argument("--quick", action="store_true", help="Skip system initialization")
+    serve_parser.add_argument(
+        "--quick", action="store_true", help="Skip system initialization"
+    )
     serve_parser.add_argument("--file", default="", help=desc["file"])
     serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
     serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
         "--rag_context_window_limit",
         type=int,
         default=110000,
-        help="",
+        help="The input context window limit for RAG",
+    )
+    serve_parser.add_argument(
+        "--full_text_ratio",
+        type=float,
+        default=0.7,
+        help="The ratio of full text area in the input context window (0.0 to 1.0)",
+    )
+    serve_parser.add_argument(
+        "--segment_ratio",
+        type=float,
+        default=0.2,
+        help="The ratio of segment area in the input context window (0.0 to 1.0)",
     )
     serve_parser.add_argument(
         "--required_exts", default="", help=desc["doc_build_parse_required_exts"]
@@ -198,6 +220,17 @@ def main(input_args: Optional[List[str]] = None):
         help="Monitor mode for the doc update",
     )
+    serve_parser.add_argument(
+        "--disable_auto_window",
+        action="store_true",
+        help="Disable automatic window adaptation for documents",
+    )
+    serve_parser.add_argument(
+        "--disable_segment_reorder",
+        action="store_true",
+        help="Disable reordering of document segments after retrieval",
+    )
     # Tools command
     tools_parser = subparsers.add_parser("tools", help="Various tools")
     tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
@@ -255,7 +288,7 @@ def main(input_args: Optional[List[str]] = None):
 def count_tokens(tokenizer_path: str, file_path: str):
     token_counter = TokenCounter(tokenizer_path)
-    source_codes = process_file3(file_path)
+    source_codes = process_file_local(file_path)
     console = Console()
     table = Table(title="Token Count Results")

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/__init__.py RENAMED Viewed

@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
     module_name: str
     source_code: str
     tag: str = ""
+    tokens: int = -1
+    metadata: Dict[str, Any] = {}
 class TranslateReadme(pydantic.BaseModel):
@@ -281,9 +283,11 @@ class AutoCoderArgs(pydantic.BaseModel):
     doc_command: Optional[str] = None
     required_exts: Optional[str] = None
-    monitor_mode: Optional[bool] = False
-    description: Optional[str] = ""
+    monitor_mode: bool = False
+    disable_auto_window: bool = False
+    disable_segment_reorder: bool = False
+    rag_doc_filter_relevance: int = 5
+    tokenizer_path: Optional[str] = None
     skip_confirm: Optional[bool] = False
     silence: Optional[bool] = False
     exclude_files: Optional[Union[str, List[str]]] = ""
@@ -304,5 +308,9 @@ class AutoCoderArgs(pydantic.BaseModel):
     agent_designer_mode: Optional[str] = "svg"
+    full_text_ratio: Optional[float] = 0.7
+    segment_ratio: Optional[float] = 0.2
+    buff_ratio: Optional[float] = 0.1
     class Config:
         protected_namespaces = ()

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/pyproject/__init__.py RENAMED Viewed

@@ -187,7 +187,11 @@ class PyProject:
     def convert_to_source_code(self, file_path):
         module_name = file_path
-        source_code = self.read_file_content(file_path)
+        try:
+            source_code = self.read_file_content(file_path)
+        except Exception as e:
+            logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
+            return None
         return SourceCode(module_name=module_name, source_code=source_code)
     def get_package_source_codes(

{auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/document_retriever.py RENAMED Viewed

@@ -18,10 +18,15 @@ from loguru import logger
 from pydantic import BaseModel
 from autocoder.common import SourceCode
-from autocoder.rag.loaders import (extract_text_from_docx,
-                                   extract_text_from_excel,
-                                   extract_text_from_pdf,
-                                   extract_text_from_ppt)
+from autocoder.rag.loaders import (
+    extract_text_from_docx,
+    extract_text_from_excel,
+    extract_text_from_pdf,
+    extract_text_from_ppt,
+)
+from autocoder.rag import variable_holder
+from autocoder.rag.token_counter import count_tokens_worker, count_tokens
+from uuid import uuid4
 cache_lock = threading.Lock()
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
     file_infos: List[Tuple[str, str, float]]
-@ray.remote
-def process_file(file_info: Tuple[str, str, float]) -> List[SourceCode]:
+def process_file_in_multi_process(
+    file_info: Tuple[str, str, float]
+) -> List[SourceCode]:
     start_time = time.time()
     file_path, relative_path, _ = file_info
     try:
         if file_path.endswith(".pdf"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
-        elif file_path.endswith(".docx"):
-            with open(file_path, "rb") as f:
-                content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
-            sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
-                    module_name=f"##File: {file_path}#{sheet[0]}",
-                    source_code=sheet[1],
+                    module_name=file_path,
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
                 )
-                for sheet in sheets
             ]
-        elif file_path.endswith(".pptx"):
-            slides = extract_text_from_ppt(file_path)
-            content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        else:
-            with open(file_path, "r", encoding="utf-8") as f:
-                content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        logger.info(f"Load file {file_path} in {time.time() - start_time}")
-        return v
-    except Exception as e:
-        logger.error(f"Error processing file {file_path}: {str(e)}")
-        return []
-def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
-    start_time = time.time()
-    file_path, relative_path, _ = file_info
-    try:
-        if file_path.endswith(".pdf"):
-            with open(file_path, "rb") as f:
-                content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
         elif file_path.endswith(".docx"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
             sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}#{sheet[0]}",
                     source_code=sheet[1],
+                    tokens=count_tokens_worker(sheet[1]),
                 )
                 for sheet in sheets
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
             content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         else:
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         logger.info(f"Load file {file_path} in {time.time() - start_time}")
         return v
     except Exception as e:
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
         return []
-def process_file3(file_path: str) -> List[SourceCode]:
+def process_file_local(file_path: str) -> List[SourceCode]:
     start_time = time.time()
     try:
         if file_path.endswith(".pdf"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=file_path,
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         elif file_path.endswith(".docx"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
             sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}#{sheet[0]}",
                     source_code=sheet[1],
+                    tokens=count_tokens(sheet[1]),
                 )
                 for sheet in sheets
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
             content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         else:
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         logger.info(f"Load file {file_path} in {time.time() - start_time}")
         return v
     except Exception as e:
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
             self.update_cache(item)
     def update_cache(self, file_path):
-        source_code = process_file3(file_path)
+        source_code = process_file_local(file_path)
         self.cache[file_path] = {
             "file_path": file_path,
             "content": [c.model_dump() for c in source_code],
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
     def open_watch(self):
         logger.info(f"start monitor: {self.path}...")
-        for changes in watch(self.path, watch_filter=self.file_filter, stop_event=self.stop_event):
+        for changes in watch(
+            self.path, watch_filter=self.file_filter, stop_event=self.stop_event
+        ):
             for change in changes:
                 (action, path) = change
                 if action == Change.added or action == Change.modified:
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
         self.thread.start()
         self.cache = self.read_cache()
     def _process_queue(self):
         while not self.stop_event.is_set():
             try:
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
             # results = ray.get(
             #     [process_file.remote(file_info) for file_info in files_to_process]
             # )
-            with Pool(processes=os.cpu_count()) as pool:
-                results = pool.map(process_file2, files_to_process)
+            from autocoder.rag.token_counter import initialize_tokenizer
+            with Pool(
+                processes=os.cpu_count(),
+                initializer=initialize_tokenizer,
+                initargs=(variable_holder.TOKENIZER_PATH,),
+            ) as pool:
+                results = pool.map(process_file_in_multi_process, files_to_process)
             for file_info, result in zip(files_to_process, results):
                 self.update_cache(file_info, result)
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
             elif isinstance(file_list, AddOrUpdateEvent):
                 for file_info in file_list.file_infos:
                     logger.info(f"{file_info[0]} is detected to be updated")
-                    result = process_file2(file_info)
+                    result = process_file_local(file_info)
                     self.update_cache(file_info, result)
             self.write_cache()
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
                     # 释放文件锁
                     fcntl.flock(lockf, fcntl.LOCK_UN)
-    def update_cache(self, file_info: Tuple[str, str, float], content: List[SourceCode]):
+    def update_cache(
+        self, file_info: Tuple[str, str, float], content: List[SourceCode]
+    ):
         file_path, relative_path, modify_time = file_info
         self.cache[file_path] = {
             "file_path": file_path,
@@ -485,11 +514,20 @@ class DocumentRetriever:
         required_exts: list,
         on_ray: bool = False,
         monitor_mode: bool = False,
+        single_file_token_limit: int = 60000,
+        disable_auto_window: bool = False,
     ) -> None:
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
         self.monitor_mode = monitor_mode
+        self.single_file_token_limit = single_file_token_limit
+        self.disable_auto_window = disable_auto_window
+        # 多小的文件会被合并
+        self.small_file_token_limit = self.single_file_token_limit / 4
+        # 合并后的最大文件大小
+        self.small_file_merge_limit = self.single_file_token_limit / 2
         self.on_ray = on_ray
         if self.on_ray:
@@ -502,6 +540,13 @@ class DocumentRetriever:
                     path, ignore_spec, required_exts
                 )
+        logger.info(f"DocumentRetriever initialized with:")
+        logger.info(f"  Path: {self.path}")
+        logger.info(f"  Diable auto window: {self.disable_auto_window} ")
+        logger.info(f"  Single file token limit: {self.single_file_token_limit}")
+        logger.info(f"  Small file token limit: {self.small_file_token_limit}")
+        logger.info(f"  Small file merge limit: {self.small_file_merge_limit}")
     def get_cache(self):
         if self.on_ray:
             return ray.get(self.cacher.get_cache.remote())
@@ -509,6 +554,102 @@ class DocumentRetriever:
             return self.cacher.get_cache()
     def retrieve_documents(self) -> Generator[SourceCode, None, None]:
+        logger.info("Starting document retrieval process")
+        waiting_list = []
+        waiting_tokens = 0
         for _, data in self.get_cache().items():
             for source_code in data["content"]:
-                yield SourceCode.model_validate(source_code)
+                doc = SourceCode.model_validate(source_code)
+                if self.disable_auto_window:
+                    yield doc
+                else:
+                    if doc.tokens <= 0:
+                        yield doc
+                    elif doc.tokens < self.small_file_token_limit:
+                        waiting_list, waiting_tokens = self._add_to_waiting_list(
+                            doc, waiting_list, waiting_tokens
+                        )
+                        if waiting_tokens >= self.small_file_merge_limit:
+                            yield from self._process_waiting_list(waiting_list)
+                            waiting_list = []
+                            waiting_tokens = 0
+                    elif doc.tokens > self.single_file_token_limit:
+                        yield from self._split_large_document(doc)
+                    else:
+                        yield doc
+        if waiting_list and not self.disable_auto_window:
+            yield from self._process_waiting_list(waiting_list)
+        logger.info("Document retrieval process completed")
+    def _add_to_waiting_list(
+        self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
+    ) -> Tuple[List[SourceCode], int]:
+        waiting_list.append(doc)
+        return waiting_list, waiting_tokens + doc.tokens
+    def _process_waiting_list(
+        self, waiting_list: List[SourceCode]
+    ) -> Generator[SourceCode, None, None]:
+        if len(waiting_list) == 1:
+            yield waiting_list[0]
+        elif len(waiting_list) > 1:
+            yield self._merge_documents(waiting_list)
+    def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
+        merged_content = "\n".join(
+            [f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
+        )
+        merged_tokens = sum([doc.tokens for doc in docs])
+        merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
+        logger.info(
+            f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
+        )
+        return SourceCode(
+            module_name=merged_name,
+            source_code=merged_content,
+            tokens=merged_tokens,
+            metadata={"original_docs": [doc.module_name for doc in docs]},
+        )
+    def _split_large_document(
+        self, doc: SourceCode
+    ) -> Generator[SourceCode, None, None]:
+        chunk_size = self.single_file_token_limit
+        total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
+        logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
+        for i in range(0, doc.tokens, chunk_size):
+            chunk_content = doc.source_code[i : i + chunk_size]
+            chunk_tokens = min(chunk_size, doc.tokens - i)
+            chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
+            # logger.debug(f"  Created chunk: {chunk_name} (tokens: {chunk_tokens})")
+            yield SourceCode(
+                module_name=chunk_name,
+                source_code=chunk_content,
+                tokens=chunk_tokens,
+                metadata={
+                    "original_doc": doc.module_name,
+                    "chunk_index": i // chunk_size + 1,
+                },
+            )
+    def _split_document(
+        self, doc: SourceCode, token_limit: int
+    ) -> Generator[SourceCode, None, None]:
+        remaining_tokens = doc.tokens
+        chunk_number = 1
+        start_index = 0
+        while remaining_tokens > 0:
+            end_index = start_index + token_limit
+            chunk_content = doc.source_code[start_index:end_index]
+            chunk_tokens = min(token_limit, remaining_tokens)
+            chunk_name = f"{doc.module_name}#{chunk_number:06d}"
+            yield SourceCode(
+                module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
+            )
+            start_index = end_index
+            remaining_tokens -= chunk_tokens
+            chunk_number += 1

auto-coder 0.1.173__tar.gz → 0.1.176__tar.gz

Potentially problematic release.

auto-coder 0.1.173tar.gz → 0.1.176tar.gz