PyPI - auto-coder - Versions diffs - 0.1.172__py3-none-any.whl → 0.1.175__py3-none-any.whl - Mend

auto-coder 0.1.172py3-none-any.whl → 0.1.175py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (26) hide show

{auto_coder-0.1.172.dist-info → auto_coder-0.1.175.dist-info}/METADATA +3 -1
{auto_coder-0.1.172.dist-info → auto_coder-0.1.175.dist-info}/RECORD +26 -24
autocoder/agent/designer.py +385 -0
autocoder/auto_coder.py +32 -8
autocoder/auto_coder_lang.py +2 -0
autocoder/auto_coder_rag.py +41 -13
autocoder/chat_auto_coder.py +144 -21
autocoder/chat_auto_coder_lang.py +3 -0
autocoder/command_args.py +12 -2
autocoder/common/__init__.py +11 -1
autocoder/common/command_completer.py +4 -0
autocoder/common/command_generator.py +4 -5
autocoder/lang.py +2 -0
autocoder/pyproject/__init__.py +5 -1
autocoder/rag/document_retriever.py +196 -55
autocoder/rag/long_context_rag.py +80 -23
autocoder/rag/token_counter.py +31 -9
autocoder/rag/token_limiter.py +34 -9
autocoder/rag/variable_holder.py +2 -0
autocoder/suffixproject/__init__.py +5 -1
autocoder/tsproject/__init__.py +5 -1
autocoder/version.py +1 -1
{auto_coder-0.1.172.dist-info → auto_coder-0.1.175.dist-info}/LICENSE +0 -0
{auto_coder-0.1.172.dist-info → auto_coder-0.1.175.dist-info}/WHEEL +0 -0
{auto_coder-0.1.172.dist-info → auto_coder-0.1.175.dist-info}/entry_points.txt +0 -0
{auto_coder-0.1.172.dist-info → auto_coder-0.1.175.dist-info}/top_level.txt +0 -0

autocoder/rag/document_retriever.py CHANGED Viewed

@@ -18,10 +18,15 @@ from loguru import logger
 from pydantic import BaseModel
 from autocoder.common import SourceCode
-from autocoder.rag.loaders import (extract_text_from_docx,
-                                   extract_text_from_excel,
-                                   extract_text_from_pdf,
-                                   extract_text_from_ppt)
+from autocoder.rag.loaders import (
+    extract_text_from_docx,
+    extract_text_from_excel,
+    extract_text_from_pdf,
+    extract_text_from_ppt,
+)
+from autocoder.rag import variable_holder
+from autocoder.rag.token_counter import count_tokens_worker, count_tokens
+from uuid import uuid4
 cache_lock = threading.Lock()
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
     file_infos: List[Tuple[str, str, float]]
-@ray.remote
-def process_file(file_info: Tuple[str, str, float]) -> List[SourceCode]:
+def process_file_in_multi_process(
+    file_info: Tuple[str, str, float]
+) -> List[SourceCode]:
     start_time = time.time()
     file_path, relative_path, _ = file_info
     try:
         if file_path.endswith(".pdf"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
-        elif file_path.endswith(".docx"):
-            with open(file_path, "rb") as f:
-                content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
-            sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
-                    module_name=f"##File: {file_path}#{sheet[0]}",
-                    source_code=sheet[1],
+                    module_name=file_path,
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
                 )
-                for sheet in sheets
             ]
-        elif file_path.endswith(".pptx"):
-            slides = extract_text_from_ppt(file_path)
-            content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        else:
-            with open(file_path, "r", encoding="utf-8") as f:
-                content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
-        logger.info(f"Load file {file_path} in {time.time() - start_time}")
-        return v
-    except Exception as e:
-        logger.error(f"Error processing file {file_path}: {str(e)}")
-        return []
-def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
-    start_time = time.time()
-    file_path, relative_path, _ = file_info
-    try:
-        if file_path.endswith(".pdf"):
-            with open(file_path, "rb") as f:
-                content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
         elif file_path.endswith(".docx"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
             sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}#{sheet[0]}",
                     source_code=sheet[1],
+                    tokens=count_tokens_worker(sheet[1]),
                 )
                 for sheet in sheets
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
             content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         else:
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens_worker(content),
+                )
+            ]
         logger.info(f"Load file {file_path} in {time.time() - start_time}")
         return v
     except Exception as e:
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
         return []
-def process_file3(file_path: str) -> List[SourceCode]:
+def process_file_local(file_path: str) -> List[SourceCode]:
     start_time = time.time()
     try:
         if file_path.endswith(".pdf"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_pdf(f.read())
-            v = [SourceCode(module_name=file_path, source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=file_path,
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         elif file_path.endswith(".docx"):
             with open(file_path, "rb") as f:
                 content = extract_text_from_docx(f.read())
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
             sheets = extract_text_from_excel(file_path)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}#{sheet[0]}",
                     source_code=sheet[1],
+                    tokens=count_tokens(sheet[1]),
                 )
                 for sheet in sheets
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
             content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         else:
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
+            v = [
+                SourceCode(
+                    module_name=f"##File: {file_path}",
+                    source_code=content,
+                    tokens=count_tokens(content),
+                )
+            ]
         logger.info(f"Load file {file_path} in {time.time() - start_time}")
         return v
     except Exception as e:
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
             self.update_cache(item)
     def update_cache(self, file_path):
-        source_code = process_file3(file_path)
+        source_code = process_file_local(file_path)
         self.cache[file_path] = {
             "file_path": file_path,
             "content": [c.model_dump() for c in source_code],
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
     def open_watch(self):
         logger.info(f"start monitor: {self.path}...")
-        for changes in watch(self.path, watch_filter=self.file_filter, stop_event=self.stop_event):
+        for changes in watch(
+            self.path, watch_filter=self.file_filter, stop_event=self.stop_event
+        ):
             for change in changes:
                 (action, path) = change
                 if action == Change.added or action == Change.modified:
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
         self.thread.start()
         self.cache = self.read_cache()
     def _process_queue(self):
         while not self.stop_event.is_set():
             try:
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
             # results = ray.get(
             #     [process_file.remote(file_info) for file_info in files_to_process]
             # )
-            with Pool(processes=os.cpu_count()) as pool:
-                results = pool.map(process_file2, files_to_process)
+            from autocoder.rag.token_counter import initialize_tokenizer
+            with Pool(
+                processes=os.cpu_count(),
+                initializer=initialize_tokenizer,
+                initargs=(variable_holder.TOKENIZER_PATH,),
+            ) as pool:
+                results = pool.map(process_file_in_multi_process, files_to_process)
             for file_info, result in zip(files_to_process, results):
                 self.update_cache(file_info, result)
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
             elif isinstance(file_list, AddOrUpdateEvent):
                 for file_info in file_list.file_infos:
                     logger.info(f"{file_info[0]} is detected to be updated")
-                    result = process_file2(file_info)
+                    result = process_file_local(file_info)
                     self.update_cache(file_info, result)
             self.write_cache()
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
                     # 释放文件锁
                     fcntl.flock(lockf, fcntl.LOCK_UN)
-    def update_cache(self, file_info: Tuple[str, str, float], content: List[SourceCode]):
+    def update_cache(
+        self, file_info: Tuple[str, str, float], content: List[SourceCode]
+    ):
         file_path, relative_path, modify_time = file_info
         self.cache[file_path] = {
             "file_path": file_path,
@@ -485,11 +514,20 @@ class DocumentRetriever:
         required_exts: list,
         on_ray: bool = False,
         monitor_mode: bool = False,
+        single_file_token_limit: int = 60000,
+        disable_auto_window: bool = False,
     ) -> None:
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
         self.monitor_mode = monitor_mode
+        self.single_file_token_limit = single_file_token_limit
+        self.disable_auto_window = disable_auto_window
+        # 多小的文件会被合并
+        self.small_file_token_limit = self.single_file_token_limit / 4
+        # 合并后的最大文件大小
+        self.small_file_merge_limit = self.single_file_token_limit / 2
         self.on_ray = on_ray
         if self.on_ray:
@@ -502,6 +540,13 @@ class DocumentRetriever:
                     path, ignore_spec, required_exts
                 )
+        logger.info(f"DocumentRetriever initialized with:")
+        logger.info(f"  Path: {self.path}")
+        logger.info(f"  Diable auto window: {self.disable_auto_window} ")
+        logger.info(f"  Single file token limit: {self.single_file_token_limit}")
+        logger.info(f"  Small file token limit: {self.small_file_token_limit}")
+        logger.info(f"  Small file merge limit: {self.small_file_merge_limit}")
     def get_cache(self):
         if self.on_ray:
             return ray.get(self.cacher.get_cache.remote())
@@ -509,6 +554,102 @@ class DocumentRetriever:
             return self.cacher.get_cache()
     def retrieve_documents(self) -> Generator[SourceCode, None, None]:
+        logger.info("Starting document retrieval process")
+        waiting_list = []
+        waiting_tokens = 0
         for _, data in self.get_cache().items():
             for source_code in data["content"]:
-                yield SourceCode.model_validate(source_code)
+                doc = SourceCode.model_validate(source_code)
+                if self.disable_auto_window:
+                    yield doc
+                else:
+                    if doc.tokens <= 0:
+                        yield doc
+                    elif doc.tokens < self.small_file_token_limit:
+                        waiting_list, waiting_tokens = self._add_to_waiting_list(
+                            doc, waiting_list, waiting_tokens
+                        )
+                        if waiting_tokens >= self.small_file_merge_limit:
+                            yield from self._process_waiting_list(waiting_list)
+                            waiting_list = []
+                            waiting_tokens = 0
+                    elif doc.tokens > self.single_file_token_limit:
+                        yield from self._split_large_document(doc)
+                    else:
+                        yield doc
+        if waiting_list and not self.disable_auto_window:
+            yield from self._process_waiting_list(waiting_list)
+        logger.info("Document retrieval process completed")
+    def _add_to_waiting_list(
+        self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
+    ) -> Tuple[List[SourceCode], int]:
+        waiting_list.append(doc)
+        return waiting_list, waiting_tokens + doc.tokens
+    def _process_waiting_list(
+        self, waiting_list: List[SourceCode]
+    ) -> Generator[SourceCode, None, None]:
+        if len(waiting_list) == 1:
+            yield waiting_list[0]
+        elif len(waiting_list) > 1:
+            yield self._merge_documents(waiting_list)
+    def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
+        merged_content = "\n".join(
+            [f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
+        )
+        merged_tokens = sum([doc.tokens for doc in docs])
+        merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
+        logger.info(
+            f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
+        )
+        return SourceCode(
+            module_name=merged_name,
+            source_code=merged_content,
+            tokens=merged_tokens,
+            metadata={"original_docs": [doc.module_name for doc in docs]},
+        )
+    def _split_large_document(
+        self, doc: SourceCode
+    ) -> Generator[SourceCode, None, None]:
+        chunk_size = self.single_file_token_limit
+        total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
+        logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
+        for i in range(0, doc.tokens, chunk_size):
+            chunk_content = doc.source_code[i : i + chunk_size]
+            chunk_tokens = min(chunk_size, doc.tokens - i)
+            chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
+            # logger.debug(f"  Created chunk: {chunk_name} (tokens: {chunk_tokens})")
+            yield SourceCode(
+                module_name=chunk_name,
+                source_code=chunk_content,
+                tokens=chunk_tokens,
+                metadata={
+                    "original_doc": doc.module_name,
+                    "chunk_index": i // chunk_size + 1,
+                },
+            )
+    def _split_document(
+        self, doc: SourceCode, token_limit: int
+    ) -> Generator[SourceCode, None, None]:
+        remaining_tokens = doc.tokens
+        chunk_number = 1
+        start_index = 0
+        while remaining_tokens > 0:
+            end_index = start_index + token_limit
+            chunk_content = doc.source_code[start_index:end_index]
+            chunk_tokens = min(token_limit, remaining_tokens)
+            chunk_name = f"{doc.module_name}#{chunk_number:06d}"
+            yield SourceCode(
+                module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
+            )
+            start_index = end_index
+            remaining_tokens -= chunk_tokens
+            chunk_number += 1

autocoder/rag/long_context_rag.py CHANGED Viewed

@@ -13,16 +13,22 @@ from openai import OpenAI
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
-from rich.text import Text
+import statistics
 from autocoder.common import AutoCoderArgs, SourceCode
 from autocoder.rag.doc_filter import DocFilter
 from autocoder.rag.document_retriever import DocumentRetriever
-from autocoder.rag.relevant_utils import (DocRelevance, FilterDoc, TaskTiming,
-                                          parse_relevance)
+from autocoder.rag.relevant_utils import (
+    DocRelevance,
+    FilterDoc,
+    TaskTiming,
+    parse_relevance,
+)
 from autocoder.rag.token_checker import check_token_limit
 from autocoder.rag.token_counter import RemoteTokenCounter, TokenCounter
 from autocoder.rag.token_limiter import TokenLimiter
+from tokenizers import Tokenizer
+from autocoder.rag import variable_holder
 class LongContextRAG:
@@ -44,11 +50,26 @@ class LongContextRAG:
         self.path = path
         self.relevant_score = self.args.rag_doc_filter_relevance or 5
+        self.full_text_ratio = args.full_text_ratio
+        self.segment_ratio = args.segment_ratio
+        self.buff_ratio = 1 - self.full_text_ratio - self.segment_ratio
+        if self.buff_ratio < 0:
+            raise ValueError(
+                "The sum of full_text_ratio and segment_ratio must be less than or equal to 1.0"
+            )
+        self.full_text_limit = int(args.rag_context_window_limit * self.full_text_ratio)
+        self.segment_limit = int(args.rag_context_window_limit * self.segment_ratio)
+        self.buff_limit = int(args.rag_context_window_limit * self.buff_ratio)
         self.tokenizer = None
         self.tokenizer_path = tokenizer_path
         self.on_ray = False
         if self.tokenizer_path:
+            variable_holder.TOKENIZER_PATH = self.tokenizer_path
+            variable_holder.TOKENIZER_MODEL = Tokenizer.from_file(self.tokenizer_path)
             self.tokenizer = TokenCounter(self.tokenizer_path)
         else:
             if llm.is_model_exist("deepseek_tokenizer"):
@@ -96,24 +117,41 @@ class LongContextRAG:
             self.required_exts,
             self.on_ray,
             self.monitor_mode,
+            ## 确保全文区至少能放下一个文件
+            single_file_token_limit=self.full_text_limit - 100,
+            disable_auto_window=self.args.disable_auto_window
         )
         self.doc_filter = DocFilter(
             self.index_model, self.args, on_ray=self.on_ray, path=self.path
         )
-        # 检查当前目录下所有文件是否超过 120k tokens ，并且打印出来
-        self.token_exceed_files = []
-        if self.tokenizer is not None:
-            self.token_exceed_files = check_token_limit(
-                count_tokens=self.count_tokens,
-                token_limit=self.token_limit,
-                retrieve_documents=self._retrieve_documents,
-                max_workers=self.args.index_filter_workers or 5,
-            )
+        doc_num = 0
+        token_num = 0
+        token_counts = []
+        for doc in self._retrieve_documents():
+            doc_num += 1
+            doc_tokens = doc.tokens
+            token_num += doc_tokens
+            token_counts.append(doc_tokens)
+        avg_tokens = statistics.mean(token_counts) if token_counts else 0
+        median_tokens = statistics.median(token_counts) if token_counts else 0
         logger.info(
-            f"Tokenizer path: {self.tokenizer_path} relevant_score: {self.relevant_score} token_limit: {self.token_limit}"
+            "RAG Configuration:\n"
+            f"  Total docs:        {doc_num}\n"
+            f"  Total tokens:      {token_num}\n"
+            f"  Tokenizer path:    {self.tokenizer_path}\n"
+            f"  Relevant score:    {self.relevant_score}\n"
+            f"  Token limit:       {self.token_limit}\n"
+            f"  Full text limit:   {self.full_text_limit}\n"
+            f"  Segment limit:     {self.segment_limit}\n"
+            f"  Buff limit:        {self.buff_limit}\n"
+            f"  Max doc tokens:    {max(token_counts) if token_counts else 0}\n"
+            f"  Min doc tokens:    {min(token_counts) if token_counts else 0}\n"
+            f"  Avg doc tokens:    {avg_tokens:.2f}\n"
+            f"  Median doc tokens: {median_tokens:.2f}\n"
         )
     def count_tokens(self, text: str) -> int:
@@ -350,9 +388,15 @@ class LongContextRAG:
             query_table.add_row("Relevant docs", str(len(relevant_docs)))
             # Add relevant docs information
-            relevant_docs_info = "\n".join(
-                [f"- {doc.module_name}" for doc in relevant_docs]
-            )
+            relevant_docs_info = []
+            for doc in relevant_docs:
+                info = f"- {doc.module_name.replace(self.path,'',1)}"
+                if 'original_docs' in doc.metadata:
+                    original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
+                    info += f" (Original docs: {original_docs})"
+                relevant_docs_info.append(info)
+            relevant_docs_info = "\n".join(relevant_docs_info)
             query_table.add_row("Relevant docs list", relevant_docs_info)
             first_round_full_docs = []
@@ -363,7 +407,9 @@ class LongContextRAG:
                 token_limiter = TokenLimiter(
                     count_tokens=self.count_tokens,
-                    token_limit=self.token_limit,
+                    full_text_limit=self.full_text_limit,
+                    segment_limit=self.segment_limit,
+                    buff_limit=self.buff_limit,
                     llm=self.llm,
                 )
                 final_relevant_docs = token_limiter.limit_tokens(
@@ -395,9 +441,18 @@ class LongContextRAG:
             )
             # Add relevant docs information
-            final_relevant_docs_info = "\n".join(
-                [f"- {doc.module_name}" for doc in relevant_docs]
-            )
+            final_relevant_docs_info = []
+            for doc in relevant_docs:
+                info = f"- {doc.module_name.replace(self.path,'',1)}"
+                if 'original_docs' in doc.metadata:
+                    original_docs = ", ".join([doc.replace(self.path,"",1) for doc in doc.metadata['original_docs']])
+                    info += f" (Original docs: {original_docs})"
+                if "chunk_ranges" in doc.metadata:
+                    chunk_ranges = json.dumps(doc.metadata['chunk_ranges'],ensure_ascii=False)
+                    info += f" (Chunk ranges: {chunk_ranges})"
+                final_relevant_docs_info.append(info)
+            final_relevant_docs_info = "\n".join(final_relevant_docs_info)
             query_table.add_row("Final Relevant docs list", final_relevant_docs_info)
             # Create a panel to contain the table
@@ -409,8 +464,10 @@ class LongContextRAG:
             # Log the panel using rich
             console.print(panel)
-            logger.info(f"Start to send to model {model}")
+            request_tokens = sum([doc.tokens for doc in relevant_docs])
+            target_model = model or self.llm.default_model_name
+            logger.info(f"Start to send to model {target_model} with {request_tokens} tokens")
             new_conversations = conversations[:-1] + [
                 {

autocoder/rag/token_counter.py CHANGED Viewed

@@ -2,29 +2,46 @@ import time
 from loguru import logger
 from tokenizers import Tokenizer
 from multiprocessing import Pool, cpu_count
+from autocoder.rag.variable_holder import TOKENIZER_MODEL
 class RemoteTokenCounter:
-    def __init__(self,tokenizer) -> None:
+    def __init__(self, tokenizer) -> None:
         self.tokenizer = tokenizer
-    def count_tokens(self, text: str) -> int:
-        try:
+    def count_tokens(self, text: str) -> int:
+        try:
             v = self.tokenizer.chat_oai(
                 conversations=[{"role": "user", "content": text}]
-            )
+            )
             return int(v[0].output)
         except Exception as e:
             logger.error(f"Error counting tokens: {str(e)}")
             return -1
 def initialize_tokenizer(tokenizer_path):
-    global tokenizer_model
+    global tokenizer_model
     tokenizer_model = Tokenizer.from_file(tokenizer_path)
+def count_tokens(text: str) -> int:
+    try:
+        # start_time = time.time_ns()
+        encoded = TOKENIZER_MODEL.encode('{"role":"user","content":"' + text + '"}')
+        v = len(encoded.ids)
+        # elapsed_time = time.time_ns() - start_time
+        # logger.info(f"Token counting took {elapsed_time/1000000} ms")
+        return v
+    except Exception as e:
+        logger.error(f"Error counting tokens: {str(e)}")
+        return -1
 def count_tokens_worker(text: str) -> int:
     try:
         # start_time = time.time_ns()
-        encoded = tokenizer_model.encode('{"role":"user","content":"'+text+'"}')
+        encoded = tokenizer_model.encode('{"role":"user","content":"' + text + '"}')
         v = len(encoded.ids)
         # elapsed_time = time.time_ns() - start_time
         # logger.info(f"Token counting took {elapsed_time/1000000} ms")
@@ -33,11 +50,16 @@ def count_tokens_worker(text: str) -> int:
         logger.error(f"Error counting tokens: {str(e)}")
         return -1
 class TokenCounter:
     def __init__(self, tokenizer_path: str):
         self.tokenizer_path = tokenizer_path
         self.num_processes = cpu_count() - 1 if cpu_count() > 1 else 1
-        self.pool = Pool(processes=self.num_processes, initializer=initialize_tokenizer, initargs=(self.tokenizer_path,))
+        self.pool = Pool(
+            processes=self.num_processes,
+            initializer=initialize_tokenizer,
+            initargs=(self.tokenizer_path,),
+        )
     def count_tokens(self, text: str) -> int:
-        return self.pool.apply(count_tokens_worker, (text,))
+        return self.pool.apply(count_tokens_worker, (text,))

auto-coder 0.1.172__py3-none-any.whl → 0.1.175__py3-none-any.whl

Potentially problematic release.

auto-coder 0.1.172py3-none-any.whl → 0.1.175py3-none-any.whl