auto-coder 0.1.173__tar.gz → 0.1.176__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (114) hide show
  1. {auto-coder-0.1.173 → auto-coder-0.1.176}/PKG-INFO +1 -1
  2. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/PKG-INFO +1 -1
  3. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/SOURCES.txt +1 -0
  4. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder.py +8 -9
  5. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_rag.py +46 -13
  6. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/__init__.py +11 -3
  7. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/pyproject/__init__.py +5 -1
  8. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/document_retriever.py +196 -55
  9. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/long_context_rag.py +81 -23
  10. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/token_counter.py +31 -9
  11. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/token_limiter.py +66 -13
  12. auto-coder-0.1.176/src/autocoder/rag/variable_holder.py +2 -0
  13. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/suffixproject/__init__.py +5 -1
  14. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/tsproject/__init__.py +5 -1
  15. auto-coder-0.1.176/src/autocoder/version.py +1 -0
  16. auto-coder-0.1.173/src/autocoder/version.py +0 -1
  17. {auto-coder-0.1.173 → auto-coder-0.1.176}/LICENSE +0 -0
  18. {auto-coder-0.1.173 → auto-coder-0.1.176}/README.md +0 -0
  19. {auto-coder-0.1.173 → auto-coder-0.1.176}/setup.cfg +0 -0
  20. {auto-coder-0.1.173 → auto-coder-0.1.176}/setup.py +0 -0
  21. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/dependency_links.txt +0 -0
  22. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/entry_points.txt +0 -0
  23. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/requires.txt +0 -0
  24. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/auto_coder.egg-info/top_level.txt +0 -0
  25. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/__init__.py +0 -0
  26. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/__init__.py +0 -0
  27. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/auto_tool.py +0 -0
  28. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/coder.py +0 -0
  29. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/designer.py +0 -0
  30. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/planner.py +0 -0
  31. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/agent/project_reader.py +0 -0
  32. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_lang.py +0 -0
  33. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/auto_coder_server.py +0 -0
  34. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/chat/__init__.py +0 -0
  35. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/chat_auto_coder.py +0 -0
  36. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/chat_auto_coder_lang.py +0 -0
  37. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/command_args.py +0 -0
  38. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/JupyterClient.py +0 -0
  39. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/ShellClient.py +0 -0
  40. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/anything2images.py +0 -0
  41. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/audio.py +0 -0
  42. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/cleaner.py +0 -0
  43. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_execute.py +0 -0
  44. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate.py +0 -0
  45. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate_diff.py +0 -0
  46. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate_editblock.py +0 -0
  47. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_generate_strict_diff.py +0 -0
  48. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge.py +0 -0
  49. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge_diff.py +0 -0
  50. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge_editblock.py +0 -0
  51. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/code_auto_merge_strict_diff.py +0 -0
  52. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/command_completer.py +0 -0
  53. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/command_generator.py +0 -0
  54. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/command_templates.py +0 -0
  55. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/const.py +0 -0
  56. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/git_utils.py +0 -0
  57. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/image_to_page.py +0 -0
  58. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/interpreter.py +0 -0
  59. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/llm_rerank.py +0 -0
  60. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/screenshots.py +0 -0
  61. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/search.py +0 -0
  62. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/search_replace.py +0 -0
  63. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/text.py +0 -0
  64. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/common/types.py +0 -0
  65. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/db/__init__.py +0 -0
  66. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/db/store.py +0 -0
  67. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/__init__.py +0 -0
  68. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/__init__.py +0 -0
  69. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/action.py +0 -0
  70. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/copilot.py +0 -0
  71. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/plugins/__init__.py +0 -0
  72. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/plugins/action_regex_project.py +0 -0
  73. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/dispacher/actions/plugins/action_translate.py +0 -0
  74. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/__init__.py +0 -0
  75. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/for_command.py +0 -0
  76. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/index.py +0 -0
  77. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/index/symbols_utils.py +0 -0
  78. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/lang.py +0 -0
  79. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/__init__.py +0 -0
  80. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/api_server.py +0 -0
  81. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/doc_filter.py +0 -0
  82. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/llm_wrapper.py +0 -0
  83. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/__init__.py +0 -0
  84. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/docx_loader.py +0 -0
  85. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/excel_loader.py +0 -0
  86. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/pdf_loader.py +0 -0
  87. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/loaders/ppt_loader.py +0 -0
  88. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/rag_config.py +0 -0
  89. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/rag_entry.py +0 -0
  90. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/raw_rag.py +0 -0
  91. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/relevant_utils.py +0 -0
  92. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/simple_directory_reader.py +0 -0
  93. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/simple_rag.py +0 -0
  94. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/token_checker.py +0 -0
  95. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/rag/types.py +0 -0
  96. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/regexproject/__init__.py +0 -0
  97. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/__init__.py +0 -0
  98. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/conversation_store.py +0 -0
  99. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/llm_client_interceptors.py +0 -0
  100. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/log_capture.py +0 -0
  101. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/multi_turn.py +0 -0
  102. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/print_table.py +0 -0
  103. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/queue_communicate.py +0 -0
  104. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/request_event_queue.py +0 -0
  105. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/request_queue.py +0 -0
  106. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/rest.py +0 -0
  107. {auto-coder-0.1.173 → auto-coder-0.1.176}/src/autocoder/utils/tests.py +0 -0
  108. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_action_regex_project.py +0 -0
  109. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_chat_auto_coder.py +0 -0
  110. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_code_auto_merge_editblock.py +0 -0
  111. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_command_completer.py +0 -0
  112. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_planner.py +0 -0
  113. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_queue_communicate.py +0 -0
  114. {auto-coder-0.1.173 → auto-coder-0.1.176}/tests/test_symbols_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.173
3
+ Version: 0.1.176
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.173
3
+ Version: 0.1.176
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -82,6 +82,7 @@ src/autocoder/rag/token_checker.py
82
82
  src/autocoder/rag/token_counter.py
83
83
  src/autocoder/rag/token_limiter.py
84
84
  src/autocoder/rag/types.py
85
+ src/autocoder/rag/variable_holder.py
85
86
  src/autocoder/rag/loaders/__init__.py
86
87
  src/autocoder/rag/loaders/docx_loader.py
87
88
  src/autocoder/rag/loaders/excel_loader.py
@@ -813,15 +813,7 @@ def main(input_args: Optional[List[str]] = None):
813
813
  llm, args, code_auto_execute.Mode.SINGLE_ROUND
814
814
  )
815
815
  executor.run(query=args.query, context=s, source_code="")
816
- return
817
- elif raw_args.agent_command == "chat":
818
- from autocoder.rag.rag_entry import RAGFactory
819
-
820
- rag = RAGFactory.get_rag(llm=llm, args=args, path="")
821
- rag.stream_chat_repl(args.query)
822
- return
823
-
824
-
816
+ return
825
817
  elif raw_args.doc_command == "serve":
826
818
 
827
819
  from autocoder.rag.llm_wrapper import LLWrapper
@@ -846,6 +838,13 @@ def main(input_args: Optional[List[str]] = None):
846
838
  llm_wrapper = LLWrapper(llm=llm, rag=rag)
847
839
  serve(llm=llm_wrapper, args=server_args)
848
840
  return
841
+
842
+ elif raw_args.doc_command == "chat":
843
+ from autocoder.rag.rag_entry import RAGFactory
844
+
845
+ rag = RAGFactory.get_rag(llm=llm, args=args, path="")
846
+ rag.stream_chat_repl(args.query)
847
+ return
849
848
 
850
849
  else:
851
850
  http_doc = HttpDoc(args=args, llm=llm, urls=None)
@@ -18,7 +18,7 @@ from rich.console import Console
18
18
  from rich.table import Table
19
19
  import os
20
20
 
21
- from autocoder.rag.document_retriever import process_file3
21
+ from autocoder.rag.document_retriever import process_file_local
22
22
  from autocoder.rag.token_counter import TokenCounter
23
23
 
24
24
  if platform.system() == "Windows":
@@ -90,16 +90,24 @@ def initialize_system():
90
90
 
91
91
  if choice == "1":
92
92
  print_status(get_message("deploying_model").format("Deepseek官方"), "")
93
-
93
+
94
94
  deploy_cmd = [
95
- "byzerllm", "deploy",
96
- "--pretrained_model_type", "saas/openai",
97
- "--cpus_per_worker", "0.001",
98
- "--gpus_per_worker", "0",
99
- "--worker_concurrency", "1000",
100
- "--num_workers", "1",
101
- "--infer_params", f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
102
- "--model", "deepseek_chat"
95
+ "byzerllm",
96
+ "deploy",
97
+ "--pretrained_model_type",
98
+ "saas/openai",
99
+ "--cpus_per_worker",
100
+ "0.001",
101
+ "--gpus_per_worker",
102
+ "0",
103
+ "--worker_concurrency",
104
+ "1000",
105
+ "--num_workers",
106
+ "1",
107
+ "--infer_params",
108
+ f"saas.base_url=https://api.deepseek.com/v1 saas.api_key={api_key} saas.model=deepseek-chat",
109
+ "--model",
110
+ "deepseek_chat",
103
111
  ]
104
112
 
105
113
  try:
@@ -138,7 +146,9 @@ def main(input_args: Optional[List[str]] = None):
138
146
 
139
147
  # Serve command
140
148
  serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
141
- serve_parser.add_argument("--quick", action="store_true", help="Skip system initialization")
149
+ serve_parser.add_argument(
150
+ "--quick", action="store_true", help="Skip system initialization"
151
+ )
142
152
  serve_parser.add_argument("--file", default="", help=desc["file"])
143
153
  serve_parser.add_argument("--model", default="deepseek_chat", help=desc["model"])
144
154
  serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
@@ -160,7 +170,19 @@ def main(input_args: Optional[List[str]] = None):
160
170
  "--rag_context_window_limit",
161
171
  type=int,
162
172
  default=110000,
163
- help="",
173
+ help="The input context window limit for RAG",
174
+ )
175
+ serve_parser.add_argument(
176
+ "--full_text_ratio",
177
+ type=float,
178
+ default=0.7,
179
+ help="The ratio of full text area in the input context window (0.0 to 1.0)",
180
+ )
181
+ serve_parser.add_argument(
182
+ "--segment_ratio",
183
+ type=float,
184
+ default=0.2,
185
+ help="The ratio of segment area in the input context window (0.0 to 1.0)",
164
186
  )
165
187
  serve_parser.add_argument(
166
188
  "--required_exts", default="", help=desc["doc_build_parse_required_exts"]
@@ -198,6 +220,17 @@ def main(input_args: Optional[List[str]] = None):
198
220
  help="Monitor mode for the doc update",
199
221
  )
200
222
 
223
+ serve_parser.add_argument(
224
+ "--disable_auto_window",
225
+ action="store_true",
226
+ help="Disable automatic window adaptation for documents",
227
+ )
228
+ serve_parser.add_argument(
229
+ "--disable_segment_reorder",
230
+ action="store_true",
231
+ help="Disable reordering of document segments after retrieval",
232
+ )
233
+
201
234
  # Tools command
202
235
  tools_parser = subparsers.add_parser("tools", help="Various tools")
203
236
  tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
@@ -255,7 +288,7 @@ def main(input_args: Optional[List[str]] = None):
255
288
 
256
289
  def count_tokens(tokenizer_path: str, file_path: str):
257
290
  token_counter = TokenCounter(tokenizer_path)
258
- source_codes = process_file3(file_path)
291
+ source_codes = process_file_local(file_path)
259
292
 
260
293
  console = Console()
261
294
  table = Table(title="Token Count Results")
@@ -11,6 +11,8 @@ class SourceCode(pydantic.BaseModel):
11
11
  module_name: str
12
12
  source_code: str
13
13
  tag: str = ""
14
+ tokens: int = -1
15
+ metadata: Dict[str, Any] = {}
14
16
 
15
17
 
16
18
  class TranslateReadme(pydantic.BaseModel):
@@ -281,9 +283,11 @@ class AutoCoderArgs(pydantic.BaseModel):
281
283
  doc_command: Optional[str] = None
282
284
  required_exts: Optional[str] = None
283
285
 
284
- monitor_mode: Optional[bool] = False
285
-
286
- description: Optional[str] = ""
286
+ monitor_mode: bool = False
287
+ disable_auto_window: bool = False
288
+ disable_segment_reorder: bool = False
289
+ rag_doc_filter_relevance: int = 5
290
+ tokenizer_path: Optional[str] = None
287
291
  skip_confirm: Optional[bool] = False
288
292
  silence: Optional[bool] = False
289
293
  exclude_files: Optional[Union[str, List[str]]] = ""
@@ -304,5 +308,9 @@ class AutoCoderArgs(pydantic.BaseModel):
304
308
 
305
309
  agent_designer_mode: Optional[str] = "svg"
306
310
 
311
+ full_text_ratio: Optional[float] = 0.7
312
+ segment_ratio: Optional[float] = 0.2
313
+ buff_ratio: Optional[float] = 0.1
314
+
307
315
  class Config:
308
316
  protected_namespaces = ()
@@ -187,7 +187,11 @@ class PyProject:
187
187
 
188
188
  def convert_to_source_code(self, file_path):
189
189
  module_name = file_path
190
- source_code = self.read_file_content(file_path)
190
+ try:
191
+ source_code = self.read_file_content(file_path)
192
+ except Exception as e:
193
+ logger.warning(f"Failed to read file: {file_path}. Error: {str(e)}")
194
+ return None
191
195
  return SourceCode(module_name=module_name, source_code=source_code)
192
196
 
193
197
  def get_package_source_codes(
@@ -18,10 +18,15 @@ from loguru import logger
18
18
  from pydantic import BaseModel
19
19
 
20
20
  from autocoder.common import SourceCode
21
- from autocoder.rag.loaders import (extract_text_from_docx,
22
- extract_text_from_excel,
23
- extract_text_from_pdf,
24
- extract_text_from_ppt)
21
+ from autocoder.rag.loaders import (
22
+ extract_text_from_docx,
23
+ extract_text_from_excel,
24
+ extract_text_from_pdf,
25
+ extract_text_from_ppt,
26
+ )
27
+ from autocoder.rag import variable_holder
28
+ from autocoder.rag.token_counter import count_tokens_worker, count_tokens
29
+ from uuid import uuid4
25
30
 
26
31
  cache_lock = threading.Lock()
27
32
 
@@ -34,72 +39,62 @@ class AddOrUpdateEvent(BaseModel):
34
39
  file_infos: List[Tuple[str, str, float]]
35
40
 
36
41
 
37
- @ray.remote
38
- def process_file(file_info: Tuple[str, str, float]) -> List[SourceCode]:
42
+ def process_file_in_multi_process(
43
+ file_info: Tuple[str, str, float]
44
+ ) -> List[SourceCode]:
39
45
  start_time = time.time()
40
46
  file_path, relative_path, _ = file_info
41
47
  try:
42
48
  if file_path.endswith(".pdf"):
43
49
  with open(file_path, "rb") as f:
44
50
  content = extract_text_from_pdf(f.read())
45
- v = [SourceCode(module_name=file_path, source_code=content)]
46
- elif file_path.endswith(".docx"):
47
- with open(file_path, "rb") as f:
48
- content = extract_text_from_docx(f.read())
49
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
50
- elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
51
- sheets = extract_text_from_excel(file_path)
52
51
  v = [
53
52
  SourceCode(
54
- module_name=f"##File: {file_path}#{sheet[0]}",
55
- source_code=sheet[1],
53
+ module_name=file_path,
54
+ source_code=content,
55
+ tokens=count_tokens_worker(content),
56
56
  )
57
- for sheet in sheets
58
57
  ]
59
- elif file_path.endswith(".pptx"):
60
- slides = extract_text_from_ppt(file_path)
61
- content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
62
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
63
- else:
64
- with open(file_path, "r", encoding="utf-8") as f:
65
- content = f.read()
66
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
67
- logger.info(f"Load file {file_path} in {time.time() - start_time}")
68
- return v
69
- except Exception as e:
70
- logger.error(f"Error processing file {file_path}: {str(e)}")
71
- return []
72
-
73
-
74
- def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
75
- start_time = time.time()
76
- file_path, relative_path, _ = file_info
77
- try:
78
- if file_path.endswith(".pdf"):
79
- with open(file_path, "rb") as f:
80
- content = extract_text_from_pdf(f.read())
81
- v = [SourceCode(module_name=file_path, source_code=content)]
82
58
  elif file_path.endswith(".docx"):
83
59
  with open(file_path, "rb") as f:
84
60
  content = extract_text_from_docx(f.read())
85
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
61
+ v = [
62
+ SourceCode(
63
+ module_name=f"##File: {file_path}",
64
+ source_code=content,
65
+ tokens=count_tokens_worker(content),
66
+ )
67
+ ]
86
68
  elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
87
69
  sheets = extract_text_from_excel(file_path)
88
70
  v = [
89
71
  SourceCode(
90
72
  module_name=f"##File: {file_path}#{sheet[0]}",
91
73
  source_code=sheet[1],
74
+ tokens=count_tokens_worker(sheet[1]),
92
75
  )
93
76
  for sheet in sheets
94
77
  ]
95
78
  elif file_path.endswith(".pptx"):
96
79
  slides = extract_text_from_ppt(file_path)
97
80
  content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
98
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
81
+ v = [
82
+ SourceCode(
83
+ module_name=f"##File: {file_path}",
84
+ source_code=content,
85
+ tokens=count_tokens_worker(content),
86
+ )
87
+ ]
99
88
  else:
100
89
  with open(file_path, "r", encoding="utf-8") as f:
101
90
  content = f.read()
102
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
91
+ v = [
92
+ SourceCode(
93
+ module_name=f"##File: {file_path}",
94
+ source_code=content,
95
+ tokens=count_tokens_worker(content),
96
+ )
97
+ ]
103
98
  logger.info(f"Load file {file_path} in {time.time() - start_time}")
104
99
  return v
105
100
  except Exception as e:
@@ -107,34 +102,59 @@ def process_file2(file_info: Tuple[str, str, float]) -> List[SourceCode]:
107
102
  return []
108
103
 
109
104
 
110
- def process_file3(file_path: str) -> List[SourceCode]:
105
+ def process_file_local(file_path: str) -> List[SourceCode]:
111
106
  start_time = time.time()
112
107
  try:
113
108
  if file_path.endswith(".pdf"):
114
109
  with open(file_path, "rb") as f:
115
110
  content = extract_text_from_pdf(f.read())
116
- v = [SourceCode(module_name=file_path, source_code=content)]
111
+ v = [
112
+ SourceCode(
113
+ module_name=file_path,
114
+ source_code=content,
115
+ tokens=count_tokens(content),
116
+ )
117
+ ]
117
118
  elif file_path.endswith(".docx"):
118
119
  with open(file_path, "rb") as f:
119
120
  content = extract_text_from_docx(f.read())
120
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
121
+ v = [
122
+ SourceCode(
123
+ module_name=f"##File: {file_path}",
124
+ source_code=content,
125
+ tokens=count_tokens(content),
126
+ )
127
+ ]
121
128
  elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
122
129
  sheets = extract_text_from_excel(file_path)
123
130
  v = [
124
131
  SourceCode(
125
132
  module_name=f"##File: {file_path}#{sheet[0]}",
126
133
  source_code=sheet[1],
134
+ tokens=count_tokens(sheet[1]),
127
135
  )
128
136
  for sheet in sheets
129
137
  ]
130
138
  elif file_path.endswith(".pptx"):
131
139
  slides = extract_text_from_ppt(file_path)
132
140
  content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
133
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
141
+ v = [
142
+ SourceCode(
143
+ module_name=f"##File: {file_path}",
144
+ source_code=content,
145
+ tokens=count_tokens(content),
146
+ )
147
+ ]
134
148
  else:
135
149
  with open(file_path, "r", encoding="utf-8") as f:
136
150
  content = f.read()
137
- v = [SourceCode(module_name=f"##File: {file_path}", source_code=content)]
151
+ v = [
152
+ SourceCode(
153
+ module_name=f"##File: {file_path}",
154
+ source_code=content,
155
+ tokens=count_tokens(content),
156
+ )
157
+ ]
138
158
  logger.info(f"Load file {file_path} in {time.time() - start_time}")
139
159
  return v
140
160
  except Exception as e:
@@ -205,7 +225,7 @@ class AutoCoderRAGDocListener:
205
225
  self.update_cache(item)
206
226
 
207
227
  def update_cache(self, file_path):
208
- source_code = process_file3(file_path)
228
+ source_code = process_file_local(file_path)
209
229
  self.cache[file_path] = {
210
230
  "file_path": file_path,
211
231
  "content": [c.model_dump() for c in source_code],
@@ -220,7 +240,9 @@ class AutoCoderRAGDocListener:
220
240
 
221
241
  def open_watch(self):
222
242
  logger.info(f"start monitor: {self.path}...")
223
- for changes in watch(self.path, watch_filter=self.file_filter, stop_event=self.stop_event):
243
+ for changes in watch(
244
+ self.path, watch_filter=self.file_filter, stop_event=self.stop_event
245
+ ):
224
246
  for change in changes:
225
247
  (action, path) = change
226
248
  if action == Change.added or action == Change.modified:
@@ -290,7 +312,6 @@ class AutoCoderRAGAsyncUpdateQueue:
290
312
  self.thread.start()
291
313
  self.cache = self.read_cache()
292
314
 
293
-
294
315
  def _process_queue(self):
295
316
  while not self.stop_event.is_set():
296
317
  try:
@@ -324,8 +345,14 @@ class AutoCoderRAGAsyncUpdateQueue:
324
345
  # results = ray.get(
325
346
  # [process_file.remote(file_info) for file_info in files_to_process]
326
347
  # )
327
- with Pool(processes=os.cpu_count()) as pool:
328
- results = pool.map(process_file2, files_to_process)
348
+ from autocoder.rag.token_counter import initialize_tokenizer
349
+
350
+ with Pool(
351
+ processes=os.cpu_count(),
352
+ initializer=initialize_tokenizer,
353
+ initargs=(variable_holder.TOKENIZER_PATH,),
354
+ ) as pool:
355
+ results = pool.map(process_file_in_multi_process, files_to_process)
329
356
 
330
357
  for file_info, result in zip(files_to_process, results):
331
358
  self.update_cache(file_info, result)
@@ -365,7 +392,7 @@ class AutoCoderRAGAsyncUpdateQueue:
365
392
  elif isinstance(file_list, AddOrUpdateEvent):
366
393
  for file_info in file_list.file_infos:
367
394
  logger.info(f"{file_info[0]} is detected to be updated")
368
- result = process_file2(file_info)
395
+ result = process_file_local(file_info)
369
396
  self.update_cache(file_info, result)
370
397
 
371
398
  self.write_cache()
@@ -410,7 +437,9 @@ class AutoCoderRAGAsyncUpdateQueue:
410
437
  # 释放文件锁
411
438
  fcntl.flock(lockf, fcntl.LOCK_UN)
412
439
 
413
- def update_cache(self, file_info: Tuple[str, str, float], content: List[SourceCode]):
440
+ def update_cache(
441
+ self, file_info: Tuple[str, str, float], content: List[SourceCode]
442
+ ):
414
443
  file_path, relative_path, modify_time = file_info
415
444
  self.cache[file_path] = {
416
445
  "file_path": file_path,
@@ -485,11 +514,20 @@ class DocumentRetriever:
485
514
  required_exts: list,
486
515
  on_ray: bool = False,
487
516
  monitor_mode: bool = False,
517
+ single_file_token_limit: int = 60000,
518
+ disable_auto_window: bool = False,
488
519
  ) -> None:
489
520
  self.path = path
490
521
  self.ignore_spec = ignore_spec
491
522
  self.required_exts = required_exts
492
523
  self.monitor_mode = monitor_mode
524
+ self.single_file_token_limit = single_file_token_limit
525
+ self.disable_auto_window = disable_auto_window
526
+
527
+ # 多小的文件会被合并
528
+ self.small_file_token_limit = self.single_file_token_limit / 4
529
+ # 合并后的最大文件大小
530
+ self.small_file_merge_limit = self.single_file_token_limit / 2
493
531
 
494
532
  self.on_ray = on_ray
495
533
  if self.on_ray:
@@ -502,6 +540,13 @@ class DocumentRetriever:
502
540
  path, ignore_spec, required_exts
503
541
  )
504
542
 
543
+ logger.info(f"DocumentRetriever initialized with:")
544
+ logger.info(f" Path: {self.path}")
545
+ logger.info(f" Diable auto window: {self.disable_auto_window} ")
546
+ logger.info(f" Single file token limit: {self.single_file_token_limit}")
547
+ logger.info(f" Small file token limit: {self.small_file_token_limit}")
548
+ logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
549
+
505
550
  def get_cache(self):
506
551
  if self.on_ray:
507
552
  return ray.get(self.cacher.get_cache.remote())
@@ -509,6 +554,102 @@ class DocumentRetriever:
509
554
  return self.cacher.get_cache()
510
555
 
511
556
  def retrieve_documents(self) -> Generator[SourceCode, None, None]:
557
+ logger.info("Starting document retrieval process")
558
+ waiting_list = []
559
+ waiting_tokens = 0
512
560
  for _, data in self.get_cache().items():
513
561
  for source_code in data["content"]:
514
- yield SourceCode.model_validate(source_code)
562
+ doc = SourceCode.model_validate(source_code)
563
+ if self.disable_auto_window:
564
+ yield doc
565
+ else:
566
+ if doc.tokens <= 0:
567
+ yield doc
568
+ elif doc.tokens < self.small_file_token_limit:
569
+ waiting_list, waiting_tokens = self._add_to_waiting_list(
570
+ doc, waiting_list, waiting_tokens
571
+ )
572
+ if waiting_tokens >= self.small_file_merge_limit:
573
+ yield from self._process_waiting_list(waiting_list)
574
+ waiting_list = []
575
+ waiting_tokens = 0
576
+ elif doc.tokens > self.single_file_token_limit:
577
+ yield from self._split_large_document(doc)
578
+ else:
579
+ yield doc
580
+ if waiting_list and not self.disable_auto_window:
581
+ yield from self._process_waiting_list(waiting_list)
582
+
583
+ logger.info("Document retrieval process completed")
584
+
585
+ def _add_to_waiting_list(
586
+ self, doc: SourceCode, waiting_list: List[SourceCode], waiting_tokens: int
587
+ ) -> Tuple[List[SourceCode], int]:
588
+ waiting_list.append(doc)
589
+ return waiting_list, waiting_tokens + doc.tokens
590
+
591
+ def _process_waiting_list(
592
+ self, waiting_list: List[SourceCode]
593
+ ) -> Generator[SourceCode, None, None]:
594
+ if len(waiting_list) == 1:
595
+ yield waiting_list[0]
596
+ elif len(waiting_list) > 1:
597
+ yield self._merge_documents(waiting_list)
598
+
599
+ def _merge_documents(self, docs: List[SourceCode]) -> SourceCode:
600
+ merged_content = "\n".join(
601
+ [f"#File: {doc.module_name}\n{doc.source_code}" for doc in docs]
602
+ )
603
+ merged_tokens = sum([doc.tokens for doc in docs])
604
+ merged_name = f"Merged_{len(docs)}_docs_{str(uuid4())}"
605
+ logger.info(
606
+ f"Merged {len(docs)} documents into {merged_name} (tokens: {merged_tokens})."
607
+ )
608
+ return SourceCode(
609
+ module_name=merged_name,
610
+ source_code=merged_content,
611
+ tokens=merged_tokens,
612
+ metadata={"original_docs": [doc.module_name for doc in docs]},
613
+ )
614
+
615
+ def _split_large_document(
616
+ self, doc: SourceCode
617
+ ) -> Generator[SourceCode, None, None]:
618
+ chunk_size = self.single_file_token_limit
619
+ total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
620
+ logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
621
+ for i in range(0, doc.tokens, chunk_size):
622
+ chunk_content = doc.source_code[i : i + chunk_size]
623
+ chunk_tokens = min(chunk_size, doc.tokens - i)
624
+ chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
625
+ # logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
626
+ yield SourceCode(
627
+ module_name=chunk_name,
628
+ source_code=chunk_content,
629
+ tokens=chunk_tokens,
630
+ metadata={
631
+ "original_doc": doc.module_name,
632
+ "chunk_index": i // chunk_size + 1,
633
+ },
634
+ )
635
+
636
+ def _split_document(
637
+ self, doc: SourceCode, token_limit: int
638
+ ) -> Generator[SourceCode, None, None]:
639
+ remaining_tokens = doc.tokens
640
+ chunk_number = 1
641
+ start_index = 0
642
+
643
+ while remaining_tokens > 0:
644
+ end_index = start_index + token_limit
645
+ chunk_content = doc.source_code[start_index:end_index]
646
+ chunk_tokens = min(token_limit, remaining_tokens)
647
+
648
+ chunk_name = f"{doc.module_name}#{chunk_number:06d}"
649
+ yield SourceCode(
650
+ module_name=chunk_name, source_code=chunk_content, tokens=chunk_tokens
651
+ )
652
+
653
+ start_index = end_index
654
+ remaining_tokens -= chunk_tokens
655
+ chunk_number += 1