auto-coder 0.1.199__tar.gz → 0.1.201__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (127) hide show
  1. {auto_coder-0.1.199 → auto_coder-0.1.201}/PKG-INFO +9 -1
  2. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/auto_coder.egg-info/PKG-INFO +9 -1
  3. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/auto_coder.egg-info/SOURCES.txt +1 -0
  4. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/auto_coder.egg-info/requires.txt +8 -0
  5. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/cache/base_cache.py +1 -1
  6. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/cache/byzer_storage_cache.py +33 -10
  7. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/cache/simple_cache.py +65 -24
  8. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/document_retriever.py +5 -10
  9. auto_coder-0.1.201/src/autocoder/rag/loaders/docx_loader.py +20 -0
  10. auto_coder-0.1.201/src/autocoder/rag/loaders/pdf_loader.py +22 -0
  11. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/long_context_rag.py +3 -0
  12. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/token_limiter.py +2 -2
  13. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/utils.py +9 -13
  14. auto_coder-0.1.201/src/autocoder/utils/_markitdown.py +1298 -0
  15. auto_coder-0.1.201/src/autocoder/version.py +1 -0
  16. auto_coder-0.1.199/src/autocoder/rag/loaders/docx_loader.py +0 -7
  17. auto_coder-0.1.199/src/autocoder/rag/loaders/pdf_loader.py +0 -10
  18. auto_coder-0.1.199/src/autocoder/version.py +0 -1
  19. {auto_coder-0.1.199 → auto_coder-0.1.201}/LICENSE +0 -0
  20. {auto_coder-0.1.199 → auto_coder-0.1.201}/README.md +0 -0
  21. {auto_coder-0.1.199 → auto_coder-0.1.201}/setup.cfg +0 -0
  22. {auto_coder-0.1.199 → auto_coder-0.1.201}/setup.py +0 -0
  23. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/auto_coder.egg-info/dependency_links.txt +0 -0
  24. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/auto_coder.egg-info/entry_points.txt +0 -0
  25. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/auto_coder.egg-info/top_level.txt +0 -0
  26. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/__init__.py +0 -0
  27. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/agent/__init__.py +0 -0
  28. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/agent/auto_tool.py +0 -0
  29. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/agent/coder.py +0 -0
  30. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/agent/designer.py +0 -0
  31. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/agent/planner.py +0 -0
  32. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/agent/project_reader.py +0 -0
  33. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/auto_coder.py +0 -0
  34. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/auto_coder_lang.py +0 -0
  35. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/auto_coder_rag.py +0 -0
  36. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/auto_coder_server.py +0 -0
  37. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/chat/__init__.py +0 -0
  38. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/chat_auto_coder.py +0 -0
  39. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/chat_auto_coder_lang.py +0 -0
  40. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/command_args.py +0 -0
  41. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/JupyterClient.py +0 -0
  42. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/ShellClient.py +0 -0
  43. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/__init__.py +0 -0
  44. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/anything2images.py +0 -0
  45. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/audio.py +0 -0
  46. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/cleaner.py +0 -0
  47. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_execute.py +0 -0
  48. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate.py +0 -0
  49. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate_diff.py +0 -0
  50. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate_editblock.py +0 -0
  51. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_generate_strict_diff.py +0 -0
  52. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge.py +0 -0
  53. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge_diff.py +0 -0
  54. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge_editblock.py +0 -0
  55. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/code_auto_merge_strict_diff.py +0 -0
  56. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/command_completer.py +0 -0
  57. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/command_generator.py +0 -0
  58. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/command_templates.py +0 -0
  59. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/const.py +0 -0
  60. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/git_utils.py +0 -0
  61. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/image_to_page.py +0 -0
  62. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/interpreter.py +0 -0
  63. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/llm_rerank.py +0 -0
  64. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/screenshots.py +0 -0
  65. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/search.py +0 -0
  66. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/search_replace.py +0 -0
  67. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/sys_prompt.py +0 -0
  68. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/text.py +0 -0
  69. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/common/types.py +0 -0
  70. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/db/__init__.py +0 -0
  71. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/db/store.py +0 -0
  72. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/__init__.py +0 -0
  73. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/__init__.py +0 -0
  74. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/action.py +0 -0
  75. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/copilot.py +0 -0
  76. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/plugins/__init__.py +0 -0
  77. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/plugins/action_regex_project.py +0 -0
  78. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/dispacher/actions/plugins/action_translate.py +0 -0
  79. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/index/__init__.py +0 -0
  80. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/index/for_command.py +0 -0
  81. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/index/index.py +0 -0
  82. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/index/symbols_utils.py +0 -0
  83. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/lang.py +0 -0
  84. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/pyproject/__init__.py +0 -0
  85. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/__init__.py +0 -0
  86. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/api_server.py +0 -0
  87. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/cache/__init__.py +0 -0
  88. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/cache/file_monitor_cache.py +0 -0
  89. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/doc_filter.py +0 -0
  90. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/llm_wrapper.py +0 -0
  91. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/loaders/__init__.py +0 -0
  92. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/loaders/excel_loader.py +0 -0
  93. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/loaders/ppt_loader.py +0 -0
  94. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/rag_config.py +0 -0
  95. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/rag_entry.py +0 -0
  96. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/raw_rag.py +0 -0
  97. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/relevant_utils.py +0 -0
  98. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/simple_directory_reader.py +0 -0
  99. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/simple_rag.py +0 -0
  100. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/stream_event/__init__.py +0 -0
  101. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/stream_event/event_writer.py +0 -0
  102. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/stream_event/types.py +0 -0
  103. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/token_checker.py +0 -0
  104. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/token_counter.py +0 -0
  105. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/types.py +0 -0
  106. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/rag/variable_holder.py +0 -0
  107. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/regexproject/__init__.py +0 -0
  108. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/suffixproject/__init__.py +0 -0
  109. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/tsproject/__init__.py +0 -0
  110. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/__init__.py +0 -0
  111. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/conversation_store.py +0 -0
  112. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/llm_client_interceptors.py +0 -0
  113. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/log_capture.py +0 -0
  114. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/multi_turn.py +0 -0
  115. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/print_table.py +0 -0
  116. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/queue_communicate.py +0 -0
  117. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/request_event_queue.py +0 -0
  118. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/request_queue.py +0 -0
  119. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/rest.py +0 -0
  120. {auto_coder-0.1.199 → auto_coder-0.1.201}/src/autocoder/utils/tests.py +0 -0
  121. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_action_regex_project.py +0 -0
  122. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_chat_auto_coder.py +0 -0
  123. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_code_auto_merge_editblock.py +0 -0
  124. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_command_completer.py +0 -0
  125. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_planner.py +0 -0
  126. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_queue_communicate.py +0 -0
  127. {auto_coder-0.1.199 → auto_coder-0.1.201}/tests/test_symbols_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.199
3
+ Version: 0.1.201
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -49,6 +49,14 @@ Requires-Dist: python-pptx
49
49
  Requires-Dist: watchfiles
50
50
  Requires-Dist: cairosvg
51
51
  Requires-Dist: matplotlib
52
+ Requires-Dist: mammoth
53
+ Requires-Dist: markdownify
54
+ Requires-Dist: pdfminer.six
55
+ Requires-Dist: puremagic
56
+ Requires-Dist: pydub
57
+ Requires-Dist: youtube-transcript-api
58
+ Requires-Dist: SpeechRecognition
59
+ Requires-Dist: pathvalidate
52
60
 
53
61
  <p align="center">
54
62
  <picture>
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.199
3
+ Version: 0.1.201
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -49,6 +49,14 @@ Requires-Dist: python-pptx
49
49
  Requires-Dist: watchfiles
50
50
  Requires-Dist: cairosvg
51
51
  Requires-Dist: matplotlib
52
+ Requires-Dist: mammoth
53
+ Requires-Dist: markdownify
54
+ Requires-Dist: pdfminer.six
55
+ Requires-Dist: puremagic
56
+ Requires-Dist: pydub
57
+ Requires-Dist: youtube-transcript-api
58
+ Requires-Dist: SpeechRecognition
59
+ Requires-Dist: pathvalidate
52
60
 
53
61
  <p align="center">
54
62
  <picture>
@@ -102,6 +102,7 @@ src/autocoder/regexproject/__init__.py
102
102
  src/autocoder/suffixproject/__init__.py
103
103
  src/autocoder/tsproject/__init__.py
104
104
  src/autocoder/utils/__init__.py
105
+ src/autocoder/utils/_markitdown.py
105
106
  src/autocoder/utils/conversation_store.py
106
107
  src/autocoder/utils/llm_client_interceptors.py
107
108
  src/autocoder/utils/log_capture.py
@@ -39,3 +39,11 @@ python-pptx
39
39
  watchfiles
40
40
  cairosvg
41
41
  matplotlib
42
+ mammoth
43
+ markdownify
44
+ pdfminer.six
45
+ puremagic
46
+ pydub
47
+ youtube-transcript-api
48
+ SpeechRecognition
49
+ pathvalidate
@@ -6,7 +6,7 @@ class DeleteEvent(BaseModel):
6
6
  file_paths: List[str]
7
7
 
8
8
  class AddOrUpdateEvent(BaseModel):
9
- file_infos: List[Tuple[str, str, float]]
9
+ file_infos: List[Tuple[str, str, float, str]]
10
10
 
11
11
  class BaseCacheManager(ABC):
12
12
  @abstractmethod
@@ -23,13 +23,34 @@ from multiprocessing import Pool
23
23
  from concurrent.futures import ThreadPoolExecutor, as_completed
24
24
  from autocoder.rag.variable_holder import VariableHolder
25
25
  import platform
26
+ import hashlib
27
+ from typing import Union
26
28
 
27
29
  if platform.system() != "Windows":
28
30
  import fcntl
29
31
  else:
30
32
  fcntl = None
31
33
 
32
-
34
+ def generate_file_md5(file_path: str) -> str:
35
+ md5_hash = hashlib.md5()
36
+ with open(file_path, "rb") as f:
37
+ for chunk in iter(lambda: f.read(4096), b""):
38
+ md5_hash.update(chunk)
39
+ return md5_hash.hexdigest()
40
+
41
+
42
+ def generate_content_md5(content: Union[str, bytes]) -> str:
43
+ if isinstance(content, str):
44
+ content = content.encode("utf-8")
45
+ md5_hash = hashlib.md5()
46
+ md5_hash.update(content)
47
+ return md5_hash.hexdigest()
48
+
49
+ default_ignore_dirs = [
50
+ "__pycache__",
51
+ "node_modules",
52
+ "_images"
53
+ ]
33
54
  class ByzerStorageCache(BaseCacheManager):
34
55
  def __init__(
35
56
  self,
@@ -154,10 +175,10 @@ class ByzerStorageCache(BaseCacheManager):
154
175
 
155
176
  files_to_process = []
156
177
  for file_info in self.get_all_files():
157
- file_path, _, modify_time = file_info
178
+ file_path, _, modify_time, file_md5 = file_info
158
179
  if (
159
- file_path not in self.cache
160
- or self.cache[file_path]["modify_time"] < modify_time
180
+ file_path not in self.cache
181
+ or self.cache[file_path]["md5"] != file_md5
161
182
  ):
162
183
  files_to_process.append(file_info)
163
184
 
@@ -175,13 +196,14 @@ class ByzerStorageCache(BaseCacheManager):
175
196
 
176
197
  items = []
177
198
  for file_info, result in zip(files_to_process, results):
178
- file_path, relative_path, modify_time = file_info
199
+ file_path, relative_path, modify_time, file_md5 = file_info
179
200
  content: List[SourceCode] = result
180
201
  self.cache[file_path] = {
181
202
  "file_path": file_path,
182
203
  "relative_path": relative_path,
183
204
  "content": [c.model_dump() for c in content],
184
205
  "modify_time": modify_time,
206
+ "md5": file_md5,
185
207
  }
186
208
 
187
209
  for doc in content:
@@ -295,11 +317,11 @@ class ByzerStorageCache(BaseCacheManager):
295
317
  files_to_process = []
296
318
  current_files = set()
297
319
  for file_info in self.get_all_files():
298
- file_path, _, modify_time = file_info
320
+ file_path, _, _, file_md5 = file_info
299
321
  current_files.add(file_path)
300
322
  if (
301
323
  file_path not in self.cache
302
- or self.cache[file_path]["modify_time"] < modify_time
324
+ or self.cache[file_path]["md5"] != file_md5
303
325
  ):
304
326
  files_to_process.append(file_info)
305
327
 
@@ -362,10 +384,10 @@ class ByzerStorageCache(BaseCacheManager):
362
384
 
363
385
 
364
386
 
365
- def get_all_files(self) -> List[Tuple[str, str, float]]:
387
+ def get_all_files(self) -> List[Tuple[str, str, float, str]]:
366
388
  all_files = []
367
389
  for root, dirs, files in os.walk(self.path,followlinks=True):
368
- dirs[:] = [d for d in dirs if not d.startswith(".")]
390
+ dirs[:] = [d for d in dirs if not d.startswith(".") and d not in default_ignore_dirs]
369
391
 
370
392
  if self.ignore_spec:
371
393
  relative_root = os.path.relpath(root, self.path)
@@ -389,6 +411,7 @@ class ByzerStorageCache(BaseCacheManager):
389
411
  file_path = os.path.join(root, file)
390
412
  relative_path = os.path.relpath(file_path, self.path)
391
413
  modify_time = os.path.getmtime(file_path)
392
- all_files.append((file_path, relative_path, modify_time))
414
+ file_md5 = generate_file_md5(file_path)
415
+ all_files.append((file_path, relative_path, modify_time, file_md5))
393
416
 
394
417
  return all_files
@@ -1,8 +1,8 @@
1
1
 
2
- from multiprocessing import Pool
3
- from autocoder.common import SourceCode
4
- from autocoder.rag.cache.base_cache import BaseCacheManager,DeleteEvent,AddOrUpdateEvent
5
- from typing import Dict, List, Tuple,Any,Optional
2
+ from multiprocessing import Pool
3
+ from autocoder.common import SourceCode
4
+ from autocoder.rag.cache.base_cache import BaseCacheManager, DeleteEvent, AddOrUpdateEvent
5
+ from typing import Dict, List, Tuple, Any, Optional, Union
6
6
  import os
7
7
  import threading
8
8
  import json
@@ -13,8 +13,31 @@ else:
13
13
  fcntl = None
14
14
  import time
15
15
  from loguru import logger
16
- from autocoder.rag.utils import process_file_in_multi_process,process_file_local
16
+ from autocoder.rag.utils import process_file_in_multi_process, process_file_local
17
17
  from autocoder.rag.variable_holder import VariableHolder
18
+ import hashlib
19
+
20
+ default_ignore_dirs = [
21
+ "__pycache__",
22
+ "node_modules",
23
+ "_images"
24
+ ]
25
+
26
+
27
+ def generate_file_md5(file_path: str) -> str:
28
+ md5_hash = hashlib.md5()
29
+ with open(file_path, "rb") as f:
30
+ for chunk in iter(lambda: f.read(4096), b""):
31
+ md5_hash.update(chunk)
32
+ return md5_hash.hexdigest()
33
+
34
+
35
+ def generate_content_md5(content: Union[str, bytes]) -> str:
36
+ if isinstance(content, str):
37
+ content = content.encode("utf-8")
38
+ md5_hash = hashlib.md5()
39
+ md5_hash.update(content)
40
+ return md5_hash.hexdigest()
18
41
 
19
42
 
20
43
  class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
@@ -52,10 +75,10 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
52
75
  return
53
76
  files_to_process = []
54
77
  for file_info in self.get_all_files():
55
- file_path, _, modify_time = file_info
78
+ file_path, _, modify_time, file_md5 = file_info
56
79
  if (
57
80
  file_path not in self.cache
58
- or self.cache[file_path]["modify_time"] < modify_time
81
+ or self.cache[file_path].get("md5","") != file_md5
59
82
  ):
60
83
  files_to_process.append(file_info)
61
84
  if not files_to_process:
@@ -71,7 +94,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
71
94
  initializer=initialize_tokenizer,
72
95
  initargs=(VariableHolder.TOKENIZER_PATH,),
73
96
  ) as pool:
74
- results = pool.map(process_file_in_multi_process, files_to_process)
97
+ results = pool.map(
98
+ process_file_in_multi_process, files_to_process)
75
99
 
76
100
  for file_info, result in zip(files_to_process, results):
77
101
  self.update_cache(file_info, result)
@@ -83,11 +107,11 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
83
107
  files_to_process = []
84
108
  current_files = set()
85
109
  for file_info in self.get_all_files():
86
- file_path, _, modify_time = file_info
110
+ file_path, _, _, file_md5 = file_info
87
111
  current_files.add(file_path)
88
112
  if (
89
113
  file_path not in self.cache
90
- or self.cache[file_path]["modify_time"] < modify_time
114
+ or self.cache[file_path].get("md5","") != file_md5
91
115
  ):
92
116
  files_to_process.append(file_info)
93
117
 
@@ -99,7 +123,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
99
123
  self.queue.append(DeleteEvent(file_paths=deleted_files))
100
124
  if files_to_process:
101
125
  with self.lock:
102
- self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
126
+ self.queue.append(AddOrUpdateEvent(
127
+ file_infos=files_to_process))
103
128
 
104
129
  def process_queue(self):
105
130
  while self.queue:
@@ -111,8 +136,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
111
136
  elif isinstance(file_list, AddOrUpdateEvent):
112
137
  for file_info in file_list.file_infos:
113
138
  logger.info(f"{file_info[0]} is detected to be updated")
114
- result = process_file_local(file_info[0])
115
- self.update_cache(file_info, result)
139
+ try:
140
+ result = process_file_local(file_info[0])
141
+ self.update_cache(file_info, result)
142
+ except Exception as e:
143
+ logger.error(
144
+ f"SimpleCache Error in process_queue: {e}")
116
145
 
117
146
  self.write_cache()
118
147
 
@@ -138,8 +167,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
138
167
  if not fcntl:
139
168
  with open(cache_file, "w") as f:
140
169
  for data in self.cache.values():
141
- json.dump(data, f, ensure_ascii=False)
142
- f.write("\n")
170
+ try:
171
+ json.dump(data, f, ensure_ascii=False)
172
+ f.write("\n")
173
+ except Exception as e:
174
+ logger.error(
175
+ f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
143
176
  else:
144
177
  lock_file = cache_file + ".lock"
145
178
  with open(lock_file, "w") as lockf:
@@ -149,33 +182,39 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
149
182
  # 写入缓存文件
150
183
  with open(cache_file, "w") as f:
151
184
  for data in self.cache.values():
152
- json.dump(data, f, ensure_ascii=False)
153
- f.write("\n")
185
+ try:
186
+ json.dump(data, f, ensure_ascii=False)
187
+ f.write("\n")
188
+ except Exception as e:
189
+ logger.error(
190
+ f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
154
191
 
155
192
  finally:
156
193
  # 释放文件锁
157
194
  fcntl.flock(lockf, fcntl.LOCK_UN)
158
195
 
159
196
  def update_cache(
160
- self, file_info: Tuple[str, str, float], content: List[SourceCode]
197
+ self, file_info: Tuple[str, str, float, str], content: List[SourceCode]
161
198
  ):
162
- file_path, relative_path, modify_time = file_info
199
+ file_path, relative_path, modify_time, file_md5 = file_info
163
200
  self.cache[file_path] = {
164
201
  "file_path": file_path,
165
202
  "relative_path": relative_path,
166
203
  "content": [c.model_dump() for c in content],
167
204
  "modify_time": modify_time,
205
+ "md5": file_md5,
168
206
  }
169
207
 
170
- def get_cache(self,options:Optional[Dict[str,Any]]=None):
208
+ def get_cache(self, options: Optional[Dict[str, Any]] = None):
171
209
  self.load_first()
172
210
  self.trigger_update()
173
211
  return self.cache
174
212
 
175
213
  def get_all_files(self) -> List[Tuple[str, str, float]]:
176
214
  all_files = []
177
- for root, dirs, files in os.walk(self.path,followlinks=True):
178
- dirs[:] = [d for d in dirs if not d.startswith(".")]
215
+ for root, dirs, files in os.walk(self.path, followlinks=True):
216
+ dirs[:] = [d for d in dirs if not d.startswith(
217
+ ".") and d not in default_ignore_dirs]
179
218
 
180
219
  if self.ignore_spec:
181
220
  relative_root = os.path.relpath(root, self.path)
@@ -199,6 +238,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
199
238
  file_path = os.path.join(root, file)
200
239
  relative_path = os.path.relpath(file_path, self.path)
201
240
  modify_time = os.path.getmtime(file_path)
202
- all_files.append((file_path, relative_path, modify_time))
241
+ file_md5 = generate_file_md5(file_path)
242
+ all_files.append(
243
+ (file_path, relative_path, modify_time, file_md5))
203
244
 
204
- return all_files
245
+ return all_files
@@ -13,13 +13,6 @@ from loguru import logger
13
13
  from pydantic import BaseModel
14
14
 
15
15
  from autocoder.common import SourceCode
16
- from autocoder.rag.loaders import (
17
- extract_text_from_docx,
18
- extract_text_from_excel,
19
- extract_text_from_pdf,
20
- extract_text_from_ppt,
21
- )
22
- from autocoder.rag.token_counter import count_tokens_worker, count_tokens
23
16
  from uuid import uuid4
24
17
  from autocoder.rag.variable_holder import VariableHolder
25
18
  from abc import ABC, abstractmethod
@@ -119,7 +112,8 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
119
112
  logger.info(f"DocumentRetriever initialized with:")
120
113
  logger.info(f" Path: {self.path}")
121
114
  logger.info(f" Diable auto window: {self.disable_auto_window} ")
122
- logger.info(f" Single file token limit: {self.single_file_token_limit}")
115
+ logger.info(
116
+ f" Single file token limit: {self.single_file_token_limit}")
123
117
  logger.info(f" Small file token limit: {self.small_file_token_limit}")
124
118
  logger.info(f" Small file merge limit: {self.small_file_merge_limit}")
125
119
  logger.info(f" Enable hybrid index: {self.enable_hybrid_index}")
@@ -200,9 +194,10 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
200
194
  ) -> Generator[SourceCode, None, None]:
201
195
  chunk_size = self.single_file_token_limit
202
196
  total_chunks = (doc.tokens + chunk_size - 1) // chunk_size
203
- logger.info(f"Splitting document {doc.module_name} into {total_chunks} chunks")
197
+ logger.info(
198
+ f"Splitting document {doc.module_name} into {total_chunks} chunks")
204
199
  for i in range(0, doc.tokens, chunk_size):
205
- chunk_content = doc.source_code[i : i + chunk_size]
200
+ chunk_content = doc.source_code[i: i + chunk_size]
206
201
  chunk_tokens = min(chunk_size, doc.tokens - i)
207
202
  chunk_name = f"{doc.module_name}#chunk{i//chunk_size+1}"
208
203
  # logger.debug(f" Created chunk: {chunk_name} (tokens: {chunk_tokens})")
@@ -0,0 +1,20 @@
1
+ from io import BytesIO
2
+ import docx2txt
3
+ from autocoder.utils._markitdown import MarkItDown
4
+
5
+
6
+ def extract_text_from_docx_old(docx_path):
7
+ with open(docx_path, "rb") as f:
8
+ docx_content = f.read()
9
+ docx_file = BytesIO(docx_content)
10
+ text = docx2txt.process(docx_file)
11
+ return text
12
+
13
+
14
+ def extract_text_from_docx(docx_path):
15
+ try:
16
+ md_converter = MarkItDown()
17
+ result = md_converter.convert(docx_path)
18
+ return result.text_content
19
+ except Exception as e:
20
+ return extract_text_from_docx_old(docx_path)
@@ -0,0 +1,22 @@
1
+ from io import BytesIO
2
+ from pypdf import PdfReader
3
+ from autocoder.utils._markitdown import MarkItDown
4
+
5
+
6
+ def extract_text_from_pdf_old(file_path):
7
+ with open(file_path, "rb") as f:
8
+ pdf_content = f.read()
9
+ pdf_file = BytesIO(pdf_content)
10
+ pdf_reader = PdfReader(pdf_file)
11
+ text = ""
12
+ for page in pdf_reader.pages:
13
+ text += page.extract_text()
14
+ return text
15
+
16
+ def extract_text_from_pdf(file_path):
17
+ try:
18
+ md_converter = MarkItDown()
19
+ result = md_converter.convert(file_path)
20
+ return result.text_content
21
+ except Exception as e:
22
+ return extract_text_from_pdf_old(file_path)
@@ -210,6 +210,9 @@ class LongContextRAG:
210
210
 
211
211
  问题:{{ query }}
212
212
 
213
+ 要求:
214
+ 1. 注意相应的markdown图片如果存在也要输出,尽可能图文并茂
215
+
213
216
  回答:
214
217
  """
215
218
 
@@ -113,7 +113,7 @@ class TokenLimiter:
113
113
  num_count += 1
114
114
  reorder_relevant_docs.append(doc)
115
115
  if "original_doc" in doc.metadata and "chunk_index" in doc.metadata:
116
- original_doc_name = doc.metadata["original_doc"].module_name
116
+ original_doc_name = doc.metadata["original_doc"]
117
117
 
118
118
  temp_docs = []
119
119
  for temp_doc in relevant_docs[num_count:]:
@@ -122,7 +122,7 @@ class TokenLimiter:
122
122
  and "chunk_index" in temp_doc.metadata
123
123
  ):
124
124
  if (
125
- temp_doc.metadata["original_doc"].module_name
125
+ temp_doc.metadata["original_doc"]
126
126
  == original_doc_name
127
127
  ):
128
128
  if temp_doc not in reorder_relevant_docs:
@@ -13,11 +13,10 @@ def process_file_in_multi_process(
13
13
  file_info: Tuple[str, str, float]
14
14
  ) -> List[SourceCode]:
15
15
  start_time = time.time()
16
- file_path, relative_path, _ = file_info
16
+ file_path, relative_path, _, _ = file_info
17
17
  try:
18
- if file_path.endswith(".pdf"):
19
- with open(file_path, "rb") as f:
20
- content = extract_text_from_pdf(f.read())
18
+ if file_path.endswith(".pdf"):
19
+ content = extract_text_from_pdf(file_path)
21
20
  v = [
22
21
  SourceCode(
23
22
  module_name=file_path,
@@ -25,9 +24,8 @@ def process_file_in_multi_process(
25
24
  tokens=count_tokens_worker(content),
26
25
  )
27
26
  ]
28
- elif file_path.endswith(".docx"):
29
- with open(file_path, "rb") as f:
30
- content = extract_text_from_docx(f.read())
27
+ elif file_path.endswith(".docx"):
28
+ content = extract_text_from_docx(file_path)
31
29
  v = [
32
30
  SourceCode(
33
31
  module_name=f"##File: {file_path}",
@@ -75,9 +73,8 @@ def process_file_in_multi_process(
75
73
  def process_file_local(file_path: str) -> List[SourceCode]:
76
74
  start_time = time.time()
77
75
  try:
78
- if file_path.endswith(".pdf"):
79
- with open(file_path, "rb") as f:
80
- content = extract_text_from_pdf(f.read())
76
+ if file_path.endswith(".pdf"):
77
+ content = extract_text_from_pdf(file_path)
81
78
  v = [
82
79
  SourceCode(
83
80
  module_name=file_path,
@@ -85,9 +82,8 @@ def process_file_local(file_path: str) -> List[SourceCode]:
85
82
  tokens=count_tokens(content),
86
83
  )
87
84
  ]
88
- elif file_path.endswith(".docx"):
89
- with open(file_path, "rb") as f:
90
- content = extract_text_from_docx(f.read())
85
+ elif file_path.endswith(".docx"):
86
+ content = extract_text_from_docx(file_path)
91
87
  v = [
92
88
  SourceCode(
93
89
  module_name=f"##File: {file_path}",