auto-coder 0.1.375__py3-none-any.whl → 0.1.376__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (51) hide show
  1. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/RECORD +17 -51
  3. autocoder/agent/base_agentic/base_agent.py +9 -8
  4. autocoder/auto_coder_rag.py +12 -0
  5. autocoder/models.py +2 -2
  6. autocoder/rag/cache/local_duckdb_storage_cache.py +63 -33
  7. autocoder/rag/conversation_to_queries.py +37 -5
  8. autocoder/rag/long_context_rag.py +161 -41
  9. autocoder/rag/tools/recall_tool.py +2 -1
  10. autocoder/rag/tools/search_tool.py +2 -1
  11. autocoder/rag/types.py +36 -0
  12. autocoder/utils/_markitdown.py +59 -13
  13. autocoder/version.py +1 -1
  14. autocoder/agent/agentic_edit.py +0 -833
  15. autocoder/agent/agentic_edit_tools/__init__.py +0 -28
  16. autocoder/agent/agentic_edit_tools/ask_followup_question_tool_resolver.py +0 -32
  17. autocoder/agent/agentic_edit_tools/attempt_completion_tool_resolver.py +0 -29
  18. autocoder/agent/agentic_edit_tools/base_tool_resolver.py +0 -29
  19. autocoder/agent/agentic_edit_tools/execute_command_tool_resolver.py +0 -84
  20. autocoder/agent/agentic_edit_tools/list_code_definition_names_tool_resolver.py +0 -75
  21. autocoder/agent/agentic_edit_tools/list_files_tool_resolver.py +0 -62
  22. autocoder/agent/agentic_edit_tools/plan_mode_respond_tool_resolver.py +0 -30
  23. autocoder/agent/agentic_edit_tools/read_file_tool_resolver.py +0 -36
  24. autocoder/agent/agentic_edit_tools/replace_in_file_tool_resolver.py +0 -95
  25. autocoder/agent/agentic_edit_tools/search_files_tool_resolver.py +0 -70
  26. autocoder/agent/agentic_edit_tools/use_mcp_tool_resolver.py +0 -55
  27. autocoder/agent/agentic_edit_tools/write_to_file_tool_resolver.py +0 -98
  28. autocoder/agent/agentic_edit_types.py +0 -124
  29. autocoder/auto_coder_lang.py +0 -60
  30. autocoder/auto_coder_rag_client_mcp.py +0 -170
  31. autocoder/auto_coder_rag_mcp.py +0 -193
  32. autocoder/common/llm_rerank.py +0 -84
  33. autocoder/common/model_speed_test.py +0 -392
  34. autocoder/common/v2/agent/agentic_edit_conversation.py +0 -188
  35. autocoder/common/v2/agent/ignore_utils.py +0 -50
  36. autocoder/dispacher/actions/plugins/action_translate.py +0 -214
  37. autocoder/ignorefiles/__init__.py +0 -4
  38. autocoder/ignorefiles/ignore_file_utils.py +0 -63
  39. autocoder/ignorefiles/test_ignore_file_utils.py +0 -91
  40. autocoder/linters/code_linter.py +0 -588
  41. autocoder/rag/loaders/test_image_loader.py +0 -209
  42. autocoder/rag/raw_rag.py +0 -96
  43. autocoder/rag/simple_directory_reader.py +0 -646
  44. autocoder/rag/simple_rag.py +0 -404
  45. autocoder/regex_project/__init__.py +0 -162
  46. autocoder/utils/coder.py +0 -125
  47. autocoder/utils/tests.py +0 -37
  48. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/LICENSE +0 -0
  49. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/WHEEL +0 -0
  50. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/entry_points.txt +0 -0
  51. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,9 @@ from autocoder.rag.searchable import SearchableResults
29
29
  from autocoder.rag.conversation_to_queries import extract_search_queries
30
30
  from autocoder.common import openai_content as OpenAIContentProcessor
31
31
  from autocoder.common.save_formatted_log import save_formatted_log
32
+ from autocoder.rag.types import (
33
+ RecallStat,ChunkStat,AnswerStat,OtherStat,RAGStat
34
+ )
32
35
  import json, os
33
36
  try:
34
37
  from autocoder_pro.rag.llm_compute import LLMComputeEngine
@@ -42,29 +45,6 @@ except ImportError:
42
45
  LLMComputeEngine = None
43
46
 
44
47
 
45
- class RecallStat(BaseModel):
46
- total_input_tokens: int
47
- total_generated_tokens: int
48
- model_name: str = "unknown"
49
-
50
-
51
- class ChunkStat(BaseModel):
52
- total_input_tokens: int
53
- total_generated_tokens: int
54
- model_name: str = "unknown"
55
-
56
-
57
- class AnswerStat(BaseModel):
58
- total_input_tokens: int
59
- total_generated_tokens: int
60
- model_name: str = "unknown"
61
-
62
-
63
- class RAGStat(BaseModel):
64
- recall_stat: RecallStat
65
- chunk_stat: ChunkStat
66
- answer_stat: AnswerStat
67
-
68
48
 
69
49
  class LongContextRAG:
70
50
  def __init__(
@@ -690,7 +670,7 @@ class LongContextRAG:
690
670
  yield gen_item
691
671
 
692
672
  # 打印最终的统计信息
693
- self._print_rag_stats(rag_stat)
673
+ self._print_rag_stats(rag_stat, conversations)
694
674
  return
695
675
 
696
676
  def _process_document_retrieval(self, conversations,
@@ -716,7 +696,7 @@ class LongContextRAG:
716
696
 
717
697
  # 提取查询并检索候选文档
718
698
  queries = extract_search_queries(
719
- conversations=conversations, args=self.args, llm=self.llm, max_queries=self.args.rag_recall_max_queries)
699
+ conversations=conversations, args=self.args, llm=self.llm, max_queries=self.args.rag_recall_max_queries,rag_stat=rag_stat)
720
700
  documents = self._retrieve_documents(
721
701
  options={"queries": [query] + [query.query for query in queries]})
722
702
 
@@ -913,7 +893,7 @@ class LongContextRAG:
913
893
  rag_stat.answer_stat.total_generated_tokens
914
894
  yield chunk
915
895
 
916
- def _print_rag_stats(self, rag_stat: RAGStat) -> None:
896
+ def _print_rag_stats(self, rag_stat: RAGStat, conversations: Optional[List[Dict[str, str]]] = None) -> None:
917
897
  """打印RAG执行的详细统计信息"""
918
898
  total_input_tokens = (
919
899
  rag_stat.recall_stat.total_input_tokens +
@@ -937,12 +917,46 @@ class LongContextRAG:
937
917
  rag_stat.chunk_stat.total_generated_tokens) / total_tokens * 100
938
918
  answer_percent = (rag_stat.answer_stat.total_input_tokens +
939
919
  rag_stat.answer_stat.total_generated_tokens) / total_tokens * 100
940
-
941
- logger.info(
942
- f"=== RAG 执行统计信息 ===\n"
920
+
921
+ # 计算其他阶段的令牌占比
922
+ other_percents = []
923
+ if total_tokens > 0 and rag_stat.other_stats:
924
+ for other_stat in rag_stat.other_stats:
925
+ other_percent = (other_stat.total_input_tokens +
926
+ other_stat.total_generated_tokens) / total_tokens * 100
927
+ other_percents.append(other_percent)
928
+
929
+ # 计算成本分布百分比
930
+ if rag_stat.cost == 0:
931
+ recall_cost_percent = chunk_cost_percent = answer_cost_percent = 0
932
+ else:
933
+ recall_cost_percent = rag_stat.recall_stat.cost / rag_stat.cost * 100
934
+ chunk_cost_percent = rag_stat.chunk_stat.cost / rag_stat.cost * 100
935
+ answer_cost_percent = rag_stat.answer_stat.cost / rag_stat.cost * 100
936
+
937
+ # 计算其他阶段的成本占比
938
+ other_costs_percent = []
939
+ if rag_stat.cost > 0 and rag_stat.other_stats:
940
+ for other_stat in rag_stat.other_stats:
941
+ other_costs_percent.append(other_stat.cost / rag_stat.cost * 100)
942
+
943
+ ## 这里会计算每个阶段的成本
944
+ estimated_cost = self._estimate_token_cost(rag_stat)
945
+ # 构建统计信息字符串
946
+ query_content = ""
947
+ if conversations and len(conversations) > 0:
948
+ query_content = conversations[-1].get("content", "")
949
+ if len(query_content) > 100:
950
+ query_content = query_content[:100] + "..."
951
+ query_content = f"查询内容: {query_content}\n"
952
+
953
+ stats_str = (
954
+ f"=== (RAG 执行统计信息) ===\n"
955
+ f"{query_content}"
943
956
  f"总令牌使用: {total_tokens} 令牌\n"
944
957
  f" * 输入令牌总数: {total_input_tokens}\n"
945
958
  f" * 生成令牌总数: {total_generated_tokens}\n"
959
+ f" * 总成本: {rag_stat.cost:.6f}\n"
946
960
  f"\n"
947
961
  f"阶段统计:\n"
948
962
  f" 1. 文档检索阶段:\n"
@@ -950,40 +964,146 @@ class LongContextRAG:
950
964
  f" - 输入令牌: {rag_stat.recall_stat.total_input_tokens}\n"
951
965
  f" - 生成令牌: {rag_stat.recall_stat.total_generated_tokens}\n"
952
966
  f" - 阶段总计: {rag_stat.recall_stat.total_input_tokens + rag_stat.recall_stat.total_generated_tokens}\n"
967
+ f" - 阶段成本: {rag_stat.recall_stat.cost:.6f}\n"
953
968
  f"\n"
954
969
  f" 2. 文档分块阶段:\n"
955
970
  f" - 模型: {rag_stat.chunk_stat.model_name}\n"
956
971
  f" - 输入令牌: {rag_stat.chunk_stat.total_input_tokens}\n"
957
972
  f" - 生成令牌: {rag_stat.chunk_stat.total_generated_tokens}\n"
958
973
  f" - 阶段总计: {rag_stat.chunk_stat.total_input_tokens + rag_stat.chunk_stat.total_generated_tokens}\n"
974
+ f" - 阶段成本: {rag_stat.chunk_stat.cost:.6f}\n"
959
975
  f"\n"
960
976
  f" 3. 答案生成阶段:\n"
961
977
  f" - 模型: {rag_stat.answer_stat.model_name}\n"
962
978
  f" - 输入令牌: {rag_stat.answer_stat.total_input_tokens}\n"
963
979
  f" - 生成令牌: {rag_stat.answer_stat.total_generated_tokens}\n"
964
980
  f" - 阶段总计: {rag_stat.answer_stat.total_input_tokens + rag_stat.answer_stat.total_generated_tokens}\n"
981
+ f" - 阶段成本: {rag_stat.answer_stat.cost:.6f}\n"
965
982
  f"\n"
983
+ )
984
+
985
+ # 如果存在 other_stats,添加其统计信息
986
+ if rag_stat.other_stats:
987
+ for i, other_stat in enumerate(rag_stat.other_stats):
988
+ stats_str += (
989
+ f" {i+4}. 其他阶段 {i+1}:\n"
990
+ f" - 模型: {other_stat.model_name}\n"
991
+ f" - 输入令牌: {other_stat.total_input_tokens}\n"
992
+ f" - 生成令牌: {other_stat.total_generated_tokens}\n"
993
+ f" - 阶段总计: {other_stat.total_input_tokens + other_stat.total_generated_tokens}\n"
994
+ f" - 阶段成本: {other_stat.cost:.6f}\n"
995
+ f"\n"
996
+ )
997
+
998
+ # 添加令牌分布百分比
999
+ stats_str += (
966
1000
  f"令牌分布百分比:\n"
967
1001
  f" - 文档检索: {recall_percent:.1f}%\n"
968
1002
  f" - 文档分块: {chunk_percent:.1f}%\n"
969
1003
  f" - 答案生成: {answer_percent:.1f}%\n"
970
1004
  )
1005
+
1006
+ # 如果存在 other_stats,添加其令牌占比
1007
+ if rag_stat.other_stats:
1008
+ for i, other_percent in enumerate(other_percents):
1009
+ if other_percent > 0:
1010
+ stats_str += f" - 其他阶段 {i+1}: {other_percent:.1f}%\n"
1011
+
1012
+ # 添加成本分布百分比
1013
+ stats_str += (
1014
+ f"\n"
1015
+ f"成本分布百分比:\n"
1016
+ f" - 文档检索: {recall_cost_percent:.1f}%\n"
1017
+ f" - 文档分块: {chunk_cost_percent:.1f}%\n"
1018
+ f" - 答案生成: {answer_cost_percent:.1f}%\n"
1019
+ )
1020
+
1021
+ # 如果存在 other_stats,添加其成本占比
1022
+ if rag_stat.other_stats:
1023
+ for i, other_cost_percent in enumerate(other_costs_percent):
1024
+ if other_cost_percent > 0:
1025
+ stats_str += f" - 其他阶段 {i+1}: {other_cost_percent:.1f}%\n"
1026
+
1027
+ # 输出统计信息
1028
+ logger.info(stats_str)
971
1029
 
972
1030
  # 记录原始统计数据,以便调试
973
1031
  logger.debug(f"RAG Stat 原始数据: {rag_stat}")
974
1032
 
975
- # 返回成本估算
976
- estimated_cost = self._estimate_token_cost(
977
- total_input_tokens, total_generated_tokens)
1033
+
978
1034
  if estimated_cost > 0:
979
- logger.info(f"估计成本: 约 ${estimated_cost:.4f} 人民币")
1035
+ logger.info(f"估计成本: 约 {estimated_cost:.4f} ")
980
1036
 
981
- def _estimate_token_cost(self, input_tokens: int, output_tokens: int) -> float:
1037
+ def _estimate_token_cost(self, rag_stat: RAGStat) -> float:
982
1038
  """估算当前请求的令牌成本(人民币)"""
983
- # 实际应用中,可以根据不同模型设置不同价格
984
- input_cost_per_1m = 2.0/1000000 # 每百万输入令牌的成本
985
- output_cost_per_1m = 8.0/100000 # 每百万输出令牌的成本
986
-
987
- cost = (input_tokens * input_cost_per_1m / 1000000) + \
988
- (output_tokens * output_cost_per_1m/1000000)
989
- return cost
1039
+ from autocoder.models import get_model_by_name
1040
+
1041
+ total_cost = 0.0
1042
+
1043
+ # 计算召回阶段成本
1044
+ if rag_stat.recall_stat.model_name != "unknown":
1045
+ try:
1046
+ recall_model = get_model_by_name(rag_stat.recall_stat.model_name)
1047
+ input_cost = recall_model.get("input_price", 0.0) / 1000000
1048
+ output_cost = recall_model.get("output_price", 0.0) / 1000000
1049
+ recall_cost = (rag_stat.recall_stat.total_input_tokens * input_cost) + \
1050
+ (rag_stat.recall_stat.total_generated_tokens * output_cost)
1051
+ total_cost += recall_cost
1052
+ except Exception as e:
1053
+ logger.warning(f"计算召回阶段成本时出错: {str(e)}")
1054
+ recall_cost = 0.0
1055
+ total_cost += recall_cost
1056
+ rag_stat.recall_stat.cost = recall_cost
1057
+
1058
+ # 计算分块阶段成本
1059
+ if rag_stat.chunk_stat.model_name != "unknown":
1060
+ try:
1061
+ chunk_model = get_model_by_name(rag_stat.chunk_stat.model_name)
1062
+ input_cost = chunk_model.get("input_price", 0.0) / 1000000
1063
+ output_cost = chunk_model.get("output_price", 0.0) / 1000000
1064
+ chunk_cost = (rag_stat.chunk_stat.total_input_tokens * input_cost) + \
1065
+ (rag_stat.chunk_stat.total_generated_tokens * output_cost)
1066
+ total_cost += chunk_cost
1067
+ except Exception as e:
1068
+ logger.warning(f"计算分块阶段成本时出错: {str(e)}")
1069
+ # 使用默认值
1070
+ chunk_cost = 0.0
1071
+ total_cost += chunk_cost
1072
+ rag_stat.chunk_stat.cost = chunk_cost
1073
+
1074
+ # 计算答案生成阶段成本
1075
+ if rag_stat.answer_stat.model_name != "unknown":
1076
+ try:
1077
+ answer_model = get_model_by_name(rag_stat.answer_stat.model_name)
1078
+ input_cost = answer_model.get("input_price", 0.0) / 1000000
1079
+ output_cost = answer_model.get("output_price", 0.0) / 1000000
1080
+ answer_cost = (rag_stat.answer_stat.total_input_tokens * input_cost) + \
1081
+ (rag_stat.answer_stat.total_generated_tokens * output_cost)
1082
+ total_cost += answer_cost
1083
+ except Exception as e:
1084
+ logger.warning(f"计算答案生成阶段成本时出错: {str(e)}")
1085
+ # 使用默认值
1086
+ answer_cost = 0.0
1087
+ total_cost += answer_cost
1088
+ rag_stat.answer_stat.cost = answer_cost
1089
+
1090
+ # 计算其他阶段成本(如果存在)
1091
+ for i, other_stat in enumerate(rag_stat.other_stats):
1092
+ if other_stat.model_name != "unknown":
1093
+ try:
1094
+ other_model = get_model_by_name(other_stat.model_name)
1095
+ input_cost = other_model.get("input_price", 0.0) / 1000000
1096
+ output_cost = other_model.get("output_price", 0.0) / 1000000
1097
+ other_cost = (other_stat.total_input_tokens * input_cost) + \
1098
+ (other_stat.total_generated_tokens * output_cost)
1099
+ total_cost += other_cost
1100
+ except Exception as e:
1101
+ logger.warning(f"计算其他阶段 {i+1} 成本时出错: {str(e)}")
1102
+ # 使用默认值
1103
+ other_cost = 0.0
1104
+ total_cost += other_cost
1105
+ rag_stat.other_stats[i].cost = other_cost
1106
+
1107
+ # 将总成本保存到 rag_stat
1108
+ rag_stat.cost = total_cost
1109
+ return total_cost
@@ -16,7 +16,8 @@ from autocoder.agent.base_agentic.tool_registry import ToolRegistry
16
16
  from autocoder.agent.base_agentic.tools.base_tool_resolver import BaseToolResolver
17
17
  from autocoder.agent.base_agentic.types import ToolDescription, ToolExample
18
18
  from autocoder.common import AutoCoderArgs
19
- from autocoder.rag.long_context_rag import LongContextRAG, RecallStat, ChunkStat, AnswerStat, RAGStat
19
+ from autocoder.rag.long_context_rag import LongContextRAG
20
+ from autocoder.rag.types import RecallStat, ChunkStat, AnswerStat, RAGStat
20
21
  from autocoder.rag.relevant_utils import FilterDoc, DocRelevance, DocFilterResult
21
22
  from autocoder.common import SourceCode
22
23
  from autocoder.rag.relevant_utils import TaskTiming
@@ -15,7 +15,8 @@ from autocoder.agent.base_agentic.tool_registry import ToolRegistry
15
15
  from autocoder.agent.base_agentic.tools.base_tool_resolver import BaseToolResolver
16
16
  from autocoder.agent.base_agentic.types import ToolDescription, ToolExample
17
17
  from autocoder.common import AutoCoderArgs
18
- from autocoder.rag.long_context_rag import LongContextRAG, RecallStat, ChunkStat, AnswerStat, RAGStat
18
+ from autocoder.rag.long_context_rag import LongContextRAG
19
+ from autocoder.rag.types import RecallStat, ChunkStat, AnswerStat, RAGStat
19
20
  from autocoder.rag.relevant_utils import FilterDoc, DocRelevance, DocFilterResult
20
21
 
21
22
 
autocoder/rag/types.py CHANGED
@@ -3,10 +3,46 @@ import os
3
3
  import json
4
4
  import time
5
5
  import pydantic
6
+ from pydantic import BaseModel
6
7
  from typing import Dict, Any, Optional, List
7
8
  import psutil
8
9
  import glob
9
10
 
11
+ class RecallStat(BaseModel):
12
+ total_input_tokens: int
13
+ total_generated_tokens: int
14
+ model_name: str = "unknown"
15
+ cost:float = 0.0
16
+
17
+
18
+ class ChunkStat(BaseModel):
19
+ total_input_tokens: int
20
+ total_generated_tokens: int
21
+ model_name: str = "unknown"
22
+ cost:float = 0.0
23
+
24
+
25
+ class AnswerStat(BaseModel):
26
+ total_input_tokens: int
27
+ total_generated_tokens: int
28
+ model_name: str = "unknown"
29
+ cost:float = 0.0
30
+
31
+
32
+ class OtherStat(BaseModel):
33
+ total_input_tokens: int = 0
34
+ total_generated_tokens: int = 0
35
+ model_name: str = "unknown"
36
+ cost:float = 0.0
37
+
38
+
39
+ class RAGStat(BaseModel):
40
+ recall_stat: RecallStat
41
+ chunk_stat: ChunkStat
42
+ answer_stat: AnswerStat
43
+ other_stats: List[OtherStat] = []
44
+ cost:float = 0.0
45
+
10
46
  class RAGServiceInfo(pydantic.BaseModel):
11
47
  host: str
12
48
  port: int
@@ -151,7 +151,31 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
151
151
  return "![%s](%s%s)" % (alt, src, title_part)
152
152
 
153
153
  def convert_soup(self, soup: Any) -> str:
154
- return super().convert_soup(soup) # type: ignore
154
+ try:
155
+ # 设置递归深度限制,避免复杂文档导致的递归错误
156
+ import sys
157
+ original_limit = sys.getrecursionlimit()
158
+ try:
159
+ # 增加递归深度限制
160
+ sys.setrecursionlimit(10000) # 设置更高的递归限制
161
+ return super().convert_soup(soup) # type: ignore
162
+ finally:
163
+ # 恢复原始递归深度限制
164
+ sys.setrecursionlimit(original_limit)
165
+ except RecursionError:
166
+ # 处理递归错误,尝试简化处理
167
+ logger.warning("RecursionError in convert_soup, falling back to simplified conversion")
168
+ # 返回简化的文本内容
169
+ return self._simplified_convert(soup)
170
+
171
+ def _simplified_convert(self, soup: Any) -> str:
172
+ """简化的转换方法,用于处理复杂文档时的回退方案"""
173
+ # 提取纯文本内容
174
+ text = soup.get_text(separator="\n", strip=True)
175
+ # 基本清理
176
+ text = re.sub(r'\s+', ' ', text)
177
+ text = re.sub(r'\n{3,}', '\n\n', text)
178
+ return text
155
179
 
156
180
 
157
181
  class DocumentConverterResult:
@@ -224,20 +248,42 @@ class HtmlConverter(DocumentConverter):
224
248
  for script in soup(["script", "style"]):
225
249
  script.extract()
226
250
 
227
- # Print only the main content
228
- body_elm = soup.find("body")
229
- webpage_text = ""
230
- if body_elm:
231
- webpage_text = _CustomMarkdownify().convert_soup(body_elm)
232
- else:
233
- webpage_text = _CustomMarkdownify().convert_soup(soup)
251
+ try:
252
+ # Print only the main content
253
+ body_elm = soup.find("body")
254
+ webpage_text = ""
255
+ if body_elm:
256
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
257
+ else:
258
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
234
259
 
235
- assert isinstance(webpage_text, str)
260
+ assert isinstance(webpage_text, str)
236
261
 
237
- return DocumentConverterResult(
238
- title=None if soup.title is None else soup.title.string,
239
- text_content=webpage_text,
240
- )
262
+ return DocumentConverterResult(
263
+ title=None if soup.title is None else soup.title.string,
264
+ text_content=webpage_text,
265
+ )
266
+ except Exception as e:
267
+ # 如果转换过程中出现任何错误,尝试使用简化的方法提取文本
268
+ logger.warning(f"Error in HTML conversion: {str(e)}. Falling back to simplified text extraction.")
269
+ try:
270
+ # 简化的文本提取
271
+ text = soup.get_text(separator="\n", strip=True)
272
+ # 基本清理
273
+ text = re.sub(r'\s+', ' ', text)
274
+ text = re.sub(r'\n{3,}', '\n\n', text)
275
+
276
+ return DocumentConverterResult(
277
+ title=None if soup.title is None else soup.title.string,
278
+ text_content=text,
279
+ )
280
+ except Exception as inner_e:
281
+ # 如果简化提取也失败,记录错误并返回空结果
282
+ logger.error(f"Failed to extract text with simplified method: {str(inner_e)}")
283
+ return DocumentConverterResult(
284
+ title=None,
285
+ text_content=f"[文档转换失败] 无法提取内容: {str(e)}",
286
+ )
241
287
 
242
288
 
243
289
  class WikipediaConverter(DocumentConverter):
autocoder/version.py CHANGED
@@ -1,2 +1,2 @@
1
1
 
2
- __version__ = "0.1.375"
2
+ __version__ = "0.1.376"