AstrBot 4.3.5__py3-none-any.whl → 4.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. astrbot/core/agent/runners/tool_loop_agent_runner.py +31 -2
  2. astrbot/core/astrbot_config_mgr.py +23 -51
  3. astrbot/core/config/default.py +132 -12
  4. astrbot/core/conversation_mgr.py +36 -1
  5. astrbot/core/core_lifecycle.py +24 -5
  6. astrbot/core/db/migration/helper.py +6 -3
  7. astrbot/core/db/migration/migra_45_to_46.py +44 -0
  8. astrbot/core/db/vec_db/base.py +33 -2
  9. astrbot/core/db/vec_db/faiss_impl/document_storage.py +310 -52
  10. astrbot/core/db/vec_db/faiss_impl/embedding_storage.py +31 -3
  11. astrbot/core/db/vec_db/faiss_impl/vec_db.py +81 -23
  12. astrbot/core/file_token_service.py +6 -1
  13. astrbot/core/initial_loader.py +6 -3
  14. astrbot/core/knowledge_base/chunking/__init__.py +11 -0
  15. astrbot/core/knowledge_base/chunking/base.py +24 -0
  16. astrbot/core/knowledge_base/chunking/fixed_size.py +57 -0
  17. astrbot/core/knowledge_base/chunking/recursive.py +155 -0
  18. astrbot/core/knowledge_base/kb_db_sqlite.py +299 -0
  19. astrbot/core/knowledge_base/kb_helper.py +348 -0
  20. astrbot/core/knowledge_base/kb_mgr.py +287 -0
  21. astrbot/core/knowledge_base/models.py +114 -0
  22. astrbot/core/knowledge_base/parsers/__init__.py +15 -0
  23. astrbot/core/knowledge_base/parsers/base.py +50 -0
  24. astrbot/core/knowledge_base/parsers/markitdown_parser.py +25 -0
  25. astrbot/core/knowledge_base/parsers/pdf_parser.py +100 -0
  26. astrbot/core/knowledge_base/parsers/text_parser.py +41 -0
  27. astrbot/core/knowledge_base/parsers/util.py +13 -0
  28. astrbot/core/knowledge_base/retrieval/__init__.py +16 -0
  29. astrbot/core/knowledge_base/retrieval/hit_stopwords.txt +767 -0
  30. astrbot/core/knowledge_base/retrieval/manager.py +273 -0
  31. astrbot/core/knowledge_base/retrieval/rank_fusion.py +138 -0
  32. astrbot/core/knowledge_base/retrieval/sparse_retriever.py +130 -0
  33. astrbot/core/pipeline/process_stage/method/llm_request.py +29 -7
  34. astrbot/core/pipeline/process_stage/utils.py +80 -0
  35. astrbot/core/platform/astr_message_event.py +8 -7
  36. astrbot/core/platform/sources/dingtalk/dingtalk_adapter.py +5 -2
  37. astrbot/core/platform/sources/misskey/misskey_adapter.py +380 -44
  38. astrbot/core/platform/sources/misskey/misskey_api.py +581 -45
  39. astrbot/core/platform/sources/misskey/misskey_event.py +76 -41
  40. astrbot/core/platform/sources/misskey/misskey_utils.py +254 -43
  41. astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_server.py +2 -1
  42. astrbot/core/platform/sources/satori/satori_adapter.py +27 -1
  43. astrbot/core/platform/sources/satori/satori_event.py +270 -99
  44. astrbot/core/provider/manager.py +22 -9
  45. astrbot/core/provider/provider.py +67 -0
  46. astrbot/core/provider/sources/anthropic_source.py +4 -4
  47. astrbot/core/provider/sources/dashscope_source.py +10 -9
  48. astrbot/core/provider/sources/dify_source.py +6 -8
  49. astrbot/core/provider/sources/gemini_embedding_source.py +1 -2
  50. astrbot/core/provider/sources/openai_embedding_source.py +1 -2
  51. astrbot/core/provider/sources/openai_source.py +43 -15
  52. astrbot/core/provider/sources/openai_tts_api_source.py +1 -1
  53. astrbot/core/provider/sources/xinference_rerank_source.py +108 -0
  54. astrbot/core/provider/sources/xinference_stt_provider.py +187 -0
  55. astrbot/core/star/context.py +19 -13
  56. astrbot/core/star/star.py +6 -0
  57. astrbot/core/star/star_manager.py +13 -7
  58. astrbot/core/umop_config_router.py +81 -0
  59. astrbot/core/updator.py +1 -1
  60. astrbot/core/utils/io.py +23 -12
  61. astrbot/dashboard/routes/__init__.py +2 -0
  62. astrbot/dashboard/routes/config.py +137 -9
  63. astrbot/dashboard/routes/knowledge_base.py +1065 -0
  64. astrbot/dashboard/routes/plugin.py +24 -5
  65. astrbot/dashboard/routes/update.py +1 -1
  66. astrbot/dashboard/server.py +6 -0
  67. astrbot/dashboard/utils.py +161 -0
  68. {astrbot-4.3.5.dist-info → astrbot-4.5.1.dist-info}/METADATA +30 -13
  69. {astrbot-4.3.5.dist-info → astrbot-4.5.1.dist-info}/RECORD +72 -46
  70. {astrbot-4.3.5.dist-info → astrbot-4.5.1.dist-info}/WHEEL +0 -0
  71. {astrbot-4.3.5.dist-info → astrbot-4.5.1.dist-info}/entry_points.txt +0 -0
  72. {astrbot-4.3.5.dist-info → astrbot-4.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,12 @@
1
1
  import uuid
2
- import json
2
+ import time
3
3
  import numpy as np
4
4
  from .document_storage import DocumentStorage
5
5
  from .embedding_storage import EmbeddingStorage
6
6
  from ..base import Result, BaseVecDB
7
7
  from astrbot.core.provider.provider import EmbeddingProvider
8
8
  from astrbot.core.provider.provider import RerankProvider
9
+ from astrbot import logger
9
10
 
10
11
 
11
12
  class FaissVecDB(BaseVecDB):
@@ -44,18 +45,56 @@ class FaissVecDB(BaseVecDB):
44
45
 
45
46
  vector = await self.embedding_provider.get_embedding(content)
46
47
  vector = np.array(vector, dtype=np.float32)
47
- async with self.document_storage.connection.cursor() as cursor:
48
- await cursor.execute(
49
- "INSERT INTO documents (doc_id, text, metadata) VALUES (?, ?, ?)",
50
- (str_id, content, json.dumps(metadata)),
51
- )
52
- await self.document_storage.connection.commit()
53
- result = await self.document_storage.get_document_by_doc_id(str_id)
54
- int_id = result["id"]
55
48
 
56
- # 插入向量到 FAISS
57
- await self.embedding_storage.insert(vector, int_id)
58
- return int_id
49
+ # 使用 DocumentStorage 的方法插入文档
50
+ int_id = await self.document_storage.insert_document(str_id, content, metadata)
51
+
52
+ # 插入向量到 FAISS
53
+ await self.embedding_storage.insert(vector, int_id)
54
+ return int_id
55
+
56
+ async def insert_batch(
57
+ self,
58
+ contents: list[str],
59
+ metadatas: list[dict] | None = None,
60
+ ids: list[str] | None = None,
61
+ batch_size: int = 32,
62
+ tasks_limit: int = 3,
63
+ max_retries: int = 3,
64
+ progress_callback=None,
65
+ ) -> list[int]:
66
+ """
67
+ 批量插入文本和其对应向量,自动生成 ID 并保持一致性。
68
+
69
+ Args:
70
+ progress_callback: 进度回调函数,接收参数 (current, total)
71
+ """
72
+ metadatas = metadatas or [{} for _ in contents]
73
+ ids = ids or [str(uuid.uuid4()) for _ in contents]
74
+
75
+ start = time.time()
76
+ logger.debug(f"Generating embeddings for {len(contents)} contents...")
77
+ vectors = await self.embedding_provider.get_embeddings_batch(
78
+ contents,
79
+ batch_size=batch_size,
80
+ tasks_limit=tasks_limit,
81
+ max_retries=max_retries,
82
+ progress_callback=progress_callback,
83
+ )
84
+ end = time.time()
85
+ logger.debug(
86
+ f"Generated embeddings for {len(contents)} contents in {end - start:.2f} seconds."
87
+ )
88
+
89
+ # 使用 DocumentStorage 的批量插入方法
90
+ int_ids = await self.document_storage.insert_documents_batch(
91
+ ids, contents, metadatas
92
+ )
93
+
94
+ # 批量插入向量到 FAISS
95
+ vectors_array = np.array(vectors).astype("float32")
96
+ await self.embedding_storage.insert_batch(vectors_array, int_ids)
97
+ return int_ids
59
98
 
60
99
  async def retrieve(
61
100
  self,
@@ -119,23 +158,42 @@ class FaissVecDB(BaseVecDB):
119
158
 
120
159
  return top_k_results
121
160
 
122
- async def delete(self, doc_id: int):
161
+ async def delete(self, doc_id: str):
123
162
  """
124
- 删除一条文档
163
+ 删除一条文档块(chunk)
125
164
  """
126
- await self.document_storage.connection.execute(
127
- "DELETE FROM documents WHERE doc_id = ?", (doc_id,)
128
- )
129
- await self.document_storage.connection.commit()
165
+ # 获得对应的 int id
166
+ result = await self.document_storage.get_document_by_doc_id(doc_id)
167
+ int_id = result["id"] if result else None
168
+ if int_id is None:
169
+ return
170
+
171
+ # 使用 DocumentStorage 的删除方法
172
+ await self.document_storage.delete_document_by_doc_id(doc_id)
173
+ await self.embedding_storage.delete([int_id])
130
174
 
131
175
  async def close(self):
132
176
  await self.document_storage.close()
133
177
 
134
- async def count_documents(self) -> int:
178
+ async def count_documents(self, metadata_filter: dict | None = None) -> int:
135
179
  """
136
180
  计算文档数量
181
+
182
+ Args:
183
+ metadata_filter (dict | None): 元数据过滤器
137
184
  """
138
- async with self.document_storage.connection.cursor() as cursor:
139
- await cursor.execute("SELECT COUNT(*) FROM documents")
140
- count = await cursor.fetchone()
141
- return count[0] if count else 0
185
+ count = await self.document_storage.count_documents(
186
+ metadata_filters=metadata_filter or {}
187
+ )
188
+ return count
189
+
190
+ async def delete_documents(self, metadata_filters: dict):
191
+ """
192
+ 根据元数据过滤器删除文档
193
+ """
194
+ docs = await self.document_storage.get_documents(
195
+ metadata_filters=metadata_filters, offset=None, limit=None
196
+ )
197
+ doc_ids: list[int] = [doc["id"] for doc in docs]
198
+ await self.embedding_storage.delete(doc_ids)
199
+ await self.document_storage.delete_documents(metadata_filters=metadata_filters)
@@ -23,7 +23,12 @@ class FileTokenService:
23
23
  for token in expired_tokens:
24
24
  self.staged_files.pop(token, None)
25
25
 
26
- async def register_file(self, file_path: str, timeout: float = None) -> str:
26
+ async def check_token_expired(self, file_token: str) -> bool:
27
+ async with self.lock:
28
+ await self._cleanup_expired_tokens()
29
+ return file_token not in self.staged_files
30
+
31
+ async def register_file(self, file_path: str, timeout: float | None = None) -> str:
27
32
  """向令牌服务注册一个文件。
28
33
 
29
34
  Args:
@@ -41,10 +41,13 @@ class InitialLoader:
41
41
  self.dashboard_server = AstrBotDashboard(
42
42
  core_lifecycle, self.db, core_lifecycle.dashboard_shutdown_event, webui_dir
43
43
  )
44
- task = asyncio.gather(
45
- core_task, self.dashboard_server.run()
46
- ) # 启动核心任务和仪表板服务器
47
44
 
45
+ coro = self.dashboard_server.run()
46
+ if coro:
47
+ # 启动核心任务和仪表板服务器
48
+ task = asyncio.gather(core_task, coro)
49
+ else:
50
+ task = core_task
48
51
  try:
49
52
  await task # 整个AstrBot在这里运行
50
53
  except asyncio.CancelledError:
@@ -0,0 +1,11 @@
1
+ """
2
+ 文档分块模块
3
+ """
4
+
5
+ from .base import BaseChunker
6
+ from .fixed_size import FixedSizeChunker
7
+
8
+ __all__ = [
9
+ "BaseChunker",
10
+ "FixedSizeChunker",
11
+ ]
@@ -0,0 +1,24 @@
1
+ """文档分块器基类
2
+
3
+ 定义了文档分块处理的抽象接口。
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+
8
+
9
+ class BaseChunker(ABC):
10
+ """分块器基类
11
+
12
+ 所有分块器都应该继承此类并实现 chunk 方法。
13
+ """
14
+
15
+ @abstractmethod
16
+ async def chunk(self, text: str, **kwargs) -> list[str]:
17
+ """将文本分块
18
+
19
+ Args:
20
+ text: 输入文本
21
+
22
+ Returns:
23
+ list[str]: 分块后的文本列表
24
+ """
@@ -0,0 +1,57 @@
1
+ """固定大小分块器
2
+
3
+ 按照固定的字符数将文本分块,支持重叠区域。
4
+ """
5
+
6
+ from .base import BaseChunker
7
+
8
+
9
+ class FixedSizeChunker(BaseChunker):
10
+ """固定大小分块器
11
+
12
+ 按照固定的字符数分块,并支持块之间的重叠。
13
+ """
14
+
15
+ def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
16
+ """初始化分块器
17
+
18
+ Args:
19
+ chunk_size: 块的大小(字符数)
20
+ chunk_overlap: 块之间的重叠字符数
21
+ """
22
+ self.chunk_size = chunk_size
23
+ self.chunk_overlap = chunk_overlap
24
+
25
+ async def chunk(self, text: str, **kwargs) -> list[str]:
26
+ """固定大小分块
27
+
28
+ Args:
29
+ text: 输入文本
30
+ chunk_size: 每个文本块的最大大小
31
+ chunk_overlap: 每个文本块之间的重叠部分大小
32
+
33
+ Returns:
34
+ list[str]: 分块后的文本列表
35
+ """
36
+ chunk_size = kwargs.get("chunk_size", self.chunk_size)
37
+ chunk_overlap = kwargs.get("chunk_overlap", self.chunk_overlap)
38
+
39
+ chunks = []
40
+ start = 0
41
+ text_len = len(text)
42
+
43
+ while start < text_len:
44
+ end = start + chunk_size
45
+ chunk = text[start:end]
46
+
47
+ if chunk:
48
+ chunks.append(chunk)
49
+
50
+ # 移动窗口,保留重叠部分
51
+ start = end - chunk_overlap
52
+
53
+ # 防止无限循环: 如果重叠过大,直接移到end
54
+ if start >= end or chunk_overlap >= chunk_size:
55
+ start = end
56
+
57
+ return chunks
@@ -0,0 +1,155 @@
1
+ from collections.abc import Callable
2
+ from .base import BaseChunker
3
+
4
+
5
+ class RecursiveCharacterChunker(BaseChunker):
6
+ def __init__(
7
+ self,
8
+ chunk_size: int = 500,
9
+ chunk_overlap: int = 100,
10
+ length_function: Callable[[str], int] = len,
11
+ is_separator_regex: bool = False,
12
+ separators: list[str] | None = None,
13
+ ):
14
+ """
15
+ 初始化递归字符文本分割器
16
+
17
+ Args:
18
+ chunk_size: 每个文本块的最大大小
19
+ chunk_overlap: 每个文本块之间的重叠部分大小
20
+ length_function: 计算文本长度的函数
21
+ is_separator_regex: 分隔符是否为正则表达式
22
+ separators: 用于分割文本的分隔符列表,按优先级排序
23
+ """
24
+ self.chunk_size = chunk_size
25
+ self.chunk_overlap = chunk_overlap
26
+ self.length_function = length_function
27
+ self.is_separator_regex = is_separator_regex
28
+
29
+ # 默认分隔符列表,按优先级从高到低
30
+ self.separators = separators or [
31
+ "\n\n", # 段落
32
+ "\n", # 换行
33
+ "。", # 中文句子
34
+ ",", # 中文逗号
35
+ ". ", # 句子
36
+ ", ", # 逗号分隔
37
+ " ", # 单词
38
+ "", # 字符
39
+ ]
40
+
41
+ async def chunk(self, text: str, **kwargs) -> list[str]:
42
+ """
43
+ 递归地将文本分割成块
44
+
45
+ Args:
46
+ text: 要分割的文本
47
+ chunk_size: 每个文本块的最大大小
48
+ chunk_overlap: 每个文本块之间的重叠部分大小
49
+
50
+ Returns:
51
+ 分割后的文本块列表
52
+ """
53
+ if not text:
54
+ return []
55
+
56
+ overlap = kwargs.get("chunk_overlap", self.chunk_overlap)
57
+ chunk_size = kwargs.get("chunk_size", self.chunk_size)
58
+
59
+ text_length = self.length_function(text)
60
+ if text_length <= chunk_size:
61
+ return [text]
62
+
63
+ for separator in self.separators:
64
+ if separator == "":
65
+ return self._split_by_character(text, chunk_size, overlap)
66
+
67
+ if separator in text:
68
+ splits = text.split(separator)
69
+ # 重新添加分隔符(除了最后一个片段)
70
+ splits = [s + separator for s in splits[:-1]] + [splits[-1]]
71
+ splits = [s for s in splits if s]
72
+ if len(splits) == 1:
73
+ continue
74
+
75
+ # 递归合并分割后的文本块
76
+ final_chunks = []
77
+ current_chunk = []
78
+ current_chunk_length = 0
79
+
80
+ for split in splits:
81
+ split_length = self.length_function(split)
82
+
83
+ # 如果单个分割部分已经超过了chunk_size,需要递归分割
84
+ if split_length > chunk_size:
85
+ # 先处理当前积累的块
86
+ if current_chunk:
87
+ combined_text = "".join(current_chunk)
88
+ final_chunks.extend(
89
+ await self.chunk(
90
+ combined_text,
91
+ chunk_size=chunk_size,
92
+ chunk_overlap=overlap,
93
+ )
94
+ )
95
+ current_chunk = []
96
+ current_chunk_length = 0
97
+
98
+ # 递归分割过大的部分
99
+ final_chunks.extend(
100
+ await self.chunk(
101
+ split, chunk_size=chunk_size, chunk_overlap=overlap
102
+ )
103
+ )
104
+ # 如果添加这部分会使当前块超过chunk_size
105
+ elif current_chunk_length + split_length > chunk_size:
106
+ # 合并当前块并添加到结果中
107
+ combined_text = "".join(current_chunk)
108
+ final_chunks.append(combined_text)
109
+
110
+ # 处理重叠部分
111
+ overlap_start = max(0, len(combined_text) - overlap)
112
+ if overlap_start > 0:
113
+ overlap_text = combined_text[overlap_start:]
114
+ current_chunk = [overlap_text, split]
115
+ current_chunk_length = (
116
+ self.length_function(overlap_text) + split_length
117
+ )
118
+ else:
119
+ current_chunk = [split]
120
+ current_chunk_length = split_length
121
+ else:
122
+ # 添加到当前块
123
+ current_chunk.append(split)
124
+ current_chunk_length += split_length
125
+
126
+ # 处理剩余的块
127
+ if current_chunk:
128
+ final_chunks.append("".join(current_chunk))
129
+
130
+ return final_chunks
131
+
132
+ return [text]
133
+
134
+ def _split_by_character(
135
+ self, text: str, chunk_size: int | None = None, overlap: int | None = None
136
+ ) -> list[str]:
137
+ """
138
+ 按字符级别分割文本
139
+
140
+ Args:
141
+ text: 要分割的文本
142
+
143
+ Returns:
144
+ 分割后的文本块列表
145
+ """
146
+ chunk_size = chunk_size or self.chunk_size
147
+ overlap = overlap or self.chunk_overlap
148
+ result = []
149
+ for i in range(0, len(text), chunk_size - overlap):
150
+ end = min(i + chunk_size, len(text))
151
+ result.append(text[i:end])
152
+ if end == len(text):
153
+ break
154
+
155
+ return result