AstrBot 4.5.1__py3-none-any.whl → 4.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- astrbot/api/__init__.py +10 -11
- astrbot/api/event/__init__.py +5 -6
- astrbot/api/event/filter/__init__.py +37 -36
- astrbot/api/platform/__init__.py +7 -8
- astrbot/api/provider/__init__.py +7 -7
- astrbot/api/star/__init__.py +3 -4
- astrbot/api/util/__init__.py +2 -2
- astrbot/cli/__main__.py +5 -5
- astrbot/cli/commands/__init__.py +3 -3
- astrbot/cli/commands/cmd_conf.py +19 -16
- astrbot/cli/commands/cmd_init.py +3 -2
- astrbot/cli/commands/cmd_plug.py +8 -10
- astrbot/cli/commands/cmd_run.py +5 -6
- astrbot/cli/utils/__init__.py +6 -6
- astrbot/cli/utils/basic.py +14 -14
- astrbot/cli/utils/plugin.py +24 -15
- astrbot/cli/utils/version_comparator.py +10 -12
- astrbot/core/__init__.py +8 -6
- astrbot/core/agent/agent.py +3 -2
- astrbot/core/agent/handoff.py +6 -2
- astrbot/core/agent/hooks.py +9 -6
- astrbot/core/agent/mcp_client.py +50 -15
- astrbot/core/agent/message.py +168 -0
- astrbot/core/agent/response.py +2 -1
- astrbot/core/agent/run_context.py +2 -3
- astrbot/core/agent/runners/base.py +10 -13
- astrbot/core/agent/runners/tool_loop_agent_runner.py +52 -51
- astrbot/core/agent/tool.py +60 -41
- astrbot/core/agent/tool_executor.py +9 -3
- astrbot/core/astr_agent_context.py +3 -1
- astrbot/core/astrbot_config_mgr.py +29 -9
- astrbot/core/config/__init__.py +2 -2
- astrbot/core/config/astrbot_config.py +28 -26
- astrbot/core/config/default.py +4 -6
- astrbot/core/conversation_mgr.py +105 -36
- astrbot/core/core_lifecycle.py +68 -54
- astrbot/core/db/__init__.py +33 -18
- astrbot/core/db/migration/helper.py +12 -10
- astrbot/core/db/migration/migra_3_to_4.py +53 -34
- astrbot/core/db/migration/migra_45_to_46.py +1 -1
- astrbot/core/db/migration/shared_preferences_v3.py +2 -1
- astrbot/core/db/migration/sqlite_v3.py +26 -23
- astrbot/core/db/po.py +27 -18
- astrbot/core/db/sqlite.py +74 -45
- astrbot/core/db/vec_db/base.py +10 -14
- astrbot/core/db/vec_db/faiss_impl/document_storage.py +90 -77
- astrbot/core/db/vec_db/faiss_impl/embedding_storage.py +9 -3
- astrbot/core/db/vec_db/faiss_impl/vec_db.py +36 -31
- astrbot/core/event_bus.py +8 -6
- astrbot/core/file_token_service.py +6 -5
- astrbot/core/initial_loader.py +7 -5
- astrbot/core/knowledge_base/chunking/__init__.py +1 -3
- astrbot/core/knowledge_base/chunking/base.py +1 -0
- astrbot/core/knowledge_base/chunking/fixed_size.py +2 -0
- astrbot/core/knowledge_base/chunking/recursive.py +16 -10
- astrbot/core/knowledge_base/kb_db_sqlite.py +50 -48
- astrbot/core/knowledge_base/kb_helper.py +30 -17
- astrbot/core/knowledge_base/kb_mgr.py +6 -7
- astrbot/core/knowledge_base/models.py +10 -4
- astrbot/core/knowledge_base/parsers/__init__.py +3 -5
- astrbot/core/knowledge_base/parsers/base.py +1 -0
- astrbot/core/knowledge_base/parsers/markitdown_parser.py +2 -1
- astrbot/core/knowledge_base/parsers/pdf_parser.py +2 -1
- astrbot/core/knowledge_base/parsers/text_parser.py +1 -0
- astrbot/core/knowledge_base/parsers/util.py +1 -1
- astrbot/core/knowledge_base/retrieval/__init__.py +6 -8
- astrbot/core/knowledge_base/retrieval/manager.py +17 -14
- astrbot/core/knowledge_base/retrieval/rank_fusion.py +7 -3
- astrbot/core/knowledge_base/retrieval/sparse_retriever.py +11 -5
- astrbot/core/log.py +21 -13
- astrbot/core/message/components.py +123 -217
- astrbot/core/message/message_event_result.py +24 -24
- astrbot/core/persona_mgr.py +20 -11
- astrbot/core/pipeline/__init__.py +7 -7
- astrbot/core/pipeline/content_safety_check/stage.py +13 -9
- astrbot/core/pipeline/content_safety_check/strategies/__init__.py +1 -2
- astrbot/core/pipeline/content_safety_check/strategies/baidu_aip.py +12 -13
- astrbot/core/pipeline/content_safety_check/strategies/keywords.py +1 -0
- astrbot/core/pipeline/content_safety_check/strategies/strategy.py +6 -6
- astrbot/core/pipeline/context.py +4 -1
- astrbot/core/pipeline/context_utils.py +77 -7
- astrbot/core/pipeline/preprocess_stage/stage.py +12 -9
- astrbot/core/pipeline/process_stage/method/llm_request.py +125 -72
- astrbot/core/pipeline/process_stage/method/star_request.py +19 -17
- astrbot/core/pipeline/process_stage/stage.py +13 -10
- astrbot/core/pipeline/process_stage/utils.py +6 -5
- astrbot/core/pipeline/rate_limit_check/stage.py +37 -36
- astrbot/core/pipeline/respond/stage.py +23 -20
- astrbot/core/pipeline/result_decorate/stage.py +31 -23
- astrbot/core/pipeline/scheduler.py +12 -8
- astrbot/core/pipeline/session_status_check/stage.py +12 -8
- astrbot/core/pipeline/stage.py +10 -4
- astrbot/core/pipeline/waking_check/stage.py +24 -18
- astrbot/core/pipeline/whitelist_check/stage.py +10 -7
- astrbot/core/platform/__init__.py +6 -6
- astrbot/core/platform/astr_message_event.py +76 -110
- astrbot/core/platform/astrbot_message.py +11 -13
- astrbot/core/platform/manager.py +16 -15
- astrbot/core/platform/message_session.py +5 -3
- astrbot/core/platform/platform.py +16 -24
- astrbot/core/platform/platform_metadata.py +4 -4
- astrbot/core/platform/register.py +8 -8
- astrbot/core/platform/sources/aiocqhttp/aiocqhttp_message_event.py +23 -15
- astrbot/core/platform/sources/aiocqhttp/aiocqhttp_platform_adapter.py +51 -33
- astrbot/core/platform/sources/dingtalk/dingtalk_adapter.py +42 -27
- astrbot/core/platform/sources/dingtalk/dingtalk_event.py +7 -3
- astrbot/core/platform/sources/discord/client.py +9 -6
- astrbot/core/platform/sources/discord/components.py +18 -14
- astrbot/core/platform/sources/discord/discord_platform_adapter.py +45 -30
- astrbot/core/platform/sources/discord/discord_platform_event.py +38 -30
- astrbot/core/platform/sources/lark/lark_adapter.py +23 -17
- astrbot/core/platform/sources/lark/lark_event.py +21 -14
- astrbot/core/platform/sources/misskey/misskey_adapter.py +107 -67
- astrbot/core/platform/sources/misskey/misskey_api.py +153 -129
- astrbot/core/platform/sources/misskey/misskey_event.py +20 -15
- astrbot/core/platform/sources/misskey/misskey_utils.py +74 -62
- astrbot/core/platform/sources/qqofficial/qqofficial_message_event.py +63 -44
- astrbot/core/platform/sources/qqofficial/qqofficial_platform_adapter.py +41 -26
- astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_adapter.py +36 -17
- astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_event.py +3 -1
- astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_server.py +12 -7
- astrbot/core/platform/sources/satori/satori_adapter.py +56 -38
- astrbot/core/platform/sources/satori/satori_event.py +34 -25
- astrbot/core/platform/sources/slack/client.py +11 -9
- astrbot/core/platform/sources/slack/slack_adapter.py +52 -36
- astrbot/core/platform/sources/slack/slack_event.py +34 -24
- astrbot/core/platform/sources/telegram/tg_adapter.py +38 -18
- astrbot/core/platform/sources/telegram/tg_event.py +32 -18
- astrbot/core/platform/sources/webchat/webchat_adapter.py +27 -17
- astrbot/core/platform/sources/webchat/webchat_event.py +14 -10
- astrbot/core/platform/sources/wechatpadpro/wechatpadpro_adapter.py +115 -120
- astrbot/core/platform/sources/wechatpadpro/wechatpadpro_message_event.py +9 -8
- astrbot/core/platform/sources/wechatpadpro/xml_data_parser.py +15 -16
- astrbot/core/platform/sources/wecom/wecom_adapter.py +35 -18
- astrbot/core/platform/sources/wecom/wecom_event.py +55 -48
- astrbot/core/platform/sources/wecom/wecom_kf.py +34 -44
- astrbot/core/platform/sources/wecom/wecom_kf_message.py +26 -10
- astrbot/core/platform/sources/wecom_ai_bot/WXBizJsonMsgCrypt.py +18 -10
- astrbot/core/platform/sources/wecom_ai_bot/__init__.py +3 -5
- astrbot/core/platform/sources/wecom_ai_bot/ierror.py +0 -1
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_adapter.py +61 -37
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_api.py +67 -28
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_event.py +8 -9
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_queue_mgr.py +18 -9
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_server.py +14 -12
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_utils.py +22 -12
- astrbot/core/platform/sources/weixin_official_account/weixin_offacc_adapter.py +40 -26
- astrbot/core/platform/sources/weixin_official_account/weixin_offacc_event.py +47 -45
- astrbot/core/platform_message_history_mgr.py +5 -3
- astrbot/core/provider/__init__.py +2 -3
- astrbot/core/provider/entites.py +8 -8
- astrbot/core/provider/entities.py +61 -75
- astrbot/core/provider/func_tool_manager.py +59 -55
- astrbot/core/provider/manager.py +32 -22
- astrbot/core/provider/provider.py +72 -46
- astrbot/core/provider/register.py +7 -7
- astrbot/core/provider/sources/anthropic_source.py +48 -30
- astrbot/core/provider/sources/azure_tts_source.py +17 -13
- astrbot/core/provider/sources/coze_api_client.py +27 -17
- astrbot/core/provider/sources/coze_source.py +104 -87
- astrbot/core/provider/sources/dashscope_source.py +18 -11
- astrbot/core/provider/sources/dashscope_tts.py +36 -23
- astrbot/core/provider/sources/dify_source.py +25 -20
- astrbot/core/provider/sources/edge_tts_source.py +21 -17
- astrbot/core/provider/sources/fishaudio_tts_api_source.py +22 -14
- astrbot/core/provider/sources/gemini_embedding_source.py +12 -13
- astrbot/core/provider/sources/gemini_source.py +72 -58
- astrbot/core/provider/sources/gemini_tts_source.py +8 -6
- astrbot/core/provider/sources/gsv_selfhosted_source.py +17 -14
- astrbot/core/provider/sources/gsvi_tts_source.py +11 -7
- astrbot/core/provider/sources/minimax_tts_api_source.py +50 -40
- astrbot/core/provider/sources/openai_embedding_source.py +6 -8
- astrbot/core/provider/sources/openai_source.py +77 -69
- astrbot/core/provider/sources/openai_tts_api_source.py +14 -6
- astrbot/core/provider/sources/sensevoice_selfhosted_source.py +13 -11
- astrbot/core/provider/sources/vllm_rerank_source.py +10 -4
- astrbot/core/provider/sources/volcengine_tts.py +38 -31
- astrbot/core/provider/sources/whisper_api_source.py +14 -12
- astrbot/core/provider/sources/whisper_selfhosted_source.py +15 -11
- astrbot/core/provider/sources/xinference_rerank_source.py +16 -8
- astrbot/core/provider/sources/xinference_stt_provider.py +35 -25
- astrbot/core/star/__init__.py +16 -11
- astrbot/core/star/config.py +10 -15
- astrbot/core/star/context.py +97 -75
- astrbot/core/star/filter/__init__.py +4 -3
- astrbot/core/star/filter/command.py +30 -28
- astrbot/core/star/filter/command_group.py +27 -24
- astrbot/core/star/filter/custom_filter.py +6 -5
- astrbot/core/star/filter/event_message_type.py +4 -2
- astrbot/core/star/filter/permission.py +4 -2
- astrbot/core/star/filter/platform_adapter_type.py +4 -2
- astrbot/core/star/filter/regex.py +4 -2
- astrbot/core/star/register/__init__.py +19 -19
- astrbot/core/star/register/star.py +6 -2
- astrbot/core/star/register/star_handler.py +96 -73
- astrbot/core/star/session_llm_manager.py +48 -14
- astrbot/core/star/session_plugin_manager.py +29 -15
- astrbot/core/star/star.py +1 -2
- astrbot/core/star/star_handler.py +13 -8
- astrbot/core/star/star_manager.py +151 -59
- astrbot/core/star/star_tools.py +44 -37
- astrbot/core/star/updator.py +10 -10
- astrbot/core/umop_config_router.py +10 -4
- astrbot/core/updator.py +13 -5
- astrbot/core/utils/astrbot_path.py +3 -5
- astrbot/core/utils/dify_api_client.py +33 -15
- astrbot/core/utils/io.py +66 -42
- astrbot/core/utils/log_pipe.py +1 -1
- astrbot/core/utils/metrics.py +7 -7
- astrbot/core/utils/path_util.py +15 -16
- astrbot/core/utils/pip_installer.py +5 -5
- astrbot/core/utils/session_waiter.py +19 -20
- astrbot/core/utils/shared_preferences.py +45 -20
- astrbot/core/utils/t2i/__init__.py +4 -1
- astrbot/core/utils/t2i/network_strategy.py +35 -26
- astrbot/core/utils/t2i/renderer.py +11 -5
- astrbot/core/utils/t2i/template_manager.py +14 -15
- astrbot/core/utils/tencent_record_helper.py +19 -13
- astrbot/core/utils/version_comparator.py +10 -13
- astrbot/core/zip_updator.py +43 -40
- astrbot/dashboard/routes/__init__.py +18 -18
- astrbot/dashboard/routes/auth.py +10 -8
- astrbot/dashboard/routes/chat.py +30 -21
- astrbot/dashboard/routes/config.py +92 -75
- astrbot/dashboard/routes/conversation.py +46 -39
- astrbot/dashboard/routes/file.py +4 -2
- astrbot/dashboard/routes/knowledge_base.py +47 -40
- astrbot/dashboard/routes/log.py +9 -4
- astrbot/dashboard/routes/persona.py +19 -16
- astrbot/dashboard/routes/plugin.py +69 -55
- astrbot/dashboard/routes/route.py +3 -1
- astrbot/dashboard/routes/session_management.py +130 -116
- astrbot/dashboard/routes/stat.py +34 -34
- astrbot/dashboard/routes/t2i.py +15 -12
- astrbot/dashboard/routes/tools.py +56 -53
- astrbot/dashboard/routes/update.py +32 -28
- astrbot/dashboard/server.py +30 -26
- astrbot/dashboard/utils.py +8 -4
- {astrbot-4.5.1.dist-info → astrbot-4.5.3.dist-info}/METADATA +2 -1
- astrbot-4.5.3.dist-info/RECORD +261 -0
- astrbot-4.5.1.dist-info/RECORD +0 -260
- {astrbot-4.5.1.dist-info → astrbot-4.5.3.dist-info}/WHEEL +0 -0
- {astrbot-4.5.1.dist-info → astrbot-4.5.3.dist-info}/entry_points.txt +0 -0
- {astrbot-4.5.1.dist-info → astrbot-4.5.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -18,6 +18,7 @@ class FixedSizeChunker(BaseChunker):
|
|
|
18
18
|
Args:
|
|
19
19
|
chunk_size: 块的大小(字符数)
|
|
20
20
|
chunk_overlap: 块之间的重叠字符数
|
|
21
|
+
|
|
21
22
|
"""
|
|
22
23
|
self.chunk_size = chunk_size
|
|
23
24
|
self.chunk_overlap = chunk_overlap
|
|
@@ -32,6 +33,7 @@ class FixedSizeChunker(BaseChunker):
|
|
|
32
33
|
|
|
33
34
|
Returns:
|
|
34
35
|
list[str]: 分块后的文本列表
|
|
36
|
+
|
|
35
37
|
"""
|
|
36
38
|
chunk_size = kwargs.get("chunk_size", self.chunk_size)
|
|
37
39
|
chunk_overlap = kwargs.get("chunk_overlap", self.chunk_overlap)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from collections.abc import Callable
|
|
2
|
+
|
|
2
3
|
from .base import BaseChunker
|
|
3
4
|
|
|
4
5
|
|
|
@@ -11,8 +12,7 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
11
12
|
is_separator_regex: bool = False,
|
|
12
13
|
separators: list[str] | None = None,
|
|
13
14
|
):
|
|
14
|
-
"""
|
|
15
|
-
初始化递归字符文本分割器
|
|
15
|
+
"""初始化递归字符文本分割器
|
|
16
16
|
|
|
17
17
|
Args:
|
|
18
18
|
chunk_size: 每个文本块的最大大小
|
|
@@ -20,6 +20,7 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
20
20
|
length_function: 计算文本长度的函数
|
|
21
21
|
is_separator_regex: 分隔符是否为正则表达式
|
|
22
22
|
separators: 用于分割文本的分隔符列表,按优先级排序
|
|
23
|
+
|
|
23
24
|
"""
|
|
24
25
|
self.chunk_size = chunk_size
|
|
25
26
|
self.chunk_overlap = chunk_overlap
|
|
@@ -39,8 +40,7 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
39
40
|
]
|
|
40
41
|
|
|
41
42
|
async def chunk(self, text: str, **kwargs) -> list[str]:
|
|
42
|
-
"""
|
|
43
|
-
递归地将文本分割成块
|
|
43
|
+
"""递归地将文本分割成块
|
|
44
44
|
|
|
45
45
|
Args:
|
|
46
46
|
text: 要分割的文本
|
|
@@ -49,6 +49,7 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
49
49
|
|
|
50
50
|
Returns:
|
|
51
51
|
分割后的文本块列表
|
|
52
|
+
|
|
52
53
|
"""
|
|
53
54
|
if not text:
|
|
54
55
|
return []
|
|
@@ -90,7 +91,7 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
90
91
|
combined_text,
|
|
91
92
|
chunk_size=chunk_size,
|
|
92
93
|
chunk_overlap=overlap,
|
|
93
|
-
)
|
|
94
|
+
),
|
|
94
95
|
)
|
|
95
96
|
current_chunk = []
|
|
96
97
|
current_chunk_length = 0
|
|
@@ -98,8 +99,10 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
98
99
|
# 递归分割过大的部分
|
|
99
100
|
final_chunks.extend(
|
|
100
101
|
await self.chunk(
|
|
101
|
-
split,
|
|
102
|
-
|
|
102
|
+
split,
|
|
103
|
+
chunk_size=chunk_size,
|
|
104
|
+
chunk_overlap=overlap,
|
|
105
|
+
),
|
|
103
106
|
)
|
|
104
107
|
# 如果添加这部分会使当前块超过chunk_size
|
|
105
108
|
elif current_chunk_length + split_length > chunk_size:
|
|
@@ -132,16 +135,19 @@ class RecursiveCharacterChunker(BaseChunker):
|
|
|
132
135
|
return [text]
|
|
133
136
|
|
|
134
137
|
def _split_by_character(
|
|
135
|
-
self,
|
|
138
|
+
self,
|
|
139
|
+
text: str,
|
|
140
|
+
chunk_size: int | None = None,
|
|
141
|
+
overlap: int | None = None,
|
|
136
142
|
) -> list[str]:
|
|
137
|
-
"""
|
|
138
|
-
按字符级别分割文本
|
|
143
|
+
"""按字符级别分割文本
|
|
139
144
|
|
|
140
145
|
Args:
|
|
141
146
|
text: 要分割的文本
|
|
142
147
|
|
|
143
148
|
Returns:
|
|
144
149
|
分割后的文本块列表
|
|
150
|
+
|
|
145
151
|
"""
|
|
146
152
|
chunk_size = chunk_size or self.chunk_size
|
|
147
153
|
overlap = overlap or self.chunk_overlap
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from contextlib import asynccontextmanager
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
from
|
|
5
|
-
from sqlalchemy import text, func, select, update, delete
|
|
4
|
+
from sqlalchemy import delete, func, select, text, update
|
|
6
5
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
|
6
|
+
from sqlmodel import col, desc
|
|
7
7
|
|
|
8
8
|
from astrbot.core import logger
|
|
9
|
+
from astrbot.core.db.vec_db.faiss_impl import FaissVecDB
|
|
9
10
|
from astrbot.core.knowledge_base.models import (
|
|
10
11
|
BaseKBModel,
|
|
11
12
|
KBDocument,
|
|
12
13
|
KBMedia,
|
|
13
14
|
KnowledgeBase,
|
|
14
15
|
)
|
|
15
|
-
from astrbot.core.db.vec_db.faiss_impl import FaissVecDB
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class KBSQLiteDatabase:
|
|
@@ -21,6 +21,7 @@ class KBSQLiteDatabase:
|
|
|
21
21
|
|
|
22
22
|
Args:
|
|
23
23
|
db_path: 数据库文件路径, 默认为 data/knowledge_base/kb.db
|
|
24
|
+
|
|
24
25
|
"""
|
|
25
26
|
self.db_path = db_path
|
|
26
27
|
self.DATABASE_URL = f"sqlite+aiosqlite:///{db_path}"
|
|
@@ -85,77 +86,77 @@ class KBSQLiteDatabase:
|
|
|
85
86
|
await session.execute(
|
|
86
87
|
text(
|
|
87
88
|
"CREATE INDEX IF NOT EXISTS idx_kb_kb_id "
|
|
88
|
-
"ON knowledge_bases(kb_id)"
|
|
89
|
-
)
|
|
89
|
+
"ON knowledge_bases(kb_id)",
|
|
90
|
+
),
|
|
90
91
|
)
|
|
91
92
|
await session.execute(
|
|
92
93
|
text(
|
|
93
94
|
"CREATE INDEX IF NOT EXISTS idx_kb_name "
|
|
94
|
-
"ON knowledge_bases(kb_name)"
|
|
95
|
-
)
|
|
95
|
+
"ON knowledge_bases(kb_name)",
|
|
96
|
+
),
|
|
96
97
|
)
|
|
97
98
|
await session.execute(
|
|
98
99
|
text(
|
|
99
100
|
"CREATE INDEX IF NOT EXISTS idx_kb_created_at "
|
|
100
|
-
"ON knowledge_bases(created_at)"
|
|
101
|
-
)
|
|
101
|
+
"ON knowledge_bases(created_at)",
|
|
102
|
+
),
|
|
102
103
|
)
|
|
103
104
|
|
|
104
105
|
# 创建文档表索引
|
|
105
106
|
await session.execute(
|
|
106
107
|
text(
|
|
107
108
|
"CREATE INDEX IF NOT EXISTS idx_doc_doc_id "
|
|
108
|
-
"ON kb_documents(doc_id)"
|
|
109
|
-
)
|
|
109
|
+
"ON kb_documents(doc_id)",
|
|
110
|
+
),
|
|
110
111
|
)
|
|
111
112
|
await session.execute(
|
|
112
113
|
text(
|
|
113
114
|
"CREATE INDEX IF NOT EXISTS idx_doc_kb_id "
|
|
114
|
-
"ON kb_documents(kb_id)"
|
|
115
|
-
)
|
|
115
|
+
"ON kb_documents(kb_id)",
|
|
116
|
+
),
|
|
116
117
|
)
|
|
117
118
|
await session.execute(
|
|
118
119
|
text(
|
|
119
120
|
"CREATE INDEX IF NOT EXISTS idx_doc_name "
|
|
120
|
-
"ON kb_documents(doc_name)"
|
|
121
|
-
)
|
|
121
|
+
"ON kb_documents(doc_name)",
|
|
122
|
+
),
|
|
122
123
|
)
|
|
123
124
|
await session.execute(
|
|
124
125
|
text(
|
|
125
126
|
"CREATE INDEX IF NOT EXISTS idx_doc_type "
|
|
126
|
-
"ON kb_documents(file_type)"
|
|
127
|
-
)
|
|
127
|
+
"ON kb_documents(file_type)",
|
|
128
|
+
),
|
|
128
129
|
)
|
|
129
130
|
await session.execute(
|
|
130
131
|
text(
|
|
131
132
|
"CREATE INDEX IF NOT EXISTS idx_doc_created_at "
|
|
132
|
-
"ON kb_documents(created_at)"
|
|
133
|
-
)
|
|
133
|
+
"ON kb_documents(created_at)",
|
|
134
|
+
),
|
|
134
135
|
)
|
|
135
136
|
|
|
136
137
|
# 创建多媒体表索引
|
|
137
138
|
await session.execute(
|
|
138
139
|
text(
|
|
139
140
|
"CREATE INDEX IF NOT EXISTS idx_media_media_id "
|
|
140
|
-
"ON kb_media(media_id)"
|
|
141
|
-
)
|
|
141
|
+
"ON kb_media(media_id)",
|
|
142
|
+
),
|
|
142
143
|
)
|
|
143
144
|
await session.execute(
|
|
144
145
|
text(
|
|
145
146
|
"CREATE INDEX IF NOT EXISTS idx_media_doc_id "
|
|
146
|
-
"ON kb_media(doc_id)"
|
|
147
|
-
)
|
|
147
|
+
"ON kb_media(doc_id)",
|
|
148
|
+
),
|
|
148
149
|
)
|
|
149
150
|
await session.execute(
|
|
150
151
|
text(
|
|
151
|
-
"CREATE INDEX IF NOT EXISTS idx_media_kb_id ON kb_media(kb_id)"
|
|
152
|
-
)
|
|
152
|
+
"CREATE INDEX IF NOT EXISTS idx_media_kb_id ON kb_media(kb_id)",
|
|
153
|
+
),
|
|
153
154
|
)
|
|
154
155
|
await session.execute(
|
|
155
156
|
text(
|
|
156
157
|
"CREATE INDEX IF NOT EXISTS idx_media_type "
|
|
157
|
-
"ON kb_media(media_type)"
|
|
158
|
-
)
|
|
158
|
+
"ON kb_media(media_type)",
|
|
159
|
+
),
|
|
159
160
|
)
|
|
160
161
|
|
|
161
162
|
await session.commit()
|
|
@@ -208,7 +209,10 @@ class KBSQLiteDatabase:
|
|
|
208
209
|
return result.scalar_one_or_none()
|
|
209
210
|
|
|
210
211
|
async def list_documents_by_kb(
|
|
211
|
-
self,
|
|
212
|
+
self,
|
|
213
|
+
kb_id: str,
|
|
214
|
+
offset: int = 0,
|
|
215
|
+
limit: int = 100,
|
|
212
216
|
) -> list[KBDocument]:
|
|
213
217
|
"""列出知识库的所有文档"""
|
|
214
218
|
async with self.get_db() as session:
|
|
@@ -226,7 +230,7 @@ class KBSQLiteDatabase:
|
|
|
226
230
|
"""统计知识库的文档数量"""
|
|
227
231
|
async with self.get_db() as session:
|
|
228
232
|
stmt = select(func.count(col(KBDocument.id))).where(
|
|
229
|
-
col(KBDocument.kb_id) == kb_id
|
|
233
|
+
col(KBDocument.kb_id) == kb_id,
|
|
230
234
|
)
|
|
231
235
|
result = await session.execute(stmt)
|
|
232
236
|
return result.scalar() or 0
|
|
@@ -252,12 +256,11 @@ class KBSQLiteDatabase:
|
|
|
252
256
|
async def delete_document_by_id(self, doc_id: str, vec_db: FaissVecDB):
|
|
253
257
|
"""删除单个文档及其相关数据"""
|
|
254
258
|
# 在知识库表中删除
|
|
255
|
-
async with self.get_db() as session:
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
await session.commit()
|
|
259
|
+
async with self.get_db() as session, session.begin():
|
|
260
|
+
# 删除文档记录
|
|
261
|
+
delete_stmt = delete(KBDocument).where(col(KBDocument.doc_id) == doc_id)
|
|
262
|
+
await session.execute(delete_stmt)
|
|
263
|
+
await session.commit()
|
|
261
264
|
|
|
262
265
|
# 在 vec db 中删除相关向量
|
|
263
266
|
await vec_db.delete_documents(metadata_filters={"kb_doc_id": doc_id})
|
|
@@ -282,18 +285,17 @@ class KBSQLiteDatabase:
|
|
|
282
285
|
"""更新知识库统计信息"""
|
|
283
286
|
chunk_cnt = await vec_db.count_documents()
|
|
284
287
|
|
|
285
|
-
async with self.get_db() as session:
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
.
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
chunk_count=chunk_cnt,
|
|
295
|
-
)
|
|
288
|
+
async with self.get_db() as session, session.begin():
|
|
289
|
+
update_stmt = (
|
|
290
|
+
update(KnowledgeBase)
|
|
291
|
+
.where(col(KnowledgeBase.kb_id) == kb_id)
|
|
292
|
+
.values(
|
|
293
|
+
doc_count=select(func.count(col(KBDocument.id)))
|
|
294
|
+
.where(col(KBDocument.kb_id) == kb_id)
|
|
295
|
+
.scalar_subquery(),
|
|
296
|
+
chunk_count=chunk_cnt,
|
|
296
297
|
)
|
|
298
|
+
)
|
|
297
299
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
+
await session.execute(update_stmt)
|
|
301
|
+
await session.commit()
|
|
@@ -1,16 +1,19 @@
|
|
|
1
|
-
import uuid
|
|
2
|
-
import aiofiles
|
|
3
1
|
import json
|
|
2
|
+
import uuid
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
|
|
5
|
+
import aiofiles
|
|
6
|
+
|
|
7
|
+
from astrbot.core import logger
|
|
7
8
|
from astrbot.core.db.vec_db.base import BaseVecDB
|
|
8
9
|
from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB
|
|
9
|
-
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
|
|
10
10
|
from astrbot.core.provider.manager import ProviderManager
|
|
11
|
-
from .
|
|
11
|
+
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
|
|
12
|
+
|
|
12
13
|
from .chunking.base import BaseChunker
|
|
13
|
-
from
|
|
14
|
+
from .kb_db_sqlite import KBSQLiteDatabase
|
|
15
|
+
from .models import KBDocument, KBMedia, KnowledgeBase
|
|
16
|
+
from .parsers.util import select_parser
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
class KBHelper:
|
|
@@ -45,11 +48,11 @@ class KBHelper:
|
|
|
45
48
|
if not self.kb.embedding_provider_id:
|
|
46
49
|
raise ValueError(f"知识库 {self.kb.kb_name} 未配置 Embedding Provider")
|
|
47
50
|
ep: EmbeddingProvider = await self.prov_mgr.get_provider_by_id(
|
|
48
|
-
self.kb.embedding_provider_id
|
|
51
|
+
self.kb.embedding_provider_id,
|
|
49
52
|
) # type: ignore
|
|
50
53
|
if not ep:
|
|
51
54
|
raise ValueError(
|
|
52
|
-
f"无法找到 ID 为 {self.kb.embedding_provider_id} 的 Embedding Provider"
|
|
55
|
+
f"无法找到 ID 为 {self.kb.embedding_provider_id} 的 Embedding Provider",
|
|
53
56
|
)
|
|
54
57
|
return ep
|
|
55
58
|
|
|
@@ -57,11 +60,11 @@ class KBHelper:
|
|
|
57
60
|
if not self.kb.rerank_provider_id:
|
|
58
61
|
return None
|
|
59
62
|
rp: RerankProvider = await self.prov_mgr.get_provider_by_id(
|
|
60
|
-
self.kb.rerank_provider_id
|
|
63
|
+
self.kb.rerank_provider_id,
|
|
61
64
|
) # type: ignore
|
|
62
65
|
if not rp:
|
|
63
66
|
raise ValueError(
|
|
64
|
-
f"无法找到 ID 为 {self.kb.rerank_provider_id} 的 Rerank Provider"
|
|
67
|
+
f"无法找到 ID 为 {self.kb.rerank_provider_id} 的 Rerank Provider",
|
|
65
68
|
)
|
|
66
69
|
return rp
|
|
67
70
|
|
|
@@ -122,6 +125,7 @@ class KBHelper:
|
|
|
122
125
|
- stage: 当前阶段 ('parsing', 'chunking', 'embedding')
|
|
123
126
|
- current: 当前进度
|
|
124
127
|
- total: 总数
|
|
128
|
+
|
|
125
129
|
"""
|
|
126
130
|
await self._ensure_vec_db()
|
|
127
131
|
doc_id = str(uuid.uuid4())
|
|
@@ -162,7 +166,9 @@ class KBHelper:
|
|
|
162
166
|
await progress_callback("chunking", 0, 100)
|
|
163
167
|
|
|
164
168
|
chunks_text = await self.chunker.chunk(
|
|
165
|
-
text_content,
|
|
169
|
+
text_content,
|
|
170
|
+
chunk_size=chunk_size,
|
|
171
|
+
chunk_overlap=chunk_overlap,
|
|
166
172
|
)
|
|
167
173
|
contents = []
|
|
168
174
|
metadatas = []
|
|
@@ -173,7 +179,7 @@ class KBHelper:
|
|
|
173
179
|
"kb_id": self.kb.kb_id,
|
|
174
180
|
"kb_doc_id": doc_id,
|
|
175
181
|
"chunk_index": idx,
|
|
176
|
-
}
|
|
182
|
+
},
|
|
177
183
|
)
|
|
178
184
|
|
|
179
185
|
if progress_callback:
|
|
@@ -234,7 +240,9 @@ class KBHelper:
|
|
|
234
240
|
raise e
|
|
235
241
|
|
|
236
242
|
async def list_documents(
|
|
237
|
-
self,
|
|
243
|
+
self,
|
|
244
|
+
offset: int = 0,
|
|
245
|
+
limit: int = 100,
|
|
238
246
|
) -> list[KBDocument]:
|
|
239
247
|
"""列出知识库的所有文档"""
|
|
240
248
|
docs = await self.kb_db.list_documents_by_kb(self.kb.kb_id, offset, limit)
|
|
@@ -288,12 +296,17 @@ class KBHelper:
|
|
|
288
296
|
await session.refresh(doc)
|
|
289
297
|
|
|
290
298
|
async def get_chunks_by_doc_id(
|
|
291
|
-
self,
|
|
299
|
+
self,
|
|
300
|
+
doc_id: str,
|
|
301
|
+
offset: int = 0,
|
|
302
|
+
limit: int = 100,
|
|
292
303
|
) -> list[dict]:
|
|
293
304
|
"""获取文档的所有块及其元数据"""
|
|
294
305
|
vec_db: FaissVecDB = self.vec_db # type: ignore
|
|
295
306
|
chunks = await vec_db.document_storage.get_documents(
|
|
296
|
-
metadata_filters={"kb_doc_id": doc_id},
|
|
307
|
+
metadata_filters={"kb_doc_id": doc_id},
|
|
308
|
+
offset=offset,
|
|
309
|
+
limit=limit,
|
|
297
310
|
)
|
|
298
311
|
result = []
|
|
299
312
|
for chunk in chunks:
|
|
@@ -306,7 +319,7 @@ class KBHelper:
|
|
|
306
319
|
"chunk_index": chunk_md["chunk_index"],
|
|
307
320
|
"content": chunk["text"],
|
|
308
321
|
"char_count": len(chunk["text"]),
|
|
309
|
-
}
|
|
322
|
+
},
|
|
310
323
|
)
|
|
311
324
|
return result
|
|
312
325
|
|
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
import traceback
|
|
2
2
|
from pathlib import Path
|
|
3
|
+
|
|
3
4
|
from astrbot.core import logger
|
|
4
5
|
from astrbot.core.provider.manager import ProviderManager
|
|
5
6
|
|
|
6
|
-
from .retrieval.manager import RetrievalManager, RetrievalResult
|
|
7
|
-
from .retrieval.sparse_retriever import SparseRetriever
|
|
8
|
-
from .retrieval.rank_fusion import RankFusion
|
|
9
|
-
from .kb_db_sqlite import KBSQLiteDatabase
|
|
10
|
-
|
|
11
7
|
# from .chunking.fixed_size import FixedSizeChunker
|
|
12
8
|
from .chunking.recursive import RecursiveCharacterChunker
|
|
9
|
+
from .kb_db_sqlite import KBSQLiteDatabase
|
|
13
10
|
from .kb_helper import KBHelper
|
|
14
|
-
|
|
15
11
|
from .models import KnowledgeBase
|
|
16
|
-
|
|
12
|
+
from .retrieval.manager import RetrievalManager, RetrievalResult
|
|
13
|
+
from .retrieval.rank_fusion import RankFusion
|
|
14
|
+
from .retrieval.sparse_retriever import SparseRetriever
|
|
17
15
|
|
|
18
16
|
FILES_PATH = "data/knowledge_base"
|
|
19
17
|
DB_PATH = Path(FILES_PATH) / "kb.db"
|
|
@@ -257,6 +255,7 @@ class KnowledgeBaseManager:
|
|
|
257
255
|
|
|
258
256
|
Returns:
|
|
259
257
|
str: 格式化的上下文文本
|
|
258
|
+
|
|
260
259
|
"""
|
|
261
260
|
lines = ["以下是相关的知识库内容,请参考这些信息回答用户的问题:\n"]
|
|
262
261
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from datetime import datetime, timezone
|
|
3
3
|
|
|
4
|
-
from sqlmodel import Field, SQLModel, Text, UniqueConstraint
|
|
4
|
+
from sqlmodel import Field, MetaData, SQLModel, Text, UniqueConstraint
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class BaseKBModel(SQLModel, table=False):
|
|
@@ -17,7 +17,9 @@ class KnowledgeBase(BaseKBModel, table=True):
|
|
|
17
17
|
__tablename__ = "knowledge_bases" # type: ignore
|
|
18
18
|
|
|
19
19
|
id: int | None = Field(
|
|
20
|
-
primary_key=True,
|
|
20
|
+
primary_key=True,
|
|
21
|
+
sa_column_kwargs={"autoincrement": True},
|
|
22
|
+
default=None,
|
|
21
23
|
)
|
|
22
24
|
kb_id: str = Field(
|
|
23
25
|
max_length=36,
|
|
@@ -63,7 +65,9 @@ class KBDocument(BaseKBModel, table=True):
|
|
|
63
65
|
__tablename__ = "kb_documents" # type: ignore
|
|
64
66
|
|
|
65
67
|
id: int | None = Field(
|
|
66
|
-
primary_key=True,
|
|
68
|
+
primary_key=True,
|
|
69
|
+
sa_column_kwargs={"autoincrement": True},
|
|
70
|
+
default=None,
|
|
67
71
|
)
|
|
68
72
|
doc_id: str = Field(
|
|
69
73
|
max_length=36,
|
|
@@ -95,7 +99,9 @@ class KBMedia(BaseKBModel, table=True):
|
|
|
95
99
|
__tablename__ = "kb_media" # type: ignore
|
|
96
100
|
|
|
97
101
|
id: int | None = Field(
|
|
98
|
-
primary_key=True,
|
|
102
|
+
primary_key=True,
|
|
103
|
+
sa_column_kwargs={"autoincrement": True},
|
|
104
|
+
default=None,
|
|
99
105
|
)
|
|
100
106
|
media_id: str = Field(
|
|
101
107
|
max_length=36,
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
"""
|
|
2
|
-
文档解析器模块
|
|
3
|
-
"""
|
|
1
|
+
"""文档解析器模块"""
|
|
4
2
|
|
|
5
3
|
from .base import BaseParser, MediaItem, ParseResult
|
|
6
|
-
from .text_parser import TextParser
|
|
7
4
|
from .pdf_parser import PDFParser
|
|
5
|
+
from .text_parser import TextParser
|
|
8
6
|
|
|
9
7
|
__all__ = [
|
|
10
8
|
"BaseParser",
|
|
11
9
|
"MediaItem",
|
|
10
|
+
"PDFParser",
|
|
12
11
|
"ParseResult",
|
|
13
12
|
"TextParser",
|
|
14
|
-
"PDFParser",
|
|
15
13
|
]
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
|
+
from markitdown_no_magika import MarkItDown, StreamInfo
|
|
5
|
+
|
|
4
6
|
from astrbot.core.knowledge_base.parsers.base import (
|
|
5
7
|
BaseParser,
|
|
6
8
|
ParseResult,
|
|
7
9
|
)
|
|
8
|
-
from markitdown_no_magika import MarkItDown, StreamInfo
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class MarkitdownParser(BaseParser):
|
|
@@ -29,6 +29,7 @@ class PDFParser(BaseParser):
|
|
|
29
29
|
|
|
30
30
|
Returns:
|
|
31
31
|
ParseResult: 包含文本和图片的解析结果
|
|
32
|
+
|
|
32
33
|
"""
|
|
33
34
|
pdf_file = io.BytesIO(file_content)
|
|
34
35
|
reader = PdfReader(pdf_file)
|
|
@@ -87,7 +88,7 @@ class PDFParser(BaseParser):
|
|
|
87
88
|
file_name=f"page_{page_num}_img_{image_counter}.{ext}",
|
|
88
89
|
content=image_data,
|
|
89
90
|
mime_type=mime_type,
|
|
90
|
-
)
|
|
91
|
+
),
|
|
91
92
|
)
|
|
92
93
|
except Exception:
|
|
93
94
|
# 单个图片提取失败不影响整体
|
|
@@ -1,16 +1,14 @@
|
|
|
1
|
-
"""
|
|
2
|
-
检索模块
|
|
3
|
-
"""
|
|
1
|
+
"""检索模块"""
|
|
4
2
|
|
|
5
3
|
from .manager import RetrievalManager, RetrievalResult
|
|
6
|
-
from .
|
|
7
|
-
from .
|
|
4
|
+
from .rank_fusion import FusedResult, RankFusion
|
|
5
|
+
from .sparse_retriever import SparseResult, SparseRetriever
|
|
8
6
|
|
|
9
7
|
__all__ = [
|
|
8
|
+
"FusedResult",
|
|
9
|
+
"RankFusion",
|
|
10
10
|
"RetrievalManager",
|
|
11
11
|
"RetrievalResult",
|
|
12
|
-
"SparseRetriever",
|
|
13
12
|
"SparseResult",
|
|
14
|
-
"
|
|
15
|
-
"FusedResult",
|
|
13
|
+
"SparseRetriever",
|
|
16
14
|
]
|