AstrBot 3.5.6__py3-none-any.whl → 4.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- astrbot/api/__init__.py +16 -4
- astrbot/api/all.py +2 -1
- astrbot/api/event/__init__.py +5 -6
- astrbot/api/event/filter/__init__.py +37 -34
- astrbot/api/platform/__init__.py +7 -8
- astrbot/api/provider/__init__.py +8 -7
- astrbot/api/star/__init__.py +3 -4
- astrbot/api/util/__init__.py +2 -2
- astrbot/cli/__init__.py +1 -0
- astrbot/cli/__main__.py +18 -197
- astrbot/cli/commands/__init__.py +6 -0
- astrbot/cli/commands/cmd_conf.py +209 -0
- astrbot/cli/commands/cmd_init.py +56 -0
- astrbot/cli/commands/cmd_plug.py +245 -0
- astrbot/cli/commands/cmd_run.py +62 -0
- astrbot/cli/utils/__init__.py +18 -0
- astrbot/cli/utils/basic.py +76 -0
- astrbot/cli/utils/plugin.py +246 -0
- astrbot/cli/utils/version_comparator.py +90 -0
- astrbot/core/__init__.py +17 -19
- astrbot/core/agent/agent.py +14 -0
- astrbot/core/agent/handoff.py +38 -0
- astrbot/core/agent/hooks.py +30 -0
- astrbot/core/agent/mcp_client.py +385 -0
- astrbot/core/agent/message.py +175 -0
- astrbot/core/agent/response.py +14 -0
- astrbot/core/agent/run_context.py +22 -0
- astrbot/core/agent/runners/__init__.py +3 -0
- astrbot/core/agent/runners/base.py +65 -0
- astrbot/core/agent/runners/coze/coze_agent_runner.py +367 -0
- astrbot/core/agent/runners/coze/coze_api_client.py +324 -0
- astrbot/core/agent/runners/dashscope/dashscope_agent_runner.py +403 -0
- astrbot/core/agent/runners/dify/dify_agent_runner.py +336 -0
- astrbot/core/agent/runners/dify/dify_api_client.py +195 -0
- astrbot/core/agent/runners/tool_loop_agent_runner.py +400 -0
- astrbot/core/agent/tool.py +285 -0
- astrbot/core/agent/tool_executor.py +17 -0
- astrbot/core/astr_agent_context.py +19 -0
- astrbot/core/astr_agent_hooks.py +36 -0
- astrbot/core/astr_agent_run_util.py +80 -0
- astrbot/core/astr_agent_tool_exec.py +246 -0
- astrbot/core/astrbot_config_mgr.py +275 -0
- astrbot/core/config/__init__.py +2 -2
- astrbot/core/config/astrbot_config.py +60 -20
- astrbot/core/config/default.py +1972 -453
- astrbot/core/config/i18n_utils.py +110 -0
- astrbot/core/conversation_mgr.py +285 -75
- astrbot/core/core_lifecycle.py +167 -62
- astrbot/core/db/__init__.py +305 -102
- astrbot/core/db/migration/helper.py +69 -0
- astrbot/core/db/migration/migra_3_to_4.py +357 -0
- astrbot/core/db/migration/migra_45_to_46.py +44 -0
- astrbot/core/db/migration/migra_webchat_session.py +131 -0
- astrbot/core/db/migration/shared_preferences_v3.py +48 -0
- astrbot/core/db/migration/sqlite_v3.py +497 -0
- astrbot/core/db/po.py +259 -55
- astrbot/core/db/sqlite.py +773 -528
- astrbot/core/db/vec_db/base.py +73 -0
- astrbot/core/db/vec_db/faiss_impl/__init__.py +3 -0
- astrbot/core/db/vec_db/faiss_impl/document_storage.py +392 -0
- astrbot/core/db/vec_db/faiss_impl/embedding_storage.py +93 -0
- astrbot/core/db/vec_db/faiss_impl/sqlite_init.sql +17 -0
- astrbot/core/db/vec_db/faiss_impl/vec_db.py +204 -0
- astrbot/core/event_bus.py +26 -22
- astrbot/core/exceptions.py +9 -0
- astrbot/core/file_token_service.py +98 -0
- astrbot/core/initial_loader.py +19 -10
- astrbot/core/knowledge_base/chunking/__init__.py +9 -0
- astrbot/core/knowledge_base/chunking/base.py +25 -0
- astrbot/core/knowledge_base/chunking/fixed_size.py +59 -0
- astrbot/core/knowledge_base/chunking/recursive.py +161 -0
- astrbot/core/knowledge_base/kb_db_sqlite.py +301 -0
- astrbot/core/knowledge_base/kb_helper.py +642 -0
- astrbot/core/knowledge_base/kb_mgr.py +330 -0
- astrbot/core/knowledge_base/models.py +120 -0
- astrbot/core/knowledge_base/parsers/__init__.py +13 -0
- astrbot/core/knowledge_base/parsers/base.py +51 -0
- astrbot/core/knowledge_base/parsers/markitdown_parser.py +26 -0
- astrbot/core/knowledge_base/parsers/pdf_parser.py +101 -0
- astrbot/core/knowledge_base/parsers/text_parser.py +42 -0
- astrbot/core/knowledge_base/parsers/url_parser.py +103 -0
- astrbot/core/knowledge_base/parsers/util.py +13 -0
- astrbot/core/knowledge_base/prompts.py +65 -0
- astrbot/core/knowledge_base/retrieval/__init__.py +14 -0
- astrbot/core/knowledge_base/retrieval/hit_stopwords.txt +767 -0
- astrbot/core/knowledge_base/retrieval/manager.py +276 -0
- astrbot/core/knowledge_base/retrieval/rank_fusion.py +142 -0
- astrbot/core/knowledge_base/retrieval/sparse_retriever.py +136 -0
- astrbot/core/log.py +21 -15
- astrbot/core/message/components.py +413 -287
- astrbot/core/message/message_event_result.py +35 -24
- astrbot/core/persona_mgr.py +192 -0
- astrbot/core/pipeline/__init__.py +14 -14
- astrbot/core/pipeline/content_safety_check/stage.py +13 -9
- astrbot/core/pipeline/content_safety_check/strategies/__init__.py +1 -2
- astrbot/core/pipeline/content_safety_check/strategies/baidu_aip.py +13 -14
- astrbot/core/pipeline/content_safety_check/strategies/keywords.py +2 -1
- astrbot/core/pipeline/content_safety_check/strategies/strategy.py +6 -6
- astrbot/core/pipeline/context.py +7 -1
- astrbot/core/pipeline/context_utils.py +107 -0
- astrbot/core/pipeline/preprocess_stage/stage.py +63 -36
- astrbot/core/pipeline/process_stage/method/agent_request.py +48 -0
- astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py +464 -0
- astrbot/core/pipeline/process_stage/method/agent_sub_stages/third_party.py +202 -0
- astrbot/core/pipeline/process_stage/method/star_request.py +26 -32
- astrbot/core/pipeline/process_stage/stage.py +21 -15
- astrbot/core/pipeline/process_stage/utils.py +125 -0
- astrbot/core/pipeline/rate_limit_check/stage.py +34 -36
- astrbot/core/pipeline/respond/stage.py +142 -101
- astrbot/core/pipeline/result_decorate/stage.py +124 -57
- astrbot/core/pipeline/scheduler.py +21 -16
- astrbot/core/pipeline/session_status_check/stage.py +37 -0
- astrbot/core/pipeline/stage.py +11 -76
- astrbot/core/pipeline/waking_check/stage.py +69 -33
- astrbot/core/pipeline/whitelist_check/stage.py +10 -7
- astrbot/core/platform/__init__.py +6 -6
- astrbot/core/platform/astr_message_event.py +107 -129
- astrbot/core/platform/astrbot_message.py +32 -12
- astrbot/core/platform/manager.py +62 -18
- astrbot/core/platform/message_session.py +30 -0
- astrbot/core/platform/platform.py +16 -24
- astrbot/core/platform/platform_metadata.py +9 -4
- astrbot/core/platform/register.py +12 -7
- astrbot/core/platform/sources/aiocqhttp/aiocqhttp_message_event.py +136 -60
- astrbot/core/platform/sources/aiocqhttp/aiocqhttp_platform_adapter.py +126 -46
- astrbot/core/platform/sources/dingtalk/dingtalk_adapter.py +63 -31
- astrbot/core/platform/sources/dingtalk/dingtalk_event.py +30 -26
- astrbot/core/platform/sources/discord/client.py +129 -0
- astrbot/core/platform/sources/discord/components.py +139 -0
- astrbot/core/platform/sources/discord/discord_platform_adapter.py +473 -0
- astrbot/core/platform/sources/discord/discord_platform_event.py +313 -0
- astrbot/core/platform/sources/lark/lark_adapter.py +27 -18
- astrbot/core/platform/sources/lark/lark_event.py +39 -13
- astrbot/core/platform/sources/misskey/misskey_adapter.py +770 -0
- astrbot/core/platform/sources/misskey/misskey_api.py +964 -0
- astrbot/core/platform/sources/misskey/misskey_event.py +163 -0
- astrbot/core/platform/sources/misskey/misskey_utils.py +550 -0
- astrbot/core/platform/sources/qqofficial/qqofficial_message_event.py +149 -33
- astrbot/core/platform/sources/qqofficial/qqofficial_platform_adapter.py +41 -26
- astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_adapter.py +36 -17
- astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_event.py +3 -1
- astrbot/core/platform/sources/qqofficial_webhook/qo_webhook_server.py +14 -8
- astrbot/core/platform/sources/satori/satori_adapter.py +792 -0
- astrbot/core/platform/sources/satori/satori_event.py +432 -0
- astrbot/core/platform/sources/slack/client.py +164 -0
- astrbot/core/platform/sources/slack/slack_adapter.py +416 -0
- astrbot/core/platform/sources/slack/slack_event.py +253 -0
- astrbot/core/platform/sources/telegram/tg_adapter.py +100 -43
- astrbot/core/platform/sources/telegram/tg_event.py +136 -36
- astrbot/core/platform/sources/webchat/webchat_adapter.py +72 -22
- astrbot/core/platform/sources/webchat/webchat_event.py +46 -22
- astrbot/core/platform/sources/webchat/webchat_queue_mgr.py +35 -0
- astrbot/core/platform/sources/wechatpadpro/wechatpadpro_adapter.py +926 -0
- astrbot/core/platform/sources/wechatpadpro/wechatpadpro_message_event.py +178 -0
- astrbot/core/platform/sources/wechatpadpro/xml_data_parser.py +159 -0
- astrbot/core/platform/sources/wecom/wecom_adapter.py +169 -27
- astrbot/core/platform/sources/wecom/wecom_event.py +162 -77
- astrbot/core/platform/sources/wecom/wecom_kf.py +279 -0
- astrbot/core/platform/sources/wecom/wecom_kf_message.py +196 -0
- astrbot/core/platform/sources/wecom_ai_bot/WXBizJsonMsgCrypt.py +297 -0
- astrbot/core/platform/sources/wecom_ai_bot/__init__.py +15 -0
- astrbot/core/platform/sources/wecom_ai_bot/ierror.py +19 -0
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_adapter.py +472 -0
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_api.py +417 -0
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_event.py +152 -0
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_queue_mgr.py +153 -0
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_server.py +168 -0
- astrbot/core/platform/sources/wecom_ai_bot/wecomai_utils.py +209 -0
- astrbot/core/platform/sources/weixin_official_account/weixin_offacc_adapter.py +306 -0
- astrbot/core/platform/sources/weixin_official_account/weixin_offacc_event.py +186 -0
- astrbot/core/platform_message_history_mgr.py +49 -0
- astrbot/core/provider/__init__.py +2 -3
- astrbot/core/provider/entites.py +8 -8
- astrbot/core/provider/entities.py +154 -98
- astrbot/core/provider/func_tool_manager.py +446 -458
- astrbot/core/provider/manager.py +345 -207
- astrbot/core/provider/provider.py +188 -73
- astrbot/core/provider/register.py +9 -7
- astrbot/core/provider/sources/anthropic_source.py +295 -115
- astrbot/core/provider/sources/azure_tts_source.py +224 -0
- astrbot/core/provider/sources/bailian_rerank_source.py +236 -0
- astrbot/core/provider/sources/dashscope_tts.py +138 -14
- astrbot/core/provider/sources/edge_tts_source.py +24 -19
- astrbot/core/provider/sources/fishaudio_tts_api_source.py +58 -13
- astrbot/core/provider/sources/gemini_embedding_source.py +61 -0
- astrbot/core/provider/sources/gemini_source.py +310 -132
- astrbot/core/provider/sources/gemini_tts_source.py +81 -0
- astrbot/core/provider/sources/groq_source.py +15 -0
- astrbot/core/provider/sources/gsv_selfhosted_source.py +151 -0
- astrbot/core/provider/sources/gsvi_tts_source.py +14 -7
- astrbot/core/provider/sources/minimax_tts_api_source.py +159 -0
- astrbot/core/provider/sources/openai_embedding_source.py +40 -0
- astrbot/core/provider/sources/openai_source.py +241 -145
- astrbot/core/provider/sources/openai_tts_api_source.py +18 -7
- astrbot/core/provider/sources/sensevoice_selfhosted_source.py +13 -11
- astrbot/core/provider/sources/vllm_rerank_source.py +71 -0
- astrbot/core/provider/sources/volcengine_tts.py +115 -0
- astrbot/core/provider/sources/whisper_api_source.py +18 -13
- astrbot/core/provider/sources/whisper_selfhosted_source.py +19 -12
- astrbot/core/provider/sources/xinference_rerank_source.py +116 -0
- astrbot/core/provider/sources/xinference_stt_provider.py +197 -0
- astrbot/core/provider/sources/zhipu_source.py +6 -73
- astrbot/core/star/__init__.py +43 -11
- astrbot/core/star/config.py +17 -18
- astrbot/core/star/context.py +362 -138
- astrbot/core/star/filter/__init__.py +4 -3
- astrbot/core/star/filter/command.py +111 -35
- astrbot/core/star/filter/command_group.py +46 -34
- astrbot/core/star/filter/custom_filter.py +6 -5
- astrbot/core/star/filter/event_message_type.py +4 -2
- astrbot/core/star/filter/permission.py +4 -2
- astrbot/core/star/filter/platform_adapter_type.py +45 -12
- astrbot/core/star/filter/regex.py +4 -2
- astrbot/core/star/register/__init__.py +19 -15
- astrbot/core/star/register/star.py +41 -13
- astrbot/core/star/register/star_handler.py +236 -86
- astrbot/core/star/session_llm_manager.py +280 -0
- astrbot/core/star/session_plugin_manager.py +170 -0
- astrbot/core/star/star.py +36 -43
- astrbot/core/star/star_handler.py +47 -85
- astrbot/core/star/star_manager.py +442 -260
- astrbot/core/star/star_tools.py +167 -45
- astrbot/core/star/updator.py +17 -20
- astrbot/core/umop_config_router.py +106 -0
- astrbot/core/updator.py +38 -13
- astrbot/core/utils/astrbot_path.py +39 -0
- astrbot/core/utils/command_parser.py +1 -1
- astrbot/core/utils/io.py +119 -60
- astrbot/core/utils/log_pipe.py +1 -1
- astrbot/core/utils/metrics.py +11 -10
- astrbot/core/utils/migra_helper.py +73 -0
- astrbot/core/utils/path_util.py +63 -62
- astrbot/core/utils/pip_installer.py +37 -15
- astrbot/core/utils/session_lock.py +29 -0
- astrbot/core/utils/session_waiter.py +19 -20
- astrbot/core/utils/shared_preferences.py +174 -34
- astrbot/core/utils/t2i/__init__.py +4 -1
- astrbot/core/utils/t2i/local_strategy.py +386 -238
- astrbot/core/utils/t2i/network_strategy.py +109 -49
- astrbot/core/utils/t2i/renderer.py +29 -14
- astrbot/core/utils/t2i/template/astrbot_powershell.html +184 -0
- astrbot/core/utils/t2i/template_manager.py +111 -0
- astrbot/core/utils/tencent_record_helper.py +115 -1
- astrbot/core/utils/version_comparator.py +10 -13
- astrbot/core/zip_updator.py +112 -65
- astrbot/dashboard/routes/__init__.py +20 -13
- astrbot/dashboard/routes/auth.py +20 -9
- astrbot/dashboard/routes/chat.py +297 -141
- astrbot/dashboard/routes/config.py +652 -55
- astrbot/dashboard/routes/conversation.py +107 -37
- astrbot/dashboard/routes/file.py +26 -0
- astrbot/dashboard/routes/knowledge_base.py +1244 -0
- astrbot/dashboard/routes/log.py +27 -2
- astrbot/dashboard/routes/persona.py +202 -0
- astrbot/dashboard/routes/plugin.py +197 -139
- astrbot/dashboard/routes/route.py +27 -7
- astrbot/dashboard/routes/session_management.py +354 -0
- astrbot/dashboard/routes/stat.py +85 -18
- astrbot/dashboard/routes/static_file.py +5 -2
- astrbot/dashboard/routes/t2i.py +233 -0
- astrbot/dashboard/routes/tools.py +184 -120
- astrbot/dashboard/routes/update.py +59 -36
- astrbot/dashboard/server.py +96 -36
- astrbot/dashboard/utils.py +165 -0
- astrbot-4.7.0.dist-info/METADATA +294 -0
- astrbot-4.7.0.dist-info/RECORD +274 -0
- {astrbot-3.5.6.dist-info → astrbot-4.7.0.dist-info}/WHEEL +1 -1
- astrbot/core/db/plugin/sqlite_impl.py +0 -112
- astrbot/core/db/sqlite_init.sql +0 -50
- astrbot/core/pipeline/platform_compatibility/stage.py +0 -56
- astrbot/core/pipeline/process_stage/method/llm_request.py +0 -606
- astrbot/core/platform/sources/gewechat/client.py +0 -806
- astrbot/core/platform/sources/gewechat/downloader.py +0 -55
- astrbot/core/platform/sources/gewechat/gewechat_event.py +0 -255
- astrbot/core/platform/sources/gewechat/gewechat_platform_adapter.py +0 -103
- astrbot/core/platform/sources/gewechat/xml_data_parser.py +0 -110
- astrbot/core/provider/sources/dashscope_source.py +0 -203
- astrbot/core/provider/sources/dify_source.py +0 -281
- astrbot/core/provider/sources/llmtuner_source.py +0 -132
- astrbot/core/rag/embedding/openai_source.py +0 -20
- astrbot/core/rag/knowledge_db_mgr.py +0 -94
- astrbot/core/rag/store/__init__.py +0 -9
- astrbot/core/rag/store/chroma_db.py +0 -42
- astrbot/core/utils/dify_api_client.py +0 -152
- astrbot-3.5.6.dist-info/METADATA +0 -249
- astrbot-3.5.6.dist-info/RECORD +0 -158
- {astrbot-3.5.6.dist-info → astrbot-4.7.0.dist-info}/entry_points.txt +0 -0
- {astrbot-3.5.6.dist-info → astrbot-4.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
import aiohttp
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class URLExtractor:
|
|
7
|
+
"""URL 内容提取器,封装了 Tavily API 调用和密钥管理"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, tavily_keys: list[str]):
|
|
10
|
+
"""
|
|
11
|
+
初始化 URL 提取器
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
tavily_keys: Tavily API 密钥列表
|
|
15
|
+
"""
|
|
16
|
+
if not tavily_keys:
|
|
17
|
+
raise ValueError("Error: Tavily API keys are not configured.")
|
|
18
|
+
|
|
19
|
+
self.tavily_keys = tavily_keys
|
|
20
|
+
self.tavily_key_index = 0
|
|
21
|
+
self.tavily_key_lock = asyncio.Lock()
|
|
22
|
+
|
|
23
|
+
async def _get_tavily_key(self) -> str:
|
|
24
|
+
"""并发安全的从列表中获取并轮换Tavily API密钥。"""
|
|
25
|
+
async with self.tavily_key_lock:
|
|
26
|
+
key = self.tavily_keys[self.tavily_key_index]
|
|
27
|
+
self.tavily_key_index = (self.tavily_key_index + 1) % len(self.tavily_keys)
|
|
28
|
+
return key
|
|
29
|
+
|
|
30
|
+
async def extract_text_from_url(self, url: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
使用 Tavily API 从 URL 提取主要文本内容。
|
|
33
|
+
这是 web_searcher 插件中 tavily_extract_web_page 方法的简化版本,
|
|
34
|
+
专门为知识库模块设计,不依赖 AstrMessageEvent。
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
url: 要提取内容的网页 URL
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
提取的文本内容
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: 如果 URL 为空或 API 密钥未配置
|
|
44
|
+
IOError: 如果请求失败或返回错误
|
|
45
|
+
"""
|
|
46
|
+
if not url:
|
|
47
|
+
raise ValueError("Error: url must be a non-empty string.")
|
|
48
|
+
|
|
49
|
+
tavily_key = await self._get_tavily_key()
|
|
50
|
+
api_url = "https://api.tavily.com/extract"
|
|
51
|
+
headers = {
|
|
52
|
+
"Authorization": f"Bearer {tavily_key}",
|
|
53
|
+
"Content-Type": "application/json",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
payload = {
|
|
57
|
+
"urls": [url],
|
|
58
|
+
"extract_depth": "basic", # 使用基础提取深度
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
async with aiohttp.ClientSession(trust_env=True) as session:
|
|
63
|
+
async with session.post(
|
|
64
|
+
api_url,
|
|
65
|
+
json=payload,
|
|
66
|
+
headers=headers,
|
|
67
|
+
timeout=30.0, # 增加超时时间,因为内容提取可能需要更长时间
|
|
68
|
+
) as response:
|
|
69
|
+
if response.status != 200:
|
|
70
|
+
reason = await response.text()
|
|
71
|
+
raise OSError(
|
|
72
|
+
f"Tavily web extraction failed: {reason}, status: {response.status}"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
data = await response.json()
|
|
76
|
+
results = data.get("results", [])
|
|
77
|
+
|
|
78
|
+
if not results:
|
|
79
|
+
raise ValueError(f"No content extracted from URL: {url}")
|
|
80
|
+
|
|
81
|
+
# 返回第一个结果的内容
|
|
82
|
+
return results[0].get("raw_content", "")
|
|
83
|
+
|
|
84
|
+
except aiohttp.ClientError as e:
|
|
85
|
+
raise OSError(f"Failed to fetch URL {url}: {e}") from e
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise OSError(f"Failed to extract content from URL {url}: {e}") from e
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# 为了向后兼容,提供一个简单的函数接口
|
|
91
|
+
async def extract_text_from_url(url: str, tavily_keys: list[str]) -> str:
|
|
92
|
+
"""
|
|
93
|
+
简单的函数接口,用于从 URL 提取文本内容
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
url: 要提取内容的网页 URL
|
|
97
|
+
tavily_keys: Tavily API 密钥列表
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
提取的文本内容
|
|
101
|
+
"""
|
|
102
|
+
extractor = URLExtractor(tavily_keys)
|
|
103
|
+
return await extractor.extract_text_from_url(url)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .base import BaseParser
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
async def select_parser(ext: str) -> BaseParser:
|
|
5
|
+
if ext in {".md", ".txt", ".markdown", ".xlsx", ".docx", ".xls"}:
|
|
6
|
+
from .markitdown_parser import MarkitdownParser
|
|
7
|
+
|
|
8
|
+
return MarkitdownParser()
|
|
9
|
+
if ext == ".pdf":
|
|
10
|
+
from .pdf_parser import PDFParser
|
|
11
|
+
|
|
12
|
+
return PDFParser()
|
|
13
|
+
raise ValueError(f"暂时不支持的文件格式: {ext}")
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
TEXT_REPAIR_SYSTEM_PROMPT = """You are a meticulous digital archivist. Your mission is to reconstruct a clean, readable article from raw, noisy text chunks.
|
|
2
|
+
|
|
3
|
+
**Core Task:**
|
|
4
|
+
1. **Analyze:** Examine the text chunk to separate "signal" (substantive information) from "noise" (UI elements, ads, navigation, footers).
|
|
5
|
+
2. **Process:** Clean and repair the signal. **Do not translate it.** Keep the original language.
|
|
6
|
+
|
|
7
|
+
**Crucial Rules:**
|
|
8
|
+
- **NEVER discard a chunk if it contains ANY valuable information.** Your primary duty is to salvage content.
|
|
9
|
+
- **If a chunk contains multiple distinct topics, split them.** Enclose each topic in its own `<repaired_text>` tag.
|
|
10
|
+
- Your output MUST be ONLY `<repaired_text>...</repaired_text>` tags or a single `<discard_chunk />` tag.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
**Example 1: Chunk with Noise and Signal**
|
|
14
|
+
|
|
15
|
+
*Input Chunk:*
|
|
16
|
+
"Home | About | Products | **The Llama is a domesticated South American camelid.** | © 2025 ACME Corp."
|
|
17
|
+
|
|
18
|
+
*Your Thought Process:*
|
|
19
|
+
1. "Home | About | Products..." and "© 2025 ACME Corp." are noise.
|
|
20
|
+
2. "The Llama is a domesticated..." is the signal.
|
|
21
|
+
3. I must extract the signal and wrap it.
|
|
22
|
+
|
|
23
|
+
*Your Output:*
|
|
24
|
+
<repaired_text>
|
|
25
|
+
The Llama is a domesticated South American camelid.
|
|
26
|
+
</repaired_text>
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
**Example 2: Chunk with ONLY Noise**
|
|
30
|
+
|
|
31
|
+
*Input Chunk:*
|
|
32
|
+
"Next Page > | Subscribe to our newsletter | Follow us on X"
|
|
33
|
+
|
|
34
|
+
*Your Thought Process:*
|
|
35
|
+
1. This entire chunk is noise. There is no signal.
|
|
36
|
+
2. I must discard this.
|
|
37
|
+
|
|
38
|
+
*Your Output:*
|
|
39
|
+
<discard_chunk />
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
**Example 3: Chunk with Multiple Topics (Requires Splitting)**
|
|
43
|
+
|
|
44
|
+
*Input Chunk:*
|
|
45
|
+
"## Chapter 1: The Sun
|
|
46
|
+
The Sun is the star at the center of the Solar System.
|
|
47
|
+
|
|
48
|
+
## Chapter 2: The Moon
|
|
49
|
+
The Moon is Earth's only natural satellite."
|
|
50
|
+
|
|
51
|
+
*Your Thought Process:*
|
|
52
|
+
1. This chunk contains two distinct topics.
|
|
53
|
+
2. I must process them separately to maintain semantic integrity.
|
|
54
|
+
3. I will create two `<repaired_text>` blocks.
|
|
55
|
+
|
|
56
|
+
*Your Output:*
|
|
57
|
+
<repaired_text>
|
|
58
|
+
## Chapter 1: The Sun
|
|
59
|
+
The Sun is the star at the center of the Solar System.
|
|
60
|
+
</repaired_text>
|
|
61
|
+
<repaired_text>
|
|
62
|
+
## Chapter 2: The Moon
|
|
63
|
+
The Moon is Earth's only natural satellite.
|
|
64
|
+
</repaired_text>
|
|
65
|
+
"""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""检索模块"""
|
|
2
|
+
|
|
3
|
+
from .manager import RetrievalManager, RetrievalResult
|
|
4
|
+
from .rank_fusion import FusedResult, RankFusion
|
|
5
|
+
from .sparse_retriever import SparseResult, SparseRetriever
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"FusedResult",
|
|
9
|
+
"RankFusion",
|
|
10
|
+
"RetrievalManager",
|
|
11
|
+
"RetrievalResult",
|
|
12
|
+
"SparseResult",
|
|
13
|
+
"SparseRetriever",
|
|
14
|
+
]
|