@agentunion/kite 1.3.2 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +302 -0
- package/cli.js +119 -4
- package/core/dependency_checker.py +250 -0
- package/core/env_checker.py +490 -0
- package/dependencies_lock.json +128 -0
- package/extensions/agents/assistant/entry.py +111 -1
- package/extensions/agents/assistant/server.py +279 -215
- package/extensions/channels/acp_channel/entry.py +111 -1
- package/extensions/channels/acp_channel/module.md +23 -22
- package/extensions/channels/acp_channel/server.py +279 -215
- package/extensions/event_hub_bench/entry.py +107 -1
- package/extensions/services/backup/entry.py +306 -21
- package/extensions/services/backup/module.md +24 -22
- package/extensions/services/evol/auth_manager.py +443 -0
- package/extensions/services/evol/config.yaml +149 -0
- package/extensions/services/evol/config_loader.py +117 -0
- package/extensions/services/evol/entry.py +406 -0
- package/extensions/services/evol/evol_api.py +173 -0
- package/extensions/services/evol/evol_config.json5 +29 -0
- package/extensions/services/evol/migrate_tokens.py +122 -0
- package/extensions/services/evol/module.md +32 -0
- package/extensions/services/evol/pairing.py +250 -0
- package/extensions/services/evol/pairing_codes.jsonl +1 -0
- package/extensions/services/evol/relay.py +682 -0
- package/extensions/services/evol/relay_config.json5 +67 -0
- package/extensions/services/evol/routes/__init__.py +1 -0
- package/extensions/services/evol/routes/routes_management_ws.py +127 -0
- package/extensions/services/evol/routes/routes_rpc.py +89 -0
- package/extensions/services/evol/routes/routes_test.py +61 -0
- package/extensions/services/evol/server.py +875 -0
- package/extensions/services/evol/static/css/style.css +1200 -0
- package/extensions/services/evol/static/index.html +781 -0
- package/extensions/services/evol/static/index_evol.html +14 -0
- package/extensions/services/evol/static/js/app.js +6304 -0
- package/extensions/services/evol/static/js/auth.js +326 -0
- package/extensions/services/evol/static/js/dialog.js +285 -0
- package/extensions/services/evol/static/js/evol-app-fixed.js +50 -0
- package/extensions/services/evol/static/js/evol-app.js +1949 -0
- package/extensions/services/evol/static/js/evol-app.js.bak +1800 -0
- package/extensions/services/evol/static/js/kernel-client-example.js +228 -0
- package/extensions/services/evol/static/js/kernel-client.js +396 -0
- package/extensions/services/evol/static/js/main.js +141 -0
- package/extensions/services/evol/static/js/registry-tests.js +585 -0
- package/extensions/services/evol/static/js/stats.js +217 -0
- package/extensions/services/evol/static/js/token-manager.js +175 -0
- package/extensions/services/evol/static/pairing.html +248 -0
- package/extensions/services/evol/static/test_registry.html +262 -0
- package/extensions/services/evol/static/test_relay.html +462 -0
- package/extensions/services/evol/stats_manager.py +240 -0
- package/extensions/services/model_service/entry.py +167 -19
- package/extensions/services/model_service/module.md +21 -22
- package/extensions/services/proxy/.claude/settings.local.json +13 -0
- package/extensions/services/proxy/CHANGELOG_20260308.md +258 -0
- package/extensions/services/proxy/_fix_prints.py +133 -0
- package/extensions/services/proxy/_fix_prints2.py +87 -0
- package/extensions/services/proxy/agentcp/LICENCE +178 -0
- package/extensions/services/proxy/agentcp/README copy.md +85 -0
- package/extensions/services/proxy/agentcp/README.md +260 -0
- package/extensions/services/proxy/agentcp/__init__.py +16 -0
- package/extensions/services/proxy/agentcp/agent.py +4 -0
- package/extensions/services/proxy/agentcp/agentcp.py +2494 -0
- package/extensions/services/proxy/agentcp/agentprofile.json +89 -0
- package/extensions/services/proxy/agentcp/ap/__init__.py +16 -0
- package/extensions/services/proxy/agentcp/ap/ap_client.py +316 -0
- package/extensions/services/proxy/agentcp/assets/images/wechat_qr.png +0 -0
- package/extensions/services/proxy/agentcp/backup/metrics.json +31 -0
- package/extensions/services/proxy/agentcp/base/__init__.py +20 -0
- package/extensions/services/proxy/agentcp/base/auth_client.py +257 -0
- package/extensions/services/proxy/agentcp/base/client.py +112 -0
- package/extensions/services/proxy/agentcp/base/env.py +34 -0
- package/extensions/services/proxy/agentcp/base/html_util.py +336 -0
- package/extensions/services/proxy/agentcp/base/log.py +98 -0
- package/extensions/services/proxy/agentcp/ca/__init__.py +17 -0
- package/extensions/services/proxy/agentcp/ca/ca_client.py +414 -0
- package/extensions/services/proxy/agentcp/ca/ca_root.py +74 -0
- package/extensions/services/proxy/agentcp/context/__init__.py +20 -0
- package/extensions/services/proxy/agentcp/context/context.py +73 -0
- package/extensions/services/proxy/agentcp/context/exceptions.py +114 -0
- package/extensions/services/proxy/agentcp/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/create_profile_weather.py +125 -0
- package/extensions/services/proxy/agentcp/db/__init__.py +15 -0
- package/extensions/services/proxy/agentcp/db/db_mananger.py +550 -0
- package/extensions/services/proxy/agentcp/docs/UDP_HEARTBEAT_FIX_REPORT.md +265 -0
- package/extensions/services/proxy/agentcp/docs/heartbeat_issue_analysis.md +291 -0
- package/extensions/services/proxy/agentcp/file/__init__.py +16 -0
- package/extensions/services/proxy/agentcp/file/file_client.py +141 -0
- package/extensions/services/proxy/agentcp/file/wss_binary_message.py +137 -0
- package/extensions/services/proxy/agentcp/hcp.py +299 -0
- package/extensions/services/proxy/agentcp/heartbeat/__init__.py +16 -0
- package/extensions/services/proxy/agentcp/heartbeat/heartbeat_client.py +360 -0
- package/extensions/services/proxy/agentcp/improved_scheduler.py +498 -0
- package/extensions/services/proxy/agentcp/llm_agent_utils.py +249 -0
- package/extensions/services/proxy/agentcp/llm_server.py +172 -0
- package/extensions/services/proxy/agentcp/mermaid.py +210 -0
- package/extensions/services/proxy/agentcp/message.py +149 -0
- package/extensions/services/proxy/agentcp/metrics.py +256 -0
- package/extensions/services/proxy/agentcp/monitoring/__init__.py +20 -0
- package/extensions/services/proxy/agentcp/monitoring/global_monitor.py +27 -0
- package/extensions/services/proxy/agentcp/monitoring/metrics_store.py +325 -0
- package/extensions/services/proxy/agentcp/monitoring/monitoring_service.py +269 -0
- package/extensions/services/proxy/agentcp/monitoring/sliding_window.py +222 -0
- package/extensions/services/proxy/agentcp/monitoring/standalone_reader.py +224 -0
- package/extensions/services/proxy/agentcp/msg/__init__.py +21 -0
- package/extensions/services/proxy/agentcp/msg/connection_manager.py +456 -0
- package/extensions/services/proxy/agentcp/msg/message_client.py +2058 -0
- package/extensions/services/proxy/agentcp/msg/message_serialize.py +263 -0
- package/extensions/services/proxy/agentcp/msg/open_ai_message.py +88 -0
- package/extensions/services/proxy/agentcp/msg/session_manager.py +1062 -0
- package/extensions/services/proxy/agentcp/msg/stream_client.py +267 -0
- package/extensions/services/proxy/agentcp/msg/websocket_file_receiver.py +89 -0
- package/extensions/services/proxy/agentcp/msg/ws_logger.py +685 -0
- package/extensions/services/proxy/agentcp/msg/wss_binary_message.py +137 -0
- package/extensions/services/proxy/agentcp/requirements.txt +7 -0
- package/extensions/services/proxy/agentcp/samples/agent_graph/README.md +37 -0
- package/extensions/services/proxy/agentcp/samples/agent_graph/agentprofile.json +89 -0
- package/extensions/services/proxy/agentcp/samples/agent_graph/create_profile.py +138 -0
- package/extensions/services/proxy/agentcp/samples/agent_graph/main.py +164 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/create_profile.py +123 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/llm/create_profile.py +129 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/llm/env.json +5 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/llm/main.py +146 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/main.py +123 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/readme.md +379 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/search/create_profile.py +129 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/search/main.py +28 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/tool/create_profile.py +129 -0
- package/extensions/services/proxy/agentcp/samples/agent_use/tool/main.py +20 -0
- package/extensions/services/proxy/agentcp/samples/ali_amap/README.md +97 -0
- package/extensions/services/proxy/agentcp/samples/ali_amap/amap_agent.py +88 -0
- package/extensions/services/proxy/agentcp/samples/ali_amap/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/compute_agent/agent/powershell.py +228 -0
- package/extensions/services/proxy/agentcp/samples/compute_agent/agent/software.py +63 -0
- package/extensions/services/proxy/agentcp/samples/compute_agent/agent/tools.py +36 -0
- package/extensions/services/proxy/agentcp/samples/compute_agent/browser_user.py +41 -0
- package/extensions/services/proxy/agentcp/samples/deepseek/README.md +79 -0
- package/extensions/services/proxy/agentcp/samples/deepseek/create_profile.py +126 -0
- package/extensions/services/proxy/agentcp/samples/deepseek/deepseek.py +42 -0
- package/extensions/services/proxy/agentcp/samples/dify_chat/README.md +78 -0
- package/extensions/services/proxy/agentcp/samples/dify_chat/create_profile.py +126 -0
- package/extensions/services/proxy/agentcp/samples/dify_chat/dify_chat.py +47 -0
- package/extensions/services/proxy/agentcp/samples/dify_workflow/README.md +78 -0
- package/extensions/services/proxy/agentcp/samples/dify_workflow/create_profile.py +126 -0
- package/extensions/services/proxy/agentcp/samples/dify_workflow/dify_workflow.py +46 -0
- package/extensions/services/proxy/agentcp/samples/executor/README.md +44 -0
- package/extensions/services/proxy/agentcp/samples/executor/agentprofile.json +89 -0
- package/extensions/services/proxy/agentcp/samples/executor/create_profile.py +139 -0
- package/extensions/services/proxy/agentcp/samples/executor/main.py +160 -0
- package/extensions/services/proxy/agentcp/samples/filereader/README.md +45 -0
- package/extensions/services/proxy/agentcp/samples/filereader/agentprofile.json +90 -0
- package/extensions/services/proxy/agentcp/samples/filereader/create_profile.py +137 -0
- package/extensions/services/proxy/agentcp/samples/filereader/main.py +253 -0
- package/extensions/services/proxy/agentcp/samples/filewriter/README.md +38 -0
- package/extensions/services/proxy/agentcp/samples/filewriter/agentprofile.json +91 -0
- package/extensions/services/proxy/agentcp/samples/filewriter/create_profile.py +138 -0
- package/extensions/services/proxy/agentcp/samples/filewriter/main.py +289 -0
- package/extensions/services/proxy/agentcp/samples/hcp/README.md +85 -0
- package/extensions/services/proxy/agentcp/samples/hcp/acp_weather_agent.zip +0 -0
- package/extensions/services/proxy/agentcp/samples/hcp/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/hcp/hcp.py +237 -0
- package/extensions/services/proxy/agentcp/samples/helloworld/README.md +68 -0
- package/extensions/services/proxy/agentcp/samples/helloworld/hello_world.py +40 -0
- package/extensions/services/proxy/agentcp/samples/llm_agent/MEADME.md +117 -0
- package/extensions/services/proxy/agentcp/samples/llm_agent/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/llm_agent/qwen_agent.py +136 -0
- package/extensions/services/proxy/agentcp/samples/local_llm_agent/README.md +90 -0
- package/extensions/services/proxy/agentcp/samples/local_llm_agent/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/local_llm_agent/main.py +49 -0
- package/extensions/services/proxy/agentcp/samples/query_llm_from_agent/README.md +55 -0
- package/extensions/services/proxy/agentcp/samples/query_llm_from_agent/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/query_llm_from_agent/main.py +23 -0
- package/extensions/services/proxy/agentcp/samples/query_weather_api_agent/README.md +103 -0
- package/extensions/services/proxy/agentcp/samples/query_weather_api_agent/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/query_weather_api_agent/main.py +69 -0
- package/extensions/services/proxy/agentcp/samples/query_weather_from_agent/README.md +58 -0
- package/extensions/services/proxy/agentcp/samples/query_weather_from_agent/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/query_weather_from_agent/main.py +25 -0
- package/extensions/services/proxy/agentcp/samples/qwen3/README.md +71 -0
- package/extensions/services/proxy/agentcp/samples/qwen3/create_profile.py +126 -0
- package/extensions/services/proxy/agentcp/samples/qwen3/qwen3.py +37 -0
- package/extensions/services/proxy/agentcp/samples/qwen3_tools/README.md +133 -0
- package/extensions/services/proxy/agentcp/samples/qwen3_tools/create_profile.py +126 -0
- package/extensions/services/proxy/agentcp/samples/qwen3_tools/qwen3_tools.py +98 -0
- package/extensions/services/proxy/agentcp/samples/search/create_profile_qwen.py +125 -0
- package/extensions/services/proxy/agentcp/samples/search/create_profile_search.py +125 -0
- package/extensions/services/proxy/agentcp/samples/search/qwen_agent.py +136 -0
- package/extensions/services/proxy/agentcp/samples/search/search_agent.py +170 -0
- package/extensions/services/proxy/agentcp/samples/wrapper_agently_to_agent/README.md +89 -0
- package/extensions/services/proxy/agentcp/samples/wrapper_agently_to_agent/create_profile.py +125 -0
- package/extensions/services/proxy/agentcp/samples/wrapper_agently_to_agent/main.py +44 -0
- package/extensions/services/proxy/agentcp/utils/__init__.py +15 -0
- package/extensions/services/proxy/agentcp/utils/file_util.py +117 -0
- package/extensions/services/proxy/agentcp/utils/proxy_bypass.py +99 -0
- package/extensions/services/proxy/agentcp/workflow.py +203 -0
- package/extensions/services/proxy/console_auth.py +109 -0
- package/extensions/services/proxy/evol/__init__.py +1 -0
- package/extensions/services/proxy/evol/config.py +37 -0
- package/extensions/services/proxy/evol/http/__init__.py +1 -0
- package/extensions/services/proxy/evol/http/async_http.py +551 -0
- package/extensions/services/proxy/evol/log.py +28 -0
- package/extensions/services/proxy/evol/presenter/__init__.py +2 -0
- package/extensions/services/proxy/evol/presenter/agentIdPresenter.py +1031 -0
- package/extensions/services/proxy/evol/presenter/apikeyPresenter.py +106 -0
- package/extensions/services/proxy/evol/presenter/configPresenter.py +1281 -0
- package/extensions/services/proxy/evol/presenter/userPresenter.py +477 -0
- package/extensions/services/proxy/evol/server/__init__.py +1 -0
- package/extensions/services/proxy/evol/server/claude_proxy_async.py +3430 -0
- package/extensions/services/proxy/evol/server/openclaw_proxy.py +1861 -0
- package/extensions/services/proxy/evol/server/proxy_config.py +15 -0
- package/extensions/services/proxy/evol/server/proxy_engine.py +501 -0
- package/extensions/services/proxy/evol/version.py +24 -0
- package/extensions/services/proxy/logs/websocket.log +260 -0
- package/extensions/services/proxy/main.py +240 -0
- package/extensions/services/proxy/requirements.txt +13 -0
- package/extensions/services/proxy/server.py +271 -0
- package/extensions/services/watchdog/entry.py +215 -26
- package/extensions/services/watchdog/module.md +1 -0
- package/extensions/services/watchdog/monitor.py +178 -38
- package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
- package/extensions/services/web/config_example.py +35 -0
- package/extensions/services/web/config_loader.py +110 -0
- package/extensions/services/web/entry.py +114 -26
- package/extensions/services/web/module.md +35 -24
- package/extensions/services/web/pairing.py +250 -0
- package/extensions/services/web/pairing_codes.jsonl +16 -0
- package/extensions/services/web/relay.py +643 -0
- package/extensions/services/web/relay_config.json5 +67 -0
- package/extensions/services/web/routes/routes_management_ws.py +127 -0
- package/extensions/services/web/routes/routes_rpc.py +89 -0
- package/extensions/services/web/routes/routes_test.py +61 -0
- package/extensions/services/web/routes/schemas.py +0 -22
- package/extensions/services/web/server.py +434 -99
- package/extensions/services/web/static/css/style.css +67 -28
- package/extensions/services/web/static/index.html +234 -44
- package/extensions/services/web/static/js/app.js +1335 -48
- package/extensions/services/web/static/js/kernel-client-example.js +161 -0
- package/extensions/services/web/static/js/kernel-client.js +383 -0
- package/extensions/services/web/static/js/registry-tests.js +558 -0
- package/extensions/services/web/static/js/token-manager.js +175 -0
- package/extensions/services/web/static/pairing.html +248 -0
- package/extensions/services/web/static/test_registry.html +262 -0
- package/extensions/services/web/web_config.json5 +29 -0
- package/kernel/entry.py +120 -32
- package/kernel/event_hub.py +141 -16
- package/kernel/module.md +60 -33
- package/kernel/registry_store.py +45 -36
- package/kernel/rpc_router.py +152 -59
- package/kernel/server.py +322 -26
- package/kite_cli/__init__.py +3 -0
- package/kite_cli/__main__.py +5 -0
- package/kite_cli/commands/__init__.py +1 -0
- package/kite_cli/commands/clean.py +101 -0
- package/kite_cli/commands/deps_install.py +67 -0
- package/kite_cli/commands/doctor.py +35 -0
- package/kite_cli/commands/env_check.py +45 -0
- package/kite_cli/commands/history.py +111 -0
- package/kite_cli/commands/info.py +96 -0
- package/kite_cli/commands/install.py +313 -0
- package/kite_cli/commands/list.py +143 -0
- package/kite_cli/commands/log.py +81 -0
- package/kite_cli/commands/prepare.py +49 -0
- package/kite_cli/commands/rollback.py +88 -0
- package/kite_cli/commands/search.py +73 -0
- package/kite_cli/commands/uninstall.py +85 -0
- package/kite_cli/commands/update.py +118 -0
- package/kite_cli/commands/venv_setup.py +56 -0
- package/kite_cli/core/__init__.py +1 -0
- package/kite_cli/core/checker.py +142 -0
- package/kite_cli/core/dependency.py +229 -0
- package/kite_cli/core/downloader.py +209 -0
- package/kite_cli/core/install_info.py +40 -0
- package/kite_cli/core/tool_installer.py +397 -0
- package/kite_cli/core/validator.py +78 -0
- package/kite_cli/main.py +317 -0
- package/kite_cli/utils/__init__.py +1 -0
- package/kite_cli/utils/i18n.py +252 -0
- package/kite_cli/utils/interactive.py +63 -0
- package/kite_cli/utils/operation_log.py +77 -0
- package/kite_cli/utils/paths.py +34 -0
- package/kite_cli/utils/version.py +308 -0
- package/launcher/entry.py +1124 -178
- package/launcher/logging_setup.py +104 -0
- package/launcher/module.md +46 -37
- package/launcher/module_scanner.py +11 -1
- package/main.py +4 -1
- package/package.json +9 -1
- package/python_version.json +4 -0
- package/requirements.txt +38 -0
- package/scripts/env-manager.js +328 -0
- package/scripts/plan_manager.py +315 -0
- package/scripts/python-env.js +79 -0
- package/scripts/scan_dependencies.py +461 -0
- package/scripts/setup-python-env.js +191 -0
- package/extensions/services/web/routes/routes_modules.py +0 -249
|
@@ -0,0 +1,2058 @@
|
|
|
1
|
+
# Copyright 2025 AgentUnion Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import queue
|
|
18
|
+
import ssl
|
|
19
|
+
import threading
|
|
20
|
+
import time
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from typing import Dict, Optional, Union
|
|
23
|
+
|
|
24
|
+
import websockets
|
|
25
|
+
from websockets.exceptions import ConnectionClosed, ConnectionClosedError, ConnectionClosedOK
|
|
26
|
+
from websockets.exceptions import InvalidMessage, PayloadTooBig, ProtocolError
|
|
27
|
+
from websockets.protocol import State as WsState
|
|
28
|
+
from websockets.frames import Frame, Opcode
|
|
29
|
+
|
|
30
|
+
from agentcp.utils.proxy_bypass import ensure_no_proxy_for_local_env, is_local_url, pop_proxy_env, restore_proxy_env
|
|
31
|
+
from agentcp.base.auth_client import AuthClient
|
|
32
|
+
from agentcp.base.client import IClient
|
|
33
|
+
from agentcp.base.log import log_debug, log_error, log_exception, log_info, log_warning
|
|
34
|
+
|
|
35
|
+
from ..context import ErrorContext, exceptions
|
|
36
|
+
|
|
37
|
+
ensure_no_proxy_for_local_env()
|
|
38
|
+
from .ws_logger import get_ws_logger # ✅ 导入 WebSocket 专用日志
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ConnectionState(Enum):
|
|
42
|
+
DISCONNECTED = "disconnected"
|
|
43
|
+
CONNECTING = "connecting"
|
|
44
|
+
CONNECTED = "connected"
|
|
45
|
+
RECONNECTING = "reconnecting"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class MessageClientConfig:
|
|
49
|
+
"""Configuration class for MessageClient
|
|
50
|
+
|
|
51
|
+
配置参数说明:
|
|
52
|
+
- max_queue_size: 消息队列最大容量,断连期间消息暂存于此
|
|
53
|
+
- connection_timeout: WebSocket 连接建立超时时间
|
|
54
|
+
- ping_interval: 心跳间隔,用于检测连接是否存活
|
|
55
|
+
- reconnect_base_interval: 首次重连等待时间
|
|
56
|
+
- reconnect_max_interval: 最大重连等待时间(指数退避上限)
|
|
57
|
+
- reconnect_backoff_factor: 指数退避因子
|
|
58
|
+
- max_message_size: 单条消息最大大小,超过则丢弃
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
# ✅ 消息队列:扩大容量,减少断连期间消息丢失
|
|
63
|
+
self.max_queue_size: int = 5000 # 从 30 改为 5000
|
|
64
|
+
|
|
65
|
+
# ✅ 连接超时:缩短,更快感知连接失败
|
|
66
|
+
self.connection_timeout: float = 3.0 # 从 5.0 改为 3.0
|
|
67
|
+
|
|
68
|
+
self.retry_interval: float = 4.0
|
|
69
|
+
self.max_retry_attempts: int = 0 # 0 表示无限重连
|
|
70
|
+
self.send_retry_attempts: int = 5
|
|
71
|
+
self.send_retry_delay: float = 0.01
|
|
72
|
+
|
|
73
|
+
# ✅ 心跳:更频繁,更快检测连接"假死"
|
|
74
|
+
self.ping_interval: int = 3 # 从 5 改为 3
|
|
75
|
+
|
|
76
|
+
# ✅ 自动重连配置:缩短间隔,更快恢复服务
|
|
77
|
+
self.auto_reconnect: bool = True
|
|
78
|
+
self.reconnect_base_interval: float = 0.5 # 从 2.0 改为 0.5(首次重连只等 0.5 秒)
|
|
79
|
+
self.reconnect_max_interval: float = 10.0 # 从 60.0 改为 10.0(最多等 10 秒)
|
|
80
|
+
self.reconnect_backoff_factor: float = 1.5 # 保持不变
|
|
81
|
+
|
|
82
|
+
# ✅ 消息大小限制
|
|
83
|
+
self.max_message_size: int = 10 * 1024 * 1024 # 从 64MB 改为 10MB
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class MessageClient(IClient):
|
|
87
|
+
"""WebSocket-based message client using websockets library.
|
|
88
|
+
|
|
89
|
+
使用 websockets 库替代 websocket-client,更好地处理协议扩展和错误。
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
# 类级别的速率限制标志
|
|
93
|
+
_last_rate_limit_log_time = 0
|
|
94
|
+
_rate_limit_log_interval = 30
|
|
95
|
+
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
agent_id: str,
|
|
99
|
+
server_url: str,
|
|
100
|
+
aid_path: str,
|
|
101
|
+
seed_password: str,
|
|
102
|
+
cache_auth_client: Optional[AuthClient] = None,
|
|
103
|
+
config: Optional[MessageClientConfig] = None,
|
|
104
|
+
agent_id_ref=None,
|
|
105
|
+
):
|
|
106
|
+
self.agent_id = agent_id
|
|
107
|
+
self.server_url = server_url.rstrip("/")
|
|
108
|
+
self.config = config or MessageClientConfig()
|
|
109
|
+
self._agent_id_ref = agent_id_ref
|
|
110
|
+
|
|
111
|
+
# Initialize auth client
|
|
112
|
+
if cache_auth_client is None:
|
|
113
|
+
self.auth_client = AuthClient(agent_id, server_url, aid_path, seed_password)
|
|
114
|
+
else:
|
|
115
|
+
self.auth_client = cache_auth_client
|
|
116
|
+
|
|
117
|
+
# Thread synchronization
|
|
118
|
+
self.lock = threading.Lock()
|
|
119
|
+
self.connected_event = threading.Event()
|
|
120
|
+
|
|
121
|
+
# WebSocket related
|
|
122
|
+
self.ws: Optional[websockets.WebSocketClientProtocol] = None
|
|
123
|
+
self.ws_thread: Optional[threading.Thread] = None
|
|
124
|
+
self.ws_url: Optional[str] = None
|
|
125
|
+
|
|
126
|
+
# Asyncio event loop for websockets
|
|
127
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
128
|
+
|
|
129
|
+
# Message handling
|
|
130
|
+
self.queue = queue.Queue(maxsize=self.config.max_queue_size)
|
|
131
|
+
self.message_handler: Optional[object] = None
|
|
132
|
+
|
|
133
|
+
# Connection state
|
|
134
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
135
|
+
self._is_retrying = False
|
|
136
|
+
self._shutdown_requested = False
|
|
137
|
+
self.stream_queue_map = {}
|
|
138
|
+
self._stream_queue_lock = threading.Lock() # 保护 stream_queue_map 的访问
|
|
139
|
+
|
|
140
|
+
# Stream queue cleanup
|
|
141
|
+
self._cleanup_thread: Optional[threading.Thread] = None
|
|
142
|
+
self._cleanup_running = False
|
|
143
|
+
|
|
144
|
+
# 重连状态管理
|
|
145
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
146
|
+
self._reconnect_attempt_count = 0
|
|
147
|
+
|
|
148
|
+
# 连接健康检查
|
|
149
|
+
self._health_check_thread: Optional[threading.Thread] = None
|
|
150
|
+
self._health_check_running = False
|
|
151
|
+
self._last_pong_time: float = 0
|
|
152
|
+
|
|
153
|
+
# 连接唯一标识,用于追踪和防止重复连接
|
|
154
|
+
self._connection_id: int = 0
|
|
155
|
+
|
|
156
|
+
# CONNECTING tracking: avoid stuck connection attempts
|
|
157
|
+
self._connecting_since: float = 0.0
|
|
158
|
+
self._connecting_conn_id: int = 0
|
|
159
|
+
|
|
160
|
+
# ✅ 断开回调:当 WebSocket 连接断开时通知外部
|
|
161
|
+
self._on_disconnect_callback: Optional[callable] = None
|
|
162
|
+
|
|
163
|
+
# ✅ 连接恢复回调:当 WebSocket 连接恢复时通知外部
|
|
164
|
+
self._on_reconnect_callback: Optional[callable] = None
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def connection_state(self) -> ConnectionState:
|
|
168
|
+
"""Get current connection state."""
|
|
169
|
+
with self.lock:
|
|
170
|
+
return self._connection_state
|
|
171
|
+
|
|
172
|
+
def _set_connection_state(self, state: ConnectionState) -> None:
|
|
173
|
+
"""Set connection state thread-safely."""
|
|
174
|
+
with self.lock:
|
|
175
|
+
self._connection_state = state
|
|
176
|
+
if state == ConnectionState.CONNECTED:
|
|
177
|
+
self.connected_event.set()
|
|
178
|
+
else:
|
|
179
|
+
self.connected_event.clear()
|
|
180
|
+
if state != ConnectionState.CONNECTING:
|
|
181
|
+
self._connecting_since = 0.0
|
|
182
|
+
self._connecting_conn_id = 0
|
|
183
|
+
|
|
184
|
+
def _get_use_system_proxy(self) -> bool:
|
|
185
|
+
"""获取是否使用系统代理"""
|
|
186
|
+
if self._agent_id_ref and hasattr(self._agent_id_ref, 'get_use_system_proxy'):
|
|
187
|
+
return self._agent_id_ref.get_use_system_proxy()
|
|
188
|
+
return False
|
|
189
|
+
|
|
190
|
+
def _is_ws_open(self) -> bool:
|
|
191
|
+
"""Check if WebSocket connection is open."""
|
|
192
|
+
try:
|
|
193
|
+
return self.ws is not None and self.ws.state == WsState.OPEN
|
|
194
|
+
except Exception:
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
# ==================== 连接状态查询 API ====================
|
|
198
|
+
|
|
199
|
+
def is_healthy(self) -> bool:
|
|
200
|
+
"""✅ 检查连接是否健康可用
|
|
201
|
+
|
|
202
|
+
健康条件:
|
|
203
|
+
1. WebSocket 连接状态为 OPEN
|
|
204
|
+
2. connected_event 已设置
|
|
205
|
+
3. 连接状态为 CONNECTED
|
|
206
|
+
4. 没有正在重连
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
True: 连接健康,可以发送消息
|
|
210
|
+
False: 连接不可用
|
|
211
|
+
"""
|
|
212
|
+
return (
|
|
213
|
+
self._is_ws_open() and
|
|
214
|
+
self.connected_event.is_set() and
|
|
215
|
+
self.connection_state == ConnectionState.CONNECTED and
|
|
216
|
+
not self._is_retrying
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def get_connection_info(self) -> dict:
|
|
220
|
+
"""✅ 获取连接状态详情
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
包含连接状态信息的字典
|
|
224
|
+
"""
|
|
225
|
+
return {
|
|
226
|
+
"agent_id": self.agent_id,
|
|
227
|
+
"server_url": self.server_url,
|
|
228
|
+
"state": self.connection_state.value,
|
|
229
|
+
"ws_open": self._is_ws_open(),
|
|
230
|
+
"is_healthy": self.is_healthy(),
|
|
231
|
+
"is_retrying": self._is_retrying,
|
|
232
|
+
"reconnect_attempts": self._reconnect_attempt_count,
|
|
233
|
+
"current_reconnect_interval": self._current_reconnect_interval,
|
|
234
|
+
"connection_id": self._connection_id,
|
|
235
|
+
"last_pong_time": self._last_pong_time,
|
|
236
|
+
"queue_size": self.queue.qsize(),
|
|
237
|
+
"queue_capacity": self.config.max_queue_size,
|
|
238
|
+
"pending_streams": self.get_pending_stream_count(),
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
def get_health_summary(self) -> str:
|
|
242
|
+
"""✅ 获取连接健康状态摘要(用于日志/调试)
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
健康状态摘要字符串
|
|
246
|
+
"""
|
|
247
|
+
info = self.get_connection_info()
|
|
248
|
+
status = "🟢 健康" if info["is_healthy"] else "🔴 不健康"
|
|
249
|
+
return (
|
|
250
|
+
f"{status} | state={info['state']} | "
|
|
251
|
+
f"ws_open={info['ws_open']} | "
|
|
252
|
+
f"retrying={info['is_retrying']} | "
|
|
253
|
+
f"queue={info['queue_size']}/{info['queue_capacity']}"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def set_reconnect_callback(self, callback: callable) -> None:
|
|
257
|
+
"""✅ 设置连接恢复回调
|
|
258
|
+
|
|
259
|
+
当 WebSocket 连接恢复时,会调用此回调函数。
|
|
260
|
+
回调函数签名: callback(agent_id: str, server_url: str)
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
callback: 连接恢复时调用的回调函数
|
|
264
|
+
"""
|
|
265
|
+
self._on_reconnect_callback = callback
|
|
266
|
+
log_info(f"[MessageClient] 已设置连接恢复回调: {callback}")
|
|
267
|
+
|
|
268
|
+
# ==================== 原有方法 ====================
|
|
269
|
+
|
|
270
|
+
def initialize(self) -> None:
|
|
271
|
+
"""Initialize the client by signing in."""
|
|
272
|
+
self.auth_client.sign_in()
|
|
273
|
+
|
|
274
|
+
def sign_in(self) -> bool:
|
|
275
|
+
"""Sign in using auth client."""
|
|
276
|
+
try:
|
|
277
|
+
result = self.auth_client.sign_in()
|
|
278
|
+
return result is not None
|
|
279
|
+
except Exception as e:
|
|
280
|
+
log_exception(f"Failed to sign in: {e}")
|
|
281
|
+
return False
|
|
282
|
+
|
|
283
|
+
def get_headers(self) -> Dict[str, str]:
|
|
284
|
+
"""Get headers for requests."""
|
|
285
|
+
return {"User-Agent": f"AgentCP/{__import__('agentcp').__version__} (AuthClient; {self.agent_id})"}
|
|
286
|
+
|
|
287
|
+
def sign_out(self) -> None:
|
|
288
|
+
"""Sign out using auth client."""
|
|
289
|
+
self.auth_client.sign_out()
|
|
290
|
+
|
|
291
|
+
def set_message_handler(self, message_handler: object) -> None:
|
|
292
|
+
"""Set message handler for incoming messages."""
|
|
293
|
+
self.message_handler = message_handler
|
|
294
|
+
|
|
295
|
+
def set_disconnect_callback(self, callback: callable) -> None:
|
|
296
|
+
"""设置断开回调函数
|
|
297
|
+
|
|
298
|
+
当 WebSocket 连接断开时,会调用此回调函数通知外部。
|
|
299
|
+
回调函数签名: callback(agent_id: str, server_url: str, code: int, reason: str)
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
callback: 断开时调用的回调函数
|
|
303
|
+
"""
|
|
304
|
+
self._on_disconnect_callback = callback
|
|
305
|
+
log_info(f"[MessageClient] 已设置断开回调: {callback}")
|
|
306
|
+
|
|
307
|
+
def _build_websocket_url(self) -> str:
|
|
308
|
+
"""Build WebSocket URL with proper protocol and parameters."""
|
|
309
|
+
ws_url = self.server_url.replace("https://", "wss://").replace("http://", "ws://")
|
|
310
|
+
return f"{ws_url}/session?agent_id={self.agent_id}&signature={self.auth_client.signature}"
|
|
311
|
+
|
|
312
|
+
def start_websocket_client(self) -> bool:
|
|
313
|
+
"""Start WebSocket client connection.
|
|
314
|
+
|
|
315
|
+
修复:如果 WebSocket 连接实际上是正常的,不要创建新连接。
|
|
316
|
+
只在连接真正断开时才创建新连接。
|
|
317
|
+
"""
|
|
318
|
+
# ✅ 检查解释器是否正在关闭
|
|
319
|
+
import sys
|
|
320
|
+
if hasattr(sys, 'is_finalizing') and sys.is_finalizing():
|
|
321
|
+
log_debug("Interpreter is shutting down, skipping connection")
|
|
322
|
+
self._shutdown_requested = True
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
if self._shutdown_requested:
|
|
326
|
+
return False
|
|
327
|
+
|
|
328
|
+
need_cleanup = False
|
|
329
|
+
need_start = False
|
|
330
|
+
conn_id = 0
|
|
331
|
+
now = time.time()
|
|
332
|
+
|
|
333
|
+
with self.lock:
|
|
334
|
+
ws_open = self._is_ws_open()
|
|
335
|
+
|
|
336
|
+
# ✅ 修复:如果 WebSocket 连接实际上是正常的,直接返回 true
|
|
337
|
+
# 不管状态是什么,只要连接是 open 的就不需要重连
|
|
338
|
+
if ws_open:
|
|
339
|
+
# 修正状态(可能被错误地设置为 DISCONNECTED)
|
|
340
|
+
if self._connection_state != ConnectionState.CONNECTED:
|
|
341
|
+
log_info(f"[conn:{self._connection_id}] WebSocket is open, fixing state from {self._connection_state.value} to connected")
|
|
342
|
+
self._connection_state = ConnectionState.CONNECTED
|
|
343
|
+
self.connected_event.set()
|
|
344
|
+
return True
|
|
345
|
+
|
|
346
|
+
# 如果正在连接中(另一个线程正在创建连接),等待结果
|
|
347
|
+
if self._connection_state == ConnectionState.CONNECTING:
|
|
348
|
+
conn_id = self._connection_id
|
|
349
|
+
if (
|
|
350
|
+
self._connecting_conn_id == conn_id
|
|
351
|
+
and self._connecting_since > 0
|
|
352
|
+
and (now - self._connecting_since) > max(self.config.connection_timeout * 2, 10.0)
|
|
353
|
+
):
|
|
354
|
+
log_warning(
|
|
355
|
+
f"[conn:{conn_id}] Stale CONNECTING detected "
|
|
356
|
+
f"(elapsed={now - self._connecting_since:.1f}s), restarting connection"
|
|
357
|
+
)
|
|
358
|
+
need_cleanup = True
|
|
359
|
+
need_start = True
|
|
360
|
+
self._connection_id += 1
|
|
361
|
+
conn_id = self._connection_id
|
|
362
|
+
log_info(f"[conn:{conn_id}] Creating new connection: state=connecting(stale), ws_open={ws_open}")
|
|
363
|
+
self._connection_state = ConnectionState.CONNECTING
|
|
364
|
+
self._connecting_since = now
|
|
365
|
+
self._connecting_conn_id = conn_id
|
|
366
|
+
self.connected_event.clear()
|
|
367
|
+
else:
|
|
368
|
+
log_debug(f"[conn:{conn_id}] Another thread is connecting, waiting...")
|
|
369
|
+
else:
|
|
370
|
+
# ✅ 只有在 ws 真正不可用时才创建新连接
|
|
371
|
+
need_cleanup = True
|
|
372
|
+
need_start = True
|
|
373
|
+
self._connection_id += 1
|
|
374
|
+
conn_id = self._connection_id
|
|
375
|
+
# 记录为什么需要新连接
|
|
376
|
+
log_info(f"[conn:{conn_id}] Creating new connection: state={self._connection_state.value}, ws_open={ws_open}")
|
|
377
|
+
self._connection_state = ConnectionState.CONNECTING
|
|
378
|
+
self._connecting_since = now
|
|
379
|
+
self._connecting_conn_id = conn_id
|
|
380
|
+
self.connected_event.clear()
|
|
381
|
+
|
|
382
|
+
# 在锁外执行阻塞操作
|
|
383
|
+
if need_cleanup:
|
|
384
|
+
self._cleanup_old_connection_unlocked()
|
|
385
|
+
|
|
386
|
+
if need_start:
|
|
387
|
+
self.ws_url = self._build_websocket_url()
|
|
388
|
+
log_debug(f"[conn:{conn_id}] Connecting to WebSocket URL: {self.ws_url}")
|
|
389
|
+
|
|
390
|
+
# ✅ 记录连接尝试到专用日志
|
|
391
|
+
ws_logger = get_ws_logger()
|
|
392
|
+
ws_logger.log_connection_attempt(conn_id, self.ws_url, "new_connection")
|
|
393
|
+
|
|
394
|
+
# Start WebSocket thread with asyncio loop
|
|
395
|
+
self.ws_thread = threading.Thread(
|
|
396
|
+
target=self._ws_handler,
|
|
397
|
+
args=(conn_id,),
|
|
398
|
+
daemon=True,
|
|
399
|
+
name=f"WebSocketHandler-{conn_id}"
|
|
400
|
+
)
|
|
401
|
+
self.ws_thread.start()
|
|
402
|
+
|
|
403
|
+
return self._wait_for_connection()
|
|
404
|
+
|
|
405
|
+
def _cleanup_old_connection_unlocked(self) -> None:
|
|
406
|
+
"""Clean up old connection. Called WITHOUT lock held to avoid blocking."""
|
|
407
|
+
log_info(f"[cleanup] 开始清理旧连接状态...")
|
|
408
|
+
|
|
409
|
+
# 停止辅助线程标志
|
|
410
|
+
self._cleanup_running = False
|
|
411
|
+
self._health_check_running = False
|
|
412
|
+
|
|
413
|
+
# ✅ 通知所有等待中的 stream 请求(创建新连接前清理旧状态)
|
|
414
|
+
pending_count = self.get_pending_stream_count() # ✅ 使用线程安全方法
|
|
415
|
+
if pending_count > 0:
|
|
416
|
+
log_warning(f"[cleanup] 通知 {pending_count} 个等待中的 stream 请求...")
|
|
417
|
+
self._notify_pending_stream_requests("创建新连接,旧请求已取消")
|
|
418
|
+
|
|
419
|
+
# 在锁内保存并清除旧的引用
|
|
420
|
+
with self.lock:
|
|
421
|
+
old_loop = self._loop
|
|
422
|
+
old_ws = self.ws
|
|
423
|
+
old_thread = self.ws_thread
|
|
424
|
+
# 注意:不在这里清除引用,让新连接设置新值
|
|
425
|
+
# 这样可以避免竞态条件
|
|
426
|
+
|
|
427
|
+
# 关闭旧的 WebSocket
|
|
428
|
+
if old_loop and old_ws:
|
|
429
|
+
try:
|
|
430
|
+
if old_loop.is_running():
|
|
431
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
432
|
+
self._graceful_close_ws(old_ws),
|
|
433
|
+
old_loop
|
|
434
|
+
)
|
|
435
|
+
try:
|
|
436
|
+
future.result(timeout=2.0)
|
|
437
|
+
except Exception:
|
|
438
|
+
pass
|
|
439
|
+
except Exception:
|
|
440
|
+
pass
|
|
441
|
+
|
|
442
|
+
# 停止旧的事件循环
|
|
443
|
+
if old_loop:
|
|
444
|
+
try:
|
|
445
|
+
if old_loop.is_running():
|
|
446
|
+
old_loop.call_soon_threadsafe(old_loop.stop)
|
|
447
|
+
except Exception:
|
|
448
|
+
pass
|
|
449
|
+
|
|
450
|
+
# 等待旧线程结束
|
|
451
|
+
if old_thread and old_thread.is_alive():
|
|
452
|
+
try:
|
|
453
|
+
old_thread.join(timeout=2.0)
|
|
454
|
+
except Exception:
|
|
455
|
+
pass
|
|
456
|
+
|
|
457
|
+
async def _graceful_close_ws(self, ws) -> None:
|
|
458
|
+
"""Gracefully close WebSocket connection."""
|
|
459
|
+
if ws is None:
|
|
460
|
+
return
|
|
461
|
+
try:
|
|
462
|
+
await asyncio.wait_for(ws.close(), timeout=1.0)
|
|
463
|
+
except asyncio.TimeoutError:
|
|
464
|
+
pass
|
|
465
|
+
except Exception:
|
|
466
|
+
pass
|
|
467
|
+
|
|
468
|
+
def _cleanup_old_connection(self) -> None:
|
|
469
|
+
"""Clean up old connection (legacy method, calls unlocked version)."""
|
|
470
|
+
self._cleanup_old_connection_unlocked()
|
|
471
|
+
|
|
472
|
+
def _wait_for_connection(self) -> bool:
|
|
473
|
+
"""Wait for connection to be established."""
|
|
474
|
+
result = self.connected_event.wait(timeout=self.config.connection_timeout)
|
|
475
|
+
if not result:
|
|
476
|
+
# 超时了,检查状态
|
|
477
|
+
with self.lock:
|
|
478
|
+
if self._connection_state == ConnectionState.CONNECTING:
|
|
479
|
+
# 连接超时,但线程可能还在运行,让它继续
|
|
480
|
+
# 下次调用会重新等待或创建新连接
|
|
481
|
+
log_debug("Connection wait timeout, connection still in progress")
|
|
482
|
+
if self._connecting_since > 0 and (time.time() - self._connecting_since) > self.config.connection_timeout:
|
|
483
|
+
log_warning("Connection appears stalled, marking DISCONNECTED to allow reconnect")
|
|
484
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
485
|
+
self._connecting_since = 0.0
|
|
486
|
+
self._connecting_conn_id = 0
|
|
487
|
+
self.connected_event.clear()
|
|
488
|
+
return result
|
|
489
|
+
|
|
490
|
+
def stop_websocket_client(self) -> None:
|
|
491
|
+
"""Stop WebSocket client connection."""
|
|
492
|
+
self._shutdown_requested = True
|
|
493
|
+
|
|
494
|
+
# 停止清理线程
|
|
495
|
+
self._stop_cleanup_thread()
|
|
496
|
+
|
|
497
|
+
# 停止健康检查线程
|
|
498
|
+
self._stop_health_check_thread()
|
|
499
|
+
|
|
500
|
+
# 关闭 WebSocket
|
|
501
|
+
if self._loop and self.ws:
|
|
502
|
+
try:
|
|
503
|
+
if self._loop.is_running():
|
|
504
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
505
|
+
self._graceful_close_ws(self.ws),
|
|
506
|
+
self._loop
|
|
507
|
+
)
|
|
508
|
+
try:
|
|
509
|
+
future.result(timeout=2.0)
|
|
510
|
+
except Exception:
|
|
511
|
+
pass
|
|
512
|
+
except Exception:
|
|
513
|
+
pass
|
|
514
|
+
|
|
515
|
+
# 停止事件循环
|
|
516
|
+
if self._loop and self._loop.is_running():
|
|
517
|
+
try:
|
|
518
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
519
|
+
except Exception:
|
|
520
|
+
pass
|
|
521
|
+
|
|
522
|
+
if self.ws_thread and self.ws_thread.is_alive():
|
|
523
|
+
self.ws_thread.join(timeout=2.0)
|
|
524
|
+
self.ws_thread = None
|
|
525
|
+
|
|
526
|
+
self._set_connection_state(ConnectionState.DISCONNECTED)
|
|
527
|
+
|
|
528
|
+
def send_msg(self, msg: Union[str, Dict]) -> bool:
|
|
529
|
+
"""Send message through WebSocket with retry logic."""
|
|
530
|
+
if not self._ensure_connection():
|
|
531
|
+
return self._queue_message(msg)
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
# 检查连接是否有效
|
|
535
|
+
if not self._is_ws_open():
|
|
536
|
+
log_debug("WebSocket connection invalid, queueing message")
|
|
537
|
+
# 不设置 DISCONNECTED,让连接自然恢复或由健康检查处理
|
|
538
|
+
return self._queue_message(msg)
|
|
539
|
+
|
|
540
|
+
message_str = json.dumps(msg) if not isinstance(msg, str) else msg
|
|
541
|
+
|
|
542
|
+
# ✅ 发送前检查消息大小,超过限制直接丢弃
|
|
543
|
+
msg_size = len(message_str.encode('utf-8')) if isinstance(message_str, str) else len(message_str)
|
|
544
|
+
if msg_size > self.config.max_message_size:
|
|
545
|
+
log_error(f"[conn:{self._connection_id}] ❌ 发送消息过大,已丢弃: {msg_size/1024/1024:.2f}MB > {self.config.max_message_size/1024/1024:.0f}MB 限制")
|
|
546
|
+
# 记录到专用日志
|
|
547
|
+
ws_logger = get_ws_logger()
|
|
548
|
+
ws_logger.log_abnormal_data(
|
|
549
|
+
conn_id=self._connection_id,
|
|
550
|
+
data=None,
|
|
551
|
+
error=f"发送消息大小 {msg_size/1024/1024:.2f}MB ({msg_size} bytes) 超过限制 {self.config.max_message_size/1024/1024:.0f}MB,已丢弃",
|
|
552
|
+
data_type="oversized_send_discarded"
|
|
553
|
+
)
|
|
554
|
+
return False # 丢弃消息,返回失败
|
|
555
|
+
|
|
556
|
+
# 使用事件循环发送消息
|
|
557
|
+
if self._loop and self._loop.is_running():
|
|
558
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
559
|
+
self._async_send(message_str),
|
|
560
|
+
self._loop
|
|
561
|
+
)
|
|
562
|
+
future.result(timeout=5.0)
|
|
563
|
+
return True
|
|
564
|
+
else:
|
|
565
|
+
return self._queue_message(msg)
|
|
566
|
+
|
|
567
|
+
except ConnectionClosed as e:
|
|
568
|
+
log_debug(f"WebSocket connection closed during send: {e}")
|
|
569
|
+
# 连接已关闭,设置状态(连接会自动重连)
|
|
570
|
+
with self.lock:
|
|
571
|
+
if self._connection_state == ConnectionState.CONNECTED:
|
|
572
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
573
|
+
self.connected_event.clear()
|
|
574
|
+
return self._queue_message(msg)
|
|
575
|
+
except Exception as e:
|
|
576
|
+
log_debug(f"Failed to send message: {e}")
|
|
577
|
+
trace_id = msg.get("trace_id", "") if isinstance(msg, dict) else ""
|
|
578
|
+
ErrorContext.publish(exceptions.SendMsgError(message=f"Error sending message: {e}", trace_id=trace_id))
|
|
579
|
+
# 发送失败不一定意味着连接断开,不要设置 DISCONNECTED
|
|
580
|
+
return self._queue_message(msg)
|
|
581
|
+
|
|
582
|
+
async def _async_send(self, message: str) -> None:
|
|
583
|
+
"""Async send message."""
|
|
584
|
+
if self._is_ws_open():
|
|
585
|
+
await self.ws.send(message)
|
|
586
|
+
|
|
587
|
+
def _ensure_connection(self) -> bool:
|
|
588
|
+
"""Ensure WebSocket connection is established."""
|
|
589
|
+
# 快速路径:如果已连接且有效,直接返回
|
|
590
|
+
if self._is_ws_open():
|
|
591
|
+
# 只在状态是 DISCONNECTED 时修正为 CONNECTED
|
|
592
|
+
# 不要修改 CONNECTING 状态,避免干扰正在进行的连接
|
|
593
|
+
with self.lock:
|
|
594
|
+
if self._connection_state == ConnectionState.DISCONNECTED:
|
|
595
|
+
self._connection_state = ConnectionState.CONNECTED
|
|
596
|
+
self.connected_event.set()
|
|
597
|
+
return True
|
|
598
|
+
|
|
599
|
+
# 需要建立连接
|
|
600
|
+
retry_count = 0
|
|
601
|
+
while retry_count < self.config.send_retry_attempts:
|
|
602
|
+
if self.start_websocket_client():
|
|
603
|
+
return True
|
|
604
|
+
|
|
605
|
+
retry_count += 1
|
|
606
|
+
if retry_count < self.config.send_retry_attempts:
|
|
607
|
+
time.sleep(self.config.send_retry_delay)
|
|
608
|
+
|
|
609
|
+
log_error(f"Failed to establish connection after {self.config.send_retry_attempts} attempts")
|
|
610
|
+
return False
|
|
611
|
+
|
|
612
|
+
def _queue_message(self, msg: Union[str, Dict]) -> bool:
|
|
613
|
+
"""Queue message for later sending."""
|
|
614
|
+
try:
|
|
615
|
+
if self.queue.full():
|
|
616
|
+
try:
|
|
617
|
+
self.queue.get_nowait()
|
|
618
|
+
self.queue.task_done()
|
|
619
|
+
except queue.Empty:
|
|
620
|
+
pass
|
|
621
|
+
|
|
622
|
+
message_str = json.dumps(msg) if not isinstance(msg, str) else msg
|
|
623
|
+
self.queue.put(message_str, timeout=1)
|
|
624
|
+
log_debug("Message queued for later sending")
|
|
625
|
+
return False
|
|
626
|
+
|
|
627
|
+
except (queue.Full, queue.Empty) as e:
|
|
628
|
+
log_error(f"Failed to queue message: {e}")
|
|
629
|
+
return False
|
|
630
|
+
|
|
631
|
+
def _handle_reconnection(self) -> None:
|
|
632
|
+
"""Handle reconnection logic with exponential backoff."""
|
|
633
|
+
# ✅ 检查解释器是否正在关闭
|
|
634
|
+
import sys
|
|
635
|
+
if hasattr(sys, 'is_finalizing') and sys.is_finalizing():
|
|
636
|
+
log_debug("Interpreter is shutting down, skipping reconnection")
|
|
637
|
+
self._shutdown_requested = True
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
if self._shutdown_requested:
|
|
641
|
+
return
|
|
642
|
+
|
|
643
|
+
if not self.config.auto_reconnect:
|
|
644
|
+
log_debug("Auto-reconnect is disabled, skipping reconnection")
|
|
645
|
+
return
|
|
646
|
+
|
|
647
|
+
# 使用锁保护 _is_retrying 标志
|
|
648
|
+
with self.lock:
|
|
649
|
+
if self._is_retrying:
|
|
650
|
+
log_debug("Reconnection already in progress, skipping")
|
|
651
|
+
return
|
|
652
|
+
self._is_retrying = True
|
|
653
|
+
# 不设置 RECONNECTING 状态,让 start_websocket_client 设置 CONNECTING
|
|
654
|
+
|
|
655
|
+
reconnect_start_time = time.time()
|
|
656
|
+
ws_logger = get_ws_logger()
|
|
657
|
+
|
|
658
|
+
try:
|
|
659
|
+
if self._reconnect_attempt_count == 0:
|
|
660
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
661
|
+
|
|
662
|
+
while not self._shutdown_requested:
|
|
663
|
+
self._reconnect_attempt_count += 1
|
|
664
|
+
|
|
665
|
+
if self.config.max_retry_attempts > 0 and self._reconnect_attempt_count > self.config.max_retry_attempts:
|
|
666
|
+
log_error(f"Reconnection failed after {self.config.max_retry_attempts} attempts, giving up")
|
|
667
|
+
# ✅ 记录重连失败
|
|
668
|
+
ws_logger.log_reconnect_fail(
|
|
669
|
+
conn_id=self._connection_id,
|
|
670
|
+
attempt=self._reconnect_attempt_count,
|
|
671
|
+
reason=f"达到最大重试次数 {self.config.max_retry_attempts}"
|
|
672
|
+
)
|
|
673
|
+
break
|
|
674
|
+
|
|
675
|
+
# ✅ 记录重连开始
|
|
676
|
+
ws_logger.log_reconnect_start(
|
|
677
|
+
conn_id=self._connection_id,
|
|
678
|
+
attempt=self._reconnect_attempt_count,
|
|
679
|
+
interval=self._current_reconnect_interval
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
if self._reconnect_attempt_count == 1 or self._reconnect_attempt_count % 10 == 0:
|
|
683
|
+
log_info(f"🔄 Reconnecting... attempt {self._reconnect_attempt_count} (interval: {self._current_reconnect_interval:.1f}s)")
|
|
684
|
+
else:
|
|
685
|
+
log_debug(f"Reconnecting attempt {self._reconnect_attempt_count}")
|
|
686
|
+
|
|
687
|
+
if self.start_websocket_client():
|
|
688
|
+
reconnect_duration = time.time() - reconnect_start_time
|
|
689
|
+
log_info("✅ Reconnection successful!")
|
|
690
|
+
|
|
691
|
+
# ✅ 记录重连成功
|
|
692
|
+
ws_logger.log_reconnect_success(
|
|
693
|
+
conn_id=self._connection_id,
|
|
694
|
+
attempt=self._reconnect_attempt_count,
|
|
695
|
+
duration=reconnect_duration,
|
|
696
|
+
pending_recovered=0 # 等待请求已在断开时通知,这里为0
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# ✅ 增强:主动验证连接真正可用
|
|
700
|
+
if not self._verify_connection_after_reconnect():
|
|
701
|
+
log_warning("⚠️ 重连后连接验证失败,继续重试...")
|
|
702
|
+
time.sleep(self._current_reconnect_interval)
|
|
703
|
+
continue
|
|
704
|
+
|
|
705
|
+
# ✅ 执行系统恢复检查
|
|
706
|
+
self._perform_system_recovery_check()
|
|
707
|
+
|
|
708
|
+
# ✅ 触发连接恢复回调
|
|
709
|
+
if self._on_reconnect_callback:
|
|
710
|
+
try:
|
|
711
|
+
log_info(f"[conn:{self._connection_id}] 触发连接恢复回调...")
|
|
712
|
+
self._on_reconnect_callback(
|
|
713
|
+
agent_id=self.agent_id,
|
|
714
|
+
server_url=self.server_url
|
|
715
|
+
)
|
|
716
|
+
except Exception as e:
|
|
717
|
+
log_error(f"[conn:{self._connection_id}] 连接恢复回调执行异常: {e}")
|
|
718
|
+
|
|
719
|
+
self._reconnect_attempt_count = 0
|
|
720
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
721
|
+
return
|
|
722
|
+
|
|
723
|
+
time.sleep(self._current_reconnect_interval)
|
|
724
|
+
|
|
725
|
+
self._current_reconnect_interval = min(
|
|
726
|
+
self._current_reconnect_interval * self.config.reconnect_backoff_factor,
|
|
727
|
+
self.config.reconnect_max_interval
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
if self.config.max_retry_attempts > 0:
|
|
731
|
+
log_error(f"Reconnection failed after {self.config.max_retry_attempts} attempts")
|
|
732
|
+
|
|
733
|
+
finally:
|
|
734
|
+
self._is_retrying = False
|
|
735
|
+
if self.connection_state != ConnectionState.CONNECTED:
|
|
736
|
+
self._set_connection_state(ConnectionState.DISCONNECTED)
|
|
737
|
+
|
|
738
|
+
def _verify_connection_after_reconnect(self) -> bool:
|
|
739
|
+
"""✅ 重连后主动验证连接是否真正可用
|
|
740
|
+
|
|
741
|
+
检查项:
|
|
742
|
+
1. WebSocket 对象存在且状态为 OPEN
|
|
743
|
+
2. 事件循环正在运行
|
|
744
|
+
3. connected_event 已设置
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
True: 连接验证通过
|
|
748
|
+
False: 连接验证失败
|
|
749
|
+
"""
|
|
750
|
+
try:
|
|
751
|
+
# 等待一小段时间让连接稳定
|
|
752
|
+
time.sleep(0.2)
|
|
753
|
+
|
|
754
|
+
# 1. 检查 WebSocket 状态
|
|
755
|
+
if not self._is_ws_open():
|
|
756
|
+
log_warning(f"[验证] WebSocket 状态不是 OPEN")
|
|
757
|
+
return False
|
|
758
|
+
|
|
759
|
+
# 2. 检查事件循环
|
|
760
|
+
if self._loop is None or not self._loop.is_running():
|
|
761
|
+
log_warning(f"[验证] 事件循环未运行")
|
|
762
|
+
return False
|
|
763
|
+
|
|
764
|
+
# 3. 检查 connected_event
|
|
765
|
+
if not self.connected_event.is_set():
|
|
766
|
+
log_warning(f"[验证] connected_event 未设置")
|
|
767
|
+
return False
|
|
768
|
+
|
|
769
|
+
# 4. 检查连接状态
|
|
770
|
+
if self.connection_state != ConnectionState.CONNECTED:
|
|
771
|
+
log_warning(f"[验证] 连接状态不是 CONNECTED: {self.connection_state.value}")
|
|
772
|
+
return False
|
|
773
|
+
|
|
774
|
+
log_info(f"[验证] ✅ 连接验证通过")
|
|
775
|
+
return True
|
|
776
|
+
|
|
777
|
+
except Exception as e:
|
|
778
|
+
log_error(f"[验证] 连接验证异常: {e}")
|
|
779
|
+
return False
|
|
780
|
+
|
|
781
|
+
def _perform_system_recovery_check(self) -> None:
|
|
782
|
+
"""✅ 执行系统恢复检查,确保重连后系统正常运行
|
|
783
|
+
|
|
784
|
+
检查项目:
|
|
785
|
+
1. WebSocket 连接状态
|
|
786
|
+
2. 事件循环状态
|
|
787
|
+
3. 队列状态
|
|
788
|
+
4. 辅助线程状态
|
|
789
|
+
"""
|
|
790
|
+
try:
|
|
791
|
+
ws_logger = get_ws_logger()
|
|
792
|
+
recovery_status = {}
|
|
793
|
+
|
|
794
|
+
# 1. 检查连接状态
|
|
795
|
+
ws_open = self._is_ws_open()
|
|
796
|
+
recovery_status["ws_connection"] = "OK" if ws_open else "FAILED"
|
|
797
|
+
|
|
798
|
+
# 2. 检查事件循环
|
|
799
|
+
loop_running = self._loop is not None and self._loop.is_running()
|
|
800
|
+
recovery_status["event_loop"] = "OK" if loop_running else "FAILED"
|
|
801
|
+
|
|
802
|
+
# 3. 检查消息队列
|
|
803
|
+
queue_size = self.queue.qsize() if self.queue else 0
|
|
804
|
+
recovery_status["message_queue_size"] = queue_size
|
|
805
|
+
recovery_status["message_queue"] = "OK"
|
|
806
|
+
|
|
807
|
+
# 4. 检查 stream_queue_map(应该已被清空)
|
|
808
|
+
pending_streams = self.get_pending_stream_count() # ✅ 使用线程安全方法
|
|
809
|
+
recovery_status["pending_stream_requests"] = pending_streams
|
|
810
|
+
|
|
811
|
+
# 5. 检查辅助线程
|
|
812
|
+
cleanup_running = self._cleanup_thread and self._cleanup_thread.is_alive()
|
|
813
|
+
health_check_running = self._health_check_thread and self._health_check_thread.is_alive()
|
|
814
|
+
recovery_status["cleanup_thread"] = "OK" if cleanup_running else "RESTARTING"
|
|
815
|
+
recovery_status["health_check_thread"] = "OK" if health_check_running else "RESTARTING"
|
|
816
|
+
|
|
817
|
+
# 6. 检查连接事件
|
|
818
|
+
connected_event_set = self.connected_event.is_set()
|
|
819
|
+
recovery_status["connected_event"] = "OK" if connected_event_set else "FAILED"
|
|
820
|
+
|
|
821
|
+
# 判断整体状态
|
|
822
|
+
all_ok = (
|
|
823
|
+
ws_open and
|
|
824
|
+
loop_running and
|
|
825
|
+
connected_event_set
|
|
826
|
+
)
|
|
827
|
+
recovery_status["overall_status"] = "HEALTHY" if all_ok else "DEGRADED"
|
|
828
|
+
|
|
829
|
+
# 记录恢复状态
|
|
830
|
+
ws_logger.log_system_recovery(
|
|
831
|
+
conn_id=self._connection_id,
|
|
832
|
+
recovery_status=recovery_status
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
if all_ok:
|
|
836
|
+
log_info(f"✅ [系统恢复] 所有检查通过,系统已完全恢复")
|
|
837
|
+
else:
|
|
838
|
+
log_warning(f"⚠️ [系统恢复] 部分检查未通过: {recovery_status}")
|
|
839
|
+
|
|
840
|
+
# 尝试修复问题
|
|
841
|
+
if not cleanup_running:
|
|
842
|
+
log_info("🔧 重启清理线程...")
|
|
843
|
+
self._start_cleanup_thread()
|
|
844
|
+
|
|
845
|
+
if not health_check_running:
|
|
846
|
+
log_info("🔧 重启健康检查线程...")
|
|
847
|
+
self._start_health_check_thread()
|
|
848
|
+
|
|
849
|
+
except Exception as e:
|
|
850
|
+
log_error(f"❌ 系统恢复检查失败: {e}")
|
|
851
|
+
|
|
852
|
+
async def _process_queued_messages(self) -> None:
|
|
853
|
+
"""Process messages that were queued during disconnection."""
|
|
854
|
+
try:
|
|
855
|
+
while not self.queue.empty():
|
|
856
|
+
try:
|
|
857
|
+
message = self.queue.get_nowait()
|
|
858
|
+
if self._is_ws_open():
|
|
859
|
+
await self.ws.send(message)
|
|
860
|
+
self.queue.task_done()
|
|
861
|
+
except queue.Empty:
|
|
862
|
+
break
|
|
863
|
+
except Exception as e:
|
|
864
|
+
log_error(f"Failed to send queued message: {e}")
|
|
865
|
+
break
|
|
866
|
+
except Exception as e:
|
|
867
|
+
log_error(f"Error processing queued messages: {e}")
|
|
868
|
+
|
|
869
|
+
def _cleanup_stale_stream_queues(self, owner_conn_id: int) -> None:
|
|
870
|
+
"""定期清理过期的流队列"""
|
|
871
|
+
log_info(f"[conn:{owner_conn_id}] 🧹 流队列清理线程已启动")
|
|
872
|
+
cleanup_interval = 30
|
|
873
|
+
last_cleanup_time = time.time()
|
|
874
|
+
|
|
875
|
+
while self._cleanup_running and not self._shutdown_requested:
|
|
876
|
+
try:
|
|
877
|
+
# 使用短间隔 sleep,快速响应停止信号
|
|
878
|
+
time.sleep(1.0)
|
|
879
|
+
|
|
880
|
+
# 检查连接 ID 是否仍然有效
|
|
881
|
+
if self._connection_id != owner_conn_id:
|
|
882
|
+
log_debug(f"[conn:{owner_conn_id}] 清理线程: 连接已被取代,退出")
|
|
883
|
+
break
|
|
884
|
+
|
|
885
|
+
if not self._cleanup_running or self._shutdown_requested:
|
|
886
|
+
break
|
|
887
|
+
|
|
888
|
+
# 检查是否到达清理间隔
|
|
889
|
+
now = time.time()
|
|
890
|
+
if now - last_cleanup_time < cleanup_interval:
|
|
891
|
+
continue
|
|
892
|
+
last_cleanup_time = now
|
|
893
|
+
|
|
894
|
+
stale_requests = []
|
|
895
|
+
|
|
896
|
+
# ✅ 使用锁保护遍历操作
|
|
897
|
+
with self._stream_queue_lock:
|
|
898
|
+
for request_id, entry in list(self.stream_queue_map.items()):
|
|
899
|
+
timestamp = entry.get("timestamp", now)
|
|
900
|
+
age = now - timestamp
|
|
901
|
+
|
|
902
|
+
if age > 15.0:
|
|
903
|
+
stale_requests.append({
|
|
904
|
+
"request_id": request_id,
|
|
905
|
+
"age": age,
|
|
906
|
+
"receiver": entry.get("receiver", "unknown"),
|
|
907
|
+
"entry": entry # 保存完整的 entry
|
|
908
|
+
})
|
|
909
|
+
|
|
910
|
+
# ✅ 在锁内移除过期请求
|
|
911
|
+
for req in stale_requests:
|
|
912
|
+
self.stream_queue_map.pop(req["request_id"], None)
|
|
913
|
+
remaining_count = len(self.stream_queue_map)
|
|
914
|
+
|
|
915
|
+
# ✅ 释放锁后再处理通知
|
|
916
|
+
if stale_requests:
|
|
917
|
+
log_info(f"🧹 发现 {len(stale_requests)} 个过期流请求,开始清理...")
|
|
918
|
+
|
|
919
|
+
for req in stale_requests:
|
|
920
|
+
request_id = req["request_id"]
|
|
921
|
+
queue_entry = req["entry"]
|
|
922
|
+
|
|
923
|
+
log_error(f"⚠️ 清理过期流请求: request_id={request_id[:8]}... "
|
|
924
|
+
f"receiver={req['receiver']} 等待时间={req['age']:.1f}s")
|
|
925
|
+
|
|
926
|
+
try:
|
|
927
|
+
temp_queue = queue_entry["queue"]
|
|
928
|
+
loop = queue_entry.get("loop")
|
|
929
|
+
|
|
930
|
+
if temp_queue.empty() and loop:
|
|
931
|
+
error_data = {"error": "timeout", "message": "流创建超时"}
|
|
932
|
+
loop.call_soon_threadsafe(temp_queue.put_nowait, error_data)
|
|
933
|
+
except Exception as e:
|
|
934
|
+
log_debug(f"清理队列时异常(可忽略): {e}")
|
|
935
|
+
|
|
936
|
+
log_info(f"✅ 清理完成,剩余等待请求: {remaining_count}")
|
|
937
|
+
|
|
938
|
+
except Exception as e:
|
|
939
|
+
log_error(f"❌ 流队列清理异常: {e}")
|
|
940
|
+
|
|
941
|
+
log_info(f"[conn:{owner_conn_id}] 🧹 流队列清理线程已停止")
|
|
942
|
+
|
|
943
|
+
def _start_cleanup_thread(self) -> None:
|
|
944
|
+
"""启动清理线程"""
|
|
945
|
+
# 如果旧线程还在运行,先等待它停止
|
|
946
|
+
if self._cleanup_thread and self._cleanup_thread.is_alive():
|
|
947
|
+
if self._cleanup_running:
|
|
948
|
+
return # 线程正常运行中,不需要重启
|
|
949
|
+
# 等待旧线程结束
|
|
950
|
+
self._cleanup_thread.join(timeout=2.0)
|
|
951
|
+
|
|
952
|
+
self._cleanup_running = True
|
|
953
|
+
|
|
954
|
+
# 传递当前连接 ID
|
|
955
|
+
current_conn_id = self._connection_id
|
|
956
|
+
|
|
957
|
+
self._cleanup_thread = threading.Thread(
|
|
958
|
+
target=self._cleanup_stale_stream_queues,
|
|
959
|
+
args=(current_conn_id,),
|
|
960
|
+
daemon=True,
|
|
961
|
+
name=f"StreamQueueCleanup-{current_conn_id}"
|
|
962
|
+
)
|
|
963
|
+
self._cleanup_thread.start()
|
|
964
|
+
log_debug(f"[conn:{current_conn_id}] 流队列清理线程已启动")
|
|
965
|
+
|
|
966
|
+
def _stop_cleanup_thread(self) -> None:
|
|
967
|
+
"""停止清理线程"""
|
|
968
|
+
if not self._cleanup_thread:
|
|
969
|
+
return
|
|
970
|
+
|
|
971
|
+
self._cleanup_running = False
|
|
972
|
+
|
|
973
|
+
if self._cleanup_thread.is_alive():
|
|
974
|
+
self._cleanup_thread.join(timeout=2.0)
|
|
975
|
+
|
|
976
|
+
self._cleanup_thread = None
|
|
977
|
+
log_debug("流队列清理线程已停止")
|
|
978
|
+
|
|
979
|
+
def _start_health_check_thread(self) -> None:
|
|
980
|
+
"""启动连接健康检查线程"""
|
|
981
|
+
# 如果旧线程还在运行,先等待它停止
|
|
982
|
+
if self._health_check_thread and self._health_check_thread.is_alive():
|
|
983
|
+
if self._health_check_running:
|
|
984
|
+
return # 线程正常运行中,不需要重启
|
|
985
|
+
# 等待旧线程结束
|
|
986
|
+
self._health_check_thread.join(timeout=2.0)
|
|
987
|
+
|
|
988
|
+
self._health_check_running = True
|
|
989
|
+
self._last_pong_time = time.time()
|
|
990
|
+
|
|
991
|
+
# 传递当前连接 ID,让线程知道它属于哪个连接
|
|
992
|
+
current_conn_id = self._connection_id
|
|
993
|
+
|
|
994
|
+
self._health_check_thread = threading.Thread(
|
|
995
|
+
target=self._health_check_loop,
|
|
996
|
+
args=(current_conn_id,),
|
|
997
|
+
daemon=True,
|
|
998
|
+
name=f"WebSocketHealthCheck-{current_conn_id}"
|
|
999
|
+
)
|
|
1000
|
+
self._health_check_thread.start()
|
|
1001
|
+
log_debug(f"[conn:{current_conn_id}] 连接健康检查线程已启动")
|
|
1002
|
+
|
|
1003
|
+
def _stop_health_check_thread(self) -> None:
|
|
1004
|
+
"""停止连接健康检查线程"""
|
|
1005
|
+
self._health_check_running = False
|
|
1006
|
+
|
|
1007
|
+
if self._health_check_thread and self._health_check_thread.is_alive():
|
|
1008
|
+
self._health_check_thread.join(timeout=2.0)
|
|
1009
|
+
|
|
1010
|
+
self._health_check_thread = None
|
|
1011
|
+
log_debug("连接健康检查线程已停止")
|
|
1012
|
+
|
|
1013
|
+
def _health_check_loop(self, owner_conn_id: int) -> None:
|
|
1014
|
+
"""连接健康检查循环
|
|
1015
|
+
|
|
1016
|
+
注意:websockets 库内部已经处理了 ping/pong,会自动关闭不响应的连接。
|
|
1017
|
+
因此这里只需要检查 WebSocket 状态,不需要自己判断 pong 超时。
|
|
1018
|
+
"""
|
|
1019
|
+
# ✅ 优化:缩短检查间隔,更快发现连接问题
|
|
1020
|
+
check_interval = self.config.ping_interval * 2 # 从 *3 改为 *2(6秒检查一次)
|
|
1021
|
+
ws_logger = get_ws_logger()
|
|
1022
|
+
|
|
1023
|
+
log_debug(f"[conn:{owner_conn_id}] 健康检查线程启动: 检查间隔={check_interval}s")
|
|
1024
|
+
|
|
1025
|
+
last_check_time = time.time()
|
|
1026
|
+
|
|
1027
|
+
while self._health_check_running and not self._shutdown_requested:
|
|
1028
|
+
try:
|
|
1029
|
+
# 使用短间隔 sleep,快速响应停止信号
|
|
1030
|
+
time.sleep(1.0)
|
|
1031
|
+
|
|
1032
|
+
# 检查连接 ID 是否仍然有效(防止旧线程继续运行)
|
|
1033
|
+
if self._connection_id != owner_conn_id:
|
|
1034
|
+
log_debug(f"[conn:{owner_conn_id}] 健康检查线程: 连接已被取代 (当前: {self._connection_id}),退出")
|
|
1035
|
+
break
|
|
1036
|
+
|
|
1037
|
+
if not self._health_check_running or self._shutdown_requested:
|
|
1038
|
+
break
|
|
1039
|
+
|
|
1040
|
+
# 检查是否到达检查间隔
|
|
1041
|
+
now = time.time()
|
|
1042
|
+
if now - last_check_time < check_interval:
|
|
1043
|
+
continue
|
|
1044
|
+
last_check_time = now
|
|
1045
|
+
|
|
1046
|
+
# 再次检查连接 ID
|
|
1047
|
+
if self._connection_id != owner_conn_id:
|
|
1048
|
+
log_debug(f"[conn:{owner_conn_id}] 健康检查线程: 连接已被取代,退出")
|
|
1049
|
+
break
|
|
1050
|
+
|
|
1051
|
+
# 获取当前状态
|
|
1052
|
+
ws_open = self._is_ws_open()
|
|
1053
|
+
conn_state = self.connection_state.value
|
|
1054
|
+
|
|
1055
|
+
# 检查连接状态
|
|
1056
|
+
if self.connection_state == ConnectionState.DISCONNECTED:
|
|
1057
|
+
log_debug(f"[conn:{owner_conn_id}] 健康检查: 检测到连接状态为 DISCONNECTED")
|
|
1058
|
+
# 只在触发重连时记录日志
|
|
1059
|
+
ws_logger.log_health_check(
|
|
1060
|
+
conn_id=owner_conn_id,
|
|
1061
|
+
ws_open=ws_open,
|
|
1062
|
+
connection_state=conn_state,
|
|
1063
|
+
action="trigger_reconnect_state_disconnected"
|
|
1064
|
+
)
|
|
1065
|
+
# ✅ 修复:触发重连前先通知所有等待中的请求
|
|
1066
|
+
self._notify_pending_stream_requests("健康检查检测到连接断开")
|
|
1067
|
+
if not self._is_retrying:
|
|
1068
|
+
threading.Thread(target=self._handle_reconnection, daemon=True).start()
|
|
1069
|
+
continue
|
|
1070
|
+
|
|
1071
|
+
# 检查 WebSocket 对象是否有效
|
|
1072
|
+
if not ws_open:
|
|
1073
|
+
log_debug(f"[conn:{owner_conn_id}] 健康检查: WebSocket 连接已关闭")
|
|
1074
|
+
# 只在触发重连时记录日志
|
|
1075
|
+
ws_logger.log_health_check(
|
|
1076
|
+
conn_id=owner_conn_id,
|
|
1077
|
+
ws_open=ws_open,
|
|
1078
|
+
connection_state=conn_state,
|
|
1079
|
+
action="trigger_reconnect_ws_closed"
|
|
1080
|
+
)
|
|
1081
|
+
# ✅ 修复:触发重连前先通知所有等待中的请求
|
|
1082
|
+
self._notify_pending_stream_requests("健康检查检测到WebSocket关闭")
|
|
1083
|
+
self._set_connection_state(ConnectionState.DISCONNECTED)
|
|
1084
|
+
if not self._is_retrying:
|
|
1085
|
+
threading.Thread(target=self._handle_reconnection, daemon=True).start()
|
|
1086
|
+
continue
|
|
1087
|
+
|
|
1088
|
+
# 连接正常,更新 pong 时间(用于统计,不用于判断断开)
|
|
1089
|
+
# 不记录日志,避免日志量过大
|
|
1090
|
+
self._last_pong_time = time.time()
|
|
1091
|
+
|
|
1092
|
+
except Exception as e:
|
|
1093
|
+
log_error(f"[conn:{owner_conn_id}] 健康检查异常: {e}")
|
|
1094
|
+
|
|
1095
|
+
log_debug(f"[conn:{owner_conn_id}] 健康检查线程已退出")
|
|
1096
|
+
|
|
1097
|
+
def _ws_handler(self, conn_id: int) -> None:
|
|
1098
|
+
"""WebSocket handler thread function with asyncio loop."""
|
|
1099
|
+
loop = None
|
|
1100
|
+
try:
|
|
1101
|
+
# ✅ 检查解释器是否正在关闭
|
|
1102
|
+
import sys
|
|
1103
|
+
if hasattr(sys, 'is_finalizing') and sys.is_finalizing():
|
|
1104
|
+
log_debug(f"[conn:{conn_id}] Interpreter is shutting down, skipping connection")
|
|
1105
|
+
self._shutdown_requested = True
|
|
1106
|
+
return
|
|
1107
|
+
|
|
1108
|
+
loop = asyncio.new_event_loop()
|
|
1109
|
+
asyncio.set_event_loop(loop)
|
|
1110
|
+
self._loop = loop
|
|
1111
|
+
|
|
1112
|
+
loop.run_until_complete(self._ws_connect_and_receive(conn_id))
|
|
1113
|
+
|
|
1114
|
+
except RuntimeError as e:
|
|
1115
|
+
error_str = str(e).lower()
|
|
1116
|
+
# ✅ 检测解释器关闭相关的错误
|
|
1117
|
+
if "interpreter shutdown" in error_str or "cannot schedule" in error_str:
|
|
1118
|
+
log_warning(f"[conn:{conn_id}] Interpreter shutting down, stopping reconnection")
|
|
1119
|
+
self._shutdown_requested = True # 阻止重连
|
|
1120
|
+
else:
|
|
1121
|
+
log_debug(f"[conn:{conn_id}] WebSocket handler RuntimeError: {e}")
|
|
1122
|
+
except Exception as e:
|
|
1123
|
+
error_str = str(e).lower()
|
|
1124
|
+
# ✅ 也检查通用异常中的解释器关闭错误
|
|
1125
|
+
if "interpreter shutdown" in error_str or "cannot schedule" in error_str:
|
|
1126
|
+
log_warning(f"[conn:{conn_id}] Interpreter shutting down, stopping reconnection")
|
|
1127
|
+
self._shutdown_requested = True
|
|
1128
|
+
else:
|
|
1129
|
+
log_debug(f"[conn:{conn_id}] WebSocket handler error: {e}")
|
|
1130
|
+
finally:
|
|
1131
|
+
# 只有当前连接才设置 DISCONNECTED 状态
|
|
1132
|
+
with self.lock:
|
|
1133
|
+
if self._connection_id == conn_id:
|
|
1134
|
+
log_debug(f"[conn:{conn_id}] Handler exiting, setting DISCONNECTED")
|
|
1135
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
1136
|
+
self._connecting_since = 0.0
|
|
1137
|
+
self._connecting_conn_id = 0
|
|
1138
|
+
self.connected_event.clear()
|
|
1139
|
+
self.ws = None
|
|
1140
|
+
else:
|
|
1141
|
+
log_debug(f"[conn:{conn_id}] Handler exiting, but superseded by conn:{self._connection_id}")
|
|
1142
|
+
|
|
1143
|
+
# 安全关闭事件循环
|
|
1144
|
+
if loop and not loop.is_closed():
|
|
1145
|
+
try:
|
|
1146
|
+
# 只有当 loop 没有运行时才能安全地取消任务
|
|
1147
|
+
if not loop.is_running():
|
|
1148
|
+
# 取消所有pending任务
|
|
1149
|
+
pending = asyncio.all_tasks(loop)
|
|
1150
|
+
for task in pending:
|
|
1151
|
+
task.cancel()
|
|
1152
|
+
|
|
1153
|
+
# 等待任务取消完成
|
|
1154
|
+
if pending:
|
|
1155
|
+
loop.run_until_complete(
|
|
1156
|
+
asyncio.gather(*pending, return_exceptions=True)
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
# 关闭loop
|
|
1160
|
+
if not loop.is_closed():
|
|
1161
|
+
loop.close()
|
|
1162
|
+
except Exception:
|
|
1163
|
+
pass
|
|
1164
|
+
|
|
1165
|
+
async def _ws_connect_and_receive(self, conn_id: int) -> None:
|
|
1166
|
+
"""Async WebSocket connection and message receiving loop."""
|
|
1167
|
+
ssl_context = None
|
|
1168
|
+
if self.ws_url and self.ws_url.startswith("wss://"):
|
|
1169
|
+
ssl_context = ssl.create_default_context()
|
|
1170
|
+
ssl_context.check_hostname = False
|
|
1171
|
+
ssl_context.verify_mode = ssl.CERT_NONE
|
|
1172
|
+
|
|
1173
|
+
# 准备代理配置(localhost 永远直连,避免全局代理/VPN 劫持)
|
|
1174
|
+
use_proxy = self._get_use_system_proxy() and (not is_local_url(self.ws_url))
|
|
1175
|
+
extra_headers = {}
|
|
1176
|
+
saved_proxy_env = None
|
|
1177
|
+
|
|
1178
|
+
try:
|
|
1179
|
+
# websockets库通过环境变量支持代理,但我们可以通过extra_headers传递代理信息
|
|
1180
|
+
# 如果不使用代理,确保不会使用环境变量中的代理设置
|
|
1181
|
+
import os
|
|
1182
|
+
import platform
|
|
1183
|
+
if not use_proxy:
|
|
1184
|
+
# 临时清除代理环境变量(只影响本次握手),确保 localhost 不会走代理
|
|
1185
|
+
saved_proxy_env = pop_proxy_env()
|
|
1186
|
+
|
|
1187
|
+
# 准备 websockets.connect 参数
|
|
1188
|
+
# 注意:websockets 14.2+ 在某些平台(macOS/Darwin)上不支持 proxy 参数
|
|
1189
|
+
# 会抛出 "BaseEventLoop.create_connection() got an unexpected keyword argument 'proxy'"
|
|
1190
|
+
ws_connect_kwargs = {
|
|
1191
|
+
"ssl": ssl_context,
|
|
1192
|
+
"open_timeout": self.config.connection_timeout,
|
|
1193
|
+
"ping_interval": self.config.ping_interval,
|
|
1194
|
+
"ping_timeout": self.config.ping_interval * 10,
|
|
1195
|
+
"close_timeout": 5,
|
|
1196
|
+
"max_size": None, # ✅ 禁用协议层大小限制,在应用层处理超大消息
|
|
1197
|
+
"compression": "deflate", # ✅ 启用压缩,与服务器协商压缩扩展
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
# macOS (Darwin) 上 websockets 14.2+ 不支持 proxy 参数
|
|
1201
|
+
# 其他平台显式禁用代理(配合环境变量清除)
|
|
1202
|
+
if platform.system() != "Darwin":
|
|
1203
|
+
ws_connect_kwargs["proxy"] = None
|
|
1204
|
+
|
|
1205
|
+
async with websockets.connect(
|
|
1206
|
+
self.ws_url,
|
|
1207
|
+
**ws_connect_kwargs
|
|
1208
|
+
) as ws:
|
|
1209
|
+
# 连接建立后立即恢复代理环境变量(避免影响进程内其他请求)
|
|
1210
|
+
if saved_proxy_env:
|
|
1211
|
+
restore_proxy_env(saved_proxy_env)
|
|
1212
|
+
saved_proxy_env = None
|
|
1213
|
+
# 检查连接ID是否仍然有效(防止旧连接继续处理)
|
|
1214
|
+
with self.lock:
|
|
1215
|
+
if self._connection_id != conn_id:
|
|
1216
|
+
log_debug(f"[conn:{conn_id}] Connection superseded by conn:{self._connection_id}, closing")
|
|
1217
|
+
# ✅ 记录连接被取代到专用日志
|
|
1218
|
+
ws_logger = get_ws_logger()
|
|
1219
|
+
ws_logger.log_connection_superseded(conn_id, self._connection_id, "_ws_connect_and_receive:after_connect")
|
|
1220
|
+
return
|
|
1221
|
+
|
|
1222
|
+
self.ws = ws
|
|
1223
|
+
|
|
1224
|
+
# 连接成功
|
|
1225
|
+
log_info(f"[conn:{conn_id}] WebSocket connection established")
|
|
1226
|
+
self._set_connection_state(ConnectionState.CONNECTED)
|
|
1227
|
+
with self.lock:
|
|
1228
|
+
self._is_retrying = False
|
|
1229
|
+
self._reconnect_attempt_count = 0
|
|
1230
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
1231
|
+
self._last_pong_time = time.time()
|
|
1232
|
+
|
|
1233
|
+
# ✅ 记录连接建立到专用日志
|
|
1234
|
+
ws_logger = get_ws_logger()
|
|
1235
|
+
ws_logger.log_connection_established(
|
|
1236
|
+
conn_id=conn_id,
|
|
1237
|
+
ws_url=self.ws_url,
|
|
1238
|
+
extra_info={
|
|
1239
|
+
"agent_id": self.agent_id,
|
|
1240
|
+
"ping_interval": self.config.ping_interval,
|
|
1241
|
+
"has_handler": self.message_handler is not None
|
|
1242
|
+
}
|
|
1243
|
+
)
|
|
1244
|
+
|
|
1245
|
+
# 启动辅助线程(异常不影响主流程)
|
|
1246
|
+
try:
|
|
1247
|
+
self._start_cleanup_thread()
|
|
1248
|
+
ws_logger.log_helper_thread(conn_id, "cleanup", "started")
|
|
1249
|
+
except Exception as e:
|
|
1250
|
+
log_error(f"[conn:{conn_id}] 启动清理线程失败: {e}")
|
|
1251
|
+
ws_logger.log_helper_thread(conn_id, "cleanup", "start_failed", success=False, error=str(e))
|
|
1252
|
+
|
|
1253
|
+
try:
|
|
1254
|
+
self._start_health_check_thread()
|
|
1255
|
+
ws_logger.log_helper_thread(conn_id, "health_check", "started")
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
log_error(f"[conn:{conn_id}] 启动健康检查线程失败: {e}")
|
|
1258
|
+
ws_logger.log_helper_thread(conn_id, "health_check", "start_failed", success=False, error=str(e))
|
|
1259
|
+
|
|
1260
|
+
# 调用消息处理器的 on_open
|
|
1261
|
+
if self.message_handler and hasattr(self.message_handler, "on_open"):
|
|
1262
|
+
try:
|
|
1263
|
+
self.message_handler.on_open(ws)
|
|
1264
|
+
ws_logger.log_on_open_callback(
|
|
1265
|
+
conn_id=conn_id,
|
|
1266
|
+
success=True,
|
|
1267
|
+
handler_type=type(self.message_handler).__name__
|
|
1268
|
+
)
|
|
1269
|
+
except Exception as e:
|
|
1270
|
+
log_exception(f"[conn:{conn_id}] Error in message handler on_open: {e}")
|
|
1271
|
+
ws_logger.log_on_open_callback(
|
|
1272
|
+
conn_id=conn_id,
|
|
1273
|
+
success=False,
|
|
1274
|
+
error=str(e),
|
|
1275
|
+
handler_type=type(self.message_handler).__name__
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
# 处理队列中的消息
|
|
1279
|
+
await self._process_queued_messages()
|
|
1280
|
+
|
|
1281
|
+
# 消息接收循环
|
|
1282
|
+
loop_start_time = time.time()
|
|
1283
|
+
messages_received = 0
|
|
1284
|
+
last_stats_time = time.time()
|
|
1285
|
+
stats_interval = 60.0 # 每60秒记录一次统计
|
|
1286
|
+
|
|
1287
|
+
# ✅ 新增:记录最近的消息类型(用于诊断)
|
|
1288
|
+
recent_msg_types = [] # 保存最近20条消息的类型
|
|
1289
|
+
max_recent = 20
|
|
1290
|
+
|
|
1291
|
+
# ✅ 新增:追踪消息大小
|
|
1292
|
+
max_msg_size = 0 # 最大消息大小
|
|
1293
|
+
total_bytes = 0 # 总字节数
|
|
1294
|
+
large_msg_count = 0 # 大消息计数(>100KB)
|
|
1295
|
+
|
|
1296
|
+
# ✅ 修改:使用 while True + recv() 代替 async for,以便捕获单条消息的协议错误
|
|
1297
|
+
protocol_error_count = 0 # RSV 位错误计数(用于日志)
|
|
1298
|
+
|
|
1299
|
+
while True:
|
|
1300
|
+
# 检查连接是否仍然有效
|
|
1301
|
+
if self._connection_id != conn_id:
|
|
1302
|
+
log_debug(f"[conn:{conn_id}] Connection superseded, exiting message loop")
|
|
1303
|
+
ws_logger.log_connection_superseded(conn_id, self._connection_id, "message_loop")
|
|
1304
|
+
ws_logger.log_message_loop_exit(
|
|
1305
|
+
conn_id=conn_id,
|
|
1306
|
+
reason="connection_superseded",
|
|
1307
|
+
messages_received=messages_received,
|
|
1308
|
+
duration=time.time() - loop_start_time
|
|
1309
|
+
)
|
|
1310
|
+
return
|
|
1311
|
+
|
|
1312
|
+
# 检查连接状态(websockets 15.x 使用 state 而不是 closed)
|
|
1313
|
+
if ws.state != WsState.OPEN:
|
|
1314
|
+
log_debug(f"[conn:{conn_id}] WebSocket connection not open (state={ws.state}), exiting message loop")
|
|
1315
|
+
break
|
|
1316
|
+
|
|
1317
|
+
try:
|
|
1318
|
+
# ✅ 使用 recv() 接收消息,可以在这里捕获单条消息的错误
|
|
1319
|
+
message = await ws.recv()
|
|
1320
|
+
protocol_error_count = 0 # 成功接收,重置错误计数
|
|
1321
|
+
|
|
1322
|
+
except ProtocolError as e:
|
|
1323
|
+
error_str = str(e).lower()
|
|
1324
|
+
# ✅ 检查是否是 RSV 位错误
|
|
1325
|
+
if "reserved bits" in error_str or "rsv" in error_str:
|
|
1326
|
+
protocol_error_count += 1
|
|
1327
|
+
log_warning(f"[conn:{conn_id}] ⚠️ RSV 位错误 (第 {protocol_error_count} 次): {e}")
|
|
1328
|
+
ws_logger.log_abnormal_data(
|
|
1329
|
+
conn_id=conn_id,
|
|
1330
|
+
data=None,
|
|
1331
|
+
error=f"RSV位错误: {e}",
|
|
1332
|
+
data_type="rsv_bit_error"
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
# ✅ RSV 位错误时,websockets 库已经发送了关闭帧,连接无法继续
|
|
1336
|
+
# 抛出 ConnectionClosedError 让外层统一处理(正确清理资源后重连)
|
|
1337
|
+
log_info(f"[conn:{conn_id}] RSV 位错误导致连接关闭,触发快速重连")
|
|
1338
|
+
from websockets.frames import Close
|
|
1339
|
+
# 创建一个带有清晰原因的 ConnectionClosedError
|
|
1340
|
+
raise ConnectionClosedError(
|
|
1341
|
+
Close(1006, f"RSV位错误: {str(e)[:80]}"),
|
|
1342
|
+
None
|
|
1343
|
+
)
|
|
1344
|
+
else:
|
|
1345
|
+
# 其他协议错误,向上抛出
|
|
1346
|
+
raise
|
|
1347
|
+
|
|
1348
|
+
except ConnectionClosed:
|
|
1349
|
+
# 连接关闭,退出循环让外层处理
|
|
1350
|
+
raise
|
|
1351
|
+
|
|
1352
|
+
try:
|
|
1353
|
+
self._last_pong_time = time.time()
|
|
1354
|
+
self._set_connection_state(ConnectionState.CONNECTED)
|
|
1355
|
+
messages_received += 1
|
|
1356
|
+
|
|
1357
|
+
# ✅ 新增:追踪消息大小
|
|
1358
|
+
msg_size = len(message) if message else 0
|
|
1359
|
+
total_bytes += msg_size
|
|
1360
|
+
if msg_size > max_msg_size:
|
|
1361
|
+
max_msg_size = msg_size
|
|
1362
|
+
|
|
1363
|
+
# ✅ 应用层消息大小检查:超过阈值直接丢弃,不影响WebSocket连接
|
|
1364
|
+
if msg_size > self.config.max_message_size:
|
|
1365
|
+
large_msg_count += 1
|
|
1366
|
+
log_error(f"[conn:{conn_id}] ❌ 收到超大消息,已丢弃: {msg_size/1024/1024:.1f}MB > {self.config.max_message_size/1024/1024:.0f}MB 限制")
|
|
1367
|
+
# 记录到专用日志(只记录大小,不记录内容)
|
|
1368
|
+
ws_logger.log_abnormal_data(
|
|
1369
|
+
conn_id=conn_id,
|
|
1370
|
+
data=None,
|
|
1371
|
+
error=f"消息大小 {msg_size/1024/1024:.2f}MB ({msg_size} bytes) 超过限制 {self.config.max_message_size/1024/1024:.0f}MB,已丢弃",
|
|
1372
|
+
data_type="oversized_message_discarded"
|
|
1373
|
+
)
|
|
1374
|
+
continue # ✅ 丢弃消息,继续处理下一条,不断开连接
|
|
1375
|
+
|
|
1376
|
+
if msg_size > 1 * 1024 * 1024: # >1MB
|
|
1377
|
+
large_msg_count += 1
|
|
1378
|
+
log_warning(f"[conn:{conn_id}] ⚠️ 收到大消息: {msg_size/1024/1024:.1f}MB")
|
|
1379
|
+
|
|
1380
|
+
# 定期记录消息统计(每60秒)
|
|
1381
|
+
now = time.time()
|
|
1382
|
+
if now - last_stats_time >= stats_interval:
|
|
1383
|
+
interval_time = now - last_stats_time
|
|
1384
|
+
avg_msg_size = total_bytes / messages_received if messages_received > 0 else 0
|
|
1385
|
+
throughput_kb = (total_bytes / 1024) / interval_time # KB/s
|
|
1386
|
+
|
|
1387
|
+
# ✅ 检测异常流量
|
|
1388
|
+
if throughput_kb > 10000: # >10MB/s
|
|
1389
|
+
log_error(f"[conn:{conn_id}] ⚠️ 异常高流量: {throughput_kb:.0f}KB/s, 平均消息大小: {avg_msg_size/1024:.1f}KB")
|
|
1390
|
+
|
|
1391
|
+
ws_logger.log_message_received(
|
|
1392
|
+
conn_id=conn_id,
|
|
1393
|
+
message_type="stats",
|
|
1394
|
+
message_size=0,
|
|
1395
|
+
cmd=None,
|
|
1396
|
+
extra_info={
|
|
1397
|
+
"total_messages": messages_received,
|
|
1398
|
+
"interval_seconds": int(interval_time),
|
|
1399
|
+
"loop_duration": int(now - loop_start_time),
|
|
1400
|
+
"avg_msg_size_kb": f"{avg_msg_size/1024:.1f}",
|
|
1401
|
+
"throughput_kb_s": f"{throughput_kb:.0f}",
|
|
1402
|
+
"total_bytes_mb": f"{total_bytes/1024/1024:.1f}",
|
|
1403
|
+
"large_msg_count": large_msg_count
|
|
1404
|
+
}
|
|
1405
|
+
)
|
|
1406
|
+
last_stats_time = now
|
|
1407
|
+
|
|
1408
|
+
if isinstance(message, bytes):
|
|
1409
|
+
# 二进制消息,尝试解码
|
|
1410
|
+
try:
|
|
1411
|
+
message = message.decode('utf-8')
|
|
1412
|
+
except UnicodeDecodeError as e:
|
|
1413
|
+
# ✅ 记录异常数据到专用日志
|
|
1414
|
+
ws_logger.log_abnormal_data(
|
|
1415
|
+
conn_id=conn_id,
|
|
1416
|
+
data=message,
|
|
1417
|
+
error=f"二进制消息解码失败: {e}",
|
|
1418
|
+
data_type="binary"
|
|
1419
|
+
)
|
|
1420
|
+
log_warning(f"[conn:{conn_id}] Failed to decode binary message (discarded): {e}")
|
|
1421
|
+
continue
|
|
1422
|
+
|
|
1423
|
+
# ✅ 新增:提取并记录消息类型
|
|
1424
|
+
msg_cmd = "unknown"
|
|
1425
|
+
try:
|
|
1426
|
+
msg_json = json.loads(message) if isinstance(message, str) else {}
|
|
1427
|
+
msg_cmd = msg_json.get("cmd", "no_cmd")
|
|
1428
|
+
except Exception:
|
|
1429
|
+
msg_cmd = "parse_error"
|
|
1430
|
+
|
|
1431
|
+
recent_msg_types.append(msg_cmd)
|
|
1432
|
+
if len(recent_msg_types) > max_recent:
|
|
1433
|
+
recent_msg_types.pop(0)
|
|
1434
|
+
|
|
1435
|
+
# 处理消息
|
|
1436
|
+
if self.message_handler and hasattr(self.message_handler, "on_message"):
|
|
1437
|
+
try:
|
|
1438
|
+
self.message_handler.on_message(ws, message)
|
|
1439
|
+
except Exception as e:
|
|
1440
|
+
# ✅ 记录消息处理错误到专用日志
|
|
1441
|
+
ws_logger.log_message_error(
|
|
1442
|
+
conn_id=conn_id,
|
|
1443
|
+
message=message,
|
|
1444
|
+
error=str(e)
|
|
1445
|
+
)
|
|
1446
|
+
log_exception(f"[conn:{conn_id}] Error in message handler: {e}")
|
|
1447
|
+
|
|
1448
|
+
except Exception as e:
|
|
1449
|
+
# ✅ 记录异常数据到专用日志
|
|
1450
|
+
ws_logger.log_abnormal_data(
|
|
1451
|
+
conn_id=conn_id,
|
|
1452
|
+
data=message if 'message' in locals() else None,
|
|
1453
|
+
error=f"消息处理异常: {e}",
|
|
1454
|
+
data_type="unknown"
|
|
1455
|
+
)
|
|
1456
|
+
log_warning(f"[conn:{conn_id}] Error processing message (discarded): {e}")
|
|
1457
|
+
continue
|
|
1458
|
+
|
|
1459
|
+
# while True 循环正常结束(ws.state != OPEN)
|
|
1460
|
+
log_debug(f"[conn:{conn_id}] WebSocket message loop ended normally")
|
|
1461
|
+
ws_logger.log_message_loop_exit(
|
|
1462
|
+
conn_id=conn_id,
|
|
1463
|
+
reason="loop_ended_normally",
|
|
1464
|
+
messages_received=messages_received,
|
|
1465
|
+
duration=time.time() - loop_start_time
|
|
1466
|
+
)
|
|
1467
|
+
self._handle_connection_close(conn_id, None, "connection ended")
|
|
1468
|
+
|
|
1469
|
+
except ConnectionClosed as e:
|
|
1470
|
+
# ✅ 增强日志:记录更多诊断信息
|
|
1471
|
+
connection_duration = time.time() - loop_start_time if 'loop_start_time' in locals() else 0
|
|
1472
|
+
msgs_count = messages_received if 'messages_received' in locals() else 0
|
|
1473
|
+
recent_types = recent_msg_types if 'recent_msg_types' in locals() else []
|
|
1474
|
+
max_size = max_msg_size if 'max_msg_size' in locals() else 0
|
|
1475
|
+
total = total_bytes if 'total_bytes' in locals() else 0
|
|
1476
|
+
large_count = large_msg_count if 'large_msg_count' in locals() else 0
|
|
1477
|
+
|
|
1478
|
+
log_warning(f"[conn:{conn_id}] WebSocket connection closed: code={e.code}, reason={e.reason}, "
|
|
1479
|
+
f"duration={connection_duration:.1f}s, messages={msgs_count}, max_size={max_size/1024:.1f}KB")
|
|
1480
|
+
|
|
1481
|
+
# ✅ 记录连接关闭异常到专用日志(包含诊断信息)
|
|
1482
|
+
ws_logger = get_ws_logger()
|
|
1483
|
+
ws_logger.log_connection_closed(
|
|
1484
|
+
conn_id=conn_id,
|
|
1485
|
+
code=e.code,
|
|
1486
|
+
reason=e.reason or "(empty)",
|
|
1487
|
+
connection_duration=connection_duration,
|
|
1488
|
+
messages_received=msgs_count,
|
|
1489
|
+
last_pong_time=self._last_pong_time,
|
|
1490
|
+
extra_info={
|
|
1491
|
+
"ws_url": self.ws_url[:80] if self.ws_url else "N/A",
|
|
1492
|
+
"agent_id": self.agent_id,
|
|
1493
|
+
"code_meaning": self._get_close_code_meaning(e.code),
|
|
1494
|
+
"recent_msg_types": recent_types[-10:] if recent_types else [],
|
|
1495
|
+
"max_msg_size_kb": f"{max_size/1024:.1f}",
|
|
1496
|
+
"total_bytes_kb": f"{total/1024:.1f}",
|
|
1497
|
+
"large_msg_count": large_count,
|
|
1498
|
+
"exception_type": type(e).__name__,
|
|
1499
|
+
"exception_detail": str(e)[:200] if str(e) else "(none)"
|
|
1500
|
+
}
|
|
1501
|
+
)
|
|
1502
|
+
self._handle_connection_close(conn_id, e.code, e.reason)
|
|
1503
|
+
|
|
1504
|
+
except asyncio.TimeoutError:
|
|
1505
|
+
log_warning(f"[conn:{conn_id}] WebSocket connection timeout")
|
|
1506
|
+
self._handle_connection_close(conn_id, None, "timeout")
|
|
1507
|
+
|
|
1508
|
+
except PayloadTooBig as e:
|
|
1509
|
+
if saved_proxy_env:
|
|
1510
|
+
restore_proxy_env(saved_proxy_env)
|
|
1511
|
+
saved_proxy_env = None
|
|
1512
|
+
# ✅ 备用处理:max_size=None时此异常不应触发,保留作为防御性编程
|
|
1513
|
+
log_error(f"[conn:{conn_id}] ❌ 收到的消息太大,超过限制: {e}")
|
|
1514
|
+
ws_logger = get_ws_logger()
|
|
1515
|
+
ws_logger.log_abnormal_data(
|
|
1516
|
+
conn_id=conn_id,
|
|
1517
|
+
data=None,
|
|
1518
|
+
error=f"PayloadTooBig: {e}",
|
|
1519
|
+
data_type="payload_too_big"
|
|
1520
|
+
)
|
|
1521
|
+
self._handle_connection_close(conn_id, None, f"消息太大: {e}")
|
|
1522
|
+
|
|
1523
|
+
except ProtocolError as e:
|
|
1524
|
+
if saved_proxy_env:
|
|
1525
|
+
restore_proxy_env(saved_proxy_env)
|
|
1526
|
+
saved_proxy_env = None
|
|
1527
|
+
# ✅ 协议错误(如无效的帧、RSV位错误等)
|
|
1528
|
+
log_error(f"[conn:{conn_id}] ❌ WebSocket 协议错误: {e}")
|
|
1529
|
+
ws_logger = get_ws_logger()
|
|
1530
|
+
ws_logger.log_abnormal_data(
|
|
1531
|
+
conn_id=conn_id,
|
|
1532
|
+
data=None,
|
|
1533
|
+
error=f"ProtocolError: {e}",
|
|
1534
|
+
data_type="protocol_error"
|
|
1535
|
+
)
|
|
1536
|
+
# 协议错误通常表示服务器行为异常,增加重连间隔
|
|
1537
|
+
self._current_reconnect_interval = min(
|
|
1538
|
+
self._current_reconnect_interval * 3,
|
|
1539
|
+
self.config.reconnect_max_interval
|
|
1540
|
+
)
|
|
1541
|
+
self._handle_connection_close(conn_id, None, f"协议错误: {e}")
|
|
1542
|
+
|
|
1543
|
+
except InvalidMessage as e:
|
|
1544
|
+
if saved_proxy_env:
|
|
1545
|
+
restore_proxy_env(saved_proxy_env)
|
|
1546
|
+
saved_proxy_env = None
|
|
1547
|
+
# ✅ 无效的消息格式
|
|
1548
|
+
log_error(f"[conn:{conn_id}] ❌ 无效的 WebSocket 消息: {e}")
|
|
1549
|
+
ws_logger = get_ws_logger()
|
|
1550
|
+
ws_logger.log_abnormal_data(
|
|
1551
|
+
conn_id=conn_id,
|
|
1552
|
+
data=None,
|
|
1553
|
+
error=f"InvalidMessage: {e}",
|
|
1554
|
+
data_type="invalid_message"
|
|
1555
|
+
)
|
|
1556
|
+
self._handle_connection_close(conn_id, None, f"无效消息: {e}")
|
|
1557
|
+
|
|
1558
|
+
except Exception as e:
|
|
1559
|
+
error_str = str(e)
|
|
1560
|
+
|
|
1561
|
+
# 检查是否为连接数限制错误
|
|
1562
|
+
is_rate_limit = (
|
|
1563
|
+
"400" in error_str or
|
|
1564
|
+
"超过连接数限制" in error_str or
|
|
1565
|
+
"connection limit" in error_str.lower()
|
|
1566
|
+
)
|
|
1567
|
+
|
|
1568
|
+
if is_rate_limit:
|
|
1569
|
+
current_time = time.time()
|
|
1570
|
+
if current_time - MessageClient._last_rate_limit_log_time > MessageClient._rate_limit_log_interval:
|
|
1571
|
+
MessageClient._last_rate_limit_log_time = current_time
|
|
1572
|
+
log_warning(f"[conn:{conn_id}] WebSocket rate limit: 超过连接数限制")
|
|
1573
|
+
self._current_reconnect_interval = min(
|
|
1574
|
+
self._current_reconnect_interval * 2,
|
|
1575
|
+
self.config.reconnect_max_interval
|
|
1576
|
+
)
|
|
1577
|
+
else:
|
|
1578
|
+
log_debug(f"[conn:{conn_id}] WebSocket connection error: {e}")
|
|
1579
|
+
# ✅ 记录异常到专用日志
|
|
1580
|
+
ws_logger = get_ws_logger()
|
|
1581
|
+
ws_logger.log_abnormal_data(
|
|
1582
|
+
conn_id=conn_id,
|
|
1583
|
+
data=None,
|
|
1584
|
+
error=f"WebSocket异常: {error_str}",
|
|
1585
|
+
data_type="exception"
|
|
1586
|
+
)
|
|
1587
|
+
|
|
1588
|
+
self._handle_connection_close(conn_id, None, str(e))
|
|
1589
|
+
|
|
1590
|
+
def _handle_connection_close(self, conn_id: int, code: Optional[int], reason: str, received_data: any = None) -> None:
|
|
1591
|
+
"""Handle connection close event."""
|
|
1592
|
+
# 检查连接ID是否仍然有效
|
|
1593
|
+
is_current_connection = False
|
|
1594
|
+
current_conn_id = 0
|
|
1595
|
+
with self.lock:
|
|
1596
|
+
current_conn_id = self._connection_id
|
|
1597
|
+
if self._connection_id != conn_id:
|
|
1598
|
+
log_warning(f"[conn:{conn_id}] 连接已被取代 (当前: {self._connection_id}),仍执行清理")
|
|
1599
|
+
# ✅ 不直接 return,异常断开时仍需清理
|
|
1600
|
+
else:
|
|
1601
|
+
is_current_connection = True
|
|
1602
|
+
log_info(f"[conn:{conn_id}] 当前连接断开: code={code}, reason={reason}")
|
|
1603
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
1604
|
+
self._connecting_since = 0.0
|
|
1605
|
+
self._connecting_conn_id = 0
|
|
1606
|
+
self.connected_event.clear()
|
|
1607
|
+
self.ws = None
|
|
1608
|
+
|
|
1609
|
+
# ✅ 记录到专用 WebSocket 日志(无论是否是当前连接)
|
|
1610
|
+
with self._stream_queue_lock:
|
|
1611
|
+
pending_count = len(self.stream_queue_map)
|
|
1612
|
+
|
|
1613
|
+
ws_logger = get_ws_logger()
|
|
1614
|
+
|
|
1615
|
+
# ✅ 记录连接被取代事件
|
|
1616
|
+
if not is_current_connection:
|
|
1617
|
+
ws_logger.log_connection_superseded(conn_id, current_conn_id, "_handle_connection_close")
|
|
1618
|
+
|
|
1619
|
+
try:
|
|
1620
|
+
ws_logger.log_disconnect(
|
|
1621
|
+
conn_id=conn_id,
|
|
1622
|
+
reason=reason,
|
|
1623
|
+
code=code,
|
|
1624
|
+
received_data=received_data,
|
|
1625
|
+
pending_requests=pending_count,
|
|
1626
|
+
extra_info={
|
|
1627
|
+
"agent_id": self.agent_id,
|
|
1628
|
+
"server_url": self.server_url,
|
|
1629
|
+
"is_current_connection": is_current_connection,
|
|
1630
|
+
"current_conn_id": current_conn_id
|
|
1631
|
+
}
|
|
1632
|
+
)
|
|
1633
|
+
except Exception as e:
|
|
1634
|
+
log_error(f"记录 WebSocket 断开日志失败: {e}")
|
|
1635
|
+
|
|
1636
|
+
# ✅ 修复:连接断开时立即通知所有等待中的 stream 请求
|
|
1637
|
+
self._notify_pending_stream_requests(f"连接断开: {reason}")
|
|
1638
|
+
|
|
1639
|
+
# ✅ 只有当前连接断开时才执行重置和重连(旧连接断开不处理,因为已有新连接)
|
|
1640
|
+
if not is_current_connection:
|
|
1641
|
+
log_debug(f"[conn:{conn_id}] 旧连接断开,跳过重置和重连(当前连接: {self._connection_id})")
|
|
1642
|
+
return
|
|
1643
|
+
|
|
1644
|
+
# ✅ 触发断开回调通知外部(仅当前连接的异常断开才通知)
|
|
1645
|
+
if code != 1000 and self._on_disconnect_callback:
|
|
1646
|
+
try:
|
|
1647
|
+
log_info(f"[conn:{conn_id}] 触发断开回调通知外部...")
|
|
1648
|
+
self._on_disconnect_callback(
|
|
1649
|
+
agent_id=self.agent_id,
|
|
1650
|
+
server_url=self.server_url,
|
|
1651
|
+
code=code,
|
|
1652
|
+
reason=reason
|
|
1653
|
+
)
|
|
1654
|
+
except Exception as e:
|
|
1655
|
+
log_error(f"[conn:{conn_id}] 断开回调执行异常: {e}")
|
|
1656
|
+
|
|
1657
|
+
# ✅ 异常断开时执行完全重置(模拟重启应用的效果)
|
|
1658
|
+
# 注意:_full_reset 会清理状态,但不能在当前线程(WebSocket线程)中停止事件循环
|
|
1659
|
+
need_full_reset = code == 1006 or code == 1002 or code is None or "400" in str(reason) or "protocol" in str(reason).lower()
|
|
1660
|
+
if need_full_reset:
|
|
1661
|
+
log_warning(f"[conn:{conn_id}] 检测到异常断开(code={code}),执行部分重置...")
|
|
1662
|
+
# ✅ 修复:不调用 _full_reset(会尝试停止当前线程的事件循环导致问题)
|
|
1663
|
+
# 只清理必要的状态,让 _handle_reconnection 处理重连
|
|
1664
|
+
self._partial_reset_for_reconnect(conn_id)
|
|
1665
|
+
|
|
1666
|
+
if not self._shutdown_requested and self.config.auto_reconnect:
|
|
1667
|
+
if code != 1000: # 非正常关闭
|
|
1668
|
+
with self.lock:
|
|
1669
|
+
if not self._is_retrying:
|
|
1670
|
+
log_debug(f"[conn:{conn_id}] Triggering reconnection")
|
|
1671
|
+
# ✅ 给重连一点时间让当前线程完成清理
|
|
1672
|
+
def delayed_reconnect():
|
|
1673
|
+
time.sleep(0.5) # 等待当前 WebSocket 线程完全结束
|
|
1674
|
+
self._handle_reconnection()
|
|
1675
|
+
threading.Thread(target=delayed_reconnect, daemon=True, name=f"Reconnect-{conn_id}").start()
|
|
1676
|
+
|
|
1677
|
+
def _full_reset(self, conn_id: int) -> None:
|
|
1678
|
+
"""✅ 完全重置连接状态(模拟重启应用的效果)
|
|
1679
|
+
|
|
1680
|
+
当检测到异常断开时,清理所有状态,确保重连后系统能正常运转。
|
|
1681
|
+
"""
|
|
1682
|
+
ws_logger = get_ws_logger()
|
|
1683
|
+
log_warning(f"[conn:{conn_id}] ========== 开始完全重置 ==========")
|
|
1684
|
+
ws_logger.log_full_reset_detail(conn_id, "start", "开始完全重置流程")
|
|
1685
|
+
|
|
1686
|
+
try:
|
|
1687
|
+
# ✅ 0. 首先重置连接状态(关键!阻止其他线程创建新连接)
|
|
1688
|
+
old_conn_id = 0
|
|
1689
|
+
with self.lock:
|
|
1690
|
+
old_conn_id = self._connection_id
|
|
1691
|
+
self._connection_id = 0
|
|
1692
|
+
self._is_retrying = False
|
|
1693
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
1694
|
+
self._connecting_since = 0.0
|
|
1695
|
+
self._connecting_conn_id = 0
|
|
1696
|
+
self.connected_event.clear()
|
|
1697
|
+
log_info(f"[conn:{conn_id}] ✅ 连接ID重置: {old_conn_id} → 0")
|
|
1698
|
+
ws_logger.log_full_reset_detail(conn_id, "reset_conn_id", f"old={old_conn_id} -> new=0")
|
|
1699
|
+
|
|
1700
|
+
# 1. 停止所有辅助线程(关键!防止它们继续干扰)
|
|
1701
|
+
log_info(f"[conn:{conn_id}] 🛑 停止辅助线程...")
|
|
1702
|
+
self._cleanup_running = False
|
|
1703
|
+
self._health_check_running = False
|
|
1704
|
+
ws_logger.log_full_reset_detail(conn_id, "stop_threads", "设置线程停止标志")
|
|
1705
|
+
|
|
1706
|
+
# 2. 清空消息队列
|
|
1707
|
+
queue_size = self.queue.qsize()
|
|
1708
|
+
cleared_count = 0
|
|
1709
|
+
while not self.queue.empty():
|
|
1710
|
+
try:
|
|
1711
|
+
self.queue.get_nowait()
|
|
1712
|
+
self.queue.task_done()
|
|
1713
|
+
cleared_count += 1
|
|
1714
|
+
except queue.Empty:
|
|
1715
|
+
break
|
|
1716
|
+
|
|
1717
|
+
log_info(f"[conn:{conn_id}] 🧹 清空消息队列: {cleared_count}/{queue_size} 条消息已丢弃")
|
|
1718
|
+
ws_logger.log_full_reset_detail(conn_id, "clear_queue", f"cleared={cleared_count}, total={queue_size}")
|
|
1719
|
+
|
|
1720
|
+
# 3. 清空 stream_queue_map(已经在 _notify_pending_stream_requests 中通知过了)
|
|
1721
|
+
with self._stream_queue_lock:
|
|
1722
|
+
stream_count = len(self.stream_queue_map)
|
|
1723
|
+
self.stream_queue_map.clear()
|
|
1724
|
+
log_info(f"[conn:{conn_id}] 🧹 清空流请求映射: {stream_count} 个请求已清理")
|
|
1725
|
+
ws_logger.log_full_reset_detail(conn_id, "clear_streams", f"cleared={stream_count}")
|
|
1726
|
+
|
|
1727
|
+
# 4. 关闭旧的 WebSocket 连接
|
|
1728
|
+
old_ws = self.ws
|
|
1729
|
+
old_loop = self._loop
|
|
1730
|
+
if old_loop and old_ws:
|
|
1731
|
+
try:
|
|
1732
|
+
if old_loop.is_running():
|
|
1733
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
1734
|
+
self._graceful_close_ws(old_ws),
|
|
1735
|
+
old_loop
|
|
1736
|
+
)
|
|
1737
|
+
try:
|
|
1738
|
+
future.result(timeout=1.0)
|
|
1739
|
+
except Exception:
|
|
1740
|
+
pass
|
|
1741
|
+
except Exception:
|
|
1742
|
+
pass
|
|
1743
|
+
log_info(f"[conn:{conn_id}] 🔌 旧 WebSocket 连接已关闭")
|
|
1744
|
+
ws_logger.log_full_reset_detail(conn_id, "close_ws", "旧WebSocket已关闭")
|
|
1745
|
+
|
|
1746
|
+
# 5. 停止旧的事件循环
|
|
1747
|
+
if old_loop and old_loop.is_running():
|
|
1748
|
+
try:
|
|
1749
|
+
old_loop.call_soon_threadsafe(old_loop.stop)
|
|
1750
|
+
log_info(f"[conn:{conn_id}] ⏹️ 旧事件循环已停止")
|
|
1751
|
+
ws_logger.log_full_reset_detail(conn_id, "stop_loop", "事件循环已停止")
|
|
1752
|
+
except Exception:
|
|
1753
|
+
pass
|
|
1754
|
+
|
|
1755
|
+
# 6. 等待旧线程结束(注意:不能 join 当前线程,会死锁!)
|
|
1756
|
+
current_thread = threading.current_thread()
|
|
1757
|
+
if self._cleanup_thread and self._cleanup_thread.is_alive() and self._cleanup_thread != current_thread:
|
|
1758
|
+
self._cleanup_thread.join(timeout=1.0)
|
|
1759
|
+
if self._health_check_thread and self._health_check_thread.is_alive() and self._health_check_thread != current_thread:
|
|
1760
|
+
self._health_check_thread.join(timeout=1.0)
|
|
1761
|
+
# WebSocket 线程通常就是当前线程,不要 join 自己
|
|
1762
|
+
if self.ws_thread and self.ws_thread.is_alive() and self.ws_thread != current_thread:
|
|
1763
|
+
self.ws_thread.join(timeout=1.0)
|
|
1764
|
+
ws_logger.log_full_reset_detail(conn_id, "join_threads", "等待旧线程结束完成")
|
|
1765
|
+
|
|
1766
|
+
# 7. 清空所有引用
|
|
1767
|
+
with self.lock:
|
|
1768
|
+
self.ws = None
|
|
1769
|
+
self._loop = None
|
|
1770
|
+
self.ws_thread = None
|
|
1771
|
+
self._cleanup_thread = None
|
|
1772
|
+
self._health_check_thread = None
|
|
1773
|
+
ws_logger.log_full_reset_detail(conn_id, "clear_refs", "清空所有引用")
|
|
1774
|
+
|
|
1775
|
+
# 8. 重置重连状态
|
|
1776
|
+
self._reconnect_attempt_count = 0
|
|
1777
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
1778
|
+
self._last_pong_time = 0
|
|
1779
|
+
ws_logger.log_full_reset_detail(conn_id, "reset_reconnect", "重置重连状态")
|
|
1780
|
+
|
|
1781
|
+
# 9. 记录重置日志
|
|
1782
|
+
ws_logger.log_full_reset(
|
|
1783
|
+
conn_id=conn_id,
|
|
1784
|
+
queue_cleared=cleared_count,
|
|
1785
|
+
streams_cleared=stream_count
|
|
1786
|
+
)
|
|
1787
|
+
|
|
1788
|
+
log_info(f"[conn:{conn_id}] ✅ 完全重置完成,系统状态已清理,准备重新连接")
|
|
1789
|
+
ws_logger.log_full_reset_detail(conn_id, "complete", "完全重置流程完成")
|
|
1790
|
+
|
|
1791
|
+
except Exception as e:
|
|
1792
|
+
import traceback
|
|
1793
|
+
error_detail = traceback.format_exc()
|
|
1794
|
+
log_error(f"[conn:{conn_id}] ❌ 完全重置失败: {e}\n{error_detail}")
|
|
1795
|
+
ws_logger.log_full_reset_detail(conn_id, "error", f"重置失败: {str(e)}")
|
|
1796
|
+
|
|
1797
|
+
def _partial_reset_for_reconnect(self, conn_id: int) -> None:
|
|
1798
|
+
"""✅ 部分重置,用于异常断开后准备重连
|
|
1799
|
+
|
|
1800
|
+
与 _full_reset 不同,此方法:
|
|
1801
|
+
1. 不尝试停止当前线程的事件循环(避免死锁)
|
|
1802
|
+
2. 不 join 当前线程(避免死锁)
|
|
1803
|
+
3. 只清理必要的状态,让重连线程创建新的连接
|
|
1804
|
+
|
|
1805
|
+
这个方法在 WebSocket 处理线程中调用是安全的。
|
|
1806
|
+
"""
|
|
1807
|
+
ws_logger = get_ws_logger()
|
|
1808
|
+
log_info(f"[conn:{conn_id}] 🔄 开始部分重置(为重连准备)...")
|
|
1809
|
+
|
|
1810
|
+
try:
|
|
1811
|
+
# 1. 重置重连计数(让重连从头开始)
|
|
1812
|
+
self._reconnect_attempt_count = 0
|
|
1813
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
1814
|
+
|
|
1815
|
+
# 2. 清空 stream_queue_map(已经在 _notify_pending_stream_requests 中通知过了)
|
|
1816
|
+
with self._stream_queue_lock:
|
|
1817
|
+
stream_count = len(self.stream_queue_map)
|
|
1818
|
+
self.stream_queue_map.clear()
|
|
1819
|
+
if stream_count > 0:
|
|
1820
|
+
log_info(f"[conn:{conn_id}] 🧹 清空流请求映射: {stream_count} 个请求已清理")
|
|
1821
|
+
|
|
1822
|
+
# 3. 清空消息队列中的消息(可选,重连后会重新发送)
|
|
1823
|
+
# 注意:这里不清空队列,让队列中的消息在重连后自动发送
|
|
1824
|
+
queue_size = self.queue.qsize()
|
|
1825
|
+
if queue_size > 0:
|
|
1826
|
+
log_info(f"[conn:{conn_id}] 📦 消息队列有 {queue_size} 条待发送消息,重连后自动发送")
|
|
1827
|
+
|
|
1828
|
+
# 4. 停止辅助线程标志(让它们自己退出)
|
|
1829
|
+
self._cleanup_running = False
|
|
1830
|
+
self._health_check_running = False
|
|
1831
|
+
|
|
1832
|
+
# 5. 标记连接状态(关键:让 start_websocket_client 知道需要创建新连接)
|
|
1833
|
+
with self.lock:
|
|
1834
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
1835
|
+
self._connecting_since = 0.0
|
|
1836
|
+
self._connecting_conn_id = 0
|
|
1837
|
+
self.connected_event.clear()
|
|
1838
|
+
self._is_retrying = False # 重置重试标志,允许新的重连
|
|
1839
|
+
# 注意:不清空 ws 和 _loop,让它们自然被替换
|
|
1840
|
+
|
|
1841
|
+
log_info(f"[conn:{conn_id}] ✅ 部分重置完成,准备重连")
|
|
1842
|
+
ws_logger.log_full_reset_detail(conn_id, "partial_reset_complete", "部分重置完成,准备重连")
|
|
1843
|
+
|
|
1844
|
+
except Exception as e:
|
|
1845
|
+
log_error(f"[conn:{conn_id}] ❌ 部分重置异常: {e}")
|
|
1846
|
+
|
|
1847
|
+
def _notify_pending_stream_requests(self, reason: str) -> None:
|
|
1848
|
+
"""✅ 通知所有等待中的 stream 请求连接已断开
|
|
1849
|
+
|
|
1850
|
+
当 WebSocket 连接断开时,立即通知所有等待响应的 create_stream 请求,
|
|
1851
|
+
避免它们继续等待到 15 秒超时。这样调用方可以更快地重试。
|
|
1852
|
+
"""
|
|
1853
|
+
# ✅ 使用锁保护,复制后立即释放锁
|
|
1854
|
+
with self._stream_queue_lock:
|
|
1855
|
+
if not self.stream_queue_map:
|
|
1856
|
+
return
|
|
1857
|
+
pending_items = list(self.stream_queue_map.items())
|
|
1858
|
+
pending_count = len(pending_items)
|
|
1859
|
+
self.stream_queue_map.clear() # 在锁内清空
|
|
1860
|
+
|
|
1861
|
+
if pending_count == 0:
|
|
1862
|
+
return
|
|
1863
|
+
|
|
1864
|
+
log_warning(f"🔔 通知 {pending_count} 个等待中的流请求: {reason}")
|
|
1865
|
+
|
|
1866
|
+
# ✅ 释放锁后再处理通知(避免长时间持锁)
|
|
1867
|
+
notified_count = 0
|
|
1868
|
+
failed_count = 0
|
|
1869
|
+
for request_id, queue_entry in pending_items:
|
|
1870
|
+
try:
|
|
1871
|
+
temp_queue = queue_entry.get("queue")
|
|
1872
|
+
loop = queue_entry.get("loop")
|
|
1873
|
+
receiver = queue_entry.get("receiver", "unknown")
|
|
1874
|
+
|
|
1875
|
+
if temp_queue and loop:
|
|
1876
|
+
error_data = {
|
|
1877
|
+
"error": "connection_lost",
|
|
1878
|
+
"message": f"WebSocket 连接断开: {reason},请重试"
|
|
1879
|
+
}
|
|
1880
|
+
try:
|
|
1881
|
+
# 检查事件循环是否仍在运行
|
|
1882
|
+
if loop.is_running():
|
|
1883
|
+
# 使用线程安全的方式放入错误通知
|
|
1884
|
+
loop.call_soon_threadsafe(temp_queue.put_nowait, error_data)
|
|
1885
|
+
notified_count += 1
|
|
1886
|
+
log_debug(f"📢 已通知: request_id={request_id[:8]}... receiver={receiver}")
|
|
1887
|
+
else:
|
|
1888
|
+
failed_count += 1
|
|
1889
|
+
log_debug(f"事件循环已停止,跳过: request_id={request_id[:8]}...")
|
|
1890
|
+
except RuntimeError as e:
|
|
1891
|
+
failed_count += 1
|
|
1892
|
+
log_debug(f"事件循环已关闭: {e}")
|
|
1893
|
+
except Exception as e:
|
|
1894
|
+
failed_count += 1
|
|
1895
|
+
log_debug(f"通知失败: {e}")
|
|
1896
|
+
|
|
1897
|
+
except Exception as e:
|
|
1898
|
+
log_error(f"❌ 通知等待请求时异常: {e}")
|
|
1899
|
+
|
|
1900
|
+
# 汇总日志
|
|
1901
|
+
log_info(f"🔔 流请求通知完成: 成功={notified_count}, 失败={failed_count}, 总数={pending_count}")
|
|
1902
|
+
|
|
1903
|
+
def _get_close_code_meaning(self, code: int) -> str:
|
|
1904
|
+
"""获取 WebSocket 关闭代码的含义"""
|
|
1905
|
+
close_codes = {
|
|
1906
|
+
1000: "正常关闭 (Normal Closure)",
|
|
1907
|
+
1001: "端点离开 (Going Away) - 服务器关闭或浏览器导航离开",
|
|
1908
|
+
1002: "协议错误 (Protocol Error)",
|
|
1909
|
+
1003: "不支持的数据类型 (Unsupported Data)",
|
|
1910
|
+
1005: "未收到状态码 (No Status Received)",
|
|
1911
|
+
1006: "异常关闭 (Abnormal Closure) - 连接意外断开,未收到关闭帧。常见原因:网络中断、服务器崩溃、防火墙/代理断开、心跳超时",
|
|
1912
|
+
1007: "无效的帧数据 (Invalid Frame Payload Data)",
|
|
1913
|
+
1008: "策略违规 (Policy Violation)",
|
|
1914
|
+
1009: "消息太大 (Message Too Big)",
|
|
1915
|
+
1010: "必需的扩展 (Mandatory Extension)",
|
|
1916
|
+
1011: "内部服务器错误 (Internal Server Error)",
|
|
1917
|
+
1012: "服务重启 (Service Restart)",
|
|
1918
|
+
1013: "稍后重试 (Try Again Later)",
|
|
1919
|
+
1014: "错误的网关 (Bad Gateway)",
|
|
1920
|
+
1015: "TLS握手失败 (TLS Handshake Failure)",
|
|
1921
|
+
}
|
|
1922
|
+
return close_codes.get(code, f"未知代码 (Unknown Code: {code})")
|
|
1923
|
+
|
|
1924
|
+
# 兼容性方法 - 保持与旧 API 的兼容
|
|
1925
|
+
def on_open(self, ws) -> None:
|
|
1926
|
+
"""Handle WebSocket connection open (for compatibility)."""
|
|
1927
|
+
pass
|
|
1928
|
+
|
|
1929
|
+
def on_message(self, ws, message: str) -> None:
|
|
1930
|
+
"""Handle incoming WebSocket messages (for compatibility)."""
|
|
1931
|
+
pass
|
|
1932
|
+
|
|
1933
|
+
def on_error(self, ws, error: Exception) -> None:
|
|
1934
|
+
"""Handle WebSocket errors (for compatibility)."""
|
|
1935
|
+
pass
|
|
1936
|
+
|
|
1937
|
+
def on_close(self, ws, close_status_code: int, close_msg: str) -> None:
|
|
1938
|
+
"""Handle WebSocket connection close (for compatibility)."""
|
|
1939
|
+
pass
|
|
1940
|
+
|
|
1941
|
+
def on_ping(self, ws, message: bytes) -> None:
|
|
1942
|
+
"""Handle WebSocket ping (for compatibility)."""
|
|
1943
|
+
self._last_pong_time = time.time()
|
|
1944
|
+
|
|
1945
|
+
def on_pong(self, ws, message: bytes) -> None:
|
|
1946
|
+
"""Handle WebSocket pong (for compatibility)."""
|
|
1947
|
+
self._last_pong_time = time.time()
|
|
1948
|
+
|
|
1949
|
+
# ✅ 线程安全的 stream_queue_map 访问方法
|
|
1950
|
+
def register_stream_request(self, request_id: str, queue_entry: dict) -> None:
|
|
1951
|
+
"""线程安全地注册流请求"""
|
|
1952
|
+
with self._stream_queue_lock:
|
|
1953
|
+
self.stream_queue_map[request_id] = queue_entry
|
|
1954
|
+
|
|
1955
|
+
def unregister_stream_request(self, request_id: str) -> Optional[dict]:
|
|
1956
|
+
"""线程安全地注销流请求,返回被移除的条目"""
|
|
1957
|
+
with self._stream_queue_lock:
|
|
1958
|
+
return self.stream_queue_map.pop(request_id, None)
|
|
1959
|
+
|
|
1960
|
+
def get_stream_request(self, request_id: str) -> Optional[dict]:
|
|
1961
|
+
"""线程安全地获取流请求"""
|
|
1962
|
+
with self._stream_queue_lock:
|
|
1963
|
+
return self.stream_queue_map.get(request_id)
|
|
1964
|
+
|
|
1965
|
+
def get_pending_stream_count(self) -> int:
|
|
1966
|
+
"""线程安全地获取等待中的流请求数量"""
|
|
1967
|
+
with self._stream_queue_lock:
|
|
1968
|
+
return len(self.stream_queue_map)
|
|
1969
|
+
|
|
1970
|
+
def full_reset(self) -> None:
|
|
1971
|
+
"""
|
|
1972
|
+
完全重置 MessageClient,清理所有资源
|
|
1973
|
+
|
|
1974
|
+
这个方法比 _full_reset 更彻底,用于外部显式调用
|
|
1975
|
+
重置后可以重新调用 start_websocket_client() 建立新连接
|
|
1976
|
+
"""
|
|
1977
|
+
log_info(f"[MessageClient] 开始完全重置: agent_id={self.agent_id}")
|
|
1978
|
+
|
|
1979
|
+
try:
|
|
1980
|
+
# 1. 设置关闭标志(阻止重连和新操作)
|
|
1981
|
+
self._shutdown_requested = True
|
|
1982
|
+
log_debug("[MessageClient] ✓ 已设置关闭标志")
|
|
1983
|
+
|
|
1984
|
+
# 2. 停止辅助线程标志
|
|
1985
|
+
self._cleanup_running = False
|
|
1986
|
+
self._health_check_running = False
|
|
1987
|
+
log_debug("[MessageClient] ✓ 已设置线程停止标志")
|
|
1988
|
+
|
|
1989
|
+
# 3. 通知所有等待中的请求
|
|
1990
|
+
pending_count = self.get_pending_stream_count()
|
|
1991
|
+
if pending_count > 0:
|
|
1992
|
+
log_info(f"[MessageClient] 通知 {pending_count} 个等待中的流请求...")
|
|
1993
|
+
self._notify_pending_stream_requests("MessageClient 正在完全重置")
|
|
1994
|
+
|
|
1995
|
+
# 4. 停止 WebSocket 连接
|
|
1996
|
+
log_debug("[MessageClient] 正在停止 WebSocket...")
|
|
1997
|
+
try:
|
|
1998
|
+
self.stop_websocket_client()
|
|
1999
|
+
except Exception as e:
|
|
2000
|
+
log_warning(f"[MessageClient] 停止 WebSocket 失败: {e}")
|
|
2001
|
+
|
|
2002
|
+
# 5. 清空 stream_queue_map
|
|
2003
|
+
with self._stream_queue_lock:
|
|
2004
|
+
self.stream_queue_map.clear()
|
|
2005
|
+
log_debug("[MessageClient] ✓ stream_queue_map 已清空")
|
|
2006
|
+
|
|
2007
|
+
# 6. 清空消息队列
|
|
2008
|
+
cleared_count = 0
|
|
2009
|
+
while not self.queue.empty():
|
|
2010
|
+
try:
|
|
2011
|
+
self.queue.get_nowait()
|
|
2012
|
+
self.queue.task_done()
|
|
2013
|
+
cleared_count += 1
|
|
2014
|
+
except queue.Empty:
|
|
2015
|
+
break
|
|
2016
|
+
log_debug(f"[MessageClient] ✓ 已清空 {cleared_count} 条待发送消息")
|
|
2017
|
+
|
|
2018
|
+
# 7. 等待辅助线程结束
|
|
2019
|
+
if self._cleanup_thread and self._cleanup_thread.is_alive():
|
|
2020
|
+
self._cleanup_thread.join(timeout=2.0)
|
|
2021
|
+
if self._health_check_thread and self._health_check_thread.is_alive():
|
|
2022
|
+
self._health_check_thread.join(timeout=2.0)
|
|
2023
|
+
log_debug("[MessageClient] ✓ 辅助线程已停止")
|
|
2024
|
+
|
|
2025
|
+
# 8. 重置连接状态
|
|
2026
|
+
with self.lock:
|
|
2027
|
+
self._connection_state = ConnectionState.DISCONNECTED
|
|
2028
|
+
self._connecting_since = 0.0
|
|
2029
|
+
self._connecting_conn_id = 0
|
|
2030
|
+
self.connected_event.clear()
|
|
2031
|
+
self._is_retrying = False
|
|
2032
|
+
self._reconnect_attempt_count = 0
|
|
2033
|
+
self._current_reconnect_interval = self.config.reconnect_base_interval
|
|
2034
|
+
self._connection_id = 0
|
|
2035
|
+
self._last_pong_time = 0
|
|
2036
|
+
|
|
2037
|
+
log_debug("[MessageClient] ✓ 连接状态已重置")
|
|
2038
|
+
|
|
2039
|
+
# 9. 清空引用
|
|
2040
|
+
self.ws = None
|
|
2041
|
+
self._loop = None
|
|
2042
|
+
self.ws_thread = None
|
|
2043
|
+
self._cleanup_thread = None
|
|
2044
|
+
self._health_check_thread = None
|
|
2045
|
+
log_debug("[MessageClient] ✓ 对象引用已清空")
|
|
2046
|
+
|
|
2047
|
+
# 10. 重置关闭标志(允许后续重新启动)
|
|
2048
|
+
self._shutdown_requested = False
|
|
2049
|
+
log_debug("[MessageClient] ✓ 关闭标志已重置")
|
|
2050
|
+
|
|
2051
|
+
log_info(f"[MessageClient] ✅ 完全重置完成: agent_id={self.agent_id}")
|
|
2052
|
+
|
|
2053
|
+
except Exception as e:
|
|
2054
|
+
log_error(f"[MessageClient] ❌ 完全重置失败: {e}")
|
|
2055
|
+
import traceback
|
|
2056
|
+
traceback.print_exc()
|
|
2057
|
+
# 确保关闭标志被重置,允许重试
|
|
2058
|
+
self._shutdown_requested = False
|