@agentunion/kite 1.3.2 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (293) hide show
  1. package/CHANGELOG.md +302 -0
  2. package/cli.js +119 -4
  3. package/core/dependency_checker.py +250 -0
  4. package/core/env_checker.py +490 -0
  5. package/dependencies_lock.json +128 -0
  6. package/extensions/agents/assistant/entry.py +111 -1
  7. package/extensions/agents/assistant/server.py +279 -215
  8. package/extensions/channels/acp_channel/entry.py +111 -1
  9. package/extensions/channels/acp_channel/module.md +23 -22
  10. package/extensions/channels/acp_channel/server.py +279 -215
  11. package/extensions/event_hub_bench/entry.py +107 -1
  12. package/extensions/services/backup/entry.py +306 -21
  13. package/extensions/services/backup/module.md +24 -22
  14. package/extensions/services/evol/auth_manager.py +443 -0
  15. package/extensions/services/evol/config.yaml +149 -0
  16. package/extensions/services/evol/config_loader.py +117 -0
  17. package/extensions/services/evol/entry.py +406 -0
  18. package/extensions/services/evol/evol_api.py +173 -0
  19. package/extensions/services/evol/evol_config.json5 +29 -0
  20. package/extensions/services/evol/migrate_tokens.py +122 -0
  21. package/extensions/services/evol/module.md +32 -0
  22. package/extensions/services/evol/pairing.py +250 -0
  23. package/extensions/services/evol/pairing_codes.jsonl +1 -0
  24. package/extensions/services/evol/relay.py +682 -0
  25. package/extensions/services/evol/relay_config.json5 +67 -0
  26. package/extensions/services/evol/routes/__init__.py +1 -0
  27. package/extensions/services/evol/routes/routes_management_ws.py +127 -0
  28. package/extensions/services/evol/routes/routes_rpc.py +89 -0
  29. package/extensions/services/evol/routes/routes_test.py +61 -0
  30. package/extensions/services/evol/server.py +875 -0
  31. package/extensions/services/evol/static/css/style.css +1200 -0
  32. package/extensions/services/evol/static/index.html +781 -0
  33. package/extensions/services/evol/static/index_evol.html +14 -0
  34. package/extensions/services/evol/static/js/app.js +6304 -0
  35. package/extensions/services/evol/static/js/auth.js +326 -0
  36. package/extensions/services/evol/static/js/dialog.js +285 -0
  37. package/extensions/services/evol/static/js/evol-app-fixed.js +50 -0
  38. package/extensions/services/evol/static/js/evol-app.js +1949 -0
  39. package/extensions/services/evol/static/js/evol-app.js.bak +1800 -0
  40. package/extensions/services/evol/static/js/kernel-client-example.js +228 -0
  41. package/extensions/services/evol/static/js/kernel-client.js +396 -0
  42. package/extensions/services/evol/static/js/main.js +141 -0
  43. package/extensions/services/evol/static/js/registry-tests.js +585 -0
  44. package/extensions/services/evol/static/js/stats.js +217 -0
  45. package/extensions/services/evol/static/js/token-manager.js +175 -0
  46. package/extensions/services/evol/static/pairing.html +248 -0
  47. package/extensions/services/evol/static/test_registry.html +262 -0
  48. package/extensions/services/evol/static/test_relay.html +462 -0
  49. package/extensions/services/evol/stats_manager.py +240 -0
  50. package/extensions/services/model_service/entry.py +167 -19
  51. package/extensions/services/model_service/module.md +21 -22
  52. package/extensions/services/proxy/.claude/settings.local.json +13 -0
  53. package/extensions/services/proxy/CHANGELOG_20260308.md +258 -0
  54. package/extensions/services/proxy/_fix_prints.py +133 -0
  55. package/extensions/services/proxy/_fix_prints2.py +87 -0
  56. package/extensions/services/proxy/agentcp/LICENCE +178 -0
  57. package/extensions/services/proxy/agentcp/README copy.md +85 -0
  58. package/extensions/services/proxy/agentcp/README.md +260 -0
  59. package/extensions/services/proxy/agentcp/__init__.py +16 -0
  60. package/extensions/services/proxy/agentcp/agent.py +4 -0
  61. package/extensions/services/proxy/agentcp/agentcp.py +2494 -0
  62. package/extensions/services/proxy/agentcp/agentprofile.json +89 -0
  63. package/extensions/services/proxy/agentcp/ap/__init__.py +16 -0
  64. package/extensions/services/proxy/agentcp/ap/ap_client.py +316 -0
  65. package/extensions/services/proxy/agentcp/assets/images/wechat_qr.png +0 -0
  66. package/extensions/services/proxy/agentcp/backup/metrics.json +31 -0
  67. package/extensions/services/proxy/agentcp/base/__init__.py +20 -0
  68. package/extensions/services/proxy/agentcp/base/auth_client.py +257 -0
  69. package/extensions/services/proxy/agentcp/base/client.py +112 -0
  70. package/extensions/services/proxy/agentcp/base/env.py +34 -0
  71. package/extensions/services/proxy/agentcp/base/html_util.py +336 -0
  72. package/extensions/services/proxy/agentcp/base/log.py +98 -0
  73. package/extensions/services/proxy/agentcp/ca/__init__.py +17 -0
  74. package/extensions/services/proxy/agentcp/ca/ca_client.py +414 -0
  75. package/extensions/services/proxy/agentcp/ca/ca_root.py +74 -0
  76. package/extensions/services/proxy/agentcp/context/__init__.py +20 -0
  77. package/extensions/services/proxy/agentcp/context/context.py +73 -0
  78. package/extensions/services/proxy/agentcp/context/exceptions.py +114 -0
  79. package/extensions/services/proxy/agentcp/create_profile.py +125 -0
  80. package/extensions/services/proxy/agentcp/create_profile_weather.py +125 -0
  81. package/extensions/services/proxy/agentcp/db/__init__.py +15 -0
  82. package/extensions/services/proxy/agentcp/db/db_mananger.py +550 -0
  83. package/extensions/services/proxy/agentcp/docs/UDP_HEARTBEAT_FIX_REPORT.md +265 -0
  84. package/extensions/services/proxy/agentcp/docs/heartbeat_issue_analysis.md +291 -0
  85. package/extensions/services/proxy/agentcp/file/__init__.py +16 -0
  86. package/extensions/services/proxy/agentcp/file/file_client.py +141 -0
  87. package/extensions/services/proxy/agentcp/file/wss_binary_message.py +137 -0
  88. package/extensions/services/proxy/agentcp/hcp.py +299 -0
  89. package/extensions/services/proxy/agentcp/heartbeat/__init__.py +16 -0
  90. package/extensions/services/proxy/agentcp/heartbeat/heartbeat_client.py +360 -0
  91. package/extensions/services/proxy/agentcp/improved_scheduler.py +498 -0
  92. package/extensions/services/proxy/agentcp/llm_agent_utils.py +249 -0
  93. package/extensions/services/proxy/agentcp/llm_server.py +172 -0
  94. package/extensions/services/proxy/agentcp/mermaid.py +210 -0
  95. package/extensions/services/proxy/agentcp/message.py +149 -0
  96. package/extensions/services/proxy/agentcp/metrics.py +256 -0
  97. package/extensions/services/proxy/agentcp/monitoring/__init__.py +20 -0
  98. package/extensions/services/proxy/agentcp/monitoring/global_monitor.py +27 -0
  99. package/extensions/services/proxy/agentcp/monitoring/metrics_store.py +325 -0
  100. package/extensions/services/proxy/agentcp/monitoring/monitoring_service.py +269 -0
  101. package/extensions/services/proxy/agentcp/monitoring/sliding_window.py +222 -0
  102. package/extensions/services/proxy/agentcp/monitoring/standalone_reader.py +224 -0
  103. package/extensions/services/proxy/agentcp/msg/__init__.py +21 -0
  104. package/extensions/services/proxy/agentcp/msg/connection_manager.py +456 -0
  105. package/extensions/services/proxy/agentcp/msg/message_client.py +2058 -0
  106. package/extensions/services/proxy/agentcp/msg/message_serialize.py +263 -0
  107. package/extensions/services/proxy/agentcp/msg/open_ai_message.py +88 -0
  108. package/extensions/services/proxy/agentcp/msg/session_manager.py +1062 -0
  109. package/extensions/services/proxy/agentcp/msg/stream_client.py +267 -0
  110. package/extensions/services/proxy/agentcp/msg/websocket_file_receiver.py +89 -0
  111. package/extensions/services/proxy/agentcp/msg/ws_logger.py +685 -0
  112. package/extensions/services/proxy/agentcp/msg/wss_binary_message.py +137 -0
  113. package/extensions/services/proxy/agentcp/requirements.txt +7 -0
  114. package/extensions/services/proxy/agentcp/samples/agent_graph/README.md +37 -0
  115. package/extensions/services/proxy/agentcp/samples/agent_graph/agentprofile.json +89 -0
  116. package/extensions/services/proxy/agentcp/samples/agent_graph/create_profile.py +138 -0
  117. package/extensions/services/proxy/agentcp/samples/agent_graph/main.py +164 -0
  118. package/extensions/services/proxy/agentcp/samples/agent_use/create_profile.py +123 -0
  119. package/extensions/services/proxy/agentcp/samples/agent_use/llm/create_profile.py +129 -0
  120. package/extensions/services/proxy/agentcp/samples/agent_use/llm/env.json +5 -0
  121. package/extensions/services/proxy/agentcp/samples/agent_use/llm/main.py +146 -0
  122. package/extensions/services/proxy/agentcp/samples/agent_use/main.py +123 -0
  123. package/extensions/services/proxy/agentcp/samples/agent_use/readme.md +379 -0
  124. package/extensions/services/proxy/agentcp/samples/agent_use/search/create_profile.py +129 -0
  125. package/extensions/services/proxy/agentcp/samples/agent_use/search/main.py +28 -0
  126. package/extensions/services/proxy/agentcp/samples/agent_use/tool/create_profile.py +129 -0
  127. package/extensions/services/proxy/agentcp/samples/agent_use/tool/main.py +20 -0
  128. package/extensions/services/proxy/agentcp/samples/ali_amap/README.md +97 -0
  129. package/extensions/services/proxy/agentcp/samples/ali_amap/amap_agent.py +88 -0
  130. package/extensions/services/proxy/agentcp/samples/ali_amap/create_profile.py +125 -0
  131. package/extensions/services/proxy/agentcp/samples/compute_agent/agent/powershell.py +228 -0
  132. package/extensions/services/proxy/agentcp/samples/compute_agent/agent/software.py +63 -0
  133. package/extensions/services/proxy/agentcp/samples/compute_agent/agent/tools.py +36 -0
  134. package/extensions/services/proxy/agentcp/samples/compute_agent/browser_user.py +41 -0
  135. package/extensions/services/proxy/agentcp/samples/deepseek/README.md +79 -0
  136. package/extensions/services/proxy/agentcp/samples/deepseek/create_profile.py +126 -0
  137. package/extensions/services/proxy/agentcp/samples/deepseek/deepseek.py +42 -0
  138. package/extensions/services/proxy/agentcp/samples/dify_chat/README.md +78 -0
  139. package/extensions/services/proxy/agentcp/samples/dify_chat/create_profile.py +126 -0
  140. package/extensions/services/proxy/agentcp/samples/dify_chat/dify_chat.py +47 -0
  141. package/extensions/services/proxy/agentcp/samples/dify_workflow/README.md +78 -0
  142. package/extensions/services/proxy/agentcp/samples/dify_workflow/create_profile.py +126 -0
  143. package/extensions/services/proxy/agentcp/samples/dify_workflow/dify_workflow.py +46 -0
  144. package/extensions/services/proxy/agentcp/samples/executor/README.md +44 -0
  145. package/extensions/services/proxy/agentcp/samples/executor/agentprofile.json +89 -0
  146. package/extensions/services/proxy/agentcp/samples/executor/create_profile.py +139 -0
  147. package/extensions/services/proxy/agentcp/samples/executor/main.py +160 -0
  148. package/extensions/services/proxy/agentcp/samples/filereader/README.md +45 -0
  149. package/extensions/services/proxy/agentcp/samples/filereader/agentprofile.json +90 -0
  150. package/extensions/services/proxy/agentcp/samples/filereader/create_profile.py +137 -0
  151. package/extensions/services/proxy/agentcp/samples/filereader/main.py +253 -0
  152. package/extensions/services/proxy/agentcp/samples/filewriter/README.md +38 -0
  153. package/extensions/services/proxy/agentcp/samples/filewriter/agentprofile.json +91 -0
  154. package/extensions/services/proxy/agentcp/samples/filewriter/create_profile.py +138 -0
  155. package/extensions/services/proxy/agentcp/samples/filewriter/main.py +289 -0
  156. package/extensions/services/proxy/agentcp/samples/hcp/README.md +85 -0
  157. package/extensions/services/proxy/agentcp/samples/hcp/acp_weather_agent.zip +0 -0
  158. package/extensions/services/proxy/agentcp/samples/hcp/create_profile.py +125 -0
  159. package/extensions/services/proxy/agentcp/samples/hcp/hcp.py +237 -0
  160. package/extensions/services/proxy/agentcp/samples/helloworld/README.md +68 -0
  161. package/extensions/services/proxy/agentcp/samples/helloworld/hello_world.py +40 -0
  162. package/extensions/services/proxy/agentcp/samples/llm_agent/MEADME.md +117 -0
  163. package/extensions/services/proxy/agentcp/samples/llm_agent/create_profile.py +125 -0
  164. package/extensions/services/proxy/agentcp/samples/llm_agent/qwen_agent.py +136 -0
  165. package/extensions/services/proxy/agentcp/samples/local_llm_agent/README.md +90 -0
  166. package/extensions/services/proxy/agentcp/samples/local_llm_agent/create_profile.py +125 -0
  167. package/extensions/services/proxy/agentcp/samples/local_llm_agent/main.py +49 -0
  168. package/extensions/services/proxy/agentcp/samples/query_llm_from_agent/README.md +55 -0
  169. package/extensions/services/proxy/agentcp/samples/query_llm_from_agent/create_profile.py +125 -0
  170. package/extensions/services/proxy/agentcp/samples/query_llm_from_agent/main.py +23 -0
  171. package/extensions/services/proxy/agentcp/samples/query_weather_api_agent/README.md +103 -0
  172. package/extensions/services/proxy/agentcp/samples/query_weather_api_agent/create_profile.py +125 -0
  173. package/extensions/services/proxy/agentcp/samples/query_weather_api_agent/main.py +69 -0
  174. package/extensions/services/proxy/agentcp/samples/query_weather_from_agent/README.md +58 -0
  175. package/extensions/services/proxy/agentcp/samples/query_weather_from_agent/create_profile.py +125 -0
  176. package/extensions/services/proxy/agentcp/samples/query_weather_from_agent/main.py +25 -0
  177. package/extensions/services/proxy/agentcp/samples/qwen3/README.md +71 -0
  178. package/extensions/services/proxy/agentcp/samples/qwen3/create_profile.py +126 -0
  179. package/extensions/services/proxy/agentcp/samples/qwen3/qwen3.py +37 -0
  180. package/extensions/services/proxy/agentcp/samples/qwen3_tools/README.md +133 -0
  181. package/extensions/services/proxy/agentcp/samples/qwen3_tools/create_profile.py +126 -0
  182. package/extensions/services/proxy/agentcp/samples/qwen3_tools/qwen3_tools.py +98 -0
  183. package/extensions/services/proxy/agentcp/samples/search/create_profile_qwen.py +125 -0
  184. package/extensions/services/proxy/agentcp/samples/search/create_profile_search.py +125 -0
  185. package/extensions/services/proxy/agentcp/samples/search/qwen_agent.py +136 -0
  186. package/extensions/services/proxy/agentcp/samples/search/search_agent.py +170 -0
  187. package/extensions/services/proxy/agentcp/samples/wrapper_agently_to_agent/README.md +89 -0
  188. package/extensions/services/proxy/agentcp/samples/wrapper_agently_to_agent/create_profile.py +125 -0
  189. package/extensions/services/proxy/agentcp/samples/wrapper_agently_to_agent/main.py +44 -0
  190. package/extensions/services/proxy/agentcp/utils/__init__.py +15 -0
  191. package/extensions/services/proxy/agentcp/utils/file_util.py +117 -0
  192. package/extensions/services/proxy/agentcp/utils/proxy_bypass.py +99 -0
  193. package/extensions/services/proxy/agentcp/workflow.py +203 -0
  194. package/extensions/services/proxy/console_auth.py +109 -0
  195. package/extensions/services/proxy/evol/__init__.py +1 -0
  196. package/extensions/services/proxy/evol/config.py +37 -0
  197. package/extensions/services/proxy/evol/http/__init__.py +1 -0
  198. package/extensions/services/proxy/evol/http/async_http.py +551 -0
  199. package/extensions/services/proxy/evol/log.py +28 -0
  200. package/extensions/services/proxy/evol/presenter/__init__.py +2 -0
  201. package/extensions/services/proxy/evol/presenter/agentIdPresenter.py +1031 -0
  202. package/extensions/services/proxy/evol/presenter/apikeyPresenter.py +106 -0
  203. package/extensions/services/proxy/evol/presenter/configPresenter.py +1281 -0
  204. package/extensions/services/proxy/evol/presenter/userPresenter.py +477 -0
  205. package/extensions/services/proxy/evol/server/__init__.py +1 -0
  206. package/extensions/services/proxy/evol/server/claude_proxy_async.py +3430 -0
  207. package/extensions/services/proxy/evol/server/openclaw_proxy.py +1861 -0
  208. package/extensions/services/proxy/evol/server/proxy_config.py +15 -0
  209. package/extensions/services/proxy/evol/server/proxy_engine.py +501 -0
  210. package/extensions/services/proxy/evol/version.py +24 -0
  211. package/extensions/services/proxy/logs/websocket.log +260 -0
  212. package/extensions/services/proxy/main.py +240 -0
  213. package/extensions/services/proxy/requirements.txt +13 -0
  214. package/extensions/services/proxy/server.py +271 -0
  215. package/extensions/services/watchdog/entry.py +215 -26
  216. package/extensions/services/watchdog/module.md +1 -0
  217. package/extensions/services/watchdog/monitor.py +178 -38
  218. package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
  219. package/extensions/services/web/config_example.py +35 -0
  220. package/extensions/services/web/config_loader.py +110 -0
  221. package/extensions/services/web/entry.py +114 -26
  222. package/extensions/services/web/module.md +35 -24
  223. package/extensions/services/web/pairing.py +250 -0
  224. package/extensions/services/web/pairing_codes.jsonl +16 -0
  225. package/extensions/services/web/relay.py +643 -0
  226. package/extensions/services/web/relay_config.json5 +67 -0
  227. package/extensions/services/web/routes/routes_management_ws.py +127 -0
  228. package/extensions/services/web/routes/routes_rpc.py +89 -0
  229. package/extensions/services/web/routes/routes_test.py +61 -0
  230. package/extensions/services/web/routes/schemas.py +0 -22
  231. package/extensions/services/web/server.py +434 -99
  232. package/extensions/services/web/static/css/style.css +67 -28
  233. package/extensions/services/web/static/index.html +234 -44
  234. package/extensions/services/web/static/js/app.js +1335 -48
  235. package/extensions/services/web/static/js/kernel-client-example.js +161 -0
  236. package/extensions/services/web/static/js/kernel-client.js +383 -0
  237. package/extensions/services/web/static/js/registry-tests.js +558 -0
  238. package/extensions/services/web/static/js/token-manager.js +175 -0
  239. package/extensions/services/web/static/pairing.html +248 -0
  240. package/extensions/services/web/static/test_registry.html +262 -0
  241. package/extensions/services/web/web_config.json5 +29 -0
  242. package/kernel/entry.py +120 -32
  243. package/kernel/event_hub.py +141 -16
  244. package/kernel/module.md +60 -33
  245. package/kernel/registry_store.py +45 -36
  246. package/kernel/rpc_router.py +152 -59
  247. package/kernel/server.py +322 -26
  248. package/kite_cli/__init__.py +3 -0
  249. package/kite_cli/__main__.py +5 -0
  250. package/kite_cli/commands/__init__.py +1 -0
  251. package/kite_cli/commands/clean.py +101 -0
  252. package/kite_cli/commands/deps_install.py +67 -0
  253. package/kite_cli/commands/doctor.py +35 -0
  254. package/kite_cli/commands/env_check.py +45 -0
  255. package/kite_cli/commands/history.py +111 -0
  256. package/kite_cli/commands/info.py +96 -0
  257. package/kite_cli/commands/install.py +313 -0
  258. package/kite_cli/commands/list.py +143 -0
  259. package/kite_cli/commands/log.py +81 -0
  260. package/kite_cli/commands/prepare.py +49 -0
  261. package/kite_cli/commands/rollback.py +88 -0
  262. package/kite_cli/commands/search.py +73 -0
  263. package/kite_cli/commands/uninstall.py +85 -0
  264. package/kite_cli/commands/update.py +118 -0
  265. package/kite_cli/commands/venv_setup.py +56 -0
  266. package/kite_cli/core/__init__.py +1 -0
  267. package/kite_cli/core/checker.py +142 -0
  268. package/kite_cli/core/dependency.py +229 -0
  269. package/kite_cli/core/downloader.py +209 -0
  270. package/kite_cli/core/install_info.py +40 -0
  271. package/kite_cli/core/tool_installer.py +397 -0
  272. package/kite_cli/core/validator.py +78 -0
  273. package/kite_cli/main.py +317 -0
  274. package/kite_cli/utils/__init__.py +1 -0
  275. package/kite_cli/utils/i18n.py +252 -0
  276. package/kite_cli/utils/interactive.py +63 -0
  277. package/kite_cli/utils/operation_log.py +77 -0
  278. package/kite_cli/utils/paths.py +34 -0
  279. package/kite_cli/utils/version.py +308 -0
  280. package/launcher/entry.py +1124 -178
  281. package/launcher/logging_setup.py +104 -0
  282. package/launcher/module.md +46 -37
  283. package/launcher/module_scanner.py +11 -1
  284. package/main.py +4 -1
  285. package/package.json +9 -1
  286. package/python_version.json +4 -0
  287. package/requirements.txt +38 -0
  288. package/scripts/env-manager.js +328 -0
  289. package/scripts/plan_manager.py +315 -0
  290. package/scripts/python-env.js +79 -0
  291. package/scripts/scan_dependencies.py +461 -0
  292. package/scripts/setup-python-env.js +191 -0
  293. package/extensions/services/web/routes/routes_modules.py +0 -249
@@ -0,0 +1,2058 @@
1
+ # Copyright 2025 AgentUnion Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import asyncio
16
+ import json
17
+ import queue
18
+ import ssl
19
+ import threading
20
+ import time
21
+ from enum import Enum
22
+ from typing import Dict, Optional, Union
23
+
24
+ import websockets
25
+ from websockets.exceptions import ConnectionClosed, ConnectionClosedError, ConnectionClosedOK
26
+ from websockets.exceptions import InvalidMessage, PayloadTooBig, ProtocolError
27
+ from websockets.protocol import State as WsState
28
+ from websockets.frames import Frame, Opcode
29
+
30
+ from agentcp.utils.proxy_bypass import ensure_no_proxy_for_local_env, is_local_url, pop_proxy_env, restore_proxy_env
31
+ from agentcp.base.auth_client import AuthClient
32
+ from agentcp.base.client import IClient
33
+ from agentcp.base.log import log_debug, log_error, log_exception, log_info, log_warning
34
+
35
+ from ..context import ErrorContext, exceptions
36
+
37
+ ensure_no_proxy_for_local_env()
38
+ from .ws_logger import get_ws_logger # ✅ 导入 WebSocket 专用日志
39
+
40
+
41
+ class ConnectionState(Enum):
42
+ DISCONNECTED = "disconnected"
43
+ CONNECTING = "connecting"
44
+ CONNECTED = "connected"
45
+ RECONNECTING = "reconnecting"
46
+
47
+
48
+ class MessageClientConfig:
49
+ """Configuration class for MessageClient
50
+
51
+ 配置参数说明:
52
+ - max_queue_size: 消息队列最大容量,断连期间消息暂存于此
53
+ - connection_timeout: WebSocket 连接建立超时时间
54
+ - ping_interval: 心跳间隔,用于检测连接是否存活
55
+ - reconnect_base_interval: 首次重连等待时间
56
+ - reconnect_max_interval: 最大重连等待时间(指数退避上限)
57
+ - reconnect_backoff_factor: 指数退避因子
58
+ - max_message_size: 单条消息最大大小,超过则丢弃
59
+ """
60
+
61
+ def __init__(self):
62
+ # ✅ 消息队列:扩大容量,减少断连期间消息丢失
63
+ self.max_queue_size: int = 5000 # 从 30 改为 5000
64
+
65
+ # ✅ 连接超时:缩短,更快感知连接失败
66
+ self.connection_timeout: float = 3.0 # 从 5.0 改为 3.0
67
+
68
+ self.retry_interval: float = 4.0
69
+ self.max_retry_attempts: int = 0 # 0 表示无限重连
70
+ self.send_retry_attempts: int = 5
71
+ self.send_retry_delay: float = 0.01
72
+
73
+ # ✅ 心跳:更频繁,更快检测连接"假死"
74
+ self.ping_interval: int = 3 # 从 5 改为 3
75
+
76
+ # ✅ 自动重连配置:缩短间隔,更快恢复服务
77
+ self.auto_reconnect: bool = True
78
+ self.reconnect_base_interval: float = 0.5 # 从 2.0 改为 0.5(首次重连只等 0.5 秒)
79
+ self.reconnect_max_interval: float = 10.0 # 从 60.0 改为 10.0(最多等 10 秒)
80
+ self.reconnect_backoff_factor: float = 1.5 # 保持不变
81
+
82
+ # ✅ 消息大小限制
83
+ self.max_message_size: int = 10 * 1024 * 1024 # 从 64MB 改为 10MB
84
+
85
+
86
+ class MessageClient(IClient):
87
+ """WebSocket-based message client using websockets library.
88
+
89
+ 使用 websockets 库替代 websocket-client,更好地处理协议扩展和错误。
90
+ """
91
+
92
+ # 类级别的速率限制标志
93
+ _last_rate_limit_log_time = 0
94
+ _rate_limit_log_interval = 30
95
+
96
+ def __init__(
97
+ self,
98
+ agent_id: str,
99
+ server_url: str,
100
+ aid_path: str,
101
+ seed_password: str,
102
+ cache_auth_client: Optional[AuthClient] = None,
103
+ config: Optional[MessageClientConfig] = None,
104
+ agent_id_ref=None,
105
+ ):
106
+ self.agent_id = agent_id
107
+ self.server_url = server_url.rstrip("/")
108
+ self.config = config or MessageClientConfig()
109
+ self._agent_id_ref = agent_id_ref
110
+
111
+ # Initialize auth client
112
+ if cache_auth_client is None:
113
+ self.auth_client = AuthClient(agent_id, server_url, aid_path, seed_password)
114
+ else:
115
+ self.auth_client = cache_auth_client
116
+
117
+ # Thread synchronization
118
+ self.lock = threading.Lock()
119
+ self.connected_event = threading.Event()
120
+
121
+ # WebSocket related
122
+ self.ws: Optional[websockets.WebSocketClientProtocol] = None
123
+ self.ws_thread: Optional[threading.Thread] = None
124
+ self.ws_url: Optional[str] = None
125
+
126
+ # Asyncio event loop for websockets
127
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
128
+
129
+ # Message handling
130
+ self.queue = queue.Queue(maxsize=self.config.max_queue_size)
131
+ self.message_handler: Optional[object] = None
132
+
133
+ # Connection state
134
+ self._connection_state = ConnectionState.DISCONNECTED
135
+ self._is_retrying = False
136
+ self._shutdown_requested = False
137
+ self.stream_queue_map = {}
138
+ self._stream_queue_lock = threading.Lock() # 保护 stream_queue_map 的访问
139
+
140
+ # Stream queue cleanup
141
+ self._cleanup_thread: Optional[threading.Thread] = None
142
+ self._cleanup_running = False
143
+
144
+ # 重连状态管理
145
+ self._current_reconnect_interval = self.config.reconnect_base_interval
146
+ self._reconnect_attempt_count = 0
147
+
148
+ # 连接健康检查
149
+ self._health_check_thread: Optional[threading.Thread] = None
150
+ self._health_check_running = False
151
+ self._last_pong_time: float = 0
152
+
153
+ # 连接唯一标识,用于追踪和防止重复连接
154
+ self._connection_id: int = 0
155
+
156
+ # CONNECTING tracking: avoid stuck connection attempts
157
+ self._connecting_since: float = 0.0
158
+ self._connecting_conn_id: int = 0
159
+
160
+ # ✅ 断开回调:当 WebSocket 连接断开时通知外部
161
+ self._on_disconnect_callback: Optional[callable] = None
162
+
163
+ # ✅ 连接恢复回调:当 WebSocket 连接恢复时通知外部
164
+ self._on_reconnect_callback: Optional[callable] = None
165
+
166
+ @property
167
+ def connection_state(self) -> ConnectionState:
168
+ """Get current connection state."""
169
+ with self.lock:
170
+ return self._connection_state
171
+
172
+ def _set_connection_state(self, state: ConnectionState) -> None:
173
+ """Set connection state thread-safely."""
174
+ with self.lock:
175
+ self._connection_state = state
176
+ if state == ConnectionState.CONNECTED:
177
+ self.connected_event.set()
178
+ else:
179
+ self.connected_event.clear()
180
+ if state != ConnectionState.CONNECTING:
181
+ self._connecting_since = 0.0
182
+ self._connecting_conn_id = 0
183
+
184
+ def _get_use_system_proxy(self) -> bool:
185
+ """获取是否使用系统代理"""
186
+ if self._agent_id_ref and hasattr(self._agent_id_ref, 'get_use_system_proxy'):
187
+ return self._agent_id_ref.get_use_system_proxy()
188
+ return False
189
+
190
+ def _is_ws_open(self) -> bool:
191
+ """Check if WebSocket connection is open."""
192
+ try:
193
+ return self.ws is not None and self.ws.state == WsState.OPEN
194
+ except Exception:
195
+ return False
196
+
197
+ # ==================== 连接状态查询 API ====================
198
+
199
+ def is_healthy(self) -> bool:
200
+ """✅ 检查连接是否健康可用
201
+
202
+ 健康条件:
203
+ 1. WebSocket 连接状态为 OPEN
204
+ 2. connected_event 已设置
205
+ 3. 连接状态为 CONNECTED
206
+ 4. 没有正在重连
207
+
208
+ Returns:
209
+ True: 连接健康,可以发送消息
210
+ False: 连接不可用
211
+ """
212
+ return (
213
+ self._is_ws_open() and
214
+ self.connected_event.is_set() and
215
+ self.connection_state == ConnectionState.CONNECTED and
216
+ not self._is_retrying
217
+ )
218
+
219
+ def get_connection_info(self) -> dict:
220
+ """✅ 获取连接状态详情
221
+
222
+ Returns:
223
+ 包含连接状态信息的字典
224
+ """
225
+ return {
226
+ "agent_id": self.agent_id,
227
+ "server_url": self.server_url,
228
+ "state": self.connection_state.value,
229
+ "ws_open": self._is_ws_open(),
230
+ "is_healthy": self.is_healthy(),
231
+ "is_retrying": self._is_retrying,
232
+ "reconnect_attempts": self._reconnect_attempt_count,
233
+ "current_reconnect_interval": self._current_reconnect_interval,
234
+ "connection_id": self._connection_id,
235
+ "last_pong_time": self._last_pong_time,
236
+ "queue_size": self.queue.qsize(),
237
+ "queue_capacity": self.config.max_queue_size,
238
+ "pending_streams": self.get_pending_stream_count(),
239
+ }
240
+
241
+ def get_health_summary(self) -> str:
242
+ """✅ 获取连接健康状态摘要(用于日志/调试)
243
+
244
+ Returns:
245
+ 健康状态摘要字符串
246
+ """
247
+ info = self.get_connection_info()
248
+ status = "🟢 健康" if info["is_healthy"] else "🔴 不健康"
249
+ return (
250
+ f"{status} | state={info['state']} | "
251
+ f"ws_open={info['ws_open']} | "
252
+ f"retrying={info['is_retrying']} | "
253
+ f"queue={info['queue_size']}/{info['queue_capacity']}"
254
+ )
255
+
256
+ def set_reconnect_callback(self, callback: callable) -> None:
257
+ """✅ 设置连接恢复回调
258
+
259
+ 当 WebSocket 连接恢复时,会调用此回调函数。
260
+ 回调函数签名: callback(agent_id: str, server_url: str)
261
+
262
+ Args:
263
+ callback: 连接恢复时调用的回调函数
264
+ """
265
+ self._on_reconnect_callback = callback
266
+ log_info(f"[MessageClient] 已设置连接恢复回调: {callback}")
267
+
268
+ # ==================== 原有方法 ====================
269
+
270
+ def initialize(self) -> None:
271
+ """Initialize the client by signing in."""
272
+ self.auth_client.sign_in()
273
+
274
+ def sign_in(self) -> bool:
275
+ """Sign in using auth client."""
276
+ try:
277
+ result = self.auth_client.sign_in()
278
+ return result is not None
279
+ except Exception as e:
280
+ log_exception(f"Failed to sign in: {e}")
281
+ return False
282
+
283
+ def get_headers(self) -> Dict[str, str]:
284
+ """Get headers for requests."""
285
+ return {"User-Agent": f"AgentCP/{__import__('agentcp').__version__} (AuthClient; {self.agent_id})"}
286
+
287
+ def sign_out(self) -> None:
288
+ """Sign out using auth client."""
289
+ self.auth_client.sign_out()
290
+
291
+ def set_message_handler(self, message_handler: object) -> None:
292
+ """Set message handler for incoming messages."""
293
+ self.message_handler = message_handler
294
+
295
+ def set_disconnect_callback(self, callback: callable) -> None:
296
+ """设置断开回调函数
297
+
298
+ 当 WebSocket 连接断开时,会调用此回调函数通知外部。
299
+ 回调函数签名: callback(agent_id: str, server_url: str, code: int, reason: str)
300
+
301
+ Args:
302
+ callback: 断开时调用的回调函数
303
+ """
304
+ self._on_disconnect_callback = callback
305
+ log_info(f"[MessageClient] 已设置断开回调: {callback}")
306
+
307
+ def _build_websocket_url(self) -> str:
308
+ """Build WebSocket URL with proper protocol and parameters."""
309
+ ws_url = self.server_url.replace("https://", "wss://").replace("http://", "ws://")
310
+ return f"{ws_url}/session?agent_id={self.agent_id}&signature={self.auth_client.signature}"
311
+
312
+ def start_websocket_client(self) -> bool:
313
+ """Start WebSocket client connection.
314
+
315
+ 修复:如果 WebSocket 连接实际上是正常的,不要创建新连接。
316
+ 只在连接真正断开时才创建新连接。
317
+ """
318
+ # ✅ 检查解释器是否正在关闭
319
+ import sys
320
+ if hasattr(sys, 'is_finalizing') and sys.is_finalizing():
321
+ log_debug("Interpreter is shutting down, skipping connection")
322
+ self._shutdown_requested = True
323
+ return False
324
+
325
+ if self._shutdown_requested:
326
+ return False
327
+
328
+ need_cleanup = False
329
+ need_start = False
330
+ conn_id = 0
331
+ now = time.time()
332
+
333
+ with self.lock:
334
+ ws_open = self._is_ws_open()
335
+
336
+ # ✅ 修复:如果 WebSocket 连接实际上是正常的,直接返回 true
337
+ # 不管状态是什么,只要连接是 open 的就不需要重连
338
+ if ws_open:
339
+ # 修正状态(可能被错误地设置为 DISCONNECTED)
340
+ if self._connection_state != ConnectionState.CONNECTED:
341
+ log_info(f"[conn:{self._connection_id}] WebSocket is open, fixing state from {self._connection_state.value} to connected")
342
+ self._connection_state = ConnectionState.CONNECTED
343
+ self.connected_event.set()
344
+ return True
345
+
346
+ # 如果正在连接中(另一个线程正在创建连接),等待结果
347
+ if self._connection_state == ConnectionState.CONNECTING:
348
+ conn_id = self._connection_id
349
+ if (
350
+ self._connecting_conn_id == conn_id
351
+ and self._connecting_since > 0
352
+ and (now - self._connecting_since) > max(self.config.connection_timeout * 2, 10.0)
353
+ ):
354
+ log_warning(
355
+ f"[conn:{conn_id}] Stale CONNECTING detected "
356
+ f"(elapsed={now - self._connecting_since:.1f}s), restarting connection"
357
+ )
358
+ need_cleanup = True
359
+ need_start = True
360
+ self._connection_id += 1
361
+ conn_id = self._connection_id
362
+ log_info(f"[conn:{conn_id}] Creating new connection: state=connecting(stale), ws_open={ws_open}")
363
+ self._connection_state = ConnectionState.CONNECTING
364
+ self._connecting_since = now
365
+ self._connecting_conn_id = conn_id
366
+ self.connected_event.clear()
367
+ else:
368
+ log_debug(f"[conn:{conn_id}] Another thread is connecting, waiting...")
369
+ else:
370
+ # ✅ 只有在 ws 真正不可用时才创建新连接
371
+ need_cleanup = True
372
+ need_start = True
373
+ self._connection_id += 1
374
+ conn_id = self._connection_id
375
+ # 记录为什么需要新连接
376
+ log_info(f"[conn:{conn_id}] Creating new connection: state={self._connection_state.value}, ws_open={ws_open}")
377
+ self._connection_state = ConnectionState.CONNECTING
378
+ self._connecting_since = now
379
+ self._connecting_conn_id = conn_id
380
+ self.connected_event.clear()
381
+
382
+ # 在锁外执行阻塞操作
383
+ if need_cleanup:
384
+ self._cleanup_old_connection_unlocked()
385
+
386
+ if need_start:
387
+ self.ws_url = self._build_websocket_url()
388
+ log_debug(f"[conn:{conn_id}] Connecting to WebSocket URL: {self.ws_url}")
389
+
390
+ # ✅ 记录连接尝试到专用日志
391
+ ws_logger = get_ws_logger()
392
+ ws_logger.log_connection_attempt(conn_id, self.ws_url, "new_connection")
393
+
394
+ # Start WebSocket thread with asyncio loop
395
+ self.ws_thread = threading.Thread(
396
+ target=self._ws_handler,
397
+ args=(conn_id,),
398
+ daemon=True,
399
+ name=f"WebSocketHandler-{conn_id}"
400
+ )
401
+ self.ws_thread.start()
402
+
403
+ return self._wait_for_connection()
404
+
405
+ def _cleanup_old_connection_unlocked(self) -> None:
406
+ """Clean up old connection. Called WITHOUT lock held to avoid blocking."""
407
+ log_info(f"[cleanup] 开始清理旧连接状态...")
408
+
409
+ # 停止辅助线程标志
410
+ self._cleanup_running = False
411
+ self._health_check_running = False
412
+
413
+ # ✅ 通知所有等待中的 stream 请求(创建新连接前清理旧状态)
414
+ pending_count = self.get_pending_stream_count() # ✅ 使用线程安全方法
415
+ if pending_count > 0:
416
+ log_warning(f"[cleanup] 通知 {pending_count} 个等待中的 stream 请求...")
417
+ self._notify_pending_stream_requests("创建新连接,旧请求已取消")
418
+
419
+ # 在锁内保存并清除旧的引用
420
+ with self.lock:
421
+ old_loop = self._loop
422
+ old_ws = self.ws
423
+ old_thread = self.ws_thread
424
+ # 注意:不在这里清除引用,让新连接设置新值
425
+ # 这样可以避免竞态条件
426
+
427
+ # 关闭旧的 WebSocket
428
+ if old_loop and old_ws:
429
+ try:
430
+ if old_loop.is_running():
431
+ future = asyncio.run_coroutine_threadsafe(
432
+ self._graceful_close_ws(old_ws),
433
+ old_loop
434
+ )
435
+ try:
436
+ future.result(timeout=2.0)
437
+ except Exception:
438
+ pass
439
+ except Exception:
440
+ pass
441
+
442
+ # 停止旧的事件循环
443
+ if old_loop:
444
+ try:
445
+ if old_loop.is_running():
446
+ old_loop.call_soon_threadsafe(old_loop.stop)
447
+ except Exception:
448
+ pass
449
+
450
+ # 等待旧线程结束
451
+ if old_thread and old_thread.is_alive():
452
+ try:
453
+ old_thread.join(timeout=2.0)
454
+ except Exception:
455
+ pass
456
+
457
+ async def _graceful_close_ws(self, ws) -> None:
458
+ """Gracefully close WebSocket connection."""
459
+ if ws is None:
460
+ return
461
+ try:
462
+ await asyncio.wait_for(ws.close(), timeout=1.0)
463
+ except asyncio.TimeoutError:
464
+ pass
465
+ except Exception:
466
+ pass
467
+
468
+ def _cleanup_old_connection(self) -> None:
469
+ """Clean up old connection (legacy method, calls unlocked version)."""
470
+ self._cleanup_old_connection_unlocked()
471
+
472
+ def _wait_for_connection(self) -> bool:
473
+ """Wait for connection to be established."""
474
+ result = self.connected_event.wait(timeout=self.config.connection_timeout)
475
+ if not result:
476
+ # 超时了,检查状态
477
+ with self.lock:
478
+ if self._connection_state == ConnectionState.CONNECTING:
479
+ # 连接超时,但线程可能还在运行,让它继续
480
+ # 下次调用会重新等待或创建新连接
481
+ log_debug("Connection wait timeout, connection still in progress")
482
+ if self._connecting_since > 0 and (time.time() - self._connecting_since) > self.config.connection_timeout:
483
+ log_warning("Connection appears stalled, marking DISCONNECTED to allow reconnect")
484
+ self._connection_state = ConnectionState.DISCONNECTED
485
+ self._connecting_since = 0.0
486
+ self._connecting_conn_id = 0
487
+ self.connected_event.clear()
488
+ return result
489
+
490
+ def stop_websocket_client(self) -> None:
491
+ """Stop WebSocket client connection."""
492
+ self._shutdown_requested = True
493
+
494
+ # 停止清理线程
495
+ self._stop_cleanup_thread()
496
+
497
+ # 停止健康检查线程
498
+ self._stop_health_check_thread()
499
+
500
+ # 关闭 WebSocket
501
+ if self._loop and self.ws:
502
+ try:
503
+ if self._loop.is_running():
504
+ future = asyncio.run_coroutine_threadsafe(
505
+ self._graceful_close_ws(self.ws),
506
+ self._loop
507
+ )
508
+ try:
509
+ future.result(timeout=2.0)
510
+ except Exception:
511
+ pass
512
+ except Exception:
513
+ pass
514
+
515
+ # 停止事件循环
516
+ if self._loop and self._loop.is_running():
517
+ try:
518
+ self._loop.call_soon_threadsafe(self._loop.stop)
519
+ except Exception:
520
+ pass
521
+
522
+ if self.ws_thread and self.ws_thread.is_alive():
523
+ self.ws_thread.join(timeout=2.0)
524
+ self.ws_thread = None
525
+
526
+ self._set_connection_state(ConnectionState.DISCONNECTED)
527
+
528
+ def send_msg(self, msg: Union[str, Dict]) -> bool:
529
+ """Send message through WebSocket with retry logic."""
530
+ if not self._ensure_connection():
531
+ return self._queue_message(msg)
532
+
533
+ try:
534
+ # 检查连接是否有效
535
+ if not self._is_ws_open():
536
+ log_debug("WebSocket connection invalid, queueing message")
537
+ # 不设置 DISCONNECTED,让连接自然恢复或由健康检查处理
538
+ return self._queue_message(msg)
539
+
540
+ message_str = json.dumps(msg) if not isinstance(msg, str) else msg
541
+
542
+ # ✅ 发送前检查消息大小,超过限制直接丢弃
543
+ msg_size = len(message_str.encode('utf-8')) if isinstance(message_str, str) else len(message_str)
544
+ if msg_size > self.config.max_message_size:
545
+ log_error(f"[conn:{self._connection_id}] ❌ 发送消息过大,已丢弃: {msg_size/1024/1024:.2f}MB > {self.config.max_message_size/1024/1024:.0f}MB 限制")
546
+ # 记录到专用日志
547
+ ws_logger = get_ws_logger()
548
+ ws_logger.log_abnormal_data(
549
+ conn_id=self._connection_id,
550
+ data=None,
551
+ error=f"发送消息大小 {msg_size/1024/1024:.2f}MB ({msg_size} bytes) 超过限制 {self.config.max_message_size/1024/1024:.0f}MB,已丢弃",
552
+ data_type="oversized_send_discarded"
553
+ )
554
+ return False # 丢弃消息,返回失败
555
+
556
+ # 使用事件循环发送消息
557
+ if self._loop and self._loop.is_running():
558
+ future = asyncio.run_coroutine_threadsafe(
559
+ self._async_send(message_str),
560
+ self._loop
561
+ )
562
+ future.result(timeout=5.0)
563
+ return True
564
+ else:
565
+ return self._queue_message(msg)
566
+
567
+ except ConnectionClosed as e:
568
+ log_debug(f"WebSocket connection closed during send: {e}")
569
+ # 连接已关闭,设置状态(连接会自动重连)
570
+ with self.lock:
571
+ if self._connection_state == ConnectionState.CONNECTED:
572
+ self._connection_state = ConnectionState.DISCONNECTED
573
+ self.connected_event.clear()
574
+ return self._queue_message(msg)
575
+ except Exception as e:
576
+ log_debug(f"Failed to send message: {e}")
577
+ trace_id = msg.get("trace_id", "") if isinstance(msg, dict) else ""
578
+ ErrorContext.publish(exceptions.SendMsgError(message=f"Error sending message: {e}", trace_id=trace_id))
579
+ # 发送失败不一定意味着连接断开,不要设置 DISCONNECTED
580
+ return self._queue_message(msg)
581
+
582
+ async def _async_send(self, message: str) -> None:
583
+ """Async send message."""
584
+ if self._is_ws_open():
585
+ await self.ws.send(message)
586
+
587
+ def _ensure_connection(self) -> bool:
588
+ """Ensure WebSocket connection is established."""
589
+ # 快速路径:如果已连接且有效,直接返回
590
+ if self._is_ws_open():
591
+ # 只在状态是 DISCONNECTED 时修正为 CONNECTED
592
+ # 不要修改 CONNECTING 状态,避免干扰正在进行的连接
593
+ with self.lock:
594
+ if self._connection_state == ConnectionState.DISCONNECTED:
595
+ self._connection_state = ConnectionState.CONNECTED
596
+ self.connected_event.set()
597
+ return True
598
+
599
+ # 需要建立连接
600
+ retry_count = 0
601
+ while retry_count < self.config.send_retry_attempts:
602
+ if self.start_websocket_client():
603
+ return True
604
+
605
+ retry_count += 1
606
+ if retry_count < self.config.send_retry_attempts:
607
+ time.sleep(self.config.send_retry_delay)
608
+
609
+ log_error(f"Failed to establish connection after {self.config.send_retry_attempts} attempts")
610
+ return False
611
+
612
+ def _queue_message(self, msg: Union[str, Dict]) -> bool:
613
+ """Queue message for later sending."""
614
+ try:
615
+ if self.queue.full():
616
+ try:
617
+ self.queue.get_nowait()
618
+ self.queue.task_done()
619
+ except queue.Empty:
620
+ pass
621
+
622
+ message_str = json.dumps(msg) if not isinstance(msg, str) else msg
623
+ self.queue.put(message_str, timeout=1)
624
+ log_debug("Message queued for later sending")
625
+ return False
626
+
627
+ except (queue.Full, queue.Empty) as e:
628
+ log_error(f"Failed to queue message: {e}")
629
+ return False
630
+
631
+ def _handle_reconnection(self) -> None:
632
+ """Handle reconnection logic with exponential backoff."""
633
+ # ✅ 检查解释器是否正在关闭
634
+ import sys
635
+ if hasattr(sys, 'is_finalizing') and sys.is_finalizing():
636
+ log_debug("Interpreter is shutting down, skipping reconnection")
637
+ self._shutdown_requested = True
638
+ return
639
+
640
+ if self._shutdown_requested:
641
+ return
642
+
643
+ if not self.config.auto_reconnect:
644
+ log_debug("Auto-reconnect is disabled, skipping reconnection")
645
+ return
646
+
647
+ # 使用锁保护 _is_retrying 标志
648
+ with self.lock:
649
+ if self._is_retrying:
650
+ log_debug("Reconnection already in progress, skipping")
651
+ return
652
+ self._is_retrying = True
653
+ # 不设置 RECONNECTING 状态,让 start_websocket_client 设置 CONNECTING
654
+
655
+ reconnect_start_time = time.time()
656
+ ws_logger = get_ws_logger()
657
+
658
+ try:
659
+ if self._reconnect_attempt_count == 0:
660
+ self._current_reconnect_interval = self.config.reconnect_base_interval
661
+
662
+ while not self._shutdown_requested:
663
+ self._reconnect_attempt_count += 1
664
+
665
+ if self.config.max_retry_attempts > 0 and self._reconnect_attempt_count > self.config.max_retry_attempts:
666
+ log_error(f"Reconnection failed after {self.config.max_retry_attempts} attempts, giving up")
667
+ # ✅ 记录重连失败
668
+ ws_logger.log_reconnect_fail(
669
+ conn_id=self._connection_id,
670
+ attempt=self._reconnect_attempt_count,
671
+ reason=f"达到最大重试次数 {self.config.max_retry_attempts}"
672
+ )
673
+ break
674
+
675
+ # ✅ 记录重连开始
676
+ ws_logger.log_reconnect_start(
677
+ conn_id=self._connection_id,
678
+ attempt=self._reconnect_attempt_count,
679
+ interval=self._current_reconnect_interval
680
+ )
681
+
682
+ if self._reconnect_attempt_count == 1 or self._reconnect_attempt_count % 10 == 0:
683
+ log_info(f"🔄 Reconnecting... attempt {self._reconnect_attempt_count} (interval: {self._current_reconnect_interval:.1f}s)")
684
+ else:
685
+ log_debug(f"Reconnecting attempt {self._reconnect_attempt_count}")
686
+
687
+ if self.start_websocket_client():
688
+ reconnect_duration = time.time() - reconnect_start_time
689
+ log_info("✅ Reconnection successful!")
690
+
691
+ # ✅ 记录重连成功
692
+ ws_logger.log_reconnect_success(
693
+ conn_id=self._connection_id,
694
+ attempt=self._reconnect_attempt_count,
695
+ duration=reconnect_duration,
696
+ pending_recovered=0 # 等待请求已在断开时通知,这里为0
697
+ )
698
+
699
+ # ✅ 增强:主动验证连接真正可用
700
+ if not self._verify_connection_after_reconnect():
701
+ log_warning("⚠️ 重连后连接验证失败,继续重试...")
702
+ time.sleep(self._current_reconnect_interval)
703
+ continue
704
+
705
+ # ✅ 执行系统恢复检查
706
+ self._perform_system_recovery_check()
707
+
708
+ # ✅ 触发连接恢复回调
709
+ if self._on_reconnect_callback:
710
+ try:
711
+ log_info(f"[conn:{self._connection_id}] 触发连接恢复回调...")
712
+ self._on_reconnect_callback(
713
+ agent_id=self.agent_id,
714
+ server_url=self.server_url
715
+ )
716
+ except Exception as e:
717
+ log_error(f"[conn:{self._connection_id}] 连接恢复回调执行异常: {e}")
718
+
719
+ self._reconnect_attempt_count = 0
720
+ self._current_reconnect_interval = self.config.reconnect_base_interval
721
+ return
722
+
723
+ time.sleep(self._current_reconnect_interval)
724
+
725
+ self._current_reconnect_interval = min(
726
+ self._current_reconnect_interval * self.config.reconnect_backoff_factor,
727
+ self.config.reconnect_max_interval
728
+ )
729
+
730
+ if self.config.max_retry_attempts > 0:
731
+ log_error(f"Reconnection failed after {self.config.max_retry_attempts} attempts")
732
+
733
+ finally:
734
+ self._is_retrying = False
735
+ if self.connection_state != ConnectionState.CONNECTED:
736
+ self._set_connection_state(ConnectionState.DISCONNECTED)
737
+
738
+ def _verify_connection_after_reconnect(self) -> bool:
739
+ """✅ 重连后主动验证连接是否真正可用
740
+
741
+ 检查项:
742
+ 1. WebSocket 对象存在且状态为 OPEN
743
+ 2. 事件循环正在运行
744
+ 3. connected_event 已设置
745
+
746
+ Returns:
747
+ True: 连接验证通过
748
+ False: 连接验证失败
749
+ """
750
+ try:
751
+ # 等待一小段时间让连接稳定
752
+ time.sleep(0.2)
753
+
754
+ # 1. 检查 WebSocket 状态
755
+ if not self._is_ws_open():
756
+ log_warning(f"[验证] WebSocket 状态不是 OPEN")
757
+ return False
758
+
759
+ # 2. 检查事件循环
760
+ if self._loop is None or not self._loop.is_running():
761
+ log_warning(f"[验证] 事件循环未运行")
762
+ return False
763
+
764
+ # 3. 检查 connected_event
765
+ if not self.connected_event.is_set():
766
+ log_warning(f"[验证] connected_event 未设置")
767
+ return False
768
+
769
+ # 4. 检查连接状态
770
+ if self.connection_state != ConnectionState.CONNECTED:
771
+ log_warning(f"[验证] 连接状态不是 CONNECTED: {self.connection_state.value}")
772
+ return False
773
+
774
+ log_info(f"[验证] ✅ 连接验证通过")
775
+ return True
776
+
777
+ except Exception as e:
778
+ log_error(f"[验证] 连接验证异常: {e}")
779
+ return False
780
+
781
+ def _perform_system_recovery_check(self) -> None:
782
+ """✅ 执行系统恢复检查,确保重连后系统正常运行
783
+
784
+ 检查项目:
785
+ 1. WebSocket 连接状态
786
+ 2. 事件循环状态
787
+ 3. 队列状态
788
+ 4. 辅助线程状态
789
+ """
790
+ try:
791
+ ws_logger = get_ws_logger()
792
+ recovery_status = {}
793
+
794
+ # 1. 检查连接状态
795
+ ws_open = self._is_ws_open()
796
+ recovery_status["ws_connection"] = "OK" if ws_open else "FAILED"
797
+
798
+ # 2. 检查事件循环
799
+ loop_running = self._loop is not None and self._loop.is_running()
800
+ recovery_status["event_loop"] = "OK" if loop_running else "FAILED"
801
+
802
+ # 3. 检查消息队列
803
+ queue_size = self.queue.qsize() if self.queue else 0
804
+ recovery_status["message_queue_size"] = queue_size
805
+ recovery_status["message_queue"] = "OK"
806
+
807
+ # 4. 检查 stream_queue_map(应该已被清空)
808
+ pending_streams = self.get_pending_stream_count() # ✅ 使用线程安全方法
809
+ recovery_status["pending_stream_requests"] = pending_streams
810
+
811
+ # 5. 检查辅助线程
812
+ cleanup_running = self._cleanup_thread and self._cleanup_thread.is_alive()
813
+ health_check_running = self._health_check_thread and self._health_check_thread.is_alive()
814
+ recovery_status["cleanup_thread"] = "OK" if cleanup_running else "RESTARTING"
815
+ recovery_status["health_check_thread"] = "OK" if health_check_running else "RESTARTING"
816
+
817
+ # 6. 检查连接事件
818
+ connected_event_set = self.connected_event.is_set()
819
+ recovery_status["connected_event"] = "OK" if connected_event_set else "FAILED"
820
+
821
+ # 判断整体状态
822
+ all_ok = (
823
+ ws_open and
824
+ loop_running and
825
+ connected_event_set
826
+ )
827
+ recovery_status["overall_status"] = "HEALTHY" if all_ok else "DEGRADED"
828
+
829
+ # 记录恢复状态
830
+ ws_logger.log_system_recovery(
831
+ conn_id=self._connection_id,
832
+ recovery_status=recovery_status
833
+ )
834
+
835
+ if all_ok:
836
+ log_info(f"✅ [系统恢复] 所有检查通过,系统已完全恢复")
837
+ else:
838
+ log_warning(f"⚠️ [系统恢复] 部分检查未通过: {recovery_status}")
839
+
840
+ # 尝试修复问题
841
+ if not cleanup_running:
842
+ log_info("🔧 重启清理线程...")
843
+ self._start_cleanup_thread()
844
+
845
+ if not health_check_running:
846
+ log_info("🔧 重启健康检查线程...")
847
+ self._start_health_check_thread()
848
+
849
+ except Exception as e:
850
+ log_error(f"❌ 系统恢复检查失败: {e}")
851
+
852
+ async def _process_queued_messages(self) -> None:
853
+ """Process messages that were queued during disconnection."""
854
+ try:
855
+ while not self.queue.empty():
856
+ try:
857
+ message = self.queue.get_nowait()
858
+ if self._is_ws_open():
859
+ await self.ws.send(message)
860
+ self.queue.task_done()
861
+ except queue.Empty:
862
+ break
863
+ except Exception as e:
864
+ log_error(f"Failed to send queued message: {e}")
865
+ break
866
+ except Exception as e:
867
+ log_error(f"Error processing queued messages: {e}")
868
+
869
+ def _cleanup_stale_stream_queues(self, owner_conn_id: int) -> None:
870
+ """定期清理过期的流队列"""
871
+ log_info(f"[conn:{owner_conn_id}] 🧹 流队列清理线程已启动")
872
+ cleanup_interval = 30
873
+ last_cleanup_time = time.time()
874
+
875
+ while self._cleanup_running and not self._shutdown_requested:
876
+ try:
877
+ # 使用短间隔 sleep,快速响应停止信号
878
+ time.sleep(1.0)
879
+
880
+ # 检查连接 ID 是否仍然有效
881
+ if self._connection_id != owner_conn_id:
882
+ log_debug(f"[conn:{owner_conn_id}] 清理线程: 连接已被取代,退出")
883
+ break
884
+
885
+ if not self._cleanup_running or self._shutdown_requested:
886
+ break
887
+
888
+ # 检查是否到达清理间隔
889
+ now = time.time()
890
+ if now - last_cleanup_time < cleanup_interval:
891
+ continue
892
+ last_cleanup_time = now
893
+
894
+ stale_requests = []
895
+
896
+ # ✅ 使用锁保护遍历操作
897
+ with self._stream_queue_lock:
898
+ for request_id, entry in list(self.stream_queue_map.items()):
899
+ timestamp = entry.get("timestamp", now)
900
+ age = now - timestamp
901
+
902
+ if age > 15.0:
903
+ stale_requests.append({
904
+ "request_id": request_id,
905
+ "age": age,
906
+ "receiver": entry.get("receiver", "unknown"),
907
+ "entry": entry # 保存完整的 entry
908
+ })
909
+
910
+ # ✅ 在锁内移除过期请求
911
+ for req in stale_requests:
912
+ self.stream_queue_map.pop(req["request_id"], None)
913
+ remaining_count = len(self.stream_queue_map)
914
+
915
+ # ✅ 释放锁后再处理通知
916
+ if stale_requests:
917
+ log_info(f"🧹 发现 {len(stale_requests)} 个过期流请求,开始清理...")
918
+
919
+ for req in stale_requests:
920
+ request_id = req["request_id"]
921
+ queue_entry = req["entry"]
922
+
923
+ log_error(f"⚠️ 清理过期流请求: request_id={request_id[:8]}... "
924
+ f"receiver={req['receiver']} 等待时间={req['age']:.1f}s")
925
+
926
+ try:
927
+ temp_queue = queue_entry["queue"]
928
+ loop = queue_entry.get("loop")
929
+
930
+ if temp_queue.empty() and loop:
931
+ error_data = {"error": "timeout", "message": "流创建超时"}
932
+ loop.call_soon_threadsafe(temp_queue.put_nowait, error_data)
933
+ except Exception as e:
934
+ log_debug(f"清理队列时异常(可忽略): {e}")
935
+
936
+ log_info(f"✅ 清理完成,剩余等待请求: {remaining_count}")
937
+
938
+ except Exception as e:
939
+ log_error(f"❌ 流队列清理异常: {e}")
940
+
941
+ log_info(f"[conn:{owner_conn_id}] 🧹 流队列清理线程已停止")
942
+
943
+ def _start_cleanup_thread(self) -> None:
944
+ """启动清理线程"""
945
+ # 如果旧线程还在运行,先等待它停止
946
+ if self._cleanup_thread and self._cleanup_thread.is_alive():
947
+ if self._cleanup_running:
948
+ return # 线程正常运行中,不需要重启
949
+ # 等待旧线程结束
950
+ self._cleanup_thread.join(timeout=2.0)
951
+
952
+ self._cleanup_running = True
953
+
954
+ # 传递当前连接 ID
955
+ current_conn_id = self._connection_id
956
+
957
+ self._cleanup_thread = threading.Thread(
958
+ target=self._cleanup_stale_stream_queues,
959
+ args=(current_conn_id,),
960
+ daemon=True,
961
+ name=f"StreamQueueCleanup-{current_conn_id}"
962
+ )
963
+ self._cleanup_thread.start()
964
+ log_debug(f"[conn:{current_conn_id}] 流队列清理线程已启动")
965
+
966
+ def _stop_cleanup_thread(self) -> None:
967
+ """停止清理线程"""
968
+ if not self._cleanup_thread:
969
+ return
970
+
971
+ self._cleanup_running = False
972
+
973
+ if self._cleanup_thread.is_alive():
974
+ self._cleanup_thread.join(timeout=2.0)
975
+
976
+ self._cleanup_thread = None
977
+ log_debug("流队列清理线程已停止")
978
+
979
+ def _start_health_check_thread(self) -> None:
980
+ """启动连接健康检查线程"""
981
+ # 如果旧线程还在运行,先等待它停止
982
+ if self._health_check_thread and self._health_check_thread.is_alive():
983
+ if self._health_check_running:
984
+ return # 线程正常运行中,不需要重启
985
+ # 等待旧线程结束
986
+ self._health_check_thread.join(timeout=2.0)
987
+
988
+ self._health_check_running = True
989
+ self._last_pong_time = time.time()
990
+
991
+ # 传递当前连接 ID,让线程知道它属于哪个连接
992
+ current_conn_id = self._connection_id
993
+
994
+ self._health_check_thread = threading.Thread(
995
+ target=self._health_check_loop,
996
+ args=(current_conn_id,),
997
+ daemon=True,
998
+ name=f"WebSocketHealthCheck-{current_conn_id}"
999
+ )
1000
+ self._health_check_thread.start()
1001
+ log_debug(f"[conn:{current_conn_id}] 连接健康检查线程已启动")
1002
+
1003
+ def _stop_health_check_thread(self) -> None:
1004
+ """停止连接健康检查线程"""
1005
+ self._health_check_running = False
1006
+
1007
+ if self._health_check_thread and self._health_check_thread.is_alive():
1008
+ self._health_check_thread.join(timeout=2.0)
1009
+
1010
+ self._health_check_thread = None
1011
+ log_debug("连接健康检查线程已停止")
1012
+
1013
+ def _health_check_loop(self, owner_conn_id: int) -> None:
1014
+ """连接健康检查循环
1015
+
1016
+ 注意:websockets 库内部已经处理了 ping/pong,会自动关闭不响应的连接。
1017
+ 因此这里只需要检查 WebSocket 状态,不需要自己判断 pong 超时。
1018
+ """
1019
+ # ✅ 优化:缩短检查间隔,更快发现连接问题
1020
+ check_interval = self.config.ping_interval * 2 # 从 *3 改为 *2(6秒检查一次)
1021
+ ws_logger = get_ws_logger()
1022
+
1023
+ log_debug(f"[conn:{owner_conn_id}] 健康检查线程启动: 检查间隔={check_interval}s")
1024
+
1025
+ last_check_time = time.time()
1026
+
1027
+ while self._health_check_running and not self._shutdown_requested:
1028
+ try:
1029
+ # 使用短间隔 sleep,快速响应停止信号
1030
+ time.sleep(1.0)
1031
+
1032
+ # 检查连接 ID 是否仍然有效(防止旧线程继续运行)
1033
+ if self._connection_id != owner_conn_id:
1034
+ log_debug(f"[conn:{owner_conn_id}] 健康检查线程: 连接已被取代 (当前: {self._connection_id}),退出")
1035
+ break
1036
+
1037
+ if not self._health_check_running or self._shutdown_requested:
1038
+ break
1039
+
1040
+ # 检查是否到达检查间隔
1041
+ now = time.time()
1042
+ if now - last_check_time < check_interval:
1043
+ continue
1044
+ last_check_time = now
1045
+
1046
+ # 再次检查连接 ID
1047
+ if self._connection_id != owner_conn_id:
1048
+ log_debug(f"[conn:{owner_conn_id}] 健康检查线程: 连接已被取代,退出")
1049
+ break
1050
+
1051
+ # 获取当前状态
1052
+ ws_open = self._is_ws_open()
1053
+ conn_state = self.connection_state.value
1054
+
1055
+ # 检查连接状态
1056
+ if self.connection_state == ConnectionState.DISCONNECTED:
1057
+ log_debug(f"[conn:{owner_conn_id}] 健康检查: 检测到连接状态为 DISCONNECTED")
1058
+ # 只在触发重连时记录日志
1059
+ ws_logger.log_health_check(
1060
+ conn_id=owner_conn_id,
1061
+ ws_open=ws_open,
1062
+ connection_state=conn_state,
1063
+ action="trigger_reconnect_state_disconnected"
1064
+ )
1065
+ # ✅ 修复:触发重连前先通知所有等待中的请求
1066
+ self._notify_pending_stream_requests("健康检查检测到连接断开")
1067
+ if not self._is_retrying:
1068
+ threading.Thread(target=self._handle_reconnection, daemon=True).start()
1069
+ continue
1070
+
1071
+ # 检查 WebSocket 对象是否有效
1072
+ if not ws_open:
1073
+ log_debug(f"[conn:{owner_conn_id}] 健康检查: WebSocket 连接已关闭")
1074
+ # 只在触发重连时记录日志
1075
+ ws_logger.log_health_check(
1076
+ conn_id=owner_conn_id,
1077
+ ws_open=ws_open,
1078
+ connection_state=conn_state,
1079
+ action="trigger_reconnect_ws_closed"
1080
+ )
1081
+ # ✅ 修复:触发重连前先通知所有等待中的请求
1082
+ self._notify_pending_stream_requests("健康检查检测到WebSocket关闭")
1083
+ self._set_connection_state(ConnectionState.DISCONNECTED)
1084
+ if not self._is_retrying:
1085
+ threading.Thread(target=self._handle_reconnection, daemon=True).start()
1086
+ continue
1087
+
1088
+ # 连接正常,更新 pong 时间(用于统计,不用于判断断开)
1089
+ # 不记录日志,避免日志量过大
1090
+ self._last_pong_time = time.time()
1091
+
1092
+ except Exception as e:
1093
+ log_error(f"[conn:{owner_conn_id}] 健康检查异常: {e}")
1094
+
1095
+ log_debug(f"[conn:{owner_conn_id}] 健康检查线程已退出")
1096
+
1097
+ def _ws_handler(self, conn_id: int) -> None:
1098
+ """WebSocket handler thread function with asyncio loop."""
1099
+ loop = None
1100
+ try:
1101
+ # ✅ 检查解释器是否正在关闭
1102
+ import sys
1103
+ if hasattr(sys, 'is_finalizing') and sys.is_finalizing():
1104
+ log_debug(f"[conn:{conn_id}] Interpreter is shutting down, skipping connection")
1105
+ self._shutdown_requested = True
1106
+ return
1107
+
1108
+ loop = asyncio.new_event_loop()
1109
+ asyncio.set_event_loop(loop)
1110
+ self._loop = loop
1111
+
1112
+ loop.run_until_complete(self._ws_connect_and_receive(conn_id))
1113
+
1114
+ except RuntimeError as e:
1115
+ error_str = str(e).lower()
1116
+ # ✅ 检测解释器关闭相关的错误
1117
+ if "interpreter shutdown" in error_str or "cannot schedule" in error_str:
1118
+ log_warning(f"[conn:{conn_id}] Interpreter shutting down, stopping reconnection")
1119
+ self._shutdown_requested = True # 阻止重连
1120
+ else:
1121
+ log_debug(f"[conn:{conn_id}] WebSocket handler RuntimeError: {e}")
1122
+ except Exception as e:
1123
+ error_str = str(e).lower()
1124
+ # ✅ 也检查通用异常中的解释器关闭错误
1125
+ if "interpreter shutdown" in error_str or "cannot schedule" in error_str:
1126
+ log_warning(f"[conn:{conn_id}] Interpreter shutting down, stopping reconnection")
1127
+ self._shutdown_requested = True
1128
+ else:
1129
+ log_debug(f"[conn:{conn_id}] WebSocket handler error: {e}")
1130
+ finally:
1131
+ # 只有当前连接才设置 DISCONNECTED 状态
1132
+ with self.lock:
1133
+ if self._connection_id == conn_id:
1134
+ log_debug(f"[conn:{conn_id}] Handler exiting, setting DISCONNECTED")
1135
+ self._connection_state = ConnectionState.DISCONNECTED
1136
+ self._connecting_since = 0.0
1137
+ self._connecting_conn_id = 0
1138
+ self.connected_event.clear()
1139
+ self.ws = None
1140
+ else:
1141
+ log_debug(f"[conn:{conn_id}] Handler exiting, but superseded by conn:{self._connection_id}")
1142
+
1143
+ # 安全关闭事件循环
1144
+ if loop and not loop.is_closed():
1145
+ try:
1146
+ # 只有当 loop 没有运行时才能安全地取消任务
1147
+ if not loop.is_running():
1148
+ # 取消所有pending任务
1149
+ pending = asyncio.all_tasks(loop)
1150
+ for task in pending:
1151
+ task.cancel()
1152
+
1153
+ # 等待任务取消完成
1154
+ if pending:
1155
+ loop.run_until_complete(
1156
+ asyncio.gather(*pending, return_exceptions=True)
1157
+ )
1158
+
1159
+ # 关闭loop
1160
+ if not loop.is_closed():
1161
+ loop.close()
1162
+ except Exception:
1163
+ pass
1164
+
1165
+ async def _ws_connect_and_receive(self, conn_id: int) -> None:
1166
+ """Async WebSocket connection and message receiving loop."""
1167
+ ssl_context = None
1168
+ if self.ws_url and self.ws_url.startswith("wss://"):
1169
+ ssl_context = ssl.create_default_context()
1170
+ ssl_context.check_hostname = False
1171
+ ssl_context.verify_mode = ssl.CERT_NONE
1172
+
1173
+ # 准备代理配置(localhost 永远直连,避免全局代理/VPN 劫持)
1174
+ use_proxy = self._get_use_system_proxy() and (not is_local_url(self.ws_url))
1175
+ extra_headers = {}
1176
+ saved_proxy_env = None
1177
+
1178
+ try:
1179
+ # websockets库通过环境变量支持代理,但我们可以通过extra_headers传递代理信息
1180
+ # 如果不使用代理,确保不会使用环境变量中的代理设置
1181
+ import os
1182
+ import platform
1183
+ if not use_proxy:
1184
+ # 临时清除代理环境变量(只影响本次握手),确保 localhost 不会走代理
1185
+ saved_proxy_env = pop_proxy_env()
1186
+
1187
+ # 准备 websockets.connect 参数
1188
+ # 注意:websockets 14.2+ 在某些平台(macOS/Darwin)上不支持 proxy 参数
1189
+ # 会抛出 "BaseEventLoop.create_connection() got an unexpected keyword argument 'proxy'"
1190
+ ws_connect_kwargs = {
1191
+ "ssl": ssl_context,
1192
+ "open_timeout": self.config.connection_timeout,
1193
+ "ping_interval": self.config.ping_interval,
1194
+ "ping_timeout": self.config.ping_interval * 10,
1195
+ "close_timeout": 5,
1196
+ "max_size": None, # ✅ 禁用协议层大小限制,在应用层处理超大消息
1197
+ "compression": "deflate", # ✅ 启用压缩,与服务器协商压缩扩展
1198
+ }
1199
+
1200
+ # macOS (Darwin) 上 websockets 14.2+ 不支持 proxy 参数
1201
+ # 其他平台显式禁用代理(配合环境变量清除)
1202
+ if platform.system() != "Darwin":
1203
+ ws_connect_kwargs["proxy"] = None
1204
+
1205
+ async with websockets.connect(
1206
+ self.ws_url,
1207
+ **ws_connect_kwargs
1208
+ ) as ws:
1209
+ # 连接建立后立即恢复代理环境变量(避免影响进程内其他请求)
1210
+ if saved_proxy_env:
1211
+ restore_proxy_env(saved_proxy_env)
1212
+ saved_proxy_env = None
1213
+ # 检查连接ID是否仍然有效(防止旧连接继续处理)
1214
+ with self.lock:
1215
+ if self._connection_id != conn_id:
1216
+ log_debug(f"[conn:{conn_id}] Connection superseded by conn:{self._connection_id}, closing")
1217
+ # ✅ 记录连接被取代到专用日志
1218
+ ws_logger = get_ws_logger()
1219
+ ws_logger.log_connection_superseded(conn_id, self._connection_id, "_ws_connect_and_receive:after_connect")
1220
+ return
1221
+
1222
+ self.ws = ws
1223
+
1224
+ # 连接成功
1225
+ log_info(f"[conn:{conn_id}] WebSocket connection established")
1226
+ self._set_connection_state(ConnectionState.CONNECTED)
1227
+ with self.lock:
1228
+ self._is_retrying = False
1229
+ self._reconnect_attempt_count = 0
1230
+ self._current_reconnect_interval = self.config.reconnect_base_interval
1231
+ self._last_pong_time = time.time()
1232
+
1233
+ # ✅ 记录连接建立到专用日志
1234
+ ws_logger = get_ws_logger()
1235
+ ws_logger.log_connection_established(
1236
+ conn_id=conn_id,
1237
+ ws_url=self.ws_url,
1238
+ extra_info={
1239
+ "agent_id": self.agent_id,
1240
+ "ping_interval": self.config.ping_interval,
1241
+ "has_handler": self.message_handler is not None
1242
+ }
1243
+ )
1244
+
1245
+ # 启动辅助线程(异常不影响主流程)
1246
+ try:
1247
+ self._start_cleanup_thread()
1248
+ ws_logger.log_helper_thread(conn_id, "cleanup", "started")
1249
+ except Exception as e:
1250
+ log_error(f"[conn:{conn_id}] 启动清理线程失败: {e}")
1251
+ ws_logger.log_helper_thread(conn_id, "cleanup", "start_failed", success=False, error=str(e))
1252
+
1253
+ try:
1254
+ self._start_health_check_thread()
1255
+ ws_logger.log_helper_thread(conn_id, "health_check", "started")
1256
+ except Exception as e:
1257
+ log_error(f"[conn:{conn_id}] 启动健康检查线程失败: {e}")
1258
+ ws_logger.log_helper_thread(conn_id, "health_check", "start_failed", success=False, error=str(e))
1259
+
1260
+ # 调用消息处理器的 on_open
1261
+ if self.message_handler and hasattr(self.message_handler, "on_open"):
1262
+ try:
1263
+ self.message_handler.on_open(ws)
1264
+ ws_logger.log_on_open_callback(
1265
+ conn_id=conn_id,
1266
+ success=True,
1267
+ handler_type=type(self.message_handler).__name__
1268
+ )
1269
+ except Exception as e:
1270
+ log_exception(f"[conn:{conn_id}] Error in message handler on_open: {e}")
1271
+ ws_logger.log_on_open_callback(
1272
+ conn_id=conn_id,
1273
+ success=False,
1274
+ error=str(e),
1275
+ handler_type=type(self.message_handler).__name__
1276
+ )
1277
+
1278
+ # 处理队列中的消息
1279
+ await self._process_queued_messages()
1280
+
1281
+ # 消息接收循环
1282
+ loop_start_time = time.time()
1283
+ messages_received = 0
1284
+ last_stats_time = time.time()
1285
+ stats_interval = 60.0 # 每60秒记录一次统计
1286
+
1287
+ # ✅ 新增:记录最近的消息类型(用于诊断)
1288
+ recent_msg_types = [] # 保存最近20条消息的类型
1289
+ max_recent = 20
1290
+
1291
+ # ✅ 新增:追踪消息大小
1292
+ max_msg_size = 0 # 最大消息大小
1293
+ total_bytes = 0 # 总字节数
1294
+ large_msg_count = 0 # 大消息计数(>100KB)
1295
+
1296
+ # ✅ 修改:使用 while True + recv() 代替 async for,以便捕获单条消息的协议错误
1297
+ protocol_error_count = 0 # RSV 位错误计数(用于日志)
1298
+
1299
+ while True:
1300
+ # 检查连接是否仍然有效
1301
+ if self._connection_id != conn_id:
1302
+ log_debug(f"[conn:{conn_id}] Connection superseded, exiting message loop")
1303
+ ws_logger.log_connection_superseded(conn_id, self._connection_id, "message_loop")
1304
+ ws_logger.log_message_loop_exit(
1305
+ conn_id=conn_id,
1306
+ reason="connection_superseded",
1307
+ messages_received=messages_received,
1308
+ duration=time.time() - loop_start_time
1309
+ )
1310
+ return
1311
+
1312
+ # 检查连接状态(websockets 15.x 使用 state 而不是 closed)
1313
+ if ws.state != WsState.OPEN:
1314
+ log_debug(f"[conn:{conn_id}] WebSocket connection not open (state={ws.state}), exiting message loop")
1315
+ break
1316
+
1317
+ try:
1318
+ # ✅ 使用 recv() 接收消息,可以在这里捕获单条消息的错误
1319
+ message = await ws.recv()
1320
+ protocol_error_count = 0 # 成功接收,重置错误计数
1321
+
1322
+ except ProtocolError as e:
1323
+ error_str = str(e).lower()
1324
+ # ✅ 检查是否是 RSV 位错误
1325
+ if "reserved bits" in error_str or "rsv" in error_str:
1326
+ protocol_error_count += 1
1327
+ log_warning(f"[conn:{conn_id}] ⚠️ RSV 位错误 (第 {protocol_error_count} 次): {e}")
1328
+ ws_logger.log_abnormal_data(
1329
+ conn_id=conn_id,
1330
+ data=None,
1331
+ error=f"RSV位错误: {e}",
1332
+ data_type="rsv_bit_error"
1333
+ )
1334
+
1335
+ # ✅ RSV 位错误时,websockets 库已经发送了关闭帧,连接无法继续
1336
+ # 抛出 ConnectionClosedError 让外层统一处理(正确清理资源后重连)
1337
+ log_info(f"[conn:{conn_id}] RSV 位错误导致连接关闭,触发快速重连")
1338
+ from websockets.frames import Close
1339
+ # 创建一个带有清晰原因的 ConnectionClosedError
1340
+ raise ConnectionClosedError(
1341
+ Close(1006, f"RSV位错误: {str(e)[:80]}"),
1342
+ None
1343
+ )
1344
+ else:
1345
+ # 其他协议错误,向上抛出
1346
+ raise
1347
+
1348
+ except ConnectionClosed:
1349
+ # 连接关闭,退出循环让外层处理
1350
+ raise
1351
+
1352
+ try:
1353
+ self._last_pong_time = time.time()
1354
+ self._set_connection_state(ConnectionState.CONNECTED)
1355
+ messages_received += 1
1356
+
1357
+ # ✅ 新增:追踪消息大小
1358
+ msg_size = len(message) if message else 0
1359
+ total_bytes += msg_size
1360
+ if msg_size > max_msg_size:
1361
+ max_msg_size = msg_size
1362
+
1363
+ # ✅ 应用层消息大小检查:超过阈值直接丢弃,不影响WebSocket连接
1364
+ if msg_size > self.config.max_message_size:
1365
+ large_msg_count += 1
1366
+ log_error(f"[conn:{conn_id}] ❌ 收到超大消息,已丢弃: {msg_size/1024/1024:.1f}MB > {self.config.max_message_size/1024/1024:.0f}MB 限制")
1367
+ # 记录到专用日志(只记录大小,不记录内容)
1368
+ ws_logger.log_abnormal_data(
1369
+ conn_id=conn_id,
1370
+ data=None,
1371
+ error=f"消息大小 {msg_size/1024/1024:.2f}MB ({msg_size} bytes) 超过限制 {self.config.max_message_size/1024/1024:.0f}MB,已丢弃",
1372
+ data_type="oversized_message_discarded"
1373
+ )
1374
+ continue # ✅ 丢弃消息,继续处理下一条,不断开连接
1375
+
1376
+ if msg_size > 1 * 1024 * 1024: # >1MB
1377
+ large_msg_count += 1
1378
+ log_warning(f"[conn:{conn_id}] ⚠️ 收到大消息: {msg_size/1024/1024:.1f}MB")
1379
+
1380
+ # 定期记录消息统计(每60秒)
1381
+ now = time.time()
1382
+ if now - last_stats_time >= stats_interval:
1383
+ interval_time = now - last_stats_time
1384
+ avg_msg_size = total_bytes / messages_received if messages_received > 0 else 0
1385
+ throughput_kb = (total_bytes / 1024) / interval_time # KB/s
1386
+
1387
+ # ✅ 检测异常流量
1388
+ if throughput_kb > 10000: # >10MB/s
1389
+ log_error(f"[conn:{conn_id}] ⚠️ 异常高流量: {throughput_kb:.0f}KB/s, 平均消息大小: {avg_msg_size/1024:.1f}KB")
1390
+
1391
+ ws_logger.log_message_received(
1392
+ conn_id=conn_id,
1393
+ message_type="stats",
1394
+ message_size=0,
1395
+ cmd=None,
1396
+ extra_info={
1397
+ "total_messages": messages_received,
1398
+ "interval_seconds": int(interval_time),
1399
+ "loop_duration": int(now - loop_start_time),
1400
+ "avg_msg_size_kb": f"{avg_msg_size/1024:.1f}",
1401
+ "throughput_kb_s": f"{throughput_kb:.0f}",
1402
+ "total_bytes_mb": f"{total_bytes/1024/1024:.1f}",
1403
+ "large_msg_count": large_msg_count
1404
+ }
1405
+ )
1406
+ last_stats_time = now
1407
+
1408
+ if isinstance(message, bytes):
1409
+ # 二进制消息,尝试解码
1410
+ try:
1411
+ message = message.decode('utf-8')
1412
+ except UnicodeDecodeError as e:
1413
+ # ✅ 记录异常数据到专用日志
1414
+ ws_logger.log_abnormal_data(
1415
+ conn_id=conn_id,
1416
+ data=message,
1417
+ error=f"二进制消息解码失败: {e}",
1418
+ data_type="binary"
1419
+ )
1420
+ log_warning(f"[conn:{conn_id}] Failed to decode binary message (discarded): {e}")
1421
+ continue
1422
+
1423
+ # ✅ 新增:提取并记录消息类型
1424
+ msg_cmd = "unknown"
1425
+ try:
1426
+ msg_json = json.loads(message) if isinstance(message, str) else {}
1427
+ msg_cmd = msg_json.get("cmd", "no_cmd")
1428
+ except Exception:
1429
+ msg_cmd = "parse_error"
1430
+
1431
+ recent_msg_types.append(msg_cmd)
1432
+ if len(recent_msg_types) > max_recent:
1433
+ recent_msg_types.pop(0)
1434
+
1435
+ # 处理消息
1436
+ if self.message_handler and hasattr(self.message_handler, "on_message"):
1437
+ try:
1438
+ self.message_handler.on_message(ws, message)
1439
+ except Exception as e:
1440
+ # ✅ 记录消息处理错误到专用日志
1441
+ ws_logger.log_message_error(
1442
+ conn_id=conn_id,
1443
+ message=message,
1444
+ error=str(e)
1445
+ )
1446
+ log_exception(f"[conn:{conn_id}] Error in message handler: {e}")
1447
+
1448
+ except Exception as e:
1449
+ # ✅ 记录异常数据到专用日志
1450
+ ws_logger.log_abnormal_data(
1451
+ conn_id=conn_id,
1452
+ data=message if 'message' in locals() else None,
1453
+ error=f"消息处理异常: {e}",
1454
+ data_type="unknown"
1455
+ )
1456
+ log_warning(f"[conn:{conn_id}] Error processing message (discarded): {e}")
1457
+ continue
1458
+
1459
+ # while True 循环正常结束(ws.state != OPEN)
1460
+ log_debug(f"[conn:{conn_id}] WebSocket message loop ended normally")
1461
+ ws_logger.log_message_loop_exit(
1462
+ conn_id=conn_id,
1463
+ reason="loop_ended_normally",
1464
+ messages_received=messages_received,
1465
+ duration=time.time() - loop_start_time
1466
+ )
1467
+ self._handle_connection_close(conn_id, None, "connection ended")
1468
+
1469
+ except ConnectionClosed as e:
1470
+ # ✅ 增强日志:记录更多诊断信息
1471
+ connection_duration = time.time() - loop_start_time if 'loop_start_time' in locals() else 0
1472
+ msgs_count = messages_received if 'messages_received' in locals() else 0
1473
+ recent_types = recent_msg_types if 'recent_msg_types' in locals() else []
1474
+ max_size = max_msg_size if 'max_msg_size' in locals() else 0
1475
+ total = total_bytes if 'total_bytes' in locals() else 0
1476
+ large_count = large_msg_count if 'large_msg_count' in locals() else 0
1477
+
1478
+ log_warning(f"[conn:{conn_id}] WebSocket connection closed: code={e.code}, reason={e.reason}, "
1479
+ f"duration={connection_duration:.1f}s, messages={msgs_count}, max_size={max_size/1024:.1f}KB")
1480
+
1481
+ # ✅ 记录连接关闭异常到专用日志(包含诊断信息)
1482
+ ws_logger = get_ws_logger()
1483
+ ws_logger.log_connection_closed(
1484
+ conn_id=conn_id,
1485
+ code=e.code,
1486
+ reason=e.reason or "(empty)",
1487
+ connection_duration=connection_duration,
1488
+ messages_received=msgs_count,
1489
+ last_pong_time=self._last_pong_time,
1490
+ extra_info={
1491
+ "ws_url": self.ws_url[:80] if self.ws_url else "N/A",
1492
+ "agent_id": self.agent_id,
1493
+ "code_meaning": self._get_close_code_meaning(e.code),
1494
+ "recent_msg_types": recent_types[-10:] if recent_types else [],
1495
+ "max_msg_size_kb": f"{max_size/1024:.1f}",
1496
+ "total_bytes_kb": f"{total/1024:.1f}",
1497
+ "large_msg_count": large_count,
1498
+ "exception_type": type(e).__name__,
1499
+ "exception_detail": str(e)[:200] if str(e) else "(none)"
1500
+ }
1501
+ )
1502
+ self._handle_connection_close(conn_id, e.code, e.reason)
1503
+
1504
+ except asyncio.TimeoutError:
1505
+ log_warning(f"[conn:{conn_id}] WebSocket connection timeout")
1506
+ self._handle_connection_close(conn_id, None, "timeout")
1507
+
1508
+ except PayloadTooBig as e:
1509
+ if saved_proxy_env:
1510
+ restore_proxy_env(saved_proxy_env)
1511
+ saved_proxy_env = None
1512
+ # ✅ 备用处理:max_size=None时此异常不应触发,保留作为防御性编程
1513
+ log_error(f"[conn:{conn_id}] ❌ 收到的消息太大,超过限制: {e}")
1514
+ ws_logger = get_ws_logger()
1515
+ ws_logger.log_abnormal_data(
1516
+ conn_id=conn_id,
1517
+ data=None,
1518
+ error=f"PayloadTooBig: {e}",
1519
+ data_type="payload_too_big"
1520
+ )
1521
+ self._handle_connection_close(conn_id, None, f"消息太大: {e}")
1522
+
1523
+ except ProtocolError as e:
1524
+ if saved_proxy_env:
1525
+ restore_proxy_env(saved_proxy_env)
1526
+ saved_proxy_env = None
1527
+ # ✅ 协议错误(如无效的帧、RSV位错误等)
1528
+ log_error(f"[conn:{conn_id}] ❌ WebSocket 协议错误: {e}")
1529
+ ws_logger = get_ws_logger()
1530
+ ws_logger.log_abnormal_data(
1531
+ conn_id=conn_id,
1532
+ data=None,
1533
+ error=f"ProtocolError: {e}",
1534
+ data_type="protocol_error"
1535
+ )
1536
+ # 协议错误通常表示服务器行为异常,增加重连间隔
1537
+ self._current_reconnect_interval = min(
1538
+ self._current_reconnect_interval * 3,
1539
+ self.config.reconnect_max_interval
1540
+ )
1541
+ self._handle_connection_close(conn_id, None, f"协议错误: {e}")
1542
+
1543
+ except InvalidMessage as e:
1544
+ if saved_proxy_env:
1545
+ restore_proxy_env(saved_proxy_env)
1546
+ saved_proxy_env = None
1547
+ # ✅ 无效的消息格式
1548
+ log_error(f"[conn:{conn_id}] ❌ 无效的 WebSocket 消息: {e}")
1549
+ ws_logger = get_ws_logger()
1550
+ ws_logger.log_abnormal_data(
1551
+ conn_id=conn_id,
1552
+ data=None,
1553
+ error=f"InvalidMessage: {e}",
1554
+ data_type="invalid_message"
1555
+ )
1556
+ self._handle_connection_close(conn_id, None, f"无效消息: {e}")
1557
+
1558
+ except Exception as e:
1559
+ error_str = str(e)
1560
+
1561
+ # 检查是否为连接数限制错误
1562
+ is_rate_limit = (
1563
+ "400" in error_str or
1564
+ "超过连接数限制" in error_str or
1565
+ "connection limit" in error_str.lower()
1566
+ )
1567
+
1568
+ if is_rate_limit:
1569
+ current_time = time.time()
1570
+ if current_time - MessageClient._last_rate_limit_log_time > MessageClient._rate_limit_log_interval:
1571
+ MessageClient._last_rate_limit_log_time = current_time
1572
+ log_warning(f"[conn:{conn_id}] WebSocket rate limit: 超过连接数限制")
1573
+ self._current_reconnect_interval = min(
1574
+ self._current_reconnect_interval * 2,
1575
+ self.config.reconnect_max_interval
1576
+ )
1577
+ else:
1578
+ log_debug(f"[conn:{conn_id}] WebSocket connection error: {e}")
1579
+ # ✅ 记录异常到专用日志
1580
+ ws_logger = get_ws_logger()
1581
+ ws_logger.log_abnormal_data(
1582
+ conn_id=conn_id,
1583
+ data=None,
1584
+ error=f"WebSocket异常: {error_str}",
1585
+ data_type="exception"
1586
+ )
1587
+
1588
+ self._handle_connection_close(conn_id, None, str(e))
1589
+
1590
+ def _handle_connection_close(self, conn_id: int, code: Optional[int], reason: str, received_data: any = None) -> None:
1591
+ """Handle connection close event."""
1592
+ # 检查连接ID是否仍然有效
1593
+ is_current_connection = False
1594
+ current_conn_id = 0
1595
+ with self.lock:
1596
+ current_conn_id = self._connection_id
1597
+ if self._connection_id != conn_id:
1598
+ log_warning(f"[conn:{conn_id}] 连接已被取代 (当前: {self._connection_id}),仍执行清理")
1599
+ # ✅ 不直接 return,异常断开时仍需清理
1600
+ else:
1601
+ is_current_connection = True
1602
+ log_info(f"[conn:{conn_id}] 当前连接断开: code={code}, reason={reason}")
1603
+ self._connection_state = ConnectionState.DISCONNECTED
1604
+ self._connecting_since = 0.0
1605
+ self._connecting_conn_id = 0
1606
+ self.connected_event.clear()
1607
+ self.ws = None
1608
+
1609
+ # ✅ 记录到专用 WebSocket 日志(无论是否是当前连接)
1610
+ with self._stream_queue_lock:
1611
+ pending_count = len(self.stream_queue_map)
1612
+
1613
+ ws_logger = get_ws_logger()
1614
+
1615
+ # ✅ 记录连接被取代事件
1616
+ if not is_current_connection:
1617
+ ws_logger.log_connection_superseded(conn_id, current_conn_id, "_handle_connection_close")
1618
+
1619
+ try:
1620
+ ws_logger.log_disconnect(
1621
+ conn_id=conn_id,
1622
+ reason=reason,
1623
+ code=code,
1624
+ received_data=received_data,
1625
+ pending_requests=pending_count,
1626
+ extra_info={
1627
+ "agent_id": self.agent_id,
1628
+ "server_url": self.server_url,
1629
+ "is_current_connection": is_current_connection,
1630
+ "current_conn_id": current_conn_id
1631
+ }
1632
+ )
1633
+ except Exception as e:
1634
+ log_error(f"记录 WebSocket 断开日志失败: {e}")
1635
+
1636
+ # ✅ 修复:连接断开时立即通知所有等待中的 stream 请求
1637
+ self._notify_pending_stream_requests(f"连接断开: {reason}")
1638
+
1639
+ # ✅ 只有当前连接断开时才执行重置和重连(旧连接断开不处理,因为已有新连接)
1640
+ if not is_current_connection:
1641
+ log_debug(f"[conn:{conn_id}] 旧连接断开,跳过重置和重连(当前连接: {self._connection_id})")
1642
+ return
1643
+
1644
+ # ✅ 触发断开回调通知外部(仅当前连接的异常断开才通知)
1645
+ if code != 1000 and self._on_disconnect_callback:
1646
+ try:
1647
+ log_info(f"[conn:{conn_id}] 触发断开回调通知外部...")
1648
+ self._on_disconnect_callback(
1649
+ agent_id=self.agent_id,
1650
+ server_url=self.server_url,
1651
+ code=code,
1652
+ reason=reason
1653
+ )
1654
+ except Exception as e:
1655
+ log_error(f"[conn:{conn_id}] 断开回调执行异常: {e}")
1656
+
1657
+ # ✅ 异常断开时执行完全重置(模拟重启应用的效果)
1658
+ # 注意:_full_reset 会清理状态,但不能在当前线程(WebSocket线程)中停止事件循环
1659
+ need_full_reset = code == 1006 or code == 1002 or code is None or "400" in str(reason) or "protocol" in str(reason).lower()
1660
+ if need_full_reset:
1661
+ log_warning(f"[conn:{conn_id}] 检测到异常断开(code={code}),执行部分重置...")
1662
+ # ✅ 修复:不调用 _full_reset(会尝试停止当前线程的事件循环导致问题)
1663
+ # 只清理必要的状态,让 _handle_reconnection 处理重连
1664
+ self._partial_reset_for_reconnect(conn_id)
1665
+
1666
+ if not self._shutdown_requested and self.config.auto_reconnect:
1667
+ if code != 1000: # 非正常关闭
1668
+ with self.lock:
1669
+ if not self._is_retrying:
1670
+ log_debug(f"[conn:{conn_id}] Triggering reconnection")
1671
+ # ✅ 给重连一点时间让当前线程完成清理
1672
+ def delayed_reconnect():
1673
+ time.sleep(0.5) # 等待当前 WebSocket 线程完全结束
1674
+ self._handle_reconnection()
1675
+ threading.Thread(target=delayed_reconnect, daemon=True, name=f"Reconnect-{conn_id}").start()
1676
+
1677
+ def _full_reset(self, conn_id: int) -> None:
1678
+ """✅ 完全重置连接状态(模拟重启应用的效果)
1679
+
1680
+ 当检测到异常断开时,清理所有状态,确保重连后系统能正常运转。
1681
+ """
1682
+ ws_logger = get_ws_logger()
1683
+ log_warning(f"[conn:{conn_id}] ========== 开始完全重置 ==========")
1684
+ ws_logger.log_full_reset_detail(conn_id, "start", "开始完全重置流程")
1685
+
1686
+ try:
1687
+ # ✅ 0. 首先重置连接状态(关键!阻止其他线程创建新连接)
1688
+ old_conn_id = 0
1689
+ with self.lock:
1690
+ old_conn_id = self._connection_id
1691
+ self._connection_id = 0
1692
+ self._is_retrying = False
1693
+ self._connection_state = ConnectionState.DISCONNECTED
1694
+ self._connecting_since = 0.0
1695
+ self._connecting_conn_id = 0
1696
+ self.connected_event.clear()
1697
+ log_info(f"[conn:{conn_id}] ✅ 连接ID重置: {old_conn_id} → 0")
1698
+ ws_logger.log_full_reset_detail(conn_id, "reset_conn_id", f"old={old_conn_id} -> new=0")
1699
+
1700
+ # 1. 停止所有辅助线程(关键!防止它们继续干扰)
1701
+ log_info(f"[conn:{conn_id}] 🛑 停止辅助线程...")
1702
+ self._cleanup_running = False
1703
+ self._health_check_running = False
1704
+ ws_logger.log_full_reset_detail(conn_id, "stop_threads", "设置线程停止标志")
1705
+
1706
+ # 2. 清空消息队列
1707
+ queue_size = self.queue.qsize()
1708
+ cleared_count = 0
1709
+ while not self.queue.empty():
1710
+ try:
1711
+ self.queue.get_nowait()
1712
+ self.queue.task_done()
1713
+ cleared_count += 1
1714
+ except queue.Empty:
1715
+ break
1716
+
1717
+ log_info(f"[conn:{conn_id}] 🧹 清空消息队列: {cleared_count}/{queue_size} 条消息已丢弃")
1718
+ ws_logger.log_full_reset_detail(conn_id, "clear_queue", f"cleared={cleared_count}, total={queue_size}")
1719
+
1720
+ # 3. 清空 stream_queue_map(已经在 _notify_pending_stream_requests 中通知过了)
1721
+ with self._stream_queue_lock:
1722
+ stream_count = len(self.stream_queue_map)
1723
+ self.stream_queue_map.clear()
1724
+ log_info(f"[conn:{conn_id}] 🧹 清空流请求映射: {stream_count} 个请求已清理")
1725
+ ws_logger.log_full_reset_detail(conn_id, "clear_streams", f"cleared={stream_count}")
1726
+
1727
+ # 4. 关闭旧的 WebSocket 连接
1728
+ old_ws = self.ws
1729
+ old_loop = self._loop
1730
+ if old_loop and old_ws:
1731
+ try:
1732
+ if old_loop.is_running():
1733
+ future = asyncio.run_coroutine_threadsafe(
1734
+ self._graceful_close_ws(old_ws),
1735
+ old_loop
1736
+ )
1737
+ try:
1738
+ future.result(timeout=1.0)
1739
+ except Exception:
1740
+ pass
1741
+ except Exception:
1742
+ pass
1743
+ log_info(f"[conn:{conn_id}] 🔌 旧 WebSocket 连接已关闭")
1744
+ ws_logger.log_full_reset_detail(conn_id, "close_ws", "旧WebSocket已关闭")
1745
+
1746
+ # 5. 停止旧的事件循环
1747
+ if old_loop and old_loop.is_running():
1748
+ try:
1749
+ old_loop.call_soon_threadsafe(old_loop.stop)
1750
+ log_info(f"[conn:{conn_id}] ⏹️ 旧事件循环已停止")
1751
+ ws_logger.log_full_reset_detail(conn_id, "stop_loop", "事件循环已停止")
1752
+ except Exception:
1753
+ pass
1754
+
1755
+ # 6. 等待旧线程结束(注意:不能 join 当前线程,会死锁!)
1756
+ current_thread = threading.current_thread()
1757
+ if self._cleanup_thread and self._cleanup_thread.is_alive() and self._cleanup_thread != current_thread:
1758
+ self._cleanup_thread.join(timeout=1.0)
1759
+ if self._health_check_thread and self._health_check_thread.is_alive() and self._health_check_thread != current_thread:
1760
+ self._health_check_thread.join(timeout=1.0)
1761
+ # WebSocket 线程通常就是当前线程,不要 join 自己
1762
+ if self.ws_thread and self.ws_thread.is_alive() and self.ws_thread != current_thread:
1763
+ self.ws_thread.join(timeout=1.0)
1764
+ ws_logger.log_full_reset_detail(conn_id, "join_threads", "等待旧线程结束完成")
1765
+
1766
+ # 7. 清空所有引用
1767
+ with self.lock:
1768
+ self.ws = None
1769
+ self._loop = None
1770
+ self.ws_thread = None
1771
+ self._cleanup_thread = None
1772
+ self._health_check_thread = None
1773
+ ws_logger.log_full_reset_detail(conn_id, "clear_refs", "清空所有引用")
1774
+
1775
+ # 8. 重置重连状态
1776
+ self._reconnect_attempt_count = 0
1777
+ self._current_reconnect_interval = self.config.reconnect_base_interval
1778
+ self._last_pong_time = 0
1779
+ ws_logger.log_full_reset_detail(conn_id, "reset_reconnect", "重置重连状态")
1780
+
1781
+ # 9. 记录重置日志
1782
+ ws_logger.log_full_reset(
1783
+ conn_id=conn_id,
1784
+ queue_cleared=cleared_count,
1785
+ streams_cleared=stream_count
1786
+ )
1787
+
1788
+ log_info(f"[conn:{conn_id}] ✅ 完全重置完成,系统状态已清理,准备重新连接")
1789
+ ws_logger.log_full_reset_detail(conn_id, "complete", "完全重置流程完成")
1790
+
1791
+ except Exception as e:
1792
+ import traceback
1793
+ error_detail = traceback.format_exc()
1794
+ log_error(f"[conn:{conn_id}] ❌ 完全重置失败: {e}\n{error_detail}")
1795
+ ws_logger.log_full_reset_detail(conn_id, "error", f"重置失败: {str(e)}")
1796
+
1797
+ def _partial_reset_for_reconnect(self, conn_id: int) -> None:
1798
+ """✅ 部分重置,用于异常断开后准备重连
1799
+
1800
+ 与 _full_reset 不同,此方法:
1801
+ 1. 不尝试停止当前线程的事件循环(避免死锁)
1802
+ 2. 不 join 当前线程(避免死锁)
1803
+ 3. 只清理必要的状态,让重连线程创建新的连接
1804
+
1805
+ 这个方法在 WebSocket 处理线程中调用是安全的。
1806
+ """
1807
+ ws_logger = get_ws_logger()
1808
+ log_info(f"[conn:{conn_id}] 🔄 开始部分重置(为重连准备)...")
1809
+
1810
+ try:
1811
+ # 1. 重置重连计数(让重连从头开始)
1812
+ self._reconnect_attempt_count = 0
1813
+ self._current_reconnect_interval = self.config.reconnect_base_interval
1814
+
1815
+ # 2. 清空 stream_queue_map(已经在 _notify_pending_stream_requests 中通知过了)
1816
+ with self._stream_queue_lock:
1817
+ stream_count = len(self.stream_queue_map)
1818
+ self.stream_queue_map.clear()
1819
+ if stream_count > 0:
1820
+ log_info(f"[conn:{conn_id}] 🧹 清空流请求映射: {stream_count} 个请求已清理")
1821
+
1822
+ # 3. 清空消息队列中的消息(可选,重连后会重新发送)
1823
+ # 注意:这里不清空队列,让队列中的消息在重连后自动发送
1824
+ queue_size = self.queue.qsize()
1825
+ if queue_size > 0:
1826
+ log_info(f"[conn:{conn_id}] 📦 消息队列有 {queue_size} 条待发送消息,重连后自动发送")
1827
+
1828
+ # 4. 停止辅助线程标志(让它们自己退出)
1829
+ self._cleanup_running = False
1830
+ self._health_check_running = False
1831
+
1832
+ # 5. 标记连接状态(关键:让 start_websocket_client 知道需要创建新连接)
1833
+ with self.lock:
1834
+ self._connection_state = ConnectionState.DISCONNECTED
1835
+ self._connecting_since = 0.0
1836
+ self._connecting_conn_id = 0
1837
+ self.connected_event.clear()
1838
+ self._is_retrying = False # 重置重试标志,允许新的重连
1839
+ # 注意:不清空 ws 和 _loop,让它们自然被替换
1840
+
1841
+ log_info(f"[conn:{conn_id}] ✅ 部分重置完成,准备重连")
1842
+ ws_logger.log_full_reset_detail(conn_id, "partial_reset_complete", "部分重置完成,准备重连")
1843
+
1844
+ except Exception as e:
1845
+ log_error(f"[conn:{conn_id}] ❌ 部分重置异常: {e}")
1846
+
1847
+ def _notify_pending_stream_requests(self, reason: str) -> None:
1848
+ """✅ 通知所有等待中的 stream 请求连接已断开
1849
+
1850
+ 当 WebSocket 连接断开时,立即通知所有等待响应的 create_stream 请求,
1851
+ 避免它们继续等待到 15 秒超时。这样调用方可以更快地重试。
1852
+ """
1853
+ # ✅ 使用锁保护,复制后立即释放锁
1854
+ with self._stream_queue_lock:
1855
+ if not self.stream_queue_map:
1856
+ return
1857
+ pending_items = list(self.stream_queue_map.items())
1858
+ pending_count = len(pending_items)
1859
+ self.stream_queue_map.clear() # 在锁内清空
1860
+
1861
+ if pending_count == 0:
1862
+ return
1863
+
1864
+ log_warning(f"🔔 通知 {pending_count} 个等待中的流请求: {reason}")
1865
+
1866
+ # ✅ 释放锁后再处理通知(避免长时间持锁)
1867
+ notified_count = 0
1868
+ failed_count = 0
1869
+ for request_id, queue_entry in pending_items:
1870
+ try:
1871
+ temp_queue = queue_entry.get("queue")
1872
+ loop = queue_entry.get("loop")
1873
+ receiver = queue_entry.get("receiver", "unknown")
1874
+
1875
+ if temp_queue and loop:
1876
+ error_data = {
1877
+ "error": "connection_lost",
1878
+ "message": f"WebSocket 连接断开: {reason},请重试"
1879
+ }
1880
+ try:
1881
+ # 检查事件循环是否仍在运行
1882
+ if loop.is_running():
1883
+ # 使用线程安全的方式放入错误通知
1884
+ loop.call_soon_threadsafe(temp_queue.put_nowait, error_data)
1885
+ notified_count += 1
1886
+ log_debug(f"📢 已通知: request_id={request_id[:8]}... receiver={receiver}")
1887
+ else:
1888
+ failed_count += 1
1889
+ log_debug(f"事件循环已停止,跳过: request_id={request_id[:8]}...")
1890
+ except RuntimeError as e:
1891
+ failed_count += 1
1892
+ log_debug(f"事件循环已关闭: {e}")
1893
+ except Exception as e:
1894
+ failed_count += 1
1895
+ log_debug(f"通知失败: {e}")
1896
+
1897
+ except Exception as e:
1898
+ log_error(f"❌ 通知等待请求时异常: {e}")
1899
+
1900
+ # 汇总日志
1901
+ log_info(f"🔔 流请求通知完成: 成功={notified_count}, 失败={failed_count}, 总数={pending_count}")
1902
+
1903
+ def _get_close_code_meaning(self, code: int) -> str:
1904
+ """获取 WebSocket 关闭代码的含义"""
1905
+ close_codes = {
1906
+ 1000: "正常关闭 (Normal Closure)",
1907
+ 1001: "端点离开 (Going Away) - 服务器关闭或浏览器导航离开",
1908
+ 1002: "协议错误 (Protocol Error)",
1909
+ 1003: "不支持的数据类型 (Unsupported Data)",
1910
+ 1005: "未收到状态码 (No Status Received)",
1911
+ 1006: "异常关闭 (Abnormal Closure) - 连接意外断开,未收到关闭帧。常见原因:网络中断、服务器崩溃、防火墙/代理断开、心跳超时",
1912
+ 1007: "无效的帧数据 (Invalid Frame Payload Data)",
1913
+ 1008: "策略违规 (Policy Violation)",
1914
+ 1009: "消息太大 (Message Too Big)",
1915
+ 1010: "必需的扩展 (Mandatory Extension)",
1916
+ 1011: "内部服务器错误 (Internal Server Error)",
1917
+ 1012: "服务重启 (Service Restart)",
1918
+ 1013: "稍后重试 (Try Again Later)",
1919
+ 1014: "错误的网关 (Bad Gateway)",
1920
+ 1015: "TLS握手失败 (TLS Handshake Failure)",
1921
+ }
1922
+ return close_codes.get(code, f"未知代码 (Unknown Code: {code})")
1923
+
1924
+ # 兼容性方法 - 保持与旧 API 的兼容
1925
+ def on_open(self, ws) -> None:
1926
+ """Handle WebSocket connection open (for compatibility)."""
1927
+ pass
1928
+
1929
+ def on_message(self, ws, message: str) -> None:
1930
+ """Handle incoming WebSocket messages (for compatibility)."""
1931
+ pass
1932
+
1933
+ def on_error(self, ws, error: Exception) -> None:
1934
+ """Handle WebSocket errors (for compatibility)."""
1935
+ pass
1936
+
1937
+ def on_close(self, ws, close_status_code: int, close_msg: str) -> None:
1938
+ """Handle WebSocket connection close (for compatibility)."""
1939
+ pass
1940
+
1941
+ def on_ping(self, ws, message: bytes) -> None:
1942
+ """Handle WebSocket ping (for compatibility)."""
1943
+ self._last_pong_time = time.time()
1944
+
1945
+ def on_pong(self, ws, message: bytes) -> None:
1946
+ """Handle WebSocket pong (for compatibility)."""
1947
+ self._last_pong_time = time.time()
1948
+
1949
+ # ✅ 线程安全的 stream_queue_map 访问方法
1950
+ def register_stream_request(self, request_id: str, queue_entry: dict) -> None:
1951
+ """线程安全地注册流请求"""
1952
+ with self._stream_queue_lock:
1953
+ self.stream_queue_map[request_id] = queue_entry
1954
+
1955
+ def unregister_stream_request(self, request_id: str) -> Optional[dict]:
1956
+ """线程安全地注销流请求,返回被移除的条目"""
1957
+ with self._stream_queue_lock:
1958
+ return self.stream_queue_map.pop(request_id, None)
1959
+
1960
+ def get_stream_request(self, request_id: str) -> Optional[dict]:
1961
+ """线程安全地获取流请求"""
1962
+ with self._stream_queue_lock:
1963
+ return self.stream_queue_map.get(request_id)
1964
+
1965
+ def get_pending_stream_count(self) -> int:
1966
+ """线程安全地获取等待中的流请求数量"""
1967
+ with self._stream_queue_lock:
1968
+ return len(self.stream_queue_map)
1969
+
1970
+ def full_reset(self) -> None:
1971
+ """
1972
+ 完全重置 MessageClient,清理所有资源
1973
+
1974
+ 这个方法比 _full_reset 更彻底,用于外部显式调用
1975
+ 重置后可以重新调用 start_websocket_client() 建立新连接
1976
+ """
1977
+ log_info(f"[MessageClient] 开始完全重置: agent_id={self.agent_id}")
1978
+
1979
+ try:
1980
+ # 1. 设置关闭标志(阻止重连和新操作)
1981
+ self._shutdown_requested = True
1982
+ log_debug("[MessageClient] ✓ 已设置关闭标志")
1983
+
1984
+ # 2. 停止辅助线程标志
1985
+ self._cleanup_running = False
1986
+ self._health_check_running = False
1987
+ log_debug("[MessageClient] ✓ 已设置线程停止标志")
1988
+
1989
+ # 3. 通知所有等待中的请求
1990
+ pending_count = self.get_pending_stream_count()
1991
+ if pending_count > 0:
1992
+ log_info(f"[MessageClient] 通知 {pending_count} 个等待中的流请求...")
1993
+ self._notify_pending_stream_requests("MessageClient 正在完全重置")
1994
+
1995
+ # 4. 停止 WebSocket 连接
1996
+ log_debug("[MessageClient] 正在停止 WebSocket...")
1997
+ try:
1998
+ self.stop_websocket_client()
1999
+ except Exception as e:
2000
+ log_warning(f"[MessageClient] 停止 WebSocket 失败: {e}")
2001
+
2002
+ # 5. 清空 stream_queue_map
2003
+ with self._stream_queue_lock:
2004
+ self.stream_queue_map.clear()
2005
+ log_debug("[MessageClient] ✓ stream_queue_map 已清空")
2006
+
2007
+ # 6. 清空消息队列
2008
+ cleared_count = 0
2009
+ while not self.queue.empty():
2010
+ try:
2011
+ self.queue.get_nowait()
2012
+ self.queue.task_done()
2013
+ cleared_count += 1
2014
+ except queue.Empty:
2015
+ break
2016
+ log_debug(f"[MessageClient] ✓ 已清空 {cleared_count} 条待发送消息")
2017
+
2018
+ # 7. 等待辅助线程结束
2019
+ if self._cleanup_thread and self._cleanup_thread.is_alive():
2020
+ self._cleanup_thread.join(timeout=2.0)
2021
+ if self._health_check_thread and self._health_check_thread.is_alive():
2022
+ self._health_check_thread.join(timeout=2.0)
2023
+ log_debug("[MessageClient] ✓ 辅助线程已停止")
2024
+
2025
+ # 8. 重置连接状态
2026
+ with self.lock:
2027
+ self._connection_state = ConnectionState.DISCONNECTED
2028
+ self._connecting_since = 0.0
2029
+ self._connecting_conn_id = 0
2030
+ self.connected_event.clear()
2031
+ self._is_retrying = False
2032
+ self._reconnect_attempt_count = 0
2033
+ self._current_reconnect_interval = self.config.reconnect_base_interval
2034
+ self._connection_id = 0
2035
+ self._last_pong_time = 0
2036
+
2037
+ log_debug("[MessageClient] ✓ 连接状态已重置")
2038
+
2039
+ # 9. 清空引用
2040
+ self.ws = None
2041
+ self._loop = None
2042
+ self.ws_thread = None
2043
+ self._cleanup_thread = None
2044
+ self._health_check_thread = None
2045
+ log_debug("[MessageClient] ✓ 对象引用已清空")
2046
+
2047
+ # 10. 重置关闭标志(允许后续重新启动)
2048
+ self._shutdown_requested = False
2049
+ log_debug("[MessageClient] ✓ 关闭标志已重置")
2050
+
2051
+ log_info(f"[MessageClient] ✅ 完全重置完成: agent_id={self.agent_id}")
2052
+
2053
+ except Exception as e:
2054
+ log_error(f"[MessageClient] ❌ 完全重置失败: {e}")
2055
+ import traceback
2056
+ traceback.print_exc()
2057
+ # 确保关闭标志被重置,允许重试
2058
+ self._shutdown_requested = False