aiohomematic 2026.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. aiohomematic/__init__.py +110 -0
  2. aiohomematic/_log_context_protocol.py +29 -0
  3. aiohomematic/api.py +410 -0
  4. aiohomematic/async_support.py +250 -0
  5. aiohomematic/backend_detection.py +462 -0
  6. aiohomematic/central/__init__.py +103 -0
  7. aiohomematic/central/async_rpc_server.py +760 -0
  8. aiohomematic/central/central_unit.py +1152 -0
  9. aiohomematic/central/config.py +463 -0
  10. aiohomematic/central/config_builder.py +772 -0
  11. aiohomematic/central/connection_state.py +160 -0
  12. aiohomematic/central/coordinators/__init__.py +38 -0
  13. aiohomematic/central/coordinators/cache.py +414 -0
  14. aiohomematic/central/coordinators/client.py +480 -0
  15. aiohomematic/central/coordinators/connection_recovery.py +1141 -0
  16. aiohomematic/central/coordinators/device.py +1166 -0
  17. aiohomematic/central/coordinators/event.py +514 -0
  18. aiohomematic/central/coordinators/hub.py +532 -0
  19. aiohomematic/central/decorators.py +184 -0
  20. aiohomematic/central/device_registry.py +229 -0
  21. aiohomematic/central/events/__init__.py +104 -0
  22. aiohomematic/central/events/bus.py +1392 -0
  23. aiohomematic/central/events/integration.py +424 -0
  24. aiohomematic/central/events/types.py +194 -0
  25. aiohomematic/central/health.py +762 -0
  26. aiohomematic/central/rpc_server.py +353 -0
  27. aiohomematic/central/scheduler.py +794 -0
  28. aiohomematic/central/state_machine.py +391 -0
  29. aiohomematic/client/__init__.py +203 -0
  30. aiohomematic/client/_rpc_errors.py +187 -0
  31. aiohomematic/client/backends/__init__.py +48 -0
  32. aiohomematic/client/backends/base.py +335 -0
  33. aiohomematic/client/backends/capabilities.py +138 -0
  34. aiohomematic/client/backends/ccu.py +487 -0
  35. aiohomematic/client/backends/factory.py +116 -0
  36. aiohomematic/client/backends/homegear.py +294 -0
  37. aiohomematic/client/backends/json_ccu.py +252 -0
  38. aiohomematic/client/backends/protocol.py +316 -0
  39. aiohomematic/client/ccu.py +1857 -0
  40. aiohomematic/client/circuit_breaker.py +459 -0
  41. aiohomematic/client/config.py +64 -0
  42. aiohomematic/client/handlers/__init__.py +40 -0
  43. aiohomematic/client/handlers/backup.py +157 -0
  44. aiohomematic/client/handlers/base.py +79 -0
  45. aiohomematic/client/handlers/device_ops.py +1085 -0
  46. aiohomematic/client/handlers/firmware.py +144 -0
  47. aiohomematic/client/handlers/link_mgmt.py +199 -0
  48. aiohomematic/client/handlers/metadata.py +436 -0
  49. aiohomematic/client/handlers/programs.py +144 -0
  50. aiohomematic/client/handlers/sysvars.py +100 -0
  51. aiohomematic/client/interface_client.py +1304 -0
  52. aiohomematic/client/json_rpc.py +2068 -0
  53. aiohomematic/client/request_coalescer.py +282 -0
  54. aiohomematic/client/rpc_proxy.py +629 -0
  55. aiohomematic/client/state_machine.py +324 -0
  56. aiohomematic/const.py +2207 -0
  57. aiohomematic/context.py +275 -0
  58. aiohomematic/converter.py +270 -0
  59. aiohomematic/decorators.py +390 -0
  60. aiohomematic/exceptions.py +185 -0
  61. aiohomematic/hmcli.py +997 -0
  62. aiohomematic/i18n.py +193 -0
  63. aiohomematic/interfaces/__init__.py +407 -0
  64. aiohomematic/interfaces/central.py +1067 -0
  65. aiohomematic/interfaces/client.py +1096 -0
  66. aiohomematic/interfaces/coordinators.py +63 -0
  67. aiohomematic/interfaces/model.py +1921 -0
  68. aiohomematic/interfaces/operations.py +217 -0
  69. aiohomematic/logging_context.py +134 -0
  70. aiohomematic/metrics/__init__.py +125 -0
  71. aiohomematic/metrics/_protocols.py +140 -0
  72. aiohomematic/metrics/aggregator.py +534 -0
  73. aiohomematic/metrics/dataclasses.py +489 -0
  74. aiohomematic/metrics/emitter.py +292 -0
  75. aiohomematic/metrics/events.py +183 -0
  76. aiohomematic/metrics/keys.py +300 -0
  77. aiohomematic/metrics/observer.py +563 -0
  78. aiohomematic/metrics/stats.py +172 -0
  79. aiohomematic/model/__init__.py +189 -0
  80. aiohomematic/model/availability.py +65 -0
  81. aiohomematic/model/calculated/__init__.py +89 -0
  82. aiohomematic/model/calculated/climate.py +276 -0
  83. aiohomematic/model/calculated/data_point.py +315 -0
  84. aiohomematic/model/calculated/field.py +147 -0
  85. aiohomematic/model/calculated/operating_voltage_level.py +286 -0
  86. aiohomematic/model/calculated/support.py +232 -0
  87. aiohomematic/model/custom/__init__.py +214 -0
  88. aiohomematic/model/custom/capabilities/__init__.py +67 -0
  89. aiohomematic/model/custom/capabilities/climate.py +41 -0
  90. aiohomematic/model/custom/capabilities/light.py +87 -0
  91. aiohomematic/model/custom/capabilities/lock.py +44 -0
  92. aiohomematic/model/custom/capabilities/siren.py +63 -0
  93. aiohomematic/model/custom/climate.py +1130 -0
  94. aiohomematic/model/custom/cover.py +722 -0
  95. aiohomematic/model/custom/data_point.py +360 -0
  96. aiohomematic/model/custom/definition.py +300 -0
  97. aiohomematic/model/custom/field.py +89 -0
  98. aiohomematic/model/custom/light.py +1174 -0
  99. aiohomematic/model/custom/lock.py +322 -0
  100. aiohomematic/model/custom/mixins.py +445 -0
  101. aiohomematic/model/custom/profile.py +945 -0
  102. aiohomematic/model/custom/registry.py +251 -0
  103. aiohomematic/model/custom/siren.py +462 -0
  104. aiohomematic/model/custom/switch.py +195 -0
  105. aiohomematic/model/custom/text_display.py +289 -0
  106. aiohomematic/model/custom/valve.py +78 -0
  107. aiohomematic/model/data_point.py +1416 -0
  108. aiohomematic/model/device.py +1840 -0
  109. aiohomematic/model/event.py +216 -0
  110. aiohomematic/model/generic/__init__.py +327 -0
  111. aiohomematic/model/generic/action.py +40 -0
  112. aiohomematic/model/generic/action_select.py +62 -0
  113. aiohomematic/model/generic/binary_sensor.py +30 -0
  114. aiohomematic/model/generic/button.py +31 -0
  115. aiohomematic/model/generic/data_point.py +177 -0
  116. aiohomematic/model/generic/dummy.py +150 -0
  117. aiohomematic/model/generic/number.py +76 -0
  118. aiohomematic/model/generic/select.py +56 -0
  119. aiohomematic/model/generic/sensor.py +76 -0
  120. aiohomematic/model/generic/switch.py +54 -0
  121. aiohomematic/model/generic/text.py +33 -0
  122. aiohomematic/model/hub/__init__.py +100 -0
  123. aiohomematic/model/hub/binary_sensor.py +24 -0
  124. aiohomematic/model/hub/button.py +28 -0
  125. aiohomematic/model/hub/connectivity.py +190 -0
  126. aiohomematic/model/hub/data_point.py +342 -0
  127. aiohomematic/model/hub/hub.py +864 -0
  128. aiohomematic/model/hub/inbox.py +135 -0
  129. aiohomematic/model/hub/install_mode.py +393 -0
  130. aiohomematic/model/hub/metrics.py +208 -0
  131. aiohomematic/model/hub/number.py +42 -0
  132. aiohomematic/model/hub/select.py +52 -0
  133. aiohomematic/model/hub/sensor.py +37 -0
  134. aiohomematic/model/hub/switch.py +43 -0
  135. aiohomematic/model/hub/text.py +30 -0
  136. aiohomematic/model/hub/update.py +221 -0
  137. aiohomematic/model/support.py +592 -0
  138. aiohomematic/model/update.py +140 -0
  139. aiohomematic/model/week_profile.py +1827 -0
  140. aiohomematic/property_decorators.py +719 -0
  141. aiohomematic/py.typed +0 -0
  142. aiohomematic/rega_scripts/accept_device_in_inbox.fn +51 -0
  143. aiohomematic/rega_scripts/create_backup_start.fn +28 -0
  144. aiohomematic/rega_scripts/create_backup_status.fn +89 -0
  145. aiohomematic/rega_scripts/fetch_all_device_data.fn +97 -0
  146. aiohomematic/rega_scripts/get_backend_info.fn +25 -0
  147. aiohomematic/rega_scripts/get_inbox_devices.fn +61 -0
  148. aiohomematic/rega_scripts/get_program_descriptions.fn +31 -0
  149. aiohomematic/rega_scripts/get_serial.fn +44 -0
  150. aiohomematic/rega_scripts/get_service_messages.fn +83 -0
  151. aiohomematic/rega_scripts/get_system_update_info.fn +39 -0
  152. aiohomematic/rega_scripts/get_system_variable_descriptions.fn +31 -0
  153. aiohomematic/rega_scripts/set_program_state.fn +17 -0
  154. aiohomematic/rega_scripts/set_system_variable.fn +19 -0
  155. aiohomematic/rega_scripts/trigger_firmware_update.fn +67 -0
  156. aiohomematic/schemas.py +256 -0
  157. aiohomematic/store/__init__.py +55 -0
  158. aiohomematic/store/dynamic/__init__.py +43 -0
  159. aiohomematic/store/dynamic/command.py +250 -0
  160. aiohomematic/store/dynamic/data.py +175 -0
  161. aiohomematic/store/dynamic/details.py +187 -0
  162. aiohomematic/store/dynamic/ping_pong.py +416 -0
  163. aiohomematic/store/persistent/__init__.py +71 -0
  164. aiohomematic/store/persistent/base.py +285 -0
  165. aiohomematic/store/persistent/device.py +233 -0
  166. aiohomematic/store/persistent/incident.py +380 -0
  167. aiohomematic/store/persistent/paramset.py +241 -0
  168. aiohomematic/store/persistent/session.py +556 -0
  169. aiohomematic/store/serialization.py +150 -0
  170. aiohomematic/store/storage.py +689 -0
  171. aiohomematic/store/types.py +526 -0
  172. aiohomematic/store/visibility/__init__.py +40 -0
  173. aiohomematic/store/visibility/parser.py +141 -0
  174. aiohomematic/store/visibility/registry.py +722 -0
  175. aiohomematic/store/visibility/rules.py +307 -0
  176. aiohomematic/strings.json +237 -0
  177. aiohomematic/support.py +706 -0
  178. aiohomematic/tracing.py +236 -0
  179. aiohomematic/translations/de.json +237 -0
  180. aiohomematic/translations/en.json +237 -0
  181. aiohomematic/type_aliases.py +51 -0
  182. aiohomematic/validator.py +128 -0
  183. aiohomematic-2026.1.29.dist-info/METADATA +296 -0
  184. aiohomematic-2026.1.29.dist-info/RECORD +188 -0
  185. aiohomematic-2026.1.29.dist-info/WHEEL +5 -0
  186. aiohomematic-2026.1.29.dist-info/entry_points.txt +2 -0
  187. aiohomematic-2026.1.29.dist-info/licenses/LICENSE +21 -0
  188. aiohomematic-2026.1.29.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1141 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2021-2026
3
+ """
4
+ Unified connection recovery coordinator.
5
+
6
+ Overview
7
+ --------
8
+ This module provides the ConnectionRecoveryCoordinator which consolidates:
9
+ - SelfHealingCoordinator (event-driven triggers)
10
+ - RecoveryCoordinator (retry tracking, state transitions)
11
+ - BackgroundScheduler._check_connection (staged reconnection)
12
+
13
+ Into a single, event-driven recovery system.
14
+
15
+ Architecture
16
+ ------------
17
+ The coordinator:
18
+ 1. Subscribes to connection-related events (ConnectionLostEvent, CircuitBreakerTrippedEvent)
19
+ 2. Executes staged recovery (TCP check → RPC check → warmup → reconnect → data load)
20
+ 3. Tracks retry attempts with exponential backoff
21
+ 4. Manages central state transitions (RECOVERING, RUNNING, DEGRADED, FAILED)
22
+ 5. Provides heartbeat retry in FAILED state
23
+
24
+ Event Flow
25
+ ----------
26
+ ::
27
+
28
+ ConnectionLostEvent / CircuitBreakerTrippedEvent
29
+
30
+
31
+ ConnectionRecoveryCoordinator
32
+
33
+ ├─► RecoveryStageChangedEvent (per stage transition)
34
+
35
+ ├─► RecoveryAttemptedEvent (per attempt)
36
+
37
+ └─► RecoveryCompletedEvent / RecoveryFailedEvent
38
+
39
+ Public API
40
+ ----------
41
+ - ConnectionRecoveryCoordinator: Main coordinator class
42
+ - MAX_RECOVERY_ATTEMPTS: Maximum retry attempts before FAILED state
43
+ - HEARTBEAT_RETRY_INTERVAL: Interval between heartbeat retries
44
+ - MAX_CONCURRENT_RECOVERIES: Maximum parallel recovery operations
45
+ """
46
+
47
+ from __future__ import annotations
48
+
49
+ import asyncio
50
+ from dataclasses import dataclass, field
51
+ from datetime import datetime
52
+ import logging
53
+ import time
54
+ from typing import TYPE_CHECKING, Final
55
+
56
+ from aiohomematic.central.events import (
57
+ CircuitBreakerStateChangedEvent,
58
+ CircuitBreakerTrippedEvent,
59
+ ConnectionLostEvent,
60
+ HeartbeatTimerFiredEvent,
61
+ RecoveryAttemptedEvent,
62
+ RecoveryCompletedEvent,
63
+ RecoveryFailedEvent,
64
+ RecoveryStageChangedEvent,
65
+ SystemStatusChangedEvent,
66
+ )
67
+ from aiohomematic.client import CircuitState
68
+ from aiohomematic.const import (
69
+ INTERFACES_REQUIRING_JSON_RPC_CLIENT,
70
+ INTERFACES_REQUIRING_XML_RPC,
71
+ CentralState,
72
+ FailureReason,
73
+ RecoveryStage,
74
+ get_json_rpc_default_port,
75
+ )
76
+ from aiohomematic.store.types import IncidentSeverity, IncidentType
77
+
78
+ if TYPE_CHECKING:
79
+ from collections.abc import Callable
80
+
81
+ from aiohomematic.central.events import EventBus
82
+ from aiohomematic.central.state_machine import CentralStateMachine
83
+ from aiohomematic.interfaces import (
84
+ CentralInfoProtocol,
85
+ ClientProviderProtocol,
86
+ ConfigProviderProtocol,
87
+ CoordinatorProviderProtocol,
88
+ DeviceDataRefresherProtocol,
89
+ IncidentRecorderProtocol,
90
+ TaskSchedulerProtocol,
91
+ )
92
+
93
+ _LOGGER: Final = logging.getLogger(__name__)
94
+
95
+ # Maximum number of recovery attempts before transitioning to FAILED
96
+ MAX_RECOVERY_ATTEMPTS: Final[int] = 8
97
+
98
+ # Interval between heartbeat retries in FAILED state (seconds)
99
+ HEARTBEAT_RETRY_INTERVAL: Final[float] = 60.0
100
+
101
+ # Base delay between recovery attempts (seconds)
102
+ BASE_RETRY_DELAY: Final[float] = 5.0
103
+
104
+ # Maximum delay between recovery attempts (seconds)
105
+ MAX_RETRY_DELAY: Final[float] = 60.0
106
+
107
+ # Maximum concurrent recovery operations
108
+ MAX_CONCURRENT_RECOVERIES: Final[int] = 2
109
+
110
+
111
+ @dataclass(slots=True)
112
+ class InterfaceRecoveryState:
113
+ """
114
+ State tracking for recovery of a single interface.
115
+
116
+ Tracks attempt count, timing, stage progression, and history.
117
+ """
118
+
119
+ interface_id: str
120
+ attempt_count: int = 0
121
+ last_attempt: datetime | None = None
122
+ last_success: datetime | None = None
123
+ consecutive_failures: int = 0
124
+ current_stage: RecoveryStage = RecoveryStage.IDLE
125
+ stage_entered_at: datetime = field(default_factory=datetime.now)
126
+ stages_completed: list[RecoveryStage] = field(default_factory=list)
127
+ recovery_start_time: float | None = None
128
+
129
+ @property
130
+ def can_retry(self) -> bool:
131
+ """Check if another retry attempt is allowed."""
132
+ return self.attempt_count < MAX_RECOVERY_ATTEMPTS
133
+
134
+ @property
135
+ def next_retry_delay(self) -> float:
136
+ """Calculate delay before next retry using exponential backoff."""
137
+ if self.consecutive_failures == 0:
138
+ return BASE_RETRY_DELAY
139
+ # Exponential backoff: BASE * 2^(failures-1), capped at MAX
140
+ delay: float = BASE_RETRY_DELAY * (2 ** (self.consecutive_failures - 1))
141
+ return float(min(delay, MAX_RETRY_DELAY))
142
+
143
+ def record_failure(self) -> None:
144
+ """Record a failed recovery attempt."""
145
+ self.consecutive_failures += 1
146
+ self.last_attempt = datetime.now()
147
+ self.attempt_count += 1
148
+
149
+ def record_success(self) -> None:
150
+ """Record a successful recovery attempt."""
151
+ self.consecutive_failures = 0
152
+ self.last_success = datetime.now()
153
+ self.last_attempt = datetime.now()
154
+ self.attempt_count += 1
155
+
156
+ def reset(self) -> None:
157
+ """Reset recovery state after successful recovery."""
158
+ self.attempt_count = 0
159
+ self.consecutive_failures = 0
160
+ self.current_stage = RecoveryStage.IDLE
161
+ self.stages_completed.clear()
162
+ self.recovery_start_time = None
163
+
164
+ def start_recovery(self) -> None:
165
+ """Start a new recovery cycle."""
166
+ self.recovery_start_time = time.perf_counter()
167
+ self.stages_completed.clear()
168
+
169
+ def transition_to_stage(self, *, new_stage: RecoveryStage) -> float:
170
+ """
171
+ Transition to a new recovery stage.
172
+
173
+ Args:
174
+ new_stage: The new stage to transition to
175
+
176
+ Returns:
177
+ Duration in the old stage in milliseconds.
178
+
179
+ """
180
+ duration_ms = (datetime.now() - self.stage_entered_at).total_seconds() * 1000
181
+ if self.current_stage not in (RecoveryStage.IDLE, RecoveryStage.RECOVERED, RecoveryStage.FAILED):
182
+ self.stages_completed.append(self.current_stage)
183
+ self.current_stage = new_stage
184
+ self.stage_entered_at = datetime.now()
185
+ return duration_ms
186
+
187
+
188
+ class ConnectionRecoveryCoordinator:
189
+ """
190
+ Unified coordinator for connection recovery.
191
+
192
+ Consolidates:
193
+ - SelfHealingCoordinator (event-driven triggers)
194
+ - RecoveryCoordinator (retry tracking, state transitions)
195
+ - BackgroundScheduler._check_connection (staged reconnection)
196
+
197
+ Thread Safety
198
+ -------------
199
+ This class is designed for single-threaded asyncio use.
200
+ All event handlers and recovery operations run in the same event loop.
201
+
202
+ Example Usage
203
+ -------------
204
+ coordinator = ConnectionRecoveryCoordinator(
205
+ central_info=central,
206
+ config_provider=central,
207
+ client_provider=central,
208
+ coordinator_provider=central,
209
+ device_data_refresher=central,
210
+ event_bus=central.event_bus,
211
+ task_scheduler=central,
212
+ state_machine=central.state_machine,
213
+ )
214
+
215
+ # Recovery happens automatically via events
216
+
217
+ # To stop:
218
+ coordinator.stop()
219
+
220
+ """
221
+
222
+ __slots__ = (
223
+ "_active_recoveries",
224
+ "_central_info",
225
+ "_client_provider",
226
+ "_config_provider",
227
+ "_coordinator_provider",
228
+ "_device_data_refresher",
229
+ "_event_bus",
230
+ "_heartbeat_task",
231
+ "_incident_recorder",
232
+ "_in_failed_state",
233
+ "_recovery_semaphore",
234
+ "_recovery_states",
235
+ "_shutdown",
236
+ "_state_machine",
237
+ "_task_scheduler",
238
+ "_unsubscribers",
239
+ )
240
+
241
+ def __init__(
242
+ self,
243
+ *,
244
+ central_info: CentralInfoProtocol,
245
+ config_provider: ConfigProviderProtocol,
246
+ client_provider: ClientProviderProtocol,
247
+ coordinator_provider: CoordinatorProviderProtocol,
248
+ device_data_refresher: DeviceDataRefresherProtocol,
249
+ event_bus: EventBus,
250
+ task_scheduler: TaskSchedulerProtocol,
251
+ state_machine: CentralStateMachine | None = None,
252
+ incident_recorder: IncidentRecorderProtocol | None = None,
253
+ ) -> None:
254
+ """
255
+ Initialize the connection recovery coordinator.
256
+
257
+ Args:
258
+ central_info: Central system information
259
+ config_provider: Configuration provider
260
+ client_provider: Client lookup provider
261
+ coordinator_provider: Coordinator provider for client coordinator access
262
+ device_data_refresher: Device data refresh operations
263
+ event_bus: Event bus for subscriptions and publishing
264
+ task_scheduler: Task scheduler for async operations
265
+ state_machine: Optional central state machine
266
+ incident_recorder: Optional incident recorder for diagnostic events
267
+
268
+ """
269
+ self._central_info: Final = central_info
270
+ self._config_provider: Final = config_provider
271
+ self._client_provider: Final = client_provider
272
+ self._coordinator_provider: Final = coordinator_provider
273
+ self._device_data_refresher: Final = device_data_refresher
274
+ self._event_bus: Final = event_bus
275
+ self._task_scheduler: Final = task_scheduler
276
+ self._state_machine = state_machine
277
+ self._incident_recorder = incident_recorder
278
+
279
+ # Recovery state tracking
280
+ self._recovery_states: dict[str, InterfaceRecoveryState] = {}
281
+ self._active_recoveries: set[str] = set()
282
+ self._recovery_semaphore = asyncio.Semaphore(MAX_CONCURRENT_RECOVERIES)
283
+ self._in_failed_state: bool = False
284
+ self._shutdown: bool = False
285
+ self._heartbeat_task: asyncio.Task[None] | None = None
286
+ self._unsubscribers: list[Callable[[], None]] = []
287
+
288
+ # Subscribe to connection-related events
289
+ self._subscribe_to_events()
290
+
291
+ _LOGGER.debug("CONNECTION_RECOVERY: Coordinator initialized for %s", self._central_info.name)
292
+
293
+ @property
294
+ def in_recovery(self) -> bool:
295
+ """Return True if any recovery is in progress."""
296
+ return bool(self._active_recoveries)
297
+
298
+ @property
299
+ def recovery_states(self) -> dict[str, InterfaceRecoveryState]:
300
+ """Return recovery states for all tracked interfaces."""
301
+ return self._recovery_states.copy()
302
+
303
+ def get_recovery_state(self, *, interface_id: str) -> InterfaceRecoveryState | None:
304
+ """Return recovery state for a specific interface."""
305
+ return self._recovery_states.get(interface_id)
306
+
307
+ def set_state_machine(self, *, state_machine: CentralStateMachine) -> None:
308
+ """Set the state machine reference."""
309
+ self._state_machine = state_machine
310
+
311
+ def stop(self) -> None:
312
+ """Stop the coordinator and unsubscribe from events."""
313
+ self._shutdown = True
314
+
315
+ # Cancel heartbeat task if running
316
+ if self._heartbeat_task and not self._heartbeat_task.done():
317
+ self._heartbeat_task.cancel()
318
+
319
+ # Unsubscribe from all events
320
+ for unsub in self._unsubscribers:
321
+ unsub()
322
+ self._unsubscribers.clear()
323
+
324
+ _LOGGER.debug("CONNECTION_RECOVERY: Coordinator stopped for %s", self._central_info.name)
325
+
326
+ async def _check_rpc_available(self, *, interface_id: str) -> bool:
327
+ """Check if RPC interface is available."""
328
+ try:
329
+ client = self._client_provider.get_client(interface_id=interface_id)
330
+
331
+ # For JSON-RPC-only interfaces (CUxD, CCU-Jack), use check_connection_availability
332
+ # which internally calls Interface.isPresent via JSON-RPC
333
+ if client.interface in INTERFACES_REQUIRING_JSON_RPC_CLIENT - INTERFACES_REQUIRING_XML_RPC:
334
+ return await client.check_connection_availability(handle_ping_pong=False)
335
+
336
+ # For XML-RPC interfaces, use system.listMethods via proxy
337
+ # pylint: disable=protected-access
338
+ # Get the proxy - it may be directly on the client or on the backend
339
+ proxy = None
340
+ if hasattr(client, "_proxy"):
341
+ proxy = client._proxy
342
+ elif hasattr(client, "_backend") and hasattr(client._backend, "_proxy"):
343
+ proxy = client._backend._proxy
344
+
345
+ if proxy is not None and hasattr(proxy, "system"):
346
+ # Reset the transport before checking - the HTTP connection may be
347
+ # in an inconsistent state (e.g., ResponseNotReady) after connection loss.
348
+ # This forces a fresh TCP connection for the RPC check.
349
+ if hasattr(proxy, "_reset_transport"):
350
+ proxy._reset_transport()
351
+
352
+ result = await proxy.system.listMethods()
353
+ return bool(result)
354
+
355
+ _LOGGER.debug(
356
+ "CONNECTION_RECOVERY: No suitable proxy found for RPC check on %s",
357
+ interface_id,
358
+ )
359
+ except Exception as ex: # noqa: BLE001
360
+ _LOGGER.debug(
361
+ "CONNECTION_RECOVERY: RPC check failed for %s: %s",
362
+ interface_id,
363
+ ex,
364
+ )
365
+ return False
366
+
367
+ async def _check_tcp_port_available(self, *, host: str, port: int) -> bool:
368
+ """Check if a TCP port is available (non-invasive connectivity check)."""
369
+ try:
370
+ reader, writer = await asyncio.wait_for(
371
+ asyncio.open_connection(host, port),
372
+ timeout=2.0,
373
+ )
374
+ writer.close()
375
+ await writer.wait_closed()
376
+ except (TimeoutError, OSError):
377
+ return False
378
+ return True
379
+
380
+ async def _emit_recovery_attempt(
381
+ self,
382
+ *,
383
+ interface_id: str,
384
+ state: InterfaceRecoveryState,
385
+ success: bool,
386
+ error_message: str | None = None,
387
+ ) -> None:
388
+ """Emit a recovery attempt event."""
389
+ await self._event_bus.publish(
390
+ event=RecoveryAttemptedEvent(
391
+ timestamp=datetime.now(),
392
+ interface_id=interface_id,
393
+ attempt_number=state.attempt_count,
394
+ max_attempts=MAX_RECOVERY_ATTEMPTS,
395
+ stage_reached=state.current_stage,
396
+ success=success,
397
+ error_message=error_message,
398
+ )
399
+ )
400
+
401
+ async def _emit_recovery_completed(
402
+ self,
403
+ *,
404
+ interface_id: str,
405
+ state: InterfaceRecoveryState,
406
+ ) -> None:
407
+ """Emit a recovery completed event."""
408
+ duration_ms = (time.perf_counter() - state.recovery_start_time) * 1000 if state.recovery_start_time else 0.0
409
+
410
+ await self._event_bus.publish(
411
+ event=RecoveryCompletedEvent(
412
+ timestamp=datetime.now(),
413
+ interface_id=interface_id,
414
+ central_name=self._central_info.name,
415
+ total_attempts=state.attempt_count,
416
+ total_duration_ms=duration_ms,
417
+ stages_completed=tuple(state.stages_completed),
418
+ )
419
+ )
420
+
421
+ async def _emit_recovery_failed(
422
+ self,
423
+ *,
424
+ interface_id: str,
425
+ state: InterfaceRecoveryState,
426
+ ) -> None:
427
+ """Emit a recovery failed event."""
428
+ duration_ms = (time.perf_counter() - state.recovery_start_time) * 1000 if state.recovery_start_time else 0.0
429
+
430
+ await self._event_bus.publish(
431
+ event=RecoveryFailedEvent(
432
+ timestamp=datetime.now(),
433
+ interface_id=interface_id,
434
+ central_name=self._central_info.name,
435
+ total_attempts=state.attempt_count,
436
+ total_duration_ms=duration_ms,
437
+ last_stage_reached=state.current_stage,
438
+ failure_reason=FailureReason.UNKNOWN,
439
+ requires_manual_intervention=True,
440
+ )
441
+ )
442
+
443
+ async def _execute_recovery_stages(self, *, interface_id: str) -> bool:
444
+ """
445
+ Execute staged recovery for an interface.
446
+
447
+ Returns True if recovery succeeded, False otherwise.
448
+ """
449
+ if interface_id not in self._recovery_states:
450
+ return False
451
+
452
+ timeout_config = self._config_provider.config.timeout_config
453
+
454
+ try:
455
+ # Stage: DETECTING → COOLDOWN
456
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.COOLDOWN)
457
+ await asyncio.sleep(timeout_config.reconnect_initial_cooldown)
458
+
459
+ # Stage: COOLDOWN → TCP_CHECKING
460
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.TCP_CHECKING)
461
+ if not await self._stage_tcp_check(interface_id=interface_id):
462
+ return False
463
+
464
+ # Stage: TCP_CHECKING → RPC_CHECKING
465
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.RPC_CHECKING)
466
+ if not await self._stage_rpc_check(interface_id=interface_id):
467
+ return False
468
+
469
+ # Stage: RPC_CHECKING → WARMING_UP
470
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.WARMING_UP)
471
+ await asyncio.sleep(timeout_config.reconnect_warmup_delay)
472
+
473
+ # Stage: WARMING_UP → STABILITY_CHECK
474
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.STABILITY_CHECK)
475
+ if not await self._stage_stability_check(interface_id=interface_id):
476
+ return False
477
+
478
+ # Stage: STABILITY_CHECK → RECONNECTING
479
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.RECONNECTING)
480
+ if not await self._stage_reconnect(interface_id=interface_id):
481
+ return False
482
+
483
+ # Stage: RECONNECTING → DATA_LOADING
484
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.DATA_LOADING)
485
+ if not await self._stage_data_load(interface_id=interface_id):
486
+ return False
487
+
488
+ # Stage: DATA_LOADING → RECOVERED
489
+ await self._transition_stage(interface_id=interface_id, new_stage=RecoveryStage.RECOVERED)
490
+
491
+ except asyncio.CancelledError:
492
+ _LOGGER.debug("CONNECTION_RECOVERY: Recovery cancelled for %s", interface_id)
493
+ raise
494
+ except Exception:
495
+ _LOGGER.exception( # i18n-log: ignore
496
+ "CONNECTION_RECOVERY: Exception during recovery of %s",
497
+ interface_id,
498
+ )
499
+ return False
500
+ else:
501
+ return True
502
+
503
+ def _get_client_port(self, *, interface_id: str) -> int | None:
504
+ """Get the port for a client."""
505
+ try:
506
+ client = self._client_provider.get_client(interface_id=interface_id)
507
+ # Access internal config to get port - pylint: disable=protected-access
508
+ # InterfaceClient stores config in _interface_config directly
509
+ if hasattr(client, "_interface_config"):
510
+ port = client._interface_config.port
511
+ return port if isinstance(port, int) else None
512
+ # ClientCCU stores config in _config.interface_config
513
+ if hasattr(client, "_config") and hasattr(client._config, "interface_config"):
514
+ port = client._config.interface_config.port
515
+ return port if isinstance(port, int) else None
516
+ except Exception: # noqa: BLE001
517
+ pass
518
+ return None
519
+
520
+ async def _handle_max_retries_reached(self, *, interface_id: str) -> None:
521
+ """Handle when max retries are reached for an interface."""
522
+ self._in_failed_state = True
523
+ self._transition_to_failed(interface_id=interface_id)
524
+
525
+ if state := self._recovery_states.get(interface_id):
526
+ await self._emit_recovery_failed(interface_id=interface_id, state=state)
527
+
528
+ # Start heartbeat timer if not already running
529
+ self._start_heartbeat_timer()
530
+
531
+ _LOGGER.error( # i18n-log: ignore
532
+ "CONNECTION_RECOVERY: FAILED state entered for %s - max retries reached. "
533
+ "Will retry every %d seconds via heartbeat.",
534
+ interface_id,
535
+ int(HEARTBEAT_RETRY_INTERVAL),
536
+ )
537
+
538
+ async def _heartbeat_loop(self) -> None:
539
+ """Heartbeat loop for FAILED state retries."""
540
+ while self._in_failed_state and not self._shutdown:
541
+ await asyncio.sleep(HEARTBEAT_RETRY_INTERVAL)
542
+
543
+ # Re-check conditions after sleep (state may have changed during await)
544
+ if not self._in_failed_state or self._shutdown:
545
+ return # type: ignore[unreachable]
546
+
547
+ # Get failed interfaces
548
+ failed_interfaces = tuple(iid for iid, state in self._recovery_states.items() if not state.can_retry)
549
+
550
+ if failed_interfaces:
551
+ # Reset attempt counts to allow retry
552
+ for iid in failed_interfaces:
553
+ if (state := self._recovery_states.get(iid)) is not None:
554
+ state.attempt_count = MAX_RECOVERY_ATTEMPTS - 1
555
+
556
+ # Emit heartbeat event
557
+ await self._event_bus.publish(
558
+ event=HeartbeatTimerFiredEvent(
559
+ timestamp=datetime.now(),
560
+ central_name=self._central_info.name,
561
+ interface_ids=failed_interfaces,
562
+ )
563
+ )
564
+
565
+ def _on_circuit_breaker_state_changed(self, *, event: CircuitBreakerStateChangedEvent) -> None:
566
+ """Handle circuit breaker state change event."""
567
+ if self._shutdown:
568
+ return
569
+
570
+ # Only act on recovery: HALF_OPEN → CLOSED
571
+ if event.old_state == CircuitState.HALF_OPEN and event.new_state == CircuitState.CLOSED:
572
+ _LOGGER.info( # i18n-log: ignore
573
+ "CONNECTION_RECOVERY: Circuit breaker recovered for %s, triggering data refresh",
574
+ event.interface_id,
575
+ )
576
+ # Schedule data refresh for the recovered interface
577
+ iid = event.interface_id
578
+
579
+ async def refresh_data() -> None:
580
+ await self._refresh_interface_data(interface_id=iid)
581
+
582
+ self._task_scheduler.create_task(
583
+ target=refresh_data,
584
+ name=f"recovery_refresh_{event.interface_id}",
585
+ )
586
+
587
+ def _on_circuit_breaker_tripped(self, *, event: CircuitBreakerTrippedEvent) -> None:
588
+ """Handle circuit breaker tripped event."""
589
+ if self._shutdown:
590
+ return
591
+
592
+ interface_id = event.interface_id
593
+
594
+ _LOGGER.warning( # i18n-log: ignore
595
+ "CONNECTION_RECOVERY: Circuit breaker tripped for %s after %d failures",
596
+ interface_id,
597
+ event.failure_count,
598
+ )
599
+
600
+ # Circuit breaker trip indicates connection issues - start recovery if not already
601
+ if interface_id not in self._active_recoveries:
602
+
603
+ async def start_recovery_cb() -> None:
604
+ await self._start_recovery(interface_id=interface_id)
605
+
606
+ self._task_scheduler.create_task(
607
+ target=start_recovery_cb,
608
+ name=f"recovery_cb_{interface_id}",
609
+ )
610
+
611
+ def _on_connection_lost(self, *, event: ConnectionLostEvent) -> None:
612
+ """Handle connection lost event."""
613
+ if self._shutdown:
614
+ return
615
+
616
+ # Skip if already recovering this interface
617
+ if (interface_id := event.interface_id) in self._active_recoveries:
618
+ _LOGGER.debug(
619
+ "CONNECTION_RECOVERY: %s already recovering, skipping duplicate event",
620
+ interface_id,
621
+ )
622
+ return
623
+
624
+ _LOGGER.info( # i18n-log: ignore
625
+ "CONNECTION_RECOVERY: Connection lost for %s (reason: %s), starting recovery",
626
+ interface_id,
627
+ event.reason,
628
+ )
629
+
630
+ # Record incident for diagnostic purposes
631
+ self._record_connection_lost_incident(event=event)
632
+
633
+ # Start recovery for this interface
634
+ async def start_recovery() -> None:
635
+ await self._start_recovery(interface_id=interface_id)
636
+
637
+ self._task_scheduler.create_task(
638
+ target=start_recovery,
639
+ name=f"recovery_{interface_id}",
640
+ )
641
+
642
+ def _on_heartbeat_timer_fired(self, *, event: HeartbeatTimerFiredEvent) -> None:
643
+ """Handle heartbeat timer fired event."""
644
+ if self._shutdown or not self._in_failed_state:
645
+ return
646
+
647
+ _LOGGER.info( # i18n-log: ignore
648
+ "CONNECTION_RECOVERY: Heartbeat retry for %s with %d failed interfaces",
649
+ event.central_name,
650
+ len(event.interface_ids),
651
+ )
652
+
653
+ # Start recovery for all failed interfaces
654
+ self._task_scheduler.create_task(
655
+ target=lambda: self._recover_all_interfaces(interface_ids=list(event.interface_ids)),
656
+ name="heartbeat_recovery",
657
+ )
658
+
659
+ def _record_connection_lost_incident(self, *, event: ConnectionLostEvent) -> None:
660
+ """Record a CONNECTION_LOST incident for diagnostics."""
661
+ if (incident_recorder := self._incident_recorder) is None:
662
+ return
663
+
664
+ interface_id = event.interface_id
665
+ reason = event.reason
666
+ detected_at = event.detected_at.isoformat() if event.detected_at else None
667
+
668
+ # Gather client state information if available
669
+ client_state: str | None = None
670
+ circuit_breaker_state: str | None = None
671
+ try:
672
+ if client := self._client_provider.get_client(interface_id=interface_id):
673
+ client_state = client.state.state.value if hasattr(client.state, "state") else None
674
+ # pylint: disable=protected-access
675
+ if hasattr(client, "_circuit_breaker") and client._circuit_breaker: # noqa: SLF001
676
+ circuit_breaker_state = client._circuit_breaker.state.value # noqa: SLF001
677
+ # pylint: enable=protected-access
678
+ except Exception: # noqa: BLE001
679
+ pass # Don't fail incident recording if client info unavailable
680
+
681
+ # Get recovery state if available
682
+ recovery_attempt_count = 0
683
+ if (recovery_state := self._recovery_states.get(interface_id)) is not None:
684
+ recovery_attempt_count = recovery_state.attempt_count
685
+
686
+ context = {
687
+ "reason": reason,
688
+ "detected_at": detected_at,
689
+ "client_state": client_state,
690
+ "circuit_breaker_state": circuit_breaker_state,
691
+ "recovery_attempt_count": recovery_attempt_count,
692
+ "active_recoveries": list(self._active_recoveries),
693
+ "in_failed_state": self._in_failed_state,
694
+ }
695
+
696
+ async def _record() -> None:
697
+ try:
698
+ await incident_recorder.record_incident(
699
+ incident_type=IncidentType.CONNECTION_LOST,
700
+ severity=IncidentSeverity.ERROR,
701
+ message=f"Connection lost for {interface_id}: {reason}",
702
+ interface_id=interface_id,
703
+ context=context,
704
+ )
705
+ except Exception as err: # pragma: no cover
706
+ _LOGGER.debug(
707
+ "CONNECTION_RECOVERY: Failed to record connection lost incident for %s: %s",
708
+ interface_id,
709
+ err,
710
+ )
711
+
712
+ # Schedule the async recording via task scheduler
713
+ self._task_scheduler.create_task(
714
+ target=_record(),
715
+ name=f"record_connection_lost_incident_{interface_id}",
716
+ )
717
+
718
+ def _record_connection_restored_incident(
719
+ self,
720
+ *,
721
+ interface_id: str,
722
+ state: InterfaceRecoveryState,
723
+ ) -> None:
724
+ """Record a CONNECTION_RESTORED incident for diagnostics."""
725
+ if (incident_recorder := self._incident_recorder) is None:
726
+ return
727
+
728
+ # Calculate recovery duration
729
+ duration_ms = (time.perf_counter() - state.recovery_start_time) * 1000 if state.recovery_start_time else 0.0
730
+
731
+ # Gather client state information if available
732
+ client_state: str | None = None
733
+ circuit_breaker_state: str | None = None
734
+ try:
735
+ if client := self._client_provider.get_client(interface_id=interface_id):
736
+ client_state = client.state.state.value if hasattr(client.state, "state") else None
737
+ # pylint: disable=protected-access
738
+ if hasattr(client, "_circuit_breaker") and client._circuit_breaker: # noqa: SLF001
739
+ circuit_breaker_state = client._circuit_breaker.state.value # noqa: SLF001
740
+ # pylint: enable=protected-access
741
+ except Exception: # noqa: BLE001
742
+ pass # Don't fail incident recording if client info unavailable
743
+
744
+ context = {
745
+ "total_attempts": state.attempt_count,
746
+ "total_duration_ms": round(duration_ms, 2),
747
+ "stages_completed": [s.value for s in state.stages_completed],
748
+ "client_state": client_state,
749
+ "circuit_breaker_state": circuit_breaker_state,
750
+ "was_in_failed_state": self._in_failed_state,
751
+ }
752
+
753
+ async def _record() -> None:
754
+ try:
755
+ await incident_recorder.record_incident(
756
+ incident_type=IncidentType.CONNECTION_RESTORED,
757
+ severity=IncidentSeverity.INFO,
758
+ message=f"Connection restored for {interface_id} after {state.attempt_count} attempt(s)",
759
+ interface_id=interface_id,
760
+ context=context,
761
+ )
762
+ except Exception as err: # pragma: no cover
763
+ _LOGGER.debug(
764
+ "CONNECTION_RECOVERY: Failed to record connection restored incident for %s: %s",
765
+ interface_id,
766
+ err,
767
+ )
768
+
769
+ # Schedule the async recording via task scheduler
770
+ self._task_scheduler.create_task(
771
+ target=_record(),
772
+ name=f"record_connection_restored_incident_{interface_id}",
773
+ )
774
+
775
+ async def _recover_all_interfaces(self, *, interface_ids: list[str]) -> None:
776
+ """Recover multiple interfaces with throttling."""
777
+ if self._shutdown:
778
+ return
779
+
780
+ async def throttled_recovery(interface_id: str) -> bool:
781
+ async with self._recovery_semaphore:
782
+ return await self._execute_recovery_stages(interface_id=interface_id)
783
+
784
+ # Run recoveries in parallel with throttling
785
+ tasks = [throttled_recovery(iid) for iid in interface_ids]
786
+ results = await asyncio.gather(*tasks, return_exceptions=True)
787
+
788
+ # Process results
789
+ success_count = sum(1 for r in results if r is True)
790
+ failed_count = len(interface_ids) - success_count
791
+
792
+ if success_count == len(interface_ids):
793
+ self._in_failed_state = False
794
+ self._transition_to_running()
795
+ elif success_count > 0:
796
+ self._transition_to_degraded(failed_count=failed_count)
797
+
798
+ async def _refresh_interface_data(self, *, interface_id: str) -> None:
799
+ """Refresh data for a specific interface after recovery."""
800
+ try:
801
+ client = self._client_provider.get_client(interface_id=interface_id)
802
+ await self._device_data_refresher.load_and_refresh_data_point_data(interface=client.interface)
803
+ _LOGGER.debug("CONNECTION_RECOVERY: Data refresh completed for %s", interface_id)
804
+ except Exception:
805
+ _LOGGER.exception( # i18n-log: ignore
806
+ "CONNECTION_RECOVERY: Data refresh failed for %s",
807
+ interface_id,
808
+ )
809
+
810
+ async def _stage_data_load(self, *, interface_id: str) -> bool:
811
+ """Stage: Load device and paramset data."""
812
+ try:
813
+ client = self._client_provider.get_client(interface_id=interface_id)
814
+ interface = client.interface
815
+ await self._device_data_refresher.load_and_refresh_data_point_data(interface=interface)
816
+ except Exception:
817
+ _LOGGER.exception( # i18n-log: ignore
818
+ "CONNECTION_RECOVERY: Data load failed for %s",
819
+ interface_id,
820
+ )
821
+ return False
822
+
823
+ _LOGGER.info( # i18n-log: ignore
824
+ "CONNECTION_RECOVERY: Data load completed for %s",
825
+ interface_id,
826
+ )
827
+ return True
828
+
829
+ async def _stage_reconnect(self, *, interface_id: str) -> bool:
830
+ """Stage: Perform full client reconnection."""
831
+ try:
832
+ client = self._client_provider.get_client(interface_id=interface_id)
833
+ await client.reconnect()
834
+ except Exception:
835
+ _LOGGER.exception( # i18n-log: ignore
836
+ "CONNECTION_RECOVERY: Reconnect exception for %s",
837
+ interface_id,
838
+ )
839
+ return False
840
+
841
+ if client.available:
842
+ _LOGGER.info( # i18n-log: ignore
843
+ "CONNECTION_RECOVERY: Reconnect succeeded for %s",
844
+ interface_id,
845
+ )
846
+ return True
847
+
848
+ _LOGGER.warning( # i18n-log: ignore
849
+ "CONNECTION_RECOVERY: Reconnect failed for %s - client not available",
850
+ interface_id,
851
+ )
852
+ return False
853
+
854
+ async def _stage_rpc_check(self, *, interface_id: str) -> bool:
855
+ """Stage: Check RPC service availability."""
856
+ if await self._check_rpc_available(interface_id=interface_id):
857
+ _LOGGER.info( # i18n-log: ignore
858
+ "CONNECTION_RECOVERY: RPC service available for %s",
859
+ interface_id,
860
+ )
861
+ return True
862
+
863
+ _LOGGER.warning( # i18n-log: ignore
864
+ "CONNECTION_RECOVERY: RPC service not available for %s",
865
+ interface_id,
866
+ )
867
+ return False
868
+
869
+ async def _stage_stability_check(self, *, interface_id: str) -> bool:
870
+ """Stage: Confirm RPC stability after warmup."""
871
+ if await self._check_rpc_available(interface_id=interface_id):
872
+ _LOGGER.info( # i18n-log: ignore
873
+ "CONNECTION_RECOVERY: RPC service stable for %s",
874
+ interface_id,
875
+ )
876
+ return True
877
+
878
+ _LOGGER.warning( # i18n-log: ignore
879
+ "CONNECTION_RECOVERY: RPC unstable after warmup for %s",
880
+ interface_id,
881
+ )
882
+ return False
883
+
884
+ async def _stage_tcp_check(self, *, interface_id: str) -> bool:
885
+ """Stage: Check TCP port availability."""
886
+ timeout_config = self._config_provider.config.timeout_config
887
+ config = self._config_provider.config
888
+ host = config.host
889
+
890
+ # Get the port to check
891
+ port = self._get_client_port(interface_id=interface_id)
892
+
893
+ # For JSON-RPC-only interfaces (CUxD, CCU-Jack), use the JSON-RPC port instead
894
+ # These interfaces don't have their own XML-RPC port
895
+ if port is None or port == 0:
896
+ client = self._client_provider.get_client(interface_id=interface_id)
897
+ if client.interface in INTERFACES_REQUIRING_JSON_RPC_CLIENT - INTERFACES_REQUIRING_XML_RPC:
898
+ port = get_json_rpc_default_port(tls=config.tls)
899
+ _LOGGER.debug(
900
+ "CONNECTION_RECOVERY: Using JSON-RPC port %d for %s",
901
+ port,
902
+ interface_id,
903
+ )
904
+ else:
905
+ # Non-JSON-RPC interface without a port - can't check
906
+ _LOGGER.warning( # i18n-log: ignore
907
+ "CONNECTION_RECOVERY: No port configured for %s, skipping TCP check",
908
+ interface_id,
909
+ )
910
+ return False
911
+
912
+ start_time = time.perf_counter()
913
+ while (time.perf_counter() - start_time) < timeout_config.reconnect_tcp_check_timeout:
914
+ if await self._check_tcp_port_available(host=host, port=port):
915
+ _LOGGER.info( # i18n-log: ignore
916
+ "CONNECTION_RECOVERY: TCP port available for %s (%s:%d)",
917
+ interface_id,
918
+ host,
919
+ port,
920
+ )
921
+ return True
922
+ await asyncio.sleep(2.0) # Check every 2 seconds
923
+
924
+ _LOGGER.warning( # i18n-log: ignore
925
+ "CONNECTION_RECOVERY: TCP check timeout for %s",
926
+ interface_id,
927
+ )
928
+ return False
929
+
930
+ def _start_heartbeat_timer(self) -> None:
931
+ """Start the heartbeat timer for FAILED state retries."""
932
+ if self._heartbeat_task and not self._heartbeat_task.done():
933
+ return # Already running
934
+
935
+ self._task_scheduler.create_task(
936
+ target=self._heartbeat_loop,
937
+ name="heartbeat_timer",
938
+ )
939
+
940
+ async def _start_recovery(self, *, interface_id: str) -> None:
941
+ """Start recovery for a single interface."""
942
+ if self._shutdown:
943
+ return
944
+
945
+ # Get or create recovery state
946
+ if (state := self._recovery_states.get(interface_id)) is None:
947
+ state = InterfaceRecoveryState(interface_id=interface_id)
948
+ self._recovery_states[interface_id] = state
949
+
950
+ # Check if max retries reached
951
+ if not state.can_retry:
952
+ _LOGGER.warning( # i18n-log: ignore
953
+ "CONNECTION_RECOVERY: Max retries (%d) reached for %s",
954
+ MAX_RECOVERY_ATTEMPTS,
955
+ interface_id,
956
+ )
957
+ await self._handle_max_retries_reached(interface_id=interface_id)
958
+ return
959
+
960
+ # Mark as active recovery
961
+ self._active_recoveries.add(interface_id)
962
+ state.start_recovery()
963
+
964
+ # Transition central to RECOVERING
965
+ self._transition_to_recovering()
966
+
967
+ # Emit connection_state event to notify integration of connection issue
968
+ # This ensures users see a repair notification immediately when recovery starts
969
+ await self._event_bus.publish(
970
+ event=SystemStatusChangedEvent(
971
+ timestamp=datetime.now(),
972
+ connection_state=(interface_id, False),
973
+ )
974
+ )
975
+
976
+ # Clear JSON-RPC session to force re-authentication
977
+ # This prevents auth errors from stale sessions during recovery
978
+ if client := self._client_provider.get_client(interface_id=interface_id):
979
+ client.clear_json_rpc_session()
980
+
981
+ try:
982
+ async with self._recovery_semaphore:
983
+ success = await self._execute_recovery_stages(interface_id=interface_id)
984
+
985
+ if success:
986
+ state.record_success()
987
+ # Record incident before reset (preserves recovery metrics)
988
+ self._record_connection_restored_incident(interface_id=interface_id, state=state)
989
+ await self._emit_recovery_completed(interface_id=interface_id, state=state)
990
+ state.reset()
991
+ # Emit connection_state event to notify integration of connection restored
992
+ # This clears the repair notification created when recovery started
993
+ await self._event_bus.publish(
994
+ event=SystemStatusChangedEvent(
995
+ timestamp=datetime.now(),
996
+ connection_state=(interface_id, True),
997
+ )
998
+ )
999
+ # Remove from active recoveries BEFORE checking transition state
1000
+ # This ensures _transition_after_recovery() sees correct active_recoveries count
1001
+ self._active_recoveries.discard(interface_id)
1002
+ self._transition_after_recovery()
1003
+ else:
1004
+ state.record_failure()
1005
+ await self._emit_recovery_attempt(interface_id=interface_id, state=state, success=False)
1006
+
1007
+ # Note: record_failure() above incremented attempt_count, may now exceed max
1008
+ if not state.can_retry:
1009
+ await self._handle_max_retries_reached( # type: ignore[unreachable]
1010
+ interface_id=interface_id
1011
+ )
1012
+ else:
1013
+ # Schedule retry with backoff
1014
+ delay = state.next_retry_delay
1015
+ _LOGGER.info( # i18n-log: ignore
1016
+ "CONNECTION_RECOVERY: Scheduling retry for %s in %.1fs",
1017
+ interface_id,
1018
+ delay,
1019
+ )
1020
+ await asyncio.sleep(delay)
1021
+ if not self._shutdown:
1022
+ await self._start_recovery(interface_id=interface_id)
1023
+
1024
+ finally:
1025
+ # Ensure cleanup on failure/exception (safe to call twice, discard is idempotent)
1026
+ self._active_recoveries.discard(interface_id)
1027
+
1028
+ def _stop_heartbeat_timer(self) -> None:
1029
+ """Stop the heartbeat timer."""
1030
+ if self._heartbeat_task and not self._heartbeat_task.done():
1031
+ self._heartbeat_task.cancel()
1032
+ self._heartbeat_task = None
1033
+
1034
+ def _subscribe_to_events(self) -> None:
1035
+ """Subscribe to connection-related events."""
1036
+ self._unsubscribers.append(
1037
+ self._event_bus.subscribe(
1038
+ event_type=ConnectionLostEvent,
1039
+ event_key=None,
1040
+ handler=self._on_connection_lost,
1041
+ )
1042
+ )
1043
+ self._unsubscribers.append(
1044
+ self._event_bus.subscribe(
1045
+ event_type=CircuitBreakerTrippedEvent,
1046
+ event_key=None,
1047
+ handler=self._on_circuit_breaker_tripped,
1048
+ )
1049
+ )
1050
+ self._unsubscribers.append(
1051
+ self._event_bus.subscribe(
1052
+ event_type=CircuitBreakerStateChangedEvent,
1053
+ event_key=None,
1054
+ handler=self._on_circuit_breaker_state_changed,
1055
+ )
1056
+ )
1057
+ self._unsubscribers.append(
1058
+ self._event_bus.subscribe(
1059
+ event_type=HeartbeatTimerFiredEvent,
1060
+ event_key=None,
1061
+ handler=self._on_heartbeat_timer_fired,
1062
+ )
1063
+ )
1064
+
1065
+ def _transition_after_recovery(self) -> None:
1066
+ """Transition central state after successful recovery."""
1067
+ if self._state_machine is None:
1068
+ return
1069
+
1070
+ # Check if all active recoveries are complete
1071
+ if not self._active_recoveries:
1072
+ self._transition_to_running()
1073
+ self._in_failed_state = False
1074
+ self._stop_heartbeat_timer()
1075
+
1076
+ async def _transition_stage(self, *, interface_id: str, new_stage: RecoveryStage) -> None:
1077
+ """Transition to a new recovery stage and emit event."""
1078
+ if (state := self._recovery_states.get(interface_id)) is None:
1079
+ return
1080
+
1081
+ if (old_stage := state.current_stage) == new_stage:
1082
+ return
1083
+
1084
+ duration_ms = state.transition_to_stage(new_stage=new_stage)
1085
+
1086
+ await self._event_bus.publish(
1087
+ event=RecoveryStageChangedEvent(
1088
+ timestamp=datetime.now(),
1089
+ interface_id=interface_id,
1090
+ old_stage=old_stage,
1091
+ new_stage=new_stage,
1092
+ duration_in_old_stage_ms=duration_ms,
1093
+ attempt_number=state.attempt_count + 1,
1094
+ )
1095
+ )
1096
+
1097
+ def _transition_to_degraded(self, *, failed_count: int) -> None:
1098
+ """Transition central to DEGRADED state."""
1099
+ if self._state_machine is None:
1100
+ return
1101
+
1102
+ if self._state_machine.can_transition_to(target=CentralState.DEGRADED):
1103
+ self._state_machine.transition_to(
1104
+ target=CentralState.DEGRADED,
1105
+ reason=f"Partial recovery: {failed_count} interface(s) still failed",
1106
+ )
1107
+
1108
+ def _transition_to_failed(self, *, interface_id: str) -> None:
1109
+ """Transition central to FAILED state."""
1110
+ if self._state_machine is None:
1111
+ return
1112
+
1113
+ if self._state_machine.can_transition_to(target=CentralState.FAILED):
1114
+ self._state_machine.transition_to(
1115
+ target=CentralState.FAILED,
1116
+ reason=f"Max retries reached for {interface_id}",
1117
+ failure_reason=FailureReason.UNKNOWN,
1118
+ failure_interface_id=interface_id,
1119
+ )
1120
+
1121
+ def _transition_to_recovering(self) -> None:
1122
+ """Transition central to RECOVERING state."""
1123
+ if self._state_machine is None:
1124
+ return
1125
+
1126
+ if self._state_machine.can_transition_to(target=CentralState.RECOVERING):
1127
+ self._state_machine.transition_to(
1128
+ target=CentralState.RECOVERING,
1129
+ reason="Connection recovery in progress",
1130
+ )
1131
+
1132
+ def _transition_to_running(self) -> None:
1133
+ """Transition central to RUNNING state."""
1134
+ if self._state_machine is None:
1135
+ return
1136
+
1137
+ if self._state_machine.can_transition_to(target=CentralState.RUNNING):
1138
+ self._state_machine.transition_to(
1139
+ target=CentralState.RUNNING,
1140
+ reason="All interfaces recovered successfully",
1141
+ )