cgse-core 0.17.3__tar.gz → 0.17.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cgse_core-0.17.3 → cgse_core-0.17.4}/PKG-INFO +1 -1
- {cgse_core-0.17.3 → cgse_core-0.17.4}/pyproject.toml +1 -1
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/_stop.py +1 -2
- cgse_core-0.17.4/src/egse/connect.py +528 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/control.py +2 -1
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/dummy.py +36 -11
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/logger/__init__.py +2 -2
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/notifyhub/server.py +3 -1
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/procman/procman_cs.py +1 -1
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/procman/procman_ui.py +5 -2
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/registry/client.py +16 -6
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/registry/server.py +10 -2
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/registry/service.py +11 -5
- cgse_core-0.17.3/src/egse/connect.py +0 -55
- {cgse_core-0.17.3 → cgse_core-0.17.4}/.gitignore +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/README.md +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/__init__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/_start.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/_status.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/cgse_explore.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/services.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/cgse_core/settings.yaml +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/_setup_core.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/command.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/confman/__init__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/confman/__main__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/confman/confman.yaml +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/confman/confman_cs.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/busy.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/operational-mode.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/pm_ui.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/simulator-mode.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/start-process-button.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/stop-process-button.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/icons/user-interface.svg +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/listener.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/logger/__main__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/logger/log_cs.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/mixin.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/monitoring.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/notifyhub/__init__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/notifyhub/client.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/notifyhub/event.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/notifyhub/services.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/procman/__init__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/procman/procman.yaml +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/procman/procman_protocol.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/protocol.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/proxy.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/registry/__init__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/registry/backend.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/services.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/services.yaml +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/storage/__init__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/storage/__main__.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/storage/persistence.py +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/storage/storage.yaml +0 -0
- {cgse_core-0.17.3 → cgse_core-0.17.4}/src/egse/storage/storage_cs.py +0 -0
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from egse.env import bool_env
|
|
8
|
+
from egse.log import logging
|
|
9
|
+
from egse.system import type_name
|
|
10
|
+
from egse.zmq_ser import connect_address
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("egse.connect")
|
|
13
|
+
|
|
14
|
+
# random.seed(time.monotonic()) # uncomment for testing only, main application should set a seed.
|
|
15
|
+
|
|
16
|
+
VERBOSE_DEBUG = bool_env("VERBOSE_DEBUG")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_endpoint(
|
|
20
|
+
service_type: str | None = None,
|
|
21
|
+
protocol: str = "tcp",
|
|
22
|
+
hostname: str = "localhost",
|
|
23
|
+
port: int = 0,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Returns the endpoint for a service, either from the registry or by constructing
|
|
27
|
+
it from protocol, hostname and port.
|
|
28
|
+
|
|
29
|
+
If port is 0 (the default), attempt to retrieve the endpoint from the service registry.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
service_type: The service type to look up in the registry.
|
|
33
|
+
protocol: Protocol to use if constructing the endpoint, defaults to tcp.
|
|
34
|
+
hostname: Hostname to use if constructing the endpoint, defaults to localhost.
|
|
35
|
+
port: Port to use if constructing the endpoint, defaults to 0.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The endpoint string.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
RuntimeError: If no endpoint can be determined.
|
|
42
|
+
"""
|
|
43
|
+
endpoint = None
|
|
44
|
+
from egse.registry.client import RegistryClient
|
|
45
|
+
|
|
46
|
+
if port == 0:
|
|
47
|
+
with RegistryClient() as reg:
|
|
48
|
+
endpoint = reg.get_endpoint(service_type)
|
|
49
|
+
if endpoint:
|
|
50
|
+
if VERBOSE_DEBUG:
|
|
51
|
+
logger.debug(f"Endpoint for '{service_type}' found in registry: {endpoint}")
|
|
52
|
+
else:
|
|
53
|
+
logger.warning(f"No endpoint for '{service_type}' found in registry.")
|
|
54
|
+
|
|
55
|
+
if not endpoint:
|
|
56
|
+
if port == 0:
|
|
57
|
+
raise RuntimeError(f"No service registered as '{service_type}' and no port provided.")
|
|
58
|
+
endpoint = connect_address(protocol, hostname, port)
|
|
59
|
+
if VERBOSE_DEBUG:
|
|
60
|
+
logger.debug(f"Endpoint constructed from protocol/hostname/port: {endpoint}")
|
|
61
|
+
|
|
62
|
+
return endpoint
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ConnectionState(Enum):
|
|
66
|
+
DISCONNECTED = "disconnected"
|
|
67
|
+
CONNECTING = "connecting"
|
|
68
|
+
CONNECTED = "connected"
|
|
69
|
+
CIRCUIT_OPEN = "circuit_open" # Temporarily stopped trying
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class BackoffStrategy(Enum):
|
|
73
|
+
"""
|
|
74
|
+
Specifies the strategy for increasing the delay between retry attempts
|
|
75
|
+
in backoff algorithms to reduce load and avoid overwhelming services.
|
|
76
|
+
|
|
77
|
+
Strategies:
|
|
78
|
+
EXPONENTIAL:
|
|
79
|
+
The delay doubles with each retry attempt (e.g., 1s, 2s, 4s, 8s).
|
|
80
|
+
This is the most widely used approach because it quickly reduces load on struggling systems.
|
|
81
|
+
LINEAR:
|
|
82
|
+
The delay increases by a fixed amount each time (e.g., 1s, 2s, 3s, 4s).
|
|
83
|
+
This provides a more gradual reduction in request rate.
|
|
84
|
+
FIXED:
|
|
85
|
+
Uses the same delay between all retry attempts.
|
|
86
|
+
Simple but less adaptive to system conditions.
|
|
87
|
+
|
|
88
|
+
References:
|
|
89
|
+
- AWS Architecture Blog: Exponential Backoff And Jitter
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
EXPONENTIAL = "exponential"
|
|
93
|
+
"""The delay doubles with each retry attempt (e.g., 1s, 2s, 4s, 8s).
|
|
94
|
+
This is the most widely used approach because it quickly reduces load on struggling systems."""
|
|
95
|
+
LINEAR = "linear"
|
|
96
|
+
"""The delay increases by a fixed amount each time (e.g., 1s, 2s, 3s, 4s).
|
|
97
|
+
This provides a more gradual reduction in request rate."""
|
|
98
|
+
FIXED = "fixed"
|
|
99
|
+
"""Uses the same delay between all retry attempts. Simple but less adaptive to system conditions."""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class JitterStrategy(Enum):
|
|
103
|
+
"""
|
|
104
|
+
Specifies the strategy for applying jitter (randomization) to retry intervals
|
|
105
|
+
in backoff algorithms to avoid synchronized retries and reduce load spikes.
|
|
106
|
+
|
|
107
|
+
Strategies:
|
|
108
|
+
NONE:
|
|
109
|
+
No jitter is applied. The retry interval is deterministic.
|
|
110
|
+
FULL:
|
|
111
|
+
Applies full jitter by selecting a random value uniformly between 0 and the calculated interval.
|
|
112
|
+
This maximizes randomness but can result in very short delays.
|
|
113
|
+
EQUAL:
|
|
114
|
+
Applies "equal jitter" as described in the AWS Architecture Blog.
|
|
115
|
+
The interval is randomized within [interval/2, interval], ensuring a minimum delay of half the interval.
|
|
116
|
+
Note: This is not the same as "a jitter of 50% around interval" (which would be [0.5 * interval, 1.5 * interval]).
|
|
117
|
+
PERCENT_10:
|
|
118
|
+
Applies a jitter of ±10% around the base interval, resulting in a random interval within [0.9 * interval, 1.1 * interval].
|
|
119
|
+
|
|
120
|
+
References:
|
|
121
|
+
- AWS Architecture Blog: Exponential Backoff And Jitter
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
NONE = "none"
|
|
125
|
+
"""No jitter is applied to the backoff."""
|
|
126
|
+
FULL = "full"
|
|
127
|
+
"""Maximum distribution but can be too random with very short intervals."""
|
|
128
|
+
EQUAL = "equal"
|
|
129
|
+
"""Best balance, maintains backoff properties while preventing synchronization."""
|
|
130
|
+
PERCENT_10 = "10%"
|
|
131
|
+
"""Add a jitter of 10% around the base interval."""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def calculate_retry_interval(
|
|
135
|
+
attempt_number,
|
|
136
|
+
base_interval,
|
|
137
|
+
max_interval,
|
|
138
|
+
backoff_strategy: BackoffStrategy = BackoffStrategy.EXPONENTIAL,
|
|
139
|
+
jitter_strategy: JitterStrategy = JitterStrategy.EQUAL,
|
|
140
|
+
):
|
|
141
|
+
"""
|
|
142
|
+
Calculates the next retry interval based on the given backoff and jitter strategies.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
attempt_number (int): The current retry attempt (starting from 0).
|
|
146
|
+
base_interval (float): The initial interval in seconds.
|
|
147
|
+
max_interval (float): The maximum allowed interval in seconds.
|
|
148
|
+
backoff_strategy (BackoffStrategy): Strategy for increasing the delay (exponential, linear, or fixed).
|
|
149
|
+
jitter_strategy (JitterStrategy): Strategy for randomizing the delay to avoid synchronization.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
float: The computed retry interval in seconds.
|
|
153
|
+
|
|
154
|
+
Notes:
|
|
155
|
+
- See the docstrings for BackoffStrategy and JitterStrategy for details on each strategy.
|
|
156
|
+
- Based on best practices from the AWS Architecture Blog: Exponential Backoff And Jitter.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
if backoff_strategy == BackoffStrategy.EXPONENTIAL:
|
|
160
|
+
interval = min(base_interval * (2**attempt_number), max_interval)
|
|
161
|
+
elif backoff_strategy == BackoffStrategy.LINEAR:
|
|
162
|
+
interval = min(base_interval + attempt_number, max_interval)
|
|
163
|
+
else:
|
|
164
|
+
interval = base_interval
|
|
165
|
+
|
|
166
|
+
if jitter_strategy == JitterStrategy.NONE:
|
|
167
|
+
return interval
|
|
168
|
+
elif jitter_strategy == JitterStrategy.FULL:
|
|
169
|
+
return random.uniform(0, interval)
|
|
170
|
+
elif jitter_strategy == JitterStrategy.EQUAL:
|
|
171
|
+
return interval / 2 + random.uniform(0, interval / 2)
|
|
172
|
+
elif jitter_strategy == JitterStrategy.PERCENT_10:
|
|
173
|
+
jitter_amount = interval * 0.1
|
|
174
|
+
return interval + random.uniform(-jitter_amount, jitter_amount)
|
|
175
|
+
|
|
176
|
+
return interval
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class AsyncServiceConnector:
|
|
180
|
+
"""
|
|
181
|
+
Asynchronous base class for robust service connection management with retry, backoff, and circuit breaker logic.
|
|
182
|
+
|
|
183
|
+
This class is intended to be subclassed for managing persistent connections to external services
|
|
184
|
+
(such as devices, databases, or remote APIs) that may be unreliable or temporarily unavailable.
|
|
185
|
+
|
|
186
|
+
Features:
|
|
187
|
+
- Automatic retry with configurable backoff and jitter strategies.
|
|
188
|
+
- Circuit breaker to prevent repeated connection attempts after multiple failures.
|
|
189
|
+
- Connection state tracking (disconnected, connecting, connected, circuit open).
|
|
190
|
+
|
|
191
|
+
Usage:
|
|
192
|
+
1. Subclass `AsyncServiceConnector` and override the `connect_to_service()` coroutine with your
|
|
193
|
+
actual connection logic. Optionally, override `health_check()` for custom health verification.
|
|
194
|
+
2. Store the actual connection object (e.g., socket, transport) as an instance attribute in your subclass.
|
|
195
|
+
3. Use `attempt_connection()` to initiate connection attempts; it will handle retries and backoff automatically.
|
|
196
|
+
4. Use `is_connected()` to check connection status.
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
class MyConnector(AsyncServiceConnector):
|
|
200
|
+
async def connect_to_service(self):
|
|
201
|
+
self.connection = await create_socket()
|
|
202
|
+
return self.connection is not None
|
|
203
|
+
|
|
204
|
+
def get_connection(self):
|
|
205
|
+
return self.connection
|
|
206
|
+
|
|
207
|
+
Note:
|
|
208
|
+
The base class does not manage or expose the underlying connection object.
|
|
209
|
+
Your subclass should provide a method or property to access it as needed.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
def __init__(
|
|
213
|
+
self,
|
|
214
|
+
service_name: str,
|
|
215
|
+
backoff_strategy: BackoffStrategy = BackoffStrategy.EXPONENTIAL,
|
|
216
|
+
jitter_strategy: JitterStrategy = JitterStrategy.EQUAL,
|
|
217
|
+
):
|
|
218
|
+
self.state = ConnectionState.DISCONNECTED
|
|
219
|
+
self.last_attempt = 0
|
|
220
|
+
self.base_interval = 1
|
|
221
|
+
self.retry_interval = 1 # Start with 1 second
|
|
222
|
+
self.max_retry_interval = 300 # Max 5 minutes
|
|
223
|
+
self.failure_count = 0
|
|
224
|
+
self.max_failures_before_circuit_break = 5
|
|
225
|
+
self.circuit_break_duration = 60 # 1 minute
|
|
226
|
+
self.circuit_opened_at = None
|
|
227
|
+
self.backoff_strategy = backoff_strategy
|
|
228
|
+
self.jitter_strategy = jitter_strategy
|
|
229
|
+
|
|
230
|
+
self.service_name = service_name
|
|
231
|
+
|
|
232
|
+
async def connect_to_service(self) -> bool:
|
|
233
|
+
logger.warning(
|
|
234
|
+
f"The connect_to_service() method is not implemented for {self.service_name}, connection will always fail."
|
|
235
|
+
)
|
|
236
|
+
return False
|
|
237
|
+
|
|
238
|
+
async def disconnect_from_service(self) -> None:
|
|
239
|
+
"""
|
|
240
|
+
Optional hook to cleanly disconnect / release resources for the service.
|
|
241
|
+
Default implementation is a no-op. Subclasses should override to:
|
|
242
|
+
- close async transports
|
|
243
|
+
- cancel background tasks
|
|
244
|
+
- set state to ConnectionState.DISCONNECTED
|
|
245
|
+
- call device.disconnect()
|
|
246
|
+
"""
|
|
247
|
+
logger.debug(f"{self.service_name}: default async disconnect_from_service(): no-op")
|
|
248
|
+
self.state = ConnectionState.DISCONNECTED
|
|
249
|
+
return
|
|
250
|
+
|
|
251
|
+
async def health_check(self) -> bool:
|
|
252
|
+
logger.warning(
|
|
253
|
+
f"The health_check() method is not implemented for {self.service_name}, check will always return false."
|
|
254
|
+
)
|
|
255
|
+
return False
|
|
256
|
+
|
|
257
|
+
def should_attempt_connection(self) -> bool:
|
|
258
|
+
"""Return True if we should attempt a new connection."""
|
|
259
|
+
now = time.monotonic()
|
|
260
|
+
|
|
261
|
+
# If circuit is open, check if we should close it
|
|
262
|
+
if self.state == ConnectionState.CIRCUIT_OPEN:
|
|
263
|
+
assert self.circuit_opened_at is not None
|
|
264
|
+
circuit_break_open_since = now - self.circuit_opened_at
|
|
265
|
+
logger.debug(f"{circuit_break_open_since=}")
|
|
266
|
+
if circuit_break_open_since > self.circuit_break_duration:
|
|
267
|
+
self.state = ConnectionState.DISCONNECTED
|
|
268
|
+
self.failure_count = 0
|
|
269
|
+
self.retry_interval = 1
|
|
270
|
+
return True
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
# Regular backoff logic
|
|
274
|
+
return now - self.last_attempt >= self.retry_interval
|
|
275
|
+
|
|
276
|
+
async def attempt_connection(self):
|
|
277
|
+
"""Try to connect to the service.
|
|
278
|
+
|
|
279
|
+
This will execute the `connect_to_service()` that was overridden by the subclass.
|
|
280
|
+
That function shall return True when the connection succeeded, False otherwise.
|
|
281
|
+
"""
|
|
282
|
+
if self.state == ConnectionState.CONNECTED:
|
|
283
|
+
# ensure the CONNECTED state is validated before skipping reconnection attempts
|
|
284
|
+
# even is state is CONNECTED, the underlying connection could be stale or broken
|
|
285
|
+
# or closed externally and unless you check the health here, you will never attempt
|
|
286
|
+
# recovery.
|
|
287
|
+
try:
|
|
288
|
+
healthy = await self.health_check()
|
|
289
|
+
except Exception as exc:
|
|
290
|
+
logger.debug(f"health_check raised: {type_name(exc)} – {exc}")
|
|
291
|
+
healthy = False
|
|
292
|
+
|
|
293
|
+
if healthy:
|
|
294
|
+
if VERBOSE_DEBUG:
|
|
295
|
+
logger.debug(f"{self.service_name} already connected and healthy")
|
|
296
|
+
return
|
|
297
|
+
|
|
298
|
+
logger.info(
|
|
299
|
+
f"{self.service_name} marked CONNECTED but health_check failed — disconnecting and reconnecting"
|
|
300
|
+
)
|
|
301
|
+
self.state = ConnectionState.DISCONNECTED
|
|
302
|
+
try:
|
|
303
|
+
# ensure the state is updated by disconnect hook (disconnect_from_service should set DISCONNECTED)
|
|
304
|
+
await self.disconnect_from_service()
|
|
305
|
+
except Exception as exc:
|
|
306
|
+
if VERBOSE_DEBUG:
|
|
307
|
+
logger.debug(f"Couldn't disconnect from {self.service_name}")
|
|
308
|
+
|
|
309
|
+
if not self.should_attempt_connection():
|
|
310
|
+
logger.debug("Not time yet to attempt new connection")
|
|
311
|
+
return
|
|
312
|
+
|
|
313
|
+
self.state = ConnectionState.CONNECTING
|
|
314
|
+
self.last_attempt = time.monotonic()
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
success = await self.connect_to_service()
|
|
318
|
+
|
|
319
|
+
if success:
|
|
320
|
+
self.state = ConnectionState.CONNECTED
|
|
321
|
+
self.failure_count = 0
|
|
322
|
+
self.retry_interval = 1 # Reset backoff
|
|
323
|
+
logger.info(f"Successfully connected to service {self.service_name}")
|
|
324
|
+
else:
|
|
325
|
+
# warning should have been logged by the connect_to_service() callable.
|
|
326
|
+
self.handle_connection_failure()
|
|
327
|
+
|
|
328
|
+
except Exception as exc:
|
|
329
|
+
logger.warning(f"Failed to connect to service {self.service_name}: {exc}")
|
|
330
|
+
self.handle_connection_failure()
|
|
331
|
+
|
|
332
|
+
def handle_connection_failure(self):
|
|
333
|
+
self.failure_count += 1
|
|
334
|
+
|
|
335
|
+
# Open circuit breaker if too many failures
|
|
336
|
+
if self.failure_count >= self.max_failures_before_circuit_break:
|
|
337
|
+
self.state = ConnectionState.CIRCUIT_OPEN
|
|
338
|
+
self.circuit_opened_at = time.monotonic()
|
|
339
|
+
logger.warning(
|
|
340
|
+
f"Circuit breaker opened for service {self.service_name} after {self.failure_count} failures"
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
self.state = ConnectionState.DISCONNECTED
|
|
344
|
+
self.retry_interval = calculate_retry_interval(
|
|
345
|
+
self.failure_count,
|
|
346
|
+
self.base_interval,
|
|
347
|
+
self.max_retry_interval,
|
|
348
|
+
self.backoff_strategy,
|
|
349
|
+
self.jitter_strategy,
|
|
350
|
+
)
|
|
351
|
+
logger.debug(f"retry_interval={self.retry_interval}")
|
|
352
|
+
|
|
353
|
+
def is_connected(self) -> bool:
|
|
354
|
+
if VERBOSE_DEBUG:
|
|
355
|
+
logger.debug(f"Checking if {self.service_name} is connected: {self.state.name}")
|
|
356
|
+
return self.state == ConnectionState.CONNECTED
|
|
357
|
+
|
|
358
|
+
def get_connection(self) -> Any:
|
|
359
|
+
"""
|
|
360
|
+
Optional method to return the underlying connection object.
|
|
361
|
+
Subclasses should override this method to return the actual connection
|
|
362
|
+
(e.g., socket, transport) if needed.
|
|
363
|
+
"""
|
|
364
|
+
logger.warning(f"The get_connection() method is not implemented for {self.service_name}, returning None.")
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
class ServiceConnector:
|
|
369
|
+
"""
|
|
370
|
+
Synchronous base class for robust service connection management with retry, backoff, and circuit breaker logic.
|
|
371
|
+
|
|
372
|
+
This class is intended to be subclassed for managing persistent connections to external services
|
|
373
|
+
(such as devices, databases, or remote APIs) that may be unreliable or temporarily unavailable.
|
|
374
|
+
|
|
375
|
+
Features:
|
|
376
|
+
- Automatic retry with configurable backoff and jitter strategies.
|
|
377
|
+
- Circuit breaker to prevent repeated connection attempts after multiple failures.
|
|
378
|
+
- Connection state tracking (disconnected, connecting, connected, circuit open).
|
|
379
|
+
- Thread-safe operation using a lock for all state changes.
|
|
380
|
+
|
|
381
|
+
Usage:
|
|
382
|
+
1. Subclass `ServiceConnector` and override the `connect_to_service()` method with your
|
|
383
|
+
actual connection logic. Optionally, override `health_check()` for custom health verification.
|
|
384
|
+
2. Store the actual connection object (e.g., socket, transport) as an instance attribute in your subclass.
|
|
385
|
+
3. Use `attempt_connection()` to initiate connection attempts; it will handle retries and backoff automatically.
|
|
386
|
+
4. Use `is_connected()` to check connection status.
|
|
387
|
+
|
|
388
|
+
Example:
|
|
389
|
+
class MyConnector(ServiceConnector):
|
|
390
|
+
def connect_to_service(self):
|
|
391
|
+
self.connection = create_socket()
|
|
392
|
+
return self.connection is not None
|
|
393
|
+
|
|
394
|
+
def get_connection(self):
|
|
395
|
+
return self.connection
|
|
396
|
+
|
|
397
|
+
Note:
|
|
398
|
+
The base class does not manage or expose the underlying connection object.
|
|
399
|
+
Your subclass should provide a method or property to access it as needed.
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
def __init__(
|
|
403
|
+
self,
|
|
404
|
+
service_name: str,
|
|
405
|
+
backoff_strategy: BackoffStrategy = BackoffStrategy.EXPONENTIAL,
|
|
406
|
+
jitter_strategy: JitterStrategy = JitterStrategy.EQUAL,
|
|
407
|
+
):
|
|
408
|
+
self.state = ConnectionState.DISCONNECTED
|
|
409
|
+
self.last_attempt = 0
|
|
410
|
+
self.base_interval = 1
|
|
411
|
+
self.retry_interval = 1
|
|
412
|
+
self.max_retry_interval = 300
|
|
413
|
+
self.failure_count = 0
|
|
414
|
+
self.max_failures_before_circuit_break = 5
|
|
415
|
+
self.circuit_break_duration = 60
|
|
416
|
+
self.circuit_opened_at = None
|
|
417
|
+
self.service_name = service_name
|
|
418
|
+
self.backoff_strategy = backoff_strategy
|
|
419
|
+
self.jitter_strategy = jitter_strategy
|
|
420
|
+
|
|
421
|
+
self._lock = threading.RLock()
|
|
422
|
+
|
|
423
|
+
def connect_to_service(self) -> bool:
|
|
424
|
+
logger.warning(
|
|
425
|
+
f"The connect_to_service() method is not implemented for {self.service_name}, connection will always fail."
|
|
426
|
+
)
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
def disconnect_from_service(self) -> None:
|
|
430
|
+
"""
|
|
431
|
+
Optional hook to cleanly disconnect / release resources for the service. Default implementation is a no-op.
|
|
432
|
+
Subclasses should override and must be careful about thread-safety; the base class holds _lock which can be
|
|
433
|
+
used.
|
|
434
|
+
"""
|
|
435
|
+
with self._lock:
|
|
436
|
+
logger.debug(f"{self.service_name}: default disconnect_from_service(): no-op")
|
|
437
|
+
self.state = ConnectionState.DISCONNECTED
|
|
438
|
+
return
|
|
439
|
+
|
|
440
|
+
def health_check(self) -> bool:
|
|
441
|
+
logger.warning(
|
|
442
|
+
f"The health_check() method is not implemented for {self.service_name}, check will always return false."
|
|
443
|
+
)
|
|
444
|
+
return False
|
|
445
|
+
|
|
446
|
+
def should_attempt_connection(self) -> bool:
|
|
447
|
+
now = time.monotonic()
|
|
448
|
+
with self._lock:
|
|
449
|
+
if self.state == ConnectionState.CIRCUIT_OPEN:
|
|
450
|
+
assert self.circuit_opened_at is not None
|
|
451
|
+
if now - self.circuit_opened_at > self.circuit_break_duration:
|
|
452
|
+
self.state = ConnectionState.DISCONNECTED
|
|
453
|
+
self.failure_count = 0
|
|
454
|
+
self.retry_interval = 1
|
|
455
|
+
return True
|
|
456
|
+
return False
|
|
457
|
+
return now - self.last_attempt >= self.retry_interval
|
|
458
|
+
|
|
459
|
+
def attempt_connection(self):
|
|
460
|
+
with self._lock:
|
|
461
|
+
current_state = self.state
|
|
462
|
+
|
|
463
|
+
if current_state == ConnectionState.CONNECTED:
|
|
464
|
+
# ensure the CONNECTED state is validated before skipping reconnection attempts
|
|
465
|
+
try:
|
|
466
|
+
healthy = self.health_check()
|
|
467
|
+
except Exception as exc:
|
|
468
|
+
logger.debug(f"health_check raised: {type_name(exc)} – {exc}")
|
|
469
|
+
healthy = False
|
|
470
|
+
|
|
471
|
+
if healthy:
|
|
472
|
+
logger.debug(f"{self.service_name} already connected and healthy")
|
|
473
|
+
return
|
|
474
|
+
|
|
475
|
+
logger.info(
|
|
476
|
+
f"{self.service_name} marked CONNECTED but health_check failed — disconnecting and reconnecting"
|
|
477
|
+
)
|
|
478
|
+
self.state = ConnectionState.DISCONNECTED
|
|
479
|
+
try:
|
|
480
|
+
# ensure the state is updated by disconnect hook (disconnect_from_service should set DISCONNECTED)
|
|
481
|
+
self.disconnect_from_service()
|
|
482
|
+
except Exception as exc:
|
|
483
|
+
if VERBOSE_DEBUG:
|
|
484
|
+
logger.debug(f"Couldn't disconnect from {self.service_name}: {type_name(exc)} – {exc}")
|
|
485
|
+
|
|
486
|
+
with self._lock:
|
|
487
|
+
if not self.should_attempt_connection():
|
|
488
|
+
return
|
|
489
|
+
self.state = ConnectionState.CONNECTING
|
|
490
|
+
self.last_attempt = time.monotonic()
|
|
491
|
+
|
|
492
|
+
try:
|
|
493
|
+
success = self.connect_to_service()
|
|
494
|
+
with self._lock:
|
|
495
|
+
if success:
|
|
496
|
+
self.state = ConnectionState.CONNECTED
|
|
497
|
+
self.failure_count = 0
|
|
498
|
+
self.retry_interval = 1
|
|
499
|
+
logger.debug(f"Successfully connected to service {self.service_name}")
|
|
500
|
+
else:
|
|
501
|
+
self.handle_connection_failure()
|
|
502
|
+
except Exception as exc:
|
|
503
|
+
logger.error(f"Failed to connect to service {self.service_name}: {exc}")
|
|
504
|
+
with self._lock:
|
|
505
|
+
self.handle_connection_failure()
|
|
506
|
+
|
|
507
|
+
def handle_connection_failure(self):
|
|
508
|
+
self.failure_count += 1
|
|
509
|
+
if self.failure_count >= self.max_failures_before_circuit_break:
|
|
510
|
+
self.state = ConnectionState.CIRCUIT_OPEN
|
|
511
|
+
self.circuit_opened_at = time.monotonic()
|
|
512
|
+
logger.warning(
|
|
513
|
+
f"Circuit breaker opened for service {self.service_name} after {self.failure_count} failures"
|
|
514
|
+
)
|
|
515
|
+
else:
|
|
516
|
+
self.state = ConnectionState.DISCONNECTED
|
|
517
|
+
self.retry_interval = calculate_retry_interval(
|
|
518
|
+
self.failure_count,
|
|
519
|
+
self.base_interval,
|
|
520
|
+
self.max_retry_interval,
|
|
521
|
+
self.backoff_strategy,
|
|
522
|
+
self.jitter_strategy,
|
|
523
|
+
)
|
|
524
|
+
logger.debug(f"retry_interval={self.retry_interval}")
|
|
525
|
+
|
|
526
|
+
def is_connected(self) -> bool:
|
|
527
|
+
with self._lock:
|
|
528
|
+
return self.state == ConnectionState.CONNECTED
|
|
@@ -558,7 +558,8 @@ class ControlServer(metaclass=abc.ABCMeta):
|
|
|
558
558
|
try:
|
|
559
559
|
hk_dict = save_average_execution_time(self.device_protocol.get_housekeeping)
|
|
560
560
|
|
|
561
|
-
|
|
561
|
+
if storage_manager:
|
|
562
|
+
self.store_housekeeping_information(hk_dict)
|
|
562
563
|
self.propagate_metrics(hk_dict)
|
|
563
564
|
except Exception as exc:
|
|
564
565
|
logger.error(
|
|
@@ -18,9 +18,9 @@ and stop the server with:
|
|
|
18
18
|
|
|
19
19
|
Commands that can be used with the proxy:
|
|
20
20
|
|
|
21
|
-
* info
|
|
22
|
-
* get_value
|
|
23
|
-
* division
|
|
21
|
+
* info - returns an info message from the dummy device, e.g. "Dummy Device <__version__>"
|
|
22
|
+
* get_value - returns a random float between 0.0 and 1.0
|
|
23
|
+
* division - returns the result of the division between arguments 'a' and 'b'.
|
|
24
24
|
This can be used also to induce a ZeroDivisionError that should return a Failure
|
|
25
25
|
object.
|
|
26
26
|
|
|
@@ -35,6 +35,7 @@ and stopped with:
|
|
|
35
35
|
|
|
36
36
|
from __future__ import annotations
|
|
37
37
|
|
|
38
|
+
import contextlib
|
|
38
39
|
import multiprocessing
|
|
39
40
|
import random
|
|
40
41
|
import select
|
|
@@ -52,12 +53,14 @@ from egse.device import DeviceConnectionError
|
|
|
52
53
|
from egse.device import DeviceConnectionInterface
|
|
53
54
|
from egse.device import DeviceTimeoutError
|
|
54
55
|
from egse.device import DeviceTransport
|
|
56
|
+
from egse.env import bool_env
|
|
55
57
|
from egse.log import logger
|
|
56
58
|
from egse.protocol import CommandProtocol
|
|
57
59
|
from egse.proxy import Proxy
|
|
58
60
|
from egse.system import SignalCatcher
|
|
59
61
|
from egse.system import attrdict
|
|
60
62
|
from egse.system import format_datetime
|
|
63
|
+
from egse.system import type_name
|
|
61
64
|
from egse.zmq_ser import bind_address
|
|
62
65
|
from egse.zmq_ser import connect_address
|
|
63
66
|
|
|
@@ -77,6 +80,9 @@ WRITE_TIMEOUT = 1.0
|
|
|
77
80
|
CONNECT_TIMEOUT = 3.0
|
|
78
81
|
"""The maximum time in seconds to wait for establishing a socket connect."""
|
|
79
82
|
|
|
83
|
+
|
|
84
|
+
VERBOSE_DEBUG = bool_env("VERBOSE_DEBUG", default=False)
|
|
85
|
+
|
|
80
86
|
# Especially DummyCommand and DummyController need to be defined in a known module
|
|
81
87
|
# because those objects are pickled and when de-pickled at the clients side the class
|
|
82
88
|
# definition must be known.
|
|
@@ -116,14 +122,17 @@ def is_dummy_cs_active() -> bool:
|
|
|
116
122
|
|
|
117
123
|
|
|
118
124
|
def is_dummy_dev_active() -> bool:
|
|
125
|
+
if VERBOSE_DEBUG:
|
|
126
|
+
logger.debug("Checking if dummy device is active...")
|
|
119
127
|
try:
|
|
120
128
|
dev = DummyDeviceEthernetInterface(DEV_HOST, DEV_PORT)
|
|
121
129
|
dev.connect()
|
|
122
130
|
rc = dev.trans("ping\n")
|
|
123
131
|
dev.disconnect()
|
|
124
132
|
return rc.decode().strip() == "pong"
|
|
125
|
-
except DeviceConnectionError as exc:
|
|
126
|
-
|
|
133
|
+
except (DeviceConnectionError, ConnectionResetError, DeviceTimeoutError) as exc:
|
|
134
|
+
if VERBOSE_DEBUG:
|
|
135
|
+
logger.debug(f"Caught {type_name(exc)}: {exc} - returning False")
|
|
127
136
|
return False
|
|
128
137
|
|
|
129
138
|
|
|
@@ -309,11 +318,10 @@ class DummyDeviceEthernetInterface(DeviceConnectionInterface, DeviceTransport):
|
|
|
309
318
|
Args:
|
|
310
319
|
hostname (str): the IP address or fully qualified hostname of the Dummy Device
|
|
311
320
|
controller.
|
|
312
|
-
|
|
313
321
|
port (int): the IP port number to connect to.
|
|
314
322
|
"""
|
|
315
323
|
|
|
316
|
-
def __init__(self, hostname: str = None, port: int = None):
|
|
324
|
+
def __init__(self, hostname: str | None = None, port: int | None = None):
|
|
317
325
|
super().__init__()
|
|
318
326
|
|
|
319
327
|
# Basic connection settings, loaded from the configuration YAML file
|
|
@@ -350,7 +358,8 @@ class DummyDeviceEthernetInterface(DeviceConnectionInterface, DeviceTransport):
|
|
|
350
358
|
try:
|
|
351
359
|
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
352
360
|
except socket.error as exc:
|
|
353
|
-
self.sock
|
|
361
|
+
if self.sock is not None:
|
|
362
|
+
self.sock.close()
|
|
354
363
|
raise DeviceConnectionError("Dummy Device", "Failed to create socket.") from exc
|
|
355
364
|
|
|
356
365
|
# Attempt to establish a connection to the remote host
|
|
@@ -403,7 +412,8 @@ class DummyDeviceEthernetInterface(DeviceConnectionInterface, DeviceTransport):
|
|
|
403
412
|
"""
|
|
404
413
|
try:
|
|
405
414
|
logger.debug(f"Disconnecting from {self.hostname}")
|
|
406
|
-
self.sock
|
|
415
|
+
if self.sock is not None:
|
|
416
|
+
self.sock.close()
|
|
407
417
|
self.is_connection_open = False
|
|
408
418
|
except Exception as exc:
|
|
409
419
|
raise DeviceConnectionError(DEV_NAME, f"Could not close socket to {self.hostname}") from exc
|
|
@@ -439,6 +449,8 @@ class DummyDeviceEthernetInterface(DeviceConnectionInterface, DeviceTransport):
|
|
|
439
449
|
buf_size = 1024 * 10
|
|
440
450
|
response = bytes()
|
|
441
451
|
|
|
452
|
+
assert self.sock is not None
|
|
453
|
+
|
|
442
454
|
# Set a timeout of READ_TIMEOUT to the socket.recv
|
|
443
455
|
|
|
444
456
|
saved_timeout = self.sock.gettimeout()
|
|
@@ -478,6 +490,8 @@ class DummyDeviceEthernetInterface(DeviceConnectionInterface, DeviceTransport):
|
|
|
478
490
|
there was a socket related error.
|
|
479
491
|
"""
|
|
480
492
|
|
|
493
|
+
assert self.sock is not None
|
|
494
|
+
|
|
481
495
|
# logger.debug(f"{command.encode() = }")
|
|
482
496
|
|
|
483
497
|
try:
|
|
@@ -505,6 +519,9 @@ class DummyDeviceEthernetInterface(DeviceConnectionInterface, DeviceTransport):
|
|
|
505
519
|
DeviceTimeoutError: when the sendall() timed out, and a DeviceConnectionError if
|
|
506
520
|
there was a socket related error.
|
|
507
521
|
"""
|
|
522
|
+
|
|
523
|
+
assert self.sock is not None
|
|
524
|
+
|
|
508
525
|
# logger.debug(f"{command.encode() = }")
|
|
509
526
|
|
|
510
527
|
try:
|
|
@@ -580,11 +597,18 @@ def start_dev():
|
|
|
580
597
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
581
598
|
s.bind((DEV_HOST, DEV_PORT))
|
|
582
599
|
s.listen()
|
|
600
|
+
s.settimeout(CONNECT_TIMEOUT)
|
|
583
601
|
logger.info(f"Ready to accept connection on {DEV_HOST}:{DEV_PORT}...")
|
|
584
|
-
|
|
602
|
+
while True:
|
|
603
|
+
with contextlib.suppress(socket.timeout):
|
|
604
|
+
conn, addr = s.accept()
|
|
605
|
+
break
|
|
606
|
+
if killer.term_signal_received:
|
|
607
|
+
return
|
|
585
608
|
with conn:
|
|
586
609
|
logger.info(f"Accepted connection from {addr}")
|
|
587
610
|
conn.sendall(f"Dummy Device {__version__}".encode())
|
|
611
|
+
conn.settimeout(READ_TIMEOUT)
|
|
588
612
|
try:
|
|
589
613
|
while True:
|
|
590
614
|
error_msg = ""
|
|
@@ -641,7 +665,8 @@ def process_command(command_string: str) -> str | None:
|
|
|
641
665
|
|
|
642
666
|
try:
|
|
643
667
|
action, response = COMMAND_ACTIONS_RESPONSES[command_string]
|
|
644
|
-
|
|
668
|
+
if action:
|
|
669
|
+
action()
|
|
645
670
|
if error_msg:
|
|
646
671
|
return error_msg
|
|
647
672
|
else:
|
|
@@ -57,7 +57,7 @@ COMMANDER_PORT = settings.get("COMMANDER_PORT", 0) # dynamically assigned by th
|
|
|
57
57
|
_initialised = False # will be set to True in the setup_logging() function
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def get_log_file_name():
|
|
60
|
+
def get_log_file_name() -> str:
|
|
61
61
|
"""
|
|
62
62
|
Returns the filename of the log file as defined in the Settings or return the default name 'general.log'.
|
|
63
63
|
"""
|
|
@@ -315,7 +315,7 @@ def send_request(command_request: str):
|
|
|
315
315
|
"""Sends a request to the Logger Control Server and waits for a response."""
|
|
316
316
|
|
|
317
317
|
if COMMANDER_PORT == 0:
|
|
318
|
-
endpoint = get_endpoint_from_registry()
|
|
318
|
+
endpoint = get_endpoint_from_registry(SERVICE_TYPE)
|
|
319
319
|
else:
|
|
320
320
|
endpoint = f"{PROTOCOL}://{HOSTNAME}:{COMMANDER_PORT}"
|
|
321
321
|
|
|
@@ -23,10 +23,12 @@ from egse.notifyhub import SERVICE_TYPE
|
|
|
23
23
|
from egse.notifyhub import STATS_INTERVAL
|
|
24
24
|
from egse.notifyhub.client import AsyncNotificationHubClient
|
|
25
25
|
from egse.registry import MessageType
|
|
26
|
-
from egse.registry.client import
|
|
26
|
+
from egse.registry.client import REQUEST_TIMEOUT
|
|
27
|
+
from egse.registry.client import AsyncRegistryClient
|
|
27
28
|
from egse.system import TyperAsyncCommand
|
|
28
29
|
from egse.system import get_host_ip
|
|
29
30
|
from egse.zmq_ser import get_port_number
|
|
31
|
+
|
|
30
32
|
from .event import NotificationEvent
|
|
31
33
|
|
|
32
34
|
REQUEST_POLL_TIMEOUT = 1.0
|
|
@@ -154,7 +154,7 @@ def stop():
|
|
|
154
154
|
|
|
155
155
|
if COMMANDING_PORT == 0:
|
|
156
156
|
with RegistryClient() as reg:
|
|
157
|
-
service = reg.discover_service(
|
|
157
|
+
service = reg.discover_service(SERVICE_TYPE)
|
|
158
158
|
rich.print("service = ", service)
|
|
159
159
|
if service:
|
|
160
160
|
hostname = service["host"]
|
|
@@ -44,6 +44,9 @@ from egse.zmq_ser import set_address_port
|
|
|
44
44
|
|
|
45
45
|
MAX_SLEEP = 10
|
|
46
46
|
|
|
47
|
+
DEVICE_CMD_ENTRY_POINT = "cgse.service.device_command"
|
|
48
|
+
GUI_SCRIPTS_ENTRY_POINT = "gui_scripts"
|
|
49
|
+
|
|
47
50
|
|
|
48
51
|
class ControlServerStatus(Enum):
|
|
49
52
|
"""Status of the Control Server of a device."""
|
|
@@ -78,7 +81,7 @@ def get_cgse_cmd(device_proxy: str) -> str:
|
|
|
78
81
|
|
|
79
82
|
module_name = device_proxy[7:].rsplit(".", 1)[0]
|
|
80
83
|
entry_point_values = []
|
|
81
|
-
for ep in sorted(entry_points(
|
|
84
|
+
for ep in sorted(entry_points(DEVICE_CMD_ENTRY_POINT), key=lambda x: x.name):
|
|
82
85
|
entry_point_values.append(ep.value)
|
|
83
86
|
|
|
84
87
|
similarity_scores = [
|
|
@@ -108,7 +111,7 @@ def get_cgse_ui(device_proxy: str) -> Union[str, None]:
|
|
|
108
111
|
|
|
109
112
|
module_name = device_proxy[7:].rsplit(".", 1)[0]
|
|
110
113
|
entry_point_values = []
|
|
111
|
-
for ep in sorted(entry_points(
|
|
114
|
+
for ep in sorted(entry_points(GUI_SCRIPTS_ENTRY_POINT), key=lambda x: x.name):
|
|
112
115
|
entry_point_values.append(ep.name)
|
|
113
116
|
|
|
114
117
|
similarity_scores = [
|
|
@@ -639,6 +639,8 @@ class AsyncRegistryClient:
|
|
|
639
639
|
The response from the registry as a dictionary.
|
|
640
640
|
"""
|
|
641
641
|
|
|
642
|
+
assert self.req_socket is not None, "REQ socket is not connected, cannot send request."
|
|
643
|
+
|
|
642
644
|
timeout = timeout or self.timeout
|
|
643
645
|
try:
|
|
644
646
|
self.logger.debug(f"Sending request: {request}")
|
|
@@ -687,6 +689,8 @@ class AsyncRegistryClient:
|
|
|
687
689
|
The response from the registry as a dictionary.
|
|
688
690
|
"""
|
|
689
691
|
|
|
692
|
+
assert self.hb_socket is not None, "HB socket is not connected, cannot send heartbeat request."
|
|
693
|
+
|
|
690
694
|
try:
|
|
691
695
|
self.logger.debug(f"Sending heartbeat request: {request}")
|
|
692
696
|
await self.hb_socket.send_string(json.dumps(request))
|
|
@@ -867,7 +871,8 @@ class AsyncRegistryClient:
|
|
|
867
871
|
await self.reregister()
|
|
868
872
|
|
|
869
873
|
else:
|
|
870
|
-
VERBOSE_DEBUG
|
|
874
|
+
if VERBOSE_DEBUG:
|
|
875
|
+
self.logger.debug(f"Heartbeat succeeded: {response.get('message')}")
|
|
871
876
|
|
|
872
877
|
except Exception as exc:
|
|
873
878
|
self.logger.error(f"Error in heartbeat loop: {exc}", exc_info=True)
|
|
@@ -893,13 +898,15 @@ class AsyncRegistryClient:
|
|
|
893
898
|
"""Stop the running heartbeat task."""
|
|
894
899
|
|
|
895
900
|
if self._heartbeat_task is None:
|
|
896
|
-
VERBOSE_DEBUG
|
|
901
|
+
if VERBOSE_DEBUG:
|
|
902
|
+
self.logger.debug("Couldn't stop heartbeat, heartbeat_task is None")
|
|
897
903
|
return
|
|
898
904
|
|
|
899
905
|
self._heartbeat_task.cancel()
|
|
900
906
|
try:
|
|
901
907
|
await self._heartbeat_task
|
|
902
908
|
except asyncio.CancelledError:
|
|
909
|
+
self.logger.info("Heartbeat task cancelled")
|
|
903
910
|
pass
|
|
904
911
|
self._tasks.discard(self._heartbeat_task)
|
|
905
912
|
self._heartbeat_task = None
|
|
@@ -909,7 +916,8 @@ class AsyncRegistryClient:
|
|
|
909
916
|
"""Stop the running event listener task."""
|
|
910
917
|
|
|
911
918
|
if self._event_listener_task is None:
|
|
912
|
-
VERBOSE_DEBUG
|
|
919
|
+
if VERBOSE_DEBUG:
|
|
920
|
+
self.logger.debug("Couldn't stop event_listener, event_listener_task is None")
|
|
913
921
|
return
|
|
914
922
|
|
|
915
923
|
self._event_listener_task.cancel()
|
|
@@ -1142,12 +1150,14 @@ class AsyncRegistryClient:
|
|
|
1142
1150
|
self.sub_socket.close()
|
|
1143
1151
|
|
|
1144
1152
|
# We can not terminate the context, because we use a global instance, i.e. a singleton context.
|
|
1145
|
-
# When we try to terminate it, even after checking if it was closed,
|
|
1153
|
+
# When we try to terminate it, even after checking if it was closed, it raises an exception.
|
|
1146
1154
|
if hasattr(self, "context") and self.context:
|
|
1147
1155
|
self.logger.info(f"{self.context = !r}")
|
|
1148
1156
|
self.logger.info(f"{self.context._sockets = !r}")
|
|
1149
|
-
|
|
1150
|
-
|
|
1157
|
+
# The zmq context instance is the global singleton instance.
|
|
1158
|
+
# Terminating it here would affect other parts of the application using zmq.
|
|
1159
|
+
# if not self.context.closed:
|
|
1160
|
+
# self.context.term()
|
|
1151
1161
|
except Exception as exc:
|
|
1152
1162
|
self.logger.error(f"Error during cleanup: {exc}")
|
|
1153
1163
|
|
|
@@ -218,13 +218,15 @@ class AsyncRegistryServer:
|
|
|
218
218
|
"""Task that handles incoming requests."""
|
|
219
219
|
self.logger.info("Started request handler task")
|
|
220
220
|
|
|
221
|
+
assert self.req_socket is not None, "REQ socket is not connected, cannot handle requests."
|
|
222
|
+
|
|
221
223
|
try:
|
|
222
224
|
message_parts = None
|
|
223
225
|
while self._running:
|
|
224
226
|
try:
|
|
225
227
|
# Wait for a request with timeout to allow checking if still running
|
|
226
228
|
try:
|
|
227
|
-
# self.logger.
|
|
229
|
+
# self.logger.debug("Waiting for a request with 1s timeout...")
|
|
228
230
|
message_parts = await asyncio.wait_for(self.req_socket.recv_multipart(), timeout=1.0)
|
|
229
231
|
except asyncio.TimeoutError:
|
|
230
232
|
# self.logger.debug("waiting for command request...")
|
|
@@ -241,6 +243,9 @@ class AsyncRegistryServer:
|
|
|
241
243
|
response = await self._process_request(message_data)
|
|
242
244
|
|
|
243
245
|
await self._send_response(client_id, message_type, response)
|
|
246
|
+
else:
|
|
247
|
+
self.logger.warning("Request handler: message corrupted, check debug messages.")
|
|
248
|
+
self.logger.debug(f"{message_parts=}")
|
|
244
249
|
|
|
245
250
|
except zmq.ZMQError as exc:
|
|
246
251
|
self.logger.error(f"ZMQ error: {exc}", exc_info=True)
|
|
@@ -397,6 +402,8 @@ class AsyncRegistryServer:
|
|
|
397
402
|
"""Task that handles heartbeat messages."""
|
|
398
403
|
self.logger.info("Started heartbeats handler task")
|
|
399
404
|
|
|
405
|
+
assert self.hb_socket is not None, "HB socket is not connected, cannot handle heartbeat messages."
|
|
406
|
+
|
|
400
407
|
try:
|
|
401
408
|
message_parts = None
|
|
402
409
|
while self._running:
|
|
@@ -425,7 +432,8 @@ class AsyncRegistryServer:
|
|
|
425
432
|
self.logger.warning("Heartbeat request: message corrupted, check debug messages.")
|
|
426
433
|
|
|
427
434
|
except asyncio.TimeoutError:
|
|
428
|
-
VERBOSE_DEBUG
|
|
435
|
+
if VERBOSE_DEBUG:
|
|
436
|
+
self.logger.debug("waiting for heartbeat...")
|
|
429
437
|
continue
|
|
430
438
|
|
|
431
439
|
except Exception as exc:
|
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import json
|
|
5
|
-
import logging
|
|
6
5
|
import time
|
|
7
6
|
from typing import Any
|
|
8
7
|
from typing import Callable
|
|
@@ -10,13 +9,14 @@ from typing import Callable
|
|
|
10
9
|
import zmq
|
|
11
10
|
import zmq.asyncio
|
|
12
11
|
|
|
12
|
+
from egse.log import logging
|
|
13
13
|
from egse.registry import DEFAULT_RS_PUB_PORT
|
|
14
14
|
from egse.registry import DEFAULT_RS_REQ_PORT
|
|
15
15
|
from egse.registry.client import AsyncRegistryClient
|
|
16
16
|
from egse.system import get_host_ip
|
|
17
17
|
from egse.zmq_ser import get_port_number
|
|
18
18
|
|
|
19
|
-
module_module_logger_name = "async_microservice"
|
|
19
|
+
module_module_logger_name = "egse.async_microservice"
|
|
20
20
|
module_logger = logging.getLogger(module_module_logger_name)
|
|
21
21
|
|
|
22
22
|
|
|
@@ -64,7 +64,7 @@ class ZMQMicroservice:
|
|
|
64
64
|
self.registry_sub_endpoint = registry_sub_endpoint or f"tcp://localhost:{DEFAULT_RS_PUB_PORT}"
|
|
65
65
|
self.metadata = metadata or {}
|
|
66
66
|
|
|
67
|
-
self.host_ip = get_host_ip()
|
|
67
|
+
self.host_ip = get_host_ip() or "localhost"
|
|
68
68
|
|
|
69
69
|
# Service ID will be set when registered
|
|
70
70
|
self.service_id = None
|
|
@@ -164,6 +164,7 @@ class ZMQMicroservice:
|
|
|
164
164
|
|
|
165
165
|
if not self.service_id:
|
|
166
166
|
module_logger.error("Failed to register with the service registry")
|
|
167
|
+
await self._cleanup()
|
|
167
168
|
return True
|
|
168
169
|
|
|
169
170
|
module_logger.info(f"Registered with service ID: {self.service_id}")
|
|
@@ -175,12 +176,17 @@ class ZMQMicroservice:
|
|
|
175
176
|
# Start request handler
|
|
176
177
|
request_task = asyncio.create_task(self._handle_requests())
|
|
177
178
|
self._tasks.add(request_task)
|
|
178
|
-
request_task.add_done_callback(self._tasks.discard)
|
|
179
|
+
# request_task.add_done_callback(self._tasks.discard)
|
|
179
180
|
|
|
180
181
|
# Wait for shutdown signal
|
|
181
182
|
await self._shutdown.wait()
|
|
182
183
|
|
|
183
|
-
#
|
|
184
|
+
# request_task.cancel()
|
|
185
|
+
# try:
|
|
186
|
+
# await request_task
|
|
187
|
+
# except asyncio.CancelledError:
|
|
188
|
+
# module_logger.info("Request handler task cancelled during shutdown")
|
|
189
|
+
|
|
184
190
|
await self._cleanup()
|
|
185
191
|
|
|
186
192
|
return False
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
from egse.env import bool_env
|
|
2
|
-
from egse.log import logging
|
|
3
|
-
from egse.zmq_ser import connect_address
|
|
4
|
-
|
|
5
|
-
logger = logging.getLogger("egse.connect")
|
|
6
|
-
|
|
7
|
-
# random.seed(time.monotonic()) # uncomment for testing only, main application should set a seed.
|
|
8
|
-
|
|
9
|
-
VERBOSE_DEBUG = bool_env("VERBOSE_DEBUG")
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def get_endpoint(
|
|
13
|
-
service_type: str,
|
|
14
|
-
protocol: str = "tcp",
|
|
15
|
-
hostname: str = "localhost",
|
|
16
|
-
port: int = 0,
|
|
17
|
-
):
|
|
18
|
-
"""
|
|
19
|
-
Returns the endpoint for a service, either from the registry or by constructing
|
|
20
|
-
it from protocol, hostname and port.
|
|
21
|
-
|
|
22
|
-
If port is 0 (the default), attempt to retrieve the endpoint from the service registry.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
service_type: The service type to look up in the registry.
|
|
26
|
-
protocol: Protocol to use if constructing the endpoint, defaults to tcp.
|
|
27
|
-
hostname: Hostname to use if constructing the endpoint, defaults to localhost.
|
|
28
|
-
port: Port to use if constructing the endpoint, defaults to 0.
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
The endpoint string.
|
|
32
|
-
|
|
33
|
-
Raises:
|
|
34
|
-
RuntimeError: If no endpoint can be determined.
|
|
35
|
-
"""
|
|
36
|
-
endpoint = None
|
|
37
|
-
from egse.registry.client import RegistryClient
|
|
38
|
-
|
|
39
|
-
if port == 0:
|
|
40
|
-
with RegistryClient() as reg:
|
|
41
|
-
endpoint = reg.get_endpoint(service_type)
|
|
42
|
-
if endpoint:
|
|
43
|
-
if VERBOSE_DEBUG:
|
|
44
|
-
logger.debug(f"Endpoint for {service_type} found in registry: {endpoint}")
|
|
45
|
-
else:
|
|
46
|
-
logger.warning(f"No endpoint for {service_type} found in registry.")
|
|
47
|
-
|
|
48
|
-
if not endpoint:
|
|
49
|
-
if port == 0:
|
|
50
|
-
raise RuntimeError(f"No service registered as {service_type} and no port provided.")
|
|
51
|
-
endpoint = connect_address(protocol, hostname, port)
|
|
52
|
-
if VERBOSE_DEBUG:
|
|
53
|
-
logger.debug(f"Endpoint constructed from protocol/hostname/port: {endpoint}")
|
|
54
|
-
|
|
55
|
-
return endpoint
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|