kstlib 0.0.1a0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kstlib/__init__.py +266 -1
- kstlib/__main__.py +16 -0
- kstlib/alerts/__init__.py +110 -0
- kstlib/alerts/channels/__init__.py +36 -0
- kstlib/alerts/channels/base.py +197 -0
- kstlib/alerts/channels/email.py +227 -0
- kstlib/alerts/channels/slack.py +389 -0
- kstlib/alerts/exceptions.py +72 -0
- kstlib/alerts/manager.py +651 -0
- kstlib/alerts/models.py +142 -0
- kstlib/alerts/throttle.py +263 -0
- kstlib/auth/__init__.py +139 -0
- kstlib/auth/callback.py +399 -0
- kstlib/auth/config.py +502 -0
- kstlib/auth/errors.py +127 -0
- kstlib/auth/models.py +316 -0
- kstlib/auth/providers/__init__.py +14 -0
- kstlib/auth/providers/base.py +393 -0
- kstlib/auth/providers/oauth2.py +645 -0
- kstlib/auth/providers/oidc.py +821 -0
- kstlib/auth/session.py +338 -0
- kstlib/auth/token.py +482 -0
- kstlib/cache/__init__.py +50 -0
- kstlib/cache/decorator.py +261 -0
- kstlib/cache/strategies.py +516 -0
- kstlib/cli/__init__.py +8 -0
- kstlib/cli/app.py +195 -0
- kstlib/cli/commands/__init__.py +5 -0
- kstlib/cli/commands/auth/__init__.py +39 -0
- kstlib/cli/commands/auth/common.py +122 -0
- kstlib/cli/commands/auth/login.py +325 -0
- kstlib/cli/commands/auth/logout.py +74 -0
- kstlib/cli/commands/auth/providers.py +57 -0
- kstlib/cli/commands/auth/status.py +291 -0
- kstlib/cli/commands/auth/token.py +199 -0
- kstlib/cli/commands/auth/whoami.py +106 -0
- kstlib/cli/commands/config.py +89 -0
- kstlib/cli/commands/ops/__init__.py +39 -0
- kstlib/cli/commands/ops/attach.py +49 -0
- kstlib/cli/commands/ops/common.py +269 -0
- kstlib/cli/commands/ops/list_sessions.py +252 -0
- kstlib/cli/commands/ops/logs.py +49 -0
- kstlib/cli/commands/ops/start.py +98 -0
- kstlib/cli/commands/ops/status.py +138 -0
- kstlib/cli/commands/ops/stop.py +60 -0
- kstlib/cli/commands/rapi/__init__.py +60 -0
- kstlib/cli/commands/rapi/call.py +341 -0
- kstlib/cli/commands/rapi/list.py +99 -0
- kstlib/cli/commands/rapi/show.py +206 -0
- kstlib/cli/commands/secrets/__init__.py +35 -0
- kstlib/cli/commands/secrets/common.py +425 -0
- kstlib/cli/commands/secrets/decrypt.py +88 -0
- kstlib/cli/commands/secrets/doctor.py +743 -0
- kstlib/cli/commands/secrets/encrypt.py +242 -0
- kstlib/cli/commands/secrets/shred.py +96 -0
- kstlib/cli/common.py +86 -0
- kstlib/config/__init__.py +76 -0
- kstlib/config/exceptions.py +110 -0
- kstlib/config/export.py +225 -0
- kstlib/config/loader.py +963 -0
- kstlib/config/sops.py +287 -0
- kstlib/db/__init__.py +54 -0
- kstlib/db/aiosqlcipher.py +137 -0
- kstlib/db/cipher.py +112 -0
- kstlib/db/database.py +367 -0
- kstlib/db/exceptions.py +25 -0
- kstlib/db/pool.py +302 -0
- kstlib/helpers/__init__.py +35 -0
- kstlib/helpers/exceptions.py +11 -0
- kstlib/helpers/time_trigger.py +396 -0
- kstlib/kstlib.conf.yml +890 -0
- kstlib/limits.py +963 -0
- kstlib/logging/__init__.py +108 -0
- kstlib/logging/manager.py +633 -0
- kstlib/mail/__init__.py +42 -0
- kstlib/mail/builder.py +626 -0
- kstlib/mail/exceptions.py +27 -0
- kstlib/mail/filesystem.py +248 -0
- kstlib/mail/transport.py +224 -0
- kstlib/mail/transports/__init__.py +19 -0
- kstlib/mail/transports/gmail.py +268 -0
- kstlib/mail/transports/resend.py +324 -0
- kstlib/mail/transports/smtp.py +326 -0
- kstlib/meta.py +72 -0
- kstlib/metrics/__init__.py +88 -0
- kstlib/metrics/decorators.py +1090 -0
- kstlib/metrics/exceptions.py +14 -0
- kstlib/monitoring/__init__.py +116 -0
- kstlib/monitoring/_styles.py +163 -0
- kstlib/monitoring/cell.py +57 -0
- kstlib/monitoring/config.py +424 -0
- kstlib/monitoring/delivery.py +579 -0
- kstlib/monitoring/exceptions.py +63 -0
- kstlib/monitoring/image.py +220 -0
- kstlib/monitoring/kv.py +79 -0
- kstlib/monitoring/list.py +69 -0
- kstlib/monitoring/metric.py +88 -0
- kstlib/monitoring/monitoring.py +341 -0
- kstlib/monitoring/renderer.py +139 -0
- kstlib/monitoring/service.py +392 -0
- kstlib/monitoring/table.py +129 -0
- kstlib/monitoring/types.py +56 -0
- kstlib/ops/__init__.py +86 -0
- kstlib/ops/base.py +148 -0
- kstlib/ops/container.py +577 -0
- kstlib/ops/exceptions.py +209 -0
- kstlib/ops/manager.py +407 -0
- kstlib/ops/models.py +176 -0
- kstlib/ops/tmux.py +372 -0
- kstlib/ops/validators.py +287 -0
- kstlib/py.typed +0 -0
- kstlib/rapi/__init__.py +118 -0
- kstlib/rapi/client.py +875 -0
- kstlib/rapi/config.py +861 -0
- kstlib/rapi/credentials.py +887 -0
- kstlib/rapi/exceptions.py +213 -0
- kstlib/resilience/__init__.py +101 -0
- kstlib/resilience/circuit_breaker.py +440 -0
- kstlib/resilience/exceptions.py +95 -0
- kstlib/resilience/heartbeat.py +491 -0
- kstlib/resilience/rate_limiter.py +506 -0
- kstlib/resilience/shutdown.py +417 -0
- kstlib/resilience/watchdog.py +637 -0
- kstlib/secrets/__init__.py +29 -0
- kstlib/secrets/exceptions.py +19 -0
- kstlib/secrets/models.py +62 -0
- kstlib/secrets/providers/__init__.py +79 -0
- kstlib/secrets/providers/base.py +58 -0
- kstlib/secrets/providers/environment.py +66 -0
- kstlib/secrets/providers/keyring.py +107 -0
- kstlib/secrets/providers/kms.py +223 -0
- kstlib/secrets/providers/kwargs.py +101 -0
- kstlib/secrets/providers/sops.py +209 -0
- kstlib/secrets/resolver.py +221 -0
- kstlib/secrets/sensitive.py +130 -0
- kstlib/secure/__init__.py +23 -0
- kstlib/secure/fs.py +194 -0
- kstlib/secure/permissions.py +70 -0
- kstlib/ssl.py +347 -0
- kstlib/ui/__init__.py +23 -0
- kstlib/ui/exceptions.py +26 -0
- kstlib/ui/panels.py +484 -0
- kstlib/ui/spinner.py +864 -0
- kstlib/ui/tables.py +382 -0
- kstlib/utils/__init__.py +48 -0
- kstlib/utils/dict.py +36 -0
- kstlib/utils/formatting.py +338 -0
- kstlib/utils/http_trace.py +237 -0
- kstlib/utils/lazy.py +49 -0
- kstlib/utils/secure_delete.py +205 -0
- kstlib/utils/serialization.py +247 -0
- kstlib/utils/text.py +56 -0
- kstlib/utils/validators.py +124 -0
- kstlib/websocket/__init__.py +97 -0
- kstlib/websocket/exceptions.py +214 -0
- kstlib/websocket/manager.py +1102 -0
- kstlib/websocket/models.py +361 -0
- kstlib-1.0.1.dist-info/METADATA +201 -0
- kstlib-1.0.1.dist-info/RECORD +163 -0
- {kstlib-0.0.1a0.dist-info → kstlib-1.0.1.dist-info}/WHEEL +1 -1
- kstlib-1.0.1.dist-info/entry_points.txt +2 -0
- kstlib-1.0.1.dist-info/licenses/LICENSE.md +9 -0
- kstlib-0.0.1a0.dist-info/METADATA +0 -29
- kstlib-0.0.1a0.dist-info/RECORD +0 -6
- kstlib-0.0.1a0.dist-info/licenses/LICENSE.md +0 -5
- {kstlib-0.0.1a0.dist-info → kstlib-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
"""Watchdog for detecting thread/process freezes and hangs.
|
|
2
|
+
|
|
3
|
+
Provides configurable timeout monitoring for long-running operations,
|
|
4
|
+
with automatic callback invocation when activity stops.
|
|
5
|
+
|
|
6
|
+
Examples:
|
|
7
|
+
Basic usage with callback:
|
|
8
|
+
|
|
9
|
+
>>> def on_freeze(): # doctest: +SKIP
|
|
10
|
+
... print("Thread frozen!")
|
|
11
|
+
>>> watchdog = Watchdog(timeout=30, on_timeout=on_freeze) # doctest: +SKIP
|
|
12
|
+
>>> watchdog.start() # doctest: +SKIP
|
|
13
|
+
>>> while running: # doctest: +SKIP
|
|
14
|
+
... watchdog.ping() # Reset timer
|
|
15
|
+
... do_work()
|
|
16
|
+
>>> watchdog.stop() # doctest: +SKIP
|
|
17
|
+
|
|
18
|
+
As context manager:
|
|
19
|
+
|
|
20
|
+
>>> with Watchdog(timeout=30) as wd: # doctest: +SKIP
|
|
21
|
+
... for item in items:
|
|
22
|
+
... wd.ping()
|
|
23
|
+
... process(item)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import asyncio
|
|
29
|
+
import contextlib
|
|
30
|
+
import inspect
|
|
31
|
+
import json
|
|
32
|
+
import logging
|
|
33
|
+
import threading
|
|
34
|
+
import time
|
|
35
|
+
from collections.abc import Awaitable, Callable, Mapping
|
|
36
|
+
from dataclasses import dataclass
|
|
37
|
+
from datetime import datetime, timezone
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
from typing_extensions import Self
|
|
42
|
+
|
|
43
|
+
from kstlib.limits import (
|
|
44
|
+
DEFAULT_WATCHDOG_TIMEOUT,
|
|
45
|
+
HARD_MAX_WATCHDOG_TIMEOUT,
|
|
46
|
+
HARD_MIN_WATCHDOG_TIMEOUT,
|
|
47
|
+
clamp_with_limits,
|
|
48
|
+
get_resilience_limits,
|
|
49
|
+
)
|
|
50
|
+
from kstlib.resilience.exceptions import WatchdogTimeoutError
|
|
51
|
+
|
|
52
|
+
log = logging.getLogger(__name__)
|
|
53
|
+
|
|
54
|
+
# Type alias for alert callback
|
|
55
|
+
OnAlertCallback = Callable[[str, str, Mapping[str, Any]], Awaitable[None] | None]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class WatchdogStats:
|
|
60
|
+
"""Statistics for watchdog monitoring.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
pings_total: Total number of ping calls.
|
|
64
|
+
timeouts_triggered: Number of timeout events detected.
|
|
65
|
+
last_ping_time: Timestamp of last activity (monotonic).
|
|
66
|
+
start_time: Timestamp when watchdog started (monotonic).
|
|
67
|
+
|
|
68
|
+
Examples:
|
|
69
|
+
>>> stats = WatchdogStats()
|
|
70
|
+
>>> stats.record_ping()
|
|
71
|
+
>>> stats.pings_total
|
|
72
|
+
1
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
pings_total: int = 0
|
|
76
|
+
timeouts_triggered: int = 0
|
|
77
|
+
last_ping_time: float | None = None
|
|
78
|
+
start_time: float | None = None
|
|
79
|
+
|
|
80
|
+
def record_ping(self) -> None:
|
|
81
|
+
"""Record a ping event."""
|
|
82
|
+
self.pings_total += 1
|
|
83
|
+
self.last_ping_time = time.monotonic()
|
|
84
|
+
|
|
85
|
+
def record_timeout(self) -> None:
|
|
86
|
+
"""Record a timeout event."""
|
|
87
|
+
self.timeouts_triggered += 1
|
|
88
|
+
|
|
89
|
+
def record_start(self) -> None:
|
|
90
|
+
"""Record watchdog start."""
|
|
91
|
+
self.start_time = time.monotonic()
|
|
92
|
+
self.last_ping_time = time.monotonic()
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def uptime(self) -> float:
|
|
96
|
+
"""Return seconds since watchdog started."""
|
|
97
|
+
if self.start_time is None:
|
|
98
|
+
return 0.0
|
|
99
|
+
return time.monotonic() - self.start_time
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Watchdog:
|
|
103
|
+
"""Monitor thread/process health and detect freezes or hangs.
|
|
104
|
+
|
|
105
|
+
Implements a watchdog timer that must be periodically "pinged" to
|
|
106
|
+
prevent timeout. If no ping is received within the timeout period,
|
|
107
|
+
the on_timeout callback is invoked.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
timeout: Seconds of inactivity before triggering timeout.
|
|
111
|
+
If None, uses config default (30s).
|
|
112
|
+
on_timeout: Callback invoked when timeout is detected.
|
|
113
|
+
Can be sync or async function.
|
|
114
|
+
name: Optional identifier for logging and monitoring.
|
|
115
|
+
|
|
116
|
+
Examples:
|
|
117
|
+
Basic usage:
|
|
118
|
+
|
|
119
|
+
>>> watchdog = Watchdog(timeout=30)
|
|
120
|
+
>>> watchdog.timeout
|
|
121
|
+
30
|
|
122
|
+
|
|
123
|
+
With callback:
|
|
124
|
+
|
|
125
|
+
>>> def alert():
|
|
126
|
+
... print("Watchdog triggered!")
|
|
127
|
+
>>> wd = Watchdog(timeout=10, on_timeout=alert, name="worker")
|
|
128
|
+
>>> wd.name
|
|
129
|
+
'worker'
|
|
130
|
+
|
|
131
|
+
As context manager:
|
|
132
|
+
|
|
133
|
+
>>> with Watchdog(timeout=30) as wd: # doctest: +SKIP
|
|
134
|
+
... wd.ping()
|
|
135
|
+
... do_work()
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
def __init__(
|
|
139
|
+
self,
|
|
140
|
+
*,
|
|
141
|
+
timeout: float | None = None,
|
|
142
|
+
on_timeout: Callable[[], None] | Callable[[], Awaitable[None]] | None = None,
|
|
143
|
+
on_alert: OnAlertCallback | None = None,
|
|
144
|
+
name: str | None = None,
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Initialize watchdog.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
timeout: Seconds before timeout triggers. Clamped to [1, 3600].
|
|
150
|
+
on_timeout: Callback for timeout events (sync or async).
|
|
151
|
+
on_alert: Callback for alerting (channel, message, context).
|
|
152
|
+
name: Optional identifier.
|
|
153
|
+
"""
|
|
154
|
+
# Load config defaults if needed
|
|
155
|
+
if timeout is None:
|
|
156
|
+
try:
|
|
157
|
+
limits = get_resilience_limits()
|
|
158
|
+
timeout = limits.watchdog_timeout
|
|
159
|
+
except Exception:
|
|
160
|
+
timeout = DEFAULT_WATCHDOG_TIMEOUT
|
|
161
|
+
|
|
162
|
+
self._timeout = clamp_with_limits(timeout, HARD_MIN_WATCHDOG_TIMEOUT, HARD_MAX_WATCHDOG_TIMEOUT)
|
|
163
|
+
self._on_timeout = on_timeout
|
|
164
|
+
self._on_alert = on_alert
|
|
165
|
+
self._name = name
|
|
166
|
+
self._stats = WatchdogStats()
|
|
167
|
+
|
|
168
|
+
# State
|
|
169
|
+
self._last_ping = time.monotonic()
|
|
170
|
+
self._running = False
|
|
171
|
+
self._triggered = False
|
|
172
|
+
self._shutdown_requested = False
|
|
173
|
+
self._lock = threading.Lock()
|
|
174
|
+
self._stop_event = threading.Event()
|
|
175
|
+
self._thread: threading.Thread | None = None
|
|
176
|
+
self._async_task: asyncio.Task[None] | None = None
|
|
177
|
+
self._callback_task: asyncio.Task[None] | None = None
|
|
178
|
+
|
|
179
|
+
# State file monitoring (when using from_state_file)
|
|
180
|
+
self._state_file: Path | None = None
|
|
181
|
+
self._max_age: float = 30.0
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def timeout(self) -> float:
|
|
185
|
+
"""Timeout duration in seconds."""
|
|
186
|
+
return self._timeout
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def name(self) -> str | None:
|
|
190
|
+
"""Watchdog identifier."""
|
|
191
|
+
return self._name
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def stats(self) -> WatchdogStats:
|
|
195
|
+
"""Statistics for this watchdog."""
|
|
196
|
+
return self._stats
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def is_running(self) -> bool:
|
|
200
|
+
"""Return True if watchdog is actively monitoring."""
|
|
201
|
+
return self._running
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def is_triggered(self) -> bool:
|
|
205
|
+
"""Return True if timeout has been triggered."""
|
|
206
|
+
return self._triggered
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def seconds_since_ping(self) -> float:
|
|
210
|
+
"""Return seconds since last ping."""
|
|
211
|
+
with self._lock:
|
|
212
|
+
return time.monotonic() - self._last_ping
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def is_shutdown(self) -> bool:
|
|
216
|
+
"""Check if shutdown has been requested."""
|
|
217
|
+
return self._shutdown_requested
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def state_file(self) -> Path | None:
|
|
221
|
+
"""Return the state file path if monitoring a heartbeat file."""
|
|
222
|
+
return self._state_file
|
|
223
|
+
|
|
224
|
+
@classmethod
|
|
225
|
+
def from_state_file(
|
|
226
|
+
cls,
|
|
227
|
+
state_file: str | Path,
|
|
228
|
+
*,
|
|
229
|
+
check_interval: float | None = None,
|
|
230
|
+
max_age: float = 30.0,
|
|
231
|
+
on_timeout: Callable[[], None] | Callable[[], Awaitable[None]] | None = None,
|
|
232
|
+
on_alert: OnAlertCallback | None = None,
|
|
233
|
+
name: str | None = None,
|
|
234
|
+
) -> Self:
|
|
235
|
+
"""Create a watchdog that monitors a heartbeat state file.
|
|
236
|
+
|
|
237
|
+
Instead of requiring periodic ping() calls, this watchdog checks
|
|
238
|
+
if a heartbeat state file is being updated regularly.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
state_file: Path to the heartbeat JSON state file.
|
|
242
|
+
check_interval: Seconds between file checks (defaults to max_age/2).
|
|
243
|
+
max_age: Maximum age in seconds before triggering timeout (default: 30s).
|
|
244
|
+
on_timeout: Callback for timeout events.
|
|
245
|
+
on_alert: Callback for alerting (channel, message, context).
|
|
246
|
+
name: Optional identifier.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Configured Watchdog instance.
|
|
250
|
+
|
|
251
|
+
Examples:
|
|
252
|
+
>>> wd = Watchdog.from_state_file( # doctest: +SKIP
|
|
253
|
+
... "/tmp/bot.heartbeat",
|
|
254
|
+
... max_age=30.0, # Trigger if no heartbeat for 30 seconds
|
|
255
|
+
... on_timeout=restart_bot,
|
|
256
|
+
... )
|
|
257
|
+
>>> await wd.astart() # doctest: +SKIP
|
|
258
|
+
"""
|
|
259
|
+
interval = check_interval if check_interval is not None else max_age / 2
|
|
260
|
+
instance = cls(
|
|
261
|
+
timeout=interval,
|
|
262
|
+
on_timeout=on_timeout,
|
|
263
|
+
on_alert=on_alert,
|
|
264
|
+
name=name or f"state_file_watcher:{state_file}",
|
|
265
|
+
)
|
|
266
|
+
instance._state_file = Path(state_file)
|
|
267
|
+
instance._max_age = max_age
|
|
268
|
+
return instance
|
|
269
|
+
|
|
270
|
+
def shutdown(self) -> None:
|
|
271
|
+
"""Signal shutdown and stop gracefully."""
|
|
272
|
+
log.info("Watchdog shutdown requested")
|
|
273
|
+
self._shutdown_requested = True
|
|
274
|
+
self.stop()
|
|
275
|
+
|
|
276
|
+
async def ashutdown(self) -> None:
|
|
277
|
+
"""Signal shutdown and stop gracefully (async version)."""
|
|
278
|
+
log.info("Watchdog shutdown requested")
|
|
279
|
+
self._shutdown_requested = True
|
|
280
|
+
await self.astop()
|
|
281
|
+
|
|
282
|
+
def ping(self) -> None:
|
|
283
|
+
"""Reset the watchdog timer.
|
|
284
|
+
|
|
285
|
+
Call this periodically to indicate the monitored code is still alive.
|
|
286
|
+
Must be called more frequently than the timeout interval.
|
|
287
|
+
|
|
288
|
+
Examples:
|
|
289
|
+
>>> watchdog = Watchdog(timeout=30)
|
|
290
|
+
>>> watchdog.ping() # Reset timer
|
|
291
|
+
"""
|
|
292
|
+
with self._lock:
|
|
293
|
+
self._last_ping = time.monotonic()
|
|
294
|
+
self._stats.record_ping()
|
|
295
|
+
|
|
296
|
+
async def aping(self) -> None:
|
|
297
|
+
"""Async version of ping().
|
|
298
|
+
|
|
299
|
+
Examples:
|
|
300
|
+
>>> import asyncio
|
|
301
|
+
>>> async def example():
|
|
302
|
+
... watchdog = Watchdog(timeout=30)
|
|
303
|
+
... await watchdog.aping()
|
|
304
|
+
>>> asyncio.run(example())
|
|
305
|
+
"""
|
|
306
|
+
self.ping()
|
|
307
|
+
|
|
308
|
+
def start(self) -> None:
|
|
309
|
+
"""Start watchdog monitoring in a background thread.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
RuntimeError: If watchdog is already running.
|
|
313
|
+
|
|
314
|
+
Examples:
|
|
315
|
+
>>> watchdog = Watchdog(timeout=30)
|
|
316
|
+
>>> watchdog.start()
|
|
317
|
+
>>> watchdog.is_running
|
|
318
|
+
True
|
|
319
|
+
>>> watchdog.stop()
|
|
320
|
+
"""
|
|
321
|
+
with self._lock:
|
|
322
|
+
if self._running:
|
|
323
|
+
raise RuntimeError("Watchdog is already running")
|
|
324
|
+
|
|
325
|
+
self._running = True
|
|
326
|
+
self._triggered = False
|
|
327
|
+
self._stop_event.clear()
|
|
328
|
+
self._last_ping = time.monotonic()
|
|
329
|
+
self._stats.record_start()
|
|
330
|
+
|
|
331
|
+
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
|
332
|
+
self._thread.start()
|
|
333
|
+
|
|
334
|
+
def stop(self) -> None:
|
|
335
|
+
"""Stop watchdog monitoring.
|
|
336
|
+
|
|
337
|
+
Safe to call multiple times or when not running.
|
|
338
|
+
|
|
339
|
+
Examples:
|
|
340
|
+
>>> watchdog = Watchdog(timeout=30)
|
|
341
|
+
>>> watchdog.start()
|
|
342
|
+
>>> watchdog.stop()
|
|
343
|
+
>>> watchdog.is_running
|
|
344
|
+
False
|
|
345
|
+
"""
|
|
346
|
+
with self._lock:
|
|
347
|
+
if not self._running:
|
|
348
|
+
return
|
|
349
|
+
self._running = False
|
|
350
|
+
|
|
351
|
+
self._stop_event.set()
|
|
352
|
+
|
|
353
|
+
if self._thread is not None:
|
|
354
|
+
self._thread.join(timeout=1.0)
|
|
355
|
+
self._thread = None
|
|
356
|
+
|
|
357
|
+
async def astart(self) -> None:
|
|
358
|
+
"""Start watchdog monitoring asynchronously.
|
|
359
|
+
|
|
360
|
+
Raises:
|
|
361
|
+
RuntimeError: If watchdog is already running.
|
|
362
|
+
"""
|
|
363
|
+
with self._lock:
|
|
364
|
+
if self._running:
|
|
365
|
+
raise RuntimeError("Watchdog is already running")
|
|
366
|
+
|
|
367
|
+
self._running = True
|
|
368
|
+
self._triggered = False
|
|
369
|
+
self._stop_event.clear()
|
|
370
|
+
self._last_ping = time.monotonic()
|
|
371
|
+
self._stats.record_start()
|
|
372
|
+
|
|
373
|
+
self._async_task = asyncio.create_task(self._async_monitor_loop())
|
|
374
|
+
|
|
375
|
+
async def astop(self) -> None:
|
|
376
|
+
"""Stop watchdog monitoring asynchronously.
|
|
377
|
+
|
|
378
|
+
Safe to call multiple times or when not running.
|
|
379
|
+
"""
|
|
380
|
+
with self._lock:
|
|
381
|
+
if not self._running:
|
|
382
|
+
return
|
|
383
|
+
self._running = False
|
|
384
|
+
|
|
385
|
+
self._stop_event.set()
|
|
386
|
+
|
|
387
|
+
if self._async_task is not None:
|
|
388
|
+
self._async_task.cancel()
|
|
389
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
390
|
+
await self._async_task
|
|
391
|
+
self._async_task = None
|
|
392
|
+
|
|
393
|
+
def reset(self) -> None:
|
|
394
|
+
"""Reset watchdog state without stopping.
|
|
395
|
+
|
|
396
|
+
Clears triggered flag and resets timer.
|
|
397
|
+
"""
|
|
398
|
+
with self._lock:
|
|
399
|
+
self._last_ping = time.monotonic()
|
|
400
|
+
self._triggered = False
|
|
401
|
+
|
|
402
|
+
def _monitor_loop(self) -> None:
|
|
403
|
+
"""Background thread monitoring loop."""
|
|
404
|
+
check_interval = min(1.0, self._timeout / 4)
|
|
405
|
+
|
|
406
|
+
while not self._stop_event.wait(timeout=check_interval):
|
|
407
|
+
if self._shutdown_requested:
|
|
408
|
+
break
|
|
409
|
+
self._check_timeout()
|
|
410
|
+
|
|
411
|
+
async def _async_monitor_loop(self) -> None:
|
|
412
|
+
"""Async monitoring loop."""
|
|
413
|
+
check_interval = min(1.0, self._timeout / 4)
|
|
414
|
+
|
|
415
|
+
while self._running and not self._shutdown_requested:
|
|
416
|
+
await asyncio.sleep(check_interval)
|
|
417
|
+
await self._async_check_timeout()
|
|
418
|
+
|
|
419
|
+
def _check_timeout(self) -> None:
|
|
420
|
+
"""Check for timeout and invoke callback if needed."""
|
|
421
|
+
# If monitoring a state file, check that instead of ping time
|
|
422
|
+
if self._state_file is not None:
|
|
423
|
+
self._check_state_file_sync()
|
|
424
|
+
return
|
|
425
|
+
|
|
426
|
+
with self._lock:
|
|
427
|
+
if self._triggered:
|
|
428
|
+
return
|
|
429
|
+
|
|
430
|
+
elapsed = time.monotonic() - self._last_ping
|
|
431
|
+
if elapsed < self._timeout:
|
|
432
|
+
return
|
|
433
|
+
|
|
434
|
+
self._triggered = True
|
|
435
|
+
self._stats.record_timeout()
|
|
436
|
+
|
|
437
|
+
# Invoke callback outside lock - suppress errors to prevent watchdog crash
|
|
438
|
+
if self._on_timeout is not None:
|
|
439
|
+
with contextlib.suppress(Exception):
|
|
440
|
+
result = self._on_timeout()
|
|
441
|
+
# Handle async callback in sync context
|
|
442
|
+
if inspect.iscoroutine(result):
|
|
443
|
+
# Run async callback in new event loop
|
|
444
|
+
try:
|
|
445
|
+
loop = asyncio.get_running_loop()
|
|
446
|
+
self._callback_task = loop.create_task(result)
|
|
447
|
+
except RuntimeError:
|
|
448
|
+
asyncio.run(result)
|
|
449
|
+
|
|
450
|
+
def _check_state_file_sync(self) -> None:
|
|
451
|
+
"""Check heartbeat state file (sync version)."""
|
|
452
|
+
if self._state_file is None:
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
is_alive = self._is_state_file_alive()
|
|
456
|
+
|
|
457
|
+
with self._lock:
|
|
458
|
+
if is_alive:
|
|
459
|
+
# Reset triggered state if heartbeat is back
|
|
460
|
+
self._triggered = False
|
|
461
|
+
return
|
|
462
|
+
|
|
463
|
+
if self._triggered:
|
|
464
|
+
return
|
|
465
|
+
|
|
466
|
+
self._triggered = True
|
|
467
|
+
self._stats.record_timeout()
|
|
468
|
+
|
|
469
|
+
# Invoke callbacks outside lock
|
|
470
|
+
if self._on_timeout is not None:
|
|
471
|
+
with contextlib.suppress(Exception):
|
|
472
|
+
result = self._on_timeout()
|
|
473
|
+
if inspect.iscoroutine(result):
|
|
474
|
+
result.close() # Cannot await in sync context
|
|
475
|
+
|
|
476
|
+
def _is_state_file_alive(self) -> bool:
|
|
477
|
+
"""Check if heartbeat state file is recent enough."""
|
|
478
|
+
if self._state_file is None or not self._state_file.exists():
|
|
479
|
+
return False
|
|
480
|
+
try:
|
|
481
|
+
data = json.loads(self._state_file.read_text())
|
|
482
|
+
timestamp = data.get("timestamp")
|
|
483
|
+
if not timestamp:
|
|
484
|
+
return False
|
|
485
|
+
beat_time = datetime.fromisoformat(timestamp)
|
|
486
|
+
age = (datetime.now(timezone.utc) - beat_time).total_seconds()
|
|
487
|
+
return age <= self._max_age
|
|
488
|
+
except (json.JSONDecodeError, KeyError, OSError, ValueError, TypeError):
|
|
489
|
+
return False
|
|
490
|
+
|
|
491
|
+
async def _async_check_timeout(self) -> None:
|
|
492
|
+
"""Async version of timeout check."""
|
|
493
|
+
# If monitoring a state file, check that instead of ping time
|
|
494
|
+
if self._state_file is not None:
|
|
495
|
+
await self._check_state_file_async()
|
|
496
|
+
return
|
|
497
|
+
|
|
498
|
+
with self._lock:
|
|
499
|
+
if self._triggered:
|
|
500
|
+
return
|
|
501
|
+
|
|
502
|
+
elapsed = time.monotonic() - self._last_ping
|
|
503
|
+
if elapsed < self._timeout:
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
self._triggered = True
|
|
507
|
+
self._stats.record_timeout()
|
|
508
|
+
|
|
509
|
+
# Invoke callback outside lock - suppress errors to prevent watchdog crash
|
|
510
|
+
if self._on_timeout is not None:
|
|
511
|
+
with contextlib.suppress(Exception):
|
|
512
|
+
result = self._on_timeout()
|
|
513
|
+
if inspect.iscoroutine(result):
|
|
514
|
+
await result
|
|
515
|
+
|
|
516
|
+
async def _check_state_file_async(self) -> None:
|
|
517
|
+
"""Check heartbeat state file (async version)."""
|
|
518
|
+
if self._state_file is None:
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
# Run file check in executor to avoid blocking
|
|
522
|
+
loop = asyncio.get_running_loop()
|
|
523
|
+
is_alive = await loop.run_in_executor(None, self._is_state_file_alive)
|
|
524
|
+
|
|
525
|
+
with self._lock:
|
|
526
|
+
if is_alive:
|
|
527
|
+
# Reset triggered state if heartbeat is back
|
|
528
|
+
self._triggered = False
|
|
529
|
+
return
|
|
530
|
+
|
|
531
|
+
if self._triggered:
|
|
532
|
+
return
|
|
533
|
+
|
|
534
|
+
self._triggered = True
|
|
535
|
+
self._stats.record_timeout()
|
|
536
|
+
|
|
537
|
+
# Send alert if callback provided
|
|
538
|
+
if self._on_alert is not None:
|
|
539
|
+
with contextlib.suppress(Exception):
|
|
540
|
+
alert_result = self._on_alert(
|
|
541
|
+
"watchdog",
|
|
542
|
+
f"Heartbeat state file is stale: {self._state_file}",
|
|
543
|
+
{"state_file": str(self._state_file), "max_age": self._max_age},
|
|
544
|
+
)
|
|
545
|
+
if asyncio.iscoroutine(alert_result):
|
|
546
|
+
await alert_result
|
|
547
|
+
|
|
548
|
+
# Invoke timeout callback outside lock
|
|
549
|
+
if self._on_timeout is not None:
|
|
550
|
+
with contextlib.suppress(Exception):
|
|
551
|
+
result = self._on_timeout()
|
|
552
|
+
if inspect.iscoroutine(result):
|
|
553
|
+
await result
|
|
554
|
+
|
|
555
|
+
def __enter__(self) -> Self:
|
|
556
|
+
"""Enter context manager, starting watchdog."""
|
|
557
|
+
self.start()
|
|
558
|
+
return self
|
|
559
|
+
|
|
560
|
+
def __exit__(
|
|
561
|
+
self,
|
|
562
|
+
exc_type: type[BaseException] | None,
|
|
563
|
+
exc_val: BaseException | None,
|
|
564
|
+
exc_tb: object,
|
|
565
|
+
) -> None:
|
|
566
|
+
"""Exit context manager, stopping watchdog."""
|
|
567
|
+
self.stop()
|
|
568
|
+
|
|
569
|
+
async def __aenter__(self) -> Self:
|
|
570
|
+
"""Enter async context manager, starting watchdog."""
|
|
571
|
+
await self.astart()
|
|
572
|
+
return self
|
|
573
|
+
|
|
574
|
+
async def __aexit__(
|
|
575
|
+
self,
|
|
576
|
+
exc_type: type[BaseException] | None,
|
|
577
|
+
exc_val: BaseException | None,
|
|
578
|
+
exc_tb: object,
|
|
579
|
+
) -> None:
|
|
580
|
+
"""Exit async context manager, stopping watchdog."""
|
|
581
|
+
await self.astop()
|
|
582
|
+
|
|
583
|
+
def __repr__(self) -> str:
|
|
584
|
+
"""Return string representation."""
|
|
585
|
+
name_part = f", name={self._name!r}" if self._name else ""
|
|
586
|
+
status = "running" if self._running else "stopped"
|
|
587
|
+
return f"Watchdog(timeout={self._timeout}, status={status}{name_part})"
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def watchdog_context(
|
|
591
|
+
timeout: float | None = None,
|
|
592
|
+
on_timeout: Callable[[], None] | Callable[[], Awaitable[None]] | None = None,
|
|
593
|
+
*,
|
|
594
|
+
raise_on_timeout: bool = False,
|
|
595
|
+
name: str | None = None,
|
|
596
|
+
) -> Watchdog:
|
|
597
|
+
"""Create a watchdog context for monitoring code blocks.
|
|
598
|
+
|
|
599
|
+
This is a convenience function that creates a Watchdog instance.
|
|
600
|
+
Use with 'with' statement for automatic start/stop.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
timeout: Seconds before timeout triggers.
|
|
604
|
+
on_timeout: Optional callback for timeout events.
|
|
605
|
+
raise_on_timeout: If True, raise WatchdogTimeoutError on timeout.
|
|
606
|
+
name: Optional identifier.
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
Watchdog instance for use as context manager.
|
|
610
|
+
|
|
611
|
+
Examples:
|
|
612
|
+
>>> with watchdog_context(timeout=30) as wd: # doctest: +SKIP
|
|
613
|
+
... for item in items:
|
|
614
|
+
... wd.ping()
|
|
615
|
+
... process(item)
|
|
616
|
+
"""
|
|
617
|
+
callback = on_timeout
|
|
618
|
+
|
|
619
|
+
if raise_on_timeout and on_timeout is None:
|
|
620
|
+
|
|
621
|
+
def raise_timeout() -> None:
|
|
622
|
+
raise WatchdogTimeoutError(
|
|
623
|
+
f"Watchdog timeout after {timeout}s",
|
|
624
|
+
seconds_inactive=timeout or DEFAULT_WATCHDOG_TIMEOUT,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
callback = raise_timeout
|
|
628
|
+
|
|
629
|
+
return Watchdog(timeout=timeout, on_timeout=callback, name=name)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
__all__ = [
|
|
633
|
+
"OnAlertCallback",
|
|
634
|
+
"Watchdog",
|
|
635
|
+
"WatchdogStats",
|
|
636
|
+
"watchdog_context",
|
|
637
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Secrets subsystem public exports.
|
|
2
|
+
|
|
3
|
+
The secrets package exposes factories and models that orchestrate credential
|
|
4
|
+
resolution across multiple providers such as kwargs, configuration files,
|
|
5
|
+
keyring backends, and SOPS encrypted payloads.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from kstlib.secrets.exceptions import (
|
|
9
|
+
SecretDecryptionError,
|
|
10
|
+
SecretError,
|
|
11
|
+
SecretNotFoundError,
|
|
12
|
+
)
|
|
13
|
+
from kstlib.secrets.models import SecretRecord, SecretRequest, SecretSource
|
|
14
|
+
from kstlib.secrets.resolver import SecretResolver, get_secret_resolver, resolve_secret
|
|
15
|
+
from kstlib.secrets.sensitive import CachePurgeProtocol, sensitive
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"CachePurgeProtocol",
|
|
19
|
+
"SecretDecryptionError",
|
|
20
|
+
"SecretError",
|
|
21
|
+
"SecretNotFoundError",
|
|
22
|
+
"SecretRecord",
|
|
23
|
+
"SecretRequest",
|
|
24
|
+
"SecretResolver",
|
|
25
|
+
"SecretSource",
|
|
26
|
+
"get_secret_resolver",
|
|
27
|
+
"resolve_secret",
|
|
28
|
+
"sensitive",
|
|
29
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Custom exceptions raised by the secrets subsystem."""
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"SecretDecryptionError",
|
|
5
|
+
"SecretError",
|
|
6
|
+
"SecretNotFoundError",
|
|
7
|
+
]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SecretError(RuntimeError):
|
|
11
|
+
"""Base class for all secrets related errors."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SecretNotFoundError(SecretError):
|
|
15
|
+
"""Raised when no provider can supply a requested secret."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SecretDecryptionError(SecretError):
|
|
19
|
+
"""Raised when a secret payload cannot be decrypted."""
|