ominfra 0.0.0.dev126__py3-none-any.whl → 0.0.0.dev128__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ominfra/clouds/aws/auth.py +1 -1
  2. ominfra/deploy/_executor.py +1 -1
  3. ominfra/deploy/poly/_main.py +1 -1
  4. ominfra/pyremote/_runcommands.py +1 -1
  5. ominfra/scripts/journald2aws.py +2 -2
  6. ominfra/scripts/supervisor.py +4736 -4166
  7. ominfra/supervisor/configs.py +34 -11
  8. ominfra/supervisor/context.py +7 -345
  9. ominfra/supervisor/dispatchers.py +21 -324
  10. ominfra/supervisor/dispatchersimpl.py +343 -0
  11. ominfra/supervisor/groups.py +33 -111
  12. ominfra/supervisor/groupsimpl.py +86 -0
  13. ominfra/supervisor/inject.py +45 -20
  14. ominfra/supervisor/main.py +3 -3
  15. ominfra/supervisor/pipes.py +85 -0
  16. ominfra/supervisor/poller.py +42 -38
  17. ominfra/supervisor/privileges.py +65 -0
  18. ominfra/supervisor/process.py +6 -742
  19. ominfra/supervisor/processimpl.py +516 -0
  20. ominfra/supervisor/setup.py +38 -0
  21. ominfra/supervisor/setupimpl.py +262 -0
  22. ominfra/supervisor/spawning.py +32 -0
  23. ominfra/supervisor/spawningimpl.py +350 -0
  24. ominfra/supervisor/supervisor.py +67 -84
  25. ominfra/supervisor/types.py +101 -47
  26. ominfra/supervisor/utils/__init__.py +0 -0
  27. ominfra/supervisor/utils/collections.py +52 -0
  28. ominfra/supervisor/utils/diag.py +31 -0
  29. ominfra/supervisor/utils/fds.py +46 -0
  30. ominfra/supervisor/utils/fs.py +47 -0
  31. ominfra/supervisor/utils/os.py +45 -0
  32. ominfra/supervisor/utils/ostypes.py +9 -0
  33. ominfra/supervisor/utils/signals.py +60 -0
  34. ominfra/supervisor/utils/strings.py +105 -0
  35. ominfra/supervisor/utils/users.py +67 -0
  36. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/METADATA +3 -3
  37. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/RECORD +41 -25
  38. ominfra/supervisor/datatypes.py +0 -175
  39. ominfra/supervisor/signals.py +0 -52
  40. ominfra/supervisor/utils.py +0 -206
  41. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/LICENSE +0 -0
  42. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/WHEEL +0 -0
  43. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/entry_points.txt +0 -0
  44. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,516 @@
1
+ # ruff: noqa: UP006 UP007
2
+ import errno
3
+ import os.path
4
+ import signal
5
+ import time
6
+ import traceback
7
+ import typing as ta
8
+
9
+ from omlish.lite.check import check_isinstance
10
+ from omlish.lite.logs import log
11
+ from omlish.lite.typing import Func1
12
+
13
+ from .configs import ProcessConfig
14
+ from .configs import RestartUnconditionally
15
+ from .dispatchers import Dispatchers
16
+ from .events import PROCESS_STATE_EVENT_MAP
17
+ from .events import EventCallbacks
18
+ from .pipes import ProcessPipes
19
+ from .pipes import close_parent_pipes
20
+ from .process import ProcessStateError
21
+ from .spawning import ProcessSpawnError
22
+ from .spawning import ProcessSpawning
23
+ from .states import ProcessState
24
+ from .states import SupervisorState
25
+ from .types import InputDispatcher
26
+ from .types import Process
27
+ from .types import ProcessGroup
28
+ from .types import ServerContext
29
+ from .utils.os import decode_wait_status
30
+ from .utils.ostypes import Pid
31
+ from .utils.ostypes import Rc
32
+ from .utils.signals import sig_name
33
+
34
+
35
+ class ProcessSpawningFactory(Func1[Process, ProcessSpawning]):
36
+ pass
37
+
38
+
39
+ ##
40
+
41
+
42
+ class ProcessImpl(Process):
43
+ """A class to manage a subprocess."""
44
+
45
+ def __init__(
46
+ self,
47
+ config: ProcessConfig,
48
+ group: ProcessGroup,
49
+ *,
50
+ context: ServerContext,
51
+ event_callbacks: EventCallbacks,
52
+ process_spawning_factory: ProcessSpawningFactory,
53
+ ) -> None:
54
+ super().__init__()
55
+
56
+ self._config = config
57
+ self._group = group
58
+
59
+ self._context = context
60
+ self._event_callbacks = event_callbacks
61
+
62
+ self._spawning = process_spawning_factory(self)
63
+
64
+ #
65
+
66
+ self._dispatchers = Dispatchers([])
67
+ self._pipes = ProcessPipes()
68
+
69
+ self._state = ProcessState.STOPPED
70
+ self._pid = Pid(0) # 0 when not running
71
+
72
+ self._last_start = 0. # Last time the subprocess was started; 0 if never
73
+ self._last_stop = 0. # Last time the subprocess was stopped; 0 if never
74
+ self._last_stop_report = 0. # Last time "waiting for x to stop" logged, to throttle
75
+ self._delay = 0. # If nonzero, delay starting or killing until this time
76
+
77
+ self._administrative_stop = False # true if process has been stopped by an admin
78
+ self._system_stop = False # true if process has been stopped by the system
79
+
80
+ self._killing = False # true if we are trying to kill this process
81
+
82
+ self._backoff = 0 # backoff counter (to startretries)
83
+
84
+ self._exitstatus: ta.Optional[Rc] = None # status attached to dead process by finish()
85
+ self._spawn_err: ta.Optional[str] = None # error message attached by spawn() if any
86
+
87
+ #
88
+
89
+ def __repr__(self) -> str:
90
+ return f'<Subprocess at {id(self)} with name {self._config.name} in state {self.get_state().name}>'
91
+
92
+ #
93
+
94
+ @property
95
+ def name(self) -> str:
96
+ return self._config.name
97
+
98
+ @property
99
+ def config(self) -> ProcessConfig:
100
+ return self._config
101
+
102
+ @property
103
+ def group(self) -> ProcessGroup:
104
+ return self._group
105
+
106
+ @property
107
+ def pid(self) -> Pid:
108
+ return self._pid
109
+
110
+ #
111
+
112
+ @property
113
+ def context(self) -> ServerContext:
114
+ return self._context
115
+
116
+ @property
117
+ def state(self) -> ProcessState:
118
+ return self._state
119
+
120
+ @property
121
+ def backoff(self) -> int:
122
+ return self._backoff
123
+
124
+ #
125
+
126
+ def spawn(self) -> ta.Optional[Pid]:
127
+ if self.pid:
128
+ log.warning('process \'%s\' already running', self.name)
129
+ return None
130
+
131
+ self.check_in_state(
132
+ ProcessState.EXITED,
133
+ ProcessState.FATAL,
134
+ ProcessState.BACKOFF,
135
+ ProcessState.STOPPED,
136
+ )
137
+
138
+ self._killing = False
139
+ self._spawn_err = None
140
+ self._exitstatus = None
141
+ self._system_stop = False
142
+ self._administrative_stop = False
143
+
144
+ self._last_start = time.time()
145
+
146
+ self.change_state(ProcessState.STARTING)
147
+
148
+ try:
149
+ sp = self._spawning.spawn()
150
+ except ProcessSpawnError as err:
151
+ log.exception('Spawn error')
152
+ self._spawn_err = err.args[0]
153
+ self.check_in_state(ProcessState.STARTING)
154
+ self.change_state(ProcessState.BACKOFF)
155
+ return None
156
+
157
+ log.info("Spawned: '%s' with pid %s", self.name, sp.pid)
158
+
159
+ self._pid = sp.pid
160
+ self._pipes = sp.pipes
161
+ self._dispatchers = sp.dispatchers
162
+
163
+ self._delay = time.time() + self.config.startsecs
164
+
165
+ return sp.pid
166
+
167
+ def get_dispatchers(self) -> Dispatchers:
168
+ return self._dispatchers
169
+
170
+ def write(self, chars: ta.Union[bytes, str]) -> None:
171
+ if not self.pid or self._killing:
172
+ raise OSError(errno.EPIPE, 'Process already closed')
173
+
174
+ stdin_fd = self._pipes.stdin
175
+ if stdin_fd is None:
176
+ raise OSError(errno.EPIPE, 'Process has no stdin channel')
177
+
178
+ dispatcher = check_isinstance(self._dispatchers[stdin_fd], InputDispatcher)
179
+ if dispatcher.closed:
180
+ raise OSError(errno.EPIPE, "Process' stdin channel is closed")
181
+
182
+ dispatcher.write(chars)
183
+ dispatcher.flush() # this must raise EPIPE if the pipe is closed
184
+
185
+ #
186
+
187
+ def change_state(self, new_state: ProcessState, expected: bool = True) -> bool:
188
+ old_state = self._state
189
+ if new_state is old_state:
190
+ return False
191
+
192
+ self._state = new_state
193
+ if new_state == ProcessState.BACKOFF:
194
+ now = time.time()
195
+ self._backoff += 1
196
+ self._delay = now + self._backoff
197
+
198
+ event_class = PROCESS_STATE_EVENT_MAP.get(new_state)
199
+ if event_class is not None:
200
+ event = event_class(self, old_state, expected)
201
+ self._event_callbacks.notify(event)
202
+
203
+ return True
204
+
205
+ def check_in_state(self, *states: ProcessState) -> None:
206
+ if self._state not in states:
207
+ raise ProcessStateError(
208
+ f'Check failed for {self._config.name}: '
209
+ f'{self._state.name} not in {" ".join(s.name for s in states)}',
210
+ )
211
+
212
+ #
213
+
214
+ def _check_and_adjust_for_system_clock_rollback(self, test_time):
215
+ """
216
+ Check if system clock has rolled backward beyond test_time. If so, set affected timestamps to test_time.
217
+ """
218
+
219
+ if self._state == ProcessState.STARTING:
220
+ self._last_start = min(test_time, self._last_start)
221
+ if self._delay > 0 and test_time < (self._delay - self._config.startsecs):
222
+ self._delay = test_time + self._config.startsecs
223
+
224
+ elif self._state == ProcessState.RUNNING:
225
+ if test_time > self._last_start and test_time < (self._last_start + self._config.startsecs):
226
+ self._last_start = test_time - self._config.startsecs
227
+
228
+ elif self._state == ProcessState.STOPPING:
229
+ self._last_stop_report = min(test_time, self._last_stop_report)
230
+ if self._delay > 0 and test_time < (self._delay - self._config.stopwaitsecs):
231
+ self._delay = test_time + self._config.stopwaitsecs
232
+
233
+ elif self._state == ProcessState.BACKOFF:
234
+ if self._delay > 0 and test_time < (self._delay - self._backoff):
235
+ self._delay = test_time + self._backoff
236
+
237
+ def stop(self) -> ta.Optional[str]:
238
+ self._administrative_stop = True
239
+ self._last_stop_report = 0
240
+ return self.kill(self._config.stopsignal)
241
+
242
+ def stop_report(self) -> None:
243
+ """Log a 'waiting for x to stop' message with throttling."""
244
+
245
+ if self._state == ProcessState.STOPPING:
246
+ now = time.time()
247
+
248
+ self._check_and_adjust_for_system_clock_rollback(now)
249
+
250
+ if now > (self._last_stop_report + 2): # every 2 seconds
251
+ log.info('waiting for %s to stop', self.name)
252
+ self._last_stop_report = now
253
+
254
+ def give_up(self) -> None:
255
+ self._delay = 0
256
+ self._backoff = 0
257
+ self._system_stop = True
258
+ self.check_in_state(ProcessState.BACKOFF)
259
+ self.change_state(ProcessState.FATAL)
260
+
261
+ def kill(self, sig: int) -> ta.Optional[str]:
262
+ """
263
+ Send a signal to the subprocess with the intention to kill it (to make it exit). This may or may not actually
264
+ kill it.
265
+
266
+ Return None if the signal was sent, or an error message string if an error occurred or if the subprocess is not
267
+ running.
268
+ """
269
+ now = time.time()
270
+
271
+ # If the process is in BACKOFF and we want to stop or kill it, then BACKOFF -> STOPPED. This is needed because
272
+ # if startretries is a large number and the process isn't starting successfully, the stop request would be
273
+ # blocked for a long time waiting for the retries.
274
+ if self._state == ProcessState.BACKOFF:
275
+ log.debug('Attempted to kill %s, which is in BACKOFF state.', self.name)
276
+ self.change_state(ProcessState.STOPPED)
277
+ return None
278
+
279
+ args: tuple
280
+ if not self.pid:
281
+ fmt, args = "attempted to kill %s with sig %s but it wasn't running", (self.name, sig_name(sig))
282
+ log.debug(fmt, *args)
283
+ return fmt % args
284
+
285
+ # If we're in the stopping state, then we've already sent the stop signal and this is the kill signal
286
+ if self._state == ProcessState.STOPPING:
287
+ killasgroup = self._config.killasgroup
288
+ else:
289
+ killasgroup = self._config.stopasgroup
290
+
291
+ as_group = ''
292
+ if killasgroup:
293
+ as_group = 'process group '
294
+
295
+ log.debug('killing %s (pid %s) %s with signal %s', self.name, self.pid, as_group, sig_name(sig))
296
+
297
+ # RUNNING/STARTING/STOPPING -> STOPPING
298
+ self._killing = True
299
+ self._delay = now + self._config.stopwaitsecs
300
+ # we will already be in the STOPPING state if we're doing a SIGKILL as a result of overrunning stopwaitsecs
301
+ self.check_in_state(ProcessState.RUNNING, ProcessState.STARTING, ProcessState.STOPPING)
302
+ self.change_state(ProcessState.STOPPING)
303
+
304
+ kpid = int(self.pid)
305
+ if killasgroup:
306
+ # send to the whole process group instead
307
+ kpid = -kpid
308
+
309
+ try:
310
+ try:
311
+ os.kill(kpid, sig)
312
+ except OSError as exc:
313
+ if exc.errno == errno.ESRCH:
314
+ log.debug('unable to signal %s (pid %s), it probably just exited on its own: %s', self.name, self.pid, str(exc)) # noqa
315
+ # we could change the state here but we intentionally do not. we will do it during normal SIGCHLD
316
+ # processing.
317
+ return None
318
+ raise
319
+ except Exception: # noqa
320
+ tb = traceback.format_exc()
321
+ fmt, args = 'unknown problem killing %s (%s):%s', (self.name, self.pid, tb)
322
+ log.critical(fmt, *args)
323
+ self.change_state(ProcessState.UNKNOWN)
324
+ self._killing = False
325
+ self._delay = 0
326
+ return fmt % args
327
+
328
+ return None
329
+
330
+ def signal(self, sig: int) -> ta.Optional[str]:
331
+ """
332
+ Send a signal to the subprocess, without intending to kill it.
333
+
334
+ Return None if the signal was sent, or an error message string if an error occurred or if the subprocess is not
335
+ running.
336
+ """
337
+ args: tuple
338
+ if not self.pid:
339
+ fmt, args = "Attempted to send %s sig %s but it wasn't running", (self.name, sig_name(sig))
340
+ log.debug(fmt, *args)
341
+ return fmt % args
342
+
343
+ log.debug('sending %s (pid %s) sig %s', self.name, self.pid, sig_name(sig))
344
+
345
+ self.check_in_state(ProcessState.RUNNING, ProcessState.STARTING, ProcessState.STOPPING)
346
+
347
+ try:
348
+ try:
349
+ os.kill(self.pid, sig)
350
+ except OSError as exc:
351
+ if exc.errno == errno.ESRCH:
352
+ log.debug(
353
+ 'unable to signal %s (pid %s), it probably just now exited on its own: %s',
354
+ self.name,
355
+ self.pid,
356
+ str(exc),
357
+ )
358
+ # we could change the state here but we intentionally do not. we will do it during normal SIGCHLD
359
+ # processing.
360
+ return None
361
+ raise
362
+ except Exception: # noqa
363
+ tb = traceback.format_exc()
364
+ fmt, args = 'unknown problem sending sig %s (%s):%s', (self.name, self.pid, tb)
365
+ log.critical(fmt, *args)
366
+ self.change_state(ProcessState.UNKNOWN)
367
+ return fmt % args
368
+
369
+ return None
370
+
371
+ def finish(self, sts: Rc) -> None:
372
+ """The process was reaped and we need to report and manage its state."""
373
+
374
+ self._dispatchers.drain()
375
+
376
+ es, msg = decode_wait_status(sts)
377
+
378
+ now = time.time()
379
+
380
+ self._check_and_adjust_for_system_clock_rollback(now)
381
+
382
+ self._last_stop = now
383
+
384
+ if now > self._last_start:
385
+ too_quickly = now - self._last_start < self._config.startsecs
386
+ else:
387
+ too_quickly = False
388
+ log.warning(
389
+ "process '%s' (%s) last_start time is in the future, don't know how long process was running so "
390
+ "assuming it did not exit too quickly",
391
+ self.name,
392
+ self.pid,
393
+ )
394
+
395
+ exit_expected = es in self._config.exitcodes
396
+
397
+ if self._killing:
398
+ # likely the result of a stop request implies STOPPING -> STOPPED
399
+ self._killing = False
400
+ self._delay = 0
401
+ self._exitstatus = Rc(es)
402
+
403
+ fmt, args = 'stopped: %s (%s)', (self.name, msg)
404
+ self.check_in_state(ProcessState.STOPPING)
405
+ self.change_state(ProcessState.STOPPED)
406
+ if exit_expected:
407
+ log.info(fmt, *args)
408
+ else:
409
+ log.warning(fmt, *args)
410
+
411
+ elif too_quickly:
412
+ # the program did not stay up long enough to make it to RUNNING implies STARTING -> BACKOFF
413
+ self._exitstatus = None
414
+ self._spawn_err = 'Exited too quickly (process log may have details)'
415
+ self.check_in_state(ProcessState.STARTING)
416
+ self.change_state(ProcessState.BACKOFF)
417
+ log.warning('exited: %s (%s)', self.name, msg + '; not expected')
418
+
419
+ else:
420
+ # this finish was not the result of a stop request, the program was in the RUNNING state but exited implies
421
+ # RUNNING -> EXITED normally but see next comment
422
+ self._delay = 0
423
+ self._backoff = 0
424
+ self._exitstatus = es
425
+
426
+ # if the process was STARTING but a system time change causes self.last_start to be in the future, the
427
+ # normal STARTING->RUNNING transition can be subverted so we perform the transition here.
428
+ if self._state == ProcessState.STARTING:
429
+ self.change_state(ProcessState.RUNNING)
430
+
431
+ self.check_in_state(ProcessState.RUNNING)
432
+
433
+ if exit_expected:
434
+ # expected exit code
435
+ self.change_state(ProcessState.EXITED, expected=True)
436
+ log.info('exited: %s (%s)', self.name, msg + '; expected')
437
+ else:
438
+ # unexpected exit code
439
+ self._spawn_err = f'Bad exit code {es}'
440
+ self.change_state(ProcessState.EXITED, expected=False)
441
+ log.warning('exited: %s (%s)', self.name, msg + '; not expected')
442
+
443
+ self._pid = Pid(0)
444
+ close_parent_pipes(self._pipes)
445
+ self._pipes = ProcessPipes()
446
+ self._dispatchers = Dispatchers([])
447
+
448
+ def get_state(self) -> ProcessState:
449
+ return self._state
450
+
451
+ def transition(self) -> None:
452
+ now = time.time()
453
+ state = self._state
454
+
455
+ self._check_and_adjust_for_system_clock_rollback(now)
456
+
457
+ logger = log
458
+
459
+ if self.context.state > SupervisorState.RESTARTING:
460
+ # dont start any processes if supervisor is shutting down
461
+ if state == ProcessState.EXITED:
462
+ if self._config.autorestart:
463
+ if self._config.autorestart is RestartUnconditionally:
464
+ # EXITED -> STARTING
465
+ self.spawn()
466
+ elif self._exitstatus not in self._config.exitcodes:
467
+ # EXITED -> STARTING
468
+ self.spawn()
469
+
470
+ elif state == ProcessState.STOPPED and not self._last_start:
471
+ if self._config.autostart:
472
+ # STOPPED -> STARTING
473
+ self.spawn()
474
+
475
+ elif state == ProcessState.BACKOFF:
476
+ if self._backoff <= self._config.startretries:
477
+ if now > self._delay:
478
+ # BACKOFF -> STARTING
479
+ self.spawn()
480
+
481
+ if state == ProcessState.STARTING:
482
+ if now - self._last_start > self._config.startsecs:
483
+ # STARTING -> RUNNING if the proc has started successfully and it has stayed up for at least
484
+ # proc.config.startsecs,
485
+ self._delay = 0
486
+ self._backoff = 0
487
+ self.check_in_state(ProcessState.STARTING)
488
+ self.change_state(ProcessState.RUNNING)
489
+ msg = ('entered RUNNING state, process has stayed up for > than %s seconds (startsecs)' % self._config.startsecs) # noqa
490
+ logger.info('success: %s %s', self.name, msg)
491
+
492
+ if state == ProcessState.BACKOFF:
493
+ if self._backoff > self._config.startretries:
494
+ # BACKOFF -> FATAL if the proc has exceeded its number of retries
495
+ self.give_up()
496
+ msg = ('entered FATAL state, too many start retries too quickly')
497
+ logger.info('gave up: %s %s', self.name, msg)
498
+
499
+ elif state == ProcessState.STOPPING:
500
+ time_left = self._delay - now
501
+ if time_left <= 0:
502
+ # kill processes which are taking too long to stop with a final sigkill. if this doesn't kill it, the
503
+ # process will be stuck in the STOPPING state forever.
504
+ log.warning('killing \'%s\' (%s) with SIGKILL', self.name, self.pid)
505
+ self.kill(signal.SIGKILL)
506
+
507
+ def after_setuid(self) -> None:
508
+ # temporary logfiles which are erased at start time
509
+ # get_autoname = self.context.get_auto_child_log_name # noqa
510
+ # sid = self.context.config.identifier # noqa
511
+ # name = self._config.name # noqa
512
+ # if self.stdout_logfile is Automatic:
513
+ # self.stdout_logfile = get_autoname(name, sid, 'stdout')
514
+ # if self.stderr_logfile is Automatic:
515
+ # self.stderr_logfile = get_autoname(name, sid, 'stderr')
516
+ pass
@@ -0,0 +1,38 @@
1
+ # ruff: noqa: UP006 UP007
2
+ import abc
3
+ import typing as ta
4
+
5
+ from .utils.users import User
6
+
7
+
8
+ ##
9
+
10
+
11
+ SupervisorUser = ta.NewType('SupervisorUser', User)
12
+
13
+
14
+ ##
15
+
16
+
17
+ class DaemonizeListener(abc.ABC): # noqa
18
+ def before_daemonize(self) -> None: # noqa
19
+ pass
20
+
21
+ def after_daemonize(self) -> None: # noqa
22
+ pass
23
+
24
+
25
+ DaemonizeListeners = ta.NewType('DaemonizeListeners', ta.Sequence[DaemonizeListener])
26
+
27
+
28
+ ##
29
+
30
+
31
+ class SupervisorSetup(abc.ABC):
32
+ @abc.abstractmethod
33
+ def setup(self) -> None:
34
+ raise NotImplementedError
35
+
36
+ @abc.abstractmethod
37
+ def cleanup(self) -> None:
38
+ raise NotImplementedError