ominfra 0.0.0.dev126__py3-none-any.whl → 0.0.0.dev128__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. ominfra/clouds/aws/auth.py +1 -1
  2. ominfra/deploy/_executor.py +1 -1
  3. ominfra/deploy/poly/_main.py +1 -1
  4. ominfra/pyremote/_runcommands.py +1 -1
  5. ominfra/scripts/journald2aws.py +2 -2
  6. ominfra/scripts/supervisor.py +4736 -4166
  7. ominfra/supervisor/configs.py +34 -11
  8. ominfra/supervisor/context.py +7 -345
  9. ominfra/supervisor/dispatchers.py +21 -324
  10. ominfra/supervisor/dispatchersimpl.py +343 -0
  11. ominfra/supervisor/groups.py +33 -111
  12. ominfra/supervisor/groupsimpl.py +86 -0
  13. ominfra/supervisor/inject.py +45 -20
  14. ominfra/supervisor/main.py +3 -3
  15. ominfra/supervisor/pipes.py +85 -0
  16. ominfra/supervisor/poller.py +42 -38
  17. ominfra/supervisor/privileges.py +65 -0
  18. ominfra/supervisor/process.py +6 -742
  19. ominfra/supervisor/processimpl.py +516 -0
  20. ominfra/supervisor/setup.py +38 -0
  21. ominfra/supervisor/setupimpl.py +262 -0
  22. ominfra/supervisor/spawning.py +32 -0
  23. ominfra/supervisor/spawningimpl.py +350 -0
  24. ominfra/supervisor/supervisor.py +67 -84
  25. ominfra/supervisor/types.py +101 -47
  26. ominfra/supervisor/utils/__init__.py +0 -0
  27. ominfra/supervisor/utils/collections.py +52 -0
  28. ominfra/supervisor/utils/diag.py +31 -0
  29. ominfra/supervisor/utils/fds.py +46 -0
  30. ominfra/supervisor/utils/fs.py +47 -0
  31. ominfra/supervisor/utils/os.py +45 -0
  32. ominfra/supervisor/utils/ostypes.py +9 -0
  33. ominfra/supervisor/utils/signals.py +60 -0
  34. ominfra/supervisor/utils/strings.py +105 -0
  35. ominfra/supervisor/utils/users.py +67 -0
  36. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/METADATA +3 -3
  37. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/RECORD +41 -25
  38. ominfra/supervisor/datatypes.py +0 -175
  39. ominfra/supervisor/signals.py +0 -52
  40. ominfra/supervisor/utils.py +0 -206
  41. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/LICENSE +0 -0
  42. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/WHEEL +0 -0
  43. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/entry_points.txt +0 -0
  44. {ominfra-0.0.0.dev126.dist-info → ominfra-0.0.0.dev128.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,516 @@
1
+ # ruff: noqa: UP006 UP007
2
+ import errno
3
+ import os.path
4
+ import signal
5
+ import time
6
+ import traceback
7
+ import typing as ta
8
+
9
+ from omlish.lite.check import check_isinstance
10
+ from omlish.lite.logs import log
11
+ from omlish.lite.typing import Func1
12
+
13
+ from .configs import ProcessConfig
14
+ from .configs import RestartUnconditionally
15
+ from .dispatchers import Dispatchers
16
+ from .events import PROCESS_STATE_EVENT_MAP
17
+ from .events import EventCallbacks
18
+ from .pipes import ProcessPipes
19
+ from .pipes import close_parent_pipes
20
+ from .process import ProcessStateError
21
+ from .spawning import ProcessSpawnError
22
+ from .spawning import ProcessSpawning
23
+ from .states import ProcessState
24
+ from .states import SupervisorState
25
+ from .types import InputDispatcher
26
+ from .types import Process
27
+ from .types import ProcessGroup
28
+ from .types import ServerContext
29
+ from .utils.os import decode_wait_status
30
+ from .utils.ostypes import Pid
31
+ from .utils.ostypes import Rc
32
+ from .utils.signals import sig_name
33
+
34
+
35
+ class ProcessSpawningFactory(Func1[Process, ProcessSpawning]):
36
+ pass
37
+
38
+
39
+ ##
40
+
41
+
42
+ class ProcessImpl(Process):
43
+ """A class to manage a subprocess."""
44
+
45
+ def __init__(
46
+ self,
47
+ config: ProcessConfig,
48
+ group: ProcessGroup,
49
+ *,
50
+ context: ServerContext,
51
+ event_callbacks: EventCallbacks,
52
+ process_spawning_factory: ProcessSpawningFactory,
53
+ ) -> None:
54
+ super().__init__()
55
+
56
+ self._config = config
57
+ self._group = group
58
+
59
+ self._context = context
60
+ self._event_callbacks = event_callbacks
61
+
62
+ self._spawning = process_spawning_factory(self)
63
+
64
+ #
65
+
66
+ self._dispatchers = Dispatchers([])
67
+ self._pipes = ProcessPipes()
68
+
69
+ self._state = ProcessState.STOPPED
70
+ self._pid = Pid(0) # 0 when not running
71
+
72
+ self._last_start = 0. # Last time the subprocess was started; 0 if never
73
+ self._last_stop = 0. # Last time the subprocess was stopped; 0 if never
74
+ self._last_stop_report = 0. # Last time "waiting for x to stop" logged, to throttle
75
+ self._delay = 0. # If nonzero, delay starting or killing until this time
76
+
77
+ self._administrative_stop = False # true if process has been stopped by an admin
78
+ self._system_stop = False # true if process has been stopped by the system
79
+
80
+ self._killing = False # true if we are trying to kill this process
81
+
82
+ self._backoff = 0 # backoff counter (to startretries)
83
+
84
+ self._exitstatus: ta.Optional[Rc] = None # status attached to dead process by finish()
85
+ self._spawn_err: ta.Optional[str] = None # error message attached by spawn() if any
86
+
87
+ #
88
+
89
+ def __repr__(self) -> str:
90
+ return f'<Subprocess at {id(self)} with name {self._config.name} in state {self.get_state().name}>'
91
+
92
+ #
93
+
94
+ @property
95
+ def name(self) -> str:
96
+ return self._config.name
97
+
98
+ @property
99
+ def config(self) -> ProcessConfig:
100
+ return self._config
101
+
102
+ @property
103
+ def group(self) -> ProcessGroup:
104
+ return self._group
105
+
106
+ @property
107
+ def pid(self) -> Pid:
108
+ return self._pid
109
+
110
+ #
111
+
112
+ @property
113
+ def context(self) -> ServerContext:
114
+ return self._context
115
+
116
+ @property
117
+ def state(self) -> ProcessState:
118
+ return self._state
119
+
120
+ @property
121
+ def backoff(self) -> int:
122
+ return self._backoff
123
+
124
+ #
125
+
126
+ def spawn(self) -> ta.Optional[Pid]:
127
+ if self.pid:
128
+ log.warning('process \'%s\' already running', self.name)
129
+ return None
130
+
131
+ self.check_in_state(
132
+ ProcessState.EXITED,
133
+ ProcessState.FATAL,
134
+ ProcessState.BACKOFF,
135
+ ProcessState.STOPPED,
136
+ )
137
+
138
+ self._killing = False
139
+ self._spawn_err = None
140
+ self._exitstatus = None
141
+ self._system_stop = False
142
+ self._administrative_stop = False
143
+
144
+ self._last_start = time.time()
145
+
146
+ self.change_state(ProcessState.STARTING)
147
+
148
+ try:
149
+ sp = self._spawning.spawn()
150
+ except ProcessSpawnError as err:
151
+ log.exception('Spawn error')
152
+ self._spawn_err = err.args[0]
153
+ self.check_in_state(ProcessState.STARTING)
154
+ self.change_state(ProcessState.BACKOFF)
155
+ return None
156
+
157
+ log.info("Spawned: '%s' with pid %s", self.name, sp.pid)
158
+
159
+ self._pid = sp.pid
160
+ self._pipes = sp.pipes
161
+ self._dispatchers = sp.dispatchers
162
+
163
+ self._delay = time.time() + self.config.startsecs
164
+
165
+ return sp.pid
166
+
167
+ def get_dispatchers(self) -> Dispatchers:
168
+ return self._dispatchers
169
+
170
+ def write(self, chars: ta.Union[bytes, str]) -> None:
171
+ if not self.pid or self._killing:
172
+ raise OSError(errno.EPIPE, 'Process already closed')
173
+
174
+ stdin_fd = self._pipes.stdin
175
+ if stdin_fd is None:
176
+ raise OSError(errno.EPIPE, 'Process has no stdin channel')
177
+
178
+ dispatcher = check_isinstance(self._dispatchers[stdin_fd], InputDispatcher)
179
+ if dispatcher.closed:
180
+ raise OSError(errno.EPIPE, "Process' stdin channel is closed")
181
+
182
+ dispatcher.write(chars)
183
+ dispatcher.flush() # this must raise EPIPE if the pipe is closed
184
+
185
+ #
186
+
187
+ def change_state(self, new_state: ProcessState, expected: bool = True) -> bool:
188
+ old_state = self._state
189
+ if new_state is old_state:
190
+ return False
191
+
192
+ self._state = new_state
193
+ if new_state == ProcessState.BACKOFF:
194
+ now = time.time()
195
+ self._backoff += 1
196
+ self._delay = now + self._backoff
197
+
198
+ event_class = PROCESS_STATE_EVENT_MAP.get(new_state)
199
+ if event_class is not None:
200
+ event = event_class(self, old_state, expected)
201
+ self._event_callbacks.notify(event)
202
+
203
+ return True
204
+
205
+ def check_in_state(self, *states: ProcessState) -> None:
206
+ if self._state not in states:
207
+ raise ProcessStateError(
208
+ f'Check failed for {self._config.name}: '
209
+ f'{self._state.name} not in {" ".join(s.name for s in states)}',
210
+ )
211
+
212
+ #
213
+
214
+ def _check_and_adjust_for_system_clock_rollback(self, test_time):
215
+ """
216
+ Check if system clock has rolled backward beyond test_time. If so, set affected timestamps to test_time.
217
+ """
218
+
219
+ if self._state == ProcessState.STARTING:
220
+ self._last_start = min(test_time, self._last_start)
221
+ if self._delay > 0 and test_time < (self._delay - self._config.startsecs):
222
+ self._delay = test_time + self._config.startsecs
223
+
224
+ elif self._state == ProcessState.RUNNING:
225
+ if test_time > self._last_start and test_time < (self._last_start + self._config.startsecs):
226
+ self._last_start = test_time - self._config.startsecs
227
+
228
+ elif self._state == ProcessState.STOPPING:
229
+ self._last_stop_report = min(test_time, self._last_stop_report)
230
+ if self._delay > 0 and test_time < (self._delay - self._config.stopwaitsecs):
231
+ self._delay = test_time + self._config.stopwaitsecs
232
+
233
+ elif self._state == ProcessState.BACKOFF:
234
+ if self._delay > 0 and test_time < (self._delay - self._backoff):
235
+ self._delay = test_time + self._backoff
236
+
237
+ def stop(self) -> ta.Optional[str]:
238
+ self._administrative_stop = True
239
+ self._last_stop_report = 0
240
+ return self.kill(self._config.stopsignal)
241
+
242
+ def stop_report(self) -> None:
243
+ """Log a 'waiting for x to stop' message with throttling."""
244
+
245
+ if self._state == ProcessState.STOPPING:
246
+ now = time.time()
247
+
248
+ self._check_and_adjust_for_system_clock_rollback(now)
249
+
250
+ if now > (self._last_stop_report + 2): # every 2 seconds
251
+ log.info('waiting for %s to stop', self.name)
252
+ self._last_stop_report = now
253
+
254
+ def give_up(self) -> None:
255
+ self._delay = 0
256
+ self._backoff = 0
257
+ self._system_stop = True
258
+ self.check_in_state(ProcessState.BACKOFF)
259
+ self.change_state(ProcessState.FATAL)
260
+
261
+ def kill(self, sig: int) -> ta.Optional[str]:
262
+ """
263
+ Send a signal to the subprocess with the intention to kill it (to make it exit). This may or may not actually
264
+ kill it.
265
+
266
+ Return None if the signal was sent, or an error message string if an error occurred or if the subprocess is not
267
+ running.
268
+ """
269
+ now = time.time()
270
+
271
+ # If the process is in BACKOFF and we want to stop or kill it, then BACKOFF -> STOPPED. This is needed because
272
+ # if startretries is a large number and the process isn't starting successfully, the stop request would be
273
+ # blocked for a long time waiting for the retries.
274
+ if self._state == ProcessState.BACKOFF:
275
+ log.debug('Attempted to kill %s, which is in BACKOFF state.', self.name)
276
+ self.change_state(ProcessState.STOPPED)
277
+ return None
278
+
279
+ args: tuple
280
+ if not self.pid:
281
+ fmt, args = "attempted to kill %s with sig %s but it wasn't running", (self.name, sig_name(sig))
282
+ log.debug(fmt, *args)
283
+ return fmt % args
284
+
285
+ # If we're in the stopping state, then we've already sent the stop signal and this is the kill signal
286
+ if self._state == ProcessState.STOPPING:
287
+ killasgroup = self._config.killasgroup
288
+ else:
289
+ killasgroup = self._config.stopasgroup
290
+
291
+ as_group = ''
292
+ if killasgroup:
293
+ as_group = 'process group '
294
+
295
+ log.debug('killing %s (pid %s) %s with signal %s', self.name, self.pid, as_group, sig_name(sig))
296
+
297
+ # RUNNING/STARTING/STOPPING -> STOPPING
298
+ self._killing = True
299
+ self._delay = now + self._config.stopwaitsecs
300
+ # we will already be in the STOPPING state if we're doing a SIGKILL as a result of overrunning stopwaitsecs
301
+ self.check_in_state(ProcessState.RUNNING, ProcessState.STARTING, ProcessState.STOPPING)
302
+ self.change_state(ProcessState.STOPPING)
303
+
304
+ kpid = int(self.pid)
305
+ if killasgroup:
306
+ # send to the whole process group instead
307
+ kpid = -kpid
308
+
309
+ try:
310
+ try:
311
+ os.kill(kpid, sig)
312
+ except OSError as exc:
313
+ if exc.errno == errno.ESRCH:
314
+ log.debug('unable to signal %s (pid %s), it probably just exited on its own: %s', self.name, self.pid, str(exc)) # noqa
315
+ # we could change the state here but we intentionally do not. we will do it during normal SIGCHLD
316
+ # processing.
317
+ return None
318
+ raise
319
+ except Exception: # noqa
320
+ tb = traceback.format_exc()
321
+ fmt, args = 'unknown problem killing %s (%s):%s', (self.name, self.pid, tb)
322
+ log.critical(fmt, *args)
323
+ self.change_state(ProcessState.UNKNOWN)
324
+ self._killing = False
325
+ self._delay = 0
326
+ return fmt % args
327
+
328
+ return None
329
+
330
+ def signal(self, sig: int) -> ta.Optional[str]:
331
+ """
332
+ Send a signal to the subprocess, without intending to kill it.
333
+
334
+ Return None if the signal was sent, or an error message string if an error occurred or if the subprocess is not
335
+ running.
336
+ """
337
+ args: tuple
338
+ if not self.pid:
339
+ fmt, args = "Attempted to send %s sig %s but it wasn't running", (self.name, sig_name(sig))
340
+ log.debug(fmt, *args)
341
+ return fmt % args
342
+
343
+ log.debug('sending %s (pid %s) sig %s', self.name, self.pid, sig_name(sig))
344
+
345
+ self.check_in_state(ProcessState.RUNNING, ProcessState.STARTING, ProcessState.STOPPING)
346
+
347
+ try:
348
+ try:
349
+ os.kill(self.pid, sig)
350
+ except OSError as exc:
351
+ if exc.errno == errno.ESRCH:
352
+ log.debug(
353
+ 'unable to signal %s (pid %s), it probably just now exited on its own: %s',
354
+ self.name,
355
+ self.pid,
356
+ str(exc),
357
+ )
358
+ # we could change the state here but we intentionally do not. we will do it during normal SIGCHLD
359
+ # processing.
360
+ return None
361
+ raise
362
+ except Exception: # noqa
363
+ tb = traceback.format_exc()
364
+ fmt, args = 'unknown problem sending sig %s (%s):%s', (self.name, self.pid, tb)
365
+ log.critical(fmt, *args)
366
+ self.change_state(ProcessState.UNKNOWN)
367
+ return fmt % args
368
+
369
+ return None
370
+
371
+ def finish(self, sts: Rc) -> None:
372
+ """The process was reaped and we need to report and manage its state."""
373
+
374
+ self._dispatchers.drain()
375
+
376
+ es, msg = decode_wait_status(sts)
377
+
378
+ now = time.time()
379
+
380
+ self._check_and_adjust_for_system_clock_rollback(now)
381
+
382
+ self._last_stop = now
383
+
384
+ if now > self._last_start:
385
+ too_quickly = now - self._last_start < self._config.startsecs
386
+ else:
387
+ too_quickly = False
388
+ log.warning(
389
+ "process '%s' (%s) last_start time is in the future, don't know how long process was running so "
390
+ "assuming it did not exit too quickly",
391
+ self.name,
392
+ self.pid,
393
+ )
394
+
395
+ exit_expected = es in self._config.exitcodes
396
+
397
+ if self._killing:
398
+ # likely the result of a stop request implies STOPPING -> STOPPED
399
+ self._killing = False
400
+ self._delay = 0
401
+ self._exitstatus = Rc(es)
402
+
403
+ fmt, args = 'stopped: %s (%s)', (self.name, msg)
404
+ self.check_in_state(ProcessState.STOPPING)
405
+ self.change_state(ProcessState.STOPPED)
406
+ if exit_expected:
407
+ log.info(fmt, *args)
408
+ else:
409
+ log.warning(fmt, *args)
410
+
411
+ elif too_quickly:
412
+ # the program did not stay up long enough to make it to RUNNING implies STARTING -> BACKOFF
413
+ self._exitstatus = None
414
+ self._spawn_err = 'Exited too quickly (process log may have details)'
415
+ self.check_in_state(ProcessState.STARTING)
416
+ self.change_state(ProcessState.BACKOFF)
417
+ log.warning('exited: %s (%s)', self.name, msg + '; not expected')
418
+
419
+ else:
420
+ # this finish was not the result of a stop request, the program was in the RUNNING state but exited implies
421
+ # RUNNING -> EXITED normally but see next comment
422
+ self._delay = 0
423
+ self._backoff = 0
424
+ self._exitstatus = es
425
+
426
+ # if the process was STARTING but a system time change causes self.last_start to be in the future, the
427
+ # normal STARTING->RUNNING transition can be subverted so we perform the transition here.
428
+ if self._state == ProcessState.STARTING:
429
+ self.change_state(ProcessState.RUNNING)
430
+
431
+ self.check_in_state(ProcessState.RUNNING)
432
+
433
+ if exit_expected:
434
+ # expected exit code
435
+ self.change_state(ProcessState.EXITED, expected=True)
436
+ log.info('exited: %s (%s)', self.name, msg + '; expected')
437
+ else:
438
+ # unexpected exit code
439
+ self._spawn_err = f'Bad exit code {es}'
440
+ self.change_state(ProcessState.EXITED, expected=False)
441
+ log.warning('exited: %s (%s)', self.name, msg + '; not expected')
442
+
443
+ self._pid = Pid(0)
444
+ close_parent_pipes(self._pipes)
445
+ self._pipes = ProcessPipes()
446
+ self._dispatchers = Dispatchers([])
447
+
448
+ def get_state(self) -> ProcessState:
449
+ return self._state
450
+
451
+ def transition(self) -> None:
452
+ now = time.time()
453
+ state = self._state
454
+
455
+ self._check_and_adjust_for_system_clock_rollback(now)
456
+
457
+ logger = log
458
+
459
+ if self.context.state > SupervisorState.RESTARTING:
460
+ # dont start any processes if supervisor is shutting down
461
+ if state == ProcessState.EXITED:
462
+ if self._config.autorestart:
463
+ if self._config.autorestart is RestartUnconditionally:
464
+ # EXITED -> STARTING
465
+ self.spawn()
466
+ elif self._exitstatus not in self._config.exitcodes:
467
+ # EXITED -> STARTING
468
+ self.spawn()
469
+
470
+ elif state == ProcessState.STOPPED and not self._last_start:
471
+ if self._config.autostart:
472
+ # STOPPED -> STARTING
473
+ self.spawn()
474
+
475
+ elif state == ProcessState.BACKOFF:
476
+ if self._backoff <= self._config.startretries:
477
+ if now > self._delay:
478
+ # BACKOFF -> STARTING
479
+ self.spawn()
480
+
481
+ if state == ProcessState.STARTING:
482
+ if now - self._last_start > self._config.startsecs:
483
+ # STARTING -> RUNNING if the proc has started successfully and it has stayed up for at least
484
+ # proc.config.startsecs,
485
+ self._delay = 0
486
+ self._backoff = 0
487
+ self.check_in_state(ProcessState.STARTING)
488
+ self.change_state(ProcessState.RUNNING)
489
+ msg = ('entered RUNNING state, process has stayed up for > than %s seconds (startsecs)' % self._config.startsecs) # noqa
490
+ logger.info('success: %s %s', self.name, msg)
491
+
492
+ if state == ProcessState.BACKOFF:
493
+ if self._backoff > self._config.startretries:
494
+ # BACKOFF -> FATAL if the proc has exceeded its number of retries
495
+ self.give_up()
496
+ msg = ('entered FATAL state, too many start retries too quickly')
497
+ logger.info('gave up: %s %s', self.name, msg)
498
+
499
+ elif state == ProcessState.STOPPING:
500
+ time_left = self._delay - now
501
+ if time_left <= 0:
502
+ # kill processes which are taking too long to stop with a final sigkill. if this doesn't kill it, the
503
+ # process will be stuck in the STOPPING state forever.
504
+ log.warning('killing \'%s\' (%s) with SIGKILL', self.name, self.pid)
505
+ self.kill(signal.SIGKILL)
506
+
507
+ def after_setuid(self) -> None:
508
+ # temporary logfiles which are erased at start time
509
+ # get_autoname = self.context.get_auto_child_log_name # noqa
510
+ # sid = self.context.config.identifier # noqa
511
+ # name = self._config.name # noqa
512
+ # if self.stdout_logfile is Automatic:
513
+ # self.stdout_logfile = get_autoname(name, sid, 'stdout')
514
+ # if self.stderr_logfile is Automatic:
515
+ # self.stderr_logfile = get_autoname(name, sid, 'stderr')
516
+ pass
@@ -0,0 +1,38 @@
1
+ # ruff: noqa: UP006 UP007
2
+ import abc
3
+ import typing as ta
4
+
5
+ from .utils.users import User
6
+
7
+
8
+ ##
9
+
10
+
11
+ SupervisorUser = ta.NewType('SupervisorUser', User)
12
+
13
+
14
+ ##
15
+
16
+
17
+ class DaemonizeListener(abc.ABC): # noqa
18
+ def before_daemonize(self) -> None: # noqa
19
+ pass
20
+
21
+ def after_daemonize(self) -> None: # noqa
22
+ pass
23
+
24
+
25
+ DaemonizeListeners = ta.NewType('DaemonizeListeners', ta.Sequence[DaemonizeListener])
26
+
27
+
28
+ ##
29
+
30
+
31
+ class SupervisorSetup(abc.ABC):
32
+ @abc.abstractmethod
33
+ def setup(self) -> None:
34
+ raise NotImplementedError
35
+
36
+ @abc.abstractmethod
37
+ def cleanup(self) -> None:
38
+ raise NotImplementedError