salt-api-cli 1.4.3__tar.gz → 1.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: salt-api-cli
3
- Version: 1.4.3
3
+ Version: 1.4.4
4
4
  Summary: CLI to access salt-api
5
5
  Author-email: Pradish Bijukchhe <pradish@sandbox.com.np>
6
6
  License-Expression: MIT
@@ -41,7 +41,7 @@ from rich.spinner import Spinner
41
41
  from rich.table import Table
42
42
  from rich.text import Text
43
43
 
44
- from salt_api_cli.lowlevel import split_args
44
+ from salt_api_cli.lowlevel import SaltApiError, split_args
45
45
 
46
46
  console = Console()
47
47
 
@@ -270,15 +270,36 @@ def _print_state_result(result: dict[str, Any]) -> None:
270
270
  # before giving up on minions that never reported. Each poll is a fast,
271
271
  # self-contained request, so the proxy/gateway connection cap never bites.
272
272
  #
273
- # We don't probe minion liveness (saltutil.find_job): an empty probe is
274
- # ambiguous — a busy-but-alive Windows minion mid-highstate can simply fail to
275
- # answer in time and look identical to a down one, so probing wrongly dropped
276
- # live minions. Instead we just poll until every targeted minion has returned
277
- # or _POLL_DEADLINE trips, then render whatever came back. The job keeps
278
- # running on the minions regardless; results stay fetchable later by jid. Press
279
- # Ctrl+C to stop waiting early and render the partial results gathered so far.
273
+ # We can't probe minion liveness mid-job (saltutil.find_job): an empty probe
274
+ # is ambiguous — a busy-but-alive Windows minion mid-highstate can simply fail
275
+ # to answer in time and look identical to a down one, so probing wrongly
276
+ # dropped live minions. Connection-level presence (the manage.present runner)
277
+ # is no better here: it matches connection source IPs against cached minion
278
+ # addresses, which NAT breaks, so it reports every minion absent. What does
279
+ # work is liveness pings (test.ping): one published immediately *before* the
280
+ # real job, while each minion's job loop is still idle, then re-published to
281
+ # whoever stays silent every _PING_INTERVAL. A minion that answers any ping —
282
+ # late answers count too — is provably alive and worth waiting for. One that
283
+ # has ignored the job plus _OFFLINE_PINGS pings (each at least _PING_GRACE
284
+ # old) is tagged "silent", and after _OFFLINE_AFTER of unbroken silence it is
285
+ # presumed offline; once every minion still outstanding is offline we stop
286
+ # waiting. The long fuse matters because of the NAT half-dead case: a minion
287
+ # whose stale connection swallowed the publish (so it never received the job)
288
+ # typically reconnects within a couple of minutes — TCP keepalive bounds it —
289
+ # and answers a fresh ping after ignoring several. On that signature the job
290
+ # is re-sent to it (published jobs are not queued for disconnected minions,
291
+ # so this is never a double run), up to _MAX_RESEND times. Otherwise we poll
292
+ # until every targeted minion has returned or _POLL_DEADLINE trips, then
293
+ # render whatever came back. The job keeps running on the minions regardless;
294
+ # results stay fetchable later by jid. Press Ctrl+C to stop waiting early and
295
+ # render the partial results gathered so far.
280
296
  _POLL_INTERVAL = 3.0
281
297
  _POLL_DEADLINE = 1800.0 # 30 minutes (hard backstop)
298
+ _PING_INTERVAL = 20.0 # re-ping still-silent minions this often
299
+ _PING_GRACE = 15.0 # how long a ping may go unanswered before it counts missed
300
+ _OFFLINE_PINGS = 3 # missed pings before a silent minion is tagged as such
301
+ _OFFLINE_AFTER = 180.0 # unbroken silence before "silent" hardens to offline
302
+ _MAX_RESEND = 1 # times the job is re-sent to a minion that reconnects
282
303
 
283
304
 
284
305
  def _first_return(resp: dict[str, Any]) -> Any:
@@ -303,6 +324,21 @@ def _lookup_returns(raw: Any) -> dict[str, Any]:
303
324
  return cast("dict[str, Any]", inner) if isinstance(inner, dict) else data
304
325
 
305
326
 
327
+ def _submit_async(call: Callable[..., dict[str, Any]], **payload: Any) -> str | None:
328
+ """Submit a ``local_async`` job and return its jid, or ``None`` if the
329
+ submission failed or matched nothing — callers treat that as "no job",
330
+ never as an error worth aborting the run for (these are the auxiliary
331
+ liveness pings and re-sends, not the main job)."""
332
+ try:
333
+ info: Any = _first_return(call("local_async", **payload))
334
+ except SaltApiError:
335
+ return None
336
+ if not isinstance(info, dict):
337
+ return None
338
+ jid = cast("dict[str, Any]", info).get("jid")
339
+ return str(jid) if jid else None
340
+
341
+
306
342
  def _count_cells(counts: dict[str, int]) -> list[Text]:
307
343
  """One right-padded cell per status category, for column alignment in the
308
344
  live view. ``ok``/``failed`` always render; the rest blank when zero so
@@ -341,6 +377,8 @@ def _live_view(
341
377
  returns: dict[str, Any],
342
378
  done: set[str],
343
379
  missing: set[str],
380
+ quiet: set[str],
381
+ offline: set[str],
344
382
  spinner: Spinner,
345
383
  *,
346
384
  n_cells: int,
@@ -348,20 +386,34 @@ def _live_view(
348
386
  ) -> Group:
349
387
  """A live checklist: a tick for finished minions (with ``cells_for`` of
350
388
  their reply in aligned columns), a spinner for the ones still running, an x
351
- for those that never reported, under a one-line status header. ``missing``
352
- is only populated in the final frame (after the deadline or a Ctrl+C); while
353
- polling it's empty, so still-pending minions show a spinner. ``n_cells`` is
354
- how many trailing columns ``cells_for`` produces (so blank rows stay
355
- aligned)."""
389
+ for those that never reported, under a one-line status header. ``quiet``
390
+ is the targeted minions that have ignored several liveness pings (a ? with
391
+ a ``silent`` tag might yet be a reconnecting NAT drop); after enough
392
+ unbroken silence they harden into ``offline`` (an x with an ``offline``
393
+ tag). Both sets stay inside the outstanding minions — a returned minion is
394
+ neither. ``missing`` is only populated in the final frame (after the
395
+ deadline or a Ctrl+C); while polling it's empty, so still-pending minions
396
+ show a spinner. ``n_cells`` is how many trailing columns ``cells_for``
397
+ produces (so blank rows stay aligned)."""
356
398
  blanks = [Text("")] * n_cells
399
+ quiet_cells = [Text("silent", style="yellow"), *[Text("")] * (n_cells - 1)]
400
+ offline_cells = [Text("offline", style="red"), *[Text("")] * (n_cells - 1)]
357
401
  grid = Table.grid(padding=(0, 1))
358
402
  grid.add_column(no_wrap=True) # marker
359
403
  grid.add_column(no_wrap=True) # minion id
360
404
  for _ in range(n_cells): # per-command trailing columns
361
405
  grid.add_column(no_wrap=True, justify="left")
362
406
  for minion in targeted:
363
- if minion in missing:
407
+ if minion in offline:
408
+ grid.add_row(
409
+ Text("X", style="red"), Text(minion, style="dim"), *offline_cells
410
+ )
411
+ elif minion in missing:
364
412
  grid.add_row(Text("X", style="red"), Text(minion, style="dim"), *blanks)
413
+ elif minion in quiet:
414
+ grid.add_row(
415
+ Text("?", style="yellow"), Text(minion, style="dim"), *quiet_cells
416
+ )
365
417
  elif minion in done:
366
418
  grid.add_row(
367
419
  Text("+", style="green"), Text(minion), *cells_for(returns.get(minion))
@@ -369,12 +421,18 @@ def _live_view(
369
421
  else:
370
422
  grid.add_row(spinner, Text(minion, style="dim"), *blanks)
371
423
 
372
- pending = len(targeted) - len(done) - len(missing)
424
+ n_missing = len(missing - offline)
425
+ n_quiet = len(quiet - missing - offline)
426
+ pending = len(targeted) - len(done) - n_missing - n_quiet - len(offline)
373
427
  bits = [f"{len(done)}/{len(targeted)} done"]
374
428
  if pending:
375
429
  bits.append(f"{pending} running")
376
- if missing:
377
- bits.append(f"[red]{len(missing)} no response[/]")
430
+ if n_quiet:
431
+ bits.append(f"[yellow]{n_quiet} silent[/]")
432
+ if offline:
433
+ bits.append(f"[red]{len(offline)} offline[/]")
434
+ if n_missing:
435
+ bits.append(f"[red]{n_missing} no response[/]")
378
436
  header = Text.from_markup(f"[dim]{' '.join(bits)}[/]")
379
437
  return Group(header, grid)
380
438
 
@@ -385,19 +443,27 @@ def _stream_job(
385
443
  *,
386
444
  n_cells: int,
387
445
  cells_for: Callable[[Any], list[Text]],
388
- ) -> tuple[dict[str, Any], set[str], float, bool] | None:
446
+ ) -> tuple[dict[str, Any], set[str], set[str], float, bool] | None:
389
447
  """Fire a job async, show a live checklist, and return its raw results.
390
448
 
391
- Submits ``payload`` via the ``local_async`` client (returns a job id at
392
- once), then polls ``runner jobs.lookup_jid`` until every targeted minion
393
- has returned, the deadline trips, or the user hits Ctrl+C. While polling it
394
- shows a live per-minion checklist (spinner -> tick), whose trailing columns
395
- come from ``cells_for(value)`` (``n_cells`` of them). In every case it then
396
- renders the final checklist frame and returns ``(returns, outstanding,
397
- start, interrupted)`` ``outstanding`` being the targeted minions that
398
- never reported for the caller to render, or ``None`` if no job started
399
- (already reported). ``call(name, **kw)`` invokes the named salt-api
400
- client."""
449
+ Submits a liveness ping then ``payload`` via the ``local_async`` client
450
+ (returns a job id at once), then polls ``runner jobs.lookup_jid`` until
451
+ every targeted minion has returned, everyone still outstanding is presumed
452
+ offline (ignored the job plus _OFFLINE_PINGS liveness pings, for at least
453
+ _OFFLINE_AFTER), the deadline trips, or the user hits Ctrl+C. Silent
454
+ minions are re-pinged every _PING_INTERVAL, and one that ignores several
455
+ pings then answers a later one just reconnected after missing the publish
456
+ the job is re-sent to it (see the comment above _POLL_INTERVAL). While
457
+ polling it shows a live per-minion checklist (spinner -> tick, silent and
458
+ offline minions tagged), whose trailing
459
+ columns come from ``cells_for(value)`` (``n_cells`` of them). In
460
+ every case it then renders the final checklist frame and returns
461
+ ``(returns, outstanding, offline, start, interrupted)`` — ``outstanding``
462
+ being the targeted minions that never reported and ``offline`` the subset
463
+ of those presumed unreachable — for the caller to render, or ``None`` if
464
+ no job started (already reported). ``call(name, **kw)`` invokes the named
465
+ salt-api client."""
466
+ ping_jid = _submit_async(call, tgt=payload["tgt"], fun="test.ping")
401
467
  submit = call("local_async", **payload)
402
468
  info: Any = _first_return(submit)
403
469
  jid = info.get("jid")
@@ -421,6 +487,8 @@ def _stream_job(
421
487
  console.print(f"[dim]job {jid} -> {len(targeted)} minion(s)[/]")
422
488
  start = time.monotonic()
423
489
  returns: dict[str, Any] = {}
490
+ quiet: set[str] = set()
491
+ offline: set[str] = set()
424
492
  spinner = Spinner("dots", style="cyan")
425
493
 
426
494
  def view(missing: set[str] | None = None) -> Group:
@@ -430,30 +498,109 @@ def _stream_job(
430
498
  returns,
431
499
  done,
432
500
  missing or set(),
501
+ quiet,
502
+ offline,
433
503
  spinner,
434
504
  n_cells=n_cells,
435
505
  cells_for=cells_for,
436
506
  )
437
507
 
438
- # Poll lookup_jid until everyone's back or the deadline trips; Ctrl+C stops
439
- # waiting early. The job keeps running on the minions either way — we just
440
- # stop watching and render whatever was gathered. transient=False keeps the
441
- # finished checklist on screen above the rendered tables.
508
+ # Poll lookup_jid until everyone's back, everyone left is offline, or the
509
+ # deadline trips; Ctrl+C stops waiting early. The job keeps running on the
510
+ # minions either way — we just stop watching and render whatever was
511
+ # gathered. transient=False keeps the finished checklist on screen above
512
+ # the rendered tables.
442
513
  interrupted = False
514
+ jids = [jid] # the job, plus any re-sends to reconnected minions
515
+ alive: set[str] = set() # answered some liveness ping
516
+ resent: dict[str, int] = {} # minion -> times the job was re-sent to it
517
+ reset_at: dict[str, float] = {} # ignore pings before this (post re-send)
518
+ # Each ping round: its jid, publish time, and who it targeted. Round 0 is
519
+ # the pre-job ping at the original target expression.
520
+ rounds: list[tuple[str, float, set[str]]] = []
521
+ if ping_jid:
522
+ rounds.append((ping_jid, start, expected))
443
523
  with Live(console=console, refresh_per_second=12, transient=False) as live:
444
524
  try:
445
525
  while True:
446
526
  # lookup_jid is cumulative: each poll returns every minion that
447
- # has reported so far, so we just keep the latest snapshot.
448
- returns = _lookup_returns(
449
- _first_return(
450
- call("runner", fun="jobs.lookup_jid", kwarg={"jid": jid})
527
+ # has reported so far; merge the snapshots across all jids.
528
+ for j in jids:
529
+ returns.update(
530
+ _lookup_returns(
531
+ _first_return(
532
+ call("runner", fun="jobs.lookup_jid", kwarg={"jid": j})
533
+ )
534
+ )
451
535
  )
452
- )
536
+ outstanding = expected - set(returns)
537
+ now = time.monotonic()
538
+ # Collect liveness answers from the newest ping rounds (late
539
+ # answers count; a reconnected minion only ever receives the
540
+ # newest, so polling further back buys nothing).
541
+ for rjid, _, targets in rounds[-2:]:
542
+ if targets - alive - set(returns):
543
+ answers = _lookup_returns(
544
+ _first_return(
545
+ call(
546
+ "runner",
547
+ fun="jobs.lookup_jid",
548
+ kwarg={"jid": rjid},
549
+ )
550
+ )
551
+ )
552
+ alive |= expected & set(answers)
553
+ # The reconnect signature: tagged silent (ignored several
554
+ # pings and the job — the publish never reached it), now
555
+ # answering. Re-send the job to it and make it re-prove
556
+ # liveness from scratch, so a second drop re-runs this cycle.
557
+ recovered = {m for m in quiet & alive if resent.get(m, 0) < _MAX_RESEND}
558
+ if recovered:
559
+ rejid = _submit_async(
560
+ call,
561
+ **{**payload, "tgt": sorted(recovered), "tgt_type": "list"},
562
+ )
563
+ if rejid:
564
+ jids.append(rejid)
565
+ for m in recovered:
566
+ resent[m] = resent.get(m, 0) + 1
567
+ reset_at[m] = now
568
+ alive -= recovered
569
+ # Silent: missed _OFFLINE_PINGS pings (each old enough that an
570
+ # answer would have arrived) plus the job. Offline: silent for
571
+ # _OFFLINE_AFTER straight — long enough to have reconnected
572
+ # and answered a fresh ping, were it a NAT-dropped connection.
573
+ quiet = {
574
+ m
575
+ for m in outstanding - alive
576
+ if sum(
577
+ 1
578
+ for _, t, targets in rounds
579
+ if m in targets
580
+ and t >= reset_at.get(m, -1.0)
581
+ and now - t >= _PING_GRACE
582
+ )
583
+ >= _OFFLINE_PINGS
584
+ }
585
+ offline = {
586
+ m for m in quiet if now - reset_at.get(m, start) >= _OFFLINE_AFTER
587
+ }
588
+ # Re-ping whoever is still silent, so slow answers, reconnects,
589
+ # and genuinely-down minions keep accumulating evidence.
590
+ silent = outstanding - alive
591
+ last_round = rounds[-1][1] if rounds else start
592
+ if silent and now - last_round >= _PING_INTERVAL:
593
+ rjid = _submit_async(
594
+ call, tgt=sorted(silent), tgt_type="list", fun="test.ping"
595
+ )
596
+ if rjid:
597
+ rounds.append((rjid, now, set(silent)))
453
598
  live.update(view())
454
- if not expected - set(returns):
599
+ if not outstanding:
600
+ break
601
+ if offline == outstanding:
455
602
  break
456
- if time.monotonic() - start > _POLL_DEADLINE:
603
+ if now - start > _POLL_DEADLINE:
457
604
  break
458
605
  time.sleep(_POLL_INTERVAL)
459
606
  except KeyboardInterrupt:
@@ -461,23 +608,42 @@ def _stream_job(
461
608
  # Final frame: mark whoever never reported so the persisted checklist
462
609
  # reflects the true end state rather than a frozen spinner.
463
610
  outstanding = expected - set(returns)
611
+ offline &= outstanding
464
612
  live.update(view(outstanding))
465
613
 
466
- return returns, expected - set(returns), start, interrupted
614
+ if resent:
615
+ names = ", ".join(sorted(resent, key=_natural_key))
616
+ console.print(
617
+ f"[dim]re-sent the job to {names} - reconnected after missing the "
618
+ f"original publish[/]"
619
+ )
620
+ return returns, expected - set(returns), offline, start, interrupted
467
621
 
468
622
 
469
- def _print_outstanding(outstanding: set[str], interrupted: bool) -> None:
623
+ def _print_outstanding(
624
+ outstanding: set[str], offline: set[str], interrupted: bool
625
+ ) -> None:
470
626
  """Trailer naming the minions that hadn't reported when we stopped waiting
471
- — because the user interrupted, or the deadline tripped."""
627
+ — because the user interrupted, everyone left was offline, or the deadline
628
+ tripped."""
472
629
  if not outstanding:
473
630
  return
474
- names = ", ".join(sorted(outstanding, key=_natural_key))
475
631
  if interrupted:
632
+ names = ", ".join(sorted(outstanding, key=_natural_key))
476
633
  console.print(
477
634
  f"[yellow]stopped waiting (Ctrl+C); no result yet from: {names} "
478
635
  f"- the job may still be running on them[/]"
479
636
  )
480
- else:
637
+ return
638
+ if offline:
639
+ names = ", ".join(sorted(offline, key=_natural_key))
640
+ console.print(
641
+ f"[yellow]no result from: {names} - ignored the job and repeated "
642
+ f"liveness pings, so presumed offline; the job never reached them[/]"
643
+ )
644
+ waiting = outstanding - offline
645
+ if waiting:
646
+ names = ", ".join(sorted(waiting, key=_natural_key))
481
647
  console.print(
482
648
  f"[yellow]no result from: {names} within the "
483
649
  f"{int(_POLL_DEADLINE)}s deadline (still running, or down)[/]"
@@ -490,11 +656,11 @@ def _stream_state(call: Callable[..., dict[str, Any]], payload: dict[str, Any])
490
656
  result = _stream_job(call, payload, n_cells=5, cells_for=_state_cells)
491
657
  if result is None:
492
658
  return
493
- returns, outstanding, start, interrupted = result
659
+ returns, outstanding, offline, start, interrupted = result
494
660
 
495
661
  # Live view cleared — render the coloured tables, one block per minion.
496
662
  _print_state_result({"return": [returns]})
497
- _print_outstanding(outstanding, interrupted)
663
+ _print_outstanding(outstanding, offline, interrupted)
498
664
 
499
665
  # Fleet-wide summary: totals across all minions + wall-clock elapsed.
500
666
  totals, n = _grand_totals(returns)
@@ -654,10 +820,10 @@ def _stream_cmd(call: Callable[..., dict[str, Any]], payload: dict[str, Any]) ->
654
820
  result = _stream_job(call, payload, n_cells=1, cells_for=_cmd_cells)
655
821
  if result is None:
656
822
  return
657
- returns, outstanding, start, interrupted = result
823
+ returns, outstanding, offline, start, interrupted = result
658
824
 
659
825
  _print_cmd_result({"return": [returns]})
660
- _print_outstanding(outstanding, interrupted)
826
+ _print_outstanding(outstanding, offline, interrupted)
661
827
 
662
828
  n = len(returns)
663
829
  if n:
@@ -0,0 +1 @@
1
+ __version__ = "1.4.4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: salt-api-cli
3
- Version: 1.4.3
3
+ Version: 1.4.4
4
4
  Summary: CLI to access salt-api
5
5
  Author-email: Pradish Bijukchhe <pradish@sandbox.com.np>
6
6
  License-Expression: MIT
@@ -1 +0,0 @@
1
- __version__ = "1.4.3"
File without changes
File without changes
File without changes