loki-mode 7.61.0 → 7.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/SKILL.md CHANGED
@@ -3,7 +3,7 @@ name: loki-mode
3
3
  description: Autonomous spec-driven build system with a built-in trust layer. It does not call work done until it is verified (RARV-C closure loop, 8 quality gates, completion council, verified-completion evidence gate). Triggers on "Loki Mode". Takes a spec (PRD, GitHub issue, OpenAPI doc, etc.) to deployed product with minimal human intervention. Provider-agnostic. Requires --dangerously-skip-permissions flag.
4
4
  ---
5
5
 
6
- # Loki Mode v7.61.0
6
+ # Loki Mode v7.63.0
7
7
 
8
8
  **You are an autonomous agent. You make decisions. You do not ask questions. You do not stop.**
9
9
 
@@ -406,4 +406,4 @@ See `CHANGELOG.md` entries [7.5.7], [7.5.8], [7.5.13] for the per-fix list and r
406
406
 
407
407
  ---
408
408
 
409
- **v7.61.0 | [Autonomi](https://www.autonomi.dev/) flagship product | ~260 lines core**
409
+ **v7.63.0 | [Autonomi](https://www.autonomi.dev/) flagship product | ~260 lines core**
package/VERSION CHANGED
@@ -1 +1 @@
1
- 7.61.0
1
+ 7.63.0
@@ -1374,20 +1374,59 @@ app_runner_health_check() {
1374
1374
  return 1
1375
1375
  fi
1376
1376
 
1377
- # For HTTP apps, try an HTTP health check
1377
+ # For HTTP apps, try an HTTP health check.
1378
1378
  if [ -n "$_APP_RUNNER_PORT" ] && [ "$_APP_RUNNER_PORT" -gt 0 ] 2>/dev/null; then
1379
- if curl -sf -o /dev/null -m 5 "http://localhost:${_APP_RUNNER_PORT}/" 2>/dev/null; then
1379
+ # The health signal is "is the server answering HTTP at all", NOT "does /
1380
+ # return 2xx". Loki generates plenty of apps that legitimately serve a
1381
+ # non-2xx on the root path (an API-only FastAPI/Express backend 404s on
1382
+ # `/`, anything behind auth 401s). Those are serving correctly, so a
1383
+ # status-strict probe (curl -f, which fails on >=400) would mark a healthy
1384
+ # backend unhealthy and trigger a needless restart -> a restart storm /
1385
+ # false crash. What genuinely means "no longer serving" -- a hung event
1386
+ # loop, a deadlock, a wedged dev server -- is a connection that times out
1387
+ # or is refused, i.e. NO HTTP response at all. So we read the HTTP status
1388
+ # code: any code returned (2xx/3xx/4xx/5xx) means the server answered and
1389
+ # is alive; "000" is curl's sentinel for connect-failure/timeout/reset
1390
+ # and is the only thing we treat as a crash.
1391
+ # If curl is unavailable we cannot probe HTTP at all; fall back to the
1392
+ # old, more tolerant signal (PID alive == healthy) rather than declaring
1393
+ # every HTTP app wedged and triggering a restart storm. curl is the only
1394
+ # HTTP client this function uses.
1395
+ if ! command -v curl >/dev/null 2>&1; then
1380
1396
  _write_health "true"
1381
1397
  _write_app_state "running"
1382
1398
  return 0
1383
- else
1384
- # HTTP failed but process alive -- may be a non-HTTP app or still starting
1399
+ fi
1400
+ # On connect-failure/timeout curl already prints "000" via %{http_code}
1401
+ # and exits non-zero; do NOT append our own "000" (a `|| echo 000` would
1402
+ # concatenate to "000000"). The trailing `|| true` swallows the non-zero
1403
+ # exit (matching this file's guarded command-substitution convention, e.g.
1404
+ # the _GIT_DIFF_HASH / port reads) so the watchdog never aborts under a
1405
+ # future `set -e`; the empty fallback then maps to "000".
1406
+ local _http_code
1407
+ _http_code=$(curl -s -o /dev/null -m 5 -w '%{http_code}' \
1408
+ "http://localhost:${_APP_RUNNER_PORT}/" 2>/dev/null || true)
1409
+ _http_code="${_http_code:-000}"
1410
+ if [ "$_http_code" != "000" ]; then
1385
1411
  _write_health "true"
1412
+ _write_app_state "running"
1386
1413
  return 0
1414
+ else
1415
+ # No HTTP response: the process is alive (kill -0 passed above) but is
1416
+ # not serving on its declared port -- a wedged/hung/deadlocked server.
1417
+ # Previously this branch wrote ok:true unconditionally, so the HTTP
1418
+ # signal could never report a failure and a wedged server stayed
1419
+ # "healthy" forever. Report the failure honestly so the watchdog can
1420
+ # act on it. We deliberately do NOT flip state.json to "crashed" here
1421
+ # (mirroring the dead-PID precedent above at the kill -0 check); the
1422
+ # watchdog owns the crashed transition after its circuit breaker, so a
1423
+ # single transient blip does not prematurely mark the app crashed.
1424
+ _write_health "false"
1425
+ return 1
1387
1426
  fi
1388
1427
  fi
1389
1428
 
1390
- # Non-HTTP: PID alive is sufficient
1429
+ # Non-HTTP: PID alive is sufficient (no URL/port to probe)
1391
1430
  _write_health "true"
1392
1431
  return 0
1393
1432
  }
@@ -1477,17 +1516,35 @@ app_runner_watchdog() {
1477
1516
  return 0
1478
1517
  fi
1479
1518
 
1480
- # Process alive, nothing to do
1519
+ # Process alive: kill -0 only proves the PID exists, not that the app is
1520
+ # actually serving. A hung event loop, a deadlock, or a wedged dev server
1521
+ # all pass kill -0 forever while never answering a request, so the old
1522
+ # "alive == healthy" shortcut let a wedged HTTP app run un-restarted and
1523
+ # left health.json stale. Mirror the compose branch: defer to
1524
+ # app_runner_health_check (HTTP-aware for apps that declared a port), and
1525
+ # treat an unhealthy-but-alive process as a crash so the same circuit
1526
+ # breaker + backoff + restart path handles it.
1481
1527
  if kill -0 "$_APP_RUNNER_PID" 2>/dev/null; then
1482
- # BUG 3 fix: a confirmed-alive observation clears the accumulated crash
1483
- # count so the breaker fires only on 5 CONSECUTIVE deaths, not on 5
1484
- # cumulative crashes that were each successfully recovered over a long
1485
- # session (which would trip the breaker on a HEALTHY app).
1486
- _APP_RUNNER_CRASH_COUNT=0
1487
- return 0
1528
+ if app_runner_health_check; then
1529
+ # BUG 3 fix: a confirmed-healthy observation clears the accumulated
1530
+ # crash count so the breaker fires only on 5 CONSECUTIVE failures,
1531
+ # not on 5 cumulative crashes that were each successfully recovered
1532
+ # over a long session (which would trip the breaker on a HEALTHY app).
1533
+ _APP_RUNNER_CRASH_COUNT=0
1534
+ return 0
1535
+ fi
1536
+ # Alive but not healthy (e.g. HTTP probe failed for an app that declared
1537
+ # a port). Fall through to the crash path below, but first terminate the
1538
+ # wedged process: it is still bound to the port, so app_runner_start's
1539
+ # port-conflict guard would otherwise refuse to start and the breaker
1540
+ # would trip while the orphan keeps serving hung responses (a restart
1541
+ # storm). app_runner_stop performs a full process-tree teardown and
1542
+ # clears _APP_RUNNER_PID / app.pid, leaving a clean slate for restart.
1543
+ log_warn "App Runner: process alive but unhealthy (not serving) -- treating as crash"
1544
+ app_runner_stop
1488
1545
  fi
1489
1546
 
1490
- # Process is dead
1547
+ # Process is dead (or was just torn down because it was alive-but-wedged)
1491
1548
  _APP_RUNNER_CRASH_COUNT=$(( _APP_RUNNER_CRASH_COUNT + 1 ))
1492
1549
  log_warn "App Runner: process died (crash #$_APP_RUNNER_CRASH_COUNT)"
1493
1550
 
@@ -199,19 +199,11 @@ loki_docker_build_argv() {
199
199
  argv+=(-e "LOKI_SKIP_PROJECT_REGISTRY=1")
200
200
  # Deterministic per-host-path container name: two repos get two distinct
201
201
  # concurrent containers (multi-repo parity with the host CLI) and a stable
202
- # handle for `loki docker stop`. Hash the workspace path; fall back to a
203
- # sanitized basename if no sha tool is present.
204
- local _name_hash=""
205
- if command -v shasum >/dev/null 2>&1; then
206
- _name_hash="$(printf '%s' "$workspace" | shasum -a 256 2>/dev/null | cut -c1-12)"
207
- elif command -v sha256sum >/dev/null 2>&1; then
208
- _name_hash="$(printf '%s' "$workspace" | sha256sum 2>/dev/null | cut -c1-12)"
209
- fi
210
- if [ -n "$_name_hash" ]; then
211
- argv+=(--name "loki-${_name_hash}")
212
- else
213
- argv+=(--name "loki-$(basename "$workspace" | tr -c 'A-Za-z0-9_.-' '_')")
214
- fi
202
+ # handle for `loki docker stop`. Computed by loki_docker_container_name
203
+ # (single source of truth so cmd_stop reaps the exact same name): sha12 of
204
+ # the workspace path, with a sanitized-basename fallback if no sha tool is
205
+ # present.
206
+ argv+=(--name "$(loki_docker_container_name "$workspace")")
215
207
  # The container IS the session boundary, so the runner must NOT setsid into
216
208
  # a new, detached session: setsid detach inside a `--rm` container makes the
217
209
  # `docker run` exit code report 0 even when the runner failed (a user with
@@ -241,3 +233,213 @@ loki_docker_build_argv() {
241
233
 
242
234
  printf '%s\n' "${argv[@]}"
243
235
  }
236
+
237
+ #===============================================================================
238
+ # Wave-4 Docker helpers (FEAT-DOCKER-DASH / FEAT-DOCKER-PRUNE / FIX-DOCKER-STOP)
239
+ #
240
+ # These are called by autonomy/loki (cmd_docker, cmd_stop). The CLI guards each
241
+ # call with `declare -F <name>` so missing helpers degrade gracefully, but the
242
+ # NAMES below are part of the contract and must not change.
243
+ #
244
+ # Call contracts (Agent A call sites must match exactly):
245
+ #
246
+ # loki_docker_pick_host_port
247
+ # args: none
248
+ # stdout: a single free host port number (nothing else)
249
+ # port precedence: ${DASHBOARD_DEFAULT_PORT:-${LOKI_DASHBOARD_PORT:-57374}};
250
+ # if that port is bound, increments to the next free port (up to 50 tries).
251
+ #
252
+ # loki_docker_pull_and_prune
253
+ # args: none (reads $LOKI_DOCKER_IMAGE, $LOKI_DOCKER_PRUNE)
254
+ # gate: LOKI_DOCKER_PRUNE (default 1). When 0 -> returns 0 immediately,
255
+ # no docker pull and no prune.
256
+ # side effects: docker pull, then best-effort rmi of OLD/unused
257
+ # asklokesh/loki-mode images only. Prints an honest summary.
258
+ # return: always 0 (best-effort; partial rmi failure is non-fatal).
259
+ #
260
+ # loki_docker_write_runstate <container> <image> [project_dir]
261
+ # args: $1 container name, $2 image ref, $3 project dir (default $(pwd))
262
+ # side effect: atomically writes <project_dir>/.loki/docker/run.json:
263
+ # {"container","image","project_dir","started_at"(ISO8601 UTC)}
264
+ # return: 0 on success, non-zero if the file could not be written.
265
+ #
266
+ # loki_docker_clear_runstate [project_dir]
267
+ # args: $1 project dir (default $(pwd))
268
+ # side effect: rm -f <project_dir>/.loki/docker/run.json (no error if absent)
269
+ # return: always 0.
270
+ #
271
+ # loki_docker_container_name [workspace_path]
272
+ # args: $1 workspace path (default $(pwd))
273
+ # stdout: the deterministic container name loki-<sha12 of path>, identical
274
+ # to the name loki_docker_build_argv assigns (basename fallback if no
275
+ # sha tool is present). Nothing else on stdout.
276
+ #===============================================================================
277
+
278
+ # Normalize a docker image identifier for comparison: strip a leading "sha256:"
279
+ # and truncate to the 12-char short form. `docker images --format '{{.ID}}'`
280
+ # emits a short id, while `docker inspect --format '{{.Id}}'` and
281
+ # `docker ps --format '{{.ImageID}}'` emit the full "sha256:<64hex>" form, so
282
+ # the prune logic MUST normalize both sides before comparing -- otherwise the
283
+ # ":latest" / in-use exclusions silently fail and we delete the wrong image.
284
+ _loki_docker_norm_id() {
285
+ local id="${1#sha256:}"
286
+ printf '%s' "${id:0:12}"
287
+ }
288
+
289
+ # Probe seam: returns 0 if $1 (a host port) is FREE, non-zero if bound.
290
+ # Factored out so tests can override it without binding a real socket.
291
+ # Prefers lsof (used elsewhere in the repo), falls back to nc, then to a
292
+ # bash /dev/tcp probe so there is no hard lsof dependency.
293
+ _loki_docker_port_free() {
294
+ local port="$1"
295
+ if command -v lsof >/dev/null 2>&1; then
296
+ ! lsof -i ":$port" >/dev/null 2>&1
297
+ elif command -v nc >/dev/null 2>&1; then
298
+ ! nc -z 127.0.0.1 "$port" >/dev/null 2>&1
299
+ else
300
+ # /dev/tcp connect: success means something is listening -> port busy.
301
+ ! (exec 3<>"/dev/tcp/127.0.0.1/${port}") >/dev/null 2>&1
302
+ fi
303
+ }
304
+
305
+ # Echo a free host port. Tries the default dashboard port first, then walks up.
306
+ loki_docker_pick_host_port() {
307
+ local port="${DASHBOARD_DEFAULT_PORT:-${LOKI_DASHBOARD_PORT:-57374}}"
308
+ local attempts=0
309
+ while ! _loki_docker_port_free "$port" && [ "$attempts" -lt 50 ]; do
310
+ port=$((port + 1))
311
+ attempts=$((attempts + 1))
312
+ done
313
+ printf '%s\n' "$port"
314
+ }
315
+
316
+ # Pull the loki-mode image and prune ONLY old/unused asklokesh/loki-mode images.
317
+ # Triple-scoped safety: (1) reference filter limits enumeration to
318
+ # asklokesh/loki-mode, (2) the just-pulled :latest id is excluded, (3) any id
319
+ # in use by a running container is excluded. NEVER `docker image prune -a`.
320
+ loki_docker_pull_and_prune() {
321
+ local image="${LOKI_DOCKER_IMAGE:-asklokesh/loki-mode:latest}"
322
+ local prune="${LOKI_DOCKER_PRUNE:-1}"
323
+
324
+ # Opt-out: skip the explicit pull AND the prune entirely.
325
+ if [ "$prune" = "0" ]; then
326
+ return 0
327
+ fi
328
+
329
+ if ! command -v docker >/dev/null 2>&1; then
330
+ echo "loki_docker_pull_and_prune: docker not found; skipping" >&2
331
+ return 0
332
+ fi
333
+
334
+ echo "Pulling ${image} ..."
335
+ docker pull "$image" >/dev/null 2>&1 || {
336
+ echo "loki_docker_pull_and_prune: pull failed; skipping prune" >&2
337
+ return 0
338
+ }
339
+
340
+ # Capture the just-pulled image id (normalized short form). If inspect
341
+ # yields nothing we cannot safely exclude the just-pulled image from the
342
+ # rmi set, so bail rather than risk deleting it.
343
+ local latest_raw latest_id
344
+ latest_raw="$(docker inspect --format '{{.Id}}' "$image" 2>/dev/null)"
345
+ if [ -z "$latest_raw" ]; then
346
+ echo "loki_docker_pull_and_prune: could not resolve pulled image id; skipping prune" >&2
347
+ return 0
348
+ fi
349
+ latest_id="$(_loki_docker_norm_id "$latest_raw")"
350
+
351
+ # Build the in-use set from running containers (image ids AND names).
352
+ # Normalize ids so a full sha256 id matches the short id from `docker images`.
353
+ local -A in_use=()
354
+ local _line _iid _iname
355
+ while IFS=' ' read -r _iid _iname; do
356
+ [ -n "$_iid" ] && in_use["$(_loki_docker_norm_id "$_iid")"]=1
357
+ [ -n "$_iname" ] && in_use["$_iname"]=1
358
+ done < <(docker ps --format '{{.ImageID}} {{.Image}}' 2>/dev/null)
359
+
360
+ # Enumerate ONLY asklokesh/loki-mode images (tagged + dangling), scoped by
361
+ # the reference filter so a non-loki-mode image is never even considered.
362
+ local -A candidates=()
363
+ while read -r _id; do
364
+ [ -n "$_id" ] && candidates["$(_loki_docker_norm_id "$_id")"]=1
365
+ done < <(docker images --filter 'reference=asklokesh/loki-mode' --format '{{.ID}}' 2>/dev/null)
366
+ while read -r _id; do
367
+ [ -n "$_id" ] && candidates["$(_loki_docker_norm_id "$_id")"]=1
368
+ done < <(docker images --filter 'reference=asklokesh/loki-mode' --filter 'dangling=true' -q 2>/dev/null)
369
+
370
+ # rmi each candidate that is NOT the just-pulled :latest AND NOT in use.
371
+ local reclaimed=0 cand
372
+ for cand in "${!candidates[@]}"; do
373
+ if [ -n "$latest_id" ] && [ "$cand" = "$latest_id" ]; then
374
+ continue
375
+ fi
376
+ if [ -n "${in_use[$cand]:-}" ]; then
377
+ continue
378
+ fi
379
+ if docker rmi "$cand" >/dev/null 2>&1; then
380
+ reclaimed=$((reclaimed + 1))
381
+ fi
382
+ done
383
+
384
+ if [ "$reclaimed" -gt 0 ]; then
385
+ echo "Reclaimed ${reclaimed} old loki-mode image(s)."
386
+ else
387
+ echo "Image cleanup: nothing to reclaim."
388
+ fi
389
+ return 0
390
+ }
391
+
392
+ # Write .loki/docker/run.json atomically. See contract block above for args.
393
+ loki_docker_write_runstate() {
394
+ local container="$1"
395
+ local image="$2"
396
+ local project_dir="${3:-$(pwd)}"
397
+ [ -n "$container" ] || { echo "loki_docker_write_runstate: missing container" >&2; return 2; }
398
+ [ -n "$image" ] || { echo "loki_docker_write_runstate: missing image" >&2; return 2; }
399
+
400
+ local dir="${project_dir}/.loki/docker"
401
+ mkdir -p "$dir" || return 2
402
+
403
+ local started_at
404
+ started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null)"
405
+
406
+ # tmp file in the SAME dir as the target so `mv` is an atomic rename, not a
407
+ # cross-device copy.
408
+ local tmp="${dir}/.run.json.$$"
409
+ {
410
+ printf '{\n'
411
+ printf ' "container": "%s",\n' "$container"
412
+ printf ' "image": "%s",\n' "$image"
413
+ printf ' "project_dir": "%s",\n' "$project_dir"
414
+ printf ' "started_at": "%s"\n' "$started_at"
415
+ printf '}\n'
416
+ } > "$tmp" || { rm -f "$tmp" 2>/dev/null; return 2; }
417
+
418
+ mv -f "$tmp" "${dir}/run.json" || { rm -f "$tmp" 2>/dev/null; return 2; }
419
+ return 0
420
+ }
421
+
422
+ # Remove .loki/docker/run.json. No error if it is already gone.
423
+ loki_docker_clear_runstate() {
424
+ local project_dir="${1:-$(pwd)}"
425
+ rm -f "${project_dir}/.loki/docker/run.json" 2>/dev/null
426
+ return 0
427
+ }
428
+
429
+ # Echo the deterministic container name for a workspace path. This MUST stay
430
+ # byte-identical to the name loki_docker_build_argv assigns (lines ~204-214), so
431
+ # cmd_stop can reap the container by name. Same sha12 logic + basename fallback.
432
+ loki_docker_container_name() {
433
+ local workspace="${1:-$(pwd)}"
434
+ local _name_hash=""
435
+ if command -v shasum >/dev/null 2>&1; then
436
+ _name_hash="$(printf '%s' "$workspace" | shasum -a 256 2>/dev/null | cut -c1-12)"
437
+ elif command -v sha256sum >/dev/null 2>&1; then
438
+ _name_hash="$(printf '%s' "$workspace" | sha256sum 2>/dev/null | cut -c1-12)"
439
+ fi
440
+ if [ -n "$_name_hash" ]; then
441
+ printf '%s\n' "loki-${_name_hash}"
442
+ else
443
+ printf '%s\n' "loki-$(basename "$workspace" | tr -c 'A-Za-z0-9_.-' '_')"
444
+ fi
445
+ }
package/autonomy/loki CHANGED
@@ -2304,6 +2304,48 @@ cmd_stop() {
2304
2304
  return 0
2305
2305
  fi
2306
2306
 
2307
+ # FIX-DOCKER-STOP (reap side): a `loki docker start` in this folder runs the
2308
+ # build inside a container named loki-<sha12 of workspace>; the host .loki has
2309
+ # no live pid for it, so the legacy "No active session" path would fire while
2310
+ # the container is Up. Reconcile + reap it here, before the session check.
2311
+ # Folder-scoped: the container name derives from THIS folder's run.json (or a
2312
+ # recompute over $(pwd)), so a run in another folder is never touched (AC18).
2313
+ local _docker_reaped=0
2314
+ if command -v docker >/dev/null 2>&1; then
2315
+ local _docker_runstate="$LOKI_DIR/docker/run.json"
2316
+ local _docker_container=""
2317
+ if [ -f "$_docker_runstate" ] && command -v python3 >/dev/null 2>&1; then
2318
+ _docker_container=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('container',''))" "$_docker_runstate" 2>/dev/null || true)
2319
+ fi
2320
+ # Fallback: recompute the deterministic name via Agent B's helper. cmd_stop
2321
+ # does NOT source docker-run.sh itself (only cmd_docker does), so source it
2322
+ # here, gated on the module existing, to make the helper available.
2323
+ if [ -z "$_docker_container" ]; then
2324
+ local _docker_mod="$_LOKI_SCRIPT_DIR/docker-run.sh"
2325
+ if [ -f "$_docker_mod" ]; then
2326
+ # shellcheck source=/dev/null
2327
+ source "$_docker_mod" 2>/dev/null || true
2328
+ if declare -F loki_docker_container_name >/dev/null 2>&1; then
2329
+ _docker_container="$(loki_docker_container_name 2>/dev/null || true)"
2330
+ fi
2331
+ fi
2332
+ fi
2333
+ if [ -n "$_docker_container" ]; then
2334
+ # Only reap if a container with exactly this name is actually running.
2335
+ local _running_id
2336
+ _running_id=$(docker ps -q -f "name=^${_docker_container}$" 2>/dev/null || true)
2337
+ if [ -n "$_running_id" ]; then
2338
+ docker stop "$_docker_container" >/dev/null 2>&1 || true
2339
+ # --rm may auto-remove on stop; rm is best-effort (already-gone is fine).
2340
+ docker rm "$_docker_container" >/dev/null 2>&1 || true
2341
+ _docker_reaped=1
2342
+ echo -e "${RED}Stopped docker run: ${_docker_container}${NC}"
2343
+ fi
2344
+ fi
2345
+ # Clear our run-state record regardless (the container is gone or absent).
2346
+ rm -f "$_docker_runstate" 2>/dev/null || true
2347
+ fi
2348
+
2307
2349
  # No session ID given -- stop all running sessions
2308
2350
  if is_session_running; then
2309
2351
  # Stop per-session PIDs first
@@ -2535,7 +2577,7 @@ PYDASH
2535
2577
  else
2536
2578
  echo -e "${RED}STOP signal sent. Execution will halt immediately.${NC}"
2537
2579
  fi
2538
- else
2580
+ elif [ "$_docker_reaped" != "1" ]; then
2539
2581
  echo -e "${YELLOW}No active session running.${NC}"
2540
2582
  if [ -f "$LOKI_DIR/STOP" ]; then
2541
2583
  echo "STOP signal already present."
@@ -2560,6 +2602,20 @@ PYDASH
2560
2602
  sleep 0.5
2561
2603
  pkill -9 -f "$_stop_all_pat" 2>/dev/null || true
2562
2604
  echo -e "${RED}--all: signalled all loki-run-* processes on this machine.${NC}"
2605
+
2606
+ # FIX-DOCKER-STOP --all: also reap EVERY loki-mode container on this
2607
+ # machine (parity with the machine-wide PID kill above). Best-effort.
2608
+ if command -v docker >/dev/null 2>&1; then
2609
+ local _all_ids
2610
+ _all_ids=$(docker ps -q --filter ancestor=asklokesh/loki-mode 2>/dev/null || true)
2611
+ if [ -n "$_all_ids" ]; then
2612
+ # shellcheck disable=SC2086
2613
+ docker stop $_all_ids >/dev/null 2>&1 || true
2614
+ # shellcheck disable=SC2086
2615
+ docker rm $_all_ids >/dev/null 2>&1 || true
2616
+ echo -e "${RED}--all: stopped all loki-mode docker containers on this machine.${NC}"
2617
+ fi
2618
+ fi
2563
2619
  fi
2564
2620
  }
2565
2621
 
@@ -8084,9 +8140,11 @@ cmd_doctor() {
8084
8140
  local _install
8085
8141
  _install=$(doctor_provider_install_cmd "$_cmd")
8086
8142
  # Route the per-provider install hint to STDERR (fd 2), mirroring the
8087
- # doctor_probe_note stderr-only pattern above. This keeps the
8088
- # parity-captured STDOUT byte-identical to the Bun route (which emits
8089
- # no per-provider Install line) while the user still sees the hint.
8143
+ # doctor_probe_note stderr-only pattern above. The Bun route emits the
8144
+ # same hint on STDOUT immediately after the provider WARN line; under
8145
+ # the parity harness's 2>&1 capture both land adjacent to the WARN, so
8146
+ # the two routes stay byte-identical when an optional provider is absent
8147
+ # (v7.63.0: closed the cline-absent parity gap by adding the Bun hint).
8090
8148
  [ -n "$_install" ] && echo -e " ${YELLOW}Install: ${_install}${NC}" >&2
8091
8149
  fi
8092
8150
  }
@@ -28857,6 +28915,68 @@ cmd_docker() {
28857
28915
  return 0
28858
28916
  fi
28859
28917
 
28918
+ # Detect the forwarded loki subcommand (first non-dash token in fwd). This
28919
+ # handles `--api start ...` where the wrapper-kept --api precedes `start`.
28920
+ local _fwd_sub=""
28921
+ local _ft
28922
+ for _ft in "${fwd[@]}"; do
28923
+ case "$_ft" in
28924
+ -*) continue ;;
28925
+ *) _fwd_sub="$_ft"; break ;;
28926
+ esac
28927
+ done
28928
+
28929
+ # Is this docker run launched in background mode? Mirrors the loki `start`
28930
+ # background flags (--bg/--background/--detach/-d) that get forwarded into
28931
+ # the container. Used to gate dashboard auto-open exactly like run.sh:~10088.
28932
+ local _docker_bg=0
28933
+ for _ft in "${fwd[@]}"; do
28934
+ case "$_ft" in
28935
+ --bg|--background|--detach|-d) _docker_bg=1; break ;;
28936
+ esac
28937
+ done
28938
+
28939
+ # FEAT-DOCKER-DASH / FEAT-DOCKER-PRUNE / FIX-DOCKER-STOP (start path only):
28940
+ # the container itself stays dashboard-OFF (no published container port).
28941
+ # Instead we drive the HOST dashboard (which already aggregates local +
28942
+ # docker runs) so `loki docker start` feels exactly like local `loki start`.
28943
+ if [ "$_fwd_sub" = "start" ]; then
28944
+ # F3 FEAT-DOCKER-PRUNE: pull latest + prune old loki-mode images before
28945
+ # the run. Gated by LOKI_DOCKER_PRUNE (default 1); =0 opts out (and the
28946
+ # helper also skips the explicit pull). Best-effort, never blocks a run.
28947
+ if [ "${LOKI_DOCKER_PRUNE:-1}" != "0" ] && declare -F loki_docker_pull_and_prune >/dev/null 2>&1; then
28948
+ loki_docker_pull_and_prune || true
28949
+ fi
28950
+
28951
+ # F2 FEAT-DOCKER-DASH: start/reuse the host dashboard (idempotent: an
28952
+ # already-running dashboard short-circuits in cmd_dashboard_start before
28953
+ # the port-in-use check, so the picked port is ignored on reuse). Run in
28954
+ # a subshell so its internal `exit` paths (already-running -> exit 0,
28955
+ # errors -> exit 1) never abort the blocking docker run.
28956
+ local _dash_port="57374"
28957
+ if declare -F loki_docker_pick_host_port >/dev/null 2>&1; then
28958
+ _dash_port="$(loki_docker_pick_host_port 2>/dev/null || echo 57374)"
28959
+ fi
28960
+ ( cmd_dashboard_start --port "$_dash_port" ) || true
28961
+
28962
+ # Auto-open the dashboard in the browser, gated EXACTLY like
28963
+ # run.sh:~10088: a TTY on stdout, not background, and not opted out via
28964
+ # LOKI_NO_AUTO_OPEN=1. cmd_dashboard_open reads the persisted port file,
28965
+ # so the URL is correct even when an existing dashboard was reused.
28966
+ if [ -t 1 ] && [ "$_docker_bg" != "1" ] && [ "${LOKI_NO_AUTO_OPEN:-0}" != "1" ]; then
28967
+ ( cmd_dashboard_open ) || true
28968
+ fi
28969
+
28970
+ # F4 FIX-DOCKER-STOP (write side): record this run's container/state so
28971
+ # `loki stop` in this folder can reap the container. Cleared after the
28972
+ # blocking run returns (mirrors the register_host running/stopped bracket).
28973
+ # The helper requires the container name + image; compute the deterministic
28974
+ # name once (byte-identical to the build-time --name) and pass the resolved image.
28975
+ if declare -F loki_docker_write_runstate >/dev/null 2>&1 && declare -F loki_docker_container_name >/dev/null 2>&1; then
28976
+ loki_docker_write_runstate "$(loki_docker_container_name)" "${LOKI_DOCKER_IMAGE:-asklokesh/loki-mode:latest}" || true
28977
+ fi
28978
+ fi
28979
+
28860
28980
  # Multi-repo unified dashboard (Option B): register THIS project on the host
28861
28981
  # with the real cwd and NO pid, so the existing host dashboard
28862
28982
  # (`loki dashboard`) lists it alongside host `loki start` projects and reads
@@ -28866,6 +28986,11 @@ cmd_docker() {
28866
28986
  "${docker_argv[@]}"
28867
28987
  local rc=$?
28868
28988
  _loki_docker_register_host stopped
28989
+ # F4 FIX-DOCKER-STOP (clear side): runs unconditionally after the blocking
28990
+ # call (not gated on rc), same as register_host stopped.
28991
+ if [ "$_fwd_sub" = "start" ]; then
28992
+ declare -F loki_docker_clear_runstate >/dev/null 2>&1 && loki_docker_clear_runstate || true
28993
+ fi
28869
28994
  [ "$cleanup_creds" = "1" ] && rm -f "$creds" 2>/dev/null || true
28870
28995
  return $rc
28871
28996
  }
@@ -33,13 +33,33 @@
33
33
  # Configuration
34
34
  CHECKLIST_ENABLED=${LOKI_CHECKLIST_ENABLED:-true}
35
35
  CHECKLIST_INTERVAL=${LOKI_CHECKLIST_INTERVAL:-5}
36
- # Guard against zero/negative interval (division by zero in modulo)
37
- if [ "$CHECKLIST_INTERVAL" -le 0 ] 2>/dev/null; then
36
+ # Normalize the interval. This is sourced into run.sh which runs under
37
+ # `set -uo pipefail`, and CHECKLIST_INTERVAL flows into a modulo at
38
+ # checklist_should_verify (current_iteration % CHECKLIST_INTERVAL).
39
+ # A non-numeric value (e.g. "abc") would be treated as an unbound variable
40
+ # name by arithmetic expansion and abort the host loop; an empty value would
41
+ # cause a divide-by-zero. The old `[ ... -le 0 ] 2>/dev/null` guard only
42
+ # caught numeric <=0 (the `[` error on non-numeric was swallowed and the bad
43
+ # value retained). The case below rejects empty and any non-digit input first
44
+ # (pure string match, never errors), then the arithmetic test is safe.
45
+ case "$CHECKLIST_INTERVAL" in
46
+ ''|*[!0-9]*) CHECKLIST_INTERVAL=5 ;; # empty or non-digit -> default
47
+ esac
48
+ # Guard against zero interval (division by zero in modulo); value is all-digits.
49
+ if [ "$CHECKLIST_INTERVAL" -le 0 ]; then
38
50
  CHECKLIST_INTERVAL=5
39
51
  fi
40
52
  CHECKLIST_TIMEOUT=${LOKI_CHECKLIST_TIMEOUT:-30}
41
- # Guard against zero/negative timeout
42
- if [ "$CHECKLIST_TIMEOUT" -le 0 ] 2>/dev/null; then
53
+ # Normalize the timeout. It does not flow into arithmetic (only passed as a
54
+ # --timeout CLI arg to checklist-verify.py at line ~650), so it cannot crash
55
+ # the loop, but the same flawed `[ ... -le 0 ] 2>/dev/null` guard would retain
56
+ # a non-numeric value and hand garbage to the downstream tool. Normalize it the
57
+ # same way for robustness.
58
+ case "$CHECKLIST_TIMEOUT" in
59
+ ''|*[!0-9]*) CHECKLIST_TIMEOUT=30 ;; # empty or non-digit -> default
60
+ esac
61
+ # Guard against zero timeout; value is all-digits.
62
+ if [ "$CHECKLIST_TIMEOUT" -le 0 ]; then
43
63
  CHECKLIST_TIMEOUT=30
44
64
  fi
45
65
 
@@ -393,6 +413,15 @@ checklist_should_verify() {
393
413
 
394
414
  # Check iteration interval
395
415
  local current_iteration="${ITERATION_COUNT:-0}"
416
+ # Defensive numeric guard: current_iteration feeds the modulo below, the
417
+ # single choke point for both operands under `set -uo pipefail`. A
418
+ # non-numeric value would be treated as an unbound variable name by
419
+ # arithmetic expansion and abort the sourced host loop. ITERATION_COUNT is
420
+ # loki-internal (lower risk than the env-driven interval), but normalize
421
+ # here so the modulo can never crash. Pure string match, never errors.
422
+ case "$current_iteration" in
423
+ ''|*[!0-9]*) current_iteration=0 ;; # empty or non-digit -> treat as 0
424
+ esac
396
425
  if [ "$current_iteration" -eq 0 ]; then
397
426
  return 1
398
427
  fi