@keepur/hive 0.8.3 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/service/deploy.sh CHANGED
@@ -127,10 +127,32 @@ notify() {
127
127
  > /dev/null
128
128
  }
129
129
 
130
+ # log_size_before <log_file>
131
+ # Byte size of the log right now, or 0 if it doesn't exist. Captured
132
+ # immediately before `launchctl bootstrap` so health_check can scan only
133
+ # bytes written by the new boot and ignore any stale "Hive is running"
134
+ # from a prior run.
135
+ log_size_before() {
136
+ local log_file="$1"
137
+ if [[ -f "$log_file" ]]; then
138
+ wc -c < "$log_file" | awk '{print $1}'
139
+ else
140
+ echo 0
141
+ fi
142
+ }
143
+
144
+ # health_check <log_file> [start_offset]
145
+ # KPR-240: anchor the marker scan to the boot we just kicked off.
146
+ # Reads bytes after $start_offset and succeeds iff "Hive is running"
147
+ # appears after "Hive starting up" in that window. Avoids the tail -5
148
+ # race on busy boots (12 agents + scheduler + memory lifecycle can push
149
+ # the marker out of the last 5 lines within 1s) and refuses to match a
150
+ # stale marker from a previous run.
130
151
  health_check() {
131
152
  local log_file="$1"
153
+ local start_offset="${2:-0}"
132
154
  if $DRY_RUN; then
133
- echo "[DRY RUN] health_check: would check $log_file"
155
+ echo "[DRY RUN] health_check: would check $log_file (offset $start_offset)"
134
156
  return 0
135
157
  fi
136
158
  for attempt in $(seq 1 "$HEALTH_CHECK_RETRIES"); do
@@ -140,7 +162,7 @@ health_check() {
140
162
  fi
141
163
  for _ in $(seq 1 "$HEALTH_CHECK_WINDOW"); do
142
164
  sleep 1
143
- if tail -5 "$log_file" 2>/dev/null | grep -q '"Hive is running"'; then
165
+ if _scan_new_boot "$log_file" "$start_offset"; then
144
166
  return 0
145
167
  fi
146
168
  done
@@ -149,6 +171,20 @@ health_check() {
149
171
  return 1
150
172
  }
151
173
 
174
+ # _scan_new_boot <log_file> <start_offset>
175
+ # True iff the bytes after $start_offset contain "Hive is running"
176
+ # preceded by "Hive starting up". tail -c +N is 1-indexed: start at byte N.
177
+ _scan_new_boot() {
178
+ local log_file="$1"
179
+ local start_offset="$2"
180
+ [[ -f "$log_file" ]] || return 1
181
+ tail -c "+$((start_offset + 1))" "$log_file" 2>/dev/null | awk '
182
+ /"Hive starting up"/ { started = 1; next }
183
+ started && /"Hive is running"/ { found = 1; exit }
184
+ END { if (found) exit 0; else exit 1 }
185
+ '
186
+ }
187
+
152
188
  kill_ports() {
153
189
  local ports_str="$1"
154
190
  if $DRY_RUN; then
@@ -373,8 +409,10 @@ if $ROLLBACK; then
373
409
  notify "Rollback FAILED for \`$id\`: no previous engine (.hive.prev missing)."
374
410
  exit 1
375
411
  fi
412
+ health_log="$instance_root/$logs_dir/hive.log"
413
+ health_offset=$(log_size_before "$health_log")
376
414
  run_cmd launchctl bootstrap "gui/$(id -u)" "$plist_path"
377
- if health_check "$instance_root/$logs_dir/hive.log"; then
415
+ if health_check "$health_log" "$health_offset"; then
378
416
  rollback_version=$(jq -r .version < "$instance_root/.hive/package.json" 2>/dev/null || echo "unknown")
379
417
  notify "Rollback succeeded for \`$id\` → \`$rollback_version\`."
380
418
  echo "Rollback complete."
@@ -529,11 +567,13 @@ for inst in "${INSTANCES[@]}"; do
529
567
  echo " Swapping engine..."
530
568
  swap_engine "$instance_root"
531
569
 
570
+ health_log="$instance_root/$logs_dir/hive.log"
571
+ health_offset=$(log_size_before "$health_log")
532
572
  echo " Restarting $label..."
533
573
  run_cmd launchctl bootstrap "gui/$(id -u)" "$plist_path"
534
574
 
535
575
  echo " Checking health..."
536
- if ! health_check "$instance_root/$logs_dir/hive.log"; then
576
+ if ! health_check "$health_log" "$health_offset"; then
537
577
  echo " Health check FAILED for $id — rolling back"
538
578
  # New engine bound the port and failed health check — bootout it before swap.
539
579
  run_cmd launchctl bootout "gui/$(id -u)/$label" 2>/dev/null || true
@@ -65,11 +65,15 @@ echo '{"name":"@keepur/hive","version":"0.2.0-dev"}' > "$BUILD_DIR/package.json"
65
65
  # function bodies via sed and source them in isolation. The inner `/!p` drops
66
66
  # the closing delimiter line so we don't capture the `if $ROLLBACK; then` line
67
67
  # (which would trip set -u on undefined ROLLBACK when sourced).
68
- sed -n '/^# --- Engine fetch\/swap\/rollback/,/^# --- Short-circuit:/{/^# --- Short-circuit:/!p;}' \
68
+ sed -n '/^# --- Helpers ---/,/^# --- Short-circuit:/{/^# --- Short-circuit:/!p;}' \
69
69
  "$SCRIPT_DIR/deploy.sh" > "$TESTROOT/helpers.sh"
70
70
  # Helper bodies reference $DRY_RUN (added so --dry-run skips the destructive
71
71
  # ops); set it false here so the helpers actually execute under set -u.
72
72
  DRY_RUN=false
73
+ # Keep health_check's retry/window/wait small so the new tests don't burn 90s.
74
+ HEALTH_CHECK_RETRIES=1
75
+ HEALTH_CHECK_WINDOW=2
76
+ HEALTH_CHECK_WAIT_BETWEEN=0
73
77
  # shellcheck source=/dev/null
74
78
  source "$TESTROOT/helpers.sh"
75
79
 
@@ -205,4 +209,88 @@ if install_engine_deps "$DEPLOY_DIR" >/dev/null 2>&1; then
205
209
  exit 1
206
210
  fi
207
211
 
212
+ # --- Health-check tests (KPR-240) ---
213
+ # Anchor the marker scan to a captured byte offset so a busy boot logging
214
+ # many lines after "Hive is running" can't be falsely flagged failed, and
215
+ # so a stale marker from a previous run can't be falsely flagged healthy.
216
+ LOG_DIR=$(mktemp -d -t hive-health-test.XXXXXX)
217
+ trap 'rm -rf "$TESTROOT" "$LOG_DIR"' EXIT
218
+
219
+ # --- Test 11: busy boot logs many lines after marker, still passes ---
220
+ echo "test 11: busy boot passes despite marker scrolling out of tail -5"
221
+ LOG_FILE="$LOG_DIR/hive-busy.log"
222
+ : > "$LOG_FILE"
223
+ OFFSET=$(log_size_before "$LOG_FILE")
224
+ # Simulate the new boot: starting marker, running marker, then a flood of
225
+ # log lines (12 agents + scheduler + memory lifecycle) that pushes the
226
+ # marker far out of `tail -5`.
227
+ {
228
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"hive","msg":"Hive starting up","instance":"test"}'
229
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"hive","msg":"Hive is running"}'
230
+ for i in $(seq 1 50); do
231
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"agent","msg":"agent-'"$i"' ready"}'
232
+ done
233
+ } >> "$LOG_FILE"
234
+ if ! health_check "$LOG_FILE" "$OFFSET" >/dev/null; then
235
+ echo "FAIL: healthy boot with flood after marker should pass"
236
+ exit 1
237
+ fi
238
+ # Sanity: with the legacy tail -5 strategy this would have failed.
239
+ if tail -5 "$LOG_FILE" | grep -q '"Hive is running"'; then
240
+ echo "FAIL: tail -5 unexpectedly still contains the marker — test setup wrong"
241
+ exit 1
242
+ fi
243
+
244
+ # --- Test 12: genuine boot failure (never reaches marker) fails ---
245
+ echo "test 12: genuine boot failure fails health_check"
246
+ LOG_FILE="$LOG_DIR/hive-fail.log"
247
+ : > "$LOG_FILE"
248
+ OFFSET=$(log_size_before "$LOG_FILE")
249
+ {
250
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"hive","msg":"Hive starting up","instance":"test"}'
251
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"error","component":"hive","msg":"Mongo unreachable, exiting"}'
252
+ } >> "$LOG_FILE"
253
+ if health_check "$LOG_FILE" "$OFFSET" >/dev/null 2>&1; then
254
+ echo "FAIL: boot that never reached 'Hive is running' should fail"
255
+ exit 1
256
+ fi
257
+
258
+ # --- Test 13: stale "Hive is running" before offset is ignored ---
259
+ echo "test 13: stale marker from previous boot is not matched"
260
+ LOG_FILE="$LOG_DIR/hive-stale.log"
261
+ : > "$LOG_FILE"
262
+ # Previous boot: full happy-path markers land in the file.
263
+ {
264
+ echo '{"ts":"2026-05-24T00:00:00Z","level":"info","component":"hive","msg":"Hive starting up","instance":"test"}'
265
+ echo '{"ts":"2026-05-24T00:00:00Z","level":"info","component":"hive","msg":"Hive is running"}'
266
+ } >> "$LOG_FILE"
267
+ # Capture offset AFTER the prior boot's markers — mimics the deploy.sh
268
+ # call site that snapshots wc -c right before launchctl bootstrap.
269
+ OFFSET=$(log_size_before "$LOG_FILE")
270
+ # New boot crashes before "Hive is running". The stale marker is still
271
+ # physically in the log, but past start_offset there is no marker.
272
+ {
273
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"hive","msg":"Hive starting up","instance":"test"}'
274
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"error","component":"hive","msg":"crashed"}'
275
+ } >> "$LOG_FILE"
276
+ if health_check "$LOG_FILE" "$OFFSET" >/dev/null 2>&1; then
277
+ echo "FAIL: stale marker from previous boot should not satisfy health_check"
278
+ exit 1
279
+ fi
280
+
281
+ # --- Test 14: log file absent at start (fresh install), then created ---
282
+ echo "test 14: missing log at offset capture, populated by boot, passes"
283
+ LOG_FILE="$LOG_DIR/hive-fresh.log"
284
+ # No file yet — offset is 0 by contract.
285
+ OFFSET=$(log_size_before "$LOG_FILE")
286
+ [[ "$OFFSET" == "0" ]] || { echo "FAIL: log_size_before should return 0 for missing file (got '$OFFSET')"; exit 1; }
287
+ {
288
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"hive","msg":"Hive starting up","instance":"test"}'
289
+ echo '{"ts":"2026-05-25T00:00:00Z","level":"info","component":"hive","msg":"Hive is running"}'
290
+ } > "$LOG_FILE"
291
+ if ! health_check "$LOG_FILE" "$OFFSET" >/dev/null; then
292
+ echo "FAIL: fresh-install boot should pass health_check"
293
+ exit 1
294
+ fi
295
+
208
296
  echo "all tests passed."