plc-checkweigher 1.32.2 → 1.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,12 +174,13 @@ restart)
174
174
  need_sudo
175
175
  banner "Restarting services"
176
176
  echo ""
177
- spin_start "Restarting plc_watcher and plc_web"
178
- sudo systemctl restart plc_watcher plc_web
177
+ spin_start "Restarting plc_watcher, plc_web and plc_selfheal"
178
+ sudo systemctl restart plc_watcher plc_web plc_selfheal 2>/dev/null \
179
+ || sudo systemctl restart plc_watcher plc_web
179
180
  sleep 2
180
181
  spin_ok
181
182
  echo ""
182
- systemctl status plc_watcher plc_web --no-pager | grep -E 'Active|Main PID'
183
+ systemctl status plc_watcher plc_web plc_selfheal --no-pager 2>/dev/null | grep -E 'Active|Main PID'
183
184
  echo ""
184
185
  ;;
185
186
 
@@ -187,8 +188,9 @@ start)
187
188
  need_sudo
188
189
  banner "Starting services"
189
190
  echo ""
190
- spin_start "Starting plc_watcher and plc_web"
191
- sudo systemctl start plc_watcher plc_web
191
+ spin_start "Starting plc_watcher, plc_web and plc_selfheal"
192
+ sudo systemctl start plc_watcher plc_web plc_selfheal 2>/dev/null \
193
+ || sudo systemctl start plc_watcher plc_web
192
194
  sleep 1
193
195
  spin_ok
194
196
  echo ""
@@ -198,12 +200,15 @@ stop)
198
200
  need_sudo
199
201
  banner "Stopping services"
200
202
  echo ""
201
- spin_start "Stopping plc_watcher and plc_web"
203
+ # Stop the self-healing daemon FIRST — otherwise it would detect the
204
+ # watcher/web going down and immediately restart them, fighting this stop.
205
+ spin_start "Stopping plc_selfheal, plc_watcher and plc_web"
206
+ sudo systemctl stop plc_selfheal 2>/dev/null || true
202
207
  sudo systemctl stop plc_watcher plc_web
203
208
  spin_ok
204
209
  echo ""
205
210
  warn "Services stopped — will restart automatically on next boot"
206
- warn "To disable auto-start: sudo systemctl disable plc_watcher plc_web"
211
+ warn "To disable auto-start: sudo systemctl disable plc_watcher plc_web plc_selfheal"
207
212
  echo ""
208
213
  ;;
209
214
 
@@ -235,6 +240,87 @@ queue)
235
240
  echo ""
236
241
  ;;
237
242
 
243
+ # ── Self-healing daemon ────────────────────────────────────────────────────────
244
+ selfheal)
245
+ SUBCMD="${1:-status}"
246
+ shift || true
247
+ case "$SUBCMD" in
248
+ status)
249
+ banner "Self-Healing Daemon"
250
+ echo ""
251
+ ACTIVE=$(systemctl is-active plc_selfheal 2>/dev/null || true)
252
+ ENABLED=$(systemctl is-enabled plc_selfheal 2>/dev/null || true)
253
+ if [[ "$ACTIVE" == "active" ]]; then
254
+ ok "plc_selfheal: RUNNING (auto-start: ${ENABLED})"
255
+ else
256
+ warn "plc_selfheal: ${ACTIVE} (auto-start: ${ENABLED})"
257
+ info "Start it: sudo systemctl start plc_selfheal"
258
+ fi
259
+ echo ""
260
+ # Last-cycle / throttle state
261
+ _ST="/home/pi/reports/health/.selfheal_state.json"
262
+ if [[ -f "$_ST" ]]; then
263
+ "${PYTHON}" - "$_ST" << 'PYEOF' 2>/dev/null || true
264
+ import json, sys, time
265
+ try:
266
+ d = json.load(open(sys.argv[1]))
267
+ except Exception:
268
+ sys.exit(0)
269
+ lc = d.get("last_cycle", 0)
270
+ if lc:
271
+ age = int(time.time() - lc)
272
+ print(f" Last self-heal sweep: {age}s ago")
273
+ rep = d.get("reported", {})
274
+ if rep:
275
+ print(f" Active unresolved problems: {len(rep)}")
276
+ for k in rep: print(f" • {k}")
277
+ else:
278
+ print(" No unresolved problems")
279
+ PYEOF
280
+ else
281
+ info "No sweep has run yet (daemon may have just started)"
282
+ fi
283
+ echo ""
284
+ # Pending health reports on disk
285
+ _HD="/home/pi/reports/health"
286
+ if [[ -d "$_HD" ]]; then
287
+ _N=$(find "$_HD" -maxdepth 1 -name 'health_*.txt' 2>/dev/null | wc -l)
288
+ [[ "${_N:-0}" -gt 0 ]] && info "${_N} health report(s) in ${_HD}"
289
+ fi
290
+ echo ""
291
+ info "Recent activity: plc_checkweigher selfheal logs"
292
+ echo ""
293
+ ;;
294
+ logs)
295
+ banner "Self-Heal Logs"
296
+ echo ""
297
+ exec journalctl -u plc_selfheal -n 50 -f --no-pager
298
+ ;;
299
+ now|run)
300
+ banner "Self-Heal — Immediate Sweep"
301
+ need_sudo
302
+ echo ""
303
+ spin_start "Running one self-heal cycle"
304
+ sudo "${PYTHON}" -c "
305
+ import sys; sys.path.insert(0, '${INSTALL_DIR}')
306
+ import selfheal
307
+ healed, failed, env = selfheal.run_cycle()
308
+ selfheal.maybe_report(healed, failed, env)
309
+ print('HEALED:', len(healed), '| UNRESOLVED:', len(failed))
310
+ for k,d in healed: print(' [HEALED]', k, '-', d)
311
+ for k,d in failed: print(' [FAIL] ', k, '-', d)
312
+ " && spin_ok || spin_warn "Cycle completed with issues"
313
+ echo ""
314
+ ;;
315
+ *)
316
+ echo "Usage: plc_checkweigher selfheal [status|logs|now]"
317
+ echo " status — daemon state, last sweep, unresolved problems (default)"
318
+ echo " logs — follow the self-heal journal"
319
+ echo " now — run one self-heal sweep immediately"
320
+ ;;
321
+ esac
322
+ ;;
323
+
238
324
  # ── Push test ─────────────────────────────────────────────────────────────────
239
325
  push-test)
240
326
  banner "SMB Push Test"
@@ -1088,18 +1174,56 @@ SMBC
1088
1174
  echo ""
1089
1175
  fi
1090
1176
 
1091
- # ── 8. plc_live.json staleness ────────────────────────────────────────
1177
+ # ── 8. Live state file (/tmp/plc_live.json) service-aware repair ─────
1178
+ # The watcher writes this file continuously. If it is missing or stale
1179
+ # we decide the fix by the service state, not by the file alone:
1180
+ # • watcher inactive → start it
1181
+ # • watcher active + missing → stuck before first write → restart
1182
+ # • watcher active + stale → lost PLC / stuck loop → restart
1183
+ # After any restart we wait briefly and confirm the file reappears.
1092
1184
  spin_start "Live state file (/tmp/plc_live.json)"
1093
- if [[ -f "/tmp/plc_live.json" ]]; then
1094
- _AGE=$(( $(date +%s) - $(stat -c %Y /tmp/plc_live.json 2>/dev/null || echo 0) ))
1095
- if [[ "${_AGE:-0}" -gt 10 ]]; then
1096
- ffix_warn "plc_live.json is ${_AGE}s old — watcher may be stuck or offline"
1097
- flog "WARN: plc_live.json stale (${_AGE}s old)"
1185
+ _LIVE="/tmp/plc_live.json"
1186
+ _WATCHER_ACTIVE=0
1187
+ systemctl is-active --quiet plc_watcher 2>/dev/null && _WATCHER_ACTIVE=1
1188
+
1189
+ _LIVE_PROBLEM="" # empty = healthy
1190
+ if [[ -f "$_LIVE" ]]; then
1191
+ _AGE=$(( $(date +%s) - $(stat -c %Y "$_LIVE" 2>/dev/null || echo 0) ))
1192
+ [[ "${_AGE:-0}" -gt 10 ]] && _LIVE_PROBLEM="stale (${_AGE}s old)"
1193
+ else
1194
+ _LIVE_PROBLEM="missing"
1195
+ fi
1196
+
1197
+ if [[ -z "$_LIVE_PROBLEM" ]]; then
1198
+ ffix_info "plc_live.json updated ${_AGE}s ago (OK)"
1199
+ elif [[ $_WATCHER_ACTIVE -eq 0 ]]; then
1200
+ ffix_warn "plc_live.json ${_LIVE_PROBLEM} — plc_watcher is not running"
1201
+ flog "WARN: live state ${_LIVE_PROBLEM}; plc_watcher inactive"
1202
+ spin_start "Starting plc_watcher"
1203
+ sudo systemctl start plc_watcher 2>/dev/null || true
1204
+ sleep 4
1205
+ if [[ -f "$_LIVE" ]]; then
1206
+ ffix_ok "plc_watcher started — live state now being written"
1098
1207
  else
1099
- ffix_info "plc_live.json updated ${_AGE}s ago (OK)"
1208
+ ffix_err "plc_watcher started but live state still missing — check: journalctl -u plc_watcher"
1100
1209
  fi
1101
1210
  else
1102
- ffix_info "plc_live.json absent (normal created when watcher connects)"
1211
+ # Service is up but the file is missing/stale → watcher stuck. Restart.
1212
+ ffix_warn "plc_live.json ${_LIVE_PROBLEM} while plc_watcher is running — restarting watcher"
1213
+ flog "WARN: live state ${_LIVE_PROBLEM}; watcher active → restart"
1214
+ spin_start "Restarting plc_watcher"
1215
+ sudo systemctl restart plc_watcher 2>/dev/null || true
1216
+ sleep 4
1217
+ if [[ -f "$_LIVE" ]]; then
1218
+ _AGE2=$(( $(date +%s) - $(stat -c %Y "$_LIVE" 2>/dev/null || echo 0) ))
1219
+ if [[ "${_AGE2:-99}" -le 10 ]]; then
1220
+ ffix_ok "Watcher restarted — live state fresh (${_AGE2}s old)"
1221
+ else
1222
+ ffix_warn "Watcher restarted but live state still stale — PLC may be unreachable"
1223
+ fi
1224
+ else
1225
+ ffix_err "Watcher restarted but live state still missing — check: journalctl -u plc_watcher"
1226
+ fi
1103
1227
  fi
1104
1228
 
1105
1229
  # ── 9. reader script path ─────────────────────────────────────────────
@@ -1258,7 +1382,8 @@ update)
1258
1382
  # ── Update systemd unit files if they changed ─────────────────────────────
1259
1383
  _UNITS_UPDATED=0
1260
1384
  for _svc_src in "${INSTALL_DIR}/plc_watcher.service" \
1261
- "${INSTALL_DIR}/web/plc_web.service"; do
1385
+ "${INSTALL_DIR}/web/plc_web.service" \
1386
+ "${INSTALL_DIR}/plc_selfheal.service"; do
1262
1387
  [[ ! -f "$_svc_src" ]] && continue
1263
1388
  _svc_dst="/etc/systemd/system/$(basename "$_svc_src")"
1264
1389
  if [[ ! -f "$_svc_dst" ]] || ! diff -q "$_svc_src" "$_svc_dst" &>/dev/null; then
@@ -1271,6 +1396,22 @@ update)
1271
1396
  spin_ok "systemd reloaded"
1272
1397
  fi
1273
1398
 
1399
+ # ── Self-healing daemon present + enabled (retrofit older installs) ────────
1400
+ spin_start "Self-healing daemon"
1401
+ if [[ -f "${INSTALL_DIR}/selfheal.py" && -f /etc/systemd/system/plc_selfheal.service ]]; then
1402
+ if ! systemctl is-enabled --quiet plc_selfheal 2>/dev/null; then
1403
+ sudo systemctl enable plc_selfheal 2>/dev/null || true
1404
+ spin_ok "plc_selfheal enabled"
1405
+ elif ! systemctl is-active --quiet plc_selfheal 2>/dev/null; then
1406
+ sudo systemctl start plc_selfheal 2>/dev/null || true
1407
+ spin_ok "plc_selfheal started"
1408
+ else
1409
+ spin_ok "plc_selfheal active"
1410
+ fi
1411
+ else
1412
+ spin_warn "selfheal.py or unit missing — will be installed on next full setup"
1413
+ fi
1414
+
1274
1415
  # ── Update Plymouth boot splash ───────────────────────────────────────────
1275
1416
  _THEME_DIR="/usr/share/plymouth/themes/saismruth"
1276
1417
  _LOGO_SRC="${INSTALL_DIR}/assets/logo.png"
@@ -1528,6 +1669,7 @@ SUDOEOF
1528
1669
  if [[ $_CODE_CHANGED -eq 1 || $_UNITS_UPDATED -eq 1 ]]; then
1529
1670
  spin_start "Restarting services"
1530
1671
  sudo systemctl restart plc_watcher plc_web 2>/dev/null
1672
+ sudo systemctl restart plc_selfheal 2>/dev/null || true
1531
1673
  sleep 2
1532
1674
  spin_ok "Services restarted"
1533
1675
  else
@@ -1661,12 +1803,13 @@ uninstall)
1661
1803
  # ── 1. Stop and disable services ─────────────────────────────────────────
1662
1804
  echo ""
1663
1805
  ustep "Stopping and disabling services"
1664
- for SVC in plc_watcher plc_web; do
1806
+ for SVC in plc_selfheal plc_watcher plc_web; do
1665
1807
  systemctl is-active --quiet "$SVC" 2>/dev/null && sudo systemctl stop "$SVC" 2>/dev/null || true
1666
1808
  systemctl is-enabled --quiet "$SVC" 2>/dev/null && sudo systemctl disable "$SVC" 2>/dev/null || true
1667
1809
  done
1668
1810
  sudo rm -f /etc/systemd/system/plc_watcher.service \
1669
- /etc/systemd/system/plc_web.service
1811
+ /etc/systemd/system/plc_web.service \
1812
+ /etc/systemd/system/plc_selfheal.service
1670
1813
  spin_ok "Services removed"
1671
1814
 
1672
1815
  # ── 2. System drop-ins ───────────────────────────────────────────────────
@@ -1820,11 +1963,14 @@ help|--help|-h)
1820
1963
  echo " fix -errors Scan journal, verify RT/permissions/PLC/config"
1821
1964
  echo " logs Stream live logs (plc_watcher + plc_web)"
1822
1965
  echo " queue Show SMB pending queue and delivery ledger"
1966
+ echo " selfheal status Self-healing daemon state + unresolved problems"
1967
+ echo " selfheal logs Follow the self-heal journal"
1968
+ echo " selfheal now Run one self-heal sweep immediately"
1823
1969
  echo ""
1824
1970
  echo -e " ${W}Services${NC}"
1825
- echo " start Start plc_watcher and plc_web"
1826
- echo " stop Stop both services"
1827
- echo " restart Restart both services"
1971
+ echo " start Start watcher, web and self-heal"
1972
+ echo " stop Stop all services"
1973
+ echo " restart Restart all services"
1828
1974
  echo ""
1829
1975
  echo -e " ${W}Network${NC}"
1830
1976
  echo " wifi Scan and switch WiFi network"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "plc-checkweigher",
3
- "version": "1.32.2",
3
+ "version": "1.33.0",
4
4
  "description": "One-command installer for the PLC Check-Weigher system on Raspberry Pi (PREEMPT_RT kernel, Python stack, WiFi, SMB, systemd RT services)",
5
5
  "scripts": {
6
6
  "postinstall": "node bin/cli.js"
package/setup.sh CHANGED
@@ -432,9 +432,37 @@ WantedBy=multi-user.target
432
432
  EOF
433
433
  ok "plc_web.service (Nice=-10)"
434
434
 
435
+ # ── Self-healing daemon — auto-repairs services/files/configs at runtime ──
436
+ cat > /etc/systemd/system/plc_selfheal.service << EOF
437
+ [Unit]
438
+ Description=PLC Check-Weigher Self-Healing Daemon
439
+ After=network.target plc_watcher.service plc_web.service
440
+ Wants=plc_watcher.service plc_web.service
441
+
442
+ [Service]
443
+ Type=simple
444
+ User=root
445
+ WorkingDirectory=${INSTALL_DIR}
446
+ Environment=PYTHONUNBUFFERED=1
447
+ ExecStart=${VENV_DIR}/bin/python3 -u ${INSTALL_DIR}/selfheal.py
448
+ Restart=always
449
+ RestartSec=10
450
+ Nice=10
451
+ CPUAffinity=0 1 2
452
+ IOSchedulingClass=idle
453
+ StandardOutput=journal
454
+ StandardError=journal
455
+
456
+ [Install]
457
+ WantedBy=multi-user.target
458
+ EOF
459
+ cp /etc/systemd/system/plc_selfheal.service "${INSTALL_DIR}/plc_selfheal.service"
460
+ chown "${PI_USER}:${PI_USER}" "${INSTALL_DIR}/plc_selfheal.service"
461
+ ok "plc_selfheal.service (auto-repair · cores 0-2 · Nice=10)"
462
+
435
463
  systemctl daemon-reload
436
- systemctl enable plc_watcher.service plc_web.service
437
- ok "Both services enabled — start automatically after reboot"
464
+ systemctl enable plc_watcher.service plc_web.service plc_selfheal.service
465
+ ok "All services enabled — start automatically after reboot"
438
466
 
439
467
  cp /etc/systemd/system/plc_watcher.service "${INSTALL_DIR}/plc_watcher.service"
440
468
  chown "${PI_USER}:${PI_USER}" "${INSTALL_DIR}/plc_watcher.service"