@misterhuydo/sentinel 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,219 @@
1
+ """
2
+ health_checker.py — Poll HEALTH_URL for each configured repo.
3
+
4
+ Distinguishes deliberate admin stops (502/503 + no startup errors in logs)
5
+ from crash restarts (502/503 + startup failure patterns detected).
6
+
7
+ State machine per repo:
8
+ None — normal monitoring
9
+ pending — first stop detected, one alert sent asking human to confirm
10
+ confirmed — human confirmed deliberate stop, silent monitoring
11
+ (cleared) — when app recovers: state removed, "back online" notification sent
12
+ """
13
+ import logging
14
+ import re
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+
18
+ import requests
19
+
20
+ from .config_loader import RepoConfig
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Spring Boot / Java app startup failure patterns
25
+ _START_FAILURE_RE = re.compile(
26
+ r"APPLICATION FAILED TO START"
27
+ r"|Error starting ApplicationContext"
28
+ r"|BeanCreationException"
29
+ r"|Failed to load ApplicationContext"
30
+ r"|java\.lang\.NoSuchMethodError"
31
+ r"|java\.lang\.ClassNotFoundException"
32
+ r"|java\.lang\.NoClassDefFoundError"
33
+ r"|could not start embedded"
34
+ r"|Caused by:.*Spring"
35
+ r"|startup failed",
36
+ re.IGNORECASE,
37
+ )
38
+
39
+ # How many bytes from the end of each synced file to scan
40
+ _SCAN_TAIL_BYTES = 200_000 # ~200KB / ~2000 lines
41
+
42
+
43
+ @dataclass
44
+ class HealthResult:
45
+ status: str # "healthy" | "stopped" | "failing" | "unknown"
46
+ http_code: int = 0
47
+ message: str = ""
48
+ raw: dict = field(default_factory=dict)
49
+
50
+
51
+ def check_health(repo: RepoConfig, timeout: int = 10) -> HealthResult:
52
+ """
53
+ Poll repo.health_url and return a HealthResult.
54
+
55
+ Status values:
56
+ healthy — HTTP 200 + Status=true
57
+ failing — HTTP 200 + Status!=true
58
+ stopped — HTTP 502/503/504 or connection refused
59
+ unknown — no URL, unexpected HTTP code, or parse error
60
+ """
61
+ if not repo.health_url:
62
+ return HealthResult(status="unknown", message="No HEALTH_URL configured")
63
+ try:
64
+ r = requests.get(repo.health_url, timeout=timeout)
65
+ if r.status_code in (502, 503, 504):
66
+ return HealthResult(
67
+ status="stopped",
68
+ http_code=r.status_code,
69
+ message=f"HTTP {r.status_code}",
70
+ )
71
+ if r.status_code == 200:
72
+ try:
73
+ data = r.json()
74
+ except Exception:
75
+ return HealthResult(status="healthy", http_code=200)
76
+ if str(data.get("Status", "")).lower() == "true":
77
+ return HealthResult(status="healthy", http_code=200, raw=data)
78
+ return HealthResult(
79
+ status="failing",
80
+ http_code=200,
81
+ raw=data,
82
+ message=f"Status={data.get('Status')} DEFCON={data.get('DEFCON', '?')}",
83
+ )
84
+ return HealthResult(
85
+ status="unknown",
86
+ http_code=r.status_code,
87
+ message=f"Unexpected HTTP {r.status_code}",
88
+ )
89
+ except requests.exceptions.ConnectionError:
90
+ return HealthResult(status="stopped", message="Connection refused")
91
+ except requests.exceptions.Timeout:
92
+ return HealthResult(status="unknown", message="Request timed out")
93
+ except Exception as e:
94
+ return HealthResult(status="unknown", message=str(e))
95
+
96
+
97
+ def scan_startup_failure(workspace_dir: str, source_names: list[str]) -> tuple[bool, str]:
98
+ """
99
+ Search synced log files for the given source names for startup failure patterns.
100
+ Scans the last ~200KB of each file (most recent entries).
101
+ Returns (found: bool, first_matching_line: str).
102
+ """
103
+ synced_base = Path(workspace_dir) / "synced"
104
+ for source_name in source_names:
105
+ source_dir = synced_base / source_name
106
+ if not source_dir.exists():
107
+ continue
108
+ for log_file in sorted(source_dir.glob("*")):
109
+ if not log_file.is_file():
110
+ continue
111
+ try:
112
+ size = log_file.stat().st_size
113
+ with open(log_file, encoding="utf-8", errors="replace") as fh:
114
+ if size > _SCAN_TAIL_BYTES:
115
+ fh.seek(size - _SCAN_TAIL_BYTES)
116
+ fh.readline() # skip partial line at seek boundary
117
+ for line in fh:
118
+ if _START_FAILURE_RE.search(line):
119
+ return True, line.strip()[:300]
120
+ except Exception as e:
121
+ logger.debug("health_checker: scan error %s: %s", log_file, e)
122
+ return False, ""
123
+
124
+
125
+ def evaluate_repos(repos: dict, log_sources: dict, workspace_dir: str, store=None) -> list[dict]:
126
+ """
127
+ Run health checks for all repos that have HEALTH_URL configured.
128
+ Uses store to track maintenance state and suppress repeated alerts.
129
+
130
+ Returns a list of result dicts, one per repo checked. Each dict has:
131
+ repo_name, status, http_code, message,
132
+ action ("none" | "fix" | "alert_once" | "recovered"),
133
+ startup_failure (bool), startup_failure_line (str)
134
+ """
135
+ results = []
136
+ for repo in repos.values():
137
+ if not repo.health_url:
138
+ continue
139
+
140
+ health = check_health(repo)
141
+ state = store.get_health_state(repo.repo_name) if store else None
142
+
143
+ result = {
144
+ "repo_name": repo.repo_name,
145
+ "status": health.status,
146
+ "http_code": health.http_code,
147
+ "message": health.message,
148
+ "action": "none",
149
+ "startup_failure": False,
150
+ "startup_failure_line": "",
151
+ "was_in_maintenance": state is not None,
152
+ }
153
+
154
+ # App recovered — clear state and notify
155
+ if health.status == "healthy":
156
+ if state is not None:
157
+ logger.info(
158
+ "health_checker: %s is back online (was %s since %s)",
159
+ repo.repo_name, state["status"], state["since"],
160
+ )
161
+ if store:
162
+ store.clear_health_state(repo.repo_name)
163
+ result["action"] = "recovered"
164
+ result["message"] = f"Back online (was {state['status']} since {state['since']})"
165
+ results.append(result)
166
+ continue
167
+
168
+ if health.status == "unknown":
169
+ results.append(result)
170
+ continue
171
+
172
+ # App is stopped or failing — check for startup errors first
173
+ sources_for_repo = [
174
+ s.name for s in log_sources.values()
175
+ if getattr(s, "target_repo", "auto") in ("auto", repo.repo_name)
176
+ ]
177
+ if not sources_for_repo:
178
+ sources_for_repo = list(log_sources.keys())
179
+
180
+ found, line = scan_startup_failure(workspace_dir, sources_for_repo)
181
+ result["startup_failure"] = found
182
+ result["startup_failure_line"] = line
183
+
184
+ if found:
185
+ # Startup crash overrides any maintenance state — app needs a fix
186
+ result["action"] = "fix"
187
+ result["message"] = f"{health.message} — startup failure found in logs"
188
+ if store and state:
189
+ store.clear_health_state(repo.repo_name)
190
+ logger.warning(
191
+ "health_checker: %s stopped + startup error → fix needed: %s",
192
+ repo.repo_name, line[:120],
193
+ )
194
+ results.append(result)
195
+ continue
196
+
197
+ # No startup errors. Apply maintenance state machine.
198
+ if state is None:
199
+ # First time we've seen this — ask once, set pending
200
+ result["action"] = "alert_once"
201
+ result["message"] = health.message
202
+ if store:
203
+ store.set_health_state(repo.repo_name, "pending")
204
+ logger.info(
205
+ "health_checker: %s stopped with no startup errors — alerting once, "
206
+ "set state=pending",
207
+ repo.repo_name,
208
+ )
209
+ else:
210
+ # pending or confirmed — stay silent
211
+ result["action"] = "none"
212
+ logger.debug(
213
+ "health_checker: %s still stopped (%s since %s) — silent",
214
+ repo.repo_name, state["status"], state["since"],
215
+ )
216
+
217
+ results.append(result)
218
+
219
+ return results
@@ -0,0 +1,164 @@
1
+ """
2
+ log_syncer.py — Periodically rsync log files from remote SSH servers to local workspace.
3
+
4
+ Synced location: workspace/synced/<source-name>/<logname>-node<N>.log
5
+
6
+ Uses rsync --append-verify (delta sync — only new bytes transferred).
7
+ Full log history is preserved locally for instant grep without SSH.
8
+ """
9
+ import logging
10
+ import shutil
11
+ import subprocess
12
+ from pathlib import Path
13
+
14
+ from .config_loader import LogSourceConfig, SentinelConfig
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ RSYNC_TIMEOUT = 120 # seconds per file
19
+
20
+
21
+ def _resolve_key(key: str, config_dir: Path) -> str | None:
22
+ import os
23
+ for candidate in [
24
+ config_dir / key,
25
+ Path(os.path.expanduser("~/.ssh")) / key,
26
+ Path(key),
27
+ ]:
28
+ if Path(candidate).exists():
29
+ return str(candidate)
30
+ return None
31
+
32
+
33
+ def sync_source(source: LogSourceConfig, cfg: SentinelConfig, config_dir: Path) -> dict:
34
+ """
35
+ rsync all log files for one SSH source from all hosts.
36
+ Returns {"synced": N, "failed": N, "skipped": N}.
37
+ """
38
+ if source.source_type != "ssh":
39
+ return {"synced": 0, "failed": 0, "skipped": 1}
40
+ if not getattr(source, "sync_enabled", True):
41
+ return {"synced": 0, "failed": 0, "skipped": 1}
42
+ if not shutil.which("rsync"):
43
+ logger.warning("log_syncer: rsync not found — install rsync to enable local sync")
44
+ return {"synced": 0, "failed": 0, "skipped": 1}
45
+
46
+ key_path = _resolve_key(source.key, config_dir)
47
+ if not key_path:
48
+ logger.warning("log_syncer: SSH key not found for %s: %s", source.name, source.key)
49
+ return {"synced": 0, "failed": 1, "skipped": 0}
50
+
51
+ dest_dir = Path(cfg.workspace_dir) / "synced" / source.name
52
+ dest_dir.mkdir(parents=True, exist_ok=True)
53
+
54
+ counts = {"synced": 0, "failed": 0, "skipped": 0}
55
+ for node_idx, raw_host in enumerate(source.hosts):
56
+ raw_host = raw_host.strip()
57
+ if "@" in raw_host:
58
+ ssh_user, ssh_host = raw_host.split("@", 1)
59
+ else:
60
+ ssh_user = "ec2-user"
61
+ ssh_host = raw_host
62
+
63
+ for log_path in source.logs:
64
+ log_path = log_path.strip()
65
+ dest_file = dest_dir / f"{Path(log_path).name}-node{node_idx}"
66
+ ssh_opts = (
67
+ f"ssh -i {key_path} "
68
+ "-o StrictHostKeyChecking=no "
69
+ "-o ConnectTimeout=15 "
70
+ "-o BatchMode=yes"
71
+ )
72
+ cmd = [
73
+ "rsync", "--append-verify", "-az", "--timeout=30",
74
+ "-e", ssh_opts,
75
+ f"{ssh_user}@{ssh_host}:{log_path}",
76
+ str(dest_file),
77
+ ]
78
+ try:
79
+ r = subprocess.run(
80
+ cmd, capture_output=True, text=True, timeout=RSYNC_TIMEOUT
81
+ )
82
+ if r.returncode == 0:
83
+ counts["synced"] += 1
84
+ logger.debug("Synced %s@%s:%s → %s", ssh_user, ssh_host, log_path, dest_file.name)
85
+ else:
86
+ counts["failed"] += 1
87
+ logger.warning(
88
+ "Sync failed %s@%s:%s (rc=%d): %s",
89
+ ssh_user, ssh_host, log_path, r.returncode, r.stderr.strip()[:200],
90
+ )
91
+ except subprocess.TimeoutExpired:
92
+ counts["failed"] += 1
93
+ logger.warning("Sync timed out: %s@%s:%s", ssh_user, ssh_host, log_path)
94
+ except Exception as e:
95
+ counts["failed"] += 1
96
+ logger.warning("Sync error %s@%s:%s: %s", ssh_user, ssh_host, log_path, e)
97
+
98
+ return counts
99
+
100
+
101
+ def prune_synced_logs(cfg: SentinelConfig) -> dict:
102
+ """
103
+ Prune synced log files:
104
+ - Delete files not modified in sync_retention_days days (stale)
105
+ - Truncate files exceeding sync_max_file_mb MB (drop oldest half of lines)
106
+ """
107
+ import time
108
+ synced_base = Path(cfg.workspace_dir) / "synced"
109
+ if not synced_base.exists():
110
+ return {"deleted": 0, "truncated": 0}
111
+ retention_secs = cfg.sync_retention_days * 86400
112
+ max_bytes = cfg.sync_max_file_mb * 1024 * 1024
113
+ now = time.time()
114
+ deleted = truncated = 0
115
+ for log_file in synced_base.rglob("*"):
116
+ if not log_file.is_file():
117
+ continue
118
+ try:
119
+ age = now - log_file.stat().st_mtime
120
+ if age > retention_secs:
121
+ log_file.unlink()
122
+ deleted += 1
123
+ logger.info(
124
+ "log_syncer: pruned stale file %s (%.1f days old)",
125
+ log_file.name, age / 86400,
126
+ )
127
+ continue
128
+ if log_file.stat().st_size > max_bytes:
129
+ lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
130
+ keep = lines[len(lines) // 2:]
131
+ log_file.write_text("\n".join(keep) + "\n", encoding="utf-8")
132
+ truncated += 1
133
+ logger.info(
134
+ "log_syncer: trimmed %s — dropped %d lines (exceeded %dMB cap)",
135
+ log_file.name, len(lines) - len(keep), cfg.sync_max_file_mb,
136
+ )
137
+ except Exception as e:
138
+ logger.warning("log_syncer: prune error for %s: %s", log_file, e)
139
+ if deleted or truncated:
140
+ logger.info("log_syncer: pruning done — %d deleted, %d truncated", deleted, truncated)
141
+ return {"deleted": deleted, "truncated": truncated}
142
+
143
+
144
+ def sync_all(log_sources: dict, cfg: SentinelConfig, config_dir: Path) -> dict:
145
+ """Sync all SSH log sources, then prune old/oversized files. Returns summary dict."""
146
+ total = {"synced": 0, "failed": 0, "skipped": 0}
147
+ for source in log_sources.values():
148
+ r = sync_source(source, cfg, config_dir)
149
+ for k in total:
150
+ total[k] += r.get(k, 0)
151
+ logger.info(
152
+ "Log sync complete — %d file(s) synced, %d failed, %d skipped",
153
+ total["synced"], total["failed"], total["skipped"],
154
+ )
155
+ prune_synced_logs(cfg)
156
+ return total
157
+
158
+
159
+ def get_synced_files(source_name: str, workspace_dir: str) -> list[Path]:
160
+ """Return list of synced log files for a given source name, or [] if none."""
161
+ d = Path(workspace_dir) / "synced" / source_name
162
+ if not d.exists():
163
+ return []
164
+ return sorted(d.glob("*"))
@@ -28,6 +28,7 @@ from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
28
28
  from .issue_watcher import scan_issues, mark_done, IssueEvent
29
29
  from .repo_router import route
30
30
  from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification, send_startup_notification, send_upgrade_notification
31
+ from .health_checker import evaluate_repos
31
32
  from .state_store import StateStore
32
33
 
33
34
  logging.basicConfig(
@@ -299,6 +300,48 @@ async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
299
300
  return_exceptions=True,
300
301
  )
301
302
 
303
+
304
+ # -- Health URL checks -------------------------------------------------------
305
+ if cfg_loader.repos:
306
+ health_results = evaluate_repos(
307
+ cfg_loader.repos, cfg_loader.log_sources, cfg_loader.sentinel.workspace_dir,
308
+ store=store,
309
+ )
310
+ for hr in health_results:
311
+ if hr["action"] == "fix":
312
+ fp = f"health-{hr['repo_name']}"
313
+ store.record_error(fp, f"health_checker/{hr['repo_name']}", hr["message"])
314
+ if not store.fix_attempted_recently(fp, hours=6):
315
+ synth = ErrorEvent(
316
+ source=f"health_checker/{hr['repo_name']}",
317
+ severity="ERROR",
318
+ message=f"App startup failure: {hr['message']}",
319
+ raw_lines=[hr["startup_failure_line"]],
320
+ timestamp=None,
321
+ )
322
+ synth.fingerprint = fp
323
+ await _handle_error(synth, cfg_loader, store)
324
+ elif hr["action"] == "alert_once":
325
+ from .notify import slack_alert
326
+ slack_alert(
327
+ cfg_loader.sentinel.slack_bot_token,
328
+ cfg_loader.sentinel.slack_channel,
329
+ (
330
+ f":question: *{hr['repo_name']}* health returned {hr['message']}"
331
+ " with no startup errors in logs.\n"
332
+ "If this is deliberate maintenance, tell Boss: "
333
+ f"`maintenance {hr['repo_name']}` \n"
334
+ "I'll silently monitor until it's back online."
335
+ ),
336
+ )
337
+ elif hr["action"] == "recovered":
338
+ from .notify import slack_alert
339
+ slack_alert(
340
+ cfg_loader.sentinel.slack_bot_token,
341
+ cfg_loader.sentinel.slack_channel,
342
+ f":white_check_mark: *{hr['repo_name']}* is back online.",
343
+ )
344
+
302
345
  if cfg_loader.sentinel.send_health and (_report_requested or _report_due(cfg_loader, store)):
303
346
  _report_requested = False
304
347
  logger.info("Sending health digest...")
@@ -539,6 +582,23 @@ async def _upgrade_check_loop(cfg_loader: ConfigLoader):
539
582
  await asyncio.sleep(cfg_loader.sentinel.upgrade_check_hours * 3600)
540
583
 
541
584
 
585
+ async def _sync_loop(cfg_loader: ConfigLoader):
586
+ """Background task: rsync remote logs to local workspace/synced/."""
587
+ from .log_syncer import sync_all
588
+ # Short initial delay — let startup checks finish first
589
+ await asyncio.sleep(30)
590
+ while True:
591
+ try:
592
+ sync_all(
593
+ cfg_loader.log_sources,
594
+ cfg_loader.sentinel,
595
+ cfg_loader.config_dir,
596
+ )
597
+ except Exception as e:
598
+ logger.warning("Log sync loop error: %s", e)
599
+ await asyncio.sleep(cfg_loader.sentinel.sync_interval_seconds)
600
+
601
+
542
602
  # ── Entry point ──────────────────────────────────────────────────────────────────────────────────
543
603
 
544
604
  def _log_auth_status(cfg: SentinelConfig) -> None:
@@ -605,6 +665,8 @@ async def run_loop(cfg_loader: ConfigLoader, store: StateStore):
605
665
  asyncio.ensure_future(_config_poll_loop(cfg_loader))
606
666
  if cfg_loader.sentinel.auto_upgrade:
607
667
  asyncio.ensure_future(_upgrade_check_loop(cfg_loader))
668
+ if cfg_loader.sentinel.sync_enabled:
669
+ asyncio.ensure_future(_sync_loop(cfg_loader))
608
670
  if cfg_loader.sentinel.slack_bot_token:
609
671
  from .slack_bot import run_slack_bot
610
672
  asyncio.ensure_future(run_slack_bot(cfg_loader, store))