@misterhuydo/sentinel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/sentinel.js +39 -0
- package/lib/add.js +57 -0
- package/lib/generate.js +111 -0
- package/lib/init.js +206 -0
- package/package.json +21 -0
- package/python/requirements.txt +5 -0
- package/python/sentinel/__init__.py +0 -0
- package/python/sentinel/cairn_client.py +45 -0
- package/python/sentinel/cicd_trigger.py +66 -0
- package/python/sentinel/config_loader.py +174 -0
- package/python/sentinel/fix_engine.py +123 -0
- package/python/sentinel/git_manager.py +227 -0
- package/python/sentinel/log_fetcher.py +200 -0
- package/python/sentinel/log_parser.py +149 -0
- package/python/sentinel/main.py +223 -0
- package/python/sentinel/repo_router.py +24 -0
- package/python/sentinel/reporter.py +173 -0
- package/python/sentinel/state_store.py +164 -0
- package/templates/log-configs/_example.properties +47 -0
- package/templates/repo-configs/_example.properties +37 -0
- package/templates/sentinel.properties +31 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
log_fetcher.py — Fetch logs from SSH servers and Cloudflare workers.
|
|
3
|
+
|
|
4
|
+
Each source maintains a single rolling log file:
|
|
5
|
+
workspace/fetched/<source-name>.log
|
|
6
|
+
|
|
7
|
+
New content is appended each poll cycle; lines older than
|
|
8
|
+
LOG_RETENTION_HOURS are pruned so Claude always has a meaningful
|
|
9
|
+
window of history without unbounded disk growth.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import subprocess
|
|
17
|
+
from datetime import datetime, timedelta, timezone
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
import requests
|
|
21
|
+
|
|
22
|
+
from .config_loader import LogSourceConfig, SentinelConfig
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
SCRIPTS_DIR = Path(__file__).parent.parent / "scripts"
|
|
27
|
+
FETCH_LOG_SH = SCRIPTS_DIR / "fetch_log.sh"
|
|
28
|
+
|
|
29
|
+
# Matches common Java/Spring log timestamps at the start of a line:
|
|
30
|
+
# 2026-03-21 01:36:56,173 or 2026-03-21T01:36:56.173Z
|
|
31
|
+
_TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2})")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _parse_line_ts(line):
|
|
35
|
+
m = _TS_RE.match(line)
|
|
36
|
+
if not m:
|
|
37
|
+
return None
|
|
38
|
+
try:
|
|
39
|
+
return datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
|
|
40
|
+
except ValueError:
|
|
41
|
+
pass
|
|
42
|
+
try:
|
|
43
|
+
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
|
|
44
|
+
except ValueError:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _prune_lines(lines, cutoff):
|
|
49
|
+
"""
|
|
50
|
+
Drop lines (and their continuation lines) whose timestamp is before cutoff.
|
|
51
|
+
Lines with no parseable timestamp are continuations; they inherit the
|
|
52
|
+
keep/drop decision of the preceding timestamped line.
|
|
53
|
+
"""
|
|
54
|
+
kept = []
|
|
55
|
+
keep = True
|
|
56
|
+
for line in lines:
|
|
57
|
+
ts = _parse_line_ts(line)
|
|
58
|
+
if ts is not None:
|
|
59
|
+
keep = ts >= cutoff
|
|
60
|
+
if keep:
|
|
61
|
+
kept.append(line)
|
|
62
|
+
return kept
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _rolling_update(rolling_path, new_content, retention_hours):
|
|
66
|
+
"""Append new_content to rolling_path then prune lines older than retention_hours."""
|
|
67
|
+
existing = ""
|
|
68
|
+
if rolling_path.exists():
|
|
69
|
+
existing = rolling_path.read_text(encoding="utf-8", errors="replace")
|
|
70
|
+
|
|
71
|
+
combined = (existing.rstrip("\n") + "\n" + new_content.strip()) if existing else new_content.strip()
|
|
72
|
+
lines = combined.splitlines()
|
|
73
|
+
|
|
74
|
+
cutoff = datetime.now(timezone.utc) - timedelta(hours=retention_hours)
|
|
75
|
+
kept = _prune_lines(lines, cutoff)
|
|
76
|
+
|
|
77
|
+
rolling_path.write_text("\n".join(kept) + "\n", encoding="utf-8")
|
|
78
|
+
logger.debug(
|
|
79
|
+
"Rolling log %s: %d -> %d lines (kept %dh window)",
|
|
80
|
+
rolling_path.name, len(lines), len(kept), retention_hours,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# -- Public API ----------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
async def fetch_all(sources, sentinel_cfg):
|
|
87
|
+
tasks = [_fetch_one(src, sentinel_cfg) for src in sources]
|
|
88
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
89
|
+
output = {}
|
|
90
|
+
for src, result in zip(sources, results):
|
|
91
|
+
if isinstance(result, Exception):
|
|
92
|
+
logger.error("Fetch failed for %s: %s", src.name, result)
|
|
93
|
+
output[src.name] = []
|
|
94
|
+
else:
|
|
95
|
+
output[src.name] = result
|
|
96
|
+
return output
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def _fetch_one(src, cfg):
|
|
100
|
+
loop = asyncio.get_running_loop()
|
|
101
|
+
if src.source_type == "ssh":
|
|
102
|
+
return await loop.run_in_executor(None, _fetch_ssh, src, cfg)
|
|
103
|
+
elif src.source_type == "cloudflare":
|
|
104
|
+
return await loop.run_in_executor(None, _fetch_cloudflare, src, cfg)
|
|
105
|
+
else:
|
|
106
|
+
raise ValueError(f"Unknown SOURCE_TYPE '{src.source_type}' for {src.name}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# -- SSH -----------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
def _find_props_file(src):
|
|
112
|
+
candidates = [
|
|
113
|
+
Path("config/log-configs") / f"{src.name}.properties",
|
|
114
|
+
Path(__file__).parent.parent / "config" / "log-configs" / f"{src.name}.properties",
|
|
115
|
+
]
|
|
116
|
+
for p in candidates:
|
|
117
|
+
if p.exists():
|
|
118
|
+
return p
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _fetch_ssh(src, cfg):
|
|
123
|
+
props_file = _find_props_file(src)
|
|
124
|
+
if not props_file:
|
|
125
|
+
raise FileNotFoundError(f"Properties file not found for {src.name}")
|
|
126
|
+
|
|
127
|
+
workspace = Path(cfg.workspace_dir) / "fetched"
|
|
128
|
+
workspace.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
|
|
130
|
+
tmp_dir = workspace / f"_tmp_{src.name}"
|
|
131
|
+
tmp_dir.mkdir(exist_ok=True)
|
|
132
|
+
|
|
133
|
+
result = subprocess.run(
|
|
134
|
+
["bash", str(FETCH_LOG_SH), str(props_file)],
|
|
135
|
+
env={**os.environ.copy(), "OUTPUT_DIR": str(tmp_dir)},
|
|
136
|
+
capture_output=True,
|
|
137
|
+
text=True,
|
|
138
|
+
timeout=120,
|
|
139
|
+
)
|
|
140
|
+
if result.returncode != 0:
|
|
141
|
+
logger.warning("fetch_log.sh stderr for %s:\n%s", src.name, result.stderr)
|
|
142
|
+
|
|
143
|
+
new_lines = []
|
|
144
|
+
for host_file in sorted(tmp_dir.rglob("*.log")):
|
|
145
|
+
new_lines.extend(host_file.read_text(encoding="utf-8", errors="replace").splitlines())
|
|
146
|
+
host_file.unlink()
|
|
147
|
+
for d in sorted(tmp_dir.rglob("*"), reverse=True):
|
|
148
|
+
if d.is_dir():
|
|
149
|
+
try:
|
|
150
|
+
d.rmdir()
|
|
151
|
+
except OSError:
|
|
152
|
+
pass
|
|
153
|
+
try:
|
|
154
|
+
tmp_dir.rmdir()
|
|
155
|
+
except OSError:
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
rolling_path = workspace / f"{src.name}.log"
|
|
159
|
+
if not new_lines:
|
|
160
|
+
logger.info("SSH fetch %s: no lines fetched", src.name)
|
|
161
|
+
return [rolling_path] if rolling_path.exists() else []
|
|
162
|
+
|
|
163
|
+
_rolling_update(rolling_path, "\n".join(new_lines), cfg.log_retention_hours)
|
|
164
|
+
logger.info("SSH fetch %s: %d new lines -> %s", src.name, len(new_lines), rolling_path)
|
|
165
|
+
return [rolling_path]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# -- Cloudflare ----------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def _fetch_cloudflare(src, cfg):
|
|
171
|
+
if not src.cf_url or not src.cf_token:
|
|
172
|
+
raise ValueError(f"CF_URL or CF_TOKEN missing for {src.name}")
|
|
173
|
+
|
|
174
|
+
workspace = Path(cfg.workspace_dir) / "fetched"
|
|
175
|
+
workspace.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
|
|
177
|
+
headers = {"Authorization": f"Bearer {src.cf_token}"}
|
|
178
|
+
lines = []
|
|
179
|
+
cursor = None
|
|
180
|
+
|
|
181
|
+
while True:
|
|
182
|
+
params = {"cursor": cursor} if cursor else {}
|
|
183
|
+
resp = requests.get(src.cf_url, headers=headers, params=params, timeout=30)
|
|
184
|
+
resp.raise_for_status()
|
|
185
|
+
ct = resp.headers.get("content-type", "")
|
|
186
|
+
if ct.startswith("application/json"):
|
|
187
|
+
data = resp.json()
|
|
188
|
+
batch = data.get("lines") or data.get("logs") or []
|
|
189
|
+
lines.extend(str(l) for l in batch)
|
|
190
|
+
cursor = data.get("next_cursor") or data.get("cursor")
|
|
191
|
+
else:
|
|
192
|
+
lines.extend(resp.text.splitlines())
|
|
193
|
+
cursor = None
|
|
194
|
+
if not cursor:
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
rolling_path = workspace / f"{src.name}.log"
|
|
198
|
+
_rolling_update(rolling_path, "\n".join(lines), cfg.log_retention_hours)
|
|
199
|
+
logger.info("Cloudflare fetch %s: %d new lines -> %s", src.name, len(lines), rolling_path)
|
|
200
|
+
return [rolling_path]
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
log_parser.py — Parse fetched log files into ErrorEvent objects.
|
|
3
|
+
|
|
4
|
+
Handles Java-style logs (Spring Boot / Logback format):
|
|
5
|
+
2024-01-15 12:34:56.789 ERROR [thread] class.ClassName - Message
|
|
6
|
+
followed by optional stack trace lines (^\tat ...)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import re
|
|
11
|
+
import logging
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
_LOG_HEADER = re.compile(
|
|
18
|
+
r"^(?P<ts>\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}[.,\d]*)\s+"
|
|
19
|
+
r"(?P<level>CRITICAL|ERROR|WARN(?:ING)?|INFO|DEBUG)\s+"
|
|
20
|
+
r"(?:\[(?P<thread>[^\]]*)\]\s+)?"
|
|
21
|
+
r"(?P<logger>\S+)\s+-\s+"
|
|
22
|
+
r"(?P<message>.+)$"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_STACK_LINE = re.compile(r"^\s+at |\s+\.\.\. \d+ more|^Caused by:")
|
|
26
|
+
|
|
27
|
+
SEVERITY_MAP = {
|
|
28
|
+
"CRITICAL": "CRITICAL",
|
|
29
|
+
"ERROR": "ERROR",
|
|
30
|
+
"WARN": "WARN",
|
|
31
|
+
"WARNING": "WARN",
|
|
32
|
+
"INFO": "INFO",
|
|
33
|
+
"DEBUG": "DEBUG",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_CRITICAL_PATTERNS = re.compile(
|
|
37
|
+
r"OutOfMemoryError|StackOverflowError|OOMKilled", re.IGNORECASE
|
|
38
|
+
)
|
|
39
|
+
_INFRA_PATTERNS = re.compile(
|
|
40
|
+
r"ConnectException|TimeoutException|ConnectionRefused|SocketTimeout",
|
|
41
|
+
re.IGNORECASE,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ErrorEvent:
|
|
47
|
+
source: str # log-source name (e.g. "SSOLWA")
|
|
48
|
+
log_file: str
|
|
49
|
+
timestamp: str
|
|
50
|
+
level: str # CRITICAL / ERROR / WARN
|
|
51
|
+
thread: str
|
|
52
|
+
logger_name: str
|
|
53
|
+
message: str
|
|
54
|
+
stack_trace: list[str] = field(default_factory=list)
|
|
55
|
+
fingerprint: str = ""
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
if not self.fingerprint:
|
|
59
|
+
self.fingerprint = _fingerprint(self.message, self.stack_trace)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def severity(self) -> str:
|
|
63
|
+
if _CRITICAL_PATTERNS.search(self.message) or _CRITICAL_PATTERNS.search(
|
|
64
|
+
"\n".join(self.stack_trace)
|
|
65
|
+
):
|
|
66
|
+
return "CRITICAL"
|
|
67
|
+
return self.level
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def is_infra_issue(self) -> bool:
|
|
71
|
+
return bool(_INFRA_PATTERNS.search(self.message))
|
|
72
|
+
|
|
73
|
+
def short_summary(self) -> str:
|
|
74
|
+
return self.message[:120]
|
|
75
|
+
|
|
76
|
+
def full_text(self) -> str:
|
|
77
|
+
lines = [f"{self.timestamp} {self.level} [{self.thread}] {self.logger_name} - {self.message}"]
|
|
78
|
+
lines.extend(self.stack_trace)
|
|
79
|
+
return "\n".join(lines)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _normalize_message(msg: str) -> str:
|
|
83
|
+
msg = re.sub(r"0x[0-9a-fA-F]+", "0xADDR", msg)
|
|
84
|
+
msg = re.sub(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", "UUID", msg)
|
|
85
|
+
msg = re.sub(r"\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[.,\d]*\b", "TIMESTAMP", msg)
|
|
86
|
+
msg = re.sub(r"\b\d+\b", "N", msg)
|
|
87
|
+
return msg.strip()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _fingerprint(message: str, stack_trace: list[str]) -> str:
|
|
91
|
+
top_frames = [l for l in stack_trace if l.strip().startswith("at ")][:3]
|
|
92
|
+
raw = _normalize_message(message) + "\n" + "\n".join(top_frames)
|
|
93
|
+
return hashlib.sha1(raw.encode()).hexdigest()[:16]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def parse_log_file(path: Path, source_name: str) -> list[ErrorEvent]:
|
|
97
|
+
"""Parse a single log file and return all ERROR/WARN events."""
|
|
98
|
+
events: list[ErrorEvent] = []
|
|
99
|
+
current_header: re.Match | None = None
|
|
100
|
+
current_stack: list[str] = []
|
|
101
|
+
|
|
102
|
+
def flush():
|
|
103
|
+
if current_header is None:
|
|
104
|
+
return
|
|
105
|
+
level = SEVERITY_MAP.get(current_header.group("level").upper(), "WARN")
|
|
106
|
+
if level not in ("ERROR", "WARN", "CRITICAL"):
|
|
107
|
+
return
|
|
108
|
+
event = ErrorEvent(
|
|
109
|
+
source=source_name,
|
|
110
|
+
log_file=str(path),
|
|
111
|
+
timestamp=current_header.group("ts"),
|
|
112
|
+
level=level,
|
|
113
|
+
thread=current_header.group("thread") or "",
|
|
114
|
+
logger_name=current_header.group("logger"),
|
|
115
|
+
message=current_header.group("message"),
|
|
116
|
+
stack_trace=list(current_stack),
|
|
117
|
+
)
|
|
118
|
+
events.append(event)
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
122
|
+
except OSError as e:
|
|
123
|
+
logger.error("Cannot read %s: %s", path, e)
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
for line in text.splitlines():
|
|
127
|
+
m = _LOG_HEADER.match(line)
|
|
128
|
+
if m:
|
|
129
|
+
flush()
|
|
130
|
+
current_header = m
|
|
131
|
+
current_stack = []
|
|
132
|
+
elif current_header and _STACK_LINE.match(line):
|
|
133
|
+
current_stack.append(line)
|
|
134
|
+
|
|
135
|
+
flush()
|
|
136
|
+
logger.debug("Parsed %s: %d error/warn events", path.name, len(events))
|
|
137
|
+
return events
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def parse_all(
|
|
141
|
+
fetched_files: dict[str, list[Path]],
|
|
142
|
+
log_sources, # dict[str, LogSourceConfig]
|
|
143
|
+
) -> list[ErrorEvent]:
|
|
144
|
+
"""Parse all fetched log files across all sources."""
|
|
145
|
+
all_events: list[ErrorEvent] = []
|
|
146
|
+
for source_name, files in fetched_files.items():
|
|
147
|
+
for f in files:
|
|
148
|
+
all_events.extend(parse_log_file(f, source_name))
|
|
149
|
+
return all_events
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
main.py — Sentinel entry point and watch loop.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python -m sentinel.main # run watch loop
|
|
6
|
+
python -m sentinel.main --init # first-time setup
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
import signal
|
|
13
|
+
import subprocess
|
|
14
|
+
import sys
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from .cairn_client import ensure_installed as cairn_installed, index_repo
|
|
19
|
+
from .config_loader import ConfigLoader
|
|
20
|
+
from .fix_engine import generate_fix
|
|
21
|
+
from .git_manager import apply_and_commit, publish
|
|
22
|
+
from .cicd_trigger import trigger as cicd_trigger
|
|
23
|
+
from .log_fetcher import fetch_all
|
|
24
|
+
from .log_parser import parse_all, ErrorEvent
|
|
25
|
+
from .repo_router import route
|
|
26
|
+
from .reporter import build_and_send
|
|
27
|
+
from .state_store import StateStore
|
|
28
|
+
|
|
29
|
+
logging.basicConfig(
|
|
30
|
+
level=logging.INFO,
|
|
31
|
+
format="%(asctime)s %(levelname)-7s %(name)s — %(message)s",
|
|
32
|
+
handlers=[
|
|
33
|
+
logging.StreamHandler(sys.stdout),
|
|
34
|
+
logging.FileHandler("logs/sentinel.log", encoding="utf-8"),
|
|
35
|
+
],
|
|
36
|
+
)
|
|
37
|
+
logger = logging.getLogger("sentinel")
|
|
38
|
+
|
|
39
|
+
_report_requested = False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _on_sigusr1(*_):
|
|
43
|
+
global _report_requested
|
|
44
|
+
_report_requested = True
|
|
45
|
+
logger.info("SIGUSR1 received — health report queued")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _register_signals():
|
|
49
|
+
try:
|
|
50
|
+
signal.signal(signal.SIGUSR1, _on_sigusr1)
|
|
51
|
+
except (OSError, AttributeError):
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ── Fix pipeline ──────────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: StateStore):
|
|
58
|
+
sentinel = cfg_loader.sentinel
|
|
59
|
+
|
|
60
|
+
repo = route(event, cfg_loader.repos)
|
|
61
|
+
if not repo:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
if Path("SENTINEL_PAUSE").exists():
|
|
65
|
+
logger.info("SENTINEL_PAUSE present — fix activity halted")
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
if event.is_infra_issue:
|
|
69
|
+
logger.info("Infra issue for %s — log only", event.fingerprint)
|
|
70
|
+
store.record_fix(event.fingerprint, "skipped", repo_name=repo.repo_name)
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
if event.severity == "CRITICAL" and repo.auto_publish:
|
|
74
|
+
logger.warning("CRITICAL in auto-publish repo '%s' — flagging for human review", repo.repo_name)
|
|
75
|
+
store.record_fix(event.fingerprint, "skipped", repo_name=repo.repo_name)
|
|
76
|
+
build_and_send(sentinel, store)
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
if store.fix_attempted_recently(event.fingerprint, hours=24):
|
|
80
|
+
logger.debug("Fix already attempted recently for %s", event.fingerprint)
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
patches_dir = Path(sentinel.workspace_dir) / "patches"
|
|
84
|
+
status, patch_path = generate_fix(event, repo, sentinel, patches_dir)
|
|
85
|
+
|
|
86
|
+
if status != "patch" or patch_path is None:
|
|
87
|
+
store.record_fix(event.fingerprint, "skipped" if status == "skip" else "failed", repo_name=repo.repo_name)
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
|
|
91
|
+
if commit_status != "committed":
|
|
92
|
+
store.record_fix(event.fingerprint, "failed", repo_name=repo.repo_name)
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
branch, pr_url = publish(event, repo, sentinel, commit_hash)
|
|
96
|
+
store.record_fix(
|
|
97
|
+
event.fingerprint,
|
|
98
|
+
"applied" if repo.auto_publish else "pending",
|
|
99
|
+
patch_path=str(patch_path),
|
|
100
|
+
commit_hash=commit_hash,
|
|
101
|
+
branch=branch,
|
|
102
|
+
pr_url=pr_url,
|
|
103
|
+
repo_name=repo.repo_name,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if repo.auto_publish:
|
|
107
|
+
cicd_trigger(repo, store, event.fingerprint)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ── Poll cycle ────────────────────────────────────────────────────────────────
|
|
111
|
+
|
|
112
|
+
async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
|
|
113
|
+
global _report_requested
|
|
114
|
+
|
|
115
|
+
sources = list(cfg_loader.log_sources.values())
|
|
116
|
+
if not sources:
|
|
117
|
+
logger.warning("No log-configs found")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
logger.info("Fetching logs from %d source(s)...", len(sources))
|
|
121
|
+
fetched = await fetch_all(sources, cfg_loader.sentinel)
|
|
122
|
+
|
|
123
|
+
events = parse_all(fetched, cfg_loader.log_sources)
|
|
124
|
+
logger.info("Parsed %d error/warn events", len(events))
|
|
125
|
+
|
|
126
|
+
new_events = []
|
|
127
|
+
for event in events:
|
|
128
|
+
store.record_error(event.fingerprint, event.source, event.message)
|
|
129
|
+
if not store.fix_attempted_recently(event.fingerprint):
|
|
130
|
+
new_events.append(event)
|
|
131
|
+
|
|
132
|
+
logger.info("%d new event(s) to process", len(new_events))
|
|
133
|
+
await asyncio.gather(*[_handle_error(e, cfg_loader, store) for e in new_events], return_exceptions=True)
|
|
134
|
+
|
|
135
|
+
if _report_requested or _report_due(cfg_loader, store):
|
|
136
|
+
_report_requested = False
|
|
137
|
+
logger.info("Sending health report...")
|
|
138
|
+
build_and_send(cfg_loader.sentinel, store)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _report_due(cfg_loader: ConfigLoader, store: StateStore) -> bool:
|
|
142
|
+
last = store.last_report_time()
|
|
143
|
+
if last is None:
|
|
144
|
+
return True
|
|
145
|
+
elapsed = (datetime.now(timezone.utc) - last).total_seconds()
|
|
146
|
+
return elapsed >= cfg_loader.sentinel.report_interval_hours * 3600
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ── Init ──────────────────────────────────────────────────────────────────────
|
|
150
|
+
|
|
151
|
+
def run_init(cfg_loader: ConfigLoader):
|
|
152
|
+
sentinel = cfg_loader.sentinel
|
|
153
|
+
logger.info("=== Sentinel --init ===")
|
|
154
|
+
|
|
155
|
+
if not cairn_installed():
|
|
156
|
+
logger.error("Cairn not installed. Run: npm install -g @misterhuydo/cairn-mcp")
|
|
157
|
+
|
|
158
|
+
for name, repo in cfg_loader.repos.items():
|
|
159
|
+
local = Path(repo.local_path)
|
|
160
|
+
if not local.exists():
|
|
161
|
+
logger.info("Cloning %s → %s", repo.repo_url, repo.local_path)
|
|
162
|
+
r = subprocess.run(["git", "clone", repo.repo_url, str(local)], capture_output=True, text=True)
|
|
163
|
+
if r.returncode != 0:
|
|
164
|
+
logger.error("Clone failed for %s: %s", name, r.stderr)
|
|
165
|
+
continue
|
|
166
|
+
index_repo(repo)
|
|
167
|
+
|
|
168
|
+
for src_name, src in cfg_loader.log_sources.items():
|
|
169
|
+
if src.source_type == "ssh" and src.hosts:
|
|
170
|
+
host = src.hosts[0]
|
|
171
|
+
logger.info("Testing SSH to %s (%s)...", src_name, host)
|
|
172
|
+
r = subprocess.run(
|
|
173
|
+
["ssh", "-i", src.key, "-o", "StrictHostKeyChecking=no",
|
|
174
|
+
"-o", "ConnectTimeout=5", f"ec2-user@{host}", "echo ok"],
|
|
175
|
+
capture_output=True, text=True, timeout=15,
|
|
176
|
+
)
|
|
177
|
+
logger.info(" SSH %s: %s", host, "OK" if r.returncode == 0 else f"FAILED — {r.stderr.strip()}")
|
|
178
|
+
|
|
179
|
+
logger.info("Sending test email...")
|
|
180
|
+
try:
|
|
181
|
+
build_and_send(sentinel, StateStore(sentinel.state_db))
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error("Test email failed: %s", e)
|
|
184
|
+
|
|
185
|
+
logger.info("=== Init complete ===")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ── Entry point ───────────────────────────────────────────────────────────────
|
|
189
|
+
|
|
190
|
+
async def run_loop(cfg_loader: ConfigLoader, store: StateStore):
|
|
191
|
+
interval = cfg_loader.sentinel.poll_interval_seconds
|
|
192
|
+
logger.info("Sentinel starting — poll interval: %ds, repos: %s", interval, list(cfg_loader.repos.keys()))
|
|
193
|
+
while True:
|
|
194
|
+
try:
|
|
195
|
+
await poll_cycle(cfg_loader, store)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.exception("Unhandled error in poll cycle: %s", e)
|
|
198
|
+
await asyncio.sleep(interval)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def main():
|
|
202
|
+
Path("logs").mkdir(exist_ok=True)
|
|
203
|
+
Path("workspace/fetched").mkdir(parents=True, exist_ok=True)
|
|
204
|
+
Path("workspace/patches").mkdir(parents=True, exist_ok=True)
|
|
205
|
+
|
|
206
|
+
parser = argparse.ArgumentParser(description="Sentinel — Autonomous DevOps Agent")
|
|
207
|
+
parser.add_argument("--init", action="store_true", help="First-time setup")
|
|
208
|
+
parser.add_argument("--config", default="./config", help="Config directory path")
|
|
209
|
+
args = parser.parse_args()
|
|
210
|
+
|
|
211
|
+
cfg_loader = ConfigLoader(config_dir=args.config)
|
|
212
|
+
store = StateStore(cfg_loader.sentinel.state_db)
|
|
213
|
+
_register_signals()
|
|
214
|
+
|
|
215
|
+
if args.init:
|
|
216
|
+
run_init(cfg_loader)
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
asyncio.run(run_loop(cfg_loader, store))
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
repo_router.py — Map an ErrorEvent to its RepoConfig by filename stem.
|
|
3
|
+
|
|
4
|
+
Convention: log-configs/<name>.properties links to repo-configs/<name>.properties.
|
|
5
|
+
If no repo-config exists for a given log-config stem, the error is unroutable
|
|
6
|
+
and the fix is skipped.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from .log_parser import ErrorEvent
|
|
11
|
+
from .config_loader import RepoConfig
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def route(event: ErrorEvent, repos: dict[str, RepoConfig]) -> RepoConfig | None:
|
|
17
|
+
"""Return the RepoConfig whose stem matches the log-source name, or None."""
|
|
18
|
+
repo = repos.get(event.source)
|
|
19
|
+
if repo is None:
|
|
20
|
+
logger.warning(
|
|
21
|
+
"No repo-config found for log-source '%s' (fingerprint: %s) — skipping fix",
|
|
22
|
+
event.source, event.fingerprint,
|
|
23
|
+
)
|
|
24
|
+
return repo
|