@misterhuydo/sentinel 1.0.6 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/sentinel.js +42 -39
- package/lib/add.js +415 -57
- package/lib/generate.js +14 -23
- package/lib/init.js +21 -7
- package/package.json +1 -1
- package/python/sentinel/__pycache__/issue_watcher.cpython-313.pyc +0 -0
- package/python/sentinel/config_loader.py +15 -3
- package/python/sentinel/fix_engine.py +49 -14
- package/python/sentinel/issue_watcher.py +146 -131
- package/python/sentinel/log_parser.py +175 -149
- package/python/sentinel/main.py +110 -32
- package/python/sentinel/reporter.py +159 -0
- package/python/sentinel/state_store.py +275 -164
- package/templates/sentinel.properties +20 -32
- package/templates/workspace-sentinel.properties +20 -0
|
@@ -1,149 +1,175 @@
|
|
|
1
|
-
"""
|
|
2
|
-
log_parser.py — Parse fetched log files into ErrorEvent objects.
|
|
3
|
-
|
|
4
|
-
Handles Java-style logs (Spring Boot / Logback format):
|
|
5
|
-
2024-01-15 12:34:56.789 ERROR [thread] class.ClassName - Message
|
|
6
|
-
followed by optional stack trace lines (^\tat ...)
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import hashlib
|
|
10
|
-
import re
|
|
11
|
-
import logging
|
|
12
|
-
from dataclasses import dataclass, field
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
_LOG_HEADER = re.compile(
|
|
18
|
-
r"^(?P<ts>\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}[.,\d]*)\s+"
|
|
19
|
-
r"(?P<level>CRITICAL|ERROR|WARN(?:ING)?|INFO|DEBUG)\s+"
|
|
20
|
-
r"(?:\[(?P<thread>[^\]]*)\]\s+)?"
|
|
21
|
-
r"(?P<logger>\S+)\s+-\s+"
|
|
22
|
-
r"(?P<message>.+)$"
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
_STACK_LINE = re.compile(r"^\s+at |\s+\.\.\. \d+ more|^Caused by:")
|
|
26
|
-
|
|
27
|
-
SEVERITY_MAP = {
|
|
28
|
-
"CRITICAL": "CRITICAL",
|
|
29
|
-
"ERROR": "ERROR",
|
|
30
|
-
"WARN": "WARN",
|
|
31
|
-
"WARNING": "WARN",
|
|
32
|
-
"INFO": "INFO",
|
|
33
|
-
"DEBUG": "DEBUG",
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
_CRITICAL_PATTERNS = re.compile(
|
|
37
|
-
r"OutOfMemoryError|StackOverflowError|OOMKilled", re.IGNORECASE
|
|
38
|
-
)
|
|
39
|
-
_INFRA_PATTERNS = re.compile(
|
|
40
|
-
r"ConnectException|TimeoutException|ConnectionRefused|SocketTimeout",
|
|
41
|
-
re.IGNORECASE,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass
|
|
46
|
-
class ErrorEvent:
|
|
47
|
-
source: str # log-source name (e.g. "SSOLWA")
|
|
48
|
-
log_file: str
|
|
49
|
-
timestamp: str
|
|
50
|
-
level: str # CRITICAL / ERROR / WARN
|
|
51
|
-
thread: str
|
|
52
|
-
logger_name: str
|
|
53
|
-
message: str
|
|
54
|
-
stack_trace: list[str] = field(default_factory=list)
|
|
55
|
-
fingerprint: str = ""
|
|
56
|
-
|
|
57
|
-
def __post_init__(self):
|
|
58
|
-
if not self.fingerprint:
|
|
59
|
-
self.fingerprint = _fingerprint(self.message, self.stack_trace)
|
|
60
|
-
|
|
61
|
-
@property
|
|
62
|
-
def severity(self) -> str:
|
|
63
|
-
if _CRITICAL_PATTERNS.search(self.message) or _CRITICAL_PATTERNS.search(
|
|
64
|
-
"\n".join(self.stack_trace)
|
|
65
|
-
):
|
|
66
|
-
return "CRITICAL"
|
|
67
|
-
return self.level
|
|
68
|
-
|
|
69
|
-
@property
|
|
70
|
-
def is_infra_issue(self) -> bool:
|
|
71
|
-
return bool(_INFRA_PATTERNS.search(self.message))
|
|
72
|
-
|
|
73
|
-
def short_summary(self) -> str:
|
|
74
|
-
return self.message[:120]
|
|
75
|
-
|
|
76
|
-
def full_text(self) -> str:
|
|
77
|
-
lines = [f"{self.timestamp} {self.level} [{self.thread}] {self.logger_name} - {self.message}"]
|
|
78
|
-
lines.extend(self.stack_trace)
|
|
79
|
-
return "\n".join(lines)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def _normalize_message(msg: str) -> str:
|
|
83
|
-
msg = re.sub(r"0x[0-9a-fA-F]+", "0xADDR", msg)
|
|
84
|
-
msg = re.sub(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", "UUID", msg)
|
|
85
|
-
msg = re.sub(r"\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[.,\d]*\b", "TIMESTAMP", msg)
|
|
86
|
-
msg = re.sub(r"\b\d+\b", "N", msg)
|
|
87
|
-
return msg.strip()
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def _fingerprint(message: str, stack_trace: list[str]) -> str:
|
|
91
|
-
top_frames = [l for l in stack_trace if l.strip().startswith("at ")][:3]
|
|
92
|
-
raw = _normalize_message(message) + "\n" + "\n".join(top_frames)
|
|
93
|
-
return hashlib.sha1(raw.encode()).hexdigest()[:16]
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def parse_log_file(path: Path, source_name: str) -> list[ErrorEvent]:
|
|
97
|
-
"""Parse a single log file and return all ERROR/WARN events."""
|
|
98
|
-
events: list[ErrorEvent] = []
|
|
99
|
-
current_header: re.Match | None = None
|
|
100
|
-
current_stack: list[str] = []
|
|
101
|
-
|
|
102
|
-
def flush():
|
|
103
|
-
if current_header is None:
|
|
104
|
-
return
|
|
105
|
-
level = SEVERITY_MAP.get(current_header.group("level").upper(), "WARN")
|
|
106
|
-
if level not in ("ERROR", "WARN", "CRITICAL"):
|
|
107
|
-
return
|
|
108
|
-
event = ErrorEvent(
|
|
109
|
-
source=source_name,
|
|
110
|
-
log_file=str(path),
|
|
111
|
-
timestamp=current_header.group("ts"),
|
|
112
|
-
level=level,
|
|
113
|
-
thread=current_header.group("thread") or "",
|
|
114
|
-
logger_name=current_header.group("logger"),
|
|
115
|
-
message=current_header.group("message"),
|
|
116
|
-
stack_trace=list(current_stack),
|
|
117
|
-
)
|
|
118
|
-
events.append(event)
|
|
119
|
-
|
|
120
|
-
try:
|
|
121
|
-
text = path.read_text(encoding="utf-8", errors="replace")
|
|
122
|
-
except OSError as e:
|
|
123
|
-
logger.error("Cannot read %s: %s", path, e)
|
|
124
|
-
return []
|
|
125
|
-
|
|
126
|
-
for line in text.splitlines():
|
|
127
|
-
m = _LOG_HEADER.match(line)
|
|
128
|
-
if m:
|
|
129
|
-
flush()
|
|
130
|
-
current_header = m
|
|
131
|
-
current_stack = []
|
|
132
|
-
elif current_header and _STACK_LINE.match(line):
|
|
133
|
-
current_stack.append(line)
|
|
134
|
-
|
|
135
|
-
flush()
|
|
136
|
-
logger.debug("Parsed %s: %d error/warn events", path.name, len(events))
|
|
137
|
-
return events
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def parse_all(
|
|
141
|
-
fetched_files: dict[str, list[Path]],
|
|
142
|
-
log_sources, # dict[str, LogSourceConfig]
|
|
143
|
-
) -> list[ErrorEvent]:
|
|
144
|
-
"""Parse all fetched log files across all sources."""
|
|
145
|
-
all_events: list[ErrorEvent] = []
|
|
146
|
-
for source_name, files in fetched_files.items():
|
|
147
|
-
for f in files:
|
|
148
|
-
all_events.extend(parse_log_file(f, source_name))
|
|
149
|
-
return all_events
|
|
1
|
+
"""
|
|
2
|
+
log_parser.py — Parse fetched log files into ErrorEvent objects.
|
|
3
|
+
|
|
4
|
+
Handles Java-style logs (Spring Boot / Logback format):
|
|
5
|
+
2024-01-15 12:34:56.789 ERROR [thread] class.ClassName - Message
|
|
6
|
+
followed by optional stack trace lines (^\tat ...)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import re
|
|
11
|
+
import logging
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
_LOG_HEADER = re.compile(
|
|
18
|
+
r"^(?P<ts>\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}[.,\d]*)\s+"
|
|
19
|
+
r"(?P<level>CRITICAL|ERROR|WARN(?:ING)?|INFO|DEBUG)\s+"
|
|
20
|
+
r"(?:\[(?P<thread>[^\]]*)\]\s+)?"
|
|
21
|
+
r"(?P<logger>\S+)\s+-\s+"
|
|
22
|
+
r"(?P<message>.+)$"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_STACK_LINE = re.compile(r"^\s+at |\s+\.\.\. \d+ more|^Caused by:")
|
|
26
|
+
|
|
27
|
+
SEVERITY_MAP = {
|
|
28
|
+
"CRITICAL": "CRITICAL",
|
|
29
|
+
"ERROR": "ERROR",
|
|
30
|
+
"WARN": "WARN",
|
|
31
|
+
"WARNING": "WARN",
|
|
32
|
+
"INFO": "INFO",
|
|
33
|
+
"DEBUG": "DEBUG",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_CRITICAL_PATTERNS = re.compile(
|
|
37
|
+
r"OutOfMemoryError|StackOverflowError|OOMKilled", re.IGNORECASE
|
|
38
|
+
)
|
|
39
|
+
_INFRA_PATTERNS = re.compile(
|
|
40
|
+
r"ConnectException|TimeoutException|ConnectionRefused|SocketTimeout",
|
|
41
|
+
re.IGNORECASE,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ErrorEvent:
|
|
47
|
+
source: str # log-source name (e.g. "SSOLWA")
|
|
48
|
+
log_file: str
|
|
49
|
+
timestamp: str
|
|
50
|
+
level: str # CRITICAL / ERROR / WARN
|
|
51
|
+
thread: str
|
|
52
|
+
logger_name: str
|
|
53
|
+
message: str
|
|
54
|
+
stack_trace: list[str] = field(default_factory=list)
|
|
55
|
+
fingerprint: str = ""
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
if not self.fingerprint:
|
|
59
|
+
self.fingerprint = _fingerprint(self.message, self.stack_trace)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def severity(self) -> str:
|
|
63
|
+
if _CRITICAL_PATTERNS.search(self.message) or _CRITICAL_PATTERNS.search(
|
|
64
|
+
"\n".join(self.stack_trace)
|
|
65
|
+
):
|
|
66
|
+
return "CRITICAL"
|
|
67
|
+
return self.level
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def is_infra_issue(self) -> bool:
|
|
71
|
+
return bool(_INFRA_PATTERNS.search(self.message))
|
|
72
|
+
|
|
73
|
+
def short_summary(self) -> str:
|
|
74
|
+
return self.message[:120]
|
|
75
|
+
|
|
76
|
+
def full_text(self) -> str:
|
|
77
|
+
lines = [f"{self.timestamp} {self.level} [{self.thread}] {self.logger_name} - {self.message}"]
|
|
78
|
+
lines.extend(self.stack_trace)
|
|
79
|
+
return "\n".join(lines)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _normalize_message(msg: str) -> str:
|
|
83
|
+
msg = re.sub(r"0x[0-9a-fA-F]+", "0xADDR", msg)
|
|
84
|
+
msg = re.sub(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", "UUID", msg)
|
|
85
|
+
msg = re.sub(r"\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[.,\d]*\b", "TIMESTAMP", msg)
|
|
86
|
+
msg = re.sub(r"\b\d+\b", "N", msg)
|
|
87
|
+
return msg.strip()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _fingerprint(message: str, stack_trace: list[str]) -> str:
|
|
91
|
+
top_frames = [l for l in stack_trace if l.strip().startswith("at ")][:3]
|
|
92
|
+
raw = _normalize_message(message) + "\n" + "\n".join(top_frames)
|
|
93
|
+
return hashlib.sha1(raw.encode()).hexdigest()[:16]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def parse_log_file(path: Path, source_name: str) -> list[ErrorEvent]:
|
|
97
|
+
"""Parse a single log file and return all ERROR/WARN events."""
|
|
98
|
+
events: list[ErrorEvent] = []
|
|
99
|
+
current_header: re.Match | None = None
|
|
100
|
+
current_stack: list[str] = []
|
|
101
|
+
|
|
102
|
+
def flush():
|
|
103
|
+
if current_header is None:
|
|
104
|
+
return
|
|
105
|
+
level = SEVERITY_MAP.get(current_header.group("level").upper(), "WARN")
|
|
106
|
+
if level not in ("ERROR", "WARN", "CRITICAL"):
|
|
107
|
+
return
|
|
108
|
+
event = ErrorEvent(
|
|
109
|
+
source=source_name,
|
|
110
|
+
log_file=str(path),
|
|
111
|
+
timestamp=current_header.group("ts"),
|
|
112
|
+
level=level,
|
|
113
|
+
thread=current_header.group("thread") or "",
|
|
114
|
+
logger_name=current_header.group("logger"),
|
|
115
|
+
message=current_header.group("message"),
|
|
116
|
+
stack_trace=list(current_stack),
|
|
117
|
+
)
|
|
118
|
+
events.append(event)
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
122
|
+
except OSError as e:
|
|
123
|
+
logger.error("Cannot read %s: %s", path, e)
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
for line in text.splitlines():
|
|
127
|
+
m = _LOG_HEADER.match(line)
|
|
128
|
+
if m:
|
|
129
|
+
flush()
|
|
130
|
+
current_header = m
|
|
131
|
+
current_stack = []
|
|
132
|
+
elif current_header and _STACK_LINE.match(line):
|
|
133
|
+
current_stack.append(line)
|
|
134
|
+
|
|
135
|
+
flush()
|
|
136
|
+
logger.debug("Parsed %s: %d error/warn events", path.name, len(events))
|
|
137
|
+
return events
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def parse_all(
|
|
141
|
+
fetched_files: dict[str, list[Path]],
|
|
142
|
+
log_sources, # dict[str, LogSourceConfig]
|
|
143
|
+
) -> list[ErrorEvent]:
|
|
144
|
+
"""Parse all fetched log files across all sources."""
|
|
145
|
+
all_events: list[ErrorEvent] = []
|
|
146
|
+
for source_name, files in fetched_files.items():
|
|
147
|
+
for f in files:
|
|
148
|
+
all_events.extend(parse_log_file(f, source_name))
|
|
149
|
+
return all_events
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# -- Sentinel marker detection -------------------------------------------------
|
|
153
|
+
|
|
154
|
+
_SENTINEL_MARKER_RE = re.compile(r'SENTINEL:#([0-9a-f]{16})')
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def scan_for_markers(path: Path) -> list[str]:
|
|
158
|
+
"""
|
|
159
|
+
Scan a single log file for SENTINEL:#<fingerprint> markers injected by fix_engine.
|
|
160
|
+
Returns a list of full marker strings (e.g. ['SENTINEL:#abc123de45678901']).
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
text = path.read_text(encoding='utf-8', errors='replace')
|
|
164
|
+
except OSError:
|
|
165
|
+
return []
|
|
166
|
+
return [f'SENTINEL:#{m}' for m in _SENTINEL_MARKER_RE.findall(text)]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def scan_all_for_markers(fetched_files: dict[str, list[Path]]) -> list[str]:
|
|
170
|
+
"""Scan all fetched log files and return every SENTINEL marker found."""
|
|
171
|
+
markers: list[str] = []
|
|
172
|
+
for files in fetched_files.values():
|
|
173
|
+
for f in files:
|
|
174
|
+
markers.extend(scan_for_markers(f))
|
|
175
|
+
return markers
|
package/python/sentinel/main.py
CHANGED
|
@@ -21,10 +21,10 @@ from .fix_engine import generate_fix
|
|
|
21
21
|
from .git_manager import apply_and_commit, publish
|
|
22
22
|
from .cicd_trigger import trigger as cicd_trigger
|
|
23
23
|
from .log_fetcher import fetch_all
|
|
24
|
-
from .log_parser import parse_all, ErrorEvent
|
|
24
|
+
from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
|
|
25
25
|
from .issue_watcher import scan_issues, mark_done, IssueEvent
|
|
26
26
|
from .repo_router import route
|
|
27
|
-
from .reporter import build_and_send, send_fix_notification, send_failure_notification
|
|
27
|
+
from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification, send_startup_notification
|
|
28
28
|
from .state_store import StateStore
|
|
29
29
|
|
|
30
30
|
logging.basicConfig(
|
|
@@ -81,7 +81,7 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
81
81
|
return
|
|
82
82
|
|
|
83
83
|
patches_dir = Path(sentinel.workspace_dir) / "patches"
|
|
84
|
-
status, patch_path = generate_fix(event, repo, sentinel, patches_dir)
|
|
84
|
+
status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
|
|
85
85
|
|
|
86
86
|
if status != "patch" or patch_path is None:
|
|
87
87
|
outcome = "skipped" if status == "skip" else "failed"
|
|
@@ -116,6 +116,7 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
116
116
|
branch=branch,
|
|
117
117
|
pr_url=pr_url,
|
|
118
118
|
repo_name=repo.repo_name,
|
|
119
|
+
sentinel_marker=marker,
|
|
119
120
|
)
|
|
120
121
|
|
|
121
122
|
send_fix_notification(sentinel, {
|
|
@@ -172,7 +173,7 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
172
173
|
return # Leave the file so admin can add the header
|
|
173
174
|
|
|
174
175
|
patches_dir = Path(sentinel.workspace_dir) / "patches"
|
|
175
|
-
status, patch_path = generate_fix(event, repo, sentinel, patches_dir)
|
|
176
|
+
status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
|
|
176
177
|
|
|
177
178
|
if status != "patch" or patch_path is None:
|
|
178
179
|
store.record_fix(event.fingerprint, "skipped" if status == "skip" else "failed",
|
|
@@ -209,6 +210,7 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
209
210
|
branch=branch,
|
|
210
211
|
pr_url=pr_url,
|
|
211
212
|
repo_name=repo.repo_name,
|
|
213
|
+
sentinel_marker=marker,
|
|
212
214
|
)
|
|
213
215
|
send_fix_notification(sentinel, {
|
|
214
216
|
"source": event.source,
|
|
@@ -232,6 +234,8 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
232
234
|
|
|
233
235
|
async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
|
|
234
236
|
global _report_requested
|
|
237
|
+
events: list = []
|
|
238
|
+
fetched: dict = {}
|
|
235
239
|
|
|
236
240
|
# ── Log sources (optional) ────────────────────────────────────────────────
|
|
237
241
|
sources = list(cfg_loader.log_sources.values())
|
|
@@ -254,6 +258,36 @@ async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
|
|
|
254
258
|
return_exceptions=True,
|
|
255
259
|
)
|
|
256
260
|
|
|
261
|
+
# ── SENTINEL marker scanning (phase 1: record first seen in prod logs) ────
|
|
262
|
+
if sources and fetched:
|
|
263
|
+
for marker in set(scan_all_for_markers(fetched)):
|
|
264
|
+
fix = store.mark_marker_seen(marker)
|
|
265
|
+
if fix:
|
|
266
|
+
logger.info("Marker seen in production: %s repo=%s — quiet period started",
|
|
267
|
+
marker, fix.get("repo_name"))
|
|
268
|
+
|
|
269
|
+
# ── Regression detection (error recurred before quiet period elapsed) ──────
|
|
270
|
+
if sources:
|
|
271
|
+
for event in events:
|
|
272
|
+
pending = store.get_marker_seen_fix(event.fingerprint)
|
|
273
|
+
if pending:
|
|
274
|
+
logger.warning("Regression: %s recurred after marker seen", event.fingerprint)
|
|
275
|
+
store.mark_regressed(event.fingerprint)
|
|
276
|
+
send_regression_notification(cfg_loader.sentinel, pending, {
|
|
277
|
+
"source": event.source,
|
|
278
|
+
"message": event.message,
|
|
279
|
+
"body": event.full_text()[:500],
|
|
280
|
+
})
|
|
281
|
+
|
|
282
|
+
# ── Phase 2: confirm fixes whose quiet period has elapsed ────────────────
|
|
283
|
+
quiet_hours = cfg_loader.sentinel.marker_confirm_hours
|
|
284
|
+
for fix in store.get_fixes_pending_confirmation(quiet_hours):
|
|
285
|
+
confirmed = store.confirm_fix(fix["fingerprint"])
|
|
286
|
+
if confirmed:
|
|
287
|
+
logger.info("Fix confirmed after %dh quiet period: %s repo=%s",
|
|
288
|
+
quiet_hours, fix["fingerprint"], fix.get("repo_name"))
|
|
289
|
+
send_confirmed_notification(cfg_loader.sentinel, confirmed)
|
|
290
|
+
|
|
257
291
|
# ── Issues directory (always checked) ────────────────────────────────────
|
|
258
292
|
issues = scan_issues(Path("."))
|
|
259
293
|
if issues:
|
|
@@ -277,50 +311,99 @@ def _report_due(cfg_loader: ConfigLoader, store: StateStore) -> bool:
|
|
|
277
311
|
return elapsed >= cfg_loader.sentinel.report_interval_hours * 3600
|
|
278
312
|
|
|
279
313
|
|
|
280
|
-
# ──
|
|
314
|
+
# ── Startup checks (runs automatically on every start) ───────────────────────────
|
|
281
315
|
|
|
282
|
-
def
|
|
283
|
-
|
|
284
|
-
|
|
316
|
+
async def _startup_checks(cfg_loader: ConfigLoader) -> dict:
|
|
317
|
+
"""
|
|
318
|
+
Clone missing repos, index with Cairn, test SSH sources.
|
|
319
|
+
Returns a results dict passed to the startup email.
|
|
320
|
+
"""
|
|
321
|
+
results = {
|
|
322
|
+
"repos": [], # {name, status, message}
|
|
323
|
+
"cairn": [], # {name, status, message}
|
|
324
|
+
"ssh": [], # {name, host, status, message}
|
|
325
|
+
"warnings": [],
|
|
326
|
+
}
|
|
285
327
|
|
|
286
328
|
if not cairn_installed():
|
|
287
|
-
|
|
329
|
+
results["warnings"].append("Cairn not found — run: npm install -g @misterhuydo/cairn-mcp")
|
|
288
330
|
|
|
289
331
|
for name, repo in cfg_loader.repos.items():
|
|
290
332
|
local = Path(repo.local_path)
|
|
291
333
|
if not local.exists():
|
|
292
334
|
logger.info("Cloning %s → %s", repo.repo_url, repo.local_path)
|
|
293
|
-
r = subprocess.run(
|
|
335
|
+
r = subprocess.run(
|
|
336
|
+
["git", "clone", repo.repo_url, str(local)],
|
|
337
|
+
capture_output=True, text=True,
|
|
338
|
+
)
|
|
294
339
|
if r.returncode != 0:
|
|
295
|
-
|
|
340
|
+
msg = r.stderr.strip()
|
|
341
|
+
logger.error("Clone failed for %s: %s", name, msg)
|
|
342
|
+
results["repos"].append({"name": name, "status": "error", "message": msg})
|
|
296
343
|
continue
|
|
297
|
-
|
|
344
|
+
results["repos"].append({"name": name, "status": "cloned", "message": repo.repo_url})
|
|
345
|
+
else:
|
|
346
|
+
results["repos"].append({"name": name, "status": "exists", "message": str(local)})
|
|
347
|
+
|
|
348
|
+
ok = index_repo(repo)
|
|
349
|
+
results["cairn"].append({
|
|
350
|
+
"name": name,
|
|
351
|
+
"status": "ok" if ok else "error",
|
|
352
|
+
"message": "indexed" if ok else "cairn index failed — check logs",
|
|
353
|
+
})
|
|
298
354
|
|
|
299
355
|
for src_name, src in cfg_loader.log_sources.items():
|
|
300
356
|
if src.source_type == "ssh" and src.hosts:
|
|
301
357
|
host = src.hosts[0]
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
358
|
+
try:
|
|
359
|
+
r = subprocess.run(
|
|
360
|
+
["ssh", "-i", src.key, "-o", "StrictHostKeyChecking=no",
|
|
361
|
+
"-o", "ConnectTimeout=5", f"ec2-user@{host}", "echo ok"],
|
|
362
|
+
capture_output=True, text=True, timeout=15,
|
|
363
|
+
)
|
|
364
|
+
ok = r.returncode == 0
|
|
365
|
+
results["ssh"].append({
|
|
366
|
+
"name": src_name, "host": host,
|
|
367
|
+
"status": "ok" if ok else "error",
|
|
368
|
+
"message": "" if ok else r.stderr.strip(),
|
|
369
|
+
})
|
|
370
|
+
except Exception as e:
|
|
371
|
+
results["ssh"].append({"name": src_name, "host": host,
|
|
372
|
+
"status": "error", "message": str(e)})
|
|
373
|
+
|
|
374
|
+
return results
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
async def _send_startup_email_delayed(cfg, results: dict, delay: int = 300):
|
|
378
|
+
"""Wait delay seconds then send startup summary email."""
|
|
379
|
+
await asyncio.sleep(delay)
|
|
311
380
|
try:
|
|
312
|
-
|
|
381
|
+
send_startup_notification(cfg, results)
|
|
313
382
|
except Exception as e:
|
|
314
|
-
logger.error("
|
|
315
|
-
|
|
316
|
-
logger.info("=== Init complete ===")
|
|
383
|
+
logger.error("Failed to send startup notification: %s", e)
|
|
317
384
|
|
|
318
385
|
|
|
319
|
-
# ── Entry point
|
|
386
|
+
# ── Entry point ──────────────────────────────────────────────────────────────────────────────────
|
|
320
387
|
|
|
321
388
|
async def run_loop(cfg_loader: ConfigLoader, store: StateStore):
|
|
322
389
|
interval = cfg_loader.sentinel.poll_interval_seconds
|
|
323
|
-
logger.info("Sentinel starting — poll interval: %ds, repos: %s",
|
|
390
|
+
logger.info("Sentinel starting — poll interval: %ds, repos: %s",
|
|
391
|
+
interval, list(cfg_loader.repos.keys()))
|
|
392
|
+
|
|
393
|
+
results = await _startup_checks(cfg_loader)
|
|
394
|
+
|
|
395
|
+
has_errors = any(
|
|
396
|
+
item["status"] == "error"
|
|
397
|
+
for key in ("repos", "cairn", "ssh")
|
|
398
|
+
for item in results[key]
|
|
399
|
+
)
|
|
400
|
+
if has_errors:
|
|
401
|
+
logger.warning("Startup completed with errors — check config and logs")
|
|
402
|
+
else:
|
|
403
|
+
logger.info("Startup checks passed — startup email in 5 minutes")
|
|
404
|
+
|
|
405
|
+
asyncio.ensure_future(_send_startup_email_delayed(cfg_loader.sentinel, results))
|
|
406
|
+
|
|
324
407
|
while True:
|
|
325
408
|
try:
|
|
326
409
|
await poll_cycle(cfg_loader, store)
|
|
@@ -336,7 +419,6 @@ def main():
|
|
|
336
419
|
Path("issues").mkdir(exist_ok=True)
|
|
337
420
|
|
|
338
421
|
parser = argparse.ArgumentParser(description="Sentinel — Autonomous DevOps Agent")
|
|
339
|
-
parser.add_argument("--init", action="store_true", help="First-time setup")
|
|
340
422
|
parser.add_argument("--config", default="./config", help="Config directory path")
|
|
341
423
|
args = parser.parse_args()
|
|
342
424
|
|
|
@@ -344,10 +426,6 @@ def main():
|
|
|
344
426
|
store = StateStore(cfg_loader.sentinel.state_db)
|
|
345
427
|
_register_signals()
|
|
346
428
|
|
|
347
|
-
if args.init:
|
|
348
|
-
run_init(cfg_loader)
|
|
349
|
-
return
|
|
350
|
-
|
|
351
429
|
asyncio.run(run_loop(cfg_loader, store))
|
|
352
430
|
|
|
353
431
|
|