@misterhuydo/sentinel 1.0.6 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,149 +1,175 @@
1
- """
2
- log_parser.py — Parse fetched log files into ErrorEvent objects.
3
-
4
- Handles Java-style logs (Spring Boot / Logback format):
5
- 2024-01-15 12:34:56.789 ERROR [thread] class.ClassName - Message
6
- followed by optional stack trace lines (^\tat ...)
7
- """
8
-
9
- import hashlib
10
- import re
11
- import logging
12
- from dataclasses import dataclass, field
13
- from pathlib import Path
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
- _LOG_HEADER = re.compile(
18
- r"^(?P<ts>\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}[.,\d]*)\s+"
19
- r"(?P<level>CRITICAL|ERROR|WARN(?:ING)?|INFO|DEBUG)\s+"
20
- r"(?:\[(?P<thread>[^\]]*)\]\s+)?"
21
- r"(?P<logger>\S+)\s+-\s+"
22
- r"(?P<message>.+)$"
23
- )
24
-
25
- _STACK_LINE = re.compile(r"^\s+at |\s+\.\.\. \d+ more|^Caused by:")
26
-
27
- SEVERITY_MAP = {
28
- "CRITICAL": "CRITICAL",
29
- "ERROR": "ERROR",
30
- "WARN": "WARN",
31
- "WARNING": "WARN",
32
- "INFO": "INFO",
33
- "DEBUG": "DEBUG",
34
- }
35
-
36
- _CRITICAL_PATTERNS = re.compile(
37
- r"OutOfMemoryError|StackOverflowError|OOMKilled", re.IGNORECASE
38
- )
39
- _INFRA_PATTERNS = re.compile(
40
- r"ConnectException|TimeoutException|ConnectionRefused|SocketTimeout",
41
- re.IGNORECASE,
42
- )
43
-
44
-
45
- @dataclass
46
- class ErrorEvent:
47
- source: str # log-source name (e.g. "SSOLWA")
48
- log_file: str
49
- timestamp: str
50
- level: str # CRITICAL / ERROR / WARN
51
- thread: str
52
- logger_name: str
53
- message: str
54
- stack_trace: list[str] = field(default_factory=list)
55
- fingerprint: str = ""
56
-
57
- def __post_init__(self):
58
- if not self.fingerprint:
59
- self.fingerprint = _fingerprint(self.message, self.stack_trace)
60
-
61
- @property
62
- def severity(self) -> str:
63
- if _CRITICAL_PATTERNS.search(self.message) or _CRITICAL_PATTERNS.search(
64
- "\n".join(self.stack_trace)
65
- ):
66
- return "CRITICAL"
67
- return self.level
68
-
69
- @property
70
- def is_infra_issue(self) -> bool:
71
- return bool(_INFRA_PATTERNS.search(self.message))
72
-
73
- def short_summary(self) -> str:
74
- return self.message[:120]
75
-
76
- def full_text(self) -> str:
77
- lines = [f"{self.timestamp} {self.level} [{self.thread}] {self.logger_name} - {self.message}"]
78
- lines.extend(self.stack_trace)
79
- return "\n".join(lines)
80
-
81
-
82
- def _normalize_message(msg: str) -> str:
83
- msg = re.sub(r"0x[0-9a-fA-F]+", "0xADDR", msg)
84
- msg = re.sub(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", "UUID", msg)
85
- msg = re.sub(r"\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[.,\d]*\b", "TIMESTAMP", msg)
86
- msg = re.sub(r"\b\d+\b", "N", msg)
87
- return msg.strip()
88
-
89
-
90
- def _fingerprint(message: str, stack_trace: list[str]) -> str:
91
- top_frames = [l for l in stack_trace if l.strip().startswith("at ")][:3]
92
- raw = _normalize_message(message) + "\n" + "\n".join(top_frames)
93
- return hashlib.sha1(raw.encode()).hexdigest()[:16]
94
-
95
-
96
- def parse_log_file(path: Path, source_name: str) -> list[ErrorEvent]:
97
- """Parse a single log file and return all ERROR/WARN events."""
98
- events: list[ErrorEvent] = []
99
- current_header: re.Match | None = None
100
- current_stack: list[str] = []
101
-
102
- def flush():
103
- if current_header is None:
104
- return
105
- level = SEVERITY_MAP.get(current_header.group("level").upper(), "WARN")
106
- if level not in ("ERROR", "WARN", "CRITICAL"):
107
- return
108
- event = ErrorEvent(
109
- source=source_name,
110
- log_file=str(path),
111
- timestamp=current_header.group("ts"),
112
- level=level,
113
- thread=current_header.group("thread") or "",
114
- logger_name=current_header.group("logger"),
115
- message=current_header.group("message"),
116
- stack_trace=list(current_stack),
117
- )
118
- events.append(event)
119
-
120
- try:
121
- text = path.read_text(encoding="utf-8", errors="replace")
122
- except OSError as e:
123
- logger.error("Cannot read %s: %s", path, e)
124
- return []
125
-
126
- for line in text.splitlines():
127
- m = _LOG_HEADER.match(line)
128
- if m:
129
- flush()
130
- current_header = m
131
- current_stack = []
132
- elif current_header and _STACK_LINE.match(line):
133
- current_stack.append(line)
134
-
135
- flush()
136
- logger.debug("Parsed %s: %d error/warn events", path.name, len(events))
137
- return events
138
-
139
-
140
- def parse_all(
141
- fetched_files: dict[str, list[Path]],
142
- log_sources, # dict[str, LogSourceConfig]
143
- ) -> list[ErrorEvent]:
144
- """Parse all fetched log files across all sources."""
145
- all_events: list[ErrorEvent] = []
146
- for source_name, files in fetched_files.items():
147
- for f in files:
148
- all_events.extend(parse_log_file(f, source_name))
149
- return all_events
1
+ """
2
+ log_parser.py — Parse fetched log files into ErrorEvent objects.
3
+
4
+ Handles Java-style logs (Spring Boot / Logback format):
5
+ 2024-01-15 12:34:56.789 ERROR [thread] class.ClassName - Message
6
+ followed by optional stack trace lines (^\tat ...)
7
+ """
8
+
9
+ import hashlib
10
+ import re
11
+ import logging
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ _LOG_HEADER = re.compile(
18
+ r"^(?P<ts>\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}[.,\d]*)\s+"
19
+ r"(?P<level>CRITICAL|ERROR|WARN(?:ING)?|INFO|DEBUG)\s+"
20
+ r"(?:\[(?P<thread>[^\]]*)\]\s+)?"
21
+ r"(?P<logger>\S+)\s+-\s+"
22
+ r"(?P<message>.+)$"
23
+ )
24
+
25
+ _STACK_LINE = re.compile(r"^\s+at |\s+\.\.\. \d+ more|^Caused by:")
26
+
27
+ SEVERITY_MAP = {
28
+ "CRITICAL": "CRITICAL",
29
+ "ERROR": "ERROR",
30
+ "WARN": "WARN",
31
+ "WARNING": "WARN",
32
+ "INFO": "INFO",
33
+ "DEBUG": "DEBUG",
34
+ }
35
+
36
+ _CRITICAL_PATTERNS = re.compile(
37
+ r"OutOfMemoryError|StackOverflowError|OOMKilled", re.IGNORECASE
38
+ )
39
+ _INFRA_PATTERNS = re.compile(
40
+ r"ConnectException|TimeoutException|ConnectionRefused|SocketTimeout",
41
+ re.IGNORECASE,
42
+ )
43
+
44
+
45
+ @dataclass
46
+ class ErrorEvent:
47
+ source: str # log-source name (e.g. "SSOLWA")
48
+ log_file: str
49
+ timestamp: str
50
+ level: str # CRITICAL / ERROR / WARN
51
+ thread: str
52
+ logger_name: str
53
+ message: str
54
+ stack_trace: list[str] = field(default_factory=list)
55
+ fingerprint: str = ""
56
+
57
+ def __post_init__(self):
58
+ if not self.fingerprint:
59
+ self.fingerprint = _fingerprint(self.message, self.stack_trace)
60
+
61
+ @property
62
+ def severity(self) -> str:
63
+ if _CRITICAL_PATTERNS.search(self.message) or _CRITICAL_PATTERNS.search(
64
+ "\n".join(self.stack_trace)
65
+ ):
66
+ return "CRITICAL"
67
+ return self.level
68
+
69
+ @property
70
+ def is_infra_issue(self) -> bool:
71
+ return bool(_INFRA_PATTERNS.search(self.message))
72
+
73
+ def short_summary(self) -> str:
74
+ return self.message[:120]
75
+
76
+ def full_text(self) -> str:
77
+ lines = [f"{self.timestamp} {self.level} [{self.thread}] {self.logger_name} - {self.message}"]
78
+ lines.extend(self.stack_trace)
79
+ return "\n".join(lines)
80
+
81
+
82
+ def _normalize_message(msg: str) -> str:
83
+ msg = re.sub(r"0x[0-9a-fA-F]+", "0xADDR", msg)
84
+ msg = re.sub(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", "UUID", msg)
85
+ msg = re.sub(r"\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[.,\d]*\b", "TIMESTAMP", msg)
86
+ msg = re.sub(r"\b\d+\b", "N", msg)
87
+ return msg.strip()
88
+
89
+
90
+ def _fingerprint(message: str, stack_trace: list[str]) -> str:
91
+ top_frames = [l for l in stack_trace if l.strip().startswith("at ")][:3]
92
+ raw = _normalize_message(message) + "\n" + "\n".join(top_frames)
93
+ return hashlib.sha1(raw.encode()).hexdigest()[:16]
94
+
95
+
96
+ def parse_log_file(path: Path, source_name: str) -> list[ErrorEvent]:
97
+ """Parse a single log file and return all ERROR/WARN events."""
98
+ events: list[ErrorEvent] = []
99
+ current_header: re.Match | None = None
100
+ current_stack: list[str] = []
101
+
102
+ def flush():
103
+ if current_header is None:
104
+ return
105
+ level = SEVERITY_MAP.get(current_header.group("level").upper(), "WARN")
106
+ if level not in ("ERROR", "WARN", "CRITICAL"):
107
+ return
108
+ event = ErrorEvent(
109
+ source=source_name,
110
+ log_file=str(path),
111
+ timestamp=current_header.group("ts"),
112
+ level=level,
113
+ thread=current_header.group("thread") or "",
114
+ logger_name=current_header.group("logger"),
115
+ message=current_header.group("message"),
116
+ stack_trace=list(current_stack),
117
+ )
118
+ events.append(event)
119
+
120
+ try:
121
+ text = path.read_text(encoding="utf-8", errors="replace")
122
+ except OSError as e:
123
+ logger.error("Cannot read %s: %s", path, e)
124
+ return []
125
+
126
+ for line in text.splitlines():
127
+ m = _LOG_HEADER.match(line)
128
+ if m:
129
+ flush()
130
+ current_header = m
131
+ current_stack = []
132
+ elif current_header and _STACK_LINE.match(line):
133
+ current_stack.append(line)
134
+
135
+ flush()
136
+ logger.debug("Parsed %s: %d error/warn events", path.name, len(events))
137
+ return events
138
+
139
+
140
+ def parse_all(
141
+ fetched_files: dict[str, list[Path]],
142
+ log_sources, # dict[str, LogSourceConfig]
143
+ ) -> list[ErrorEvent]:
144
+ """Parse all fetched log files across all sources."""
145
+ all_events: list[ErrorEvent] = []
146
+ for source_name, files in fetched_files.items():
147
+ for f in files:
148
+ all_events.extend(parse_log_file(f, source_name))
149
+ return all_events
150
+
151
+
152
+ # -- Sentinel marker detection -------------------------------------------------
153
+
154
+ _SENTINEL_MARKER_RE = re.compile(r'SENTINEL:#([0-9a-f]{16})')
155
+
156
+
157
+ def scan_for_markers(path: Path) -> list[str]:
158
+ """
159
+ Scan a single log file for SENTINEL:#<fingerprint> markers injected by fix_engine.
160
+ Returns a list of full marker strings (e.g. ['SENTINEL:#abc123de45678901']).
161
+ """
162
+ try:
163
+ text = path.read_text(encoding='utf-8', errors='replace')
164
+ except OSError:
165
+ return []
166
+ return [f'SENTINEL:#{m}' for m in _SENTINEL_MARKER_RE.findall(text)]
167
+
168
+
169
+ def scan_all_for_markers(fetched_files: dict[str, list[Path]]) -> list[str]:
170
+ """Scan all fetched log files and return every SENTINEL marker found."""
171
+ markers: list[str] = []
172
+ for files in fetched_files.values():
173
+ for f in files:
174
+ markers.extend(scan_for_markers(f))
175
+ return markers
@@ -21,10 +21,10 @@ from .fix_engine import generate_fix
21
21
  from .git_manager import apply_and_commit, publish
22
22
  from .cicd_trigger import trigger as cicd_trigger
23
23
  from .log_fetcher import fetch_all
24
- from .log_parser import parse_all, ErrorEvent
24
+ from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
25
25
  from .issue_watcher import scan_issues, mark_done, IssueEvent
26
26
  from .repo_router import route
27
- from .reporter import build_and_send, send_fix_notification, send_failure_notification
27
+ from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification
28
28
  from .state_store import StateStore
29
29
 
30
30
  logging.basicConfig(
@@ -81,7 +81,7 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
81
81
  return
82
82
 
83
83
  patches_dir = Path(sentinel.workspace_dir) / "patches"
84
- status, patch_path = generate_fix(event, repo, sentinel, patches_dir)
84
+ status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
85
85
 
86
86
  if status != "patch" or patch_path is None:
87
87
  outcome = "skipped" if status == "skip" else "failed"
@@ -116,6 +116,7 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
116
116
  branch=branch,
117
117
  pr_url=pr_url,
118
118
  repo_name=repo.repo_name,
119
+ sentinel_marker=marker,
119
120
  )
120
121
 
121
122
  send_fix_notification(sentinel, {
@@ -172,7 +173,7 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
172
173
  return # Leave the file so admin can add the header
173
174
 
174
175
  patches_dir = Path(sentinel.workspace_dir) / "patches"
175
- status, patch_path = generate_fix(event, repo, sentinel, patches_dir)
176
+ status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
176
177
 
177
178
  if status != "patch" or patch_path is None:
178
179
  store.record_fix(event.fingerprint, "skipped" if status == "skip" else "failed",
@@ -209,6 +210,7 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
209
210
  branch=branch,
210
211
  pr_url=pr_url,
211
212
  repo_name=repo.repo_name,
213
+ sentinel_marker=marker,
212
214
  )
213
215
  send_fix_notification(sentinel, {
214
216
  "source": event.source,
@@ -232,6 +234,8 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
232
234
 
233
235
  async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
234
236
  global _report_requested
237
+ events: list = []
238
+ fetched: dict = {}
235
239
 
236
240
  # ── Log sources (optional) ────────────────────────────────────────────────
237
241
  sources = list(cfg_loader.log_sources.values())
@@ -254,6 +258,36 @@ async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
254
258
  return_exceptions=True,
255
259
  )
256
260
 
261
+ # ── SENTINEL marker scanning (phase 1: record first seen in prod logs) ────
262
+ if sources and fetched:
263
+ for marker in set(scan_all_for_markers(fetched)):
264
+ fix = store.mark_marker_seen(marker)
265
+ if fix:
266
+ logger.info("Marker seen in production: %s repo=%s — quiet period started",
267
+ marker, fix.get("repo_name"))
268
+
269
+ # ── Regression detection (error recurred before quiet period elapsed) ──────
270
+ if sources:
271
+ for event in events:
272
+ pending = store.get_marker_seen_fix(event.fingerprint)
273
+ if pending:
274
+ logger.warning("Regression: %s recurred after marker seen", event.fingerprint)
275
+ store.mark_regressed(event.fingerprint)
276
+ send_regression_notification(cfg_loader.sentinel, pending, {
277
+ "source": event.source,
278
+ "message": event.message,
279
+ "body": event.full_text()[:500],
280
+ })
281
+
282
+ # ── Phase 2: confirm fixes whose quiet period has elapsed ────────────────
283
+ quiet_hours = cfg_loader.sentinel.marker_confirm_hours
284
+ for fix in store.get_fixes_pending_confirmation(quiet_hours):
285
+ confirmed = store.confirm_fix(fix["fingerprint"])
286
+ if confirmed:
287
+ logger.info("Fix confirmed after %dh quiet period: %s repo=%s",
288
+ quiet_hours, fix["fingerprint"], fix.get("repo_name"))
289
+ send_confirmed_notification(cfg_loader.sentinel, confirmed)
290
+
257
291
  # ── Issues directory (always checked) ────────────────────────────────────
258
292
  issues = scan_issues(Path("."))
259
293
  if issues:
@@ -186,3 +186,89 @@ def send_failure_notification(cfg: SentinelConfig, details: dict):
186
186
  _send_email(cfg, subject, html)
187
187
  logger.info('Failure notification sent for %s', source)
188
188
 
189
+
190
+
191
+ # ---- Confirmed fix notification ----------------------------------------------
192
+
193
+ def send_confirmed_notification(cfg: SentinelConfig, fix: dict):
194
+ """Notify admins that a fix has been confirmed running in production."""
195
+ if not cfg.mails:
196
+ return
197
+ repo_name = fix.get('repo_name', 'unknown')
198
+ fingerprint = fix.get('fingerprint', '')
199
+ marker = fix.get('sentinel_marker', '')
200
+ ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
201
+ subject = f'[Sentinel] ✅ Fix confirmed in production: {repo_name} ({fingerprint[:8]})'
202
+ html = (
203
+ '<!DOCTYPE html><html><head><meta charset="utf-8">'
204
+ '<style>'
205
+ 'body{font-family:Arial,sans-serif;font-size:14px;color:#222}'
206
+ 'h2{color:#2e7d32}'
207
+ 'table{border-collapse:collapse;width:100%;margin-bottom:16px}'
208
+ 'th{background:#f1f3f4;text-align:left;padding:6px 10px}'
209
+ 'td{padding:5px 10px;border-bottom:1px solid #eee;vertical-align:top}'
210
+ '.label{font-weight:bold;width:160px}'
211
+ '.mono{font-family:monospace;font-size:12px}'
212
+ '</style></head><body>'
213
+ '<h2>✅ Fix confirmed running in production</h2>'
214
+ f'<p><strong>{repo_name}</strong> &middot; {ts}</p>'
215
+ '<table>'
216
+ f'<tr><td class="label">Fingerprint</td><td class="mono">{fingerprint}</td></tr>'
217
+ f'<tr><td class="label">Sentinel marker</td><td class="mono">{marker}</td></tr>'
218
+ f'<tr><td class="label">Commit</td><td class="mono">{fix.get("commit_hash", "")}</td></tr>'
219
+ f'<tr><td class="label">Branch</td><td class="mono">{fix.get("branch", "")}</td></tr>'
220
+ f'<tr><td class="label">Confirmed at</td><td>{fix.get("confirmed_at", ts)}</td></tr>'
221
+ '</table>'
222
+ '<p>The marker log line was detected in production logs, confirming the fix is live and the fixed code path executed.</p>'
223
+ '<hr><small>Sentinel &mdash; Autonomous DevOps Agent</small>'
224
+ '</body></html>'
225
+ )
226
+ _send_email(cfg, subject, html)
227
+ logger.info('Confirmed notification sent for %s', fingerprint)
228
+
229
+
230
+ # ---- Regression notification ------------------------------------------------
231
+
232
+ def send_regression_notification(cfg: SentinelConfig, fix: dict, event: dict):
233
+ """Notify admins that a confirmed fix did not resolve the issue."""
234
+ if not cfg.mails:
235
+ return
236
+ repo_name = fix.get('repo_name', 'unknown')
237
+ fingerprint = fix.get('fingerprint', '')
238
+ ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
239
+ subject = f'[Sentinel] ⚠ Regression: fix did not resolve issue in {repo_name}'
240
+ html = (
241
+ '<!DOCTYPE html><html><head><meta charset="utf-8">'
242
+ '<style>'
243
+ 'body{font-family:Arial,sans-serif;font-size:14px;color:#222}'
244
+ 'h2{color:#c62828}'
245
+ 'h3{color:#444;border-bottom:1px solid #ddd;padding-bottom:4px}'
246
+ 'table{border-collapse:collapse;width:100%;margin-bottom:16px}'
247
+ 'th{background:#f1f3f4;text-align:left;padding:6px 10px}'
248
+ 'td{padding:5px 10px;border-bottom:1px solid #eee;vertical-align:top}'
249
+ '.label{font-weight:bold;width:160px}'
250
+ '.mono{font-family:monospace;font-size:12px}'
251
+ 'pre{background:#f8f8f8;border:1px solid #ddd;padding:10px;font-size:12px;white-space:pre-wrap}'
252
+ '</style></head><body>'
253
+ '<h2>⚠ Regression detected &mdash; fix did not resolve the issue</h2>'
254
+ f'<p><strong>{repo_name}</strong> &middot; {ts}</p>'
255
+ '<p>The original error recurred in production logs after the Sentinel fix was confirmed deployed.</p>'
256
+ '<h3>Fix Details</h3>'
257
+ '<table>'
258
+ f'<tr><td class="label">Fingerprint</td><td class="mono">{fingerprint}</td></tr>'
259
+ f'<tr><td class="label">Commit</td><td class="mono">{fix.get("commit_hash", "")}</td></tr>'
260
+ f'<tr><td class="label">Branch</td><td class="mono">{fix.get("branch", "")}</td></tr>'
261
+ f'<tr><td class="label">Confirmed at</td><td>{fix.get("confirmed_at", "")}</td></tr>'
262
+ '</table>'
263
+ '<h3>Recurring Error</h3>'
264
+ '<table>'
265
+ f'<tr><td class="label">Source</td><td class="mono">{event.get("source", "")}</td></tr>'
266
+ f'<tr><td class="label">Message</td><td class="mono">{event.get("message", "")}</td></tr>'
267
+ '</table>'
268
+ f'<pre>{event.get("body", "")}</pre>'
269
+ '<p>Sentinel will not attempt another automatic fix. Please investigate manually.</p>'
270
+ '<hr><small>Sentinel &mdash; Autonomous DevOps Agent</small>'
271
+ '</body></html>'
272
+ )
273
+ _send_email(cfg, subject, html)
274
+ logger.info('Regression notification sent for %s', fingerprint)