debugger-help 3.0.2__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debugger_help-3.0.2 → debugger_help-4.0.0}/PKG-INFO +1 -1
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help/__init__.py +1 -1
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help/agent.py +370 -234
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help.egg-info/PKG-INFO +1 -1
- {debugger_help-3.0.2 → debugger_help-4.0.0}/pyproject.toml +1 -1
- {debugger_help-3.0.2 → debugger_help-4.0.0}/README.md +0 -0
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help.egg-info/SOURCES.txt +0 -0
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help.egg-info/dependency_links.txt +0 -0
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help.egg-info/entry_points.txt +0 -0
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help.egg-info/requires.txt +0 -0
- {debugger_help-3.0.2 → debugger_help-4.0.0}/debugger_help.egg-info/top_level.txt +0 -0
- {debugger_help-3.0.2 → debugger_help-4.0.0}/setup.cfg +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""debugger.help VPS Agent — Deep system monitoring."""
|
|
2
|
-
__version__ = "
|
|
2
|
+
__version__ = "4.0.0"
|
|
@@ -1,29 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
debugger.help VPS Agent
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
- Memory: RAM, swap, shared, buffers, cached
|
|
10
|
-
- Disk: usage per mount, I/O read/write rates, inode usage
|
|
11
|
-
- Network: per-interface stats, connection counts by state, open ports, DNS resolution
|
|
12
|
-
- Processes: top CPU/memory consumers, zombie/defunct processes, open file descriptors
|
|
13
|
-
- Docker containers (if running): status, CPU, memory, restart counts
|
|
14
|
-
- Systemd services: failed units
|
|
15
|
-
- SSL certificates: expiry checks
|
|
16
|
-
- File watchers: key log files (syslog, dmesg, pm2 logs, custom)
|
|
17
|
-
- Image generation: Flux/SDXL/ComfyUI pipeline errors, model load times, inference timing
|
|
18
|
-
- Environment: all relevant env vars (sanitized), Python packages, Node packages
|
|
19
|
-
|
|
20
|
-
Usage:
|
|
21
|
-
pip install psutil requests
|
|
22
|
-
# Optional: pip install pynvml docker
|
|
23
|
-
|
|
24
|
-
export DEBUGGER_API_KEY="sk_your_api_key_here"
|
|
25
|
-
export DEBUGGER_INGEST_URL="https://your-project.supabase.co/functions/v1/ingest"
|
|
26
|
-
python debugger_agent.py
|
|
3
|
+
debugger.help VPS Agent v4.0.0 — Action-Based Execution
|
|
4
|
+
|
|
5
|
+
Key changes from v3:
|
|
6
|
+
- Action-based command system: only predefined actions execute, no raw commands
|
|
7
|
+
- Fixed error/warning log flood: deduplication + excluded agent's own logs
|
|
8
|
+
- Deterministic execution: AI selects from action registry, agent maps to commands
|
|
27
9
|
"""
|
|
28
10
|
|
|
29
11
|
import os
|
|
@@ -67,7 +49,7 @@ try:
|
|
|
67
49
|
GPU_DRIVER_VERSION = GPU_DRIVER_VERSION.decode()
|
|
68
50
|
try:
|
|
69
51
|
GPU_CUDA_VERSION = pynvml.nvmlSystemGetCudaDriverVersion_v2()
|
|
70
|
-
GPU_CUDA_VERSION =
|
|
52
|
+
GPU_CUDA_VERSION = "{}.{}".format(GPU_CUDA_VERSION // 1000, (GPU_CUDA_VERSION % 1000) // 10)
|
|
71
53
|
except Exception:
|
|
72
54
|
GPU_CUDA_VERSION = "unknown"
|
|
73
55
|
except (ImportError, Exception):
|
|
@@ -93,10 +75,13 @@ logger = logging.getLogger("debugger-agent")
|
|
|
93
75
|
# Configuration
|
|
94
76
|
API_KEY = os.environ.get("DEBUGGER_API_KEY", "")
|
|
95
77
|
INGEST_URL = os.environ.get("DEBUGGER_INGEST_URL", "")
|
|
96
|
-
SOURCE_NAME = os.environ.get("DEBUGGER_SOURCE",
|
|
78
|
+
SOURCE_NAME = os.environ.get("DEBUGGER_SOURCE", "vps-{}".format(socket.gethostname()))
|
|
97
79
|
PLATFORM = os.environ.get("DEBUGGER_PLATFORM", "Python (VPS)")
|
|
98
80
|
INTERVAL = int(os.environ.get("DEBUGGER_INTERVAL", "10"))
|
|
99
|
-
VERSION = "
|
|
81
|
+
VERSION = "4.0.0"
|
|
82
|
+
|
|
83
|
+
# Derive poll-commands URL from ingest URL
|
|
84
|
+
POLL_COMMANDS_URL = INGEST_URL.replace("/ingest", "/poll-commands") if INGEST_URL else ""
|
|
100
85
|
|
|
101
86
|
# Additional log files to watch
|
|
102
87
|
WATCH_LOG_FILES = [
|
|
@@ -123,22 +108,24 @@ session.mount("https://", HTTPAdapter(max_retries=retry))
|
|
|
123
108
|
session.mount("http://", HTTPAdapter(max_retries=retry))
|
|
124
109
|
|
|
125
110
|
HEADERS = {
|
|
126
|
-
"Authorization":
|
|
111
|
+
"Authorization": "Bearer {}".format(API_KEY),
|
|
127
112
|
"Content-Type": "application/json",
|
|
128
113
|
}
|
|
129
114
|
|
|
130
115
|
|
|
131
|
-
def send(payload
|
|
116
|
+
def send(payload):
|
|
132
117
|
"""Send payload with auto-retry."""
|
|
133
118
|
try:
|
|
134
119
|
resp = session.post(INGEST_URL, json=payload, headers=HEADERS, timeout=15)
|
|
120
|
+
if resp.status_code != 200:
|
|
121
|
+
logger.warning("Send failed [%s %s]: %s", payload.get("type"), resp.status_code, resp.text[:200])
|
|
135
122
|
return resp.status_code == 200
|
|
136
123
|
except Exception as e:
|
|
137
|
-
logger.
|
|
124
|
+
logger.warning("Send error [%s]: %s", payload.get("type"), e)
|
|
138
125
|
return False
|
|
139
126
|
|
|
140
127
|
|
|
141
|
-
def run_cmd(cmd
|
|
128
|
+
def run_cmd(cmd, timeout=10):
|
|
142
129
|
"""Run shell command and return output."""
|
|
143
130
|
try:
|
|
144
131
|
result = subprocess.run(
|
|
@@ -146,26 +133,64 @@ def run_cmd(cmd: str, timeout: int = 10) -> str:
|
|
|
146
133
|
)
|
|
147
134
|
return (result.stdout + result.stderr).strip()
|
|
148
135
|
except subprocess.TimeoutExpired:
|
|
149
|
-
return
|
|
136
|
+
return "[timeout after {}s]".format(timeout)
|
|
150
137
|
except Exception as e:
|
|
151
|
-
return
|
|
138
|
+
return "[error: {}]".format(e)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# =============================================================================
|
|
142
|
+
# ACTION REGISTRY — the ONLY commands that can be executed remotely
|
|
143
|
+
# =============================================================================
|
|
144
|
+
|
|
145
|
+
ACTION_COMMANDS = {
|
|
146
|
+
"check_gpu": "nvidia-smi",
|
|
147
|
+
"check_gpu_detailed": "nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpu --format=csv,noheader,nounits",
|
|
148
|
+
"check_docker": "systemctl status docker",
|
|
149
|
+
"restart_docker": "sudo systemctl restart docker",
|
|
150
|
+
"check_comfy": "pm2 logs yourstudio-gpu --nostream --lines 100 2>/dev/null || pm2 logs --nostream --lines 100 2>/dev/null",
|
|
151
|
+
"restart_comfy": "pm2 restart yourstudio-gpu 2>/dev/null || pm2 restart all",
|
|
152
|
+
"check_processes": "ps aux --sort=-%cpu | head -30",
|
|
153
|
+
"check_ports": "ss -tulnp",
|
|
154
|
+
"check_health": "curl -fsS --connect-timeout 5 --max-time 10 http://127.0.0.1:8188/health 2>&1 || echo 'Health check failed'",
|
|
155
|
+
"check_disk": "df -h",
|
|
156
|
+
"check_memory": "free -h",
|
|
157
|
+
"check_uptime": "uptime",
|
|
158
|
+
"check_pm2": "pm2 list",
|
|
159
|
+
"check_systemd": "systemctl --failed --no-pager --plain 2>/dev/null | head -20",
|
|
160
|
+
"check_journal_errors": "journalctl -p err --since '1 hour ago' --no-pager -q 2>/dev/null | tail -50",
|
|
161
|
+
"check_dmesg": "dmesg --time-format iso -T 2>/dev/null | tail -50",
|
|
162
|
+
"check_network": "ip addr show",
|
|
163
|
+
}
|
|
152
164
|
|
|
165
|
+
# Actions that are never auto-executed even if the server says so
|
|
166
|
+
DANGEROUS_ACTIONS = {"restart_docker", "restart_comfy"}
|
|
153
167
|
|
|
154
|
-
|
|
168
|
+
|
|
169
|
+
# =============================================================================
|
|
170
|
+
# Stdout/Stderr Capture (with deduplication)
|
|
171
|
+
# =============================================================================
|
|
155
172
|
|
|
156
173
|
class StreamCapture(io.TextIOBase):
|
|
157
|
-
"""Captures writes to stdout/stderr and
|
|
174
|
+
"""Captures writes to stdout/stderr and buffers them for batch sending."""
|
|
158
175
|
|
|
159
176
|
def __init__(self, original_stream, level="info", max_buffer=500):
|
|
160
177
|
self.original = original_stream
|
|
161
178
|
self.level = level
|
|
162
179
|
self.buffer = deque(maxlen=max_buffer)
|
|
180
|
+
self.pending = deque(maxlen=100)
|
|
163
181
|
self.lock = threading.Lock()
|
|
182
|
+
self._sending = False
|
|
183
|
+
self._recent_hashes = deque(maxlen=200)
|
|
164
184
|
|
|
165
185
|
def write(self, text):
|
|
166
186
|
self.original.write(text)
|
|
167
|
-
if text.strip():
|
|
187
|
+
if text.strip() and not self._sending:
|
|
188
|
+
msg_hash = hash(text.strip()[:200])
|
|
168
189
|
with self.lock:
|
|
190
|
+
# Deduplicate: skip if we've seen this exact message recently
|
|
191
|
+
if msg_hash in self._recent_hashes:
|
|
192
|
+
return len(text)
|
|
193
|
+
self._recent_hashes.append(msg_hash)
|
|
169
194
|
self.buffer.append(text.strip())
|
|
170
195
|
lower = text.lower()
|
|
171
196
|
detected_level = self.level
|
|
@@ -176,29 +201,43 @@ class StreamCapture(io.TextIOBase):
|
|
|
176
201
|
detected_level = "error"
|
|
177
202
|
elif any(kw in lower for kw in ["warning", "warn", "deprecat"]):
|
|
178
203
|
detected_level = "warn"
|
|
204
|
+
self.pending.append((detected_level, text.strip()[:2000]))
|
|
205
|
+
return len(text)
|
|
206
|
+
|
|
207
|
+
def flush(self):
|
|
208
|
+
self.original.flush()
|
|
179
209
|
|
|
210
|
+
def flush_pending(self):
|
|
211
|
+
"""Send buffered messages from main loop to avoid recursion."""
|
|
212
|
+
items = []
|
|
213
|
+
with self.lock:
|
|
214
|
+
while self.pending:
|
|
215
|
+
items.append(self.pending.popleft())
|
|
216
|
+
self._sending = True
|
|
217
|
+
try:
|
|
218
|
+
for level, msg in items:
|
|
180
219
|
send({
|
|
181
220
|
"type": "log",
|
|
182
221
|
"source": SOURCE_NAME,
|
|
183
222
|
"platform": PLATFORM,
|
|
184
223
|
"version": VERSION,
|
|
185
|
-
"level":
|
|
186
|
-
"message":
|
|
187
|
-
"context": {"capturedFrom":
|
|
224
|
+
"level": level,
|
|
225
|
+
"message": "[std{}] {}".format(self.level, msg),
|
|
226
|
+
"context": {"capturedFrom": "std{}".format(self.level)},
|
|
188
227
|
})
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def flush(self):
|
|
192
|
-
self.original.flush()
|
|
228
|
+
finally:
|
|
229
|
+
self._sending = False
|
|
193
230
|
|
|
194
231
|
def get_recent(self, n=100):
|
|
195
232
|
with self.lock:
|
|
196
233
|
return list(self.buffer)[-n:]
|
|
197
234
|
|
|
198
235
|
|
|
199
|
-
#
|
|
236
|
+
# =============================================================================
|
|
237
|
+
# GPU Metrics (Deep)
|
|
238
|
+
# =============================================================================
|
|
200
239
|
|
|
201
|
-
def get_gpu_metrics()
|
|
240
|
+
def get_gpu_metrics():
|
|
202
241
|
if not GPU_AVAILABLE:
|
|
203
242
|
return {}
|
|
204
243
|
try:
|
|
@@ -227,45 +266,33 @@ def get_gpu_metrics() -> dict:
|
|
|
227
266
|
"power_w": round(power, 1),
|
|
228
267
|
}
|
|
229
268
|
|
|
230
|
-
# Power limit
|
|
231
269
|
try:
|
|
232
270
|
power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
|
|
233
271
|
gpu_info["power_limit_w"] = round(power_limit, 1)
|
|
234
272
|
except Exception:
|
|
235
273
|
pass
|
|
236
274
|
|
|
237
|
-
# Throttle reasons
|
|
238
275
|
try:
|
|
239
276
|
throttle = pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)
|
|
240
277
|
reasons = []
|
|
241
|
-
if throttle & 0x0000000000000002:
|
|
242
|
-
|
|
243
|
-
if throttle &
|
|
244
|
-
|
|
245
|
-
if throttle &
|
|
246
|
-
|
|
247
|
-
if throttle &
|
|
248
|
-
|
|
249
|
-
if throttle & 0x0000000000000040:
|
|
250
|
-
reasons.append("sync_boost")
|
|
251
|
-
if throttle & 0x0000000000000080:
|
|
252
|
-
reasons.append("sw_thermal")
|
|
253
|
-
if throttle & 0x0000000000000100:
|
|
254
|
-
reasons.append("hw_thermal")
|
|
255
|
-
if throttle & 0x0000000000000200:
|
|
256
|
-
reasons.append("hw_power_brake")
|
|
278
|
+
if throttle & 0x0000000000000002: reasons.append("idle")
|
|
279
|
+
if throttle & 0x0000000000000004: reasons.append("app_clocks")
|
|
280
|
+
if throttle & 0x0000000000000008: reasons.append("sw_power_cap")
|
|
281
|
+
if throttle & 0x0000000000000020: reasons.append("hw_slowdown")
|
|
282
|
+
if throttle & 0x0000000000000040: reasons.append("sync_boost")
|
|
283
|
+
if throttle & 0x0000000000000080: reasons.append("sw_thermal")
|
|
284
|
+
if throttle & 0x0000000000000100: reasons.append("hw_thermal")
|
|
285
|
+
if throttle & 0x0000000000000200: reasons.append("hw_power_brake")
|
|
257
286
|
gpu_info["throttle_reasons"] = reasons if reasons else ["none"]
|
|
258
287
|
except Exception:
|
|
259
288
|
pass
|
|
260
289
|
|
|
261
|
-
# Clock speeds
|
|
262
290
|
try:
|
|
263
291
|
gpu_info["clock_graphics_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
|
|
264
292
|
gpu_info["clock_mem_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
|
|
265
293
|
except Exception:
|
|
266
294
|
pass
|
|
267
295
|
|
|
268
|
-
# Running processes on GPU
|
|
269
296
|
try:
|
|
270
297
|
procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
|
|
271
298
|
gpu_procs = []
|
|
@@ -284,7 +311,6 @@ def get_gpu_metrics() -> dict:
|
|
|
284
311
|
except Exception:
|
|
285
312
|
pass
|
|
286
313
|
|
|
287
|
-
# ECC errors
|
|
288
314
|
try:
|
|
289
315
|
ecc_single = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_SINGLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
|
|
290
316
|
ecc_double = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_DOUBLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
|
|
@@ -300,7 +326,6 @@ def get_gpu_metrics() -> dict:
|
|
|
300
326
|
"cuda_version": GPU_CUDA_VERSION,
|
|
301
327
|
"device_count": device_count,
|
|
302
328
|
"gpus": gpus,
|
|
303
|
-
# Flatten primary GPU for metrics table
|
|
304
329
|
"gpu_temp": gpus[0]["temp_c"] if gpus else None,
|
|
305
330
|
"gpu_vram": gpus[0]["vram_used_gb"] if gpus else None,
|
|
306
331
|
}
|
|
@@ -308,10 +333,11 @@ def get_gpu_metrics() -> dict:
|
|
|
308
333
|
return {"error": str(e)}
|
|
309
334
|
|
|
310
335
|
|
|
311
|
-
#
|
|
336
|
+
# =============================================================================
|
|
337
|
+
# PM2 Integration
|
|
338
|
+
# =============================================================================
|
|
312
339
|
|
|
313
|
-
def get_pm2_status()
|
|
314
|
-
"""Get full pm2 process list with details."""
|
|
340
|
+
def get_pm2_status():
|
|
315
341
|
raw = run_cmd("pm2 jlist 2>/dev/null")
|
|
316
342
|
if not raw or raw.startswith("[error") or raw.startswith("[timeout"):
|
|
317
343
|
return {"available": False, "raw": raw}
|
|
@@ -340,37 +366,22 @@ def get_pm2_status() -> dict:
|
|
|
340
366
|
"instances": env.get("instances"),
|
|
341
367
|
"exit_code": env.get("exit_code"),
|
|
342
368
|
}
|
|
343
|
-
# Get recent logs for this process
|
|
344
|
-
recent_logs = run_cmd(f"pm2 logs {p.get('name')} --nostream --lines 30 2>/dev/null", timeout=5)
|
|
345
|
-
if recent_logs and not recent_logs.startswith("["):
|
|
346
|
-
proc_info["recent_logs"] = recent_logs[-3000:] # Last 3KB
|
|
347
|
-
|
|
348
|
-
# Get recent error logs
|
|
349
|
-
err_log_path = env.get("pm_err_log_path")
|
|
350
|
-
if err_log_path and os.path.exists(err_log_path):
|
|
351
|
-
try:
|
|
352
|
-
with open(err_log_path, "r") as f:
|
|
353
|
-
lines = f.readlines()
|
|
354
|
-
proc_info["recent_errors"] = "".join(lines[-50:])[-3000:]
|
|
355
|
-
except Exception:
|
|
356
|
-
pass
|
|
357
|
-
|
|
358
369
|
result["processes"].append(proc_info)
|
|
359
370
|
return result
|
|
360
371
|
except json.JSONDecodeError:
|
|
361
372
|
return {"available": True, "raw_output": raw[:2000]}
|
|
362
373
|
|
|
363
374
|
|
|
364
|
-
def get_pm2_logs(lines
|
|
365
|
-
|
|
366
|
-
output = run_cmd(f"pm2 logs --nostream --lines {lines} 2>/dev/null", timeout=10)
|
|
375
|
+
def get_pm2_logs(lines=100):
|
|
376
|
+
output = run_cmd("pm2 logs --nostream --lines {} 2>/dev/null".format(lines), timeout=10)
|
|
367
377
|
return output[-5000:] if output else ""
|
|
368
378
|
|
|
369
379
|
|
|
370
|
-
#
|
|
380
|
+
# =============================================================================
|
|
381
|
+
# System Metrics (Deep)
|
|
382
|
+
# =============================================================================
|
|
371
383
|
|
|
372
|
-
def get_disk_io()
|
|
373
|
-
"""Get disk I/O stats."""
|
|
384
|
+
def get_disk_io():
|
|
374
385
|
try:
|
|
375
386
|
io_counters = psutil.disk_io_counters(perdisk=False)
|
|
376
387
|
if io_counters:
|
|
@@ -387,11 +398,8 @@ def get_disk_io() -> dict:
|
|
|
387
398
|
return {}
|
|
388
399
|
|
|
389
400
|
|
|
390
|
-
def get_network_details()
|
|
391
|
-
"""Get detailed network info."""
|
|
401
|
+
def get_network_details():
|
|
392
402
|
result = {}
|
|
393
|
-
|
|
394
|
-
# Per-interface stats
|
|
395
403
|
try:
|
|
396
404
|
net_io = psutil.net_io_counters(pernic=True)
|
|
397
405
|
interfaces = {}
|
|
@@ -412,7 +420,6 @@ def get_network_details() -> dict:
|
|
|
412
420
|
except Exception:
|
|
413
421
|
pass
|
|
414
422
|
|
|
415
|
-
# Connection states
|
|
416
423
|
try:
|
|
417
424
|
connections = psutil.net_connections(kind="tcp")
|
|
418
425
|
states = {}
|
|
@@ -424,7 +431,6 @@ def get_network_details() -> dict:
|
|
|
424
431
|
except Exception:
|
|
425
432
|
pass
|
|
426
433
|
|
|
427
|
-
# Listening ports
|
|
428
434
|
try:
|
|
429
435
|
listening = []
|
|
430
436
|
for conn in psutil.net_connections(kind="tcp"):
|
|
@@ -438,7 +444,6 @@ def get_network_details() -> dict:
|
|
|
438
444
|
except Exception:
|
|
439
445
|
pass
|
|
440
446
|
|
|
441
|
-
# DNS check
|
|
442
447
|
try:
|
|
443
448
|
start = time.time()
|
|
444
449
|
socket.getaddrinfo("google.com", 80)
|
|
@@ -449,8 +454,7 @@ def get_network_details() -> dict:
|
|
|
449
454
|
return result
|
|
450
455
|
|
|
451
456
|
|
|
452
|
-
def get_top_processes(n
|
|
453
|
-
"""Get top N processes by CPU and memory."""
|
|
457
|
+
def get_top_processes(n=15):
|
|
454
458
|
procs = []
|
|
455
459
|
for p in psutil.process_iter(["pid", "name", "cpu_percent", "memory_percent", "status", "num_fds", "num_threads", "create_time", "cmdline"]):
|
|
456
460
|
try:
|
|
@@ -473,8 +477,7 @@ def get_top_processes(n: int = 15) -> list:
|
|
|
473
477
|
return procs[:n]
|
|
474
478
|
|
|
475
479
|
|
|
476
|
-
def get_zombie_processes()
|
|
477
|
-
"""Find zombie/defunct processes."""
|
|
480
|
+
def get_zombie_processes():
|
|
478
481
|
zombies = []
|
|
479
482
|
for p in psutil.process_iter(["pid", "name", "status", "ppid"]):
|
|
480
483
|
try:
|
|
@@ -489,8 +492,7 @@ def get_zombie_processes() -> list:
|
|
|
489
492
|
return zombies
|
|
490
493
|
|
|
491
494
|
|
|
492
|
-
def get_disk_details()
|
|
493
|
-
"""Get all mount point usage and inode info."""
|
|
495
|
+
def get_disk_details():
|
|
494
496
|
mounts = []
|
|
495
497
|
for part in psutil.disk_partitions(all=False):
|
|
496
498
|
try:
|
|
@@ -504,9 +506,8 @@ def get_disk_details() -> list:
|
|
|
504
506
|
"free_gb": round(usage.free / 1e9, 1),
|
|
505
507
|
"pct": usage.percent,
|
|
506
508
|
}
|
|
507
|
-
# Inode usage (Linux)
|
|
508
509
|
try:
|
|
509
|
-
inode_output = run_cmd(
|
|
510
|
+
inode_output = run_cmd("df -i {} | tail -1".format(part.mountpoint), timeout=3)
|
|
510
511
|
parts = inode_output.split()
|
|
511
512
|
if len(parts) >= 5:
|
|
512
513
|
info["inodes_used"] = parts[2]
|
|
@@ -520,12 +521,10 @@ def get_disk_details() -> list:
|
|
|
520
521
|
return mounts
|
|
521
522
|
|
|
522
523
|
|
|
523
|
-
def get_systemd_failed()
|
|
524
|
-
"""Get failed systemd units."""
|
|
524
|
+
def get_systemd_failed():
|
|
525
525
|
output = run_cmd("systemctl --failed --no-pager --plain 2>/dev/null", timeout=5)
|
|
526
526
|
if not output or "0 loaded" in output:
|
|
527
527
|
return []
|
|
528
|
-
|
|
529
528
|
failed = []
|
|
530
529
|
for line in output.split("\n"):
|
|
531
530
|
line = line.strip()
|
|
@@ -536,8 +535,7 @@ def get_systemd_failed() -> list:
|
|
|
536
535
|
return failed[:20]
|
|
537
536
|
|
|
538
537
|
|
|
539
|
-
def get_docker_status()
|
|
540
|
-
"""Get Docker container info."""
|
|
538
|
+
def get_docker_status():
|
|
541
539
|
if not DOCKER_AVAILABLE:
|
|
542
540
|
return {"available": False}
|
|
543
541
|
try:
|
|
@@ -559,7 +557,6 @@ def get_docker_status() -> dict:
|
|
|
559
557
|
}
|
|
560
558
|
except Exception:
|
|
561
559
|
pass
|
|
562
|
-
|
|
563
560
|
containers.append({
|
|
564
561
|
"name": c.name,
|
|
565
562
|
"image": c.image.tags[0] if c.image.tags else str(c.image.id)[:12],
|
|
@@ -572,8 +569,7 @@ def get_docker_status() -> dict:
|
|
|
572
569
|
return {"available": False, "error": str(e)}
|
|
573
570
|
|
|
574
571
|
|
|
575
|
-
def check_ssl_certs()
|
|
576
|
-
"""Check SSL certificate expiry for configured domains."""
|
|
572
|
+
def check_ssl_certs():
|
|
577
573
|
results = []
|
|
578
574
|
for domain in SSL_CHECK_DOMAINS:
|
|
579
575
|
try:
|
|
@@ -596,45 +592,24 @@ def check_ssl_certs() -> list:
|
|
|
596
592
|
return results
|
|
597
593
|
|
|
598
594
|
|
|
599
|
-
def get_system_logs()
|
|
600
|
-
"""Get recent system logs (dmesg, syslog, auth)."""
|
|
595
|
+
def get_system_logs():
|
|
601
596
|
logs = {}
|
|
602
|
-
|
|
603
|
-
# dmesg (kernel messages — GPU errors, OOM kills show up here)
|
|
604
597
|
dmesg = run_cmd("dmesg --time-format iso -T 2>/dev/null | tail -50", timeout=5)
|
|
605
598
|
if dmesg and not dmesg.startswith("[error"):
|
|
606
599
|
logs["dmesg"] = dmesg[-3000:]
|
|
607
|
-
|
|
608
|
-
# Check for OOM kills specifically
|
|
609
600
|
oom = run_cmd("dmesg | grep -i 'oom\\|killed process\\|out of memory' | tail -10 2>/dev/null", timeout=5)
|
|
610
601
|
if oom and not oom.startswith("[error"):
|
|
611
602
|
logs["oom_kills"] = oom
|
|
612
|
-
|
|
613
|
-
# journalctl recent errors
|
|
614
603
|
journal = run_cmd("journalctl -p err --since '1 hour ago' --no-pager -q 2>/dev/null | tail -30", timeout=5)
|
|
615
604
|
if journal and not journal.startswith("[error"):
|
|
616
605
|
logs["journal_errors"] = journal[-2000:]
|
|
617
|
-
|
|
618
|
-
# Auth log (failed logins, suspicious activity)
|
|
619
606
|
auth = run_cmd("tail -20 /var/log/auth.log 2>/dev/null || tail -20 /var/log/secure 2>/dev/null", timeout=5)
|
|
620
607
|
if auth and not auth.startswith("[error"):
|
|
621
608
|
logs["auth_log"] = auth[-1000:]
|
|
622
|
-
|
|
623
609
|
return logs
|
|
624
610
|
|
|
625
611
|
|
|
626
|
-
def
|
|
627
|
-
"""Get firewall rules summary."""
|
|
628
|
-
# Try ufw first, then iptables
|
|
629
|
-
ufw = run_cmd("ufw status verbose 2>/dev/null", timeout=5)
|
|
630
|
-
if ufw and "Status:" in ufw:
|
|
631
|
-
return ufw[:2000]
|
|
632
|
-
ipt = run_cmd("iptables -L -n --line-numbers 2>/dev/null | head -40", timeout=5)
|
|
633
|
-
return ipt[:2000] if ipt else ""
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
def get_env_sanitized() -> dict:
|
|
637
|
-
"""Get relevant environment variables with secrets redacted."""
|
|
612
|
+
def get_env_sanitized():
|
|
638
613
|
relevant_prefixes = [
|
|
639
614
|
"NODE_", "PYTHON", "PATH", "HOME", "USER", "SHELL", "LANG",
|
|
640
615
|
"CUDA", "NVIDIA", "GPU", "LD_LIBRARY", "VIRTUAL_ENV", "CONDA",
|
|
@@ -643,7 +618,6 @@ def get_env_sanitized() -> dict:
|
|
|
643
618
|
env = {}
|
|
644
619
|
for key, value in os.environ.items():
|
|
645
620
|
if any(key.startswith(p) for p in relevant_prefixes):
|
|
646
|
-
# Redact anything that looks like a secret
|
|
647
621
|
if any(s in key.upper() for s in ["KEY", "SECRET", "TOKEN", "PASS", "AUTH"]):
|
|
648
622
|
env[key] = "[REDACTED]"
|
|
649
623
|
else:
|
|
@@ -651,19 +625,14 @@ def get_env_sanitized() -> dict:
|
|
|
651
625
|
return env
|
|
652
626
|
|
|
653
627
|
|
|
654
|
-
def get_installed_packages()
|
|
655
|
-
"""Get installed Python and Node packages."""
|
|
628
|
+
def get_installed_packages():
|
|
656
629
|
pkgs = {}
|
|
657
|
-
|
|
658
|
-
# Python packages
|
|
659
630
|
pip_list = run_cmd("pip list --format=json 2>/dev/null", timeout=10)
|
|
660
631
|
if pip_list and pip_list.startswith("["):
|
|
661
632
|
try:
|
|
662
633
|
pkgs["python"] = {p["name"]: p["version"] for p in json.loads(pip_list)}
|
|
663
634
|
except Exception:
|
|
664
635
|
pass
|
|
665
|
-
|
|
666
|
-
# Node packages (global)
|
|
667
636
|
npm_list = run_cmd("npm list -g --depth=0 --json 2>/dev/null", timeout=10)
|
|
668
637
|
if npm_list and npm_list.startswith("{"):
|
|
669
638
|
try:
|
|
@@ -671,27 +640,32 @@ def get_installed_packages() -> dict:
|
|
|
671
640
|
pkgs["node_global"] = {k: v.get("version", "?") for k, v in deps.items()}
|
|
672
641
|
except Exception:
|
|
673
642
|
pass
|
|
674
|
-
|
|
675
|
-
# pm2 ecosystem config
|
|
676
643
|
pm2_conf = run_cmd("cat ecosystem.config.js 2>/dev/null || cat ecosystem.config.cjs 2>/dev/null", timeout=3)
|
|
677
644
|
if pm2_conf and not pm2_conf.startswith("[error"):
|
|
678
645
|
pkgs["pm2_ecosystem"] = pm2_conf[:3000]
|
|
679
|
-
|
|
680
646
|
return pkgs
|
|
681
647
|
|
|
682
648
|
|
|
683
|
-
|
|
649
|
+
def get_firewall_rules():
|
|
650
|
+
ufw = run_cmd("ufw status verbose 2>/dev/null", timeout=5)
|
|
651
|
+
if ufw and "Status:" in ufw:
|
|
652
|
+
return ufw[:2000]
|
|
653
|
+
ipt = run_cmd("iptables -L -n --line-numbers 2>/dev/null | head -40", timeout=5)
|
|
654
|
+
return ipt[:2000] if ipt else ""
|
|
684
655
|
|
|
685
|
-
class LogFileWatcher(threading.Thread):
|
|
686
|
-
"""Watch log files for new lines and send them."""
|
|
687
656
|
|
|
688
|
-
|
|
657
|
+
# =============================================================================
|
|
658
|
+
# File Watcher Thread (with deduplication + agent log exclusion)
|
|
659
|
+
# =============================================================================
|
|
660
|
+
|
|
661
|
+
class LogFileWatcher(threading.Thread):
|
|
662
|
+
def __init__(self, files):
|
|
689
663
|
super().__init__(daemon=True)
|
|
690
664
|
self.files = files
|
|
691
665
|
self.positions = {}
|
|
666
|
+
self._recent_hashes = deque(maxlen=500)
|
|
692
667
|
|
|
693
668
|
def run(self):
|
|
694
|
-
# Initialize positions to end of files
|
|
695
669
|
for f in self.files:
|
|
696
670
|
try:
|
|
697
671
|
self.positions[f] = os.path.getsize(f)
|
|
@@ -705,10 +679,16 @@ class LogFileWatcher(threading.Thread):
|
|
|
705
679
|
if size > self.positions.get(filepath, 0):
|
|
706
680
|
with open(filepath, "r") as fh:
|
|
707
681
|
fh.seek(self.positions[filepath])
|
|
708
|
-
new_lines = fh.read(10000)
|
|
682
|
+
new_lines = fh.read(10000)
|
|
709
683
|
self.positions[filepath] = fh.tell()
|
|
710
684
|
|
|
711
685
|
if new_lines.strip():
|
|
686
|
+
# Deduplicate
|
|
687
|
+
msg_hash = hash(new_lines.strip()[:300])
|
|
688
|
+
if msg_hash in self._recent_hashes:
|
|
689
|
+
continue
|
|
690
|
+
self._recent_hashes.append(msg_hash)
|
|
691
|
+
|
|
712
692
|
level = "info"
|
|
713
693
|
lower = new_lines.lower()
|
|
714
694
|
if any(kw in lower for kw in ["error", "exception", "traceback", "failed", "critical"]):
|
|
@@ -722,38 +702,35 @@ class LogFileWatcher(threading.Thread):
|
|
|
722
702
|
"platform": PLATFORM,
|
|
723
703
|
"version": VERSION,
|
|
724
704
|
"level": level,
|
|
725
|
-
"message":
|
|
705
|
+
"message": "[file:{}] {}".format(os.path.basename(filepath), new_lines.strip()[:2000]),
|
|
726
706
|
"context": {"capturedFrom": "file_watcher", "file": filepath},
|
|
727
707
|
})
|
|
728
708
|
elif size < self.positions.get(filepath, 0):
|
|
729
|
-
# File was truncated/rotated
|
|
730
709
|
self.positions[filepath] = 0
|
|
731
710
|
except Exception:
|
|
732
711
|
pass
|
|
733
712
|
time.sleep(2)
|
|
734
713
|
|
|
735
714
|
|
|
736
|
-
#
|
|
715
|
+
# =============================================================================
|
|
716
|
+
# Collect Metrics
|
|
717
|
+
# =============================================================================
|
|
737
718
|
|
|
738
|
-
def collect_full_metrics()
|
|
739
|
-
"""Collect absolutely everything."""
|
|
719
|
+
def collect_full_metrics():
|
|
740
720
|
cpu = psutil.cpu_percent(interval=1)
|
|
741
721
|
cpu_per_core = psutil.cpu_percent(interval=0, percpu=True)
|
|
742
722
|
mem = psutil.virtual_memory()
|
|
743
723
|
swap = psutil.swap_memory()
|
|
744
|
-
|
|
724
|
+
|
|
745
725
|
data = {
|
|
746
726
|
"cpu": cpu,
|
|
747
727
|
"memory": mem.percent,
|
|
748
728
|
"network_latency": 0,
|
|
749
729
|
"custom": {
|
|
750
|
-
# CPU deep
|
|
751
730
|
"cpu_per_core": cpu_per_core,
|
|
752
731
|
"cpu_count_logical": psutil.cpu_count(),
|
|
753
732
|
"cpu_count_physical": psutil.cpu_count(logical=False),
|
|
754
733
|
"load_avg": list(os.getloadavg()) if hasattr(os, "getloadavg") else [],
|
|
755
|
-
|
|
756
|
-
# Memory deep
|
|
757
734
|
"memory_used_gb": round(mem.used / 1e9, 2),
|
|
758
735
|
"memory_total_gb": round(mem.total / 1e9, 2),
|
|
759
736
|
"memory_available_gb": round(mem.available / 1e9, 2),
|
|
@@ -762,12 +739,8 @@ def collect_full_metrics() -> dict:
|
|
|
762
739
|
"swap_used_gb": round(swap.used / 1e9, 2),
|
|
763
740
|
"swap_total_gb": round(swap.total / 1e9, 2),
|
|
764
741
|
"swap_pct": swap.percent,
|
|
765
|
-
|
|
766
|
-
# Disk I/O
|
|
767
742
|
"disk_io": get_disk_io(),
|
|
768
|
-
|
|
769
|
-
# OS info
|
|
770
|
-
"os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
|
|
743
|
+
"os": "{} {}".format(os.uname().sysname, os.uname().release) if hasattr(os, "uname") else "unknown",
|
|
771
744
|
"hostname": socket.gethostname(),
|
|
772
745
|
"python_version": sys.version.split()[0],
|
|
773
746
|
"node_version": run_cmd("node --version 2>/dev/null"),
|
|
@@ -775,7 +748,6 @@ def collect_full_metrics() -> dict:
|
|
|
775
748
|
},
|
|
776
749
|
}
|
|
777
750
|
|
|
778
|
-
# CPU frequency
|
|
779
751
|
try:
|
|
780
752
|
freq = psutil.cpu_freq()
|
|
781
753
|
if freq:
|
|
@@ -784,7 +756,6 @@ def collect_full_metrics() -> dict:
|
|
|
784
756
|
except Exception:
|
|
785
757
|
pass
|
|
786
758
|
|
|
787
|
-
# Context switches
|
|
788
759
|
try:
|
|
789
760
|
ctx = psutil.cpu_stats()
|
|
790
761
|
data["custom"]["ctx_switches"] = ctx.ctx_switches
|
|
@@ -792,7 +763,6 @@ def collect_full_metrics() -> dict:
|
|
|
792
763
|
except Exception:
|
|
793
764
|
pass
|
|
794
765
|
|
|
795
|
-
# GPU
|
|
796
766
|
gpu = get_gpu_metrics()
|
|
797
767
|
if gpu:
|
|
798
768
|
data["gpu_temp"] = gpu.get("gpu_temp")
|
|
@@ -802,8 +772,7 @@ def collect_full_metrics() -> dict:
|
|
|
802
772
|
return data
|
|
803
773
|
|
|
804
774
|
|
|
805
|
-
def collect_deep_snapshot()
|
|
806
|
-
"""Full system snapshot for variable inspector."""
|
|
775
|
+
def collect_deep_snapshot():
|
|
807
776
|
snapshot = {
|
|
808
777
|
"system": {
|
|
809
778
|
"hostname": socket.gethostname(),
|
|
@@ -811,7 +780,7 @@ def collect_deep_snapshot() -> dict:
|
|
|
811
780
|
"cpu_count": psutil.cpu_count(),
|
|
812
781
|
"memory_total_gb": round(psutil.virtual_memory().total / 1e9, 2),
|
|
813
782
|
"python_version": sys.version.split()[0],
|
|
814
|
-
"os":
|
|
783
|
+
"os": "{} {}".format(os.uname().sysname, os.uname().release) if hasattr(os, "uname") else "unknown",
|
|
815
784
|
"kernel": run_cmd("uname -r 2>/dev/null"),
|
|
816
785
|
},
|
|
817
786
|
"gpu": get_gpu_metrics() if GPU_AVAILABLE else {"available": False},
|
|
@@ -825,28 +794,23 @@ def collect_deep_snapshot() -> dict:
|
|
|
825
794
|
"firewall": get_firewall_rules(),
|
|
826
795
|
"environment": get_env_sanitized(),
|
|
827
796
|
}
|
|
828
|
-
|
|
829
|
-
# Job state
|
|
830
797
|
state = job_tracker.get_state()
|
|
831
798
|
if state:
|
|
832
799
|
snapshot["current_job"] = state
|
|
833
|
-
|
|
834
800
|
return snapshot
|
|
835
801
|
|
|
836
802
|
|
|
837
|
-
#
|
|
803
|
+
# =============================================================================
|
|
804
|
+
# Job State Tracking
|
|
805
|
+
# =============================================================================
|
|
838
806
|
|
|
839
807
|
class JobTracker:
|
|
840
|
-
"""Track job states from BullMQ or similar queue systems."""
|
|
841
|
-
|
|
842
808
|
def __init__(self):
|
|
843
809
|
self.current_job = None
|
|
844
810
|
self.job_history = deque(maxlen=50)
|
|
845
811
|
self.lock = threading.Lock()
|
|
846
812
|
|
|
847
|
-
def update(self, job_id
|
|
848
|
-
last_action: str = "", error: str | None = None,
|
|
849
|
-
metadata: dict | None = None):
|
|
813
|
+
def update(self, job_id, status, progress=0, last_action="", error=None, metadata=None):
|
|
850
814
|
with self.lock:
|
|
851
815
|
job = {
|
|
852
816
|
"job_id": job_id,
|
|
@@ -860,11 +824,11 @@ class JobTracker:
|
|
|
860
824
|
self.current_job = job
|
|
861
825
|
self.job_history.append(job)
|
|
862
826
|
|
|
863
|
-
def get_state(self)
|
|
827
|
+
def get_state(self):
|
|
864
828
|
with self.lock:
|
|
865
829
|
return self.current_job.copy() if self.current_job else None
|
|
866
830
|
|
|
867
|
-
def get_history(self, n
|
|
831
|
+
def get_history(self, n=20):
|
|
868
832
|
with self.lock:
|
|
869
833
|
return list(self.job_history)[-n:]
|
|
870
834
|
|
|
@@ -881,14 +845,16 @@ class JobTracker:
|
|
|
881
845
|
})
|
|
882
846
|
|
|
883
847
|
|
|
884
|
-
#
|
|
848
|
+
# =============================================================================
|
|
849
|
+
# Public API
|
|
850
|
+
# =============================================================================
|
|
885
851
|
|
|
886
852
|
job_tracker = JobTracker()
|
|
887
853
|
stdout_capture = None
|
|
888
854
|
stderr_capture = None
|
|
889
855
|
|
|
890
856
|
|
|
891
|
-
def send_log(level
|
|
857
|
+
def send_log(level, message, context=None):
|
|
892
858
|
send({
|
|
893
859
|
"type": "log",
|
|
894
860
|
"source": SOURCE_NAME,
|
|
@@ -900,7 +866,7 @@ def send_log(level: str, message: str, context: dict = None):
|
|
|
900
866
|
})
|
|
901
867
|
|
|
902
868
|
|
|
903
|
-
def send_error(title
|
|
869
|
+
def send_error(title, stack_trace="", context=None):
|
|
904
870
|
send({
|
|
905
871
|
"type": "error",
|
|
906
872
|
"source": SOURCE_NAME,
|
|
@@ -912,74 +878,236 @@ def send_error(title: str, stack_trace: str = "", context: dict = None):
|
|
|
912
878
|
})
|
|
913
879
|
|
|
914
880
|
|
|
915
|
-
def update_job(job_id
|
|
916
|
-
last_action: str = "", error: str = None, metadata: dict = None):
|
|
881
|
+
def update_job(job_id, status, progress=0, last_action="", error=None, metadata=None):
|
|
917
882
|
job_tracker.update(job_id, status, progress, last_action, error, metadata)
|
|
918
883
|
job_tracker.send_update()
|
|
919
|
-
|
|
920
884
|
if error:
|
|
921
|
-
send_error(
|
|
885
|
+
send_error("Job {} failed: {}".format(job_id, error), context={
|
|
922
886
|
"job_id": job_id, "status": status, "last_action": last_action,
|
|
923
887
|
**(metadata or {}),
|
|
924
888
|
})
|
|
925
889
|
|
|
926
890
|
|
|
927
|
-
def capture_image_gen(job_id
|
|
928
|
-
|
|
929
|
-
duration_s: float = None):
|
|
930
|
-
"""Specialized capture for image generation pipelines."""
|
|
931
|
-
context = {
|
|
932
|
-
"job_id": job_id,
|
|
933
|
-
"model": model,
|
|
934
|
-
"params": params,
|
|
935
|
-
"duration_s": duration_s,
|
|
936
|
-
}
|
|
891
|
+
def capture_image_gen(job_id, model, params, result=None, error=None, duration_s=None):
|
|
892
|
+
context = {"job_id": job_id, "model": model, "params": params, "duration_s": duration_s}
|
|
937
893
|
if result:
|
|
938
894
|
context["result"] = result
|
|
939
|
-
send_log("info",
|
|
895
|
+
send_log("info", "Image gen complete: {} ({:.1f}s)".format(model, duration_s or 0), context)
|
|
940
896
|
if error:
|
|
941
897
|
context["error"] = error
|
|
942
|
-
send_error(
|
|
943
|
-
|
|
898
|
+
send_error("Image gen failed: {} — {}".format(model, error), context=context)
|
|
944
899
|
update_job(job_id, "completed" if result else "failed",
|
|
945
900
|
progress=100 if result else 0,
|
|
946
|
-
last_action=
|
|
947
|
-
error=error,
|
|
948
|
-
|
|
901
|
+
last_action="generate:{}".format(model),
|
|
902
|
+
error=error, metadata=context)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
# =============================================================================
|
|
906
|
+
# Action-Based Command Execution
|
|
907
|
+
# =============================================================================
|
|
908
|
+
|
|
909
|
+
def execute_action(action_id, timeout=30):
|
|
910
|
+
"""Execute a predefined action and return result."""
|
|
911
|
+
cmd = ACTION_COMMANDS.get(action_id)
|
|
912
|
+
if not cmd:
|
|
913
|
+
return {
|
|
914
|
+
"output": "[Unknown action: {}]".format(action_id),
|
|
915
|
+
"exit_code": -1,
|
|
916
|
+
"duration_ms": 0,
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
start = time.time()
|
|
920
|
+
try:
|
|
921
|
+
result = subprocess.run(
|
|
922
|
+
cmd, shell=True, capture_output=True, text=True, timeout=timeout
|
|
923
|
+
)
|
|
924
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
925
|
+
output = result.stdout
|
|
926
|
+
if result.stderr:
|
|
927
|
+
output += ("\n--- stderr ---\n" + result.stderr) if output else result.stderr
|
|
928
|
+
return {
|
|
929
|
+
"output": output.strip()[:50000],
|
|
930
|
+
"exit_code": result.returncode,
|
|
931
|
+
"duration_ms": duration_ms,
|
|
932
|
+
}
|
|
933
|
+
except subprocess.TimeoutExpired:
|
|
934
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
935
|
+
return {
|
|
936
|
+
"output": "[Action timed out after {}s]".format(timeout),
|
|
937
|
+
"exit_code": -1,
|
|
938
|
+
"duration_ms": duration_ms,
|
|
939
|
+
}
|
|
940
|
+
except Exception as e:
|
|
941
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
942
|
+
return {
|
|
943
|
+
"output": "[Execution error: {}]".format(e),
|
|
944
|
+
"exit_code": -1,
|
|
945
|
+
"duration_ms": duration_ms,
|
|
946
|
+
}
|
|
949
947
|
|
|
950
948
|
|
|
951
|
-
|
|
949
|
+
def poll_and_execute_commands():
|
|
950
|
+
"""Poll for pending actions and execute them."""
|
|
951
|
+
if not POLL_COMMANDS_URL:
|
|
952
|
+
return
|
|
953
|
+
|
|
954
|
+
try:
|
|
955
|
+
resp = session.post(
|
|
956
|
+
POLL_COMMANDS_URL,
|
|
957
|
+
json={"action": "poll", "source_name": SOURCE_NAME},
|
|
958
|
+
headers=HEADERS,
|
|
959
|
+
timeout=10,
|
|
960
|
+
)
|
|
961
|
+
if resp.status_code != 200:
|
|
962
|
+
return
|
|
963
|
+
|
|
964
|
+
data = resp.json()
|
|
965
|
+
commands = data.get("commands", [])
|
|
966
|
+
settings = data.get("settings") or {}
|
|
967
|
+
max_timeout = settings.get("max_timeout_s", 30)
|
|
968
|
+
|
|
969
|
+
for cmd_entry in commands:
|
|
970
|
+
cmd_id = cmd_entry["id"]
|
|
971
|
+
cmd_str = cmd_entry["command"]
|
|
972
|
+
|
|
973
|
+
# Only accept ACTION: prefixed commands
|
|
974
|
+
if not cmd_str.startswith("ACTION:"):
|
|
975
|
+
logger.warning("Rejected non-action command: %s", cmd_str[:50])
|
|
976
|
+
session.post(
|
|
977
|
+
POLL_COMMANDS_URL,
|
|
978
|
+
json={
|
|
979
|
+
"action": "result",
|
|
980
|
+
"command_id": cmd_id,
|
|
981
|
+
"output": "[REJECTED] Only predefined actions are allowed. Raw commands are disabled.",
|
|
982
|
+
"exit_code": -2,
|
|
983
|
+
"duration_ms": 0,
|
|
984
|
+
"source_name": SOURCE_NAME,
|
|
985
|
+
},
|
|
986
|
+
headers=HEADERS,
|
|
987
|
+
timeout=10,
|
|
988
|
+
)
|
|
989
|
+
continue
|
|
990
|
+
|
|
991
|
+
action_id = cmd_str.replace("ACTION:", "")
|
|
992
|
+
|
|
993
|
+
if action_id not in ACTION_COMMANDS:
|
|
994
|
+
logger.warning("Unknown action: %s", action_id)
|
|
995
|
+
session.post(
|
|
996
|
+
POLL_COMMANDS_URL,
|
|
997
|
+
json={
|
|
998
|
+
"action": "result",
|
|
999
|
+
"command_id": cmd_id,
|
|
1000
|
+
"output": "[REJECTED] Unknown action: {}".format(action_id),
|
|
1001
|
+
"exit_code": -2,
|
|
1002
|
+
"duration_ms": 0,
|
|
1003
|
+
"source_name": SOURCE_NAME,
|
|
1004
|
+
},
|
|
1005
|
+
headers=HEADERS,
|
|
1006
|
+
timeout=10,
|
|
1007
|
+
)
|
|
1008
|
+
continue
|
|
1009
|
+
|
|
1010
|
+
logger.info("Executing action: %s (id=%s...)", action_id, cmd_id[:8])
|
|
1011
|
+
|
|
1012
|
+
# Claim
|
|
1013
|
+
session.post(
|
|
1014
|
+
POLL_COMMANDS_URL,
|
|
1015
|
+
json={"action": "claim", "command_id": cmd_id},
|
|
1016
|
+
headers=HEADERS,
|
|
1017
|
+
timeout=10,
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
# Execute the mapped command
|
|
1021
|
+
result = execute_action(action_id, timeout=min(max_timeout, 60))
|
|
1022
|
+
logger.info("Action %s completed: exit_code=%s (%sms)", action_id, result["exit_code"], result["duration_ms"])
|
|
1023
|
+
|
|
1024
|
+
# Report result
|
|
1025
|
+
session.post(
|
|
1026
|
+
POLL_COMMANDS_URL,
|
|
1027
|
+
json={
|
|
1028
|
+
"action": "result",
|
|
1029
|
+
"command_id": cmd_id,
|
|
1030
|
+
"output": result["output"],
|
|
1031
|
+
"exit_code": result["exit_code"],
|
|
1032
|
+
"duration_ms": result["duration_ms"],
|
|
1033
|
+
"source_name": SOURCE_NAME,
|
|
1034
|
+
},
|
|
1035
|
+
headers=HEADERS,
|
|
1036
|
+
timeout=10,
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
# Log execution
|
|
1040
|
+
send_log(
|
|
1041
|
+
"info" if result["exit_code"] == 0 else "warn",
|
|
1042
|
+
"[action] {} -> exit {} ({}ms)".format(action_id, result["exit_code"], result["duration_ms"]),
|
|
1043
|
+
{"action": action_id, "exit_code": result["exit_code"], "command_id": cmd_id},
|
|
1044
|
+
)
|
|
1045
|
+
|
|
1046
|
+
except Exception as e:
|
|
1047
|
+
logger.debug("Command poll error: %s", e)
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
# =============================================================================
|
|
1051
|
+
# PM2 Log Tracker (prevents re-sending same lines)
|
|
1052
|
+
# =============================================================================
|
|
1053
|
+
|
|
1054
|
+
class PM2LogTracker:
|
|
1055
|
+
"""Tracks which PM2 log lines have been sent to prevent duplicates."""
|
|
1056
|
+
def __init__(self):
|
|
1057
|
+
self._seen_hashes = deque(maxlen=1000)
|
|
1058
|
+
|
|
1059
|
+
def get_new_errors(self, pm2_output):
|
|
1060
|
+
"""Return only error lines that haven't been seen before."""
|
|
1061
|
+
new_errors = []
|
|
1062
|
+
for line in pm2_output.split("\n"):
|
|
1063
|
+
lower = line.lower()
|
|
1064
|
+
if any(kw in lower for kw in ["error", "exception", "failed", "crash", "enoent", "eacces", "killed"]):
|
|
1065
|
+
line_hash = hash(line.strip()[:200])
|
|
1066
|
+
if line_hash not in self._seen_hashes:
|
|
1067
|
+
self._seen_hashes.append(line_hash)
|
|
1068
|
+
new_errors.append(line.strip()[:500])
|
|
1069
|
+
return new_errors
|
|
1070
|
+
|
|
1071
|
+
|
|
1072
|
+
pm2_log_tracker = PM2LogTracker()
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
# =============================================================================
|
|
1076
|
+
# Main Loop
|
|
1077
|
+
# =============================================================================
|
|
952
1078
|
|
|
953
1079
|
def main():
|
|
954
1080
|
global stdout_capture, stderr_capture
|
|
955
1081
|
|
|
956
|
-
# Capture stdout/stderr
|
|
957
1082
|
stdout_capture = StreamCapture(sys.stdout, "info")
|
|
958
1083
|
stderr_capture = StreamCapture(sys.stderr, "error")
|
|
959
1084
|
sys.stdout = stdout_capture
|
|
960
1085
|
sys.stderr = stderr_capture
|
|
961
1086
|
|
|
962
|
-
logger.info(
|
|
963
|
-
logger.info(
|
|
964
|
-
logger.info(
|
|
1087
|
+
logger.info("debugger.help Agent v%s — Action-Based Execution", VERSION)
|
|
1088
|
+
logger.info("Source: %s | GPU: %s | Docker: %s", SOURCE_NAME, "yes" if GPU_AVAILABLE else "no", "yes" if DOCKER_AVAILABLE else "no")
|
|
1089
|
+
logger.info("Interval: %ss | Endpoint: %s", INTERVAL, INGEST_URL)
|
|
1090
|
+
logger.info("Registered actions: %s", ", ".join(sorted(ACTION_COMMANDS.keys())))
|
|
965
1091
|
|
|
966
|
-
# Start file watchers
|
|
1092
|
+
# Start file watchers — exclude agent's own log files
|
|
967
1093
|
watch_files = list(WATCH_LOG_FILES)
|
|
968
|
-
# Auto-discover pm2 log files
|
|
969
1094
|
pm2_log_dir = os.path.expanduser("~/.pm2/logs")
|
|
970
1095
|
if os.path.isdir(pm2_log_dir):
|
|
971
1096
|
pm2_logs = glob.glob(os.path.join(pm2_log_dir, "*.log"))
|
|
1097
|
+
# Exclude the debugger agent's own log files to prevent feedback loop
|
|
1098
|
+
pm2_logs = [f for f in pm2_logs if "debugger-agent" not in os.path.basename(f).lower()
|
|
1099
|
+
and "debugger_agent" not in os.path.basename(f).lower()]
|
|
972
1100
|
watch_files.extend(pm2_logs)
|
|
973
|
-
logger.info(
|
|
974
|
-
|
|
1101
|
+
logger.info("Watching %d pm2 log files (excluded agent logs)", len(pm2_logs))
|
|
1102
|
+
|
|
975
1103
|
if watch_files:
|
|
976
1104
|
watcher = LogFileWatcher(watch_files)
|
|
977
1105
|
watcher.start()
|
|
978
|
-
logger.info(
|
|
1106
|
+
logger.info("File watcher started for %d files", len(watch_files))
|
|
979
1107
|
|
|
980
1108
|
# Initial connection
|
|
981
1109
|
send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
|
|
982
|
-
send_log("info",
|
|
1110
|
+
send_log("info", "Agent v{} started on {}".format(VERSION, socket.gethostname()), {
|
|
983
1111
|
"hostname": socket.gethostname(),
|
|
984
1112
|
"python_version": sys.version,
|
|
985
1113
|
"gpu_available": GPU_AVAILABLE,
|
|
@@ -987,9 +1115,10 @@ def main():
|
|
|
987
1115
|
"pid": os.getpid(),
|
|
988
1116
|
"gpu_driver": GPU_DRIVER_VERSION,
|
|
989
1117
|
"cuda_version": GPU_CUDA_VERSION,
|
|
1118
|
+
"available_actions": list(ACTION_COMMANDS.keys()),
|
|
990
1119
|
})
|
|
991
1120
|
|
|
992
|
-
# Send initial deep snapshot
|
|
1121
|
+
# Send initial deep snapshot
|
|
993
1122
|
try:
|
|
994
1123
|
pkgs = get_installed_packages()
|
|
995
1124
|
send({
|
|
@@ -1022,10 +1151,9 @@ def main():
|
|
|
1022
1151
|
else:
|
|
1023
1152
|
consecutive_failures += 1
|
|
1024
1153
|
|
|
1025
|
-
# Auto-reconnect backoff
|
|
1026
1154
|
if consecutive_failures > 5:
|
|
1027
1155
|
backoff = min(consecutive_failures * 5, 60)
|
|
1028
|
-
logger.warning(
|
|
1156
|
+
logger.warning("Connection issues. Backing off %ss...", backoff)
|
|
1029
1157
|
time.sleep(backoff)
|
|
1030
1158
|
send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
|
|
1031
1159
|
consecutive_failures = 0
|
|
@@ -1042,15 +1170,13 @@ def main():
|
|
|
1042
1170
|
"variables": snapshot,
|
|
1043
1171
|
})
|
|
1044
1172
|
|
|
1045
|
-
# PM2 logs check every ~
|
|
1173
|
+
# PM2 logs check every ~30s (with deduplication)
|
|
1046
1174
|
if tick % 3 == 0:
|
|
1047
1175
|
pm2_logs = get_pm2_logs(50)
|
|
1048
1176
|
if pm2_logs:
|
|
1049
|
-
|
|
1050
|
-
for line in
|
|
1051
|
-
|
|
1052
|
-
if any(kw in lower for kw in ["error", "exception", "failed", "crash", "enoent", "eacces", "killed"]):
|
|
1053
|
-
send_log("error", f"[pm2] {line.strip()[:500]}", {"capturedFrom": "pm2_logs"})
|
|
1177
|
+
new_errors = pm2_log_tracker.get_new_errors(pm2_logs)
|
|
1178
|
+
for line in new_errors[:5]: # Max 5 new errors per check
|
|
1179
|
+
send_log("error", "[pm2] {}".format(line), {"capturedFrom": "pm2_logs"})
|
|
1054
1180
|
|
|
1055
1181
|
# System logs check every ~2 min
|
|
1056
1182
|
if tick % 12 == 0:
|
|
@@ -1070,8 +1196,8 @@ def main():
|
|
|
1070
1196
|
for r in ssl_results:
|
|
1071
1197
|
if r.get("warning") or r.get("error"):
|
|
1072
1198
|
days = r.get("days_left")
|
|
1073
|
-
detail = r.get("error") or
|
|
1074
|
-
send_log("warn",
|
|
1199
|
+
detail = r.get("error") or "{} days left".format(days)
|
|
1200
|
+
send_log("warn", "SSL cert issue: {} — {}".format(r.get("domain"), detail), r)
|
|
1075
1201
|
send({
|
|
1076
1202
|
"type": "inspect",
|
|
1077
1203
|
"source": SOURCE_NAME,
|
|
@@ -1080,18 +1206,19 @@ def main():
|
|
|
1080
1206
|
"variables": {"ssl_certs": ssl_results},
|
|
1081
1207
|
})
|
|
1082
1208
|
|
|
1083
|
-
# GPU warnings every ~
|
|
1209
|
+
# GPU warnings every ~30s (only when thresholds exceeded)
|
|
1084
1210
|
if GPU_AVAILABLE and tick % 3 == 0:
|
|
1085
1211
|
gpu = get_gpu_metrics()
|
|
1086
1212
|
gpus = gpu.get("gpus", [])
|
|
1087
1213
|
for g in gpus:
|
|
1088
1214
|
if g.get("temp_c", 0) > 85:
|
|
1089
|
-
send_log("warn",
|
|
1215
|
+
send_log("warn", "GPU {} temperature critical: {}C".format(g["index"], g["temp_c"]), g)
|
|
1090
1216
|
if g.get("vram_pct", 0) > 90:
|
|
1091
|
-
send_log("warn",
|
|
1217
|
+
send_log("warn", "GPU {} VRAM critical: {}% ({}/{} GB)".format(
|
|
1218
|
+
g["index"], g["vram_pct"], g["vram_used_gb"], g["vram_total_gb"]), g)
|
|
1092
1219
|
throttle = g.get("throttle_reasons", [])
|
|
1093
1220
|
if throttle and throttle != ["none"] and throttle != ["idle"]:
|
|
1094
|
-
send_log("warn",
|
|
1221
|
+
send_log("warn", "GPU {} throttling: {}".format(g["index"], ", ".join(throttle)), g)
|
|
1095
1222
|
|
|
1096
1223
|
# Heartbeat every ~5 min
|
|
1097
1224
|
if tick % 30 == 0 and tick > 0:
|
|
@@ -1108,6 +1235,15 @@ def main():
|
|
|
1108
1235
|
"variables": {"installed_packages": pkgs},
|
|
1109
1236
|
})
|
|
1110
1237
|
|
|
1238
|
+
# Poll for remote actions every tick
|
|
1239
|
+
poll_and_execute_commands()
|
|
1240
|
+
|
|
1241
|
+
# Flush captured stdout/stderr
|
|
1242
|
+
if stdout_capture:
|
|
1243
|
+
stdout_capture.flush_pending()
|
|
1244
|
+
if stderr_capture:
|
|
1245
|
+
stderr_capture.flush_pending()
|
|
1246
|
+
|
|
1111
1247
|
tick += 1
|
|
1112
1248
|
time.sleep(INTERVAL)
|
|
1113
1249
|
|
|
@@ -1118,7 +1254,7 @@ def main():
|
|
|
1118
1254
|
sys.stderr = stderr_capture.original
|
|
1119
1255
|
break
|
|
1120
1256
|
except Exception as e:
|
|
1121
|
-
logger.error(
|
|
1257
|
+
logger.error("Main loop error: %s", e)
|
|
1122
1258
|
send_error(str(e), traceback.format_exc())
|
|
1123
1259
|
time.sleep(INTERVAL)
|
|
1124
1260
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "debugger-help"
|
|
7
|
-
version = "
|
|
7
|
+
version = "4.0.0"
|
|
8
8
|
description = "debugger.help VPS Agent — Deep system monitoring for logs, GPU, PM2, Docker, and more"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|