debugger-help 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debugger_help/agent.py
ADDED
|
@@ -0,0 +1,1125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
debugger.help VPS Agent v3 — Ultimate Deep Debugger
|
|
4
|
+
|
|
5
|
+
Captures EVERYTHING from your VPS:
|
|
6
|
+
- pm2 logs, status, process list, restart counts
|
|
7
|
+
- GPU: VRAM, temp, power, utilization, ECC errors, throttling, driver version, CUDA version
|
|
8
|
+
- CPU: per-core usage, load average, frequency, context switches
|
|
9
|
+
- Memory: RAM, swap, shared, buffers, cached
|
|
10
|
+
- Disk: usage per mount, I/O read/write rates, inode usage
|
|
11
|
+
- Network: per-interface stats, connection counts by state, open ports, DNS resolution
|
|
12
|
+
- Processes: top CPU/memory consumers, zombie/defunct processes, open file descriptors
|
|
13
|
+
- Docker containers (if running): status, CPU, memory, restart counts
|
|
14
|
+
- Systemd services: failed units
|
|
15
|
+
- SSL certificates: expiry checks
|
|
16
|
+
- File watchers: key log files (syslog, dmesg, pm2 logs, custom)
|
|
17
|
+
- Image generation: Flux/SDXL/ComfyUI pipeline errors, model load times, inference timing
|
|
18
|
+
- Environment: all relevant env vars (sanitized), Python packages, Node packages
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
pip install psutil requests
|
|
22
|
+
# Optional: pip install pynvml docker
|
|
23
|
+
|
|
24
|
+
export DEBUGGER_API_KEY="sk_your_api_key_here"
|
|
25
|
+
export DEBUGGER_INGEST_URL="https://your-project.supabase.co/functions/v1/ingest"
|
|
26
|
+
python debugger_agent.py
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import sys
|
|
31
|
+
import time
|
|
32
|
+
import json
|
|
33
|
+
import socket
|
|
34
|
+
import logging
|
|
35
|
+
import traceback
|
|
36
|
+
import threading
|
|
37
|
+
import subprocess
|
|
38
|
+
import io
|
|
39
|
+
import re
|
|
40
|
+
import glob
|
|
41
|
+
import ssl
|
|
42
|
+
from datetime import datetime, timezone
|
|
43
|
+
from collections import deque
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
import psutil
|
|
48
|
+
except ImportError:
|
|
49
|
+
print("ERROR: psutil required. Install with: pip install psutil")
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
import requests
|
|
54
|
+
from requests.adapters import HTTPAdapter
|
|
55
|
+
from urllib3.util.retry import Retry
|
|
56
|
+
except ImportError:
|
|
57
|
+
print("ERROR: requests required. Install with: pip install requests")
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
|
|
60
|
+
# Optional GPU monitoring
|
|
61
|
+
try:
|
|
62
|
+
import pynvml
|
|
63
|
+
pynvml.nvmlInit()
|
|
64
|
+
GPU_AVAILABLE = True
|
|
65
|
+
GPU_DRIVER_VERSION = pynvml.nvmlSystemGetDriverVersion()
|
|
66
|
+
if isinstance(GPU_DRIVER_VERSION, bytes):
|
|
67
|
+
GPU_DRIVER_VERSION = GPU_DRIVER_VERSION.decode()
|
|
68
|
+
try:
|
|
69
|
+
GPU_CUDA_VERSION = pynvml.nvmlSystemGetCudaDriverVersion_v2()
|
|
70
|
+
GPU_CUDA_VERSION = f"{GPU_CUDA_VERSION // 1000}.{(GPU_CUDA_VERSION % 1000) // 10}"
|
|
71
|
+
except Exception:
|
|
72
|
+
GPU_CUDA_VERSION = "unknown"
|
|
73
|
+
except (ImportError, Exception):
|
|
74
|
+
GPU_AVAILABLE = False
|
|
75
|
+
GPU_DRIVER_VERSION = None
|
|
76
|
+
GPU_CUDA_VERSION = None
|
|
77
|
+
|
|
78
|
+
# Optional Docker monitoring
|
|
79
|
+
try:
|
|
80
|
+
import docker
|
|
81
|
+
DOCKER_CLIENT = docker.from_env()
|
|
82
|
+
DOCKER_AVAILABLE = True
|
|
83
|
+
except Exception:
|
|
84
|
+
DOCKER_CLIENT = None
|
|
85
|
+
DOCKER_AVAILABLE = False
|
|
86
|
+
|
|
87
|
+
logging.basicConfig(
|
|
88
|
+
level=logging.INFO,
|
|
89
|
+
format="%(asctime)s [%(levelname)s] %(message)s"
|
|
90
|
+
)
|
|
91
|
+
logger = logging.getLogger("debugger-agent")
|
|
92
|
+
|
|
93
|
+
# Configuration
|
|
94
|
+
API_KEY = os.environ.get("DEBUGGER_API_KEY", "")
|
|
95
|
+
INGEST_URL = os.environ.get("DEBUGGER_INGEST_URL", "")
|
|
96
|
+
SOURCE_NAME = os.environ.get("DEBUGGER_SOURCE", f"vps-{socket.gethostname()}")
|
|
97
|
+
PLATFORM = os.environ.get("DEBUGGER_PLATFORM", "Python (VPS)")
|
|
98
|
+
INTERVAL = int(os.environ.get("DEBUGGER_INTERVAL", "10"))
|
|
99
|
+
VERSION = "3.0.0"
|
|
100
|
+
|
|
101
|
+
# Additional log files to watch
|
|
102
|
+
WATCH_LOG_FILES = [
|
|
103
|
+
p for p in os.environ.get("DEBUGGER_WATCH_LOGS", "").split(",") if p.strip()
|
|
104
|
+
] or []
|
|
105
|
+
|
|
106
|
+
# SSL domains to check
|
|
107
|
+
SSL_CHECK_DOMAINS = [
|
|
108
|
+
d for d in os.environ.get("DEBUGGER_SSL_DOMAINS", "").split(",") if d.strip()
|
|
109
|
+
] or []
|
|
110
|
+
|
|
111
|
+
if not API_KEY:
|
|
112
|
+
logger.error("DEBUGGER_API_KEY not set.")
|
|
113
|
+
sys.exit(1)
|
|
114
|
+
if not INGEST_URL:
|
|
115
|
+
logger.error("DEBUGGER_INGEST_URL not set.")
|
|
116
|
+
sys.exit(1)
|
|
117
|
+
|
|
118
|
+
# --- HTTP Session with Retry ---
|
|
119
|
+
|
|
120
|
+
session = requests.Session()
|
|
121
|
+
retry = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
|
|
122
|
+
session.mount("https://", HTTPAdapter(max_retries=retry))
|
|
123
|
+
session.mount("http://", HTTPAdapter(max_retries=retry))
|
|
124
|
+
|
|
125
|
+
HEADERS = {
|
|
126
|
+
"Authorization": f"Bearer {API_KEY}",
|
|
127
|
+
"Content-Type": "application/json",
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def send(payload: dict) -> bool:
|
|
132
|
+
"""Send payload with auto-retry."""
|
|
133
|
+
try:
|
|
134
|
+
resp = session.post(INGEST_URL, json=payload, headers=HEADERS, timeout=15)
|
|
135
|
+
return resp.status_code == 200
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.debug(f"Send failed (will retry): {e}")
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def run_cmd(cmd: str, timeout: int = 10) -> str:
|
|
142
|
+
"""Run shell command and return output."""
|
|
143
|
+
try:
|
|
144
|
+
result = subprocess.run(
|
|
145
|
+
cmd, shell=True, capture_output=True, text=True, timeout=timeout
|
|
146
|
+
)
|
|
147
|
+
return (result.stdout + result.stderr).strip()
|
|
148
|
+
except subprocess.TimeoutExpired:
|
|
149
|
+
return f"[timeout after {timeout}s]"
|
|
150
|
+
except Exception as e:
|
|
151
|
+
return f"[error: {e}]"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# --- Stdout/Stderr Capture ---
|
|
155
|
+
|
|
156
|
+
class StreamCapture(io.TextIOBase):
|
|
157
|
+
"""Captures writes to stdout/stderr and sends them as logs."""
|
|
158
|
+
|
|
159
|
+
def __init__(self, original_stream, level="info", max_buffer=500):
|
|
160
|
+
self.original = original_stream
|
|
161
|
+
self.level = level
|
|
162
|
+
self.buffer = deque(maxlen=max_buffer)
|
|
163
|
+
self.lock = threading.Lock()
|
|
164
|
+
|
|
165
|
+
def write(self, text):
|
|
166
|
+
self.original.write(text)
|
|
167
|
+
if text.strip():
|
|
168
|
+
with self.lock:
|
|
169
|
+
self.buffer.append(text.strip())
|
|
170
|
+
lower = text.lower()
|
|
171
|
+
detected_level = self.level
|
|
172
|
+
if any(kw in lower for kw in [
|
|
173
|
+
"error", "exception", "traceback", "failed", "critical",
|
|
174
|
+
"cuda", "oom", "killed", "segfault", "abort", "panic"
|
|
175
|
+
]):
|
|
176
|
+
detected_level = "error"
|
|
177
|
+
elif any(kw in lower for kw in ["warning", "warn", "deprecat"]):
|
|
178
|
+
detected_level = "warn"
|
|
179
|
+
|
|
180
|
+
send({
|
|
181
|
+
"type": "log",
|
|
182
|
+
"source": SOURCE_NAME,
|
|
183
|
+
"platform": PLATFORM,
|
|
184
|
+
"version": VERSION,
|
|
185
|
+
"level": detected_level,
|
|
186
|
+
"message": f"[std{self.level}] {text.strip()[:2000]}",
|
|
187
|
+
"context": {"capturedFrom": f"std{self.level}"},
|
|
188
|
+
})
|
|
189
|
+
return len(text)
|
|
190
|
+
|
|
191
|
+
def flush(self):
|
|
192
|
+
self.original.flush()
|
|
193
|
+
|
|
194
|
+
def get_recent(self, n=100):
|
|
195
|
+
with self.lock:
|
|
196
|
+
return list(self.buffer)[-n:]
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# --- GPU Metrics (Deep) ---
|
|
200
|
+
|
|
201
|
+
def get_gpu_metrics() -> dict:
|
|
202
|
+
if not GPU_AVAILABLE:
|
|
203
|
+
return {}
|
|
204
|
+
try:
|
|
205
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
206
|
+
gpus = []
|
|
207
|
+
for i in range(device_count):
|
|
208
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
209
|
+
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
210
|
+
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
211
|
+
power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
|
|
212
|
+
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
213
|
+
name = pynvml.nvmlDeviceGetName(handle)
|
|
214
|
+
if isinstance(name, bytes):
|
|
215
|
+
name = name.decode("utf-8")
|
|
216
|
+
|
|
217
|
+
gpu_info = {
|
|
218
|
+
"index": i,
|
|
219
|
+
"name": name,
|
|
220
|
+
"temp_c": temp,
|
|
221
|
+
"vram_used_gb": round(mem.used / 1e9, 2),
|
|
222
|
+
"vram_total_gb": round(mem.total / 1e9, 2),
|
|
223
|
+
"vram_free_gb": round(mem.free / 1e9, 2),
|
|
224
|
+
"vram_pct": round((mem.used / mem.total) * 100, 1) if mem.total > 0 else 0,
|
|
225
|
+
"gpu_util_pct": util.gpu,
|
|
226
|
+
"mem_util_pct": util.memory,
|
|
227
|
+
"power_w": round(power, 1),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# Power limit
|
|
231
|
+
try:
|
|
232
|
+
power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
|
|
233
|
+
gpu_info["power_limit_w"] = round(power_limit, 1)
|
|
234
|
+
except Exception:
|
|
235
|
+
pass
|
|
236
|
+
|
|
237
|
+
# Throttle reasons
|
|
238
|
+
try:
|
|
239
|
+
throttle = pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)
|
|
240
|
+
reasons = []
|
|
241
|
+
if throttle & 0x0000000000000002:
|
|
242
|
+
reasons.append("idle")
|
|
243
|
+
if throttle & 0x0000000000000004:
|
|
244
|
+
reasons.append("app_clocks")
|
|
245
|
+
if throttle & 0x0000000000000008:
|
|
246
|
+
reasons.append("sw_power_cap")
|
|
247
|
+
if throttle & 0x0000000000000020:
|
|
248
|
+
reasons.append("hw_slowdown")
|
|
249
|
+
if throttle & 0x0000000000000040:
|
|
250
|
+
reasons.append("sync_boost")
|
|
251
|
+
if throttle & 0x0000000000000080:
|
|
252
|
+
reasons.append("sw_thermal")
|
|
253
|
+
if throttle & 0x0000000000000100:
|
|
254
|
+
reasons.append("hw_thermal")
|
|
255
|
+
if throttle & 0x0000000000000200:
|
|
256
|
+
reasons.append("hw_power_brake")
|
|
257
|
+
gpu_info["throttle_reasons"] = reasons if reasons else ["none"]
|
|
258
|
+
except Exception:
|
|
259
|
+
pass
|
|
260
|
+
|
|
261
|
+
# Clock speeds
|
|
262
|
+
try:
|
|
263
|
+
gpu_info["clock_graphics_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
|
|
264
|
+
gpu_info["clock_mem_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
|
|
265
|
+
except Exception:
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
# Running processes on GPU
|
|
269
|
+
try:
|
|
270
|
+
procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
|
|
271
|
+
gpu_procs = []
|
|
272
|
+
for p in procs[:10]:
|
|
273
|
+
try:
|
|
274
|
+
proc = psutil.Process(p.pid)
|
|
275
|
+
gpu_procs.append({
|
|
276
|
+
"pid": p.pid,
|
|
277
|
+
"name": proc.name(),
|
|
278
|
+
"vram_mb": round(p.usedGpuMemory / 1e6, 1) if p.usedGpuMemory else 0,
|
|
279
|
+
"cmdline": " ".join(proc.cmdline()[:5])
|
|
280
|
+
})
|
|
281
|
+
except Exception:
|
|
282
|
+
gpu_procs.append({"pid": p.pid, "vram_mb": round(p.usedGpuMemory / 1e6, 1) if p.usedGpuMemory else 0})
|
|
283
|
+
gpu_info["processes"] = gpu_procs
|
|
284
|
+
except Exception:
|
|
285
|
+
pass
|
|
286
|
+
|
|
287
|
+
# ECC errors
|
|
288
|
+
try:
|
|
289
|
+
ecc_single = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_SINGLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
|
|
290
|
+
ecc_double = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_DOUBLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
|
|
291
|
+
gpu_info["ecc_single_bit"] = ecc_single
|
|
292
|
+
gpu_info["ecc_double_bit"] = ecc_double
|
|
293
|
+
except Exception:
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
gpus.append(gpu_info)
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
"driver_version": GPU_DRIVER_VERSION,
|
|
300
|
+
"cuda_version": GPU_CUDA_VERSION,
|
|
301
|
+
"device_count": device_count,
|
|
302
|
+
"gpus": gpus,
|
|
303
|
+
# Flatten primary GPU for metrics table
|
|
304
|
+
"gpu_temp": gpus[0]["temp_c"] if gpus else None,
|
|
305
|
+
"gpu_vram": gpus[0]["vram_used_gb"] if gpus else None,
|
|
306
|
+
}
|
|
307
|
+
except Exception as e:
|
|
308
|
+
return {"error": str(e)}
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# --- PM2 Integration ---
|
|
312
|
+
|
|
313
|
+
def get_pm2_status() -> dict:
|
|
314
|
+
"""Get full pm2 process list with details."""
|
|
315
|
+
raw = run_cmd("pm2 jlist 2>/dev/null")
|
|
316
|
+
if not raw or raw.startswith("[error") or raw.startswith("[timeout"):
|
|
317
|
+
return {"available": False, "raw": raw}
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
processes = json.loads(raw)
|
|
321
|
+
result = {"available": True, "processes": []}
|
|
322
|
+
for p in processes:
|
|
323
|
+
env = p.get("pm2_env", {})
|
|
324
|
+
monit = p.get("monit", {})
|
|
325
|
+
proc_info = {
|
|
326
|
+
"name": p.get("name"),
|
|
327
|
+
"pid": p.get("pid"),
|
|
328
|
+
"pm_id": p.get("pm_id"),
|
|
329
|
+
"status": env.get("status"),
|
|
330
|
+
"restart_count": env.get("restart_time", 0),
|
|
331
|
+
"unstable_restarts": env.get("unstable_restarts", 0),
|
|
332
|
+
"uptime_ms": int(time.time() * 1000) - env.get("pm_uptime", 0) if env.get("pm_uptime") else None,
|
|
333
|
+
"cpu_pct": monit.get("cpu"),
|
|
334
|
+
"memory_mb": round(monit.get("memory", 0) / 1e6, 1),
|
|
335
|
+
"exec_mode": env.get("exec_mode"),
|
|
336
|
+
"node_version": env.get("node_version"),
|
|
337
|
+
"script": env.get("pm_exec_path"),
|
|
338
|
+
"cwd": env.get("pm_cwd"),
|
|
339
|
+
"created_at": env.get("created_at"),
|
|
340
|
+
"instances": env.get("instances"),
|
|
341
|
+
"exit_code": env.get("exit_code"),
|
|
342
|
+
}
|
|
343
|
+
# Get recent logs for this process
|
|
344
|
+
recent_logs = run_cmd(f"pm2 logs {p.get('name')} --nostream --lines 30 2>/dev/null", timeout=5)
|
|
345
|
+
if recent_logs and not recent_logs.startswith("["):
|
|
346
|
+
proc_info["recent_logs"] = recent_logs[-3000:] # Last 3KB
|
|
347
|
+
|
|
348
|
+
# Get recent error logs
|
|
349
|
+
err_log_path = env.get("pm_err_log_path")
|
|
350
|
+
if err_log_path and os.path.exists(err_log_path):
|
|
351
|
+
try:
|
|
352
|
+
with open(err_log_path, "r") as f:
|
|
353
|
+
lines = f.readlines()
|
|
354
|
+
proc_info["recent_errors"] = "".join(lines[-50:])[-3000:]
|
|
355
|
+
except Exception:
|
|
356
|
+
pass
|
|
357
|
+
|
|
358
|
+
result["processes"].append(proc_info)
|
|
359
|
+
return result
|
|
360
|
+
except json.JSONDecodeError:
|
|
361
|
+
return {"available": True, "raw_output": raw[:2000]}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def get_pm2_logs(lines: int = 100) -> str:
|
|
365
|
+
"""Get combined pm2 logs."""
|
|
366
|
+
output = run_cmd(f"pm2 logs --nostream --lines {lines} 2>/dev/null", timeout=10)
|
|
367
|
+
return output[-5000:] if output else ""
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
# --- System Metrics (Deep) ---
|
|
371
|
+
|
|
372
|
+
def get_disk_io() -> dict:
|
|
373
|
+
"""Get disk I/O stats."""
|
|
374
|
+
try:
|
|
375
|
+
io_counters = psutil.disk_io_counters(perdisk=False)
|
|
376
|
+
if io_counters:
|
|
377
|
+
return {
|
|
378
|
+
"read_mb": round(io_counters.read_bytes / 1e6, 1),
|
|
379
|
+
"write_mb": round(io_counters.write_bytes / 1e6, 1),
|
|
380
|
+
"read_count": io_counters.read_count,
|
|
381
|
+
"write_count": io_counters.write_count,
|
|
382
|
+
"read_time_ms": io_counters.read_time,
|
|
383
|
+
"write_time_ms": io_counters.write_time,
|
|
384
|
+
}
|
|
385
|
+
except Exception:
|
|
386
|
+
pass
|
|
387
|
+
return {}
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def get_network_details() -> dict:
|
|
391
|
+
"""Get detailed network info."""
|
|
392
|
+
result = {}
|
|
393
|
+
|
|
394
|
+
# Per-interface stats
|
|
395
|
+
try:
|
|
396
|
+
net_io = psutil.net_io_counters(pernic=True)
|
|
397
|
+
interfaces = {}
|
|
398
|
+
for name, counters in net_io.items():
|
|
399
|
+
if name == "lo":
|
|
400
|
+
continue
|
|
401
|
+
interfaces[name] = {
|
|
402
|
+
"sent_mb": round(counters.bytes_sent / 1e6, 1),
|
|
403
|
+
"recv_mb": round(counters.bytes_recv / 1e6, 1),
|
|
404
|
+
"packets_sent": counters.packets_sent,
|
|
405
|
+
"packets_recv": counters.packets_recv,
|
|
406
|
+
"errors_in": counters.errin,
|
|
407
|
+
"errors_out": counters.errout,
|
|
408
|
+
"drops_in": counters.dropin,
|
|
409
|
+
"drops_out": counters.dropout,
|
|
410
|
+
}
|
|
411
|
+
result["interfaces"] = interfaces
|
|
412
|
+
except Exception:
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
# Connection states
|
|
416
|
+
try:
|
|
417
|
+
connections = psutil.net_connections(kind="tcp")
|
|
418
|
+
states = {}
|
|
419
|
+
for conn in connections:
|
|
420
|
+
s = conn.status
|
|
421
|
+
states[s] = states.get(s, 0) + 1
|
|
422
|
+
result["tcp_states"] = states
|
|
423
|
+
result["total_connections"] = len(connections)
|
|
424
|
+
except Exception:
|
|
425
|
+
pass
|
|
426
|
+
|
|
427
|
+
# Listening ports
|
|
428
|
+
try:
|
|
429
|
+
listening = []
|
|
430
|
+
for conn in psutil.net_connections(kind="tcp"):
|
|
431
|
+
if conn.status == "LISTEN":
|
|
432
|
+
listening.append({
|
|
433
|
+
"port": conn.laddr.port,
|
|
434
|
+
"addr": conn.laddr.ip,
|
|
435
|
+
"pid": conn.pid,
|
|
436
|
+
})
|
|
437
|
+
result["listening_ports"] = listening[:30]
|
|
438
|
+
except Exception:
|
|
439
|
+
pass
|
|
440
|
+
|
|
441
|
+
# DNS check
|
|
442
|
+
try:
|
|
443
|
+
start = time.time()
|
|
444
|
+
socket.getaddrinfo("google.com", 80)
|
|
445
|
+
result["dns_resolve_ms"] = round((time.time() - start) * 1000, 1)
|
|
446
|
+
except Exception as e:
|
|
447
|
+
result["dns_error"] = str(e)
|
|
448
|
+
|
|
449
|
+
return result
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def get_top_processes(n: int = 15) -> list:
|
|
453
|
+
"""Get top N processes by CPU and memory."""
|
|
454
|
+
procs = []
|
|
455
|
+
for p in psutil.process_iter(["pid", "name", "cpu_percent", "memory_percent", "status", "num_fds", "num_threads", "create_time", "cmdline"]):
|
|
456
|
+
try:
|
|
457
|
+
info = p.info
|
|
458
|
+
if info["cpu_percent"] is None:
|
|
459
|
+
continue
|
|
460
|
+
procs.append({
|
|
461
|
+
"pid": info["pid"],
|
|
462
|
+
"name": info["name"],
|
|
463
|
+
"cpu_pct": info["cpu_percent"],
|
|
464
|
+
"mem_pct": round(info["memory_percent"] or 0, 1),
|
|
465
|
+
"status": info["status"],
|
|
466
|
+
"fds": info.get("num_fds"),
|
|
467
|
+
"threads": info.get("num_threads"),
|
|
468
|
+
"cmd": " ".join((info.get("cmdline") or [])[:5])[:200],
|
|
469
|
+
})
|
|
470
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
471
|
+
pass
|
|
472
|
+
procs.sort(key=lambda x: x["cpu_pct"], reverse=True)
|
|
473
|
+
return procs[:n]
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def get_zombie_processes() -> list:
|
|
477
|
+
"""Find zombie/defunct processes."""
|
|
478
|
+
zombies = []
|
|
479
|
+
for p in psutil.process_iter(["pid", "name", "status", "ppid"]):
|
|
480
|
+
try:
|
|
481
|
+
if p.info["status"] == psutil.STATUS_ZOMBIE:
|
|
482
|
+
zombies.append({
|
|
483
|
+
"pid": p.info["pid"],
|
|
484
|
+
"name": p.info["name"],
|
|
485
|
+
"ppid": p.info["ppid"],
|
|
486
|
+
})
|
|
487
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
488
|
+
pass
|
|
489
|
+
return zombies
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def get_disk_details() -> list:
|
|
493
|
+
"""Get all mount point usage and inode info."""
|
|
494
|
+
mounts = []
|
|
495
|
+
for part in psutil.disk_partitions(all=False):
|
|
496
|
+
try:
|
|
497
|
+
usage = psutil.disk_usage(part.mountpoint)
|
|
498
|
+
info = {
|
|
499
|
+
"mount": part.mountpoint,
|
|
500
|
+
"device": part.device,
|
|
501
|
+
"fstype": part.fstype,
|
|
502
|
+
"total_gb": round(usage.total / 1e9, 1),
|
|
503
|
+
"used_gb": round(usage.used / 1e9, 1),
|
|
504
|
+
"free_gb": round(usage.free / 1e9, 1),
|
|
505
|
+
"pct": usage.percent,
|
|
506
|
+
}
|
|
507
|
+
# Inode usage (Linux)
|
|
508
|
+
try:
|
|
509
|
+
inode_output = run_cmd(f"df -i {part.mountpoint} | tail -1", timeout=3)
|
|
510
|
+
parts = inode_output.split()
|
|
511
|
+
if len(parts) >= 5:
|
|
512
|
+
info["inodes_used"] = parts[2]
|
|
513
|
+
info["inodes_free"] = parts[3]
|
|
514
|
+
info["inodes_pct"] = parts[4]
|
|
515
|
+
except Exception:
|
|
516
|
+
pass
|
|
517
|
+
mounts.append(info)
|
|
518
|
+
except Exception:
|
|
519
|
+
pass
|
|
520
|
+
return mounts
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def get_systemd_failed() -> list:
|
|
524
|
+
"""Get failed systemd units."""
|
|
525
|
+
output = run_cmd("systemctl --failed --no-pager --plain 2>/dev/null", timeout=5)
|
|
526
|
+
if not output or "0 loaded" in output:
|
|
527
|
+
return []
|
|
528
|
+
|
|
529
|
+
failed = []
|
|
530
|
+
for line in output.split("\n"):
|
|
531
|
+
line = line.strip()
|
|
532
|
+
if line and not line.startswith("UNIT") and not line.startswith("LOAD") and "loaded" not in line.lower():
|
|
533
|
+
parts = line.split()
|
|
534
|
+
if len(parts) >= 1:
|
|
535
|
+
failed.append(parts[0])
|
|
536
|
+
return failed[:20]
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def get_docker_status() -> dict:
|
|
540
|
+
"""Get Docker container info."""
|
|
541
|
+
if not DOCKER_AVAILABLE:
|
|
542
|
+
return {"available": False}
|
|
543
|
+
try:
|
|
544
|
+
containers = []
|
|
545
|
+
for c in DOCKER_CLIENT.containers.list(all=True):
|
|
546
|
+
stats = {}
|
|
547
|
+
if c.status == "running":
|
|
548
|
+
try:
|
|
549
|
+
s = c.stats(stream=False)
|
|
550
|
+
cpu_delta = s["cpu_stats"]["cpu_usage"]["total_usage"] - s["precpu_stats"]["cpu_usage"]["total_usage"]
|
|
551
|
+
system_delta = s["cpu_stats"]["system_cpu_usage"] - s["precpu_stats"]["system_cpu_usage"]
|
|
552
|
+
cpu_pct = round((cpu_delta / system_delta) * 100, 1) if system_delta > 0 else 0
|
|
553
|
+
mem_usage = s["memory_stats"].get("usage", 0)
|
|
554
|
+
mem_limit = s["memory_stats"].get("limit", 1)
|
|
555
|
+
stats = {
|
|
556
|
+
"cpu_pct": cpu_pct,
|
|
557
|
+
"memory_mb": round(mem_usage / 1e6, 1),
|
|
558
|
+
"memory_limit_mb": round(mem_limit / 1e6, 1),
|
|
559
|
+
}
|
|
560
|
+
except Exception:
|
|
561
|
+
pass
|
|
562
|
+
|
|
563
|
+
containers.append({
|
|
564
|
+
"name": c.name,
|
|
565
|
+
"image": c.image.tags[0] if c.image.tags else str(c.image.id)[:12],
|
|
566
|
+
"status": c.status,
|
|
567
|
+
"restart_count": c.attrs.get("RestartCount", 0),
|
|
568
|
+
**stats,
|
|
569
|
+
})
|
|
570
|
+
return {"available": True, "containers": containers}
|
|
571
|
+
except Exception as e:
|
|
572
|
+
return {"available": False, "error": str(e)}
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def check_ssl_certs() -> list:
|
|
576
|
+
"""Check SSL certificate expiry for configured domains."""
|
|
577
|
+
results = []
|
|
578
|
+
for domain in SSL_CHECK_DOMAINS:
|
|
579
|
+
try:
|
|
580
|
+
ctx = ssl.create_default_context()
|
|
581
|
+
with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s:
|
|
582
|
+
s.settimeout(5)
|
|
583
|
+
s.connect((domain, 443))
|
|
584
|
+
cert = s.getpeercert()
|
|
585
|
+
expires = datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
|
586
|
+
days_left = (expires - datetime.now()).days
|
|
587
|
+
results.append({
|
|
588
|
+
"domain": domain,
|
|
589
|
+
"expires": cert["notAfter"],
|
|
590
|
+
"days_left": days_left,
|
|
591
|
+
"issuer": dict(x[0] for x in cert.get("issuer", [])).get("organizationName", "unknown"),
|
|
592
|
+
"warning": days_left < 30,
|
|
593
|
+
})
|
|
594
|
+
except Exception as e:
|
|
595
|
+
results.append({"domain": domain, "error": str(e)})
|
|
596
|
+
return results
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def get_system_logs() -> dict:
|
|
600
|
+
"""Get recent system logs (dmesg, syslog, auth)."""
|
|
601
|
+
logs = {}
|
|
602
|
+
|
|
603
|
+
# dmesg (kernel messages — GPU errors, OOM kills show up here)
|
|
604
|
+
dmesg = run_cmd("dmesg --time-format iso -T 2>/dev/null | tail -50", timeout=5)
|
|
605
|
+
if dmesg and not dmesg.startswith("[error"):
|
|
606
|
+
logs["dmesg"] = dmesg[-3000:]
|
|
607
|
+
|
|
608
|
+
# Check for OOM kills specifically
|
|
609
|
+
oom = run_cmd("dmesg | grep -i 'oom\\|killed process\\|out of memory' | tail -10 2>/dev/null", timeout=5)
|
|
610
|
+
if oom and not oom.startswith("[error"):
|
|
611
|
+
logs["oom_kills"] = oom
|
|
612
|
+
|
|
613
|
+
# journalctl recent errors
|
|
614
|
+
journal = run_cmd("journalctl -p err --since '1 hour ago' --no-pager -q 2>/dev/null | tail -30", timeout=5)
|
|
615
|
+
if journal and not journal.startswith("[error"):
|
|
616
|
+
logs["journal_errors"] = journal[-2000:]
|
|
617
|
+
|
|
618
|
+
# Auth log (failed logins, suspicious activity)
|
|
619
|
+
auth = run_cmd("tail -20 /var/log/auth.log 2>/dev/null || tail -20 /var/log/secure 2>/dev/null", timeout=5)
|
|
620
|
+
if auth and not auth.startswith("[error"):
|
|
621
|
+
logs["auth_log"] = auth[-1000:]
|
|
622
|
+
|
|
623
|
+
return logs
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
def get_firewall_rules() -> str:
|
|
627
|
+
"""Get firewall rules summary."""
|
|
628
|
+
# Try ufw first, then iptables
|
|
629
|
+
ufw = run_cmd("ufw status verbose 2>/dev/null", timeout=5)
|
|
630
|
+
if ufw and "Status:" in ufw:
|
|
631
|
+
return ufw[:2000]
|
|
632
|
+
ipt = run_cmd("iptables -L -n --line-numbers 2>/dev/null | head -40", timeout=5)
|
|
633
|
+
return ipt[:2000] if ipt else ""
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def get_env_sanitized() -> dict:
|
|
637
|
+
"""Get relevant environment variables with secrets redacted."""
|
|
638
|
+
relevant_prefixes = [
|
|
639
|
+
"NODE_", "PYTHON", "PATH", "HOME", "USER", "SHELL", "LANG",
|
|
640
|
+
"CUDA", "NVIDIA", "GPU", "LD_LIBRARY", "VIRTUAL_ENV", "CONDA",
|
|
641
|
+
"PM2_", "PORT", "HOST", "DISPLAY", "XDG_", "DBUS",
|
|
642
|
+
]
|
|
643
|
+
env = {}
|
|
644
|
+
for key, value in os.environ.items():
|
|
645
|
+
if any(key.startswith(p) for p in relevant_prefixes):
|
|
646
|
+
# Redact anything that looks like a secret
|
|
647
|
+
if any(s in key.upper() for s in ["KEY", "SECRET", "TOKEN", "PASS", "AUTH"]):
|
|
648
|
+
env[key] = "[REDACTED]"
|
|
649
|
+
else:
|
|
650
|
+
env[key] = value[:500]
|
|
651
|
+
return env
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def get_installed_packages() -> dict:
|
|
655
|
+
"""Get installed Python and Node packages."""
|
|
656
|
+
pkgs = {}
|
|
657
|
+
|
|
658
|
+
# Python packages
|
|
659
|
+
pip_list = run_cmd("pip list --format=json 2>/dev/null", timeout=10)
|
|
660
|
+
if pip_list and pip_list.startswith("["):
|
|
661
|
+
try:
|
|
662
|
+
pkgs["python"] = {p["name"]: p["version"] for p in json.loads(pip_list)}
|
|
663
|
+
except Exception:
|
|
664
|
+
pass
|
|
665
|
+
|
|
666
|
+
# Node packages (global)
|
|
667
|
+
npm_list = run_cmd("npm list -g --depth=0 --json 2>/dev/null", timeout=10)
|
|
668
|
+
if npm_list and npm_list.startswith("{"):
|
|
669
|
+
try:
|
|
670
|
+
deps = json.loads(npm_list).get("dependencies", {})
|
|
671
|
+
pkgs["node_global"] = {k: v.get("version", "?") for k, v in deps.items()}
|
|
672
|
+
except Exception:
|
|
673
|
+
pass
|
|
674
|
+
|
|
675
|
+
# pm2 ecosystem config
|
|
676
|
+
pm2_conf = run_cmd("cat ecosystem.config.js 2>/dev/null || cat ecosystem.config.cjs 2>/dev/null", timeout=3)
|
|
677
|
+
if pm2_conf and not pm2_conf.startswith("[error"):
|
|
678
|
+
pkgs["pm2_ecosystem"] = pm2_conf[:3000]
|
|
679
|
+
|
|
680
|
+
return pkgs
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
# --- File Watcher Thread ---
|
|
684
|
+
|
|
685
|
+
class LogFileWatcher(threading.Thread):
|
|
686
|
+
"""Watch log files for new lines and send them."""
|
|
687
|
+
|
|
688
|
+
def __init__(self, files: list):
|
|
689
|
+
super().__init__(daemon=True)
|
|
690
|
+
self.files = files
|
|
691
|
+
self.positions = {}
|
|
692
|
+
|
|
693
|
+
def run(self):
|
|
694
|
+
# Initialize positions to end of files
|
|
695
|
+
for f in self.files:
|
|
696
|
+
try:
|
|
697
|
+
self.positions[f] = os.path.getsize(f)
|
|
698
|
+
except Exception:
|
|
699
|
+
self.positions[f] = 0
|
|
700
|
+
|
|
701
|
+
while True:
|
|
702
|
+
for filepath in self.files:
|
|
703
|
+
try:
|
|
704
|
+
size = os.path.getsize(filepath)
|
|
705
|
+
if size > self.positions.get(filepath, 0):
|
|
706
|
+
with open(filepath, "r") as fh:
|
|
707
|
+
fh.seek(self.positions[filepath])
|
|
708
|
+
new_lines = fh.read(10000) # Max 10KB per read
|
|
709
|
+
self.positions[filepath] = fh.tell()
|
|
710
|
+
|
|
711
|
+
if new_lines.strip():
|
|
712
|
+
level = "info"
|
|
713
|
+
lower = new_lines.lower()
|
|
714
|
+
if any(kw in lower for kw in ["error", "exception", "traceback", "failed", "critical"]):
|
|
715
|
+
level = "error"
|
|
716
|
+
elif "warn" in lower:
|
|
717
|
+
level = "warn"
|
|
718
|
+
|
|
719
|
+
send({
|
|
720
|
+
"type": "log",
|
|
721
|
+
"source": SOURCE_NAME,
|
|
722
|
+
"platform": PLATFORM,
|
|
723
|
+
"version": VERSION,
|
|
724
|
+
"level": level,
|
|
725
|
+
"message": f"[file:{os.path.basename(filepath)}] {new_lines.strip()[:2000]}",
|
|
726
|
+
"context": {"capturedFrom": "file_watcher", "file": filepath},
|
|
727
|
+
})
|
|
728
|
+
elif size < self.positions.get(filepath, 0):
|
|
729
|
+
# File was truncated/rotated
|
|
730
|
+
self.positions[filepath] = 0
|
|
731
|
+
except Exception:
|
|
732
|
+
pass
|
|
733
|
+
time.sleep(2)
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
# --- Collect Everything ---
|
|
737
|
+
|
|
738
|
+
def collect_full_metrics() -> dict:
|
|
739
|
+
"""Collect absolutely everything."""
|
|
740
|
+
cpu = psutil.cpu_percent(interval=1)
|
|
741
|
+
cpu_per_core = psutil.cpu_percent(interval=0, percpu=True)
|
|
742
|
+
mem = psutil.virtual_memory()
|
|
743
|
+
swap = psutil.swap_memory()
|
|
744
|
+
|
|
745
|
+
data = {
|
|
746
|
+
"cpu": cpu,
|
|
747
|
+
"memory": mem.percent,
|
|
748
|
+
"network_latency": 0,
|
|
749
|
+
"custom": {
|
|
750
|
+
# CPU deep
|
|
751
|
+
"cpu_per_core": cpu_per_core,
|
|
752
|
+
"cpu_count_logical": psutil.cpu_count(),
|
|
753
|
+
"cpu_count_physical": psutil.cpu_count(logical=False),
|
|
754
|
+
"load_avg": list(os.getloadavg()) if hasattr(os, "getloadavg") else [],
|
|
755
|
+
|
|
756
|
+
# Memory deep
|
|
757
|
+
"memory_used_gb": round(mem.used / 1e9, 2),
|
|
758
|
+
"memory_total_gb": round(mem.total / 1e9, 2),
|
|
759
|
+
"memory_available_gb": round(mem.available / 1e9, 2),
|
|
760
|
+
"memory_cached_gb": round(getattr(mem, "cached", 0) / 1e9, 2),
|
|
761
|
+
"memory_buffers_gb": round(getattr(mem, "buffers", 0) / 1e9, 2),
|
|
762
|
+
"swap_used_gb": round(swap.used / 1e9, 2),
|
|
763
|
+
"swap_total_gb": round(swap.total / 1e9, 2),
|
|
764
|
+
"swap_pct": swap.percent,
|
|
765
|
+
|
|
766
|
+
# Disk I/O
|
|
767
|
+
"disk_io": get_disk_io(),
|
|
768
|
+
|
|
769
|
+
# OS info
|
|
770
|
+
"os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
|
|
771
|
+
"hostname": socket.gethostname(),
|
|
772
|
+
"python_version": sys.version.split()[0],
|
|
773
|
+
"node_version": run_cmd("node --version 2>/dev/null"),
|
|
774
|
+
"uptime_hours": round((time.time() - psutil.boot_time()) / 3600, 1),
|
|
775
|
+
},
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
# CPU frequency
|
|
779
|
+
try:
|
|
780
|
+
freq = psutil.cpu_freq()
|
|
781
|
+
if freq:
|
|
782
|
+
data["custom"]["cpu_freq_mhz"] = round(freq.current, 0)
|
|
783
|
+
data["custom"]["cpu_freq_max_mhz"] = round(freq.max, 0)
|
|
784
|
+
except Exception:
|
|
785
|
+
pass
|
|
786
|
+
|
|
787
|
+
# Context switches
|
|
788
|
+
try:
|
|
789
|
+
ctx = psutil.cpu_stats()
|
|
790
|
+
data["custom"]["ctx_switches"] = ctx.ctx_switches
|
|
791
|
+
data["custom"]["interrupts"] = ctx.interrupts
|
|
792
|
+
except Exception:
|
|
793
|
+
pass
|
|
794
|
+
|
|
795
|
+
# GPU
|
|
796
|
+
gpu = get_gpu_metrics()
|
|
797
|
+
if gpu:
|
|
798
|
+
data["gpu_temp"] = gpu.get("gpu_temp")
|
|
799
|
+
data["gpu_vram"] = gpu.get("gpu_vram")
|
|
800
|
+
data["custom"]["gpu"] = gpu
|
|
801
|
+
|
|
802
|
+
return data
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
def collect_deep_snapshot() -> dict:
|
|
806
|
+
"""Full system snapshot for variable inspector."""
|
|
807
|
+
snapshot = {
|
|
808
|
+
"system": {
|
|
809
|
+
"hostname": socket.gethostname(),
|
|
810
|
+
"uptime_hours": round((time.time() - psutil.boot_time()) / 3600, 1),
|
|
811
|
+
"cpu_count": psutil.cpu_count(),
|
|
812
|
+
"memory_total_gb": round(psutil.virtual_memory().total / 1e9, 2),
|
|
813
|
+
"python_version": sys.version.split()[0],
|
|
814
|
+
"os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
|
|
815
|
+
"kernel": run_cmd("uname -r 2>/dev/null"),
|
|
816
|
+
},
|
|
817
|
+
"gpu": get_gpu_metrics() if GPU_AVAILABLE else {"available": False},
|
|
818
|
+
"pm2": get_pm2_status(),
|
|
819
|
+
"disks": get_disk_details(),
|
|
820
|
+
"network": get_network_details(),
|
|
821
|
+
"top_processes": get_top_processes(15),
|
|
822
|
+
"zombie_processes": get_zombie_processes(),
|
|
823
|
+
"docker": get_docker_status(),
|
|
824
|
+
"systemd_failed": get_systemd_failed(),
|
|
825
|
+
"firewall": get_firewall_rules(),
|
|
826
|
+
"environment": get_env_sanitized(),
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
# Job state
|
|
830
|
+
state = job_tracker.get_state()
|
|
831
|
+
if state:
|
|
832
|
+
snapshot["current_job"] = state
|
|
833
|
+
|
|
834
|
+
return snapshot
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
# --- Job State Tracking ---
|
|
838
|
+
|
|
839
|
+
class JobTracker:
|
|
840
|
+
"""Track job states from BullMQ or similar queue systems."""
|
|
841
|
+
|
|
842
|
+
def __init__(self):
|
|
843
|
+
self.current_job = None
|
|
844
|
+
self.job_history = deque(maxlen=50)
|
|
845
|
+
self.lock = threading.Lock()
|
|
846
|
+
|
|
847
|
+
def update(self, job_id: str, status: str, progress: float = 0,
|
|
848
|
+
last_action: str = "", error: str | None = None,
|
|
849
|
+
metadata: dict | None = None):
|
|
850
|
+
with self.lock:
|
|
851
|
+
job = {
|
|
852
|
+
"job_id": job_id,
|
|
853
|
+
"status": status,
|
|
854
|
+
"progress": progress,
|
|
855
|
+
"last_action": last_action,
|
|
856
|
+
"error": error,
|
|
857
|
+
"metadata": metadata or {},
|
|
858
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
859
|
+
}
|
|
860
|
+
self.current_job = job
|
|
861
|
+
self.job_history.append(job)
|
|
862
|
+
|
|
863
|
+
def get_state(self) -> dict | None:
|
|
864
|
+
with self.lock:
|
|
865
|
+
return self.current_job.copy() if self.current_job else None
|
|
866
|
+
|
|
867
|
+
def get_history(self, n: int = 20) -> list:
|
|
868
|
+
with self.lock:
|
|
869
|
+
return list(self.job_history)[-n:]
|
|
870
|
+
|
|
871
|
+
def send_update(self):
|
|
872
|
+
state = self.get_state()
|
|
873
|
+
if not state:
|
|
874
|
+
return
|
|
875
|
+
send({
|
|
876
|
+
"type": "inspect",
|
|
877
|
+
"source": SOURCE_NAME,
|
|
878
|
+
"platform": PLATFORM,
|
|
879
|
+
"version": VERSION,
|
|
880
|
+
"variables": {"current_job": state, "job_history": self.get_history()},
|
|
881
|
+
})
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
# --- Public API for External Use ---
|
|
885
|
+
|
|
886
|
+
job_tracker = JobTracker()
|
|
887
|
+
stdout_capture = None
|
|
888
|
+
stderr_capture = None
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
def send_log(level: str, message: str, context: dict = None):
|
|
892
|
+
send({
|
|
893
|
+
"type": "log",
|
|
894
|
+
"source": SOURCE_NAME,
|
|
895
|
+
"platform": PLATFORM,
|
|
896
|
+
"version": VERSION,
|
|
897
|
+
"level": level,
|
|
898
|
+
"message": message,
|
|
899
|
+
"context": context or {},
|
|
900
|
+
})
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def send_error(title: str, stack_trace: str = "", context: dict = None):
|
|
904
|
+
send({
|
|
905
|
+
"type": "error",
|
|
906
|
+
"source": SOURCE_NAME,
|
|
907
|
+
"platform": PLATFORM,
|
|
908
|
+
"version": VERSION,
|
|
909
|
+
"title": title,
|
|
910
|
+
"stackTrace": stack_trace,
|
|
911
|
+
"context": context or {},
|
|
912
|
+
})
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def update_job(job_id: str, status: str, progress: float = 0,
|
|
916
|
+
last_action: str = "", error: str = None, metadata: dict = None):
|
|
917
|
+
job_tracker.update(job_id, status, progress, last_action, error, metadata)
|
|
918
|
+
job_tracker.send_update()
|
|
919
|
+
|
|
920
|
+
if error:
|
|
921
|
+
send_error(f"Job {job_id} failed: {error}", context={
|
|
922
|
+
"job_id": job_id, "status": status, "last_action": last_action,
|
|
923
|
+
**(metadata or {}),
|
|
924
|
+
})
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def capture_image_gen(job_id: str, model: str, params: dict,
|
|
928
|
+
result: dict = None, error: str = None,
|
|
929
|
+
duration_s: float = None):
|
|
930
|
+
"""Specialized capture for image generation pipelines."""
|
|
931
|
+
context = {
|
|
932
|
+
"job_id": job_id,
|
|
933
|
+
"model": model,
|
|
934
|
+
"params": params,
|
|
935
|
+
"duration_s": duration_s,
|
|
936
|
+
}
|
|
937
|
+
if result:
|
|
938
|
+
context["result"] = result
|
|
939
|
+
send_log("info", f"Image gen complete: {model} ({duration_s:.1f}s)", context)
|
|
940
|
+
if error:
|
|
941
|
+
context["error"] = error
|
|
942
|
+
send_error(f"Image gen failed: {model} — {error}", context=context)
|
|
943
|
+
|
|
944
|
+
update_job(job_id, "completed" if result else "failed",
|
|
945
|
+
progress=100 if result else 0,
|
|
946
|
+
last_action=f"generate:{model}",
|
|
947
|
+
error=error,
|
|
948
|
+
metadata=context)
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
# --- Main Loop ---
|
|
952
|
+
|
|
953
|
+
def main():
|
|
954
|
+
global stdout_capture, stderr_capture
|
|
955
|
+
|
|
956
|
+
# Capture stdout/stderr
|
|
957
|
+
stdout_capture = StreamCapture(sys.stdout, "info")
|
|
958
|
+
stderr_capture = StreamCapture(sys.stderr, "error")
|
|
959
|
+
sys.stdout = stdout_capture
|
|
960
|
+
sys.stderr = stderr_capture
|
|
961
|
+
|
|
962
|
+
logger.info(f"debugger.help Agent v{VERSION} — Ultimate Deep Debugger")
|
|
963
|
+
logger.info(f"Source: {SOURCE_NAME} | GPU: {'yes' if GPU_AVAILABLE else 'no'} | Docker: {'yes' if DOCKER_AVAILABLE else 'no'}")
|
|
964
|
+
logger.info(f"Interval: {INTERVAL}s | Endpoint: {INGEST_URL}")
|
|
965
|
+
|
|
966
|
+
# Start file watchers
|
|
967
|
+
watch_files = list(WATCH_LOG_FILES)
|
|
968
|
+
# Auto-discover pm2 log files
|
|
969
|
+
pm2_log_dir = os.path.expanduser("~/.pm2/logs")
|
|
970
|
+
if os.path.isdir(pm2_log_dir):
|
|
971
|
+
pm2_logs = glob.glob(os.path.join(pm2_log_dir, "*.log"))
|
|
972
|
+
watch_files.extend(pm2_logs)
|
|
973
|
+
logger.info(f"Watching {len(pm2_logs)} pm2 log files")
|
|
974
|
+
|
|
975
|
+
if watch_files:
|
|
976
|
+
watcher = LogFileWatcher(watch_files)
|
|
977
|
+
watcher.start()
|
|
978
|
+
logger.info(f"File watcher started for {len(watch_files)} files")
|
|
979
|
+
|
|
980
|
+
# Initial connection
|
|
981
|
+
send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
|
|
982
|
+
send_log("info", f"Agent v{VERSION} started on {socket.gethostname()}", {
|
|
983
|
+
"hostname": socket.gethostname(),
|
|
984
|
+
"python_version": sys.version,
|
|
985
|
+
"gpu_available": GPU_AVAILABLE,
|
|
986
|
+
"docker_available": DOCKER_AVAILABLE,
|
|
987
|
+
"pid": os.getpid(),
|
|
988
|
+
"gpu_driver": GPU_DRIVER_VERSION,
|
|
989
|
+
"cuda_version": GPU_CUDA_VERSION,
|
|
990
|
+
})
|
|
991
|
+
|
|
992
|
+
# Send initial deep snapshot with packages
|
|
993
|
+
try:
|
|
994
|
+
pkgs = get_installed_packages()
|
|
995
|
+
send({
|
|
996
|
+
"type": "inspect",
|
|
997
|
+
"source": SOURCE_NAME,
|
|
998
|
+
"platform": PLATFORM,
|
|
999
|
+
"version": VERSION,
|
|
1000
|
+
"variables": {"installed_packages": pkgs},
|
|
1001
|
+
})
|
|
1002
|
+
except Exception:
|
|
1003
|
+
pass
|
|
1004
|
+
|
|
1005
|
+
tick = 0
|
|
1006
|
+
consecutive_failures = 0
|
|
1007
|
+
|
|
1008
|
+
while True:
|
|
1009
|
+
try:
|
|
1010
|
+
# Metrics every tick
|
|
1011
|
+
metrics = collect_full_metrics()
|
|
1012
|
+
ok = send({
|
|
1013
|
+
"type": "metric",
|
|
1014
|
+
"source": SOURCE_NAME,
|
|
1015
|
+
"platform": PLATFORM,
|
|
1016
|
+
"version": VERSION,
|
|
1017
|
+
**metrics,
|
|
1018
|
+
})
|
|
1019
|
+
|
|
1020
|
+
if ok:
|
|
1021
|
+
consecutive_failures = 0
|
|
1022
|
+
else:
|
|
1023
|
+
consecutive_failures += 1
|
|
1024
|
+
|
|
1025
|
+
# Auto-reconnect backoff
|
|
1026
|
+
if consecutive_failures > 5:
|
|
1027
|
+
backoff = min(consecutive_failures * 5, 60)
|
|
1028
|
+
logger.warning(f"Connection issues. Backing off {backoff}s...")
|
|
1029
|
+
time.sleep(backoff)
|
|
1030
|
+
send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
|
|
1031
|
+
consecutive_failures = 0
|
|
1032
|
+
continue
|
|
1033
|
+
|
|
1034
|
+
# Deep snapshot every ~1 min
|
|
1035
|
+
if tick % 6 == 0:
|
|
1036
|
+
snapshot = collect_deep_snapshot()
|
|
1037
|
+
send({
|
|
1038
|
+
"type": "inspect",
|
|
1039
|
+
"source": SOURCE_NAME,
|
|
1040
|
+
"platform": PLATFORM,
|
|
1041
|
+
"version": VERSION,
|
|
1042
|
+
"variables": snapshot,
|
|
1043
|
+
})
|
|
1044
|
+
|
|
1045
|
+
# PM2 logs check every ~30 seconds
|
|
1046
|
+
if tick % 3 == 0:
|
|
1047
|
+
pm2_logs = get_pm2_logs(50)
|
|
1048
|
+
if pm2_logs:
|
|
1049
|
+
# Check for errors in pm2 logs
|
|
1050
|
+
for line in pm2_logs.split("\n"):
|
|
1051
|
+
lower = line.lower()
|
|
1052
|
+
if any(kw in lower for kw in ["error", "exception", "failed", "crash", "enoent", "eacces", "killed"]):
|
|
1053
|
+
send_log("error", f"[pm2] {line.strip()[:500]}", {"capturedFrom": "pm2_logs"})
|
|
1054
|
+
|
|
1055
|
+
# System logs check every ~2 min
|
|
1056
|
+
if tick % 12 == 0:
|
|
1057
|
+
sys_logs = get_system_logs()
|
|
1058
|
+
if sys_logs:
|
|
1059
|
+
send({
|
|
1060
|
+
"type": "inspect",
|
|
1061
|
+
"source": SOURCE_NAME,
|
|
1062
|
+
"platform": PLATFORM,
|
|
1063
|
+
"version": VERSION,
|
|
1064
|
+
"variables": {"system_logs": sys_logs},
|
|
1065
|
+
})
|
|
1066
|
+
|
|
1067
|
+
# SSL cert check every ~10 min
|
|
1068
|
+
if tick % 60 == 0 and SSL_CHECK_DOMAINS:
|
|
1069
|
+
ssl_results = check_ssl_certs()
|
|
1070
|
+
for r in ssl_results:
|
|
1071
|
+
if r.get("warning") or r.get("error"):
|
|
1072
|
+
send_log("warn", f"SSL cert issue: {r.get('domain')} — {r.get('error') or f'{r.get(\"days_left\")} days left'}", r)
|
|
1073
|
+
send({
|
|
1074
|
+
"type": "inspect",
|
|
1075
|
+
"source": SOURCE_NAME,
|
|
1076
|
+
"platform": PLATFORM,
|
|
1077
|
+
"version": VERSION,
|
|
1078
|
+
"variables": {"ssl_certs": ssl_results},
|
|
1079
|
+
})
|
|
1080
|
+
|
|
1081
|
+
# GPU warnings every ~30 seconds
|
|
1082
|
+
if GPU_AVAILABLE and tick % 3 == 0:
|
|
1083
|
+
gpu = get_gpu_metrics()
|
|
1084
|
+
gpus = gpu.get("gpus", [])
|
|
1085
|
+
for g in gpus:
|
|
1086
|
+
if g.get("temp_c", 0) > 85:
|
|
1087
|
+
send_log("warn", f"GPU {g['index']} temperature critical: {g['temp_c']}°C", g)
|
|
1088
|
+
if g.get("vram_pct", 0) > 90:
|
|
1089
|
+
send_log("warn", f"GPU {g['index']} VRAM critical: {g['vram_pct']}% ({g['vram_used_gb']}/{g['vram_total_gb']} GB)", g)
|
|
1090
|
+
throttle = g.get("throttle_reasons", [])
|
|
1091
|
+
if throttle and throttle != ["none"] and throttle != ["idle"]:
|
|
1092
|
+
send_log("warn", f"GPU {g['index']} throttling: {', '.join(throttle)}", g)
|
|
1093
|
+
|
|
1094
|
+
# Heartbeat every ~5 min
|
|
1095
|
+
if tick % 30 == 0 and tick > 0:
|
|
1096
|
+
send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
|
|
1097
|
+
|
|
1098
|
+
# Packages update every ~30 min
|
|
1099
|
+
if tick % 180 == 0 and tick > 0:
|
|
1100
|
+
pkgs = get_installed_packages()
|
|
1101
|
+
send({
|
|
1102
|
+
"type": "inspect",
|
|
1103
|
+
"source": SOURCE_NAME,
|
|
1104
|
+
"platform": PLATFORM,
|
|
1105
|
+
"version": VERSION,
|
|
1106
|
+
"variables": {"installed_packages": pkgs},
|
|
1107
|
+
})
|
|
1108
|
+
|
|
1109
|
+
tick += 1
|
|
1110
|
+
time.sleep(INTERVAL)
|
|
1111
|
+
|
|
1112
|
+
except KeyboardInterrupt:
|
|
1113
|
+
send_log("info", "Agent shutting down gracefully")
|
|
1114
|
+
logger.info("Shutting down...")
|
|
1115
|
+
sys.stdout = stdout_capture.original
|
|
1116
|
+
sys.stderr = stderr_capture.original
|
|
1117
|
+
break
|
|
1118
|
+
except Exception as e:
|
|
1119
|
+
logger.error(f"Main loop error: {e}")
|
|
1120
|
+
send_error(str(e), traceback.format_exc())
|
|
1121
|
+
time.sleep(INTERVAL)
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
if __name__ == "__main__":
|
|
1125
|
+
main()
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: debugger-help
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: debugger.help VPS Agent — Deep system monitoring for logs, GPU, PM2, Docker, and more
|
|
5
|
+
Author: debugger.help
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://debugger.help
|
|
8
|
+
Project-URL: Repository, https://github.com/YOUR_ORG/debugger-help
|
|
9
|
+
Keywords: debugger,monitoring,logging,gpu,vps,observability
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: System :: Monitoring
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: psutil>=5.9.0
|
|
18
|
+
Requires-Dist: requests>=2.28.0
|
|
19
|
+
Provides-Extra: gpu
|
|
20
|
+
Requires-Dist: pynvml>=11.0.0; extra == "gpu"
|
|
21
|
+
Provides-Extra: docker
|
|
22
|
+
Requires-Dist: docker>=6.0.0; extra == "docker"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: pynvml>=11.0.0; extra == "all"
|
|
25
|
+
Requires-Dist: docker>=6.0.0; extra == "all"
|
|
26
|
+
|
|
27
|
+
# debugger-help
|
|
28
|
+
|
|
29
|
+
Deep VPS monitoring agent for [debugger.help](https://debugger.help). Captures PM2, GPU metrics, Docker containers, system health, and streams everything to your dashboard in real time.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install debugger-help
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
With GPU monitoring:
|
|
38
|
+
```bash
|
|
39
|
+
pip install debugger-help[gpu]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
With everything:
|
|
43
|
+
```bash
|
|
44
|
+
pip install debugger-help[all]
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
export DEBUGGER_API_KEY="your-api-key"
|
|
51
|
+
export DEBUGGER_INGEST_URL="your-ingest-url"
|
|
52
|
+
debugger-agent
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Or keep it running with PM2:
|
|
56
|
+
```bash
|
|
57
|
+
pm2 start "debugger-agent" --name debugger-agent
|
|
58
|
+
pm2 save
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## What it captures
|
|
62
|
+
|
|
63
|
+
- PM2 process states, restart counts, logs, error logs
|
|
64
|
+
- GPU: VRAM per-process, temperature, power, utilization, clock speeds, throttle reasons, ECC errors
|
|
65
|
+
- CPU: per-core usage, load average, frequency, context switches
|
|
66
|
+
- Memory: RAM, swap, shared, buffers, cached
|
|
67
|
+
- Disk: usage per mount, I/O rates, inode usage
|
|
68
|
+
- Network: per-interface stats, TCP connection states, open ports, DNS resolution
|
|
69
|
+
- Processes: top CPU/memory consumers, zombies, open file descriptors
|
|
70
|
+
- Docker containers: status, CPU, memory, restart counts
|
|
71
|
+
- Systemd: failed units
|
|
72
|
+
- SSL certificates: expiry checks
|
|
73
|
+
- System logs: dmesg, OOM kills, journalctl errors
|
|
74
|
+
- File watchers: syslog, PM2 logs, custom log files
|
|
75
|
+
|
|
76
|
+
## Environment Variables
|
|
77
|
+
|
|
78
|
+
| Variable | Required | Description |
|
|
79
|
+
|----------|----------|-------------|
|
|
80
|
+
| `DEBUGGER_API_KEY` | Yes | Your debugger.help API key |
|
|
81
|
+
| `DEBUGGER_INGEST_URL` | Yes | Your ingest endpoint URL |
|
|
82
|
+
| `DEBUGGER_SOURCE` | No | Source name (default: `vps-{hostname}`) |
|
|
83
|
+
| `DEBUGGER_INTERVAL` | No | Collection interval in seconds (default: `10`) |
|
|
84
|
+
| `DEBUGGER_WATCH_LOGS` | No | Comma-separated extra log file paths |
|
|
85
|
+
| `DEBUGGER_SSL_DOMAINS` | No | Comma-separated domains for SSL checks |
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
MIT
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
debugger_help/__init__.py,sha256=LvFFmKY3_tMH7u9l7pif6OQX9YLTzXD-lAtpT16bB60,80
|
|
2
|
+
debugger_help/agent.py,sha256=NWwOZlL-0PafD1ZfRNGnr6j1DGLJ7x_W-a3JcYuVgkU,40545
|
|
3
|
+
debugger_help-3.0.0.dist-info/METADATA,sha256=QFtQNHPDsFCGntsmXJB7ZQBjf_bqaI65qsTZNPUKfZo,2803
|
|
4
|
+
debugger_help-3.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
debugger_help-3.0.0.dist-info/entry_points.txt,sha256=TbCJip8NfrK6ArjhHyLBAJLMDVtKTxm-IYwRNULNXqo,60
|
|
6
|
+
debugger_help-3.0.0.dist-info/top_level.txt,sha256=Mvew_obR30M1IogknxotpcHXyI4x4EqwWUMUyG94MaU,14
|
|
7
|
+
debugger_help-3.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
debugger_help
|