debugger-help 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ """debugger.help VPS Agent — Deep system monitoring."""
2
+ __version__ = "3.0.0"
debugger_help/agent.py ADDED
@@ -0,0 +1,1125 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ debugger.help VPS Agent v3 — Ultimate Deep Debugger
4
+
5
+ Captures EVERYTHING from your VPS:
6
+ - pm2 logs, status, process list, restart counts
7
+ - GPU: VRAM, temp, power, utilization, ECC errors, throttling, driver version, CUDA version
8
+ - CPU: per-core usage, load average, frequency, context switches
9
+ - Memory: RAM, swap, shared, buffers, cached
10
+ - Disk: usage per mount, I/O read/write rates, inode usage
11
+ - Network: per-interface stats, connection counts by state, open ports, DNS resolution
12
+ - Processes: top CPU/memory consumers, zombie/defunct processes, open file descriptors
13
+ - Docker containers (if running): status, CPU, memory, restart counts
14
+ - Systemd services: failed units
15
+ - SSL certificates: expiry checks
16
+ - File watchers: key log files (syslog, dmesg, pm2 logs, custom)
17
+ - Image generation: Flux/SDXL/ComfyUI pipeline errors, model load times, inference timing
18
+ - Environment: all relevant env vars (sanitized), Python packages, Node packages
19
+
20
+ Usage:
21
+ pip install psutil requests
22
+ # Optional: pip install pynvml docker
23
+
24
+ export DEBUGGER_API_KEY="sk_your_api_key_here"
25
+ export DEBUGGER_INGEST_URL="https://your-project.supabase.co/functions/v1/ingest"
26
+ python debugger_agent.py
27
+ """
28
+
29
+ import os
30
+ import sys
31
+ import time
32
+ import json
33
+ import socket
34
+ import logging
35
+ import traceback
36
+ import threading
37
+ import subprocess
38
+ import io
39
+ import re
40
+ import glob
41
+ import ssl
42
+ from datetime import datetime, timezone
43
+ from collections import deque
44
+ from pathlib import Path
45
+
46
+ try:
47
+ import psutil
48
+ except ImportError:
49
+ print("ERROR: psutil required. Install with: pip install psutil")
50
+ sys.exit(1)
51
+
52
+ try:
53
+ import requests
54
+ from requests.adapters import HTTPAdapter
55
+ from urllib3.util.retry import Retry
56
+ except ImportError:
57
+ print("ERROR: requests required. Install with: pip install requests")
58
+ sys.exit(1)
59
+
60
+ # Optional GPU monitoring
61
+ try:
62
+ import pynvml
63
+ pynvml.nvmlInit()
64
+ GPU_AVAILABLE = True
65
+ GPU_DRIVER_VERSION = pynvml.nvmlSystemGetDriverVersion()
66
+ if isinstance(GPU_DRIVER_VERSION, bytes):
67
+ GPU_DRIVER_VERSION = GPU_DRIVER_VERSION.decode()
68
+ try:
69
+ GPU_CUDA_VERSION = pynvml.nvmlSystemGetCudaDriverVersion_v2()
70
+ GPU_CUDA_VERSION = f"{GPU_CUDA_VERSION // 1000}.{(GPU_CUDA_VERSION % 1000) // 10}"
71
+ except Exception:
72
+ GPU_CUDA_VERSION = "unknown"
73
+ except (ImportError, Exception):
74
+ GPU_AVAILABLE = False
75
+ GPU_DRIVER_VERSION = None
76
+ GPU_CUDA_VERSION = None
77
+
78
+ # Optional Docker monitoring
79
+ try:
80
+ import docker
81
+ DOCKER_CLIENT = docker.from_env()
82
+ DOCKER_AVAILABLE = True
83
+ except Exception:
84
+ DOCKER_CLIENT = None
85
+ DOCKER_AVAILABLE = False
86
+
87
+ logging.basicConfig(
88
+ level=logging.INFO,
89
+ format="%(asctime)s [%(levelname)s] %(message)s"
90
+ )
91
+ logger = logging.getLogger("debugger-agent")
92
+
93
+ # Configuration
94
+ API_KEY = os.environ.get("DEBUGGER_API_KEY", "")
95
+ INGEST_URL = os.environ.get("DEBUGGER_INGEST_URL", "")
96
+ SOURCE_NAME = os.environ.get("DEBUGGER_SOURCE", f"vps-{socket.gethostname()}")
97
+ PLATFORM = os.environ.get("DEBUGGER_PLATFORM", "Python (VPS)")
98
+ INTERVAL = int(os.environ.get("DEBUGGER_INTERVAL", "10"))
99
+ VERSION = "3.0.0"
100
+
101
+ # Additional log files to watch
102
+ WATCH_LOG_FILES = [
103
+ p for p in os.environ.get("DEBUGGER_WATCH_LOGS", "").split(",") if p.strip()
104
+ ] or []
105
+
106
+ # SSL domains to check
107
+ SSL_CHECK_DOMAINS = [
108
+ d for d in os.environ.get("DEBUGGER_SSL_DOMAINS", "").split(",") if d.strip()
109
+ ] or []
110
+
111
+ if not API_KEY:
112
+ logger.error("DEBUGGER_API_KEY not set.")
113
+ sys.exit(1)
114
+ if not INGEST_URL:
115
+ logger.error("DEBUGGER_INGEST_URL not set.")
116
+ sys.exit(1)
117
+
118
+ # --- HTTP Session with Retry ---
119
+
120
+ session = requests.Session()
121
+ retry = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
122
+ session.mount("https://", HTTPAdapter(max_retries=retry))
123
+ session.mount("http://", HTTPAdapter(max_retries=retry))
124
+
125
+ HEADERS = {
126
+ "Authorization": f"Bearer {API_KEY}",
127
+ "Content-Type": "application/json",
128
+ }
129
+
130
+
131
+ def send(payload: dict) -> bool:
132
+ """Send payload with auto-retry."""
133
+ try:
134
+ resp = session.post(INGEST_URL, json=payload, headers=HEADERS, timeout=15)
135
+ return resp.status_code == 200
136
+ except Exception as e:
137
+ logger.debug(f"Send failed (will retry): {e}")
138
+ return False
139
+
140
+
141
+ def run_cmd(cmd: str, timeout: int = 10) -> str:
142
+ """Run shell command and return output."""
143
+ try:
144
+ result = subprocess.run(
145
+ cmd, shell=True, capture_output=True, text=True, timeout=timeout
146
+ )
147
+ return (result.stdout + result.stderr).strip()
148
+ except subprocess.TimeoutExpired:
149
+ return f"[timeout after {timeout}s]"
150
+ except Exception as e:
151
+ return f"[error: {e}]"
152
+
153
+
154
+ # --- Stdout/Stderr Capture ---
155
+
156
+ class StreamCapture(io.TextIOBase):
157
+ """Captures writes to stdout/stderr and sends them as logs."""
158
+
159
+ def __init__(self, original_stream, level="info", max_buffer=500):
160
+ self.original = original_stream
161
+ self.level = level
162
+ self.buffer = deque(maxlen=max_buffer)
163
+ self.lock = threading.Lock()
164
+
165
+ def write(self, text):
166
+ self.original.write(text)
167
+ if text.strip():
168
+ with self.lock:
169
+ self.buffer.append(text.strip())
170
+ lower = text.lower()
171
+ detected_level = self.level
172
+ if any(kw in lower for kw in [
173
+ "error", "exception", "traceback", "failed", "critical",
174
+ "cuda", "oom", "killed", "segfault", "abort", "panic"
175
+ ]):
176
+ detected_level = "error"
177
+ elif any(kw in lower for kw in ["warning", "warn", "deprecat"]):
178
+ detected_level = "warn"
179
+
180
+ send({
181
+ "type": "log",
182
+ "source": SOURCE_NAME,
183
+ "platform": PLATFORM,
184
+ "version": VERSION,
185
+ "level": detected_level,
186
+ "message": f"[std{self.level}] {text.strip()[:2000]}",
187
+ "context": {"capturedFrom": f"std{self.level}"},
188
+ })
189
+ return len(text)
190
+
191
+ def flush(self):
192
+ self.original.flush()
193
+
194
+ def get_recent(self, n=100):
195
+ with self.lock:
196
+ return list(self.buffer)[-n:]
197
+
198
+
199
+ # --- GPU Metrics (Deep) ---
200
+
201
+ def get_gpu_metrics() -> dict:
202
+ if not GPU_AVAILABLE:
203
+ return {}
204
+ try:
205
+ device_count = pynvml.nvmlDeviceGetCount()
206
+ gpus = []
207
+ for i in range(device_count):
208
+ handle = pynvml.nvmlDeviceGetHandleByIndex(i)
209
+ temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
210
+ mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
211
+ power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
212
+ util = pynvml.nvmlDeviceGetUtilizationRates(handle)
213
+ name = pynvml.nvmlDeviceGetName(handle)
214
+ if isinstance(name, bytes):
215
+ name = name.decode("utf-8")
216
+
217
+ gpu_info = {
218
+ "index": i,
219
+ "name": name,
220
+ "temp_c": temp,
221
+ "vram_used_gb": round(mem.used / 1e9, 2),
222
+ "vram_total_gb": round(mem.total / 1e9, 2),
223
+ "vram_free_gb": round(mem.free / 1e9, 2),
224
+ "vram_pct": round((mem.used / mem.total) * 100, 1) if mem.total > 0 else 0,
225
+ "gpu_util_pct": util.gpu,
226
+ "mem_util_pct": util.memory,
227
+ "power_w": round(power, 1),
228
+ }
229
+
230
+ # Power limit
231
+ try:
232
+ power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
233
+ gpu_info["power_limit_w"] = round(power_limit, 1)
234
+ except Exception:
235
+ pass
236
+
237
+ # Throttle reasons
238
+ try:
239
+ throttle = pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)
240
+ reasons = []
241
+ if throttle & 0x0000000000000002:
242
+ reasons.append("idle")
243
+ if throttle & 0x0000000000000004:
244
+ reasons.append("app_clocks")
245
+ if throttle & 0x0000000000000008:
246
+ reasons.append("sw_power_cap")
247
+ if throttle & 0x0000000000000020:
248
+ reasons.append("hw_slowdown")
249
+ if throttle & 0x0000000000000040:
250
+ reasons.append("sync_boost")
251
+ if throttle & 0x0000000000000080:
252
+ reasons.append("sw_thermal")
253
+ if throttle & 0x0000000000000100:
254
+ reasons.append("hw_thermal")
255
+ if throttle & 0x0000000000000200:
256
+ reasons.append("hw_power_brake")
257
+ gpu_info["throttle_reasons"] = reasons if reasons else ["none"]
258
+ except Exception:
259
+ pass
260
+
261
+ # Clock speeds
262
+ try:
263
+ gpu_info["clock_graphics_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
264
+ gpu_info["clock_mem_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
265
+ except Exception:
266
+ pass
267
+
268
+ # Running processes on GPU
269
+ try:
270
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
271
+ gpu_procs = []
272
+ for p in procs[:10]:
273
+ try:
274
+ proc = psutil.Process(p.pid)
275
+ gpu_procs.append({
276
+ "pid": p.pid,
277
+ "name": proc.name(),
278
+ "vram_mb": round(p.usedGpuMemory / 1e6, 1) if p.usedGpuMemory else 0,
279
+ "cmdline": " ".join(proc.cmdline()[:5])
280
+ })
281
+ except Exception:
282
+ gpu_procs.append({"pid": p.pid, "vram_mb": round(p.usedGpuMemory / 1e6, 1) if p.usedGpuMemory else 0})
283
+ gpu_info["processes"] = gpu_procs
284
+ except Exception:
285
+ pass
286
+
287
+ # ECC errors
288
+ try:
289
+ ecc_single = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_SINGLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
290
+ ecc_double = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_DOUBLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
291
+ gpu_info["ecc_single_bit"] = ecc_single
292
+ gpu_info["ecc_double_bit"] = ecc_double
293
+ except Exception:
294
+ pass
295
+
296
+ gpus.append(gpu_info)
297
+
298
+ return {
299
+ "driver_version": GPU_DRIVER_VERSION,
300
+ "cuda_version": GPU_CUDA_VERSION,
301
+ "device_count": device_count,
302
+ "gpus": gpus,
303
+ # Flatten primary GPU for metrics table
304
+ "gpu_temp": gpus[0]["temp_c"] if gpus else None,
305
+ "gpu_vram": gpus[0]["vram_used_gb"] if gpus else None,
306
+ }
307
+ except Exception as e:
308
+ return {"error": str(e)}
309
+
310
+
311
+ # --- PM2 Integration ---
312
+
313
+ def get_pm2_status() -> dict:
314
+ """Get full pm2 process list with details."""
315
+ raw = run_cmd("pm2 jlist 2>/dev/null")
316
+ if not raw or raw.startswith("[error") or raw.startswith("[timeout"):
317
+ return {"available": False, "raw": raw}
318
+
319
+ try:
320
+ processes = json.loads(raw)
321
+ result = {"available": True, "processes": []}
322
+ for p in processes:
323
+ env = p.get("pm2_env", {})
324
+ monit = p.get("monit", {})
325
+ proc_info = {
326
+ "name": p.get("name"),
327
+ "pid": p.get("pid"),
328
+ "pm_id": p.get("pm_id"),
329
+ "status": env.get("status"),
330
+ "restart_count": env.get("restart_time", 0),
331
+ "unstable_restarts": env.get("unstable_restarts", 0),
332
+ "uptime_ms": int(time.time() * 1000) - env.get("pm_uptime", 0) if env.get("pm_uptime") else None,
333
+ "cpu_pct": monit.get("cpu"),
334
+ "memory_mb": round(monit.get("memory", 0) / 1e6, 1),
335
+ "exec_mode": env.get("exec_mode"),
336
+ "node_version": env.get("node_version"),
337
+ "script": env.get("pm_exec_path"),
338
+ "cwd": env.get("pm_cwd"),
339
+ "created_at": env.get("created_at"),
340
+ "instances": env.get("instances"),
341
+ "exit_code": env.get("exit_code"),
342
+ }
343
+ # Get recent logs for this process
344
+ recent_logs = run_cmd(f"pm2 logs {p.get('name')} --nostream --lines 30 2>/dev/null", timeout=5)
345
+ if recent_logs and not recent_logs.startswith("["):
346
+ proc_info["recent_logs"] = recent_logs[-3000:] # Last 3KB
347
+
348
+ # Get recent error logs
349
+ err_log_path = env.get("pm_err_log_path")
350
+ if err_log_path and os.path.exists(err_log_path):
351
+ try:
352
+ with open(err_log_path, "r") as f:
353
+ lines = f.readlines()
354
+ proc_info["recent_errors"] = "".join(lines[-50:])[-3000:]
355
+ except Exception:
356
+ pass
357
+
358
+ result["processes"].append(proc_info)
359
+ return result
360
+ except json.JSONDecodeError:
361
+ return {"available": True, "raw_output": raw[:2000]}
362
+
363
+
364
+ def get_pm2_logs(lines: int = 100) -> str:
365
+ """Get combined pm2 logs."""
366
+ output = run_cmd(f"pm2 logs --nostream --lines {lines} 2>/dev/null", timeout=10)
367
+ return output[-5000:] if output else ""
368
+
369
+
370
+ # --- System Metrics (Deep) ---
371
+
372
+ def get_disk_io() -> dict:
373
+ """Get disk I/O stats."""
374
+ try:
375
+ io_counters = psutil.disk_io_counters(perdisk=False)
376
+ if io_counters:
377
+ return {
378
+ "read_mb": round(io_counters.read_bytes / 1e6, 1),
379
+ "write_mb": round(io_counters.write_bytes / 1e6, 1),
380
+ "read_count": io_counters.read_count,
381
+ "write_count": io_counters.write_count,
382
+ "read_time_ms": io_counters.read_time,
383
+ "write_time_ms": io_counters.write_time,
384
+ }
385
+ except Exception:
386
+ pass
387
+ return {}
388
+
389
+
390
+ def get_network_details() -> dict:
391
+ """Get detailed network info."""
392
+ result = {}
393
+
394
+ # Per-interface stats
395
+ try:
396
+ net_io = psutil.net_io_counters(pernic=True)
397
+ interfaces = {}
398
+ for name, counters in net_io.items():
399
+ if name == "lo":
400
+ continue
401
+ interfaces[name] = {
402
+ "sent_mb": round(counters.bytes_sent / 1e6, 1),
403
+ "recv_mb": round(counters.bytes_recv / 1e6, 1),
404
+ "packets_sent": counters.packets_sent,
405
+ "packets_recv": counters.packets_recv,
406
+ "errors_in": counters.errin,
407
+ "errors_out": counters.errout,
408
+ "drops_in": counters.dropin,
409
+ "drops_out": counters.dropout,
410
+ }
411
+ result["interfaces"] = interfaces
412
+ except Exception:
413
+ pass
414
+
415
+ # Connection states
416
+ try:
417
+ connections = psutil.net_connections(kind="tcp")
418
+ states = {}
419
+ for conn in connections:
420
+ s = conn.status
421
+ states[s] = states.get(s, 0) + 1
422
+ result["tcp_states"] = states
423
+ result["total_connections"] = len(connections)
424
+ except Exception:
425
+ pass
426
+
427
+ # Listening ports
428
+ try:
429
+ listening = []
430
+ for conn in psutil.net_connections(kind="tcp"):
431
+ if conn.status == "LISTEN":
432
+ listening.append({
433
+ "port": conn.laddr.port,
434
+ "addr": conn.laddr.ip,
435
+ "pid": conn.pid,
436
+ })
437
+ result["listening_ports"] = listening[:30]
438
+ except Exception:
439
+ pass
440
+
441
+ # DNS check
442
+ try:
443
+ start = time.time()
444
+ socket.getaddrinfo("google.com", 80)
445
+ result["dns_resolve_ms"] = round((time.time() - start) * 1000, 1)
446
+ except Exception as e:
447
+ result["dns_error"] = str(e)
448
+
449
+ return result
450
+
451
+
452
+ def get_top_processes(n: int = 15) -> list:
453
+ """Get top N processes by CPU and memory."""
454
+ procs = []
455
+ for p in psutil.process_iter(["pid", "name", "cpu_percent", "memory_percent", "status", "num_fds", "num_threads", "create_time", "cmdline"]):
456
+ try:
457
+ info = p.info
458
+ if info["cpu_percent"] is None:
459
+ continue
460
+ procs.append({
461
+ "pid": info["pid"],
462
+ "name": info["name"],
463
+ "cpu_pct": info["cpu_percent"],
464
+ "mem_pct": round(info["memory_percent"] or 0, 1),
465
+ "status": info["status"],
466
+ "fds": info.get("num_fds"),
467
+ "threads": info.get("num_threads"),
468
+ "cmd": " ".join((info.get("cmdline") or [])[:5])[:200],
469
+ })
470
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
471
+ pass
472
+ procs.sort(key=lambda x: x["cpu_pct"], reverse=True)
473
+ return procs[:n]
474
+
475
+
476
+ def get_zombie_processes() -> list:
477
+ """Find zombie/defunct processes."""
478
+ zombies = []
479
+ for p in psutil.process_iter(["pid", "name", "status", "ppid"]):
480
+ try:
481
+ if p.info["status"] == psutil.STATUS_ZOMBIE:
482
+ zombies.append({
483
+ "pid": p.info["pid"],
484
+ "name": p.info["name"],
485
+ "ppid": p.info["ppid"],
486
+ })
487
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
488
+ pass
489
+ return zombies
490
+
491
+
492
+ def get_disk_details() -> list:
493
+ """Get all mount point usage and inode info."""
494
+ mounts = []
495
+ for part in psutil.disk_partitions(all=False):
496
+ try:
497
+ usage = psutil.disk_usage(part.mountpoint)
498
+ info = {
499
+ "mount": part.mountpoint,
500
+ "device": part.device,
501
+ "fstype": part.fstype,
502
+ "total_gb": round(usage.total / 1e9, 1),
503
+ "used_gb": round(usage.used / 1e9, 1),
504
+ "free_gb": round(usage.free / 1e9, 1),
505
+ "pct": usage.percent,
506
+ }
507
+ # Inode usage (Linux)
508
+ try:
509
+ inode_output = run_cmd(f"df -i {part.mountpoint} | tail -1", timeout=3)
510
+ parts = inode_output.split()
511
+ if len(parts) >= 5:
512
+ info["inodes_used"] = parts[2]
513
+ info["inodes_free"] = parts[3]
514
+ info["inodes_pct"] = parts[4]
515
+ except Exception:
516
+ pass
517
+ mounts.append(info)
518
+ except Exception:
519
+ pass
520
+ return mounts
521
+
522
+
523
+ def get_systemd_failed() -> list:
524
+ """Get failed systemd units."""
525
+ output = run_cmd("systemctl --failed --no-pager --plain 2>/dev/null", timeout=5)
526
+ if not output or "0 loaded" in output:
527
+ return []
528
+
529
+ failed = []
530
+ for line in output.split("\n"):
531
+ line = line.strip()
532
+ if line and not line.startswith("UNIT") and not line.startswith("LOAD") and "loaded" not in line.lower():
533
+ parts = line.split()
534
+ if len(parts) >= 1:
535
+ failed.append(parts[0])
536
+ return failed[:20]
537
+
538
+
539
+ def get_docker_status() -> dict:
540
+ """Get Docker container info."""
541
+ if not DOCKER_AVAILABLE:
542
+ return {"available": False}
543
+ try:
544
+ containers = []
545
+ for c in DOCKER_CLIENT.containers.list(all=True):
546
+ stats = {}
547
+ if c.status == "running":
548
+ try:
549
+ s = c.stats(stream=False)
550
+ cpu_delta = s["cpu_stats"]["cpu_usage"]["total_usage"] - s["precpu_stats"]["cpu_usage"]["total_usage"]
551
+ system_delta = s["cpu_stats"]["system_cpu_usage"] - s["precpu_stats"]["system_cpu_usage"]
552
+ cpu_pct = round((cpu_delta / system_delta) * 100, 1) if system_delta > 0 else 0
553
+ mem_usage = s["memory_stats"].get("usage", 0)
554
+ mem_limit = s["memory_stats"].get("limit", 1)
555
+ stats = {
556
+ "cpu_pct": cpu_pct,
557
+ "memory_mb": round(mem_usage / 1e6, 1),
558
+ "memory_limit_mb": round(mem_limit / 1e6, 1),
559
+ }
560
+ except Exception:
561
+ pass
562
+
563
+ containers.append({
564
+ "name": c.name,
565
+ "image": c.image.tags[0] if c.image.tags else str(c.image.id)[:12],
566
+ "status": c.status,
567
+ "restart_count": c.attrs.get("RestartCount", 0),
568
+ **stats,
569
+ })
570
+ return {"available": True, "containers": containers}
571
+ except Exception as e:
572
+ return {"available": False, "error": str(e)}
573
+
574
+
575
+ def check_ssl_certs() -> list:
576
+ """Check SSL certificate expiry for configured domains."""
577
+ results = []
578
+ for domain in SSL_CHECK_DOMAINS:
579
+ try:
580
+ ctx = ssl.create_default_context()
581
+ with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s:
582
+ s.settimeout(5)
583
+ s.connect((domain, 443))
584
+ cert = s.getpeercert()
585
+ expires = datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
586
+ days_left = (expires - datetime.now()).days
587
+ results.append({
588
+ "domain": domain,
589
+ "expires": cert["notAfter"],
590
+ "days_left": days_left,
591
+ "issuer": dict(x[0] for x in cert.get("issuer", [])).get("organizationName", "unknown"),
592
+ "warning": days_left < 30,
593
+ })
594
+ except Exception as e:
595
+ results.append({"domain": domain, "error": str(e)})
596
+ return results
597
+
598
+
599
+ def get_system_logs() -> dict:
600
+ """Get recent system logs (dmesg, syslog, auth)."""
601
+ logs = {}
602
+
603
+ # dmesg (kernel messages — GPU errors, OOM kills show up here)
604
+ dmesg = run_cmd("dmesg --time-format iso -T 2>/dev/null | tail -50", timeout=5)
605
+ if dmesg and not dmesg.startswith("[error"):
606
+ logs["dmesg"] = dmesg[-3000:]
607
+
608
+ # Check for OOM kills specifically
609
+ oom = run_cmd("dmesg | grep -i 'oom\\|killed process\\|out of memory' | tail -10 2>/dev/null", timeout=5)
610
+ if oom and not oom.startswith("[error"):
611
+ logs["oom_kills"] = oom
612
+
613
+ # journalctl recent errors
614
+ journal = run_cmd("journalctl -p err --since '1 hour ago' --no-pager -q 2>/dev/null | tail -30", timeout=5)
615
+ if journal and not journal.startswith("[error"):
616
+ logs["journal_errors"] = journal[-2000:]
617
+
618
+ # Auth log (failed logins, suspicious activity)
619
+ auth = run_cmd("tail -20 /var/log/auth.log 2>/dev/null || tail -20 /var/log/secure 2>/dev/null", timeout=5)
620
+ if auth and not auth.startswith("[error"):
621
+ logs["auth_log"] = auth[-1000:]
622
+
623
+ return logs
624
+
625
+
626
+ def get_firewall_rules() -> str:
627
+ """Get firewall rules summary."""
628
+ # Try ufw first, then iptables
629
+ ufw = run_cmd("ufw status verbose 2>/dev/null", timeout=5)
630
+ if ufw and "Status:" in ufw:
631
+ return ufw[:2000]
632
+ ipt = run_cmd("iptables -L -n --line-numbers 2>/dev/null | head -40", timeout=5)
633
+ return ipt[:2000] if ipt else ""
634
+
635
+
636
+ def get_env_sanitized() -> dict:
637
+ """Get relevant environment variables with secrets redacted."""
638
+ relevant_prefixes = [
639
+ "NODE_", "PYTHON", "PATH", "HOME", "USER", "SHELL", "LANG",
640
+ "CUDA", "NVIDIA", "GPU", "LD_LIBRARY", "VIRTUAL_ENV", "CONDA",
641
+ "PM2_", "PORT", "HOST", "DISPLAY", "XDG_", "DBUS",
642
+ ]
643
+ env = {}
644
+ for key, value in os.environ.items():
645
+ if any(key.startswith(p) for p in relevant_prefixes):
646
+ # Redact anything that looks like a secret
647
+ if any(s in key.upper() for s in ["KEY", "SECRET", "TOKEN", "PASS", "AUTH"]):
648
+ env[key] = "[REDACTED]"
649
+ else:
650
+ env[key] = value[:500]
651
+ return env
652
+
653
+
654
+ def get_installed_packages() -> dict:
655
+ """Get installed Python and Node packages."""
656
+ pkgs = {}
657
+
658
+ # Python packages
659
+ pip_list = run_cmd("pip list --format=json 2>/dev/null", timeout=10)
660
+ if pip_list and pip_list.startswith("["):
661
+ try:
662
+ pkgs["python"] = {p["name"]: p["version"] for p in json.loads(pip_list)}
663
+ except Exception:
664
+ pass
665
+
666
+ # Node packages (global)
667
+ npm_list = run_cmd("npm list -g --depth=0 --json 2>/dev/null", timeout=10)
668
+ if npm_list and npm_list.startswith("{"):
669
+ try:
670
+ deps = json.loads(npm_list).get("dependencies", {})
671
+ pkgs["node_global"] = {k: v.get("version", "?") for k, v in deps.items()}
672
+ except Exception:
673
+ pass
674
+
675
+ # pm2 ecosystem config
676
+ pm2_conf = run_cmd("cat ecosystem.config.js 2>/dev/null || cat ecosystem.config.cjs 2>/dev/null", timeout=3)
677
+ if pm2_conf and not pm2_conf.startswith("[error"):
678
+ pkgs["pm2_ecosystem"] = pm2_conf[:3000]
679
+
680
+ return pkgs
681
+
682
+
683
+ # --- File Watcher Thread ---
684
+
685
+ class LogFileWatcher(threading.Thread):
686
+ """Watch log files for new lines and send them."""
687
+
688
+ def __init__(self, files: list):
689
+ super().__init__(daemon=True)
690
+ self.files = files
691
+ self.positions = {}
692
+
693
+ def run(self):
694
+ # Initialize positions to end of files
695
+ for f in self.files:
696
+ try:
697
+ self.positions[f] = os.path.getsize(f)
698
+ except Exception:
699
+ self.positions[f] = 0
700
+
701
+ while True:
702
+ for filepath in self.files:
703
+ try:
704
+ size = os.path.getsize(filepath)
705
+ if size > self.positions.get(filepath, 0):
706
+ with open(filepath, "r") as fh:
707
+ fh.seek(self.positions[filepath])
708
+ new_lines = fh.read(10000) # Max 10KB per read
709
+ self.positions[filepath] = fh.tell()
710
+
711
+ if new_lines.strip():
712
+ level = "info"
713
+ lower = new_lines.lower()
714
+ if any(kw in lower for kw in ["error", "exception", "traceback", "failed", "critical"]):
715
+ level = "error"
716
+ elif "warn" in lower:
717
+ level = "warn"
718
+
719
+ send({
720
+ "type": "log",
721
+ "source": SOURCE_NAME,
722
+ "platform": PLATFORM,
723
+ "version": VERSION,
724
+ "level": level,
725
+ "message": f"[file:{os.path.basename(filepath)}] {new_lines.strip()[:2000]}",
726
+ "context": {"capturedFrom": "file_watcher", "file": filepath},
727
+ })
728
+ elif size < self.positions.get(filepath, 0):
729
+ # File was truncated/rotated
730
+ self.positions[filepath] = 0
731
+ except Exception:
732
+ pass
733
+ time.sleep(2)
734
+
735
+
736
+ # --- Collect Everything ---
737
+
738
+ def collect_full_metrics() -> dict:
739
+ """Collect absolutely everything."""
740
+ cpu = psutil.cpu_percent(interval=1)
741
+ cpu_per_core = psutil.cpu_percent(interval=0, percpu=True)
742
+ mem = psutil.virtual_memory()
743
+ swap = psutil.swap_memory()
744
+
745
+ data = {
746
+ "cpu": cpu,
747
+ "memory": mem.percent,
748
+ "network_latency": 0,
749
+ "custom": {
750
+ # CPU deep
751
+ "cpu_per_core": cpu_per_core,
752
+ "cpu_count_logical": psutil.cpu_count(),
753
+ "cpu_count_physical": psutil.cpu_count(logical=False),
754
+ "load_avg": list(os.getloadavg()) if hasattr(os, "getloadavg") else [],
755
+
756
+ # Memory deep
757
+ "memory_used_gb": round(mem.used / 1e9, 2),
758
+ "memory_total_gb": round(mem.total / 1e9, 2),
759
+ "memory_available_gb": round(mem.available / 1e9, 2),
760
+ "memory_cached_gb": round(getattr(mem, "cached", 0) / 1e9, 2),
761
+ "memory_buffers_gb": round(getattr(mem, "buffers", 0) / 1e9, 2),
762
+ "swap_used_gb": round(swap.used / 1e9, 2),
763
+ "swap_total_gb": round(swap.total / 1e9, 2),
764
+ "swap_pct": swap.percent,
765
+
766
+ # Disk I/O
767
+ "disk_io": get_disk_io(),
768
+
769
+ # OS info
770
+ "os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
771
+ "hostname": socket.gethostname(),
772
+ "python_version": sys.version.split()[0],
773
+ "node_version": run_cmd("node --version 2>/dev/null"),
774
+ "uptime_hours": round((time.time() - psutil.boot_time()) / 3600, 1),
775
+ },
776
+ }
777
+
778
+ # CPU frequency
779
+ try:
780
+ freq = psutil.cpu_freq()
781
+ if freq:
782
+ data["custom"]["cpu_freq_mhz"] = round(freq.current, 0)
783
+ data["custom"]["cpu_freq_max_mhz"] = round(freq.max, 0)
784
+ except Exception:
785
+ pass
786
+
787
+ # Context switches
788
+ try:
789
+ ctx = psutil.cpu_stats()
790
+ data["custom"]["ctx_switches"] = ctx.ctx_switches
791
+ data["custom"]["interrupts"] = ctx.interrupts
792
+ except Exception:
793
+ pass
794
+
795
+ # GPU
796
+ gpu = get_gpu_metrics()
797
+ if gpu:
798
+ data["gpu_temp"] = gpu.get("gpu_temp")
799
+ data["gpu_vram"] = gpu.get("gpu_vram")
800
+ data["custom"]["gpu"] = gpu
801
+
802
+ return data
803
+
804
+
805
+ def collect_deep_snapshot() -> dict:
806
+ """Full system snapshot for variable inspector."""
807
+ snapshot = {
808
+ "system": {
809
+ "hostname": socket.gethostname(),
810
+ "uptime_hours": round((time.time() - psutil.boot_time()) / 3600, 1),
811
+ "cpu_count": psutil.cpu_count(),
812
+ "memory_total_gb": round(psutil.virtual_memory().total / 1e9, 2),
813
+ "python_version": sys.version.split()[0],
814
+ "os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
815
+ "kernel": run_cmd("uname -r 2>/dev/null"),
816
+ },
817
+ "gpu": get_gpu_metrics() if GPU_AVAILABLE else {"available": False},
818
+ "pm2": get_pm2_status(),
819
+ "disks": get_disk_details(),
820
+ "network": get_network_details(),
821
+ "top_processes": get_top_processes(15),
822
+ "zombie_processes": get_zombie_processes(),
823
+ "docker": get_docker_status(),
824
+ "systemd_failed": get_systemd_failed(),
825
+ "firewall": get_firewall_rules(),
826
+ "environment": get_env_sanitized(),
827
+ }
828
+
829
+ # Job state
830
+ state = job_tracker.get_state()
831
+ if state:
832
+ snapshot["current_job"] = state
833
+
834
+ return snapshot
835
+
836
+
837
+ # --- Job State Tracking ---
838
+
839
+ class JobTracker:
840
+ """Track job states from BullMQ or similar queue systems."""
841
+
842
+ def __init__(self):
843
+ self.current_job = None
844
+ self.job_history = deque(maxlen=50)
845
+ self.lock = threading.Lock()
846
+
847
+ def update(self, job_id: str, status: str, progress: float = 0,
848
+ last_action: str = "", error: str | None = None,
849
+ metadata: dict | None = None):
850
+ with self.lock:
851
+ job = {
852
+ "job_id": job_id,
853
+ "status": status,
854
+ "progress": progress,
855
+ "last_action": last_action,
856
+ "error": error,
857
+ "metadata": metadata or {},
858
+ "updated_at": datetime.now(timezone.utc).isoformat(),
859
+ }
860
+ self.current_job = job
861
+ self.job_history.append(job)
862
+
863
+ def get_state(self) -> dict | None:
864
+ with self.lock:
865
+ return self.current_job.copy() if self.current_job else None
866
+
867
+ def get_history(self, n: int = 20) -> list:
868
+ with self.lock:
869
+ return list(self.job_history)[-n:]
870
+
871
+ def send_update(self):
872
+ state = self.get_state()
873
+ if not state:
874
+ return
875
+ send({
876
+ "type": "inspect",
877
+ "source": SOURCE_NAME,
878
+ "platform": PLATFORM,
879
+ "version": VERSION,
880
+ "variables": {"current_job": state, "job_history": self.get_history()},
881
+ })
882
+
883
+
884
+ # --- Public API for External Use ---
885
+
886
+ job_tracker = JobTracker()
887
+ stdout_capture = None
888
+ stderr_capture = None
889
+
890
+
891
+ def send_log(level: str, message: str, context: dict = None):
892
+ send({
893
+ "type": "log",
894
+ "source": SOURCE_NAME,
895
+ "platform": PLATFORM,
896
+ "version": VERSION,
897
+ "level": level,
898
+ "message": message,
899
+ "context": context or {},
900
+ })
901
+
902
+
903
+ def send_error(title: str, stack_trace: str = "", context: dict = None):
904
+ send({
905
+ "type": "error",
906
+ "source": SOURCE_NAME,
907
+ "platform": PLATFORM,
908
+ "version": VERSION,
909
+ "title": title,
910
+ "stackTrace": stack_trace,
911
+ "context": context or {},
912
+ })
913
+
914
+
915
+ def update_job(job_id: str, status: str, progress: float = 0,
916
+ last_action: str = "", error: str = None, metadata: dict = None):
917
+ job_tracker.update(job_id, status, progress, last_action, error, metadata)
918
+ job_tracker.send_update()
919
+
920
+ if error:
921
+ send_error(f"Job {job_id} failed: {error}", context={
922
+ "job_id": job_id, "status": status, "last_action": last_action,
923
+ **(metadata or {}),
924
+ })
925
+
926
+
927
+ def capture_image_gen(job_id: str, model: str, params: dict,
928
+ result: dict = None, error: str = None,
929
+ duration_s: float = None):
930
+ """Specialized capture for image generation pipelines."""
931
+ context = {
932
+ "job_id": job_id,
933
+ "model": model,
934
+ "params": params,
935
+ "duration_s": duration_s,
936
+ }
937
+ if result:
938
+ context["result"] = result
939
+ send_log("info", f"Image gen complete: {model} ({duration_s:.1f}s)", context)
940
+ if error:
941
+ context["error"] = error
942
+ send_error(f"Image gen failed: {model} — {error}", context=context)
943
+
944
+ update_job(job_id, "completed" if result else "failed",
945
+ progress=100 if result else 0,
946
+ last_action=f"generate:{model}",
947
+ error=error,
948
+ metadata=context)
949
+
950
+
951
+ # --- Main Loop ---
952
+
953
+ def main():
954
+ global stdout_capture, stderr_capture
955
+
956
+ # Capture stdout/stderr
957
+ stdout_capture = StreamCapture(sys.stdout, "info")
958
+ stderr_capture = StreamCapture(sys.stderr, "error")
959
+ sys.stdout = stdout_capture
960
+ sys.stderr = stderr_capture
961
+
962
+ logger.info(f"debugger.help Agent v{VERSION} — Ultimate Deep Debugger")
963
+ logger.info(f"Source: {SOURCE_NAME} | GPU: {'yes' if GPU_AVAILABLE else 'no'} | Docker: {'yes' if DOCKER_AVAILABLE else 'no'}")
964
+ logger.info(f"Interval: {INTERVAL}s | Endpoint: {INGEST_URL}")
965
+
966
+ # Start file watchers
967
+ watch_files = list(WATCH_LOG_FILES)
968
+ # Auto-discover pm2 log files
969
+ pm2_log_dir = os.path.expanduser("~/.pm2/logs")
970
+ if os.path.isdir(pm2_log_dir):
971
+ pm2_logs = glob.glob(os.path.join(pm2_log_dir, "*.log"))
972
+ watch_files.extend(pm2_logs)
973
+ logger.info(f"Watching {len(pm2_logs)} pm2 log files")
974
+
975
+ if watch_files:
976
+ watcher = LogFileWatcher(watch_files)
977
+ watcher.start()
978
+ logger.info(f"File watcher started for {len(watch_files)} files")
979
+
980
+ # Initial connection
981
+ send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
982
+ send_log("info", f"Agent v{VERSION} started on {socket.gethostname()}", {
983
+ "hostname": socket.gethostname(),
984
+ "python_version": sys.version,
985
+ "gpu_available": GPU_AVAILABLE,
986
+ "docker_available": DOCKER_AVAILABLE,
987
+ "pid": os.getpid(),
988
+ "gpu_driver": GPU_DRIVER_VERSION,
989
+ "cuda_version": GPU_CUDA_VERSION,
990
+ })
991
+
992
+ # Send initial deep snapshot with packages
993
+ try:
994
+ pkgs = get_installed_packages()
995
+ send({
996
+ "type": "inspect",
997
+ "source": SOURCE_NAME,
998
+ "platform": PLATFORM,
999
+ "version": VERSION,
1000
+ "variables": {"installed_packages": pkgs},
1001
+ })
1002
+ except Exception:
1003
+ pass
1004
+
1005
+ tick = 0
1006
+ consecutive_failures = 0
1007
+
1008
+ while True:
1009
+ try:
1010
+ # Metrics every tick
1011
+ metrics = collect_full_metrics()
1012
+ ok = send({
1013
+ "type": "metric",
1014
+ "source": SOURCE_NAME,
1015
+ "platform": PLATFORM,
1016
+ "version": VERSION,
1017
+ **metrics,
1018
+ })
1019
+
1020
+ if ok:
1021
+ consecutive_failures = 0
1022
+ else:
1023
+ consecutive_failures += 1
1024
+
1025
+ # Auto-reconnect backoff
1026
+ if consecutive_failures > 5:
1027
+ backoff = min(consecutive_failures * 5, 60)
1028
+ logger.warning(f"Connection issues. Backing off {backoff}s...")
1029
+ time.sleep(backoff)
1030
+ send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
1031
+ consecutive_failures = 0
1032
+ continue
1033
+
1034
+ # Deep snapshot every ~1 min
1035
+ if tick % 6 == 0:
1036
+ snapshot = collect_deep_snapshot()
1037
+ send({
1038
+ "type": "inspect",
1039
+ "source": SOURCE_NAME,
1040
+ "platform": PLATFORM,
1041
+ "version": VERSION,
1042
+ "variables": snapshot,
1043
+ })
1044
+
1045
+ # PM2 logs check every ~30 seconds
1046
+ if tick % 3 == 0:
1047
+ pm2_logs = get_pm2_logs(50)
1048
+ if pm2_logs:
1049
+ # Check for errors in pm2 logs
1050
+ for line in pm2_logs.split("\n"):
1051
+ lower = line.lower()
1052
+ if any(kw in lower for kw in ["error", "exception", "failed", "crash", "enoent", "eacces", "killed"]):
1053
+ send_log("error", f"[pm2] {line.strip()[:500]}", {"capturedFrom": "pm2_logs"})
1054
+
1055
+ # System logs check every ~2 min
1056
+ if tick % 12 == 0:
1057
+ sys_logs = get_system_logs()
1058
+ if sys_logs:
1059
+ send({
1060
+ "type": "inspect",
1061
+ "source": SOURCE_NAME,
1062
+ "platform": PLATFORM,
1063
+ "version": VERSION,
1064
+ "variables": {"system_logs": sys_logs},
1065
+ })
1066
+
1067
+ # SSL cert check every ~10 min
1068
+ if tick % 60 == 0 and SSL_CHECK_DOMAINS:
1069
+ ssl_results = check_ssl_certs()
1070
+ for r in ssl_results:
1071
+ if r.get("warning") or r.get("error"):
1072
+ send_log("warn", f"SSL cert issue: {r.get('domain')} — {r.get('error') or f'{r.get(\"days_left\")} days left'}", r)
1073
+ send({
1074
+ "type": "inspect",
1075
+ "source": SOURCE_NAME,
1076
+ "platform": PLATFORM,
1077
+ "version": VERSION,
1078
+ "variables": {"ssl_certs": ssl_results},
1079
+ })
1080
+
1081
+ # GPU warnings every ~30 seconds
1082
+ if GPU_AVAILABLE and tick % 3 == 0:
1083
+ gpu = get_gpu_metrics()
1084
+ gpus = gpu.get("gpus", [])
1085
+ for g in gpus:
1086
+ if g.get("temp_c", 0) > 85:
1087
+ send_log("warn", f"GPU {g['index']} temperature critical: {g['temp_c']}°C", g)
1088
+ if g.get("vram_pct", 0) > 90:
1089
+ send_log("warn", f"GPU {g['index']} VRAM critical: {g['vram_pct']}% ({g['vram_used_gb']}/{g['vram_total_gb']} GB)", g)
1090
+ throttle = g.get("throttle_reasons", [])
1091
+ if throttle and throttle != ["none"] and throttle != ["idle"]:
1092
+ send_log("warn", f"GPU {g['index']} throttling: {', '.join(throttle)}", g)
1093
+
1094
+ # Heartbeat every ~5 min
1095
+ if tick % 30 == 0 and tick > 0:
1096
+ send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
1097
+
1098
+ # Packages update every ~30 min
1099
+ if tick % 180 == 0 and tick > 0:
1100
+ pkgs = get_installed_packages()
1101
+ send({
1102
+ "type": "inspect",
1103
+ "source": SOURCE_NAME,
1104
+ "platform": PLATFORM,
1105
+ "version": VERSION,
1106
+ "variables": {"installed_packages": pkgs},
1107
+ })
1108
+
1109
+ tick += 1
1110
+ time.sleep(INTERVAL)
1111
+
1112
+ except KeyboardInterrupt:
1113
+ send_log("info", "Agent shutting down gracefully")
1114
+ logger.info("Shutting down...")
1115
+ sys.stdout = stdout_capture.original
1116
+ sys.stderr = stderr_capture.original
1117
+ break
1118
+ except Exception as e:
1119
+ logger.error(f"Main loop error: {e}")
1120
+ send_error(str(e), traceback.format_exc())
1121
+ time.sleep(INTERVAL)
1122
+
1123
+
1124
+ if __name__ == "__main__":
1125
+ main()
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.4
2
+ Name: debugger-help
3
+ Version: 3.0.0
4
+ Summary: debugger.help VPS Agent — Deep system monitoring for logs, GPU, PM2, Docker, and more
5
+ Author: debugger.help
6
+ License: MIT
7
+ Project-URL: Homepage, https://debugger.help
8
+ Project-URL: Repository, https://github.com/YOUR_ORG/debugger-help
9
+ Keywords: debugger,monitoring,logging,gpu,vps,observability
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: System :: Monitoring
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: psutil>=5.9.0
18
+ Requires-Dist: requests>=2.28.0
19
+ Provides-Extra: gpu
20
+ Requires-Dist: pynvml>=11.0.0; extra == "gpu"
21
+ Provides-Extra: docker
22
+ Requires-Dist: docker>=6.0.0; extra == "docker"
23
+ Provides-Extra: all
24
+ Requires-Dist: pynvml>=11.0.0; extra == "all"
25
+ Requires-Dist: docker>=6.0.0; extra == "all"
26
+
27
+ # debugger-help
28
+
29
+ Deep VPS monitoring agent for [debugger.help](https://debugger.help). Captures PM2, GPU metrics, Docker containers, system health, and streams everything to your dashboard in real time.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install debugger-help
35
+ ```
36
+
37
+ With GPU monitoring:
38
+ ```bash
39
+ pip install debugger-help[gpu]
40
+ ```
41
+
42
+ With everything:
43
+ ```bash
44
+ pip install debugger-help[all]
45
+ ```
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ export DEBUGGER_API_KEY="your-api-key"
51
+ export DEBUGGER_INGEST_URL="your-ingest-url"
52
+ debugger-agent
53
+ ```
54
+
55
+ Or keep it running with PM2:
56
+ ```bash
57
+ pm2 start "debugger-agent" --name debugger-agent
58
+ pm2 save
59
+ ```
60
+
61
+ ## What it captures
62
+
63
+ - PM2 process states, restart counts, logs, error logs
64
+ - GPU: VRAM per-process, temperature, power, utilization, clock speeds, throttle reasons, ECC errors
65
+ - CPU: per-core usage, load average, frequency, context switches
66
+ - Memory: RAM, swap, shared, buffers, cached
67
+ - Disk: usage per mount, I/O rates, inode usage
68
+ - Network: per-interface stats, TCP connection states, open ports, DNS resolution
69
+ - Processes: top CPU/memory consumers, zombies, open file descriptors
70
+ - Docker containers: status, CPU, memory, restart counts
71
+ - Systemd: failed units
72
+ - SSL certificates: expiry checks
73
+ - System logs: dmesg, OOM kills, journalctl errors
74
+ - File watchers: syslog, PM2 logs, custom log files
75
+
76
+ ## Environment Variables
77
+
78
+ | Variable | Required | Description |
79
+ |----------|----------|-------------|
80
+ | `DEBUGGER_API_KEY` | Yes | Your debugger.help API key |
81
+ | `DEBUGGER_INGEST_URL` | Yes | Your ingest endpoint URL |
82
+ | `DEBUGGER_SOURCE` | No | Source name (default: `vps-{hostname}`) |
83
+ | `DEBUGGER_INTERVAL` | No | Collection interval in seconds (default: `10`) |
84
+ | `DEBUGGER_WATCH_LOGS` | No | Comma-separated extra log file paths |
85
+ | `DEBUGGER_SSL_DOMAINS` | No | Comma-separated domains for SSL checks |
86
+
87
+ ## License
88
+
89
+ MIT
@@ -0,0 +1,7 @@
1
+ debugger_help/__init__.py,sha256=LvFFmKY3_tMH7u9l7pif6OQX9YLTzXD-lAtpT16bB60,80
2
+ debugger_help/agent.py,sha256=NWwOZlL-0PafD1ZfRNGnr6j1DGLJ7x_W-a3JcYuVgkU,40545
3
+ debugger_help-3.0.0.dist-info/METADATA,sha256=QFtQNHPDsFCGntsmXJB7ZQBjf_bqaI65qsTZNPUKfZo,2803
4
+ debugger_help-3.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
5
+ debugger_help-3.0.0.dist-info/entry_points.txt,sha256=TbCJip8NfrK6ArjhHyLBAJLMDVtKTxm-IYwRNULNXqo,60
6
+ debugger_help-3.0.0.dist-info/top_level.txt,sha256=Mvew_obR30M1IogknxotpcHXyI4x4EqwWUMUyG94MaU,14
7
+ debugger_help-3.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ debugger-agent = debugger_help.agent:main
@@ -0,0 +1 @@
1
+ debugger_help