debugger-help 3.0.2__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debugger-help
3
- Version: 3.0.2
3
+ Version: 4.0.0
4
4
  Summary: debugger.help VPS Agent — Deep system monitoring for logs, GPU, PM2, Docker, and more
5
5
  Author: debugger.help
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """debugger.help VPS Agent — Deep system monitoring."""
2
- __version__ = "3.0.2"
2
+ __version__ = "4.0.0"
@@ -1,29 +1,11 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- debugger.help VPS Agent v3Ultimate Deep Debugger
4
-
5
- Captures EVERYTHING from your VPS:
6
- - pm2 logs, status, process list, restart counts
7
- - GPU: VRAM, temp, power, utilization, ECC errors, throttling, driver version, CUDA version
8
- - CPU: per-core usage, load average, frequency, context switches
9
- - Memory: RAM, swap, shared, buffers, cached
10
- - Disk: usage per mount, I/O read/write rates, inode usage
11
- - Network: per-interface stats, connection counts by state, open ports, DNS resolution
12
- - Processes: top CPU/memory consumers, zombie/defunct processes, open file descriptors
13
- - Docker containers (if running): status, CPU, memory, restart counts
14
- - Systemd services: failed units
15
- - SSL certificates: expiry checks
16
- - File watchers: key log files (syslog, dmesg, pm2 logs, custom)
17
- - Image generation: Flux/SDXL/ComfyUI pipeline errors, model load times, inference timing
18
- - Environment: all relevant env vars (sanitized), Python packages, Node packages
19
-
20
- Usage:
21
- pip install psutil requests
22
- # Optional: pip install pynvml docker
23
-
24
- export DEBUGGER_API_KEY="sk_your_api_key_here"
25
- export DEBUGGER_INGEST_URL="https://your-project.supabase.co/functions/v1/ingest"
26
- python debugger_agent.py
3
+ debugger.help VPS Agent v4.0.0Action-Based Execution
4
+
5
+ Key changes from v3:
6
+ - Action-based command system: only predefined actions execute, no raw commands
7
+ - Fixed error/warning log flood: deduplication + excluded agent's own logs
8
+ - Deterministic execution: AI selects from action registry, agent maps to commands
27
9
  """
28
10
 
29
11
  import os
@@ -67,7 +49,7 @@ try:
67
49
  GPU_DRIVER_VERSION = GPU_DRIVER_VERSION.decode()
68
50
  try:
69
51
  GPU_CUDA_VERSION = pynvml.nvmlSystemGetCudaDriverVersion_v2()
70
- GPU_CUDA_VERSION = f"{GPU_CUDA_VERSION // 1000}.{(GPU_CUDA_VERSION % 1000) // 10}"
52
+ GPU_CUDA_VERSION = "{}.{}".format(GPU_CUDA_VERSION // 1000, (GPU_CUDA_VERSION % 1000) // 10)
71
53
  except Exception:
72
54
  GPU_CUDA_VERSION = "unknown"
73
55
  except (ImportError, Exception):
@@ -93,10 +75,13 @@ logger = logging.getLogger("debugger-agent")
93
75
  # Configuration
94
76
  API_KEY = os.environ.get("DEBUGGER_API_KEY", "")
95
77
  INGEST_URL = os.environ.get("DEBUGGER_INGEST_URL", "")
96
- SOURCE_NAME = os.environ.get("DEBUGGER_SOURCE", f"vps-{socket.gethostname()}")
78
+ SOURCE_NAME = os.environ.get("DEBUGGER_SOURCE", "vps-{}".format(socket.gethostname()))
97
79
  PLATFORM = os.environ.get("DEBUGGER_PLATFORM", "Python (VPS)")
98
80
  INTERVAL = int(os.environ.get("DEBUGGER_INTERVAL", "10"))
99
- VERSION = "3.0.2"
81
+ VERSION = "4.0.0"
82
+
83
+ # Derive poll-commands URL from ingest URL
84
+ POLL_COMMANDS_URL = INGEST_URL.replace("/ingest", "/poll-commands") if INGEST_URL else ""
100
85
 
101
86
  # Additional log files to watch
102
87
  WATCH_LOG_FILES = [
@@ -123,22 +108,24 @@ session.mount("https://", HTTPAdapter(max_retries=retry))
123
108
  session.mount("http://", HTTPAdapter(max_retries=retry))
124
109
 
125
110
  HEADERS = {
126
- "Authorization": f"Bearer {API_KEY}",
111
+ "Authorization": "Bearer {}".format(API_KEY),
127
112
  "Content-Type": "application/json",
128
113
  }
129
114
 
130
115
 
131
- def send(payload: dict) -> bool:
116
+ def send(payload):
132
117
  """Send payload with auto-retry."""
133
118
  try:
134
119
  resp = session.post(INGEST_URL, json=payload, headers=HEADERS, timeout=15)
120
+ if resp.status_code != 200:
121
+ logger.warning("Send failed [%s %s]: %s", payload.get("type"), resp.status_code, resp.text[:200])
135
122
  return resp.status_code == 200
136
123
  except Exception as e:
137
- logger.debug(f"Send failed (will retry): {e}")
124
+ logger.warning("Send error [%s]: %s", payload.get("type"), e)
138
125
  return False
139
126
 
140
127
 
141
- def run_cmd(cmd: str, timeout: int = 10) -> str:
128
+ def run_cmd(cmd, timeout=10):
142
129
  """Run shell command and return output."""
143
130
  try:
144
131
  result = subprocess.run(
@@ -146,26 +133,64 @@ def run_cmd(cmd: str, timeout: int = 10) -> str:
146
133
  )
147
134
  return (result.stdout + result.stderr).strip()
148
135
  except subprocess.TimeoutExpired:
149
- return f"[timeout after {timeout}s]"
136
+ return "[timeout after {}s]".format(timeout)
150
137
  except Exception as e:
151
- return f"[error: {e}]"
138
+ return "[error: {}]".format(e)
139
+
140
+
141
+ # =============================================================================
142
+ # ACTION REGISTRY — the ONLY commands that can be executed remotely
143
+ # =============================================================================
144
+
145
+ ACTION_COMMANDS = {
146
+ "check_gpu": "nvidia-smi",
147
+ "check_gpu_detailed": "nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpu --format=csv,noheader,nounits",
148
+ "check_docker": "systemctl status docker",
149
+ "restart_docker": "sudo systemctl restart docker",
150
+ "check_comfy": "pm2 logs yourstudio-gpu --nostream --lines 100 2>/dev/null || pm2 logs --nostream --lines 100 2>/dev/null",
151
+ "restart_comfy": "pm2 restart yourstudio-gpu 2>/dev/null || pm2 restart all",
152
+ "check_processes": "ps aux --sort=-%cpu | head -30",
153
+ "check_ports": "ss -tulnp",
154
+ "check_health": "curl -fsS --connect-timeout 5 --max-time 10 http://127.0.0.1:8188/health 2>&1 || echo 'Health check failed'",
155
+ "check_disk": "df -h",
156
+ "check_memory": "free -h",
157
+ "check_uptime": "uptime",
158
+ "check_pm2": "pm2 list",
159
+ "check_systemd": "systemctl --failed --no-pager --plain 2>/dev/null | head -20",
160
+ "check_journal_errors": "journalctl -p err --since '1 hour ago' --no-pager -q 2>/dev/null | tail -50",
161
+ "check_dmesg": "dmesg --time-format iso -T 2>/dev/null | tail -50",
162
+ "check_network": "ip addr show",
163
+ }
152
164
 
165
+ # Actions that are never auto-executed even if the server says so
166
+ DANGEROUS_ACTIONS = {"restart_docker", "restart_comfy"}
153
167
 
154
- # --- Stdout/Stderr Capture ---
168
+
169
+ # =============================================================================
170
+ # Stdout/Stderr Capture (with deduplication)
171
+ # =============================================================================
155
172
 
156
173
  class StreamCapture(io.TextIOBase):
157
- """Captures writes to stdout/stderr and sends them as logs."""
174
+ """Captures writes to stdout/stderr and buffers them for batch sending."""
158
175
 
159
176
  def __init__(self, original_stream, level="info", max_buffer=500):
160
177
  self.original = original_stream
161
178
  self.level = level
162
179
  self.buffer = deque(maxlen=max_buffer)
180
+ self.pending = deque(maxlen=100)
163
181
  self.lock = threading.Lock()
182
+ self._sending = False
183
+ self._recent_hashes = deque(maxlen=200)
164
184
 
165
185
  def write(self, text):
166
186
  self.original.write(text)
167
- if text.strip():
187
+ if text.strip() and not self._sending:
188
+ msg_hash = hash(text.strip()[:200])
168
189
  with self.lock:
190
+ # Deduplicate: skip if we've seen this exact message recently
191
+ if msg_hash in self._recent_hashes:
192
+ return len(text)
193
+ self._recent_hashes.append(msg_hash)
169
194
  self.buffer.append(text.strip())
170
195
  lower = text.lower()
171
196
  detected_level = self.level
@@ -176,29 +201,43 @@ class StreamCapture(io.TextIOBase):
176
201
  detected_level = "error"
177
202
  elif any(kw in lower for kw in ["warning", "warn", "deprecat"]):
178
203
  detected_level = "warn"
204
+ self.pending.append((detected_level, text.strip()[:2000]))
205
+ return len(text)
206
+
207
+ def flush(self):
208
+ self.original.flush()
179
209
 
210
+ def flush_pending(self):
211
+ """Send buffered messages from main loop to avoid recursion."""
212
+ items = []
213
+ with self.lock:
214
+ while self.pending:
215
+ items.append(self.pending.popleft())
216
+ self._sending = True
217
+ try:
218
+ for level, msg in items:
180
219
  send({
181
220
  "type": "log",
182
221
  "source": SOURCE_NAME,
183
222
  "platform": PLATFORM,
184
223
  "version": VERSION,
185
- "level": detected_level,
186
- "message": f"[std{self.level}] {text.strip()[:2000]}",
187
- "context": {"capturedFrom": f"std{self.level}"},
224
+ "level": level,
225
+ "message": "[std{}] {}".format(self.level, msg),
226
+ "context": {"capturedFrom": "std{}".format(self.level)},
188
227
  })
189
- return len(text)
190
-
191
- def flush(self):
192
- self.original.flush()
228
+ finally:
229
+ self._sending = False
193
230
 
194
231
  def get_recent(self, n=100):
195
232
  with self.lock:
196
233
  return list(self.buffer)[-n:]
197
234
 
198
235
 
199
- # --- GPU Metrics (Deep) ---
236
+ # =============================================================================
237
+ # GPU Metrics (Deep)
238
+ # =============================================================================
200
239
 
201
- def get_gpu_metrics() -> dict:
240
+ def get_gpu_metrics():
202
241
  if not GPU_AVAILABLE:
203
242
  return {}
204
243
  try:
@@ -227,45 +266,33 @@ def get_gpu_metrics() -> dict:
227
266
  "power_w": round(power, 1),
228
267
  }
229
268
 
230
- # Power limit
231
269
  try:
232
270
  power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
233
271
  gpu_info["power_limit_w"] = round(power_limit, 1)
234
272
  except Exception:
235
273
  pass
236
274
 
237
- # Throttle reasons
238
275
  try:
239
276
  throttle = pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)
240
277
  reasons = []
241
- if throttle & 0x0000000000000002:
242
- reasons.append("idle")
243
- if throttle & 0x0000000000000004:
244
- reasons.append("app_clocks")
245
- if throttle & 0x0000000000000008:
246
- reasons.append("sw_power_cap")
247
- if throttle & 0x0000000000000020:
248
- reasons.append("hw_slowdown")
249
- if throttle & 0x0000000000000040:
250
- reasons.append("sync_boost")
251
- if throttle & 0x0000000000000080:
252
- reasons.append("sw_thermal")
253
- if throttle & 0x0000000000000100:
254
- reasons.append("hw_thermal")
255
- if throttle & 0x0000000000000200:
256
- reasons.append("hw_power_brake")
278
+ if throttle & 0x0000000000000002: reasons.append("idle")
279
+ if throttle & 0x0000000000000004: reasons.append("app_clocks")
280
+ if throttle & 0x0000000000000008: reasons.append("sw_power_cap")
281
+ if throttle & 0x0000000000000020: reasons.append("hw_slowdown")
282
+ if throttle & 0x0000000000000040: reasons.append("sync_boost")
283
+ if throttle & 0x0000000000000080: reasons.append("sw_thermal")
284
+ if throttle & 0x0000000000000100: reasons.append("hw_thermal")
285
+ if throttle & 0x0000000000000200: reasons.append("hw_power_brake")
257
286
  gpu_info["throttle_reasons"] = reasons if reasons else ["none"]
258
287
  except Exception:
259
288
  pass
260
289
 
261
- # Clock speeds
262
290
  try:
263
291
  gpu_info["clock_graphics_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
264
292
  gpu_info["clock_mem_mhz"] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
265
293
  except Exception:
266
294
  pass
267
295
 
268
- # Running processes on GPU
269
296
  try:
270
297
  procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
271
298
  gpu_procs = []
@@ -284,7 +311,6 @@ def get_gpu_metrics() -> dict:
284
311
  except Exception:
285
312
  pass
286
313
 
287
- # ECC errors
288
314
  try:
289
315
  ecc_single = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_SINGLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
290
316
  ecc_double = pynvml.nvmlDeviceGetTotalEccErrors(handle, pynvml.NVML_DOUBLE_BIT_ECC, pynvml.NVML_VOLATILE_ECC)
@@ -300,7 +326,6 @@ def get_gpu_metrics() -> dict:
300
326
  "cuda_version": GPU_CUDA_VERSION,
301
327
  "device_count": device_count,
302
328
  "gpus": gpus,
303
- # Flatten primary GPU for metrics table
304
329
  "gpu_temp": gpus[0]["temp_c"] if gpus else None,
305
330
  "gpu_vram": gpus[0]["vram_used_gb"] if gpus else None,
306
331
  }
@@ -308,10 +333,11 @@ def get_gpu_metrics() -> dict:
308
333
  return {"error": str(e)}
309
334
 
310
335
 
311
- # --- PM2 Integration ---
336
+ # =============================================================================
337
+ # PM2 Integration
338
+ # =============================================================================
312
339
 
313
- def get_pm2_status() -> dict:
314
- """Get full pm2 process list with details."""
340
+ def get_pm2_status():
315
341
  raw = run_cmd("pm2 jlist 2>/dev/null")
316
342
  if not raw or raw.startswith("[error") or raw.startswith("[timeout"):
317
343
  return {"available": False, "raw": raw}
@@ -340,37 +366,22 @@ def get_pm2_status() -> dict:
340
366
  "instances": env.get("instances"),
341
367
  "exit_code": env.get("exit_code"),
342
368
  }
343
- # Get recent logs for this process
344
- recent_logs = run_cmd(f"pm2 logs {p.get('name')} --nostream --lines 30 2>/dev/null", timeout=5)
345
- if recent_logs and not recent_logs.startswith("["):
346
- proc_info["recent_logs"] = recent_logs[-3000:] # Last 3KB
347
-
348
- # Get recent error logs
349
- err_log_path = env.get("pm_err_log_path")
350
- if err_log_path and os.path.exists(err_log_path):
351
- try:
352
- with open(err_log_path, "r") as f:
353
- lines = f.readlines()
354
- proc_info["recent_errors"] = "".join(lines[-50:])[-3000:]
355
- except Exception:
356
- pass
357
-
358
369
  result["processes"].append(proc_info)
359
370
  return result
360
371
  except json.JSONDecodeError:
361
372
  return {"available": True, "raw_output": raw[:2000]}
362
373
 
363
374
 
364
- def get_pm2_logs(lines: int = 100) -> str:
365
- """Get combined pm2 logs."""
366
- output = run_cmd(f"pm2 logs --nostream --lines {lines} 2>/dev/null", timeout=10)
375
+ def get_pm2_logs(lines=100):
376
+ output = run_cmd("pm2 logs --nostream --lines {} 2>/dev/null".format(lines), timeout=10)
367
377
  return output[-5000:] if output else ""
368
378
 
369
379
 
370
- # --- System Metrics (Deep) ---
380
+ # =============================================================================
381
+ # System Metrics (Deep)
382
+ # =============================================================================
371
383
 
372
- def get_disk_io() -> dict:
373
- """Get disk I/O stats."""
384
+ def get_disk_io():
374
385
  try:
375
386
  io_counters = psutil.disk_io_counters(perdisk=False)
376
387
  if io_counters:
@@ -387,11 +398,8 @@ def get_disk_io() -> dict:
387
398
  return {}
388
399
 
389
400
 
390
- def get_network_details() -> dict:
391
- """Get detailed network info."""
401
+ def get_network_details():
392
402
  result = {}
393
-
394
- # Per-interface stats
395
403
  try:
396
404
  net_io = psutil.net_io_counters(pernic=True)
397
405
  interfaces = {}
@@ -412,7 +420,6 @@ def get_network_details() -> dict:
412
420
  except Exception:
413
421
  pass
414
422
 
415
- # Connection states
416
423
  try:
417
424
  connections = psutil.net_connections(kind="tcp")
418
425
  states = {}
@@ -424,7 +431,6 @@ def get_network_details() -> dict:
424
431
  except Exception:
425
432
  pass
426
433
 
427
- # Listening ports
428
434
  try:
429
435
  listening = []
430
436
  for conn in psutil.net_connections(kind="tcp"):
@@ -438,7 +444,6 @@ def get_network_details() -> dict:
438
444
  except Exception:
439
445
  pass
440
446
 
441
- # DNS check
442
447
  try:
443
448
  start = time.time()
444
449
  socket.getaddrinfo("google.com", 80)
@@ -449,8 +454,7 @@ def get_network_details() -> dict:
449
454
  return result
450
455
 
451
456
 
452
- def get_top_processes(n: int = 15) -> list:
453
- """Get top N processes by CPU and memory."""
457
+ def get_top_processes(n=15):
454
458
  procs = []
455
459
  for p in psutil.process_iter(["pid", "name", "cpu_percent", "memory_percent", "status", "num_fds", "num_threads", "create_time", "cmdline"]):
456
460
  try:
@@ -473,8 +477,7 @@ def get_top_processes(n: int = 15) -> list:
473
477
  return procs[:n]
474
478
 
475
479
 
476
- def get_zombie_processes() -> list:
477
- """Find zombie/defunct processes."""
480
+ def get_zombie_processes():
478
481
  zombies = []
479
482
  for p in psutil.process_iter(["pid", "name", "status", "ppid"]):
480
483
  try:
@@ -489,8 +492,7 @@ def get_zombie_processes() -> list:
489
492
  return zombies
490
493
 
491
494
 
492
- def get_disk_details() -> list:
493
- """Get all mount point usage and inode info."""
495
+ def get_disk_details():
494
496
  mounts = []
495
497
  for part in psutil.disk_partitions(all=False):
496
498
  try:
@@ -504,9 +506,8 @@ def get_disk_details() -> list:
504
506
  "free_gb": round(usage.free / 1e9, 1),
505
507
  "pct": usage.percent,
506
508
  }
507
- # Inode usage (Linux)
508
509
  try:
509
- inode_output = run_cmd(f"df -i {part.mountpoint} | tail -1", timeout=3)
510
+ inode_output = run_cmd("df -i {} | tail -1".format(part.mountpoint), timeout=3)
510
511
  parts = inode_output.split()
511
512
  if len(parts) >= 5:
512
513
  info["inodes_used"] = parts[2]
@@ -520,12 +521,10 @@ def get_disk_details() -> list:
520
521
  return mounts
521
522
 
522
523
 
523
- def get_systemd_failed() -> list:
524
- """Get failed systemd units."""
524
+ def get_systemd_failed():
525
525
  output = run_cmd("systemctl --failed --no-pager --plain 2>/dev/null", timeout=5)
526
526
  if not output or "0 loaded" in output:
527
527
  return []
528
-
529
528
  failed = []
530
529
  for line in output.split("\n"):
531
530
  line = line.strip()
@@ -536,8 +535,7 @@ def get_systemd_failed() -> list:
536
535
  return failed[:20]
537
536
 
538
537
 
539
- def get_docker_status() -> dict:
540
- """Get Docker container info."""
538
+ def get_docker_status():
541
539
  if not DOCKER_AVAILABLE:
542
540
  return {"available": False}
543
541
  try:
@@ -559,7 +557,6 @@ def get_docker_status() -> dict:
559
557
  }
560
558
  except Exception:
561
559
  pass
562
-
563
560
  containers.append({
564
561
  "name": c.name,
565
562
  "image": c.image.tags[0] if c.image.tags else str(c.image.id)[:12],
@@ -572,8 +569,7 @@ def get_docker_status() -> dict:
572
569
  return {"available": False, "error": str(e)}
573
570
 
574
571
 
575
- def check_ssl_certs() -> list:
576
- """Check SSL certificate expiry for configured domains."""
572
+ def check_ssl_certs():
577
573
  results = []
578
574
  for domain in SSL_CHECK_DOMAINS:
579
575
  try:
@@ -596,45 +592,24 @@ def check_ssl_certs() -> list:
596
592
  return results
597
593
 
598
594
 
599
- def get_system_logs() -> dict:
600
- """Get recent system logs (dmesg, syslog, auth)."""
595
+ def get_system_logs():
601
596
  logs = {}
602
-
603
- # dmesg (kernel messages — GPU errors, OOM kills show up here)
604
597
  dmesg = run_cmd("dmesg --time-format iso -T 2>/dev/null | tail -50", timeout=5)
605
598
  if dmesg and not dmesg.startswith("[error"):
606
599
  logs["dmesg"] = dmesg[-3000:]
607
-
608
- # Check for OOM kills specifically
609
600
  oom = run_cmd("dmesg | grep -i 'oom\\|killed process\\|out of memory' | tail -10 2>/dev/null", timeout=5)
610
601
  if oom and not oom.startswith("[error"):
611
602
  logs["oom_kills"] = oom
612
-
613
- # journalctl recent errors
614
603
  journal = run_cmd("journalctl -p err --since '1 hour ago' --no-pager -q 2>/dev/null | tail -30", timeout=5)
615
604
  if journal and not journal.startswith("[error"):
616
605
  logs["journal_errors"] = journal[-2000:]
617
-
618
- # Auth log (failed logins, suspicious activity)
619
606
  auth = run_cmd("tail -20 /var/log/auth.log 2>/dev/null || tail -20 /var/log/secure 2>/dev/null", timeout=5)
620
607
  if auth and not auth.startswith("[error"):
621
608
  logs["auth_log"] = auth[-1000:]
622
-
623
609
  return logs
624
610
 
625
611
 
626
- def get_firewall_rules() -> str:
627
- """Get firewall rules summary."""
628
- # Try ufw first, then iptables
629
- ufw = run_cmd("ufw status verbose 2>/dev/null", timeout=5)
630
- if ufw and "Status:" in ufw:
631
- return ufw[:2000]
632
- ipt = run_cmd("iptables -L -n --line-numbers 2>/dev/null | head -40", timeout=5)
633
- return ipt[:2000] if ipt else ""
634
-
635
-
636
- def get_env_sanitized() -> dict:
637
- """Get relevant environment variables with secrets redacted."""
612
+ def get_env_sanitized():
638
613
  relevant_prefixes = [
639
614
  "NODE_", "PYTHON", "PATH", "HOME", "USER", "SHELL", "LANG",
640
615
  "CUDA", "NVIDIA", "GPU", "LD_LIBRARY", "VIRTUAL_ENV", "CONDA",
@@ -643,7 +618,6 @@ def get_env_sanitized() -> dict:
643
618
  env = {}
644
619
  for key, value in os.environ.items():
645
620
  if any(key.startswith(p) for p in relevant_prefixes):
646
- # Redact anything that looks like a secret
647
621
  if any(s in key.upper() for s in ["KEY", "SECRET", "TOKEN", "PASS", "AUTH"]):
648
622
  env[key] = "[REDACTED]"
649
623
  else:
@@ -651,19 +625,14 @@ def get_env_sanitized() -> dict:
651
625
  return env
652
626
 
653
627
 
654
- def get_installed_packages() -> dict:
655
- """Get installed Python and Node packages."""
628
+ def get_installed_packages():
656
629
  pkgs = {}
657
-
658
- # Python packages
659
630
  pip_list = run_cmd("pip list --format=json 2>/dev/null", timeout=10)
660
631
  if pip_list and pip_list.startswith("["):
661
632
  try:
662
633
  pkgs["python"] = {p["name"]: p["version"] for p in json.loads(pip_list)}
663
634
  except Exception:
664
635
  pass
665
-
666
- # Node packages (global)
667
636
  npm_list = run_cmd("npm list -g --depth=0 --json 2>/dev/null", timeout=10)
668
637
  if npm_list and npm_list.startswith("{"):
669
638
  try:
@@ -671,27 +640,32 @@ def get_installed_packages() -> dict:
671
640
  pkgs["node_global"] = {k: v.get("version", "?") for k, v in deps.items()}
672
641
  except Exception:
673
642
  pass
674
-
675
- # pm2 ecosystem config
676
643
  pm2_conf = run_cmd("cat ecosystem.config.js 2>/dev/null || cat ecosystem.config.cjs 2>/dev/null", timeout=3)
677
644
  if pm2_conf and not pm2_conf.startswith("[error"):
678
645
  pkgs["pm2_ecosystem"] = pm2_conf[:3000]
679
-
680
646
  return pkgs
681
647
 
682
648
 
683
- # --- File Watcher Thread ---
649
+ def get_firewall_rules():
650
+ ufw = run_cmd("ufw status verbose 2>/dev/null", timeout=5)
651
+ if ufw and "Status:" in ufw:
652
+ return ufw[:2000]
653
+ ipt = run_cmd("iptables -L -n --line-numbers 2>/dev/null | head -40", timeout=5)
654
+ return ipt[:2000] if ipt else ""
684
655
 
685
- class LogFileWatcher(threading.Thread):
686
- """Watch log files for new lines and send them."""
687
656
 
688
- def __init__(self, files: list):
657
+ # =============================================================================
658
+ # File Watcher Thread (with deduplication + agent log exclusion)
659
+ # =============================================================================
660
+
661
+ class LogFileWatcher(threading.Thread):
662
+ def __init__(self, files):
689
663
  super().__init__(daemon=True)
690
664
  self.files = files
691
665
  self.positions = {}
666
+ self._recent_hashes = deque(maxlen=500)
692
667
 
693
668
  def run(self):
694
- # Initialize positions to end of files
695
669
  for f in self.files:
696
670
  try:
697
671
  self.positions[f] = os.path.getsize(f)
@@ -705,10 +679,16 @@ class LogFileWatcher(threading.Thread):
705
679
  if size > self.positions.get(filepath, 0):
706
680
  with open(filepath, "r") as fh:
707
681
  fh.seek(self.positions[filepath])
708
- new_lines = fh.read(10000) # Max 10KB per read
682
+ new_lines = fh.read(10000)
709
683
  self.positions[filepath] = fh.tell()
710
684
 
711
685
  if new_lines.strip():
686
+ # Deduplicate
687
+ msg_hash = hash(new_lines.strip()[:300])
688
+ if msg_hash in self._recent_hashes:
689
+ continue
690
+ self._recent_hashes.append(msg_hash)
691
+
712
692
  level = "info"
713
693
  lower = new_lines.lower()
714
694
  if any(kw in lower for kw in ["error", "exception", "traceback", "failed", "critical"]):
@@ -722,38 +702,35 @@ class LogFileWatcher(threading.Thread):
722
702
  "platform": PLATFORM,
723
703
  "version": VERSION,
724
704
  "level": level,
725
- "message": f"[file:{os.path.basename(filepath)}] {new_lines.strip()[:2000]}",
705
+ "message": "[file:{}] {}".format(os.path.basename(filepath), new_lines.strip()[:2000]),
726
706
  "context": {"capturedFrom": "file_watcher", "file": filepath},
727
707
  })
728
708
  elif size < self.positions.get(filepath, 0):
729
- # File was truncated/rotated
730
709
  self.positions[filepath] = 0
731
710
  except Exception:
732
711
  pass
733
712
  time.sleep(2)
734
713
 
735
714
 
736
- # --- Collect Everything ---
715
+ # =============================================================================
716
+ # Collect Metrics
717
+ # =============================================================================
737
718
 
738
- def collect_full_metrics() -> dict:
739
- """Collect absolutely everything."""
719
+ def collect_full_metrics():
740
720
  cpu = psutil.cpu_percent(interval=1)
741
721
  cpu_per_core = psutil.cpu_percent(interval=0, percpu=True)
742
722
  mem = psutil.virtual_memory()
743
723
  swap = psutil.swap_memory()
744
-
724
+
745
725
  data = {
746
726
  "cpu": cpu,
747
727
  "memory": mem.percent,
748
728
  "network_latency": 0,
749
729
  "custom": {
750
- # CPU deep
751
730
  "cpu_per_core": cpu_per_core,
752
731
  "cpu_count_logical": psutil.cpu_count(),
753
732
  "cpu_count_physical": psutil.cpu_count(logical=False),
754
733
  "load_avg": list(os.getloadavg()) if hasattr(os, "getloadavg") else [],
755
-
756
- # Memory deep
757
734
  "memory_used_gb": round(mem.used / 1e9, 2),
758
735
  "memory_total_gb": round(mem.total / 1e9, 2),
759
736
  "memory_available_gb": round(mem.available / 1e9, 2),
@@ -762,12 +739,8 @@ def collect_full_metrics() -> dict:
762
739
  "swap_used_gb": round(swap.used / 1e9, 2),
763
740
  "swap_total_gb": round(swap.total / 1e9, 2),
764
741
  "swap_pct": swap.percent,
765
-
766
- # Disk I/O
767
742
  "disk_io": get_disk_io(),
768
-
769
- # OS info
770
- "os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
743
+ "os": "{} {}".format(os.uname().sysname, os.uname().release) if hasattr(os, "uname") else "unknown",
771
744
  "hostname": socket.gethostname(),
772
745
  "python_version": sys.version.split()[0],
773
746
  "node_version": run_cmd("node --version 2>/dev/null"),
@@ -775,7 +748,6 @@ def collect_full_metrics() -> dict:
775
748
  },
776
749
  }
777
750
 
778
- # CPU frequency
779
751
  try:
780
752
  freq = psutil.cpu_freq()
781
753
  if freq:
@@ -784,7 +756,6 @@ def collect_full_metrics() -> dict:
784
756
  except Exception:
785
757
  pass
786
758
 
787
- # Context switches
788
759
  try:
789
760
  ctx = psutil.cpu_stats()
790
761
  data["custom"]["ctx_switches"] = ctx.ctx_switches
@@ -792,7 +763,6 @@ def collect_full_metrics() -> dict:
792
763
  except Exception:
793
764
  pass
794
765
 
795
- # GPU
796
766
  gpu = get_gpu_metrics()
797
767
  if gpu:
798
768
  data["gpu_temp"] = gpu.get("gpu_temp")
@@ -802,8 +772,7 @@ def collect_full_metrics() -> dict:
802
772
  return data
803
773
 
804
774
 
805
- def collect_deep_snapshot() -> dict:
806
- """Full system snapshot for variable inspector."""
775
+ def collect_deep_snapshot():
807
776
  snapshot = {
808
777
  "system": {
809
778
  "hostname": socket.gethostname(),
@@ -811,7 +780,7 @@ def collect_deep_snapshot() -> dict:
811
780
  "cpu_count": psutil.cpu_count(),
812
781
  "memory_total_gb": round(psutil.virtual_memory().total / 1e9, 2),
813
782
  "python_version": sys.version.split()[0],
814
- "os": f"{os.uname().sysname} {os.uname().release}" if hasattr(os, "uname") else "unknown",
783
+ "os": "{} {}".format(os.uname().sysname, os.uname().release) if hasattr(os, "uname") else "unknown",
815
784
  "kernel": run_cmd("uname -r 2>/dev/null"),
816
785
  },
817
786
  "gpu": get_gpu_metrics() if GPU_AVAILABLE else {"available": False},
@@ -825,28 +794,23 @@ def collect_deep_snapshot() -> dict:
825
794
  "firewall": get_firewall_rules(),
826
795
  "environment": get_env_sanitized(),
827
796
  }
828
-
829
- # Job state
830
797
  state = job_tracker.get_state()
831
798
  if state:
832
799
  snapshot["current_job"] = state
833
-
834
800
  return snapshot
835
801
 
836
802
 
837
- # --- Job State Tracking ---
803
+ # =============================================================================
804
+ # Job State Tracking
805
+ # =============================================================================
838
806
 
839
807
  class JobTracker:
840
- """Track job states from BullMQ or similar queue systems."""
841
-
842
808
  def __init__(self):
843
809
  self.current_job = None
844
810
  self.job_history = deque(maxlen=50)
845
811
  self.lock = threading.Lock()
846
812
 
847
- def update(self, job_id: str, status: str, progress: float = 0,
848
- last_action: str = "", error: str | None = None,
849
- metadata: dict | None = None):
813
+ def update(self, job_id, status, progress=0, last_action="", error=None, metadata=None):
850
814
  with self.lock:
851
815
  job = {
852
816
  "job_id": job_id,
@@ -860,11 +824,11 @@ class JobTracker:
860
824
  self.current_job = job
861
825
  self.job_history.append(job)
862
826
 
863
- def get_state(self) -> dict | None:
827
+ def get_state(self):
864
828
  with self.lock:
865
829
  return self.current_job.copy() if self.current_job else None
866
830
 
867
- def get_history(self, n: int = 20) -> list:
831
+ def get_history(self, n=20):
868
832
  with self.lock:
869
833
  return list(self.job_history)[-n:]
870
834
 
@@ -881,14 +845,16 @@ class JobTracker:
881
845
  })
882
846
 
883
847
 
884
- # --- Public API for External Use ---
848
+ # =============================================================================
849
+ # Public API
850
+ # =============================================================================
885
851
 
886
852
  job_tracker = JobTracker()
887
853
  stdout_capture = None
888
854
  stderr_capture = None
889
855
 
890
856
 
891
- def send_log(level: str, message: str, context: dict = None):
857
+ def send_log(level, message, context=None):
892
858
  send({
893
859
  "type": "log",
894
860
  "source": SOURCE_NAME,
@@ -900,7 +866,7 @@ def send_log(level: str, message: str, context: dict = None):
900
866
  })
901
867
 
902
868
 
903
- def send_error(title: str, stack_trace: str = "", context: dict = None):
869
+ def send_error(title, stack_trace="", context=None):
904
870
  send({
905
871
  "type": "error",
906
872
  "source": SOURCE_NAME,
@@ -912,74 +878,236 @@ def send_error(title: str, stack_trace: str = "", context: dict = None):
912
878
  })
913
879
 
914
880
 
915
- def update_job(job_id: str, status: str, progress: float = 0,
916
- last_action: str = "", error: str = None, metadata: dict = None):
881
+ def update_job(job_id, status, progress=0, last_action="", error=None, metadata=None):
917
882
  job_tracker.update(job_id, status, progress, last_action, error, metadata)
918
883
  job_tracker.send_update()
919
-
920
884
  if error:
921
- send_error(f"Job {job_id} failed: {error}", context={
885
+ send_error("Job {} failed: {}".format(job_id, error), context={
922
886
  "job_id": job_id, "status": status, "last_action": last_action,
923
887
  **(metadata or {}),
924
888
  })
925
889
 
926
890
 
927
- def capture_image_gen(job_id: str, model: str, params: dict,
928
- result: dict = None, error: str = None,
929
- duration_s: float = None):
930
- """Specialized capture for image generation pipelines."""
931
- context = {
932
- "job_id": job_id,
933
- "model": model,
934
- "params": params,
935
- "duration_s": duration_s,
936
- }
891
+ def capture_image_gen(job_id, model, params, result=None, error=None, duration_s=None):
892
+ context = {"job_id": job_id, "model": model, "params": params, "duration_s": duration_s}
937
893
  if result:
938
894
  context["result"] = result
939
- send_log("info", f"Image gen complete: {model} ({duration_s:.1f}s)", context)
895
+ send_log("info", "Image gen complete: {} ({:.1f}s)".format(model, duration_s or 0), context)
940
896
  if error:
941
897
  context["error"] = error
942
- send_error(f"Image gen failed: {model} — {error}", context=context)
943
-
898
+ send_error("Image gen failed: {} — {}".format(model, error), context=context)
944
899
  update_job(job_id, "completed" if result else "failed",
945
900
  progress=100 if result else 0,
946
- last_action=f"generate:{model}",
947
- error=error,
948
- metadata=context)
901
+ last_action="generate:{}".format(model),
902
+ error=error, metadata=context)
903
+
904
+
905
+ # =============================================================================
906
+ # Action-Based Command Execution
907
+ # =============================================================================
908
+
909
+ def execute_action(action_id, timeout=30):
910
+ """Execute a predefined action and return result."""
911
+ cmd = ACTION_COMMANDS.get(action_id)
912
+ if not cmd:
913
+ return {
914
+ "output": "[Unknown action: {}]".format(action_id),
915
+ "exit_code": -1,
916
+ "duration_ms": 0,
917
+ }
918
+
919
+ start = time.time()
920
+ try:
921
+ result = subprocess.run(
922
+ cmd, shell=True, capture_output=True, text=True, timeout=timeout
923
+ )
924
+ duration_ms = int((time.time() - start) * 1000)
925
+ output = result.stdout
926
+ if result.stderr:
927
+ output += ("\n--- stderr ---\n" + result.stderr) if output else result.stderr
928
+ return {
929
+ "output": output.strip()[:50000],
930
+ "exit_code": result.returncode,
931
+ "duration_ms": duration_ms,
932
+ }
933
+ except subprocess.TimeoutExpired:
934
+ duration_ms = int((time.time() - start) * 1000)
935
+ return {
936
+ "output": "[Action timed out after {}s]".format(timeout),
937
+ "exit_code": -1,
938
+ "duration_ms": duration_ms,
939
+ }
940
+ except Exception as e:
941
+ duration_ms = int((time.time() - start) * 1000)
942
+ return {
943
+ "output": "[Execution error: {}]".format(e),
944
+ "exit_code": -1,
945
+ "duration_ms": duration_ms,
946
+ }
949
947
 
950
948
 
951
- # --- Main Loop ---
949
+ def poll_and_execute_commands():
950
+ """Poll for pending actions and execute them."""
951
+ if not POLL_COMMANDS_URL:
952
+ return
953
+
954
+ try:
955
+ resp = session.post(
956
+ POLL_COMMANDS_URL,
957
+ json={"action": "poll", "source_name": SOURCE_NAME},
958
+ headers=HEADERS,
959
+ timeout=10,
960
+ )
961
+ if resp.status_code != 200:
962
+ return
963
+
964
+ data = resp.json()
965
+ commands = data.get("commands", [])
966
+ settings = data.get("settings") or {}
967
+ max_timeout = settings.get("max_timeout_s", 30)
968
+
969
+ for cmd_entry in commands:
970
+ cmd_id = cmd_entry["id"]
971
+ cmd_str = cmd_entry["command"]
972
+
973
+ # Only accept ACTION: prefixed commands
974
+ if not cmd_str.startswith("ACTION:"):
975
+ logger.warning("Rejected non-action command: %s", cmd_str[:50])
976
+ session.post(
977
+ POLL_COMMANDS_URL,
978
+ json={
979
+ "action": "result",
980
+ "command_id": cmd_id,
981
+ "output": "[REJECTED] Only predefined actions are allowed. Raw commands are disabled.",
982
+ "exit_code": -2,
983
+ "duration_ms": 0,
984
+ "source_name": SOURCE_NAME,
985
+ },
986
+ headers=HEADERS,
987
+ timeout=10,
988
+ )
989
+ continue
990
+
991
+ action_id = cmd_str.replace("ACTION:", "")
992
+
993
+ if action_id not in ACTION_COMMANDS:
994
+ logger.warning("Unknown action: %s", action_id)
995
+ session.post(
996
+ POLL_COMMANDS_URL,
997
+ json={
998
+ "action": "result",
999
+ "command_id": cmd_id,
1000
+ "output": "[REJECTED] Unknown action: {}".format(action_id),
1001
+ "exit_code": -2,
1002
+ "duration_ms": 0,
1003
+ "source_name": SOURCE_NAME,
1004
+ },
1005
+ headers=HEADERS,
1006
+ timeout=10,
1007
+ )
1008
+ continue
1009
+
1010
+ logger.info("Executing action: %s (id=%s...)", action_id, cmd_id[:8])
1011
+
1012
+ # Claim
1013
+ session.post(
1014
+ POLL_COMMANDS_URL,
1015
+ json={"action": "claim", "command_id": cmd_id},
1016
+ headers=HEADERS,
1017
+ timeout=10,
1018
+ )
1019
+
1020
+ # Execute the mapped command
1021
+ result = execute_action(action_id, timeout=min(max_timeout, 60))
1022
+ logger.info("Action %s completed: exit_code=%s (%sms)", action_id, result["exit_code"], result["duration_ms"])
1023
+
1024
+ # Report result
1025
+ session.post(
1026
+ POLL_COMMANDS_URL,
1027
+ json={
1028
+ "action": "result",
1029
+ "command_id": cmd_id,
1030
+ "output": result["output"],
1031
+ "exit_code": result["exit_code"],
1032
+ "duration_ms": result["duration_ms"],
1033
+ "source_name": SOURCE_NAME,
1034
+ },
1035
+ headers=HEADERS,
1036
+ timeout=10,
1037
+ )
1038
+
1039
+ # Log execution
1040
+ send_log(
1041
+ "info" if result["exit_code"] == 0 else "warn",
1042
+ "[action] {} -> exit {} ({}ms)".format(action_id, result["exit_code"], result["duration_ms"]),
1043
+ {"action": action_id, "exit_code": result["exit_code"], "command_id": cmd_id},
1044
+ )
1045
+
1046
+ except Exception as e:
1047
+ logger.debug("Command poll error: %s", e)
1048
+
1049
+
1050
+ # =============================================================================
1051
+ # PM2 Log Tracker (prevents re-sending same lines)
1052
+ # =============================================================================
1053
+
1054
+ class PM2LogTracker:
1055
+ """Tracks which PM2 log lines have been sent to prevent duplicates."""
1056
+ def __init__(self):
1057
+ self._seen_hashes = deque(maxlen=1000)
1058
+
1059
+ def get_new_errors(self, pm2_output):
1060
+ """Return only error lines that haven't been seen before."""
1061
+ new_errors = []
1062
+ for line in pm2_output.split("\n"):
1063
+ lower = line.lower()
1064
+ if any(kw in lower for kw in ["error", "exception", "failed", "crash", "enoent", "eacces", "killed"]):
1065
+ line_hash = hash(line.strip()[:200])
1066
+ if line_hash not in self._seen_hashes:
1067
+ self._seen_hashes.append(line_hash)
1068
+ new_errors.append(line.strip()[:500])
1069
+ return new_errors
1070
+
1071
+
1072
+ pm2_log_tracker = PM2LogTracker()
1073
+
1074
+
1075
+ # =============================================================================
1076
+ # Main Loop
1077
+ # =============================================================================
952
1078
 
953
1079
  def main():
954
1080
  global stdout_capture, stderr_capture
955
1081
 
956
- # Capture stdout/stderr
957
1082
  stdout_capture = StreamCapture(sys.stdout, "info")
958
1083
  stderr_capture = StreamCapture(sys.stderr, "error")
959
1084
  sys.stdout = stdout_capture
960
1085
  sys.stderr = stderr_capture
961
1086
 
962
- logger.info(f"debugger.help Agent v{VERSION}Ultimate Deep Debugger")
963
- logger.info(f"Source: {SOURCE_NAME} | GPU: {'yes' if GPU_AVAILABLE else 'no'} | Docker: {'yes' if DOCKER_AVAILABLE else 'no'}")
964
- logger.info(f"Interval: {INTERVAL}s | Endpoint: {INGEST_URL}")
1087
+ logger.info("debugger.help Agent v%sAction-Based Execution", VERSION)
1088
+ logger.info("Source: %s | GPU: %s | Docker: %s", SOURCE_NAME, "yes" if GPU_AVAILABLE else "no", "yes" if DOCKER_AVAILABLE else "no")
1089
+ logger.info("Interval: %ss | Endpoint: %s", INTERVAL, INGEST_URL)
1090
+ logger.info("Registered actions: %s", ", ".join(sorted(ACTION_COMMANDS.keys())))
965
1091
 
966
- # Start file watchers
1092
+ # Start file watchers — exclude agent's own log files
967
1093
  watch_files = list(WATCH_LOG_FILES)
968
- # Auto-discover pm2 log files
969
1094
  pm2_log_dir = os.path.expanduser("~/.pm2/logs")
970
1095
  if os.path.isdir(pm2_log_dir):
971
1096
  pm2_logs = glob.glob(os.path.join(pm2_log_dir, "*.log"))
1097
+ # Exclude the debugger agent's own log files to prevent feedback loop
1098
+ pm2_logs = [f for f in pm2_logs if "debugger-agent" not in os.path.basename(f).lower()
1099
+ and "debugger_agent" not in os.path.basename(f).lower()]
972
1100
  watch_files.extend(pm2_logs)
973
- logger.info(f"Watching {len(pm2_logs)} pm2 log files")
974
-
1101
+ logger.info("Watching %d pm2 log files (excluded agent logs)", len(pm2_logs))
1102
+
975
1103
  if watch_files:
976
1104
  watcher = LogFileWatcher(watch_files)
977
1105
  watcher.start()
978
- logger.info(f"File watcher started for {len(watch_files)} files")
1106
+ logger.info("File watcher started for %d files", len(watch_files))
979
1107
 
980
1108
  # Initial connection
981
1109
  send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
982
- send_log("info", f"Agent v{VERSION} started on {socket.gethostname()}", {
1110
+ send_log("info", "Agent v{} started on {}".format(VERSION, socket.gethostname()), {
983
1111
  "hostname": socket.gethostname(),
984
1112
  "python_version": sys.version,
985
1113
  "gpu_available": GPU_AVAILABLE,
@@ -987,9 +1115,10 @@ def main():
987
1115
  "pid": os.getpid(),
988
1116
  "gpu_driver": GPU_DRIVER_VERSION,
989
1117
  "cuda_version": GPU_CUDA_VERSION,
1118
+ "available_actions": list(ACTION_COMMANDS.keys()),
990
1119
  })
991
1120
 
992
- # Send initial deep snapshot with packages
1121
+ # Send initial deep snapshot
993
1122
  try:
994
1123
  pkgs = get_installed_packages()
995
1124
  send({
@@ -1022,10 +1151,9 @@ def main():
1022
1151
  else:
1023
1152
  consecutive_failures += 1
1024
1153
 
1025
- # Auto-reconnect backoff
1026
1154
  if consecutive_failures > 5:
1027
1155
  backoff = min(consecutive_failures * 5, 60)
1028
- logger.warning(f"Connection issues. Backing off {backoff}s...")
1156
+ logger.warning("Connection issues. Backing off %ss...", backoff)
1029
1157
  time.sleep(backoff)
1030
1158
  send({"type": "heartbeat", "source": SOURCE_NAME, "platform": PLATFORM, "version": VERSION})
1031
1159
  consecutive_failures = 0
@@ -1042,15 +1170,13 @@ def main():
1042
1170
  "variables": snapshot,
1043
1171
  })
1044
1172
 
1045
- # PM2 logs check every ~30 seconds
1173
+ # PM2 logs check every ~30s (with deduplication)
1046
1174
  if tick % 3 == 0:
1047
1175
  pm2_logs = get_pm2_logs(50)
1048
1176
  if pm2_logs:
1049
- # Check for errors in pm2 logs
1050
- for line in pm2_logs.split("\n"):
1051
- lower = line.lower()
1052
- if any(kw in lower for kw in ["error", "exception", "failed", "crash", "enoent", "eacces", "killed"]):
1053
- send_log("error", f"[pm2] {line.strip()[:500]}", {"capturedFrom": "pm2_logs"})
1177
+ new_errors = pm2_log_tracker.get_new_errors(pm2_logs)
1178
+ for line in new_errors[:5]: # Max 5 new errors per check
1179
+ send_log("error", "[pm2] {}".format(line), {"capturedFrom": "pm2_logs"})
1054
1180
 
1055
1181
  # System logs check every ~2 min
1056
1182
  if tick % 12 == 0:
@@ -1070,8 +1196,8 @@ def main():
1070
1196
  for r in ssl_results:
1071
1197
  if r.get("warning") or r.get("error"):
1072
1198
  days = r.get("days_left")
1073
- detail = r.get("error") or f"{days} days left"
1074
- send_log("warn", f"SSL cert issue: {r.get('domain')} — {detail}", r)
1199
+ detail = r.get("error") or "{} days left".format(days)
1200
+ send_log("warn", "SSL cert issue: {} — {}".format(r.get("domain"), detail), r)
1075
1201
  send({
1076
1202
  "type": "inspect",
1077
1203
  "source": SOURCE_NAME,
@@ -1080,18 +1206,19 @@ def main():
1080
1206
  "variables": {"ssl_certs": ssl_results},
1081
1207
  })
1082
1208
 
1083
- # GPU warnings every ~30 seconds
1209
+ # GPU warnings every ~30s (only when thresholds exceeded)
1084
1210
  if GPU_AVAILABLE and tick % 3 == 0:
1085
1211
  gpu = get_gpu_metrics()
1086
1212
  gpus = gpu.get("gpus", [])
1087
1213
  for g in gpus:
1088
1214
  if g.get("temp_c", 0) > 85:
1089
- send_log("warn", f"GPU {g['index']} temperature critical: {g['temp_c']}°C", g)
1215
+ send_log("warn", "GPU {} temperature critical: {}C".format(g["index"], g["temp_c"]), g)
1090
1216
  if g.get("vram_pct", 0) > 90:
1091
- send_log("warn", f"GPU {g['index']} VRAM critical: {g['vram_pct']}% ({g['vram_used_gb']}/{g['vram_total_gb']} GB)", g)
1217
+ send_log("warn", "GPU {} VRAM critical: {}% ({}/{} GB)".format(
1218
+ g["index"], g["vram_pct"], g["vram_used_gb"], g["vram_total_gb"]), g)
1092
1219
  throttle = g.get("throttle_reasons", [])
1093
1220
  if throttle and throttle != ["none"] and throttle != ["idle"]:
1094
- send_log("warn", f"GPU {g['index']} throttling: {', '.join(throttle)}", g)
1221
+ send_log("warn", "GPU {} throttling: {}".format(g["index"], ", ".join(throttle)), g)
1095
1222
 
1096
1223
  # Heartbeat every ~5 min
1097
1224
  if tick % 30 == 0 and tick > 0:
@@ -1108,6 +1235,15 @@ def main():
1108
1235
  "variables": {"installed_packages": pkgs},
1109
1236
  })
1110
1237
 
1238
+ # Poll for remote actions every tick
1239
+ poll_and_execute_commands()
1240
+
1241
+ # Flush captured stdout/stderr
1242
+ if stdout_capture:
1243
+ stdout_capture.flush_pending()
1244
+ if stderr_capture:
1245
+ stderr_capture.flush_pending()
1246
+
1111
1247
  tick += 1
1112
1248
  time.sleep(INTERVAL)
1113
1249
 
@@ -1118,7 +1254,7 @@ def main():
1118
1254
  sys.stderr = stderr_capture.original
1119
1255
  break
1120
1256
  except Exception as e:
1121
- logger.error(f"Main loop error: {e}")
1257
+ logger.error("Main loop error: %s", e)
1122
1258
  send_error(str(e), traceback.format_exc())
1123
1259
  time.sleep(INTERVAL)
1124
1260
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debugger-help
3
- Version: 3.0.2
3
+ Version: 4.0.0
4
4
  Summary: debugger.help VPS Agent — Deep system monitoring for logs, GPU, PM2, Docker, and more
5
5
  Author: debugger.help
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "debugger-help"
7
- version = "3.0.2"
7
+ version = "4.0.0"
8
8
  description = "debugger.help VPS Agent — Deep system monitoring for logs, GPU, PM2, Docker, and more"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes