souleyez 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of souleyez might be problematic. Click here for more details.

@@ -25,6 +25,7 @@ import subprocess
25
25
  import threading
26
26
  import inspect
27
27
  import traceback
28
+ import fcntl
28
29
  from typing import List, Dict, Optional, Any
29
30
  from souleyez.log_config import get_logger
30
31
  from .log_sanitizer import LogSanitizer
@@ -41,7 +42,12 @@ JOBS_DIR = os.path.join(DATA_DIR, "jobs")
41
42
  LOGS_DIR = os.path.join(DATA_DIR, "logs")
42
43
  JOBS_FILE = os.path.join(JOBS_DIR, "jobs.json")
43
44
  WORKER_LOG = os.path.join(LOGS_DIR, "worker.log")
45
+ HEARTBEAT_FILE = os.path.join(JOBS_DIR, ".worker_heartbeat")
44
46
  JOB_TIMEOUT_SECONDS = 3600 # 1 hour (changed from 300s/5min)
47
+ HEARTBEAT_INTERVAL = 10 # seconds between heartbeat writes
48
+ HEARTBEAT_STALE_THRESHOLD = 30 # seconds before heartbeat considered stale
49
+ JOB_HUNG_THRESHOLD = 300 # 5 minutes with no output = possibly hung
50
+ JOBS_BACKUP_COUNT = 3 # Number of rotating backups to keep
45
51
 
46
52
  _lock = threading.RLock() # Reentrant lock allows nested acquisition by same thread
47
53
 
@@ -51,6 +57,63 @@ def _ensure_dirs():
51
57
  os.makedirs(LOGS_DIR, exist_ok=True)
52
58
 
53
59
 
60
+ def _get_backup_files() -> List[str]:
61
+ """Get list of backup files sorted by modification time (newest first)."""
62
+ backups = []
63
+ for i in range(1, JOBS_BACKUP_COUNT + 1):
64
+ backup_path = f"{JOBS_FILE}.bak.{i}"
65
+ if os.path.exists(backup_path):
66
+ backups.append((os.path.getmtime(backup_path), backup_path))
67
+ # Sort by mtime descending (newest first)
68
+ backups.sort(reverse=True)
69
+ return [path for _, path in backups]
70
+
71
+
72
+ def _rotate_backups():
73
+ """Rotate backup files, keeping only JOBS_BACKUP_COUNT backups."""
74
+ # Shift existing backups: .bak.2 -> .bak.3, .bak.1 -> .bak.2
75
+ for i in range(JOBS_BACKUP_COUNT, 1, -1):
76
+ src = f"{JOBS_FILE}.bak.{i - 1}"
77
+ dst = f"{JOBS_FILE}.bak.{i}"
78
+ if os.path.exists(src):
79
+ try:
80
+ shutil.move(src, dst)
81
+ except Exception:
82
+ pass
83
+
84
+ # Create new .bak.1 from current jobs.json
85
+ if os.path.exists(JOBS_FILE):
86
+ try:
87
+ shutil.copy2(JOBS_FILE, f"{JOBS_FILE}.bak.1")
88
+ except Exception:
89
+ pass
90
+
91
+
92
+ def _recover_from_backup() -> List[Dict[str, Any]]:
93
+ """
94
+ Attempt to recover jobs from backup files.
95
+
96
+ Returns:
97
+ List of jobs from the first valid backup, or empty list if no valid backup found
98
+ """
99
+ backups = _get_backup_files()
100
+ for backup_path in backups:
101
+ try:
102
+ with open(backup_path, "r", encoding="utf-8") as fh:
103
+ jobs = json.load(fh)
104
+ if isinstance(jobs, list):
105
+ _append_worker_log(f"recovered {len(jobs)} jobs from backup: {backup_path}")
106
+ logger.info("Jobs recovered from backup", extra={
107
+ "backup_path": backup_path,
108
+ "job_count": len(jobs)
109
+ })
110
+ return jobs
111
+ except Exception as e:
112
+ _append_worker_log(f"backup {backup_path} also corrupt: {e}")
113
+ continue
114
+ return []
115
+
116
+
54
117
  def _read_jobs() -> List[Dict[str, Any]]:
55
118
  _ensure_dirs()
56
119
  if not os.path.exists(JOBS_FILE):
@@ -58,18 +121,42 @@ def _read_jobs() -> List[Dict[str, Any]]:
58
121
  try:
59
122
  with open(JOBS_FILE, "r", encoding="utf-8") as fh:
60
123
  return json.load(fh)
61
- except Exception:
124
+ except Exception as e:
125
+ # Log corruption event
126
+ _append_worker_log(f"jobs.json corrupt: {e}")
127
+ logger.error("Jobs file corrupted", extra={
128
+ "error": str(e),
129
+ "jobs_file": JOBS_FILE
130
+ })
131
+
132
+ # Try to recover from backup
133
+ recovered_jobs = _recover_from_backup()
134
+
135
+ # Move corrupt file aside
62
136
  try:
63
137
  corrupt = JOBS_FILE + ".corrupt." + str(int(time.time()))
64
138
  shutil.move(JOBS_FILE, corrupt)
65
- _append_worker_log(f"jobs file corrupt; moved to {corrupt}")
139
+ _append_worker_log(f"corrupt jobs file moved to {corrupt}")
66
140
  except Exception:
67
141
  pass
68
- return []
142
+
143
+ # If we recovered jobs, write them back
144
+ if recovered_jobs:
145
+ try:
146
+ _write_jobs(recovered_jobs)
147
+ _append_worker_log(f"restored {len(recovered_jobs)} jobs from backup")
148
+ except Exception as write_err:
149
+ _append_worker_log(f"failed to restore jobs: {write_err}")
150
+
151
+ return recovered_jobs
69
152
 
70
153
 
71
154
  def _write_jobs(jobs: List[Dict[str, Any]]):
72
155
  _ensure_dirs()
156
+
157
+ # Rotate backups before writing (keeps last 3 good copies)
158
+ _rotate_backups()
159
+
73
160
  tmp = tempfile.NamedTemporaryFile("w", delete=False, dir=JOBS_DIR, encoding="utf-8")
74
161
  try:
75
162
  json.dump(jobs, tmp, indent=2, ensure_ascii=False)
@@ -93,36 +180,135 @@ def _append_worker_log(msg: str):
93
180
  fh.write(line)
94
181
 
95
182
 
183
+ def _update_heartbeat():
184
+ """Write current timestamp to heartbeat file for health monitoring."""
185
+ _ensure_dirs()
186
+ try:
187
+ with open(HEARTBEAT_FILE, 'w') as fh:
188
+ fh.write(str(time.time()))
189
+ except Exception:
190
+ pass # Non-critical, don't crash worker
191
+
192
+
193
+ def get_heartbeat_age() -> Optional[float]:
194
+ """
195
+ Get age of worker heartbeat in seconds.
196
+
197
+ Returns:
198
+ Age in seconds, or None if heartbeat file doesn't exist
199
+ """
200
+ try:
201
+ if os.path.exists(HEARTBEAT_FILE):
202
+ with open(HEARTBEAT_FILE, 'r') as fh:
203
+ last_beat = float(fh.read().strip())
204
+ return time.time() - last_beat
205
+ return None
206
+ except Exception:
207
+ return None
208
+
209
+
210
+ def is_heartbeat_stale() -> bool:
211
+ """Check if worker heartbeat is stale (older than threshold)."""
212
+ age = get_heartbeat_age()
213
+ if age is None:
214
+ return True # No heartbeat = stale
215
+ return age > HEARTBEAT_STALE_THRESHOLD
216
+
217
+
218
+ def _get_process_start_time(pid: int) -> Optional[float]:
219
+ """
220
+ Get process start time from /proc filesystem (Linux only).
221
+
222
+ Returns:
223
+ Process start time as Unix timestamp, or None if not available
224
+ """
225
+ try:
226
+ stat_path = f"/proc/{pid}/stat"
227
+ if not os.path.exists(stat_path):
228
+ return None
229
+
230
+ with open(stat_path, 'r') as f:
231
+ stat = f.read()
232
+
233
+ # Parse stat file - field 22 is starttime (in clock ticks since boot)
234
+ # Format: pid (comm) state ppid pgrp session tty_nr ... starttime ...
235
+ # Need to handle comm field which may contain spaces/parentheses
236
+ parts = stat.rsplit(')', 1)
237
+ if len(parts) < 2:
238
+ return None
239
+
240
+ fields = parts[1].split()
241
+ if len(fields) < 20:
242
+ return None
243
+
244
+ starttime_ticks = int(fields[19]) # 0-indexed, field 22 is at index 19 after comm
245
+
246
+ # Convert to timestamp using system boot time and clock ticks per second
247
+ with open('/proc/stat', 'r') as f:
248
+ for line in f:
249
+ if line.startswith('btime'):
250
+ boot_time = int(line.split()[1])
251
+ break
252
+ else:
253
+ return None
254
+
255
+ # Get clock ticks per second (usually 100)
256
+ ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
257
+
258
+ return boot_time + (starttime_ticks / ticks_per_sec)
259
+ except Exception:
260
+ return None
261
+
262
+
96
263
  def _next_job_id(jobs: List[Dict[str, Any]]) -> int:
97
264
  """
98
- Get next available job ID.
99
-
100
- Uses a persistent counter to ensure IDs are never reused, even after
101
- jobs are purged. This prevents worker confusion when job IDs overlap.
265
+ Get next available job ID with file locking.
266
+
267
+ Uses a persistent counter with fcntl locking to ensure IDs are never
268
+ reused, even across multiple processes. This prevents duplicate job IDs
269
+ when multiple jobs are enqueued concurrently.
102
270
  """
103
271
  counter_file = os.path.join(JOBS_DIR, ".job_counter")
104
-
272
+ lock_file = os.path.join(JOBS_DIR, ".job_counter.lock")
273
+
105
274
  try:
106
- # Try to read persistent counter
107
- if os.path.exists(counter_file):
108
- with open(counter_file, 'r') as f:
109
- next_id = int(f.read().strip())
110
- else:
111
- # Initialize from existing jobs
112
- maxid = 0
113
- for j in jobs:
114
- try:
115
- if isinstance(j.get("id"), int) and j["id"] > maxid:
116
- maxid = j["id"]
117
- except Exception:
118
- continue
119
- next_id = maxid + 1
120
-
121
- # Write incremented counter for next time
122
- with open(counter_file, 'w') as f:
123
- f.write(str(next_id + 1))
124
-
125
- return next_id
275
+ _ensure_dirs()
276
+
277
+ # Use a separate lock file to allow atomic read-modify-write
278
+ with open(lock_file, 'w') as lock_fh:
279
+ # Acquire exclusive lock (blocks until available)
280
+ fcntl.flock(lock_fh.fileno(), fcntl.LOCK_EX)
281
+
282
+ try:
283
+ # Read current counter
284
+ if os.path.exists(counter_file):
285
+ with open(counter_file, 'r') as f:
286
+ next_id = int(f.read().strip())
287
+ else:
288
+ # Initialize from existing jobs
289
+ maxid = 0
290
+ for j in jobs:
291
+ try:
292
+ if isinstance(j.get("id"), int) and j["id"] > maxid:
293
+ maxid = j["id"]
294
+ except Exception:
295
+ continue
296
+ next_id = maxid + 1
297
+
298
+ # Write incremented counter atomically
299
+ tmp_file = counter_file + '.tmp'
300
+ with open(tmp_file, 'w') as f:
301
+ f.write(str(next_id + 1))
302
+ f.flush()
303
+ os.fsync(f.fileno())
304
+ os.replace(tmp_file, counter_file)
305
+
306
+ return next_id
307
+
308
+ finally:
309
+ # Release lock
310
+ fcntl.flock(lock_fh.fileno(), fcntl.LOCK_UN)
311
+
126
312
  except Exception:
127
313
  # Fallback to old behavior if file operations fail
128
314
  maxid = 0
@@ -409,14 +595,36 @@ def purge_all_jobs() -> int:
409
595
  return purge_jobs(status_filter=['done', 'error', 'killed'])
410
596
 
411
597
 
412
- def _update_job(jid: int, **fields):
598
+ def _update_job(jid: int, respect_killed: bool = True, **fields):
599
+ """
600
+ Update job fields atomically.
601
+
602
+ Args:
603
+ jid: Job ID to update
604
+ respect_killed: If True (default), don't overwrite status if job is killed.
605
+ This prevents race condition where job is killed while completing.
606
+ **fields: Fields to update
607
+ """
413
608
  with _lock:
414
609
  jobs = _read_jobs()
415
610
  changed = False
416
611
  for j in jobs:
417
612
  if j.get("id") == jid:
418
- j.update(fields)
419
- changed = True
613
+ # Race condition protection: don't change status of killed jobs
614
+ if respect_killed and j.get("status") == STATUS_KILLED and "status" in fields:
615
+ # Job was killed - don't overwrite status, but allow other updates
616
+ fields_copy = dict(fields)
617
+ del fields_copy["status"]
618
+ if fields_copy:
619
+ j.update(fields_copy)
620
+ changed = True
621
+ logger.debug("Skipped status update for killed job", extra={
622
+ "job_id": jid,
623
+ "attempted_status": fields.get("status")
624
+ })
625
+ else:
626
+ j.update(fields)
627
+ changed = True
420
628
  break
421
629
  if changed:
422
630
  _write_jobs(jobs)
@@ -479,10 +687,27 @@ def _process_pending_chains():
479
687
  # Get parse results from job
480
688
  parse_result = job_to_chain.get('parse_result', {})
481
689
 
482
- if not parse_result or 'error' in parse_result:
483
- # No results or parse error - skip chaining
484
- _update_job(jid, chained=True)
485
- _append_worker_log(f"job {jid}: no parse results, skipping chain")
690
+ if not parse_result:
691
+ # No parse results - this shouldn't happen if job was properly marked chainable
692
+ # Log warning and store reason for debugging
693
+ logger.warning("Job marked chainable but has no parse_result", extra={
694
+ "job_id": jid,
695
+ "tool": tool,
696
+ "status": job_to_chain.get('status')
697
+ })
698
+ _append_worker_log(f"job {jid}: WARNING - marked chainable but parse_result is empty/missing")
699
+ _update_job(jid, chained=True, chain_skip_reason="parse_result missing")
700
+ return 1
701
+
702
+ if 'error' in parse_result:
703
+ # Parse had an error - log and skip
704
+ logger.warning("Job has parse error, skipping chaining", extra={
705
+ "job_id": jid,
706
+ "tool": tool,
707
+ "parse_error": parse_result.get('error')
708
+ })
709
+ _append_worker_log(f"job {jid}: parse error '{parse_result.get('error')}', skipping chain")
710
+ _update_job(jid, chained=True, chain_skip_reason=f"parse_error: {parse_result.get('error')}")
486
711
  return 1
487
712
 
488
713
  # Process auto-chaining
@@ -571,10 +796,35 @@ def _try_run_plugin(tool: str, target: str, args: List[str], label: str, log_pat
571
796
  cmd_spec = build_command_method(target, args or [], label or "", log_path)
572
797
 
573
798
  if cmd_spec is None:
574
- # Plugin validation failed
575
- with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
576
- fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
577
- return (True, 1)
799
+ # build_command returned None - check if plugin has run() method
800
+ # This allows plugins to signal "use run() instead" by returning None
801
+ run_method = getattr(plugin, "run", None)
802
+ if callable(run_method):
803
+ # Plugin wants to handle execution itself via run() method
804
+ sig = inspect.signature(run_method)
805
+ params = list(sig.parameters.keys())
806
+
807
+ try:
808
+ if "log_path" in params:
809
+ rc = run_method(target, args or [], label or "", log_path)
810
+ elif "label" in params:
811
+ rc = run_method(target, args or [], label or "")
812
+ elif "args" in params:
813
+ rc = run_method(target, args or [])
814
+ else:
815
+ rc = run_method(target)
816
+ return (True, rc if isinstance(rc, int) else 0)
817
+ except Exception as e:
818
+ with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
819
+ fh.write(f"\n=== PLUGIN RUN ERROR ===\n")
820
+ fh.write(f"{type(e).__name__}: {e}\n")
821
+ fh.write(f"\n{traceback.format_exc()}\n")
822
+ return (True, 1)
823
+ else:
824
+ # No run() method either - actual validation failure
825
+ with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
826
+ fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
827
+ return (True, 1)
578
828
 
579
829
  # Execute using new subprocess handler with PID tracking
580
830
  rc = _run_subprocess_with_spec(cmd_spec, log_path, jid=jid, plugin=plugin)
@@ -773,6 +1023,55 @@ def _store_msf_session(jid: int, target: str, exploit_path: str, session_id: str
773
1023
  _append_worker_log(f"job {jid}: session storage error: {e}")
774
1024
 
775
1025
 
1026
+ # Cache stdbuf availability check
1027
+ _stdbuf_available = None
1028
+
1029
+
1030
+ def _is_stdbuf_available() -> bool:
1031
+ """Check if stdbuf is available for line-buffered output."""
1032
+ global _stdbuf_available
1033
+ if _stdbuf_available is None:
1034
+ _stdbuf_available = shutil.which('stdbuf') is not None
1035
+ return _stdbuf_available
1036
+
1037
+
1038
+ def _wrap_cmd_for_line_buffering(cmd: List[str]) -> List[str]:
1039
+ """
1040
+ Wrap a command with stdbuf for line-buffered output when available.
1041
+
1042
+ This ensures output is written line-by-line instead of in 4-8KB blocks,
1043
+ improving real-time log monitoring and ensuring output is captured
1044
+ before process termination.
1045
+
1046
+ Args:
1047
+ cmd: Command to wrap
1048
+
1049
+ Returns:
1050
+ Command wrapped with stdbuf if available, original command otherwise
1051
+ """
1052
+ if not cmd:
1053
+ return cmd
1054
+
1055
+ if _is_stdbuf_available():
1056
+ # stdbuf -oL = line-buffered stdout, -eL = line-buffered stderr
1057
+ return ['stdbuf', '-oL', '-eL'] + cmd
1058
+
1059
+ return cmd
1060
+
1061
+
1062
+ def _get_subprocess_env() -> Dict[str, str]:
1063
+ """
1064
+ Get environment for subprocess with buffering disabled.
1065
+
1066
+ Sets PYTHONUNBUFFERED=1 for Python subprocesses and TERM=dumb
1067
+ to prevent interactive terminal issues.
1068
+ """
1069
+ env = os.environ.copy()
1070
+ env['TERM'] = 'dumb' # Prevent stty errors from interactive tools
1071
+ env['PYTHONUNBUFFERED'] = '1' # Disable Python output buffering
1072
+ return env
1073
+
1074
+
776
1075
  def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int = None, plugin=None) -> int:
777
1076
  """
778
1077
  Execute a command specification with proper PID tracking.
@@ -814,32 +1113,35 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
814
1113
  with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
815
1114
  fh.write("ERROR: No command provided in spec\n")
816
1115
  return 1
817
-
1116
+
818
1117
  timeout = cmd_spec.get('timeout', JOB_TIMEOUT_SECONDS)
819
- env = cmd_spec.get('env')
1118
+ spec_env = cmd_spec.get('env')
820
1119
  cwd = cmd_spec.get('cwd')
821
1120
  needs_shell = cmd_spec.get('needs_shell', False)
822
-
1121
+
823
1122
  _append_worker_log(f"_run_subprocess_with_spec: timeout={timeout}s for job {jid}")
824
-
825
- # Prepare environment
826
- # Set TERM=dumb to prevent stty errors from interactive tools like msfconsole
827
- proc_env = os.environ.copy()
828
- proc_env['TERM'] = 'dumb'
829
- if env:
830
- proc_env.update(env)
831
-
1123
+
1124
+ # Wrap command with stdbuf for line-buffered output (unless shell mode)
1125
+ original_cmd = cmd
1126
+ if not needs_shell:
1127
+ cmd = _wrap_cmd_for_line_buffering(cmd)
1128
+
1129
+ # Prepare environment with PYTHONUNBUFFERED=1 and TERM=dumb
1130
+ proc_env = _get_subprocess_env()
1131
+ if spec_env:
1132
+ proc_env.update(spec_env)
1133
+
832
1134
  with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
833
1135
  fh.write("=== Command Execution (build_command) ===\n")
834
- fh.write(f"Command: {' '.join(cmd)}\n")
1136
+ fh.write(f"Command: {' '.join(original_cmd)}\n")
835
1137
  fh.write(f"Timeout: {timeout} seconds\n")
836
- if env:
837
- fh.write(f"Environment: {env}\n")
1138
+ if spec_env:
1139
+ fh.write(f"Environment: {spec_env}\n")
838
1140
  if cwd:
839
1141
  fh.write(f"Working Dir: {cwd}\n")
840
1142
  fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
841
1143
  fh.flush()
842
-
1144
+
843
1145
  try:
844
1146
  # Create new process group so all children can be killed together
845
1147
  # Redirect stdin to /dev/null to prevent password prompts from hanging
@@ -849,16 +1151,17 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
849
1151
  stdout=fh,
850
1152
  stderr=subprocess.STDOUT,
851
1153
  preexec_fn=os.setsid, # Creates new session
852
- env=proc_env, # Always use proc_env (includes TERM=dumb)
1154
+ env=proc_env,
853
1155
  cwd=cwd,
854
1156
  shell=needs_shell # nosec B602 - intentional for security tool command execution
855
1157
  )
856
1158
 
857
- # Store PID if job ID provided
1159
+ # Store PID and process start time for stale detection
858
1160
  if jid is not None:
859
- _update_job(jid, pid=proc.pid)
1161
+ proc_start_time = _get_process_start_time(proc.pid)
1162
+ _update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
860
1163
  _append_worker_log(f"job {jid}: running with PID {proc.pid}")
861
-
1164
+
862
1165
  # Wait for process with timeout
863
1166
  try:
864
1167
  proc.wait(timeout=timeout)
@@ -890,6 +1193,7 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
890
1193
  return 0
891
1194
  else:
892
1195
  fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
1196
+ fh.flush()
893
1197
  return 124
894
1198
 
895
1199
  # Check if job was killed externally during execution
@@ -912,17 +1216,21 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
912
1216
  proc.wait(timeout=5)
913
1217
  except:
914
1218
  pass
1219
+ fh.flush()
915
1220
  return 143 # 128 + 15 (SIGTERM)
916
-
1221
+
917
1222
  fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
918
1223
  fh.write(f"Exit Code: {proc.returncode}\n")
1224
+ fh.flush()
919
1225
  return proc.returncode
920
-
1226
+
921
1227
  except FileNotFoundError:
922
1228
  fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
1229
+ fh.flush()
923
1230
  return 127
924
1231
  except Exception as e:
925
1232
  fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
1233
+ fh.flush()
926
1234
  return 1
927
1235
 
928
1236
 
@@ -937,9 +1245,14 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
937
1245
  cmd = [tool] + (args or [])
938
1246
  cmd = [c.replace("<target>", target) for c in cmd]
939
1247
 
1248
+ # Wrap command with stdbuf for line-buffered output
1249
+ cmd = _wrap_cmd_for_line_buffering(cmd)
1250
+
940
1251
  with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
1252
+ # Log original command (without stdbuf wrapper for clarity)
1253
+ original_cmd = cmd[3:] if cmd[:3] == ['stdbuf', '-oL', '-eL'] else cmd
941
1254
  fh.write("=== Subprocess Execution ===\n")
942
- fh.write(f"Command: {' '.join(cmd)}\n")
1255
+ fh.write(f"Command: {' '.join(original_cmd)}\n")
943
1256
  fh.write(f"Timeout: {timeout} seconds\n")
944
1257
  fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
945
1258
  fh.flush()
@@ -947,9 +1260,8 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
947
1260
  try:
948
1261
  # Create new process group so all children can be killed together
949
1262
  # Redirect stdin to /dev/null to prevent password prompts from hanging
950
- # Set TERM=dumb to prevent stty errors from interactive tools like msfconsole
951
- env = os.environ.copy()
952
- env['TERM'] = 'dumb'
1263
+ # Use env with PYTHONUNBUFFERED=1 and TERM=dumb
1264
+ env = _get_subprocess_env()
953
1265
 
954
1266
  proc = subprocess.Popen(
955
1267
  cmd,
@@ -960,9 +1272,10 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
960
1272
  env=env
961
1273
  )
962
1274
 
963
- # Store PID if job ID provided
1275
+ # Store PID and process start time for stale detection
964
1276
  if jid is not None:
965
- _update_job(jid, pid=proc.pid)
1277
+ proc_start_time = _get_process_start_time(proc.pid)
1278
+ _update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
966
1279
  _append_worker_log(f"job {jid}: running with PID {proc.pid}")
967
1280
 
968
1281
  # Wait for process with timeout
@@ -977,6 +1290,7 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
977
1290
  proc.kill() # Fallback to single process
978
1291
  proc.wait()
979
1292
  fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
1293
+ fh.flush()
980
1294
  return 124
981
1295
 
982
1296
  # Check if job was killed externally during execution
@@ -999,17 +1313,21 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
999
1313
  proc.wait(timeout=5)
1000
1314
  except:
1001
1315
  pass
1316
+ fh.flush()
1002
1317
  return 143 # 128 + 15 (SIGTERM)
1003
1318
 
1004
1319
  fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
1005
1320
  fh.write(f"Exit Code: {proc.returncode}\n")
1321
+ fh.flush()
1006
1322
  return proc.returncode
1007
1323
 
1008
1324
  except FileNotFoundError:
1009
1325
  fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
1326
+ fh.flush()
1010
1327
  return 127
1011
1328
  except Exception as e:
1012
1329
  fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
1330
+ fh.flush()
1013
1331
  return 1
1014
1332
 
1015
1333
 
@@ -1145,77 +1463,110 @@ def run_job(jid: int) -> None:
1145
1463
  # Re-fetch job to get updated data
1146
1464
  job = get_job(jid)
1147
1465
  parse_result = handle_job_result(job)
1148
- if parse_result:
1149
- if 'error' in parse_result:
1150
- logger.warning("Job parse error", extra={
1466
+
1467
+ # Handle parse failure cases
1468
+ if parse_result is None:
1469
+ # Parser returned None - likely missing log file, no parser for tool, or missing engagement
1470
+ logger.error("Job parse returned None - results may be lost", extra={
1471
+ "job_id": jid,
1472
+ "tool": job.get('tool'),
1473
+ "log_exists": os.path.exists(job.get('log', '')) if job.get('log') else False
1474
+ })
1475
+ _append_worker_log(f"job {jid} parse returned None (tool={job.get('tool')}) - check if parser exists")
1476
+ # Update job to indicate parse failure
1477
+ _update_job(jid, status=STATUS_WARNING, parse_result={'error': 'Parser returned None - no results extracted'})
1478
+ # Mark as chained to prevent infinite retry
1479
+ _update_job(jid, chained=True)
1480
+ return
1481
+
1482
+ if 'error' in parse_result:
1483
+ logger.error("Job parse error - results may be incomplete", extra={
1484
+ "job_id": jid,
1485
+ "error": parse_result['error']
1486
+ })
1487
+ _append_worker_log(f"job {jid} parse error: {parse_result['error']}")
1488
+ # Update job status to warning with the error
1489
+ _update_job(jid, status=STATUS_WARNING, parse_result=parse_result)
1490
+ # Mark as chained to prevent infinite retry
1491
+ _update_job(jid, chained=True)
1492
+ return
1493
+
1494
+ # Parse succeeded
1495
+ logger.info("Job parsed successfully", extra={
1496
+ "job_id": jid,
1497
+ "parse_result": parse_result
1498
+ })
1499
+ _append_worker_log(f"job {jid} parsed: {parse_result}")
1500
+
1501
+ # Determine chainable status BEFORE updating to avoid race condition
1502
+ # We must set parse_result and chainable in a single atomic update
1503
+ try:
1504
+ from souleyez.core.tool_chaining import ToolChaining
1505
+ chaining = ToolChaining()
1506
+
1507
+ # Get current job to check status
1508
+ job = get_job(jid)
1509
+ job_status = job.get('status', STATUS_ERROR)
1510
+
1511
+ # Determine final status from parser if provided
1512
+ final_status = parse_result.get('status', job_status)
1513
+
1514
+ # Check if job should be chainable
1515
+ should_chain = (
1516
+ chaining.is_enabled() and
1517
+ parse_result and
1518
+ 'error' not in parse_result and
1519
+ is_chainable(final_status)
1520
+ )
1521
+
1522
+ # Build update dict - ATOMIC update of parse_result + chainable
1523
+ update_fields = {'parse_result': parse_result}
1524
+
1525
+ if 'status' in parse_result:
1526
+ update_fields['status'] = final_status
1527
+ logger.info("Job status updated from parser", extra={
1151
1528
  "job_id": jid,
1152
- "error": parse_result['error']
1529
+ "status": final_status
1153
1530
  })
1154
- _append_worker_log(f"job {jid} parse error: {parse_result['error']}")
1531
+ _append_worker_log(f"job {jid} status updated to: {final_status}")
1532
+
1533
+ if should_chain:
1534
+ update_fields['chainable'] = True
1155
1535
  else:
1156
- logger.info("Job parsed successfully", extra={
1157
- "job_id": jid,
1158
- "parse_result": parse_result
1159
- })
1160
- _append_worker_log(f"job {jid} parsed: {parse_result}")
1536
+ # Not chainable - mark as chained to skip
1537
+ update_fields['chained'] = True
1161
1538
 
1162
- # Update status based on parse result if provided
1163
- if 'status' in parse_result:
1164
- final_status = parse_result['status']
1165
- _update_job(jid, status=final_status, parse_result=parse_result)
1166
- logger.info("Job status updated from parser", extra={
1539
+ # Single atomic update to prevent race condition
1540
+ _update_job(jid, **update_fields)
1541
+
1542
+ # Log chaining decision
1543
+ if should_chain:
1544
+ if final_status == STATUS_WARNING:
1545
+ logger.info("Job with warning status marked for chaining", extra={
1167
1546
  "job_id": jid,
1168
- "status": final_status
1547
+ "tool": job.get('tool'),
1548
+ "wildcard_detected": parse_result.get('wildcard_detected', False)
1169
1549
  })
1170
- _append_worker_log(f"job {jid} status updated to: {final_status}")
1550
+ _append_worker_log(f"job {jid} (status=warning) marked as chainable")
1171
1551
  else:
1172
- # Store parse result in job for dashboard display (no status update)
1173
- _update_job(jid, parse_result=parse_result)
1174
-
1175
- # Mark job as chainable instead of chaining immediately
1176
- # Worker loop will process it when database is idle
1177
- try:
1178
- from souleyez.core.tool_chaining import ToolChaining
1179
- chaining = ToolChaining()
1180
-
1181
- # Re-fetch job to get updated status
1182
- job = get_job(jid)
1183
- job_status = job.get('status', STATUS_ERROR)
1184
-
1185
- # Check if status is chainable (done, no_results, warning)
1186
- if chaining.is_enabled() and parse_result and 'error' not in parse_result and is_chainable(job_status):
1187
- # Mark for deferred chaining
1188
- _update_job(jid, chainable=True)
1189
-
1190
- # Log special handling for warning status
1191
- if job_status == STATUS_WARNING:
1192
- logger.info("Job with warning status marked for chaining", extra={
1193
- "job_id": jid,
1194
- "tool": job.get('tool'),
1195
- "wildcard_detected": parse_result.get('wildcard_detected', False)
1196
- })
1197
- _append_worker_log(f"job {jid} (status=warning) marked as chainable")
1198
- else:
1199
- logger.info("Job marked as chainable", extra={
1200
- "job_id": jid,
1201
- "tool": job.get('tool'),
1202
- "status": job_status
1203
- })
1204
- _append_worker_log(f"job {jid} marked as chainable (status={job_status})")
1205
- else:
1206
- # Chaining disabled or job has errors - mark as chained (skip)
1207
- _update_job(jid, chained=True)
1208
- reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={job_status}"
1209
- _append_worker_log(f"job {jid} not chainable ({reason})")
1210
-
1211
- except Exception as chain_err:
1212
- logger.error("Failed to mark job as chainable", extra={
1552
+ logger.info("Job marked as chainable", extra={
1213
1553
  "job_id": jid,
1214
- "error": str(chain_err)
1554
+ "tool": job.get('tool'),
1555
+ "status": final_status
1215
1556
  })
1216
- _append_worker_log(f"job {jid} chainable marking error: {chain_err}")
1217
- # Mark as chained to prevent retry loops
1218
- _update_job(jid, chained=True, chain_error=str(chain_err))
1557
+ _append_worker_log(f"job {jid} marked as chainable (status={final_status})")
1558
+ else:
1559
+ reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={final_status}"
1560
+ _append_worker_log(f"job {jid} not chainable ({reason})")
1561
+
1562
+ except Exception as chain_err:
1563
+ logger.error("Failed to mark job as chainable", extra={
1564
+ "job_id": jid,
1565
+ "error": str(chain_err)
1566
+ })
1567
+ _append_worker_log(f"job {jid} chainable marking error: {chain_err}")
1568
+ # Mark as chained to prevent retry loops
1569
+ _update_job(jid, chained=True, chain_error=str(chain_err))
1219
1570
 
1220
1571
  except Exception as e:
1221
1572
  logger.error("Job parse exception", extra={
@@ -1378,18 +1729,46 @@ def _detect_and_recover_stale_jobs() -> int:
1378
1729
  pid = job.get('pid')
1379
1730
  tool = job.get('tool', 'unknown')
1380
1731
  log_path = job.get('log')
1732
+ stored_start_time = job.get('process_start_time')
1381
1733
 
1382
- # Skip if PID is still alive
1734
+ # Check if PID is alive
1383
1735
  if _is_pid_alive(pid):
1384
- continue
1385
-
1386
- # PID is dead - this is a stale job
1387
- _append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
1388
- logger.warning("Stale job detected", extra={
1389
- "job_id": jid,
1390
- "tool": tool,
1391
- "pid": pid
1392
- })
1736
+ # PID is alive - but check for PID reuse
1737
+ if stored_start_time is not None:
1738
+ current_start_time = _get_process_start_time(pid)
1739
+ if current_start_time is not None:
1740
+ # Allow 2 second tolerance for timing differences
1741
+ if abs(current_start_time - stored_start_time) > 2:
1742
+ # PID reused by different process
1743
+ _append_worker_log(
1744
+ f"job {jid}: PID {pid} reused (stored start: {stored_start_time:.0f}, "
1745
+ f"current: {current_start_time:.0f})"
1746
+ )
1747
+ logger.warning("PID reuse detected", extra={
1748
+ "job_id": jid,
1749
+ "tool": tool,
1750
+ "pid": pid,
1751
+ "stored_start_time": stored_start_time,
1752
+ "current_start_time": current_start_time
1753
+ })
1754
+ # Fall through to stale job handling
1755
+ else:
1756
+ # Same process, still running
1757
+ continue
1758
+ else:
1759
+ # Can't get current start time, assume still valid
1760
+ continue
1761
+ else:
1762
+ # No stored start time (old job), assume still valid
1763
+ continue
1764
+ else:
1765
+ # PID is dead - definitely stale
1766
+ _append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
1767
+ logger.warning("Stale job detected", extra={
1768
+ "job_id": jid,
1769
+ "tool": tool,
1770
+ "pid": pid
1771
+ })
1393
1772
 
1394
1773
  # Check if log shows completion
1395
1774
  completed, exit_code = _check_log_for_completion(log_path, tool)
@@ -1412,6 +1791,8 @@ def _detect_and_recover_stale_jobs() -> int:
1412
1791
  # Try to parse results
1413
1792
  try:
1414
1793
  from .result_handler import handle_job_result
1794
+ from souleyez.core.tool_chaining import ToolChaining
1795
+
1415
1796
  job = get_job(jid)
1416
1797
  parse_result = handle_job_result(job)
1417
1798
 
@@ -1419,36 +1800,34 @@ def _detect_and_recover_stale_jobs() -> int:
1419
1800
  if 'error' in parse_result:
1420
1801
  _append_worker_log(f"job {jid} stale recovery parse error: {parse_result['error']}")
1421
1802
  else:
1422
- # Update status from parser if provided
1803
+ # Determine final status and chainable in one check
1804
+ final_status = parse_result.get('status', status)
1805
+ chaining = ToolChaining()
1806
+ should_chain = chaining.is_enabled() and is_chainable(final_status)
1807
+
1808
+ # Build atomic update - parse_result + status + chainable together
1809
+ update_fields = {'parse_result': parse_result}
1423
1810
  if 'status' in parse_result:
1424
- status = parse_result['status']
1425
- _update_job(jid, status=status, parse_result=parse_result)
1426
- else:
1427
- _update_job(jid, parse_result=parse_result)
1811
+ update_fields['status'] = final_status
1812
+ if should_chain:
1813
+ update_fields['chainable'] = True
1814
+
1815
+ # Single atomic update to prevent race condition
1816
+ _update_job(jid, **update_fields)
1428
1817
 
1429
1818
  _append_worker_log(f"job {jid} stale recovery parsed: {parse_result.get('findings_added', 0)} findings")
1430
1819
 
1431
1820
  logger.info("Stale job recovered with results", extra={
1432
1821
  "job_id": jid,
1433
1822
  "tool": tool,
1434
- "status": status,
1435
- "parse_result": parse_result
1823
+ "status": final_status,
1824
+ "parse_result": parse_result,
1825
+ "chainable": should_chain
1436
1826
  })
1437
1827
 
1438
- # Mark for auto-chaining if conditions are met
1439
- try:
1440
- from souleyez.core.tool_chaining import ToolChaining
1441
- chaining = ToolChaining()
1442
- if chaining.is_enabled() and is_chainable(status):
1443
- _update_job(jid, chainable=True)
1444
- _append_worker_log(f"job {jid} stale recovery marked as chainable")
1445
- logger.info("Stale job marked as chainable", extra={
1446
- "job_id": jid,
1447
- "tool": tool,
1448
- "status": status
1449
- })
1450
- except Exception as chain_err:
1451
- _append_worker_log(f"job {jid} stale recovery chainable error: {chain_err}")
1828
+ if should_chain:
1829
+ _append_worker_log(f"job {jid} stale recovery marked as chainable")
1830
+
1452
1831
  except Exception as parse_err:
1453
1832
  _append_worker_log(f"job {jid} stale recovery parse exception: {parse_err}")
1454
1833
 
@@ -1608,26 +1987,85 @@ def _check_msf_exploitation_success():
1608
1987
  return 0
1609
1988
 
1610
1989
 
1990
+ def _update_job_progress():
1991
+ """
1992
+ Update progress tracking for running jobs.
1993
+
1994
+ Checks log file modification times and flags jobs with no recent output
1995
+ as possibly hung (no output for JOB_HUNG_THRESHOLD seconds).
1996
+ """
1997
+ try:
1998
+ jobs = _read_jobs()
1999
+ running_jobs = [j for j in jobs if j.get('status') == STATUS_RUNNING]
2000
+
2001
+ for job in running_jobs:
2002
+ jid = job.get('id')
2003
+ log_path = job.get('log')
2004
+
2005
+ if not log_path or not os.path.exists(log_path):
2006
+ continue
2007
+
2008
+ try:
2009
+ # Get log file modification time
2010
+ mtime = os.path.getmtime(log_path)
2011
+ current_time = time.time()
2012
+ time_since_output = current_time - mtime
2013
+
2014
+ # Update last_output_at in job record
2015
+ updates = {'last_output_at': mtime}
2016
+
2017
+ # Flag as possibly hung if no output for threshold
2018
+ was_hung = job.get('possibly_hung', False)
2019
+ is_hung = time_since_output > JOB_HUNG_THRESHOLD
2020
+
2021
+ if is_hung != was_hung:
2022
+ updates['possibly_hung'] = is_hung
2023
+ if is_hung:
2024
+ _append_worker_log(
2025
+ f"job {jid}: no output for {int(time_since_output)}s, flagged as possibly hung"
2026
+ )
2027
+ logger.warning("Job possibly hung", extra={
2028
+ "job_id": jid,
2029
+ "tool": job.get('tool'),
2030
+ "time_since_output": int(time_since_output)
2031
+ })
2032
+
2033
+ _update_job(jid, **updates)
2034
+
2035
+ except Exception as e:
2036
+ # Non-critical, just skip this job
2037
+ pass
2038
+
2039
+ except Exception as e:
2040
+ logger.error("Job progress tracking error", extra={"error": str(e)})
2041
+
2042
+
1611
2043
  def worker_loop(poll_interval: float = 2.0):
1612
2044
  """
1613
2045
  Main worker loop that processes jobs and handles auto-chaining.
1614
2046
 
1615
2047
  Loop behavior:
1616
- 1. Detect and recover stale jobs (dead PIDs)
1617
- 2. Check for running jobs
1618
- 3. If none running, start next queued job
1619
- 4. Process one chainable job (if any)
1620
- 5. Sleep poll_interval seconds, repeat
2048
+ 1. Update heartbeat for health monitoring
2049
+ 2. Detect and recover stale jobs (dead PIDs)
2050
+ 3. Update progress tracking for running jobs
2051
+ 4. Check for running jobs
2052
+ 5. If none running, start next queued job
2053
+ 6. Process one chainable job (if any)
2054
+ 7. Sleep poll_interval seconds, repeat
1621
2055
 
1622
2056
  Args:
1623
2057
  poll_interval: Seconds to sleep between iterations (default: 2.0)
1624
2058
  """
1625
2059
  _ensure_dirs()
2060
+ _update_heartbeat() # Initial heartbeat
1626
2061
  _append_worker_log("souleyez background worker: starting loop")
1627
2062
 
1628
- # Track last stale job check time (check every 30 seconds, not every iteration)
2063
+ # Track last stale job check time (check every 15 seconds, not every iteration)
1629
2064
  last_stale_check = 0
1630
- stale_check_interval = 30 # seconds
2065
+ stale_check_interval = 15 # seconds (reduced from 30s for faster detection)
2066
+
2067
+ # Track last heartbeat time
2068
+ last_heartbeat = time.time()
1631
2069
 
1632
2070
  # Run stale job detection on startup
1633
2071
  try:
@@ -1639,8 +2077,14 @@ def worker_loop(poll_interval: float = 2.0):
1639
2077
 
1640
2078
  try:
1641
2079
  while True:
1642
- # Periodic stale job detection (every 30 seconds)
1643
2080
  current_time = time.time()
2081
+
2082
+ # Update heartbeat every HEARTBEAT_INTERVAL seconds
2083
+ if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
2084
+ _update_heartbeat()
2085
+ last_heartbeat = current_time
2086
+
2087
+ # Periodic stale job detection (every 15 seconds)
1644
2088
  if current_time - last_stale_check >= stale_check_interval:
1645
2089
  try:
1646
2090
  recovered = _detect_and_recover_stale_jobs()
@@ -1650,6 +2094,12 @@ def worker_loop(poll_interval: float = 2.0):
1650
2094
  _append_worker_log(f"stale job detection error: {e}")
1651
2095
  last_stale_check = current_time
1652
2096
 
2097
+ # Update progress tracking for running jobs
2098
+ try:
2099
+ _update_job_progress()
2100
+ except Exception as e:
2101
+ _append_worker_log(f"progress tracking error: {e}")
2102
+
1653
2103
  # Check running MSF jobs for exploitation success (every iteration)
1654
2104
  try:
1655
2105
  detected = _check_msf_exploitation_success()