souleyez 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,6 +25,7 @@ import subprocess
25
25
  import threading
26
26
  import inspect
27
27
  import traceback
28
+ import fcntl
28
29
  from typing import List, Dict, Optional, Any
29
30
  from souleyez.log_config import get_logger
30
31
  from .log_sanitizer import LogSanitizer
@@ -41,7 +42,12 @@ JOBS_DIR = os.path.join(DATA_DIR, "jobs")
41
42
  LOGS_DIR = os.path.join(DATA_DIR, "logs")
42
43
  JOBS_FILE = os.path.join(JOBS_DIR, "jobs.json")
43
44
  WORKER_LOG = os.path.join(LOGS_DIR, "worker.log")
45
+ HEARTBEAT_FILE = os.path.join(JOBS_DIR, ".worker_heartbeat")
44
46
  JOB_TIMEOUT_SECONDS = 3600 # 1 hour (changed from 300s/5min)
47
+ HEARTBEAT_INTERVAL = 10 # seconds between heartbeat writes
48
+ HEARTBEAT_STALE_THRESHOLD = 30 # seconds before heartbeat considered stale
49
+ JOB_HUNG_THRESHOLD = 300 # 5 minutes with no output = possibly hung
50
+ JOBS_BACKUP_COUNT = 3 # Number of rotating backups to keep
45
51
 
46
52
  _lock = threading.RLock() # Reentrant lock allows nested acquisition by same thread
47
53
 
@@ -51,6 +57,63 @@ def _ensure_dirs():
51
57
  os.makedirs(LOGS_DIR, exist_ok=True)
52
58
 
53
59
 
60
+ def _get_backup_files() -> List[str]:
61
+ """Get list of backup files sorted by modification time (newest first)."""
62
+ backups = []
63
+ for i in range(1, JOBS_BACKUP_COUNT + 1):
64
+ backup_path = f"{JOBS_FILE}.bak.{i}"
65
+ if os.path.exists(backup_path):
66
+ backups.append((os.path.getmtime(backup_path), backup_path))
67
+ # Sort by mtime descending (newest first)
68
+ backups.sort(reverse=True)
69
+ return [path for _, path in backups]
70
+
71
+
72
+ def _rotate_backups():
73
+ """Rotate backup files, keeping only JOBS_BACKUP_COUNT backups."""
74
+ # Shift existing backups: .bak.2 -> .bak.3, .bak.1 -> .bak.2
75
+ for i in range(JOBS_BACKUP_COUNT, 1, -1):
76
+ src = f"{JOBS_FILE}.bak.{i - 1}"
77
+ dst = f"{JOBS_FILE}.bak.{i}"
78
+ if os.path.exists(src):
79
+ try:
80
+ shutil.move(src, dst)
81
+ except Exception:
82
+ pass
83
+
84
+ # Create new .bak.1 from current jobs.json
85
+ if os.path.exists(JOBS_FILE):
86
+ try:
87
+ shutil.copy2(JOBS_FILE, f"{JOBS_FILE}.bak.1")
88
+ except Exception:
89
+ pass
90
+
91
+
92
+ def _recover_from_backup() -> List[Dict[str, Any]]:
93
+ """
94
+ Attempt to recover jobs from backup files.
95
+
96
+ Returns:
97
+ List of jobs from the first valid backup, or empty list if no valid backup found
98
+ """
99
+ backups = _get_backup_files()
100
+ for backup_path in backups:
101
+ try:
102
+ with open(backup_path, "r", encoding="utf-8") as fh:
103
+ jobs = json.load(fh)
104
+ if isinstance(jobs, list):
105
+ _append_worker_log(f"recovered {len(jobs)} jobs from backup: {backup_path}")
106
+ logger.info("Jobs recovered from backup", extra={
107
+ "backup_path": backup_path,
108
+ "job_count": len(jobs)
109
+ })
110
+ return jobs
111
+ except Exception as e:
112
+ _append_worker_log(f"backup {backup_path} also corrupt: {e}")
113
+ continue
114
+ return []
115
+
116
+
54
117
  def _read_jobs() -> List[Dict[str, Any]]:
55
118
  _ensure_dirs()
56
119
  if not os.path.exists(JOBS_FILE):
@@ -58,18 +121,42 @@ def _read_jobs() -> List[Dict[str, Any]]:
58
121
  try:
59
122
  with open(JOBS_FILE, "r", encoding="utf-8") as fh:
60
123
  return json.load(fh)
61
- except Exception:
124
+ except Exception as e:
125
+ # Log corruption event
126
+ _append_worker_log(f"jobs.json corrupt: {e}")
127
+ logger.error("Jobs file corrupted", extra={
128
+ "error": str(e),
129
+ "jobs_file": JOBS_FILE
130
+ })
131
+
132
+ # Try to recover from backup
133
+ recovered_jobs = _recover_from_backup()
134
+
135
+ # Move corrupt file aside
62
136
  try:
63
137
  corrupt = JOBS_FILE + ".corrupt." + str(int(time.time()))
64
138
  shutil.move(JOBS_FILE, corrupt)
65
- _append_worker_log(f"jobs file corrupt; moved to {corrupt}")
139
+ _append_worker_log(f"corrupt jobs file moved to {corrupt}")
66
140
  except Exception:
67
141
  pass
68
- return []
142
+
143
+ # If we recovered jobs, write them back
144
+ if recovered_jobs:
145
+ try:
146
+ _write_jobs(recovered_jobs)
147
+ _append_worker_log(f"restored {len(recovered_jobs)} jobs from backup")
148
+ except Exception as write_err:
149
+ _append_worker_log(f"failed to restore jobs: {write_err}")
150
+
151
+ return recovered_jobs
69
152
 
70
153
 
71
154
  def _write_jobs(jobs: List[Dict[str, Any]]):
72
155
  _ensure_dirs()
156
+
157
+ # Rotate backups before writing (keeps last 3 good copies)
158
+ _rotate_backups()
159
+
73
160
  tmp = tempfile.NamedTemporaryFile("w", delete=False, dir=JOBS_DIR, encoding="utf-8")
74
161
  try:
75
162
  json.dump(jobs, tmp, indent=2, ensure_ascii=False)
@@ -93,36 +180,135 @@ def _append_worker_log(msg: str):
93
180
  fh.write(line)
94
181
 
95
182
 
183
+ def _update_heartbeat():
184
+ """Write current timestamp to heartbeat file for health monitoring."""
185
+ _ensure_dirs()
186
+ try:
187
+ with open(HEARTBEAT_FILE, 'w') as fh:
188
+ fh.write(str(time.time()))
189
+ except Exception:
190
+ pass # Non-critical, don't crash worker
191
+
192
+
193
+ def get_heartbeat_age() -> Optional[float]:
194
+ """
195
+ Get age of worker heartbeat in seconds.
196
+
197
+ Returns:
198
+ Age in seconds, or None if heartbeat file doesn't exist
199
+ """
200
+ try:
201
+ if os.path.exists(HEARTBEAT_FILE):
202
+ with open(HEARTBEAT_FILE, 'r') as fh:
203
+ last_beat = float(fh.read().strip())
204
+ return time.time() - last_beat
205
+ return None
206
+ except Exception:
207
+ return None
208
+
209
+
210
+ def is_heartbeat_stale() -> bool:
211
+ """Check if worker heartbeat is stale (older than threshold)."""
212
+ age = get_heartbeat_age()
213
+ if age is None:
214
+ return True # No heartbeat = stale
215
+ return age > HEARTBEAT_STALE_THRESHOLD
216
+
217
+
218
+ def _get_process_start_time(pid: int) -> Optional[float]:
219
+ """
220
+ Get process start time from /proc filesystem (Linux only).
221
+
222
+ Returns:
223
+ Process start time as Unix timestamp, or None if not available
224
+ """
225
+ try:
226
+ stat_path = f"/proc/{pid}/stat"
227
+ if not os.path.exists(stat_path):
228
+ return None
229
+
230
+ with open(stat_path, 'r') as f:
231
+ stat = f.read()
232
+
233
+ # Parse stat file - field 22 is starttime (in clock ticks since boot)
234
+ # Format: pid (comm) state ppid pgrp session tty_nr ... starttime ...
235
+ # Need to handle comm field which may contain spaces/parentheses
236
+ parts = stat.rsplit(')', 1)
237
+ if len(parts) < 2:
238
+ return None
239
+
240
+ fields = parts[1].split()
241
+ if len(fields) < 20:
242
+ return None
243
+
244
+ starttime_ticks = int(fields[19]) # 0-indexed, field 22 is at index 19 after comm
245
+
246
+ # Convert to timestamp using system boot time and clock ticks per second
247
+ with open('/proc/stat', 'r') as f:
248
+ for line in f:
249
+ if line.startswith('btime'):
250
+ boot_time = int(line.split()[1])
251
+ break
252
+ else:
253
+ return None
254
+
255
+ # Get clock ticks per second (usually 100)
256
+ ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
257
+
258
+ return boot_time + (starttime_ticks / ticks_per_sec)
259
+ except Exception:
260
+ return None
261
+
262
+
96
263
  def _next_job_id(jobs: List[Dict[str, Any]]) -> int:
97
264
  """
98
- Get next available job ID.
99
-
100
- Uses a persistent counter to ensure IDs are never reused, even after
101
- jobs are purged. This prevents worker confusion when job IDs overlap.
265
+ Get next available job ID with file locking.
266
+
267
+ Uses a persistent counter with fcntl locking to ensure IDs are never
268
+ reused, even across multiple processes. This prevents duplicate job IDs
269
+ when multiple jobs are enqueued concurrently.
102
270
  """
103
271
  counter_file = os.path.join(JOBS_DIR, ".job_counter")
104
-
272
+ lock_file = os.path.join(JOBS_DIR, ".job_counter.lock")
273
+
105
274
  try:
106
- # Try to read persistent counter
107
- if os.path.exists(counter_file):
108
- with open(counter_file, 'r') as f:
109
- next_id = int(f.read().strip())
110
- else:
111
- # Initialize from existing jobs
112
- maxid = 0
113
- for j in jobs:
114
- try:
115
- if isinstance(j.get("id"), int) and j["id"] > maxid:
116
- maxid = j["id"]
117
- except Exception:
118
- continue
119
- next_id = maxid + 1
120
-
121
- # Write incremented counter for next time
122
- with open(counter_file, 'w') as f:
123
- f.write(str(next_id + 1))
124
-
125
- return next_id
275
+ _ensure_dirs()
276
+
277
+ # Use a separate lock file to allow atomic read-modify-write
278
+ with open(lock_file, 'w') as lock_fh:
279
+ # Acquire exclusive lock (blocks until available)
280
+ fcntl.flock(lock_fh.fileno(), fcntl.LOCK_EX)
281
+
282
+ try:
283
+ # Read current counter
284
+ if os.path.exists(counter_file):
285
+ with open(counter_file, 'r') as f:
286
+ next_id = int(f.read().strip())
287
+ else:
288
+ # Initialize from existing jobs
289
+ maxid = 0
290
+ for j in jobs:
291
+ try:
292
+ if isinstance(j.get("id"), int) and j["id"] > maxid:
293
+ maxid = j["id"]
294
+ except Exception:
295
+ continue
296
+ next_id = maxid + 1
297
+
298
+ # Write incremented counter atomically
299
+ tmp_file = counter_file + '.tmp'
300
+ with open(tmp_file, 'w') as f:
301
+ f.write(str(next_id + 1))
302
+ f.flush()
303
+ os.fsync(f.fileno())
304
+ os.replace(tmp_file, counter_file)
305
+
306
+ return next_id
307
+
308
+ finally:
309
+ # Release lock
310
+ fcntl.flock(lock_fh.fileno(), fcntl.LOCK_UN)
311
+
126
312
  except Exception:
127
313
  # Fallback to old behavior if file operations fail
128
314
  maxid = 0
@@ -135,7 +321,7 @@ def _next_job_id(jobs: List[Dict[str, Any]]) -> int:
135
321
  return maxid + 1
136
322
 
137
323
 
138
- def enqueue_job(tool: str, target: str, args: List[str], label: str = "", engagement_id: int = None, metadata: Dict[str, Any] = None, parent_id: int = None, reason: str = None, rule_id: int = None) -> int:
324
+ def enqueue_job(tool: str, target: str, args: List[str], label: str = "", engagement_id: int = None, metadata: Dict[str, Any] = None, parent_id: int = None, reason: str = None, rule_id: int = None, skip_scope_check: bool = False) -> int:
139
325
  with _lock:
140
326
  jobs = _read_jobs()
141
327
  jid = _next_job_id(jobs)
@@ -153,6 +339,43 @@ def enqueue_job(tool: str, target: str, args: List[str], label: str = "", engage
153
339
 
154
340
  # Merge parent_id, reason, and rule_id into metadata
155
341
  job_metadata = metadata or {}
342
+
343
+ # Scope validation - check if target is within engagement scope
344
+ if not skip_scope_check and engagement_id:
345
+ try:
346
+ from souleyez.security.scope_validator import ScopeValidator, ScopeViolationError
347
+ validator = ScopeValidator(engagement_id)
348
+ result = validator.validate_target(target)
349
+ enforcement = validator.get_enforcement_mode()
350
+
351
+ if not result.is_in_scope and validator.has_scope_defined():
352
+ if enforcement == 'block':
353
+ validator.log_validation(target, result, 'blocked', job_id=jid)
354
+ raise ScopeViolationError(
355
+ f"Target '{target}' is out of scope. {result.reason}"
356
+ )
357
+ elif enforcement == 'warn':
358
+ validator.log_validation(target, result, 'warned', job_id=jid)
359
+ if 'warnings' not in job_metadata:
360
+ job_metadata['warnings'] = []
361
+ job_metadata['warnings'].append(
362
+ f"SCOPE WARNING: {target} may be out of scope. {result.reason}"
363
+ )
364
+ logger.warning("Out-of-scope target allowed (warn mode)", extra={
365
+ "target": target,
366
+ "engagement_id": engagement_id,
367
+ "reason": result.reason
368
+ })
369
+ else:
370
+ validator.log_validation(target, result, 'allowed', job_id=jid)
371
+ except ScopeViolationError:
372
+ raise # Re-raise scope violations
373
+ except Exception as e:
374
+ # Don't block jobs if scope validation fails unexpectedly
375
+ logger.warning("Scope validation error (allowing job)", extra={
376
+ "target": target,
377
+ "error": str(e)
378
+ })
156
379
  if parent_id is not None:
157
380
  job_metadata['parent_id'] = parent_id
158
381
  if reason:
@@ -409,14 +632,36 @@ def purge_all_jobs() -> int:
409
632
  return purge_jobs(status_filter=['done', 'error', 'killed'])
410
633
 
411
634
 
412
- def _update_job(jid: int, **fields):
635
+ def _update_job(jid: int, respect_killed: bool = True, **fields):
636
+ """
637
+ Update job fields atomically.
638
+
639
+ Args:
640
+ jid: Job ID to update
641
+ respect_killed: If True (default), don't overwrite status if job is killed.
642
+ This prevents race condition where job is killed while completing.
643
+ **fields: Fields to update
644
+ """
413
645
  with _lock:
414
646
  jobs = _read_jobs()
415
647
  changed = False
416
648
  for j in jobs:
417
649
  if j.get("id") == jid:
418
- j.update(fields)
419
- changed = True
650
+ # Race condition protection: don't change status of killed jobs
651
+ if respect_killed and j.get("status") == STATUS_KILLED and "status" in fields:
652
+ # Job was killed - don't overwrite status, but allow other updates
653
+ fields_copy = dict(fields)
654
+ del fields_copy["status"]
655
+ if fields_copy:
656
+ j.update(fields_copy)
657
+ changed = True
658
+ logger.debug("Skipped status update for killed job", extra={
659
+ "job_id": jid,
660
+ "attempted_status": fields.get("status")
661
+ })
662
+ else:
663
+ j.update(fields)
664
+ changed = True
420
665
  break
421
666
  if changed:
422
667
  _write_jobs(jobs)
@@ -479,10 +724,27 @@ def _process_pending_chains():
479
724
  # Get parse results from job
480
725
  parse_result = job_to_chain.get('parse_result', {})
481
726
 
482
- if not parse_result or 'error' in parse_result:
483
- # No results or parse error - skip chaining
484
- _update_job(jid, chained=True)
485
- _append_worker_log(f"job {jid}: no parse results, skipping chain")
727
+ if not parse_result:
728
+ # No parse results - this shouldn't happen if job was properly marked chainable
729
+ # Log warning and store reason for debugging
730
+ logger.warning("Job marked chainable but has no parse_result", extra={
731
+ "job_id": jid,
732
+ "tool": tool,
733
+ "status": job_to_chain.get('status')
734
+ })
735
+ _append_worker_log(f"job {jid}: WARNING - marked chainable but parse_result is empty/missing")
736
+ _update_job(jid, chained=True, chain_skip_reason="parse_result missing")
737
+ return 1
738
+
739
+ if 'error' in parse_result:
740
+ # Parse had an error - log and skip
741
+ logger.warning("Job has parse error, skipping chaining", extra={
742
+ "job_id": jid,
743
+ "tool": tool,
744
+ "parse_error": parse_result.get('error')
745
+ })
746
+ _append_worker_log(f"job {jid}: parse error '{parse_result.get('error')}', skipping chain")
747
+ _update_job(jid, chained=True, chain_skip_reason=f"parse_error: {parse_result.get('error')}")
486
748
  return 1
487
749
 
488
750
  # Process auto-chaining
@@ -571,10 +833,35 @@ def _try_run_plugin(tool: str, target: str, args: List[str], label: str, log_pat
571
833
  cmd_spec = build_command_method(target, args or [], label or "", log_path)
572
834
 
573
835
  if cmd_spec is None:
574
- # Plugin validation failed
575
- with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
576
- fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
577
- return (True, 1)
836
+ # build_command returned None - check if plugin has run() method
837
+ # This allows plugins to signal "use run() instead" by returning None
838
+ run_method = getattr(plugin, "run", None)
839
+ if callable(run_method):
840
+ # Plugin wants to handle execution itself via run() method
841
+ sig = inspect.signature(run_method)
842
+ params = list(sig.parameters.keys())
843
+
844
+ try:
845
+ if "log_path" in params:
846
+ rc = run_method(target, args or [], label or "", log_path)
847
+ elif "label" in params:
848
+ rc = run_method(target, args or [], label or "")
849
+ elif "args" in params:
850
+ rc = run_method(target, args or [])
851
+ else:
852
+ rc = run_method(target)
853
+ return (True, rc if isinstance(rc, int) else 0)
854
+ except Exception as e:
855
+ with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
856
+ fh.write(f"\n=== PLUGIN RUN ERROR ===\n")
857
+ fh.write(f"{type(e).__name__}: {e}\n")
858
+ fh.write(f"\n{traceback.format_exc()}\n")
859
+ return (True, 1)
860
+ else:
861
+ # No run() method either - actual validation failure
862
+ with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
863
+ fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
864
+ return (True, 1)
578
865
 
579
866
  # Execute using new subprocess handler with PID tracking
580
867
  rc = _run_subprocess_with_spec(cmd_spec, log_path, jid=jid, plugin=plugin)
@@ -773,6 +1060,55 @@ def _store_msf_session(jid: int, target: str, exploit_path: str, session_id: str
773
1060
  _append_worker_log(f"job {jid}: session storage error: {e}")
774
1061
 
775
1062
 
1063
+ # Cache stdbuf availability check
1064
+ _stdbuf_available = None
1065
+
1066
+
1067
+ def _is_stdbuf_available() -> bool:
1068
+ """Check if stdbuf is available for line-buffered output."""
1069
+ global _stdbuf_available
1070
+ if _stdbuf_available is None:
1071
+ _stdbuf_available = shutil.which('stdbuf') is not None
1072
+ return _stdbuf_available
1073
+
1074
+
1075
+ def _wrap_cmd_for_line_buffering(cmd: List[str]) -> List[str]:
1076
+ """
1077
+ Wrap a command with stdbuf for line-buffered output when available.
1078
+
1079
+ This ensures output is written line-by-line instead of in 4-8KB blocks,
1080
+ improving real-time log monitoring and ensuring output is captured
1081
+ before process termination.
1082
+
1083
+ Args:
1084
+ cmd: Command to wrap
1085
+
1086
+ Returns:
1087
+ Command wrapped with stdbuf if available, original command otherwise
1088
+ """
1089
+ if not cmd:
1090
+ return cmd
1091
+
1092
+ if _is_stdbuf_available():
1093
+ # stdbuf -oL = line-buffered stdout, -eL = line-buffered stderr
1094
+ return ['stdbuf', '-oL', '-eL'] + cmd
1095
+
1096
+ return cmd
1097
+
1098
+
1099
+ def _get_subprocess_env() -> Dict[str, str]:
1100
+ """
1101
+ Get environment for subprocess with buffering disabled.
1102
+
1103
+ Sets PYTHONUNBUFFERED=1 for Python subprocesses and TERM=dumb
1104
+ to prevent interactive terminal issues.
1105
+ """
1106
+ env = os.environ.copy()
1107
+ env['TERM'] = 'dumb' # Prevent stty errors from interactive tools
1108
+ env['PYTHONUNBUFFERED'] = '1' # Disable Python output buffering
1109
+ return env
1110
+
1111
+
776
1112
  def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int = None, plugin=None) -> int:
777
1113
  """
778
1114
  Execute a command specification with proper PID tracking.
@@ -814,32 +1150,35 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
814
1150
  with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
815
1151
  fh.write("ERROR: No command provided in spec\n")
816
1152
  return 1
817
-
1153
+
818
1154
  timeout = cmd_spec.get('timeout', JOB_TIMEOUT_SECONDS)
819
- env = cmd_spec.get('env')
1155
+ spec_env = cmd_spec.get('env')
820
1156
  cwd = cmd_spec.get('cwd')
821
1157
  needs_shell = cmd_spec.get('needs_shell', False)
822
-
1158
+
823
1159
  _append_worker_log(f"_run_subprocess_with_spec: timeout={timeout}s for job {jid}")
824
-
825
- # Prepare environment
826
- # Set TERM=dumb to prevent stty errors from interactive tools like msfconsole
827
- proc_env = os.environ.copy()
828
- proc_env['TERM'] = 'dumb'
829
- if env:
830
- proc_env.update(env)
831
-
1160
+
1161
+ # Wrap command with stdbuf for line-buffered output (unless shell mode)
1162
+ original_cmd = cmd
1163
+ if not needs_shell:
1164
+ cmd = _wrap_cmd_for_line_buffering(cmd)
1165
+
1166
+ # Prepare environment with PYTHONUNBUFFERED=1 and TERM=dumb
1167
+ proc_env = _get_subprocess_env()
1168
+ if spec_env:
1169
+ proc_env.update(spec_env)
1170
+
832
1171
  with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
833
1172
  fh.write("=== Command Execution (build_command) ===\n")
834
- fh.write(f"Command: {' '.join(cmd)}\n")
1173
+ fh.write(f"Command: {' '.join(original_cmd)}\n")
835
1174
  fh.write(f"Timeout: {timeout} seconds\n")
836
- if env:
837
- fh.write(f"Environment: {env}\n")
1175
+ if spec_env:
1176
+ fh.write(f"Environment: {spec_env}\n")
838
1177
  if cwd:
839
1178
  fh.write(f"Working Dir: {cwd}\n")
840
1179
  fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
841
1180
  fh.flush()
842
-
1181
+
843
1182
  try:
844
1183
  # Create new process group so all children can be killed together
845
1184
  # Redirect stdin to /dev/null to prevent password prompts from hanging
@@ -849,16 +1188,17 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
849
1188
  stdout=fh,
850
1189
  stderr=subprocess.STDOUT,
851
1190
  preexec_fn=os.setsid, # Creates new session
852
- env=proc_env, # Always use proc_env (includes TERM=dumb)
1191
+ env=proc_env,
853
1192
  cwd=cwd,
854
1193
  shell=needs_shell # nosec B602 - intentional for security tool command execution
855
1194
  )
856
1195
 
857
- # Store PID if job ID provided
1196
+ # Store PID and process start time for stale detection
858
1197
  if jid is not None:
859
- _update_job(jid, pid=proc.pid)
1198
+ proc_start_time = _get_process_start_time(proc.pid)
1199
+ _update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
860
1200
  _append_worker_log(f"job {jid}: running with PID {proc.pid}")
861
-
1201
+
862
1202
  # Wait for process with timeout
863
1203
  try:
864
1204
  proc.wait(timeout=timeout)
@@ -890,6 +1230,7 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
890
1230
  return 0
891
1231
  else:
892
1232
  fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
1233
+ fh.flush()
893
1234
  return 124
894
1235
 
895
1236
  # Check if job was killed externally during execution
@@ -912,17 +1253,21 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
912
1253
  proc.wait(timeout=5)
913
1254
  except:
914
1255
  pass
1256
+ fh.flush()
915
1257
  return 143 # 128 + 15 (SIGTERM)
916
-
1258
+
917
1259
  fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
918
1260
  fh.write(f"Exit Code: {proc.returncode}\n")
1261
+ fh.flush()
919
1262
  return proc.returncode
920
-
1263
+
921
1264
  except FileNotFoundError:
922
1265
  fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
1266
+ fh.flush()
923
1267
  return 127
924
1268
  except Exception as e:
925
1269
  fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
1270
+ fh.flush()
926
1271
  return 1
927
1272
 
928
1273
 
@@ -937,9 +1282,14 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
937
1282
  cmd = [tool] + (args or [])
938
1283
  cmd = [c.replace("<target>", target) for c in cmd]
939
1284
 
1285
+ # Wrap command with stdbuf for line-buffered output
1286
+ cmd = _wrap_cmd_for_line_buffering(cmd)
1287
+
940
1288
  with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
1289
+ # Log original command (without stdbuf wrapper for clarity)
1290
+ original_cmd = cmd[3:] if cmd[:3] == ['stdbuf', '-oL', '-eL'] else cmd
941
1291
  fh.write("=== Subprocess Execution ===\n")
942
- fh.write(f"Command: {' '.join(cmd)}\n")
1292
+ fh.write(f"Command: {' '.join(original_cmd)}\n")
943
1293
  fh.write(f"Timeout: {timeout} seconds\n")
944
1294
  fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
945
1295
  fh.flush()
@@ -947,9 +1297,8 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
947
1297
  try:
948
1298
  # Create new process group so all children can be killed together
949
1299
  # Redirect stdin to /dev/null to prevent password prompts from hanging
950
- # Set TERM=dumb to prevent stty errors from interactive tools like msfconsole
951
- env = os.environ.copy()
952
- env['TERM'] = 'dumb'
1300
+ # Use env with PYTHONUNBUFFERED=1 and TERM=dumb
1301
+ env = _get_subprocess_env()
953
1302
 
954
1303
  proc = subprocess.Popen(
955
1304
  cmd,
@@ -960,9 +1309,10 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
960
1309
  env=env
961
1310
  )
962
1311
 
963
- # Store PID if job ID provided
1312
+ # Store PID and process start time for stale detection
964
1313
  if jid is not None:
965
- _update_job(jid, pid=proc.pid)
1314
+ proc_start_time = _get_process_start_time(proc.pid)
1315
+ _update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
966
1316
  _append_worker_log(f"job {jid}: running with PID {proc.pid}")
967
1317
 
968
1318
  # Wait for process with timeout
@@ -977,6 +1327,7 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
977
1327
  proc.kill() # Fallback to single process
978
1328
  proc.wait()
979
1329
  fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
1330
+ fh.flush()
980
1331
  return 124
981
1332
 
982
1333
  # Check if job was killed externally during execution
@@ -999,17 +1350,21 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
999
1350
  proc.wait(timeout=5)
1000
1351
  except:
1001
1352
  pass
1353
+ fh.flush()
1002
1354
  return 143 # 128 + 15 (SIGTERM)
1003
1355
 
1004
1356
  fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
1005
1357
  fh.write(f"Exit Code: {proc.returncode}\n")
1358
+ fh.flush()
1006
1359
  return proc.returncode
1007
1360
 
1008
1361
  except FileNotFoundError:
1009
1362
  fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
1363
+ fh.flush()
1010
1364
  return 127
1011
1365
  except Exception as e:
1012
1366
  fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
1367
+ fh.flush()
1013
1368
  return 1
1014
1369
 
1015
1370
 
@@ -1145,77 +1500,110 @@ def run_job(jid: int) -> None:
1145
1500
  # Re-fetch job to get updated data
1146
1501
  job = get_job(jid)
1147
1502
  parse_result = handle_job_result(job)
1148
- if parse_result:
1149
- if 'error' in parse_result:
1150
- logger.warning("Job parse error", extra={
1503
+
1504
+ # Handle parse failure cases
1505
+ if parse_result is None:
1506
+ # Parser returned None - likely missing log file, no parser for tool, or missing engagement
1507
+ logger.error("Job parse returned None - results may be lost", extra={
1508
+ "job_id": jid,
1509
+ "tool": job.get('tool'),
1510
+ "log_exists": os.path.exists(job.get('log', '')) if job.get('log') else False
1511
+ })
1512
+ _append_worker_log(f"job {jid} parse returned None (tool={job.get('tool')}) - check if parser exists")
1513
+ # Update job to indicate parse failure
1514
+ _update_job(jid, status=STATUS_WARNING, parse_result={'error': 'Parser returned None - no results extracted'})
1515
+ # Mark as chained to prevent infinite retry
1516
+ _update_job(jid, chained=True)
1517
+ return
1518
+
1519
+ if 'error' in parse_result:
1520
+ logger.error("Job parse error - results may be incomplete", extra={
1521
+ "job_id": jid,
1522
+ "error": parse_result['error']
1523
+ })
1524
+ _append_worker_log(f"job {jid} parse error: {parse_result['error']}")
1525
+ # Update job status to warning with the error
1526
+ _update_job(jid, status=STATUS_WARNING, parse_result=parse_result)
1527
+ # Mark as chained to prevent infinite retry
1528
+ _update_job(jid, chained=True)
1529
+ return
1530
+
1531
+ # Parse succeeded
1532
+ logger.info("Job parsed successfully", extra={
1533
+ "job_id": jid,
1534
+ "parse_result": parse_result
1535
+ })
1536
+ _append_worker_log(f"job {jid} parsed: {parse_result}")
1537
+
1538
+ # Determine chainable status BEFORE updating to avoid race condition
1539
+ # We must set parse_result and chainable in a single atomic update
1540
+ try:
1541
+ from souleyez.core.tool_chaining import ToolChaining
1542
+ chaining = ToolChaining()
1543
+
1544
+ # Get current job to check status
1545
+ job = get_job(jid)
1546
+ job_status = job.get('status', STATUS_ERROR)
1547
+
1548
+ # Determine final status from parser if provided
1549
+ final_status = parse_result.get('status', job_status)
1550
+
1551
+ # Check if job should be chainable
1552
+ should_chain = (
1553
+ chaining.is_enabled() and
1554
+ parse_result and
1555
+ 'error' not in parse_result and
1556
+ is_chainable(final_status)
1557
+ )
1558
+
1559
+ # Build update dict - ATOMIC update of parse_result + chainable
1560
+ update_fields = {'parse_result': parse_result}
1561
+
1562
+ if 'status' in parse_result:
1563
+ update_fields['status'] = final_status
1564
+ logger.info("Job status updated from parser", extra={
1151
1565
  "job_id": jid,
1152
- "error": parse_result['error']
1566
+ "status": final_status
1153
1567
  })
1154
- _append_worker_log(f"job {jid} parse error: {parse_result['error']}")
1568
+ _append_worker_log(f"job {jid} status updated to: {final_status}")
1569
+
1570
+ if should_chain:
1571
+ update_fields['chainable'] = True
1155
1572
  else:
1156
- logger.info("Job parsed successfully", extra={
1157
- "job_id": jid,
1158
- "parse_result": parse_result
1159
- })
1160
- _append_worker_log(f"job {jid} parsed: {parse_result}")
1573
+ # Not chainable - mark as chained to skip
1574
+ update_fields['chained'] = True
1161
1575
 
1162
- # Update status based on parse result if provided
1163
- if 'status' in parse_result:
1164
- final_status = parse_result['status']
1165
- _update_job(jid, status=final_status, parse_result=parse_result)
1166
- logger.info("Job status updated from parser", extra={
1576
+ # Single atomic update to prevent race condition
1577
+ _update_job(jid, **update_fields)
1578
+
1579
+ # Log chaining decision
1580
+ if should_chain:
1581
+ if final_status == STATUS_WARNING:
1582
+ logger.info("Job with warning status marked for chaining", extra={
1167
1583
  "job_id": jid,
1168
- "status": final_status
1584
+ "tool": job.get('tool'),
1585
+ "wildcard_detected": parse_result.get('wildcard_detected', False)
1169
1586
  })
1170
- _append_worker_log(f"job {jid} status updated to: {final_status}")
1587
+ _append_worker_log(f"job {jid} (status=warning) marked as chainable")
1171
1588
  else:
1172
- # Store parse result in job for dashboard display (no status update)
1173
- _update_job(jid, parse_result=parse_result)
1174
-
1175
- # Mark job as chainable instead of chaining immediately
1176
- # Worker loop will process it when database is idle
1177
- try:
1178
- from souleyez.core.tool_chaining import ToolChaining
1179
- chaining = ToolChaining()
1180
-
1181
- # Re-fetch job to get updated status
1182
- job = get_job(jid)
1183
- job_status = job.get('status', STATUS_ERROR)
1184
-
1185
- # Check if status is chainable (done, no_results, warning)
1186
- if chaining.is_enabled() and parse_result and 'error' not in parse_result and is_chainable(job_status):
1187
- # Mark for deferred chaining
1188
- _update_job(jid, chainable=True)
1189
-
1190
- # Log special handling for warning status
1191
- if job_status == STATUS_WARNING:
1192
- logger.info("Job with warning status marked for chaining", extra={
1193
- "job_id": jid,
1194
- "tool": job.get('tool'),
1195
- "wildcard_detected": parse_result.get('wildcard_detected', False)
1196
- })
1197
- _append_worker_log(f"job {jid} (status=warning) marked as chainable")
1198
- else:
1199
- logger.info("Job marked as chainable", extra={
1200
- "job_id": jid,
1201
- "tool": job.get('tool'),
1202
- "status": job_status
1203
- })
1204
- _append_worker_log(f"job {jid} marked as chainable (status={job_status})")
1205
- else:
1206
- # Chaining disabled or job has errors - mark as chained (skip)
1207
- _update_job(jid, chained=True)
1208
- reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={job_status}"
1209
- _append_worker_log(f"job {jid} not chainable ({reason})")
1210
-
1211
- except Exception as chain_err:
1212
- logger.error("Failed to mark job as chainable", extra={
1589
+ logger.info("Job marked as chainable", extra={
1213
1590
  "job_id": jid,
1214
- "error": str(chain_err)
1591
+ "tool": job.get('tool'),
1592
+ "status": final_status
1215
1593
  })
1216
- _append_worker_log(f"job {jid} chainable marking error: {chain_err}")
1217
- # Mark as chained to prevent retry loops
1218
- _update_job(jid, chained=True, chain_error=str(chain_err))
1594
+ _append_worker_log(f"job {jid} marked as chainable (status={final_status})")
1595
+ else:
1596
+ reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={final_status}"
1597
+ _append_worker_log(f"job {jid} not chainable ({reason})")
1598
+
1599
+ except Exception as chain_err:
1600
+ logger.error("Failed to mark job as chainable", extra={
1601
+ "job_id": jid,
1602
+ "error": str(chain_err)
1603
+ })
1604
+ _append_worker_log(f"job {jid} chainable marking error: {chain_err}")
1605
+ # Mark as chained to prevent retry loops
1606
+ _update_job(jid, chained=True, chain_error=str(chain_err))
1219
1607
 
1220
1608
  except Exception as e:
1221
1609
  logger.error("Job parse exception", extra={
@@ -1378,18 +1766,46 @@ def _detect_and_recover_stale_jobs() -> int:
1378
1766
  pid = job.get('pid')
1379
1767
  tool = job.get('tool', 'unknown')
1380
1768
  log_path = job.get('log')
1769
+ stored_start_time = job.get('process_start_time')
1381
1770
 
1382
- # Skip if PID is still alive
1771
+ # Check if PID is alive
1383
1772
  if _is_pid_alive(pid):
1384
- continue
1385
-
1386
- # PID is dead - this is a stale job
1387
- _append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
1388
- logger.warning("Stale job detected", extra={
1389
- "job_id": jid,
1390
- "tool": tool,
1391
- "pid": pid
1392
- })
1773
+ # PID is alive - but check for PID reuse
1774
+ if stored_start_time is not None:
1775
+ current_start_time = _get_process_start_time(pid)
1776
+ if current_start_time is not None:
1777
+ # Allow 2 second tolerance for timing differences
1778
+ if abs(current_start_time - stored_start_time) > 2:
1779
+ # PID reused by different process
1780
+ _append_worker_log(
1781
+ f"job {jid}: PID {pid} reused (stored start: {stored_start_time:.0f}, "
1782
+ f"current: {current_start_time:.0f})"
1783
+ )
1784
+ logger.warning("PID reuse detected", extra={
1785
+ "job_id": jid,
1786
+ "tool": tool,
1787
+ "pid": pid,
1788
+ "stored_start_time": stored_start_time,
1789
+ "current_start_time": current_start_time
1790
+ })
1791
+ # Fall through to stale job handling
1792
+ else:
1793
+ # Same process, still running
1794
+ continue
1795
+ else:
1796
+ # Can't get current start time, assume still valid
1797
+ continue
1798
+ else:
1799
+ # No stored start time (old job), assume still valid
1800
+ continue
1801
+ else:
1802
+ # PID is dead - definitely stale
1803
+ _append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
1804
+ logger.warning("Stale job detected", extra={
1805
+ "job_id": jid,
1806
+ "tool": tool,
1807
+ "pid": pid
1808
+ })
1393
1809
 
1394
1810
  # Check if log shows completion
1395
1811
  completed, exit_code = _check_log_for_completion(log_path, tool)
@@ -1412,6 +1828,8 @@ def _detect_and_recover_stale_jobs() -> int:
1412
1828
  # Try to parse results
1413
1829
  try:
1414
1830
  from .result_handler import handle_job_result
1831
+ from souleyez.core.tool_chaining import ToolChaining
1832
+
1415
1833
  job = get_job(jid)
1416
1834
  parse_result = handle_job_result(job)
1417
1835
 
@@ -1419,36 +1837,34 @@ def _detect_and_recover_stale_jobs() -> int:
1419
1837
  if 'error' in parse_result:
1420
1838
  _append_worker_log(f"job {jid} stale recovery parse error: {parse_result['error']}")
1421
1839
  else:
1422
- # Update status from parser if provided
1840
+ # Determine final status and chainable in one check
1841
+ final_status = parse_result.get('status', status)
1842
+ chaining = ToolChaining()
1843
+ should_chain = chaining.is_enabled() and is_chainable(final_status)
1844
+
1845
+ # Build atomic update - parse_result + status + chainable together
1846
+ update_fields = {'parse_result': parse_result}
1423
1847
  if 'status' in parse_result:
1424
- status = parse_result['status']
1425
- _update_job(jid, status=status, parse_result=parse_result)
1426
- else:
1427
- _update_job(jid, parse_result=parse_result)
1848
+ update_fields['status'] = final_status
1849
+ if should_chain:
1850
+ update_fields['chainable'] = True
1851
+
1852
+ # Single atomic update to prevent race condition
1853
+ _update_job(jid, **update_fields)
1428
1854
 
1429
1855
  _append_worker_log(f"job {jid} stale recovery parsed: {parse_result.get('findings_added', 0)} findings")
1430
1856
 
1431
1857
  logger.info("Stale job recovered with results", extra={
1432
1858
  "job_id": jid,
1433
1859
  "tool": tool,
1434
- "status": status,
1435
- "parse_result": parse_result
1860
+ "status": final_status,
1861
+ "parse_result": parse_result,
1862
+ "chainable": should_chain
1436
1863
  })
1437
1864
 
1438
- # Mark for auto-chaining if conditions are met
1439
- try:
1440
- from souleyez.core.tool_chaining import ToolChaining
1441
- chaining = ToolChaining()
1442
- if chaining.is_enabled() and is_chainable(status):
1443
- _update_job(jid, chainable=True)
1444
- _append_worker_log(f"job {jid} stale recovery marked as chainable")
1445
- logger.info("Stale job marked as chainable", extra={
1446
- "job_id": jid,
1447
- "tool": tool,
1448
- "status": status
1449
- })
1450
- except Exception as chain_err:
1451
- _append_worker_log(f"job {jid} stale recovery chainable error: {chain_err}")
1865
+ if should_chain:
1866
+ _append_worker_log(f"job {jid} stale recovery marked as chainable")
1867
+
1452
1868
  except Exception as parse_err:
1453
1869
  _append_worker_log(f"job {jid} stale recovery parse exception: {parse_err}")
1454
1870
 
@@ -1608,26 +2024,85 @@ def _check_msf_exploitation_success():
1608
2024
  return 0
1609
2025
 
1610
2026
 
2027
+ def _update_job_progress():
2028
+ """
2029
+ Update progress tracking for running jobs.
2030
+
2031
+ Checks log file modification times and flags jobs with no recent output
2032
+ as possibly hung (no output for JOB_HUNG_THRESHOLD seconds).
2033
+ """
2034
+ try:
2035
+ jobs = _read_jobs()
2036
+ running_jobs = [j for j in jobs if j.get('status') == STATUS_RUNNING]
2037
+
2038
+ for job in running_jobs:
2039
+ jid = job.get('id')
2040
+ log_path = job.get('log')
2041
+
2042
+ if not log_path or not os.path.exists(log_path):
2043
+ continue
2044
+
2045
+ try:
2046
+ # Get log file modification time
2047
+ mtime = os.path.getmtime(log_path)
2048
+ current_time = time.time()
2049
+ time_since_output = current_time - mtime
2050
+
2051
+ # Update last_output_at in job record
2052
+ updates = {'last_output_at': mtime}
2053
+
2054
+ # Flag as possibly hung if no output for threshold
2055
+ was_hung = job.get('possibly_hung', False)
2056
+ is_hung = time_since_output > JOB_HUNG_THRESHOLD
2057
+
2058
+ if is_hung != was_hung:
2059
+ updates['possibly_hung'] = is_hung
2060
+ if is_hung:
2061
+ _append_worker_log(
2062
+ f"job {jid}: no output for {int(time_since_output)}s, flagged as possibly hung"
2063
+ )
2064
+ logger.warning("Job possibly hung", extra={
2065
+ "job_id": jid,
2066
+ "tool": job.get('tool'),
2067
+ "time_since_output": int(time_since_output)
2068
+ })
2069
+
2070
+ _update_job(jid, **updates)
2071
+
2072
+ except Exception as e:
2073
+ # Non-critical, just skip this job
2074
+ pass
2075
+
2076
+ except Exception as e:
2077
+ logger.error("Job progress tracking error", extra={"error": str(e)})
2078
+
2079
+
1611
2080
  def worker_loop(poll_interval: float = 2.0):
1612
2081
  """
1613
2082
  Main worker loop that processes jobs and handles auto-chaining.
1614
2083
 
1615
2084
  Loop behavior:
1616
- 1. Detect and recover stale jobs (dead PIDs)
1617
- 2. Check for running jobs
1618
- 3. If none running, start next queued job
1619
- 4. Process one chainable job (if any)
1620
- 5. Sleep poll_interval seconds, repeat
2085
+ 1. Update heartbeat for health monitoring
2086
+ 2. Detect and recover stale jobs (dead PIDs)
2087
+ 3. Update progress tracking for running jobs
2088
+ 4. Check for running jobs
2089
+ 5. If none running, start next queued job
2090
+ 6. Process one chainable job (if any)
2091
+ 7. Sleep poll_interval seconds, repeat
1621
2092
 
1622
2093
  Args:
1623
2094
  poll_interval: Seconds to sleep between iterations (default: 2.0)
1624
2095
  """
1625
2096
  _ensure_dirs()
2097
+ _update_heartbeat() # Initial heartbeat
1626
2098
  _append_worker_log("souleyez background worker: starting loop")
1627
2099
 
1628
- # Track last stale job check time (check every 30 seconds, not every iteration)
2100
+ # Track last stale job check time (check every 15 seconds, not every iteration)
1629
2101
  last_stale_check = 0
1630
- stale_check_interval = 30 # seconds
2102
+ stale_check_interval = 15 # seconds (reduced from 30s for faster detection)
2103
+
2104
+ # Track last heartbeat time
2105
+ last_heartbeat = time.time()
1631
2106
 
1632
2107
  # Run stale job detection on startup
1633
2108
  try:
@@ -1639,8 +2114,14 @@ def worker_loop(poll_interval: float = 2.0):
1639
2114
 
1640
2115
  try:
1641
2116
  while True:
1642
- # Periodic stale job detection (every 30 seconds)
1643
2117
  current_time = time.time()
2118
+
2119
+ # Update heartbeat every HEARTBEAT_INTERVAL seconds
2120
+ if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
2121
+ _update_heartbeat()
2122
+ last_heartbeat = current_time
2123
+
2124
+ # Periodic stale job detection (every 15 seconds)
1644
2125
  if current_time - last_stale_check >= stale_check_interval:
1645
2126
  try:
1646
2127
  recovered = _detect_and_recover_stale_jobs()
@@ -1650,6 +2131,12 @@ def worker_loop(poll_interval: float = 2.0):
1650
2131
  _append_worker_log(f"stale job detection error: {e}")
1651
2132
  last_stale_check = current_time
1652
2133
 
2134
+ # Update progress tracking for running jobs
2135
+ try:
2136
+ _update_job_progress()
2137
+ except Exception as e:
2138
+ _append_worker_log(f"progress tracking error: {e}")
2139
+
1653
2140
  # Check running MSF jobs for exploitation success (every iteration)
1654
2141
  try:
1655
2142
  detected = _check_msf_exploitation_success()