PyPI - souleyez - Versions diffs - 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl - Mend

souleyez 2.26.0py3-none-any.whl → 2.27.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of souleyez might be problematic. Click here for more details.

Files changed (14) hide show

souleyez/__init__.py +1 -1
souleyez/docs/README.md +1 -1
souleyez/docs/user-guide/configuration.md +1 -1
souleyez/engine/background.py +617 -167
souleyez/engine/result_handler.py +173 -1
souleyez/engine/worker_manager.py +98 -2
souleyez/main.py +1 -1
souleyez/plugins/http_fingerprint.py +8 -2
{souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/METADATA +3 -3
{souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/RECORD +14 -14
{souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/WHEEL +0 -0
{souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/entry_points.txt +0 -0
{souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/licenses/LICENSE +0 -0
{souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/top_level.txt +0 -0

souleyez/engine/background.py CHANGED Viewed

@@ -25,6 +25,7 @@ import subprocess
 import threading
 import inspect
 import traceback
+import fcntl
 from typing import List, Dict, Optional, Any
 from souleyez.log_config import get_logger
 from .log_sanitizer import LogSanitizer
@@ -41,7 +42,12 @@ JOBS_DIR = os.path.join(DATA_DIR, "jobs")
 LOGS_DIR = os.path.join(DATA_DIR, "logs")
 JOBS_FILE = os.path.join(JOBS_DIR, "jobs.json")
 WORKER_LOG = os.path.join(LOGS_DIR, "worker.log")
+HEARTBEAT_FILE = os.path.join(JOBS_DIR, ".worker_heartbeat")
 JOB_TIMEOUT_SECONDS = 3600  # 1 hour (changed from 300s/5min)
+HEARTBEAT_INTERVAL = 10  # seconds between heartbeat writes
+HEARTBEAT_STALE_THRESHOLD = 30  # seconds before heartbeat considered stale
+JOB_HUNG_THRESHOLD = 300  # 5 minutes with no output = possibly hung
+JOBS_BACKUP_COUNT = 3  # Number of rotating backups to keep
 _lock = threading.RLock()  # Reentrant lock allows nested acquisition by same thread
@@ -51,6 +57,63 @@ def _ensure_dirs():
     os.makedirs(LOGS_DIR, exist_ok=True)
+def _get_backup_files() -> List[str]:
+    """Get list of backup files sorted by modification time (newest first)."""
+    backups = []
+    for i in range(1, JOBS_BACKUP_COUNT + 1):
+        backup_path = f"{JOBS_FILE}.bak.{i}"
+        if os.path.exists(backup_path):
+            backups.append((os.path.getmtime(backup_path), backup_path))
+    # Sort by mtime descending (newest first)
+    backups.sort(reverse=True)
+    return [path for _, path in backups]
+def _rotate_backups():
+    """Rotate backup files, keeping only JOBS_BACKUP_COUNT backups."""
+    # Shift existing backups: .bak.2 -> .bak.3, .bak.1 -> .bak.2
+    for i in range(JOBS_BACKUP_COUNT, 1, -1):
+        src = f"{JOBS_FILE}.bak.{i - 1}"
+        dst = f"{JOBS_FILE}.bak.{i}"
+        if os.path.exists(src):
+            try:
+                shutil.move(src, dst)
+            except Exception:
+                pass
+    # Create new .bak.1 from current jobs.json
+    if os.path.exists(JOBS_FILE):
+        try:
+            shutil.copy2(JOBS_FILE, f"{JOBS_FILE}.bak.1")
+        except Exception:
+            pass
+def _recover_from_backup() -> List[Dict[str, Any]]:
+    """
+    Attempt to recover jobs from backup files.
+    Returns:
+        List of jobs from the first valid backup, or empty list if no valid backup found
+    """
+    backups = _get_backup_files()
+    for backup_path in backups:
+        try:
+            with open(backup_path, "r", encoding="utf-8") as fh:
+                jobs = json.load(fh)
+            if isinstance(jobs, list):
+                _append_worker_log(f"recovered {len(jobs)} jobs from backup: {backup_path}")
+                logger.info("Jobs recovered from backup", extra={
+                    "backup_path": backup_path,
+                    "job_count": len(jobs)
+                })
+                return jobs
+        except Exception as e:
+            _append_worker_log(f"backup {backup_path} also corrupt: {e}")
+            continue
+    return []
 def _read_jobs() -> List[Dict[str, Any]]:
     _ensure_dirs()
     if not os.path.exists(JOBS_FILE):
@@ -58,18 +121,42 @@ def _read_jobs() -> List[Dict[str, Any]]:
     try:
         with open(JOBS_FILE, "r", encoding="utf-8") as fh:
             return json.load(fh)
-    except Exception:
+    except Exception as e:
+        # Log corruption event
+        _append_worker_log(f"jobs.json corrupt: {e}")
+        logger.error("Jobs file corrupted", extra={
+            "error": str(e),
+            "jobs_file": JOBS_FILE
+        })
+        # Try to recover from backup
+        recovered_jobs = _recover_from_backup()
+        # Move corrupt file aside
         try:
             corrupt = JOBS_FILE + ".corrupt." + str(int(time.time()))
             shutil.move(JOBS_FILE, corrupt)
-            _append_worker_log(f"jobs file corrupt; moved to {corrupt}")
+            _append_worker_log(f"corrupt jobs file moved to {corrupt}")
         except Exception:
             pass
-        return []
+        # If we recovered jobs, write them back
+        if recovered_jobs:
+            try:
+                _write_jobs(recovered_jobs)
+                _append_worker_log(f"restored {len(recovered_jobs)} jobs from backup")
+            except Exception as write_err:
+                _append_worker_log(f"failed to restore jobs: {write_err}")
+        return recovered_jobs
 def _write_jobs(jobs: List[Dict[str, Any]]):
     _ensure_dirs()
+    # Rotate backups before writing (keeps last 3 good copies)
+    _rotate_backups()
     tmp = tempfile.NamedTemporaryFile("w", delete=False, dir=JOBS_DIR, encoding="utf-8")
     try:
         json.dump(jobs, tmp, indent=2, ensure_ascii=False)
@@ -93,36 +180,135 @@ def _append_worker_log(msg: str):
         fh.write(line)
+def _update_heartbeat():
+    """Write current timestamp to heartbeat file for health monitoring."""
+    _ensure_dirs()
+    try:
+        with open(HEARTBEAT_FILE, 'w') as fh:
+            fh.write(str(time.time()))
+    except Exception:
+        pass  # Non-critical, don't crash worker
+def get_heartbeat_age() -> Optional[float]:
+    """
+    Get age of worker heartbeat in seconds.
+    Returns:
+        Age in seconds, or None if heartbeat file doesn't exist
+    """
+    try:
+        if os.path.exists(HEARTBEAT_FILE):
+            with open(HEARTBEAT_FILE, 'r') as fh:
+                last_beat = float(fh.read().strip())
+            return time.time() - last_beat
+        return None
+    except Exception:
+        return None
+def is_heartbeat_stale() -> bool:
+    """Check if worker heartbeat is stale (older than threshold)."""
+    age = get_heartbeat_age()
+    if age is None:
+        return True  # No heartbeat = stale
+    return age > HEARTBEAT_STALE_THRESHOLD
+def _get_process_start_time(pid: int) -> Optional[float]:
+    """
+    Get process start time from /proc filesystem (Linux only).
+    Returns:
+        Process start time as Unix timestamp, or None if not available
+    """
+    try:
+        stat_path = f"/proc/{pid}/stat"
+        if not os.path.exists(stat_path):
+            return None
+        with open(stat_path, 'r') as f:
+            stat = f.read()
+        # Parse stat file - field 22 is starttime (in clock ticks since boot)
+        # Format: pid (comm) state ppid pgrp session tty_nr ... starttime ...
+        # Need to handle comm field which may contain spaces/parentheses
+        parts = stat.rsplit(')', 1)
+        if len(parts) < 2:
+            return None
+        fields = parts[1].split()
+        if len(fields) < 20:
+            return None
+        starttime_ticks = int(fields[19])  # 0-indexed, field 22 is at index 19 after comm
+        # Convert to timestamp using system boot time and clock ticks per second
+        with open('/proc/stat', 'r') as f:
+            for line in f:
+                if line.startswith('btime'):
+                    boot_time = int(line.split()[1])
+                    break
+            else:
+                return None
+        # Get clock ticks per second (usually 100)
+        ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
+        return boot_time + (starttime_ticks / ticks_per_sec)
+    except Exception:
+        return None
 def _next_job_id(jobs: List[Dict[str, Any]]) -> int:
     """
-    Get next available job ID.
-    Uses a persistent counter to ensure IDs are never reused, even after
-    jobs are purged. This prevents worker confusion when job IDs overlap.
+    Get next available job ID with file locking.
+    Uses a persistent counter with fcntl locking to ensure IDs are never
+    reused, even across multiple processes. This prevents duplicate job IDs
+    when multiple jobs are enqueued concurrently.
     """
     counter_file = os.path.join(JOBS_DIR, ".job_counter")
+    lock_file = os.path.join(JOBS_DIR, ".job_counter.lock")
     try:
-        # Try to read persistent counter
-        if os.path.exists(counter_file):
-            with open(counter_file, 'r') as f:
-                next_id = int(f.read().strip())
-        else:
-            # Initialize from existing jobs
-            maxid = 0
-            for j in jobs:
-                try:
-                    if isinstance(j.get("id"), int) and j["id"] > maxid:
-                        maxid = j["id"]
-                except Exception:
-                    continue
-            next_id = maxid + 1
-        # Write incremented counter for next time
-        with open(counter_file, 'w') as f:
-            f.write(str(next_id + 1))
-        return next_id
+        _ensure_dirs()
+        # Use a separate lock file to allow atomic read-modify-write
+        with open(lock_file, 'w') as lock_fh:
+            # Acquire exclusive lock (blocks until available)
+            fcntl.flock(lock_fh.fileno(), fcntl.LOCK_EX)
+            try:
+                # Read current counter
+                if os.path.exists(counter_file):
+                    with open(counter_file, 'r') as f:
+                        next_id = int(f.read().strip())
+                else:
+                    # Initialize from existing jobs
+                    maxid = 0
+                    for j in jobs:
+                        try:
+                            if isinstance(j.get("id"), int) and j["id"] > maxid:
+                                maxid = j["id"]
+                        except Exception:
+                            continue
+                    next_id = maxid + 1
+                # Write incremented counter atomically
+                tmp_file = counter_file + '.tmp'
+                with open(tmp_file, 'w') as f:
+                    f.write(str(next_id + 1))
+                    f.flush()
+                    os.fsync(f.fileno())
+                os.replace(tmp_file, counter_file)
+                return next_id
+            finally:
+                # Release lock
+                fcntl.flock(lock_fh.fileno(), fcntl.LOCK_UN)
     except Exception:
         # Fallback to old behavior if file operations fail
         maxid = 0
@@ -409,14 +595,36 @@ def purge_all_jobs() -> int:
     return purge_jobs(status_filter=['done', 'error', 'killed'])
-def _update_job(jid: int, **fields):
+def _update_job(jid: int, respect_killed: bool = True, **fields):
+    """
+    Update job fields atomically.
+    Args:
+        jid: Job ID to update
+        respect_killed: If True (default), don't overwrite status if job is killed.
+                       This prevents race condition where job is killed while completing.
+        **fields: Fields to update
+    """
     with _lock:
         jobs = _read_jobs()
         changed = False
         for j in jobs:
             if j.get("id") == jid:
-                j.update(fields)
-                changed = True
+                # Race condition protection: don't change status of killed jobs
+                if respect_killed and j.get("status") == STATUS_KILLED and "status" in fields:
+                    # Job was killed - don't overwrite status, but allow other updates
+                    fields_copy = dict(fields)
+                    del fields_copy["status"]
+                    if fields_copy:
+                        j.update(fields_copy)
+                        changed = True
+                    logger.debug("Skipped status update for killed job", extra={
+                        "job_id": jid,
+                        "attempted_status": fields.get("status")
+                    })
+                else:
+                    j.update(fields)
+                    changed = True
                 break
         if changed:
             _write_jobs(jobs)
@@ -479,10 +687,27 @@ def _process_pending_chains():
             # Get parse results from job
             parse_result = job_to_chain.get('parse_result', {})
-            if not parse_result or 'error' in parse_result:
-                # No results or parse error - skip chaining
-                _update_job(jid, chained=True)
-                _append_worker_log(f"job {jid}: no parse results, skipping chain")
+            if not parse_result:
+                # No parse results - this shouldn't happen if job was properly marked chainable
+                # Log warning and store reason for debugging
+                logger.warning("Job marked chainable but has no parse_result", extra={
+                    "job_id": jid,
+                    "tool": tool,
+                    "status": job_to_chain.get('status')
+                })
+                _append_worker_log(f"job {jid}: WARNING - marked chainable but parse_result is empty/missing")
+                _update_job(jid, chained=True, chain_skip_reason="parse_result missing")
+                return 1
+            if 'error' in parse_result:
+                # Parse had an error - log and skip
+                logger.warning("Job has parse error, skipping chaining", extra={
+                    "job_id": jid,
+                    "tool": tool,
+                    "parse_error": parse_result.get('error')
+                })
+                _append_worker_log(f"job {jid}: parse error '{parse_result.get('error')}', skipping chain")
+                _update_job(jid, chained=True, chain_skip_reason=f"parse_error: {parse_result.get('error')}")
                 return 1
             # Process auto-chaining
@@ -571,10 +796,35 @@ def _try_run_plugin(tool: str, target: str, args: List[str], label: str, log_pat
                 cmd_spec = build_command_method(target, args or [], label or "", log_path)
                 if cmd_spec is None:
-                    # Plugin validation failed
-                    with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
-                        fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
-                    return (True, 1)
+                    # build_command returned None - check if plugin has run() method
+                    # This allows plugins to signal "use run() instead" by returning None
+                    run_method = getattr(plugin, "run", None)
+                    if callable(run_method):
+                        # Plugin wants to handle execution itself via run() method
+                        sig = inspect.signature(run_method)
+                        params = list(sig.parameters.keys())
+                        try:
+                            if "log_path" in params:
+                                rc = run_method(target, args or [], label or "", log_path)
+                            elif "label" in params:
+                                rc = run_method(target, args or [], label or "")
+                            elif "args" in params:
+                                rc = run_method(target, args or [])
+                            else:
+                                rc = run_method(target)
+                            return (True, rc if isinstance(rc, int) else 0)
+                        except Exception as e:
+                            with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
+                                fh.write(f"\n=== PLUGIN RUN ERROR ===\n")
+                                fh.write(f"{type(e).__name__}: {e}\n")
+                                fh.write(f"\n{traceback.format_exc()}\n")
+                            return (True, 1)
+                    else:
+                        # No run() method either - actual validation failure
+                        with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
+                            fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
+                        return (True, 1)
                 # Execute using new subprocess handler with PID tracking
                 rc = _run_subprocess_with_spec(cmd_spec, log_path, jid=jid, plugin=plugin)
@@ -773,6 +1023,55 @@ def _store_msf_session(jid: int, target: str, exploit_path: str, session_id: str
         _append_worker_log(f"job {jid}: session storage error: {e}")
+# Cache stdbuf availability check
+_stdbuf_available = None
+def _is_stdbuf_available() -> bool:
+    """Check if stdbuf is available for line-buffered output."""
+    global _stdbuf_available
+    if _stdbuf_available is None:
+        _stdbuf_available = shutil.which('stdbuf') is not None
+    return _stdbuf_available
+def _wrap_cmd_for_line_buffering(cmd: List[str]) -> List[str]:
+    """
+    Wrap a command with stdbuf for line-buffered output when available.
+    This ensures output is written line-by-line instead of in 4-8KB blocks,
+    improving real-time log monitoring and ensuring output is captured
+    before process termination.
+    Args:
+        cmd: Command to wrap
+    Returns:
+        Command wrapped with stdbuf if available, original command otherwise
+    """
+    if not cmd:
+        return cmd
+    if _is_stdbuf_available():
+        # stdbuf -oL = line-buffered stdout, -eL = line-buffered stderr
+        return ['stdbuf', '-oL', '-eL'] + cmd
+    return cmd
+def _get_subprocess_env() -> Dict[str, str]:
+    """
+    Get environment for subprocess with buffering disabled.
+    Sets PYTHONUNBUFFERED=1 for Python subprocesses and TERM=dumb
+    to prevent interactive terminal issues.
+    """
+    env = os.environ.copy()
+    env['TERM'] = 'dumb'  # Prevent stty errors from interactive tools
+    env['PYTHONUNBUFFERED'] = '1'  # Disable Python output buffering
+    return env
 def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int = None, plugin=None) -> int:
     """
     Execute a command specification with proper PID tracking.
@@ -814,32 +1113,35 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
         with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
             fh.write("ERROR: No command provided in spec\n")
         return 1
     timeout = cmd_spec.get('timeout', JOB_TIMEOUT_SECONDS)
-    env = cmd_spec.get('env')
+    spec_env = cmd_spec.get('env')
     cwd = cmd_spec.get('cwd')
     needs_shell = cmd_spec.get('needs_shell', False)
     _append_worker_log(f"_run_subprocess_with_spec: timeout={timeout}s for job {jid}")
-    # Prepare environment
-    # Set TERM=dumb to prevent stty errors from interactive tools like msfconsole
-    proc_env = os.environ.copy()
-    proc_env['TERM'] = 'dumb'
-    if env:
-        proc_env.update(env)
+    # Wrap command with stdbuf for line-buffered output (unless shell mode)
+    original_cmd = cmd
+    if not needs_shell:
+        cmd = _wrap_cmd_for_line_buffering(cmd)
+    # Prepare environment with PYTHONUNBUFFERED=1 and TERM=dumb
+    proc_env = _get_subprocess_env()
+    if spec_env:
+        proc_env.update(spec_env)
     with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
         fh.write("=== Command Execution (build_command) ===\n")
-        fh.write(f"Command: {' '.join(cmd)}\n")
+        fh.write(f"Command: {' '.join(original_cmd)}\n")
         fh.write(f"Timeout: {timeout} seconds\n")
-        if env:
-            fh.write(f"Environment: {env}\n")
+        if spec_env:
+            fh.write(f"Environment: {spec_env}\n")
         if cwd:
             fh.write(f"Working Dir: {cwd}\n")
         fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
         fh.flush()
         try:
             # Create new process group so all children can be killed together
             # Redirect stdin to /dev/null to prevent password prompts from hanging
@@ -849,16 +1151,17 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
                 stdout=fh,
                 stderr=subprocess.STDOUT,
                 preexec_fn=os.setsid,  # Creates new session
-                env=proc_env,  # Always use proc_env (includes TERM=dumb)
+                env=proc_env,
                 cwd=cwd,
                 shell=needs_shell  # nosec B602 - intentional for security tool command execution
             )
-            # Store PID if job ID provided
+            # Store PID and process start time for stale detection
             if jid is not None:
-                _update_job(jid, pid=proc.pid)
+                proc_start_time = _get_process_start_time(proc.pid)
+                _update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
                 _append_worker_log(f"job {jid}: running with PID {proc.pid}")
             # Wait for process with timeout
             try:
                 proc.wait(timeout=timeout)
@@ -890,6 +1193,7 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
                     return 0
                 else:
                     fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
+                    fh.flush()
                     return 124
             # Check if job was killed externally during execution
@@ -912,17 +1216,21 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
                             proc.wait(timeout=5)
                     except:
                         pass
+                    fh.flush()
                     return 143  # 128 + 15 (SIGTERM)
             fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
             fh.write(f"Exit Code: {proc.returncode}\n")
+            fh.flush()
             return proc.returncode
         except FileNotFoundError:
             fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
+            fh.flush()
             return 127
         except Exception as e:
             fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
+            fh.flush()
             return 1
@@ -937,9 +1245,14 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
     cmd = [tool] + (args or [])
     cmd = [c.replace("<target>", target) for c in cmd]
+    # Wrap command with stdbuf for line-buffered output
+    cmd = _wrap_cmd_for_line_buffering(cmd)
     with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
+        # Log original command (without stdbuf wrapper for clarity)
+        original_cmd = cmd[3:] if cmd[:3] == ['stdbuf', '-oL', '-eL'] else cmd
         fh.write("=== Subprocess Execution ===\n")
-        fh.write(f"Command: {' '.join(cmd)}\n")
+        fh.write(f"Command: {' '.join(original_cmd)}\n")
         fh.write(f"Timeout: {timeout} seconds\n")
         fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
         fh.flush()
@@ -947,9 +1260,8 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
         try:
             # Create new process group so all children can be killed together
             # Redirect stdin to /dev/null to prevent password prompts from hanging
-            # Set TERM=dumb to prevent stty errors from interactive tools like msfconsole
-            env = os.environ.copy()
-            env['TERM'] = 'dumb'
+            # Use env with PYTHONUNBUFFERED=1 and TERM=dumb
+            env = _get_subprocess_env()
             proc = subprocess.Popen(
                 cmd,
@@ -960,9 +1272,10 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
                 env=env
             )
-            # Store PID if job ID provided
+            # Store PID and process start time for stale detection
             if jid is not None:
-                _update_job(jid, pid=proc.pid)
+                proc_start_time = _get_process_start_time(proc.pid)
+                _update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
                 _append_worker_log(f"job {jid}: running with PID {proc.pid}")
             # Wait for process with timeout
@@ -977,6 +1290,7 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
                     proc.kill()  # Fallback to single process
                 proc.wait()
                 fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
+                fh.flush()
                 return 124
             # Check if job was killed externally during execution
@@ -999,17 +1313,21 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
                             proc.wait(timeout=5)
                     except:
                         pass
+                    fh.flush()
                     return 143  # 128 + 15 (SIGTERM)
             fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
             fh.write(f"Exit Code: {proc.returncode}\n")
+            fh.flush()
             return proc.returncode
         except FileNotFoundError:
             fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
+            fh.flush()
             return 127
         except Exception as e:
             fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
+            fh.flush()
             return 1
@@ -1145,77 +1463,110 @@ def run_job(jid: int) -> None:
             # Re-fetch job to get updated data
             job = get_job(jid)
             parse_result = handle_job_result(job)
-            if parse_result:
-                if 'error' in parse_result:
-                    logger.warning("Job parse error", extra={
+            # Handle parse failure cases
+            if parse_result is None:
+                # Parser returned None - likely missing log file, no parser for tool, or missing engagement
+                logger.error("Job parse returned None - results may be lost", extra={
+                    "job_id": jid,
+                    "tool": job.get('tool'),
+                    "log_exists": os.path.exists(job.get('log', '')) if job.get('log') else False
+                })
+                _append_worker_log(f"job {jid} parse returned None (tool={job.get('tool')}) - check if parser exists")
+                # Update job to indicate parse failure
+                _update_job(jid, status=STATUS_WARNING, parse_result={'error': 'Parser returned None - no results extracted'})
+                # Mark as chained to prevent infinite retry
+                _update_job(jid, chained=True)
+                return
+            if 'error' in parse_result:
+                logger.error("Job parse error - results may be incomplete", extra={
+                    "job_id": jid,
+                    "error": parse_result['error']
+                })
+                _append_worker_log(f"job {jid} parse error: {parse_result['error']}")
+                # Update job status to warning with the error
+                _update_job(jid, status=STATUS_WARNING, parse_result=parse_result)
+                # Mark as chained to prevent infinite retry
+                _update_job(jid, chained=True)
+                return
+            # Parse succeeded
+            logger.info("Job parsed successfully", extra={
+                "job_id": jid,
+                "parse_result": parse_result
+            })
+            _append_worker_log(f"job {jid} parsed: {parse_result}")
+            # Determine chainable status BEFORE updating to avoid race condition
+            # We must set parse_result and chainable in a single atomic update
+            try:
+                from souleyez.core.tool_chaining import ToolChaining
+                chaining = ToolChaining()
+                # Get current job to check status
+                job = get_job(jid)
+                job_status = job.get('status', STATUS_ERROR)
+                # Determine final status from parser if provided
+                final_status = parse_result.get('status', job_status)
+                # Check if job should be chainable
+                should_chain = (
+                    chaining.is_enabled() and
+                    parse_result and
+                    'error' not in parse_result and
+                    is_chainable(final_status)
+                )
+                # Build update dict - ATOMIC update of parse_result + chainable
+                update_fields = {'parse_result': parse_result}
+                if 'status' in parse_result:
+                    update_fields['status'] = final_status
+                    logger.info("Job status updated from parser", extra={
                         "job_id": jid,
-                        "error": parse_result['error']
+                        "status": final_status
                     })
-                    _append_worker_log(f"job {jid} parse error: {parse_result['error']}")
+                    _append_worker_log(f"job {jid} status updated to: {final_status}")
+                if should_chain:
+                    update_fields['chainable'] = True
                 else:
-                    logger.info("Job parsed successfully", extra={
-                        "job_id": jid,
-                        "parse_result": parse_result
-                    })
-                    _append_worker_log(f"job {jid} parsed: {parse_result}")
+                    # Not chainable - mark as chained to skip
+                    update_fields['chained'] = True
-                    # Update status based on parse result if provided
-                    if 'status' in parse_result:
-                        final_status = parse_result['status']
-                        _update_job(jid, status=final_status, parse_result=parse_result)
-                        logger.info("Job status updated from parser", extra={
+                # Single atomic update to prevent race condition
+                _update_job(jid, **update_fields)
+                # Log chaining decision
+                if should_chain:
+                    if final_status == STATUS_WARNING:
+                        logger.info("Job with warning status marked for chaining", extra={
                             "job_id": jid,
-                            "status": final_status
+                            "tool": job.get('tool'),
+                            "wildcard_detected": parse_result.get('wildcard_detected', False)
                         })
-                        _append_worker_log(f"job {jid} status updated to: {final_status}")
+                        _append_worker_log(f"job {jid} (status=warning) marked as chainable")
                     else:
-                        # Store parse result in job for dashboard display (no status update)
-                        _update_job(jid, parse_result=parse_result)
-                    # Mark job as chainable instead of chaining immediately
-                    # Worker loop will process it when database is idle
-                    try:
-                        from souleyez.core.tool_chaining import ToolChaining
-                        chaining = ToolChaining()
-                        # Re-fetch job to get updated status
-                        job = get_job(jid)
-                        job_status = job.get('status', STATUS_ERROR)
-                        # Check if status is chainable (done, no_results, warning)
-                        if chaining.is_enabled() and parse_result and 'error' not in parse_result and is_chainable(job_status):
-                            # Mark for deferred chaining
-                            _update_job(jid, chainable=True)
-                            # Log special handling for warning status
-                            if job_status == STATUS_WARNING:
-                                logger.info("Job with warning status marked for chaining", extra={
-                                    "job_id": jid,
-                                    "tool": job.get('tool'),
-                                    "wildcard_detected": parse_result.get('wildcard_detected', False)
-                                })
-                                _append_worker_log(f"job {jid} (status=warning) marked as chainable")
-                            else:
-                                logger.info("Job marked as chainable", extra={
-                                    "job_id": jid,
-                                    "tool": job.get('tool'),
-                                    "status": job_status
-                                })
-                                _append_worker_log(f"job {jid} marked as chainable (status={job_status})")
-                        else:
-                            # Chaining disabled or job has errors - mark as chained (skip)
-                            _update_job(jid, chained=True)
-                            reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={job_status}"
-                            _append_worker_log(f"job {jid} not chainable ({reason})")
-                    except Exception as chain_err:
-                        logger.error("Failed to mark job as chainable", extra={
+                        logger.info("Job marked as chainable", extra={
                             "job_id": jid,
-                            "error": str(chain_err)
+                            "tool": job.get('tool'),
+                            "status": final_status
                         })
-                        _append_worker_log(f"job {jid} chainable marking error: {chain_err}")
-                        # Mark as chained to prevent retry loops
-                        _update_job(jid, chained=True, chain_error=str(chain_err))
+                        _append_worker_log(f"job {jid} marked as chainable (status={final_status})")
+                else:
+                    reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={final_status}"
+                    _append_worker_log(f"job {jid} not chainable ({reason})")
+            except Exception as chain_err:
+                logger.error("Failed to mark job as chainable", extra={
+                    "job_id": jid,
+                    "error": str(chain_err)
+                })
+                _append_worker_log(f"job {jid} chainable marking error: {chain_err}")
+                # Mark as chained to prevent retry loops
+                _update_job(jid, chained=True, chain_error=str(chain_err))
         except Exception as e:
             logger.error("Job parse exception", extra={
@@ -1378,18 +1729,46 @@ def _detect_and_recover_stale_jobs() -> int:
             pid = job.get('pid')
             tool = job.get('tool', 'unknown')
             log_path = job.get('log')
+            stored_start_time = job.get('process_start_time')
-            # Skip if PID is still alive
+            # Check if PID is alive
             if _is_pid_alive(pid):
-                continue
-            # PID is dead - this is a stale job
-            _append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
-            logger.warning("Stale job detected", extra={
-                "job_id": jid,
-                "tool": tool,
-                "pid": pid
-            })
+                # PID is alive - but check for PID reuse
+                if stored_start_time is not None:
+                    current_start_time = _get_process_start_time(pid)
+                    if current_start_time is not None:
+                        # Allow 2 second tolerance for timing differences
+                        if abs(current_start_time - stored_start_time) > 2:
+                            # PID reused by different process
+                            _append_worker_log(
+                                f"job {jid}: PID {pid} reused (stored start: {stored_start_time:.0f}, "
+                                f"current: {current_start_time:.0f})"
+                            )
+                            logger.warning("PID reuse detected", extra={
+                                "job_id": jid,
+                                "tool": tool,
+                                "pid": pid,
+                                "stored_start_time": stored_start_time,
+                                "current_start_time": current_start_time
+                            })
+                            # Fall through to stale job handling
+                        else:
+                            # Same process, still running
+                            continue
+                    else:
+                        # Can't get current start time, assume still valid
+                        continue
+                else:
+                    # No stored start time (old job), assume still valid
+                    continue
+            else:
+                # PID is dead - definitely stale
+                _append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
+                logger.warning("Stale job detected", extra={
+                    "job_id": jid,
+                    "tool": tool,
+                    "pid": pid
+                })
             # Check if log shows completion
             completed, exit_code = _check_log_for_completion(log_path, tool)
@@ -1412,6 +1791,8 @@ def _detect_and_recover_stale_jobs() -> int:
                 # Try to parse results
                 try:
                     from .result_handler import handle_job_result
+                    from souleyez.core.tool_chaining import ToolChaining
                     job = get_job(jid)
                     parse_result = handle_job_result(job)
@@ -1419,36 +1800,34 @@ def _detect_and_recover_stale_jobs() -> int:
                         if 'error' in parse_result:
                             _append_worker_log(f"job {jid} stale recovery parse error: {parse_result['error']}")
                         else:
-                            # Update status from parser if provided
+                            # Determine final status and chainable in one check
+                            final_status = parse_result.get('status', status)
+                            chaining = ToolChaining()
+                            should_chain = chaining.is_enabled() and is_chainable(final_status)
+                            # Build atomic update - parse_result + status + chainable together
+                            update_fields = {'parse_result': parse_result}
                             if 'status' in parse_result:
-                                status = parse_result['status']
-                                _update_job(jid, status=status, parse_result=parse_result)
-                            else:
-                                _update_job(jid, parse_result=parse_result)
+                                update_fields['status'] = final_status
+                            if should_chain:
+                                update_fields['chainable'] = True
+                            # Single atomic update to prevent race condition
+                            _update_job(jid, **update_fields)
                             _append_worker_log(f"job {jid} stale recovery parsed: {parse_result.get('findings_added', 0)} findings")
                             logger.info("Stale job recovered with results", extra={
                                 "job_id": jid,
                                 "tool": tool,
-                                "status": status,
-                                "parse_result": parse_result
+                                "status": final_status,
+                                "parse_result": parse_result,
+                                "chainable": should_chain
                             })
-                            # Mark for auto-chaining if conditions are met
-                            try:
-                                from souleyez.core.tool_chaining import ToolChaining
-                                chaining = ToolChaining()
-                                if chaining.is_enabled() and is_chainable(status):
-                                    _update_job(jid, chainable=True)
-                                    _append_worker_log(f"job {jid} stale recovery marked as chainable")
-                                    logger.info("Stale job marked as chainable", extra={
-                                        "job_id": jid,
-                                        "tool": tool,
-                                        "status": status
-                                    })
-                            except Exception as chain_err:
-                                _append_worker_log(f"job {jid} stale recovery chainable error: {chain_err}")
+                            if should_chain:
+                                _append_worker_log(f"job {jid} stale recovery marked as chainable")
                 except Exception as parse_err:
                     _append_worker_log(f"job {jid} stale recovery parse exception: {parse_err}")
@@ -1608,26 +1987,85 @@ def _check_msf_exploitation_success():
         return 0
+def _update_job_progress():
+    """
+    Update progress tracking for running jobs.
+    Checks log file modification times and flags jobs with no recent output
+    as possibly hung (no output for JOB_HUNG_THRESHOLD seconds).
+    """
+    try:
+        jobs = _read_jobs()
+        running_jobs = [j for j in jobs if j.get('status') == STATUS_RUNNING]
+        for job in running_jobs:
+            jid = job.get('id')
+            log_path = job.get('log')
+            if not log_path or not os.path.exists(log_path):
+                continue
+            try:
+                # Get log file modification time
+                mtime = os.path.getmtime(log_path)
+                current_time = time.time()
+                time_since_output = current_time - mtime
+                # Update last_output_at in job record
+                updates = {'last_output_at': mtime}
+                # Flag as possibly hung if no output for threshold
+                was_hung = job.get('possibly_hung', False)
+                is_hung = time_since_output > JOB_HUNG_THRESHOLD
+                if is_hung != was_hung:
+                    updates['possibly_hung'] = is_hung
+                    if is_hung:
+                        _append_worker_log(
+                            f"job {jid}: no output for {int(time_since_output)}s, flagged as possibly hung"
+                        )
+                        logger.warning("Job possibly hung", extra={
+                            "job_id": jid,
+                            "tool": job.get('tool'),
+                            "time_since_output": int(time_since_output)
+                        })
+                _update_job(jid, **updates)
+            except Exception as e:
+                # Non-critical, just skip this job
+                pass
+    except Exception as e:
+        logger.error("Job progress tracking error", extra={"error": str(e)})
 def worker_loop(poll_interval: float = 2.0):
     """
     Main worker loop that processes jobs and handles auto-chaining.
     Loop behavior:
-    1. Detect and recover stale jobs (dead PIDs)
-    2. Check for running jobs
-    3. If none running, start next queued job
-    4. Process one chainable job (if any)
-    5. Sleep poll_interval seconds, repeat
+    1. Update heartbeat for health monitoring
+    2. Detect and recover stale jobs (dead PIDs)
+    3. Update progress tracking for running jobs
+    4. Check for running jobs
+    5. If none running, start next queued job
+    6. Process one chainable job (if any)
+    7. Sleep poll_interval seconds, repeat
     Args:
         poll_interval: Seconds to sleep between iterations (default: 2.0)
     """
     _ensure_dirs()
+    _update_heartbeat()  # Initial heartbeat
     _append_worker_log("souleyez background worker: starting loop")
-    # Track last stale job check time (check every 30 seconds, not every iteration)
+    # Track last stale job check time (check every 15 seconds, not every iteration)
     last_stale_check = 0
-    stale_check_interval = 30  # seconds
+    stale_check_interval = 15  # seconds (reduced from 30s for faster detection)
+    # Track last heartbeat time
+    last_heartbeat = time.time()
     # Run stale job detection on startup
     try:
@@ -1639,8 +2077,14 @@ def worker_loop(poll_interval: float = 2.0):
     try:
         while True:
-            # Periodic stale job detection (every 30 seconds)
             current_time = time.time()
+            # Update heartbeat every HEARTBEAT_INTERVAL seconds
+            if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
+                _update_heartbeat()
+                last_heartbeat = current_time
+            # Periodic stale job detection (every 15 seconds)
             if current_time - last_stale_check >= stale_check_interval:
                 try:
                     recovered = _detect_and_recover_stale_jobs()
@@ -1650,6 +2094,12 @@ def worker_loop(poll_interval: float = 2.0):
                     _append_worker_log(f"stale job detection error: {e}")
                 last_stale_check = current_time
+            # Update progress tracking for running jobs
+            try:
+                _update_job_progress()
+            except Exception as e:
+                _append_worker_log(f"progress tracking error: {e}")
             # Check running MSF jobs for exploitation success (every iteration)
             try:
                 detected = _check_msf_exploitation_success()

souleyez 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl

Potentially problematic release.

souleyez 2.26.0py3-none-any.whl → 2.27.0py3-none-any.whl