souleyez 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of souleyez might be problematic. Click here for more details.
- souleyez/__init__.py +1 -1
- souleyez/docs/README.md +1 -1
- souleyez/docs/user-guide/configuration.md +1 -1
- souleyez/engine/background.py +617 -167
- souleyez/engine/result_handler.py +173 -1
- souleyez/engine/worker_manager.py +98 -2
- souleyez/main.py +1 -1
- souleyez/plugins/http_fingerprint.py +8 -2
- {souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/METADATA +3 -3
- {souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/RECORD +14 -14
- {souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/WHEEL +0 -0
- {souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/entry_points.txt +0 -0
- {souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/licenses/LICENSE +0 -0
- {souleyez-2.26.0.dist-info → souleyez-2.27.0.dist-info}/top_level.txt +0 -0
souleyez/engine/background.py
CHANGED
|
@@ -25,6 +25,7 @@ import subprocess
|
|
|
25
25
|
import threading
|
|
26
26
|
import inspect
|
|
27
27
|
import traceback
|
|
28
|
+
import fcntl
|
|
28
29
|
from typing import List, Dict, Optional, Any
|
|
29
30
|
from souleyez.log_config import get_logger
|
|
30
31
|
from .log_sanitizer import LogSanitizer
|
|
@@ -41,7 +42,12 @@ JOBS_DIR = os.path.join(DATA_DIR, "jobs")
|
|
|
41
42
|
LOGS_DIR = os.path.join(DATA_DIR, "logs")
|
|
42
43
|
JOBS_FILE = os.path.join(JOBS_DIR, "jobs.json")
|
|
43
44
|
WORKER_LOG = os.path.join(LOGS_DIR, "worker.log")
|
|
45
|
+
HEARTBEAT_FILE = os.path.join(JOBS_DIR, ".worker_heartbeat")
|
|
44
46
|
JOB_TIMEOUT_SECONDS = 3600 # 1 hour (changed from 300s/5min)
|
|
47
|
+
HEARTBEAT_INTERVAL = 10 # seconds between heartbeat writes
|
|
48
|
+
HEARTBEAT_STALE_THRESHOLD = 30 # seconds before heartbeat considered stale
|
|
49
|
+
JOB_HUNG_THRESHOLD = 300 # 5 minutes with no output = possibly hung
|
|
50
|
+
JOBS_BACKUP_COUNT = 3 # Number of rotating backups to keep
|
|
45
51
|
|
|
46
52
|
_lock = threading.RLock() # Reentrant lock allows nested acquisition by same thread
|
|
47
53
|
|
|
@@ -51,6 +57,63 @@ def _ensure_dirs():
|
|
|
51
57
|
os.makedirs(LOGS_DIR, exist_ok=True)
|
|
52
58
|
|
|
53
59
|
|
|
60
|
+
def _get_backup_files() -> List[str]:
|
|
61
|
+
"""Get list of backup files sorted by modification time (newest first)."""
|
|
62
|
+
backups = []
|
|
63
|
+
for i in range(1, JOBS_BACKUP_COUNT + 1):
|
|
64
|
+
backup_path = f"{JOBS_FILE}.bak.{i}"
|
|
65
|
+
if os.path.exists(backup_path):
|
|
66
|
+
backups.append((os.path.getmtime(backup_path), backup_path))
|
|
67
|
+
# Sort by mtime descending (newest first)
|
|
68
|
+
backups.sort(reverse=True)
|
|
69
|
+
return [path for _, path in backups]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _rotate_backups():
|
|
73
|
+
"""Rotate backup files, keeping only JOBS_BACKUP_COUNT backups."""
|
|
74
|
+
# Shift existing backups: .bak.2 -> .bak.3, .bak.1 -> .bak.2
|
|
75
|
+
for i in range(JOBS_BACKUP_COUNT, 1, -1):
|
|
76
|
+
src = f"{JOBS_FILE}.bak.{i - 1}"
|
|
77
|
+
dst = f"{JOBS_FILE}.bak.{i}"
|
|
78
|
+
if os.path.exists(src):
|
|
79
|
+
try:
|
|
80
|
+
shutil.move(src, dst)
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
# Create new .bak.1 from current jobs.json
|
|
85
|
+
if os.path.exists(JOBS_FILE):
|
|
86
|
+
try:
|
|
87
|
+
shutil.copy2(JOBS_FILE, f"{JOBS_FILE}.bak.1")
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _recover_from_backup() -> List[Dict[str, Any]]:
|
|
93
|
+
"""
|
|
94
|
+
Attempt to recover jobs from backup files.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of jobs from the first valid backup, or empty list if no valid backup found
|
|
98
|
+
"""
|
|
99
|
+
backups = _get_backup_files()
|
|
100
|
+
for backup_path in backups:
|
|
101
|
+
try:
|
|
102
|
+
with open(backup_path, "r", encoding="utf-8") as fh:
|
|
103
|
+
jobs = json.load(fh)
|
|
104
|
+
if isinstance(jobs, list):
|
|
105
|
+
_append_worker_log(f"recovered {len(jobs)} jobs from backup: {backup_path}")
|
|
106
|
+
logger.info("Jobs recovered from backup", extra={
|
|
107
|
+
"backup_path": backup_path,
|
|
108
|
+
"job_count": len(jobs)
|
|
109
|
+
})
|
|
110
|
+
return jobs
|
|
111
|
+
except Exception as e:
|
|
112
|
+
_append_worker_log(f"backup {backup_path} also corrupt: {e}")
|
|
113
|
+
continue
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
|
|
54
117
|
def _read_jobs() -> List[Dict[str, Any]]:
|
|
55
118
|
_ensure_dirs()
|
|
56
119
|
if not os.path.exists(JOBS_FILE):
|
|
@@ -58,18 +121,42 @@ def _read_jobs() -> List[Dict[str, Any]]:
|
|
|
58
121
|
try:
|
|
59
122
|
with open(JOBS_FILE, "r", encoding="utf-8") as fh:
|
|
60
123
|
return json.load(fh)
|
|
61
|
-
except Exception:
|
|
124
|
+
except Exception as e:
|
|
125
|
+
# Log corruption event
|
|
126
|
+
_append_worker_log(f"jobs.json corrupt: {e}")
|
|
127
|
+
logger.error("Jobs file corrupted", extra={
|
|
128
|
+
"error": str(e),
|
|
129
|
+
"jobs_file": JOBS_FILE
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
# Try to recover from backup
|
|
133
|
+
recovered_jobs = _recover_from_backup()
|
|
134
|
+
|
|
135
|
+
# Move corrupt file aside
|
|
62
136
|
try:
|
|
63
137
|
corrupt = JOBS_FILE + ".corrupt." + str(int(time.time()))
|
|
64
138
|
shutil.move(JOBS_FILE, corrupt)
|
|
65
|
-
_append_worker_log(f"jobs file
|
|
139
|
+
_append_worker_log(f"corrupt jobs file moved to {corrupt}")
|
|
66
140
|
except Exception:
|
|
67
141
|
pass
|
|
68
|
-
|
|
142
|
+
|
|
143
|
+
# If we recovered jobs, write them back
|
|
144
|
+
if recovered_jobs:
|
|
145
|
+
try:
|
|
146
|
+
_write_jobs(recovered_jobs)
|
|
147
|
+
_append_worker_log(f"restored {len(recovered_jobs)} jobs from backup")
|
|
148
|
+
except Exception as write_err:
|
|
149
|
+
_append_worker_log(f"failed to restore jobs: {write_err}")
|
|
150
|
+
|
|
151
|
+
return recovered_jobs
|
|
69
152
|
|
|
70
153
|
|
|
71
154
|
def _write_jobs(jobs: List[Dict[str, Any]]):
|
|
72
155
|
_ensure_dirs()
|
|
156
|
+
|
|
157
|
+
# Rotate backups before writing (keeps last 3 good copies)
|
|
158
|
+
_rotate_backups()
|
|
159
|
+
|
|
73
160
|
tmp = tempfile.NamedTemporaryFile("w", delete=False, dir=JOBS_DIR, encoding="utf-8")
|
|
74
161
|
try:
|
|
75
162
|
json.dump(jobs, tmp, indent=2, ensure_ascii=False)
|
|
@@ -93,36 +180,135 @@ def _append_worker_log(msg: str):
|
|
|
93
180
|
fh.write(line)
|
|
94
181
|
|
|
95
182
|
|
|
183
|
+
def _update_heartbeat():
|
|
184
|
+
"""Write current timestamp to heartbeat file for health monitoring."""
|
|
185
|
+
_ensure_dirs()
|
|
186
|
+
try:
|
|
187
|
+
with open(HEARTBEAT_FILE, 'w') as fh:
|
|
188
|
+
fh.write(str(time.time()))
|
|
189
|
+
except Exception:
|
|
190
|
+
pass # Non-critical, don't crash worker
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def get_heartbeat_age() -> Optional[float]:
|
|
194
|
+
"""
|
|
195
|
+
Get age of worker heartbeat in seconds.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Age in seconds, or None if heartbeat file doesn't exist
|
|
199
|
+
"""
|
|
200
|
+
try:
|
|
201
|
+
if os.path.exists(HEARTBEAT_FILE):
|
|
202
|
+
with open(HEARTBEAT_FILE, 'r') as fh:
|
|
203
|
+
last_beat = float(fh.read().strip())
|
|
204
|
+
return time.time() - last_beat
|
|
205
|
+
return None
|
|
206
|
+
except Exception:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def is_heartbeat_stale() -> bool:
|
|
211
|
+
"""Check if worker heartbeat is stale (older than threshold)."""
|
|
212
|
+
age = get_heartbeat_age()
|
|
213
|
+
if age is None:
|
|
214
|
+
return True # No heartbeat = stale
|
|
215
|
+
return age > HEARTBEAT_STALE_THRESHOLD
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _get_process_start_time(pid: int) -> Optional[float]:
|
|
219
|
+
"""
|
|
220
|
+
Get process start time from /proc filesystem (Linux only).
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Process start time as Unix timestamp, or None if not available
|
|
224
|
+
"""
|
|
225
|
+
try:
|
|
226
|
+
stat_path = f"/proc/{pid}/stat"
|
|
227
|
+
if not os.path.exists(stat_path):
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
with open(stat_path, 'r') as f:
|
|
231
|
+
stat = f.read()
|
|
232
|
+
|
|
233
|
+
# Parse stat file - field 22 is starttime (in clock ticks since boot)
|
|
234
|
+
# Format: pid (comm) state ppid pgrp session tty_nr ... starttime ...
|
|
235
|
+
# Need to handle comm field which may contain spaces/parentheses
|
|
236
|
+
parts = stat.rsplit(')', 1)
|
|
237
|
+
if len(parts) < 2:
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
fields = parts[1].split()
|
|
241
|
+
if len(fields) < 20:
|
|
242
|
+
return None
|
|
243
|
+
|
|
244
|
+
starttime_ticks = int(fields[19]) # 0-indexed, field 22 is at index 19 after comm
|
|
245
|
+
|
|
246
|
+
# Convert to timestamp using system boot time and clock ticks per second
|
|
247
|
+
with open('/proc/stat', 'r') as f:
|
|
248
|
+
for line in f:
|
|
249
|
+
if line.startswith('btime'):
|
|
250
|
+
boot_time = int(line.split()[1])
|
|
251
|
+
break
|
|
252
|
+
else:
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
# Get clock ticks per second (usually 100)
|
|
256
|
+
ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
|
257
|
+
|
|
258
|
+
return boot_time + (starttime_ticks / ticks_per_sec)
|
|
259
|
+
except Exception:
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
|
|
96
263
|
def _next_job_id(jobs: List[Dict[str, Any]]) -> int:
|
|
97
264
|
"""
|
|
98
|
-
Get next available job ID.
|
|
99
|
-
|
|
100
|
-
Uses a persistent counter to ensure IDs are never
|
|
101
|
-
|
|
265
|
+
Get next available job ID with file locking.
|
|
266
|
+
|
|
267
|
+
Uses a persistent counter with fcntl locking to ensure IDs are never
|
|
268
|
+
reused, even across multiple processes. This prevents duplicate job IDs
|
|
269
|
+
when multiple jobs are enqueued concurrently.
|
|
102
270
|
"""
|
|
103
271
|
counter_file = os.path.join(JOBS_DIR, ".job_counter")
|
|
104
|
-
|
|
272
|
+
lock_file = os.path.join(JOBS_DIR, ".job_counter.lock")
|
|
273
|
+
|
|
105
274
|
try:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
275
|
+
_ensure_dirs()
|
|
276
|
+
|
|
277
|
+
# Use a separate lock file to allow atomic read-modify-write
|
|
278
|
+
with open(lock_file, 'w') as lock_fh:
|
|
279
|
+
# Acquire exclusive lock (blocks until available)
|
|
280
|
+
fcntl.flock(lock_fh.fileno(), fcntl.LOCK_EX)
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
# Read current counter
|
|
284
|
+
if os.path.exists(counter_file):
|
|
285
|
+
with open(counter_file, 'r') as f:
|
|
286
|
+
next_id = int(f.read().strip())
|
|
287
|
+
else:
|
|
288
|
+
# Initialize from existing jobs
|
|
289
|
+
maxid = 0
|
|
290
|
+
for j in jobs:
|
|
291
|
+
try:
|
|
292
|
+
if isinstance(j.get("id"), int) and j["id"] > maxid:
|
|
293
|
+
maxid = j["id"]
|
|
294
|
+
except Exception:
|
|
295
|
+
continue
|
|
296
|
+
next_id = maxid + 1
|
|
297
|
+
|
|
298
|
+
# Write incremented counter atomically
|
|
299
|
+
tmp_file = counter_file + '.tmp'
|
|
300
|
+
with open(tmp_file, 'w') as f:
|
|
301
|
+
f.write(str(next_id + 1))
|
|
302
|
+
f.flush()
|
|
303
|
+
os.fsync(f.fileno())
|
|
304
|
+
os.replace(tmp_file, counter_file)
|
|
305
|
+
|
|
306
|
+
return next_id
|
|
307
|
+
|
|
308
|
+
finally:
|
|
309
|
+
# Release lock
|
|
310
|
+
fcntl.flock(lock_fh.fileno(), fcntl.LOCK_UN)
|
|
311
|
+
|
|
126
312
|
except Exception:
|
|
127
313
|
# Fallback to old behavior if file operations fail
|
|
128
314
|
maxid = 0
|
|
@@ -409,14 +595,36 @@ def purge_all_jobs() -> int:
|
|
|
409
595
|
return purge_jobs(status_filter=['done', 'error', 'killed'])
|
|
410
596
|
|
|
411
597
|
|
|
412
|
-
def _update_job(jid: int, **fields):
|
|
598
|
+
def _update_job(jid: int, respect_killed: bool = True, **fields):
|
|
599
|
+
"""
|
|
600
|
+
Update job fields atomically.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
jid: Job ID to update
|
|
604
|
+
respect_killed: If True (default), don't overwrite status if job is killed.
|
|
605
|
+
This prevents race condition where job is killed while completing.
|
|
606
|
+
**fields: Fields to update
|
|
607
|
+
"""
|
|
413
608
|
with _lock:
|
|
414
609
|
jobs = _read_jobs()
|
|
415
610
|
changed = False
|
|
416
611
|
for j in jobs:
|
|
417
612
|
if j.get("id") == jid:
|
|
418
|
-
|
|
419
|
-
|
|
613
|
+
# Race condition protection: don't change status of killed jobs
|
|
614
|
+
if respect_killed and j.get("status") == STATUS_KILLED and "status" in fields:
|
|
615
|
+
# Job was killed - don't overwrite status, but allow other updates
|
|
616
|
+
fields_copy = dict(fields)
|
|
617
|
+
del fields_copy["status"]
|
|
618
|
+
if fields_copy:
|
|
619
|
+
j.update(fields_copy)
|
|
620
|
+
changed = True
|
|
621
|
+
logger.debug("Skipped status update for killed job", extra={
|
|
622
|
+
"job_id": jid,
|
|
623
|
+
"attempted_status": fields.get("status")
|
|
624
|
+
})
|
|
625
|
+
else:
|
|
626
|
+
j.update(fields)
|
|
627
|
+
changed = True
|
|
420
628
|
break
|
|
421
629
|
if changed:
|
|
422
630
|
_write_jobs(jobs)
|
|
@@ -479,10 +687,27 @@ def _process_pending_chains():
|
|
|
479
687
|
# Get parse results from job
|
|
480
688
|
parse_result = job_to_chain.get('parse_result', {})
|
|
481
689
|
|
|
482
|
-
if not parse_result
|
|
483
|
-
# No results
|
|
484
|
-
|
|
485
|
-
|
|
690
|
+
if not parse_result:
|
|
691
|
+
# No parse results - this shouldn't happen if job was properly marked chainable
|
|
692
|
+
# Log warning and store reason for debugging
|
|
693
|
+
logger.warning("Job marked chainable but has no parse_result", extra={
|
|
694
|
+
"job_id": jid,
|
|
695
|
+
"tool": tool,
|
|
696
|
+
"status": job_to_chain.get('status')
|
|
697
|
+
})
|
|
698
|
+
_append_worker_log(f"job {jid}: WARNING - marked chainable but parse_result is empty/missing")
|
|
699
|
+
_update_job(jid, chained=True, chain_skip_reason="parse_result missing")
|
|
700
|
+
return 1
|
|
701
|
+
|
|
702
|
+
if 'error' in parse_result:
|
|
703
|
+
# Parse had an error - log and skip
|
|
704
|
+
logger.warning("Job has parse error, skipping chaining", extra={
|
|
705
|
+
"job_id": jid,
|
|
706
|
+
"tool": tool,
|
|
707
|
+
"parse_error": parse_result.get('error')
|
|
708
|
+
})
|
|
709
|
+
_append_worker_log(f"job {jid}: parse error '{parse_result.get('error')}', skipping chain")
|
|
710
|
+
_update_job(jid, chained=True, chain_skip_reason=f"parse_error: {parse_result.get('error')}")
|
|
486
711
|
return 1
|
|
487
712
|
|
|
488
713
|
# Process auto-chaining
|
|
@@ -571,10 +796,35 @@ def _try_run_plugin(tool: str, target: str, args: List[str], label: str, log_pat
|
|
|
571
796
|
cmd_spec = build_command_method(target, args or [], label or "", log_path)
|
|
572
797
|
|
|
573
798
|
if cmd_spec is None:
|
|
574
|
-
#
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
799
|
+
# build_command returned None - check if plugin has run() method
|
|
800
|
+
# This allows plugins to signal "use run() instead" by returning None
|
|
801
|
+
run_method = getattr(plugin, "run", None)
|
|
802
|
+
if callable(run_method):
|
|
803
|
+
# Plugin wants to handle execution itself via run() method
|
|
804
|
+
sig = inspect.signature(run_method)
|
|
805
|
+
params = list(sig.parameters.keys())
|
|
806
|
+
|
|
807
|
+
try:
|
|
808
|
+
if "log_path" in params:
|
|
809
|
+
rc = run_method(target, args or [], label or "", log_path)
|
|
810
|
+
elif "label" in params:
|
|
811
|
+
rc = run_method(target, args or [], label or "")
|
|
812
|
+
elif "args" in params:
|
|
813
|
+
rc = run_method(target, args or [])
|
|
814
|
+
else:
|
|
815
|
+
rc = run_method(target)
|
|
816
|
+
return (True, rc if isinstance(rc, int) else 0)
|
|
817
|
+
except Exception as e:
|
|
818
|
+
with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
|
|
819
|
+
fh.write(f"\n=== PLUGIN RUN ERROR ===\n")
|
|
820
|
+
fh.write(f"{type(e).__name__}: {e}\n")
|
|
821
|
+
fh.write(f"\n{traceback.format_exc()}\n")
|
|
822
|
+
return (True, 1)
|
|
823
|
+
else:
|
|
824
|
+
# No run() method either - actual validation failure
|
|
825
|
+
with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
|
|
826
|
+
fh.write("ERROR: Plugin validation failed (build_command returned None)\n")
|
|
827
|
+
return (True, 1)
|
|
578
828
|
|
|
579
829
|
# Execute using new subprocess handler with PID tracking
|
|
580
830
|
rc = _run_subprocess_with_spec(cmd_spec, log_path, jid=jid, plugin=plugin)
|
|
@@ -773,6 +1023,55 @@ def _store_msf_session(jid: int, target: str, exploit_path: str, session_id: str
|
|
|
773
1023
|
_append_worker_log(f"job {jid}: session storage error: {e}")
|
|
774
1024
|
|
|
775
1025
|
|
|
1026
|
+
# Cache stdbuf availability check
|
|
1027
|
+
_stdbuf_available = None
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def _is_stdbuf_available() -> bool:
|
|
1031
|
+
"""Check if stdbuf is available for line-buffered output."""
|
|
1032
|
+
global _stdbuf_available
|
|
1033
|
+
if _stdbuf_available is None:
|
|
1034
|
+
_stdbuf_available = shutil.which('stdbuf') is not None
|
|
1035
|
+
return _stdbuf_available
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def _wrap_cmd_for_line_buffering(cmd: List[str]) -> List[str]:
|
|
1039
|
+
"""
|
|
1040
|
+
Wrap a command with stdbuf for line-buffered output when available.
|
|
1041
|
+
|
|
1042
|
+
This ensures output is written line-by-line instead of in 4-8KB blocks,
|
|
1043
|
+
improving real-time log monitoring and ensuring output is captured
|
|
1044
|
+
before process termination.
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
cmd: Command to wrap
|
|
1048
|
+
|
|
1049
|
+
Returns:
|
|
1050
|
+
Command wrapped with stdbuf if available, original command otherwise
|
|
1051
|
+
"""
|
|
1052
|
+
if not cmd:
|
|
1053
|
+
return cmd
|
|
1054
|
+
|
|
1055
|
+
if _is_stdbuf_available():
|
|
1056
|
+
# stdbuf -oL = line-buffered stdout, -eL = line-buffered stderr
|
|
1057
|
+
return ['stdbuf', '-oL', '-eL'] + cmd
|
|
1058
|
+
|
|
1059
|
+
return cmd
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
def _get_subprocess_env() -> Dict[str, str]:
|
|
1063
|
+
"""
|
|
1064
|
+
Get environment for subprocess with buffering disabled.
|
|
1065
|
+
|
|
1066
|
+
Sets PYTHONUNBUFFERED=1 for Python subprocesses and TERM=dumb
|
|
1067
|
+
to prevent interactive terminal issues.
|
|
1068
|
+
"""
|
|
1069
|
+
env = os.environ.copy()
|
|
1070
|
+
env['TERM'] = 'dumb' # Prevent stty errors from interactive tools
|
|
1071
|
+
env['PYTHONUNBUFFERED'] = '1' # Disable Python output buffering
|
|
1072
|
+
return env
|
|
1073
|
+
|
|
1074
|
+
|
|
776
1075
|
def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int = None, plugin=None) -> int:
|
|
777
1076
|
"""
|
|
778
1077
|
Execute a command specification with proper PID tracking.
|
|
@@ -814,32 +1113,35 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
|
|
|
814
1113
|
with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
|
|
815
1114
|
fh.write("ERROR: No command provided in spec\n")
|
|
816
1115
|
return 1
|
|
817
|
-
|
|
1116
|
+
|
|
818
1117
|
timeout = cmd_spec.get('timeout', JOB_TIMEOUT_SECONDS)
|
|
819
|
-
|
|
1118
|
+
spec_env = cmd_spec.get('env')
|
|
820
1119
|
cwd = cmd_spec.get('cwd')
|
|
821
1120
|
needs_shell = cmd_spec.get('needs_shell', False)
|
|
822
|
-
|
|
1121
|
+
|
|
823
1122
|
_append_worker_log(f"_run_subprocess_with_spec: timeout={timeout}s for job {jid}")
|
|
824
|
-
|
|
825
|
-
#
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1123
|
+
|
|
1124
|
+
# Wrap command with stdbuf for line-buffered output (unless shell mode)
|
|
1125
|
+
original_cmd = cmd
|
|
1126
|
+
if not needs_shell:
|
|
1127
|
+
cmd = _wrap_cmd_for_line_buffering(cmd)
|
|
1128
|
+
|
|
1129
|
+
# Prepare environment with PYTHONUNBUFFERED=1 and TERM=dumb
|
|
1130
|
+
proc_env = _get_subprocess_env()
|
|
1131
|
+
if spec_env:
|
|
1132
|
+
proc_env.update(spec_env)
|
|
1133
|
+
|
|
832
1134
|
with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
|
|
833
1135
|
fh.write("=== Command Execution (build_command) ===\n")
|
|
834
|
-
fh.write(f"Command: {' '.join(
|
|
1136
|
+
fh.write(f"Command: {' '.join(original_cmd)}\n")
|
|
835
1137
|
fh.write(f"Timeout: {timeout} seconds\n")
|
|
836
|
-
if
|
|
837
|
-
fh.write(f"Environment: {
|
|
1138
|
+
if spec_env:
|
|
1139
|
+
fh.write(f"Environment: {spec_env}\n")
|
|
838
1140
|
if cwd:
|
|
839
1141
|
fh.write(f"Working Dir: {cwd}\n")
|
|
840
1142
|
fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
|
|
841
1143
|
fh.flush()
|
|
842
|
-
|
|
1144
|
+
|
|
843
1145
|
try:
|
|
844
1146
|
# Create new process group so all children can be killed together
|
|
845
1147
|
# Redirect stdin to /dev/null to prevent password prompts from hanging
|
|
@@ -849,16 +1151,17 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
|
|
|
849
1151
|
stdout=fh,
|
|
850
1152
|
stderr=subprocess.STDOUT,
|
|
851
1153
|
preexec_fn=os.setsid, # Creates new session
|
|
852
|
-
env=proc_env,
|
|
1154
|
+
env=proc_env,
|
|
853
1155
|
cwd=cwd,
|
|
854
1156
|
shell=needs_shell # nosec B602 - intentional for security tool command execution
|
|
855
1157
|
)
|
|
856
1158
|
|
|
857
|
-
# Store PID
|
|
1159
|
+
# Store PID and process start time for stale detection
|
|
858
1160
|
if jid is not None:
|
|
859
|
-
|
|
1161
|
+
proc_start_time = _get_process_start_time(proc.pid)
|
|
1162
|
+
_update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
|
|
860
1163
|
_append_worker_log(f"job {jid}: running with PID {proc.pid}")
|
|
861
|
-
|
|
1164
|
+
|
|
862
1165
|
# Wait for process with timeout
|
|
863
1166
|
try:
|
|
864
1167
|
proc.wait(timeout=timeout)
|
|
@@ -890,6 +1193,7 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
|
|
|
890
1193
|
return 0
|
|
891
1194
|
else:
|
|
892
1195
|
fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
|
|
1196
|
+
fh.flush()
|
|
893
1197
|
return 124
|
|
894
1198
|
|
|
895
1199
|
# Check if job was killed externally during execution
|
|
@@ -912,17 +1216,21 @@ def _run_subprocess_with_spec(cmd_spec: Dict[str, Any], log_path: str, jid: int
|
|
|
912
1216
|
proc.wait(timeout=5)
|
|
913
1217
|
except:
|
|
914
1218
|
pass
|
|
1219
|
+
fh.flush()
|
|
915
1220
|
return 143 # 128 + 15 (SIGTERM)
|
|
916
|
-
|
|
1221
|
+
|
|
917
1222
|
fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
|
|
918
1223
|
fh.write(f"Exit Code: {proc.returncode}\n")
|
|
1224
|
+
fh.flush()
|
|
919
1225
|
return proc.returncode
|
|
920
|
-
|
|
1226
|
+
|
|
921
1227
|
except FileNotFoundError:
|
|
922
1228
|
fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
|
|
1229
|
+
fh.flush()
|
|
923
1230
|
return 127
|
|
924
1231
|
except Exception as e:
|
|
925
1232
|
fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
|
|
1233
|
+
fh.flush()
|
|
926
1234
|
return 1
|
|
927
1235
|
|
|
928
1236
|
|
|
@@ -937,9 +1245,14 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
|
|
|
937
1245
|
cmd = [tool] + (args or [])
|
|
938
1246
|
cmd = [c.replace("<target>", target) for c in cmd]
|
|
939
1247
|
|
|
1248
|
+
# Wrap command with stdbuf for line-buffered output
|
|
1249
|
+
cmd = _wrap_cmd_for_line_buffering(cmd)
|
|
1250
|
+
|
|
940
1251
|
with open(log_path, "a", encoding="utf-8", errors="replace") as fh:
|
|
1252
|
+
# Log original command (without stdbuf wrapper for clarity)
|
|
1253
|
+
original_cmd = cmd[3:] if cmd[:3] == ['stdbuf', '-oL', '-eL'] else cmd
|
|
941
1254
|
fh.write("=== Subprocess Execution ===\n")
|
|
942
|
-
fh.write(f"Command: {' '.join(
|
|
1255
|
+
fh.write(f"Command: {' '.join(original_cmd)}\n")
|
|
943
1256
|
fh.write(f"Timeout: {timeout} seconds\n")
|
|
944
1257
|
fh.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n\n")
|
|
945
1258
|
fh.flush()
|
|
@@ -947,9 +1260,8 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
|
|
|
947
1260
|
try:
|
|
948
1261
|
# Create new process group so all children can be killed together
|
|
949
1262
|
# Redirect stdin to /dev/null to prevent password prompts from hanging
|
|
950
|
-
#
|
|
951
|
-
env =
|
|
952
|
-
env['TERM'] = 'dumb'
|
|
1263
|
+
# Use env with PYTHONUNBUFFERED=1 and TERM=dumb
|
|
1264
|
+
env = _get_subprocess_env()
|
|
953
1265
|
|
|
954
1266
|
proc = subprocess.Popen(
|
|
955
1267
|
cmd,
|
|
@@ -960,9 +1272,10 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
|
|
|
960
1272
|
env=env
|
|
961
1273
|
)
|
|
962
1274
|
|
|
963
|
-
# Store PID
|
|
1275
|
+
# Store PID and process start time for stale detection
|
|
964
1276
|
if jid is not None:
|
|
965
|
-
|
|
1277
|
+
proc_start_time = _get_process_start_time(proc.pid)
|
|
1278
|
+
_update_job(jid, pid=proc.pid, process_start_time=proc_start_time)
|
|
966
1279
|
_append_worker_log(f"job {jid}: running with PID {proc.pid}")
|
|
967
1280
|
|
|
968
1281
|
# Wait for process with timeout
|
|
@@ -977,6 +1290,7 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
|
|
|
977
1290
|
proc.kill() # Fallback to single process
|
|
978
1291
|
proc.wait()
|
|
979
1292
|
fh.write(f"\nERROR: Command timed out after {timeout} seconds\n")
|
|
1293
|
+
fh.flush()
|
|
980
1294
|
return 124
|
|
981
1295
|
|
|
982
1296
|
# Check if job was killed externally during execution
|
|
@@ -999,17 +1313,21 @@ def _run_subprocess(tool: str, target: str, args: List[str], log_path: str, jid:
|
|
|
999
1313
|
proc.wait(timeout=5)
|
|
1000
1314
|
except:
|
|
1001
1315
|
pass
|
|
1316
|
+
fh.flush()
|
|
1002
1317
|
return 143 # 128 + 15 (SIGTERM)
|
|
1003
1318
|
|
|
1004
1319
|
fh.write(f"\n=== Completed: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())} ===\n")
|
|
1005
1320
|
fh.write(f"Exit Code: {proc.returncode}\n")
|
|
1321
|
+
fh.flush()
|
|
1006
1322
|
return proc.returncode
|
|
1007
1323
|
|
|
1008
1324
|
except FileNotFoundError:
|
|
1009
1325
|
fh.write(f"\nERROR: Tool not found: {cmd[0]}\n")
|
|
1326
|
+
fh.flush()
|
|
1010
1327
|
return 127
|
|
1011
1328
|
except Exception as e:
|
|
1012
1329
|
fh.write(f"\nERROR: {type(e).__name__}: {e}\n")
|
|
1330
|
+
fh.flush()
|
|
1013
1331
|
return 1
|
|
1014
1332
|
|
|
1015
1333
|
|
|
@@ -1145,77 +1463,110 @@ def run_job(jid: int) -> None:
|
|
|
1145
1463
|
# Re-fetch job to get updated data
|
|
1146
1464
|
job = get_job(jid)
|
|
1147
1465
|
parse_result = handle_job_result(job)
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1466
|
+
|
|
1467
|
+
# Handle parse failure cases
|
|
1468
|
+
if parse_result is None:
|
|
1469
|
+
# Parser returned None - likely missing log file, no parser for tool, or missing engagement
|
|
1470
|
+
logger.error("Job parse returned None - results may be lost", extra={
|
|
1471
|
+
"job_id": jid,
|
|
1472
|
+
"tool": job.get('tool'),
|
|
1473
|
+
"log_exists": os.path.exists(job.get('log', '')) if job.get('log') else False
|
|
1474
|
+
})
|
|
1475
|
+
_append_worker_log(f"job {jid} parse returned None (tool={job.get('tool')}) - check if parser exists")
|
|
1476
|
+
# Update job to indicate parse failure
|
|
1477
|
+
_update_job(jid, status=STATUS_WARNING, parse_result={'error': 'Parser returned None - no results extracted'})
|
|
1478
|
+
# Mark as chained to prevent infinite retry
|
|
1479
|
+
_update_job(jid, chained=True)
|
|
1480
|
+
return
|
|
1481
|
+
|
|
1482
|
+
if 'error' in parse_result:
|
|
1483
|
+
logger.error("Job parse error - results may be incomplete", extra={
|
|
1484
|
+
"job_id": jid,
|
|
1485
|
+
"error": parse_result['error']
|
|
1486
|
+
})
|
|
1487
|
+
_append_worker_log(f"job {jid} parse error: {parse_result['error']}")
|
|
1488
|
+
# Update job status to warning with the error
|
|
1489
|
+
_update_job(jid, status=STATUS_WARNING, parse_result=parse_result)
|
|
1490
|
+
# Mark as chained to prevent infinite retry
|
|
1491
|
+
_update_job(jid, chained=True)
|
|
1492
|
+
return
|
|
1493
|
+
|
|
1494
|
+
# Parse succeeded
|
|
1495
|
+
logger.info("Job parsed successfully", extra={
|
|
1496
|
+
"job_id": jid,
|
|
1497
|
+
"parse_result": parse_result
|
|
1498
|
+
})
|
|
1499
|
+
_append_worker_log(f"job {jid} parsed: {parse_result}")
|
|
1500
|
+
|
|
1501
|
+
# Determine chainable status BEFORE updating to avoid race condition
|
|
1502
|
+
# We must set parse_result and chainable in a single atomic update
|
|
1503
|
+
try:
|
|
1504
|
+
from souleyez.core.tool_chaining import ToolChaining
|
|
1505
|
+
chaining = ToolChaining()
|
|
1506
|
+
|
|
1507
|
+
# Get current job to check status
|
|
1508
|
+
job = get_job(jid)
|
|
1509
|
+
job_status = job.get('status', STATUS_ERROR)
|
|
1510
|
+
|
|
1511
|
+
# Determine final status from parser if provided
|
|
1512
|
+
final_status = parse_result.get('status', job_status)
|
|
1513
|
+
|
|
1514
|
+
# Check if job should be chainable
|
|
1515
|
+
should_chain = (
|
|
1516
|
+
chaining.is_enabled() and
|
|
1517
|
+
parse_result and
|
|
1518
|
+
'error' not in parse_result and
|
|
1519
|
+
is_chainable(final_status)
|
|
1520
|
+
)
|
|
1521
|
+
|
|
1522
|
+
# Build update dict - ATOMIC update of parse_result + chainable
|
|
1523
|
+
update_fields = {'parse_result': parse_result}
|
|
1524
|
+
|
|
1525
|
+
if 'status' in parse_result:
|
|
1526
|
+
update_fields['status'] = final_status
|
|
1527
|
+
logger.info("Job status updated from parser", extra={
|
|
1151
1528
|
"job_id": jid,
|
|
1152
|
-
"
|
|
1529
|
+
"status": final_status
|
|
1153
1530
|
})
|
|
1154
|
-
_append_worker_log(f"job {jid}
|
|
1531
|
+
_append_worker_log(f"job {jid} status updated to: {final_status}")
|
|
1532
|
+
|
|
1533
|
+
if should_chain:
|
|
1534
|
+
update_fields['chainable'] = True
|
|
1155
1535
|
else:
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
"parse_result": parse_result
|
|
1159
|
-
})
|
|
1160
|
-
_append_worker_log(f"job {jid} parsed: {parse_result}")
|
|
1536
|
+
# Not chainable - mark as chained to skip
|
|
1537
|
+
update_fields['chained'] = True
|
|
1161
1538
|
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1539
|
+
# Single atomic update to prevent race condition
|
|
1540
|
+
_update_job(jid, **update_fields)
|
|
1541
|
+
|
|
1542
|
+
# Log chaining decision
|
|
1543
|
+
if should_chain:
|
|
1544
|
+
if final_status == STATUS_WARNING:
|
|
1545
|
+
logger.info("Job with warning status marked for chaining", extra={
|
|
1167
1546
|
"job_id": jid,
|
|
1168
|
-
"
|
|
1547
|
+
"tool": job.get('tool'),
|
|
1548
|
+
"wildcard_detected": parse_result.get('wildcard_detected', False)
|
|
1169
1549
|
})
|
|
1170
|
-
_append_worker_log(f"job {jid} status
|
|
1550
|
+
_append_worker_log(f"job {jid} (status=warning) marked as chainable")
|
|
1171
1551
|
else:
|
|
1172
|
-
|
|
1173
|
-
_update_job(jid, parse_result=parse_result)
|
|
1174
|
-
|
|
1175
|
-
# Mark job as chainable instead of chaining immediately
|
|
1176
|
-
# Worker loop will process it when database is idle
|
|
1177
|
-
try:
|
|
1178
|
-
from souleyez.core.tool_chaining import ToolChaining
|
|
1179
|
-
chaining = ToolChaining()
|
|
1180
|
-
|
|
1181
|
-
# Re-fetch job to get updated status
|
|
1182
|
-
job = get_job(jid)
|
|
1183
|
-
job_status = job.get('status', STATUS_ERROR)
|
|
1184
|
-
|
|
1185
|
-
# Check if status is chainable (done, no_results, warning)
|
|
1186
|
-
if chaining.is_enabled() and parse_result and 'error' not in parse_result and is_chainable(job_status):
|
|
1187
|
-
# Mark for deferred chaining
|
|
1188
|
-
_update_job(jid, chainable=True)
|
|
1189
|
-
|
|
1190
|
-
# Log special handling for warning status
|
|
1191
|
-
if job_status == STATUS_WARNING:
|
|
1192
|
-
logger.info("Job with warning status marked for chaining", extra={
|
|
1193
|
-
"job_id": jid,
|
|
1194
|
-
"tool": job.get('tool'),
|
|
1195
|
-
"wildcard_detected": parse_result.get('wildcard_detected', False)
|
|
1196
|
-
})
|
|
1197
|
-
_append_worker_log(f"job {jid} (status=warning) marked as chainable")
|
|
1198
|
-
else:
|
|
1199
|
-
logger.info("Job marked as chainable", extra={
|
|
1200
|
-
"job_id": jid,
|
|
1201
|
-
"tool": job.get('tool'),
|
|
1202
|
-
"status": job_status
|
|
1203
|
-
})
|
|
1204
|
-
_append_worker_log(f"job {jid} marked as chainable (status={job_status})")
|
|
1205
|
-
else:
|
|
1206
|
-
# Chaining disabled or job has errors - mark as chained (skip)
|
|
1207
|
-
_update_job(jid, chained=True)
|
|
1208
|
-
reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={job_status}"
|
|
1209
|
-
_append_worker_log(f"job {jid} not chainable ({reason})")
|
|
1210
|
-
|
|
1211
|
-
except Exception as chain_err:
|
|
1212
|
-
logger.error("Failed to mark job as chainable", extra={
|
|
1552
|
+
logger.info("Job marked as chainable", extra={
|
|
1213
1553
|
"job_id": jid,
|
|
1214
|
-
"
|
|
1554
|
+
"tool": job.get('tool'),
|
|
1555
|
+
"status": final_status
|
|
1215
1556
|
})
|
|
1216
|
-
_append_worker_log(f"job {jid}
|
|
1217
|
-
|
|
1218
|
-
|
|
1557
|
+
_append_worker_log(f"job {jid} marked as chainable (status={final_status})")
|
|
1558
|
+
else:
|
|
1559
|
+
reason = f"chaining_disabled={not chaining.is_enabled()}, has_error={'error' in parse_result}, status={final_status}"
|
|
1560
|
+
_append_worker_log(f"job {jid} not chainable ({reason})")
|
|
1561
|
+
|
|
1562
|
+
except Exception as chain_err:
|
|
1563
|
+
logger.error("Failed to mark job as chainable", extra={
|
|
1564
|
+
"job_id": jid,
|
|
1565
|
+
"error": str(chain_err)
|
|
1566
|
+
})
|
|
1567
|
+
_append_worker_log(f"job {jid} chainable marking error: {chain_err}")
|
|
1568
|
+
# Mark as chained to prevent retry loops
|
|
1569
|
+
_update_job(jid, chained=True, chain_error=str(chain_err))
|
|
1219
1570
|
|
|
1220
1571
|
except Exception as e:
|
|
1221
1572
|
logger.error("Job parse exception", extra={
|
|
@@ -1378,18 +1729,46 @@ def _detect_and_recover_stale_jobs() -> int:
|
|
|
1378
1729
|
pid = job.get('pid')
|
|
1379
1730
|
tool = job.get('tool', 'unknown')
|
|
1380
1731
|
log_path = job.get('log')
|
|
1732
|
+
stored_start_time = job.get('process_start_time')
|
|
1381
1733
|
|
|
1382
|
-
#
|
|
1734
|
+
# Check if PID is alive
|
|
1383
1735
|
if _is_pid_alive(pid):
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1736
|
+
# PID is alive - but check for PID reuse
|
|
1737
|
+
if stored_start_time is not None:
|
|
1738
|
+
current_start_time = _get_process_start_time(pid)
|
|
1739
|
+
if current_start_time is not None:
|
|
1740
|
+
# Allow 2 second tolerance for timing differences
|
|
1741
|
+
if abs(current_start_time - stored_start_time) > 2:
|
|
1742
|
+
# PID reused by different process
|
|
1743
|
+
_append_worker_log(
|
|
1744
|
+
f"job {jid}: PID {pid} reused (stored start: {stored_start_time:.0f}, "
|
|
1745
|
+
f"current: {current_start_time:.0f})"
|
|
1746
|
+
)
|
|
1747
|
+
logger.warning("PID reuse detected", extra={
|
|
1748
|
+
"job_id": jid,
|
|
1749
|
+
"tool": tool,
|
|
1750
|
+
"pid": pid,
|
|
1751
|
+
"stored_start_time": stored_start_time,
|
|
1752
|
+
"current_start_time": current_start_time
|
|
1753
|
+
})
|
|
1754
|
+
# Fall through to stale job handling
|
|
1755
|
+
else:
|
|
1756
|
+
# Same process, still running
|
|
1757
|
+
continue
|
|
1758
|
+
else:
|
|
1759
|
+
# Can't get current start time, assume still valid
|
|
1760
|
+
continue
|
|
1761
|
+
else:
|
|
1762
|
+
# No stored start time (old job), assume still valid
|
|
1763
|
+
continue
|
|
1764
|
+
else:
|
|
1765
|
+
# PID is dead - definitely stale
|
|
1766
|
+
_append_worker_log(f"job {jid}: detected stale (PID {pid} is dead)")
|
|
1767
|
+
logger.warning("Stale job detected", extra={
|
|
1768
|
+
"job_id": jid,
|
|
1769
|
+
"tool": tool,
|
|
1770
|
+
"pid": pid
|
|
1771
|
+
})
|
|
1393
1772
|
|
|
1394
1773
|
# Check if log shows completion
|
|
1395
1774
|
completed, exit_code = _check_log_for_completion(log_path, tool)
|
|
@@ -1412,6 +1791,8 @@ def _detect_and_recover_stale_jobs() -> int:
|
|
|
1412
1791
|
# Try to parse results
|
|
1413
1792
|
try:
|
|
1414
1793
|
from .result_handler import handle_job_result
|
|
1794
|
+
from souleyez.core.tool_chaining import ToolChaining
|
|
1795
|
+
|
|
1415
1796
|
job = get_job(jid)
|
|
1416
1797
|
parse_result = handle_job_result(job)
|
|
1417
1798
|
|
|
@@ -1419,36 +1800,34 @@ def _detect_and_recover_stale_jobs() -> int:
|
|
|
1419
1800
|
if 'error' in parse_result:
|
|
1420
1801
|
_append_worker_log(f"job {jid} stale recovery parse error: {parse_result['error']}")
|
|
1421
1802
|
else:
|
|
1422
|
-
#
|
|
1803
|
+
# Determine final status and chainable in one check
|
|
1804
|
+
final_status = parse_result.get('status', status)
|
|
1805
|
+
chaining = ToolChaining()
|
|
1806
|
+
should_chain = chaining.is_enabled() and is_chainable(final_status)
|
|
1807
|
+
|
|
1808
|
+
# Build atomic update - parse_result + status + chainable together
|
|
1809
|
+
update_fields = {'parse_result': parse_result}
|
|
1423
1810
|
if 'status' in parse_result:
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1811
|
+
update_fields['status'] = final_status
|
|
1812
|
+
if should_chain:
|
|
1813
|
+
update_fields['chainable'] = True
|
|
1814
|
+
|
|
1815
|
+
# Single atomic update to prevent race condition
|
|
1816
|
+
_update_job(jid, **update_fields)
|
|
1428
1817
|
|
|
1429
1818
|
_append_worker_log(f"job {jid} stale recovery parsed: {parse_result.get('findings_added', 0)} findings")
|
|
1430
1819
|
|
|
1431
1820
|
logger.info("Stale job recovered with results", extra={
|
|
1432
1821
|
"job_id": jid,
|
|
1433
1822
|
"tool": tool,
|
|
1434
|
-
"status":
|
|
1435
|
-
"parse_result": parse_result
|
|
1823
|
+
"status": final_status,
|
|
1824
|
+
"parse_result": parse_result,
|
|
1825
|
+
"chainable": should_chain
|
|
1436
1826
|
})
|
|
1437
1827
|
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
chaining = ToolChaining()
|
|
1442
|
-
if chaining.is_enabled() and is_chainable(status):
|
|
1443
|
-
_update_job(jid, chainable=True)
|
|
1444
|
-
_append_worker_log(f"job {jid} stale recovery marked as chainable")
|
|
1445
|
-
logger.info("Stale job marked as chainable", extra={
|
|
1446
|
-
"job_id": jid,
|
|
1447
|
-
"tool": tool,
|
|
1448
|
-
"status": status
|
|
1449
|
-
})
|
|
1450
|
-
except Exception as chain_err:
|
|
1451
|
-
_append_worker_log(f"job {jid} stale recovery chainable error: {chain_err}")
|
|
1828
|
+
if should_chain:
|
|
1829
|
+
_append_worker_log(f"job {jid} stale recovery marked as chainable")
|
|
1830
|
+
|
|
1452
1831
|
except Exception as parse_err:
|
|
1453
1832
|
_append_worker_log(f"job {jid} stale recovery parse exception: {parse_err}")
|
|
1454
1833
|
|
|
@@ -1608,26 +1987,85 @@ def _check_msf_exploitation_success():
|
|
|
1608
1987
|
return 0
|
|
1609
1988
|
|
|
1610
1989
|
|
|
1990
|
+
def _update_job_progress():
|
|
1991
|
+
"""
|
|
1992
|
+
Update progress tracking for running jobs.
|
|
1993
|
+
|
|
1994
|
+
Checks log file modification times and flags jobs with no recent output
|
|
1995
|
+
as possibly hung (no output for JOB_HUNG_THRESHOLD seconds).
|
|
1996
|
+
"""
|
|
1997
|
+
try:
|
|
1998
|
+
jobs = _read_jobs()
|
|
1999
|
+
running_jobs = [j for j in jobs if j.get('status') == STATUS_RUNNING]
|
|
2000
|
+
|
|
2001
|
+
for job in running_jobs:
|
|
2002
|
+
jid = job.get('id')
|
|
2003
|
+
log_path = job.get('log')
|
|
2004
|
+
|
|
2005
|
+
if not log_path or not os.path.exists(log_path):
|
|
2006
|
+
continue
|
|
2007
|
+
|
|
2008
|
+
try:
|
|
2009
|
+
# Get log file modification time
|
|
2010
|
+
mtime = os.path.getmtime(log_path)
|
|
2011
|
+
current_time = time.time()
|
|
2012
|
+
time_since_output = current_time - mtime
|
|
2013
|
+
|
|
2014
|
+
# Update last_output_at in job record
|
|
2015
|
+
updates = {'last_output_at': mtime}
|
|
2016
|
+
|
|
2017
|
+
# Flag as possibly hung if no output for threshold
|
|
2018
|
+
was_hung = job.get('possibly_hung', False)
|
|
2019
|
+
is_hung = time_since_output > JOB_HUNG_THRESHOLD
|
|
2020
|
+
|
|
2021
|
+
if is_hung != was_hung:
|
|
2022
|
+
updates['possibly_hung'] = is_hung
|
|
2023
|
+
if is_hung:
|
|
2024
|
+
_append_worker_log(
|
|
2025
|
+
f"job {jid}: no output for {int(time_since_output)}s, flagged as possibly hung"
|
|
2026
|
+
)
|
|
2027
|
+
logger.warning("Job possibly hung", extra={
|
|
2028
|
+
"job_id": jid,
|
|
2029
|
+
"tool": job.get('tool'),
|
|
2030
|
+
"time_since_output": int(time_since_output)
|
|
2031
|
+
})
|
|
2032
|
+
|
|
2033
|
+
_update_job(jid, **updates)
|
|
2034
|
+
|
|
2035
|
+
except Exception as e:
|
|
2036
|
+
# Non-critical, just skip this job
|
|
2037
|
+
pass
|
|
2038
|
+
|
|
2039
|
+
except Exception as e:
|
|
2040
|
+
logger.error("Job progress tracking error", extra={"error": str(e)})
|
|
2041
|
+
|
|
2042
|
+
|
|
1611
2043
|
def worker_loop(poll_interval: float = 2.0):
|
|
1612
2044
|
"""
|
|
1613
2045
|
Main worker loop that processes jobs and handles auto-chaining.
|
|
1614
2046
|
|
|
1615
2047
|
Loop behavior:
|
|
1616
|
-
1.
|
|
1617
|
-
2.
|
|
1618
|
-
3.
|
|
1619
|
-
4.
|
|
1620
|
-
5.
|
|
2048
|
+
1. Update heartbeat for health monitoring
|
|
2049
|
+
2. Detect and recover stale jobs (dead PIDs)
|
|
2050
|
+
3. Update progress tracking for running jobs
|
|
2051
|
+
4. Check for running jobs
|
|
2052
|
+
5. If none running, start next queued job
|
|
2053
|
+
6. Process one chainable job (if any)
|
|
2054
|
+
7. Sleep poll_interval seconds, repeat
|
|
1621
2055
|
|
|
1622
2056
|
Args:
|
|
1623
2057
|
poll_interval: Seconds to sleep between iterations (default: 2.0)
|
|
1624
2058
|
"""
|
|
1625
2059
|
_ensure_dirs()
|
|
2060
|
+
_update_heartbeat() # Initial heartbeat
|
|
1626
2061
|
_append_worker_log("souleyez background worker: starting loop")
|
|
1627
2062
|
|
|
1628
|
-
# Track last stale job check time (check every
|
|
2063
|
+
# Track last stale job check time (check every 15 seconds, not every iteration)
|
|
1629
2064
|
last_stale_check = 0
|
|
1630
|
-
stale_check_interval =
|
|
2065
|
+
stale_check_interval = 15 # seconds (reduced from 30s for faster detection)
|
|
2066
|
+
|
|
2067
|
+
# Track last heartbeat time
|
|
2068
|
+
last_heartbeat = time.time()
|
|
1631
2069
|
|
|
1632
2070
|
# Run stale job detection on startup
|
|
1633
2071
|
try:
|
|
@@ -1639,8 +2077,14 @@ def worker_loop(poll_interval: float = 2.0):
|
|
|
1639
2077
|
|
|
1640
2078
|
try:
|
|
1641
2079
|
while True:
|
|
1642
|
-
# Periodic stale job detection (every 30 seconds)
|
|
1643
2080
|
current_time = time.time()
|
|
2081
|
+
|
|
2082
|
+
# Update heartbeat every HEARTBEAT_INTERVAL seconds
|
|
2083
|
+
if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
|
|
2084
|
+
_update_heartbeat()
|
|
2085
|
+
last_heartbeat = current_time
|
|
2086
|
+
|
|
2087
|
+
# Periodic stale job detection (every 15 seconds)
|
|
1644
2088
|
if current_time - last_stale_check >= stale_check_interval:
|
|
1645
2089
|
try:
|
|
1646
2090
|
recovered = _detect_and_recover_stale_jobs()
|
|
@@ -1650,6 +2094,12 @@ def worker_loop(poll_interval: float = 2.0):
|
|
|
1650
2094
|
_append_worker_log(f"stale job detection error: {e}")
|
|
1651
2095
|
last_stale_check = current_time
|
|
1652
2096
|
|
|
2097
|
+
# Update progress tracking for running jobs
|
|
2098
|
+
try:
|
|
2099
|
+
_update_job_progress()
|
|
2100
|
+
except Exception as e:
|
|
2101
|
+
_append_worker_log(f"progress tracking error: {e}")
|
|
2102
|
+
|
|
1653
2103
|
# Check running MSF jobs for exploitation success (every iteration)
|
|
1654
2104
|
try:
|
|
1655
2105
|
detected = _check_msf_exploitation_success()
|