ralphx 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -139,6 +139,8 @@ async def list_items(
139
139
  source_step_id: Optional[int] = Query(None, description="Filter by source step"),
140
140
  limit: int = Query(50, ge=1, le=1000, description="Items per page"),
141
141
  offset: int = Query(0, ge=0, description="Offset for pagination"),
142
+ sort_by: str = Query("created_at", description="Column to sort by"),
143
+ sort_order: str = Query("desc", description="Sort order: asc or desc"),
142
144
  ):
143
145
  """List work items with optional filtering."""
144
146
  manager, project, project_db = get_project(slug)
@@ -151,6 +153,8 @@ async def list_items(
151
153
  source_step_id=source_step_id,
152
154
  limit=limit,
153
155
  offset=offset,
156
+ sort_by=sort_by,
157
+ sort_order=sort_order,
154
158
  )
155
159
 
156
160
  # Convert to response models
@@ -17,6 +17,7 @@ from ralphx.core.project_db import ProjectDatabase
17
17
  from ralphx.models.loop import LoopConfig, LoopType, ModeSelectionStrategy, ItemTypes
18
18
  from ralphx.models.run import Run, RunStatus
19
19
  from ralphx.core.logger import loop_log
20
+ from ralphx.core.checkpoint import kill_orphan_process
20
21
 
21
22
  router = APIRouter()
22
23
 
@@ -54,6 +55,9 @@ def detect_source_cycle(
54
55
  # Store for running loops
55
56
  _running_loops: dict[str, LoopExecutor] = {}
56
57
 
58
+ # Prevent concurrent stop attempts
59
+ _stopping_loops: set[str] = set()
60
+
57
61
  # Security: Validate loop names to prevent path traversal
58
62
  LOOP_NAME_PATTERN = re.compile(r'^[a-zA-Z0-9_-]+$')
59
63
 
@@ -428,34 +432,110 @@ async def start_loop(
428
432
 
429
433
  @router.post("/{slug}/loops/{loop_name}/stop")
430
434
  async def stop_loop(slug: str, loop_name: str):
431
- """Stop a running loop."""
432
- # Validate project exists first
433
- get_managers(slug)
435
+ """Stop a running loop.
436
+
437
+ Attempts to stop via executor if in memory, otherwise falls back
438
+ to killing via PID from database (for orphaned processes after
439
+ server restart/hot-reload).
440
+ """
441
+ manager, project, project_db = get_managers(slug)
434
442
 
435
443
  key = f"{slug}:{loop_name}"
436
- executor = _running_loops.get(key)
437
444
 
438
- if not executor:
439
- raise HTTPException(
440
- status_code=status.HTTP_404_NOT_FOUND,
441
- detail=f"Loop {loop_name} is not running",
442
- )
445
+ # Prevent concurrent stop attempts
446
+ if key in _stopping_loops:
447
+ return {"message": f"Stop already in progress for {loop_name}"}
443
448
 
444
- await executor.stop()
449
+ _stopping_loops.add(key)
450
+ try:
451
+ # Try 1: Stop via executor (normal case)
452
+ executor = _running_loops.get(key)
453
+ if executor:
454
+ await executor.stop()
455
+ return {
456
+ "message": f"Stop signal sent to {loop_name}",
457
+ "method": "executor",
458
+ }
459
+
460
+ # Try 2: Kill via PID (orphan case after server restart)
461
+ runs = project_db.list_runs(loop_name=loop_name, status=["running", "paused"])
462
+ if not runs:
463
+ raise HTTPException(
464
+ status_code=status.HTTP_404_NOT_FOUND,
465
+ detail=f"Loop {loop_name} is not running",
466
+ )
445
467
 
446
- return {"message": f"Stop signal sent to {loop_name}"}
468
+ # Get most recent running run
469
+ run = runs[0]
470
+ pid = run.get("executor_pid")
471
+
472
+ if not pid:
473
+ # No PID recorded - can't kill, just mark as aborted
474
+ project_db.update_run(
475
+ run["id"],
476
+ status="aborted",
477
+ completed_at=datetime.utcnow().isoformat(),
478
+ error_message="Stopped by user (no PID available for orphan process)",
479
+ )
480
+ return {
481
+ "message": f"Marked {loop_name} as aborted (no PID available)",
482
+ "method": "database_only",
483
+ "warning": "Process may still be running",
484
+ }
485
+
486
+ # Kill the orphan process
487
+ success, reason = await kill_orphan_process(pid)
488
+
489
+ # Update database regardless of kill result
490
+ if success:
491
+ error_msg = f"Killed orphan process (PID {pid}) after server restart"
492
+ if reason == "already_dead":
493
+ error_msg = f"Orphan process (PID {pid}) already terminated"
494
+ else:
495
+ error_msg = f"Could not kill orphan process (PID {pid}): {reason}"
496
+
497
+ project_db.update_run(
498
+ run["id"],
499
+ status="aborted",
500
+ completed_at=datetime.utcnow().isoformat(),
501
+ error_message=error_msg,
502
+ )
503
+
504
+ if success:
505
+ return {
506
+ "message": f"Stopped orphan process for {loop_name}",
507
+ "method": "pid_kill",
508
+ "pid": pid,
509
+ "detail": reason, # "killed" or "already_dead"
510
+ }
511
+ else:
512
+ return {
513
+ "message": f"Could not kill process {pid}, marked as aborted",
514
+ "method": "pid_kill_failed",
515
+ "pid": pid,
516
+ "reason": reason,
517
+ "warning": "Process may not have been our process (PID reuse)" if reason == "not_our_process" else None,
518
+ }
519
+ finally:
520
+ _stopping_loops.discard(key)
447
521
 
448
522
 
449
523
  @router.post("/{slug}/loops/{loop_name}/pause")
450
524
  async def pause_loop(slug: str, loop_name: str):
451
525
  """Pause a running loop."""
452
- # Validate project exists first
453
- get_managers(slug)
526
+ manager, project, project_db = get_managers(slug)
454
527
 
455
528
  key = f"{slug}:{loop_name}"
456
529
  executor = _running_loops.get(key)
457
530
 
458
531
  if not executor:
532
+ # Check if there's an orphan process
533
+ runs = project_db.list_runs(loop_name=loop_name, status=["running", "paused"])
534
+ if runs:
535
+ raise HTTPException(
536
+ status_code=status.HTTP_409_CONFLICT,
537
+ detail=f"Loop {loop_name} is running as orphan process (server restarted). Use stop to terminate it.",
538
+ )
459
539
  raise HTTPException(
460
540
  status_code=status.HTTP_404_NOT_FOUND,
461
541
  detail=f"Loop {loop_name} is not running",
@@ -469,13 +549,19 @@ async def pause_loop(slug: str, loop_name: str):
469
549
  @router.post("/{slug}/loops/{loop_name}/resume")
470
550
  async def resume_loop(slug: str, loop_name: str):
471
551
  """Resume a paused loop."""
472
- # Validate project exists first
473
- get_managers(slug)
552
+ manager, project, project_db = get_managers(slug)
474
553
 
475
554
  key = f"{slug}:{loop_name}"
476
555
  executor = _running_loops.get(key)
477
556
 
478
557
  if not executor:
558
+ # Check if there's an orphan process
559
+ runs = project_db.list_runs(loop_name=loop_name, status=["running", "paused"])
560
+ if runs:
561
+ raise HTTPException(
562
+ status_code=status.HTTP_409_CONFLICT,
563
+ detail=f"Loop {loop_name} is orphaned (server restarted). Use stop to terminate, then start again.",
564
+ )
479
565
  raise HTTPException(
480
566
  status_code=status.HTTP_404_NOT_FOUND,
481
567
  detail=f"Loop {loop_name} is not running",
@@ -459,7 +459,7 @@ async def stream_planning_response(slug: str, workflow_id: str):
459
459
  async for event in service.stream_response(
460
460
  messages,
461
461
  model=model,
462
- tools=allowed_tools if allowed_tools else None,
462
+ tools=allowed_tools,
463
463
  timeout=timeout,
464
464
  ):
465
465
  if event.type == AdapterEvent.TEXT:
@@ -8,7 +8,7 @@ from fastapi import APIRouter, HTTPException, Query, status
8
8
  from fastapi.responses import StreamingResponse
9
9
 
10
10
  from ralphx.core.project import ProjectManager
11
- from ralphx.core.session import SessionEventType, SessionManager, SessionTailer
11
+ from ralphx.core.session import SessionManager
12
12
  from ralphx.models.run import RunStatus
13
13
 
14
14
  router = APIRouter()
@@ -179,13 +179,18 @@ async def _tail_session(
179
179
  run_id: Optional[str] = None,
180
180
  iteration: Optional[int] = None,
181
181
  ) -> AsyncGenerator[str, None]:
182
- """Tail a specific session file, storing events to DB for history.
182
+ """Stream session events from DB via polling.
183
+
184
+ Events are persisted to the session_events table by the executor's
185
+ persist_event callback. This function polls that table and yields
186
+ SSE events as they appear — same pattern as planning.py's
187
+ stream_iteration_progress().
183
188
 
184
189
  Args:
185
190
  session_manager: Session manager instance.
186
191
  session_id: Session UUID.
187
- project_path: Project directory path.
188
- project_db: ProjectDatabase for storing events.
192
+ project_path: Project directory path (used for optional file metadata).
193
+ project_db: ProjectDatabase for reading events.
189
194
  from_beginning: Start from file beginning.
190
195
  run_id: Run ID for this session.
191
196
  iteration: Iteration number for this session.
@@ -195,16 +200,12 @@ async def _tail_session(
195
200
  """
196
201
  from pathlib import Path
197
202
 
203
+ # Session file is optional metadata — streaming uses DB polling, not file tailing
198
204
  session_file = session_manager.find_session_file(
199
205
  session_id=session_id,
200
206
  project_path=Path(project_path),
201
207
  )
202
-
203
- if not session_file:
204
- yield await format_sse("error", {
205
- "message": f"Session file not found: {session_id}"
206
- })
207
- return
208
+ # Don't abort if file not found — we stream from DB
208
209
 
209
210
  # Get session info if not provided
210
211
  if run_id is None or iteration is None:
@@ -250,6 +251,16 @@ async def _tail_session(
250
251
  "message": db_event.get("error_message"),
251
252
  **event_meta,
252
253
  })
254
+ elif event_type == "thinking":
255
+ yield await format_sse("thinking", {
256
+ "content": db_event.get("content", ""),
257
+ **event_meta,
258
+ })
259
+ elif event_type == "usage":
260
+ yield await format_sse("usage", {
261
+ "data": db_event.get("raw_data"),
262
+ **event_meta,
263
+ })
253
264
  elif event_type == "init":
254
265
  yield await format_sse("init", {
255
266
  "data": db_event.get("raw_data"),
@@ -261,64 +272,95 @@ async def _tail_session(
261
272
 
262
273
  yield await format_sse("session_start", {
263
274
  "session_id": session_id,
264
- "file": str(session_file),
275
+ "file": str(session_file) if session_file else None,
265
276
  "history_events": len(existing_events),
266
277
  "run_id": run_id,
267
278
  "iteration": iteration,
268
279
  })
269
280
 
270
- # Now tail the file for new events, starting from where DB left off
271
- # If we have history, start from end of file to avoid duplicates
272
- tailer = SessionTailer(
273
- session_path=session_file,
274
- from_beginning=from_beginning and len(existing_events) == 0,
275
- )
276
-
281
+ # Poll DB for new events (same pattern as planning.py stream_iteration_progress)
282
+ # This replaces the SessionTailer file-tailing approach to unify streaming
277
283
  try:
278
- async for event in tailer.tail():
279
- # Skip UNKNOWN events (like queue-operation, user messages)
280
- if event.type == SessionEventType.UNKNOWN:
281
- continue
282
-
283
- # Stream events to client (persistence handled by executor)
284
- if event.type == SessionEventType.TEXT:
285
- yield await format_sse("text", {
286
- "content": event.text,
287
- **event_meta,
288
- })
289
-
290
- elif event.type == SessionEventType.TOOL_CALL:
291
- yield await format_sse("tool_call", {
292
- "name": event.tool_name,
293
- "input": event.tool_input,
294
- **event_meta,
295
- })
296
-
297
- elif event.type == SessionEventType.TOOL_RESULT:
298
- yield await format_sse("tool_result", {
299
- "name": event.tool_name,
300
- "result": event.tool_result[:1000] if event.tool_result else None,
301
- **event_meta,
302
- })
303
-
304
- elif event.type == SessionEventType.ERROR:
305
- yield await format_sse("error", {
306
- "message": event.error_message,
307
- **event_meta,
308
- })
309
-
310
- elif event.type == SessionEventType.COMPLETE:
311
- yield await format_sse("complete", event_meta)
284
+ while True:
285
+ # Fetch new events since last seen
286
+ new_events = project_db.get_session_events(session_id, after_id=last_db_event_id)
287
+
288
+ for db_event in new_events:
289
+ last_db_event_id = db_event.get("id", 0)
290
+ event_type = db_event.get("event_type", "unknown")
291
+
292
+ if event_type == "text":
293
+ yield await format_sse("text", {
294
+ "content": db_event.get("content", ""),
295
+ **event_meta,
296
+ })
297
+ elif event_type == "tool_call":
298
+ yield await format_sse("tool_call", {
299
+ "name": db_event.get("tool_name"),
300
+ "input": db_event.get("tool_input"),
301
+ **event_meta,
302
+ })
303
+ elif event_type == "tool_result":
304
+ yield await format_sse("tool_result", {
305
+ "name": db_event.get("tool_name"),
306
+ "result": db_event.get("tool_result"),
307
+ **event_meta,
308
+ })
309
+ elif event_type == "error":
310
+ yield await format_sse("error", {
311
+ "message": db_event.get("error_message"),
312
+ **event_meta,
313
+ })
314
+ elif event_type == "thinking":
315
+ yield await format_sse("thinking", {
316
+ "content": db_event.get("content", ""),
317
+ **event_meta,
318
+ })
319
+ elif event_type == "usage":
320
+ yield await format_sse("usage", {
321
+ "data": db_event.get("raw_data"),
322
+ **event_meta,
323
+ })
324
+ elif event_type == "complete":
325
+ yield await format_sse("complete", event_meta)
326
+ return # Session complete
327
+ elif event_type == "init":
328
+ yield await format_sse("init", {
329
+ "data": db_event.get("raw_data"),
330
+ **event_meta,
331
+ })
332
+
333
+ # Check if session is done (status updated by executor)
334
+ session_info = project_db.get_session(session_id)
335
+ if session_info and session_info.get("status") in ("completed", "error"):
336
+ # Drain any remaining events
337
+ final_events = project_db.get_session_events(session_id, after_id=last_db_event_id)
338
+ for db_event in final_events:
339
+ last_db_event_id = db_event.get("id", 0)
340
+ event_type = db_event.get("event_type", "unknown")
341
+ if event_type == "text":
342
+ yield await format_sse("text", {"content": db_event.get("content", ""), **event_meta})
343
+ elif event_type == "tool_call":
344
+ yield await format_sse("tool_call", {"name": db_event.get("tool_name"), "input": db_event.get("tool_input"), **event_meta})
345
+ elif event_type == "tool_result":
346
+ yield await format_sse("tool_result", {"name": db_event.get("tool_name"), "result": db_event.get("tool_result"), **event_meta})
347
+ elif event_type == "error":
348
+ yield await format_sse("error", {"message": db_event.get("error_message"), **event_meta})
349
+ elif event_type == "complete":
350
+ yield await format_sse("complete", event_meta)
351
+ elif event_type == "thinking":
352
+ yield await format_sse("thinking", {"content": db_event.get("content", ""), **event_meta})
353
+ elif event_type == "usage":
354
+ yield await format_sse("usage", {"data": db_event.get("raw_data"), **event_meta})
355
+ elif event_type == "init":
356
+ yield await format_sse("init", {"data": db_event.get("raw_data"), **event_meta})
312
357
  break
313
358
 
314
- elif event.type == SessionEventType.INIT:
315
- yield await format_sse("init", {
316
- "data": event.raw_data,
317
- **event_meta,
318
- })
359
+ # Heartbeat + poll interval (same as planning.py)
360
+ yield await format_sse("heartbeat", {})
361
+ await asyncio.sleep(0.5)
319
362
 
320
363
  except asyncio.CancelledError:
321
- tailer.stop()
322
364
  yield await format_sse("disconnected", {})
323
365
 
324
366
 
@@ -452,6 +494,7 @@ async def list_sessions(
452
494
  "status": s.status,
453
495
  "started_at": s.started_at.isoformat() if s.started_at else None,
454
496
  "duration_seconds": s.duration_seconds,
497
+ "account_email": s.account_email,
455
498
  }
456
499
  for s in sessions
457
500
  ]
@@ -484,6 +527,7 @@ async def get_session(
484
527
  "started_at": session.started_at.isoformat() if session.started_at else None,
485
528
  "duration_seconds": session.duration_seconds,
486
529
  "items_added": session.items_added,
530
+ "account_email": session.account_email,
487
531
  }
488
532
 
489
533
 
@@ -527,7 +571,7 @@ async def get_session_events(
527
571
  async def get_grouped_events(
528
572
  slug: str,
529
573
  loop_name: str,
530
- limit_runs: int = Query(5, ge=1, le=50, description="Max runs to return"),
574
+ limit_runs: int = Query(20, ge=1, le=50, description="Max runs to return"),
531
575
  limit_sessions: int = Query(20, ge=1, le=100, description="Max sessions per run"),
532
576
  limit_events: int = Query(200, ge=1, le=1000, description="Max events per session"),
533
577
  ):
@@ -579,6 +623,7 @@ async def get_grouped_events(
579
623
  "mode": session.mode,
580
624
  "status": session.status,
581
625
  "is_live": is_live,
626
+ "account_email": session.account_email,
582
627
  "events": events,
583
628
  "events_truncated": len(events) >= limit_events,
584
629
  }
@@ -42,6 +42,7 @@ class TemplateListItem(BaseModel):
42
42
  description: str
43
43
  type: str
44
44
  category: str
45
+ default_tools: Optional[list[str]] = None
45
46
 
46
47
 
47
48
  class TemplateDetail(BaseModel):
@@ -28,7 +28,7 @@ PROCESSING_TYPES = {
28
28
  "config": {
29
29
  "loopType": "generator",
30
30
  "template": "extractgen_requirements",
31
- "allowedTools": ["WebSearch", "WebFetch"],
31
+ "allowedTools": ["Read", "Glob", "Grep"],
32
32
  "model": "opus",
33
33
  "timeout": 600,
34
34
  "max_iterations": 100,
@@ -41,7 +41,7 @@ PROCESSING_TYPES = {
41
41
  "config": {
42
42
  "loopType": "generator",
43
43
  "template": "webgen_requirements",
44
- "allowedTools": ["WebSearch", "WebFetch"],
44
+ "allowedTools": ["Read", "Glob", "Grep", "WebSearch", "WebFetch"],
45
45
  "model": "opus",
46
46
  "timeout": 900,
47
47
  "max_iterations": 15,
ralphx/core/checkpoint.py CHANGED
@@ -7,9 +7,12 @@ Implements:
7
7
  - Recovery flow for resuming interrupted runs
8
8
  """
9
9
 
10
+ import asyncio
10
11
  import fcntl
11
12
  import json
12
13
  import os
14
+ import signal
15
+ import subprocess
13
16
  import sys
14
17
  from dataclasses import dataclass, field
15
18
  from datetime import datetime
@@ -197,6 +200,121 @@ def is_pid_running(pid: int) -> bool:
197
200
  return False
198
201
 
199
202
 
203
+ def is_our_claude_process(pid: int) -> bool:
204
+ """Verify PID is actually our Claude process, not a reused PID.
205
+
206
+ This prevents PID reuse attacks where we might accidentally kill
207
+ an unrelated process that was assigned the same PID after our
208
+ Claude process terminated.
209
+
210
+ Returns False if:
211
+ - Process doesn't exist
212
+ - Can't read cmdline (permissions, etc.)
213
+ - Process is not a Claude CLI or Python/RalphX process
214
+
215
+ Note: There is still a small TOCTOU (time-of-check-to-time-of-use) race
216
+ between this check and the actual kill. This is an accepted risk that
217
+ is mitigated by:
218
+ 1. The check significantly reduces the window vs. no check at all
219
+ 2. We only use PIDs from our own database, not user input
220
+ 3. The target must match expected process names
221
+ """
222
+ if pid <= 0:
223
+ return False
224
+
225
+ if sys.platform == "win32":
226
+ # Windows: Use tasklist to verify process name
227
+ try:
228
+ result = subprocess.run(
229
+ ["tasklist", "/FI", f"PID eq {pid}", "/FO", "CSV", "/NH"],
230
+ capture_output=True, text=True, timeout=5
231
+ )
232
+ output = result.stdout.lower()
233
+ # Check for claude or python (for multiprocessing spawn)
234
+ return "claude" in output or "python" in output
235
+ except Exception:
236
+ return False
237
+
238
+ elif sys.platform == "darwin":
239
+ # macOS: Use ps command (no /proc filesystem)
240
+ try:
241
+ result = subprocess.run(
242
+ ["ps", "-p", str(pid), "-o", "command="],
243
+ capture_output=True, text=True, timeout=5
244
+ )
245
+ output = result.stdout.lower()
246
+ return "claude" in output or "python" in output or "ralphx" in output
247
+ except Exception:
248
+ return False
249
+
250
+ else:
251
+ # Linux: Check /proc/{pid}/cmdline (most reliable)
252
+ try:
253
+ with open(f"/proc/{pid}/cmdline", "rb") as f:
254
+ cmdline = f.read().decode("utf-8", errors="replace").lower()
255
+ # cmdline uses null bytes as separators
256
+ return "claude" in cmdline or "python" in cmdline or "ralphx" in cmdline
257
+ except (OSError, IOError):
258
+ return False
259
+
260
+
261
+ async def kill_orphan_process(pid: int, timeout: float = 5.0) -> tuple[bool, str]:
262
+ """Kill an orphan Claude/RalphX process by PID.
263
+
264
+ Returns tuple of (success, reason):
265
+ - (True, "killed") - Process was terminated by us
266
+ - (True, "already_dead") - Process was already dead
267
+ - (False, "not_our_process") - PID exists but isn't our process
268
+ - (False, "permission_denied") - Can't kill (permissions)
269
+ - (False, "unknown_error") - Other failure
270
+
271
+ Cross-platform notes:
272
+ - Linux/macOS: SIGTERM for graceful, SIGKILL for force
273
+ - Windows: os.kill() with any signal calls TerminateProcess (immediate)
274
+ """
275
+ # Check if process is already dead
276
+ if not is_pid_running(pid):
277
+ return (True, "already_dead")
278
+
279
+ # Validate this is our process
280
+ if not is_our_claude_process(pid):
281
+ return (False, "not_our_process")
282
+
283
+ try:
284
+ if sys.platform == "win32":
285
+ # Windows: TerminateProcess is immediate, no graceful option
286
+ os.kill(pid, signal.SIGTERM) # Actually calls TerminateProcess
287
+ await asyncio.sleep(0.1)
288
+ if not is_pid_running(pid):
289
+ return (True, "killed")
290
+ return (False, "unknown_error")
291
+ else:
292
+ # Unix: Send SIGTERM for graceful shutdown
293
+ os.kill(pid, signal.SIGTERM)
294
+
295
+ # Wait for process to die
296
+ for _ in range(int(timeout * 10)):
297
+ await asyncio.sleep(0.1)
298
+ if not is_pid_running(pid):
299
+ return (True, "killed")
300
+
301
+ # Process didn't die, force kill
302
+ os.kill(pid, signal.SIGKILL)
303
+ await asyncio.sleep(0.1)
304
+ if not is_pid_running(pid):
305
+ return (True, "killed")
306
+ return (False, "unknown_error")
307
+
308
+ except ProcessLookupError:
309
+ # Process already dead - success
310
+ return (True, "already_dead")
311
+ except PermissionError:
312
+ # Can't kill - likely not our process
313
+ return (False, "permission_denied")
314
+ except OSError:
315
+ return (False, "unknown_error")
316
+
317
+
200
318
  class ProjectLock:
201
319
  """Atomic file lock for a project.
202
320