draft-board 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/backend/.env.example +9 -0
- package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_patch.txt +195 -0
- package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_stat.txt +6 -0
- package/app/backend/CURL_EXAMPLES.md +335 -0
- package/app/backend/ENV_SETUP.md +65 -0
- package/app/backend/alembic/env.py +71 -0
- package/app/backend/alembic/script.py.mako +28 -0
- package/app/backend/alembic/versions/001_initial_schema.py +104 -0
- package/app/backend/alembic/versions/002_add_jobs_table.py +52 -0
- package/app/backend/alembic/versions/003_add_workspace_table.py +48 -0
- package/app/backend/alembic/versions/004_add_evidence_table.py +56 -0
- package/app/backend/alembic/versions/005_add_verification_commands.py +32 -0
- package/app/backend/alembic/versions/006_add_planner_lock_table.py +39 -0
- package/app/backend/alembic/versions/007_add_revision_review_tables.py +126 -0
- package/app/backend/alembic/versions/008_add_revision_idempotency_and_traceability.py +52 -0
- package/app/backend/alembic/versions/009_add_job_health_fields.py +46 -0
- package/app/backend/alembic/versions/010_add_review_comment_line_content.py +36 -0
- package/app/backend/alembic/versions/011_add_analysis_cache.py +47 -0
- package/app/backend/alembic/versions/012_add_boards_table.py +102 -0
- package/app/backend/alembic/versions/013_add_ticket_blocking.py +45 -0
- package/app/backend/alembic/versions/014_add_agent_sessions.py +220 -0
- package/app/backend/alembic/versions/015_add_ticket_sort_order.py +33 -0
- package/app/backend/alembic/versions/03220f0b93ae_add_pr_fields_to_ticket.py +49 -0
- package/app/backend/alembic/versions/0c2d89fff3b1_seed_board_configs_from_yaml.py +206 -0
- package/app/backend/alembic/versions/3348e5cf54c1_add_merge_checklist_table.py +67 -0
- package/app/backend/alembic/versions/357c780ee445_add_goal_status.py +34 -0
- package/app/backend/alembic/versions/553340b7e26c_add_autonomy_fields_to_goal.py +65 -0
- package/app/backend/alembic/versions/774dc335c679_merge_migration_heads.py +23 -0
- package/app/backend/alembic/versions/7b307e847cbd_merge_heads.py +23 -0
- package/app/backend/alembic/versions/82ecd978cc70_add_missing_indexes.py +48 -0
- package/app/backend/alembic/versions/8ef5054dc280_add_normalized_log_entries.py +173 -0
- package/app/backend/alembic/versions/8f3e2bd8ea3b_merge_migration_heads.py +23 -0
- package/app/backend/alembic/versions/9d17f0698d3b_add_config_column_to_boards_table.py +30 -0
- package/app/backend/alembic/versions/add_agent_conversation_history.py +72 -0
- package/app/backend/alembic/versions/add_job_variant.py +34 -0
- package/app/backend/alembic/versions/add_performance_indexes.py +95 -0
- package/app/backend/alembic/versions/add_repos_and_board_repos.py +174 -0
- package/app/backend/alembic/versions/add_session_id_to_jobs.py +27 -0
- package/app/backend/alembic/versions/add_sqlite_backend_tables.py +104 -0
- package/app/backend/alembic/versions/b10fb0b62240_add_diff_content_to_revisions.py +34 -0
- package/app/backend/alembic.ini +89 -0
- package/app/backend/app/__init__.py +3 -0
- package/app/backend/app/data_dir.py +85 -0
- package/app/backend/app/database.py +70 -0
- package/app/backend/app/database_sync.py +64 -0
- package/app/backend/app/dependencies/__init__.py +5 -0
- package/app/backend/app/dependencies/auth.py +80 -0
- package/app/backend/app/dependencies.py +43 -0
- package/app/backend/app/exceptions.py +178 -0
- package/app/backend/app/executors/__init__.py +1 -0
- package/app/backend/app/executors/adapters/__init__.py +1 -0
- package/app/backend/app/executors/adapters/aider.py +152 -0
- package/app/backend/app/executors/adapters/amazon_q.py +103 -0
- package/app/backend/app/executors/adapters/amp.py +123 -0
- package/app/backend/app/executors/adapters/claude.py +177 -0
- package/app/backend/app/executors/adapters/cline.py +127 -0
- package/app/backend/app/executors/adapters/codex.py +167 -0
- package/app/backend/app/executors/adapters/copilot.py +202 -0
- package/app/backend/app/executors/adapters/cursor.py +87 -0
- package/app/backend/app/executors/adapters/droid.py +123 -0
- package/app/backend/app/executors/adapters/gemini.py +132 -0
- package/app/backend/app/executors/adapters/goose.py +131 -0
- package/app/backend/app/executors/adapters/opencode.py +123 -0
- package/app/backend/app/executors/adapters/qwen.py +123 -0
- package/app/backend/app/executors/plugins/__init__.py +1 -0
- package/app/backend/app/executors/registry.py +202 -0
- package/app/backend/app/executors/spec.py +226 -0
- package/app/backend/app/main.py +486 -0
- package/app/backend/app/middleware/__init__.py +13 -0
- package/app/backend/app/middleware/idempotency.py +426 -0
- package/app/backend/app/middleware/rate_limit.py +312 -0
- package/app/backend/app/middleware/security_headers.py +43 -0
- package/app/backend/app/middleware/timeout.py +37 -0
- package/app/backend/app/models/__init__.py +56 -0
- package/app/backend/app/models/agent_conversation_history.py +56 -0
- package/app/backend/app/models/agent_session.py +127 -0
- package/app/backend/app/models/analysis_cache.py +49 -0
- package/app/backend/app/models/base.py +9 -0
- package/app/backend/app/models/board.py +79 -0
- package/app/backend/app/models/board_repo.py +68 -0
- package/app/backend/app/models/cost_budget.py +42 -0
- package/app/backend/app/models/enums.py +40 -0
- package/app/backend/app/models/evidence.py +132 -0
- package/app/backend/app/models/goal.py +102 -0
- package/app/backend/app/models/idempotency_entry.py +30 -0
- package/app/backend/app/models/job.py +163 -0
- package/app/backend/app/models/job_queue.py +39 -0
- package/app/backend/app/models/kv_store.py +28 -0
- package/app/backend/app/models/merge_checklist.py +87 -0
- package/app/backend/app/models/normalized_log.py +100 -0
- package/app/backend/app/models/planner_lock.py +43 -0
- package/app/backend/app/models/rate_limit_entry.py +25 -0
- package/app/backend/app/models/repo.py +66 -0
- package/app/backend/app/models/review_comment.py +91 -0
- package/app/backend/app/models/review_summary.py +69 -0
- package/app/backend/app/models/revision.py +130 -0
- package/app/backend/app/models/ticket.py +223 -0
- package/app/backend/app/models/ticket_event.py +83 -0
- package/app/backend/app/models/user.py +47 -0
- package/app/backend/app/models/workspace.py +71 -0
- package/app/backend/app/redis_client.py +119 -0
- package/app/backend/app/routers/__init__.py +29 -0
- package/app/backend/app/routers/agents.py +296 -0
- package/app/backend/app/routers/auth.py +94 -0
- package/app/backend/app/routers/board.py +885 -0
- package/app/backend/app/routers/dashboard.py +351 -0
- package/app/backend/app/routers/debug.py +528 -0
- package/app/backend/app/routers/evidence.py +96 -0
- package/app/backend/app/routers/executors.py +324 -0
- package/app/backend/app/routers/goals.py +574 -0
- package/app/backend/app/routers/jobs.py +448 -0
- package/app/backend/app/routers/maintenance.py +172 -0
- package/app/backend/app/routers/merge.py +360 -0
- package/app/backend/app/routers/planner.py +537 -0
- package/app/backend/app/routers/pull_requests.py +382 -0
- package/app/backend/app/routers/repos.py +263 -0
- package/app/backend/app/routers/revisions.py +939 -0
- package/app/backend/app/routers/settings.py +267 -0
- package/app/backend/app/routers/tickets.py +2003 -0
- package/app/backend/app/routers/webhooks.py +143 -0
- package/app/backend/app/routers/websocket.py +249 -0
- package/app/backend/app/schemas/__init__.py +109 -0
- package/app/backend/app/schemas/board.py +87 -0
- package/app/backend/app/schemas/common.py +33 -0
- package/app/backend/app/schemas/evidence.py +87 -0
- package/app/backend/app/schemas/goal.py +90 -0
- package/app/backend/app/schemas/job.py +97 -0
- package/app/backend/app/schemas/merge.py +139 -0
- package/app/backend/app/schemas/planner.py +500 -0
- package/app/backend/app/schemas/repo.py +187 -0
- package/app/backend/app/schemas/review.py +137 -0
- package/app/backend/app/schemas/revision.py +114 -0
- package/app/backend/app/schemas/ticket.py +238 -0
- package/app/backend/app/schemas/ticket_event.py +72 -0
- package/app/backend/app/schemas/workspace.py +19 -0
- package/app/backend/app/services/__init__.py +31 -0
- package/app/backend/app/services/agent_memory_service.py +223 -0
- package/app/backend/app/services/agent_registry.py +346 -0
- package/app/backend/app/services/agent_session_manager.py +318 -0
- package/app/backend/app/services/agent_session_service.py +219 -0
- package/app/backend/app/services/agent_tools.py +379 -0
- package/app/backend/app/services/auth_service.py +98 -0
- package/app/backend/app/services/autonomy_service.py +380 -0
- package/app/backend/app/services/board_repo_service.py +201 -0
- package/app/backend/app/services/board_service.py +326 -0
- package/app/backend/app/services/cleanup_service.py +1085 -0
- package/app/backend/app/services/config_service.py +908 -0
- package/app/backend/app/services/context_gatherer.py +557 -0
- package/app/backend/app/services/cost_tracking_service.py +293 -0
- package/app/backend/app/services/cursor_log_normalizer.py +536 -0
- package/app/backend/app/services/delivery_pipeline.py +440 -0
- package/app/backend/app/services/executor_service.py +634 -0
- package/app/backend/app/services/git_host/__init__.py +11 -0
- package/app/backend/app/services/git_host/factory.py +87 -0
- package/app/backend/app/services/git_host/github.py +270 -0
- package/app/backend/app/services/git_host/gitlab.py +194 -0
- package/app/backend/app/services/git_host/protocol.py +75 -0
- package/app/backend/app/services/git_merge_simple.py +346 -0
- package/app/backend/app/services/git_ops.py +384 -0
- package/app/backend/app/services/github_service.py +233 -0
- package/app/backend/app/services/goal_service.py +113 -0
- package/app/backend/app/services/job_service.py +423 -0
- package/app/backend/app/services/job_watchdog_service.py +424 -0
- package/app/backend/app/services/langchain_adapter.py +122 -0
- package/app/backend/app/services/llm_provider_clients.py +351 -0
- package/app/backend/app/services/llm_service.py +285 -0
- package/app/backend/app/services/log_normalizer.py +342 -0
- package/app/backend/app/services/log_stream_service.py +276 -0
- package/app/backend/app/services/merge_checklist_service.py +264 -0
- package/app/backend/app/services/merge_service.py +784 -0
- package/app/backend/app/services/orchestrator_log.py +84 -0
- package/app/backend/app/services/planner_service.py +1662 -0
- package/app/backend/app/services/planner_tick_sync.py +1040 -0
- package/app/backend/app/services/queued_message_service.py +156 -0
- package/app/backend/app/services/reliability_wrapper.py +389 -0
- package/app/backend/app/services/repo_discovery_service.py +318 -0
- package/app/backend/app/services/review_service.py +334 -0
- package/app/backend/app/services/revision_service.py +389 -0
- package/app/backend/app/services/safe_autopilot.py +510 -0
- package/app/backend/app/services/sqlite_worker.py +372 -0
- package/app/backend/app/services/task_dispatch.py +135 -0
- package/app/backend/app/services/ticket_generation_service.py +1781 -0
- package/app/backend/app/services/ticket_service.py +486 -0
- package/app/backend/app/services/udar_planner_service.py +1007 -0
- package/app/backend/app/services/webhook_service.py +126 -0
- package/app/backend/app/services/workspace_service.py +465 -0
- package/app/backend/app/services/worktree_file_service.py +92 -0
- package/app/backend/app/services/worktree_validator.py +213 -0
- package/app/backend/app/sqlite_kv.py +278 -0
- package/app/backend/app/state_machine.py +128 -0
- package/app/backend/app/templates/__init__.py +5 -0
- package/app/backend/app/templates/registry.py +243 -0
- package/app/backend/app/utils/__init__.py +5 -0
- package/app/backend/app/utils/artifact_reader.py +87 -0
- package/app/backend/app/utils/circuit_breaker.py +229 -0
- package/app/backend/app/utils/db_retry.py +136 -0
- package/app/backend/app/utils/ignored_fields.py +123 -0
- package/app/backend/app/utils/validators.py +54 -0
- package/app/backend/app/websocket/__init__.py +5 -0
- package/app/backend/app/websocket/manager.py +179 -0
- package/app/backend/app/websocket/state_tracker.py +113 -0
- package/app/backend/app/worker.py +3190 -0
- package/app/backend/calculator_tickets.json +40 -0
- package/app/backend/canary_tests.sh +591 -0
- package/app/backend/celerybeat-schedule +0 -0
- package/app/backend/celerybeat-schedule-shm +0 -0
- package/app/backend/celerybeat-schedule-wal +0 -0
- package/app/backend/logs/.gitkeep +3 -0
- package/app/backend/multiplication_division_implementation_tickets.json +55 -0
- package/app/backend/multiplication_division_tickets.json +42 -0
- package/app/backend/pyproject.toml +45 -0
- package/app/backend/requirements-dev.txt +8 -0
- package/app/backend/requirements.txt +20 -0
- package/app/backend/run.sh +30 -0
- package/app/backend/run_with_logs.sh +10 -0
- package/app/backend/scientific_calculator_tickets.json +40 -0
- package/app/backend/scripts/extract_openapi.py +21 -0
- package/app/backend/scripts/seed_demo.py +187 -0
- package/app/backend/setup_demo_review.py +302 -0
- package/app/backend/test_actual_parse.py +41 -0
- package/app/backend/test_agent_streaming.py +61 -0
- package/app/backend/test_parse.py +51 -0
- package/app/backend/test_streaming.py +51 -0
- package/app/backend/test_subprocess_streaming.py +50 -0
- package/app/backend/tests/__init__.py +1 -0
- package/app/backend/tests/conftest.py +46 -0
- package/app/backend/tests/test_auth.py +341 -0
- package/app/backend/tests/test_autonomy_service.py +391 -0
- package/app/backend/tests/test_cleanup_service_safety.py +417 -0
- package/app/backend/tests/test_middleware.py +279 -0
- package/app/backend/tests/test_planner_providers.py +290 -0
- package/app/backend/tests/test_planner_unblock.py +183 -0
- package/app/backend/tests/test_revision_invariants.py +618 -0
- package/app/backend/tests/test_sqlite_kv.py +290 -0
- package/app/backend/tests/test_sqlite_worker.py +353 -0
- package/app/backend/tests/test_task_dispatch.py +100 -0
- package/app/backend/tests/test_ticket_validation.py +304 -0
- package/app/backend/tests/test_udar_agent.py +693 -0
- package/app/backend/tests/test_webhook_service.py +184 -0
- package/app/backend/tickets_output.json +59 -0
- package/app/backend/user_management_tickets.json +50 -0
- package/app/backend/uvicorn.log +0 -0
- package/app/draft.yaml +313 -0
- package/app/frontend/dist/assets/index-LcjCczu5.js +155 -0
- package/app/frontend/dist/assets/index-_FP_279e.css +1 -0
- package/app/frontend/dist/index.html +14 -0
- package/app/frontend/dist/vite.svg +1 -0
- package/app/frontend/package.json +101 -0
- package/bin/cli.js +527 -0
- package/package.json +37 -0
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
"""Service for monitoring and recovering stuck jobs."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import UTC, datetime, timedelta
|
|
7
|
+
|
|
8
|
+
from sqlalchemy import or_
|
|
9
|
+
from sqlalchemy.exc import OperationalError
|
|
10
|
+
from sqlalchemy.orm import Session
|
|
11
|
+
|
|
12
|
+
from app.database_sync import get_sync_db
|
|
13
|
+
from app.models.job import Job, JobStatus
|
|
14
|
+
from app.models.ticket import Ticket
|
|
15
|
+
from app.state_machine import TicketState
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _ensure_utc(dt: datetime | None) -> datetime | None:
|
|
21
|
+
"""Ensure a datetime is UTC-aware. Returns None if input is None.
|
|
22
|
+
|
|
23
|
+
SQLite stores datetimes without timezone info. This helper ensures
|
|
24
|
+
we can safely compare database datetimes with timezone-aware now().
|
|
25
|
+
"""
|
|
26
|
+
if dt is None:
|
|
27
|
+
return None
|
|
28
|
+
if dt.tzinfo is None:
|
|
29
|
+
# Assume naive datetimes from DB are UTC
|
|
30
|
+
return dt.replace(tzinfo=UTC)
|
|
31
|
+
return dt
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _to_naive_utc(dt: datetime) -> datetime:
|
|
35
|
+
"""Convert a datetime to naive UTC for SQLite comparisons.
|
|
36
|
+
|
|
37
|
+
SQLite stores datetimes without timezone info. When comparing Python
|
|
38
|
+
datetimes with SQLite columns, we need naive datetimes to avoid
|
|
39
|
+
'can't subtract offset-naive and offset-aware datetimes' errors.
|
|
40
|
+
"""
|
|
41
|
+
if dt.tzinfo is not None:
|
|
42
|
+
# Convert to UTC and strip timezone
|
|
43
|
+
return dt.replace(tzinfo=None)
|
|
44
|
+
return dt
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Default thresholds
|
|
48
|
+
HEARTBEAT_STALE_SECONDS = 120 # Job is stale if no heartbeat in 2 minutes
|
|
49
|
+
QUEUED_REENQUEUE_SECONDS = 30 # Re-enqueue lost tasks after 30 seconds
|
|
50
|
+
QUEUED_STALE_MINUTES = 15 # Job is stuck in queue if queued for 15 minutes (increased from 2 to handle busy workers)
|
|
51
|
+
DEFAULT_JOB_TIMEOUT_SECONDS = 900 # 15 minutes default timeout
|
|
52
|
+
|
|
53
|
+
# SQLite retry config
|
|
54
|
+
MAX_RETRIES = 3
|
|
55
|
+
RETRY_DELAY_SECONDS = 0.5
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class WatchdogResult:
|
|
60
|
+
"""Result of a watchdog run."""
|
|
61
|
+
|
|
62
|
+
stale_jobs_recovered: int = 0
|
|
63
|
+
timed_out_jobs_recovered: int = 0
|
|
64
|
+
stuck_queued_jobs_failed: int = 0
|
|
65
|
+
missing_started_at_jobs: int = 0
|
|
66
|
+
lost_tasks_reenqueued: int = 0 # Jobs re-enqueued due to lost Celery tasks
|
|
67
|
+
tickets_blocked: int = 0
|
|
68
|
+
details: list[str] = None
|
|
69
|
+
|
|
70
|
+
def __post_init__(self):
|
|
71
|
+
if self.details is None:
|
|
72
|
+
self.details = []
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def run_job_watchdog() -> WatchdogResult:
|
|
76
|
+
"""Run the job watchdog to recover stuck jobs.
|
|
77
|
+
|
|
78
|
+
This function checks for:
|
|
79
|
+
1. RUNNING jobs with stale heartbeat (no update in HEARTBEAT_STALE_SECONDS)
|
|
80
|
+
2. RUNNING jobs that exceeded their timeout_seconds
|
|
81
|
+
3. RUNNING jobs missing started_at (data corruption or bug)
|
|
82
|
+
4. QUEUED jobs that haven't been picked up in QUEUED_STALE_MINUTES
|
|
83
|
+
|
|
84
|
+
For each stuck job:
|
|
85
|
+
- Mark job as FAILED with reason
|
|
86
|
+
- Transition associated ticket to BLOCKED (via proper transition function)
|
|
87
|
+
- Uses transaction + retry for SQLite concurrency safety
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
WatchdogResult with counts and details
|
|
91
|
+
"""
|
|
92
|
+
result = WatchdogResult()
|
|
93
|
+
|
|
94
|
+
with get_sync_db() as db:
|
|
95
|
+
# Use UTC-aware datetime for Python operations
|
|
96
|
+
now = datetime.now(UTC)
|
|
97
|
+
# Use naive UTC for SQLite comparisons (SQLite stores naive datetimes)
|
|
98
|
+
now_naive = _to_naive_utc(now)
|
|
99
|
+
|
|
100
|
+
# 1. Find RUNNING jobs with stale heartbeat
|
|
101
|
+
heartbeat_threshold = now_naive - timedelta(seconds=HEARTBEAT_STALE_SECONDS)
|
|
102
|
+
stale_heartbeat_jobs = (
|
|
103
|
+
db.query(Job)
|
|
104
|
+
.filter(
|
|
105
|
+
Job.status == JobStatus.RUNNING.value,
|
|
106
|
+
# Must have started_at to be considered for stale heartbeat
|
|
107
|
+
Job.started_at.isnot(None),
|
|
108
|
+
or_(
|
|
109
|
+
Job.last_heartbeat_at.is_(None),
|
|
110
|
+
Job.last_heartbeat_at < heartbeat_threshold,
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
.all()
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
for job in stale_heartbeat_jobs:
|
|
117
|
+
_fail_job_with_retry(
|
|
118
|
+
db=db,
|
|
119
|
+
job=job,
|
|
120
|
+
reason="Stale heartbeat - worker may have crashed",
|
|
121
|
+
result=result,
|
|
122
|
+
now=now,
|
|
123
|
+
)
|
|
124
|
+
result.stale_jobs_recovered += 1
|
|
125
|
+
|
|
126
|
+
# 2. Find RUNNING jobs missing started_at (data corruption)
|
|
127
|
+
# These are separate from stale heartbeat - they indicate a bug
|
|
128
|
+
missing_started_at_jobs = (
|
|
129
|
+
db.query(Job)
|
|
130
|
+
.filter(
|
|
131
|
+
Job.status == JobStatus.RUNNING.value,
|
|
132
|
+
Job.started_at.is_(None),
|
|
133
|
+
)
|
|
134
|
+
.all()
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
for job in missing_started_at_jobs:
|
|
138
|
+
_fail_job_with_retry(
|
|
139
|
+
db=db,
|
|
140
|
+
job=job,
|
|
141
|
+
reason="Running job missing started_at - possible data corruption",
|
|
142
|
+
result=result,
|
|
143
|
+
now=now,
|
|
144
|
+
)
|
|
145
|
+
result.missing_started_at_jobs += 1
|
|
146
|
+
|
|
147
|
+
# 3. Find RUNNING jobs that exceeded timeout
|
|
148
|
+
# Only check jobs that have valid started_at
|
|
149
|
+
running_jobs_with_start = (
|
|
150
|
+
db.query(Job)
|
|
151
|
+
.filter(
|
|
152
|
+
Job.status == JobStatus.RUNNING.value,
|
|
153
|
+
Job.started_at.isnot(None),
|
|
154
|
+
)
|
|
155
|
+
.all()
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
for job in running_jobs_with_start:
|
|
159
|
+
timeout = job.timeout_seconds or DEFAULT_JOB_TIMEOUT_SECONDS
|
|
160
|
+
# Safe: we know started_at is not None from the query
|
|
161
|
+
# Use _ensure_utc to handle timezone-naive datetimes from SQLite
|
|
162
|
+
elapsed = (now - _ensure_utc(job.started_at)).total_seconds()
|
|
163
|
+
if elapsed > timeout:
|
|
164
|
+
_fail_job_with_retry(
|
|
165
|
+
db=db,
|
|
166
|
+
job=job,
|
|
167
|
+
reason=f"Job timeout exceeded ({timeout}s, elapsed {int(elapsed)}s)",
|
|
168
|
+
result=result,
|
|
169
|
+
now=now,
|
|
170
|
+
)
|
|
171
|
+
result.timed_out_jobs_recovered += 1
|
|
172
|
+
|
|
173
|
+
# 4. Re-enqueue jobs that may have lost their Celery tasks (30s threshold)
|
|
174
|
+
# This catches jobs where the Celery task was lost from Redis but the DB shows queued
|
|
175
|
+
# BUT: Only re-enqueue when no RUNNING jobs exist — otherwise the worker is just
|
|
176
|
+
# busy and the jobs will be picked up when the current job completes.
|
|
177
|
+
# Re-enqueueing while busy creates duplicate job_queue entries.
|
|
178
|
+
running_jobs_count = (
|
|
179
|
+
db.query(Job).filter(Job.status == JobStatus.RUNNING.value).count()
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if running_jobs_count == 0:
|
|
183
|
+
reenqueue_threshold = now_naive - timedelta(
|
|
184
|
+
seconds=QUEUED_REENQUEUE_SECONDS
|
|
185
|
+
)
|
|
186
|
+
potentially_lost_jobs = (
|
|
187
|
+
db.query(Job)
|
|
188
|
+
.filter(
|
|
189
|
+
Job.status == JobStatus.QUEUED.value,
|
|
190
|
+
Job.created_at < reenqueue_threshold,
|
|
191
|
+
)
|
|
192
|
+
.all()
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
for job in potentially_lost_jobs:
|
|
196
|
+
reenqueued = _reenqueue_lost_task(db, job, result)
|
|
197
|
+
if reenqueued:
|
|
198
|
+
result.lost_tasks_reenqueued += 1
|
|
199
|
+
else:
|
|
200
|
+
logger.debug(
|
|
201
|
+
f"Skipping re-enqueue: {running_jobs_count} jobs currently running"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# 5. Find QUEUED jobs that haven't been picked up even after re-enqueue attempts
|
|
205
|
+
# This is the final fallback - fail jobs stuck for too long
|
|
206
|
+
# BUT: Only fail if there are NO running jobs (worker is idle but not picking up work)
|
|
207
|
+
# If there are running jobs, the worker is just busy - let queued jobs wait
|
|
208
|
+
running_jobs_count = (
|
|
209
|
+
db.query(Job).filter(Job.status == JobStatus.RUNNING.value).count()
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if running_jobs_count == 0:
|
|
213
|
+
# Worker appears idle but jobs are stuck in queue - something is wrong
|
|
214
|
+
queued_threshold = now_naive - timedelta(minutes=QUEUED_STALE_MINUTES)
|
|
215
|
+
stuck_queued_jobs = (
|
|
216
|
+
db.query(Job)
|
|
217
|
+
.filter(
|
|
218
|
+
Job.status == JobStatus.QUEUED.value,
|
|
219
|
+
Job.created_at < queued_threshold,
|
|
220
|
+
)
|
|
221
|
+
.all()
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
for job in stuck_queued_jobs:
|
|
225
|
+
_fail_job_with_retry(
|
|
226
|
+
db=db,
|
|
227
|
+
job=job,
|
|
228
|
+
reason=f"Stuck in queue for over {QUEUED_STALE_MINUTES} minutes with no active jobs - worker may be down",
|
|
229
|
+
result=result,
|
|
230
|
+
now=now,
|
|
231
|
+
)
|
|
232
|
+
result.stuck_queued_jobs_failed += 1
|
|
233
|
+
else:
|
|
234
|
+
logger.debug(
|
|
235
|
+
f"Skipping stuck queue check: {running_jobs_count} jobs currently running"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
logger.info(
|
|
239
|
+
f"Watchdog completed: {result.stale_jobs_recovered} stale, "
|
|
240
|
+
f"{result.timed_out_jobs_recovered} timed out, "
|
|
241
|
+
f"{result.missing_started_at_jobs} missing started_at, "
|
|
242
|
+
f"{result.lost_tasks_reenqueued} re-enqueued, "
|
|
243
|
+
f"{result.stuck_queued_jobs_failed} stuck queued, "
|
|
244
|
+
f"{result.tickets_blocked} tickets blocked"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
return result
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _reenqueue_lost_task(db: Session, job: Job, result: WatchdogResult) -> bool:
|
|
251
|
+
"""Re-enqueue a Celery task for a job that may have lost its task.
|
|
252
|
+
|
|
253
|
+
Checks if the job's Celery task is still in the queue/active. If not,
|
|
254
|
+
creates a new Celery task for the job.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
db: Database session
|
|
258
|
+
job: The job to potentially re-enqueue
|
|
259
|
+
result: WatchdogResult to update with details
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
True if the task was re-enqueued, False if it was still active
|
|
263
|
+
"""
|
|
264
|
+
from app.models.job import JobKind
|
|
265
|
+
from app.services.task_dispatch import enqueue_task
|
|
266
|
+
|
|
267
|
+
# First check if the ticket is in a state where re-enqueueing makes sense
|
|
268
|
+
ticket = db.query(Ticket).filter(Ticket.id == job.ticket_id).first()
|
|
269
|
+
if ticket:
|
|
270
|
+
terminal_states = [
|
|
271
|
+
TicketState.BLOCKED.value,
|
|
272
|
+
TicketState.DONE.value,
|
|
273
|
+
TicketState.ABANDONED.value,
|
|
274
|
+
]
|
|
275
|
+
if ticket.state in terminal_states:
|
|
276
|
+
logger.info(
|
|
277
|
+
f"Job {job.id} ticket is in {ticket.state} state - failing job instead of re-enqueueing"
|
|
278
|
+
)
|
|
279
|
+
job.status = JobStatus.FAILED.value
|
|
280
|
+
job.finished_at = datetime.now(UTC)
|
|
281
|
+
job.exit_code = -1
|
|
282
|
+
db.commit()
|
|
283
|
+
result.details.append(
|
|
284
|
+
f"Job {job.id} ({job.kind}): Failed (ticket already {ticket.state})"
|
|
285
|
+
)
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
# Task is lost or never existed - re-enqueue via unified dispatch
|
|
289
|
+
try:
|
|
290
|
+
task = None
|
|
291
|
+
task_name = None
|
|
292
|
+
if job.kind == JobKind.EXECUTE.value:
|
|
293
|
+
task_name = "execute_ticket"
|
|
294
|
+
elif job.kind == JobKind.VERIFY.value:
|
|
295
|
+
task_name = "verify_ticket"
|
|
296
|
+
elif job.kind == JobKind.RESUME.value:
|
|
297
|
+
task_name = "resume_ticket"
|
|
298
|
+
|
|
299
|
+
if task_name:
|
|
300
|
+
task = enqueue_task(task_name, args=[job.id])
|
|
301
|
+
|
|
302
|
+
if task:
|
|
303
|
+
old_task_id = job.celery_task_id
|
|
304
|
+
job.celery_task_id = task.id
|
|
305
|
+
db.commit()
|
|
306
|
+
logger.info(
|
|
307
|
+
f"Re-enqueued lost Celery task for job {job.id} ({job.kind}): "
|
|
308
|
+
f"old={old_task_id}, new={task.id}"
|
|
309
|
+
)
|
|
310
|
+
result.details.append(
|
|
311
|
+
f"Job {job.id} ({job.kind}): Re-enqueued lost task (was {old_task_id})"
|
|
312
|
+
)
|
|
313
|
+
return True
|
|
314
|
+
else:
|
|
315
|
+
logger.warning(f"Unknown job kind for re-enqueue: {job.kind}")
|
|
316
|
+
return False
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error(f"Failed to re-enqueue job {job.id}: {e}")
|
|
320
|
+
result.details.append(f"Job {job.id} ({job.kind}): Failed to re-enqueue: {e}")
|
|
321
|
+
return False
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _fail_job_with_retry(
|
|
325
|
+
db: Session,
|
|
326
|
+
job: Job,
|
|
327
|
+
reason: str,
|
|
328
|
+
result: WatchdogResult,
|
|
329
|
+
now: datetime,
|
|
330
|
+
) -> None:
|
|
331
|
+
"""Mark a job as failed with SQLite retry logic.
|
|
332
|
+
|
|
333
|
+
Uses transaction + retry to handle SQLite BUSY errors from
|
|
334
|
+
concurrent worker/API writes.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
db: Database session
|
|
338
|
+
job: The job to fail
|
|
339
|
+
reason: Reason for failure
|
|
340
|
+
result: WatchdogResult to update
|
|
341
|
+
now: Current timestamp
|
|
342
|
+
"""
|
|
343
|
+
for attempt in range(MAX_RETRIES):
|
|
344
|
+
try:
|
|
345
|
+
_fail_job(db, job, reason, result, now)
|
|
346
|
+
return
|
|
347
|
+
except OperationalError as e:
|
|
348
|
+
if "database is locked" in str(e) or "SQLITE_BUSY" in str(e):
|
|
349
|
+
if attempt < MAX_RETRIES - 1:
|
|
350
|
+
logger.warning(
|
|
351
|
+
f"SQLite busy, retrying job {job.id} fail (attempt {attempt + 1})"
|
|
352
|
+
)
|
|
353
|
+
time.sleep(RETRY_DELAY_SECONDS)
|
|
354
|
+
db.rollback()
|
|
355
|
+
else:
|
|
356
|
+
logger.error(
|
|
357
|
+
f"Failed to fail job {job.id} after {MAX_RETRIES} attempts: {e}"
|
|
358
|
+
)
|
|
359
|
+
result.details.append(f"[RETRY FAILED] Job {job.id}: {reason}")
|
|
360
|
+
else:
|
|
361
|
+
raise
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _fail_job(
|
|
365
|
+
db: Session,
|
|
366
|
+
job: Job,
|
|
367
|
+
reason: str,
|
|
368
|
+
result: WatchdogResult,
|
|
369
|
+
now: datetime,
|
|
370
|
+
) -> None:
|
|
371
|
+
"""Mark a job as failed and transition ticket to BLOCKED.
|
|
372
|
+
|
|
373
|
+
Uses the proper transition function to ensure:
|
|
374
|
+
- State machine rules are respected
|
|
375
|
+
- Events are created consistently
|
|
376
|
+
- Side effects are handled
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
db: Database session
|
|
380
|
+
job: The job to fail
|
|
381
|
+
reason: Reason for failure
|
|
382
|
+
result: WatchdogResult to update
|
|
383
|
+
now: Current timestamp
|
|
384
|
+
"""
|
|
385
|
+
# Mark job as failed
|
|
386
|
+
job.status = JobStatus.FAILED.value
|
|
387
|
+
job.finished_at = now
|
|
388
|
+
job.exit_code = -1
|
|
389
|
+
|
|
390
|
+
result.details.append(f"Job {job.id} ({job.kind}): {reason}")
|
|
391
|
+
|
|
392
|
+
# Get the ticket
|
|
393
|
+
ticket = db.query(Ticket).filter(Ticket.id == job.ticket_id).first()
|
|
394
|
+
if not ticket:
|
|
395
|
+
db.commit()
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
# Only transition if not in terminal state
|
|
399
|
+
if ticket.state in [TicketState.DONE.value, TicketState.ABANDONED.value]:
|
|
400
|
+
db.commit()
|
|
401
|
+
return
|
|
402
|
+
|
|
403
|
+
# Commit job status first
|
|
404
|
+
db.commit()
|
|
405
|
+
|
|
406
|
+
# Use the proper transition function to maintain invariants
|
|
407
|
+
# This ensures state machine rules and event creation are consistent
|
|
408
|
+
from app.worker import transition_ticket_sync
|
|
409
|
+
|
|
410
|
+
transition_ticket_sync(
|
|
411
|
+
ticket_id=ticket.id,
|
|
412
|
+
to_state=TicketState.BLOCKED,
|
|
413
|
+
reason=f"Job {job.id} failed: {reason}",
|
|
414
|
+
payload={
|
|
415
|
+
"job_id": job.id,
|
|
416
|
+
"job_kind": job.kind,
|
|
417
|
+
"watchdog_reason": reason,
|
|
418
|
+
},
|
|
419
|
+
actor_id="job_watchdog",
|
|
420
|
+
auto_verify=False, # Don't auto-verify when blocking
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
result.tickets_blocked += 1
|
|
424
|
+
result.details.append(f"Ticket {ticket.id} transitioned to BLOCKED")
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""LangChain adapter to bridge existing LLMService with LangChain's BaseLLM interface.
|
|
2
|
+
|
|
3
|
+
This adapter allows UDAR agent to use Draft's existing LLM infrastructure
|
|
4
|
+
(LiteLLM with multi-provider support) without refactoring.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
10
|
+
from langchain_core.language_models.llms import LLM
|
|
11
|
+
|
|
12
|
+
from app.services.llm_service import LLMService
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LangChainLLMAdapter(LLM):
|
|
16
|
+
"""Adapter to use existing LLMService with LangChain.
|
|
17
|
+
|
|
18
|
+
This allows UDAR agent to leverage Draft's existing LLM infrastructure
|
|
19
|
+
while using LangGraph's state machine framework.
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
llm_service = LLMService()
|
|
23
|
+
adapter = LangChainLLMAdapter(llm_service=llm_service)
|
|
24
|
+
response = adapter.invoke("What tickets are needed?")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
llm_service: LLMService
|
|
28
|
+
model: str = "claude-opus-4-6" # Default model, can be overridden
|
|
29
|
+
max_tokens: int = 2000
|
|
30
|
+
temperature: float = 0.0 # Deterministic by default
|
|
31
|
+
|
|
32
|
+
class Config:
|
|
33
|
+
"""Pydantic configuration."""
|
|
34
|
+
|
|
35
|
+
arbitrary_types_allowed = True
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def _llm_type(self) -> str:
|
|
39
|
+
"""Return identifier for this LLM type."""
|
|
40
|
+
return "smart_kanban_llm"
|
|
41
|
+
|
|
42
|
+
def _call(
|
|
43
|
+
self,
|
|
44
|
+
prompt: str,
|
|
45
|
+
stop: list[str] | None = None,
|
|
46
|
+
run_manager: CallbackManagerForLLMRun | None = None,
|
|
47
|
+
**kwargs: Any,
|
|
48
|
+
) -> str:
|
|
49
|
+
"""Call the LLM with a prompt and return the response.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
prompt: The prompt to send to the LLM
|
|
53
|
+
stop: Optional list of stop sequences
|
|
54
|
+
run_manager: Optional callback manager
|
|
55
|
+
**kwargs: Additional arguments (temperature, max_tokens, etc.)
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The LLM's response as a string
|
|
59
|
+
"""
|
|
60
|
+
# Extract parameters with fallbacks
|
|
61
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
62
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
63
|
+
|
|
64
|
+
# Call existing LLMService
|
|
65
|
+
response = self.llm_service.call_completion(
|
|
66
|
+
messages=[{"role": "user", "content": prompt}],
|
|
67
|
+
max_tokens=max_tokens,
|
|
68
|
+
temperature=temperature,
|
|
69
|
+
stop=stop,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Track token usage if callback manager provided
|
|
73
|
+
if run_manager:
|
|
74
|
+
# LangChain can track tokens for monitoring
|
|
75
|
+
run_manager.on_llm_end(response)
|
|
76
|
+
|
|
77
|
+
return response.content
|
|
78
|
+
|
|
79
|
+
async def _acall(
|
|
80
|
+
self,
|
|
81
|
+
prompt: str,
|
|
82
|
+
stop: list[str] | None = None,
|
|
83
|
+
run_manager: CallbackManagerForLLMRun | None = None,
|
|
84
|
+
**kwargs: Any,
|
|
85
|
+
) -> str:
|
|
86
|
+
"""Async version of _call.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
prompt: The prompt to send to the LLM
|
|
90
|
+
stop: Optional list of stop sequences
|
|
91
|
+
run_manager: Optional callback manager
|
|
92
|
+
**kwargs: Additional arguments
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
The LLM's response as a string
|
|
96
|
+
"""
|
|
97
|
+
# Extract parameters with fallbacks
|
|
98
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
99
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
100
|
+
|
|
101
|
+
# Call existing LLMService (async)
|
|
102
|
+
response = await self.llm_service.call_completion_async(
|
|
103
|
+
messages=[{"role": "user", "content": prompt}],
|
|
104
|
+
max_tokens=max_tokens,
|
|
105
|
+
temperature=temperature,
|
|
106
|
+
stop=stop,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Track token usage if callback manager provided
|
|
110
|
+
if run_manager:
|
|
111
|
+
run_manager.on_llm_end(response)
|
|
112
|
+
|
|
113
|
+
return response.content
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def _identifying_params(self) -> dict[str, Any]:
|
|
117
|
+
"""Return identifying parameters for this LLM."""
|
|
118
|
+
return {
|
|
119
|
+
"model": self.model,
|
|
120
|
+
"max_tokens": self.max_tokens,
|
|
121
|
+
"temperature": self.temperature,
|
|
122
|
+
}
|