draft-board 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/app/backend/.env.example +9 -0
  2. package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_patch.txt +195 -0
  3. package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_stat.txt +6 -0
  4. package/app/backend/CURL_EXAMPLES.md +335 -0
  5. package/app/backend/ENV_SETUP.md +65 -0
  6. package/app/backend/alembic/env.py +71 -0
  7. package/app/backend/alembic/script.py.mako +28 -0
  8. package/app/backend/alembic/versions/001_initial_schema.py +104 -0
  9. package/app/backend/alembic/versions/002_add_jobs_table.py +52 -0
  10. package/app/backend/alembic/versions/003_add_workspace_table.py +48 -0
  11. package/app/backend/alembic/versions/004_add_evidence_table.py +56 -0
  12. package/app/backend/alembic/versions/005_add_verification_commands.py +32 -0
  13. package/app/backend/alembic/versions/006_add_planner_lock_table.py +39 -0
  14. package/app/backend/alembic/versions/007_add_revision_review_tables.py +126 -0
  15. package/app/backend/alembic/versions/008_add_revision_idempotency_and_traceability.py +52 -0
  16. package/app/backend/alembic/versions/009_add_job_health_fields.py +46 -0
  17. package/app/backend/alembic/versions/010_add_review_comment_line_content.py +36 -0
  18. package/app/backend/alembic/versions/011_add_analysis_cache.py +47 -0
  19. package/app/backend/alembic/versions/012_add_boards_table.py +102 -0
  20. package/app/backend/alembic/versions/013_add_ticket_blocking.py +45 -0
  21. package/app/backend/alembic/versions/014_add_agent_sessions.py +220 -0
  22. package/app/backend/alembic/versions/015_add_ticket_sort_order.py +33 -0
  23. package/app/backend/alembic/versions/03220f0b93ae_add_pr_fields_to_ticket.py +49 -0
  24. package/app/backend/alembic/versions/0c2d89fff3b1_seed_board_configs_from_yaml.py +206 -0
  25. package/app/backend/alembic/versions/3348e5cf54c1_add_merge_checklist_table.py +67 -0
  26. package/app/backend/alembic/versions/357c780ee445_add_goal_status.py +34 -0
  27. package/app/backend/alembic/versions/553340b7e26c_add_autonomy_fields_to_goal.py +65 -0
  28. package/app/backend/alembic/versions/774dc335c679_merge_migration_heads.py +23 -0
  29. package/app/backend/alembic/versions/7b307e847cbd_merge_heads.py +23 -0
  30. package/app/backend/alembic/versions/82ecd978cc70_add_missing_indexes.py +48 -0
  31. package/app/backend/alembic/versions/8ef5054dc280_add_normalized_log_entries.py +173 -0
  32. package/app/backend/alembic/versions/8f3e2bd8ea3b_merge_migration_heads.py +23 -0
  33. package/app/backend/alembic/versions/9d17f0698d3b_add_config_column_to_boards_table.py +30 -0
  34. package/app/backend/alembic/versions/add_agent_conversation_history.py +72 -0
  35. package/app/backend/alembic/versions/add_job_variant.py +34 -0
  36. package/app/backend/alembic/versions/add_performance_indexes.py +95 -0
  37. package/app/backend/alembic/versions/add_repos_and_board_repos.py +174 -0
  38. package/app/backend/alembic/versions/add_session_id_to_jobs.py +27 -0
  39. package/app/backend/alembic/versions/add_sqlite_backend_tables.py +104 -0
  40. package/app/backend/alembic/versions/b10fb0b62240_add_diff_content_to_revisions.py +34 -0
  41. package/app/backend/alembic.ini +89 -0
  42. package/app/backend/app/__init__.py +3 -0
  43. package/app/backend/app/data_dir.py +85 -0
  44. package/app/backend/app/database.py +70 -0
  45. package/app/backend/app/database_sync.py +64 -0
  46. package/app/backend/app/dependencies/__init__.py +5 -0
  47. package/app/backend/app/dependencies/auth.py +80 -0
  48. package/app/backend/app/dependencies.py +43 -0
  49. package/app/backend/app/exceptions.py +178 -0
  50. package/app/backend/app/executors/__init__.py +1 -0
  51. package/app/backend/app/executors/adapters/__init__.py +1 -0
  52. package/app/backend/app/executors/adapters/aider.py +152 -0
  53. package/app/backend/app/executors/adapters/amazon_q.py +103 -0
  54. package/app/backend/app/executors/adapters/amp.py +123 -0
  55. package/app/backend/app/executors/adapters/claude.py +177 -0
  56. package/app/backend/app/executors/adapters/cline.py +127 -0
  57. package/app/backend/app/executors/adapters/codex.py +167 -0
  58. package/app/backend/app/executors/adapters/copilot.py +202 -0
  59. package/app/backend/app/executors/adapters/cursor.py +87 -0
  60. package/app/backend/app/executors/adapters/droid.py +123 -0
  61. package/app/backend/app/executors/adapters/gemini.py +132 -0
  62. package/app/backend/app/executors/adapters/goose.py +131 -0
  63. package/app/backend/app/executors/adapters/opencode.py +123 -0
  64. package/app/backend/app/executors/adapters/qwen.py +123 -0
  65. package/app/backend/app/executors/plugins/__init__.py +1 -0
  66. package/app/backend/app/executors/registry.py +202 -0
  67. package/app/backend/app/executors/spec.py +226 -0
  68. package/app/backend/app/main.py +486 -0
  69. package/app/backend/app/middleware/__init__.py +13 -0
  70. package/app/backend/app/middleware/idempotency.py +426 -0
  71. package/app/backend/app/middleware/rate_limit.py +312 -0
  72. package/app/backend/app/middleware/security_headers.py +43 -0
  73. package/app/backend/app/middleware/timeout.py +37 -0
  74. package/app/backend/app/models/__init__.py +56 -0
  75. package/app/backend/app/models/agent_conversation_history.py +56 -0
  76. package/app/backend/app/models/agent_session.py +127 -0
  77. package/app/backend/app/models/analysis_cache.py +49 -0
  78. package/app/backend/app/models/base.py +9 -0
  79. package/app/backend/app/models/board.py +79 -0
  80. package/app/backend/app/models/board_repo.py +68 -0
  81. package/app/backend/app/models/cost_budget.py +42 -0
  82. package/app/backend/app/models/enums.py +40 -0
  83. package/app/backend/app/models/evidence.py +132 -0
  84. package/app/backend/app/models/goal.py +102 -0
  85. package/app/backend/app/models/idempotency_entry.py +30 -0
  86. package/app/backend/app/models/job.py +163 -0
  87. package/app/backend/app/models/job_queue.py +39 -0
  88. package/app/backend/app/models/kv_store.py +28 -0
  89. package/app/backend/app/models/merge_checklist.py +87 -0
  90. package/app/backend/app/models/normalized_log.py +100 -0
  91. package/app/backend/app/models/planner_lock.py +43 -0
  92. package/app/backend/app/models/rate_limit_entry.py +25 -0
  93. package/app/backend/app/models/repo.py +66 -0
  94. package/app/backend/app/models/review_comment.py +91 -0
  95. package/app/backend/app/models/review_summary.py +69 -0
  96. package/app/backend/app/models/revision.py +130 -0
  97. package/app/backend/app/models/ticket.py +223 -0
  98. package/app/backend/app/models/ticket_event.py +83 -0
  99. package/app/backend/app/models/user.py +47 -0
  100. package/app/backend/app/models/workspace.py +71 -0
  101. package/app/backend/app/redis_client.py +119 -0
  102. package/app/backend/app/routers/__init__.py +29 -0
  103. package/app/backend/app/routers/agents.py +296 -0
  104. package/app/backend/app/routers/auth.py +94 -0
  105. package/app/backend/app/routers/board.py +885 -0
  106. package/app/backend/app/routers/dashboard.py +351 -0
  107. package/app/backend/app/routers/debug.py +528 -0
  108. package/app/backend/app/routers/evidence.py +96 -0
  109. package/app/backend/app/routers/executors.py +324 -0
  110. package/app/backend/app/routers/goals.py +574 -0
  111. package/app/backend/app/routers/jobs.py +448 -0
  112. package/app/backend/app/routers/maintenance.py +172 -0
  113. package/app/backend/app/routers/merge.py +360 -0
  114. package/app/backend/app/routers/planner.py +537 -0
  115. package/app/backend/app/routers/pull_requests.py +382 -0
  116. package/app/backend/app/routers/repos.py +263 -0
  117. package/app/backend/app/routers/revisions.py +939 -0
  118. package/app/backend/app/routers/settings.py +267 -0
  119. package/app/backend/app/routers/tickets.py +2003 -0
  120. package/app/backend/app/routers/webhooks.py +143 -0
  121. package/app/backend/app/routers/websocket.py +249 -0
  122. package/app/backend/app/schemas/__init__.py +109 -0
  123. package/app/backend/app/schemas/board.py +87 -0
  124. package/app/backend/app/schemas/common.py +33 -0
  125. package/app/backend/app/schemas/evidence.py +87 -0
  126. package/app/backend/app/schemas/goal.py +90 -0
  127. package/app/backend/app/schemas/job.py +97 -0
  128. package/app/backend/app/schemas/merge.py +139 -0
  129. package/app/backend/app/schemas/planner.py +500 -0
  130. package/app/backend/app/schemas/repo.py +187 -0
  131. package/app/backend/app/schemas/review.py +137 -0
  132. package/app/backend/app/schemas/revision.py +114 -0
  133. package/app/backend/app/schemas/ticket.py +238 -0
  134. package/app/backend/app/schemas/ticket_event.py +72 -0
  135. package/app/backend/app/schemas/workspace.py +19 -0
  136. package/app/backend/app/services/__init__.py +31 -0
  137. package/app/backend/app/services/agent_memory_service.py +223 -0
  138. package/app/backend/app/services/agent_registry.py +346 -0
  139. package/app/backend/app/services/agent_session_manager.py +318 -0
  140. package/app/backend/app/services/agent_session_service.py +219 -0
  141. package/app/backend/app/services/agent_tools.py +379 -0
  142. package/app/backend/app/services/auth_service.py +98 -0
  143. package/app/backend/app/services/autonomy_service.py +380 -0
  144. package/app/backend/app/services/board_repo_service.py +201 -0
  145. package/app/backend/app/services/board_service.py +326 -0
  146. package/app/backend/app/services/cleanup_service.py +1085 -0
  147. package/app/backend/app/services/config_service.py +908 -0
  148. package/app/backend/app/services/context_gatherer.py +557 -0
  149. package/app/backend/app/services/cost_tracking_service.py +293 -0
  150. package/app/backend/app/services/cursor_log_normalizer.py +536 -0
  151. package/app/backend/app/services/delivery_pipeline.py +440 -0
  152. package/app/backend/app/services/executor_service.py +634 -0
  153. package/app/backend/app/services/git_host/__init__.py +11 -0
  154. package/app/backend/app/services/git_host/factory.py +87 -0
  155. package/app/backend/app/services/git_host/github.py +270 -0
  156. package/app/backend/app/services/git_host/gitlab.py +194 -0
  157. package/app/backend/app/services/git_host/protocol.py +75 -0
  158. package/app/backend/app/services/git_merge_simple.py +346 -0
  159. package/app/backend/app/services/git_ops.py +384 -0
  160. package/app/backend/app/services/github_service.py +233 -0
  161. package/app/backend/app/services/goal_service.py +113 -0
  162. package/app/backend/app/services/job_service.py +423 -0
  163. package/app/backend/app/services/job_watchdog_service.py +424 -0
  164. package/app/backend/app/services/langchain_adapter.py +122 -0
  165. package/app/backend/app/services/llm_provider_clients.py +351 -0
  166. package/app/backend/app/services/llm_service.py +285 -0
  167. package/app/backend/app/services/log_normalizer.py +342 -0
  168. package/app/backend/app/services/log_stream_service.py +276 -0
  169. package/app/backend/app/services/merge_checklist_service.py +264 -0
  170. package/app/backend/app/services/merge_service.py +784 -0
  171. package/app/backend/app/services/orchestrator_log.py +84 -0
  172. package/app/backend/app/services/planner_service.py +1662 -0
  173. package/app/backend/app/services/planner_tick_sync.py +1040 -0
  174. package/app/backend/app/services/queued_message_service.py +156 -0
  175. package/app/backend/app/services/reliability_wrapper.py +389 -0
  176. package/app/backend/app/services/repo_discovery_service.py +318 -0
  177. package/app/backend/app/services/review_service.py +334 -0
  178. package/app/backend/app/services/revision_service.py +389 -0
  179. package/app/backend/app/services/safe_autopilot.py +510 -0
  180. package/app/backend/app/services/sqlite_worker.py +372 -0
  181. package/app/backend/app/services/task_dispatch.py +135 -0
  182. package/app/backend/app/services/ticket_generation_service.py +1781 -0
  183. package/app/backend/app/services/ticket_service.py +486 -0
  184. package/app/backend/app/services/udar_planner_service.py +1007 -0
  185. package/app/backend/app/services/webhook_service.py +126 -0
  186. package/app/backend/app/services/workspace_service.py +465 -0
  187. package/app/backend/app/services/worktree_file_service.py +92 -0
  188. package/app/backend/app/services/worktree_validator.py +213 -0
  189. package/app/backend/app/sqlite_kv.py +278 -0
  190. package/app/backend/app/state_machine.py +128 -0
  191. package/app/backend/app/templates/__init__.py +5 -0
  192. package/app/backend/app/templates/registry.py +243 -0
  193. package/app/backend/app/utils/__init__.py +5 -0
  194. package/app/backend/app/utils/artifact_reader.py +87 -0
  195. package/app/backend/app/utils/circuit_breaker.py +229 -0
  196. package/app/backend/app/utils/db_retry.py +136 -0
  197. package/app/backend/app/utils/ignored_fields.py +123 -0
  198. package/app/backend/app/utils/validators.py +54 -0
  199. package/app/backend/app/websocket/__init__.py +5 -0
  200. package/app/backend/app/websocket/manager.py +179 -0
  201. package/app/backend/app/websocket/state_tracker.py +113 -0
  202. package/app/backend/app/worker.py +3190 -0
  203. package/app/backend/calculator_tickets.json +40 -0
  204. package/app/backend/canary_tests.sh +591 -0
  205. package/app/backend/celerybeat-schedule +0 -0
  206. package/app/backend/celerybeat-schedule-shm +0 -0
  207. package/app/backend/celerybeat-schedule-wal +0 -0
  208. package/app/backend/logs/.gitkeep +3 -0
  209. package/app/backend/multiplication_division_implementation_tickets.json +55 -0
  210. package/app/backend/multiplication_division_tickets.json +42 -0
  211. package/app/backend/pyproject.toml +45 -0
  212. package/app/backend/requirements-dev.txt +8 -0
  213. package/app/backend/requirements.txt +20 -0
  214. package/app/backend/run.sh +30 -0
  215. package/app/backend/run_with_logs.sh +10 -0
  216. package/app/backend/scientific_calculator_tickets.json +40 -0
  217. package/app/backend/scripts/extract_openapi.py +21 -0
  218. package/app/backend/scripts/seed_demo.py +187 -0
  219. package/app/backend/setup_demo_review.py +302 -0
  220. package/app/backend/test_actual_parse.py +41 -0
  221. package/app/backend/test_agent_streaming.py +61 -0
  222. package/app/backend/test_parse.py +51 -0
  223. package/app/backend/test_streaming.py +51 -0
  224. package/app/backend/test_subprocess_streaming.py +50 -0
  225. package/app/backend/tests/__init__.py +1 -0
  226. package/app/backend/tests/conftest.py +46 -0
  227. package/app/backend/tests/test_auth.py +341 -0
  228. package/app/backend/tests/test_autonomy_service.py +391 -0
  229. package/app/backend/tests/test_cleanup_service_safety.py +417 -0
  230. package/app/backend/tests/test_middleware.py +279 -0
  231. package/app/backend/tests/test_planner_providers.py +290 -0
  232. package/app/backend/tests/test_planner_unblock.py +183 -0
  233. package/app/backend/tests/test_revision_invariants.py +618 -0
  234. package/app/backend/tests/test_sqlite_kv.py +290 -0
  235. package/app/backend/tests/test_sqlite_worker.py +353 -0
  236. package/app/backend/tests/test_task_dispatch.py +100 -0
  237. package/app/backend/tests/test_ticket_validation.py +304 -0
  238. package/app/backend/tests/test_udar_agent.py +693 -0
  239. package/app/backend/tests/test_webhook_service.py +184 -0
  240. package/app/backend/tickets_output.json +59 -0
  241. package/app/backend/user_management_tickets.json +50 -0
  242. package/app/backend/uvicorn.log +0 -0
  243. package/app/draft.yaml +313 -0
  244. package/app/frontend/dist/assets/index-LcjCczu5.js +155 -0
  245. package/app/frontend/dist/assets/index-_FP_279e.css +1 -0
  246. package/app/frontend/dist/index.html +14 -0
  247. package/app/frontend/dist/vite.svg +1 -0
  248. package/app/frontend/package.json +101 -0
  249. package/bin/cli.js +527 -0
  250. package/package.json +37 -0
@@ -0,0 +1,424 @@
1
+ """Service for monitoring and recovering stuck jobs."""
2
+
3
+ import logging
4
+ import time
5
+ from dataclasses import dataclass
6
+ from datetime import UTC, datetime, timedelta
7
+
8
+ from sqlalchemy import or_
9
+ from sqlalchemy.exc import OperationalError
10
+ from sqlalchemy.orm import Session
11
+
12
+ from app.database_sync import get_sync_db
13
+ from app.models.job import Job, JobStatus
14
+ from app.models.ticket import Ticket
15
+ from app.state_machine import TicketState
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _ensure_utc(dt: datetime | None) -> datetime | None:
21
+ """Ensure a datetime is UTC-aware. Returns None if input is None.
22
+
23
+ SQLite stores datetimes without timezone info. This helper ensures
24
+ we can safely compare database datetimes with timezone-aware now().
25
+ """
26
+ if dt is None:
27
+ return None
28
+ if dt.tzinfo is None:
29
+ # Assume naive datetimes from DB are UTC
30
+ return dt.replace(tzinfo=UTC)
31
+ return dt
32
+
33
+
34
+ def _to_naive_utc(dt: datetime) -> datetime:
35
+ """Convert a datetime to naive UTC for SQLite comparisons.
36
+
37
+ SQLite stores datetimes without timezone info. When comparing Python
38
+ datetimes with SQLite columns, we need naive datetimes to avoid
39
+ 'can't subtract offset-naive and offset-aware datetimes' errors.
40
+ """
41
+ if dt.tzinfo is not None:
42
+ # Convert to UTC and strip timezone
43
+ return dt.replace(tzinfo=None)
44
+ return dt
45
+
46
+
47
+ # Default thresholds
48
+ HEARTBEAT_STALE_SECONDS = 120 # Job is stale if no heartbeat in 2 minutes
49
+ QUEUED_REENQUEUE_SECONDS = 30 # Re-enqueue lost tasks after 30 seconds
50
+ QUEUED_STALE_MINUTES = 15 # Job is stuck in queue if queued for 15 minutes (increased from 2 to handle busy workers)
51
+ DEFAULT_JOB_TIMEOUT_SECONDS = 900 # 15 minutes default timeout
52
+
53
+ # SQLite retry config
54
+ MAX_RETRIES = 3
55
+ RETRY_DELAY_SECONDS = 0.5
56
+
57
+
58
+ @dataclass
59
+ class WatchdogResult:
60
+ """Result of a watchdog run."""
61
+
62
+ stale_jobs_recovered: int = 0
63
+ timed_out_jobs_recovered: int = 0
64
+ stuck_queued_jobs_failed: int = 0
65
+ missing_started_at_jobs: int = 0
66
+ lost_tasks_reenqueued: int = 0 # Jobs re-enqueued due to lost Celery tasks
67
+ tickets_blocked: int = 0
68
+ details: list[str] = None
69
+
70
+ def __post_init__(self):
71
+ if self.details is None:
72
+ self.details = []
73
+
74
+
75
+ def run_job_watchdog() -> WatchdogResult:
76
+ """Run the job watchdog to recover stuck jobs.
77
+
78
+ This function checks for:
79
+ 1. RUNNING jobs with stale heartbeat (no update in HEARTBEAT_STALE_SECONDS)
80
+ 2. RUNNING jobs that exceeded their timeout_seconds
81
+ 3. RUNNING jobs missing started_at (data corruption or bug)
82
+ 4. QUEUED jobs that haven't been picked up in QUEUED_STALE_MINUTES
83
+
84
+ For each stuck job:
85
+ - Mark job as FAILED with reason
86
+ - Transition associated ticket to BLOCKED (via proper transition function)
87
+ - Uses transaction + retry for SQLite concurrency safety
88
+
89
+ Returns:
90
+ WatchdogResult with counts and details
91
+ """
92
+ result = WatchdogResult()
93
+
94
+ with get_sync_db() as db:
95
+ # Use UTC-aware datetime for Python operations
96
+ now = datetime.now(UTC)
97
+ # Use naive UTC for SQLite comparisons (SQLite stores naive datetimes)
98
+ now_naive = _to_naive_utc(now)
99
+
100
+ # 1. Find RUNNING jobs with stale heartbeat
101
+ heartbeat_threshold = now_naive - timedelta(seconds=HEARTBEAT_STALE_SECONDS)
102
+ stale_heartbeat_jobs = (
103
+ db.query(Job)
104
+ .filter(
105
+ Job.status == JobStatus.RUNNING.value,
106
+ # Must have started_at to be considered for stale heartbeat
107
+ Job.started_at.isnot(None),
108
+ or_(
109
+ Job.last_heartbeat_at.is_(None),
110
+ Job.last_heartbeat_at < heartbeat_threshold,
111
+ ),
112
+ )
113
+ .all()
114
+ )
115
+
116
+ for job in stale_heartbeat_jobs:
117
+ _fail_job_with_retry(
118
+ db=db,
119
+ job=job,
120
+ reason="Stale heartbeat - worker may have crashed",
121
+ result=result,
122
+ now=now,
123
+ )
124
+ result.stale_jobs_recovered += 1
125
+
126
+ # 2. Find RUNNING jobs missing started_at (data corruption)
127
+ # These are separate from stale heartbeat - they indicate a bug
128
+ missing_started_at_jobs = (
129
+ db.query(Job)
130
+ .filter(
131
+ Job.status == JobStatus.RUNNING.value,
132
+ Job.started_at.is_(None),
133
+ )
134
+ .all()
135
+ )
136
+
137
+ for job in missing_started_at_jobs:
138
+ _fail_job_with_retry(
139
+ db=db,
140
+ job=job,
141
+ reason="Running job missing started_at - possible data corruption",
142
+ result=result,
143
+ now=now,
144
+ )
145
+ result.missing_started_at_jobs += 1
146
+
147
+ # 3. Find RUNNING jobs that exceeded timeout
148
+ # Only check jobs that have valid started_at
149
+ running_jobs_with_start = (
150
+ db.query(Job)
151
+ .filter(
152
+ Job.status == JobStatus.RUNNING.value,
153
+ Job.started_at.isnot(None),
154
+ )
155
+ .all()
156
+ )
157
+
158
+ for job in running_jobs_with_start:
159
+ timeout = job.timeout_seconds or DEFAULT_JOB_TIMEOUT_SECONDS
160
+ # Safe: we know started_at is not None from the query
161
+ # Use _ensure_utc to handle timezone-naive datetimes from SQLite
162
+ elapsed = (now - _ensure_utc(job.started_at)).total_seconds()
163
+ if elapsed > timeout:
164
+ _fail_job_with_retry(
165
+ db=db,
166
+ job=job,
167
+ reason=f"Job timeout exceeded ({timeout}s, elapsed {int(elapsed)}s)",
168
+ result=result,
169
+ now=now,
170
+ )
171
+ result.timed_out_jobs_recovered += 1
172
+
173
+ # 4. Re-enqueue jobs that may have lost their Celery tasks (30s threshold)
174
+ # This catches jobs where the Celery task was lost from Redis but the DB shows queued
175
+ # BUT: Only re-enqueue when no RUNNING jobs exist — otherwise the worker is just
176
+ # busy and the jobs will be picked up when the current job completes.
177
+ # Re-enqueueing while busy creates duplicate job_queue entries.
178
+ running_jobs_count = (
179
+ db.query(Job).filter(Job.status == JobStatus.RUNNING.value).count()
180
+ )
181
+
182
+ if running_jobs_count == 0:
183
+ reenqueue_threshold = now_naive - timedelta(
184
+ seconds=QUEUED_REENQUEUE_SECONDS
185
+ )
186
+ potentially_lost_jobs = (
187
+ db.query(Job)
188
+ .filter(
189
+ Job.status == JobStatus.QUEUED.value,
190
+ Job.created_at < reenqueue_threshold,
191
+ )
192
+ .all()
193
+ )
194
+
195
+ for job in potentially_lost_jobs:
196
+ reenqueued = _reenqueue_lost_task(db, job, result)
197
+ if reenqueued:
198
+ result.lost_tasks_reenqueued += 1
199
+ else:
200
+ logger.debug(
201
+ f"Skipping re-enqueue: {running_jobs_count} jobs currently running"
202
+ )
203
+
204
+ # 5. Find QUEUED jobs that haven't been picked up even after re-enqueue attempts
205
+ # This is the final fallback - fail jobs stuck for too long
206
+ # BUT: Only fail if there are NO running jobs (worker is idle but not picking up work)
207
+ # If there are running jobs, the worker is just busy - let queued jobs wait
208
+ running_jobs_count = (
209
+ db.query(Job).filter(Job.status == JobStatus.RUNNING.value).count()
210
+ )
211
+
212
+ if running_jobs_count == 0:
213
+ # Worker appears idle but jobs are stuck in queue - something is wrong
214
+ queued_threshold = now_naive - timedelta(minutes=QUEUED_STALE_MINUTES)
215
+ stuck_queued_jobs = (
216
+ db.query(Job)
217
+ .filter(
218
+ Job.status == JobStatus.QUEUED.value,
219
+ Job.created_at < queued_threshold,
220
+ )
221
+ .all()
222
+ )
223
+
224
+ for job in stuck_queued_jobs:
225
+ _fail_job_with_retry(
226
+ db=db,
227
+ job=job,
228
+ reason=f"Stuck in queue for over {QUEUED_STALE_MINUTES} minutes with no active jobs - worker may be down",
229
+ result=result,
230
+ now=now,
231
+ )
232
+ result.stuck_queued_jobs_failed += 1
233
+ else:
234
+ logger.debug(
235
+ f"Skipping stuck queue check: {running_jobs_count} jobs currently running"
236
+ )
237
+
238
+ logger.info(
239
+ f"Watchdog completed: {result.stale_jobs_recovered} stale, "
240
+ f"{result.timed_out_jobs_recovered} timed out, "
241
+ f"{result.missing_started_at_jobs} missing started_at, "
242
+ f"{result.lost_tasks_reenqueued} re-enqueued, "
243
+ f"{result.stuck_queued_jobs_failed} stuck queued, "
244
+ f"{result.tickets_blocked} tickets blocked"
245
+ )
246
+
247
+ return result
248
+
249
+
250
+ def _reenqueue_lost_task(db: Session, job: Job, result: WatchdogResult) -> bool:
251
+ """Re-enqueue a Celery task for a job that may have lost its task.
252
+
253
+ Checks if the job's Celery task is still in the queue/active. If not,
254
+ creates a new Celery task for the job.
255
+
256
+ Args:
257
+ db: Database session
258
+ job: The job to potentially re-enqueue
259
+ result: WatchdogResult to update with details
260
+
261
+ Returns:
262
+ True if the task was re-enqueued, False if it was still active
263
+ """
264
+ from app.models.job import JobKind
265
+ from app.services.task_dispatch import enqueue_task
266
+
267
+ # First check if the ticket is in a state where re-enqueueing makes sense
268
+ ticket = db.query(Ticket).filter(Ticket.id == job.ticket_id).first()
269
+ if ticket:
270
+ terminal_states = [
271
+ TicketState.BLOCKED.value,
272
+ TicketState.DONE.value,
273
+ TicketState.ABANDONED.value,
274
+ ]
275
+ if ticket.state in terminal_states:
276
+ logger.info(
277
+ f"Job {job.id} ticket is in {ticket.state} state - failing job instead of re-enqueueing"
278
+ )
279
+ job.status = JobStatus.FAILED.value
280
+ job.finished_at = datetime.now(UTC)
281
+ job.exit_code = -1
282
+ db.commit()
283
+ result.details.append(
284
+ f"Job {job.id} ({job.kind}): Failed (ticket already {ticket.state})"
285
+ )
286
+ return False
287
+
288
+ # Task is lost or never existed - re-enqueue via unified dispatch
289
+ try:
290
+ task = None
291
+ task_name = None
292
+ if job.kind == JobKind.EXECUTE.value:
293
+ task_name = "execute_ticket"
294
+ elif job.kind == JobKind.VERIFY.value:
295
+ task_name = "verify_ticket"
296
+ elif job.kind == JobKind.RESUME.value:
297
+ task_name = "resume_ticket"
298
+
299
+ if task_name:
300
+ task = enqueue_task(task_name, args=[job.id])
301
+
302
+ if task:
303
+ old_task_id = job.celery_task_id
304
+ job.celery_task_id = task.id
305
+ db.commit()
306
+ logger.info(
307
+ f"Re-enqueued lost Celery task for job {job.id} ({job.kind}): "
308
+ f"old={old_task_id}, new={task.id}"
309
+ )
310
+ result.details.append(
311
+ f"Job {job.id} ({job.kind}): Re-enqueued lost task (was {old_task_id})"
312
+ )
313
+ return True
314
+ else:
315
+ logger.warning(f"Unknown job kind for re-enqueue: {job.kind}")
316
+ return False
317
+
318
+ except Exception as e:
319
+ logger.error(f"Failed to re-enqueue job {job.id}: {e}")
320
+ result.details.append(f"Job {job.id} ({job.kind}): Failed to re-enqueue: {e}")
321
+ return False
322
+
323
+
324
+ def _fail_job_with_retry(
325
+ db: Session,
326
+ job: Job,
327
+ reason: str,
328
+ result: WatchdogResult,
329
+ now: datetime,
330
+ ) -> None:
331
+ """Mark a job as failed with SQLite retry logic.
332
+
333
+ Uses transaction + retry to handle SQLite BUSY errors from
334
+ concurrent worker/API writes.
335
+
336
+ Args:
337
+ db: Database session
338
+ job: The job to fail
339
+ reason: Reason for failure
340
+ result: WatchdogResult to update
341
+ now: Current timestamp
342
+ """
343
+ for attempt in range(MAX_RETRIES):
344
+ try:
345
+ _fail_job(db, job, reason, result, now)
346
+ return
347
+ except OperationalError as e:
348
+ if "database is locked" in str(e) or "SQLITE_BUSY" in str(e):
349
+ if attempt < MAX_RETRIES - 1:
350
+ logger.warning(
351
+ f"SQLite busy, retrying job {job.id} fail (attempt {attempt + 1})"
352
+ )
353
+ time.sleep(RETRY_DELAY_SECONDS)
354
+ db.rollback()
355
+ else:
356
+ logger.error(
357
+ f"Failed to fail job {job.id} after {MAX_RETRIES} attempts: {e}"
358
+ )
359
+ result.details.append(f"[RETRY FAILED] Job {job.id}: {reason}")
360
+ else:
361
+ raise
362
+
363
+
364
+ def _fail_job(
365
+ db: Session,
366
+ job: Job,
367
+ reason: str,
368
+ result: WatchdogResult,
369
+ now: datetime,
370
+ ) -> None:
371
+ """Mark a job as failed and transition ticket to BLOCKED.
372
+
373
+ Uses the proper transition function to ensure:
374
+ - State machine rules are respected
375
+ - Events are created consistently
376
+ - Side effects are handled
377
+
378
+ Args:
379
+ db: Database session
380
+ job: The job to fail
381
+ reason: Reason for failure
382
+ result: WatchdogResult to update
383
+ now: Current timestamp
384
+ """
385
+ # Mark job as failed
386
+ job.status = JobStatus.FAILED.value
387
+ job.finished_at = now
388
+ job.exit_code = -1
389
+
390
+ result.details.append(f"Job {job.id} ({job.kind}): {reason}")
391
+
392
+ # Get the ticket
393
+ ticket = db.query(Ticket).filter(Ticket.id == job.ticket_id).first()
394
+ if not ticket:
395
+ db.commit()
396
+ return
397
+
398
+ # Only transition if not in terminal state
399
+ if ticket.state in [TicketState.DONE.value, TicketState.ABANDONED.value]:
400
+ db.commit()
401
+ return
402
+
403
+ # Commit job status first
404
+ db.commit()
405
+
406
+ # Use the proper transition function to maintain invariants
407
+ # This ensures state machine rules and event creation are consistent
408
+ from app.worker import transition_ticket_sync
409
+
410
+ transition_ticket_sync(
411
+ ticket_id=ticket.id,
412
+ to_state=TicketState.BLOCKED,
413
+ reason=f"Job {job.id} failed: {reason}",
414
+ payload={
415
+ "job_id": job.id,
416
+ "job_kind": job.kind,
417
+ "watchdog_reason": reason,
418
+ },
419
+ actor_id="job_watchdog",
420
+ auto_verify=False, # Don't auto-verify when blocking
421
+ )
422
+
423
+ result.tickets_blocked += 1
424
+ result.details.append(f"Ticket {ticket.id} transitioned to BLOCKED")
@@ -0,0 +1,122 @@
1
+ """LangChain adapter to bridge existing LLMService with LangChain's BaseLLM interface.
2
+
3
+ This adapter allows UDAR agent to use Draft's existing LLM infrastructure
4
+ (LiteLLM with multi-provider support) without refactoring.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from langchain_core.callbacks.manager import CallbackManagerForLLMRun
10
+ from langchain_core.language_models.llms import LLM
11
+
12
+ from app.services.llm_service import LLMService
13
+
14
+
15
+ class LangChainLLMAdapter(LLM):
16
+ """Adapter to use existing LLMService with LangChain.
17
+
18
+ This allows UDAR agent to leverage Draft's existing LLM infrastructure
19
+ while using LangGraph's state machine framework.
20
+
21
+ Example:
22
+ llm_service = LLMService()
23
+ adapter = LangChainLLMAdapter(llm_service=llm_service)
24
+ response = adapter.invoke("What tickets are needed?")
25
+ """
26
+
27
+ llm_service: LLMService
28
+ model: str = "claude-opus-4-6" # Default model, can be overridden
29
+ max_tokens: int = 2000
30
+ temperature: float = 0.0 # Deterministic by default
31
+
32
+ class Config:
33
+ """Pydantic configuration."""
34
+
35
+ arbitrary_types_allowed = True
36
+
37
+ @property
38
+ def _llm_type(self) -> str:
39
+ """Return identifier for this LLM type."""
40
+ return "smart_kanban_llm"
41
+
42
+ def _call(
43
+ self,
44
+ prompt: str,
45
+ stop: list[str] | None = None,
46
+ run_manager: CallbackManagerForLLMRun | None = None,
47
+ **kwargs: Any,
48
+ ) -> str:
49
+ """Call the LLM with a prompt and return the response.
50
+
51
+ Args:
52
+ prompt: The prompt to send to the LLM
53
+ stop: Optional list of stop sequences
54
+ run_manager: Optional callback manager
55
+ **kwargs: Additional arguments (temperature, max_tokens, etc.)
56
+
57
+ Returns:
58
+ The LLM's response as a string
59
+ """
60
+ # Extract parameters with fallbacks
61
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
62
+ temperature = kwargs.get("temperature", self.temperature)
63
+
64
+ # Call existing LLMService
65
+ response = self.llm_service.call_completion(
66
+ messages=[{"role": "user", "content": prompt}],
67
+ max_tokens=max_tokens,
68
+ temperature=temperature,
69
+ stop=stop,
70
+ )
71
+
72
+ # Track token usage if callback manager provided
73
+ if run_manager:
74
+ # LangChain can track tokens for monitoring
75
+ run_manager.on_llm_end(response)
76
+
77
+ return response.content
78
+
79
+ async def _acall(
80
+ self,
81
+ prompt: str,
82
+ stop: list[str] | None = None,
83
+ run_manager: CallbackManagerForLLMRun | None = None,
84
+ **kwargs: Any,
85
+ ) -> str:
86
+ """Async version of _call.
87
+
88
+ Args:
89
+ prompt: The prompt to send to the LLM
90
+ stop: Optional list of stop sequences
91
+ run_manager: Optional callback manager
92
+ **kwargs: Additional arguments
93
+
94
+ Returns:
95
+ The LLM's response as a string
96
+ """
97
+ # Extract parameters with fallbacks
98
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
99
+ temperature = kwargs.get("temperature", self.temperature)
100
+
101
+ # Call existing LLMService (async)
102
+ response = await self.llm_service.call_completion_async(
103
+ messages=[{"role": "user", "content": prompt}],
104
+ max_tokens=max_tokens,
105
+ temperature=temperature,
106
+ stop=stop,
107
+ )
108
+
109
+ # Track token usage if callback manager provided
110
+ if run_manager:
111
+ run_manager.on_llm_end(response)
112
+
113
+ return response.content
114
+
115
+ @property
116
+ def _identifying_params(self) -> dict[str, Any]:
117
+ """Return identifying parameters for this LLM."""
118
+ return {
119
+ "model": self.model,
120
+ "max_tokens": self.max_tokens,
121
+ "temperature": self.temperature,
122
+ }