draft-board 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/app/backend/.env.example +9 -0
  2. package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_patch.txt +195 -0
  3. package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_stat.txt +6 -0
  4. package/app/backend/CURL_EXAMPLES.md +335 -0
  5. package/app/backend/ENV_SETUP.md +65 -0
  6. package/app/backend/alembic/env.py +71 -0
  7. package/app/backend/alembic/script.py.mako +28 -0
  8. package/app/backend/alembic/versions/001_initial_schema.py +104 -0
  9. package/app/backend/alembic/versions/002_add_jobs_table.py +52 -0
  10. package/app/backend/alembic/versions/003_add_workspace_table.py +48 -0
  11. package/app/backend/alembic/versions/004_add_evidence_table.py +56 -0
  12. package/app/backend/alembic/versions/005_add_verification_commands.py +32 -0
  13. package/app/backend/alembic/versions/006_add_planner_lock_table.py +39 -0
  14. package/app/backend/alembic/versions/007_add_revision_review_tables.py +126 -0
  15. package/app/backend/alembic/versions/008_add_revision_idempotency_and_traceability.py +52 -0
  16. package/app/backend/alembic/versions/009_add_job_health_fields.py +46 -0
  17. package/app/backend/alembic/versions/010_add_review_comment_line_content.py +36 -0
  18. package/app/backend/alembic/versions/011_add_analysis_cache.py +47 -0
  19. package/app/backend/alembic/versions/012_add_boards_table.py +102 -0
  20. package/app/backend/alembic/versions/013_add_ticket_blocking.py +45 -0
  21. package/app/backend/alembic/versions/014_add_agent_sessions.py +220 -0
  22. package/app/backend/alembic/versions/015_add_ticket_sort_order.py +33 -0
  23. package/app/backend/alembic/versions/03220f0b93ae_add_pr_fields_to_ticket.py +49 -0
  24. package/app/backend/alembic/versions/0c2d89fff3b1_seed_board_configs_from_yaml.py +206 -0
  25. package/app/backend/alembic/versions/3348e5cf54c1_add_merge_checklist_table.py +67 -0
  26. package/app/backend/alembic/versions/357c780ee445_add_goal_status.py +34 -0
  27. package/app/backend/alembic/versions/553340b7e26c_add_autonomy_fields_to_goal.py +65 -0
  28. package/app/backend/alembic/versions/774dc335c679_merge_migration_heads.py +23 -0
  29. package/app/backend/alembic/versions/7b307e847cbd_merge_heads.py +23 -0
  30. package/app/backend/alembic/versions/82ecd978cc70_add_missing_indexes.py +48 -0
  31. package/app/backend/alembic/versions/8ef5054dc280_add_normalized_log_entries.py +173 -0
  32. package/app/backend/alembic/versions/8f3e2bd8ea3b_merge_migration_heads.py +23 -0
  33. package/app/backend/alembic/versions/9d17f0698d3b_add_config_column_to_boards_table.py +30 -0
  34. package/app/backend/alembic/versions/add_agent_conversation_history.py +72 -0
  35. package/app/backend/alembic/versions/add_job_variant.py +34 -0
  36. package/app/backend/alembic/versions/add_performance_indexes.py +95 -0
  37. package/app/backend/alembic/versions/add_repos_and_board_repos.py +174 -0
  38. package/app/backend/alembic/versions/add_session_id_to_jobs.py +27 -0
  39. package/app/backend/alembic/versions/add_sqlite_backend_tables.py +104 -0
  40. package/app/backend/alembic/versions/b10fb0b62240_add_diff_content_to_revisions.py +34 -0
  41. package/app/backend/alembic.ini +89 -0
  42. package/app/backend/app/__init__.py +3 -0
  43. package/app/backend/app/data_dir.py +85 -0
  44. package/app/backend/app/database.py +70 -0
  45. package/app/backend/app/database_sync.py +64 -0
  46. package/app/backend/app/dependencies/__init__.py +5 -0
  47. package/app/backend/app/dependencies/auth.py +80 -0
  48. package/app/backend/app/dependencies.py +43 -0
  49. package/app/backend/app/exceptions.py +178 -0
  50. package/app/backend/app/executors/__init__.py +1 -0
  51. package/app/backend/app/executors/adapters/__init__.py +1 -0
  52. package/app/backend/app/executors/adapters/aider.py +152 -0
  53. package/app/backend/app/executors/adapters/amazon_q.py +103 -0
  54. package/app/backend/app/executors/adapters/amp.py +123 -0
  55. package/app/backend/app/executors/adapters/claude.py +177 -0
  56. package/app/backend/app/executors/adapters/cline.py +127 -0
  57. package/app/backend/app/executors/adapters/codex.py +167 -0
  58. package/app/backend/app/executors/adapters/copilot.py +202 -0
  59. package/app/backend/app/executors/adapters/cursor.py +87 -0
  60. package/app/backend/app/executors/adapters/droid.py +123 -0
  61. package/app/backend/app/executors/adapters/gemini.py +132 -0
  62. package/app/backend/app/executors/adapters/goose.py +131 -0
  63. package/app/backend/app/executors/adapters/opencode.py +123 -0
  64. package/app/backend/app/executors/adapters/qwen.py +123 -0
  65. package/app/backend/app/executors/plugins/__init__.py +1 -0
  66. package/app/backend/app/executors/registry.py +202 -0
  67. package/app/backend/app/executors/spec.py +226 -0
  68. package/app/backend/app/main.py +486 -0
  69. package/app/backend/app/middleware/__init__.py +13 -0
  70. package/app/backend/app/middleware/idempotency.py +426 -0
  71. package/app/backend/app/middleware/rate_limit.py +312 -0
  72. package/app/backend/app/middleware/security_headers.py +43 -0
  73. package/app/backend/app/middleware/timeout.py +37 -0
  74. package/app/backend/app/models/__init__.py +56 -0
  75. package/app/backend/app/models/agent_conversation_history.py +56 -0
  76. package/app/backend/app/models/agent_session.py +127 -0
  77. package/app/backend/app/models/analysis_cache.py +49 -0
  78. package/app/backend/app/models/base.py +9 -0
  79. package/app/backend/app/models/board.py +79 -0
  80. package/app/backend/app/models/board_repo.py +68 -0
  81. package/app/backend/app/models/cost_budget.py +42 -0
  82. package/app/backend/app/models/enums.py +40 -0
  83. package/app/backend/app/models/evidence.py +132 -0
  84. package/app/backend/app/models/goal.py +102 -0
  85. package/app/backend/app/models/idempotency_entry.py +30 -0
  86. package/app/backend/app/models/job.py +163 -0
  87. package/app/backend/app/models/job_queue.py +39 -0
  88. package/app/backend/app/models/kv_store.py +28 -0
  89. package/app/backend/app/models/merge_checklist.py +87 -0
  90. package/app/backend/app/models/normalized_log.py +100 -0
  91. package/app/backend/app/models/planner_lock.py +43 -0
  92. package/app/backend/app/models/rate_limit_entry.py +25 -0
  93. package/app/backend/app/models/repo.py +66 -0
  94. package/app/backend/app/models/review_comment.py +91 -0
  95. package/app/backend/app/models/review_summary.py +69 -0
  96. package/app/backend/app/models/revision.py +130 -0
  97. package/app/backend/app/models/ticket.py +223 -0
  98. package/app/backend/app/models/ticket_event.py +83 -0
  99. package/app/backend/app/models/user.py +47 -0
  100. package/app/backend/app/models/workspace.py +71 -0
  101. package/app/backend/app/redis_client.py +119 -0
  102. package/app/backend/app/routers/__init__.py +29 -0
  103. package/app/backend/app/routers/agents.py +296 -0
  104. package/app/backend/app/routers/auth.py +94 -0
  105. package/app/backend/app/routers/board.py +885 -0
  106. package/app/backend/app/routers/dashboard.py +351 -0
  107. package/app/backend/app/routers/debug.py +528 -0
  108. package/app/backend/app/routers/evidence.py +96 -0
  109. package/app/backend/app/routers/executors.py +324 -0
  110. package/app/backend/app/routers/goals.py +574 -0
  111. package/app/backend/app/routers/jobs.py +448 -0
  112. package/app/backend/app/routers/maintenance.py +172 -0
  113. package/app/backend/app/routers/merge.py +360 -0
  114. package/app/backend/app/routers/planner.py +537 -0
  115. package/app/backend/app/routers/pull_requests.py +382 -0
  116. package/app/backend/app/routers/repos.py +263 -0
  117. package/app/backend/app/routers/revisions.py +939 -0
  118. package/app/backend/app/routers/settings.py +267 -0
  119. package/app/backend/app/routers/tickets.py +2003 -0
  120. package/app/backend/app/routers/webhooks.py +143 -0
  121. package/app/backend/app/routers/websocket.py +249 -0
  122. package/app/backend/app/schemas/__init__.py +109 -0
  123. package/app/backend/app/schemas/board.py +87 -0
  124. package/app/backend/app/schemas/common.py +33 -0
  125. package/app/backend/app/schemas/evidence.py +87 -0
  126. package/app/backend/app/schemas/goal.py +90 -0
  127. package/app/backend/app/schemas/job.py +97 -0
  128. package/app/backend/app/schemas/merge.py +139 -0
  129. package/app/backend/app/schemas/planner.py +500 -0
  130. package/app/backend/app/schemas/repo.py +187 -0
  131. package/app/backend/app/schemas/review.py +137 -0
  132. package/app/backend/app/schemas/revision.py +114 -0
  133. package/app/backend/app/schemas/ticket.py +238 -0
  134. package/app/backend/app/schemas/ticket_event.py +72 -0
  135. package/app/backend/app/schemas/workspace.py +19 -0
  136. package/app/backend/app/services/__init__.py +31 -0
  137. package/app/backend/app/services/agent_memory_service.py +223 -0
  138. package/app/backend/app/services/agent_registry.py +346 -0
  139. package/app/backend/app/services/agent_session_manager.py +318 -0
  140. package/app/backend/app/services/agent_session_service.py +219 -0
  141. package/app/backend/app/services/agent_tools.py +379 -0
  142. package/app/backend/app/services/auth_service.py +98 -0
  143. package/app/backend/app/services/autonomy_service.py +380 -0
  144. package/app/backend/app/services/board_repo_service.py +201 -0
  145. package/app/backend/app/services/board_service.py +326 -0
  146. package/app/backend/app/services/cleanup_service.py +1085 -0
  147. package/app/backend/app/services/config_service.py +908 -0
  148. package/app/backend/app/services/context_gatherer.py +557 -0
  149. package/app/backend/app/services/cost_tracking_service.py +293 -0
  150. package/app/backend/app/services/cursor_log_normalizer.py +536 -0
  151. package/app/backend/app/services/delivery_pipeline.py +440 -0
  152. package/app/backend/app/services/executor_service.py +634 -0
  153. package/app/backend/app/services/git_host/__init__.py +11 -0
  154. package/app/backend/app/services/git_host/factory.py +87 -0
  155. package/app/backend/app/services/git_host/github.py +270 -0
  156. package/app/backend/app/services/git_host/gitlab.py +194 -0
  157. package/app/backend/app/services/git_host/protocol.py +75 -0
  158. package/app/backend/app/services/git_merge_simple.py +346 -0
  159. package/app/backend/app/services/git_ops.py +384 -0
  160. package/app/backend/app/services/github_service.py +233 -0
  161. package/app/backend/app/services/goal_service.py +113 -0
  162. package/app/backend/app/services/job_service.py +423 -0
  163. package/app/backend/app/services/job_watchdog_service.py +424 -0
  164. package/app/backend/app/services/langchain_adapter.py +122 -0
  165. package/app/backend/app/services/llm_provider_clients.py +351 -0
  166. package/app/backend/app/services/llm_service.py +285 -0
  167. package/app/backend/app/services/log_normalizer.py +342 -0
  168. package/app/backend/app/services/log_stream_service.py +276 -0
  169. package/app/backend/app/services/merge_checklist_service.py +264 -0
  170. package/app/backend/app/services/merge_service.py +784 -0
  171. package/app/backend/app/services/orchestrator_log.py +84 -0
  172. package/app/backend/app/services/planner_service.py +1662 -0
  173. package/app/backend/app/services/planner_tick_sync.py +1040 -0
  174. package/app/backend/app/services/queued_message_service.py +156 -0
  175. package/app/backend/app/services/reliability_wrapper.py +389 -0
  176. package/app/backend/app/services/repo_discovery_service.py +318 -0
  177. package/app/backend/app/services/review_service.py +334 -0
  178. package/app/backend/app/services/revision_service.py +389 -0
  179. package/app/backend/app/services/safe_autopilot.py +510 -0
  180. package/app/backend/app/services/sqlite_worker.py +372 -0
  181. package/app/backend/app/services/task_dispatch.py +135 -0
  182. package/app/backend/app/services/ticket_generation_service.py +1781 -0
  183. package/app/backend/app/services/ticket_service.py +486 -0
  184. package/app/backend/app/services/udar_planner_service.py +1007 -0
  185. package/app/backend/app/services/webhook_service.py +126 -0
  186. package/app/backend/app/services/workspace_service.py +465 -0
  187. package/app/backend/app/services/worktree_file_service.py +92 -0
  188. package/app/backend/app/services/worktree_validator.py +213 -0
  189. package/app/backend/app/sqlite_kv.py +278 -0
  190. package/app/backend/app/state_machine.py +128 -0
  191. package/app/backend/app/templates/__init__.py +5 -0
  192. package/app/backend/app/templates/registry.py +243 -0
  193. package/app/backend/app/utils/__init__.py +5 -0
  194. package/app/backend/app/utils/artifact_reader.py +87 -0
  195. package/app/backend/app/utils/circuit_breaker.py +229 -0
  196. package/app/backend/app/utils/db_retry.py +136 -0
  197. package/app/backend/app/utils/ignored_fields.py +123 -0
  198. package/app/backend/app/utils/validators.py +54 -0
  199. package/app/backend/app/websocket/__init__.py +5 -0
  200. package/app/backend/app/websocket/manager.py +179 -0
  201. package/app/backend/app/websocket/state_tracker.py +113 -0
  202. package/app/backend/app/worker.py +3190 -0
  203. package/app/backend/calculator_tickets.json +40 -0
  204. package/app/backend/canary_tests.sh +591 -0
  205. package/app/backend/celerybeat-schedule +0 -0
  206. package/app/backend/celerybeat-schedule-shm +0 -0
  207. package/app/backend/celerybeat-schedule-wal +0 -0
  208. package/app/backend/logs/.gitkeep +3 -0
  209. package/app/backend/multiplication_division_implementation_tickets.json +55 -0
  210. package/app/backend/multiplication_division_tickets.json +42 -0
  211. package/app/backend/pyproject.toml +45 -0
  212. package/app/backend/requirements-dev.txt +8 -0
  213. package/app/backend/requirements.txt +20 -0
  214. package/app/backend/run.sh +30 -0
  215. package/app/backend/run_with_logs.sh +10 -0
  216. package/app/backend/scientific_calculator_tickets.json +40 -0
  217. package/app/backend/scripts/extract_openapi.py +21 -0
  218. package/app/backend/scripts/seed_demo.py +187 -0
  219. package/app/backend/setup_demo_review.py +302 -0
  220. package/app/backend/test_actual_parse.py +41 -0
  221. package/app/backend/test_agent_streaming.py +61 -0
  222. package/app/backend/test_parse.py +51 -0
  223. package/app/backend/test_streaming.py +51 -0
  224. package/app/backend/test_subprocess_streaming.py +50 -0
  225. package/app/backend/tests/__init__.py +1 -0
  226. package/app/backend/tests/conftest.py +46 -0
  227. package/app/backend/tests/test_auth.py +341 -0
  228. package/app/backend/tests/test_autonomy_service.py +391 -0
  229. package/app/backend/tests/test_cleanup_service_safety.py +417 -0
  230. package/app/backend/tests/test_middleware.py +279 -0
  231. package/app/backend/tests/test_planner_providers.py +290 -0
  232. package/app/backend/tests/test_planner_unblock.py +183 -0
  233. package/app/backend/tests/test_revision_invariants.py +618 -0
  234. package/app/backend/tests/test_sqlite_kv.py +290 -0
  235. package/app/backend/tests/test_sqlite_worker.py +353 -0
  236. package/app/backend/tests/test_task_dispatch.py +100 -0
  237. package/app/backend/tests/test_ticket_validation.py +304 -0
  238. package/app/backend/tests/test_udar_agent.py +693 -0
  239. package/app/backend/tests/test_webhook_service.py +184 -0
  240. package/app/backend/tickets_output.json +59 -0
  241. package/app/backend/user_management_tickets.json +50 -0
  242. package/app/backend/uvicorn.log +0 -0
  243. package/app/draft.yaml +313 -0
  244. package/app/frontend/dist/assets/index-LcjCczu5.js +155 -0
  245. package/app/frontend/dist/assets/index-_FP_279e.css +1 -0
  246. package/app/frontend/dist/index.html +14 -0
  247. package/app/frontend/dist/vite.svg +1 -0
  248. package/app/frontend/package.json +101 -0
  249. package/bin/cli.js +527 -0
  250. package/package.json +37 -0
@@ -0,0 +1,156 @@
1
+ """Queued message service for chaining prompts during execution.
2
+
3
+ Allows users to queue the next prompt while an execution is in progress.
4
+ When the current execution completes, the queued message is automatically
5
+ executed.
6
+
7
+ Uses SQLite kv_store for persistence.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ from dataclasses import dataclass
13
+ from datetime import UTC, datetime
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ QUEUE_KEY_PREFIX = "queued_message:"
18
+ FOLLOWUP_KEY_PREFIX = "followup_prompt:"
19
+ QUEUE_TTL = 86400 # 24 hours
20
+
21
+
22
+ @dataclass
23
+ class QueuedMessage:
24
+ """A queued follow-up message."""
25
+
26
+ ticket_id: str
27
+ message: str
28
+ queued_at: datetime
29
+
30
+ def to_dict(self) -> dict:
31
+ """Serialize to dict."""
32
+ return {
33
+ "ticket_id": self.ticket_id,
34
+ "message": self.message,
35
+ "queued_at": self.queued_at.isoformat(),
36
+ }
37
+
38
+ @classmethod
39
+ def from_dict(cls, data: dict) -> "QueuedMessage":
40
+ """Deserialize from dict."""
41
+ return cls(
42
+ ticket_id=data["ticket_id"],
43
+ message=data["message"],
44
+ queued_at=datetime.fromisoformat(data["queued_at"]),
45
+ )
46
+
47
+
48
+ class QueuedMessageService:
49
+ """Service for queuing follow-up messages during execution.
50
+
51
+ One queued message per ticket. New messages replace old ones.
52
+ """
53
+
54
+ def _get_key(self, ticket_id: str) -> str:
55
+ """Get key for a ticket's queued message."""
56
+ return f"{QUEUE_KEY_PREFIX}{ticket_id}"
57
+
58
+ def queue_message(self, ticket_id: str, message: str) -> QueuedMessage:
59
+ """Queue a follow-up message for a ticket.
60
+
61
+ Replaces any existing queued message for this ticket.
62
+ """
63
+ from app.sqlite_kv import kv_set
64
+
65
+ queued = QueuedMessage(
66
+ ticket_id=ticket_id,
67
+ message=message,
68
+ queued_at=datetime.now(UTC),
69
+ )
70
+
71
+ key = self._get_key(ticket_id)
72
+ kv_set(key, json.dumps(queued.to_dict()), ttl_seconds=QUEUE_TTL)
73
+
74
+ logger.info(f"Queued message for ticket {ticket_id}: {message[:50]}...")
75
+ return queued
76
+
77
+ def get_queued(self, ticket_id: str) -> QueuedMessage | None:
78
+ """Get the queued message for a ticket (if any)."""
79
+ from app.sqlite_kv import kv_get
80
+
81
+ key = self._get_key(ticket_id)
82
+ data = kv_get(key)
83
+
84
+ if data is None:
85
+ return None
86
+
87
+ try:
88
+ return QueuedMessage.from_dict(json.loads(data))
89
+ except (json.JSONDecodeError, KeyError) as e:
90
+ logger.warning(f"Invalid queued message for {ticket_id}: {e}")
91
+ return None
92
+
93
+ def take_queued(self, ticket_id: str) -> QueuedMessage | None:
94
+ """Take (remove and return) the queued message for a ticket.
95
+
96
+ Used by the planner to consume queued messages after execution.
97
+ """
98
+ from app.sqlite_kv import kv_take
99
+
100
+ key = self._get_key(ticket_id)
101
+ data = kv_take(key)
102
+
103
+ if data is None:
104
+ return None
105
+
106
+ try:
107
+ msg = QueuedMessage.from_dict(json.loads(data))
108
+ logger.info(f"Consumed queued message for ticket {ticket_id}")
109
+ return msg
110
+ except (json.JSONDecodeError, KeyError) as e:
111
+ logger.warning(f"Invalid queued message for {ticket_id}: {e}")
112
+ return None
113
+
114
+ def cancel_queued(self, ticket_id: str) -> bool:
115
+ """Cancel/remove a queued message for a ticket."""
116
+ from app.sqlite_kv import kv_delete
117
+
118
+ key = self._get_key(ticket_id)
119
+ deleted = kv_delete(key)
120
+
121
+ if deleted:
122
+ logger.info(f"Cancelled queued message for ticket {ticket_id}")
123
+ return deleted
124
+
125
+ def has_queued(self, ticket_id: str) -> bool:
126
+ """Check if a ticket has a queued message."""
127
+ from app.sqlite_kv import kv_exists
128
+
129
+ key = self._get_key(ticket_id)
130
+ return kv_exists(key)
131
+
132
+ # ========== Follow-up prompt storage (for worker) ==========
133
+
134
+ def set_followup_prompt(self, ticket_id: str, prompt: str) -> None:
135
+ """Set a follow-up prompt for the worker to pick up."""
136
+ from app.sqlite_kv import kv_set
137
+
138
+ key = f"{FOLLOWUP_KEY_PREFIX}{ticket_id}"
139
+ kv_set(key, prompt, ttl_seconds=3600)
140
+ logger.info(f"Set follow-up prompt for ticket {ticket_id}")
141
+
142
+ def get_followup_prompt(self, ticket_id: str) -> str | None:
143
+ """Get and clear the follow-up prompt for a ticket."""
144
+ from app.sqlite_kv import kv_take
145
+
146
+ key = f"{FOLLOWUP_KEY_PREFIX}{ticket_id}"
147
+ prompt = kv_take(key)
148
+
149
+ if prompt:
150
+ logger.info(f"Retrieved follow-up prompt for ticket {ticket_id}")
151
+ return prompt.decode() if isinstance(prompt, bytes) else prompt
152
+ return None
153
+
154
+
155
+ # Global singleton
156
+ queued_message_service = QueuedMessageService()
@@ -0,0 +1,389 @@
1
+ """Reliability wrapper for autonomous execution with retry, checkpointing, and recovery."""
2
+
3
+ import asyncio
4
+ import time
5
+ from collections.abc import Callable
6
+ from dataclasses import dataclass
7
+ from datetime import datetime
8
+ from enum import StrEnum
9
+ from typing import Any
10
+
11
+ from sqlalchemy.ext.asyncio import AsyncSession
12
+
13
+ from app.exceptions import ExecutorError, ExecutorTimeoutError
14
+
15
+
16
+ class CheckpointType(StrEnum):
17
+ """Types of execution checkpoints."""
18
+
19
+ START = "start"
20
+ PROGRESS = "progress"
21
+ VALIDATION = "validation"
22
+ COMPLETION = "completion"
23
+ FAILURE = "failure"
24
+
25
+
26
+ @dataclass
27
+ class ExecutionCheckpoint:
28
+ """Represents a point in execution that can be resumed from."""
29
+
30
+ checkpoint_id: str
31
+ ticket_id: str
32
+ job_id: str | None
33
+ checkpoint_type: CheckpointType
34
+ timestamp: datetime
35
+ retry_count: int
36
+ state_snapshot: dict[str, Any]
37
+ error_message: str | None = None
38
+
39
+
40
+ @dataclass
41
+ class RetryConfig:
42
+ """Configuration for retry behavior."""
43
+
44
+ max_retries: int = 3
45
+ initial_delay_seconds: float = 2.0
46
+ max_delay_seconds: float = 60.0
47
+ exponential_base: float = 2.0
48
+ jitter: bool = True
49
+
50
+ def get_delay(self, retry_attempt: int) -> float:
51
+ """Calculate delay for given retry attempt with exponential backoff."""
52
+ delay = min(
53
+ self.initial_delay_seconds * (self.exponential_base**retry_attempt),
54
+ self.max_delay_seconds,
55
+ )
56
+
57
+ if self.jitter:
58
+ # Add random jitter of ±20% to prevent thundering herd
59
+ import random
60
+
61
+ jitter_amount = delay * 0.2
62
+ delay = delay + random.uniform(-jitter_amount, jitter_amount)
63
+
64
+ return max(0, delay)
65
+
66
+
67
+ class ReliabilityWrapper:
68
+ """
69
+ Wraps execution with reliability features:
70
+ - Automatic retry with exponential backoff
71
+ - Checkpointing for resume capability
72
+ - Progress tracking
73
+ - Error recovery
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ db: AsyncSession,
79
+ retry_config: RetryConfig | None = None,
80
+ checkpoint_interval_seconds: int = 300, # 5 minutes
81
+ ):
82
+ self.db = db
83
+ self.retry_config = retry_config or RetryConfig()
84
+ self.checkpoint_interval_seconds = checkpoint_interval_seconds
85
+ self._checkpoints: dict[str, ExecutionCheckpoint] = {}
86
+ self._last_checkpoint_time: dict[str, float] = {}
87
+
88
+ async def execute_with_reliability(
89
+ self,
90
+ func: Callable,
91
+ *args,
92
+ ticket_id: str,
93
+ job_id: str | None = None,
94
+ validation_func: Callable[[Any], bool] | None = None,
95
+ checkpoint_key: str | None = None,
96
+ **kwargs,
97
+ ) -> Any:
98
+ """
99
+ Execute a function with automatic retry, checkpointing, and recovery.
100
+
101
+ Args:
102
+ func: The function to execute
103
+ *args: Positional arguments for func
104
+ ticket_id: ID of the ticket being executed
105
+ job_id: Optional job ID for tracking
106
+ validation_func: Optional function to validate result before accepting
107
+ checkpoint_key: Optional key for checkpoint storage
108
+ **kwargs: Keyword arguments for func
109
+
110
+ Returns:
111
+ The result of the function execution
112
+
113
+ Raises:
114
+ The last exception if all retries are exhausted
115
+ """
116
+ checkpoint_key = checkpoint_key or f"{ticket_id}:{job_id or 'default'}"
117
+
118
+ # Create initial checkpoint
119
+ await self._create_checkpoint(
120
+ checkpoint_key=checkpoint_key,
121
+ ticket_id=ticket_id,
122
+ job_id=job_id,
123
+ checkpoint_type=CheckpointType.START,
124
+ retry_count=0,
125
+ state_snapshot={"args": str(args), "kwargs": str(kwargs)},
126
+ )
127
+
128
+ last_exception = None
129
+
130
+ for attempt in range(self.retry_config.max_retries + 1):
131
+ try:
132
+ # Execute the function
133
+ result = await self._execute_with_monitoring(
134
+ func=func,
135
+ checkpoint_key=checkpoint_key,
136
+ ticket_id=ticket_id,
137
+ job_id=job_id,
138
+ retry_count=attempt,
139
+ *args,
140
+ **kwargs,
141
+ )
142
+
143
+ # Validate result if validation function provided
144
+ if validation_func and not await self._validate_result(
145
+ result, validation_func
146
+ ):
147
+ raise ValueError("Result validation failed")
148
+
149
+ # Success - create completion checkpoint
150
+ await self._create_checkpoint(
151
+ checkpoint_key=checkpoint_key,
152
+ ticket_id=ticket_id,
153
+ job_id=job_id,
154
+ checkpoint_type=CheckpointType.COMPLETION,
155
+ retry_count=attempt,
156
+ state_snapshot={"success": True},
157
+ )
158
+
159
+ return result
160
+
161
+ except asyncio.CancelledError:
162
+ # Don't retry on cancellation
163
+ await self._create_checkpoint(
164
+ checkpoint_key=checkpoint_key,
165
+ ticket_id=ticket_id,
166
+ job_id=job_id,
167
+ checkpoint_type=CheckpointType.FAILURE,
168
+ retry_count=attempt,
169
+ state_snapshot={"cancelled": True},
170
+ error_message="Execution cancelled",
171
+ )
172
+ raise
173
+
174
+ except Exception as e:
175
+ last_exception = e
176
+
177
+ # Check if error is retryable
178
+ if not self._is_retryable_error(e):
179
+ await self._create_checkpoint(
180
+ checkpoint_key=checkpoint_key,
181
+ ticket_id=ticket_id,
182
+ job_id=job_id,
183
+ checkpoint_type=CheckpointType.FAILURE,
184
+ retry_count=attempt,
185
+ state_snapshot={"non_retryable": True},
186
+ error_message=str(e),
187
+ )
188
+ raise
189
+
190
+ # Last attempt failed
191
+ if attempt >= self.retry_config.max_retries:
192
+ await self._create_checkpoint(
193
+ checkpoint_key=checkpoint_key,
194
+ ticket_id=ticket_id,
195
+ job_id=job_id,
196
+ checkpoint_type=CheckpointType.FAILURE,
197
+ retry_count=attempt,
198
+ state_snapshot={"exhausted_retries": True},
199
+ error_message=str(e),
200
+ )
201
+ raise
202
+
203
+ # Calculate delay and retry
204
+ delay = self.retry_config.get_delay(attempt)
205
+
206
+ await self._create_checkpoint(
207
+ checkpoint_key=checkpoint_key,
208
+ ticket_id=ticket_id,
209
+ job_id=job_id,
210
+ checkpoint_type=CheckpointType.PROGRESS,
211
+ retry_count=attempt,
212
+ state_snapshot={
213
+ "retry_in_seconds": delay,
214
+ "error": str(e),
215
+ "attempt": attempt + 1,
216
+ },
217
+ error_message=str(e),
218
+ )
219
+
220
+ await asyncio.sleep(delay)
221
+
222
+ # Should not reach here, but handle it
223
+ if last_exception:
224
+ raise last_exception
225
+
226
+ async def _execute_with_monitoring(
227
+ self,
228
+ func: Callable,
229
+ checkpoint_key: str,
230
+ ticket_id: str,
231
+ job_id: str | None,
232
+ retry_count: int,
233
+ *args,
234
+ **kwargs,
235
+ ) -> Any:
236
+ """Execute function with progress monitoring and periodic checkpointing."""
237
+ start_time = time.time()
238
+ self._last_checkpoint_time[checkpoint_key] = start_time
239
+
240
+ # Check if this is an async function
241
+ if asyncio.iscoroutinefunction(func):
242
+ # Create a task so we can monitor it
243
+ task = asyncio.create_task(func(*args, **kwargs))
244
+
245
+ # Monitor execution and create periodic checkpoints
246
+ while not task.done():
247
+ await asyncio.sleep(1) # Check every second
248
+
249
+ elapsed = time.time() - self._last_checkpoint_time[checkpoint_key]
250
+ if elapsed >= self.checkpoint_interval_seconds:
251
+ await self._create_checkpoint(
252
+ checkpoint_key=checkpoint_key,
253
+ ticket_id=ticket_id,
254
+ job_id=job_id,
255
+ checkpoint_type=CheckpointType.PROGRESS,
256
+ retry_count=retry_count,
257
+ state_snapshot={
258
+ "elapsed_seconds": time.time() - start_time,
259
+ "still_running": True,
260
+ },
261
+ )
262
+ self._last_checkpoint_time[checkpoint_key] = time.time()
263
+
264
+ return await task
265
+ else:
266
+ # Sync function - execute directly
267
+ return func(*args, **kwargs)
268
+
269
+ async def _validate_result(self, result: Any, validation_func: Callable) -> bool:
270
+ """Validate execution result."""
271
+ try:
272
+ if asyncio.iscoroutinefunction(validation_func):
273
+ return await validation_func(result)
274
+ else:
275
+ return validation_func(result)
276
+ except Exception:
277
+ return False
278
+
279
+ def _is_retryable_error(self, error: Exception) -> bool:
280
+ """Determine if an error is retryable."""
281
+ # Network/connection errors - retryable
282
+ if isinstance(error, (ConnectionError, TimeoutError, asyncio.TimeoutError)):
283
+ return True
284
+
285
+ # Executor timeout - retryable
286
+ if isinstance(error, ExecutorTimeoutError):
287
+ return True
288
+
289
+ # Some executor errors are retryable (transient failures)
290
+ if isinstance(error, ExecutorError):
291
+ error_msg = str(error).lower()
292
+ # Retry on rate limits, temporary unavailability, etc.
293
+ retryable_patterns = [
294
+ "rate limit",
295
+ "timeout",
296
+ "temporary",
297
+ "unavailable",
298
+ "too many requests",
299
+ "service unavailable",
300
+ "connection",
301
+ ]
302
+ return any(pattern in error_msg for pattern in retryable_patterns)
303
+
304
+ # Validation errors, logic errors - not retryable
305
+ if isinstance(error, (ValueError, TypeError, KeyError, AttributeError)):
306
+ return False
307
+
308
+ # Default: don't retry unknown errors
309
+ return False
310
+
311
+ async def _create_checkpoint(
312
+ self,
313
+ checkpoint_key: str,
314
+ ticket_id: str,
315
+ job_id: str | None,
316
+ checkpoint_type: CheckpointType,
317
+ retry_count: int,
318
+ state_snapshot: dict[str, Any],
319
+ error_message: str | None = None,
320
+ ):
321
+ """Create an execution checkpoint."""
322
+ checkpoint = ExecutionCheckpoint(
323
+ checkpoint_id=f"{checkpoint_key}:{checkpoint_type.value}:{int(time.time())}",
324
+ ticket_id=ticket_id,
325
+ job_id=job_id,
326
+ checkpoint_type=checkpoint_type,
327
+ timestamp=datetime.utcnow(),
328
+ retry_count=retry_count,
329
+ state_snapshot=state_snapshot,
330
+ error_message=error_message,
331
+ )
332
+
333
+ self._checkpoints[checkpoint_key] = checkpoint
334
+
335
+ # TODO: Persist checkpoint to database for true resumability
336
+ # For now, keeping in memory is sufficient for single-session reliability
337
+
338
+ async def get_last_checkpoint(
339
+ self, checkpoint_key: str
340
+ ) -> ExecutionCheckpoint | None:
341
+ """Get the last checkpoint for a given key."""
342
+ return self._checkpoints.get(checkpoint_key)
343
+
344
+ async def list_checkpoints(self, ticket_id: str) -> list[ExecutionCheckpoint]:
345
+ """List all checkpoints for a ticket."""
346
+ return [cp for cp in self._checkpoints.values() if cp.ticket_id == ticket_id]
347
+
348
+ async def cleanup_checkpoints(self, ticket_id: str):
349
+ """Clean up checkpoints for a completed ticket."""
350
+ keys_to_remove = [
351
+ key for key, cp in self._checkpoints.items() if cp.ticket_id == ticket_id
352
+ ]
353
+ for key in keys_to_remove:
354
+ del self._checkpoints[key]
355
+ if key in self._last_checkpoint_time:
356
+ del self._last_checkpoint_time[key]
357
+
358
+
359
+ async def with_retry(
360
+ func: Callable, *args, max_retries: int = 3, initial_delay: float = 2.0, **kwargs
361
+ ) -> Any:
362
+ """
363
+ Simple retry decorator for functions that don't need full reliability wrapper.
364
+
365
+ Usage:
366
+ result = await with_retry(some_async_func, arg1, arg2, max_retries=5)
367
+ """
368
+ retry_config = RetryConfig(
369
+ max_retries=max_retries, initial_delay_seconds=initial_delay
370
+ )
371
+ last_exception = None
372
+
373
+ for attempt in range(max_retries + 1):
374
+ try:
375
+ if asyncio.iscoroutinefunction(func):
376
+ return await func(*args, **kwargs)
377
+ else:
378
+ return func(*args, **kwargs)
379
+ except Exception as e:
380
+ last_exception = e
381
+
382
+ if attempt >= max_retries:
383
+ raise
384
+
385
+ delay = retry_config.get_delay(attempt)
386
+ await asyncio.sleep(delay)
387
+
388
+ if last_exception:
389
+ raise last_exception