horsies 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- horsies/__init__.py +115 -0
- horsies/core/__init__.py +0 -0
- horsies/core/app.py +552 -0
- horsies/core/banner.py +144 -0
- horsies/core/brokers/__init__.py +5 -0
- horsies/core/brokers/listener.py +444 -0
- horsies/core/brokers/postgres.py +864 -0
- horsies/core/cli.py +624 -0
- horsies/core/codec/serde.py +575 -0
- horsies/core/errors.py +535 -0
- horsies/core/logging.py +90 -0
- horsies/core/models/__init__.py +0 -0
- horsies/core/models/app.py +268 -0
- horsies/core/models/broker.py +79 -0
- horsies/core/models/queues.py +23 -0
- horsies/core/models/recovery.py +101 -0
- horsies/core/models/schedule.py +229 -0
- horsies/core/models/task_pg.py +307 -0
- horsies/core/models/tasks.py +332 -0
- horsies/core/models/workflow.py +1988 -0
- horsies/core/models/workflow_pg.py +245 -0
- horsies/core/registry/tasks.py +101 -0
- horsies/core/scheduler/__init__.py +26 -0
- horsies/core/scheduler/calculator.py +267 -0
- horsies/core/scheduler/service.py +569 -0
- horsies/core/scheduler/state.py +260 -0
- horsies/core/task_decorator.py +615 -0
- horsies/core/types/status.py +38 -0
- horsies/core/utils/imports.py +203 -0
- horsies/core/utils/loop_runner.py +44 -0
- horsies/core/worker/current.py +17 -0
- horsies/core/worker/worker.py +1967 -0
- horsies/core/workflows/__init__.py +23 -0
- horsies/core/workflows/engine.py +2344 -0
- horsies/core/workflows/recovery.py +501 -0
- horsies/core/workflows/registry.py +97 -0
- horsies/py.typed +0 -0
- horsies-0.1.0a1.dist-info/METADATA +31 -0
- horsies-0.1.0a1.dist-info/RECORD +42 -0
- horsies-0.1.0a1.dist-info/WHEEL +5 -0
- horsies-0.1.0a1.dist-info/entry_points.txt +2 -0
- horsies-0.1.0a1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""Workflow recovery logic.
|
|
2
|
+
|
|
3
|
+
This module handles recovery of stuck workflows:
|
|
4
|
+
- PENDING tasks with all dependencies terminal (race condition during parallel completion)
|
|
5
|
+
- READY tasks that weren't enqueued (crash after READY, before INSERT into tasks)
|
|
6
|
+
- READY SubWorkflowNodes that weren't started (sub_workflow_id is NULL)
|
|
7
|
+
- Child workflows completed but parent node not updated
|
|
8
|
+
- RUNNING workflows with no active tasks (all tasks done but workflow not updated)
|
|
9
|
+
- Stale RUNNING workflows (no progress for threshold period)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
15
|
+
|
|
16
|
+
from sqlalchemy import text
|
|
17
|
+
|
|
18
|
+
from horsies.core.codec.serde import loads_json, task_result_from_json
|
|
19
|
+
from horsies.core.logging import get_logger
|
|
20
|
+
from horsies.core.models.workflow import WORKFLOW_TASK_TERMINAL_STATES
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
24
|
+
from horsies.core.brokers.postgres import PostgresBroker
|
|
25
|
+
from horsies.core.models.tasks import TaskResult, TaskError
|
|
26
|
+
|
|
27
|
+
logger = get_logger('workflow.recovery')
|
|
28
|
+
|
|
29
|
+
_WF_TASK_TERMINAL_VALUES: list[str] = [s.value for s in WORKFLOW_TASK_TERMINAL_STATES]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def recover_stuck_workflows(
|
|
33
|
+
session: 'AsyncSession',
|
|
34
|
+
broker: 'PostgresBroker | None' = None,
|
|
35
|
+
) -> int:
|
|
36
|
+
"""
|
|
37
|
+
Find and recover workflows in inconsistent states.
|
|
38
|
+
|
|
39
|
+
Recovery cases:
|
|
40
|
+
0. PENDING tasks with all deps terminal - race condition during parallel completion
|
|
41
|
+
1. READY tasks that weren't enqueued (task_id is NULL) - crash after READY, before INSERT
|
|
42
|
+
2. RUNNING workflows with all tasks complete - workflow status not updated
|
|
43
|
+
3. Workflows stuck in RUNNING with no progress
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
session: Database session (caller manages commit)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Count of recovered workflow tasks.
|
|
50
|
+
"""
|
|
51
|
+
recovered = 0
|
|
52
|
+
|
|
53
|
+
from horsies.core.workflows.engine import _get_dependency_results
|
|
54
|
+
|
|
55
|
+
# Case 0: PENDING tasks with all dependencies terminal (race condition during parallel completion)
|
|
56
|
+
# This happens when multiple dependencies complete concurrently and the PENDING→READY
|
|
57
|
+
# transition is missed due to timing
|
|
58
|
+
pending_ready = await session.execute(
|
|
59
|
+
text("""
|
|
60
|
+
SELECT wt.workflow_id, wt.task_index, wt.dependencies, wt.allow_failed_deps
|
|
61
|
+
FROM horsies_workflow_tasks wt
|
|
62
|
+
JOIN horsies_workflows w ON w.id = wt.workflow_id
|
|
63
|
+
WHERE wt.status = 'PENDING'
|
|
64
|
+
AND w.status = 'RUNNING'
|
|
65
|
+
AND NOT EXISTS (
|
|
66
|
+
SELECT 1 FROM horsies_workflow_tasks dep
|
|
67
|
+
WHERE dep.workflow_id = wt.workflow_id
|
|
68
|
+
AND dep.task_index = ANY(wt.dependencies)
|
|
69
|
+
AND NOT (dep.status = ANY(:wf_task_terminal_states))
|
|
70
|
+
)
|
|
71
|
+
"""),
|
|
72
|
+
{'wf_task_terminal_states': _WF_TASK_TERMINAL_VALUES},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
for row in pending_ready.fetchall():
|
|
76
|
+
workflow_id = row[0]
|
|
77
|
+
task_index = row[1]
|
|
78
|
+
raw_deps = row[2]
|
|
79
|
+
allow_failed_deps = row[3] if row[3] is not None else False
|
|
80
|
+
dependencies: list[int] = (
|
|
81
|
+
cast(list[int], raw_deps) if isinstance(raw_deps, list) else []
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Check if any dependency failed/skipped
|
|
85
|
+
failed_check = await session.execute(
|
|
86
|
+
text("""
|
|
87
|
+
SELECT COUNT(*) FROM horsies_workflow_tasks
|
|
88
|
+
WHERE workflow_id = :wf_id
|
|
89
|
+
AND task_index = ANY(:deps)
|
|
90
|
+
AND status IN ('FAILED', 'SKIPPED')
|
|
91
|
+
"""),
|
|
92
|
+
{'wf_id': workflow_id, 'deps': dependencies},
|
|
93
|
+
)
|
|
94
|
+
failed_count = failed_check.scalar() or 0
|
|
95
|
+
|
|
96
|
+
if failed_count > 0 and not allow_failed_deps:
|
|
97
|
+
# Skip this task (propagate failure)
|
|
98
|
+
await session.execute(
|
|
99
|
+
text("""
|
|
100
|
+
UPDATE horsies_workflow_tasks
|
|
101
|
+
SET status = 'SKIPPED'
|
|
102
|
+
WHERE workflow_id = :wf_id AND task_index = :idx AND status = 'PENDING'
|
|
103
|
+
"""),
|
|
104
|
+
{'wf_id': workflow_id, 'idx': task_index},
|
|
105
|
+
)
|
|
106
|
+
logger.info(
|
|
107
|
+
f'Recovered stuck PENDING task (skipped due to failed deps): '
|
|
108
|
+
f'workflow={workflow_id}, task_index={task_index}'
|
|
109
|
+
)
|
|
110
|
+
recovered += 1
|
|
111
|
+
else:
|
|
112
|
+
# Mark READY and enqueue
|
|
113
|
+
await session.execute(
|
|
114
|
+
text("""
|
|
115
|
+
UPDATE horsies_workflow_tasks
|
|
116
|
+
SET status = 'READY'
|
|
117
|
+
WHERE workflow_id = :wf_id AND task_index = :idx AND status = 'PENDING'
|
|
118
|
+
"""),
|
|
119
|
+
{'wf_id': workflow_id, 'idx': task_index},
|
|
120
|
+
)
|
|
121
|
+
dep_results: dict[
|
|
122
|
+
int, 'TaskResult[Any, TaskError]'
|
|
123
|
+
] = await _get_dependency_results(session, workflow_id, dependencies)
|
|
124
|
+
|
|
125
|
+
from horsies.core.workflows.engine import _enqueue_workflow_task
|
|
126
|
+
|
|
127
|
+
task_id = await _enqueue_workflow_task(
|
|
128
|
+
session, workflow_id, task_index, dep_results
|
|
129
|
+
)
|
|
130
|
+
if task_id:
|
|
131
|
+
logger.info(
|
|
132
|
+
f'Recovered stuck PENDING task: workflow={workflow_id}, '
|
|
133
|
+
f'task_index={task_index}, new_task_id={task_id}'
|
|
134
|
+
)
|
|
135
|
+
recovered += 1
|
|
136
|
+
|
|
137
|
+
# Case 1: READY tasks not enqueued (task_id is NULL but status is READY)
|
|
138
|
+
# This happens if worker crashed after marking READY but before creating task
|
|
139
|
+
# Excludes SubWorkflowNodes (handled separately)
|
|
140
|
+
ready_not_enqueued = await session.execute(
|
|
141
|
+
text("""
|
|
142
|
+
SELECT wt.workflow_id, wt.task_index, wt.dependencies
|
|
143
|
+
FROM horsies_workflow_tasks wt
|
|
144
|
+
JOIN horsies_workflows w ON w.id = wt.workflow_id
|
|
145
|
+
WHERE wt.status = 'READY'
|
|
146
|
+
AND wt.task_id IS NULL
|
|
147
|
+
AND wt.is_subworkflow = FALSE
|
|
148
|
+
AND w.status = 'RUNNING'
|
|
149
|
+
""")
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
for row in ready_not_enqueued.fetchall():
|
|
153
|
+
workflow_id = row[0]
|
|
154
|
+
task_index = row[1]
|
|
155
|
+
raw_deps = row[2]
|
|
156
|
+
dependencies: list[int] = (
|
|
157
|
+
cast(list[int], raw_deps) if isinstance(raw_deps, list) else []
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Fetch dependency results and re-enqueue
|
|
161
|
+
dep_results: dict[
|
|
162
|
+
int, 'TaskResult[Any, TaskError]'
|
|
163
|
+
] = await _get_dependency_results(session, workflow_id, dependencies)
|
|
164
|
+
|
|
165
|
+
from horsies.core.workflows.engine import _enqueue_workflow_task
|
|
166
|
+
|
|
167
|
+
task_id = await _enqueue_workflow_task(
|
|
168
|
+
session, workflow_id, task_index, dep_results
|
|
169
|
+
)
|
|
170
|
+
if task_id:
|
|
171
|
+
logger.info(
|
|
172
|
+
f'Recovered stuck READY task: workflow={workflow_id}, '
|
|
173
|
+
f'task_index={task_index}, new_task_id={task_id}'
|
|
174
|
+
)
|
|
175
|
+
recovered += 1
|
|
176
|
+
|
|
177
|
+
# Case 1.5: READY SubWorkflowNodes not started (sub_workflow_id is NULL)
|
|
178
|
+
# This happens if worker crashed after marking READY but before starting child workflow
|
|
179
|
+
# NOTE: This requires broker to start the child workflow, so we just mark them for retry
|
|
180
|
+
ready_subworkflows = await session.execute(
|
|
181
|
+
text("""
|
|
182
|
+
SELECT wt.workflow_id, wt.task_index, wt.dependencies, w.depth, w.root_workflow_id
|
|
183
|
+
FROM horsies_workflow_tasks wt
|
|
184
|
+
JOIN horsies_workflows w ON w.id = wt.workflow_id
|
|
185
|
+
WHERE wt.status = 'READY'
|
|
186
|
+
AND wt.is_subworkflow = TRUE
|
|
187
|
+
AND wt.sub_workflow_id IS NULL
|
|
188
|
+
AND w.status = 'RUNNING'
|
|
189
|
+
""")
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
for row in ready_subworkflows.fetchall():
|
|
193
|
+
workflow_id = row[0]
|
|
194
|
+
task_index = row[1]
|
|
195
|
+
dependencies = row[2]
|
|
196
|
+
depth = row[3] or 0
|
|
197
|
+
root_wf_id = row[4] or workflow_id
|
|
198
|
+
|
|
199
|
+
if broker is not None:
|
|
200
|
+
from horsies.core.workflows.engine import (
|
|
201
|
+
_enqueue_subworkflow_task,
|
|
202
|
+
_get_dependency_results,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
dep_indices: list[int] = (
|
|
206
|
+
cast(list[int], dependencies) if isinstance(dependencies, list) else []
|
|
207
|
+
)
|
|
208
|
+
dep_results = await _get_dependency_results(
|
|
209
|
+
session, workflow_id, dep_indices
|
|
210
|
+
)
|
|
211
|
+
await _enqueue_subworkflow_task(
|
|
212
|
+
session, broker, workflow_id, task_index, dep_results, depth, root_wf_id
|
|
213
|
+
)
|
|
214
|
+
logger.info(
|
|
215
|
+
f'Recovered stuck READY subworkflow (started): '
|
|
216
|
+
f'workflow={workflow_id}, task_index={task_index}'
|
|
217
|
+
)
|
|
218
|
+
else:
|
|
219
|
+
# Reset to PENDING so a future evaluation can start it
|
|
220
|
+
await session.execute(
|
|
221
|
+
text("""
|
|
222
|
+
UPDATE horsies_workflow_tasks
|
|
223
|
+
SET status = 'PENDING'
|
|
224
|
+
WHERE workflow_id = :wf_id AND task_index = :idx AND status = 'READY'
|
|
225
|
+
"""),
|
|
226
|
+
{'wf_id': workflow_id, 'idx': task_index},
|
|
227
|
+
)
|
|
228
|
+
logger.info(
|
|
229
|
+
f'Recovered stuck READY subworkflow (reset to PENDING): '
|
|
230
|
+
f'workflow={workflow_id}, task_index={task_index}'
|
|
231
|
+
)
|
|
232
|
+
recovered += 1
|
|
233
|
+
|
|
234
|
+
# Case 1.6: Child workflows completed but parent node not updated
|
|
235
|
+
# This happens if the _on_subworkflow_complete callback failed or was interrupted
|
|
236
|
+
completed_children = await session.execute(
|
|
237
|
+
text("""
|
|
238
|
+
SELECT child.id, child.parent_workflow_id, child.parent_task_index, child.status
|
|
239
|
+
FROM horsies_workflows child
|
|
240
|
+
JOIN horsies_workflows parent ON parent.id = child.parent_workflow_id
|
|
241
|
+
JOIN horsies_workflow_tasks wt ON wt.workflow_id = parent.id AND wt.task_index = child.parent_task_index
|
|
242
|
+
WHERE child.status IN ('COMPLETED', 'FAILED')
|
|
243
|
+
AND wt.status = 'RUNNING'
|
|
244
|
+
AND parent.status = 'RUNNING'
|
|
245
|
+
""")
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
for row in completed_children.fetchall():
|
|
249
|
+
child_id = row[0]
|
|
250
|
+
parent_wf_id = row[1]
|
|
251
|
+
parent_task_idx = row[2]
|
|
252
|
+
child_status = row[3]
|
|
253
|
+
|
|
254
|
+
# Re-trigger the subworkflow completion callback
|
|
255
|
+
from horsies.core.workflows.engine import _on_subworkflow_complete
|
|
256
|
+
|
|
257
|
+
await _on_subworkflow_complete(session, child_id, broker)
|
|
258
|
+
logger.info(
|
|
259
|
+
f'Recovered stuck child workflow completion: child={child_id}, '
|
|
260
|
+
f'parent={parent_wf_id}:{parent_task_idx}, child_status={child_status}'
|
|
261
|
+
)
|
|
262
|
+
recovered += 1
|
|
263
|
+
|
|
264
|
+
# Case 1.7: workflow_tasks stuck non-terminal but underlying task is already terminal.
|
|
265
|
+
# This happens when a worker crashes mid-execution:
|
|
266
|
+
# - Reaper marks tasks.status = FAILED (WORKER_CRASHED)
|
|
267
|
+
# - on_workflow_task_complete() was never called (worker died)
|
|
268
|
+
# - workflow_tasks row stays RUNNING/ENQUEUED indefinitely
|
|
269
|
+
crashed_worker_tasks = await session.execute(
|
|
270
|
+
text("""
|
|
271
|
+
SELECT wt.workflow_id, wt.task_index, wt.task_id,
|
|
272
|
+
UPPER(t.status) as task_status, t.result as task_result
|
|
273
|
+
FROM horsies_workflow_tasks wt
|
|
274
|
+
JOIN horsies_tasks t ON t.id = wt.task_id
|
|
275
|
+
JOIN horsies_workflows w ON w.id = wt.workflow_id
|
|
276
|
+
WHERE NOT (wt.status = ANY(:wf_task_terminal_states))
|
|
277
|
+
AND wt.task_id IS NOT NULL
|
|
278
|
+
AND wt.is_subworkflow = FALSE
|
|
279
|
+
AND w.status = 'RUNNING'
|
|
280
|
+
AND UPPER(t.status) IN ('COMPLETED', 'FAILED', 'CANCELLED')
|
|
281
|
+
"""),
|
|
282
|
+
{'wf_task_terminal_states': _WF_TASK_TERMINAL_VALUES},
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
for row in crashed_worker_tasks.fetchall():
|
|
286
|
+
workflow_id = row[0]
|
|
287
|
+
task_index = row[1]
|
|
288
|
+
task_id = row[2]
|
|
289
|
+
task_status = row[3] # uppercase: COMPLETED, FAILED, or CANCELLED
|
|
290
|
+
raw_task_result = row[4]
|
|
291
|
+
|
|
292
|
+
from horsies.core.models.tasks import TaskResult, TaskError, LibraryErrorCode
|
|
293
|
+
|
|
294
|
+
# Deserialize TaskResult from tasks.result, or build a synthetic one
|
|
295
|
+
if raw_task_result is not None:
|
|
296
|
+
result: TaskResult[Any, TaskError] = task_result_from_json(
|
|
297
|
+
loads_json(raw_task_result),
|
|
298
|
+
)
|
|
299
|
+
else:
|
|
300
|
+
# No result stored (e.g. crash before result, DB issue, or cancellation)
|
|
301
|
+
if task_status == 'CANCELLED':
|
|
302
|
+
error_code = LibraryErrorCode.TASK_CANCELLED
|
|
303
|
+
message = 'Task was cancelled before producing a result'
|
|
304
|
+
elif task_status == 'COMPLETED':
|
|
305
|
+
error_code = LibraryErrorCode.RESULT_NOT_AVAILABLE
|
|
306
|
+
message = 'Task completed but result is missing'
|
|
307
|
+
else:
|
|
308
|
+
error_code = LibraryErrorCode.WORKER_CRASHED
|
|
309
|
+
message = (
|
|
310
|
+
'Worker crashed during task execution '
|
|
311
|
+
f'(task_status={task_status}, no result stored)'
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
result = TaskResult(
|
|
315
|
+
err=TaskError(
|
|
316
|
+
error_code=error_code,
|
|
317
|
+
message=message,
|
|
318
|
+
data={
|
|
319
|
+
'task_id': task_id,
|
|
320
|
+
'task_status': task_status,
|
|
321
|
+
'recovery': 'case_1_7',
|
|
322
|
+
},
|
|
323
|
+
),
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Reuse the existing completion handler to update workflow_tasks,
|
|
327
|
+
# apply on_error policy, process dependents, and check workflow completion
|
|
328
|
+
from horsies.core.workflows.engine import on_workflow_task_complete
|
|
329
|
+
|
|
330
|
+
await on_workflow_task_complete(session, task_id, result, broker)
|
|
331
|
+
logger.info(
|
|
332
|
+
f'Recovered crashed worker workflow task: workflow={workflow_id}, '
|
|
333
|
+
f'task_index={task_index}, task_id={task_id}, task_status={task_status}'
|
|
334
|
+
)
|
|
335
|
+
recovered += 1
|
|
336
|
+
|
|
337
|
+
# Case 2+3: Workflows with all tasks terminal but workflow still RUNNING
|
|
338
|
+
# This handles both completed and failed workflows, respecting success_policy.
|
|
339
|
+
# This happens if worker crashed after completing last task but before updating workflow
|
|
340
|
+
terminal_candidates = await session.execute(
|
|
341
|
+
text("""
|
|
342
|
+
SELECT w.id, w.error, w.success_policy,
|
|
343
|
+
COUNT(*) FILTER (WHERE wt.status = 'FAILED') as failed_count
|
|
344
|
+
FROM horsies_workflows w
|
|
345
|
+
LEFT JOIN horsies_workflow_tasks wt ON wt.workflow_id = w.id
|
|
346
|
+
WHERE w.status = 'RUNNING'
|
|
347
|
+
AND NOT EXISTS (
|
|
348
|
+
SELECT 1 FROM horsies_workflow_tasks wt2
|
|
349
|
+
WHERE wt2.workflow_id = w.id
|
|
350
|
+
AND NOT (wt2.status = ANY(:wf_task_terminal_states))
|
|
351
|
+
)
|
|
352
|
+
GROUP BY w.id, w.error, w.success_policy
|
|
353
|
+
"""),
|
|
354
|
+
{'wf_task_terminal_states': _WF_TASK_TERMINAL_VALUES},
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
for row in terminal_candidates.fetchall():
|
|
358
|
+
workflow_id = row[0]
|
|
359
|
+
existing_error = row[1]
|
|
360
|
+
success_policy_data = row[2]
|
|
361
|
+
failed_count = row[3] or 0
|
|
362
|
+
|
|
363
|
+
# Compute final result
|
|
364
|
+
from horsies.core.workflows.engine import (
|
|
365
|
+
_get_workflow_final_result,
|
|
366
|
+
_evaluate_workflow_success,
|
|
367
|
+
_get_workflow_failure_error,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
final_result = await _get_workflow_final_result(session, workflow_id)
|
|
371
|
+
|
|
372
|
+
# Evaluate success using success_policy (or default behavior)
|
|
373
|
+
has_error = existing_error is not None
|
|
374
|
+
workflow_succeeded = await _evaluate_workflow_success(
|
|
375
|
+
session, workflow_id, success_policy_data, has_error, failed_count
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
if workflow_succeeded:
|
|
379
|
+
await session.execute(
|
|
380
|
+
text("""
|
|
381
|
+
UPDATE horsies_workflows
|
|
382
|
+
SET status = 'COMPLETED', result = :result, completed_at = NOW(), updated_at = NOW()
|
|
383
|
+
WHERE id = :wf_id AND status = 'RUNNING'
|
|
384
|
+
"""),
|
|
385
|
+
{'wf_id': workflow_id, 'result': final_result},
|
|
386
|
+
)
|
|
387
|
+
logger.info(f'Recovered stuck COMPLETED workflow: {workflow_id}')
|
|
388
|
+
else:
|
|
389
|
+
# Compute error if not already set
|
|
390
|
+
error_payload = existing_error
|
|
391
|
+
if error_payload is None:
|
|
392
|
+
error_payload = await _get_workflow_failure_error(
|
|
393
|
+
session, workflow_id, success_policy_data
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
await session.execute(
|
|
397
|
+
text("""
|
|
398
|
+
UPDATE horsies_workflows
|
|
399
|
+
SET status = 'FAILED', result = :result, error = :error,
|
|
400
|
+
completed_at = NOW(), updated_at = NOW()
|
|
401
|
+
WHERE id = :wf_id AND status = 'RUNNING'
|
|
402
|
+
"""),
|
|
403
|
+
{'wf_id': workflow_id, 'result': final_result, 'error': error_payload},
|
|
404
|
+
)
|
|
405
|
+
logger.info(f'Recovered stuck FAILED workflow: {workflow_id}')
|
|
406
|
+
|
|
407
|
+
# Send NOTIFY for workflow completion
|
|
408
|
+
await session.execute(
|
|
409
|
+
text("SELECT pg_notify('workflow_done', :wf_id)"),
|
|
410
|
+
{'wf_id': workflow_id},
|
|
411
|
+
)
|
|
412
|
+
recovered += 1
|
|
413
|
+
|
|
414
|
+
return recovered
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
async def _get_first_failed_task_error(
|
|
418
|
+
session: 'AsyncSession',
|
|
419
|
+
workflow_id: str,
|
|
420
|
+
) -> str | None:
|
|
421
|
+
"""
|
|
422
|
+
Get the error payload from the first failed task in a workflow.
|
|
423
|
+
|
|
424
|
+
Returns the serialized TaskError from the first FAILED task (ordered by task_index),
|
|
425
|
+
or None if no failed task has an error.
|
|
426
|
+
"""
|
|
427
|
+
from horsies.core.codec.serde import dumps_json
|
|
428
|
+
|
|
429
|
+
result = await session.execute(
|
|
430
|
+
text("""
|
|
431
|
+
SELECT result
|
|
432
|
+
FROM horsies_workflow_tasks
|
|
433
|
+
WHERE workflow_id = :wf_id
|
|
434
|
+
AND status = 'FAILED'
|
|
435
|
+
ORDER BY task_index ASC
|
|
436
|
+
LIMIT 1
|
|
437
|
+
"""),
|
|
438
|
+
{'wf_id': workflow_id},
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
row = result.fetchone()
|
|
442
|
+
if row is None or row[0] is None:
|
|
443
|
+
return None
|
|
444
|
+
|
|
445
|
+
task_result = task_result_from_json(loads_json(row[0]))
|
|
446
|
+
if task_result.is_err() and task_result.err:
|
|
447
|
+
return dumps_json(task_result.err)
|
|
448
|
+
|
|
449
|
+
return None
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
async def _get_dependency_results(
|
|
453
|
+
session: 'AsyncSession',
|
|
454
|
+
workflow_id: str,
|
|
455
|
+
dependency_indices: list[int],
|
|
456
|
+
) -> dict[int, 'TaskResult[Any, TaskError]']:
|
|
457
|
+
"""
|
|
458
|
+
Fetch TaskResults for dependencies in terminal states.
|
|
459
|
+
|
|
460
|
+
- COMPLETED/FAILED: returns actual TaskResult from stored result
|
|
461
|
+
- SKIPPED: returns sentinel TaskResult with UPSTREAM_SKIPPED error
|
|
462
|
+
"""
|
|
463
|
+
from horsies.core.models.tasks import TaskError, LibraryErrorCode, TaskResult
|
|
464
|
+
|
|
465
|
+
if not dependency_indices:
|
|
466
|
+
return {}
|
|
467
|
+
|
|
468
|
+
result = await session.execute(
|
|
469
|
+
text("""
|
|
470
|
+
SELECT task_index, status, result
|
|
471
|
+
FROM horsies_workflow_tasks
|
|
472
|
+
WHERE workflow_id = :wf_id
|
|
473
|
+
AND task_index = ANY(:indices)
|
|
474
|
+
AND status = ANY(:wf_task_terminal_states)
|
|
475
|
+
"""),
|
|
476
|
+
{
|
|
477
|
+
'wf_id': workflow_id,
|
|
478
|
+
'indices': dependency_indices,
|
|
479
|
+
'wf_task_terminal_states': _WF_TASK_TERMINAL_VALUES,
|
|
480
|
+
},
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
results: dict[int, TaskResult[Any, TaskError]] = {}
|
|
484
|
+
for row in result.fetchall():
|
|
485
|
+
task_index = row[0]
|
|
486
|
+
status = row[1]
|
|
487
|
+
stored_result = row[2]
|
|
488
|
+
|
|
489
|
+
if status == 'SKIPPED':
|
|
490
|
+
# Inject sentinel TaskResult for SKIPPED dependencies
|
|
491
|
+
results[task_index] = TaskResult(
|
|
492
|
+
err=TaskError(
|
|
493
|
+
error_code=LibraryErrorCode.UPSTREAM_SKIPPED,
|
|
494
|
+
message='Upstream dependency was SKIPPED',
|
|
495
|
+
data={'dependency_index': task_index},
|
|
496
|
+
)
|
|
497
|
+
)
|
|
498
|
+
elif stored_result:
|
|
499
|
+
results[task_index] = task_result_from_json(loads_json(stored_result))
|
|
500
|
+
|
|
501
|
+
return results
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Registry for WorkflowSpec objects.
|
|
2
|
+
|
|
3
|
+
Enables condition evaluation by providing access to TaskNode and SubWorkflowNode
|
|
4
|
+
objects at runtime. Workers that import workflow modules will automatically
|
|
5
|
+
register the specs.
|
|
6
|
+
|
|
7
|
+
NOTE: For conditions to work, the workflow module must be imported
|
|
8
|
+
in the worker process so specs are registered.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import TYPE_CHECKING, Any
|
|
14
|
+
from weakref import WeakValueDictionary
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from horsies.core.models.workflow import TaskNode, SubWorkflowNode, WorkflowSpec
|
|
18
|
+
|
|
19
|
+
# Registry: (workflow_name, task_index) -> TaskNode | SubWorkflowNode
|
|
20
|
+
# Use weak references so specs can be garbage collected
|
|
21
|
+
_nodes_by_spec: WeakValueDictionary[tuple[str, int], Any] = WeakValueDictionary()
|
|
22
|
+
|
|
23
|
+
# Strong reference to keep specs alive during execution
|
|
24
|
+
_active_specs: dict[str, 'WorkflowSpec'] = {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register_workflow_spec(spec: 'WorkflowSpec') -> None:
|
|
28
|
+
"""
|
|
29
|
+
Register a WorkflowSpec for condition evaluation and subworkflow lookup.
|
|
30
|
+
|
|
31
|
+
Called automatically when WorkflowSpec is created.
|
|
32
|
+
Workers need to import the same module to have access to conditions.
|
|
33
|
+
"""
|
|
34
|
+
_active_specs[spec.name] = spec
|
|
35
|
+
for node in spec.tasks:
|
|
36
|
+
if node.index is not None:
|
|
37
|
+
_nodes_by_spec[(spec.name, node.index)] = node
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def unregister_workflow_spec(name: str) -> None:
|
|
41
|
+
"""Remove a workflow spec from the registry."""
|
|
42
|
+
if name in _active_specs:
|
|
43
|
+
spec = _active_specs[name]
|
|
44
|
+
for node in spec.tasks:
|
|
45
|
+
if node.index is not None:
|
|
46
|
+
key = (name, node.index)
|
|
47
|
+
if key in _nodes_by_spec:
|
|
48
|
+
del _nodes_by_spec[key]
|
|
49
|
+
del _active_specs[name]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_task_node(workflow_name: str, task_index: int) -> 'TaskNode[Any] | None':
|
|
53
|
+
"""
|
|
54
|
+
Look up a TaskNode by workflow name and task index.
|
|
55
|
+
|
|
56
|
+
Returns None if not found (workflow not registered in this process).
|
|
57
|
+
"""
|
|
58
|
+
from typing import cast
|
|
59
|
+
from horsies.core.models.workflow import TaskNode
|
|
60
|
+
|
|
61
|
+
node = _nodes_by_spec.get((workflow_name, task_index))
|
|
62
|
+
if node is not None and isinstance(node, TaskNode):
|
|
63
|
+
return cast('TaskNode[Any]', node)
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_subworkflow_node(
|
|
68
|
+
workflow_name: str, task_index: int
|
|
69
|
+
) -> 'SubWorkflowNode[Any] | None':
|
|
70
|
+
"""
|
|
71
|
+
Look up a SubWorkflowNode by workflow name and task index.
|
|
72
|
+
|
|
73
|
+
Returns None if not found (workflow not registered in this process).
|
|
74
|
+
"""
|
|
75
|
+
from typing import cast
|
|
76
|
+
from horsies.core.models.workflow import SubWorkflowNode
|
|
77
|
+
|
|
78
|
+
node = _nodes_by_spec.get((workflow_name, task_index))
|
|
79
|
+
if node is not None and isinstance(node, SubWorkflowNode):
|
|
80
|
+
return cast('SubWorkflowNode[Any]', node)
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_node(
|
|
85
|
+
workflow_name: str, task_index: int
|
|
86
|
+
) -> 'TaskNode[Any] | SubWorkflowNode[Any] | None':
|
|
87
|
+
"""
|
|
88
|
+
Look up any node (TaskNode or SubWorkflowNode) by workflow name and task index.
|
|
89
|
+
|
|
90
|
+
Returns None if not found (workflow not registered in this process).
|
|
91
|
+
"""
|
|
92
|
+
return _nodes_by_spec.get((workflow_name, task_index))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def is_workflow_registered(name: str) -> bool:
|
|
96
|
+
"""Check if a workflow is registered."""
|
|
97
|
+
return name in _active_specs
|
horsies/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: horsies
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Summary: A Python library for distributed task execution
|
|
5
|
+
Author: Suleyman Ozkeskin
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/suleymanozkeskin/horsies
|
|
8
|
+
Project-URL: Repository, https://github.com/suleymanozkeskin/horsies
|
|
9
|
+
Project-URL: Issues, https://github.com/suleymanozkeskin/horsies/issues
|
|
10
|
+
Keywords: task-queue,workflow-engine,dag,scheduling,distributed,postgres,async
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
15
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
16
|
+
Classifier: Topic :: System :: Networking
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Python: >=3.13
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: greenlet>=3.3.0
|
|
21
|
+
Requires-Dist: psutil>=7.2.1
|
|
22
|
+
Requires-Dist: psycopg>=3.3.2
|
|
23
|
+
Requires-Dist: psycopg-pool>=3.2.0
|
|
24
|
+
Requires-Dist: pydantic>=2.12.5
|
|
25
|
+
Requires-Dist: sqlalchemy>=2.0.46
|
|
26
|
+
|
|
27
|
+
# Horsies
|
|
28
|
+
|
|
29
|
+
A PostgreSQL-backed distributed task queue and workflow engine for Python.
|
|
30
|
+
|
|
31
|
+
Documentation: [horsies docs](https://github.com/suleymanozkeskin/horsies/tree/main/website)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
horsies/__init__.py,sha256=Fr_4G82B68cs3p7wShUum2fQlmey1hBf4pAxDz7M6D0,2739
|
|
2
|
+
horsies/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
horsies/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
horsies/core/app.py,sha256=sRLVto5oFH0Mb3cwrzJXd2WoPEjoYL2ByJc764DW238,21523
|
|
5
|
+
horsies/core/banner.py,sha256=eCp8crZZWS6JjWoTn6RtCTEpvGmkpr7hWke_Av5-bvI,5716
|
|
6
|
+
horsies/core/cli.py,sha256=XPMRRvZqNg3ph96gWCfo0NJDOsqqe0Cuj_8Wx80sQP0,21057
|
|
7
|
+
horsies/core/errors.py,sha256=M0PMSdsqbOMPOsZsEOmfKTvxdAD5E7BZ0IuDus7HWtk,16640
|
|
8
|
+
horsies/core/logging.py,sha256=p2utHDeOHgvwtSzKTKXh0RwUNCk38ODmgQQkmsD1tWA,2934
|
|
9
|
+
horsies/core/task_decorator.py,sha256=kn1xD76dqF4mjXAut5GWUHwHZ4JIBsavNvW1-8Fr-OA,22449
|
|
10
|
+
horsies/core/brokers/__init__.py,sha256=a_5xkHhRKY-1PRq6UidMBGq1bX-zeuSdxIvI6GdSiSQ,94
|
|
11
|
+
horsies/core/brokers/listener.py,sha256=pxnnAgLAWqODPV_J0XwUqAhBSrHstL77PSHYEfGoVhc,18637
|
|
12
|
+
horsies/core/brokers/postgres.py,sha256=n_WjM7-Fbg9R5Vao9RJgDSy7eBIL1_pVvnKrjMayzd4,35134
|
|
13
|
+
horsies/core/codec/serde.py,sha256=kTU8d0TGqcv70fgMwJc1uGedqKMiLOFamXAYlZMYQaw,19182
|
|
14
|
+
horsies/core/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
horsies/core/models/app.py,sha256=I3HKztjNzii75bDwlbB7Jl9CPrFMfjLInUJRacQ39_s,11863
|
|
16
|
+
horsies/core/models/broker.py,sha256=yNf-tKSvJP1oIDis-jZwFMsT5exDCe27oAcnLRFSrRA,3041
|
|
17
|
+
horsies/core/models/queues.py,sha256=brB8rk_PQZds_VgavpZH3fkgqzsBAvRTySNNAtzikSw,699
|
|
18
|
+
horsies/core/models/recovery.py,sha256=sI9h7Kbx_Jv23-J_0fTZRrrMgwNqmkX0-dWKTnpikvA,4964
|
|
19
|
+
horsies/core/models/schedule.py,sha256=5mCMsX-12OPz6Gs-nr_GtaJgjxDzuyjuhSTil2XIq50,8113
|
|
20
|
+
horsies/core/models/task_pg.py,sha256=v5U1huJwASh2Saf8GDaRX-IM4mXcD8Fu43kfrFSFDLg,12304
|
|
21
|
+
horsies/core/models/tasks.py,sha256=NBGBw80XniOG3_r1nOB4SE_-N83h8WHQ7FH0tPlr41M,10863
|
|
22
|
+
horsies/core/models/workflow.py,sha256=KZE7l3cjePpTyKqjTOq5erKyptcAia08ai4v59LVtNo,71688
|
|
23
|
+
horsies/core/models/workflow_pg.py,sha256=Nd6JDy4Htft9qQ4xKW4U2vhKNe-uUMYYZfg14t-yJnk,9315
|
|
24
|
+
horsies/core/registry/tasks.py,sha256=mm1xb-f2HLUcZJrLgx2ZS-FQtCkuJbfsR-SW2qiMsts,3708
|
|
25
|
+
horsies/core/scheduler/__init__.py,sha256=m0GqCrdTbQNDUV1Fn3UZD5IewAYsV5olMrDRolg3a1I,699
|
|
26
|
+
horsies/core/scheduler/calculator.py,sha256=F3_9WoDKKq-1WH9Gkkne9HE4QnC_CtfsmoONPyIaXIU,8821
|
|
27
|
+
horsies/core/scheduler/service.py,sha256=DNixYHbEcztHHrOK3Ud_tVIfLyFJrujCyHlqN-peBpI,21711
|
|
28
|
+
horsies/core/scheduler/state.py,sha256=PAG2buNI2_jCIIFz4ofU4EaOTKbyNtHStM8vlmvRpIc,9211
|
|
29
|
+
horsies/core/types/status.py,sha256=lxepSeVJjYvAl6kaCuaeHjHf-_IrXhsfGfm1kpFkSUc,1074
|
|
30
|
+
horsies/core/utils/imports.py,sha256=srIgHxxQeguMK2pNr6gama7AIqSB2embJlPpaor6hyI,6457
|
|
31
|
+
horsies/core/utils/loop_runner.py,sha256=0DlbmtD8TefhyAg9edaZBHEvmze-sDj1behLKjuH4xY,1460
|
|
32
|
+
horsies/core/worker/current.py,sha256=UBkgevE9ulG4LFJoDlEXgpaco34qbV9Esk0tYdaBbMo,415
|
|
33
|
+
horsies/core/worker/worker.py,sha256=POhtTwvTjNogaXkBv4uOSD3ogb5LvuQz86A2jWmyW7I,76900
|
|
34
|
+
horsies/core/workflows/__init__.py,sha256=JA-8wcYUHp-wF5OCEqJwKElT8PaZZB1G7iglDZ7WNiI,579
|
|
35
|
+
horsies/core/workflows/engine.py,sha256=8Gjmch52g1IObKqzGVMWxZaZDC0diB6TuAtRfqr67l0,86927
|
|
36
|
+
horsies/core/workflows/recovery.py,sha256=xRKQPiYaLcX4vLuvcNGy5cV14eW4Cp_sewD1nXv8gc8,19445
|
|
37
|
+
horsies/core/workflows/registry.py,sha256=ItpjTN8yK9NNSV8Q8OwDnHdQSO-hxDzxeWAyGvExpRk,3194
|
|
38
|
+
horsies-0.1.0a1.dist-info/METADATA,sha256=lhzFDdWeU-Ss1nMFmU9nnY6Lh1ffXYqBDBcHsd6JwVM,1200
|
|
39
|
+
horsies-0.1.0a1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
40
|
+
horsies-0.1.0a1.dist-info/entry_points.txt,sha256=6b_OhuNbJ1ky9WSOt3UfNQXETG2YuH0lKimcibILrEE,50
|
|
41
|
+
horsies-0.1.0a1.dist-info/top_level.txt,sha256=ZQ_NurgT-yr3pE4a1svlO_VAhGJ6NFA31vJDLT1yyOA,8
|
|
42
|
+
horsies-0.1.0a1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
horsies
|