horsies 0.1.0a4__py3-none-any.whl → 0.1.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- horsies/core/app.py +67 -47
- horsies/core/banner.py +27 -27
- horsies/core/brokers/postgres.py +315 -288
- horsies/core/cli.py +7 -2
- horsies/core/errors.py +3 -0
- horsies/core/models/app.py +87 -64
- horsies/core/models/recovery.py +30 -21
- horsies/core/models/schedule.py +30 -19
- horsies/core/models/tasks.py +1 -0
- horsies/core/models/workflow.py +489 -202
- horsies/core/models/workflow_pg.py +3 -1
- horsies/core/scheduler/service.py +5 -1
- horsies/core/scheduler/state.py +39 -27
- horsies/core/task_decorator.py +138 -0
- horsies/core/types/status.py +7 -5
- horsies/core/utils/imports.py +10 -10
- horsies/core/worker/worker.py +197 -139
- horsies/core/workflows/engine.py +487 -352
- horsies/core/workflows/recovery.py +148 -119
- {horsies-0.1.0a4.dist-info → horsies-0.1.0a5.dist-info}/METADATA +1 -1
- horsies-0.1.0a5.dist-info/RECORD +42 -0
- horsies-0.1.0a4.dist-info/RECORD +0 -42
- {horsies-0.1.0a4.dist-info → horsies-0.1.0a5.dist-info}/WHEEL +0 -0
- {horsies-0.1.0a4.dist-info → horsies-0.1.0a5.dist-info}/entry_points.txt +0 -0
- {horsies-0.1.0a4.dist-info → horsies-0.1.0a5.dist-info}/top_level.txt +0 -0
horsies/core/worker/worker.py
CHANGED
|
@@ -202,6 +202,7 @@ def _child_initializer(
|
|
|
202
202
|
|
|
203
203
|
# Set log level for this child process before any logging
|
|
204
204
|
from horsies.core.logging import set_default_level
|
|
205
|
+
|
|
205
206
|
set_default_level(loglevel)
|
|
206
207
|
|
|
207
208
|
# Mark child process to adjust logging behavior during module import
|
|
@@ -236,9 +237,7 @@ def _child_initializer(
|
|
|
236
237
|
except Exception:
|
|
237
238
|
pass
|
|
238
239
|
combined_imports = _dedupe_paths(combined_imports)
|
|
239
|
-
_debug_imports_log(
|
|
240
|
-
f'[child {os.getpid()}] import_modules={combined_imports}'
|
|
241
|
-
)
|
|
240
|
+
_debug_imports_log(f'[child {os.getpid()}] import_modules={combined_imports}')
|
|
242
241
|
for m in combined_imports:
|
|
243
242
|
if m.endswith('.py') or os.path.sep in m:
|
|
244
243
|
m_abs = os.path.abspath(m)
|
|
@@ -328,7 +327,12 @@ def _heartbeat_worker(
|
|
|
328
327
|
|
|
329
328
|
def _is_retryable_db_error(exc: BaseException) -> bool:
|
|
330
329
|
match exc:
|
|
331
|
-
case
|
|
330
|
+
case (
|
|
331
|
+
OperationalError()
|
|
332
|
+
| InterfaceError()
|
|
333
|
+
| SerializationFailure()
|
|
334
|
+
| DeadlockDetected()
|
|
335
|
+
):
|
|
332
336
|
return True
|
|
333
337
|
case _:
|
|
334
338
|
return False
|
|
@@ -807,6 +811,166 @@ class WorkerConfig:
|
|
|
807
811
|
loglevel: int = 20 # logging.INFO
|
|
808
812
|
|
|
809
813
|
|
|
814
|
+
# ---------- Worker SQL constants ----------
|
|
815
|
+
|
|
816
|
+
CLAIM_ADVISORY_LOCK_SQL = text("""
|
|
817
|
+
SELECT pg_advisory_xact_lock(CAST(:key AS BIGINT))
|
|
818
|
+
""")
|
|
819
|
+
|
|
820
|
+
COUNT_GLOBAL_IN_FLIGHT_SQL = text("""
|
|
821
|
+
SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED')
|
|
822
|
+
""")
|
|
823
|
+
|
|
824
|
+
COUNT_QUEUE_IN_FLIGHT_HARD_SQL = text("""
|
|
825
|
+
SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED') AND queue_name = :q
|
|
826
|
+
""")
|
|
827
|
+
|
|
828
|
+
COUNT_QUEUE_IN_FLIGHT_SOFT_SQL = text("""
|
|
829
|
+
SELECT COUNT(*) FROM horsies_tasks WHERE status = 'RUNNING' AND queue_name = :q
|
|
830
|
+
""")
|
|
831
|
+
|
|
832
|
+
COUNT_CLAIMED_FOR_WORKER_SQL = text("""
|
|
833
|
+
SELECT COUNT(*)
|
|
834
|
+
FROM horsies_tasks
|
|
835
|
+
WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
836
|
+
AND status = 'CLAIMED'
|
|
837
|
+
""")
|
|
838
|
+
|
|
839
|
+
COUNT_RUNNING_FOR_WORKER_SQL = text("""
|
|
840
|
+
SELECT COUNT(*)
|
|
841
|
+
FROM horsies_tasks
|
|
842
|
+
WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
843
|
+
AND status = 'RUNNING'
|
|
844
|
+
""")
|
|
845
|
+
|
|
846
|
+
COUNT_IN_FLIGHT_FOR_WORKER_SQL = text("""
|
|
847
|
+
SELECT COUNT(*)
|
|
848
|
+
FROM horsies_tasks
|
|
849
|
+
WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
850
|
+
AND status IN ('RUNNING', 'CLAIMED')
|
|
851
|
+
""")
|
|
852
|
+
|
|
853
|
+
COUNT_RUNNING_IN_QUEUE_SQL = text("""
|
|
854
|
+
SELECT COUNT(*)
|
|
855
|
+
FROM horsies_tasks
|
|
856
|
+
WHERE status = 'RUNNING'
|
|
857
|
+
AND queue_name = :q
|
|
858
|
+
""")
|
|
859
|
+
|
|
860
|
+
GET_PAUSED_WORKFLOW_TASK_IDS_SQL = text("""
|
|
861
|
+
SELECT t.id
|
|
862
|
+
FROM horsies_tasks t
|
|
863
|
+
JOIN horsies_workflow_tasks wt ON wt.task_id = t.id
|
|
864
|
+
JOIN horsies_workflows w ON w.id = wt.workflow_id
|
|
865
|
+
WHERE t.id = ANY(:ids)
|
|
866
|
+
AND w.status = 'PAUSED'
|
|
867
|
+
""")
|
|
868
|
+
|
|
869
|
+
UNCLAIM_PAUSED_TASKS_SQL = text("""
|
|
870
|
+
UPDATE horsies_tasks
|
|
871
|
+
SET status = 'PENDING',
|
|
872
|
+
claimed = FALSE,
|
|
873
|
+
claimed_at = NULL,
|
|
874
|
+
claimed_by_worker_id = NULL,
|
|
875
|
+
updated_at = NOW()
|
|
876
|
+
WHERE id = ANY(:ids)
|
|
877
|
+
""")
|
|
878
|
+
|
|
879
|
+
RESET_PAUSED_WORKFLOW_TASKS_SQL = text("""
|
|
880
|
+
UPDATE horsies_workflow_tasks
|
|
881
|
+
SET status = 'READY', task_id = NULL, started_at = NULL
|
|
882
|
+
WHERE task_id = ANY(:ids)
|
|
883
|
+
""")
|
|
884
|
+
|
|
885
|
+
MARK_TASK_FAILED_WORKER_SQL = text("""
|
|
886
|
+
UPDATE horsies_tasks
|
|
887
|
+
SET status='FAILED',
|
|
888
|
+
failed_at = :now,
|
|
889
|
+
failed_reason = :reason,
|
|
890
|
+
updated_at = :now
|
|
891
|
+
WHERE id = :id
|
|
892
|
+
""")
|
|
893
|
+
|
|
894
|
+
MARK_TASK_FAILED_SQL = text("""
|
|
895
|
+
UPDATE horsies_tasks
|
|
896
|
+
SET status='FAILED',
|
|
897
|
+
failed_at = :now,
|
|
898
|
+
result = :result_json,
|
|
899
|
+
updated_at = :now
|
|
900
|
+
WHERE id = :id
|
|
901
|
+
""")
|
|
902
|
+
|
|
903
|
+
MARK_TASK_COMPLETED_SQL = text("""
|
|
904
|
+
UPDATE horsies_tasks
|
|
905
|
+
SET status='COMPLETED',
|
|
906
|
+
completed_at = :now,
|
|
907
|
+
result = :result_json,
|
|
908
|
+
updated_at = :now
|
|
909
|
+
WHERE id = :id
|
|
910
|
+
""")
|
|
911
|
+
|
|
912
|
+
GET_TASK_QUEUE_NAME_SQL = text("""
|
|
913
|
+
SELECT queue_name FROM horsies_tasks WHERE id = :id
|
|
914
|
+
""")
|
|
915
|
+
|
|
916
|
+
NOTIFY_TASK_NEW_SQL = text("""
|
|
917
|
+
SELECT pg_notify(:c1, :p)
|
|
918
|
+
""")
|
|
919
|
+
|
|
920
|
+
NOTIFY_TASK_QUEUE_SQL = text("""
|
|
921
|
+
SELECT pg_notify(:c2, :p)
|
|
922
|
+
""")
|
|
923
|
+
|
|
924
|
+
CHECK_WORKFLOW_TASK_EXISTS_SQL = text("""
|
|
925
|
+
SELECT 1 FROM horsies_workflow_tasks WHERE task_id = :tid LIMIT 1
|
|
926
|
+
""")
|
|
927
|
+
|
|
928
|
+
GET_TASK_RETRY_INFO_SQL = text("""
|
|
929
|
+
SELECT retry_count, max_retries, task_options FROM horsies_tasks WHERE id = :id
|
|
930
|
+
""")
|
|
931
|
+
|
|
932
|
+
GET_TASK_RETRY_CONFIG_SQL = text("""
|
|
933
|
+
SELECT retry_count, task_options FROM horsies_tasks WHERE id = :id
|
|
934
|
+
""")
|
|
935
|
+
|
|
936
|
+
SCHEDULE_TASK_RETRY_SQL = text("""
|
|
937
|
+
UPDATE horsies_tasks
|
|
938
|
+
SET status = 'PENDING',
|
|
939
|
+
retry_count = :retry_count,
|
|
940
|
+
next_retry_at = :next_retry_at,
|
|
941
|
+
sent_at = :next_retry_at,
|
|
942
|
+
updated_at = now()
|
|
943
|
+
WHERE id = :id
|
|
944
|
+
""")
|
|
945
|
+
|
|
946
|
+
NOTIFY_DELAYED_SQL = text("""
|
|
947
|
+
SELECT pg_notify(:channel, :payload)
|
|
948
|
+
""")
|
|
949
|
+
|
|
950
|
+
INSERT_CLAIMER_HEARTBEAT_SQL = text("""
|
|
951
|
+
INSERT INTO horsies_heartbeats (task_id, sender_id, role, sent_at, hostname, pid)
|
|
952
|
+
SELECT id, CAST(:wid AS VARCHAR), 'claimer', NOW(), :host, :pid
|
|
953
|
+
FROM horsies_tasks
|
|
954
|
+
WHERE status = 'CLAIMED' AND claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
955
|
+
""")
|
|
956
|
+
|
|
957
|
+
INSERT_WORKER_STATE_SQL = text("""
|
|
958
|
+
INSERT INTO horsies_worker_states (
|
|
959
|
+
worker_id, snapshot_at, hostname, pid,
|
|
960
|
+
processes, max_claim_batch, max_claim_per_worker,
|
|
961
|
+
cluster_wide_cap, queues, queue_priorities, queue_max_concurrency,
|
|
962
|
+
recovery_config, tasks_running, tasks_claimed,
|
|
963
|
+
memory_usage_mb, memory_percent, cpu_percent,
|
|
964
|
+
worker_started_at
|
|
965
|
+
)
|
|
966
|
+
VALUES (
|
|
967
|
+
:wid, NOW(), :host, :pid, :procs, :mcb, :mcpw, :cwc,
|
|
968
|
+
:queues, :qp, :qmc, :recovery, :running, :claimed,
|
|
969
|
+
:mem_mb, :mem_pct, :cpu_pct, :started
|
|
970
|
+
)
|
|
971
|
+
""")
|
|
972
|
+
|
|
973
|
+
|
|
810
974
|
class Worker:
|
|
811
975
|
"""
|
|
812
976
|
Async master that:
|
|
@@ -846,7 +1010,9 @@ class Worker:
|
|
|
846
1010
|
|
|
847
1011
|
# Create the process pool AFTER successful preload so initializer runs in children only
|
|
848
1012
|
# Compute the plain psycopg database URL for child processes
|
|
849
|
-
child_database_url = self.cfg.dsn.replace('+asyncpg', '').replace(
|
|
1013
|
+
child_database_url = self.cfg.dsn.replace('+asyncpg', '').replace(
|
|
1014
|
+
'+psycopg', ''
|
|
1015
|
+
)
|
|
850
1016
|
self._executor = ProcessPoolExecutor(
|
|
851
1017
|
max_workers=self.cfg.processes,
|
|
852
1018
|
initializer=_child_initializer,
|
|
@@ -1081,7 +1247,7 @@ class Worker:
|
|
|
1081
1247
|
async with self.sf() as s:
|
|
1082
1248
|
# Take a cluster-wide transaction-scoped advisory lock to serialize claiming
|
|
1083
1249
|
await s.execute(
|
|
1084
|
-
|
|
1250
|
+
CLAIM_ADVISORY_LOCK_SQL,
|
|
1085
1251
|
{'key': self._advisory_key_global()},
|
|
1086
1252
|
)
|
|
1087
1253
|
|
|
@@ -1105,11 +1271,7 @@ class Worker:
|
|
|
1105
1271
|
if self.cfg.cluster_wide_cap is not None:
|
|
1106
1272
|
# Hard cap mode: count RUNNING + CLAIMED globally
|
|
1107
1273
|
# (Note: prefetch_buffer must be 0 when cluster_wide_cap is set, enforced by config validation)
|
|
1108
|
-
res = await s.execute(
|
|
1109
|
-
text(
|
|
1110
|
-
"SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED')"
|
|
1111
|
-
)
|
|
1112
|
-
)
|
|
1274
|
+
res = await s.execute(COUNT_GLOBAL_IN_FLIGHT_SQL)
|
|
1113
1275
|
row = res.fetchone()
|
|
1114
1276
|
if row:
|
|
1115
1277
|
in_flight_global = int(row[0])
|
|
@@ -1144,16 +1306,12 @@ class Worker:
|
|
|
1144
1306
|
# Soft cap mode: count only RUNNING
|
|
1145
1307
|
if hard_cap_mode:
|
|
1146
1308
|
resq = await s.execute(
|
|
1147
|
-
|
|
1148
|
-
"SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED') AND queue_name = :q"
|
|
1149
|
-
),
|
|
1309
|
+
COUNT_QUEUE_IN_FLIGHT_HARD_SQL,
|
|
1150
1310
|
{'q': qname},
|
|
1151
1311
|
)
|
|
1152
1312
|
else:
|
|
1153
1313
|
resq = await s.execute(
|
|
1154
|
-
|
|
1155
|
-
"SELECT COUNT(*) FROM horsies_tasks WHERE status = 'RUNNING' AND queue_name = :q"
|
|
1156
|
-
),
|
|
1314
|
+
COUNT_QUEUE_IN_FLIGHT_SOFT_SQL,
|
|
1157
1315
|
{'q': qname},
|
|
1158
1316
|
)
|
|
1159
1317
|
row = resq.fetchone()
|
|
@@ -1222,14 +1380,7 @@ class Worker:
|
|
|
1222
1380
|
# Find which tasks belong to PAUSED workflows
|
|
1223
1381
|
async with self.sf() as s:
|
|
1224
1382
|
res = await s.execute(
|
|
1225
|
-
|
|
1226
|
-
SELECT t.id
|
|
1227
|
-
FROM horsies_tasks t
|
|
1228
|
-
JOIN horsies_workflow_tasks wt ON wt.task_id = t.id
|
|
1229
|
-
JOIN horsies_workflows w ON w.id = wt.workflow_id
|
|
1230
|
-
WHERE t.id = ANY(:ids)
|
|
1231
|
-
AND w.status = 'PAUSED'
|
|
1232
|
-
"""),
|
|
1383
|
+
GET_PAUSED_WORKFLOW_TASK_IDS_SQL,
|
|
1233
1384
|
{'ids': task_ids},
|
|
1234
1385
|
)
|
|
1235
1386
|
paused_task_ids = {row[0] for row in res.fetchall()}
|
|
@@ -1237,25 +1388,13 @@ class Worker:
|
|
|
1237
1388
|
if paused_task_ids:
|
|
1238
1389
|
# Unclaim these tasks: set back to PENDING so they can be picked up on resume
|
|
1239
1390
|
await s.execute(
|
|
1240
|
-
|
|
1241
|
-
UPDATE horsies_tasks
|
|
1242
|
-
SET status = 'PENDING',
|
|
1243
|
-
claimed = FALSE,
|
|
1244
|
-
claimed_at = NULL,
|
|
1245
|
-
claimed_by_worker_id = NULL,
|
|
1246
|
-
updated_at = NOW()
|
|
1247
|
-
WHERE id = ANY(:ids)
|
|
1248
|
-
"""),
|
|
1391
|
+
UNCLAIM_PAUSED_TASKS_SQL,
|
|
1249
1392
|
{'ids': list(paused_task_ids)},
|
|
1250
1393
|
)
|
|
1251
1394
|
# Also reset workflow_tasks back to READY for consistency
|
|
1252
1395
|
# (they were ENQUEUED, but the task is now unclaimed)
|
|
1253
1396
|
await s.execute(
|
|
1254
|
-
|
|
1255
|
-
UPDATE horsies_workflow_tasks
|
|
1256
|
-
SET status = 'READY', task_id = NULL, started_at = NULL
|
|
1257
|
-
WHERE task_id = ANY(:ids)
|
|
1258
|
-
"""),
|
|
1397
|
+
RESET_PAUSED_WORKFLOW_TASKS_SQL,
|
|
1259
1398
|
{'ids': list(paused_task_ids)},
|
|
1260
1399
|
)
|
|
1261
1400
|
await s.commit()
|
|
@@ -1300,14 +1439,7 @@ class Worker:
|
|
|
1300
1439
|
"""Count only CLAIMED tasks for this worker (not yet RUNNING)."""
|
|
1301
1440
|
async with self.sf() as s:
|
|
1302
1441
|
res = await s.execute(
|
|
1303
|
-
|
|
1304
|
-
"""
|
|
1305
|
-
SELECT COUNT(*)
|
|
1306
|
-
FROM horsies_tasks
|
|
1307
|
-
WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
1308
|
-
AND status = 'CLAIMED'
|
|
1309
|
-
"""
|
|
1310
|
-
),
|
|
1442
|
+
COUNT_CLAIMED_FOR_WORKER_SQL,
|
|
1311
1443
|
{'wid': self.worker_instance_id},
|
|
1312
1444
|
)
|
|
1313
1445
|
row = res.fetchone()
|
|
@@ -1317,14 +1449,7 @@ class Worker:
|
|
|
1317
1449
|
"""Count only RUNNING tasks for this worker (excludes CLAIMED)."""
|
|
1318
1450
|
async with self.sf() as s:
|
|
1319
1451
|
res = await s.execute(
|
|
1320
|
-
|
|
1321
|
-
"""
|
|
1322
|
-
SELECT COUNT(*)
|
|
1323
|
-
FROM horsies_tasks
|
|
1324
|
-
WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
1325
|
-
AND status = 'RUNNING'
|
|
1326
|
-
"""
|
|
1327
|
-
),
|
|
1452
|
+
COUNT_RUNNING_FOR_WORKER_SQL,
|
|
1328
1453
|
{'wid': self.worker_instance_id},
|
|
1329
1454
|
)
|
|
1330
1455
|
row = res.fetchone()
|
|
@@ -1334,14 +1459,7 @@ class Worker:
|
|
|
1334
1459
|
"""Count RUNNING + CLAIMED tasks for this worker (hard cap mode)."""
|
|
1335
1460
|
async with self.sf() as s:
|
|
1336
1461
|
res = await s.execute(
|
|
1337
|
-
|
|
1338
|
-
"""
|
|
1339
|
-
SELECT COUNT(*)
|
|
1340
|
-
FROM horsies_tasks
|
|
1341
|
-
WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
1342
|
-
AND status IN ('RUNNING', 'CLAIMED')
|
|
1343
|
-
"""
|
|
1344
|
-
),
|
|
1462
|
+
COUNT_IN_FLIGHT_FOR_WORKER_SQL,
|
|
1345
1463
|
{'wid': self.worker_instance_id},
|
|
1346
1464
|
)
|
|
1347
1465
|
row = res.fetchone()
|
|
@@ -1351,14 +1469,7 @@ class Worker:
|
|
|
1351
1469
|
"""Count RUNNING tasks in a given queue across the cluster."""
|
|
1352
1470
|
async with self.sf() as s:
|
|
1353
1471
|
res = await s.execute(
|
|
1354
|
-
|
|
1355
|
-
"""
|
|
1356
|
-
SELECT COUNT(*)
|
|
1357
|
-
FROM horsies_tasks
|
|
1358
|
-
WHERE status = 'RUNNING'
|
|
1359
|
-
AND queue_name = :q
|
|
1360
|
-
"""
|
|
1361
|
-
),
|
|
1472
|
+
COUNT_RUNNING_IN_QUEUE_SQL,
|
|
1362
1473
|
{'q': queue_name},
|
|
1363
1474
|
)
|
|
1364
1475
|
row = res.fetchone()
|
|
@@ -1453,14 +1564,7 @@ class Worker:
|
|
|
1453
1564
|
|
|
1454
1565
|
# worker-level failure (rare): mark FAILED with reason
|
|
1455
1566
|
await s.execute(
|
|
1456
|
-
|
|
1457
|
-
UPDATE horsies_tasks
|
|
1458
|
-
SET status='FAILED',
|
|
1459
|
-
failed_at = :now,
|
|
1460
|
-
failed_reason = :reason,
|
|
1461
|
-
updated_at = :now
|
|
1462
|
-
WHERE id = :id
|
|
1463
|
-
"""),
|
|
1567
|
+
MARK_TASK_FAILED_WORKER_SQL,
|
|
1464
1568
|
{
|
|
1465
1569
|
'now': now,
|
|
1466
1570
|
'reason': failed_reason or 'Worker failure',
|
|
@@ -1492,26 +1596,12 @@ class Worker:
|
|
|
1492
1596
|
|
|
1493
1597
|
# Mark as failed if no retry
|
|
1494
1598
|
await s.execute(
|
|
1495
|
-
|
|
1496
|
-
UPDATE horsies_tasks
|
|
1497
|
-
SET status='FAILED',
|
|
1498
|
-
failed_at = :now,
|
|
1499
|
-
result = :result_json, -- or result_json JSONB if you add that column
|
|
1500
|
-
updated_at = :now
|
|
1501
|
-
WHERE id = :id
|
|
1502
|
-
"""),
|
|
1599
|
+
MARK_TASK_FAILED_SQL,
|
|
1503
1600
|
{'now': now, 'result_json': result_json_str, 'id': task_id},
|
|
1504
1601
|
)
|
|
1505
1602
|
else:
|
|
1506
1603
|
await s.execute(
|
|
1507
|
-
|
|
1508
|
-
UPDATE horsies_tasks
|
|
1509
|
-
SET status='COMPLETED',
|
|
1510
|
-
completed_at = :now,
|
|
1511
|
-
result = :result_json, -- or result_json JSONB
|
|
1512
|
-
updated_at = :now
|
|
1513
|
-
WHERE id = :id
|
|
1514
|
-
"""),
|
|
1604
|
+
MARK_TASK_COMPLETED_SQL,
|
|
1515
1605
|
{'now': now, 'result_json': result_json_str, 'id': task_id},
|
|
1516
1606
|
)
|
|
1517
1607
|
|
|
@@ -1523,16 +1613,15 @@ class Worker:
|
|
|
1523
1613
|
# Notify workers globally and on the specific queue to wake claims
|
|
1524
1614
|
# Fetch queue name for this task
|
|
1525
1615
|
resq = await s.execute(
|
|
1526
|
-
|
|
1616
|
+
GET_TASK_QUEUE_NAME_SQL,
|
|
1617
|
+
{'id': task_id},
|
|
1527
1618
|
)
|
|
1528
1619
|
rowq = resq.fetchone()
|
|
1529
1620
|
qname = str(rowq[0]) if rowq and rowq[0] else 'default'
|
|
1530
1621
|
payload = f'capacity:{task_id}'
|
|
1622
|
+
await s.execute(NOTIFY_TASK_NEW_SQL, {'c1': 'task_new', 'p': payload})
|
|
1531
1623
|
await s.execute(
|
|
1532
|
-
|
|
1533
|
-
)
|
|
1534
|
-
await s.execute(
|
|
1535
|
-
text('SELECT pg_notify(:c2, :p)'),
|
|
1624
|
+
NOTIFY_TASK_QUEUE_SQL,
|
|
1536
1625
|
{'c2': f'task_queue_{qname}', 'p': payload},
|
|
1537
1626
|
)
|
|
1538
1627
|
except Exception:
|
|
@@ -1558,7 +1647,7 @@ class Worker:
|
|
|
1558
1647
|
|
|
1559
1648
|
# Quick check: is this task linked to a workflow?
|
|
1560
1649
|
check = await session.execute(
|
|
1561
|
-
|
|
1650
|
+
CHECK_WORKFLOW_TASK_EXISTS_SQL,
|
|
1562
1651
|
{'tid': task_id},
|
|
1563
1652
|
)
|
|
1564
1653
|
|
|
@@ -1573,9 +1662,7 @@ class Worker:
|
|
|
1573
1662
|
"""Check if a task should be retried based on its configuration and current retry count."""
|
|
1574
1663
|
async with self.sf() as s:
|
|
1575
1664
|
result = await s.execute(
|
|
1576
|
-
|
|
1577
|
-
'SELECT retry_count, max_retries, task_options FROM horsies_tasks WHERE id = :id'
|
|
1578
|
-
),
|
|
1665
|
+
GET_TASK_RETRY_INFO_SQL,
|
|
1579
1666
|
{'id': task_id},
|
|
1580
1667
|
)
|
|
1581
1668
|
row = result.fetchone()
|
|
@@ -1632,7 +1719,7 @@ class Worker:
|
|
|
1632
1719
|
"""Schedule a task for retry by updating its status and next retry time."""
|
|
1633
1720
|
# Get current retry configuration
|
|
1634
1721
|
result = await session.execute(
|
|
1635
|
-
|
|
1722
|
+
GET_TASK_RETRY_CONFIG_SQL,
|
|
1636
1723
|
{'id': task_id},
|
|
1637
1724
|
)
|
|
1638
1725
|
row = result.fetchone()
|
|
@@ -1660,15 +1747,7 @@ class Worker:
|
|
|
1660
1747
|
|
|
1661
1748
|
# Update task for retry
|
|
1662
1749
|
await session.execute(
|
|
1663
|
-
|
|
1664
|
-
UPDATE horsies_tasks
|
|
1665
|
-
SET status = 'PENDING',
|
|
1666
|
-
retry_count = :retry_count,
|
|
1667
|
-
next_retry_at = :next_retry_at,
|
|
1668
|
-
sent_at = :next_retry_at,
|
|
1669
|
-
updated_at = now()
|
|
1670
|
-
WHERE id = :id
|
|
1671
|
-
"""),
|
|
1750
|
+
SCHEDULE_TASK_RETRY_SQL,
|
|
1672
1751
|
{'id': task_id, 'retry_count': retry_count, 'next_retry_at': next_retry_at},
|
|
1673
1752
|
)
|
|
1674
1753
|
|
|
@@ -1726,7 +1805,7 @@ class Worker:
|
|
|
1726
1805
|
# Send notification to trigger retry processing
|
|
1727
1806
|
async with self.sf() as session:
|
|
1728
1807
|
await session.execute(
|
|
1729
|
-
|
|
1808
|
+
NOTIFY_DELAYED_SQL,
|
|
1730
1809
|
{'channel': channel, 'payload': payload},
|
|
1731
1810
|
)
|
|
1732
1811
|
await session.commit()
|
|
@@ -1741,7 +1820,7 @@ class Worker:
|
|
|
1741
1820
|
"""Fetch the queue_name for a given task id."""
|
|
1742
1821
|
async with self.sf() as session:
|
|
1743
1822
|
res = await session.execute(
|
|
1744
|
-
|
|
1823
|
+
GET_TASK_QUEUE_NAME_SQL,
|
|
1745
1824
|
{'id': task_id},
|
|
1746
1825
|
)
|
|
1747
1826
|
row = res.fetchone()
|
|
@@ -1761,14 +1840,7 @@ class Worker:
|
|
|
1761
1840
|
try:
|
|
1762
1841
|
async with self.sf() as s:
|
|
1763
1842
|
await s.execute(
|
|
1764
|
-
|
|
1765
|
-
"""
|
|
1766
|
-
INSERT INTO horsies_heartbeats (task_id, sender_id, role, sent_at, hostname, pid)
|
|
1767
|
-
SELECT id, CAST(:wid AS VARCHAR), 'claimer', NOW(), :host, :pid
|
|
1768
|
-
FROM horsies_tasks
|
|
1769
|
-
WHERE status = 'CLAIMED' AND claimed_by_worker_id = CAST(:wid AS VARCHAR)
|
|
1770
|
-
"""
|
|
1771
|
-
),
|
|
1843
|
+
INSERT_CLAIMER_HEARTBEAT_SQL,
|
|
1772
1844
|
{
|
|
1773
1845
|
'wid': self.worker_instance_id,
|
|
1774
1846
|
'host': socket.gethostname(),
|
|
@@ -1810,21 +1882,7 @@ class Worker:
|
|
|
1810
1882
|
|
|
1811
1883
|
async with self.sf() as s:
|
|
1812
1884
|
await s.execute(
|
|
1813
|
-
|
|
1814
|
-
INSERT INTO horsies_worker_states (
|
|
1815
|
-
worker_id, snapshot_at, hostname, pid,
|
|
1816
|
-
processes, max_claim_batch, max_claim_per_worker,
|
|
1817
|
-
cluster_wide_cap, queues, queue_priorities, queue_max_concurrency,
|
|
1818
|
-
recovery_config, tasks_running, tasks_claimed,
|
|
1819
|
-
memory_usage_mb, memory_percent, cpu_percent,
|
|
1820
|
-
worker_started_at
|
|
1821
|
-
)
|
|
1822
|
-
VALUES (
|
|
1823
|
-
:wid, NOW(), :host, :pid, :procs, :mcb, :mcpw, :cwc,
|
|
1824
|
-
:queues, :qp, :qmc, :recovery, :running, :claimed,
|
|
1825
|
-
:mem_mb, :mem_pct, :cpu_pct, :started
|
|
1826
|
-
)
|
|
1827
|
-
"""),
|
|
1885
|
+
INSERT_WORKER_STATE_SQL,
|
|
1828
1886
|
{
|
|
1829
1887
|
'wid': self.worker_instance_id,
|
|
1830
1888
|
'host': socket.gethostname(),
|