horsies 0.1.0a3__py3-none-any.whl → 0.1.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -202,6 +202,7 @@ def _child_initializer(
202
202
 
203
203
  # Set log level for this child process before any logging
204
204
  from horsies.core.logging import set_default_level
205
+
205
206
  set_default_level(loglevel)
206
207
 
207
208
  # Mark child process to adjust logging behavior during module import
@@ -236,9 +237,7 @@ def _child_initializer(
236
237
  except Exception:
237
238
  pass
238
239
  combined_imports = _dedupe_paths(combined_imports)
239
- _debug_imports_log(
240
- f'[child {os.getpid()}] import_modules={combined_imports}'
241
- )
240
+ _debug_imports_log(f'[child {os.getpid()}] import_modules={combined_imports}')
242
241
  for m in combined_imports:
243
242
  if m.endswith('.py') or os.path.sep in m:
244
243
  m_abs = os.path.abspath(m)
@@ -328,7 +327,12 @@ def _heartbeat_worker(
328
327
 
329
328
  def _is_retryable_db_error(exc: BaseException) -> bool:
330
329
  match exc:
331
- case OperationalError() | InterfaceError() | SerializationFailure() | DeadlockDetected():
330
+ case (
331
+ OperationalError()
332
+ | InterfaceError()
333
+ | SerializationFailure()
334
+ | DeadlockDetected()
335
+ ):
332
336
  return True
333
337
  case _:
334
338
  return False
@@ -807,6 +811,166 @@ class WorkerConfig:
807
811
  loglevel: int = 20 # logging.INFO
808
812
 
809
813
 
814
+ # ---------- Worker SQL constants ----------
815
+
816
+ CLAIM_ADVISORY_LOCK_SQL = text("""
817
+ SELECT pg_advisory_xact_lock(CAST(:key AS BIGINT))
818
+ """)
819
+
820
+ COUNT_GLOBAL_IN_FLIGHT_SQL = text("""
821
+ SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED')
822
+ """)
823
+
824
+ COUNT_QUEUE_IN_FLIGHT_HARD_SQL = text("""
825
+ SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED') AND queue_name = :q
826
+ """)
827
+
828
+ COUNT_QUEUE_IN_FLIGHT_SOFT_SQL = text("""
829
+ SELECT COUNT(*) FROM horsies_tasks WHERE status = 'RUNNING' AND queue_name = :q
830
+ """)
831
+
832
+ COUNT_CLAIMED_FOR_WORKER_SQL = text("""
833
+ SELECT COUNT(*)
834
+ FROM horsies_tasks
835
+ WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
836
+ AND status = 'CLAIMED'
837
+ """)
838
+
839
+ COUNT_RUNNING_FOR_WORKER_SQL = text("""
840
+ SELECT COUNT(*)
841
+ FROM horsies_tasks
842
+ WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
843
+ AND status = 'RUNNING'
844
+ """)
845
+
846
+ COUNT_IN_FLIGHT_FOR_WORKER_SQL = text("""
847
+ SELECT COUNT(*)
848
+ FROM horsies_tasks
849
+ WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
850
+ AND status IN ('RUNNING', 'CLAIMED')
851
+ """)
852
+
853
+ COUNT_RUNNING_IN_QUEUE_SQL = text("""
854
+ SELECT COUNT(*)
855
+ FROM horsies_tasks
856
+ WHERE status = 'RUNNING'
857
+ AND queue_name = :q
858
+ """)
859
+
860
+ GET_PAUSED_WORKFLOW_TASK_IDS_SQL = text("""
861
+ SELECT t.id
862
+ FROM horsies_tasks t
863
+ JOIN horsies_workflow_tasks wt ON wt.task_id = t.id
864
+ JOIN horsies_workflows w ON w.id = wt.workflow_id
865
+ WHERE t.id = ANY(:ids)
866
+ AND w.status = 'PAUSED'
867
+ """)
868
+
869
+ UNCLAIM_PAUSED_TASKS_SQL = text("""
870
+ UPDATE horsies_tasks
871
+ SET status = 'PENDING',
872
+ claimed = FALSE,
873
+ claimed_at = NULL,
874
+ claimed_by_worker_id = NULL,
875
+ updated_at = NOW()
876
+ WHERE id = ANY(:ids)
877
+ """)
878
+
879
+ RESET_PAUSED_WORKFLOW_TASKS_SQL = text("""
880
+ UPDATE horsies_workflow_tasks
881
+ SET status = 'READY', task_id = NULL, started_at = NULL
882
+ WHERE task_id = ANY(:ids)
883
+ """)
884
+
885
+ MARK_TASK_FAILED_WORKER_SQL = text("""
886
+ UPDATE horsies_tasks
887
+ SET status='FAILED',
888
+ failed_at = :now,
889
+ failed_reason = :reason,
890
+ updated_at = :now
891
+ WHERE id = :id
892
+ """)
893
+
894
+ MARK_TASK_FAILED_SQL = text("""
895
+ UPDATE horsies_tasks
896
+ SET status='FAILED',
897
+ failed_at = :now,
898
+ result = :result_json,
899
+ updated_at = :now
900
+ WHERE id = :id
901
+ """)
902
+
903
+ MARK_TASK_COMPLETED_SQL = text("""
904
+ UPDATE horsies_tasks
905
+ SET status='COMPLETED',
906
+ completed_at = :now,
907
+ result = :result_json,
908
+ updated_at = :now
909
+ WHERE id = :id
910
+ """)
911
+
912
+ GET_TASK_QUEUE_NAME_SQL = text("""
913
+ SELECT queue_name FROM horsies_tasks WHERE id = :id
914
+ """)
915
+
916
+ NOTIFY_TASK_NEW_SQL = text("""
917
+ SELECT pg_notify(:c1, :p)
918
+ """)
919
+
920
+ NOTIFY_TASK_QUEUE_SQL = text("""
921
+ SELECT pg_notify(:c2, :p)
922
+ """)
923
+
924
+ CHECK_WORKFLOW_TASK_EXISTS_SQL = text("""
925
+ SELECT 1 FROM horsies_workflow_tasks WHERE task_id = :tid LIMIT 1
926
+ """)
927
+
928
+ GET_TASK_RETRY_INFO_SQL = text("""
929
+ SELECT retry_count, max_retries, task_options FROM horsies_tasks WHERE id = :id
930
+ """)
931
+
932
+ GET_TASK_RETRY_CONFIG_SQL = text("""
933
+ SELECT retry_count, task_options FROM horsies_tasks WHERE id = :id
934
+ """)
935
+
936
+ SCHEDULE_TASK_RETRY_SQL = text("""
937
+ UPDATE horsies_tasks
938
+ SET status = 'PENDING',
939
+ retry_count = :retry_count,
940
+ next_retry_at = :next_retry_at,
941
+ sent_at = :next_retry_at,
942
+ updated_at = now()
943
+ WHERE id = :id
944
+ """)
945
+
946
+ NOTIFY_DELAYED_SQL = text("""
947
+ SELECT pg_notify(:channel, :payload)
948
+ """)
949
+
950
+ INSERT_CLAIMER_HEARTBEAT_SQL = text("""
951
+ INSERT INTO horsies_heartbeats (task_id, sender_id, role, sent_at, hostname, pid)
952
+ SELECT id, CAST(:wid AS VARCHAR), 'claimer', NOW(), :host, :pid
953
+ FROM horsies_tasks
954
+ WHERE status = 'CLAIMED' AND claimed_by_worker_id = CAST(:wid AS VARCHAR)
955
+ """)
956
+
957
+ INSERT_WORKER_STATE_SQL = text("""
958
+ INSERT INTO horsies_worker_states (
959
+ worker_id, snapshot_at, hostname, pid,
960
+ processes, max_claim_batch, max_claim_per_worker,
961
+ cluster_wide_cap, queues, queue_priorities, queue_max_concurrency,
962
+ recovery_config, tasks_running, tasks_claimed,
963
+ memory_usage_mb, memory_percent, cpu_percent,
964
+ worker_started_at
965
+ )
966
+ VALUES (
967
+ :wid, NOW(), :host, :pid, :procs, :mcb, :mcpw, :cwc,
968
+ :queues, :qp, :qmc, :recovery, :running, :claimed,
969
+ :mem_mb, :mem_pct, :cpu_pct, :started
970
+ )
971
+ """)
972
+
973
+
810
974
  class Worker:
811
975
  """
812
976
  Async master that:
@@ -846,7 +1010,9 @@ class Worker:
846
1010
 
847
1011
  # Create the process pool AFTER successful preload so initializer runs in children only
848
1012
  # Compute the plain psycopg database URL for child processes
849
- child_database_url = self.cfg.dsn.replace('+asyncpg', '').replace('+psycopg', '')
1013
+ child_database_url = self.cfg.dsn.replace('+asyncpg', '').replace(
1014
+ '+psycopg', ''
1015
+ )
850
1016
  self._executor = ProcessPoolExecutor(
851
1017
  max_workers=self.cfg.processes,
852
1018
  initializer=_child_initializer,
@@ -1081,7 +1247,7 @@ class Worker:
1081
1247
  async with self.sf() as s:
1082
1248
  # Take a cluster-wide transaction-scoped advisory lock to serialize claiming
1083
1249
  await s.execute(
1084
- text('SELECT pg_advisory_xact_lock(CAST(:key AS BIGINT))'),
1250
+ CLAIM_ADVISORY_LOCK_SQL,
1085
1251
  {'key': self._advisory_key_global()},
1086
1252
  )
1087
1253
 
@@ -1105,11 +1271,7 @@ class Worker:
1105
1271
  if self.cfg.cluster_wide_cap is not None:
1106
1272
  # Hard cap mode: count RUNNING + CLAIMED globally
1107
1273
  # (Note: prefetch_buffer must be 0 when cluster_wide_cap is set, enforced by config validation)
1108
- res = await s.execute(
1109
- text(
1110
- "SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED')"
1111
- )
1112
- )
1274
+ res = await s.execute(COUNT_GLOBAL_IN_FLIGHT_SQL)
1113
1275
  row = res.fetchone()
1114
1276
  if row:
1115
1277
  in_flight_global = int(row[0])
@@ -1144,16 +1306,12 @@ class Worker:
1144
1306
  # Soft cap mode: count only RUNNING
1145
1307
  if hard_cap_mode:
1146
1308
  resq = await s.execute(
1147
- text(
1148
- "SELECT COUNT(*) FROM horsies_tasks WHERE status IN ('RUNNING', 'CLAIMED') AND queue_name = :q"
1149
- ),
1309
+ COUNT_QUEUE_IN_FLIGHT_HARD_SQL,
1150
1310
  {'q': qname},
1151
1311
  )
1152
1312
  else:
1153
1313
  resq = await s.execute(
1154
- text(
1155
- "SELECT COUNT(*) FROM horsies_tasks WHERE status = 'RUNNING' AND queue_name = :q"
1156
- ),
1314
+ COUNT_QUEUE_IN_FLIGHT_SOFT_SQL,
1157
1315
  {'q': qname},
1158
1316
  )
1159
1317
  row = resq.fetchone()
@@ -1222,14 +1380,7 @@ class Worker:
1222
1380
  # Find which tasks belong to PAUSED workflows
1223
1381
  async with self.sf() as s:
1224
1382
  res = await s.execute(
1225
- text("""
1226
- SELECT t.id
1227
- FROM horsies_tasks t
1228
- JOIN horsies_workflow_tasks wt ON wt.task_id = t.id
1229
- JOIN horsies_workflows w ON w.id = wt.workflow_id
1230
- WHERE t.id = ANY(:ids)
1231
- AND w.status = 'PAUSED'
1232
- """),
1383
+ GET_PAUSED_WORKFLOW_TASK_IDS_SQL,
1233
1384
  {'ids': task_ids},
1234
1385
  )
1235
1386
  paused_task_ids = {row[0] for row in res.fetchall()}
@@ -1237,25 +1388,13 @@ class Worker:
1237
1388
  if paused_task_ids:
1238
1389
  # Unclaim these tasks: set back to PENDING so they can be picked up on resume
1239
1390
  await s.execute(
1240
- text("""
1241
- UPDATE horsies_tasks
1242
- SET status = 'PENDING',
1243
- claimed = FALSE,
1244
- claimed_at = NULL,
1245
- claimed_by_worker_id = NULL,
1246
- updated_at = NOW()
1247
- WHERE id = ANY(:ids)
1248
- """),
1391
+ UNCLAIM_PAUSED_TASKS_SQL,
1249
1392
  {'ids': list(paused_task_ids)},
1250
1393
  )
1251
1394
  # Also reset workflow_tasks back to READY for consistency
1252
1395
  # (they were ENQUEUED, but the task is now unclaimed)
1253
1396
  await s.execute(
1254
- text("""
1255
- UPDATE horsies_workflow_tasks
1256
- SET status = 'READY', task_id = NULL, started_at = NULL
1257
- WHERE task_id = ANY(:ids)
1258
- """),
1397
+ RESET_PAUSED_WORKFLOW_TASKS_SQL,
1259
1398
  {'ids': list(paused_task_ids)},
1260
1399
  )
1261
1400
  await s.commit()
@@ -1300,14 +1439,7 @@ class Worker:
1300
1439
  """Count only CLAIMED tasks for this worker (not yet RUNNING)."""
1301
1440
  async with self.sf() as s:
1302
1441
  res = await s.execute(
1303
- text(
1304
- """
1305
- SELECT COUNT(*)
1306
- FROM horsies_tasks
1307
- WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
1308
- AND status = 'CLAIMED'
1309
- """
1310
- ),
1442
+ COUNT_CLAIMED_FOR_WORKER_SQL,
1311
1443
  {'wid': self.worker_instance_id},
1312
1444
  )
1313
1445
  row = res.fetchone()
@@ -1317,14 +1449,7 @@ class Worker:
1317
1449
  """Count only RUNNING tasks for this worker (excludes CLAIMED)."""
1318
1450
  async with self.sf() as s:
1319
1451
  res = await s.execute(
1320
- text(
1321
- """
1322
- SELECT COUNT(*)
1323
- FROM horsies_tasks
1324
- WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
1325
- AND status = 'RUNNING'
1326
- """
1327
- ),
1452
+ COUNT_RUNNING_FOR_WORKER_SQL,
1328
1453
  {'wid': self.worker_instance_id},
1329
1454
  )
1330
1455
  row = res.fetchone()
@@ -1334,14 +1459,7 @@ class Worker:
1334
1459
  """Count RUNNING + CLAIMED tasks for this worker (hard cap mode)."""
1335
1460
  async with self.sf() as s:
1336
1461
  res = await s.execute(
1337
- text(
1338
- """
1339
- SELECT COUNT(*)
1340
- FROM horsies_tasks
1341
- WHERE claimed_by_worker_id = CAST(:wid AS VARCHAR)
1342
- AND status IN ('RUNNING', 'CLAIMED')
1343
- """
1344
- ),
1462
+ COUNT_IN_FLIGHT_FOR_WORKER_SQL,
1345
1463
  {'wid': self.worker_instance_id},
1346
1464
  )
1347
1465
  row = res.fetchone()
@@ -1351,14 +1469,7 @@ class Worker:
1351
1469
  """Count RUNNING tasks in a given queue across the cluster."""
1352
1470
  async with self.sf() as s:
1353
1471
  res = await s.execute(
1354
- text(
1355
- """
1356
- SELECT COUNT(*)
1357
- FROM horsies_tasks
1358
- WHERE status = 'RUNNING'
1359
- AND queue_name = :q
1360
- """
1361
- ),
1472
+ COUNT_RUNNING_IN_QUEUE_SQL,
1362
1473
  {'q': queue_name},
1363
1474
  )
1364
1475
  row = res.fetchone()
@@ -1453,14 +1564,7 @@ class Worker:
1453
1564
 
1454
1565
  # worker-level failure (rare): mark FAILED with reason
1455
1566
  await s.execute(
1456
- text("""
1457
- UPDATE horsies_tasks
1458
- SET status='FAILED',
1459
- failed_at = :now,
1460
- failed_reason = :reason,
1461
- updated_at = :now
1462
- WHERE id = :id
1463
- """),
1567
+ MARK_TASK_FAILED_WORKER_SQL,
1464
1568
  {
1465
1569
  'now': now,
1466
1570
  'reason': failed_reason or 'Worker failure',
@@ -1492,26 +1596,12 @@ class Worker:
1492
1596
 
1493
1597
  # Mark as failed if no retry
1494
1598
  await s.execute(
1495
- text("""
1496
- UPDATE horsies_tasks
1497
- SET status='FAILED',
1498
- failed_at = :now,
1499
- result = :result_json, -- or result_json JSONB if you add that column
1500
- updated_at = :now
1501
- WHERE id = :id
1502
- """),
1599
+ MARK_TASK_FAILED_SQL,
1503
1600
  {'now': now, 'result_json': result_json_str, 'id': task_id},
1504
1601
  )
1505
1602
  else:
1506
1603
  await s.execute(
1507
- text("""
1508
- UPDATE horsies_tasks
1509
- SET status='COMPLETED',
1510
- completed_at = :now,
1511
- result = :result_json, -- or result_json JSONB
1512
- updated_at = :now
1513
- WHERE id = :id
1514
- """),
1604
+ MARK_TASK_COMPLETED_SQL,
1515
1605
  {'now': now, 'result_json': result_json_str, 'id': task_id},
1516
1606
  )
1517
1607
 
@@ -1523,16 +1613,15 @@ class Worker:
1523
1613
  # Notify workers globally and on the specific queue to wake claims
1524
1614
  # Fetch queue name for this task
1525
1615
  resq = await s.execute(
1526
- text('SELECT queue_name FROM horsies_tasks WHERE id = :id'), {'id': task_id}
1616
+ GET_TASK_QUEUE_NAME_SQL,
1617
+ {'id': task_id},
1527
1618
  )
1528
1619
  rowq = resq.fetchone()
1529
1620
  qname = str(rowq[0]) if rowq and rowq[0] else 'default'
1530
1621
  payload = f'capacity:{task_id}'
1622
+ await s.execute(NOTIFY_TASK_NEW_SQL, {'c1': 'task_new', 'p': payload})
1531
1623
  await s.execute(
1532
- text('SELECT pg_notify(:c1, :p)'), {'c1': 'task_new', 'p': payload}
1533
- )
1534
- await s.execute(
1535
- text('SELECT pg_notify(:c2, :p)'),
1624
+ NOTIFY_TASK_QUEUE_SQL,
1536
1625
  {'c2': f'task_queue_{qname}', 'p': payload},
1537
1626
  )
1538
1627
  except Exception:
@@ -1558,7 +1647,7 @@ class Worker:
1558
1647
 
1559
1648
  # Quick check: is this task linked to a workflow?
1560
1649
  check = await session.execute(
1561
- text('SELECT 1 FROM horsies_workflow_tasks WHERE task_id = :tid LIMIT 1'),
1650
+ CHECK_WORKFLOW_TASK_EXISTS_SQL,
1562
1651
  {'tid': task_id},
1563
1652
  )
1564
1653
 
@@ -1573,9 +1662,7 @@ class Worker:
1573
1662
  """Check if a task should be retried based on its configuration and current retry count."""
1574
1663
  async with self.sf() as s:
1575
1664
  result = await s.execute(
1576
- text(
1577
- 'SELECT retry_count, max_retries, task_options FROM horsies_tasks WHERE id = :id'
1578
- ),
1665
+ GET_TASK_RETRY_INFO_SQL,
1579
1666
  {'id': task_id},
1580
1667
  )
1581
1668
  row = result.fetchone()
@@ -1632,7 +1719,7 @@ class Worker:
1632
1719
  """Schedule a task for retry by updating its status and next retry time."""
1633
1720
  # Get current retry configuration
1634
1721
  result = await session.execute(
1635
- text('SELECT retry_count, task_options FROM horsies_tasks WHERE id = :id'),
1722
+ GET_TASK_RETRY_CONFIG_SQL,
1636
1723
  {'id': task_id},
1637
1724
  )
1638
1725
  row = result.fetchone()
@@ -1660,15 +1747,7 @@ class Worker:
1660
1747
 
1661
1748
  # Update task for retry
1662
1749
  await session.execute(
1663
- text("""
1664
- UPDATE horsies_tasks
1665
- SET status = 'PENDING',
1666
- retry_count = :retry_count,
1667
- next_retry_at = :next_retry_at,
1668
- sent_at = :next_retry_at,
1669
- updated_at = now()
1670
- WHERE id = :id
1671
- """),
1750
+ SCHEDULE_TASK_RETRY_SQL,
1672
1751
  {'id': task_id, 'retry_count': retry_count, 'next_retry_at': next_retry_at},
1673
1752
  )
1674
1753
 
@@ -1726,7 +1805,7 @@ class Worker:
1726
1805
  # Send notification to trigger retry processing
1727
1806
  async with self.sf() as session:
1728
1807
  await session.execute(
1729
- text('SELECT pg_notify(:channel, :payload)'),
1808
+ NOTIFY_DELAYED_SQL,
1730
1809
  {'channel': channel, 'payload': payload},
1731
1810
  )
1732
1811
  await session.commit()
@@ -1741,7 +1820,7 @@ class Worker:
1741
1820
  """Fetch the queue_name for a given task id."""
1742
1821
  async with self.sf() as session:
1743
1822
  res = await session.execute(
1744
- text('SELECT queue_name FROM horsies_tasks WHERE id = :id'),
1823
+ GET_TASK_QUEUE_NAME_SQL,
1745
1824
  {'id': task_id},
1746
1825
  )
1747
1826
  row = res.fetchone()
@@ -1761,14 +1840,7 @@ class Worker:
1761
1840
  try:
1762
1841
  async with self.sf() as s:
1763
1842
  await s.execute(
1764
- text(
1765
- """
1766
- INSERT INTO horsies_heartbeats (task_id, sender_id, role, sent_at, hostname, pid)
1767
- SELECT id, CAST(:wid AS VARCHAR), 'claimer', NOW(), :host, :pid
1768
- FROM horsies_tasks
1769
- WHERE status = 'CLAIMED' AND claimed_by_worker_id = CAST(:wid AS VARCHAR)
1770
- """
1771
- ),
1843
+ INSERT_CLAIMER_HEARTBEAT_SQL,
1772
1844
  {
1773
1845
  'wid': self.worker_instance_id,
1774
1846
  'host': socket.gethostname(),
@@ -1810,21 +1882,7 @@ class Worker:
1810
1882
 
1811
1883
  async with self.sf() as s:
1812
1884
  await s.execute(
1813
- text("""
1814
- INSERT INTO horsies_worker_states (
1815
- worker_id, snapshot_at, hostname, pid,
1816
- processes, max_claim_batch, max_claim_per_worker,
1817
- cluster_wide_cap, queues, queue_priorities, queue_max_concurrency,
1818
- recovery_config, tasks_running, tasks_claimed,
1819
- memory_usage_mb, memory_percent, cpu_percent,
1820
- worker_started_at
1821
- )
1822
- VALUES (
1823
- :wid, NOW(), :host, :pid, :procs, :mcb, :mcpw, :cwc,
1824
- :queues, :qp, :qmc, :recovery, :running, :claimed,
1825
- :mem_mb, :mem_pct, :cpu_pct, :started
1826
- )
1827
- """),
1885
+ INSERT_WORKER_STATE_SQL,
1828
1886
  {
1829
1887
  'wid': self.worker_instance_id,
1830
1888
  'host': socket.gethostname(),