dbos 1.2.0a2__py3-none-any.whl → 1.2.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dbos/_client.py CHANGED
@@ -109,6 +109,7 @@ class DBOSClient:
109
109
  },
110
110
  sys_db_name=system_database,
111
111
  )
112
+ self._sys_db.check_connection()
112
113
  self._app_db = ApplicationDatabase(
113
114
  database_url=database_url,
114
115
  engine_kwargs={
@@ -231,7 +232,7 @@ class DBOSClient:
231
232
  "workflow_deadline_epoch_ms": None,
232
233
  }
233
234
  with self._sys_db.engine.begin() as conn:
234
- self._sys_db.insert_workflow_status(
235
+ self._sys_db._insert_workflow_status(
235
236
  status, conn, max_recovery_attempts=None
236
237
  )
237
238
  self._sys_db.send(status["workflow_uuid"], 0, destination_id, message, topic)
dbos/_core.py CHANGED
@@ -20,8 +20,10 @@ from typing import (
20
20
  cast,
21
21
  )
22
22
 
23
+ import psycopg
24
+
23
25
  from dbos._outcome import Immediate, NoResult, Outcome, Pending
24
- from dbos._utils import GlobalParams
26
+ from dbos._utils import GlobalParams, retriable_postgres_exception
25
27
 
26
28
  from ._app_db import ApplicationDatabase, TransactionResultInternal
27
29
 
@@ -931,12 +933,18 @@ def decorate_transaction(
931
933
  )
932
934
  break
933
935
  except DBAPIError as dbapi_error:
934
- if dbapi_error.orig.sqlstate == "40001": # type: ignore
936
+ driver_error = cast(
937
+ Optional[psycopg.OperationalError], dbapi_error.orig
938
+ )
939
+ if retriable_postgres_exception(dbapi_error) or (
940
+ driver_error is not None
941
+ and driver_error.sqlstate == "40001"
942
+ ):
935
943
  # Retry on serialization failure
936
944
  span = ctx.get_current_span()
937
945
  if span:
938
946
  span.add_event(
939
- "Transaction Serialization Failure",
947
+ "Transaction Failure",
940
948
  {"retry_wait_seconds": retry_wait_seconds},
941
949
  )
942
950
  time.sleep(retry_wait_seconds)
dbos/_dbos.py CHANGED
@@ -90,7 +90,6 @@ from ._context import (
90
90
  from ._dbos_config import (
91
91
  ConfigFile,
92
92
  DBOSConfig,
93
- check_config_consistency,
94
93
  overwrite_config,
95
94
  process_config,
96
95
  set_env_vars,
@@ -324,7 +323,6 @@ class DBOS:
324
323
  unvalidated_config = translate_dbos_config_to_config_file(config)
325
324
  if os.environ.get("DBOS__CLOUD") == "true":
326
325
  unvalidated_config = overwrite_config(unvalidated_config)
327
- check_config_consistency(name=unvalidated_config["name"])
328
326
 
329
327
  if unvalidated_config is not None:
330
328
  self._config: ConfigFile = process_config(data=unvalidated_config)
dbos/_dbos_config.py CHANGED
@@ -407,6 +407,7 @@ def configure_db_engine_parameters(
407
407
  "pool_timeout": 30,
408
408
  "max_overflow": 0,
409
409
  "pool_size": 20,
410
+ "pool_pre_ping": True,
410
411
  }
411
412
  # If user-provided kwargs are present, use them instead
412
413
  user_kwargs = data.get("db_engine_kwargs")
@@ -529,26 +530,3 @@ def overwrite_config(provided_config: ConfigFile) -> ConfigFile:
529
530
  del provided_config["env"]
530
531
 
531
532
  return provided_config
532
-
533
-
534
- def check_config_consistency(
535
- *,
536
- name: str,
537
- config_file_path: str = DBOS_CONFIG_PATH,
538
- ) -> None:
539
- # First load the config file and check whether it is present
540
- try:
541
- config = load_config(config_file_path, silent=True, run_process_config=False)
542
- except FileNotFoundError:
543
- dbos_logger.debug(
544
- f"No configuration file {config_file_path} found. Skipping consistency check with provided config."
545
- )
546
- return
547
- except Exception as e:
548
- raise e
549
-
550
- # Check the name
551
- if name != config["name"]:
552
- raise DBOSInitializationError(
553
- f"Provided app name '{name}' does not match the app name '{config['name']}' in {config_file_path}."
554
- )
dbos/_queue.py CHANGED
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Callable, Coroutine, Optional, TypedDict
5
5
  from psycopg import errors
6
6
  from sqlalchemy.exc import OperationalError
7
7
 
8
+ from dbos._logger import dbos_logger
8
9
  from dbos._utils import GlobalParams
9
10
 
10
11
  from ._core import P, R, execute_workflow_by_id, start_workflow, start_workflow_async
@@ -56,6 +57,8 @@ class Queue:
56
57
  from ._dbos import _get_or_create_dbos_registry
57
58
 
58
59
  registry = _get_or_create_dbos_registry()
60
+ if self.name in registry.queue_info_map:
61
+ dbos_logger.warning(f"Queue {name} has already been declared")
59
62
  registry.queue_info_map[self.name] = self
60
63
 
61
64
  def enqueue(
@@ -95,12 +98,8 @@ def queue_thread(stop_event: threading.Event, dbos: "DBOS") -> None:
95
98
  if not isinstance(
96
99
  e.orig, (errors.SerializationFailure, errors.LockNotAvailable)
97
100
  ):
98
- dbos.logger.warning(
99
- f"Exception encountered in queue thread: {traceback.format_exc()}"
100
- )
101
- except Exception:
101
+ dbos.logger.warning(f"Exception encountered in queue thread: {e}")
102
+ except Exception as e:
102
103
  if not stop_event.is_set():
103
104
  # Only print the error if the thread is not stopping
104
- dbos.logger.warning(
105
- f"Exception encountered in queue thread: {traceback.format_exc()}"
106
- )
105
+ dbos.logger.warning(f"Exception encountered in queue thread: {e}")
dbos/_sys_db.py CHANGED
@@ -1,7 +1,9 @@
1
1
  import datetime
2
+ import functools
2
3
  import json
3
4
  import logging
4
5
  import os
6
+ import random
5
7
  import re
6
8
  import threading
7
9
  import time
@@ -17,6 +19,7 @@ from typing import (
17
19
  Sequence,
18
20
  TypedDict,
19
21
  TypeVar,
22
+ cast,
20
23
  )
21
24
 
22
25
  import psycopg
@@ -27,7 +30,7 @@ from alembic.config import Config
27
30
  from sqlalchemy.exc import DBAPIError
28
31
  from sqlalchemy.sql import func
29
32
 
30
- from dbos._utils import INTERNAL_QUEUE_NAME
33
+ from dbos._utils import INTERNAL_QUEUE_NAME, retriable_postgres_exception
31
34
 
32
35
  from . import _serialization
33
36
  from ._context import get_local_dbos_context
@@ -268,6 +271,51 @@ class ThreadSafeConditionDict:
268
271
  dbos_logger.warning(f"Key {key} not found in condition dictionary.")
269
272
 
270
273
 
274
+ F = TypeVar("F", bound=Callable[..., Any])
275
+
276
+
277
+ def db_retry(
278
+ initial_backoff: float = 1.0, max_backoff: float = 60.0
279
+ ) -> Callable[[F], F]:
280
+ """
281
+ If a workflow encounters a database connection issue while performing an operation,
282
+ block the workflow and retry the operation until it reconnects and succeeds.
283
+
284
+ In other words, if DBOS loses its database connection, everything pauses until the connection is recovered,
285
+ trading off availability for correctness.
286
+ """
287
+
288
+ def decorator(func: F) -> F:
289
+ @functools.wraps(func)
290
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
291
+ retries: int = 0
292
+ backoff: float = initial_backoff
293
+ while True:
294
+ try:
295
+ return func(*args, **kwargs)
296
+ except DBAPIError as e:
297
+
298
+ # Determine if this is a retriable exception
299
+ if not retriable_postgres_exception(e):
300
+ raise
301
+
302
+ retries += 1
303
+ # Calculate backoff with jitter
304
+ actual_backoff: float = backoff * (0.5 + random.random())
305
+ dbos_logger.warning(
306
+ f"Database connection failed: {str(e)}. "
307
+ f"Retrying in {actual_backoff:.2f}s (attempt {retries})"
308
+ )
309
+ # Sleep with backoff
310
+ time.sleep(actual_backoff)
311
+ # Increase backoff for next attempt (exponential)
312
+ backoff = min(backoff * 2, max_backoff)
313
+
314
+ return cast(F, wrapper)
315
+
316
+ return decorator
317
+
318
+
271
319
  class SystemDatabase:
272
320
 
273
321
  def __init__(
@@ -365,7 +413,7 @@ class SystemDatabase:
365
413
  self.notification_conn.close()
366
414
  self.engine.dispose()
367
415
 
368
- def insert_workflow_status(
416
+ def _insert_workflow_status(
369
417
  self,
370
418
  status: WorkflowStatusInternal,
371
419
  conn: sa.Connection,
@@ -474,53 +522,46 @@ class SystemDatabase:
474
522
 
475
523
  return wf_status, workflow_deadline_epoch_ms
476
524
 
525
+ @db_retry()
477
526
  def update_workflow_status(
478
527
  self,
479
528
  status: WorkflowStatusInternal,
480
- *,
481
- conn: Optional[sa.Connection] = None,
482
529
  ) -> None:
483
530
  if self._debug_mode:
484
531
  raise Exception("called update_workflow_status in debug mode")
485
532
  wf_status: WorkflowStatuses = status["status"]
486
-
487
- cmd = (
488
- pg.insert(SystemSchema.workflow_status)
489
- .values(
490
- workflow_uuid=status["workflow_uuid"],
491
- status=status["status"],
492
- name=status["name"],
493
- class_name=status["class_name"],
494
- config_name=status["config_name"],
495
- output=status["output"],
496
- error=status["error"],
497
- executor_id=status["executor_id"],
498
- application_version=status["app_version"],
499
- application_id=status["app_id"],
500
- authenticated_user=status["authenticated_user"],
501
- authenticated_roles=status["authenticated_roles"],
502
- assumed_role=status["assumed_role"],
503
- queue_name=status["queue_name"],
504
- recovery_attempts=(
505
- 1 if wf_status != WorkflowStatusString.ENQUEUED.value else 0
506
- ),
507
- )
508
- .on_conflict_do_update(
509
- index_elements=["workflow_uuid"],
510
- set_=dict(
533
+ with self.engine.begin() as c:
534
+ c.execute(
535
+ pg.insert(SystemSchema.workflow_status)
536
+ .values(
537
+ workflow_uuid=status["workflow_uuid"],
511
538
  status=status["status"],
539
+ name=status["name"],
540
+ class_name=status["class_name"],
541
+ config_name=status["config_name"],
512
542
  output=status["output"],
513
543
  error=status["error"],
514
- updated_at=func.extract("epoch", func.now()) * 1000,
515
- ),
544
+ executor_id=status["executor_id"],
545
+ application_version=status["app_version"],
546
+ application_id=status["app_id"],
547
+ authenticated_user=status["authenticated_user"],
548
+ authenticated_roles=status["authenticated_roles"],
549
+ assumed_role=status["assumed_role"],
550
+ queue_name=status["queue_name"],
551
+ recovery_attempts=(
552
+ 1 if wf_status != WorkflowStatusString.ENQUEUED.value else 0
553
+ ),
554
+ )
555
+ .on_conflict_do_update(
556
+ index_elements=["workflow_uuid"],
557
+ set_=dict(
558
+ status=status["status"],
559
+ output=status["output"],
560
+ error=status["error"],
561
+ updated_at=func.extract("epoch", func.now()) * 1000,
562
+ ),
563
+ )
516
564
  )
517
- )
518
-
519
- if conn is not None:
520
- conn.execute(cmd)
521
- else:
522
- with self.engine.begin() as c:
523
- c.execute(cmd)
524
565
 
525
566
  def cancel_workflow(
526
567
  self,
@@ -686,6 +727,7 @@ class SystemDatabase:
686
727
  )
687
728
  return forked_workflow_id
688
729
 
730
+ @db_retry()
689
731
  def get_workflow_status(
690
732
  self, workflow_uuid: str
691
733
  ) -> Optional[WorkflowStatusInternal]:
@@ -735,6 +777,7 @@ class SystemDatabase:
735
777
  }
736
778
  return status
737
779
 
780
+ @db_retry()
738
781
  def await_workflow_result(self, workflow_id: str) -> Any:
739
782
  while True:
740
783
  with self.engine.begin() as c:
@@ -761,7 +804,7 @@ class SystemDatabase:
761
804
  pass # CB: I guess we're assuming the WF will show up eventually.
762
805
  time.sleep(1)
763
806
 
764
- def update_workflow_inputs(
807
+ def _update_workflow_inputs(
765
808
  self, workflow_uuid: str, inputs: str, conn: sa.Connection
766
809
  ) -> None:
767
810
  if self._debug_mode:
@@ -791,6 +834,7 @@ class SystemDatabase:
791
834
 
792
835
  return
793
836
 
837
+ @db_retry()
794
838
  def get_workflow_inputs(
795
839
  self, workflow_uuid: str
796
840
  ) -> Optional[_serialization.WorkflowInputs]:
@@ -1084,8 +1128,8 @@ class SystemDatabase:
1084
1128
  for row in rows
1085
1129
  ]
1086
1130
 
1087
- def record_operation_result(
1088
- self, result: OperationResultInternal, conn: Optional[sa.Connection] = None
1131
+ def _record_operation_result_txn(
1132
+ self, result: OperationResultInternal, conn: sa.Connection
1089
1133
  ) -> None:
1090
1134
  if self._debug_mode:
1091
1135
  raise Exception("called record_operation_result in debug mode")
@@ -1100,16 +1144,18 @@ class SystemDatabase:
1100
1144
  error=error,
1101
1145
  )
1102
1146
  try:
1103
- if conn is not None:
1104
- conn.execute(sql)
1105
- else:
1106
- with self.engine.begin() as c:
1107
- c.execute(sql)
1147
+ conn.execute(sql)
1108
1148
  except DBAPIError as dbapi_error:
1109
1149
  if dbapi_error.orig.sqlstate == "23505": # type: ignore
1110
1150
  raise DBOSWorkflowConflictIDError(result["workflow_uuid"])
1111
1151
  raise
1112
1152
 
1153
+ @db_retry()
1154
+ def record_operation_result(self, result: OperationResultInternal) -> None:
1155
+ with self.engine.begin() as c:
1156
+ self._record_operation_result_txn(result, c)
1157
+
1158
+ @db_retry()
1113
1159
  def record_get_result(
1114
1160
  self, result_workflow_id: str, output: Optional[str], error: Optional[str]
1115
1161
  ) -> None:
@@ -1135,6 +1181,7 @@ class SystemDatabase:
1135
1181
  with self.engine.begin() as c:
1136
1182
  c.execute(sql)
1137
1183
 
1184
+ @db_retry()
1138
1185
  def record_child_workflow(
1139
1186
  self,
1140
1187
  parentUUID: str,
@@ -1159,13 +1206,12 @@ class SystemDatabase:
1159
1206
  raise DBOSWorkflowConflictIDError(parentUUID)
1160
1207
  raise
1161
1208
 
1162
- def check_operation_execution(
1209
+ def _check_operation_execution_txn(
1163
1210
  self,
1164
1211
  workflow_id: str,
1165
1212
  function_id: int,
1166
1213
  function_name: str,
1167
- *,
1168
- conn: Optional[sa.Connection] = None,
1214
+ conn: sa.Connection,
1169
1215
  ) -> Optional[RecordedResult]:
1170
1216
  # First query: Retrieve the workflow status
1171
1217
  workflow_status_sql = sa.select(
@@ -1183,13 +1229,8 @@ class SystemDatabase:
1183
1229
  )
1184
1230
 
1185
1231
  # Execute both queries
1186
- if conn is not None:
1187
- workflow_status_rows = conn.execute(workflow_status_sql).all()
1188
- operation_output_rows = conn.execute(operation_output_sql).all()
1189
- else:
1190
- with self.engine.begin() as c:
1191
- workflow_status_rows = c.execute(workflow_status_sql).all()
1192
- operation_output_rows = c.execute(operation_output_sql).all()
1232
+ workflow_status_rows = conn.execute(workflow_status_sql).all()
1233
+ operation_output_rows = conn.execute(operation_output_sql).all()
1193
1234
 
1194
1235
  # Check if the workflow exists
1195
1236
  assert (
@@ -1231,6 +1272,16 @@ class SystemDatabase:
1231
1272
  }
1232
1273
  return result
1233
1274
 
1275
+ @db_retry()
1276
+ def check_operation_execution(
1277
+ self, workflow_id: str, function_id: int, function_name: str
1278
+ ) -> Optional[RecordedResult]:
1279
+ with self.engine.begin() as c:
1280
+ return self._check_operation_execution_txn(
1281
+ workflow_id, function_id, function_name, c
1282
+ )
1283
+
1284
+ @db_retry()
1234
1285
  def check_child_workflow(
1235
1286
  self, workflow_uuid: str, function_id: int
1236
1287
  ) -> Optional[str]:
@@ -1248,6 +1299,7 @@ class SystemDatabase:
1248
1299
  return None
1249
1300
  return str(row[0])
1250
1301
 
1302
+ @db_retry()
1251
1303
  def send(
1252
1304
  self,
1253
1305
  workflow_uuid: str,
@@ -1259,7 +1311,7 @@ class SystemDatabase:
1259
1311
  function_name = "DBOS.send"
1260
1312
  topic = topic if topic is not None else _dbos_null_topic
1261
1313
  with self.engine.begin() as c:
1262
- recorded_output = self.check_operation_execution(
1314
+ recorded_output = self._check_operation_execution_txn(
1263
1315
  workflow_uuid, function_id, function_name, conn=c
1264
1316
  )
1265
1317
  if self._debug_mode and recorded_output is None:
@@ -1297,8 +1349,9 @@ class SystemDatabase:
1297
1349
  "output": None,
1298
1350
  "error": None,
1299
1351
  }
1300
- self.record_operation_result(output, conn=c)
1352
+ self._record_operation_result_txn(output, conn=c)
1301
1353
 
1354
+ @db_retry()
1302
1355
  def recv(
1303
1356
  self,
1304
1357
  workflow_uuid: str,
@@ -1391,7 +1444,7 @@ class SystemDatabase:
1391
1444
  message: Any = None
1392
1445
  if len(rows) > 0:
1393
1446
  message = _serialization.deserialize(rows[0][0])
1394
- self.record_operation_result(
1447
+ self._record_operation_result_txn(
1395
1448
  {
1396
1449
  "workflow_uuid": workflow_uuid,
1397
1450
  "function_id": function_id,
@@ -1455,13 +1508,14 @@ class SystemDatabase:
1455
1508
  dbos_logger.error(f"Unknown channel: {channel}")
1456
1509
  except Exception as e:
1457
1510
  if self._run_background_processes:
1458
- dbos_logger.error(f"Notification listener error: {e}")
1511
+ dbos_logger.warning(f"Notification listener error: {e}")
1459
1512
  time.sleep(1)
1460
1513
  # Then the loop will try to reconnect and restart the listener
1461
1514
  finally:
1462
1515
  if self.notification_conn is not None:
1463
1516
  self.notification_conn.close()
1464
1517
 
1518
+ @db_retry()
1465
1519
  def sleep(
1466
1520
  self,
1467
1521
  workflow_uuid: str,
@@ -1501,6 +1555,7 @@ class SystemDatabase:
1501
1555
  time.sleep(duration)
1502
1556
  return duration
1503
1557
 
1558
+ @db_retry()
1504
1559
  def set_event(
1505
1560
  self,
1506
1561
  workflow_uuid: str,
@@ -1510,7 +1565,7 @@ class SystemDatabase:
1510
1565
  ) -> None:
1511
1566
  function_name = "DBOS.setEvent"
1512
1567
  with self.engine.begin() as c:
1513
- recorded_output = self.check_operation_execution(
1568
+ recorded_output = self._check_operation_execution_txn(
1514
1569
  workflow_uuid, function_id, function_name, conn=c
1515
1570
  )
1516
1571
  if self._debug_mode and recorded_output is None:
@@ -1542,8 +1597,9 @@ class SystemDatabase:
1542
1597
  "output": None,
1543
1598
  "error": None,
1544
1599
  }
1545
- self.record_operation_result(output, conn=c)
1600
+ self._record_operation_result_txn(output, conn=c)
1546
1601
 
1602
+ @db_retry()
1547
1603
  def get_event(
1548
1604
  self,
1549
1605
  target_uuid: str,
@@ -1634,7 +1690,7 @@ class SystemDatabase:
1634
1690
  )
1635
1691
  return value
1636
1692
 
1637
- def enqueue(
1693
+ def _enqueue(
1638
1694
  self,
1639
1695
  workflow_id: str,
1640
1696
  queue_name: str,
@@ -1710,13 +1766,8 @@ class SystemDatabase:
1710
1766
  if num_recent_queries >= queue.limiter["limit"]:
1711
1767
  return []
1712
1768
 
1713
- # Dequeue functions eligible for this worker and ordered by the time at which they were enqueued.
1714
- # If there is a global or local concurrency limit N, select only the N oldest enqueued
1715
- # functions, else select all of them.
1716
-
1717
- # First lets figure out how many tasks are eligible for dequeue.
1718
- # This means figuring out how many unstarted tasks are within the local and global concurrency limits
1719
- running_tasks_query = (
1769
+ # Count how many workflows on this queue are currently PENDING both locally and globally.
1770
+ pending_tasks_query = (
1720
1771
  sa.select(
1721
1772
  SystemSchema.workflow_status.c.executor_id,
1722
1773
  sa.func.count().label("task_count"),
@@ -1730,41 +1781,37 @@ class SystemDatabase:
1730
1781
  )
1731
1782
  .where(SystemSchema.workflow_queue.c.queue_name == queue.name)
1732
1783
  .where(
1733
- SystemSchema.workflow_queue.c.started_at_epoch_ms.isnot(
1734
- None
1735
- ) # Task is started
1736
- )
1737
- .where(
1738
- SystemSchema.workflow_queue.c.completed_at_epoch_ms.is_(
1739
- None
1740
- ) # Task is not completed.
1784
+ SystemSchema.workflow_status.c.status
1785
+ == WorkflowStatusString.PENDING.value
1741
1786
  )
1742
1787
  .group_by(SystemSchema.workflow_status.c.executor_id)
1743
1788
  )
1744
- running_tasks_result = c.execute(running_tasks_query).fetchall()
1745
- running_tasks_result_dict = {row[0]: row[1] for row in running_tasks_result}
1746
- running_tasks_for_this_worker = running_tasks_result_dict.get(
1747
- executor_id, 0
1748
- ) # Get count for current executor
1789
+ pending_workflows = c.execute(pending_tasks_query).fetchall()
1790
+ pending_workflows_dict = {row[0]: row[1] for row in pending_workflows}
1791
+ local_pending_workflows = pending_workflows_dict.get(executor_id, 0)
1749
1792
 
1793
+ # Compute max_tasks, the number of workflows that can be dequeued given local and global concurrency limits,
1750
1794
  max_tasks = float("inf")
1751
1795
  if queue.worker_concurrency is not None:
1752
- max_tasks = max(
1753
- 0, queue.worker_concurrency - running_tasks_for_this_worker
1754
- )
1796
+ # Print a warning if the local concurrency limit is violated
1797
+ if local_pending_workflows > queue.worker_concurrency:
1798
+ dbos_logger.warning(
1799
+ f"The number of local pending workflows ({local_pending_workflows}) on queue {queue.name} exceeds the local concurrency limit ({queue.worker_concurrency})"
1800
+ )
1801
+ max_tasks = max(0, queue.worker_concurrency - local_pending_workflows)
1802
+
1755
1803
  if queue.concurrency is not None:
1756
- total_running_tasks = sum(running_tasks_result_dict.values())
1757
- # Queue global concurrency limit should always be >= running_tasks_count
1758
- # This should never happen but a check + warning doesn't hurt
1759
- if total_running_tasks > queue.concurrency:
1804
+ global_pending_workflows = sum(pending_workflows_dict.values())
1805
+ # Print a warning if the global concurrency limit is violated
1806
+ if global_pending_workflows > queue.concurrency:
1760
1807
  dbos_logger.warning(
1761
- f"Total running tasks ({total_running_tasks}) exceeds the global concurrency limit ({queue.concurrency})"
1808
+ f"The total number of pending workflows ({global_pending_workflows}) on queue {queue.name} exceeds the global concurrency limit ({queue.concurrency})"
1762
1809
  )
1763
- available_tasks = max(0, queue.concurrency - total_running_tasks)
1810
+ available_tasks = max(0, queue.concurrency - global_pending_workflows)
1764
1811
  max_tasks = min(max_tasks, available_tasks)
1765
1812
 
1766
1813
  # Retrieve the first max_tasks workflows in the queue.
1767
- # Only retrieve workflows of the appropriate version (or without version set)
1814
+ # Only retrieve workflows of the local version (or without version set)
1768
1815
  query = (
1769
1816
  sa.select(
1770
1817
  SystemSchema.workflow_queue.c.workflow_uuid,
@@ -1777,8 +1824,10 @@ class SystemDatabase:
1777
1824
  )
1778
1825
  )
1779
1826
  .where(SystemSchema.workflow_queue.c.queue_name == queue.name)
1780
- .where(SystemSchema.workflow_queue.c.started_at_epoch_ms == None)
1781
- .where(SystemSchema.workflow_queue.c.completed_at_epoch_ms == None)
1827
+ .where(
1828
+ SystemSchema.workflow_status.c.status
1829
+ == WorkflowStatusString.ENQUEUED.value
1830
+ )
1782
1831
  .where(
1783
1832
  sa.or_(
1784
1833
  SystemSchema.workflow_status.c.application_version
@@ -1807,20 +1856,16 @@ class SystemDatabase:
1807
1856
  ret_ids: list[str] = []
1808
1857
 
1809
1858
  for id in dequeued_ids:
1810
- # If we have a limiter, stop starting functions when the number
1811
- # of functions started this period exceeds the limit.
1859
+ # If we have a limiter, stop dequeueing workflows when the number
1860
+ # of workflows started this period exceeds the limit.
1812
1861
  if queue.limiter is not None:
1813
1862
  if len(ret_ids) + num_recent_queries >= queue.limiter["limit"]:
1814
1863
  break
1815
1864
 
1816
- # To start a function, first set its status to PENDING and update its executor ID
1817
- res = c.execute(
1865
+ # To start a workflow, first set its status to PENDING and update its executor ID
1866
+ c.execute(
1818
1867
  SystemSchema.workflow_status.update()
1819
1868
  .where(SystemSchema.workflow_status.c.workflow_uuid == id)
1820
- .where(
1821
- SystemSchema.workflow_status.c.status
1822
- == WorkflowStatusString.ENQUEUED.value
1823
- )
1824
1869
  .values(
1825
1870
  status=WorkflowStatusString.PENDING.value,
1826
1871
  application_version=app_version,
@@ -1843,16 +1888,15 @@ class SystemDatabase:
1843
1888
  ),
1844
1889
  )
1845
1890
  )
1846
- if res.rowcount > 0:
1847
- # Then give it a start time and assign the executor ID
1848
- c.execute(
1849
- SystemSchema.workflow_queue.update()
1850
- .where(SystemSchema.workflow_queue.c.workflow_uuid == id)
1851
- .values(started_at_epoch_ms=start_time_ms)
1852
- )
1853
- ret_ids.append(id)
1891
+ # Then give it a start time
1892
+ c.execute(
1893
+ SystemSchema.workflow_queue.update()
1894
+ .where(SystemSchema.workflow_queue.c.workflow_uuid == id)
1895
+ .values(started_at_epoch_ms=start_time_ms)
1896
+ )
1897
+ ret_ids.append(id)
1854
1898
 
1855
- # If we have a limiter, garbage-collect all completed functions started
1899
+ # If we have a limiter, garbage-collect all completed workflows started
1856
1900
  # before the period. If there's no limiter, there's no need--they were
1857
1901
  # deleted on completion.
1858
1902
  if queue.limiter is not None:
@@ -1869,6 +1913,7 @@ class SystemDatabase:
1869
1913
  # Return the IDs of all functions we started
1870
1914
  return ret_ids
1871
1915
 
1916
+ @db_retry()
1872
1917
  def remove_from_queue(self, workflow_id: str, queue: "Queue") -> None:
1873
1918
  if self._debug_mode:
1874
1919
  raise Exception("called remove_from_queue in debug mode")
@@ -1957,6 +2002,7 @@ class SystemDatabase:
1957
2002
  )
1958
2003
  return result
1959
2004
 
2005
+ @db_retry()
1960
2006
  def init_workflow(
1961
2007
  self,
1962
2008
  status: WorkflowStatusInternal,
@@ -1969,17 +2015,17 @@ class SystemDatabase:
1969
2015
  Synchronously record the status and inputs for workflows in a single transaction
1970
2016
  """
1971
2017
  with self.engine.begin() as conn:
1972
- wf_status, workflow_deadline_epoch_ms = self.insert_workflow_status(
2018
+ wf_status, workflow_deadline_epoch_ms = self._insert_workflow_status(
1973
2019
  status, conn, max_recovery_attempts=max_recovery_attempts
1974
2020
  )
1975
2021
  # TODO: Modify the inputs if they were changed by `update_workflow_inputs`
1976
- self.update_workflow_inputs(status["workflow_uuid"], inputs, conn)
2022
+ self._update_workflow_inputs(status["workflow_uuid"], inputs, conn)
1977
2023
 
1978
2024
  if (
1979
2025
  status["queue_name"] is not None
1980
2026
  and wf_status == WorkflowStatusString.ENQUEUED.value
1981
2027
  ):
1982
- self.enqueue(
2028
+ self._enqueue(
1983
2029
  status["workflow_uuid"],
1984
2030
  status["queue_name"],
1985
2031
  conn,
@@ -1987,6 +2033,14 @@ class SystemDatabase:
1987
2033
  )
1988
2034
  return wf_status, workflow_deadline_epoch_ms
1989
2035
 
2036
+ def check_connection(self) -> None:
2037
+ try:
2038
+ with self.engine.begin() as conn:
2039
+ conn.execute(sa.text("SELECT 1")).fetchall()
2040
+ except Exception as e:
2041
+ dbos_logger.error(f"Error connecting to the DBOS system database: {e}")
2042
+ raise
2043
+
1990
2044
 
1991
2045
  def reset_system_database(postgres_db_url: sa.URL, sysdb_name: str) -> None:
1992
2046
  try:
dbos/_utils.py CHANGED
@@ -1,6 +1,9 @@
1
1
  import importlib.metadata
2
2
  import os
3
3
 
4
+ import psycopg
5
+ from sqlalchemy.exc import DBAPIError
6
+
4
7
  INTERNAL_QUEUE_NAME = "_dbos_internal_queue"
5
8
 
6
9
  request_id_header = "x-request-id"
@@ -15,3 +18,33 @@ class GlobalParams:
15
18
  except importlib.metadata.PackageNotFoundError:
16
19
  # If package is not installed or during development
17
20
  dbos_version = "unknown"
21
+
22
+
23
+ def retriable_postgres_exception(e: DBAPIError) -> bool:
24
+ if e.connection_invalidated:
25
+ return True
26
+ if isinstance(e.orig, psycopg.OperationalError):
27
+ driver_error: psycopg.OperationalError = e.orig
28
+ pgcode = driver_error.sqlstate or ""
29
+ # Failure to establish connection
30
+ if "connection failed" in str(driver_error):
31
+ return True
32
+ # Error within database transaction
33
+ elif "server closed the connection unexpectedly" in str(driver_error):
34
+ return True
35
+ # Connection timeout
36
+ if isinstance(driver_error, psycopg.errors.ConnectionTimeout):
37
+ return True
38
+ # Insufficient resources
39
+ elif pgcode.startswith("53"):
40
+ return True
41
+ # Connection exception
42
+ elif pgcode.startswith("08"):
43
+ return True
44
+ # Operator intervention
45
+ elif pgcode.startswith("57"):
46
+ return True
47
+ else:
48
+ return False
49
+ else:
50
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dbos
3
- Version: 1.2.0a2
3
+ Version: 1.2.0a5
4
4
  Summary: Ultra-lightweight durable execution in Python
5
5
  Author-Email: "DBOS, Inc." <contact@dbos.dev>
6
6
  License: MIT
@@ -1,20 +1,20 @@
1
- dbos-1.2.0a2.dist-info/METADATA,sha256=at-2zS4N-BoXxmKlOeZt7HRA5shIG9pu3pb98t8VFNs,13267
2
- dbos-1.2.0a2.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
3
- dbos-1.2.0a2.dist-info/entry_points.txt,sha256=_QOQ3tVfEjtjBlr1jS4sHqHya9lI2aIEIWkz8dqYp14,58
4
- dbos-1.2.0a2.dist-info/licenses/LICENSE,sha256=VGZit_a5-kdw9WT6fY5jxAWVwGQzgLFyPWrcVVUhVNU,1067
1
+ dbos-1.2.0a5.dist-info/METADATA,sha256=PeZezLDhF3k-2FR5-9qCP0_BVWuEVSvo41QucKSXD8o,13267
2
+ dbos-1.2.0a5.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
3
+ dbos-1.2.0a5.dist-info/entry_points.txt,sha256=_QOQ3tVfEjtjBlr1jS4sHqHya9lI2aIEIWkz8dqYp14,58
4
+ dbos-1.2.0a5.dist-info/licenses/LICENSE,sha256=VGZit_a5-kdw9WT6fY5jxAWVwGQzgLFyPWrcVVUhVNU,1067
5
5
  dbos/__init__.py,sha256=NssPCubaBxdiKarOWa-wViz1hdJSkmBGcpLX_gQ4NeA,891
6
6
  dbos/__main__.py,sha256=G7Exn-MhGrVJVDbgNlpzhfh8WMX_72t3_oJaFT9Lmt8,653
7
7
  dbos/_admin_server.py,sha256=TWXi4drrzKFpKkUmEJpJkQBZxAtOalnhtYicEn2nDK0,10618
8
8
  dbos/_app_db.py,sha256=0PKqpxJ3EbIaak3Wl0lNl3hXvhBfz4EEHaCw1bUOvIM,9937
9
9
  dbos/_classproperty.py,sha256=f0X-_BySzn3yFDRKB2JpCbLYQ9tLwt1XftfshvY7CBs,626
10
- dbos/_client.py,sha256=-nK2GjS9D0qnD2DkRDs7gKxNECwYlsvW6hFCjADlnv0,14186
10
+ dbos/_client.py,sha256=mGDuQRcSdkyEHf1s0rJuqHQiWbqIBt85qijNJSYmBik,14227
11
11
  dbos/_conductor/conductor.py,sha256=o0IaZjwnZ2TOyHeP2H4iSX6UnXLXQ4uODvWAKD9hHMs,21703
12
12
  dbos/_conductor/protocol.py,sha256=wgOFZxmS81bv0WCB9dAyg0s6QzldpzVKQDoSPeaX0Ws,6967
13
13
  dbos/_context.py,sha256=5ajoWAmToAfzzmMLylnJZoL4Ny9rBwZWuG05sXadMIA,24798
14
- dbos/_core.py,sha256=7ukQH_KClBaMFy0sVTSR5tWylW-RqI9qaReBY-LDKrk,48316
14
+ dbos/_core.py,sha256=m2i9lsHjNKTi8BQyiSOUBrAVH5OvMoBswNZPRpMVIC0,48662
15
15
  dbos/_croniter.py,sha256=XHAyUyibs_59sJQfSNWkP7rqQY6_XrlfuuCxk4jYqek,47559
16
- dbos/_dbos.py,sha256=MuMYbtqUyk2uihCH8aMVDeHmn_P8X8-udqeNT1RLesY,47365
17
- dbos/_dbos_config.py,sha256=IufNrIC-M2xSNTXyT_KXlEdfB3j03pPLv_nE0fEq4_U,20955
16
+ dbos/_dbos.py,sha256=1EhH7r6v2vwW3Z74nK6_Zw8InE1jSXedEsztz0I4ggA,47269
17
+ dbos/_dbos_config.py,sha256=JYtEbhjcCxLUhktMgqIEBz7i5nk1Ryg0vqSJHXqdGOo,20264
18
18
  dbos/_debug.py,sha256=MNlQVZ6TscGCRQeEEL0VE8Uignvr6dPeDDDefS3xgIE,1823
19
19
  dbos/_docker_pg_helper.py,sha256=tLJXWqZ4S-ExcaPnxg_i6cVxL6ZxrYlZjaGsklY-s2I,6115
20
20
  dbos/_error.py,sha256=q0OQJZTbR8FFHV9hEpAGpz9oWBT5L509zUhmyff7FJw,8500
@@ -38,7 +38,7 @@ dbos/_migrations/versions/d76646551a6c_workflow_queue.py,sha256=G942nophZ2uC2vc4
38
38
  dbos/_migrations/versions/eab0cc1d9a14_job_queue.py,sha256=uvhFOtqbBreCePhAxZfIT0qCAI7BiZTou9wt6QnbY7c,1412
39
39
  dbos/_migrations/versions/f4b9b32ba814_functionname_childid_op_outputs.py,sha256=m90Lc5YH0ZISSq1MyxND6oq3RZrZKrIqEsZtwJ1jWxA,1049
40
40
  dbos/_outcome.py,sha256=EXxBg4jXCVJsByDQ1VOCIedmbeq_03S6d-p1vqQrLFU,6810
41
- dbos/_queue.py,sha256=aKCGahWBGJOLOv5PCOOId96Va3YQ4ICuHWXy-eQXohE,3526
41
+ dbos/_queue.py,sha256=oDQcydDwYM68U5KQKN6iZiSC-4LXye6KFmSJ7ohG048,3558
42
42
  dbos/_recovery.py,sha256=jVMexjfCCNopzyn8gVQzJCmGJaP9G3C1EFaoCQ_Nh7g,2564
43
43
  dbos/_registrations.py,sha256=CZt1ElqDjCT7hz6iyT-1av76Yu-iuwu_c9lozO87wvM,7303
44
44
  dbos/_roles.py,sha256=iOsgmIAf1XVzxs3gYWdGRe1B880YfOw5fpU7Jwx8_A8,2271
@@ -47,7 +47,7 @@ dbos/_schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  dbos/_schemas/application_database.py,sha256=SypAS9l9EsaBHFn9FR8jmnqt01M74d9AF1AMa4m2hhI,1040
48
48
  dbos/_schemas/system_database.py,sha256=3Z0L72bOgHnusK1hBaETWU9RfiLBP0QnS-fdu41i0yY,5835
49
49
  dbos/_serialization.py,sha256=bWuwhXSQcGmiazvhJHA5gwhrRWxtmFmcCFQSDJnqqkU,3666
50
- dbos/_sys_db.py,sha256=IMmRbeIcrsOFJfVcBhMkDWiA3_SvxeKbOGipFiplHPM,83735
50
+ dbos/_sys_db.py,sha256=T02hZbe-4tpsK4hGVlatft06ybu86SJ4w6-anaf55KQ,85528
51
51
  dbos/_templates/dbos-db-starter/README.md,sha256=GhxhBj42wjTt1fWEtwNriHbJuKb66Vzu89G4pxNHw2g,930
52
52
  dbos/_templates/dbos-db-starter/__package/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  dbos/_templates/dbos-db-starter/__package/main.py.dbos,sha256=aQnBPSSQpkB8ERfhf7gB7P9tsU6OPKhZscfeh0yiaD8,2702
@@ -59,7 +59,7 @@ dbos/_templates/dbos-db-starter/migrations/script.py.mako,sha256=MEqL-2qATlST9TA
59
59
  dbos/_templates/dbos-db-starter/migrations/versions/2024_07_31_180642_init.py,sha256=MpS7LGaJS0CpvsjhfDkp9EJqvMvVCjRPfUp4c0aE2ys,941
60
60
  dbos/_templates/dbos-db-starter/start_postgres_docker.py,sha256=lQVLlYO5YkhGPEgPqwGc7Y8uDKse9HsWv5fynJEFJHM,1681
61
61
  dbos/_tracer.py,sha256=yN6GRDKu_1p-EqtQLNarMocPfga2ZuqpzStzzSPYhzo,2732
62
- dbos/_utils.py,sha256=UbpMYRBSyvJqdXeWAnfSw8xXM1R1mfnyl1oTunhEjJM,513
62
+ dbos/_utils.py,sha256=uywq1QrjMwy17btjxW4bES49povlQwYwYbvKwMT6C2U,1575
63
63
  dbos/_workflow_commands.py,sha256=UCpHWvCEXjVZtf5FNanFvtJpgUJDSI1EFBqQP0x_2A0,3346
64
64
  dbos/cli/_github_init.py,sha256=Y_bDF9gfO2jB1id4FV5h1oIxEJRWyqVjhb7bNEa5nQ0,3224
65
65
  dbos/cli/_template_init.py,sha256=7JBcpMqP1r2mfCnvWatu33z8ctEGHJarlZYKgB83cXE,2972
@@ -67,4 +67,4 @@ dbos/cli/cli.py,sha256=HinoCGrAUTiSeq7AAoCFfhdiE0uDw7vLMuDMN1_YTLI,20705
67
67
  dbos/dbos-config.schema.json,sha256=CjaspeYmOkx6Ip_pcxtmfXJTn_YGdSx_0pcPBF7KZmo,6060
68
68
  dbos/py.typed,sha256=QfzXT1Ktfk3Rj84akygc7_42z0lRpCq0Ilh8OXI6Zas,44
69
69
  version/__init__.py,sha256=L4sNxecRuqdtSFdpUGX3TtBi9KL3k7YsZVIvv-fv9-A,1678
70
- dbos-1.2.0a2.dist-info/RECORD,,
70
+ dbos-1.2.0a5.dist-info/RECORD,,
File without changes