PyPI - dbos - Versions diffs - 1.1.0a3__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

dbos 1.1.0a3py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

dbos/_admin_server.py +24 -4
dbos/_app_db.py +0 -15
dbos/_client.py +4 -3
dbos/_context.py +6 -0
dbos/_core.py +11 -4
dbos/_dbos.py +22 -4
dbos/_dbos_config.py +21 -39
dbos/_error.py +14 -0
dbos/_event_loop.py +10 -7
dbos/_queue.py +6 -7
dbos/_sys_db.py +201 -138
dbos/_utils.py +33 -0
dbos/_workflow_commands.py +1 -10
dbos/cli/cli.py +2 -1
{dbos-1.1.0a3.dist-info → dbos-1.2.0.dist-info}/METADATA +1 -1
{dbos-1.1.0a3.dist-info → dbos-1.2.0.dist-info}/RECORD +19 -19
{dbos-1.1.0a3.dist-info → dbos-1.2.0.dist-info}/WHEEL +0 -0
{dbos-1.1.0a3.dist-info → dbos-1.2.0.dist-info}/entry_points.txt +0 -0
{dbos-1.1.0a3.dist-info → dbos-1.2.0.dist-info}/licenses/LICENSE +0 -0

dbos/_sys_db.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import datetime
+import functools
 import json
 import logging
 import os
+import random
 import re
 import threading
 import time
@@ -17,6 +19,7 @@ from typing import (
     Sequence,
     TypedDict,
     TypeVar,
+    cast,
 )
 import psycopg
@@ -27,11 +30,12 @@ from alembic.config import Config
 from sqlalchemy.exc import DBAPIError
 from sqlalchemy.sql import func
-from dbos._utils import INTERNAL_QUEUE_NAME
+from dbos._utils import INTERNAL_QUEUE_NAME, retriable_postgres_exception
 from . import _serialization
 from ._context import get_local_dbos_context
 from ._error import (
+    DBOSAwaitedWorkflowCancelledError,
     DBOSConflictingWorkflowError,
     DBOSDeadLetterQueueError,
     DBOSNonExistentWorkflowError,
@@ -96,6 +100,10 @@ class WorkflowStatus:
     executor_id: Optional[str]
     # The application version on which this workflow was started
     app_version: Optional[str]
+    # The start-to-close timeout of the workflow in ms
+    workflow_timeout_ms: Optional[int]
+    # The deadline of a workflow, computed by adding its timeout to its start time.
+    workflow_deadline_epoch_ms: Optional[int]
     # INTERNAL FIELDS
@@ -263,6 +271,51 @@ class ThreadSafeConditionDict:
                 dbos_logger.warning(f"Key {key} not found in condition dictionary.")
+F = TypeVar("F", bound=Callable[..., Any])
+def db_retry(
+    initial_backoff: float = 1.0, max_backoff: float = 60.0
+) -> Callable[[F], F]:
+    """
+    If a workflow encounters a database connection issue while performing an operation,
+    block the workflow and retry the operation until it reconnects and succeeds.
+    In other words, if DBOS loses its database connection, everything pauses until the connection is recovered,
+    trading off availability for correctness.
+    """
+    def decorator(func: F) -> F:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            retries: int = 0
+            backoff: float = initial_backoff
+            while True:
+                try:
+                    return func(*args, **kwargs)
+                except DBAPIError as e:
+                    # Determine if this is a retriable exception
+                    if not retriable_postgres_exception(e):
+                        raise
+                    retries += 1
+                    # Calculate backoff with jitter
+                    actual_backoff: float = backoff * (0.5 + random.random())
+                    dbos_logger.warning(
+                        f"Database connection failed: {str(e)}. "
+                        f"Retrying in {actual_backoff:.2f}s (attempt {retries})"
+                    )
+                    # Sleep with backoff
+                    time.sleep(actual_backoff)
+                    # Increase backoff for next attempt (exponential)
+                    backoff = min(backoff * 2, max_backoff)
+        return cast(F, wrapper)
+    return decorator
 class SystemDatabase:
     def __init__(
@@ -360,7 +413,7 @@ class SystemDatabase:
             self.notification_conn.close()
         self.engine.dispose()
-    def insert_workflow_status(
+    def _insert_workflow_status(
         self,
         status: WorkflowStatusInternal,
         conn: sa.Connection,
@@ -372,6 +425,15 @@ class SystemDatabase:
         wf_status: WorkflowStatuses = status["status"]
         workflow_deadline_epoch_ms: Optional[int] = status["workflow_deadline_epoch_ms"]
+        # Values to update when a row already exists for this workflow
+        update_values: dict[str, Any] = {
+            "recovery_attempts": SystemSchema.workflow_status.c.recovery_attempts + 1,
+            "updated_at": func.extract("epoch", func.now()) * 1000,
+        }
+        # Don't update an existing executor ID when enqueueing a workflow.
+        if wf_status != WorkflowStatusString.ENQUEUED.value:
+            update_values["executor_id"] = status["executor_id"]
         cmd = (
             pg.insert(SystemSchema.workflow_status)
             .values(
@@ -397,13 +459,7 @@ class SystemDatabase:
             )
             .on_conflict_do_update(
                 index_elements=["workflow_uuid"],
-                set_=dict(
-                    executor_id=status["executor_id"],
-                    recovery_attempts=(
-                        SystemSchema.workflow_status.c.recovery_attempts + 1
-                    ),
-                    updated_at=func.extract("epoch", func.now()) * 1000,
-                ),
+                set_=update_values,
             )
         )
@@ -469,53 +525,46 @@ class SystemDatabase:
         return wf_status, workflow_deadline_epoch_ms
+    @db_retry()
     def update_workflow_status(
         self,
         status: WorkflowStatusInternal,
-        *,
-        conn: Optional[sa.Connection] = None,
     ) -> None:
         if self._debug_mode:
             raise Exception("called update_workflow_status in debug mode")
         wf_status: WorkflowStatuses = status["status"]
-        cmd = (
-            pg.insert(SystemSchema.workflow_status)
-            .values(
-                workflow_uuid=status["workflow_uuid"],
-                status=status["status"],
-                name=status["name"],
-                class_name=status["class_name"],
-                config_name=status["config_name"],
-                output=status["output"],
-                error=status["error"],
-                executor_id=status["executor_id"],
-                application_version=status["app_version"],
-                application_id=status["app_id"],
-                authenticated_user=status["authenticated_user"],
-                authenticated_roles=status["authenticated_roles"],
-                assumed_role=status["assumed_role"],
-                queue_name=status["queue_name"],
-                recovery_attempts=(
-                    1 if wf_status != WorkflowStatusString.ENQUEUED.value else 0
-                ),
-            )
-            .on_conflict_do_update(
-                index_elements=["workflow_uuid"],
-                set_=dict(
+        with self.engine.begin() as c:
+            c.execute(
+                pg.insert(SystemSchema.workflow_status)
+                .values(
+                    workflow_uuid=status["workflow_uuid"],
                     status=status["status"],
+                    name=status["name"],
+                    class_name=status["class_name"],
+                    config_name=status["config_name"],
                     output=status["output"],
                     error=status["error"],
-                    updated_at=func.extract("epoch", func.now()) * 1000,
-                ),
+                    executor_id=status["executor_id"],
+                    application_version=status["app_version"],
+                    application_id=status["app_id"],
+                    authenticated_user=status["authenticated_user"],
+                    authenticated_roles=status["authenticated_roles"],
+                    assumed_role=status["assumed_role"],
+                    queue_name=status["queue_name"],
+                    recovery_attempts=(
+                        1 if wf_status != WorkflowStatusString.ENQUEUED.value else 0
+                    ),
+                )
+                .on_conflict_do_update(
+                    index_elements=["workflow_uuid"],
+                    set_=dict(
+                        status=status["status"],
+                        output=status["output"],
+                        error=status["error"],
+                        updated_at=func.extract("epoch", func.now()) * 1000,
+                    ),
+                )
             )
-        )
-        if conn is not None:
-            conn.execute(cmd)
-        else:
-            with self.engine.begin() as c:
-                c.execute(cmd)
     def cancel_workflow(
         self,
@@ -596,18 +645,6 @@ class SystemDatabase:
                 )
             )
-    def get_max_function_id(self, workflow_uuid: str) -> Optional[int]:
-        with self.engine.begin() as conn:
-            max_function_id_row = conn.execute(
-                sa.select(
-                    sa.func.max(SystemSchema.operation_outputs.c.function_id)
-                ).where(SystemSchema.operation_outputs.c.workflow_uuid == workflow_uuid)
-            ).fetchone()
-            max_function_id = max_function_id_row[0] if max_function_id_row else None
-            return max_function_id
     def fork_workflow(
         self,
         original_workflow_id: str,
@@ -693,6 +730,7 @@ class SystemDatabase:
             )
         return forked_workflow_id
+    @db_retry()
     def get_workflow_status(
         self, workflow_uuid: str
     ) -> Optional[WorkflowStatusInternal]:
@@ -742,6 +780,7 @@ class SystemDatabase:
             }
             return status
+    @db_retry()
     def await_workflow_result(self, workflow_id: str) -> Any:
         while True:
             with self.engine.begin() as c:
@@ -761,14 +800,14 @@ class SystemDatabase:
                         error = row[2]
                         raise _serialization.deserialize_exception(error)
                     elif status == WorkflowStatusString.CANCELLED.value:
-                        # Raise a normal exception here, not the cancellation exception
+                        # Raise AwaitedWorkflowCancelledError here, not the cancellation exception
                         # because the awaiting workflow is not being cancelled.
-                        raise Exception(f"Awaited workflow {workflow_id} was cancelled")
+                        raise DBOSAwaitedWorkflowCancelledError(workflow_id)
                 else:
                     pass  # CB: I guess we're assuming the WF will show up eventually.
             time.sleep(1)
-    def update_workflow_inputs(
+    def _update_workflow_inputs(
         self, workflow_uuid: str, inputs: str, conn: sa.Connection
     ) -> None:
         if self._debug_mode:
@@ -798,6 +837,7 @@ class SystemDatabase:
         return
+    @db_retry()
     def get_workflow_inputs(
         self, workflow_uuid: str
     ) -> Optional[_serialization.WorkflowInputs]:
@@ -837,6 +877,8 @@ class SystemDatabase:
             SystemSchema.workflow_inputs.c.inputs,
             SystemSchema.workflow_status.c.output,
             SystemSchema.workflow_status.c.error,
+            SystemSchema.workflow_status.c.workflow_deadline_epoch_ms,
+            SystemSchema.workflow_status.c.workflow_timeout_ms,
         ).join(
             SystemSchema.workflow_inputs,
             SystemSchema.workflow_status.c.workflow_uuid
@@ -918,6 +960,8 @@ class SystemDatabase:
             info.input = inputs
             info.output = output
             info.error = exception
+            info.workflow_deadline_epoch_ms = row[18]
+            info.workflow_timeout_ms = row[19]
             infos.append(info)
         return infos
@@ -947,6 +991,8 @@ class SystemDatabase:
             SystemSchema.workflow_inputs.c.inputs,
             SystemSchema.workflow_status.c.output,
             SystemSchema.workflow_status.c.error,
+            SystemSchema.workflow_status.c.workflow_deadline_epoch_ms,
+            SystemSchema.workflow_status.c.workflow_timeout_ms,
         ).select_from(
             SystemSchema.workflow_queue.join(
                 SystemSchema.workflow_status,
@@ -1024,6 +1070,8 @@ class SystemDatabase:
             info.input = inputs
             info.output = output
             info.error = exception
+            info.workflow_deadline_epoch_ms = row[18]
+            info.workflow_timeout_ms = row[19]
             infos.append(info)
@@ -1083,8 +1131,8 @@ class SystemDatabase:
                 for row in rows
             ]
-    def record_operation_result(
-        self, result: OperationResultInternal, conn: Optional[sa.Connection] = None
+    def _record_operation_result_txn(
+        self, result: OperationResultInternal, conn: sa.Connection
     ) -> None:
         if self._debug_mode:
             raise Exception("called record_operation_result in debug mode")
@@ -1099,16 +1147,18 @@ class SystemDatabase:
             error=error,
         )
         try:
-            if conn is not None:
-                conn.execute(sql)
-            else:
-                with self.engine.begin() as c:
-                    c.execute(sql)
+            conn.execute(sql)
         except DBAPIError as dbapi_error:
             if dbapi_error.orig.sqlstate == "23505":  # type: ignore
                 raise DBOSWorkflowConflictIDError(result["workflow_uuid"])
             raise
+    @db_retry()
+    def record_operation_result(self, result: OperationResultInternal) -> None:
+        with self.engine.begin() as c:
+            self._record_operation_result_txn(result, c)
+    @db_retry()
     def record_get_result(
         self, result_workflow_id: str, output: Optional[str], error: Optional[str]
     ) -> None:
@@ -1134,6 +1184,7 @@ class SystemDatabase:
         with self.engine.begin() as c:
             c.execute(sql)
+    @db_retry()
     def record_child_workflow(
         self,
         parentUUID: str,
@@ -1158,13 +1209,12 @@ class SystemDatabase:
                 raise DBOSWorkflowConflictIDError(parentUUID)
             raise
-    def check_operation_execution(
+    def _check_operation_execution_txn(
         self,
         workflow_id: str,
         function_id: int,
         function_name: str,
-        *,
-        conn: Optional[sa.Connection] = None,
+        conn: sa.Connection,
     ) -> Optional[RecordedResult]:
         # First query: Retrieve the workflow status
         workflow_status_sql = sa.select(
@@ -1182,13 +1232,8 @@ class SystemDatabase:
         )
         # Execute both queries
-        if conn is not None:
-            workflow_status_rows = conn.execute(workflow_status_sql).all()
-            operation_output_rows = conn.execute(operation_output_sql).all()
-        else:
-            with self.engine.begin() as c:
-                workflow_status_rows = c.execute(workflow_status_sql).all()
-                operation_output_rows = c.execute(operation_output_sql).all()
+        workflow_status_rows = conn.execute(workflow_status_sql).all()
+        operation_output_rows = conn.execute(operation_output_sql).all()
         # Check if the workflow exists
         assert (
@@ -1230,6 +1275,16 @@ class SystemDatabase:
         }
         return result
+    @db_retry()
+    def check_operation_execution(
+        self, workflow_id: str, function_id: int, function_name: str
+    ) -> Optional[RecordedResult]:
+        with self.engine.begin() as c:
+            return self._check_operation_execution_txn(
+                workflow_id, function_id, function_name, c
+            )
+    @db_retry()
     def check_child_workflow(
         self, workflow_uuid: str, function_id: int
     ) -> Optional[str]:
@@ -1247,6 +1302,7 @@ class SystemDatabase:
             return None
         return str(row[0])
+    @db_retry()
     def send(
         self,
         workflow_uuid: str,
@@ -1258,7 +1314,7 @@ class SystemDatabase:
         function_name = "DBOS.send"
         topic = topic if topic is not None else _dbos_null_topic
         with self.engine.begin() as c:
-            recorded_output = self.check_operation_execution(
+            recorded_output = self._check_operation_execution_txn(
                 workflow_uuid, function_id, function_name, conn=c
             )
             if self._debug_mode and recorded_output is None:
@@ -1296,8 +1352,9 @@ class SystemDatabase:
                 "output": None,
                 "error": None,
             }
-            self.record_operation_result(output, conn=c)
+            self._record_operation_result_txn(output, conn=c)
+    @db_retry()
     def recv(
         self,
         workflow_uuid: str,
@@ -1390,7 +1447,7 @@ class SystemDatabase:
             message: Any = None
             if len(rows) > 0:
                 message = _serialization.deserialize(rows[0][0])
-            self.record_operation_result(
+            self._record_operation_result_txn(
                 {
                     "workflow_uuid": workflow_uuid,
                     "function_id": function_id,
@@ -1454,13 +1511,14 @@ class SystemDatabase:
                             dbos_logger.error(f"Unknown channel: {channel}")
             except Exception as e:
                 if self._run_background_processes:
-                    dbos_logger.error(f"Notification listener error: {e}")
+                    dbos_logger.warning(f"Notification listener error: {e}")
                     time.sleep(1)
                     # Then the loop will try to reconnect and restart the listener
             finally:
                 if self.notification_conn is not None:
                     self.notification_conn.close()
+    @db_retry()
     def sleep(
         self,
         workflow_uuid: str,
@@ -1500,6 +1558,7 @@ class SystemDatabase:
             time.sleep(duration)
         return duration
+    @db_retry()
     def set_event(
         self,
         workflow_uuid: str,
@@ -1509,7 +1568,7 @@ class SystemDatabase:
     ) -> None:
         function_name = "DBOS.setEvent"
         with self.engine.begin() as c:
-            recorded_output = self.check_operation_execution(
+            recorded_output = self._check_operation_execution_txn(
                 workflow_uuid, function_id, function_name, conn=c
             )
             if self._debug_mode and recorded_output is None:
@@ -1541,8 +1600,9 @@ class SystemDatabase:
                 "output": None,
                 "error": None,
             }
-            self.record_operation_result(output, conn=c)
+            self._record_operation_result_txn(output, conn=c)
+    @db_retry()
     def get_event(
         self,
         target_uuid: str,
@@ -1633,7 +1693,7 @@ class SystemDatabase:
             )
         return value
-    def enqueue(
+    def _enqueue(
         self,
         workflow_id: str,
         queue_name: str,
@@ -1709,13 +1769,8 @@ class SystemDatabase:
                 if num_recent_queries >= queue.limiter["limit"]:
                     return []
-            # Dequeue functions eligible for this worker and ordered by the time at which they were enqueued.
-            # If there is a global or local concurrency limit N, select only the N oldest enqueued
-            # functions, else select all of them.
-            # First lets figure out how many tasks are eligible for dequeue.
-            # This means figuring out how many unstarted tasks are within the local and global concurrency limits
-            running_tasks_query = (
+            # Count how many workflows on this queue are currently PENDING both locally and globally.
+            pending_tasks_query = (
                 sa.select(
                     SystemSchema.workflow_status.c.executor_id,
                     sa.func.count().label("task_count"),
@@ -1729,41 +1784,37 @@ class SystemDatabase:
                 )
                 .where(SystemSchema.workflow_queue.c.queue_name == queue.name)
                 .where(
-                    SystemSchema.workflow_queue.c.started_at_epoch_ms.isnot(
-                        None
-                    )  # Task is started
-                )
-                .where(
-                    SystemSchema.workflow_queue.c.completed_at_epoch_ms.is_(
-                        None
-                    )  # Task is not completed.
+                    SystemSchema.workflow_status.c.status
+                    == WorkflowStatusString.PENDING.value
                 )
                 .group_by(SystemSchema.workflow_status.c.executor_id)
             )
-            running_tasks_result = c.execute(running_tasks_query).fetchall()
-            running_tasks_result_dict = {row[0]: row[1] for row in running_tasks_result}
-            running_tasks_for_this_worker = running_tasks_result_dict.get(
-                executor_id, 0
-            )  # Get count for current executor
+            pending_workflows = c.execute(pending_tasks_query).fetchall()
+            pending_workflows_dict = {row[0]: row[1] for row in pending_workflows}
+            local_pending_workflows = pending_workflows_dict.get(executor_id, 0)
+            # Compute max_tasks, the number of workflows that can be dequeued given local and global concurrency limits,
             max_tasks = float("inf")
             if queue.worker_concurrency is not None:
-                max_tasks = max(
-                    0, queue.worker_concurrency - running_tasks_for_this_worker
-                )
+                # Print a warning if the local concurrency limit is violated
+                if local_pending_workflows > queue.worker_concurrency:
+                    dbos_logger.warning(
+                        f"The number of local pending workflows ({local_pending_workflows}) on queue {queue.name} exceeds the local concurrency limit ({queue.worker_concurrency})"
+                    )
+                max_tasks = max(0, queue.worker_concurrency - local_pending_workflows)
             if queue.concurrency is not None:
-                total_running_tasks = sum(running_tasks_result_dict.values())
-                # Queue global concurrency limit should always be >= running_tasks_count
-                # This should never happen but a check + warning doesn't hurt
-                if total_running_tasks > queue.concurrency:
+                global_pending_workflows = sum(pending_workflows_dict.values())
+                # Print a warning if the global concurrency limit is violated
+                if global_pending_workflows > queue.concurrency:
                     dbos_logger.warning(
-                        f"Total running tasks ({total_running_tasks}) exceeds the global concurrency limit ({queue.concurrency})"
+                        f"The total number of pending workflows ({global_pending_workflows}) on queue {queue.name} exceeds the global concurrency limit ({queue.concurrency})"
                     )
-                available_tasks = max(0, queue.concurrency - total_running_tasks)
+                available_tasks = max(0, queue.concurrency - global_pending_workflows)
                 max_tasks = min(max_tasks, available_tasks)
             # Retrieve the first max_tasks workflows in the queue.
-            # Only retrieve workflows of the appropriate version (or without version set)
+            # Only retrieve workflows of the local version (or without version set)
             query = (
                 sa.select(
                     SystemSchema.workflow_queue.c.workflow_uuid,
@@ -1776,8 +1827,10 @@ class SystemDatabase:
                     )
                 )
                 .where(SystemSchema.workflow_queue.c.queue_name == queue.name)
-                .where(SystemSchema.workflow_queue.c.started_at_epoch_ms == None)
-                .where(SystemSchema.workflow_queue.c.completed_at_epoch_ms == None)
+                .where(
+                    SystemSchema.workflow_status.c.status
+                    == WorkflowStatusString.ENQUEUED.value
+                )
                 .where(
                     sa.or_(
                         SystemSchema.workflow_status.c.application_version
@@ -1806,20 +1859,16 @@ class SystemDatabase:
             ret_ids: list[str] = []
             for id in dequeued_ids:
-                # If we have a limiter, stop starting functions when the number
-                # of functions started this period exceeds the limit.
+                # If we have a limiter, stop dequeueing workflows when the number
+                # of workflows started this period exceeds the limit.
                 if queue.limiter is not None:
                     if len(ret_ids) + num_recent_queries >= queue.limiter["limit"]:
                         break
-                # To start a function, first set its status to PENDING and update its executor ID
-                res = c.execute(
+                # To start a workflow, first set its status to PENDING and update its executor ID
+                c.execute(
                     SystemSchema.workflow_status.update()
                     .where(SystemSchema.workflow_status.c.workflow_uuid == id)
-                    .where(
-                        SystemSchema.workflow_status.c.status
-                        == WorkflowStatusString.ENQUEUED.value
-                    )
                     .values(
                         status=WorkflowStatusString.PENDING.value,
                         application_version=app_version,
@@ -1827,8 +1876,13 @@ class SystemDatabase:
                         # If a timeout is set, set the deadline on dequeue
                         workflow_deadline_epoch_ms=sa.case(
                             (
-                                SystemSchema.workflow_status.c.workflow_timeout_ms.isnot(
-                                    None
+                                sa.and_(
+                                    SystemSchema.workflow_status.c.workflow_timeout_ms.isnot(
+                                        None
+                                    ),
+                                    SystemSchema.workflow_status.c.workflow_deadline_epoch_ms.is_(
+                                        None
+                                    ),
                                 ),
                                 sa.func.extract("epoch", sa.func.now()) * 1000
                                 + SystemSchema.workflow_status.c.workflow_timeout_ms,
@@ -1837,16 +1891,15 @@ class SystemDatabase:
                         ),
                     )
                 )
-                if res.rowcount > 0:
-                    # Then give it a start time and assign the executor ID
-                    c.execute(
-                        SystemSchema.workflow_queue.update()
-                        .where(SystemSchema.workflow_queue.c.workflow_uuid == id)
-                        .values(started_at_epoch_ms=start_time_ms)
-                    )
-                    ret_ids.append(id)
+                # Then give it a start time
+                c.execute(
+                    SystemSchema.workflow_queue.update()
+                    .where(SystemSchema.workflow_queue.c.workflow_uuid == id)
+                    .values(started_at_epoch_ms=start_time_ms)
+                )
+                ret_ids.append(id)
-            # If we have a limiter, garbage-collect all completed functions started
+            # If we have a limiter, garbage-collect all completed workflows started
             # before the period. If there's no limiter, there's no need--they were
             # deleted on completion.
             if queue.limiter is not None:
@@ -1863,6 +1916,7 @@ class SystemDatabase:
             # Return the IDs of all functions we started
             return ret_ids
+    @db_retry()
     def remove_from_queue(self, workflow_id: str, queue: "Queue") -> None:
         if self._debug_mode:
             raise Exception("called remove_from_queue in debug mode")
@@ -1951,6 +2005,7 @@ class SystemDatabase:
             )
         return result
+    @db_retry()
     def init_workflow(
         self,
         status: WorkflowStatusInternal,
@@ -1963,17 +2018,17 @@ class SystemDatabase:
         Synchronously record the status and inputs for workflows in a single transaction
         """
         with self.engine.begin() as conn:
-            wf_status, workflow_deadline_epoch_ms = self.insert_workflow_status(
+            wf_status, workflow_deadline_epoch_ms = self._insert_workflow_status(
                 status, conn, max_recovery_attempts=max_recovery_attempts
             )
             # TODO: Modify the inputs if they were changed by `update_workflow_inputs`
-            self.update_workflow_inputs(status["workflow_uuid"], inputs, conn)
+            self._update_workflow_inputs(status["workflow_uuid"], inputs, conn)
             if (
                 status["queue_name"] is not None
                 and wf_status == WorkflowStatusString.ENQUEUED.value
             ):
-                self.enqueue(
+                self._enqueue(
                     status["workflow_uuid"],
                     status["queue_name"],
                     conn,
@@ -1981,6 +2036,14 @@ class SystemDatabase:
                 )
         return wf_status, workflow_deadline_epoch_ms
+    def check_connection(self) -> None:
+        try:
+            with self.engine.begin() as conn:
+                conn.execute(sa.text("SELECT 1")).fetchall()
+        except Exception as e:
+            dbos_logger.error(f"Error connecting to the DBOS system database: {e}")
+            raise
 def reset_system_database(postgres_db_url: sa.URL, sysdb_name: str) -> None:
     try:

dbos 1.1.0a3__py3-none-any.whl → 1.2.0__py3-none-any.whl

dbos 1.1.0a3py3-none-any.whl → 1.2.0py3-none-any.whl