boilermaker-servicebus 1.0.0.dev2__tar.gz → 1.0.0.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/PKG-INFO +4 -2
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/app.py +3 -2
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/__init__.py +1 -2
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/common.py +3 -5
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/eval.py +1 -2
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/task_graph.py +146 -13
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/exc.py +16 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/service_bus.py +1 -1
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/storage/blob_storage.py +36 -34
- boilermaker_servicebus-1.0.0.dev4/boilermaker/task/__init__.py +20 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/graph.py +265 -60
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/task.py +2 -1
- boilermaker_servicebus-1.0.0.dev4/boilermaker/task/types.py +18 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/PKG-INFO +4 -2
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/SOURCES.txt +1 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/requires.txt +4 -1
- boilermaker_servicebus-1.0.0.dev4/examples/task_graph_example.py +274 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/pyproject.toml +14 -16
- boilermaker_servicebus-1.0.0.dev4/tests/evaluators/test_task_graphs.py +1169 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/storage/test_blob_storage.py +146 -5
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_graph.py +789 -134
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_app.py +6 -0
- boilermaker_servicebus-1.0.0.dev2/boilermaker/task/__init__.py +0 -6
- boilermaker_servicebus-1.0.0.dev2/examples/task_graph_example.py +0 -127
- boilermaker_servicebus-1.0.0.dev2/tests/evaluators/test_task_graphs.py +0 -564
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/LICENSE +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/README.md +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/__init__.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/config.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/results_store.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/simple.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/failure.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/retries.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/sample.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/storage/__init__.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/storage/base.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/result.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/task_id.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/tracing.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/dependency_links.txt +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/top_level.txt +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/examples/basic.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/examples/callbacks.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/setup.cfg +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/conftest.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/conftest.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_common.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_eval.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_eval_factory.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_results_store.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_simple.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/graph_factories.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/helpers.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_graph_cycle_detection.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_result.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_task.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_config.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_retries.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_sample.py +0 -0
- {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_service_bus.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: boilermaker-servicebus
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev4
|
|
4
4
|
Summary: An async python Background task system using Azure Service Bus Queues
|
|
5
5
|
Author-email: Erik Aker <eaker@mulliganfunding.com>
|
|
6
6
|
License: Apache License
|
|
@@ -210,7 +210,7 @@ Project-URL: Issues, https://github.com/MulliganFunding/boilermaker-servicebus/i
|
|
|
210
210
|
Requires-Python: >=3.11
|
|
211
211
|
Description-Content-Type: text/markdown
|
|
212
212
|
License-File: LICENSE
|
|
213
|
-
Requires-Dist: aio-azure-clients-toolbox>=1.0
|
|
213
|
+
Requires-Dist: aio-azure-clients-toolbox>=1.1.0
|
|
214
214
|
Requires-Dist: anyio>=4.11.0
|
|
215
215
|
Requires-Dist: azure-core-tracing-opentelemetry>=1.0.0b12
|
|
216
216
|
Requires-Dist: azure-servicebus>=7.14.2
|
|
@@ -219,6 +219,8 @@ Requires-Dist: opentelemetry-api>=1.34.0
|
|
|
219
219
|
Requires-Dist: pydantic>=2.12.2
|
|
220
220
|
Requires-Dist: pydantic-settings>=2.11.0
|
|
221
221
|
Requires-Dist: uuid-utils>=0.11.1
|
|
222
|
+
Provides-Extra: repl
|
|
223
|
+
Requires-Dist: ipython; extra == "repl"
|
|
222
224
|
Dynamic: license-file
|
|
223
225
|
|
|
224
226
|
# Boilermaker
|
|
@@ -12,7 +12,7 @@ import typing
|
|
|
12
12
|
import weakref
|
|
13
13
|
from functools import wraps
|
|
14
14
|
|
|
15
|
-
from aio_azure_clients_toolbox import AzureServiceBus, ManagedAzureServiceBusSender
|
|
15
|
+
from aio_azure_clients_toolbox import AzureServiceBus, ManagedAzureServiceBusSender
|
|
16
16
|
from anyio import create_task_group, open_signal_receiver
|
|
17
17
|
from anyio.abc import CancelScope
|
|
18
18
|
from azure.servicebus import ServiceBusReceivedMessage
|
|
@@ -163,7 +163,7 @@ class Boilermaker:
|
|
|
163
163
|
raise ValueError(f"Function must be async: {fn_name}")
|
|
164
164
|
|
|
165
165
|
task = Task.default(fn_name, **options)
|
|
166
|
-
self.function_registry[fn_name] = fn
|
|
166
|
+
self.function_registry[fn_name] = typing.cast(TaskHandler, fn) # why must cast here
|
|
167
167
|
self.task_registry[fn_name] = task
|
|
168
168
|
logger.info(f"Registered background function fn={fn_name}")
|
|
169
169
|
return self
|
|
@@ -361,6 +361,7 @@ class Boilermaker:
|
|
|
361
361
|
results: list[int] = await self.service_bus_client.send_message(
|
|
362
362
|
task.model_dump_json(),
|
|
363
363
|
delay=delay,
|
|
364
|
+
unique_msg_id=str(task.task_id),
|
|
364
365
|
)
|
|
365
366
|
if results and len(results) == 1:
|
|
366
367
|
sequence_number = results[0]
|
|
@@ -4,12 +4,11 @@ import typing
|
|
|
4
4
|
from azure.servicebus.aio import ServiceBusReceiver
|
|
5
5
|
|
|
6
6
|
from boilermaker.storage.base import StorageInterface
|
|
7
|
-
from boilermaker.task import Task
|
|
7
|
+
from boilermaker.task import Task, TaskHandler
|
|
8
8
|
|
|
9
9
|
from .common import (
|
|
10
10
|
MessageActions,
|
|
11
11
|
TaskEvaluatorBase,
|
|
12
|
-
TaskHandler,
|
|
13
12
|
TaskHandlerRegistry,
|
|
14
13
|
TaskPublisher,
|
|
15
14
|
)
|
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import traceback
|
|
4
4
|
import typing
|
|
5
5
|
from abc import abstractmethod
|
|
6
|
-
from collections.abc import Awaitable
|
|
6
|
+
from collections.abc import Awaitable
|
|
7
7
|
from functools import cached_property
|
|
8
8
|
from json.decoder import JSONDecodeError
|
|
9
9
|
|
|
@@ -21,14 +21,12 @@ from pydantic import ValidationError
|
|
|
21
21
|
from boilermaker import exc, sample
|
|
22
22
|
from boilermaker.storage import StorageInterface
|
|
23
23
|
from boilermaker.task import Task, TaskResult, TaskStatus
|
|
24
|
+
from boilermaker.task import types as task_types
|
|
24
25
|
|
|
25
26
|
tracer: trace.Tracer = trace.get_tracer(__name__)
|
|
26
27
|
logger = logging.getLogger("boilermaker.app")
|
|
27
28
|
|
|
28
|
-
|
|
29
|
-
# Common Types used when evaluating tasks
|
|
30
|
-
TaskHandler: typing.TypeAlias = Callable[..., Awaitable[typing.Any]]
|
|
31
|
-
TaskHandlerRegistry: typing.TypeAlias = dict[str, TaskHandler]
|
|
29
|
+
TaskHandlerRegistry: typing.TypeAlias = dict[str, task_types.TaskHandler]
|
|
32
30
|
|
|
33
31
|
|
|
34
32
|
class TaskPublisher(typing.Protocol):
|
|
@@ -7,8 +7,7 @@ from boilermaker.exc import BoilermakerUnregisteredFunction
|
|
|
7
7
|
from boilermaker.failure import TaskFailureResult
|
|
8
8
|
from boilermaker.retries import RetryException
|
|
9
9
|
from boilermaker.task import Task, TaskResult, TaskStatus
|
|
10
|
-
|
|
11
|
-
from .common import TaskHandler
|
|
10
|
+
from boilermaker.task.types import TaskHandler
|
|
12
11
|
|
|
13
12
|
logger = logging.getLogger("boilermaker.app")
|
|
14
13
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import itertools
|
|
2
3
|
import logging
|
|
3
4
|
import typing
|
|
4
5
|
|
|
5
6
|
from azure.servicebus.aio import ServiceBusReceiver
|
|
6
7
|
|
|
7
|
-
from boilermaker import exc
|
|
8
|
+
from boilermaker import exc, retries
|
|
8
9
|
from boilermaker.storage import StorageInterface
|
|
9
10
|
from boilermaker.task import Task, TaskResult, TaskStatus
|
|
10
11
|
|
|
@@ -13,9 +14,24 @@ from .eval import eval_task
|
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger("boilermaker.app")
|
|
15
16
|
|
|
17
|
+
# Retry policy used when load_graph raises a transient exception.
|
|
18
|
+
# Up to 3 attempts total (initial + 2 retries) with exponential backoff.
|
|
19
|
+
_LOAD_GRAPH_RETRY_POLICY = retries.RetryPolicy(
|
|
20
|
+
max_tries=3,
|
|
21
|
+
delay=1,
|
|
22
|
+
delay_max=16,
|
|
23
|
+
retry_mode=retries.RetryMode.Exponential,
|
|
24
|
+
)
|
|
25
|
+
|
|
16
26
|
|
|
17
27
|
class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
18
|
-
"""Evaluator for tasks that are part of a TaskGraph workflow.
|
|
28
|
+
"""Evaluator for tasks that are part of a TaskGraph workflow.
|
|
29
|
+
|
|
30
|
+
At-least-once delivery contract: any task in ``Scheduled`` status may be
|
|
31
|
+
published more than once. Workers must tolerate at-least-once delivery.
|
|
32
|
+
Re-publication on Service Bus redelivery is the intentional recovery
|
|
33
|
+
mechanism for the store-before-publish crash gap in ``continue_graph``.
|
|
34
|
+
"""
|
|
19
35
|
|
|
20
36
|
def __init__(
|
|
21
37
|
self,
|
|
@@ -114,8 +130,17 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
|
114
130
|
result=None,
|
|
115
131
|
)
|
|
116
132
|
await self.storage_interface.store_task_result(task_result)
|
|
117
|
-
# Publish failure tasks which may be ready now
|
|
118
|
-
|
|
133
|
+
# Publish failure tasks which may be ready now.
|
|
134
|
+
# The message is already deadlettered at this point, so suppressing settlement
|
|
135
|
+
# is not possible — log and return gracefully if continue_graph fails.
|
|
136
|
+
try:
|
|
137
|
+
await self.continue_graph(task_result)
|
|
138
|
+
except exc.ContinueGraphError:
|
|
139
|
+
logger.error(
|
|
140
|
+
f"continue_graph failed after retries exhausted for task {self.task.task_id}; "
|
|
141
|
+
"failure callbacks may not be dispatched (message already deadlettered)",
|
|
142
|
+
exc_info=True,
|
|
143
|
+
)
|
|
119
144
|
return task_result
|
|
120
145
|
|
|
121
146
|
# Actually invoke the task here
|
|
@@ -128,7 +153,17 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
|
128
153
|
await self.storage_interface.store_task_result(result)
|
|
129
154
|
|
|
130
155
|
if result.status.finished:
|
|
131
|
-
|
|
156
|
+
try:
|
|
157
|
+
await self.continue_graph(result)
|
|
158
|
+
except exc.ContinueGraphError:
|
|
159
|
+
# Transient load_graph failure — do NOT settle the message.
|
|
160
|
+
# Allow Service Bus redelivery so downstream dispatch can be retried.
|
|
161
|
+
logger.error(
|
|
162
|
+
f"continue_graph failed for task {self.task.task_id}; "
|
|
163
|
+
"suppressing message settlement to allow redelivery",
|
|
164
|
+
exc_info=True,
|
|
165
|
+
)
|
|
166
|
+
return result
|
|
132
167
|
elif result.status == TaskStatus.Retry:
|
|
133
168
|
# Retry requested: republish the same task with delay
|
|
134
169
|
delay = self.task.get_next_delay()
|
|
@@ -171,20 +206,81 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
|
171
206
|
Continue evaluating TaskGraph workflow after a task completes successfully.
|
|
172
207
|
|
|
173
208
|
We always reload the graph from storage to get the latest state.
|
|
209
|
+
|
|
210
|
+
Transient ``load_graph`` failures (``BoilermakerStorageError`` with a
|
|
211
|
+
non-404 status code) are retried with exponential backoff up to
|
|
212
|
+
``_LOAD_GRAPH_RETRY_POLICY.max_tries`` attempts. If all attempts fail,
|
|
213
|
+
``ContinueGraphError`` is raised so that ``message_handler`` can suppress
|
|
214
|
+
message settlement and allow Service Bus redelivery.
|
|
215
|
+
|
|
216
|
+
Permanent failures (``BoilermakerStorageError`` with ``status_code=404``)
|
|
217
|
+
are logged at CRITICAL severity and ``None`` is returned. Settling the
|
|
218
|
+
message is correct in this case because redelivery will not help — the
|
|
219
|
+
graph blob is gone and downstream tasks cannot be dispatched.
|
|
220
|
+
|
|
221
|
+
Note: in practice ``load_graph`` never returns ``None`` for a missing
|
|
222
|
+
blob; the underlying library re-raises all ``HttpResponseError``s
|
|
223
|
+
(including 404) as ``AzureBlobError``, which ``load_graph`` wraps as
|
|
224
|
+
``BoilermakerStorageError(status_code=404)``. The ``if not graph`` guard
|
|
225
|
+
below is retained as a defensive fallback only.
|
|
226
|
+
|
|
227
|
+
At-least-once delivery: any task already in ``Scheduled`` status is
|
|
228
|
+
re-published without a second blob write (second pass below). This is
|
|
229
|
+
the crash-recovery path for the store-before-publish gap.
|
|
174
230
|
"""
|
|
175
231
|
graph_id = completed_task_result.graph_id
|
|
176
232
|
if not graph_id:
|
|
177
233
|
return None
|
|
178
234
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
235
|
+
# Attempt to load the graph, retrying on transient errors.
|
|
236
|
+
last_exc: Exception | None = None
|
|
237
|
+
for attempt in range(_LOAD_GRAPH_RETRY_POLICY.max_tries):
|
|
238
|
+
try:
|
|
239
|
+
graph = await self.storage_interface.load_graph(graph_id)
|
|
240
|
+
break # success
|
|
241
|
+
except exc.BoilermakerStorageError as e:
|
|
242
|
+
if getattr(e, "status_code", None) == 404:
|
|
243
|
+
# Permanent: graph blob does not exist. Redelivery will not help.
|
|
244
|
+
logger.critical(
|
|
245
|
+
f"Graph {graph_id} not found in storage (404); downstream tasks will not be dispatched. "
|
|
246
|
+
"This graph may have been deleted.",
|
|
247
|
+
exc_info=True,
|
|
248
|
+
)
|
|
249
|
+
return None
|
|
250
|
+
# Transient error — will retry or raise ContinueGraphError after max_tries
|
|
251
|
+
last_exc = e
|
|
252
|
+
if attempt < _LOAD_GRAPH_RETRY_POLICY.max_tries - 1:
|
|
253
|
+
delay = _LOAD_GRAPH_RETRY_POLICY.get_delay_interval(attempt)
|
|
254
|
+
logger.warning(
|
|
255
|
+
f"load_graph failed for graph {graph_id} "
|
|
256
|
+
f"(attempt {attempt + 1}/{_LOAD_GRAPH_RETRY_POLICY.max_tries}); "
|
|
257
|
+
f"retrying in {delay}s",
|
|
258
|
+
exc_info=True,
|
|
259
|
+
)
|
|
260
|
+
await asyncio.sleep(delay)
|
|
261
|
+
else:
|
|
262
|
+
logger.error(
|
|
263
|
+
f"load_graph failed for graph {graph_id} after "
|
|
264
|
+
f"{_LOAD_GRAPH_RETRY_POLICY.max_tries} attempts; "
|
|
265
|
+
"raising ContinueGraphError to suppress message settlement",
|
|
266
|
+
exc_info=True,
|
|
267
|
+
)
|
|
268
|
+
raise exc.ContinueGraphError(
|
|
269
|
+
f"load_graph failed for graph {graph_id} after "
|
|
270
|
+
f"{_LOAD_GRAPH_RETRY_POLICY.max_tries} attempts"
|
|
271
|
+
) from last_exc
|
|
272
|
+
else:
|
|
273
|
+
# Should only be reached if max_tries == 0 (not expected).
|
|
274
|
+
raise exc.ContinueGraphError(f"load_graph not attempted for graph {graph_id}")
|
|
185
275
|
|
|
186
276
|
if not graph:
|
|
187
|
-
|
|
277
|
+
# Permanent failure: graph blob does not exist. Redelivery will not help.
|
|
278
|
+
# Settling the upstream message is intentional here.
|
|
279
|
+
logger.critical(
|
|
280
|
+
f"Graph {graph_id} not found after task completion — "
|
|
281
|
+
"downstream tasks will never be dispatched. "
|
|
282
|
+
"This is a permanent data loss; redelivery will not recover it."
|
|
283
|
+
)
|
|
188
284
|
return None
|
|
189
285
|
|
|
190
286
|
# Sanity check: did we load the result that was *just* stored?
|
|
@@ -197,14 +293,19 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
|
197
293
|
)
|
|
198
294
|
return None
|
|
199
295
|
|
|
296
|
+
# Snapshot tasks already in Scheduled status BEFORE the first pass.
|
|
297
|
+
# The second pass uses this snapshot so that tasks freshly scheduled
|
|
298
|
+
# in the first pass are not double-published.
|
|
299
|
+
already_scheduled_tasks = list(graph.generate_scheduled_tasks())
|
|
300
|
+
|
|
200
301
|
# Find and publish newly ready tasks
|
|
201
302
|
ready_count = 0
|
|
202
303
|
for ready_task in itertools.chain.from_iterable(
|
|
203
304
|
(graph.generate_ready_tasks(), graph.generate_failure_ready_tasks())
|
|
204
305
|
):
|
|
205
306
|
# Write that the task was *scheduled* back to Blob Storage with blob etag and then publish the task!
|
|
206
|
-
result = graph.schedule_task(ready_task.task_id)
|
|
207
307
|
try:
|
|
308
|
+
result = graph.schedule_task(ready_task.task_id)
|
|
208
309
|
await self.storage_interface.store_task_result(result, etag=result.etag)
|
|
209
310
|
except exc.BoilermakerStorageError:
|
|
210
311
|
logger.error(
|
|
@@ -213,6 +314,13 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
|
213
314
|
exc_info=True,
|
|
214
315
|
)
|
|
215
316
|
continue
|
|
317
|
+
except ValueError:
|
|
318
|
+
logger.error(
|
|
319
|
+
f"schedule_task raised ValueError for task {ready_task.task_id} in graph {graph_id}. "
|
|
320
|
+
"Skipping to avoid double-scheduling.",
|
|
321
|
+
exc_info=True,
|
|
322
|
+
)
|
|
323
|
+
continue
|
|
216
324
|
|
|
217
325
|
ready_count += 1
|
|
218
326
|
await self.publish_task(ready_task)
|
|
@@ -223,4 +331,29 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
|
|
|
223
331
|
f"No new tasks ready in graph {graph_id} after task {completed_task_result.task_id}"
|
|
224
332
|
)
|
|
225
333
|
|
|
334
|
+
# Second pass: re-publish tasks that were ALREADY in Scheduled status when the
|
|
335
|
+
# graph was loaded (crash-recovery).
|
|
336
|
+
#
|
|
337
|
+
# If a previous invocation of continue_graph wrote Scheduled to blob storage
|
|
338
|
+
# (store_task_result) but crashed before publishing the Service Bus message
|
|
339
|
+
# (publish_task), the task blob shows Scheduled but there is no SB message.
|
|
340
|
+
# generate_ready_tasks() skips Scheduled tasks (is_not_started == False), so
|
|
341
|
+
# without this pass the task would never be dispatched.
|
|
342
|
+
#
|
|
343
|
+
# We snapshot already-scheduled tasks BEFORE the first pass so that tasks
|
|
344
|
+
# scheduled in the first pass are not double-published here.
|
|
345
|
+
#
|
|
346
|
+
# On Service Bus redelivery, we detect these orphaned-Scheduled tasks here and
|
|
347
|
+
# re-publish them without a second blob write (the blob is already correct).
|
|
348
|
+
#
|
|
349
|
+
# NOTE: Workers must tolerate at-least-once delivery. A task in Scheduled
|
|
350
|
+
# status may be published more than once. This is the intentional recovery
|
|
351
|
+
# mechanism for the store-before-publish crash gap.
|
|
352
|
+
for scheduled_task in already_scheduled_tasks:
|
|
353
|
+
logger.info(
|
|
354
|
+
f"Re-publishing already-scheduled task {scheduled_task.task_id} "
|
|
355
|
+
f"in graph {graph_id} (crash recovery: blob written but message not published)"
|
|
356
|
+
)
|
|
357
|
+
await self.publish_task(scheduled_task)
|
|
358
|
+
|
|
226
359
|
return ready_count
|
|
@@ -1,6 +1,22 @@
|
|
|
1
1
|
from azure.servicebus.exceptions import ServiceBusError
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
class BoilermakerError(Exception):
|
|
5
|
+
"""Base class for Boilermaker-specific exceptions."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ContinueGraphError(BoilermakerError):
|
|
11
|
+
"""Raised when continue_graph cannot load the graph after retries.
|
|
12
|
+
|
|
13
|
+
Signals message_handler that settlement must be suppressed so that
|
|
14
|
+
Service Bus will redeliver the message and downstream dispatch can be retried.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
4
20
|
class BoilermakerAppException(Exception):
|
|
5
21
|
def __init__(self, message: str, errors: list):
|
|
6
22
|
super().__init__(message + str(errors))
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import datetime
|
|
2
1
|
import logging
|
|
3
2
|
import traceback
|
|
4
3
|
from functools import partial
|
|
@@ -13,7 +12,7 @@ from azure.core.exceptions import (
|
|
|
13
12
|
ResourceNotFoundError,
|
|
14
13
|
)
|
|
15
14
|
from azure.identity.aio import DefaultAzureCredential
|
|
16
|
-
from
|
|
15
|
+
from pydantic import ValidationError
|
|
17
16
|
|
|
18
17
|
from boilermaker.exc import BoilermakerStorageError
|
|
19
18
|
from boilermaker.storage import StorageInterface
|
|
@@ -50,7 +49,8 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
|
|
|
50
49
|
Returns:
|
|
51
50
|
The loaded TaskGraph instance, or None if not found.
|
|
52
51
|
Raises:
|
|
53
|
-
|
|
52
|
+
BoilermakerStorageError: If the blob cannot be loaded or if TaskGraph/TaskResultSlim
|
|
53
|
+
data cannot be validated.
|
|
54
54
|
"""
|
|
55
55
|
if not graph_id:
|
|
56
56
|
raise ValueError("`graph_id` must be provided to load a TaskGraph.")
|
|
@@ -70,28 +70,38 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
|
|
|
70
70
|
if graph_contents is None:
|
|
71
71
|
return None
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
try:
|
|
74
|
+
graph = TaskGraph.model_validate_json(graph_contents)
|
|
75
|
+
except ValidationError as e:
|
|
76
|
+
raise BoilermakerStorageError(
|
|
77
|
+
f"Failed to deserialize graph {graph_id}: {e}",
|
|
78
|
+
status_code=None,
|
|
79
|
+
) from e
|
|
74
80
|
|
|
75
81
|
# Load all TaskResultSlim instances associated with this graph
|
|
76
82
|
# We don't want to load *all* return values into memory. Just the statuses.
|
|
77
83
|
async for blob in self.list_blobs(prefix=graph_dir):
|
|
78
|
-
|
|
84
|
+
# DO NOT REDOWNLOAD GRAPH
|
|
85
|
+
if blob.name == graph_path:
|
|
86
|
+
continue
|
|
87
|
+
try:
|
|
88
|
+
tr = TaskResultSlim.model_validate_json(await self.download_blob(blob.name))
|
|
89
|
+
except ValidationError as e:
|
|
90
|
+
raise BoilermakerStorageError(
|
|
91
|
+
f"Failed to deserialize task result in graph {graph_id}: {e}",
|
|
92
|
+
status_code=None,
|
|
93
|
+
) from e
|
|
79
94
|
tr.etag = blob.etag
|
|
80
95
|
if tr.graph_id == graph_id:
|
|
81
96
|
graph.results[tr.task_id] = tr
|
|
82
97
|
else:
|
|
83
|
-
logger.warning(
|
|
84
|
-
f"TaskResult {tr.task_id} in graph {graph_dir} with wrong graph_id {tr.graph_id}!"
|
|
85
|
-
)
|
|
98
|
+
logger.warning(f"TaskResult {tr.task_id} in graph {graph_dir} with wrong graph_id {tr.graph_id}!")
|
|
86
99
|
return graph
|
|
87
100
|
|
|
88
101
|
async def store_graph(self, graph: TaskGraph) -> TaskGraph:
|
|
89
102
|
"""
|
|
90
103
|
Stores a TaskGraph to Azure Blob Storage and stores all children as pending tasks as well.
|
|
91
104
|
|
|
92
|
-
We use a lease on the container to make sure *only* one task is writing! This means
|
|
93
|
-
that we don't have to worry about concurrent writes causing data corruption.
|
|
94
|
-
|
|
95
105
|
We expect the *written graph* to be **immutable** (see the ImmutabilityPolicy below).
|
|
96
106
|
|
|
97
107
|
Args:
|
|
@@ -99,35 +109,21 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
|
|
|
99
109
|
"""
|
|
100
110
|
lease = None
|
|
101
111
|
async with self.get_blob_service_client() as blob_service_client:
|
|
102
|
-
container_client = blob_service_client.get_container_client(
|
|
103
|
-
self.container_name
|
|
104
|
-
)
|
|
105
|
-
lease = await container_client.acquire_lease()
|
|
106
|
-
upload_kwargs = {
|
|
107
|
-
"lease": lease,
|
|
108
|
-
"blob_type": "BlockBlob",
|
|
109
|
-
"immutability_policy": ImmutabilityPolicy(
|
|
110
|
-
expiry_time=datetime.datetime.now(tz=datetime.UTC) + datetime.timedelta(hours=4),
|
|
111
|
-
policy_mode="LOCKED",
|
|
112
|
-
),
|
|
113
|
-
}
|
|
114
|
-
|
|
112
|
+
container_client = blob_service_client.get_container_client(self.container_name)
|
|
115
113
|
# Store the graph itself first
|
|
116
114
|
fname = f"{self.task_result_prefix}/{graph.storage_path}"
|
|
117
115
|
try:
|
|
118
116
|
_result = await container_client.upload_blob(
|
|
119
117
|
fname,
|
|
120
118
|
graph.model_dump_json(),
|
|
121
|
-
|
|
119
|
+
blob_type="BlockBlob",
|
|
122
120
|
)
|
|
123
121
|
except (
|
|
124
122
|
ResourceNotFoundError,
|
|
125
123
|
HttpResponseError,
|
|
126
124
|
ResourceExistsError,
|
|
127
125
|
) as exc:
|
|
128
|
-
logger.error(
|
|
129
|
-
f"Error occurred while storing TaskGraph {graph.graph_id}: {exc}"
|
|
130
|
-
)
|
|
126
|
+
logger.error(f"Error occurred while storing TaskGraph {graph.graph_id}: {exc}")
|
|
131
127
|
raise BoilermakerStorageError(
|
|
132
128
|
f"Failed to store TaskGraph {graph.graph_id}",
|
|
133
129
|
task_id=None,
|
|
@@ -139,15 +135,15 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
|
|
|
139
135
|
pending_result = None
|
|
140
136
|
try:
|
|
141
137
|
async with create_task_group() as tg:
|
|
138
|
+
# don't let any tasks that get ahead accidentally clobber us
|
|
142
139
|
for pending_result in graph.generate_pending_results():
|
|
143
|
-
fname_pr =
|
|
144
|
-
|
|
145
|
-
)
|
|
140
|
+
fname_pr = f"{self.task_result_prefix}/{pending_result.storage_path}"
|
|
141
|
+
|
|
146
142
|
uploader = partial(
|
|
147
143
|
container_client.upload_blob,
|
|
148
144
|
fname_pr,
|
|
149
145
|
pending_result.model_dump_json(),
|
|
150
|
-
|
|
146
|
+
blob_type="BlockBlob",
|
|
151
147
|
)
|
|
152
148
|
tg.start_soon(uploader)
|
|
153
149
|
except* Exception as excgroup:
|
|
@@ -180,15 +176,21 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
|
|
|
180
176
|
"graph_id": task_result.graph_id or "none",
|
|
181
177
|
"status": task_result.status,
|
|
182
178
|
}
|
|
183
|
-
concurrency_kwargs: dict[str, str | int] = {}
|
|
179
|
+
concurrency_kwargs: dict[str, str | int | MatchConditions] = {}
|
|
184
180
|
if etag:
|
|
185
181
|
concurrency_kwargs["etag"] = etag
|
|
186
|
-
concurrency_kwargs["
|
|
182
|
+
concurrency_kwargs["match_condition"] = MatchConditions.IfNotModified
|
|
187
183
|
|
|
188
184
|
try:
|
|
189
185
|
await self.upload_blob(
|
|
190
186
|
fname, task_result.model_dump_json(), tags=blob_tags, overwrite=True, **concurrency_kwargs
|
|
191
187
|
)
|
|
188
|
+
# SAFETY: This catch assumes aio_azure_clients_toolbox raises AzureBlobError
|
|
189
|
+
# (wrapping HTTP 412 Precondition Failed) when an ETag mismatch occurs.
|
|
190
|
+
# This is the primary guard against concurrent double-scheduling of downstream
|
|
191
|
+
# tasks. Verified against aio-azure-clients-toolbox v1.0.4 (see uv.lock):
|
|
192
|
+
# get_blob_client() catches all HttpResponseError (including 412) and re-raises
|
|
193
|
+
# as AzureBlobError. If the library behavior changes, this guard will silently break.
|
|
192
194
|
except AzureBlobError as exc:
|
|
193
195
|
raise BoilermakerStorageError(
|
|
194
196
|
f"Failed to store TaskResult {task_result.task_id}",
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .graph import LAST_ADDED, LastAddedSingleton, TaskChain, TaskGraph, TaskGraphBuilder
|
|
2
|
+
from .result import TaskResult, TaskResultSlim, TaskStatus
|
|
3
|
+
from .task import Task
|
|
4
|
+
from .task_id import GraphId, TaskId
|
|
5
|
+
from .types import TaskHandler
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"LAST_ADDED",
|
|
9
|
+
"LastAddedSingleton",
|
|
10
|
+
"TaskChain",
|
|
11
|
+
"TaskGraph",
|
|
12
|
+
"TaskGraphBuilder",
|
|
13
|
+
"TaskResult",
|
|
14
|
+
"TaskResultSlim",
|
|
15
|
+
"TaskStatus",
|
|
16
|
+
"Task",
|
|
17
|
+
"TaskId",
|
|
18
|
+
"GraphId",
|
|
19
|
+
"TaskHandler",
|
|
20
|
+
]
|