boilermaker-servicebus 1.0.0.dev2__tar.gz → 1.0.0.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/PKG-INFO +4 -2
  2. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/app.py +3 -2
  3. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/__init__.py +1 -2
  4. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/common.py +3 -5
  5. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/eval.py +1 -2
  6. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/task_graph.py +146 -13
  7. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/exc.py +16 -0
  8. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/service_bus.py +1 -1
  9. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/storage/blob_storage.py +36 -34
  10. boilermaker_servicebus-1.0.0.dev4/boilermaker/task/__init__.py +20 -0
  11. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/graph.py +265 -60
  12. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/task.py +2 -1
  13. boilermaker_servicebus-1.0.0.dev4/boilermaker/task/types.py +18 -0
  14. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/PKG-INFO +4 -2
  15. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/SOURCES.txt +1 -0
  16. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/requires.txt +4 -1
  17. boilermaker_servicebus-1.0.0.dev4/examples/task_graph_example.py +274 -0
  18. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/pyproject.toml +14 -16
  19. boilermaker_servicebus-1.0.0.dev4/tests/evaluators/test_task_graphs.py +1169 -0
  20. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/storage/test_blob_storage.py +146 -5
  21. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_graph.py +789 -134
  22. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_app.py +6 -0
  23. boilermaker_servicebus-1.0.0.dev2/boilermaker/task/__init__.py +0 -6
  24. boilermaker_servicebus-1.0.0.dev2/examples/task_graph_example.py +0 -127
  25. boilermaker_servicebus-1.0.0.dev2/tests/evaluators/test_task_graphs.py +0 -564
  26. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/LICENSE +0 -0
  27. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/README.md +0 -0
  28. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/__init__.py +0 -0
  29. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/config.py +0 -0
  30. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/results_store.py +0 -0
  31. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/evaluators/simple.py +0 -0
  32. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/failure.py +0 -0
  33. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/retries.py +0 -0
  34. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/sample.py +0 -0
  35. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/storage/__init__.py +0 -0
  36. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/storage/base.py +0 -0
  37. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/result.py +0 -0
  38. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/task/task_id.py +0 -0
  39. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker/tracing.py +0 -0
  40. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/dependency_links.txt +0 -0
  41. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/boilermaker_servicebus.egg-info/top_level.txt +0 -0
  42. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/examples/basic.py +0 -0
  43. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/examples/callbacks.py +0 -0
  44. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/setup.cfg +0 -0
  45. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/conftest.py +0 -0
  46. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/conftest.py +0 -0
  47. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_common.py +0 -0
  48. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_eval.py +0 -0
  49. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_eval_factory.py +0 -0
  50. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_results_store.py +0 -0
  51. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/evaluators/test_simple.py +0 -0
  52. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/graph_factories.py +0 -0
  53. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/helpers.py +0 -0
  54. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_graph_cycle_detection.py +0 -0
  55. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_result.py +0 -0
  56. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/task/test_task.py +0 -0
  57. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_config.py +0 -0
  58. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_retries.py +0 -0
  59. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_sample.py +0 -0
  60. {boilermaker_servicebus-1.0.0.dev2 → boilermaker_servicebus-1.0.0.dev4}/tests/test_service_bus.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: boilermaker-servicebus
3
- Version: 1.0.0.dev2
3
+ Version: 1.0.0.dev4
4
4
  Summary: An async python Background task system using Azure Service Bus Queues
5
5
  Author-email: Erik Aker <eaker@mulliganfunding.com>
6
6
  License: Apache License
@@ -210,7 +210,7 @@ Project-URL: Issues, https://github.com/MulliganFunding/boilermaker-servicebus/i
210
210
  Requires-Python: >=3.11
211
211
  Description-Content-Type: text/markdown
212
212
  License-File: LICENSE
213
- Requires-Dist: aio-azure-clients-toolbox>=1.0.4
213
+ Requires-Dist: aio-azure-clients-toolbox>=1.1.0
214
214
  Requires-Dist: anyio>=4.11.0
215
215
  Requires-Dist: azure-core-tracing-opentelemetry>=1.0.0b12
216
216
  Requires-Dist: azure-servicebus>=7.14.2
@@ -219,6 +219,8 @@ Requires-Dist: opentelemetry-api>=1.34.0
219
219
  Requires-Dist: pydantic>=2.12.2
220
220
  Requires-Dist: pydantic-settings>=2.11.0
221
221
  Requires-Dist: uuid-utils>=0.11.1
222
+ Provides-Extra: repl
223
+ Requires-Dist: ipython; extra == "repl"
222
224
  Dynamic: license-file
223
225
 
224
226
  # Boilermaker
@@ -12,7 +12,7 @@ import typing
12
12
  import weakref
13
13
  from functools import wraps
14
14
 
15
- from aio_azure_clients_toolbox import AzureServiceBus, ManagedAzureServiceBusSender # type: ignore
15
+ from aio_azure_clients_toolbox import AzureServiceBus, ManagedAzureServiceBusSender
16
16
  from anyio import create_task_group, open_signal_receiver
17
17
  from anyio.abc import CancelScope
18
18
  from azure.servicebus import ServiceBusReceivedMessage
@@ -163,7 +163,7 @@ class Boilermaker:
163
163
  raise ValueError(f"Function must be async: {fn_name}")
164
164
 
165
165
  task = Task.default(fn_name, **options)
166
- self.function_registry[fn_name] = fn
166
+ self.function_registry[fn_name] = typing.cast(TaskHandler, fn) # why must cast here
167
167
  self.task_registry[fn_name] = task
168
168
  logger.info(f"Registered background function fn={fn_name}")
169
169
  return self
@@ -361,6 +361,7 @@ class Boilermaker:
361
361
  results: list[int] = await self.service_bus_client.send_message(
362
362
  task.model_dump_json(),
363
363
  delay=delay,
364
+ unique_msg_id=str(task.task_id),
364
365
  )
365
366
  if results and len(results) == 1:
366
367
  sequence_number = results[0]
@@ -4,12 +4,11 @@ import typing
4
4
  from azure.servicebus.aio import ServiceBusReceiver
5
5
 
6
6
  from boilermaker.storage.base import StorageInterface
7
- from boilermaker.task import Task
7
+ from boilermaker.task import Task, TaskHandler
8
8
 
9
9
  from .common import (
10
10
  MessageActions,
11
11
  TaskEvaluatorBase,
12
- TaskHandler,
13
12
  TaskHandlerRegistry,
14
13
  TaskPublisher,
15
14
  )
@@ -3,7 +3,7 @@ import logging
3
3
  import traceback
4
4
  import typing
5
5
  from abc import abstractmethod
6
- from collections.abc import Awaitable, Callable
6
+ from collections.abc import Awaitable
7
7
  from functools import cached_property
8
8
  from json.decoder import JSONDecodeError
9
9
 
@@ -21,14 +21,12 @@ from pydantic import ValidationError
21
21
  from boilermaker import exc, sample
22
22
  from boilermaker.storage import StorageInterface
23
23
  from boilermaker.task import Task, TaskResult, TaskStatus
24
+ from boilermaker.task import types as task_types
24
25
 
25
26
  tracer: trace.Tracer = trace.get_tracer(__name__)
26
27
  logger = logging.getLogger("boilermaker.app")
27
28
 
28
-
29
- # Common Types used when evaluating tasks
30
- TaskHandler: typing.TypeAlias = Callable[..., Awaitable[typing.Any]]
31
- TaskHandlerRegistry: typing.TypeAlias = dict[str, TaskHandler]
29
+ TaskHandlerRegistry: typing.TypeAlias = dict[str, task_types.TaskHandler]
32
30
 
33
31
 
34
32
  class TaskPublisher(typing.Protocol):
@@ -7,8 +7,7 @@ from boilermaker.exc import BoilermakerUnregisteredFunction
7
7
  from boilermaker.failure import TaskFailureResult
8
8
  from boilermaker.retries import RetryException
9
9
  from boilermaker.task import Task, TaskResult, TaskStatus
10
-
11
- from .common import TaskHandler
10
+ from boilermaker.task.types import TaskHandler
12
11
 
13
12
  logger = logging.getLogger("boilermaker.app")
14
13
 
@@ -1,10 +1,11 @@
1
+ import asyncio
1
2
  import itertools
2
3
  import logging
3
4
  import typing
4
5
 
5
6
  from azure.servicebus.aio import ServiceBusReceiver
6
7
 
7
- from boilermaker import exc
8
+ from boilermaker import exc, retries
8
9
  from boilermaker.storage import StorageInterface
9
10
  from boilermaker.task import Task, TaskResult, TaskStatus
10
11
 
@@ -13,9 +14,24 @@ from .eval import eval_task
13
14
 
14
15
  logger = logging.getLogger("boilermaker.app")
15
16
 
17
+ # Retry policy used when load_graph raises a transient exception.
18
+ # Up to 3 attempts total (initial + 2 retries) with exponential backoff.
19
+ _LOAD_GRAPH_RETRY_POLICY = retries.RetryPolicy(
20
+ max_tries=3,
21
+ delay=1,
22
+ delay_max=16,
23
+ retry_mode=retries.RetryMode.Exponential,
24
+ )
25
+
16
26
 
17
27
  class TaskGraphEvaluator(TaskEvaluatorBase):
18
- """Evaluator for tasks that are part of a TaskGraph workflow."""
28
+ """Evaluator for tasks that are part of a TaskGraph workflow.
29
+
30
+ At-least-once delivery contract: any task in ``Scheduled`` status may be
31
+ published more than once. Workers must tolerate at-least-once delivery.
32
+ Re-publication on Service Bus redelivery is the intentional recovery
33
+ mechanism for the store-before-publish crash gap in ``continue_graph``.
34
+ """
19
35
 
20
36
  def __init__(
21
37
  self,
@@ -114,8 +130,17 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
114
130
  result=None,
115
131
  )
116
132
  await self.storage_interface.store_task_result(task_result)
117
- # Publish failure tasks which may be ready now
118
- await self.continue_graph(task_result)
133
+ # Publish failure tasks which may be ready now.
134
+ # The message is already deadlettered at this point, so suppressing settlement
135
+ # is not possible — log and return gracefully if continue_graph fails.
136
+ try:
137
+ await self.continue_graph(task_result)
138
+ except exc.ContinueGraphError:
139
+ logger.error(
140
+ f"continue_graph failed after retries exhausted for task {self.task.task_id}; "
141
+ "failure callbacks may not be dispatched (message already deadlettered)",
142
+ exc_info=True,
143
+ )
119
144
  return task_result
120
145
 
121
146
  # Actually invoke the task here
@@ -128,7 +153,17 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
128
153
  await self.storage_interface.store_task_result(result)
129
154
 
130
155
  if result.status.finished:
131
- await self.continue_graph(result)
156
+ try:
157
+ await self.continue_graph(result)
158
+ except exc.ContinueGraphError:
159
+ # Transient load_graph failure — do NOT settle the message.
160
+ # Allow Service Bus redelivery so downstream dispatch can be retried.
161
+ logger.error(
162
+ f"continue_graph failed for task {self.task.task_id}; "
163
+ "suppressing message settlement to allow redelivery",
164
+ exc_info=True,
165
+ )
166
+ return result
132
167
  elif result.status == TaskStatus.Retry:
133
168
  # Retry requested: republish the same task with delay
134
169
  delay = self.task.get_next_delay()
@@ -171,20 +206,81 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
171
206
  Continue evaluating TaskGraph workflow after a task completes successfully.
172
207
 
173
208
  We always reload the graph from storage to get the latest state.
209
+
210
+ Transient ``load_graph`` failures (``BoilermakerStorageError`` with a
211
+ non-404 status code) are retried with exponential backoff up to
212
+ ``_LOAD_GRAPH_RETRY_POLICY.max_tries`` attempts. If all attempts fail,
213
+ ``ContinueGraphError`` is raised so that ``message_handler`` can suppress
214
+ message settlement and allow Service Bus redelivery.
215
+
216
+ Permanent failures (``BoilermakerStorageError`` with ``status_code=404``)
217
+ are logged at CRITICAL severity and ``None`` is returned. Settling the
218
+ message is correct in this case because redelivery will not help — the
219
+ graph blob is gone and downstream tasks cannot be dispatched.
220
+
221
+ Note: in practice ``load_graph`` never returns ``None`` for a missing
222
+ blob; the underlying library re-raises all ``HttpResponseError``s
223
+ (including 404) as ``AzureBlobError``, which ``load_graph`` wraps as
224
+ ``BoilermakerStorageError(status_code=404)``. The ``if not graph`` guard
225
+ below is retained as a defensive fallback only.
226
+
227
+ At-least-once delivery: any task already in ``Scheduled`` status is
228
+ re-published without a second blob write (second pass below). This is
229
+ the crash-recovery path for the store-before-publish gap.
174
230
  """
175
231
  graph_id = completed_task_result.graph_id
176
232
  if not graph_id:
177
233
  return None
178
234
 
179
- try:
180
- # Reload graph with latest results
181
- graph = await self.storage_interface.load_graph(graph_id)
182
- except Exception:
183
- logger.error(f"Exception in continue_graph for graph {graph_id}", exc_info=True)
184
- return None
235
+ # Attempt to load the graph, retrying on transient errors.
236
+ last_exc: Exception | None = None
237
+ for attempt in range(_LOAD_GRAPH_RETRY_POLICY.max_tries):
238
+ try:
239
+ graph = await self.storage_interface.load_graph(graph_id)
240
+ break # success
241
+ except exc.BoilermakerStorageError as e:
242
+ if getattr(e, "status_code", None) == 404:
243
+ # Permanent: graph blob does not exist. Redelivery will not help.
244
+ logger.critical(
245
+ f"Graph {graph_id} not found in storage (404); downstream tasks will not be dispatched. "
246
+ "This graph may have been deleted.",
247
+ exc_info=True,
248
+ )
249
+ return None
250
+ # Transient error — will retry or raise ContinueGraphError after max_tries
251
+ last_exc = e
252
+ if attempt < _LOAD_GRAPH_RETRY_POLICY.max_tries - 1:
253
+ delay = _LOAD_GRAPH_RETRY_POLICY.get_delay_interval(attempt)
254
+ logger.warning(
255
+ f"load_graph failed for graph {graph_id} "
256
+ f"(attempt {attempt + 1}/{_LOAD_GRAPH_RETRY_POLICY.max_tries}); "
257
+ f"retrying in {delay}s",
258
+ exc_info=True,
259
+ )
260
+ await asyncio.sleep(delay)
261
+ else:
262
+ logger.error(
263
+ f"load_graph failed for graph {graph_id} after "
264
+ f"{_LOAD_GRAPH_RETRY_POLICY.max_tries} attempts; "
265
+ "raising ContinueGraphError to suppress message settlement",
266
+ exc_info=True,
267
+ )
268
+ raise exc.ContinueGraphError(
269
+ f"load_graph failed for graph {graph_id} after "
270
+ f"{_LOAD_GRAPH_RETRY_POLICY.max_tries} attempts"
271
+ ) from last_exc
272
+ else:
273
+ # Should only be reached if max_tries == 0 (not expected).
274
+ raise exc.ContinueGraphError(f"load_graph not attempted for graph {graph_id}")
185
275
 
186
276
  if not graph:
187
- logger.error(f"Graph {graph_id} not found after task completion")
277
+ # Permanent failure: graph blob does not exist. Redelivery will not help.
278
+ # Settling the upstream message is intentional here.
279
+ logger.critical(
280
+ f"Graph {graph_id} not found after task completion — "
281
+ "downstream tasks will never be dispatched. "
282
+ "This is a permanent data loss; redelivery will not recover it."
283
+ )
188
284
  return None
189
285
 
190
286
  # Sanity check: did we load the result that was *just* stored?
@@ -197,14 +293,19 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
197
293
  )
198
294
  return None
199
295
 
296
+ # Snapshot tasks already in Scheduled status BEFORE the first pass.
297
+ # The second pass uses this snapshot so that tasks freshly scheduled
298
+ # in the first pass are not double-published.
299
+ already_scheduled_tasks = list(graph.generate_scheduled_tasks())
300
+
200
301
  # Find and publish newly ready tasks
201
302
  ready_count = 0
202
303
  for ready_task in itertools.chain.from_iterable(
203
304
  (graph.generate_ready_tasks(), graph.generate_failure_ready_tasks())
204
305
  ):
205
306
  # Write that the task was *scheduled* back to Blob Storage with blob etag and then publish the task!
206
- result = graph.schedule_task(ready_task.task_id)
207
307
  try:
308
+ result = graph.schedule_task(ready_task.task_id)
208
309
  await self.storage_interface.store_task_result(result, etag=result.etag)
209
310
  except exc.BoilermakerStorageError:
210
311
  logger.error(
@@ -213,6 +314,13 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
213
314
  exc_info=True,
214
315
  )
215
316
  continue
317
+ except ValueError:
318
+ logger.error(
319
+ f"schedule_task raised ValueError for task {ready_task.task_id} in graph {graph_id}. "
320
+ "Skipping to avoid double-scheduling.",
321
+ exc_info=True,
322
+ )
323
+ continue
216
324
 
217
325
  ready_count += 1
218
326
  await self.publish_task(ready_task)
@@ -223,4 +331,29 @@ class TaskGraphEvaluator(TaskEvaluatorBase):
223
331
  f"No new tasks ready in graph {graph_id} after task {completed_task_result.task_id}"
224
332
  )
225
333
 
334
+ # Second pass: re-publish tasks that were ALREADY in Scheduled status when the
335
+ # graph was loaded (crash-recovery).
336
+ #
337
+ # If a previous invocation of continue_graph wrote Scheduled to blob storage
338
+ # (store_task_result) but crashed before publishing the Service Bus message
339
+ # (publish_task), the task blob shows Scheduled but there is no SB message.
340
+ # generate_ready_tasks() skips Scheduled tasks (is_not_started == False), so
341
+ # without this pass the task would never be dispatched.
342
+ #
343
+ # We snapshot already-scheduled tasks BEFORE the first pass so that tasks
344
+ # scheduled in the first pass are not double-published here.
345
+ #
346
+ # On Service Bus redelivery, we detect these orphaned-Scheduled tasks here and
347
+ # re-publish them without a second blob write (the blob is already correct).
348
+ #
349
+ # NOTE: Workers must tolerate at-least-once delivery. A task in Scheduled
350
+ # status may be published more than once. This is the intentional recovery
351
+ # mechanism for the store-before-publish crash gap.
352
+ for scheduled_task in already_scheduled_tasks:
353
+ logger.info(
354
+ f"Re-publishing already-scheduled task {scheduled_task.task_id} "
355
+ f"in graph {graph_id} (crash recovery: blob written but message not published)"
356
+ )
357
+ await self.publish_task(scheduled_task)
358
+
226
359
  return ready_count
@@ -1,6 +1,22 @@
1
1
  from azure.servicebus.exceptions import ServiceBusError
2
2
 
3
3
 
4
+ class BoilermakerError(Exception):
5
+ """Base class for Boilermaker-specific exceptions."""
6
+
7
+ pass
8
+
9
+
10
+ class ContinueGraphError(BoilermakerError):
11
+ """Raised when continue_graph cannot load the graph after retries.
12
+
13
+ Signals message_handler that settlement must be suppressed so that
14
+ Service Bus will redeliver the message and downstream dispatch can be retried.
15
+ """
16
+
17
+ pass
18
+
19
+
4
20
  class BoilermakerAppException(Exception):
5
21
  def __init__(self, message: str, errors: list):
6
22
  super().__init__(message + str(errors))
@@ -7,7 +7,7 @@ allows sending messages or subscribing to a queue.
7
7
 
8
8
  from aio_azure_clients_toolbox import (
9
9
  CredentialFactory,
10
- ManagedAzureServiceBusSender, # type: ignore
10
+ ManagedAzureServiceBusSender,
11
11
  )
12
12
 
13
13
  from .config import Config
@@ -1,4 +1,3 @@
1
- import datetime
2
1
  import logging
3
2
  import traceback
4
3
  from functools import partial
@@ -13,7 +12,7 @@ from azure.core.exceptions import (
13
12
  ResourceNotFoundError,
14
13
  )
15
14
  from azure.identity.aio import DefaultAzureCredential
16
- from azure.storage.blob import ImmutabilityPolicy
15
+ from pydantic import ValidationError
17
16
 
18
17
  from boilermaker.exc import BoilermakerStorageError
19
18
  from boilermaker.storage import StorageInterface
@@ -50,7 +49,8 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
50
49
  Returns:
51
50
  The loaded TaskGraph instance, or None if not found.
52
51
  Raises:
53
- ValidationError: If TaskGraph or TaskResultSlim data cannot be validated.
52
+ BoilermakerStorageError: If the blob cannot be loaded or if TaskGraph/TaskResultSlim
53
+ data cannot be validated.
54
54
  """
55
55
  if not graph_id:
56
56
  raise ValueError("`graph_id` must be provided to load a TaskGraph.")
@@ -70,28 +70,38 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
70
70
  if graph_contents is None:
71
71
  return None
72
72
 
73
- graph = TaskGraph.model_validate_json(graph_contents)
73
+ try:
74
+ graph = TaskGraph.model_validate_json(graph_contents)
75
+ except ValidationError as e:
76
+ raise BoilermakerStorageError(
77
+ f"Failed to deserialize graph {graph_id}: {e}",
78
+ status_code=None,
79
+ ) from e
74
80
 
75
81
  # Load all TaskResultSlim instances associated with this graph
76
82
  # We don't want to load *all* return values into memory. Just the statuses.
77
83
  async for blob in self.list_blobs(prefix=graph_dir):
78
- tr = TaskResultSlim.model_validate_json(await self.download_blob(blob.name))
84
+ # DO NOT REDOWNLOAD GRAPH
85
+ if blob.name == graph_path:
86
+ continue
87
+ try:
88
+ tr = TaskResultSlim.model_validate_json(await self.download_blob(blob.name))
89
+ except ValidationError as e:
90
+ raise BoilermakerStorageError(
91
+ f"Failed to deserialize task result in graph {graph_id}: {e}",
92
+ status_code=None,
93
+ ) from e
79
94
  tr.etag = blob.etag
80
95
  if tr.graph_id == graph_id:
81
96
  graph.results[tr.task_id] = tr
82
97
  else:
83
- logger.warning(
84
- f"TaskResult {tr.task_id} in graph {graph_dir} with wrong graph_id {tr.graph_id}!"
85
- )
98
+ logger.warning(f"TaskResult {tr.task_id} in graph {graph_dir} with wrong graph_id {tr.graph_id}!")
86
99
  return graph
87
100
 
88
101
  async def store_graph(self, graph: TaskGraph) -> TaskGraph:
89
102
  """
90
103
  Stores a TaskGraph to Azure Blob Storage and stores all children as pending tasks as well.
91
104
 
92
- We use a lease on the container to make sure *only* one task is writing! This means
93
- that we don't have to worry about concurrent writes causing data corruption.
94
-
95
105
  We expect the *written graph* to be **immutable** (see the ImmutabilityPolicy below).
96
106
 
97
107
  Args:
@@ -99,35 +109,21 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
99
109
  """
100
110
  lease = None
101
111
  async with self.get_blob_service_client() as blob_service_client:
102
- container_client = blob_service_client.get_container_client(
103
- self.container_name
104
- )
105
- lease = await container_client.acquire_lease()
106
- upload_kwargs = {
107
- "lease": lease,
108
- "blob_type": "BlockBlob",
109
- "immutability_policy": ImmutabilityPolicy(
110
- expiry_time=datetime.datetime.now(tz=datetime.UTC) + datetime.timedelta(hours=4),
111
- policy_mode="LOCKED",
112
- ),
113
- }
114
-
112
+ container_client = blob_service_client.get_container_client(self.container_name)
115
113
  # Store the graph itself first
116
114
  fname = f"{self.task_result_prefix}/{graph.storage_path}"
117
115
  try:
118
116
  _result = await container_client.upload_blob(
119
117
  fname,
120
118
  graph.model_dump_json(),
121
- **upload_kwargs,
119
+ blob_type="BlockBlob",
122
120
  )
123
121
  except (
124
122
  ResourceNotFoundError,
125
123
  HttpResponseError,
126
124
  ResourceExistsError,
127
125
  ) as exc:
128
- logger.error(
129
- f"Error occurred while storing TaskGraph {graph.graph_id}: {exc}"
130
- )
126
+ logger.error(f"Error occurred while storing TaskGraph {graph.graph_id}: {exc}")
131
127
  raise BoilermakerStorageError(
132
128
  f"Failed to store TaskGraph {graph.graph_id}",
133
129
  task_id=None,
@@ -139,15 +135,15 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
139
135
  pending_result = None
140
136
  try:
141
137
  async with create_task_group() as tg:
138
+ # don't let any tasks that get ahead accidentally clobber us
142
139
  for pending_result in graph.generate_pending_results():
143
- fname_pr = (
144
- f"{self.task_result_prefix}/{pending_result.storage_path}"
145
- )
140
+ fname_pr = f"{self.task_result_prefix}/{pending_result.storage_path}"
141
+
146
142
  uploader = partial(
147
143
  container_client.upload_blob,
148
144
  fname_pr,
149
145
  pending_result.model_dump_json(),
150
- **upload_kwargs,
146
+ blob_type="BlockBlob",
151
147
  )
152
148
  tg.start_soon(uploader)
153
149
  except* Exception as excgroup:
@@ -180,15 +176,21 @@ class BlobClientStorage(AzureBlobStorageClient, StorageInterface):
180
176
  "graph_id": task_result.graph_id or "none",
181
177
  "status": task_result.status,
182
178
  }
183
- concurrency_kwargs: dict[str, str | int] = {}
179
+ concurrency_kwargs: dict[str, str | int | MatchConditions] = {}
184
180
  if etag:
185
181
  concurrency_kwargs["etag"] = etag
186
- concurrency_kwargs["if_match"] = MatchConditions.IfNotModified.value
182
+ concurrency_kwargs["match_condition"] = MatchConditions.IfNotModified
187
183
 
188
184
  try:
189
185
  await self.upload_blob(
190
186
  fname, task_result.model_dump_json(), tags=blob_tags, overwrite=True, **concurrency_kwargs
191
187
  )
188
+ # SAFETY: This catch assumes aio_azure_clients_toolbox raises AzureBlobError
189
+ # (wrapping HTTP 412 Precondition Failed) when an ETag mismatch occurs.
190
+ # This is the primary guard against concurrent double-scheduling of downstream
191
+ # tasks. Verified against aio-azure-clients-toolbox v1.0.4 (see uv.lock):
192
+ # get_blob_client() catches all HttpResponseError (including 412) and re-raises
193
+ # as AzureBlobError. If the library behavior changes, this guard will silently break.
192
194
  except AzureBlobError as exc:
193
195
  raise BoilermakerStorageError(
194
196
  f"Failed to store TaskResult {task_result.task_id}",
@@ -0,0 +1,20 @@
1
+ from .graph import LAST_ADDED, LastAddedSingleton, TaskChain, TaskGraph, TaskGraphBuilder
2
+ from .result import TaskResult, TaskResultSlim, TaskStatus
3
+ from .task import Task
4
+ from .task_id import GraphId, TaskId
5
+ from .types import TaskHandler
6
+
7
+ __all__ = [
8
+ "LAST_ADDED",
9
+ "LastAddedSingleton",
10
+ "TaskChain",
11
+ "TaskGraph",
12
+ "TaskGraphBuilder",
13
+ "TaskResult",
14
+ "TaskResultSlim",
15
+ "TaskStatus",
16
+ "Task",
17
+ "TaskId",
18
+ "GraphId",
19
+ "TaskHandler",
20
+ ]