soe-ai 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. soe/builtin_tools/__init__.py +39 -0
  2. soe/builtin_tools/soe_add_signal.py +82 -0
  3. soe/builtin_tools/soe_call_tool.py +111 -0
  4. soe/builtin_tools/soe_copy_context.py +80 -0
  5. soe/builtin_tools/soe_explore_docs.py +290 -0
  6. soe/builtin_tools/soe_get_available_tools.py +42 -0
  7. soe/builtin_tools/soe_get_context.py +50 -0
  8. soe/builtin_tools/soe_get_workflows.py +63 -0
  9. soe/builtin_tools/soe_inject_node.py +86 -0
  10. soe/builtin_tools/soe_inject_workflow.py +105 -0
  11. soe/builtin_tools/soe_list_contexts.py +73 -0
  12. soe/builtin_tools/soe_remove_node.py +72 -0
  13. soe/builtin_tools/soe_remove_workflow.py +62 -0
  14. soe/builtin_tools/soe_update_context.py +54 -0
  15. soe/docs/_config.yml +10 -0
  16. soe/docs/advanced_patterns/guide_fanout_and_aggregations.md +318 -0
  17. soe/docs/advanced_patterns/guide_inheritance.md +435 -0
  18. soe/docs/advanced_patterns/hybrid_intelligence.md +237 -0
  19. soe/docs/advanced_patterns/index.md +49 -0
  20. soe/docs/advanced_patterns/operational.md +781 -0
  21. soe/docs/advanced_patterns/self_evolving_workflows.md +385 -0
  22. soe/docs/advanced_patterns/swarm_intelligence.md +211 -0
  23. soe/docs/builtins/context.md +164 -0
  24. soe/docs/builtins/explore_docs.md +135 -0
  25. soe/docs/builtins/tools.md +164 -0
  26. soe/docs/builtins/workflows.md +199 -0
  27. soe/docs/guide_00_getting_started.md +341 -0
  28. soe/docs/guide_01_tool.md +206 -0
  29. soe/docs/guide_02_llm.md +143 -0
  30. soe/docs/guide_03_router.md +146 -0
  31. soe/docs/guide_04_patterns.md +475 -0
  32. soe/docs/guide_05_agent.md +159 -0
  33. soe/docs/guide_06_schema.md +397 -0
  34. soe/docs/guide_07_identity.md +540 -0
  35. soe/docs/guide_08_child.md +612 -0
  36. soe/docs/guide_09_ecosystem.md +690 -0
  37. soe/docs/guide_10_infrastructure.md +427 -0
  38. soe/docs/guide_11_builtins.md +118 -0
  39. soe/docs/index.md +104 -0
  40. soe/docs/primitives/backends.md +281 -0
  41. soe/docs/primitives/context.md +256 -0
  42. soe/docs/primitives/node_reference.md +259 -0
  43. soe/docs/primitives/primitives.md +331 -0
  44. soe/docs/primitives/signals.md +865 -0
  45. soe/docs_index.py +1 -1
  46. soe/lib/__init__.py +0 -0
  47. soe/lib/child_context.py +46 -0
  48. soe/lib/context_fields.py +51 -0
  49. soe/lib/inheritance.py +172 -0
  50. soe/lib/jinja_render.py +113 -0
  51. soe/lib/operational.py +51 -0
  52. soe/lib/parent_sync.py +71 -0
  53. soe/lib/register_event.py +75 -0
  54. soe/lib/schema_validation.py +134 -0
  55. soe/lib/yaml_parser.py +14 -0
  56. soe/local_backends/__init__.py +18 -0
  57. soe/local_backends/factory.py +124 -0
  58. soe/local_backends/in_memory/context.py +38 -0
  59. soe/local_backends/in_memory/conversation_history.py +60 -0
  60. soe/local_backends/in_memory/identity.py +52 -0
  61. soe/local_backends/in_memory/schema.py +40 -0
  62. soe/local_backends/in_memory/telemetry.py +38 -0
  63. soe/local_backends/in_memory/workflow.py +33 -0
  64. soe/local_backends/storage/context.py +57 -0
  65. soe/local_backends/storage/conversation_history.py +82 -0
  66. soe/local_backends/storage/identity.py +118 -0
  67. soe/local_backends/storage/schema.py +96 -0
  68. soe/local_backends/storage/telemetry.py +72 -0
  69. soe/local_backends/storage/workflow.py +56 -0
  70. soe/nodes/__init__.py +13 -0
  71. soe/nodes/agent/__init__.py +10 -0
  72. soe/nodes/agent/factory.py +134 -0
  73. soe/nodes/agent/lib/loop_handlers.py +150 -0
  74. soe/nodes/agent/lib/loop_state.py +157 -0
  75. soe/nodes/agent/lib/prompts.py +65 -0
  76. soe/nodes/agent/lib/tools.py +35 -0
  77. soe/nodes/agent/stages/__init__.py +12 -0
  78. soe/nodes/agent/stages/parameter.py +37 -0
  79. soe/nodes/agent/stages/response.py +54 -0
  80. soe/nodes/agent/stages/router.py +37 -0
  81. soe/nodes/agent/state.py +111 -0
  82. soe/nodes/agent/types.py +66 -0
  83. soe/nodes/agent/validation/__init__.py +11 -0
  84. soe/nodes/agent/validation/config.py +95 -0
  85. soe/nodes/agent/validation/operational.py +24 -0
  86. soe/nodes/child/__init__.py +3 -0
  87. soe/nodes/child/factory.py +61 -0
  88. soe/nodes/child/state.py +59 -0
  89. soe/nodes/child/validation/__init__.py +11 -0
  90. soe/nodes/child/validation/config.py +126 -0
  91. soe/nodes/child/validation/operational.py +28 -0
  92. soe/nodes/lib/conditions.py +71 -0
  93. soe/nodes/lib/context.py +24 -0
  94. soe/nodes/lib/conversation_history.py +77 -0
  95. soe/nodes/lib/identity.py +64 -0
  96. soe/nodes/lib/llm_resolver.py +142 -0
  97. soe/nodes/lib/output.py +68 -0
  98. soe/nodes/lib/response_builder.py +91 -0
  99. soe/nodes/lib/signal_emission.py +79 -0
  100. soe/nodes/lib/signals.py +54 -0
  101. soe/nodes/lib/tools.py +100 -0
  102. soe/nodes/llm/__init__.py +7 -0
  103. soe/nodes/llm/factory.py +103 -0
  104. soe/nodes/llm/state.py +76 -0
  105. soe/nodes/llm/types.py +12 -0
  106. soe/nodes/llm/validation/__init__.py +11 -0
  107. soe/nodes/llm/validation/config.py +89 -0
  108. soe/nodes/llm/validation/operational.py +23 -0
  109. soe/nodes/router/__init__.py +3 -0
  110. soe/nodes/router/factory.py +37 -0
  111. soe/nodes/router/state.py +32 -0
  112. soe/nodes/router/validation/__init__.py +11 -0
  113. soe/nodes/router/validation/config.py +58 -0
  114. soe/nodes/router/validation/operational.py +16 -0
  115. soe/nodes/tool/factory.py +66 -0
  116. soe/nodes/tool/lib/__init__.py +11 -0
  117. soe/nodes/tool/lib/conditions.py +35 -0
  118. soe/nodes/tool/lib/failure.py +28 -0
  119. soe/nodes/tool/lib/parameters.py +67 -0
  120. soe/nodes/tool/state.py +66 -0
  121. soe/nodes/tool/types.py +27 -0
  122. soe/nodes/tool/validation/__init__.py +15 -0
  123. soe/nodes/tool/validation/config.py +132 -0
  124. soe/nodes/tool/validation/operational.py +16 -0
  125. soe/validation/__init__.py +18 -0
  126. soe/validation/config.py +195 -0
  127. soe/validation/jinja.py +54 -0
  128. soe/validation/operational.py +110 -0
  129. {soe_ai-0.1.1.dist-info → soe_ai-0.1.2.dist-info}/METADATA +4 -4
  130. soe_ai-0.1.2.dist-info/RECORD +137 -0
  131. {soe_ai-0.1.1.dist-info → soe_ai-0.1.2.dist-info}/WHEEL +1 -1
  132. soe_ai-0.1.1.dist-info/RECORD +0 -10
  133. {soe_ai-0.1.1.dist-info → soe_ai-0.1.2.dist-info}/licenses/LICENSE +0 -0
  134. {soe_ai-0.1.1.dist-info → soe_ai-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,781 @@
1
+
2
+ # Appendix A: Operational Features
3
+
4
+ ## Introduction
5
+
6
+ SOE provides **operational context** and **infrastructure configurations** that give you fine-grained control over workflow execution. These features enable advanced patterns like:
7
+
8
+ - Waiting for multiple signals (AND logic)
9
+ - Limiting LLM calls
10
+ - Circuit breaker patterns
11
+ - Loop prevention
12
+ - Retry configurations
13
+
14
+ ## The Operational Context
15
+
16
+ Every workflow execution has a reserved `__operational__` namespace in context. This is **read-only** for your workflows but provides valuable runtime information.
17
+
18
+ ### Structure
19
+
20
+ ```python
21
+ context["__operational__"] = {
22
+ "signals": ["START", "TASK_A_DONE", ...], # All signals emitted
23
+ "nodes": {"NodeName": 3, ...}, # Execution count per node
24
+ "llm_calls": 5, # Total LLM calls
25
+ "tool_calls": 2, # Total tool calls
26
+ "errors": 0, # Total errors
27
+ "main_execution_id": "abc-123-...", # Root orchestration ID
28
+ }
29
+ ```
30
+
31
+ ### Main Execution ID
32
+
33
+ The `main_execution_id` is the root orchestration ID that persists across sub-orchestrations:
34
+
35
+ - For root workflows: `main_execution_id` equals the execution ID
36
+ - For child workflows: `main_execution_id` is inherited from the parent
37
+ - Used by conversation history to share state across the orchestration tree
38
+
39
+ This enables **persistent identity** where children share conversation history with their parent (see [Identity Guide](guide_07_identity.md)).
40
+
41
+ ### Accessing in Jinja
42
+
43
+ Use `context.__operational__` in any condition:
44
+
45
+
46
+ ```yaml
47
+ condition: "{{ 'TASK_A_DONE' in context.__operational__.signals }}"
48
+ condition: "{{ context.__operational__.llm_calls < 10 }}"
49
+ condition: "{{ context.__operational__.tool_calls < 50 }}"
50
+ condition: "{{ context.__operational__.errors >= 3 }}"
51
+ condition: "{{ context.__operational__.nodes.get('MyNode', 0) < 5 }}"
52
+ ```
53
+
54
+
55
+ ---
56
+
57
+ ## broadcast_signals: Post-Execution Control
58
+
59
+ After `orchestrate()` returns, you can send additional signals to continue or manipulate the execution using `broadcast_signals`.
60
+
61
+ ### Understanding the Relationship
62
+
63
+ When you call `orchestrate()`, it:
64
+ 1. Generates a new `execution_id`
65
+ 2. Initializes clean operational context (`__operational__`)
66
+ 3. Runs the workflow until no more signals trigger nodes
67
+ 4. Returns the `execution_id`
68
+
69
+ The `broadcast_signals` function lets you send signals to that execution **after** `orchestrate()` returns.
70
+
71
+ ### Important: Avoid START with broadcast_signals
72
+
73
+ ```python
74
+ # ❌ WRONG - Don't use START with broadcast_signals
75
+ execution_id = orchestrate(
76
+ config=workflow,
77
+ initial_signals=["START"],
78
+ ...
79
+ )
80
+ broadcast_signals(execution_id, ["START"], nodes, backends) # BAD!
81
+ ```
82
+
83
+ **Why this is wrong**: Sending `START` via `broadcast_signals` will double-process nodes and corrupt the operational context. The operational counters were already incremented during the initial `orchestrate()`.
84
+
85
+ ### Proper Usage of broadcast_signals
86
+
87
+ ```python
88
+ # ✅ CORRECT - Use for continuation or specific signals
89
+ execution_id = orchestrate(
90
+ config=workflow,
91
+ initial_signals=["START"],
92
+ ...
93
+ )
94
+
95
+ # Later, send a specific signal to continue
96
+ broadcast_signals(execution_id, ["EXTERNAL_EVENT"], nodes, backends)
97
+ broadcast_signals(execution_id, ["RETRY_PHASE_2"], nodes, backends)
98
+ ```
99
+
100
+ ### Use Cases for broadcast_signals
101
+
102
+ 1. **Delayed Scheduling**: SOE is infrastructure-agnostic and doesn't include a scheduler. You can use any external scheduler by starting with no signals:
103
+ ```python
104
+ # Create the execution but don't start
105
+ execution_id = orchestrate(
106
+ config=workflow,
107
+ initial_signals=[], # No signals yet!
108
+ ...
109
+ )
110
+
111
+ # Later, via external scheduler (cron, AWS EventBridge, etc.)
112
+ broadcast_signals(execution_id, ["START"], nodes, backends)
113
+ ```
114
+
115
+ 2. **External Event Handling**: Continue a workflow based on external events:
116
+ ```python
117
+ # Workflow waiting for approval
118
+ broadcast_signals(execution_id, ["APPROVED"], nodes, backends)
119
+ ```
120
+
121
+ 3. **Retries and Remediation**: Trigger specific retry paths:
122
+ ```python
123
+ broadcast_signals(execution_id, ["RETRY_FAILED_STEP"], nodes, backends)
124
+ ```
125
+
126
+ ### For Clean Restarts, Use Inheritance
127
+
128
+ If you need to restart a workflow completely (fresh operational context), use [Config Inheritance](guide_inheritance.md) instead:
129
+
130
+ ```python
131
+ # Fresh execution inheriting config from previous run
132
+ new_execution_id = orchestrate(
133
+ config=None,
134
+ initial_signals=["START"],
135
+ initial_context={},
136
+ inherit_config_from_id=old_execution_id, # Reuse config
137
+ ...
138
+ )
139
+ ```
140
+
141
+ This creates a new `execution_id` with clean operational counters while reusing the workflow definitions.
142
+
143
+ ---
144
+
145
+ ## Wait for Multiple Signals (AND Logic)
146
+
147
+ By default, `event_triggers` uses OR logic—any listed signal triggers the node. To implement AND logic (wait for all signals), use a router with operational context:
148
+
149
+ ### The Pattern
150
+
151
+ ```yaml
152
+ example_workflow:
153
+ TaskA:
154
+ node_type: router
155
+ event_triggers: [START]
156
+ event_emissions:
157
+ - signal_name: A_DONE
158
+
159
+ TaskB:
160
+ node_type: router
161
+ event_triggers: [START]
162
+ event_emissions:
163
+ - signal_name: B_DONE
164
+
165
+ WaitForBoth:
166
+ node_type: router
167
+ event_triggers: [A_DONE, B_DONE]
168
+ event_emissions:
169
+ - signal_name: BOTH_COMPLETE
170
+ condition: "&#123;&#123; 'A_DONE' in context.__operational__.signals and 'B_DONE' in context.__operational__.signals &#125;&#125;"
171
+ - signal_name: WAITING
172
+ condition: "&#123;&#123; not ('A_DONE' in context.__operational__.signals and 'B_DONE' in context.__operational__.signals) &#125;&#125;"
173
+ ```
174
+
175
+ ### How It Works
176
+
177
+ 1. `TaskA` and `TaskB` both trigger on `START` (parallel execution).
178
+ 2. `WaitForBoth` triggers on either `A_DONE` OR `B_DONE`.
179
+ 3. Condition checks if BOTH signals are in `__operational__.signals`.
180
+ 4. First trigger: condition fails → emits `WAITING`.
181
+ 5. Second trigger: condition succeeds → emits `BOTH_COMPLETE`.
182
+
183
+ ## LLM Call Limiting
184
+
185
+ Control AI costs by checking `llm_calls`:
186
+
187
+ ### The Pattern
188
+
189
+ ```yaml
190
+ example_workflow:
191
+ FirstLLM:
192
+ node_type: llm
193
+ event_triggers: [START]
194
+ prompt: "First task: &#123;&#123; context.task &#125;&#125;"
195
+ output_field: firstResult
196
+ event_emissions:
197
+ - signal_name: FIRST_DONE
198
+
199
+ CheckLLMCount:
200
+ node_type: router
201
+ event_triggers: [FIRST_DONE]
202
+ event_emissions:
203
+ - signal_name: CONTINUE_LLM
204
+ condition: "&#123;&#123; context.__operational__.llm_calls < 3 &#125;&#125;"
205
+ - signal_name: LLM_LIMIT_REACHED
206
+ condition: "&#123;&#123; context.__operational__.llm_calls >= 3 &#125;&#125;"
207
+
208
+ SecondLLM:
209
+ node_type: llm
210
+ event_triggers: [CONTINUE_LLM]
211
+ prompt: "Second task based on: &#123;&#123; context.firstResult &#125;&#125;"
212
+ output_field: secondResult
213
+ event_emissions:
214
+ - signal_name: SECOND_DONE
215
+ ```
216
+
217
+ ### Use Cases
218
+
219
+ - **Budget control**: Stop after N LLM calls.
220
+ - **Rate limiting**: Prevent runaway agent loops.
221
+ - **Tiered processing**: Different paths based on usage.
222
+
223
+ ## Tool Call Limiting
224
+
225
+ Monitor and limit tool usage by checking `tool_calls`:
226
+
227
+ ### The Pattern
228
+
229
+ ```yaml
230
+ example_workflow:
231
+ FirstTool:
232
+ node_type: tool
233
+ event_triggers: [START]
234
+ tool_name: api_call
235
+ context_parameter_field: api_params
236
+ output_field: firstResult
237
+ event_emissions:
238
+ - signal_name: FIRST_DONE
239
+
240
+ CheckToolCount:
241
+ node_type: router
242
+ event_triggers: [FIRST_DONE]
243
+ event_emissions:
244
+ - signal_name: CONTINUE_TOOLS
245
+ condition: "&#123;&#123; context.__operational__.tool_calls < 10 &#125;&#125;"
246
+ - signal_name: TOOL_LIMIT_REACHED
247
+ condition: "&#123;&#123; context.__operational__.tool_calls >= 10 &#125;&#125;"
248
+
249
+ SecondTool:
250
+ node_type: tool
251
+ event_triggers: [CONTINUE_TOOLS]
252
+ tool_name: api_call
253
+ context_parameter_field: api_params
254
+ output_field: secondResult
255
+ event_emissions:
256
+ - signal_name: SECOND_DONE
257
+ ```
258
+
259
+ ### Use Cases
260
+
261
+ - **Rate limiting**: Prevent excessive API calls to external services.
262
+ - **Resource protection**: Limit database or file system operations.
263
+ - **Cost control**: Track tool usage for billing or quota management.
264
+
265
+ **Note**: `tool_calls` counts both standalone tool node executions and tool calls made by agent nodes.
266
+
267
+ ## Error Circuit Breaker
268
+
269
+ Implement circuit breaker pattern using `errors` count:
270
+
271
+ ### The Pattern
272
+
273
+ ```yaml
274
+ example_workflow:
275
+ ProcessData:
276
+ node_type: tool
277
+ event_triggers: [START]
278
+ tool_name: risky_operation
279
+ context_parameter_field: data
280
+ output_field: result
281
+ event_emissions:
282
+ - signal_name: SUCCESS
283
+
284
+ CheckErrors:
285
+ node_type: router
286
+ event_triggers: [FAILURE]
287
+ event_emissions:
288
+ - signal_name: RETRY
289
+ condition: "&#123;&#123; context.__operational__.errors < 3 &#125;&#125;"
290
+ - signal_name: CIRCUIT_OPEN
291
+ condition: "&#123;&#123; context.__operational__.errors >= 3 &#125;&#125;"
292
+
293
+ RetryHandler:
294
+ node_type: router
295
+ event_triggers: [RETRY]
296
+ event_emissions:
297
+ - signal_name: START
298
+ ```
299
+
300
+ ### How It Works
301
+
302
+ 1. `ProcessData` runs a risky tool.
303
+ 2. On failure, `CheckErrors` evaluates error count.
304
+ 3. Under threshold: emit `RETRY` → triggers `START` again.
305
+ 4. Over threshold: emit `CIRCUIT_OPEN` → stop retrying.
306
+
307
+ ## Loop Prevention
308
+
309
+ Prevent infinite loops by checking node execution count:
310
+
311
+ ### The Pattern
312
+
313
+ ```yaml
314
+ example_workflow:
315
+ LoopingNode:
316
+ node_type: router
317
+ event_triggers: [START, CONTINUE]
318
+ event_emissions:
319
+ - signal_name: CONTINUE
320
+ condition: "&#123;&#123; context.__operational__.nodes.get('LoopingNode', 0) < 5 &#125;&#125;"
321
+ - signal_name: LOOP_LIMIT_REACHED
322
+ condition: "&#123;&#123; context.__operational__.nodes.get('LoopingNode', 0) >= 5 &#125;&#125;"
323
+ ```
324
+
325
+ ### How It Works
326
+
327
+ 1. `LoopingNode` triggers on `START` or `CONTINUE`.
328
+ 2. Each execution increments `nodes.LoopingNode`.
329
+ 3. Condition checks if count exceeds limit.
330
+ 4. Under limit: emit `CONTINUE` (loop).
331
+ 5. Over limit: emit `LOOP_LIMIT_REACHED` (break).
332
+
333
+ ## Retry Configuration
334
+
335
+ ### LLM Retries
336
+
337
+ LLM nodes support `retries` for handling validation failures:
338
+
339
+ ```yaml
340
+ example_workflow:
341
+ ReliableLLM:
342
+ node_type: llm
343
+ event_triggers: [START]
344
+ prompt: "Analyze: &#123;&#123; context.input &#125;&#125;"
345
+ output_field: result
346
+ retries: 5
347
+ event_emissions:
348
+ - signal_name: DONE
349
+ ```
350
+
351
+ When the LLM returns invalid JSON or fails Pydantic validation, SOE automatically retries up to `retries` times (default: 3).
352
+
353
+ ### LLM Failure Signal
354
+
355
+ When all retries are exhausted, the node raises an exception by default. Use `llm_failure_signal` to emit a signal instead, enabling graceful fallback:
356
+
357
+ ```yaml
358
+ example_workflow:
359
+ LLMWithFallback:
360
+ node_type: llm
361
+ event_triggers: [START]
362
+ prompt: "Analyze: &#123;&#123; context.input &#125;&#125;"
363
+ output_field: result
364
+ retries: 3
365
+ llm_failure_signal: LLM_FAILED
366
+ event_emissions:
367
+ - signal_name: DONE
368
+
369
+ HandleLLMFailure:
370
+ node_type: router
371
+ event_triggers: [LLM_FAILED]
372
+ event_emissions:
373
+ - signal_name: USE_FALLBACK
374
+ ```
375
+
376
+ This pattern enables:
377
+ - **Fallback paths**: Route to cached responses or simpler logic
378
+ - **Graceful degradation**: Continue workflow instead of crashing
379
+ - **Alerting**: Trigger notification workflows on failure
380
+
381
+ ### Agent Retries
382
+
383
+ Agent nodes also support `retries`:
384
+
385
+ ```yaml
386
+ example_workflow:
387
+ ReliableAgent:
388
+ node_type: agent
389
+ event_triggers: [START]
390
+ system_prompt: "You are a helpful assistant."
391
+ user_prompt: "Help with: &#123;&#123; context.request &#125;&#125;"
392
+ output_field: response
393
+ available_tools: [search]
394
+ retries: 2
395
+ event_emissions:
396
+ - signal_name: AGENT_DONE
397
+ ```
398
+
399
+ This controls how many times the agent's internal LLM calls retry on validation failure.
400
+
401
+ ### Agent Failure Signals
402
+
403
+ Agents emit `llm_failure_signal` when they exhaust all retries (terminal failure):
404
+
405
+ ```yaml
406
+ example_workflow:
407
+ RobustAgent:
408
+ node_type: agent
409
+ event_triggers: [START]
410
+ prompt: "Complete the task: &#123;&#123; context.task &#125;&#125;"
411
+ tools: [risky_operation]
412
+ output_field: result
413
+ retries: 3
414
+ llm_failure_signal: AGENT_EXHAUSTED
415
+ event_emissions:
416
+ - signal_name: DONE
417
+
418
+ HandleAgentExhausted:
419
+ node_type: router
420
+ event_triggers: [AGENT_EXHAUSTED]
421
+ event_emissions:
422
+ - signal_name: FALLBACK_REQUIRED
423
+ ```
424
+
425
+ **Note**: Tool failures are handled via the tool registry's `failure_signal` configuration (see Tool Retries below).
426
+
427
+ ### Tool Retries
428
+
429
+ For agent tools, retries are configured per-tool in the tools registry:
430
+
431
+ ```python
432
+ tools = [
433
+ {"function": risky_tool, "max_retries": 3},
434
+ {"function": reliable_tool, "max_retries": 0},
435
+ ]
436
+ ```
437
+
438
+ When a tool execution fails, the agent can retry up to `max_retries` times before reporting failure to the LLM.
439
+
440
+ ## Conditional Processing Based on State
441
+
442
+ Combine operational checks for smart routing:
443
+
444
+ ```yaml
445
+ example_workflow:
446
+ CheckState:
447
+ node_type: router
448
+ event_triggers: [START]
449
+ event_emissions:
450
+ - signal_name: NEEDS_LLM
451
+ condition: "&#123;&#123; context.__operational__.llm_calls == 0 &#125;&#125;"
452
+ - signal_name: USE_CACHED
453
+ condition: "&#123;&#123; context.__operational__.llm_calls > 0 &#125;&#125;"
454
+
455
+ CallLLM:
456
+ node_type: llm
457
+ event_triggers: [NEEDS_LLM]
458
+ prompt: "Process: &#123;&#123; context.input &#125;&#125;"
459
+ output_field: result
460
+ event_emissions:
461
+ - signal_name: COMPLETE
462
+
463
+ UseCached:
464
+ node_type: router
465
+ event_triggers: [USE_CACHED]
466
+ event_emissions:
467
+ - signal_name: COMPLETE
468
+ ```
469
+
470
+ ## Operational Context Fields Reference
471
+
472
+ | Field | Type | Description |
473
+ |-------|------|-------------|
474
+ | `signals` | `List[str]` | All signals emitted during execution |
475
+ | `nodes` | `Dict[str, int]` | Execution count per node name |
476
+ | `llm_calls` | `int` | Total LLM calls (LLM + Agent nodes) |
477
+ | `tool_calls` | `int` | Total tool calls (Tool nodes + Agent tool calls) |
478
+ | `errors` | `int` | Total errors encountered |
479
+ | `main_execution_id` | `str` | Root orchestration ID (persists to children) |
480
+
481
+ ## The Parent Context (`__parent__`)
482
+
483
+ Child workflows have a `__parent__` namespace in their context containing parent relationship metadata:
484
+
485
+ ```python
486
+ context["__parent__"] = {
487
+ "parent_execution_id": "parent-abc-123", # Immediate parent's execution ID
488
+ "main_execution_id": "root-abc-123", # Root orchestration ID
489
+ "signals_to_parent": ["DONE", "FAILED"], # Signals that propagate up
490
+ "context_updates_to_parent": ["result"], # Keys that sync to parent
491
+ }
492
+ ```
493
+
494
+ This is **read-only** and managed by SOE. It enables:
495
+ - Context updates propagating up the orchestration tree
496
+ - Signal forwarding from child to parent
497
+ - Shared conversation history across the entire tree
498
+
499
+ ## Infrastructure Guardrail Patterns
500
+
501
+ These patterns use routers as guardrails to control execution flow. They check operational context or external conditions **before** allowing expensive operations to proceed.
502
+
503
+ ### Execute Only Once
504
+
505
+ Prevent duplicate execution of expensive operations:
506
+
507
+ ```yaml
508
+ example_workflow:
509
+ OnceGuard:
510
+ node_type: router
511
+ event_triggers: [START, RETRY_REQUEST]
512
+ event_emissions:
513
+ - signal_name: PROCEED
514
+ condition: "&#123;&#123; context.__operational__.nodes.get('ExpensiveOperation', 0) == 0 &#125;&#125;"
515
+ - signal_name: ALREADY_EXECUTED
516
+ condition: "&#123;&#123; context.__operational__.nodes.get('ExpensiveOperation', 0) > 0 &#125;&#125;"
517
+
518
+ ExpensiveOperation:
519
+ node_type: tool
520
+ event_triggers: [PROCEED]
521
+ tool_name: expensive_api_call
522
+ context_parameter_field: api_params
523
+ output_field: api_result
524
+ event_emissions:
525
+ - signal_name: OPERATION_COMPLETE
526
+
527
+ SkipHandler:
528
+ node_type: router
529
+ event_triggers: [ALREADY_EXECUTED]
530
+ event_emissions:
531
+ - signal_name: OPERATION_COMPLETE
532
+ ```
533
+
534
+ **How It Works:**
535
+ 1. `OnceGuard` checks if `ExpensiveOperation` has already executed.
536
+ 2. First execution: `nodes.get('ExpensiveOperation', 0) == 0` → `PROCEED`.
537
+ 3. Subsequent triggers: `ALREADY_EXECUTED` → skip to handler.
538
+
539
+ **Use Cases:**
540
+ - Billing operations that must happen exactly once.
541
+ - Initialization tasks.
542
+ - Idempotent API calls.
543
+
544
+ ### Health Check Guardrail
545
+
546
+ Validate external service health before proceeding:
547
+
548
+ ```yaml
549
+ example_workflow:
550
+ HealthCheckRouter:
551
+ node_type: router
552
+ event_triggers: [START]
553
+ event_emissions:
554
+ - signal_name: CHECK_SERVICE
555
+
556
+ ServiceHealthCheck:
557
+ node_type: tool
558
+ event_triggers: [CHECK_SERVICE]
559
+ tool_name: check_service_health
560
+ output_field: health_status
561
+ event_emissions:
562
+ - signal_name: HEALTH_CHECKED
563
+
564
+ HealthGuard:
565
+ node_type: router
566
+ event_triggers: [HEALTH_CHECKED]
567
+ event_emissions:
568
+ - signal_name: SERVICE_HEALTHY
569
+ condition: "&#123;&#123; context.health_status.is_healthy == true &#125;&#125;"
570
+ - signal_name: SERVICE_UNHEALTHY
571
+ condition: "&#123;&#123; context.health_status.is_healthy != true &#125;&#125;"
572
+
573
+ MainProcess:
574
+ node_type: llm
575
+ event_triggers: [SERVICE_HEALTHY]
576
+ prompt: "Process with healthy service: &#123;&#123; context.request &#125;&#125;"
577
+ output_field: result
578
+ event_emissions:
579
+ - signal_name: DONE
580
+
581
+ UnhealthyFallback:
582
+ node_type: router
583
+ event_triggers: [SERVICE_UNHEALTHY]
584
+ event_emissions:
585
+ - signal_name: DONE
586
+ ```
587
+
588
+ **How It Works:**
589
+ 1. Router triggers health check tool.
590
+ 2. Tool returns `health_status` with `is_healthy` field.
591
+ 3. Second router decides: healthy → proceed, unhealthy → fallback.
592
+
593
+ **Use Cases:**
594
+ - Check database connectivity before writes.
595
+ - Validate API availability before calls.
596
+ - Verify model endpoints before inference.
597
+
598
+ ### Rate Limiting
599
+
600
+ Throttle operations based on execution count:
601
+
602
+ ```yaml
603
+ example_workflow:
604
+ RateLimitGuard:
605
+ node_type: router
606
+ event_triggers: [REQUEST]
607
+ event_emissions:
608
+ - signal_name: ALLOWED
609
+ condition: "&#123;&#123; context.__operational__.nodes.get('APICall', 0) < context.rate_limit &#125;&#125;"
610
+ - signal_name: RATE_LIMITED
611
+ condition: "&#123;&#123; context.__operational__.nodes.get('APICall', 0) >= context.rate_limit &#125;&#125;"
612
+
613
+ APICall:
614
+ node_type: tool
615
+ event_triggers: [ALLOWED]
616
+ tool_name: external_api
617
+ context_parameter_field: api_params
618
+ output_field: api_response
619
+ event_emissions:
620
+ - signal_name: CALL_COMPLETE
621
+
622
+ RateLimitHandler:
623
+ node_type: router
624
+ event_triggers: [RATE_LIMITED]
625
+ event_emissions:
626
+ - signal_name: THROTTLED
627
+ ```
628
+
629
+ **How It Works:**
630
+ 1. Guard router checks if `APICall` count is under `rate_limit`.
631
+ 2. Under limit: `ALLOWED` → execute.
632
+ 3. Over limit: `RATE_LIMITED` → throttle handler.
633
+
634
+ **Use Cases:**
635
+ - API rate limiting per execution.
636
+ - Cost control for LLM calls.
637
+ - Preventing runaway loops.
638
+
639
+ ### Kill Switch
640
+
641
+ Context-based execution suspension:
642
+
643
+ ```yaml
644
+ example_workflow:
645
+ KillSwitchGuard:
646
+ node_type: router
647
+ event_triggers: [START, CONTINUE]
648
+ event_emissions:
649
+ - signal_name: PROCEED
650
+ condition: "&#123;&#123; context.kill_switch != true &#125;&#125;"
651
+ - signal_name: SUSPENDED
652
+ condition: "&#123;&#123; context.kill_switch == true &#125;&#125;"
653
+
654
+ MainProcess:
655
+ node_type: llm
656
+ event_triggers: [PROCEED]
657
+ prompt: "Execute step: &#123;&#123; context.current_step &#125;&#125;"
658
+ output_field: step_result
659
+ event_emissions:
660
+ - signal_name: STEP_DONE
661
+
662
+ NextStep:
663
+ node_type: router
664
+ event_triggers: [STEP_DONE]
665
+ event_emissions:
666
+ - signal_name: CONTINUE
667
+ condition: "&#123;&#123; context.steps_remaining > 0 &#125;&#125;"
668
+ - signal_name: ALL_COMPLETE
669
+ condition: "&#123;&#123; context.steps_remaining <= 0 &#125;&#125;"
670
+
671
+ SuspendHandler:
672
+ node_type: router
673
+ event_triggers: [SUSPENDED]
674
+ event_emissions:
675
+ - signal_name: AWAITING_RESUME
676
+ ```
677
+
678
+ **How It Works:**
679
+ 1. Guard router checks `context.kill_switch` before each step.
680
+ 2. If `true`: emit `SUSPENDED`, execution stops.
681
+ 3. External system can set `kill_switch` in context and send signal to resume.
682
+ 4. When resumed without kill switch: execution continues.
683
+
684
+ **Use Cases:**
685
+ - Emergency stop for runaway agents.
686
+ - Pause/resume long-running workflows.
687
+ - Admin override for production systems.
688
+
689
+ ### Production Guardrails (Combined Pattern)
690
+
691
+ Combine multiple guardrails for production-ready workflows:
692
+
693
+ ```yaml
694
+ example_workflow:
695
+ EntryGuard:
696
+ node_type: router
697
+ event_triggers: [START]
698
+ event_emissions:
699
+ - signal_name: CHECK_KILL_SWITCH
700
+
701
+ KillSwitchCheck:
702
+ node_type: router
703
+ event_triggers: [CHECK_KILL_SWITCH]
704
+ event_emissions:
705
+ - signal_name: CHECK_RATE
706
+ condition: "&#123;&#123; context.system_suspended != true &#125;&#125;"
707
+ - signal_name: SYSTEM_SUSPENDED
708
+ condition: "&#123;&#123; context.system_suspended == true &#125;&#125;"
709
+
710
+ RateLimitCheck:
711
+ node_type: router
712
+ event_triggers: [CHECK_RATE]
713
+ event_emissions:
714
+ - signal_name: CHECK_HEALTH
715
+ condition: "&#123;&#123; context.__operational__.nodes.get('CoreOperation', 0) < 100 &#125;&#125;"
716
+ - signal_name: RATE_EXCEEDED
717
+ condition: "&#123;&#123; context.__operational__.nodes.get('CoreOperation', 0) >= 100 &#125;&#125;"
718
+
719
+ HealthCheck:
720
+ node_type: tool
721
+ event_triggers: [CHECK_HEALTH]
722
+ tool_name: system_health_check
723
+ output_field: system_health
724
+ event_emissions:
725
+ - signal_name: HEALTH_RESULT
726
+
727
+ HealthDecision:
728
+ node_type: router
729
+ event_triggers: [HEALTH_RESULT]
730
+ event_emissions:
731
+ - signal_name: EXECUTE
732
+ condition: "&#123;&#123; context.system_health.ready == true &#125;&#125;"
733
+ - signal_name: SYSTEM_DEGRADED
734
+ condition: "&#123;&#123; context.system_health.ready != true &#125;&#125;"
735
+
736
+ CoreOperation:
737
+ node_type: llm
738
+ event_triggers: [EXECUTE]
739
+ prompt: "Process: &#123;&#123; context.request &#125;&#125;"
740
+ output_field: result
741
+ event_emissions:
742
+ - signal_name: DONE
743
+ ```
744
+
745
+ **The Guardrail Chain:**
746
+ 1. **Kill Switch Check** - Is the system suspended?
747
+ 2. **Rate Limit Check** - Are we under the limit?
748
+ 3. **Health Check** - Is the downstream service healthy?
749
+ 4. **Execute** - Only if all checks pass.
750
+
751
+ ## Infrastructure Configurations Reference
752
+
753
+ | Config | Node Types | Default | Description |
754
+ |--------|------------|---------|-------------|
755
+ | `retries` | LLM, Agent | 3 | Max validation retries for LLM response |
756
+ | `llm_failure_signal` | LLM, Agent | None | Signal to emit when all retries exhausted (instead of raising) |
757
+ | `max_retries` | Tool (in registry) | 1 | Max execution retries per tool |
758
+ | `failure_signal` | Tool (in registry) | None | Signal to emit when tool fails after all retries |
759
+
760
+ ## Best Practices
761
+
762
+ ### Do
763
+
764
+ - **Use operational context for control flow**: Circuit breakers, loop limits.
765
+ - **Check signals for AND logic**: `{{ 'A' in context.__operational__.signals and 'B' in context.__operational__.signals }}`.
766
+ - **Set retries appropriately**: Higher for unreliable LLMs, lower for deterministic.
767
+
768
+ ### Don't
769
+
770
+ - **Write to `__operational__`**: It's managed by SOE.
771
+ - **Rely on exact node execution counts**: Implementation may vary.
772
+ - **Use operational context for business logic**: Keep it for infrastructure decisions.
773
+
774
+ ## Key Points
775
+
776
+ - **`__operational__`** is a read-only namespace with runtime metadata.
777
+ - **AND logic** for signals requires a router checking `__operational__.signals`.
778
+ - **`llm_calls`** and **`errors`** enable cost control and circuit breakers.
779
+ - **`nodes`** counts enable loop prevention.
780
+ - **`retries`** config controls LLM validation retry attempts.
781
+ - **Failure signals** (`llm_failure_signal` for nodes, `failure_signal` for tools) enable graceful error handling instead of exceptions.