nullrun 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ from enum import Enum
2
+ from typing import Any
3
+
4
+
5
+ class BreakerError(Exception):
6
+ """Base exception for Breaker SDK."""
7
+ pass
8
+
9
+
10
+ class TransportErrorSource(str, Enum):
11
+ """Where a transport failure originated.
12
+
13
+ Surfaces the failure classification up to the caller so the
14
+ `decision_source` audit trail can distinguish "server said
15
+ block" from "server did not respond" — see ADR-008 for the full
16
+ rationale.
17
+
18
+ These values also flow through `decision_source` on
19
+ `execute` / `check` return dicts when the transport layer
20
+ degrades to a fallback instead of raising.
21
+ """
22
+ NETWORK_ERROR = "NETWORK_ERROR" # httpx.ConnectError, timeout, DNS
23
+ GATEWAY_ERROR = "GATEWAY_ERROR" # 5xx from the gateway
24
+ BREAKER_OPEN = "BREAKER_OPEN" # circuit breaker tripped
25
+ AUTH_ERROR = "AUTH_ERROR" # 401 / 403 from the gateway
26
+
27
+
28
+ class NullRunTransportError(BreakerError):
29
+ """Raised by transport layer when the policy engine is unreachable.
30
+
31
+ The exception carries a `source` (TransportErrorSource) and the
32
+ `endpoint` that failed, so callers can implement endpoint-specific
33
+ recovery (e.g. fail-CLOSED for sensitive tools, fail-OPEN for
34
+ budget pre-checks) per ADR-008.
35
+
36
+ Replaces the previous behavior of swallowing the failure and
37
+ returning a synthetic `allow` / `block` response — that hid
38
+ the policy-engine outage from operators and was the root cause
39
+ of bug #1 / #2 fixed in ADR-008.
40
+ """
41
+ def __init__(
42
+ self,
43
+ message: str,
44
+ source: TransportErrorSource,
45
+ endpoint: str,
46
+ **details: Any,
47
+ ) -> None:
48
+ self.source = source
49
+ self.endpoint = endpoint
50
+ self.details = details
51
+ super().__init__(
52
+ f"Transport error on {endpoint}: {message} "
53
+ f"(source={source.value}, details={details})"
54
+ )
55
+
56
+
57
+ class RateLimitError(NullRunTransportError):
58
+ """Raised when the gateway returns HTTP 429 with a ``Retry-After``
59
+ header (or JSON body field).
60
+
61
+ Phase 4: subclass of ``NullRunTransportError`` so
62
+ ``except NullRunTransportError`` keeps catching it. Surfaces
63
+ ``retry_after`` (seconds) and ``upgrade_url`` so callers can
64
+ schedule a retry or surface a billing upgrade prompt.
65
+
66
+ Attributes:
67
+ retry_after: Seconds the server asks the client to wait
68
+ before retrying. ``None`` when no ``Retry-After`` header.
69
+ upgrade_url: Plan-upgrade URL from the 429 body. ``None``
70
+ when the response did not include one.
71
+ body: Parsed JSON body (gateway's ``error`` / ``message``).
72
+ """
73
+ def __init__(
74
+ self,
75
+ message: str,
76
+ source: TransportErrorSource,
77
+ endpoint: str,
78
+ retry_after: float | None = None,
79
+ upgrade_url: str | None = None,
80
+ body: dict[str, Any] | None = None,
81
+ **details: Any,
82
+ ) -> None:
83
+ self.retry_after = retry_after
84
+ self.upgrade_url = upgrade_url
85
+ self.body = body or {}
86
+ if retry_after is not None:
87
+ details.setdefault("retry_after", retry_after)
88
+ if upgrade_url is not None:
89
+ details.setdefault("upgrade_url", upgrade_url)
90
+ super().__init__(message, source, endpoint, **details)
91
+
92
+
93
+ class BreakerTransportError(BreakerError):
94
+ """
95
+ Raised when transport layer fails and events cannot be delivered.
96
+
97
+ This exception indicates a critical failure in the transport layer where
98
+ events are being dropped after exceeding retry limits. The caller must
99
+ handle this exception - events are NOT silently lost.
100
+
101
+ Use cases:
102
+ - After max_retries consecutive flush failures
103
+ - Transport buffer full and circuit breaker triggered
104
+ - Network connectivity issues preventing delivery
105
+
106
+ Applications should implement retry logic or alerting mechanism when this exception
107
+ is raised, as budget protection may be compromised.
108
+ """
109
+ def __init__(
110
+ self,
111
+ message: str,
112
+ events_lost: int = 0,
113
+ buffer_size: int = 0,
114
+ **details: Any,
115
+ ) -> None:
116
+ self.events_lost = events_lost
117
+ self.buffer_size = buffer_size
118
+ self.details = details
119
+ super().__init__(
120
+ f"Transport error: {message} "
121
+ f"(events_lost={events_lost}, buffer_size={buffer_size}, details={details})"
122
+ )
123
+
124
+
125
+ class InsecureTransportError(BreakerTransportError):
126
+ """Raised when SDK is configured with insecure HTTP (non-localhost)."""
127
+ pass
128
+
129
+
130
+ class NullRunAuthenticationError(BreakerError):
131
+ """
132
+ Raised when authentication fails and safe mode is required.
133
+
134
+ This exception indicates that the SDK could not authenticate with
135
+ the NullRun backend and will not operate in unprotected mode.
136
+ Applications should handle this exception and provide valid credentials.
137
+ """
138
+ def __init__(self, message: str):
139
+ self.message = message
140
+ super().__init__(message)
141
+
142
+
143
+ class NullRunBlockedException(BreakerError):
144
+ """
145
+ Raised when NullRun circuit breaker trips.
146
+
147
+ This is the client-side enforcement exception that
148
+ immediately stops runaway agents without waiting for
149
+ network roundtrip to the backend.
150
+
151
+ Use cases:
152
+ - Budget exceeded
153
+ - Loop detected (>6 same tool calls)
154
+ - Retry storm (>5 retries)
155
+ - Rate limit exceeded
156
+
157
+ Attributes:
158
+ workflow_id: Workflow that was blocked (may be a sentinel like
159
+ "<unknown>" when the block fires outside a workflow context,
160
+ e.g. the sensitive-tool pre-check).
161
+ reason: Human-readable explanation of why the block fired.
162
+ action: One of "block" / "kill" / "pause" — the suggested
163
+ downstream action.
164
+ tool_name: Optional name of the tool that triggered the block.
165
+ Surfaced as a first-class attribute (not just `details`) so
166
+ cookbook examples and audit pipelines can read
167
+ `exc.tool_name` without indexing into `**details`.
168
+ `None` when the block is workflow-scoped rather than
169
+ tool-scoped.
170
+ details: Free-form structured payload forwarded by the caller.
171
+ """
172
+ def __init__(
173
+ self,
174
+ workflow_id: str,
175
+ reason: str,
176
+ action: str = "block",
177
+ tool_name: str | None = None,
178
+ **details: Any,
179
+ ) -> None:
180
+ self.workflow_id = workflow_id
181
+ self.reason = reason
182
+ self.action = action
183
+ self.tool_name = tool_name
184
+ self.details = details
185
+ tool_suffix = f", tool={tool_name}" if tool_name else ""
186
+ super().__init__(
187
+ f"Workflow {workflow_id} blocked: {reason} "
188
+ f"(action={action}{tool_suffix}, details={details})"
189
+ )
190
+
191
+
192
+ # NOTE (Sprint 2.2): the following six exception classes were removed
193
+ # in 0.4.0 because they had no callers in the SDK or in any
194
+ # test. They were zombie public surface — defined but never raised.
195
+ # If a real use case emerges in the future, they should be re-added
196
+ # with at least one in-tree caller and a regression test that
197
+ # exercises the raise path:
198
+ # - CostLimitExceeded
199
+ # - ApprovalRequired
200
+ # - BreakerTimeout
201
+ # - LoopDetectedException
202
+ # - RetryStormException
203
+ # - RateLimitExceededException
204
+
205
+
206
+ class WorkflowPausedException(BreakerError):
207
+ """
208
+ Raised when workflow is paused by NullRun.
209
+
210
+ This allows the workflow to be resumed later after
211
+ human approval or automatic cooldown.
212
+ """
213
+
214
+ def __init__(self, workflow_id: str, reason: str, resume_after: float | None = None) -> None:
215
+ self.workflow_id = workflow_id
216
+ self.reason = reason
217
+ self.resume_after = resume_after
218
+ msg = f"Workflow {workflow_id} paused: {reason}"
219
+ if resume_after:
220
+ msg += f" (resume after {resume_after}s)"
221
+ super().__init__(msg)
222
+
223
+
224
+ class WorkflowKilledException(BaseException):
225
+ """
226
+ DEPRECATED. Use :class:`WorkflowKilledInterrupt` instead.
227
+
228
+ Kept for backward compatibility: this class is the *parent* of
229
+ :class:`WorkflowKilledInterrupt`, so user code that does
230
+ ``except WorkflowKilledException`` will still catch the new raises
231
+ (``except X`` matches subclasses of ``X`` — and the new class is
232
+ a subclass of this one).
233
+
234
+ A ``DeprecationWarning`` is emitted on construction. The class will
235
+ be removed in a future major release; migrate new code to
236
+ :class:`WorkflowKilledInterrupt` and update existing
237
+ ``except WorkflowKilledException`` clauses to
238
+ ``except WorkflowKilledInterrupt``, or, if recovery is impossible,
239
+ let the exception propagate to the top of the loop.
240
+
241
+ This class is **not** an ``Exception`` subclass — kill is a
242
+ non-recoverable signal and should not be caught by generic
243
+ ``except Exception`` clauses. Only ``except BaseException`` or the
244
+ explicit ``except WorkflowKilledInterrupt`` reliably stops the work.
245
+ See ``docs/kill-contract.md`` §6 for the full rationale.
246
+ """
247
+
248
+ def __init__(self, workflow_id: str, reason: str) -> None:
249
+ import warnings as _w
250
+ _w.warn(
251
+ "WorkflowKilledException is deprecated. Catch "
252
+ "WorkflowKilledInterrupt (BaseException) instead. The class "
253
+ "is preserved for backward-compatible `except` clauses but "
254
+ "will be removed in a future major release.",
255
+ DeprecationWarning,
256
+ stacklevel=2,
257
+ )
258
+ self.workflow_id = workflow_id
259
+ self.reason = reason
260
+ super().__init__(f"Workflow {workflow_id} killed: {reason}")
261
+
262
+
263
+ class WorkflowKilledInterrupt(WorkflowKilledException):
264
+ """
265
+ Raised when a workflow is killed by the NullRun control plane.
266
+
267
+ Inherits from the deprecated :class:`WorkflowKilledException`
268
+ (which is itself a ``BaseException`` subclass, not ``Exception``)
269
+ so that:
270
+
271
+ * ``except WorkflowKilledInterrupt`` (new code) catches new raises
272
+ and only new raises.
273
+ * ``except WorkflowKilledException`` (legacy user code) still
274
+ catches new raises — back-compat.
275
+ * ``except Exception`` does **not** catch this signal — kill is
276
+ not a recoverable error. Mirrors the ``KeyboardInterrupt`` /
277
+ ``SystemExit`` pattern from the standard library: user code
278
+ that catches ``except Exception`` and re-runs the work will
279
+ silently bypass the kill.
280
+ * ``except BaseException`` catches it, like the stdlib interrupts.
281
+
282
+ See ``docs/kill-contract.md`` §6 for the full rationale, including
283
+ the four-level coverage model and the decision tree for users.
284
+
285
+ Fields:
286
+ workflow_id: The workflow that was killed.
287
+ reason: Server-supplied reason (e.g. "killed via API",
288
+ "budget exhausted", "circuit-breaker tripped").
289
+
290
+ Catching in production
291
+ ----------------------
292
+ ``WorkflowKilledInterrupt`` is a ``BaseException`` subclass
293
+ (NOT ``Exception``), so a user-agent ``try / except Exception``
294
+ will not catch it. This is intentional — the kill signal
295
+ must reach the top of the loop. It does mean, however, that
296
+ Sentry / OpenTelemetry default error handlers (which filter
297
+ on ``Exception``) will not record the kill event unless the
298
+ user's code re-raises it under an ``except BaseException``:
299
+
300
+ from sentry_sdk import capture_exception
301
+ try:
302
+ agent.run()
303
+ except BaseException:
304
+ capture_exception() # records kill, ctrl-c, system-exit
305
+ raise
306
+
307
+ ``except Exception`` will swallow non-kill errors but let the
308
+ kill through. ``except BaseException`` captures everything
309
+ including the kill — recommended for the top of an agent loop.
310
+ """
311
+
312
+ def __init__(self, workflow_id: str, reason: str) -> None:
313
+ # Bypass the parent's __init__ so constructing the canonical
314
+ # class does NOT trigger the parent's DeprecationWarning. The
315
+ # deprecation is about using the old *name* — not the
316
+ # BaseException-based hierarchy.
317
+ self.workflow_id = workflow_id
318
+ self.reason = reason
319
+ BaseException.__init__(self, f"Workflow {workflow_id} killed: {reason}")
nullrun/context.py ADDED
@@ -0,0 +1,208 @@
1
+ """
2
+ Context management for NullRun SDK.
3
+
4
+ Provides workflow and trace context for automatic event correlation.
5
+
6
+ Sprint 2.7 (B27): the previously-defined ``_organization_id_var`` /
7
+ ``_api_key_id_var`` contextvars and the ``get_organization_id`` /
8
+ ``get_api_key_id`` getters were removed because:
9
+ 1. No code path ever wrote to them — both getters always
10
+ returned ``None``.
11
+ 2. ``observability.TenantFilter`` (the only consumer) was
12
+ removed in 0.3.1.
13
+ 3. The structured-logging tenant-isolation feature moved to
14
+ the backend in the same release.
15
+
16
+ If a future use case appears (e.g. per-API-key rate isolation),
17
+ re-introduce the contextvars AND a setter API (token-based like
18
+ ``set_attempt_index``) AND wire them in ``NullRunRuntime.__init__``
19
+ from the ``_authenticate`` response.
20
+ """
21
+
22
+ import uuid
23
+ from collections.abc import Generator
24
+ from contextlib import contextmanager
25
+ from contextvars import ContextVar
26
+
27
+ # Context variables for workflow/trace propagation.
28
+ _workflow_id_var: ContextVar[str | None] = ContextVar("workflow_id", default=None)
29
+ _trace_id_var: ContextVar[str | None] = ContextVar("trace_id", default=None)
30
+ _span_id_var: ContextVar[str | None] = ContextVar("span_id", default=None)
31
+ _agent_id_var: ContextVar[str | None] = ContextVar("agent_id", default=None)
32
+ _attempt_index_var: ContextVar[int] = ContextVar("attempt_index", default=0)
33
+
34
+
35
+ # =============================================================================
36
+ # Workflow / trace getters
37
+ # =============================================================================
38
+
39
+
40
+ def get_workflow_id() -> str | None:
41
+ """Get current workflow ID from context."""
42
+ return _workflow_id_var.get()
43
+
44
+
45
+ def get_trace_id() -> str | None:
46
+ """Get current trace ID from context."""
47
+ return _trace_id_var.get()
48
+
49
+
50
+ def get_span_id() -> str | None:
51
+ """Get current span ID from context."""
52
+ return _span_id_var.get()
53
+
54
+
55
+ def get_agent_id() -> str | None:
56
+ """Get current agent ID from context."""
57
+ return _agent_id_var.get()
58
+
59
+
60
+ def get_attempt_index() -> int:
61
+ """Get current attempt index from context (for retry correlation)."""
62
+ return _attempt_index_var.get()
63
+
64
+
65
+ def set_attempt_index(index: int) -> None:
66
+ """Set current attempt index for retry correlation."""
67
+ _attempt_index_var.set(index)
68
+
69
+
70
+ def generate_trace_id() -> str:
71
+ """Generate a new trace ID.
72
+
73
+ Returns a real UUID4 (e.g. ``95ca7c0b-8334-478a-af23-2788803ef3b8``).
74
+ The backend's `cost_events.trace_id` is uuid-typed, so the wire
75
+ value has to parse as a UUID — earlier we shipped
76
+ ``f"trace-{hex[:16]}"`` which silently dropped to NULL on insert
77
+ (the handler's `Uuid::parse_str(...).ok()` returned None).
78
+ """
79
+ return str(uuid.uuid4())
80
+
81
+
82
+ def generate_span_id() -> str:
83
+ """Generate a new span ID. Real UUID4 — see generate_trace_id."""
84
+ return str(uuid.uuid4())
85
+
86
+
87
+ @contextmanager
88
+ def workflow(name: str | None = None) -> Generator[str, None, None]:
89
+ """
90
+ Context manager for workflow scope.
91
+
92
+ Sets up a new workflow context with auto-generated or provided workflow_id.
93
+ All track() calls within this context automatically use this workflow_id.
94
+
95
+ Usage:
96
+ from nullrun import workflow
97
+
98
+ with workflow("my-agent"):
99
+ # All events here auto-tagged with workflow_id
100
+ track({"type": "llm_call", ...})
101
+ agent.invoke(...)
102
+
103
+ Args:
104
+ name: Optional workflow name. Auto-generated if not provided.
105
+
106
+ Yields:
107
+ The workflow_id string
108
+ """
109
+ # Phase 5 #5.6: emit a real UUID4 with dashes (matching
110
+ # ``generate_trace_id``). The previous ``wf-{hex32}`` format
111
+ # was inconsistent with the rest of the SDK's id generation.
112
+ workflow_id = name or str(uuid.uuid4())
113
+ trace_id = generate_trace_id()
114
+
115
+ # Save current values
116
+ wf_token = _workflow_id_var.set(workflow_id)
117
+ trace_token = _trace_id_var.set(trace_id)
118
+
119
+ try:
120
+ yield workflow_id
121
+ finally:
122
+ # Restore previous values
123
+ _workflow_id_var.reset(wf_token)
124
+ _trace_id_var.reset(trace_token)
125
+
126
+
127
+ @contextmanager
128
+ def span(name: str | None = None) -> Generator[str, None, None]:
129
+ """
130
+ Context manager for a span within a workflow.
131
+
132
+ Usage:
133
+ with workflow("my-agent"):
134
+ with span("llm-call"):
135
+ result = llm.invoke(prompt)
136
+ track({"type": "llm_call", ...})
137
+ """
138
+ span_id = name or generate_span_id()
139
+ token = _span_id_var.set(span_id)
140
+
141
+ try:
142
+ yield span_id
143
+ finally:
144
+ _span_id_var.reset(token)
145
+
146
+
147
+ @contextmanager
148
+ def agent(name: str | None = None) -> Generator[str, None, None]:
149
+ """
150
+ Context manager for agent scope within a workflow.
151
+
152
+ Sets up an agent context with auto-generated or provided agent_id.
153
+ All track() calls within this context automatically use this agent_id
154
+ for per-agent cost attribution.
155
+
156
+ Usage:
157
+ from nullrun import workflow, agent, track
158
+
159
+ with workflow("my-workflow"):
160
+ with agent("my-agent"):
161
+ # All events here auto-tagged with agent_id
162
+ track({"type": "llm_call", ...})
163
+ agent.invoke(...)
164
+
165
+ Args:
166
+ name: Optional agent name/ID. Auto-generated if not provided.
167
+
168
+ Yields:
169
+ The agent_id string
170
+ """
171
+ agent_id = name or f"agent-{uuid.uuid4().hex}"
172
+ token = _agent_id_var.set(agent_id)
173
+
174
+ try:
175
+ yield agent_id
176
+ finally:
177
+ _agent_id_var.reset(token)
178
+
179
+
180
+ @contextmanager
181
+ def attempt(attempt_index: int) -> Generator[int, None, None]:
182
+ """
183
+ Context manager for attempt scope within a workflow (retry correlation).
184
+
185
+ Sets up an attempt context for correlating retries in execution attempts.
186
+ All track() calls within this context automatically include the attempt_index
187
+ for linking retries to the same ExecutionAttempt in the backend.
188
+
189
+ Usage:
190
+ from nullrun import workflow, attempt, track
191
+
192
+ with workflow("my-workflow"):
193
+ for attempt_index in range(retries):
194
+ with attempt(attempt_index):
195
+ track({"type": "llm_call", ...})
196
+ llm.invoke(prompt)
197
+
198
+ Args:
199
+ attempt_index: The attempt index (0 = first attempt, 1 = first retry, etc.)
200
+
201
+ Yields:
202
+ The attempt_index
203
+ """
204
+ token = _attempt_index_var.set(attempt_index)
205
+ try:
206
+ yield attempt_index
207
+ finally:
208
+ _attempt_index_var.reset(token)