nullrun 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nullrun/__init__.py +282 -0
- nullrun/__version__.py +4 -0
- nullrun/actions.py +455 -0
- nullrun/breaker/__init__.py +27 -0
- nullrun/breaker/circuit_breaker.py +402 -0
- nullrun/breaker/exceptions.py +319 -0
- nullrun/context.py +208 -0
- nullrun/decorators.py +649 -0
- nullrun/instrumentation/__init__.py +23 -0
- nullrun/instrumentation/_safe_patch.py +99 -0
- nullrun/instrumentation/auto.py +1095 -0
- nullrun/instrumentation/auto_requests.py +257 -0
- nullrun/instrumentation/autogen.py +163 -0
- nullrun/instrumentation/crewai.py +140 -0
- nullrun/instrumentation/langgraph.py +412 -0
- nullrun/instrumentation/llama_index.py +110 -0
- nullrun/observability.py +160 -0
- nullrun/py.typed +0 -0
- nullrun/runtime.py +1806 -0
- nullrun/toolbox/__init__.py +20 -0
- nullrun/toolbox/langgraph.py +94 -0
- nullrun/tracing.py +155 -0
- nullrun/transport.py +1509 -0
- nullrun/transport_websocket.py +627 -0
- nullrun-0.4.0.dist-info/METADATA +194 -0
- nullrun-0.4.0.dist-info/RECORD +28 -0
- nullrun-0.4.0.dist-info/WHEEL +4 -0
- nullrun-0.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1095 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vendor-independent auto-instrumentation for NullRun SDK.
|
|
3
|
+
|
|
4
|
+
Phase D of the hardening plan: a single `nullrun.init(api_key=...)` call should
|
|
5
|
+
track every LLM call regardless of vendor. The user does not need to remember
|
|
6
|
+
to call `patch_openai()` or wire callbacks.
|
|
7
|
+
|
|
8
|
+
Three observation paths feed a single sink (`runtime.track`):
|
|
9
|
+
|
|
10
|
+
1. **httpx transport hook** — covers ~95% of LLM traffic. Every major vendor
|
|
11
|
+
SDK (openai, anthropic, mistral, google-genai, cohere) uses httpx under
|
|
12
|
+
the hood. The transport intercepts the response, picks an extractor by
|
|
13
|
+
URL host, and emits a `llm_call` event with raw usage.
|
|
14
|
+
|
|
15
|
+
2. **LangChain callback** — covers in-memory mock providers and callback-only
|
|
16
|
+
flows that do not hit the network.
|
|
17
|
+
|
|
18
|
+
3. **OpenAI Agents SDK tracer** — covers the `agents` package which has its
|
|
19
|
+
own tracing model.
|
|
20
|
+
|
|
21
|
+
Dedup happens at the `runtime.track` sink via a small LRU keyed by
|
|
22
|
+
`(host, body_hash)` — see `NullRunRuntime._seen_track_fingerprints`. Multiple
|
|
23
|
+
observation paths for the same LLM call collapse to a single
|
|
24
|
+
`/api/v1/track` POST.
|
|
25
|
+
|
|
26
|
+
Streaming handling: OpenAI v1.0+ (and friends) send `usage` only in the
|
|
27
|
+
final SSE chunk. The async transport accumulates chunks and runs the
|
|
28
|
+
extractor on the full buffer before forwarding. This is a deliberate UX
|
|
29
|
+
trade-off: streaming users get a buffered body so we can see the final
|
|
30
|
+
chunk, but the response content is identical.
|
|
31
|
+
|
|
32
|
+
For non-streaming responses (the common case) we read the body in-place and
|
|
33
|
+
return a reconstructed Response — no buffering, no UX change.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import hashlib
|
|
39
|
+
import json
|
|
40
|
+
import logging
|
|
41
|
+
import threading
|
|
42
|
+
from collections import OrderedDict
|
|
43
|
+
from collections.abc import Callable
|
|
44
|
+
from typing import Any
|
|
45
|
+
|
|
46
|
+
import httpx
|
|
47
|
+
|
|
48
|
+
from nullrun.instrumentation.langgraph import NullRunCallback
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
# D1: URL-keyed extractor table
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Each extractor receives the response body bytes + status code. It returns
|
|
57
|
+
# None when the body has no usage information (streaming mid-flight, non-LLM
|
|
58
|
+
# endpoint sharing the host, error response, etc.). The transport only emits
|
|
59
|
+
# a track event when the extractor returns a non-None dict.
|
|
60
|
+
|
|
61
|
+
ExtractedUsage = dict[str, Any]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _openai_extractor(body: bytes, status: int) -> ExtractedUsage | None:
|
|
65
|
+
"""OpenAI / Azure OpenAI / Mistral / Ollama (OpenAI-compat) response shape.
|
|
66
|
+
|
|
67
|
+
Mistral and Ollama (when serving OpenAI-compat) follow the same schema:
|
|
68
|
+
response.usage.{prompt_tokens, completion_tokens, total_tokens}.
|
|
69
|
+
"""
|
|
70
|
+
if status >= 400 or not body:
|
|
71
|
+
return None
|
|
72
|
+
try:
|
|
73
|
+
payload = json.loads(body)
|
|
74
|
+
except (json.JSONDecodeError, ValueError):
|
|
75
|
+
return None
|
|
76
|
+
usage = payload.get("usage") if isinstance(payload, dict) else None
|
|
77
|
+
if not isinstance(usage, dict):
|
|
78
|
+
return None
|
|
79
|
+
prompt = int(usage.get("prompt_tokens", 0) or 0)
|
|
80
|
+
completion = int(usage.get("completion_tokens", 0) or 0)
|
|
81
|
+
total = int(usage.get("total_tokens", 0) or 0)
|
|
82
|
+
if total == 0 and (prompt or completion):
|
|
83
|
+
total = prompt + completion
|
|
84
|
+
if prompt == 0 and completion == 0 and total == 0:
|
|
85
|
+
return None
|
|
86
|
+
return {
|
|
87
|
+
"prompt_tokens": prompt,
|
|
88
|
+
"completion_tokens": completion,
|
|
89
|
+
"total_tokens": total,
|
|
90
|
+
"model": payload.get("model"),
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _anthropic_extractor(body: bytes, status: int) -> ExtractedUsage | None:
|
|
95
|
+
"""Anthropic Messages API response shape.
|
|
96
|
+
|
|
97
|
+
response.usage.{input_tokens, output_tokens}.
|
|
98
|
+
"""
|
|
99
|
+
if status >= 400 or not body:
|
|
100
|
+
return None
|
|
101
|
+
try:
|
|
102
|
+
payload = json.loads(body)
|
|
103
|
+
except (json.JSONDecodeError, ValueError):
|
|
104
|
+
return None
|
|
105
|
+
usage = payload.get("usage") if isinstance(payload, dict) else None
|
|
106
|
+
if not isinstance(usage, dict):
|
|
107
|
+
return None
|
|
108
|
+
inp = int(usage.get("input_tokens", 0) or 0)
|
|
109
|
+
out = int(usage.get("output_tokens", 0) or 0)
|
|
110
|
+
if inp == 0 and out == 0:
|
|
111
|
+
return None
|
|
112
|
+
return {
|
|
113
|
+
"prompt_tokens": inp,
|
|
114
|
+
"completion_tokens": out,
|
|
115
|
+
"total_tokens": inp + out,
|
|
116
|
+
"model": payload.get("model"),
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _gemini_extractor(body: bytes, status: int) -> ExtractedUsage | None:
|
|
121
|
+
"""Google Gemini (Generative Language API) response shape.
|
|
122
|
+
|
|
123
|
+
response.usageMetadata.{promptTokenCount, candidatesTokenCount, totalTokenCount}.
|
|
124
|
+
"""
|
|
125
|
+
if status >= 400 or not body:
|
|
126
|
+
return None
|
|
127
|
+
try:
|
|
128
|
+
payload = json.loads(body)
|
|
129
|
+
except (json.JSONDecodeError, ValueError):
|
|
130
|
+
return None
|
|
131
|
+
usage = payload.get("usageMetadata") if isinstance(payload, dict) else None
|
|
132
|
+
if not isinstance(usage, dict):
|
|
133
|
+
return None
|
|
134
|
+
prompt = int(usage.get("promptTokenCount", 0) or 0)
|
|
135
|
+
completion = int(usage.get("candidatesTokenCount", 0) or 0)
|
|
136
|
+
total = int(usage.get("totalTokenCount", 0) or 0)
|
|
137
|
+
if prompt == 0 and completion == 0 and total == 0:
|
|
138
|
+
return None
|
|
139
|
+
return {
|
|
140
|
+
"prompt_tokens": prompt,
|
|
141
|
+
"completion_tokens": completion,
|
|
142
|
+
"total_tokens": total or (prompt + completion),
|
|
143
|
+
"model": payload.get("modelVersion"),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _cohere_extractor(body: bytes, status: int) -> ExtractedUsage | None:
|
|
148
|
+
"""Cohere v2 response shape.
|
|
149
|
+
|
|
150
|
+
response.usage.{tokens, input_tokens, output_tokens}.
|
|
151
|
+
Note: Cohere streaming has no usage in stream — only non-streaming
|
|
152
|
+
responses carry it. Documented in the plan.
|
|
153
|
+
"""
|
|
154
|
+
if status >= 400 or not body:
|
|
155
|
+
return None
|
|
156
|
+
try:
|
|
157
|
+
payload = json.loads(body)
|
|
158
|
+
except (json.JSONDecodeError, ValueError):
|
|
159
|
+
return None
|
|
160
|
+
usage = payload.get("usage") if isinstance(payload, dict) else None
|
|
161
|
+
if not isinstance(usage, dict):
|
|
162
|
+
return None
|
|
163
|
+
# v2 uses input_tokens/output_tokens; v1 used prompt_tokens/completion_tokens.
|
|
164
|
+
inp = int(
|
|
165
|
+
usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0) or 0
|
|
166
|
+
)
|
|
167
|
+
out = int(
|
|
168
|
+
usage.get("output_tokens", 0) or usage.get("completion_tokens", 0) or 0
|
|
169
|
+
)
|
|
170
|
+
total = int(usage.get("tokens", 0) or 0) or (inp + out)
|
|
171
|
+
if total == 0 and inp == 0 and out == 0:
|
|
172
|
+
return None
|
|
173
|
+
return {
|
|
174
|
+
"prompt_tokens": inp,
|
|
175
|
+
"completion_tokens": out,
|
|
176
|
+
"total_tokens": total,
|
|
177
|
+
"model": payload.get("model"),
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _bedrock_extractor(body: bytes, status: int) -> ExtractedUsage | None:
|
|
182
|
+
"""AWS Bedrock InvokeModel response shape.
|
|
183
|
+
|
|
184
|
+
Bedrock returns JSON whose usage is either top-level (`inputTokens` /
|
|
185
|
+
`outputTokens` on Anthropic-on-Bedrock) or nested under `usage`. We
|
|
186
|
+
handle both, since model adapter shapes vary.
|
|
187
|
+
"""
|
|
188
|
+
if status >= 400 or not body:
|
|
189
|
+
return None
|
|
190
|
+
try:
|
|
191
|
+
payload = json.loads(body)
|
|
192
|
+
except (json.JSONDecodeError, ValueError):
|
|
193
|
+
return None
|
|
194
|
+
if not isinstance(payload, dict):
|
|
195
|
+
return None
|
|
196
|
+
# Top-level (Anthropic-on-Bedrock, Mistral-on-Bedrock)
|
|
197
|
+
usage = payload.get("usage") if isinstance(payload.get("usage"), dict) else None
|
|
198
|
+
if usage is None:
|
|
199
|
+
# Some adapters put inputTokens/outputTokens at the top level
|
|
200
|
+
if "inputTokens" in payload or "outputTokens" in payload:
|
|
201
|
+
usage = payload
|
|
202
|
+
if not isinstance(usage, dict):
|
|
203
|
+
return None
|
|
204
|
+
inp = int(
|
|
205
|
+
usage.get("inputTokens", 0)
|
|
206
|
+
or usage.get("input_tokens", 0)
|
|
207
|
+
or 0
|
|
208
|
+
)
|
|
209
|
+
out = int(
|
|
210
|
+
usage.get("outputTokens", 0)
|
|
211
|
+
or usage.get("output_tokens", 0)
|
|
212
|
+
or 0
|
|
213
|
+
)
|
|
214
|
+
total = int(usage.get("totalTokens", 0) or 0) or (inp + out)
|
|
215
|
+
if inp == 0 and out == 0 and total == 0:
|
|
216
|
+
return None
|
|
217
|
+
return {
|
|
218
|
+
"prompt_tokens": inp,
|
|
219
|
+
"completion_tokens": out,
|
|
220
|
+
"total_tokens": total,
|
|
221
|
+
"model": payload.get("modelId") or payload.get("model"),
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# Order matters for suffix matching: more specific suffixes first.
|
|
226
|
+
PROVIDER_EXTRACTORS: dict[str, Callable[[bytes, int], ExtractedUsage | None]] = {
|
|
227
|
+
"api.openai.com": _openai_extractor,
|
|
228
|
+
"openai.azure.com": _openai_extractor, # Azure OpenAI
|
|
229
|
+
"api.mistral.ai": _openai_extractor, # Mistral uses OpenAI-compat
|
|
230
|
+
"api.anthropic.com": _anthropic_extractor,
|
|
231
|
+
"generativelanguage.googleapis.com": _gemini_extractor,
|
|
232
|
+
"api.cohere.ai": _cohere_extractor,
|
|
233
|
+
"bedrock-runtime.amazonaws.com": _bedrock_extractor,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _match_extractor(host: str) -> Callable[[bytes, int], ExtractedUsage | None] | None:
|
|
238
|
+
"""Return the extractor for `host`, or None if the host is not a known
|
|
239
|
+
LLM endpoint. We match exact host first, then any subdomain (e.g.
|
|
240
|
+
`eu.api.openai.com` still hits the OpenAI extractor).
|
|
241
|
+
"""
|
|
242
|
+
if not host:
|
|
243
|
+
return None
|
|
244
|
+
fn = PROVIDER_EXTRACTORS.get(host)
|
|
245
|
+
if fn is not None:
|
|
246
|
+
return fn
|
|
247
|
+
# Subdomain match: a.b.openai.com still goes to the OpenAI extractor.
|
|
248
|
+
for suffix, fn in PROVIDER_EXTRACTORS.items():
|
|
249
|
+
if host.endswith("." + suffix):
|
|
250
|
+
return fn
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _check_kill_before_send(runtime: Any, request: httpx.Request) -> None:
|
|
255
|
+
"""
|
|
256
|
+
L2 of the kill contract (see docs/kill-contract.md §2).
|
|
257
|
+
|
|
258
|
+
Pre-request gate: inspects the cached remote state for the workflow
|
|
259
|
+
bound to the current context / API key. If the workflow has been
|
|
260
|
+
killed (or paused) by the control plane, raise BEFORE the request
|
|
261
|
+
reaches the network — so a kill that lands between two LLM calls in
|
|
262
|
+
a long-running agent loop is honored on the *next* iteration, not
|
|
263
|
+
silently deferred until the next @protect entry or /track.
|
|
264
|
+
|
|
265
|
+
No-ops when:
|
|
266
|
+
- runtime is missing
|
|
267
|
+
- the request host is not a known LLM provider (out of scope)
|
|
268
|
+
- no workflow can be resolved (no active context, no API key binding)
|
|
269
|
+
- the cached state is anything other than Killed / Paused
|
|
270
|
+
|
|
271
|
+
Note: prior to T3-S2 (0.3.0) this also short-circuited in
|
|
272
|
+
`local_mode` (no api_key). The local_mode branch is gone because
|
|
273
|
+
api_key is now required at runtime construction — every runtime
|
|
274
|
+
has a remote control plane to consult.
|
|
275
|
+
|
|
276
|
+
Raises:
|
|
277
|
+
WorkflowKilledInterrupt: state == "Killed"
|
|
278
|
+
WorkflowPausedException: state == "Paused"
|
|
279
|
+
"""
|
|
280
|
+
if runtime is None:
|
|
281
|
+
return
|
|
282
|
+
# Defensive: test doubles (and any duck-typed runtime) may not
|
|
283
|
+
# implement `_resolve_workflow_id`. Skip the kill check silently
|
|
284
|
+
# rather than crashing the user's transport hook.
|
|
285
|
+
if not hasattr(runtime, "_resolve_workflow_id"):
|
|
286
|
+
return
|
|
287
|
+
# Phase 5 #5.8: the kill check is independent of which LLM host
|
|
288
|
+
# the user is talking to. Previously the check was gated on the
|
|
289
|
+
# extractor table, so a custom LLM endpoint silently bypassed the
|
|
290
|
+
# dashboard KILL switch. The kill state lives in `_remote_states`,
|
|
291
|
+
# which is keyed by workflow, not by host.
|
|
292
|
+
workflow_id = runtime._resolve_workflow_id(None)
|
|
293
|
+
if not workflow_id:
|
|
294
|
+
return
|
|
295
|
+
state = runtime._remote_state_for(workflow_id) if hasattr(runtime, "_remote_state_for") else getattr(runtime, "_remote_states", {}).get(workflow_id, {})
|
|
296
|
+
state_name = state.get("state", "Normal")
|
|
297
|
+
if state_name == "Killed":
|
|
298
|
+
from nullrun.breaker.exceptions import WorkflowKilledInterrupt
|
|
299
|
+
raise WorkflowKilledInterrupt(
|
|
300
|
+
workflow_id=workflow_id,
|
|
301
|
+
reason=state.get("reason", "remote kill"),
|
|
302
|
+
)
|
|
303
|
+
if state_name == "Paused":
|
|
304
|
+
from nullrun.breaker.exceptions import WorkflowPausedException
|
|
305
|
+
raise WorkflowPausedException(
|
|
306
|
+
workflow_id=workflow_id,
|
|
307
|
+
reason=state.get("reason", "remote pause"),
|
|
308
|
+
resume_after=None,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
# D2: httpx transport hook
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
# The transport wraps the user's underlying transport (e.g. the default
|
|
316
|
+
# httpx transport). For every request, it consults the extractor table by
|
|
317
|
+
# host. If the host is a known LLM provider, the response body is consumed
|
|
318
|
+
# once, the extractor runs, and a fresh Response is returned with the same
|
|
319
|
+
# body bytes — callers see no behavioural change.
|
|
320
|
+
|
|
321
|
+
# NOTE (Sprint 2.3): the ``_STREAMING_CONTENT_TYPES`` constant was
|
|
322
|
+
# defined here but only consumed in ``auto_requests.py`` (same
|
|
323
|
+
# constant is re-defined there). The streaming branch in the
|
|
324
|
+
# httpx transport wrapper does not actually consult this table;
|
|
325
|
+
# it just reads the body and lets the extractors return ``None``
|
|
326
|
+
# for non-usage bodies. The constant is deleted to avoid the
|
|
327
|
+
# false impression that this module has streaming-specific
|
|
328
|
+
# behaviour. See auto.py module docstring §"Streaming".
|
|
329
|
+
|
|
330
|
+
class NullRunSyncTransport(httpx.BaseTransport):
|
|
331
|
+
"""Synchronous httpx transport that emits a `llm_call` event for known
|
|
332
|
+
LLM provider responses.
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
def __init__(
|
|
336
|
+
self,
|
|
337
|
+
inner: httpx.BaseTransport,
|
|
338
|
+
runtime: Any,
|
|
339
|
+
) -> None:
|
|
340
|
+
self._inner = inner
|
|
341
|
+
self._runtime = runtime
|
|
342
|
+
|
|
343
|
+
def handle_request(self, request: httpx.Request) -> httpx.Response:
|
|
344
|
+
_check_kill_before_send(self._runtime, request)
|
|
345
|
+
host = request.url.host
|
|
346
|
+
extractor = _match_extractor(host)
|
|
347
|
+
if extractor is None:
|
|
348
|
+
return self._inner.handle_request(request)
|
|
349
|
+
response = self._inner.handle_request(request)
|
|
350
|
+
try:
|
|
351
|
+
body = response.read()
|
|
352
|
+
except Exception as e: # pragma: no cover — defensive
|
|
353
|
+
logger.debug("NullRun transport: failed to read body: %s", e)
|
|
354
|
+
return response
|
|
355
|
+
if not body:
|
|
356
|
+
return response
|
|
357
|
+
usage = extractor(body, response.status_code)
|
|
358
|
+
if usage is None:
|
|
359
|
+
# Reconstruct the response so callers can still consume the body.
|
|
360
|
+
return self._rebuild(response, body, request)
|
|
361
|
+
self._emit(request, host, usage, body, response.status_code)
|
|
362
|
+
return self._rebuild(response, body, request)
|
|
363
|
+
|
|
364
|
+
@staticmethod
|
|
365
|
+
def _rebuild(
|
|
366
|
+
response: httpx.Response,
|
|
367
|
+
body: bytes,
|
|
368
|
+
request: httpx.Request,
|
|
369
|
+
) -> httpx.Response:
|
|
370
|
+
# `response.read()` above consumed the streamed body — and httpx
|
|
371
|
+
# transparently decompresses gzip/br/zstd during that read. We
|
|
372
|
+
# MUST strip the encoding header on the rebuilt response, otherwise
|
|
373
|
+
# the downstream caller (e.g. openai/httpx) sees `content-encoding:
|
|
374
|
+
# gzip` and tries to decompress an already-decompressed body,
|
|
375
|
+
# raising `zlib.error: Error -3 while decompressing data:
|
|
376
|
+
# incorrect header check`. content-length also has to be recomputed
|
|
377
|
+
# against the post-decompression byte count.
|
|
378
|
+
req = getattr(response, "_request", None) or request
|
|
379
|
+
headers = response.headers.copy()
|
|
380
|
+
# Phase 6 #6.2: also strip Transfer-Encoding so downstream
|
|
381
|
+
# HTTP clients (and httpx itself) don't try to chunk-decode
|
|
382
|
+
# an already-buffered body.
|
|
383
|
+
for enc in (
|
|
384
|
+
"content-encoding", "Content-Encoding",
|
|
385
|
+
"transfer-encoding", "Transfer-Encoding",
|
|
386
|
+
):
|
|
387
|
+
if enc in headers:
|
|
388
|
+
del headers[enc]
|
|
389
|
+
if "content-length" in headers:
|
|
390
|
+
try:
|
|
391
|
+
headers["content-length"] = str(len(body))
|
|
392
|
+
except Exception: # pragma: no cover
|
|
393
|
+
pass
|
|
394
|
+
elif "Content-Length" in headers:
|
|
395
|
+
try:
|
|
396
|
+
headers["Content-Length"] = str(len(body))
|
|
397
|
+
except Exception: # pragma: no cover
|
|
398
|
+
pass
|
|
399
|
+
return httpx.Response(
|
|
400
|
+
status_code=response.status_code,
|
|
401
|
+
headers=headers,
|
|
402
|
+
content=body,
|
|
403
|
+
request=req,
|
|
404
|
+
extensions=response.extensions,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
def _emit(
|
|
408
|
+
self,
|
|
409
|
+
request: httpx.Request,
|
|
410
|
+
host: str,
|
|
411
|
+
usage: ExtractedUsage,
|
|
412
|
+
body: bytes,
|
|
413
|
+
status: int,
|
|
414
|
+
) -> None:
|
|
415
|
+
try:
|
|
416
|
+
self._runtime.track(
|
|
417
|
+
{
|
|
418
|
+
"type": "llm_call",
|
|
419
|
+
"provider": _provider_label(host),
|
|
420
|
+
"host": host,
|
|
421
|
+
"model": usage.get("model"),
|
|
422
|
+
"tokens": usage.get("total_tokens", 0),
|
|
423
|
+
"input_tokens": usage.get("prompt_tokens", 0),
|
|
424
|
+
"output_tokens": usage.get("completion_tokens", 0),
|
|
425
|
+
"has_usage": True,
|
|
426
|
+
"raw_usage": usage,
|
|
427
|
+
# Fingerprint for dedup at the track() sink.
|
|
428
|
+
"_fingerprint": _fingerprint_for(host, body, status),
|
|
429
|
+
}
|
|
430
|
+
)
|
|
431
|
+
except Exception as e:
|
|
432
|
+
logger.debug("NullRun transport: track failed: %s", e)
|
|
433
|
+
|
|
434
|
+
def close(self) -> None:
|
|
435
|
+
try:
|
|
436
|
+
self._inner.close()
|
|
437
|
+
except Exception as e: # pragma: no cover — defensive
|
|
438
|
+
logger.debug("NullRun transport: inner close failed: %s", e)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
class NullRunAsyncTransport(httpx.AsyncBaseTransport):
|
|
442
|
+
"""Asynchronous httpx transport. Mirrors `NullRunSyncTransport` for
|
|
443
|
+
async httpx clients. The body is consumed in a single pass via
|
|
444
|
+
`response.aread()`; for streamed responses, awaiting the body
|
|
445
|
+
accumulates chunks so the final usage object (last SSE chunk) is
|
|
446
|
+
visible to the extractor.
|
|
447
|
+
"""
|
|
448
|
+
|
|
449
|
+
def __init__(
|
|
450
|
+
self,
|
|
451
|
+
inner: httpx.AsyncBaseTransport,
|
|
452
|
+
runtime: Any,
|
|
453
|
+
) -> None:
|
|
454
|
+
self._inner = inner
|
|
455
|
+
self._runtime = runtime
|
|
456
|
+
|
|
457
|
+
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
|
458
|
+
_check_kill_before_send(self._runtime, request)
|
|
459
|
+
host = request.url.host
|
|
460
|
+
extractor = _match_extractor(host)
|
|
461
|
+
if extractor is None:
|
|
462
|
+
return await self._inner.handle_async_request(request)
|
|
463
|
+
response = await self._inner.handle_async_request(request)
|
|
464
|
+
try:
|
|
465
|
+
body = await response.aread()
|
|
466
|
+
except Exception as e: # pragma: no cover — defensive
|
|
467
|
+
logger.debug("NullRun transport: failed to read async body: %s", e)
|
|
468
|
+
return response
|
|
469
|
+
if not body:
|
|
470
|
+
return response
|
|
471
|
+
usage = extractor(body, response.status_code)
|
|
472
|
+
if usage is None:
|
|
473
|
+
return self._rebuild(response, body, request)
|
|
474
|
+
self._emit(request, host, usage, body, response.status_code)
|
|
475
|
+
return self._rebuild(response, body, request)
|
|
476
|
+
|
|
477
|
+
@staticmethod
|
|
478
|
+
def _rebuild(
|
|
479
|
+
response: httpx.Response,
|
|
480
|
+
body: bytes,
|
|
481
|
+
request: httpx.Request,
|
|
482
|
+
) -> httpx.Response:
|
|
483
|
+
# See `NullRunSyncTransport._rebuild` for the gzip-strip rationale.
|
|
484
|
+
# Without stripping content-encoding, the async openai/anthropic
|
|
485
|
+
# clients re-decompress the already-decompressed body and raise
|
|
486
|
+
# zlib.error.
|
|
487
|
+
req = getattr(response, "_request", None) or request
|
|
488
|
+
headers = response.headers.copy()
|
|
489
|
+
# Phase 6 #6.2: also strip Transfer-Encoding so downstream
|
|
490
|
+
# HTTP clients (and httpx itself) don't try to chunk-decode
|
|
491
|
+
# an already-buffered body.
|
|
492
|
+
for enc in (
|
|
493
|
+
"content-encoding", "Content-Encoding",
|
|
494
|
+
"transfer-encoding", "Transfer-Encoding",
|
|
495
|
+
):
|
|
496
|
+
if enc in headers:
|
|
497
|
+
del headers[enc]
|
|
498
|
+
if "content-length" in headers:
|
|
499
|
+
try:
|
|
500
|
+
headers["content-length"] = str(len(body))
|
|
501
|
+
except Exception: # pragma: no cover
|
|
502
|
+
pass
|
|
503
|
+
elif "Content-Length" in headers:
|
|
504
|
+
try:
|
|
505
|
+
headers["Content-Length"] = str(len(body))
|
|
506
|
+
except Exception: # pragma: no cover
|
|
507
|
+
pass
|
|
508
|
+
return httpx.Response(
|
|
509
|
+
status_code=response.status_code,
|
|
510
|
+
headers=headers,
|
|
511
|
+
content=body,
|
|
512
|
+
request=req,
|
|
513
|
+
extensions=response.extensions,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
def _emit(
|
|
517
|
+
self,
|
|
518
|
+
request: httpx.Request,
|
|
519
|
+
host: str,
|
|
520
|
+
usage: ExtractedUsage,
|
|
521
|
+
body: bytes,
|
|
522
|
+
status: int,
|
|
523
|
+
) -> None:
|
|
524
|
+
try:
|
|
525
|
+
self._runtime.track(
|
|
526
|
+
{
|
|
527
|
+
"type": "llm_call",
|
|
528
|
+
"provider": _provider_label(host),
|
|
529
|
+
"host": host,
|
|
530
|
+
"model": usage.get("model"),
|
|
531
|
+
"tokens": usage.get("total_tokens", 0),
|
|
532
|
+
"input_tokens": usage.get("prompt_tokens", 0),
|
|
533
|
+
"output_tokens": usage.get("completion_tokens", 0),
|
|
534
|
+
"has_usage": True,
|
|
535
|
+
"raw_usage": usage,
|
|
536
|
+
"_fingerprint": _fingerprint_for(host, body, status),
|
|
537
|
+
}
|
|
538
|
+
)
|
|
539
|
+
except Exception as e:
|
|
540
|
+
logger.debug("NullRun transport: async track failed: %s", e)
|
|
541
|
+
|
|
542
|
+
async def aclose(self) -> None:
|
|
543
|
+
try:
|
|
544
|
+
await self._inner.aclose()
|
|
545
|
+
except Exception as e: # pragma: no cover — defensive
|
|
546
|
+
logger.debug("NullRun transport: inner aclose failed: %s", e)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _provider_label(host: str) -> str:
|
|
550
|
+
"""Map a host to a short provider label for the `provider` event field."""
|
|
551
|
+
if "openai" in host:
|
|
552
|
+
return "openai"
|
|
553
|
+
if "anthropic" in host:
|
|
554
|
+
return "anthropic"
|
|
555
|
+
if "mistral" in host:
|
|
556
|
+
return "mistral"
|
|
557
|
+
if "googleapis" in host:
|
|
558
|
+
return "gemini"
|
|
559
|
+
if "cohere" in host:
|
|
560
|
+
return "cohere"
|
|
561
|
+
if "bedrock" in host or "amazonaws" in host:
|
|
562
|
+
return "bedrock"
|
|
563
|
+
return host or "unknown"
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _fingerprint_for(host: str, body: bytes, status: int) -> str:
|
|
567
|
+
"""Stable fingerprint for dedup. `sha256(host|status|body)[:16]` is
|
|
568
|
+
collision-resistant enough at the dedup-LRU scale (≤ a few hundred
|
|
569
|
+
entries) and short enough to keep memory bounded.
|
|
570
|
+
"""
|
|
571
|
+
h = hashlib.sha256()
|
|
572
|
+
h.update(host.encode("utf-8"))
|
|
573
|
+
h.update(b"|")
|
|
574
|
+
h.update(str(status).encode("ascii"))
|
|
575
|
+
h.update(b"|")
|
|
576
|
+
h.update(body)
|
|
577
|
+
return h.hexdigest()[:16]
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _fingerprint_for_event_dict(event: dict[str, Any]) -> str:
|
|
581
|
+
"""Stable fingerprint for a generic event dict.
|
|
582
|
+
|
|
583
|
+
Phase 3 of the production-readiness plan: ``runtime.track_event``
|
|
584
|
+
was the only emit path that did NOT set ``_fingerprint``, so two
|
|
585
|
+
observers firing for the same LLM call (the user's manual
|
|
586
|
+
``track_event`` plus the httpx transport hook) produced two
|
|
587
|
+
``/track`` POSTs. This helper gives the dedup LRU a stable key
|
|
588
|
+
derived from the event's content.
|
|
589
|
+
"""
|
|
590
|
+
try:
|
|
591
|
+
payload = json.dumps(event, sort_keys=True, default=str).encode("utf-8")
|
|
592
|
+
except (TypeError, ValueError):
|
|
593
|
+
payload = repr(event).encode("utf-8")
|
|
594
|
+
h = hashlib.sha256()
|
|
595
|
+
h.update(b"event|")
|
|
596
|
+
h.update(payload)
|
|
597
|
+
return h.hexdigest()[:16]
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
# ---------------------------------------------------------------------------
|
|
601
|
+
# D3: patch_httpx — idempotent __init__ wrap
|
|
602
|
+
# ---------------------------------------------------------------------------
|
|
603
|
+
# We wrap httpx.Client.__init__ / httpx.AsyncClient.__init__ so that ANY
|
|
604
|
+
# subsequent client construction automatically gets the NullRun transport
|
|
605
|
+
# applied to the user's chosen transport. This means the user does not need
|
|
606
|
+
# to do anything special — `openai.OpenAI(http_client=httpx.Client())` will
|
|
607
|
+
# be auto-instrumented.
|
|
608
|
+
|
|
609
|
+
_httpx_patched = False
|
|
610
|
+
_httpx_lock = threading.Lock()
|
|
611
|
+
# Originals are stashed on first patch so `reset_for_tests` can fully
|
|
612
|
+
# restore httpx.Client / AsyncClient to the un-patched state. Without
|
|
613
|
+
# this, a second `patch_httpx` would no-op (class marker still set)
|
|
614
|
+
# AND the closure inside the existing wrap would still reference the
|
|
615
|
+
# first runtime — silently losing track() calls from later test runs.
|
|
616
|
+
_orig_sync_init: Callable[..., Any] | None = None
|
|
617
|
+
_orig_async_init: Callable[..., Any] | None = None
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def patch_httpx(runtime: Any) -> bool:
|
|
621
|
+
"""Wrap httpx.Client and httpx.AsyncClient so all new instances route
|
|
622
|
+
responses through NullRun. Returns True if patching succeeded, False
|
|
623
|
+
on import failure. Idempotent: subsequent calls are no-ops.
|
|
624
|
+
"""
|
|
625
|
+
global _httpx_patched, _orig_sync_init, _orig_async_init
|
|
626
|
+
with _httpx_lock:
|
|
627
|
+
if _httpx_patched:
|
|
628
|
+
return True
|
|
629
|
+
try:
|
|
630
|
+
import httpx as _httpx # noqa: F401 — already imported above; this is the safety net
|
|
631
|
+
except ImportError: # pragma: no cover
|
|
632
|
+
logger.warning("httpx not available; auto-instrumentation skipped")
|
|
633
|
+
return False
|
|
634
|
+
|
|
635
|
+
# Idempotency marker on the class itself.
|
|
636
|
+
if getattr(httpx.Client, "_nullrun_patched", False):
|
|
637
|
+
# Already patched by an earlier import. The class-level marker
|
|
638
|
+
# is the source of truth; mirror it into the module-level flag
|
|
639
|
+
# so callers can introspect with is_auto_instrumented().
|
|
640
|
+
_httpx_patched = True
|
|
641
|
+
return True
|
|
642
|
+
|
|
643
|
+
# Stash originals on first patch so `reset_for_tests` can restore.
|
|
644
|
+
_orig_sync_init = httpx.Client.__init__
|
|
645
|
+
_orig_async_init = httpx.AsyncClient.__init__
|
|
646
|
+
|
|
647
|
+
def _wrap_sync_init(self: httpx.Client, *args: Any, **kwargs: Any) -> None:
|
|
648
|
+
_orig_sync_init(self, *args, **kwargs)
|
|
649
|
+
current = self._transport
|
|
650
|
+
if not isinstance(current, NullRunSyncTransport):
|
|
651
|
+
self._transport = NullRunSyncTransport(current, runtime)
|
|
652
|
+
|
|
653
|
+
def _wrap_async_init(self: httpx.AsyncClient, *args: Any, **kwargs: Any) -> None:
|
|
654
|
+
_orig_async_init(self, *args, **kwargs)
|
|
655
|
+
current = self._transport
|
|
656
|
+
if not isinstance(current, NullRunAsyncTransport):
|
|
657
|
+
self._transport = NullRunAsyncTransport(current, runtime)
|
|
658
|
+
|
|
659
|
+
httpx.Client.__init__ = _wrap_sync_init # type: ignore[method-assign]
|
|
660
|
+
httpx.AsyncClient.__init__ = _wrap_async_init # type: ignore[method-assign]
|
|
661
|
+
httpx.Client._nullrun_patched = True # type: ignore[attr-defined]
|
|
662
|
+
httpx.AsyncClient._nullrun_patched = True # type: ignore[attr-defined]
|
|
663
|
+
_httpx_patched = True
|
|
664
|
+
logger.info("httpx auto-instrumentation installed (sync + async)")
|
|
665
|
+
return True
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
# ---------------------------------------------------------------------------
|
|
669
|
+
# D4: patch_langchain_callback — in-memory mocks + callback-only flows
|
|
670
|
+
# ---------------------------------------------------------------------------
|
|
671
|
+
# The httpx hook covers langchain-openai (uses httpx) but NOT in-memory
|
|
672
|
+
# mock providers. Reusing NullRunCallback from langgraph.py is the right
|
|
673
|
+
# answer: it already extracts usage from LLMResult and emits via
|
|
674
|
+
# runtime.track.
|
|
675
|
+
|
|
676
|
+
_langchain_patched = False
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def patch_langchain_callback(runtime: Any) -> bool:
|
|
680
|
+
"""Install NullRunCallback into the LangChain callback manager so all
|
|
681
|
+
LLM calls (including mock providers) flow through it. Idempotent.
|
|
682
|
+
"""
|
|
683
|
+
global _langchain_patched
|
|
684
|
+
if _langchain_patched:
|
|
685
|
+
return True
|
|
686
|
+
try:
|
|
687
|
+
from langchain_core.callbacks import BaseCallbackManager
|
|
688
|
+
except ImportError:
|
|
689
|
+
logger.debug("langchain-core not installed; LangChain callback path skipped")
|
|
690
|
+
return False
|
|
691
|
+
|
|
692
|
+
if getattr(BaseCallbackManager, "_nullrun_patched", False):
|
|
693
|
+
_langchain_patched = True
|
|
694
|
+
return True
|
|
695
|
+
|
|
696
|
+
_orig_init = BaseCallbackManager.__init__
|
|
697
|
+
|
|
698
|
+
def _wrap_init(self: Any, *args: Any, **kwargs: Any) -> None:
|
|
699
|
+
_orig_init(self, *args, **kwargs)
|
|
700
|
+
try:
|
|
701
|
+
handlers = getattr(self, "handlers", None) or []
|
|
702
|
+
if any(isinstance(h, NullRunCallback) for h in handlers):
|
|
703
|
+
return
|
|
704
|
+
# Add a NullRun callback for this manager. We use the
|
|
705
|
+
# add_handler API when available; otherwise we set handlers
|
|
706
|
+
# directly (older LangChain).
|
|
707
|
+
if hasattr(self, "add_handler"):
|
|
708
|
+
self.add_handler(NullRunCallback(runtime=runtime))
|
|
709
|
+
else:
|
|
710
|
+
handlers.append(NullRunCallback(runtime=runtime))
|
|
711
|
+
self.handlers = handlers
|
|
712
|
+
except Exception as e: # pragma: no cover — defensive
|
|
713
|
+
logger.debug("NullRun: failed to add callback to manager: %s", e)
|
|
714
|
+
|
|
715
|
+
BaseCallbackManager.__init__ = _wrap_init # type: ignore[method-assign]
|
|
716
|
+
BaseCallbackManager._nullrun_patched = True # type: ignore[attr-defined]
|
|
717
|
+
_langchain_patched = True
|
|
718
|
+
logger.info("LangChain callback auto-instrumentation installed")
|
|
719
|
+
return True
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
# ---------------------------------------------------------------------------
|
|
723
|
+
# D5: patch_openai_agents — OpenAI Agents SDK tracer
|
|
724
|
+
# ---------------------------------------------------------------------------
|
|
725
|
+
# The `agents` package exposes a `Runner` whose `run` / `run_sync` returns
|
|
726
|
+
# an object that carries a `_trace_spans` list (private but stable across
|
|
727
|
+
# 0.1.x). We pull usage out of any `llm_call` span and emit a track event.
|
|
728
|
+
|
|
729
|
+
_agents_patched = False
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def patch_openai_agents(runtime: Any) -> bool:
|
|
733
|
+
"""Wrap Runner.run and Runner.run_sync to read llm_call spans. Idempotent.
|
|
734
|
+
Returns True on success, False if the `agents` package is not installed.
|
|
735
|
+
"""
|
|
736
|
+
global _agents_patched
|
|
737
|
+
if _agents_patched:
|
|
738
|
+
return True
|
|
739
|
+
try:
|
|
740
|
+
from agents import Runner # type: ignore[import-not-found]
|
|
741
|
+
except ImportError:
|
|
742
|
+
logger.debug("openai-agents not installed; Agents SDK path skipped")
|
|
743
|
+
return False
|
|
744
|
+
|
|
745
|
+
if getattr(Runner, "_nullrun_patched", False):
|
|
746
|
+
_agents_patched = True
|
|
747
|
+
return True
|
|
748
|
+
|
|
749
|
+
_orig_run = Runner.run
|
|
750
|
+
_orig_run_sync = getattr(Runner, "run_sync", None)
|
|
751
|
+
|
|
752
|
+
def _wrap_run(*args: Any, **kwargs: Any) -> Any:
|
|
753
|
+
result = _orig_run(*args, **kwargs)
|
|
754
|
+
_emit_from_agents_result(runtime, result)
|
|
755
|
+
return result
|
|
756
|
+
|
|
757
|
+
def _wrap_run_sync(*args: Any, **kwargs: Any) -> Any:
|
|
758
|
+
if _orig_run_sync is None:
|
|
759
|
+
return _wrap_run(*args, **kwargs)
|
|
760
|
+
result = _orig_run_sync(*args, **kwargs)
|
|
761
|
+
_emit_from_agents_result(runtime, result)
|
|
762
|
+
return result
|
|
763
|
+
|
|
764
|
+
Runner.run = _wrap_run
|
|
765
|
+
if _orig_run_sync is not None:
|
|
766
|
+
Runner.run_sync = _wrap_run_sync
|
|
767
|
+
Runner._nullrun_patched = True
|
|
768
|
+
_agents_patched = True
|
|
769
|
+
logger.info("openai-agents auto-instrumentation installed")
|
|
770
|
+
return True
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def _emit_from_agents_result(runtime: Any, result: Any) -> None:
|
|
774
|
+
"""Pull usage off a Runner.run result. The `agents` package stores
|
|
775
|
+
spans on the result's `_trace_spans` attribute (private; falls back
|
|
776
|
+
to `trace_spans` if exposed publicly in newer versions).
|
|
777
|
+
"""
|
|
778
|
+
spans = (
|
|
779
|
+
getattr(result, "_trace_spans", None)
|
|
780
|
+
or getattr(result, "trace_spans", None)
|
|
781
|
+
or []
|
|
782
|
+
)
|
|
783
|
+
for span in spans:
|
|
784
|
+
if not isinstance(span, dict):
|
|
785
|
+
continue
|
|
786
|
+
if span.get("type") != "llm_call":
|
|
787
|
+
continue
|
|
788
|
+
usage = span.get("usage")
|
|
789
|
+
if not isinstance(usage, dict):
|
|
790
|
+
continue
|
|
791
|
+
prompt = int(usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0) or 0)
|
|
792
|
+
completion = int(usage.get("output_tokens", 0) or usage.get("completion_tokens", 0) or 0)
|
|
793
|
+
total = int(usage.get("total_tokens", 0) or 0) or (prompt + completion)
|
|
794
|
+
if prompt == 0 and completion == 0 and total == 0:
|
|
795
|
+
continue
|
|
796
|
+
try:
|
|
797
|
+
runtime.track(
|
|
798
|
+
{
|
|
799
|
+
"type": "llm_call",
|
|
800
|
+
"provider": "openai_agents",
|
|
801
|
+
"model": span.get("model"),
|
|
802
|
+
"tokens": total,
|
|
803
|
+
"input_tokens": prompt,
|
|
804
|
+
"output_tokens": completion,
|
|
805
|
+
"has_usage": True,
|
|
806
|
+
"raw_usage": usage,
|
|
807
|
+
"_fingerprint": f"agents-{span.get('id', id(span))}",
|
|
808
|
+
}
|
|
809
|
+
)
|
|
810
|
+
except Exception as e: # pragma: no cover — defensive
|
|
811
|
+
logger.debug("NullRun: agents track failed: %s", e)
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
# ---------------------------------------------------------------------------
|
|
815
|
+
# D5b: patch_langgraph_compiled — auto-attach callback to compiled LangGraph
|
|
816
|
+
# ---------------------------------------------------------------------------
|
|
817
|
+
# A compiled LangGraph `StateGraph.compile()` returns a `Pregel` instance.
|
|
818
|
+
# To capture every invoke/stream/ainvoke/astream call site we monkey-patch
|
|
819
|
+
# the *class* methods so a NullRunCallback is added to
|
|
820
|
+
# `config["callbacks"]` automatically — the user does not have to call
|
|
821
|
+
# `nullrun.toolbox.langgraph.wrapper` explicitly. The patch is global
|
|
822
|
+
# (process-wide) but idempotent and a no-op if `langgraph` is not
|
|
823
|
+
# importable. Users who want per-app control (e.g. multiple runtimes in
|
|
824
|
+
# the same process) should use `wrapper()` instead.
|
|
825
|
+
|
|
826
|
+
_langgraph_compiled_patched = False
|
|
827
|
+
# Originals stashed on first patch so reset_for_tests can restore
|
|
828
|
+
# the un-patched class methods. The wrapped closures capture
|
|
829
|
+
# `runtime` in scope — without restoring, a second test pass would
|
|
830
|
+
# silently drop events from later runtimes.
|
|
831
|
+
_orig_pregel_invoke: Callable[..., Any] | None = None
|
|
832
|
+
_orig_pregel_stream: Callable[..., Any] | None = None
|
|
833
|
+
_orig_pregel_ainvoke: Callable[..., Any] | None = None
|
|
834
|
+
_orig_pregel_astream: Callable[..., Any] | None = None
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def patch_langgraph_compiled(runtime: Any) -> bool:
|
|
838
|
+
"""
|
|
839
|
+
Wrap `Pregel.invoke`, `Pregel.stream`, `Pregel.ainvoke`, and
|
|
840
|
+
`Pregel.astream` so a `NullRunCallback` is added to the
|
|
841
|
+
`config["callbacks"]` list on every call, unless the user already
|
|
842
|
+
supplied one. Idempotent. Returns False if `langgraph` is not
|
|
843
|
+
importable.
|
|
844
|
+
"""
|
|
845
|
+
global _langgraph_compiled_patched
|
|
846
|
+
if _langgraph_compiled_patched:
|
|
847
|
+
return True
|
|
848
|
+
try:
|
|
849
|
+
from langgraph.pregel import Pregel
|
|
850
|
+
except ImportError:
|
|
851
|
+
logger.debug("langgraph not installed; compiled-graph auto-patch skipped")
|
|
852
|
+
return False
|
|
853
|
+
|
|
854
|
+
if getattr(Pregel, "_nullrun_patched", False):
|
|
855
|
+
_langgraph_compiled_patched = True
|
|
856
|
+
return True
|
|
857
|
+
|
|
858
|
+
def _make_callback() -> Any:
|
|
859
|
+
return NullRunCallback(runtime=runtime)
|
|
860
|
+
|
|
861
|
+
def _ensure_callback(config: Any) -> dict[str, Any]:
|
|
862
|
+
"""
|
|
863
|
+
Inject a NullRunCallback into `config["callbacks"]` if the
|
|
864
|
+
user did not already supply one. We never *replace* the
|
|
865
|
+
list — user-supplied callbacks (other observability
|
|
866
|
+
tools, custom handlers) are preserved.
|
|
867
|
+
"""
|
|
868
|
+
if config is None:
|
|
869
|
+
config = {}
|
|
870
|
+
if not isinstance(config, dict):
|
|
871
|
+
return config
|
|
872
|
+
callbacks = config.get("callbacks")
|
|
873
|
+
if callbacks is None:
|
|
874
|
+
callbacks = []
|
|
875
|
+
else:
|
|
876
|
+
try:
|
|
877
|
+
if any(isinstance(cb, NullRunCallback) for cb in callbacks):
|
|
878
|
+
return config
|
|
879
|
+
except TypeError:
|
|
880
|
+
return config
|
|
881
|
+
callbacks = list(callbacks) + [_make_callback()]
|
|
882
|
+
config = dict(config)
|
|
883
|
+
config["callbacks"] = callbacks
|
|
884
|
+
return config
|
|
885
|
+
|
|
886
|
+
_orig_invoke = Pregel.invoke
|
|
887
|
+
_orig_stream = Pregel.stream
|
|
888
|
+
_orig_ainvoke = Pregel.ainvoke
|
|
889
|
+
_orig_astream = Pregel.astream
|
|
890
|
+
|
|
891
|
+
# Stash originals so reset_for_tests can restore the un-patched
|
|
892
|
+
# class methods. The wrapped closures capture `runtime` in
|
|
893
|
+
# scope — without restoring, a second test pass would silently
|
|
894
|
+
# drop events from later runtimes (same hazard as httpx patch).
|
|
895
|
+
global _orig_pregel_invoke, _orig_pregel_stream
|
|
896
|
+
global _orig_pregel_ainvoke, _orig_pregel_astream
|
|
897
|
+
_orig_pregel_invoke = _orig_invoke
|
|
898
|
+
_orig_pregel_stream = _orig_stream
|
|
899
|
+
_orig_pregel_ainvoke = _orig_ainvoke
|
|
900
|
+
_orig_pregel_astream = _orig_astream
|
|
901
|
+
|
|
902
|
+
def _wrap_invoke(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
|
|
903
|
+
return _orig_invoke(self, input, _ensure_callback(config), **kwargs)
|
|
904
|
+
|
|
905
|
+
def _wrap_stream(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
|
|
906
|
+
return _orig_stream(self, input, _ensure_callback(config), **kwargs)
|
|
907
|
+
|
|
908
|
+
async def _wrap_ainvoke(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
|
|
909
|
+
return await _orig_ainvoke(self, input, _ensure_callback(config), **kwargs)
|
|
910
|
+
|
|
911
|
+
async def _wrap_astream(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
|
|
912
|
+
async for chunk in _orig_astream(self, input, _ensure_callback(config), **kwargs):
|
|
913
|
+
yield chunk
|
|
914
|
+
|
|
915
|
+
Pregel.invoke = _wrap_invoke # type: ignore[method-assign]
|
|
916
|
+
Pregel.stream = _wrap_stream # type: ignore[method-assign]
|
|
917
|
+
Pregel.ainvoke = _wrap_ainvoke # type: ignore[method-assign]
|
|
918
|
+
Pregel.astream = _wrap_astream # type: ignore[method-assign]
|
|
919
|
+
Pregel._nullrun_patched = True # type: ignore[attr-defined]
|
|
920
|
+
_langgraph_compiled_patched = True
|
|
921
|
+
logger.info("LangGraph compiled-graph auto-instrumentation installed (Pregel.invoke/stream/ainvoke/astream)")
|
|
922
|
+
return True
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
# ---------------------------------------------------------------------------
|
|
926
|
+
# D6: orchestrator
|
|
927
|
+
# ---------------------------------------------------------------------------
|
|
928
|
+
# `auto_instrument(runtime)` installs all three observation paths. Each
|
|
929
|
+
# patch is best-effort and silently no-ops if the underlying package is
|
|
930
|
+
# not installed. The user's `init()` call invokes this once.
|
|
931
|
+
|
|
932
|
+
_auto_installed = False
|
|
933
|
+
_auto_lock = threading.Lock()
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
def auto_instrument(runtime: Any) -> bool:
|
|
937
|
+
"""Install all auto-instrumentation paths. Idempotent. Returns True if
|
|
938
|
+
at least one path was installed (so the caller can log a useful
|
|
939
|
+
'instrumented N paths' message).
|
|
940
|
+
|
|
941
|
+
Sprint 2.9 (B47): every patch call is wrapped in ``safe_patch``
|
|
942
|
+
which logs at WARNING if the patch raised a non-ImportError
|
|
943
|
+
exception. Pre-fix the 25+ scattered ``try/except Exception:
|
|
944
|
+
pass # pragma: no cover`` blocks meant a vendor SDK breaking
|
|
945
|
+
change (e.g. a renamed method) would silently disable cost
|
|
946
|
+
tracking with no log line. The operator would only find out
|
|
947
|
+
when the bill arrived.
|
|
948
|
+
"""
|
|
949
|
+
global _auto_installed
|
|
950
|
+
with _auto_lock:
|
|
951
|
+
if _auto_installed:
|
|
952
|
+
return True
|
|
953
|
+
# Lazy imports — auto_requests needs `_safe_bump_coverage` (now
|
|
954
|
+
# defined in this module) at module import time. The framework
|
|
955
|
+
# patches below are silent no-ops when their respective
|
|
956
|
+
# packages aren't installed.
|
|
957
|
+
from nullrun.instrumentation._safe_patch import safe_patch
|
|
958
|
+
from nullrun.instrumentation.auto_requests import patch_requests
|
|
959
|
+
from nullrun.instrumentation.autogen import patch_autogen
|
|
960
|
+
from nullrun.instrumentation.crewai import patch_crewai
|
|
961
|
+
from nullrun.instrumentation.llama_index import patch_llama_index
|
|
962
|
+
|
|
963
|
+
paths = [
|
|
964
|
+
safe_patch("httpx", lambda: patch_httpx(runtime)),
|
|
965
|
+
safe_patch("langchain_callback", lambda: patch_langchain_callback(runtime)),
|
|
966
|
+
safe_patch("openai_agents", lambda: patch_openai_agents(runtime)),
|
|
967
|
+
safe_patch("langgraph_compiled", lambda: patch_langgraph_compiled(runtime)),
|
|
968
|
+
safe_patch("requests", lambda: patch_requests(runtime)),
|
|
969
|
+
safe_patch("llama_index", lambda: patch_llama_index(runtime)),
|
|
970
|
+
safe_patch("crewai", lambda: patch_crewai(runtime)),
|
|
971
|
+
safe_patch("autogen", lambda: patch_autogen(runtime)),
|
|
972
|
+
]
|
|
973
|
+
# We deliberately mark this as installed even if zero paths
|
|
974
|
+
# succeeded — calling auto_instrument twice must not redo work
|
|
975
|
+
# (e.g. if the user calls init() twice, we don't want to double-patch).
|
|
976
|
+
_auto_installed = True
|
|
977
|
+
installed = sum(1 for ok in paths if ok)
|
|
978
|
+
if installed:
|
|
979
|
+
logger.info("NullRun auto-instrumentation: %d path(s) installed", installed)
|
|
980
|
+
else:
|
|
981
|
+
logger.info(
|
|
982
|
+
"NullRun auto-instrumentation: no LLM frameworks detected "
|
|
983
|
+
"(install one of: openai, anthropic, langchain-core, openai-agents)"
|
|
984
|
+
)
|
|
985
|
+
return installed > 0
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def is_auto_instrumented() -> bool:
|
|
989
|
+
"""Return True if `auto_instrument` has been called successfully."""
|
|
990
|
+
return _auto_installed
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def reset_for_tests() -> None:
|
|
994
|
+
"""Reset the auto-instrumentation state. Test-only — never call from
|
|
995
|
+
production code. Re-running auto_instrument after this point will
|
|
996
|
+
re-patch httpx / langchain / agents, which can cause double-wrapping
|
|
997
|
+
in long-lived test processes.
|
|
998
|
+
|
|
999
|
+
Also restores `httpx.Client.__init__` and `AsyncClient.__init__` to
|
|
1000
|
+
their pre-patch implementations so the next `patch_httpx` installs a
|
|
1001
|
+
fresh wrap bound to the new runtime (the old wrap's closure still
|
|
1002
|
+
references the original runtime, which would silently drop events
|
|
1003
|
+
on a second test pass).
|
|
1004
|
+
"""
|
|
1005
|
+
global _auto_installed, _httpx_patched, _langchain_patched, _agents_patched
|
|
1006
|
+
global _langgraph_compiled_patched
|
|
1007
|
+
global _orig_sync_init, _orig_async_init
|
|
1008
|
+
global _orig_pregel_invoke, _orig_pregel_stream
|
|
1009
|
+
global _orig_pregel_ainvoke, _orig_pregel_astream
|
|
1010
|
+
_auto_installed = False
|
|
1011
|
+
_httpx_patched = False
|
|
1012
|
+
_langchain_patched = False
|
|
1013
|
+
_agents_patched = False
|
|
1014
|
+
_langgraph_compiled_patched = False
|
|
1015
|
+
if _orig_sync_init is not None:
|
|
1016
|
+
try:
|
|
1017
|
+
httpx.Client.__init__ = _orig_sync_init # type: ignore[method-assign]
|
|
1018
|
+
httpx.Client._nullrun_patched = False # type: ignore[attr-defined]
|
|
1019
|
+
except Exception as e: # pragma: no cover — defensive
|
|
1020
|
+
logger.debug("reset_for_tests: failed to restore httpx.Client: %s", e)
|
|
1021
|
+
if _orig_async_init is not None:
|
|
1022
|
+
try:
|
|
1023
|
+
httpx.AsyncClient.__init__ = _orig_async_init # type: ignore[method-assign]
|
|
1024
|
+
httpx.AsyncClient._nullrun_patched = False # type: ignore[attr-defined]
|
|
1025
|
+
except Exception as e: # pragma: no cover — defensive
|
|
1026
|
+
logger.debug("reset_for_tests: failed to restore httpx.AsyncClient: %s", e)
|
|
1027
|
+
_orig_sync_init = None
|
|
1028
|
+
_orig_async_init = None
|
|
1029
|
+
if _orig_pregel_invoke is not None:
|
|
1030
|
+
try:
|
|
1031
|
+
from langgraph.pregel import Pregel
|
|
1032
|
+
Pregel.invoke = _orig_pregel_invoke # type: ignore[method-assign]
|
|
1033
|
+
Pregel.stream = _orig_pregel_stream # type: ignore[method-assign]
|
|
1034
|
+
Pregel.ainvoke = _orig_pregel_ainvoke # type: ignore[method-assign]
|
|
1035
|
+
Pregel.astream = _orig_pregel_astream # type: ignore[method-assign]
|
|
1036
|
+
Pregel._nullrun_patched = False # type: ignore[attr-defined]
|
|
1037
|
+
except Exception as e: # pragma: no cover — defensive
|
|
1038
|
+
logger.debug("reset_for_tests: failed to restore Pregel: %s", e)
|
|
1039
|
+
_orig_pregel_invoke = None
|
|
1040
|
+
_orig_pregel_stream = None
|
|
1041
|
+
_orig_pregel_ainvoke = None
|
|
1042
|
+
_orig_pregel_astream = None
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
# ---------------------------------------------------------------------------
|
|
1046
|
+
# Dedup helper
|
|
1047
|
+
# ---------------------------------------------------------------------------
|
|
1048
|
+
# `runtime.track` consults `_seen_track_fingerprints` to drop duplicate
|
|
1049
|
+
# events. This is exposed here so tests can introspect / clear the LRU
|
|
1050
|
+
# without poking into the runtime module.
|
|
1051
|
+
|
|
1052
|
+
DEDUP_LRU_MAX = 4096 # Phase 6 #6.7: 4096 entries give a 410ms dedup window at 10K events/sec
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def make_dedup_state() -> OrderedDict[str, None]:
|
|
1056
|
+
"""Return a fresh dedup LRU. Stored on the runtime instance."""
|
|
1057
|
+
return OrderedDict()
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def _fingerprint_is_seen(state: OrderedDict[str, None], fp: str) -> bool:
|
|
1061
|
+
if not fp:
|
|
1062
|
+
return False
|
|
1063
|
+
if fp in state:
|
|
1064
|
+
state.move_to_end(fp)
|
|
1065
|
+
return True
|
|
1066
|
+
state[fp] = None
|
|
1067
|
+
if len(state) > DEDUP_LRU_MAX:
|
|
1068
|
+
state.popitem(last=False)
|
|
1069
|
+
return False
|
|
1070
|
+
|
|
1071
|
+
|
|
1072
|
+
def _safe_bump_coverage(runtime: Any, target_attr: str, host: str) -> None:
|
|
1073
|
+
"""Bump a per-host counter on the runtime, tolerating stub runtimes
|
|
1074
|
+
(MagicMock, custom test doubles) that don't carry the attribute.
|
|
1075
|
+
|
|
1076
|
+
``target_attr`` is one of ``_coverage_seen``,
|
|
1077
|
+
``_coverage_streaming_skipped``. Mirrors the structure of
|
|
1078
|
+
``_fingerprint_is_seen`` — never raises.
|
|
1079
|
+
|
|
1080
|
+
Background: ``nullrun.instrumentation.auto_requests`` imports this
|
|
1081
|
+
helper but the original 0.3.0 release never defined it, so the
|
|
1082
|
+
entire ``requests`` auto-instrumentation path was unimportable.
|
|
1083
|
+
Adding the helper here unblocks the module and the dashboard's
|
|
1084
|
+
coverage tab.
|
|
1085
|
+
"""
|
|
1086
|
+
target = getattr(runtime, target_attr, None)
|
|
1087
|
+
if target is None:
|
|
1088
|
+
return
|
|
1089
|
+
if isinstance(target, dict):
|
|
1090
|
+
target[host] = int(target.get(host, 0)) + 1
|
|
1091
|
+
else:
|
|
1092
|
+
try:
|
|
1093
|
+
target[host] = int(target[host]) + 1
|
|
1094
|
+
except Exception as e: # pragma: no cover — defensive
|
|
1095
|
+
logger.debug("_safe_bump_coverage: %s bump failed: %s", target_attr, e)
|