nullrun 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1095 @@
1
+ """
2
+ Vendor-independent auto-instrumentation for NullRun SDK.
3
+
4
+ Phase D of the hardening plan: a single `nullrun.init(api_key=...)` call should
5
+ track every LLM call regardless of vendor. The user does not need to remember
6
+ to call `patch_openai()` or wire callbacks.
7
+
8
+ Three observation paths feed a single sink (`runtime.track`):
9
+
10
+ 1. **httpx transport hook** — covers ~95% of LLM traffic. Every major vendor
11
+ SDK (openai, anthropic, mistral, google-genai, cohere) uses httpx under
12
+ the hood. The transport intercepts the response, picks an extractor by
13
+ URL host, and emits a `llm_call` event with raw usage.
14
+
15
+ 2. **LangChain callback** — covers in-memory mock providers and callback-only
16
+ flows that do not hit the network.
17
+
18
+ 3. **OpenAI Agents SDK tracer** — covers the `agents` package which has its
19
+ own tracing model.
20
+
21
+ Dedup happens at the `runtime.track` sink via a small LRU keyed by
22
+ `(host, body_hash)` — see `NullRunRuntime._seen_track_fingerprints`. Multiple
23
+ observation paths for the same LLM call collapse to a single
24
+ `/api/v1/track` POST.
25
+
26
+ Streaming handling: OpenAI v1.0+ (and friends) send `usage` only in the
27
+ final SSE chunk. The async transport accumulates chunks and runs the
28
+ extractor on the full buffer before forwarding. This is a deliberate UX
29
+ trade-off: streaming users get a buffered body so we can see the final
30
+ chunk, but the response content is identical.
31
+
32
+ For non-streaming responses (the common case) we read the body in-place and
33
+ return a reconstructed Response — no buffering, no UX change.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import hashlib
39
+ import json
40
+ import logging
41
+ import threading
42
+ from collections import OrderedDict
43
+ from collections.abc import Callable
44
+ from typing import Any
45
+
46
+ import httpx
47
+
48
+ from nullrun.instrumentation.langgraph import NullRunCallback
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # D1: URL-keyed extractor table
55
+ # ---------------------------------------------------------------------------
56
+ # Each extractor receives the response body bytes + status code. It returns
57
+ # None when the body has no usage information (streaming mid-flight, non-LLM
58
+ # endpoint sharing the host, error response, etc.). The transport only emits
59
+ # a track event when the extractor returns a non-None dict.
60
+
61
+ ExtractedUsage = dict[str, Any]
62
+
63
+
64
+ def _openai_extractor(body: bytes, status: int) -> ExtractedUsage | None:
65
+ """OpenAI / Azure OpenAI / Mistral / Ollama (OpenAI-compat) response shape.
66
+
67
+ Mistral and Ollama (when serving OpenAI-compat) follow the same schema:
68
+ response.usage.{prompt_tokens, completion_tokens, total_tokens}.
69
+ """
70
+ if status >= 400 or not body:
71
+ return None
72
+ try:
73
+ payload = json.loads(body)
74
+ except (json.JSONDecodeError, ValueError):
75
+ return None
76
+ usage = payload.get("usage") if isinstance(payload, dict) else None
77
+ if not isinstance(usage, dict):
78
+ return None
79
+ prompt = int(usage.get("prompt_tokens", 0) or 0)
80
+ completion = int(usage.get("completion_tokens", 0) or 0)
81
+ total = int(usage.get("total_tokens", 0) or 0)
82
+ if total == 0 and (prompt or completion):
83
+ total = prompt + completion
84
+ if prompt == 0 and completion == 0 and total == 0:
85
+ return None
86
+ return {
87
+ "prompt_tokens": prompt,
88
+ "completion_tokens": completion,
89
+ "total_tokens": total,
90
+ "model": payload.get("model"),
91
+ }
92
+
93
+
94
+ def _anthropic_extractor(body: bytes, status: int) -> ExtractedUsage | None:
95
+ """Anthropic Messages API response shape.
96
+
97
+ response.usage.{input_tokens, output_tokens}.
98
+ """
99
+ if status >= 400 or not body:
100
+ return None
101
+ try:
102
+ payload = json.loads(body)
103
+ except (json.JSONDecodeError, ValueError):
104
+ return None
105
+ usage = payload.get("usage") if isinstance(payload, dict) else None
106
+ if not isinstance(usage, dict):
107
+ return None
108
+ inp = int(usage.get("input_tokens", 0) or 0)
109
+ out = int(usage.get("output_tokens", 0) or 0)
110
+ if inp == 0 and out == 0:
111
+ return None
112
+ return {
113
+ "prompt_tokens": inp,
114
+ "completion_tokens": out,
115
+ "total_tokens": inp + out,
116
+ "model": payload.get("model"),
117
+ }
118
+
119
+
120
+ def _gemini_extractor(body: bytes, status: int) -> ExtractedUsage | None:
121
+ """Google Gemini (Generative Language API) response shape.
122
+
123
+ response.usageMetadata.{promptTokenCount, candidatesTokenCount, totalTokenCount}.
124
+ """
125
+ if status >= 400 or not body:
126
+ return None
127
+ try:
128
+ payload = json.loads(body)
129
+ except (json.JSONDecodeError, ValueError):
130
+ return None
131
+ usage = payload.get("usageMetadata") if isinstance(payload, dict) else None
132
+ if not isinstance(usage, dict):
133
+ return None
134
+ prompt = int(usage.get("promptTokenCount", 0) or 0)
135
+ completion = int(usage.get("candidatesTokenCount", 0) or 0)
136
+ total = int(usage.get("totalTokenCount", 0) or 0)
137
+ if prompt == 0 and completion == 0 and total == 0:
138
+ return None
139
+ return {
140
+ "prompt_tokens": prompt,
141
+ "completion_tokens": completion,
142
+ "total_tokens": total or (prompt + completion),
143
+ "model": payload.get("modelVersion"),
144
+ }
145
+
146
+
147
+ def _cohere_extractor(body: bytes, status: int) -> ExtractedUsage | None:
148
+ """Cohere v2 response shape.
149
+
150
+ response.usage.{tokens, input_tokens, output_tokens}.
151
+ Note: Cohere streaming has no usage in stream — only non-streaming
152
+ responses carry it. Documented in the plan.
153
+ """
154
+ if status >= 400 or not body:
155
+ return None
156
+ try:
157
+ payload = json.loads(body)
158
+ except (json.JSONDecodeError, ValueError):
159
+ return None
160
+ usage = payload.get("usage") if isinstance(payload, dict) else None
161
+ if not isinstance(usage, dict):
162
+ return None
163
+ # v2 uses input_tokens/output_tokens; v1 used prompt_tokens/completion_tokens.
164
+ inp = int(
165
+ usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0) or 0
166
+ )
167
+ out = int(
168
+ usage.get("output_tokens", 0) or usage.get("completion_tokens", 0) or 0
169
+ )
170
+ total = int(usage.get("tokens", 0) or 0) or (inp + out)
171
+ if total == 0 and inp == 0 and out == 0:
172
+ return None
173
+ return {
174
+ "prompt_tokens": inp,
175
+ "completion_tokens": out,
176
+ "total_tokens": total,
177
+ "model": payload.get("model"),
178
+ }
179
+
180
+
181
+ def _bedrock_extractor(body: bytes, status: int) -> ExtractedUsage | None:
182
+ """AWS Bedrock InvokeModel response shape.
183
+
184
+ Bedrock returns JSON whose usage is either top-level (`inputTokens` /
185
+ `outputTokens` on Anthropic-on-Bedrock) or nested under `usage`. We
186
+ handle both, since model adapter shapes vary.
187
+ """
188
+ if status >= 400 or not body:
189
+ return None
190
+ try:
191
+ payload = json.loads(body)
192
+ except (json.JSONDecodeError, ValueError):
193
+ return None
194
+ if not isinstance(payload, dict):
195
+ return None
196
+ # Top-level (Anthropic-on-Bedrock, Mistral-on-Bedrock)
197
+ usage = payload.get("usage") if isinstance(payload.get("usage"), dict) else None
198
+ if usage is None:
199
+ # Some adapters put inputTokens/outputTokens at the top level
200
+ if "inputTokens" in payload or "outputTokens" in payload:
201
+ usage = payload
202
+ if not isinstance(usage, dict):
203
+ return None
204
+ inp = int(
205
+ usage.get("inputTokens", 0)
206
+ or usage.get("input_tokens", 0)
207
+ or 0
208
+ )
209
+ out = int(
210
+ usage.get("outputTokens", 0)
211
+ or usage.get("output_tokens", 0)
212
+ or 0
213
+ )
214
+ total = int(usage.get("totalTokens", 0) or 0) or (inp + out)
215
+ if inp == 0 and out == 0 and total == 0:
216
+ return None
217
+ return {
218
+ "prompt_tokens": inp,
219
+ "completion_tokens": out,
220
+ "total_tokens": total,
221
+ "model": payload.get("modelId") or payload.get("model"),
222
+ }
223
+
224
+
225
+ # Order matters for suffix matching: more specific suffixes first.
226
+ PROVIDER_EXTRACTORS: dict[str, Callable[[bytes, int], ExtractedUsage | None]] = {
227
+ "api.openai.com": _openai_extractor,
228
+ "openai.azure.com": _openai_extractor, # Azure OpenAI
229
+ "api.mistral.ai": _openai_extractor, # Mistral uses OpenAI-compat
230
+ "api.anthropic.com": _anthropic_extractor,
231
+ "generativelanguage.googleapis.com": _gemini_extractor,
232
+ "api.cohere.ai": _cohere_extractor,
233
+ "bedrock-runtime.amazonaws.com": _bedrock_extractor,
234
+ }
235
+
236
+
237
+ def _match_extractor(host: str) -> Callable[[bytes, int], ExtractedUsage | None] | None:
238
+ """Return the extractor for `host`, or None if the host is not a known
239
+ LLM endpoint. We match exact host first, then any subdomain (e.g.
240
+ `eu.api.openai.com` still hits the OpenAI extractor).
241
+ """
242
+ if not host:
243
+ return None
244
+ fn = PROVIDER_EXTRACTORS.get(host)
245
+ if fn is not None:
246
+ return fn
247
+ # Subdomain match: a.b.openai.com still goes to the OpenAI extractor.
248
+ for suffix, fn in PROVIDER_EXTRACTORS.items():
249
+ if host.endswith("." + suffix):
250
+ return fn
251
+ return None
252
+
253
+
254
+ def _check_kill_before_send(runtime: Any, request: httpx.Request) -> None:
255
+ """
256
+ L2 of the kill contract (see docs/kill-contract.md §2).
257
+
258
+ Pre-request gate: inspects the cached remote state for the workflow
259
+ bound to the current context / API key. If the workflow has been
260
+ killed (or paused) by the control plane, raise BEFORE the request
261
+ reaches the network — so a kill that lands between two LLM calls in
262
+ a long-running agent loop is honored on the *next* iteration, not
263
+ silently deferred until the next @protect entry or /track.
264
+
265
+ No-ops when:
266
+ - runtime is missing
267
+ - the request host is not a known LLM provider (out of scope)
268
+ - no workflow can be resolved (no active context, no API key binding)
269
+ - the cached state is anything other than Killed / Paused
270
+
271
+ Note: prior to T3-S2 (0.3.0) this also short-circuited in
272
+ `local_mode` (no api_key). The local_mode branch is gone because
273
+ api_key is now required at runtime construction — every runtime
274
+ has a remote control plane to consult.
275
+
276
+ Raises:
277
+ WorkflowKilledInterrupt: state == "Killed"
278
+ WorkflowPausedException: state == "Paused"
279
+ """
280
+ if runtime is None:
281
+ return
282
+ # Defensive: test doubles (and any duck-typed runtime) may not
283
+ # implement `_resolve_workflow_id`. Skip the kill check silently
284
+ # rather than crashing the user's transport hook.
285
+ if not hasattr(runtime, "_resolve_workflow_id"):
286
+ return
287
+ # Phase 5 #5.8: the kill check is independent of which LLM host
288
+ # the user is talking to. Previously the check was gated on the
289
+ # extractor table, so a custom LLM endpoint silently bypassed the
290
+ # dashboard KILL switch. The kill state lives in `_remote_states`,
291
+ # which is keyed by workflow, not by host.
292
+ workflow_id = runtime._resolve_workflow_id(None)
293
+ if not workflow_id:
294
+ return
295
+ state = runtime._remote_state_for(workflow_id) if hasattr(runtime, "_remote_state_for") else getattr(runtime, "_remote_states", {}).get(workflow_id, {})
296
+ state_name = state.get("state", "Normal")
297
+ if state_name == "Killed":
298
+ from nullrun.breaker.exceptions import WorkflowKilledInterrupt
299
+ raise WorkflowKilledInterrupt(
300
+ workflow_id=workflow_id,
301
+ reason=state.get("reason", "remote kill"),
302
+ )
303
+ if state_name == "Paused":
304
+ from nullrun.breaker.exceptions import WorkflowPausedException
305
+ raise WorkflowPausedException(
306
+ workflow_id=workflow_id,
307
+ reason=state.get("reason", "remote pause"),
308
+ resume_after=None,
309
+ )
310
+
311
+
312
+ # ---------------------------------------------------------------------------
313
+ # D2: httpx transport hook
314
+ # ---------------------------------------------------------------------------
315
+ # The transport wraps the user's underlying transport (e.g. the default
316
+ # httpx transport). For every request, it consults the extractor table by
317
+ # host. If the host is a known LLM provider, the response body is consumed
318
+ # once, the extractor runs, and a fresh Response is returned with the same
319
+ # body bytes — callers see no behavioural change.
320
+
321
+ # NOTE (Sprint 2.3): the ``_STREAMING_CONTENT_TYPES`` constant was
322
+ # defined here but only consumed in ``auto_requests.py`` (same
323
+ # constant is re-defined there). The streaming branch in the
324
+ # httpx transport wrapper does not actually consult this table;
325
+ # it just reads the body and lets the extractors return ``None``
326
+ # for non-usage bodies. The constant is deleted to avoid the
327
+ # false impression that this module has streaming-specific
328
+ # behaviour. See auto.py module docstring §"Streaming".
329
+
330
+ class NullRunSyncTransport(httpx.BaseTransport):
331
+ """Synchronous httpx transport that emits a `llm_call` event for known
332
+ LLM provider responses.
333
+ """
334
+
335
+ def __init__(
336
+ self,
337
+ inner: httpx.BaseTransport,
338
+ runtime: Any,
339
+ ) -> None:
340
+ self._inner = inner
341
+ self._runtime = runtime
342
+
343
+ def handle_request(self, request: httpx.Request) -> httpx.Response:
344
+ _check_kill_before_send(self._runtime, request)
345
+ host = request.url.host
346
+ extractor = _match_extractor(host)
347
+ if extractor is None:
348
+ return self._inner.handle_request(request)
349
+ response = self._inner.handle_request(request)
350
+ try:
351
+ body = response.read()
352
+ except Exception as e: # pragma: no cover — defensive
353
+ logger.debug("NullRun transport: failed to read body: %s", e)
354
+ return response
355
+ if not body:
356
+ return response
357
+ usage = extractor(body, response.status_code)
358
+ if usage is None:
359
+ # Reconstruct the response so callers can still consume the body.
360
+ return self._rebuild(response, body, request)
361
+ self._emit(request, host, usage, body, response.status_code)
362
+ return self._rebuild(response, body, request)
363
+
364
+ @staticmethod
365
+ def _rebuild(
366
+ response: httpx.Response,
367
+ body: bytes,
368
+ request: httpx.Request,
369
+ ) -> httpx.Response:
370
+ # `response.read()` above consumed the streamed body — and httpx
371
+ # transparently decompresses gzip/br/zstd during that read. We
372
+ # MUST strip the encoding header on the rebuilt response, otherwise
373
+ # the downstream caller (e.g. openai/httpx) sees `content-encoding:
374
+ # gzip` and tries to decompress an already-decompressed body,
375
+ # raising `zlib.error: Error -3 while decompressing data:
376
+ # incorrect header check`. content-length also has to be recomputed
377
+ # against the post-decompression byte count.
378
+ req = getattr(response, "_request", None) or request
379
+ headers = response.headers.copy()
380
+ # Phase 6 #6.2: also strip Transfer-Encoding so downstream
381
+ # HTTP clients (and httpx itself) don't try to chunk-decode
382
+ # an already-buffered body.
383
+ for enc in (
384
+ "content-encoding", "Content-Encoding",
385
+ "transfer-encoding", "Transfer-Encoding",
386
+ ):
387
+ if enc in headers:
388
+ del headers[enc]
389
+ if "content-length" in headers:
390
+ try:
391
+ headers["content-length"] = str(len(body))
392
+ except Exception: # pragma: no cover
393
+ pass
394
+ elif "Content-Length" in headers:
395
+ try:
396
+ headers["Content-Length"] = str(len(body))
397
+ except Exception: # pragma: no cover
398
+ pass
399
+ return httpx.Response(
400
+ status_code=response.status_code,
401
+ headers=headers,
402
+ content=body,
403
+ request=req,
404
+ extensions=response.extensions,
405
+ )
406
+
407
+ def _emit(
408
+ self,
409
+ request: httpx.Request,
410
+ host: str,
411
+ usage: ExtractedUsage,
412
+ body: bytes,
413
+ status: int,
414
+ ) -> None:
415
+ try:
416
+ self._runtime.track(
417
+ {
418
+ "type": "llm_call",
419
+ "provider": _provider_label(host),
420
+ "host": host,
421
+ "model": usage.get("model"),
422
+ "tokens": usage.get("total_tokens", 0),
423
+ "input_tokens": usage.get("prompt_tokens", 0),
424
+ "output_tokens": usage.get("completion_tokens", 0),
425
+ "has_usage": True,
426
+ "raw_usage": usage,
427
+ # Fingerprint for dedup at the track() sink.
428
+ "_fingerprint": _fingerprint_for(host, body, status),
429
+ }
430
+ )
431
+ except Exception as e:
432
+ logger.debug("NullRun transport: track failed: %s", e)
433
+
434
+ def close(self) -> None:
435
+ try:
436
+ self._inner.close()
437
+ except Exception as e: # pragma: no cover — defensive
438
+ logger.debug("NullRun transport: inner close failed: %s", e)
439
+
440
+
441
+ class NullRunAsyncTransport(httpx.AsyncBaseTransport):
442
+ """Asynchronous httpx transport. Mirrors `NullRunSyncTransport` for
443
+ async httpx clients. The body is consumed in a single pass via
444
+ `response.aread()`; for streamed responses, awaiting the body
445
+ accumulates chunks so the final usage object (last SSE chunk) is
446
+ visible to the extractor.
447
+ """
448
+
449
+ def __init__(
450
+ self,
451
+ inner: httpx.AsyncBaseTransport,
452
+ runtime: Any,
453
+ ) -> None:
454
+ self._inner = inner
455
+ self._runtime = runtime
456
+
457
+ async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
458
+ _check_kill_before_send(self._runtime, request)
459
+ host = request.url.host
460
+ extractor = _match_extractor(host)
461
+ if extractor is None:
462
+ return await self._inner.handle_async_request(request)
463
+ response = await self._inner.handle_async_request(request)
464
+ try:
465
+ body = await response.aread()
466
+ except Exception as e: # pragma: no cover — defensive
467
+ logger.debug("NullRun transport: failed to read async body: %s", e)
468
+ return response
469
+ if not body:
470
+ return response
471
+ usage = extractor(body, response.status_code)
472
+ if usage is None:
473
+ return self._rebuild(response, body, request)
474
+ self._emit(request, host, usage, body, response.status_code)
475
+ return self._rebuild(response, body, request)
476
+
477
+ @staticmethod
478
+ def _rebuild(
479
+ response: httpx.Response,
480
+ body: bytes,
481
+ request: httpx.Request,
482
+ ) -> httpx.Response:
483
+ # See `NullRunSyncTransport._rebuild` for the gzip-strip rationale.
484
+ # Without stripping content-encoding, the async openai/anthropic
485
+ # clients re-decompress the already-decompressed body and raise
486
+ # zlib.error.
487
+ req = getattr(response, "_request", None) or request
488
+ headers = response.headers.copy()
489
+ # Phase 6 #6.2: also strip Transfer-Encoding so downstream
490
+ # HTTP clients (and httpx itself) don't try to chunk-decode
491
+ # an already-buffered body.
492
+ for enc in (
493
+ "content-encoding", "Content-Encoding",
494
+ "transfer-encoding", "Transfer-Encoding",
495
+ ):
496
+ if enc in headers:
497
+ del headers[enc]
498
+ if "content-length" in headers:
499
+ try:
500
+ headers["content-length"] = str(len(body))
501
+ except Exception: # pragma: no cover
502
+ pass
503
+ elif "Content-Length" in headers:
504
+ try:
505
+ headers["Content-Length"] = str(len(body))
506
+ except Exception: # pragma: no cover
507
+ pass
508
+ return httpx.Response(
509
+ status_code=response.status_code,
510
+ headers=headers,
511
+ content=body,
512
+ request=req,
513
+ extensions=response.extensions,
514
+ )
515
+
516
+ def _emit(
517
+ self,
518
+ request: httpx.Request,
519
+ host: str,
520
+ usage: ExtractedUsage,
521
+ body: bytes,
522
+ status: int,
523
+ ) -> None:
524
+ try:
525
+ self._runtime.track(
526
+ {
527
+ "type": "llm_call",
528
+ "provider": _provider_label(host),
529
+ "host": host,
530
+ "model": usage.get("model"),
531
+ "tokens": usage.get("total_tokens", 0),
532
+ "input_tokens": usage.get("prompt_tokens", 0),
533
+ "output_tokens": usage.get("completion_tokens", 0),
534
+ "has_usage": True,
535
+ "raw_usage": usage,
536
+ "_fingerprint": _fingerprint_for(host, body, status),
537
+ }
538
+ )
539
+ except Exception as e:
540
+ logger.debug("NullRun transport: async track failed: %s", e)
541
+
542
+ async def aclose(self) -> None:
543
+ try:
544
+ await self._inner.aclose()
545
+ except Exception as e: # pragma: no cover — defensive
546
+ logger.debug("NullRun transport: inner aclose failed: %s", e)
547
+
548
+
549
+ def _provider_label(host: str) -> str:
550
+ """Map a host to a short provider label for the `provider` event field."""
551
+ if "openai" in host:
552
+ return "openai"
553
+ if "anthropic" in host:
554
+ return "anthropic"
555
+ if "mistral" in host:
556
+ return "mistral"
557
+ if "googleapis" in host:
558
+ return "gemini"
559
+ if "cohere" in host:
560
+ return "cohere"
561
+ if "bedrock" in host or "amazonaws" in host:
562
+ return "bedrock"
563
+ return host or "unknown"
564
+
565
+
566
+ def _fingerprint_for(host: str, body: bytes, status: int) -> str:
567
+ """Stable fingerprint for dedup. `sha256(host|status|body)[:16]` is
568
+ collision-resistant enough at the dedup-LRU scale (≤ a few hundred
569
+ entries) and short enough to keep memory bounded.
570
+ """
571
+ h = hashlib.sha256()
572
+ h.update(host.encode("utf-8"))
573
+ h.update(b"|")
574
+ h.update(str(status).encode("ascii"))
575
+ h.update(b"|")
576
+ h.update(body)
577
+ return h.hexdigest()[:16]
578
+
579
+
580
+ def _fingerprint_for_event_dict(event: dict[str, Any]) -> str:
581
+ """Stable fingerprint for a generic event dict.
582
+
583
+ Phase 3 of the production-readiness plan: ``runtime.track_event``
584
+ was the only emit path that did NOT set ``_fingerprint``, so two
585
+ observers firing for the same LLM call (the user's manual
586
+ ``track_event`` plus the httpx transport hook) produced two
587
+ ``/track`` POSTs. This helper gives the dedup LRU a stable key
588
+ derived from the event's content.
589
+ """
590
+ try:
591
+ payload = json.dumps(event, sort_keys=True, default=str).encode("utf-8")
592
+ except (TypeError, ValueError):
593
+ payload = repr(event).encode("utf-8")
594
+ h = hashlib.sha256()
595
+ h.update(b"event|")
596
+ h.update(payload)
597
+ return h.hexdigest()[:16]
598
+
599
+
600
+ # ---------------------------------------------------------------------------
601
+ # D3: patch_httpx — idempotent __init__ wrap
602
+ # ---------------------------------------------------------------------------
603
+ # We wrap httpx.Client.__init__ / httpx.AsyncClient.__init__ so that ANY
604
+ # subsequent client construction automatically gets the NullRun transport
605
+ # applied to the user's chosen transport. This means the user does not need
606
+ # to do anything special — `openai.OpenAI(http_client=httpx.Client())` will
607
+ # be auto-instrumented.
608
+
609
+ _httpx_patched = False
610
+ _httpx_lock = threading.Lock()
611
+ # Originals are stashed on first patch so `reset_for_tests` can fully
612
+ # restore httpx.Client / AsyncClient to the un-patched state. Without
613
+ # this, a second `patch_httpx` would no-op (class marker still set)
614
+ # AND the closure inside the existing wrap would still reference the
615
+ # first runtime — silently losing track() calls from later test runs.
616
+ _orig_sync_init: Callable[..., Any] | None = None
617
+ _orig_async_init: Callable[..., Any] | None = None
618
+
619
+
620
+ def patch_httpx(runtime: Any) -> bool:
621
+ """Wrap httpx.Client and httpx.AsyncClient so all new instances route
622
+ responses through NullRun. Returns True if patching succeeded, False
623
+ on import failure. Idempotent: subsequent calls are no-ops.
624
+ """
625
+ global _httpx_patched, _orig_sync_init, _orig_async_init
626
+ with _httpx_lock:
627
+ if _httpx_patched:
628
+ return True
629
+ try:
630
+ import httpx as _httpx # noqa: F401 — already imported above; this is the safety net
631
+ except ImportError: # pragma: no cover
632
+ logger.warning("httpx not available; auto-instrumentation skipped")
633
+ return False
634
+
635
+ # Idempotency marker on the class itself.
636
+ if getattr(httpx.Client, "_nullrun_patched", False):
637
+ # Already patched by an earlier import. The class-level marker
638
+ # is the source of truth; mirror it into the module-level flag
639
+ # so callers can introspect with is_auto_instrumented().
640
+ _httpx_patched = True
641
+ return True
642
+
643
+ # Stash originals on first patch so `reset_for_tests` can restore.
644
+ _orig_sync_init = httpx.Client.__init__
645
+ _orig_async_init = httpx.AsyncClient.__init__
646
+
647
+ def _wrap_sync_init(self: httpx.Client, *args: Any, **kwargs: Any) -> None:
648
+ _orig_sync_init(self, *args, **kwargs)
649
+ current = self._transport
650
+ if not isinstance(current, NullRunSyncTransport):
651
+ self._transport = NullRunSyncTransport(current, runtime)
652
+
653
+ def _wrap_async_init(self: httpx.AsyncClient, *args: Any, **kwargs: Any) -> None:
654
+ _orig_async_init(self, *args, **kwargs)
655
+ current = self._transport
656
+ if not isinstance(current, NullRunAsyncTransport):
657
+ self._transport = NullRunAsyncTransport(current, runtime)
658
+
659
+ httpx.Client.__init__ = _wrap_sync_init # type: ignore[method-assign]
660
+ httpx.AsyncClient.__init__ = _wrap_async_init # type: ignore[method-assign]
661
+ httpx.Client._nullrun_patched = True # type: ignore[attr-defined]
662
+ httpx.AsyncClient._nullrun_patched = True # type: ignore[attr-defined]
663
+ _httpx_patched = True
664
+ logger.info("httpx auto-instrumentation installed (sync + async)")
665
+ return True
666
+
667
+
668
+ # ---------------------------------------------------------------------------
669
+ # D4: patch_langchain_callback — in-memory mocks + callback-only flows
670
+ # ---------------------------------------------------------------------------
671
+ # The httpx hook covers langchain-openai (uses httpx) but NOT in-memory
672
+ # mock providers. Reusing NullRunCallback from langgraph.py is the right
673
+ # answer: it already extracts usage from LLMResult and emits via
674
+ # runtime.track.
675
+
676
+ _langchain_patched = False
677
+
678
+
679
+ def patch_langchain_callback(runtime: Any) -> bool:
680
+ """Install NullRunCallback into the LangChain callback manager so all
681
+ LLM calls (including mock providers) flow through it. Idempotent.
682
+ """
683
+ global _langchain_patched
684
+ if _langchain_patched:
685
+ return True
686
+ try:
687
+ from langchain_core.callbacks import BaseCallbackManager
688
+ except ImportError:
689
+ logger.debug("langchain-core not installed; LangChain callback path skipped")
690
+ return False
691
+
692
+ if getattr(BaseCallbackManager, "_nullrun_patched", False):
693
+ _langchain_patched = True
694
+ return True
695
+
696
+ _orig_init = BaseCallbackManager.__init__
697
+
698
+ def _wrap_init(self: Any, *args: Any, **kwargs: Any) -> None:
699
+ _orig_init(self, *args, **kwargs)
700
+ try:
701
+ handlers = getattr(self, "handlers", None) or []
702
+ if any(isinstance(h, NullRunCallback) for h in handlers):
703
+ return
704
+ # Add a NullRun callback for this manager. We use the
705
+ # add_handler API when available; otherwise we set handlers
706
+ # directly (older LangChain).
707
+ if hasattr(self, "add_handler"):
708
+ self.add_handler(NullRunCallback(runtime=runtime))
709
+ else:
710
+ handlers.append(NullRunCallback(runtime=runtime))
711
+ self.handlers = handlers
712
+ except Exception as e: # pragma: no cover — defensive
713
+ logger.debug("NullRun: failed to add callback to manager: %s", e)
714
+
715
+ BaseCallbackManager.__init__ = _wrap_init # type: ignore[method-assign]
716
+ BaseCallbackManager._nullrun_patched = True # type: ignore[attr-defined]
717
+ _langchain_patched = True
718
+ logger.info("LangChain callback auto-instrumentation installed")
719
+ return True
720
+
721
+
722
+ # ---------------------------------------------------------------------------
723
+ # D5: patch_openai_agents — OpenAI Agents SDK tracer
724
+ # ---------------------------------------------------------------------------
725
+ # The `agents` package exposes a `Runner` whose `run` / `run_sync` returns
726
+ # an object that carries a `_trace_spans` list (private but stable across
727
+ # 0.1.x). We pull usage out of any `llm_call` span and emit a track event.
728
+
729
+ _agents_patched = False
730
+
731
+
732
+ def patch_openai_agents(runtime: Any) -> bool:
733
+ """Wrap Runner.run and Runner.run_sync to read llm_call spans. Idempotent.
734
+ Returns True on success, False if the `agents` package is not installed.
735
+ """
736
+ global _agents_patched
737
+ if _agents_patched:
738
+ return True
739
+ try:
740
+ from agents import Runner # type: ignore[import-not-found]
741
+ except ImportError:
742
+ logger.debug("openai-agents not installed; Agents SDK path skipped")
743
+ return False
744
+
745
+ if getattr(Runner, "_nullrun_patched", False):
746
+ _agents_patched = True
747
+ return True
748
+
749
+ _orig_run = Runner.run
750
+ _orig_run_sync = getattr(Runner, "run_sync", None)
751
+
752
+ def _wrap_run(*args: Any, **kwargs: Any) -> Any:
753
+ result = _orig_run(*args, **kwargs)
754
+ _emit_from_agents_result(runtime, result)
755
+ return result
756
+
757
+ def _wrap_run_sync(*args: Any, **kwargs: Any) -> Any:
758
+ if _orig_run_sync is None:
759
+ return _wrap_run(*args, **kwargs)
760
+ result = _orig_run_sync(*args, **kwargs)
761
+ _emit_from_agents_result(runtime, result)
762
+ return result
763
+
764
+ Runner.run = _wrap_run
765
+ if _orig_run_sync is not None:
766
+ Runner.run_sync = _wrap_run_sync
767
+ Runner._nullrun_patched = True
768
+ _agents_patched = True
769
+ logger.info("openai-agents auto-instrumentation installed")
770
+ return True
771
+
772
+
773
+ def _emit_from_agents_result(runtime: Any, result: Any) -> None:
774
+ """Pull usage off a Runner.run result. The `agents` package stores
775
+ spans on the result's `_trace_spans` attribute (private; falls back
776
+ to `trace_spans` if exposed publicly in newer versions).
777
+ """
778
+ spans = (
779
+ getattr(result, "_trace_spans", None)
780
+ or getattr(result, "trace_spans", None)
781
+ or []
782
+ )
783
+ for span in spans:
784
+ if not isinstance(span, dict):
785
+ continue
786
+ if span.get("type") != "llm_call":
787
+ continue
788
+ usage = span.get("usage")
789
+ if not isinstance(usage, dict):
790
+ continue
791
+ prompt = int(usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0) or 0)
792
+ completion = int(usage.get("output_tokens", 0) or usage.get("completion_tokens", 0) or 0)
793
+ total = int(usage.get("total_tokens", 0) or 0) or (prompt + completion)
794
+ if prompt == 0 and completion == 0 and total == 0:
795
+ continue
796
+ try:
797
+ runtime.track(
798
+ {
799
+ "type": "llm_call",
800
+ "provider": "openai_agents",
801
+ "model": span.get("model"),
802
+ "tokens": total,
803
+ "input_tokens": prompt,
804
+ "output_tokens": completion,
805
+ "has_usage": True,
806
+ "raw_usage": usage,
807
+ "_fingerprint": f"agents-{span.get('id', id(span))}",
808
+ }
809
+ )
810
+ except Exception as e: # pragma: no cover — defensive
811
+ logger.debug("NullRun: agents track failed: %s", e)
812
+
813
+
814
+ # ---------------------------------------------------------------------------
815
+ # D5b: patch_langgraph_compiled — auto-attach callback to compiled LangGraph
816
+ # ---------------------------------------------------------------------------
817
+ # A compiled LangGraph `StateGraph.compile()` returns a `Pregel` instance.
818
+ # To capture every invoke/stream/ainvoke/astream call site we monkey-patch
819
+ # the *class* methods so a NullRunCallback is added to
820
+ # `config["callbacks"]` automatically — the user does not have to call
821
+ # `nullrun.toolbox.langgraph.wrapper` explicitly. The patch is global
822
+ # (process-wide) but idempotent and a no-op if `langgraph` is not
823
+ # importable. Users who want per-app control (e.g. multiple runtimes in
824
+ # the same process) should use `wrapper()` instead.
825
+
826
+ _langgraph_compiled_patched = False
827
+ # Originals stashed on first patch so reset_for_tests can restore
828
+ # the un-patched class methods. The wrapped closures capture
829
+ # `runtime` in scope — without restoring, a second test pass would
830
+ # silently drop events from later runtimes.
831
+ _orig_pregel_invoke: Callable[..., Any] | None = None
832
+ _orig_pregel_stream: Callable[..., Any] | None = None
833
+ _orig_pregel_ainvoke: Callable[..., Any] | None = None
834
+ _orig_pregel_astream: Callable[..., Any] | None = None
835
+
836
+
837
+ def patch_langgraph_compiled(runtime: Any) -> bool:
838
+ """
839
+ Wrap `Pregel.invoke`, `Pregel.stream`, `Pregel.ainvoke`, and
840
+ `Pregel.astream` so a `NullRunCallback` is added to the
841
+ `config["callbacks"]` list on every call, unless the user already
842
+ supplied one. Idempotent. Returns False if `langgraph` is not
843
+ importable.
844
+ """
845
+ global _langgraph_compiled_patched
846
+ if _langgraph_compiled_patched:
847
+ return True
848
+ try:
849
+ from langgraph.pregel import Pregel
850
+ except ImportError:
851
+ logger.debug("langgraph not installed; compiled-graph auto-patch skipped")
852
+ return False
853
+
854
+ if getattr(Pregel, "_nullrun_patched", False):
855
+ _langgraph_compiled_patched = True
856
+ return True
857
+
858
+ def _make_callback() -> Any:
859
+ return NullRunCallback(runtime=runtime)
860
+
861
+ def _ensure_callback(config: Any) -> dict[str, Any]:
862
+ """
863
+ Inject a NullRunCallback into `config["callbacks"]` if the
864
+ user did not already supply one. We never *replace* the
865
+ list — user-supplied callbacks (other observability
866
+ tools, custom handlers) are preserved.
867
+ """
868
+ if config is None:
869
+ config = {}
870
+ if not isinstance(config, dict):
871
+ return config
872
+ callbacks = config.get("callbacks")
873
+ if callbacks is None:
874
+ callbacks = []
875
+ else:
876
+ try:
877
+ if any(isinstance(cb, NullRunCallback) for cb in callbacks):
878
+ return config
879
+ except TypeError:
880
+ return config
881
+ callbacks = list(callbacks) + [_make_callback()]
882
+ config = dict(config)
883
+ config["callbacks"] = callbacks
884
+ return config
885
+
886
+ _orig_invoke = Pregel.invoke
887
+ _orig_stream = Pregel.stream
888
+ _orig_ainvoke = Pregel.ainvoke
889
+ _orig_astream = Pregel.astream
890
+
891
+ # Stash originals so reset_for_tests can restore the un-patched
892
+ # class methods. The wrapped closures capture `runtime` in
893
+ # scope — without restoring, a second test pass would silently
894
+ # drop events from later runtimes (same hazard as httpx patch).
895
+ global _orig_pregel_invoke, _orig_pregel_stream
896
+ global _orig_pregel_ainvoke, _orig_pregel_astream
897
+ _orig_pregel_invoke = _orig_invoke
898
+ _orig_pregel_stream = _orig_stream
899
+ _orig_pregel_ainvoke = _orig_ainvoke
900
+ _orig_pregel_astream = _orig_astream
901
+
902
+ def _wrap_invoke(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
903
+ return _orig_invoke(self, input, _ensure_callback(config), **kwargs)
904
+
905
+ def _wrap_stream(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
906
+ return _orig_stream(self, input, _ensure_callback(config), **kwargs)
907
+
908
+ async def _wrap_ainvoke(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
909
+ return await _orig_ainvoke(self, input, _ensure_callback(config), **kwargs)
910
+
911
+ async def _wrap_astream(self: Any, input: Any, config: Any = None, **kwargs: Any) -> Any:
912
+ async for chunk in _orig_astream(self, input, _ensure_callback(config), **kwargs):
913
+ yield chunk
914
+
915
+ Pregel.invoke = _wrap_invoke # type: ignore[method-assign]
916
+ Pregel.stream = _wrap_stream # type: ignore[method-assign]
917
+ Pregel.ainvoke = _wrap_ainvoke # type: ignore[method-assign]
918
+ Pregel.astream = _wrap_astream # type: ignore[method-assign]
919
+ Pregel._nullrun_patched = True # type: ignore[attr-defined]
920
+ _langgraph_compiled_patched = True
921
+ logger.info("LangGraph compiled-graph auto-instrumentation installed (Pregel.invoke/stream/ainvoke/astream)")
922
+ return True
923
+
924
+
925
+ # ---------------------------------------------------------------------------
926
+ # D6: orchestrator
927
+ # ---------------------------------------------------------------------------
928
+ # `auto_instrument(runtime)` installs all three observation paths. Each
929
+ # patch is best-effort and silently no-ops if the underlying package is
930
+ # not installed. The user's `init()` call invokes this once.
931
+
932
+ _auto_installed = False
933
+ _auto_lock = threading.Lock()
934
+
935
+
936
+ def auto_instrument(runtime: Any) -> bool:
937
+ """Install all auto-instrumentation paths. Idempotent. Returns True if
938
+ at least one path was installed (so the caller can log a useful
939
+ 'instrumented N paths' message).
940
+
941
+ Sprint 2.9 (B47): every patch call is wrapped in ``safe_patch``
942
+ which logs at WARNING if the patch raised a non-ImportError
943
+ exception. Pre-fix the 25+ scattered ``try/except Exception:
944
+ pass # pragma: no cover`` blocks meant a vendor SDK breaking
945
+ change (e.g. a renamed method) would silently disable cost
946
+ tracking with no log line. The operator would only find out
947
+ when the bill arrived.
948
+ """
949
+ global _auto_installed
950
+ with _auto_lock:
951
+ if _auto_installed:
952
+ return True
953
+ # Lazy imports — auto_requests needs `_safe_bump_coverage` (now
954
+ # defined in this module) at module import time. The framework
955
+ # patches below are silent no-ops when their respective
956
+ # packages aren't installed.
957
+ from nullrun.instrumentation._safe_patch import safe_patch
958
+ from nullrun.instrumentation.auto_requests import patch_requests
959
+ from nullrun.instrumentation.autogen import patch_autogen
960
+ from nullrun.instrumentation.crewai import patch_crewai
961
+ from nullrun.instrumentation.llama_index import patch_llama_index
962
+
963
+ paths = [
964
+ safe_patch("httpx", lambda: patch_httpx(runtime)),
965
+ safe_patch("langchain_callback", lambda: patch_langchain_callback(runtime)),
966
+ safe_patch("openai_agents", lambda: patch_openai_agents(runtime)),
967
+ safe_patch("langgraph_compiled", lambda: patch_langgraph_compiled(runtime)),
968
+ safe_patch("requests", lambda: patch_requests(runtime)),
969
+ safe_patch("llama_index", lambda: patch_llama_index(runtime)),
970
+ safe_patch("crewai", lambda: patch_crewai(runtime)),
971
+ safe_patch("autogen", lambda: patch_autogen(runtime)),
972
+ ]
973
+ # We deliberately mark this as installed even if zero paths
974
+ # succeeded — calling auto_instrument twice must not redo work
975
+ # (e.g. if the user calls init() twice, we don't want to double-patch).
976
+ _auto_installed = True
977
+ installed = sum(1 for ok in paths if ok)
978
+ if installed:
979
+ logger.info("NullRun auto-instrumentation: %d path(s) installed", installed)
980
+ else:
981
+ logger.info(
982
+ "NullRun auto-instrumentation: no LLM frameworks detected "
983
+ "(install one of: openai, anthropic, langchain-core, openai-agents)"
984
+ )
985
+ return installed > 0
986
+
987
+
988
+ def is_auto_instrumented() -> bool:
989
+ """Return True if `auto_instrument` has been called successfully."""
990
+ return _auto_installed
991
+
992
+
993
+ def reset_for_tests() -> None:
994
+ """Reset the auto-instrumentation state. Test-only — never call from
995
+ production code. Re-running auto_instrument after this point will
996
+ re-patch httpx / langchain / agents, which can cause double-wrapping
997
+ in long-lived test processes.
998
+
999
+ Also restores `httpx.Client.__init__` and `AsyncClient.__init__` to
1000
+ their pre-patch implementations so the next `patch_httpx` installs a
1001
+ fresh wrap bound to the new runtime (the old wrap's closure still
1002
+ references the original runtime, which would silently drop events
1003
+ on a second test pass).
1004
+ """
1005
+ global _auto_installed, _httpx_patched, _langchain_patched, _agents_patched
1006
+ global _langgraph_compiled_patched
1007
+ global _orig_sync_init, _orig_async_init
1008
+ global _orig_pregel_invoke, _orig_pregel_stream
1009
+ global _orig_pregel_ainvoke, _orig_pregel_astream
1010
+ _auto_installed = False
1011
+ _httpx_patched = False
1012
+ _langchain_patched = False
1013
+ _agents_patched = False
1014
+ _langgraph_compiled_patched = False
1015
+ if _orig_sync_init is not None:
1016
+ try:
1017
+ httpx.Client.__init__ = _orig_sync_init # type: ignore[method-assign]
1018
+ httpx.Client._nullrun_patched = False # type: ignore[attr-defined]
1019
+ except Exception as e: # pragma: no cover — defensive
1020
+ logger.debug("reset_for_tests: failed to restore httpx.Client: %s", e)
1021
+ if _orig_async_init is not None:
1022
+ try:
1023
+ httpx.AsyncClient.__init__ = _orig_async_init # type: ignore[method-assign]
1024
+ httpx.AsyncClient._nullrun_patched = False # type: ignore[attr-defined]
1025
+ except Exception as e: # pragma: no cover — defensive
1026
+ logger.debug("reset_for_tests: failed to restore httpx.AsyncClient: %s", e)
1027
+ _orig_sync_init = None
1028
+ _orig_async_init = None
1029
+ if _orig_pregel_invoke is not None:
1030
+ try:
1031
+ from langgraph.pregel import Pregel
1032
+ Pregel.invoke = _orig_pregel_invoke # type: ignore[method-assign]
1033
+ Pregel.stream = _orig_pregel_stream # type: ignore[method-assign]
1034
+ Pregel.ainvoke = _orig_pregel_ainvoke # type: ignore[method-assign]
1035
+ Pregel.astream = _orig_pregel_astream # type: ignore[method-assign]
1036
+ Pregel._nullrun_patched = False # type: ignore[attr-defined]
1037
+ except Exception as e: # pragma: no cover — defensive
1038
+ logger.debug("reset_for_tests: failed to restore Pregel: %s", e)
1039
+ _orig_pregel_invoke = None
1040
+ _orig_pregel_stream = None
1041
+ _orig_pregel_ainvoke = None
1042
+ _orig_pregel_astream = None
1043
+
1044
+
1045
+ # ---------------------------------------------------------------------------
1046
+ # Dedup helper
1047
+ # ---------------------------------------------------------------------------
1048
+ # `runtime.track` consults `_seen_track_fingerprints` to drop duplicate
1049
+ # events. This is exposed here so tests can introspect / clear the LRU
1050
+ # without poking into the runtime module.
1051
+
1052
+ DEDUP_LRU_MAX = 4096 # Phase 6 #6.7: 4096 entries give a 410ms dedup window at 10K events/sec
1053
+
1054
+
1055
+ def make_dedup_state() -> OrderedDict[str, None]:
1056
+ """Return a fresh dedup LRU. Stored on the runtime instance."""
1057
+ return OrderedDict()
1058
+
1059
+
1060
+ def _fingerprint_is_seen(state: OrderedDict[str, None], fp: str) -> bool:
1061
+ if not fp:
1062
+ return False
1063
+ if fp in state:
1064
+ state.move_to_end(fp)
1065
+ return True
1066
+ state[fp] = None
1067
+ if len(state) > DEDUP_LRU_MAX:
1068
+ state.popitem(last=False)
1069
+ return False
1070
+
1071
+
1072
+ def _safe_bump_coverage(runtime: Any, target_attr: str, host: str) -> None:
1073
+ """Bump a per-host counter on the runtime, tolerating stub runtimes
1074
+ (MagicMock, custom test doubles) that don't carry the attribute.
1075
+
1076
+ ``target_attr`` is one of ``_coverage_seen``,
1077
+ ``_coverage_streaming_skipped``. Mirrors the structure of
1078
+ ``_fingerprint_is_seen`` — never raises.
1079
+
1080
+ Background: ``nullrun.instrumentation.auto_requests`` imports this
1081
+ helper but the original 0.3.0 release never defined it, so the
1082
+ entire ``requests`` auto-instrumentation path was unimportable.
1083
+ Adding the helper here unblocks the module and the dashboard's
1084
+ coverage tab.
1085
+ """
1086
+ target = getattr(runtime, target_attr, None)
1087
+ if target is None:
1088
+ return
1089
+ if isinstance(target, dict):
1090
+ target[host] = int(target.get(host, 0)) + 1
1091
+ else:
1092
+ try:
1093
+ target[host] = int(target[host]) + 1
1094
+ except Exception as e: # pragma: no cover — defensive
1095
+ logger.debug("_safe_bump_coverage: %s bump failed: %s", target_attr, e)