judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.8.0.dist-info/RECORD +0 -82
  94. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,3 +1,1076 @@
1
- from judgeval.common.tracer import Tracer, wrap, TraceClient, TraceManagerClient
1
+ from __future__ import annotations
2
+ import os
3
+ from contextvars import ContextVar
4
+ import atexit
5
+ import functools
6
+ import inspect
7
+ import random
8
+ from typing import (
9
+ Any,
10
+ Union,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ Tuple,
16
+ Type,
17
+ TypeVar,
18
+ overload,
19
+ Literal,
20
+ TypedDict,
21
+ Iterator,
22
+ AsyncIterator,
23
+ )
24
+ from functools import partial
25
+ from warnings import warn
2
26
 
3
- __all__ = ["Tracer", "wrap", "TraceClient", "TraceManagerClient"]
27
+ from opentelemetry.sdk.trace import SpanProcessor, TracerProvider, Span
28
+ from opentelemetry.sdk.resources import Resource
29
+ from opentelemetry.trace import (
30
+ Status,
31
+ StatusCode,
32
+ TracerProvider as ABCTracerProvider,
33
+ NoOpTracerProvider,
34
+ Tracer as ABCTracer,
35
+ get_current_span,
36
+ )
37
+
38
+ from judgeval.data.evaluation_run import ExampleEvaluationRun, TraceEvaluationRun
39
+ from judgeval.data.example import Example
40
+ from judgeval.env import (
41
+ JUDGMENT_API_KEY,
42
+ JUDGMENT_DEFAULT_GPT_MODEL,
43
+ JUDGMENT_ORG_ID,
44
+ )
45
+ from judgeval.logger import judgeval_logger
46
+ from judgeval.scorers.api_scorer import APIScorerConfig
47
+ from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
48
+ from judgeval.scorers.base_scorer import BaseScorer
49
+ from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
50
+ from judgeval.tracer.managers import (
51
+ sync_span_context,
52
+ async_span_context,
53
+ sync_agent_context,
54
+ async_agent_context,
55
+ )
56
+ from judgeval.utils.serialize import safe_serialize
57
+ from judgeval.version import get_version
58
+ from judgeval.warnings import JudgmentWarning
59
+
60
+ from judgeval.tracer.keys import AttributeKeys, ResourceKeys, InternalAttributeKeys
61
+ from judgeval.api import JudgmentSyncClient
62
+ from judgeval.tracer.llm import wrap_provider
63
+ from judgeval.utils.url import url_for
64
+ from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
65
+ from judgeval.tracer.processors import (
66
+ JudgmentSpanProcessor,
67
+ NoOpJudgmentSpanProcessor,
68
+ )
69
+ from judgeval.tracer.utils import set_span_attribute, TraceScorerConfig
70
+
71
+ C = TypeVar("C", bound=Callable)
72
+ Cls = TypeVar("Cls", bound=Type)
73
+ ApiClient = TypeVar("ApiClient", bound=Any)
74
+
75
+
76
+ class AgentContext(TypedDict):
77
+ agent_id: str
78
+ class_name: str | None
79
+ instance_name: str | None
80
+ track_state: bool
81
+ track_attributes: List[str] | None
82
+ field_mappings: Dict[str, str]
83
+ instance: Any
84
+ is_agent_entry_point: bool
85
+ parent_agent_id: str | None
86
+
87
+
88
+ def resolve_project_id(
89
+ api_key: str, organization_id: str, project_name: str
90
+ ) -> str | None:
91
+ try:
92
+ client = JudgmentSyncClient(
93
+ api_key=api_key,
94
+ organization_id=organization_id,
95
+ )
96
+ return client.projects_resolve({"project_name": project_name})["project_id"]
97
+ except Exception:
98
+ return None
99
+
100
+
101
+ class Tracer:
102
+ _active_tracers: List[Tracer] = []
103
+
104
+ __slots__ = (
105
+ "api_key",
106
+ "organization_id",
107
+ "project_name",
108
+ "api_url",
109
+ "deep_tracing",
110
+ "enable_monitoring",
111
+ "enable_evaluation",
112
+ "api_client",
113
+ "local_eval_queue",
114
+ # Otel
115
+ "judgment_processor",
116
+ "processors",
117
+ "provider",
118
+ "tracer",
119
+ # Agent
120
+ "agent_context",
121
+ "cost_context",
122
+ )
123
+
124
+ api_key: str
125
+ organization_id: str
126
+ project_name: str
127
+ api_url: str
128
+ deep_tracing: bool
129
+ enable_monitoring: bool
130
+ enable_evaluation: bool
131
+ api_client: JudgmentSyncClient
132
+ local_eval_queue: LocalEvaluationQueue
133
+
134
+ judgment_processor: JudgmentSpanProcessor
135
+ processors: List[SpanProcessor]
136
+ provider: ABCTracerProvider
137
+ tracer: ABCTracer
138
+
139
+ agent_context: ContextVar[Optional[AgentContext]]
140
+ cost_context: ContextVar[Optional[Dict[str, float]]]
141
+
142
+ def __init__(
143
+ self,
144
+ /,
145
+ *,
146
+ project_name: str,
147
+ api_key: Optional[str] = None,
148
+ organization_id: Optional[str] = None,
149
+ deep_tracing: bool = False,
150
+ enable_monitoring: bool = os.getenv(
151
+ "JUDGMENT_ENABLE_MONITORING", "true"
152
+ ).lower()
153
+ != "false",
154
+ enable_evaluation: bool = os.getenv(
155
+ "JUDGMENT_ENABLE_EVALUATIONS", "true"
156
+ ).lower()
157
+ != "false",
158
+ processors: List[SpanProcessor] = [],
159
+ resource_attributes: Optional[Dict[str, Any]] = None,
160
+ ):
161
+ _api_key = api_key or JUDGMENT_API_KEY
162
+ _organization_id = organization_id or JUDGMENT_ORG_ID
163
+
164
+ if _api_key is None:
165
+ raise ValueError(
166
+ "API Key is not set, please set it in the environment variables or pass it as `api_key`"
167
+ )
168
+
169
+ if _organization_id is None:
170
+ raise ValueError(
171
+ "Organization ID is not set, please set it in the environment variables or pass it as `organization_id`"
172
+ )
173
+
174
+ self.api_key = _api_key
175
+ self.organization_id = _organization_id
176
+ self.project_name = project_name
177
+ self.api_url = url_for("/otel/v1/traces")
178
+
179
+ self.deep_tracing = deep_tracing
180
+ self.enable_monitoring = enable_monitoring
181
+ self.enable_evaluation = enable_evaluation
182
+
183
+ self.judgment_processor = NoOpJudgmentSpanProcessor()
184
+ self.processors = processors
185
+ self.provider = NoOpTracerProvider()
186
+
187
+ self.agent_context = ContextVar("current_agent_context", default=None)
188
+ self.cost_context = ContextVar("current_cost_context", default=None)
189
+
190
+ if self.enable_monitoring:
191
+ project_id = resolve_project_id(
192
+ self.api_key, self.organization_id, self.project_name
193
+ )
194
+
195
+ resource_attributes = resource_attributes or {}
196
+ resource_attributes.update(
197
+ {
198
+ ResourceKeys.SERVICE_NAME: self.project_name,
199
+ ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
200
+ ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
201
+ }
202
+ )
203
+
204
+ if project_id is not None:
205
+ resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = project_id
206
+ else:
207
+ judgeval_logger.error(
208
+ f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/projects. Skipping Judgment export."
209
+ )
210
+
211
+ resource = Resource.create(resource_attributes)
212
+
213
+ self.judgment_processor = JudgmentSpanProcessor(
214
+ self,
215
+ self.api_url,
216
+ self.api_key,
217
+ self.organization_id,
218
+ max_queue_size=2**18,
219
+ export_timeout_millis=30000,
220
+ )
221
+ self.processors.append(self.judgment_processor)
222
+ self.provider = TracerProvider(resource=resource)
223
+ for processor in self.processors:
224
+ self.provider.add_span_processor(processor)
225
+
226
+ self.tracer = self.provider.get_tracer(
227
+ JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME,
228
+ get_version(),
229
+ )
230
+ self.api_client = JudgmentSyncClient(
231
+ api_key=self.api_key,
232
+ organization_id=self.organization_id,
233
+ )
234
+ self.local_eval_queue = LocalEvaluationQueue()
235
+
236
+ if self.enable_evaluation and self.enable_monitoring:
237
+ self.local_eval_queue.start_workers()
238
+
239
+ Tracer._active_tracers.append(self)
240
+
241
+ # Register atexit handler to flush on program exit
242
+ atexit.register(self._atexit_flush)
243
+
244
+ def get_current_span(self):
245
+ return get_current_span()
246
+
247
+ def get_tracer(self):
248
+ return self.tracer
249
+
250
+ def get_current_agent_context(self):
251
+ return self.agent_context
252
+
253
+ def get_current_cost_context(self):
254
+ return self.cost_context
255
+
256
+ def set_customer_id(self, customer_id: str) -> None:
257
+ span = self.get_current_span()
258
+ if span and span.is_recording():
259
+ set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
260
+
261
+ def add_cost_to_current_context(self, cost: Optional[float]) -> None:
262
+ """Add cost to the current cost context and update span attribute."""
263
+ if cost is None:
264
+ return
265
+ current_cost_context = self.cost_context.get()
266
+ if current_cost_context is not None:
267
+ current_cumulative_cost = current_cost_context.get("cumulative_cost", 0.0)
268
+ new_cumulative_cost = float(current_cumulative_cost) + cost
269
+ current_cost_context["cumulative_cost"] = new_cumulative_cost
270
+
271
+ span = self.get_current_span()
272
+ if span and span.is_recording():
273
+ set_span_attribute(
274
+ span,
275
+ AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST,
276
+ new_cumulative_cost,
277
+ )
278
+
279
+ def add_agent_attributes_to_span(self, span):
280
+ """Add agent ID, class name, and instance name to span if they exist in context"""
281
+ current_agent_context = self.agent_context.get()
282
+ if not current_agent_context:
283
+ return
284
+
285
+ set_span_attribute(
286
+ span, AttributeKeys.JUDGMENT_AGENT_ID, current_agent_context["agent_id"]
287
+ )
288
+ set_span_attribute(
289
+ span,
290
+ AttributeKeys.JUDGMENT_AGENT_CLASS_NAME,
291
+ current_agent_context["class_name"],
292
+ )
293
+ set_span_attribute(
294
+ span,
295
+ AttributeKeys.JUDGMENT_AGENT_INSTANCE_NAME,
296
+ current_agent_context["instance_name"],
297
+ )
298
+ set_span_attribute(
299
+ span,
300
+ AttributeKeys.JUDGMENT_PARENT_AGENT_ID,
301
+ current_agent_context["parent_agent_id"],
302
+ )
303
+ set_span_attribute(
304
+ span,
305
+ AttributeKeys.JUDGMENT_IS_AGENT_ENTRY_POINT,
306
+ current_agent_context["is_agent_entry_point"],
307
+ )
308
+ current_agent_context["is_agent_entry_point"] = False
309
+
310
+ def record_instance_state(self, record_point: Literal["before", "after"], span):
311
+ current_agent_context = self.agent_context.get()
312
+
313
+ if current_agent_context and current_agent_context.get("track_state"):
314
+ instance = current_agent_context.get("instance")
315
+ track_attributes = current_agent_context.get("track_attributes")
316
+ field_mappings = current_agent_context.get("field_mappings", {})
317
+
318
+ if track_attributes is not None:
319
+ attributes = {
320
+ field_mappings.get(attr, attr): getattr(instance, attr, None)
321
+ for attr in track_attributes
322
+ }
323
+ else:
324
+ attributes = {
325
+ field_mappings.get(k, k): v
326
+ for k, v in instance.__dict__.items()
327
+ if not k.startswith("_")
328
+ }
329
+ set_span_attribute(
330
+ span,
331
+ (
332
+ AttributeKeys.JUDGMENT_STATE_BEFORE
333
+ if record_point == "before"
334
+ else AttributeKeys.JUDGMENT_STATE_AFTER
335
+ ),
336
+ safe_serialize(attributes),
337
+ )
338
+
339
+ def _set_pending_trace_eval(
340
+ self,
341
+ span: Span,
342
+ scorer_config: TraceScorerConfig,
343
+ args: Tuple[Any, ...],
344
+ kwargs: Dict[str, Any],
345
+ ):
346
+ if not self.enable_evaluation:
347
+ return
348
+
349
+ scorer = scorer_config.scorer
350
+ model = scorer_config.model
351
+ run_condition = scorer_config.run_condition
352
+ sampling_rate = scorer_config.sampling_rate
353
+
354
+ if not isinstance(scorer, (TraceAPIScorerConfig)):
355
+ judgeval_logger.error(
356
+ "Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
357
+ % type(scorer)
358
+ )
359
+ return
360
+
361
+ if run_condition is not None and not run_condition(*args, **kwargs):
362
+ return
363
+
364
+ if sampling_rate < 0 or sampling_rate > 1:
365
+ judgeval_logger.error(
366
+ "Sampling rate must be between 0 and 1, got %s, skipping evaluation."
367
+ % sampling_rate
368
+ )
369
+ return
370
+
371
+ percentage = random.uniform(0, 1)
372
+ if percentage > sampling_rate:
373
+ judgeval_logger.info(
374
+ "Sampling rate is %s, skipping evaluation." % sampling_rate
375
+ )
376
+ return
377
+
378
+ span_context = span.get_span_context()
379
+ trace_id = format(span_context.trace_id, "032x")
380
+ span_id = format(span_context.span_id, "016x")
381
+ eval_run_name = f"async_trace_evaluate_{span_id}"
382
+
383
+ eval_run = TraceEvaluationRun(
384
+ organization_id=self.organization_id,
385
+ project_name=self.project_name,
386
+ eval_name=eval_run_name,
387
+ scorers=[scorer],
388
+ model=model,
389
+ trace_and_span_ids=[(trace_id, span_id)],
390
+ )
391
+ span.set_attribute(
392
+ AttributeKeys.PENDING_TRACE_EVAL,
393
+ safe_serialize(eval_run.model_dump(warnings=False)),
394
+ )
395
+
396
+ def _create_traced_sync_generator(
397
+ self,
398
+ generator: Iterator[Any],
399
+ main_span: Span,
400
+ base_name: str,
401
+ attributes: Optional[Dict[str, Any]],
402
+ ):
403
+ """Create a traced synchronous generator that wraps each yield in a span."""
404
+ try:
405
+ while True:
406
+ yield_span_name = f"{base_name}_yield"
407
+ yield_attributes = {
408
+ AttributeKeys.JUDGMENT_SPAN_KIND: "generator_yield",
409
+ **(attributes or {}),
410
+ }
411
+
412
+ with sync_span_context(
413
+ self, yield_span_name, yield_attributes, disable_partial_emit=True
414
+ ) as yield_span:
415
+ self.add_agent_attributes_to_span(yield_span)
416
+
417
+ try:
418
+ value = next(generator)
419
+ except StopIteration:
420
+ # Mark span as cancelled so it won't be exported
421
+ self.judgment_processor.set_internal_attribute(
422
+ span_context=yield_span.get_span_context(),
423
+ key=InternalAttributeKeys.CANCELLED,
424
+ value=True,
425
+ )
426
+ break
427
+
428
+ set_span_attribute(
429
+ yield_span,
430
+ AttributeKeys.JUDGMENT_OUTPUT,
431
+ safe_serialize(value),
432
+ )
433
+
434
+ yield value
435
+ except Exception as e:
436
+ main_span.record_exception(e)
437
+ main_span.set_status(Status(StatusCode.ERROR, str(e)))
438
+ raise
439
+
440
+ async def _create_traced_async_generator(
441
+ self,
442
+ async_generator: AsyncIterator[Any],
443
+ main_span: Span,
444
+ base_name: str,
445
+ attributes: Optional[Dict[str, Any]],
446
+ ):
447
+ """Create a traced asynchronous generator that wraps each yield in a span."""
448
+ try:
449
+ while True:
450
+ yield_span_name = f"{base_name}_yield"
451
+ yield_attributes = {
452
+ AttributeKeys.JUDGMENT_SPAN_KIND: "async_generator_yield",
453
+ **(attributes or {}),
454
+ }
455
+
456
+ async with async_span_context(
457
+ self, yield_span_name, yield_attributes, disable_partial_emit=True
458
+ ) as yield_span:
459
+ self.add_agent_attributes_to_span(yield_span)
460
+
461
+ try:
462
+ value = await async_generator.__anext__()
463
+ except StopAsyncIteration:
464
+ # Mark span as cancelled so it won't be exported
465
+ self.judgment_processor.set_internal_attribute(
466
+ span_context=yield_span.get_span_context(),
467
+ key=InternalAttributeKeys.CANCELLED,
468
+ value=True,
469
+ )
470
+ break
471
+
472
+ set_span_attribute(
473
+ yield_span,
474
+ AttributeKeys.JUDGMENT_OUTPUT,
475
+ safe_serialize(value),
476
+ )
477
+
478
+ yield value
479
+ except Exception as e:
480
+ main_span.record_exception(e)
481
+ main_span.set_status(Status(StatusCode.ERROR, str(e)))
482
+ raise
483
+
484
+ def _wrap_sync(
485
+ self,
486
+ f: Callable,
487
+ name: Optional[str],
488
+ attributes: Optional[Dict[str, Any]],
489
+ scorer_config: TraceScorerConfig | None = None,
490
+ ):
491
+ # Check if this is a generator function - if so, wrap it specially
492
+ if inspect.isgeneratorfunction(f):
493
+ return self._wrap_sync_generator_function(
494
+ f, name, attributes, scorer_config
495
+ )
496
+
497
+ @functools.wraps(f)
498
+ def wrapper(*args, **kwargs):
499
+ n = name or f.__qualname__
500
+ with sync_span_context(self, n, attributes) as span:
501
+ self.add_agent_attributes_to_span(span)
502
+ self.record_instance_state("before", span)
503
+ try:
504
+ set_span_attribute(
505
+ span,
506
+ AttributeKeys.JUDGMENT_INPUT,
507
+ safe_serialize(format_inputs(f, args, kwargs)),
508
+ )
509
+
510
+ if scorer_config:
511
+ self._set_pending_trace_eval(span, scorer_config, args, kwargs)
512
+
513
+ self.judgment_processor.emit_partial()
514
+
515
+ result = f(*args, **kwargs)
516
+ except Exception as user_exc:
517
+ span.record_exception(user_exc)
518
+ span.set_status(Status(StatusCode.ERROR, str(user_exc)))
519
+ raise
520
+
521
+ if inspect.isgenerator(result):
522
+ set_span_attribute(
523
+ span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
524
+ )
525
+ self.record_instance_state("after", span)
526
+ return self._create_traced_sync_generator(
527
+ result, span, n, attributes
528
+ )
529
+ else:
530
+ set_span_attribute(
531
+ span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
532
+ )
533
+ self.record_instance_state("after", span)
534
+ return result
535
+
536
+ return wrapper
537
+
538
+ def _wrap_sync_generator_function(
539
+ self,
540
+ f: Callable,
541
+ name: Optional[str],
542
+ attributes: Optional[Dict[str, Any]],
543
+ scorer_config: TraceScorerConfig | None = None,
544
+ ):
545
+ """Wrap a generator function to trace nested function calls within each yield."""
546
+
547
+ @functools.wraps(f)
548
+ def wrapper(*args, **kwargs):
549
+ n = name or f.__qualname__
550
+
551
+ with sync_span_context(self, n, attributes) as main_span:
552
+ self.add_agent_attributes_to_span(main_span)
553
+ self.record_instance_state("before", main_span)
554
+
555
+ try:
556
+ set_span_attribute(
557
+ main_span,
558
+ AttributeKeys.JUDGMENT_INPUT,
559
+ safe_serialize(format_inputs(f, args, kwargs)),
560
+ )
561
+
562
+ if scorer_config:
563
+ self._set_pending_trace_eval(
564
+ main_span, scorer_config, args, kwargs
565
+ )
566
+
567
+ self.judgment_processor.emit_partial()
568
+
569
+ generator = f(*args, **kwargs)
570
+ set_span_attribute(
571
+ main_span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
572
+ )
573
+ self.record_instance_state("after", main_span)
574
+
575
+ return self._create_traced_sync_generator(
576
+ generator, main_span, n, attributes
577
+ )
578
+
579
+ except Exception as user_exc:
580
+ main_span.record_exception(user_exc)
581
+ main_span.set_status(Status(StatusCode.ERROR, str(user_exc)))
582
+ raise
583
+
584
+ return wrapper
585
+
586
+ def _wrap_async(
587
+ self,
588
+ f: Callable,
589
+ name: Optional[str],
590
+ attributes: Optional[Dict[str, Any]],
591
+ scorer_config: TraceScorerConfig | None = None,
592
+ ):
593
+ # Check if this is an async generator function - if so, wrap it specially
594
+ if inspect.isasyncgenfunction(f):
595
+ return self._wrap_async_generator_function(
596
+ f, name, attributes, scorer_config
597
+ )
598
+
599
+ @functools.wraps(f)
600
+ async def wrapper(*args, **kwargs):
601
+ n = name or f.__qualname__
602
+ async with async_span_context(self, n, attributes) as span:
603
+ self.add_agent_attributes_to_span(span)
604
+ self.record_instance_state("before", span)
605
+ try:
606
+ set_span_attribute(
607
+ span,
608
+ AttributeKeys.JUDGMENT_INPUT,
609
+ safe_serialize(format_inputs(f, args, kwargs)),
610
+ )
611
+
612
+ if scorer_config:
613
+ self._set_pending_trace_eval(span, scorer_config, args, kwargs)
614
+
615
+ self.judgment_processor.emit_partial()
616
+
617
+ result = await f(*args, **kwargs)
618
+ except Exception as user_exc:
619
+ span.record_exception(user_exc)
620
+ span.set_status(Status(StatusCode.ERROR, str(user_exc)))
621
+ raise
622
+
623
+ if inspect.isasyncgen(result):
624
+ set_span_attribute(
625
+ span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
626
+ )
627
+ self.record_instance_state("after", span)
628
+ return self._create_traced_async_generator(
629
+ result, span, n, attributes
630
+ )
631
+ else:
632
+ set_span_attribute(
633
+ span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
634
+ )
635
+ self.record_instance_state("after", span)
636
+ return result
637
+
638
+ return wrapper
639
+
640
+ def _wrap_async_generator_function(
641
+ self,
642
+ f: Callable,
643
+ name: Optional[str],
644
+ attributes: Optional[Dict[str, Any]],
645
+ scorer_config: TraceScorerConfig | None = None,
646
+ ):
647
+ """Wrap an async generator function to trace nested function calls within each yield."""
648
+
649
+ @functools.wraps(f)
650
+ def wrapper(*args, **kwargs):
651
+ n = name or f.__qualname__
652
+
653
+ with sync_span_context(self, n, attributes) as main_span:
654
+ self.add_agent_attributes_to_span(main_span)
655
+ self.record_instance_state("before", main_span)
656
+
657
+ try:
658
+ set_span_attribute(
659
+ main_span,
660
+ AttributeKeys.JUDGMENT_INPUT,
661
+ safe_serialize(format_inputs(f, args, kwargs)),
662
+ )
663
+
664
+ if scorer_config:
665
+ self._set_pending_trace_eval(
666
+ main_span, scorer_config, args, kwargs
667
+ )
668
+
669
+ self.judgment_processor.emit_partial()
670
+
671
+ async_generator = f(*args, **kwargs)
672
+ set_span_attribute(
673
+ main_span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
674
+ )
675
+ self.record_instance_state("after", main_span)
676
+
677
+ return self._create_traced_async_generator(
678
+ async_generator, main_span, n, attributes
679
+ )
680
+
681
+ except Exception as user_exc:
682
+ main_span.record_exception(user_exc)
683
+ main_span.set_status(Status(StatusCode.ERROR, str(user_exc)))
684
+ raise
685
+
686
+ return wrapper
687
+
688
+ @overload
689
+ def observe(
690
+ self,
691
+ func: C,
692
+ /,
693
+ *,
694
+ span_type: str | None = None,
695
+ scorer_config: TraceScorerConfig | None = None,
696
+ ) -> C: ...
697
+
698
+ @overload
699
+ def observe(
700
+ self,
701
+ func: None = None,
702
+ /,
703
+ *,
704
+ span_type: str | None = None,
705
+ scorer_config: TraceScorerConfig | None = None,
706
+ ) -> Callable[[C], C]: ...
707
+
708
+ def observe(
709
+ self,
710
+ func: Callable | None = None,
711
+ /,
712
+ *,
713
+ span_type: str | None = "span",
714
+ span_name: str | None = None,
715
+ attributes: Optional[Dict[str, Any]] = None,
716
+ scorer_config: TraceScorerConfig | None = None,
717
+ ) -> Callable | None:
718
+ if func is None:
719
+ return partial(
720
+ self.observe,
721
+ span_type=span_type,
722
+ span_name=span_name,
723
+ attributes=attributes,
724
+ scorer_config=scorer_config,
725
+ )
726
+
727
+ if not self.enable_monitoring:
728
+ return func
729
+
730
+ # Handle functions (including generator functions) - detect generators at runtime
731
+ name = span_name or getattr(func, "__qualname__", "function")
732
+ func_attributes: Dict[str, Any] = {
733
+ AttributeKeys.JUDGMENT_SPAN_KIND: span_type,
734
+ **(attributes or {}),
735
+ }
736
+
737
+ if inspect.iscoroutinefunction(func) or inspect.isasyncgenfunction(func):
738
+ return self._wrap_async(func, name, func_attributes, scorer_config)
739
+ else:
740
+ return self._wrap_sync(func, name, func_attributes, scorer_config)
741
+
742
+ @overload
743
+ def agent(
744
+ self,
745
+ func: C,
746
+ /,
747
+ *,
748
+ identifier: str | None = None,
749
+ track_state: bool = False,
750
+ track_attributes: List[str] | None = None,
751
+ field_mappings: Dict[str, str] = {},
752
+ ) -> C: ...
753
+
754
+ @overload
755
+ def agent(
756
+ self,
757
+ func: None = None,
758
+ /,
759
+ *,
760
+ identifier: str | None = None,
761
+ track_state: bool = False,
762
+ track_attributes: List[str] | None = None,
763
+ field_mappings: Dict[str, str] = {},
764
+ ) -> Callable[[C], C]: ...
765
+
766
+ def agent(
767
+ self,
768
+ func: Callable | None = None,
769
+ /,
770
+ *,
771
+ identifier: str | None = None,
772
+ track_state: bool = False,
773
+ track_attributes: List[str] | None = None,
774
+ field_mappings: Dict[str, str] = {},
775
+ ) -> Callable | None:
776
+ """
777
+ Agent decorator that creates an agent ID and propagates it to child spans.
778
+ Also captures and propagates the class name if the decorated function is a method.
779
+ Optionally captures instance name based on the specified identifier attribute.
780
+
781
+ This decorator should be used in combination with @observe decorator:
782
+
783
+ class MyAgent:
784
+ def __init__(self, name):
785
+ self.name = name
786
+
787
+ @judgment.agent(identifier="name")
788
+ @judgment.observe(span_type="function")
789
+ def my_agent_method(self):
790
+ # This span and all child spans will have:
791
+ # - agent_id: auto-generated UUID
792
+ # - class_name: "MyAgent"
793
+ # - instance_name: self.name value
794
+ pass
795
+
796
+ Args:
797
+ identifier: Name of the instance attribute to use as the instance name
798
+ """
799
+ if func is None:
800
+ return partial(
801
+ self.agent,
802
+ identifier=identifier,
803
+ track_state=track_state,
804
+ track_attributes=track_attributes,
805
+ field_mappings=field_mappings,
806
+ )
807
+
808
+ if not self.enable_monitoring:
809
+ return func
810
+
811
+ class_name = None
812
+ if hasattr(func, "__qualname__") and "." in func.__qualname__:
813
+ parts = func.__qualname__.split(".")
814
+ if len(parts) >= 2:
815
+ class_name = parts[-2]
816
+
817
+ if inspect.iscoroutinefunction(func):
818
+
819
+ @functools.wraps(func)
820
+ async def async_wrapper(*args, **kwargs):
821
+ async with async_agent_context(
822
+ tracer=self,
823
+ args=args,
824
+ class_name=class_name,
825
+ identifier=identifier,
826
+ track_state=track_state,
827
+ track_attributes=track_attributes,
828
+ field_mappings=field_mappings,
829
+ ):
830
+ return await func(*args, **kwargs)
831
+
832
+ return async_wrapper
833
+ else:
834
+
835
+ @functools.wraps(func)
836
+ def sync_wrapper(*args, **kwargs):
837
+ with sync_agent_context(
838
+ tracer=self,
839
+ args=args,
840
+ class_name=class_name,
841
+ identifier=identifier,
842
+ track_state=track_state,
843
+ track_attributes=track_attributes,
844
+ field_mappings=field_mappings,
845
+ ):
846
+ return func(*args, **kwargs)
847
+
848
+ return sync_wrapper
849
+
850
+ @overload
851
+ def observe_tools(
852
+ self,
853
+ cls: Cls,
854
+ /,
855
+ *,
856
+ exclude_methods: List[str] = [],
857
+ include_private: bool = False,
858
+ ) -> Cls: ...
859
+
860
+ @overload
861
+ def observe_tools(
862
+ self,
863
+ cls: None = None,
864
+ /,
865
+ *,
866
+ exclude_methods: List[str] = [],
867
+ include_private: bool = False,
868
+ ) -> Callable[[Cls], Cls]: ...
869
+
870
+ def observe_tools(
871
+ self,
872
+ cls: Cls | None = None,
873
+ /,
874
+ *,
875
+ exclude_methods: List[str] = [],
876
+ include_private: bool = False,
877
+ ) -> Cls | Callable[[Cls], Cls]:
878
+ if cls is None:
879
+ return partial(
880
+ self.observe_tools,
881
+ exclude_methods=exclude_methods,
882
+ include_private=include_private,
883
+ )
884
+ return cls
885
+
886
+ def wrap(self, client: ApiClient) -> ApiClient:
887
+ return wrap_provider(self, client)
888
+
889
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
890
+ """Force flush all pending spans and block until completion.
891
+
892
+ Args:
893
+ timeout_millis: Maximum time to wait for flush completion in milliseconds
894
+
895
+ Returns:
896
+ True if all processors flushed successfully within timeout, False otherwise
897
+ """
898
+ success = True
899
+ for processor in self.processors:
900
+ try:
901
+ result = processor.force_flush(timeout_millis)
902
+ if not result:
903
+ success = False
904
+ except Exception as e:
905
+ judgeval_logger.warning(f"Error flushing processor {processor}: {e}")
906
+ success = False
907
+ return success
908
+
909
+ def _atexit_flush(self) -> None:
910
+ """Internal method called on program exit to flush remaining spans.
911
+
912
+ This blocks until all spans are flushed or timeout is reached to ensure
913
+ proper cleanup before program termination.
914
+ """
915
+ try:
916
+ success = self.force_flush(timeout_millis=30000)
917
+ if not success:
918
+ judgeval_logger.warning(
919
+ "Some spans may not have been exported before program exit"
920
+ )
921
+ except Exception as e:
922
+ judgeval_logger.warning(f"Error during atexit flush: {e}")
923
+
924
+ def async_evaluate(
925
+ self,
926
+ /,
927
+ *,
928
+ scorer: Union[APIScorerConfig, BaseScorer],
929
+ example: Example,
930
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
931
+ sampling_rate: float = 1.0,
932
+ ):
933
+ if not self.enable_evaluation or not self.enable_monitoring:
934
+ judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
935
+ return
936
+
937
+ if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
938
+ judgeval_logger.error(
939
+ "Scorer must be an instance of APIScorerConfig or BaseScorer, got %s, skipping evaluation."
940
+ % type(scorer)
941
+ )
942
+ return
943
+
944
+ if not isinstance(example, Example):
945
+ judgeval_logger.error(
946
+ "Example must be an instance of Example, got %s, skipping evaluation."
947
+ % type(example)
948
+ )
949
+ return
950
+
951
+ if sampling_rate < 0 or sampling_rate > 1:
952
+ judgeval_logger.error(
953
+ "Sampling rate must be between 0 and 1, got %s, skipping evaluation."
954
+ % sampling_rate
955
+ )
956
+ return
957
+
958
+ percentage = random.uniform(0, 1)
959
+ if percentage > sampling_rate:
960
+ judgeval_logger.info(
961
+ "Sampling rate is %s, skipping evaluation." % sampling_rate
962
+ )
963
+ return
964
+
965
+ span_context = self.get_current_span().get_span_context()
966
+ trace_id = format(span_context.trace_id, "032x")
967
+ span_id = format(span_context.span_id, "016x")
968
+ hosted_scoring = isinstance(scorer, APIScorerConfig) or (
969
+ isinstance(scorer, BaseScorer) and scorer.server_hosted
970
+ )
971
+ eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
972
+ if hosted_scoring:
973
+ eval_run = ExampleEvaluationRun(
974
+ organization_id=self.organization_id,
975
+ project_name=self.project_name,
976
+ eval_name=eval_run_name,
977
+ examples=[example],
978
+ scorers=[scorer],
979
+ model=model,
980
+ trace_span_id=span_id,
981
+ trace_id=trace_id,
982
+ )
983
+ self.api_client.add_to_run_eval_queue_examples(
984
+ eval_run.model_dump(warnings=False)
985
+ ) # type: ignore
986
+ else:
987
+ # Handle custom scorers using local evaluation queue
988
+ eval_run = ExampleEvaluationRun(
989
+ organization_id=self.organization_id,
990
+ project_name=self.project_name,
991
+ eval_name=eval_run_name,
992
+ examples=[example],
993
+ scorers=[scorer],
994
+ model=model,
995
+ trace_span_id=span_id,
996
+ trace_id=trace_id,
997
+ )
998
+
999
+ # Enqueue the evaluation run to the local evaluation queue
1000
+ self.local_eval_queue.enqueue(eval_run)
1001
+
1002
+ def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
1003
+ """Wait for all evaluations and span processing to complete.
1004
+
1005
+ This method blocks until all queued evaluations are processed and
1006
+ all pending spans are flushed to the server.
1007
+
1008
+ Args:
1009
+ timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
1010
+ None means wait indefinitely.
1011
+
1012
+ Returns:
1013
+ True if all processing completed within the timeout, False otherwise.
1014
+
1015
+ """
1016
+ try:
1017
+ judgeval_logger.debug(
1018
+ "Waiting for all evaluations and spans to complete..."
1019
+ )
1020
+
1021
+ # Wait for all queued evaluation work to complete
1022
+ eval_completed = self.local_eval_queue.wait_for_completion()
1023
+ if not eval_completed:
1024
+ judgeval_logger.warning(
1025
+ f"Local evaluation queue did not complete within {timeout} seconds"
1026
+ )
1027
+ return False
1028
+
1029
+ self.force_flush()
1030
+
1031
+ judgeval_logger.debug("All evaluations and spans completed successfully")
1032
+ return True
1033
+
1034
+ except Exception as e:
1035
+ judgeval_logger.warning(f"Error while waiting for completion: {e}")
1036
+ return False
1037
+
1038
+
1039
+ def wrap(client: ApiClient) -> ApiClient:
1040
+ if not Tracer._active_tracers:
1041
+ warn(
1042
+ "No active tracers found, client will not be wrapped. "
1043
+ "You can use the global `wrap` function after creating a tracer instance. "
1044
+ "Or you can use the `wrap` method on the tracer instance to directly wrap the client. ",
1045
+ JudgmentWarning,
1046
+ stacklevel=2,
1047
+ )
1048
+
1049
+ wrapped_client = client
1050
+ for tracer in Tracer._active_tracers:
1051
+ wrapped_client = tracer.wrap(wrapped_client)
1052
+ return wrapped_client
1053
+
1054
+
1055
+ def format_inputs(
1056
+ f: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
1057
+ ) -> Dict[str, Any]:
1058
+ try:
1059
+ params = list(inspect.signature(f).parameters.values())
1060
+ inputs = {}
1061
+ arg_i = 0
1062
+ for param in params:
1063
+ if param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD:
1064
+ if arg_i < len(args):
1065
+ inputs[param.name] = args[arg_i]
1066
+ arg_i += 1
1067
+ elif param.name in kwargs:
1068
+ inputs[param.name] = kwargs[param.name]
1069
+ elif param.kind == inspect.Parameter.VAR_POSITIONAL:
1070
+ inputs[param.name] = args[arg_i:]
1071
+ arg_i = len(args)
1072
+ elif param.kind == inspect.Parameter.VAR_KEYWORD:
1073
+ inputs[param.name] = kwargs
1074
+ return inputs
1075
+ except Exception:
1076
+ return {}