judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.8.0.dist-info/RECORD +0 -82
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/tracer/__init__.py
CHANGED
@@ -1,3 +1,1076 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
import os
|
3
|
+
from contextvars import ContextVar
|
4
|
+
import atexit
|
5
|
+
import functools
|
6
|
+
import inspect
|
7
|
+
import random
|
8
|
+
from typing import (
|
9
|
+
Any,
|
10
|
+
Union,
|
11
|
+
Callable,
|
12
|
+
Dict,
|
13
|
+
List,
|
14
|
+
Optional,
|
15
|
+
Tuple,
|
16
|
+
Type,
|
17
|
+
TypeVar,
|
18
|
+
overload,
|
19
|
+
Literal,
|
20
|
+
TypedDict,
|
21
|
+
Iterator,
|
22
|
+
AsyncIterator,
|
23
|
+
)
|
24
|
+
from functools import partial
|
25
|
+
from warnings import warn
|
2
26
|
|
3
|
-
|
27
|
+
from opentelemetry.sdk.trace import SpanProcessor, TracerProvider, Span
|
28
|
+
from opentelemetry.sdk.resources import Resource
|
29
|
+
from opentelemetry.trace import (
|
30
|
+
Status,
|
31
|
+
StatusCode,
|
32
|
+
TracerProvider as ABCTracerProvider,
|
33
|
+
NoOpTracerProvider,
|
34
|
+
Tracer as ABCTracer,
|
35
|
+
get_current_span,
|
36
|
+
)
|
37
|
+
|
38
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun, TraceEvaluationRun
|
39
|
+
from judgeval.data.example import Example
|
40
|
+
from judgeval.env import (
|
41
|
+
JUDGMENT_API_KEY,
|
42
|
+
JUDGMENT_DEFAULT_GPT_MODEL,
|
43
|
+
JUDGMENT_ORG_ID,
|
44
|
+
)
|
45
|
+
from judgeval.logger import judgeval_logger
|
46
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
47
|
+
from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
|
48
|
+
from judgeval.scorers.base_scorer import BaseScorer
|
49
|
+
from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
|
50
|
+
from judgeval.tracer.managers import (
|
51
|
+
sync_span_context,
|
52
|
+
async_span_context,
|
53
|
+
sync_agent_context,
|
54
|
+
async_agent_context,
|
55
|
+
)
|
56
|
+
from judgeval.utils.serialize import safe_serialize
|
57
|
+
from judgeval.version import get_version
|
58
|
+
from judgeval.warnings import JudgmentWarning
|
59
|
+
|
60
|
+
from judgeval.tracer.keys import AttributeKeys, ResourceKeys, InternalAttributeKeys
|
61
|
+
from judgeval.api import JudgmentSyncClient
|
62
|
+
from judgeval.tracer.llm import wrap_provider
|
63
|
+
from judgeval.utils.url import url_for
|
64
|
+
from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
|
65
|
+
from judgeval.tracer.processors import (
|
66
|
+
JudgmentSpanProcessor,
|
67
|
+
NoOpJudgmentSpanProcessor,
|
68
|
+
)
|
69
|
+
from judgeval.tracer.utils import set_span_attribute, TraceScorerConfig
|
70
|
+
|
71
|
+
C = TypeVar("C", bound=Callable)
|
72
|
+
Cls = TypeVar("Cls", bound=Type)
|
73
|
+
ApiClient = TypeVar("ApiClient", bound=Any)
|
74
|
+
|
75
|
+
|
76
|
+
class AgentContext(TypedDict):
|
77
|
+
agent_id: str
|
78
|
+
class_name: str | None
|
79
|
+
instance_name: str | None
|
80
|
+
track_state: bool
|
81
|
+
track_attributes: List[str] | None
|
82
|
+
field_mappings: Dict[str, str]
|
83
|
+
instance: Any
|
84
|
+
is_agent_entry_point: bool
|
85
|
+
parent_agent_id: str | None
|
86
|
+
|
87
|
+
|
88
|
+
def resolve_project_id(
|
89
|
+
api_key: str, organization_id: str, project_name: str
|
90
|
+
) -> str | None:
|
91
|
+
try:
|
92
|
+
client = JudgmentSyncClient(
|
93
|
+
api_key=api_key,
|
94
|
+
organization_id=organization_id,
|
95
|
+
)
|
96
|
+
return client.projects_resolve({"project_name": project_name})["project_id"]
|
97
|
+
except Exception:
|
98
|
+
return None
|
99
|
+
|
100
|
+
|
101
|
+
class Tracer:
|
102
|
+
_active_tracers: List[Tracer] = []
|
103
|
+
|
104
|
+
__slots__ = (
|
105
|
+
"api_key",
|
106
|
+
"organization_id",
|
107
|
+
"project_name",
|
108
|
+
"api_url",
|
109
|
+
"deep_tracing",
|
110
|
+
"enable_monitoring",
|
111
|
+
"enable_evaluation",
|
112
|
+
"api_client",
|
113
|
+
"local_eval_queue",
|
114
|
+
# Otel
|
115
|
+
"judgment_processor",
|
116
|
+
"processors",
|
117
|
+
"provider",
|
118
|
+
"tracer",
|
119
|
+
# Agent
|
120
|
+
"agent_context",
|
121
|
+
"cost_context",
|
122
|
+
)
|
123
|
+
|
124
|
+
api_key: str
|
125
|
+
organization_id: str
|
126
|
+
project_name: str
|
127
|
+
api_url: str
|
128
|
+
deep_tracing: bool
|
129
|
+
enable_monitoring: bool
|
130
|
+
enable_evaluation: bool
|
131
|
+
api_client: JudgmentSyncClient
|
132
|
+
local_eval_queue: LocalEvaluationQueue
|
133
|
+
|
134
|
+
judgment_processor: JudgmentSpanProcessor
|
135
|
+
processors: List[SpanProcessor]
|
136
|
+
provider: ABCTracerProvider
|
137
|
+
tracer: ABCTracer
|
138
|
+
|
139
|
+
agent_context: ContextVar[Optional[AgentContext]]
|
140
|
+
cost_context: ContextVar[Optional[Dict[str, float]]]
|
141
|
+
|
142
|
+
def __init__(
|
143
|
+
self,
|
144
|
+
/,
|
145
|
+
*,
|
146
|
+
project_name: str,
|
147
|
+
api_key: Optional[str] = None,
|
148
|
+
organization_id: Optional[str] = None,
|
149
|
+
deep_tracing: bool = False,
|
150
|
+
enable_monitoring: bool = os.getenv(
|
151
|
+
"JUDGMENT_ENABLE_MONITORING", "true"
|
152
|
+
).lower()
|
153
|
+
!= "false",
|
154
|
+
enable_evaluation: bool = os.getenv(
|
155
|
+
"JUDGMENT_ENABLE_EVALUATIONS", "true"
|
156
|
+
).lower()
|
157
|
+
!= "false",
|
158
|
+
processors: List[SpanProcessor] = [],
|
159
|
+
resource_attributes: Optional[Dict[str, Any]] = None,
|
160
|
+
):
|
161
|
+
_api_key = api_key or JUDGMENT_API_KEY
|
162
|
+
_organization_id = organization_id or JUDGMENT_ORG_ID
|
163
|
+
|
164
|
+
if _api_key is None:
|
165
|
+
raise ValueError(
|
166
|
+
"API Key is not set, please set it in the environment variables or pass it as `api_key`"
|
167
|
+
)
|
168
|
+
|
169
|
+
if _organization_id is None:
|
170
|
+
raise ValueError(
|
171
|
+
"Organization ID is not set, please set it in the environment variables or pass it as `organization_id`"
|
172
|
+
)
|
173
|
+
|
174
|
+
self.api_key = _api_key
|
175
|
+
self.organization_id = _organization_id
|
176
|
+
self.project_name = project_name
|
177
|
+
self.api_url = url_for("/otel/v1/traces")
|
178
|
+
|
179
|
+
self.deep_tracing = deep_tracing
|
180
|
+
self.enable_monitoring = enable_monitoring
|
181
|
+
self.enable_evaluation = enable_evaluation
|
182
|
+
|
183
|
+
self.judgment_processor = NoOpJudgmentSpanProcessor()
|
184
|
+
self.processors = processors
|
185
|
+
self.provider = NoOpTracerProvider()
|
186
|
+
|
187
|
+
self.agent_context = ContextVar("current_agent_context", default=None)
|
188
|
+
self.cost_context = ContextVar("current_cost_context", default=None)
|
189
|
+
|
190
|
+
if self.enable_monitoring:
|
191
|
+
project_id = resolve_project_id(
|
192
|
+
self.api_key, self.organization_id, self.project_name
|
193
|
+
)
|
194
|
+
|
195
|
+
resource_attributes = resource_attributes or {}
|
196
|
+
resource_attributes.update(
|
197
|
+
{
|
198
|
+
ResourceKeys.SERVICE_NAME: self.project_name,
|
199
|
+
ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
|
200
|
+
ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
|
201
|
+
}
|
202
|
+
)
|
203
|
+
|
204
|
+
if project_id is not None:
|
205
|
+
resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = project_id
|
206
|
+
else:
|
207
|
+
judgeval_logger.error(
|
208
|
+
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/projects. Skipping Judgment export."
|
209
|
+
)
|
210
|
+
|
211
|
+
resource = Resource.create(resource_attributes)
|
212
|
+
|
213
|
+
self.judgment_processor = JudgmentSpanProcessor(
|
214
|
+
self,
|
215
|
+
self.api_url,
|
216
|
+
self.api_key,
|
217
|
+
self.organization_id,
|
218
|
+
max_queue_size=2**18,
|
219
|
+
export_timeout_millis=30000,
|
220
|
+
)
|
221
|
+
self.processors.append(self.judgment_processor)
|
222
|
+
self.provider = TracerProvider(resource=resource)
|
223
|
+
for processor in self.processors:
|
224
|
+
self.provider.add_span_processor(processor)
|
225
|
+
|
226
|
+
self.tracer = self.provider.get_tracer(
|
227
|
+
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME,
|
228
|
+
get_version(),
|
229
|
+
)
|
230
|
+
self.api_client = JudgmentSyncClient(
|
231
|
+
api_key=self.api_key,
|
232
|
+
organization_id=self.organization_id,
|
233
|
+
)
|
234
|
+
self.local_eval_queue = LocalEvaluationQueue()
|
235
|
+
|
236
|
+
if self.enable_evaluation and self.enable_monitoring:
|
237
|
+
self.local_eval_queue.start_workers()
|
238
|
+
|
239
|
+
Tracer._active_tracers.append(self)
|
240
|
+
|
241
|
+
# Register atexit handler to flush on program exit
|
242
|
+
atexit.register(self._atexit_flush)
|
243
|
+
|
244
|
+
def get_current_span(self):
|
245
|
+
return get_current_span()
|
246
|
+
|
247
|
+
def get_tracer(self):
|
248
|
+
return self.tracer
|
249
|
+
|
250
|
+
def get_current_agent_context(self):
|
251
|
+
return self.agent_context
|
252
|
+
|
253
|
+
def get_current_cost_context(self):
|
254
|
+
return self.cost_context
|
255
|
+
|
256
|
+
def set_customer_id(self, customer_id: str) -> None:
|
257
|
+
span = self.get_current_span()
|
258
|
+
if span and span.is_recording():
|
259
|
+
set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
|
260
|
+
|
261
|
+
def add_cost_to_current_context(self, cost: Optional[float]) -> None:
|
262
|
+
"""Add cost to the current cost context and update span attribute."""
|
263
|
+
if cost is None:
|
264
|
+
return
|
265
|
+
current_cost_context = self.cost_context.get()
|
266
|
+
if current_cost_context is not None:
|
267
|
+
current_cumulative_cost = current_cost_context.get("cumulative_cost", 0.0)
|
268
|
+
new_cumulative_cost = float(current_cumulative_cost) + cost
|
269
|
+
current_cost_context["cumulative_cost"] = new_cumulative_cost
|
270
|
+
|
271
|
+
span = self.get_current_span()
|
272
|
+
if span and span.is_recording():
|
273
|
+
set_span_attribute(
|
274
|
+
span,
|
275
|
+
AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST,
|
276
|
+
new_cumulative_cost,
|
277
|
+
)
|
278
|
+
|
279
|
+
def add_agent_attributes_to_span(self, span):
|
280
|
+
"""Add agent ID, class name, and instance name to span if they exist in context"""
|
281
|
+
current_agent_context = self.agent_context.get()
|
282
|
+
if not current_agent_context:
|
283
|
+
return
|
284
|
+
|
285
|
+
set_span_attribute(
|
286
|
+
span, AttributeKeys.JUDGMENT_AGENT_ID, current_agent_context["agent_id"]
|
287
|
+
)
|
288
|
+
set_span_attribute(
|
289
|
+
span,
|
290
|
+
AttributeKeys.JUDGMENT_AGENT_CLASS_NAME,
|
291
|
+
current_agent_context["class_name"],
|
292
|
+
)
|
293
|
+
set_span_attribute(
|
294
|
+
span,
|
295
|
+
AttributeKeys.JUDGMENT_AGENT_INSTANCE_NAME,
|
296
|
+
current_agent_context["instance_name"],
|
297
|
+
)
|
298
|
+
set_span_attribute(
|
299
|
+
span,
|
300
|
+
AttributeKeys.JUDGMENT_PARENT_AGENT_ID,
|
301
|
+
current_agent_context["parent_agent_id"],
|
302
|
+
)
|
303
|
+
set_span_attribute(
|
304
|
+
span,
|
305
|
+
AttributeKeys.JUDGMENT_IS_AGENT_ENTRY_POINT,
|
306
|
+
current_agent_context["is_agent_entry_point"],
|
307
|
+
)
|
308
|
+
current_agent_context["is_agent_entry_point"] = False
|
309
|
+
|
310
|
+
def record_instance_state(self, record_point: Literal["before", "after"], span):
|
311
|
+
current_agent_context = self.agent_context.get()
|
312
|
+
|
313
|
+
if current_agent_context and current_agent_context.get("track_state"):
|
314
|
+
instance = current_agent_context.get("instance")
|
315
|
+
track_attributes = current_agent_context.get("track_attributes")
|
316
|
+
field_mappings = current_agent_context.get("field_mappings", {})
|
317
|
+
|
318
|
+
if track_attributes is not None:
|
319
|
+
attributes = {
|
320
|
+
field_mappings.get(attr, attr): getattr(instance, attr, None)
|
321
|
+
for attr in track_attributes
|
322
|
+
}
|
323
|
+
else:
|
324
|
+
attributes = {
|
325
|
+
field_mappings.get(k, k): v
|
326
|
+
for k, v in instance.__dict__.items()
|
327
|
+
if not k.startswith("_")
|
328
|
+
}
|
329
|
+
set_span_attribute(
|
330
|
+
span,
|
331
|
+
(
|
332
|
+
AttributeKeys.JUDGMENT_STATE_BEFORE
|
333
|
+
if record_point == "before"
|
334
|
+
else AttributeKeys.JUDGMENT_STATE_AFTER
|
335
|
+
),
|
336
|
+
safe_serialize(attributes),
|
337
|
+
)
|
338
|
+
|
339
|
+
def _set_pending_trace_eval(
|
340
|
+
self,
|
341
|
+
span: Span,
|
342
|
+
scorer_config: TraceScorerConfig,
|
343
|
+
args: Tuple[Any, ...],
|
344
|
+
kwargs: Dict[str, Any],
|
345
|
+
):
|
346
|
+
if not self.enable_evaluation:
|
347
|
+
return
|
348
|
+
|
349
|
+
scorer = scorer_config.scorer
|
350
|
+
model = scorer_config.model
|
351
|
+
run_condition = scorer_config.run_condition
|
352
|
+
sampling_rate = scorer_config.sampling_rate
|
353
|
+
|
354
|
+
if not isinstance(scorer, (TraceAPIScorerConfig)):
|
355
|
+
judgeval_logger.error(
|
356
|
+
"Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
|
357
|
+
% type(scorer)
|
358
|
+
)
|
359
|
+
return
|
360
|
+
|
361
|
+
if run_condition is not None and not run_condition(*args, **kwargs):
|
362
|
+
return
|
363
|
+
|
364
|
+
if sampling_rate < 0 or sampling_rate > 1:
|
365
|
+
judgeval_logger.error(
|
366
|
+
"Sampling rate must be between 0 and 1, got %s, skipping evaluation."
|
367
|
+
% sampling_rate
|
368
|
+
)
|
369
|
+
return
|
370
|
+
|
371
|
+
percentage = random.uniform(0, 1)
|
372
|
+
if percentage > sampling_rate:
|
373
|
+
judgeval_logger.info(
|
374
|
+
"Sampling rate is %s, skipping evaluation." % sampling_rate
|
375
|
+
)
|
376
|
+
return
|
377
|
+
|
378
|
+
span_context = span.get_span_context()
|
379
|
+
trace_id = format(span_context.trace_id, "032x")
|
380
|
+
span_id = format(span_context.span_id, "016x")
|
381
|
+
eval_run_name = f"async_trace_evaluate_{span_id}"
|
382
|
+
|
383
|
+
eval_run = TraceEvaluationRun(
|
384
|
+
organization_id=self.organization_id,
|
385
|
+
project_name=self.project_name,
|
386
|
+
eval_name=eval_run_name,
|
387
|
+
scorers=[scorer],
|
388
|
+
model=model,
|
389
|
+
trace_and_span_ids=[(trace_id, span_id)],
|
390
|
+
)
|
391
|
+
span.set_attribute(
|
392
|
+
AttributeKeys.PENDING_TRACE_EVAL,
|
393
|
+
safe_serialize(eval_run.model_dump(warnings=False)),
|
394
|
+
)
|
395
|
+
|
396
|
+
def _create_traced_sync_generator(
|
397
|
+
self,
|
398
|
+
generator: Iterator[Any],
|
399
|
+
main_span: Span,
|
400
|
+
base_name: str,
|
401
|
+
attributes: Optional[Dict[str, Any]],
|
402
|
+
):
|
403
|
+
"""Create a traced synchronous generator that wraps each yield in a span."""
|
404
|
+
try:
|
405
|
+
while True:
|
406
|
+
yield_span_name = f"{base_name}_yield"
|
407
|
+
yield_attributes = {
|
408
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: "generator_yield",
|
409
|
+
**(attributes or {}),
|
410
|
+
}
|
411
|
+
|
412
|
+
with sync_span_context(
|
413
|
+
self, yield_span_name, yield_attributes, disable_partial_emit=True
|
414
|
+
) as yield_span:
|
415
|
+
self.add_agent_attributes_to_span(yield_span)
|
416
|
+
|
417
|
+
try:
|
418
|
+
value = next(generator)
|
419
|
+
except StopIteration:
|
420
|
+
# Mark span as cancelled so it won't be exported
|
421
|
+
self.judgment_processor.set_internal_attribute(
|
422
|
+
span_context=yield_span.get_span_context(),
|
423
|
+
key=InternalAttributeKeys.CANCELLED,
|
424
|
+
value=True,
|
425
|
+
)
|
426
|
+
break
|
427
|
+
|
428
|
+
set_span_attribute(
|
429
|
+
yield_span,
|
430
|
+
AttributeKeys.JUDGMENT_OUTPUT,
|
431
|
+
safe_serialize(value),
|
432
|
+
)
|
433
|
+
|
434
|
+
yield value
|
435
|
+
except Exception as e:
|
436
|
+
main_span.record_exception(e)
|
437
|
+
main_span.set_status(Status(StatusCode.ERROR, str(e)))
|
438
|
+
raise
|
439
|
+
|
440
|
+
async def _create_traced_async_generator(
|
441
|
+
self,
|
442
|
+
async_generator: AsyncIterator[Any],
|
443
|
+
main_span: Span,
|
444
|
+
base_name: str,
|
445
|
+
attributes: Optional[Dict[str, Any]],
|
446
|
+
):
|
447
|
+
"""Create a traced asynchronous generator that wraps each yield in a span."""
|
448
|
+
try:
|
449
|
+
while True:
|
450
|
+
yield_span_name = f"{base_name}_yield"
|
451
|
+
yield_attributes = {
|
452
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: "async_generator_yield",
|
453
|
+
**(attributes or {}),
|
454
|
+
}
|
455
|
+
|
456
|
+
async with async_span_context(
|
457
|
+
self, yield_span_name, yield_attributes, disable_partial_emit=True
|
458
|
+
) as yield_span:
|
459
|
+
self.add_agent_attributes_to_span(yield_span)
|
460
|
+
|
461
|
+
try:
|
462
|
+
value = await async_generator.__anext__()
|
463
|
+
except StopAsyncIteration:
|
464
|
+
# Mark span as cancelled so it won't be exported
|
465
|
+
self.judgment_processor.set_internal_attribute(
|
466
|
+
span_context=yield_span.get_span_context(),
|
467
|
+
key=InternalAttributeKeys.CANCELLED,
|
468
|
+
value=True,
|
469
|
+
)
|
470
|
+
break
|
471
|
+
|
472
|
+
set_span_attribute(
|
473
|
+
yield_span,
|
474
|
+
AttributeKeys.JUDGMENT_OUTPUT,
|
475
|
+
safe_serialize(value),
|
476
|
+
)
|
477
|
+
|
478
|
+
yield value
|
479
|
+
except Exception as e:
|
480
|
+
main_span.record_exception(e)
|
481
|
+
main_span.set_status(Status(StatusCode.ERROR, str(e)))
|
482
|
+
raise
|
483
|
+
|
484
|
+
def _wrap_sync(
|
485
|
+
self,
|
486
|
+
f: Callable,
|
487
|
+
name: Optional[str],
|
488
|
+
attributes: Optional[Dict[str, Any]],
|
489
|
+
scorer_config: TraceScorerConfig | None = None,
|
490
|
+
):
|
491
|
+
# Check if this is a generator function - if so, wrap it specially
|
492
|
+
if inspect.isgeneratorfunction(f):
|
493
|
+
return self._wrap_sync_generator_function(
|
494
|
+
f, name, attributes, scorer_config
|
495
|
+
)
|
496
|
+
|
497
|
+
@functools.wraps(f)
|
498
|
+
def wrapper(*args, **kwargs):
|
499
|
+
n = name or f.__qualname__
|
500
|
+
with sync_span_context(self, n, attributes) as span:
|
501
|
+
self.add_agent_attributes_to_span(span)
|
502
|
+
self.record_instance_state("before", span)
|
503
|
+
try:
|
504
|
+
set_span_attribute(
|
505
|
+
span,
|
506
|
+
AttributeKeys.JUDGMENT_INPUT,
|
507
|
+
safe_serialize(format_inputs(f, args, kwargs)),
|
508
|
+
)
|
509
|
+
|
510
|
+
if scorer_config:
|
511
|
+
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
512
|
+
|
513
|
+
self.judgment_processor.emit_partial()
|
514
|
+
|
515
|
+
result = f(*args, **kwargs)
|
516
|
+
except Exception as user_exc:
|
517
|
+
span.record_exception(user_exc)
|
518
|
+
span.set_status(Status(StatusCode.ERROR, str(user_exc)))
|
519
|
+
raise
|
520
|
+
|
521
|
+
if inspect.isgenerator(result):
|
522
|
+
set_span_attribute(
|
523
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
|
524
|
+
)
|
525
|
+
self.record_instance_state("after", span)
|
526
|
+
return self._create_traced_sync_generator(
|
527
|
+
result, span, n, attributes
|
528
|
+
)
|
529
|
+
else:
|
530
|
+
set_span_attribute(
|
531
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
|
532
|
+
)
|
533
|
+
self.record_instance_state("after", span)
|
534
|
+
return result
|
535
|
+
|
536
|
+
return wrapper
|
537
|
+
|
538
|
+
def _wrap_sync_generator_function(
|
539
|
+
self,
|
540
|
+
f: Callable,
|
541
|
+
name: Optional[str],
|
542
|
+
attributes: Optional[Dict[str, Any]],
|
543
|
+
scorer_config: TraceScorerConfig | None = None,
|
544
|
+
):
|
545
|
+
"""Wrap a generator function to trace nested function calls within each yield."""
|
546
|
+
|
547
|
+
@functools.wraps(f)
|
548
|
+
def wrapper(*args, **kwargs):
|
549
|
+
n = name or f.__qualname__
|
550
|
+
|
551
|
+
with sync_span_context(self, n, attributes) as main_span:
|
552
|
+
self.add_agent_attributes_to_span(main_span)
|
553
|
+
self.record_instance_state("before", main_span)
|
554
|
+
|
555
|
+
try:
|
556
|
+
set_span_attribute(
|
557
|
+
main_span,
|
558
|
+
AttributeKeys.JUDGMENT_INPUT,
|
559
|
+
safe_serialize(format_inputs(f, args, kwargs)),
|
560
|
+
)
|
561
|
+
|
562
|
+
if scorer_config:
|
563
|
+
self._set_pending_trace_eval(
|
564
|
+
main_span, scorer_config, args, kwargs
|
565
|
+
)
|
566
|
+
|
567
|
+
self.judgment_processor.emit_partial()
|
568
|
+
|
569
|
+
generator = f(*args, **kwargs)
|
570
|
+
set_span_attribute(
|
571
|
+
main_span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
|
572
|
+
)
|
573
|
+
self.record_instance_state("after", main_span)
|
574
|
+
|
575
|
+
return self._create_traced_sync_generator(
|
576
|
+
generator, main_span, n, attributes
|
577
|
+
)
|
578
|
+
|
579
|
+
except Exception as user_exc:
|
580
|
+
main_span.record_exception(user_exc)
|
581
|
+
main_span.set_status(Status(StatusCode.ERROR, str(user_exc)))
|
582
|
+
raise
|
583
|
+
|
584
|
+
return wrapper
|
585
|
+
|
586
|
+
def _wrap_async(
|
587
|
+
self,
|
588
|
+
f: Callable,
|
589
|
+
name: Optional[str],
|
590
|
+
attributes: Optional[Dict[str, Any]],
|
591
|
+
scorer_config: TraceScorerConfig | None = None,
|
592
|
+
):
|
593
|
+
# Check if this is an async generator function - if so, wrap it specially
|
594
|
+
if inspect.isasyncgenfunction(f):
|
595
|
+
return self._wrap_async_generator_function(
|
596
|
+
f, name, attributes, scorer_config
|
597
|
+
)
|
598
|
+
|
599
|
+
@functools.wraps(f)
|
600
|
+
async def wrapper(*args, **kwargs):
|
601
|
+
n = name or f.__qualname__
|
602
|
+
async with async_span_context(self, n, attributes) as span:
|
603
|
+
self.add_agent_attributes_to_span(span)
|
604
|
+
self.record_instance_state("before", span)
|
605
|
+
try:
|
606
|
+
set_span_attribute(
|
607
|
+
span,
|
608
|
+
AttributeKeys.JUDGMENT_INPUT,
|
609
|
+
safe_serialize(format_inputs(f, args, kwargs)),
|
610
|
+
)
|
611
|
+
|
612
|
+
if scorer_config:
|
613
|
+
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
614
|
+
|
615
|
+
self.judgment_processor.emit_partial()
|
616
|
+
|
617
|
+
result = await f(*args, **kwargs)
|
618
|
+
except Exception as user_exc:
|
619
|
+
span.record_exception(user_exc)
|
620
|
+
span.set_status(Status(StatusCode.ERROR, str(user_exc)))
|
621
|
+
raise
|
622
|
+
|
623
|
+
if inspect.isasyncgen(result):
|
624
|
+
set_span_attribute(
|
625
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
|
626
|
+
)
|
627
|
+
self.record_instance_state("after", span)
|
628
|
+
return self._create_traced_async_generator(
|
629
|
+
result, span, n, attributes
|
630
|
+
)
|
631
|
+
else:
|
632
|
+
set_span_attribute(
|
633
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
|
634
|
+
)
|
635
|
+
self.record_instance_state("after", span)
|
636
|
+
return result
|
637
|
+
|
638
|
+
return wrapper
|
639
|
+
|
640
|
+
def _wrap_async_generator_function(
|
641
|
+
self,
|
642
|
+
f: Callable,
|
643
|
+
name: Optional[str],
|
644
|
+
attributes: Optional[Dict[str, Any]],
|
645
|
+
scorer_config: TraceScorerConfig | None = None,
|
646
|
+
):
|
647
|
+
"""Wrap an async generator function to trace nested function calls within each yield."""
|
648
|
+
|
649
|
+
@functools.wraps(f)
|
650
|
+
def wrapper(*args, **kwargs):
|
651
|
+
n = name or f.__qualname__
|
652
|
+
|
653
|
+
with sync_span_context(self, n, attributes) as main_span:
|
654
|
+
self.add_agent_attributes_to_span(main_span)
|
655
|
+
self.record_instance_state("before", main_span)
|
656
|
+
|
657
|
+
try:
|
658
|
+
set_span_attribute(
|
659
|
+
main_span,
|
660
|
+
AttributeKeys.JUDGMENT_INPUT,
|
661
|
+
safe_serialize(format_inputs(f, args, kwargs)),
|
662
|
+
)
|
663
|
+
|
664
|
+
if scorer_config:
|
665
|
+
self._set_pending_trace_eval(
|
666
|
+
main_span, scorer_config, args, kwargs
|
667
|
+
)
|
668
|
+
|
669
|
+
self.judgment_processor.emit_partial()
|
670
|
+
|
671
|
+
async_generator = f(*args, **kwargs)
|
672
|
+
set_span_attribute(
|
673
|
+
main_span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
|
674
|
+
)
|
675
|
+
self.record_instance_state("after", main_span)
|
676
|
+
|
677
|
+
return self._create_traced_async_generator(
|
678
|
+
async_generator, main_span, n, attributes
|
679
|
+
)
|
680
|
+
|
681
|
+
except Exception as user_exc:
|
682
|
+
main_span.record_exception(user_exc)
|
683
|
+
main_span.set_status(Status(StatusCode.ERROR, str(user_exc)))
|
684
|
+
raise
|
685
|
+
|
686
|
+
return wrapper
|
687
|
+
|
688
|
+
@overload
|
689
|
+
def observe(
|
690
|
+
self,
|
691
|
+
func: C,
|
692
|
+
/,
|
693
|
+
*,
|
694
|
+
span_type: str | None = None,
|
695
|
+
scorer_config: TraceScorerConfig | None = None,
|
696
|
+
) -> C: ...
|
697
|
+
|
698
|
+
@overload
|
699
|
+
def observe(
|
700
|
+
self,
|
701
|
+
func: None = None,
|
702
|
+
/,
|
703
|
+
*,
|
704
|
+
span_type: str | None = None,
|
705
|
+
scorer_config: TraceScorerConfig | None = None,
|
706
|
+
) -> Callable[[C], C]: ...
|
707
|
+
|
708
|
+
def observe(
|
709
|
+
self,
|
710
|
+
func: Callable | None = None,
|
711
|
+
/,
|
712
|
+
*,
|
713
|
+
span_type: str | None = "span",
|
714
|
+
span_name: str | None = None,
|
715
|
+
attributes: Optional[Dict[str, Any]] = None,
|
716
|
+
scorer_config: TraceScorerConfig | None = None,
|
717
|
+
) -> Callable | None:
|
718
|
+
if func is None:
|
719
|
+
return partial(
|
720
|
+
self.observe,
|
721
|
+
span_type=span_type,
|
722
|
+
span_name=span_name,
|
723
|
+
attributes=attributes,
|
724
|
+
scorer_config=scorer_config,
|
725
|
+
)
|
726
|
+
|
727
|
+
if not self.enable_monitoring:
|
728
|
+
return func
|
729
|
+
|
730
|
+
# Handle functions (including generator functions) - detect generators at runtime
|
731
|
+
name = span_name or getattr(func, "__qualname__", "function")
|
732
|
+
func_attributes: Dict[str, Any] = {
|
733
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: span_type,
|
734
|
+
**(attributes or {}),
|
735
|
+
}
|
736
|
+
|
737
|
+
if inspect.iscoroutinefunction(func) or inspect.isasyncgenfunction(func):
|
738
|
+
return self._wrap_async(func, name, func_attributes, scorer_config)
|
739
|
+
else:
|
740
|
+
return self._wrap_sync(func, name, func_attributes, scorer_config)
|
741
|
+
|
742
|
+
@overload
|
743
|
+
def agent(
|
744
|
+
self,
|
745
|
+
func: C,
|
746
|
+
/,
|
747
|
+
*,
|
748
|
+
identifier: str | None = None,
|
749
|
+
track_state: bool = False,
|
750
|
+
track_attributes: List[str] | None = None,
|
751
|
+
field_mappings: Dict[str, str] = {},
|
752
|
+
) -> C: ...
|
753
|
+
|
754
|
+
@overload
|
755
|
+
def agent(
|
756
|
+
self,
|
757
|
+
func: None = None,
|
758
|
+
/,
|
759
|
+
*,
|
760
|
+
identifier: str | None = None,
|
761
|
+
track_state: bool = False,
|
762
|
+
track_attributes: List[str] | None = None,
|
763
|
+
field_mappings: Dict[str, str] = {},
|
764
|
+
) -> Callable[[C], C]: ...
|
765
|
+
|
766
|
+
def agent(
|
767
|
+
self,
|
768
|
+
func: Callable | None = None,
|
769
|
+
/,
|
770
|
+
*,
|
771
|
+
identifier: str | None = None,
|
772
|
+
track_state: bool = False,
|
773
|
+
track_attributes: List[str] | None = None,
|
774
|
+
field_mappings: Dict[str, str] = {},
|
775
|
+
) -> Callable | None:
|
776
|
+
"""
|
777
|
+
Agent decorator that creates an agent ID and propagates it to child spans.
|
778
|
+
Also captures and propagates the class name if the decorated function is a method.
|
779
|
+
Optionally captures instance name based on the specified identifier attribute.
|
780
|
+
|
781
|
+
This decorator should be used in combination with @observe decorator:
|
782
|
+
|
783
|
+
class MyAgent:
|
784
|
+
def __init__(self, name):
|
785
|
+
self.name = name
|
786
|
+
|
787
|
+
@judgment.agent(identifier="name")
|
788
|
+
@judgment.observe(span_type="function")
|
789
|
+
def my_agent_method(self):
|
790
|
+
# This span and all child spans will have:
|
791
|
+
# - agent_id: auto-generated UUID
|
792
|
+
# - class_name: "MyAgent"
|
793
|
+
# - instance_name: self.name value
|
794
|
+
pass
|
795
|
+
|
796
|
+
Args:
|
797
|
+
identifier: Name of the instance attribute to use as the instance name
|
798
|
+
"""
|
799
|
+
if func is None:
|
800
|
+
return partial(
|
801
|
+
self.agent,
|
802
|
+
identifier=identifier,
|
803
|
+
track_state=track_state,
|
804
|
+
track_attributes=track_attributes,
|
805
|
+
field_mappings=field_mappings,
|
806
|
+
)
|
807
|
+
|
808
|
+
if not self.enable_monitoring:
|
809
|
+
return func
|
810
|
+
|
811
|
+
class_name = None
|
812
|
+
if hasattr(func, "__qualname__") and "." in func.__qualname__:
|
813
|
+
parts = func.__qualname__.split(".")
|
814
|
+
if len(parts) >= 2:
|
815
|
+
class_name = parts[-2]
|
816
|
+
|
817
|
+
if inspect.iscoroutinefunction(func):
|
818
|
+
|
819
|
+
@functools.wraps(func)
|
820
|
+
async def async_wrapper(*args, **kwargs):
|
821
|
+
async with async_agent_context(
|
822
|
+
tracer=self,
|
823
|
+
args=args,
|
824
|
+
class_name=class_name,
|
825
|
+
identifier=identifier,
|
826
|
+
track_state=track_state,
|
827
|
+
track_attributes=track_attributes,
|
828
|
+
field_mappings=field_mappings,
|
829
|
+
):
|
830
|
+
return await func(*args, **kwargs)
|
831
|
+
|
832
|
+
return async_wrapper
|
833
|
+
else:
|
834
|
+
|
835
|
+
@functools.wraps(func)
|
836
|
+
def sync_wrapper(*args, **kwargs):
|
837
|
+
with sync_agent_context(
|
838
|
+
tracer=self,
|
839
|
+
args=args,
|
840
|
+
class_name=class_name,
|
841
|
+
identifier=identifier,
|
842
|
+
track_state=track_state,
|
843
|
+
track_attributes=track_attributes,
|
844
|
+
field_mappings=field_mappings,
|
845
|
+
):
|
846
|
+
return func(*args, **kwargs)
|
847
|
+
|
848
|
+
return sync_wrapper
|
849
|
+
|
850
|
+
@overload
|
851
|
+
def observe_tools(
|
852
|
+
self,
|
853
|
+
cls: Cls,
|
854
|
+
/,
|
855
|
+
*,
|
856
|
+
exclude_methods: List[str] = [],
|
857
|
+
include_private: bool = False,
|
858
|
+
) -> Cls: ...
|
859
|
+
|
860
|
+
@overload
|
861
|
+
def observe_tools(
|
862
|
+
self,
|
863
|
+
cls: None = None,
|
864
|
+
/,
|
865
|
+
*,
|
866
|
+
exclude_methods: List[str] = [],
|
867
|
+
include_private: bool = False,
|
868
|
+
) -> Callable[[Cls], Cls]: ...
|
869
|
+
|
870
|
+
def observe_tools(
|
871
|
+
self,
|
872
|
+
cls: Cls | None = None,
|
873
|
+
/,
|
874
|
+
*,
|
875
|
+
exclude_methods: List[str] = [],
|
876
|
+
include_private: bool = False,
|
877
|
+
) -> Cls | Callable[[Cls], Cls]:
|
878
|
+
if cls is None:
|
879
|
+
return partial(
|
880
|
+
self.observe_tools,
|
881
|
+
exclude_methods=exclude_methods,
|
882
|
+
include_private=include_private,
|
883
|
+
)
|
884
|
+
return cls
|
885
|
+
|
886
|
+
def wrap(self, client: ApiClient) -> ApiClient:
|
887
|
+
return wrap_provider(self, client)
|
888
|
+
|
889
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
890
|
+
"""Force flush all pending spans and block until completion.
|
891
|
+
|
892
|
+
Args:
|
893
|
+
timeout_millis: Maximum time to wait for flush completion in milliseconds
|
894
|
+
|
895
|
+
Returns:
|
896
|
+
True if all processors flushed successfully within timeout, False otherwise
|
897
|
+
"""
|
898
|
+
success = True
|
899
|
+
for processor in self.processors:
|
900
|
+
try:
|
901
|
+
result = processor.force_flush(timeout_millis)
|
902
|
+
if not result:
|
903
|
+
success = False
|
904
|
+
except Exception as e:
|
905
|
+
judgeval_logger.warning(f"Error flushing processor {processor}: {e}")
|
906
|
+
success = False
|
907
|
+
return success
|
908
|
+
|
909
|
+
def _atexit_flush(self) -> None:
|
910
|
+
"""Internal method called on program exit to flush remaining spans.
|
911
|
+
|
912
|
+
This blocks until all spans are flushed or timeout is reached to ensure
|
913
|
+
proper cleanup before program termination.
|
914
|
+
"""
|
915
|
+
try:
|
916
|
+
success = self.force_flush(timeout_millis=30000)
|
917
|
+
if not success:
|
918
|
+
judgeval_logger.warning(
|
919
|
+
"Some spans may not have been exported before program exit"
|
920
|
+
)
|
921
|
+
except Exception as e:
|
922
|
+
judgeval_logger.warning(f"Error during atexit flush: {e}")
|
923
|
+
|
924
|
+
def async_evaluate(
|
925
|
+
self,
|
926
|
+
/,
|
927
|
+
*,
|
928
|
+
scorer: Union[APIScorerConfig, BaseScorer],
|
929
|
+
example: Example,
|
930
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
931
|
+
sampling_rate: float = 1.0,
|
932
|
+
):
|
933
|
+
if not self.enable_evaluation or not self.enable_monitoring:
|
934
|
+
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
935
|
+
return
|
936
|
+
|
937
|
+
if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
|
938
|
+
judgeval_logger.error(
|
939
|
+
"Scorer must be an instance of APIScorerConfig or BaseScorer, got %s, skipping evaluation."
|
940
|
+
% type(scorer)
|
941
|
+
)
|
942
|
+
return
|
943
|
+
|
944
|
+
if not isinstance(example, Example):
|
945
|
+
judgeval_logger.error(
|
946
|
+
"Example must be an instance of Example, got %s, skipping evaluation."
|
947
|
+
% type(example)
|
948
|
+
)
|
949
|
+
return
|
950
|
+
|
951
|
+
if sampling_rate < 0 or sampling_rate > 1:
|
952
|
+
judgeval_logger.error(
|
953
|
+
"Sampling rate must be between 0 and 1, got %s, skipping evaluation."
|
954
|
+
% sampling_rate
|
955
|
+
)
|
956
|
+
return
|
957
|
+
|
958
|
+
percentage = random.uniform(0, 1)
|
959
|
+
if percentage > sampling_rate:
|
960
|
+
judgeval_logger.info(
|
961
|
+
"Sampling rate is %s, skipping evaluation." % sampling_rate
|
962
|
+
)
|
963
|
+
return
|
964
|
+
|
965
|
+
span_context = self.get_current_span().get_span_context()
|
966
|
+
trace_id = format(span_context.trace_id, "032x")
|
967
|
+
span_id = format(span_context.span_id, "016x")
|
968
|
+
hosted_scoring = isinstance(scorer, APIScorerConfig) or (
|
969
|
+
isinstance(scorer, BaseScorer) and scorer.server_hosted
|
970
|
+
)
|
971
|
+
eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
972
|
+
if hosted_scoring:
|
973
|
+
eval_run = ExampleEvaluationRun(
|
974
|
+
organization_id=self.organization_id,
|
975
|
+
project_name=self.project_name,
|
976
|
+
eval_name=eval_run_name,
|
977
|
+
examples=[example],
|
978
|
+
scorers=[scorer],
|
979
|
+
model=model,
|
980
|
+
trace_span_id=span_id,
|
981
|
+
trace_id=trace_id,
|
982
|
+
)
|
983
|
+
self.api_client.add_to_run_eval_queue_examples(
|
984
|
+
eval_run.model_dump(warnings=False)
|
985
|
+
) # type: ignore
|
986
|
+
else:
|
987
|
+
# Handle custom scorers using local evaluation queue
|
988
|
+
eval_run = ExampleEvaluationRun(
|
989
|
+
organization_id=self.organization_id,
|
990
|
+
project_name=self.project_name,
|
991
|
+
eval_name=eval_run_name,
|
992
|
+
examples=[example],
|
993
|
+
scorers=[scorer],
|
994
|
+
model=model,
|
995
|
+
trace_span_id=span_id,
|
996
|
+
trace_id=trace_id,
|
997
|
+
)
|
998
|
+
|
999
|
+
# Enqueue the evaluation run to the local evaluation queue
|
1000
|
+
self.local_eval_queue.enqueue(eval_run)
|
1001
|
+
|
1002
|
+
def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
|
1003
|
+
"""Wait for all evaluations and span processing to complete.
|
1004
|
+
|
1005
|
+
This method blocks until all queued evaluations are processed and
|
1006
|
+
all pending spans are flushed to the server.
|
1007
|
+
|
1008
|
+
Args:
|
1009
|
+
timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
|
1010
|
+
None means wait indefinitely.
|
1011
|
+
|
1012
|
+
Returns:
|
1013
|
+
True if all processing completed within the timeout, False otherwise.
|
1014
|
+
|
1015
|
+
"""
|
1016
|
+
try:
|
1017
|
+
judgeval_logger.debug(
|
1018
|
+
"Waiting for all evaluations and spans to complete..."
|
1019
|
+
)
|
1020
|
+
|
1021
|
+
# Wait for all queued evaluation work to complete
|
1022
|
+
eval_completed = self.local_eval_queue.wait_for_completion()
|
1023
|
+
if not eval_completed:
|
1024
|
+
judgeval_logger.warning(
|
1025
|
+
f"Local evaluation queue did not complete within {timeout} seconds"
|
1026
|
+
)
|
1027
|
+
return False
|
1028
|
+
|
1029
|
+
self.force_flush()
|
1030
|
+
|
1031
|
+
judgeval_logger.debug("All evaluations and spans completed successfully")
|
1032
|
+
return True
|
1033
|
+
|
1034
|
+
except Exception as e:
|
1035
|
+
judgeval_logger.warning(f"Error while waiting for completion: {e}")
|
1036
|
+
return False
|
1037
|
+
|
1038
|
+
|
1039
|
+
def wrap(client: ApiClient) -> ApiClient:
|
1040
|
+
if not Tracer._active_tracers:
|
1041
|
+
warn(
|
1042
|
+
"No active tracers found, client will not be wrapped. "
|
1043
|
+
"You can use the global `wrap` function after creating a tracer instance. "
|
1044
|
+
"Or you can use the `wrap` method on the tracer instance to directly wrap the client. ",
|
1045
|
+
JudgmentWarning,
|
1046
|
+
stacklevel=2,
|
1047
|
+
)
|
1048
|
+
|
1049
|
+
wrapped_client = client
|
1050
|
+
for tracer in Tracer._active_tracers:
|
1051
|
+
wrapped_client = tracer.wrap(wrapped_client)
|
1052
|
+
return wrapped_client
|
1053
|
+
|
1054
|
+
|
1055
|
+
def format_inputs(
|
1056
|
+
f: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
|
1057
|
+
) -> Dict[str, Any]:
|
1058
|
+
try:
|
1059
|
+
params = list(inspect.signature(f).parameters.values())
|
1060
|
+
inputs = {}
|
1061
|
+
arg_i = 0
|
1062
|
+
for param in params:
|
1063
|
+
if param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD:
|
1064
|
+
if arg_i < len(args):
|
1065
|
+
inputs[param.name] = args[arg_i]
|
1066
|
+
arg_i += 1
|
1067
|
+
elif param.name in kwargs:
|
1068
|
+
inputs[param.name] = kwargs[param.name]
|
1069
|
+
elif param.kind == inspect.Parameter.VAR_POSITIONAL:
|
1070
|
+
inputs[param.name] = args[arg_i:]
|
1071
|
+
arg_i = len(args)
|
1072
|
+
elif param.kind == inspect.Parameter.VAR_KEYWORD:
|
1073
|
+
inputs[param.name] = kwargs
|
1074
|
+
return inputs
|
1075
|
+
except Exception:
|
1076
|
+
return {}
|