judgeval 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +3 -1
- judgeval/common/tracer.py +1079 -139
- judgeval/common/utils.py +6 -2
- judgeval/constants.py +5 -0
- judgeval/data/datasets/dataset.py +12 -6
- judgeval/data/datasets/eval_dataset_client.py +3 -1
- judgeval/data/trace.py +7 -2
- judgeval/integrations/langgraph.py +218 -34
- judgeval/judgment_client.py +9 -1
- judgeval/rules.py +60 -50
- judgeval/run_evaluation.py +53 -29
- judgeval/scorers/judgeval_scorer.py +4 -1
- judgeval/scorers/prompt_scorer.py +3 -0
- judgeval/utils/alerts.py +8 -0
- {judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/METADATA +48 -50
- {judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/RECORD +18 -18
- {judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/WHEEL +0 -0
- {judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/licenses/LICENSE.md +0 -0
judgeval/rules.py
CHANGED
@@ -9,13 +9,13 @@ import asyncio
|
|
9
9
|
from concurrent.futures import ThreadPoolExecutor
|
10
10
|
import time
|
11
11
|
import uuid
|
12
|
+
import os
|
13
|
+
import re
|
14
|
+
import json
|
15
|
+
from datetime import datetime
|
12
16
|
|
13
17
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
14
|
-
|
15
|
-
class AlertStatus(str, Enum):
|
16
|
-
"""Status of an alert evaluation."""
|
17
|
-
TRIGGERED = "triggered"
|
18
|
-
NOT_TRIGGERED = "not_triggered"
|
18
|
+
from judgeval.utils.alerts import AlertStatus, AlertResult
|
19
19
|
|
20
20
|
class Condition(BaseModel):
|
21
21
|
"""
|
@@ -68,6 +68,36 @@ class Condition(BaseModel):
|
|
68
68
|
# Fallback to default comparison (greater than or equal)
|
69
69
|
return value >= self.threshold if self.threshold is not None else False
|
70
70
|
|
71
|
+
class PagerDutyConfig(BaseModel):
|
72
|
+
"""
|
73
|
+
Configuration for PagerDuty notifications.
|
74
|
+
|
75
|
+
Attributes:
|
76
|
+
routing_key: PagerDuty integration routing key
|
77
|
+
severity: Severity level (critical, error, warning, info)
|
78
|
+
source: Source of the alert (defaults to "judgeval")
|
79
|
+
component: Optional component that triggered the alert
|
80
|
+
group: Optional logical grouping for the alert
|
81
|
+
class_type: Optional class/type of alert event
|
82
|
+
"""
|
83
|
+
routing_key: str
|
84
|
+
severity: str = "error" # critical, error, warning, info
|
85
|
+
source: str = "judgeval"
|
86
|
+
component: Optional[str] = None
|
87
|
+
group: Optional[str] = None
|
88
|
+
class_type: Optional[str] = None
|
89
|
+
|
90
|
+
def model_dump(self, **kwargs):
|
91
|
+
"""Convert the PagerDutyConfig to a dictionary for JSON serialization."""
|
92
|
+
return {
|
93
|
+
"routing_key": self.routing_key,
|
94
|
+
"severity": self.severity,
|
95
|
+
"source": self.source,
|
96
|
+
"component": self.component,
|
97
|
+
"group": self.group,
|
98
|
+
"class_type": self.class_type
|
99
|
+
}
|
100
|
+
|
71
101
|
class NotificationConfig(BaseModel):
|
72
102
|
"""
|
73
103
|
Configuration for notifications when a rule is triggered.
|
@@ -75,8 +105,12 @@ class NotificationConfig(BaseModel):
|
|
75
105
|
Example:
|
76
106
|
{
|
77
107
|
"enabled": true,
|
78
|
-
"communication_methods": ["email", "broadcast_slack", "broadcast_email"],
|
108
|
+
"communication_methods": ["email", "broadcast_slack", "broadcast_email", "pagerduty"],
|
79
109
|
"email_addresses": ["user1@example.com", "user2@example.com"],
|
110
|
+
"pagerduty_config": {
|
111
|
+
"routing_key": "R0ABCD1234567890123456789",
|
112
|
+
"severity": "error"
|
113
|
+
},
|
80
114
|
"send_at": 1632150000 # Unix timestamp (specific date/time)
|
81
115
|
}
|
82
116
|
|
@@ -84,10 +118,12 @@ class NotificationConfig(BaseModel):
|
|
84
118
|
- "email": Send emails to specified email addresses
|
85
119
|
- "broadcast_slack": Send broadcast notifications to all configured Slack channels
|
86
120
|
- "broadcast_email": Send broadcast emails to all organization emails
|
121
|
+
- "pagerduty": Send alerts to PagerDuty using the configured routing key
|
87
122
|
"""
|
88
123
|
enabled: bool = True
|
89
124
|
communication_methods: List[str] = []
|
90
125
|
email_addresses: Optional[List[str]] = None
|
126
|
+
pagerduty_config: Optional[PagerDutyConfig] = None
|
91
127
|
send_at: Optional[int] = None # Unix timestamp for scheduled notifications
|
92
128
|
|
93
129
|
def model_dump(self, **kwargs):
|
@@ -96,6 +132,7 @@ class NotificationConfig(BaseModel):
|
|
96
132
|
"enabled": self.enabled,
|
97
133
|
"communication_methods": self.communication_methods,
|
98
134
|
"email_addresses": self.email_addresses,
|
135
|
+
"pagerduty_config": self.pagerduty_config.model_dump() if self.pagerduty_config else None,
|
99
136
|
"send_at": self.send_at
|
100
137
|
}
|
101
138
|
|
@@ -144,7 +181,8 @@ class Rule(BaseModel):
|
|
144
181
|
# Create standardized metric representation needed by server API
|
145
182
|
metric_data = {
|
146
183
|
"score_type": "",
|
147
|
-
"threshold": 0.0
|
184
|
+
"threshold": 0.0,
|
185
|
+
"name": ""
|
148
186
|
}
|
149
187
|
|
150
188
|
# First try to use object's own serialization methods
|
@@ -182,6 +220,16 @@ class Rule(BaseModel):
|
|
182
220
|
# Use condition threshold if metric doesn't have one
|
183
221
|
metric_data['threshold'] = self.conditions[i].threshold
|
184
222
|
|
223
|
+
# Make sure name is set
|
224
|
+
if not metric_data.get('name'):
|
225
|
+
if hasattr(metric_obj, '__name__'):
|
226
|
+
metric_data['name'] = metric_obj.__name__
|
227
|
+
elif hasattr(metric_obj, 'name'):
|
228
|
+
metric_data['name'] = metric_obj.name
|
229
|
+
else:
|
230
|
+
# Fallback to score_type if available
|
231
|
+
metric_data['name'] = metric_data.get('score_type', str(metric_obj))
|
232
|
+
|
185
233
|
# Update the condition with our properly serialized metric
|
186
234
|
condition["metric"] = metric_data
|
187
235
|
|
@@ -199,47 +247,6 @@ class Rule(BaseModel):
|
|
199
247
|
raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
|
200
248
|
return v
|
201
249
|
|
202
|
-
class AlertResult(BaseModel):
|
203
|
-
"""
|
204
|
-
Result of evaluating a rule.
|
205
|
-
|
206
|
-
Example:
|
207
|
-
{
|
208
|
-
"status": "triggered",
|
209
|
-
"rule_name": "Quality Check",
|
210
|
-
"conditions_result": [
|
211
|
-
{"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
|
212
|
-
{"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
|
213
|
-
],
|
214
|
-
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
215
|
-
"metadata": {
|
216
|
-
"example_id": "example_123",
|
217
|
-
"timestamp": "20240321_123456"
|
218
|
-
},
|
219
|
-
"notification": {
|
220
|
-
"enabled": true,
|
221
|
-
"communication_methods": ["slack", "email"],
|
222
|
-
"email_addresses": ["user1@example.com", "user2@example.com"]
|
223
|
-
}
|
224
|
-
}
|
225
|
-
"""
|
226
|
-
status: AlertStatus
|
227
|
-
rule_id: Optional[str] = None # The unique identifier of the rule
|
228
|
-
rule_name: str
|
229
|
-
conditions_result: List[Dict[str, Any]]
|
230
|
-
metadata: Dict[str, Any] = {}
|
231
|
-
notification: Optional[NotificationConfig] = None # Configuration for notifications
|
232
|
-
|
233
|
-
@property
|
234
|
-
def example_id(self) -> Optional[str]:
|
235
|
-
"""Get example_id from metadata for backward compatibility"""
|
236
|
-
return self.metadata.get("example_id")
|
237
|
-
|
238
|
-
@property
|
239
|
-
def timestamp(self) -> Optional[str]:
|
240
|
-
"""Get timestamp from metadata for backward compatibility"""
|
241
|
-
return self.metadata.get("timestamp")
|
242
|
-
|
243
250
|
class RulesEngine:
|
244
251
|
"""
|
245
252
|
Engine for creating and evaluating rules against metrics.
|
@@ -406,7 +413,7 @@ class RulesEngine:
|
|
406
413
|
# If rule has a notification config and the alert is triggered, include it in the result
|
407
414
|
notification_config = rule.notification
|
408
415
|
|
409
|
-
# Set the alert status based on whether the rule was triggered
|
416
|
+
# Set the alert status based on whether the rule was triggered using proper enum values
|
410
417
|
status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
|
411
418
|
|
412
419
|
# Create the alert result
|
@@ -416,7 +423,10 @@ class RulesEngine:
|
|
416
423
|
rule_name=rule.name,
|
417
424
|
conditions_result=condition_results,
|
418
425
|
notification=notification_config,
|
419
|
-
metadata=example_metadata or {}
|
426
|
+
metadata=example_metadata or {},
|
427
|
+
combine_type=rule.combine_type,
|
428
|
+
project_id=example_metadata.get("project_id") if example_metadata else None,
|
429
|
+
trace_span_id=example_metadata.get("trace_span_id") if example_metadata else None
|
420
430
|
)
|
421
431
|
|
422
432
|
results[rule_id] = alert_result
|
judgeval/run_evaluation.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import requests
|
3
3
|
import time
|
4
|
+
import json
|
4
5
|
import sys
|
5
6
|
import itertools
|
6
7
|
import threading
|
@@ -99,9 +100,9 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
99
100
|
raise JudgmentAPIError(error_message)
|
100
101
|
return response_data
|
101
102
|
|
102
|
-
def execute_api_trace_eval(trace_run: TraceRun) ->
|
103
|
+
def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
103
104
|
"""
|
104
|
-
Executes an evaluation of a list of `
|
105
|
+
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
105
106
|
"""
|
106
107
|
|
107
108
|
try:
|
@@ -145,46 +146,47 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
145
146
|
"""
|
146
147
|
# No merge required
|
147
148
|
if not local_results and api_results:
|
148
|
-
return api_results
|
149
|
+
return [result.model_copy() for result in api_results]
|
149
150
|
if not api_results and local_results:
|
150
|
-
return local_results
|
151
|
+
return [result.model_copy() for result in local_results]
|
151
152
|
|
152
153
|
if len(api_results) != len(local_results):
|
153
154
|
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
154
155
|
raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
|
155
156
|
|
157
|
+
# Create a copy of api_results to avoid modifying the input
|
158
|
+
merged_results = [result.model_copy() for result in api_results]
|
159
|
+
|
156
160
|
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
157
|
-
for
|
158
|
-
if not (
|
161
|
+
for merged_result, local_result in zip(merged_results, local_results):
|
162
|
+
if not (merged_result.data_object and local_result.data_object):
|
159
163
|
raise ValueError("Data object is None in one of the results.")
|
160
|
-
if
|
164
|
+
if merged_result.data_object.input != local_result.data_object.input:
|
161
165
|
raise ValueError("The API and local results are not aligned.")
|
162
|
-
if
|
166
|
+
if merged_result.data_object.actual_output != local_result.data_object.actual_output:
|
163
167
|
raise ValueError("The API and local results are not aligned.")
|
164
|
-
if
|
168
|
+
if merged_result.data_object.expected_output != local_result.data_object.expected_output:
|
165
169
|
raise ValueError("The API and local results are not aligned.")
|
166
|
-
if
|
170
|
+
if merged_result.data_object.context != local_result.data_object.context:
|
167
171
|
raise ValueError("The API and local results are not aligned.")
|
168
|
-
if
|
172
|
+
if merged_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
|
169
173
|
raise ValueError("The API and local results are not aligned.")
|
170
|
-
if
|
174
|
+
if merged_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
|
171
175
|
raise ValueError("The API and local results are not aligned.")
|
172
|
-
if
|
176
|
+
if merged_result.data_object.tools_called != local_result.data_object.tools_called:
|
173
177
|
raise ValueError("The API and local results are not aligned.")
|
174
|
-
if
|
178
|
+
if merged_result.data_object.expected_tools != local_result.data_object.expected_tools:
|
175
179
|
raise ValueError("The API and local results are not aligned.")
|
176
180
|
|
177
|
-
|
178
181
|
# Merge ScorerData from the API and local scorers together
|
179
|
-
api_scorer_data =
|
182
|
+
api_scorer_data = merged_result.scorers_data
|
180
183
|
local_scorer_data = local_result.scorers_data
|
181
184
|
if api_scorer_data is None and local_scorer_data is not None:
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
api_result.scorers_data = api_scorer_data + local_scorer_data
|
185
|
+
merged_result.scorers_data = local_scorer_data
|
186
|
+
elif api_scorer_data is not None and local_scorer_data is not None:
|
187
|
+
merged_result.scorers_data = api_scorer_data + local_scorer_data
|
186
188
|
|
187
|
-
return
|
189
|
+
return merged_results
|
188
190
|
|
189
191
|
|
190
192
|
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
@@ -362,14 +364,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
362
364
|
"""
|
363
365
|
Checks if the example contains the necessary parameters for the scorer.
|
364
366
|
"""
|
367
|
+
prompt_user = False
|
365
368
|
for scorer in scorers:
|
366
369
|
for example in examples:
|
367
370
|
missing_params = []
|
368
371
|
for param in scorer.required_params:
|
369
372
|
if getattr(example, param.value) is None:
|
370
|
-
missing_params.append(f"
|
373
|
+
missing_params.append(f"{param.value}")
|
371
374
|
if missing_params:
|
372
|
-
|
375
|
+
rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
|
376
|
+
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
377
|
+
rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
|
378
|
+
rprint("-"*40)
|
379
|
+
prompt_user = True
|
380
|
+
|
381
|
+
if prompt_user:
|
382
|
+
user_input = input("Do you want to continue? (y/n)")
|
383
|
+
if user_input.lower() != "y":
|
384
|
+
sys.exit(0)
|
385
|
+
else:
|
386
|
+
rprint("[green]Continuing...[/green]")
|
373
387
|
|
374
388
|
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
375
389
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -392,8 +406,15 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
392
406
|
)
|
393
407
|
if function and tracer:
|
394
408
|
new_traces: List[Trace] = []
|
395
|
-
|
396
|
-
tracer
|
409
|
+
|
410
|
+
# Handle case where tracer is actually a callback handler
|
411
|
+
actual_tracer = tracer
|
412
|
+
if hasattr(tracer, 'tracer') and hasattr(tracer.tracer, 'traces'):
|
413
|
+
# This is a callback handler, get the underlying tracer
|
414
|
+
actual_tracer = tracer.tracer
|
415
|
+
|
416
|
+
actual_tracer.offline_mode = True
|
417
|
+
actual_tracer.traces = []
|
397
418
|
for example in examples:
|
398
419
|
if example.input:
|
399
420
|
if isinstance(example.input, str):
|
@@ -404,19 +425,21 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
404
425
|
raise ValueError(f"Input must be string or dict, got {type(example.input)}")
|
405
426
|
else:
|
406
427
|
result = run_with_spinner("Running agent function: ", function)
|
407
|
-
|
428
|
+
|
429
|
+
|
430
|
+
for i, trace in enumerate(actual_tracer.traces):
|
408
431
|
# We set the root-level trace span with the expected tools of the Trace
|
409
432
|
trace = Trace(**trace)
|
410
|
-
trace.
|
433
|
+
trace.trace_spans[0].expected_tools = examples[i].expected_tools
|
411
434
|
new_traces.append(trace)
|
412
435
|
trace_run.traces = new_traces
|
413
|
-
|
436
|
+
actual_tracer.traces = []
|
414
437
|
|
415
438
|
# Execute evaluation using Judgment API
|
416
439
|
info("Starting API evaluation")
|
417
440
|
try: # execute an EvaluationRun with just JudgmentScorers
|
418
441
|
debug("Sending request to Judgment API")
|
419
|
-
response_data:
|
442
|
+
response_data: Dict = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
|
420
443
|
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
421
444
|
info(f"Received {len(scoring_results)} results from API")
|
422
445
|
except JudgmentAPIError as e:
|
@@ -894,6 +917,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
894
917
|
f"Processing evaluation '{evaluation_run.eval_name}': "
|
895
918
|
)
|
896
919
|
else:
|
920
|
+
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
897
921
|
if judgment_scorers:
|
898
922
|
# Execute evaluation using Judgment API
|
899
923
|
info("Starting API evaluation")
|
@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
|
|
12
12
|
from judgeval.judges import JudgevalJudge
|
13
13
|
from judgeval.judges.utils import create_judge
|
14
14
|
from judgeval.constants import UNBOUNDED_SCORERS
|
15
|
-
|
15
|
+
from judgeval.data.example import ExampleParams
|
16
16
|
class JudgevalScorer:
|
17
17
|
"""
|
18
18
|
Base class for scorers in `judgeval`.
|
@@ -39,6 +39,7 @@ class JudgevalScorer:
|
|
39
39
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
40
40
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
41
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
42
|
+
required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
|
42
43
|
error: Optional[str] = None
|
43
44
|
success: Optional[bool] = None
|
44
45
|
|
@@ -51,6 +52,7 @@ class JudgevalScorer:
|
|
51
52
|
reason: Optional[str] = None,
|
52
53
|
success: Optional[bool] = None,
|
53
54
|
evaluation_model: Optional[str] = None,
|
55
|
+
required_params: Optional[List[ExampleParams]] = None,
|
54
56
|
strict_mode: bool = False,
|
55
57
|
async_mode: bool = True,
|
56
58
|
verbose_mode: bool = True,
|
@@ -87,6 +89,7 @@ class JudgevalScorer:
|
|
87
89
|
self.evaluation_cost = evaluation_cost
|
88
90
|
self.verbose_logs = verbose_logs
|
89
91
|
self.additional_metadata = additional_metadata
|
92
|
+
self.required_params = required_params
|
90
93
|
|
91
94
|
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
92
95
|
"""
|
@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
|
|
30
30
|
from pydantic import BaseModel, model_serializer, Field
|
31
31
|
|
32
32
|
from judgeval.data import Example
|
33
|
+
from judgeval.data.example import ExampleParams
|
33
34
|
from judgeval.scorers import JudgevalScorer
|
34
35
|
from judgeval.scorers.utils import (
|
35
36
|
scorer_progress_meter,
|
@@ -64,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
64
65
|
async_mode: bool = True,
|
65
66
|
strict_mode: bool = False,
|
66
67
|
verbose_mode: bool = False,
|
68
|
+
required_params: Optional[List[ExampleParams]] = None,
|
67
69
|
):
|
68
70
|
# Initialize BaseModel first
|
69
71
|
BaseModel.__init__(
|
@@ -85,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
85
87
|
async_mode=async_mode,
|
86
88
|
strict_mode=strict_mode,
|
87
89
|
verbose_mode=verbose_mode,
|
90
|
+
required_params=required_params,
|
88
91
|
)
|
89
92
|
|
90
93
|
def score_example(
|
judgeval/utils/alerts.py
CHANGED
@@ -20,12 +20,20 @@ class AlertResult(BaseModel):
|
|
20
20
|
status: Status of the alert (triggered or not)
|
21
21
|
conditions_result: List of condition evaluation results
|
22
22
|
metadata: Dictionary containing example_id, timestamp, and other metadata
|
23
|
+
notification: Optional notification configuration for triggered alerts
|
24
|
+
combine_type: The combination type used ("all" or "any")
|
25
|
+
project_id: Optional project identifier
|
26
|
+
trace_span_id: Optional trace span identifier
|
23
27
|
"""
|
24
28
|
rule_name: str
|
25
29
|
rule_id: Optional[str] = None # The unique identifier of the rule
|
26
30
|
status: AlertStatus
|
27
31
|
conditions_result: List[Dict[str, Any]] = []
|
28
32
|
metadata: Dict[str, Any] = {}
|
33
|
+
notification: Optional[Any] = None # NotificationConfig when triggered, None otherwise
|
34
|
+
combine_type: Optional[str] = None # "all" or "any"
|
35
|
+
project_id: Optional[str] = None # Project identifier
|
36
|
+
trace_span_id: Optional[str] = None # Trace span identifier
|
29
37
|
|
30
38
|
@property
|
31
39
|
def example_id(self) -> Optional[str]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.42
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
|
|
18
18
|
Requires-Dist: langchain-huggingface
|
19
19
|
Requires-Dist: langchain-openai
|
20
20
|
Requires-Dist: litellm==1.61.15
|
21
|
+
Requires-Dist: matplotlib>=3.10.3
|
21
22
|
Requires-Dist: nest-asyncio
|
22
23
|
Requires-Dist: openai
|
23
24
|
Requires-Dist: pandas
|
@@ -31,44 +32,47 @@ Description-Content-Type: text/markdown
|
|
31
32
|
<img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
|
32
33
|
<img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
|
33
34
|
|
34
|
-
|
35
|
+
<br>
|
36
|
+
<div style="font-size: 1.5em;">
|
37
|
+
Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
|
38
|
+
</div>
|
35
39
|
|
36
|
-
|
40
|
+
## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) • [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
|
37
41
|
|
38
|
-
|
42
|
+
[Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
39
43
|
|
40
|
-
|
44
|
+
We're hiring! Join us in our mission to unleash optimized agents.
|
41
45
|
|
42
46
|
[](https://x.com/JudgmentLabs)
|
43
47
|
[](https://www.linkedin.com/company/judgmentlabs)
|
44
|
-
[](https://discord.gg/
|
48
|
+
[](https://discord.gg/ZCnSXYug)
|
45
49
|
|
46
|
-
|
50
|
+
<img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
|
47
51
|
|
48
|
-
|
52
|
+
</div>
|
49
53
|
|
50
|
-
Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
|
51
54
|
|
52
|
-
Judgeval
|
55
|
+
Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
|
53
56
|
|
54
|
-
|
57
|
+
Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
|
58
|
+
> 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
|
55
59
|
|
56
60
|
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
57
61
|
|
58
62
|
## 📋 Table of Contents
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
63
|
+
- [✨ Features](#-features)
|
64
|
+
- [🛠️ Installation](#️-installation)
|
65
|
+
- [🏁 Quickstarts](#-quickstarts)
|
66
|
+
- [🛰️ Tracing](#️-tracing)
|
67
|
+
- [📝 Offline Evaluations](#-offline-evaluations)
|
68
|
+
- [📡 Online Evaluations](#-online-evaluations)
|
69
|
+
- [🏢 Self-Hosting](#-self-hosting)
|
70
|
+
- [Key Features](#key-features)
|
71
|
+
- [Getting Started](#getting-started)
|
72
|
+
- [📚 Cookbooks](#-cookbooks)
|
73
|
+
- [💻 Development with Cursor](#-development-with-cursor)
|
74
|
+
- [⭐ Star Us on GitHub](#-star-us-on-github)
|
75
|
+
- [❤️ Contributors](#️-contributors)
|
72
76
|
|
73
77
|
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
74
78
|
|
@@ -77,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
|
77
81
|
|
78
82
|
| | |
|
79
83
|
|:---|:---:|
|
80
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
81
|
-
| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>
|
82
|
-
| <h3>📡 Monitoring</h3>
|
83
|
-
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets
|
84
|
-
| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
|
84
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
85
|
+
| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
86
|
+
| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
87
|
+
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
85
88
|
|
86
89
|
## 🛠️ Installation
|
87
90
|
|
@@ -91,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
|
|
91
94
|
pip install judgeval
|
92
95
|
```
|
93
96
|
|
94
|
-
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
97
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
95
98
|
|
96
|
-
|
99
|
+
```bash
|
100
|
+
export JUDGMENT_API_KEY=...
|
101
|
+
export JUDGMENT_ORG_ID=...
|
102
|
+
```
|
97
103
|
|
98
|
-
|
104
|
+
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
99
105
|
|
100
|
-
|
106
|
+
## 🏁 Quickstarts
|
101
107
|
|
102
108
|
### 🛰️ Tracing
|
103
109
|
|
104
|
-
Track your agent execution with full observability with just a few lines of code.
|
105
110
|
Create a file named `traces.py` with the following code:
|
106
111
|
|
107
112
|
```python
|
@@ -126,12 +131,15 @@ def main():
|
|
126
131
|
|
127
132
|
main()
|
128
133
|
```
|
134
|
+
You'll see your trace exported to the Judgment Platform:
|
135
|
+
|
136
|
+
<p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
|
137
|
+
|
129
138
|
|
130
139
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
|
131
140
|
|
132
141
|
### 📝 Offline Evaluations
|
133
142
|
|
134
|
-
You can evaluate your agent's execution to measure quality metrics such as hallucination.
|
135
143
|
Create a file named `evaluate.py` with the following code:
|
136
144
|
|
137
145
|
```python evaluate.py
|
@@ -147,7 +155,7 @@ example = Example(
|
|
147
155
|
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
148
156
|
)
|
149
157
|
|
150
|
-
scorer = FaithfulnessScorer(threshold=0.5)
|
158
|
+
scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
|
151
159
|
results = client.run_evaluation(
|
152
160
|
examples=[example],
|
153
161
|
scorers=[scorer],
|
@@ -196,6 +204,8 @@ def main():
|
|
196
204
|
main()
|
197
205
|
```
|
198
206
|
|
207
|
+
You should see an evaluation attached to your trace on the Judgment Platform.
|
208
|
+
|
199
209
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
|
200
210
|
|
201
211
|
## 🏢 Self-Hosting
|
@@ -220,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
|
|
220
230
|
|
221
231
|
### Sample Agents
|
222
232
|
|
223
|
-
####
|
224
|
-
A
|
225
|
-
|
226
|
-
#### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
|
227
|
-
A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
|
228
|
-
|
229
|
-
### Custom Evaluators
|
230
|
-
|
231
|
-
#### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
|
232
|
-
Detecting and evaluating Personal Identifiable Information (PII) leakage.
|
233
|
-
|
234
|
-
#### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
|
235
|
-
|
236
|
-
Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
|
233
|
+
#### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
|
234
|
+
A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
|
237
235
|
|
238
236
|
## 💻 Development with Cursor
|
239
237
|
When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
|