judgeval 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/rules.py CHANGED
@@ -9,13 +9,13 @@ import asyncio
9
9
  from concurrent.futures import ThreadPoolExecutor
10
10
  import time
11
11
  import uuid
12
+ import os
13
+ import re
14
+ import json
15
+ from datetime import datetime
12
16
 
13
17
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
14
-
15
- class AlertStatus(str, Enum):
16
- """Status of an alert evaluation."""
17
- TRIGGERED = "triggered"
18
- NOT_TRIGGERED = "not_triggered"
18
+ from judgeval.utils.alerts import AlertStatus, AlertResult
19
19
 
20
20
  class Condition(BaseModel):
21
21
  """
@@ -68,6 +68,36 @@ class Condition(BaseModel):
68
68
  # Fallback to default comparison (greater than or equal)
69
69
  return value >= self.threshold if self.threshold is not None else False
70
70
 
71
+ class PagerDutyConfig(BaseModel):
72
+ """
73
+ Configuration for PagerDuty notifications.
74
+
75
+ Attributes:
76
+ routing_key: PagerDuty integration routing key
77
+ severity: Severity level (critical, error, warning, info)
78
+ source: Source of the alert (defaults to "judgeval")
79
+ component: Optional component that triggered the alert
80
+ group: Optional logical grouping for the alert
81
+ class_type: Optional class/type of alert event
82
+ """
83
+ routing_key: str
84
+ severity: str = "error" # critical, error, warning, info
85
+ source: str = "judgeval"
86
+ component: Optional[str] = None
87
+ group: Optional[str] = None
88
+ class_type: Optional[str] = None
89
+
90
+ def model_dump(self, **kwargs):
91
+ """Convert the PagerDutyConfig to a dictionary for JSON serialization."""
92
+ return {
93
+ "routing_key": self.routing_key,
94
+ "severity": self.severity,
95
+ "source": self.source,
96
+ "component": self.component,
97
+ "group": self.group,
98
+ "class_type": self.class_type
99
+ }
100
+
71
101
  class NotificationConfig(BaseModel):
72
102
  """
73
103
  Configuration for notifications when a rule is triggered.
@@ -75,8 +105,12 @@ class NotificationConfig(BaseModel):
75
105
  Example:
76
106
  {
77
107
  "enabled": true,
78
- "communication_methods": ["email", "broadcast_slack", "broadcast_email"],
108
+ "communication_methods": ["email", "broadcast_slack", "broadcast_email", "pagerduty"],
79
109
  "email_addresses": ["user1@example.com", "user2@example.com"],
110
+ "pagerduty_config": {
111
+ "routing_key": "R0ABCD1234567890123456789",
112
+ "severity": "error"
113
+ },
80
114
  "send_at": 1632150000 # Unix timestamp (specific date/time)
81
115
  }
82
116
 
@@ -84,10 +118,12 @@ class NotificationConfig(BaseModel):
84
118
  - "email": Send emails to specified email addresses
85
119
  - "broadcast_slack": Send broadcast notifications to all configured Slack channels
86
120
  - "broadcast_email": Send broadcast emails to all organization emails
121
+ - "pagerduty": Send alerts to PagerDuty using the configured routing key
87
122
  """
88
123
  enabled: bool = True
89
124
  communication_methods: List[str] = []
90
125
  email_addresses: Optional[List[str]] = None
126
+ pagerduty_config: Optional[PagerDutyConfig] = None
91
127
  send_at: Optional[int] = None # Unix timestamp for scheduled notifications
92
128
 
93
129
  def model_dump(self, **kwargs):
@@ -96,6 +132,7 @@ class NotificationConfig(BaseModel):
96
132
  "enabled": self.enabled,
97
133
  "communication_methods": self.communication_methods,
98
134
  "email_addresses": self.email_addresses,
135
+ "pagerduty_config": self.pagerduty_config.model_dump() if self.pagerduty_config else None,
99
136
  "send_at": self.send_at
100
137
  }
101
138
 
@@ -144,7 +181,8 @@ class Rule(BaseModel):
144
181
  # Create standardized metric representation needed by server API
145
182
  metric_data = {
146
183
  "score_type": "",
147
- "threshold": 0.0
184
+ "threshold": 0.0,
185
+ "name": ""
148
186
  }
149
187
 
150
188
  # First try to use object's own serialization methods
@@ -182,6 +220,16 @@ class Rule(BaseModel):
182
220
  # Use condition threshold if metric doesn't have one
183
221
  metric_data['threshold'] = self.conditions[i].threshold
184
222
 
223
+ # Make sure name is set
224
+ if not metric_data.get('name'):
225
+ if hasattr(metric_obj, '__name__'):
226
+ metric_data['name'] = metric_obj.__name__
227
+ elif hasattr(metric_obj, 'name'):
228
+ metric_data['name'] = metric_obj.name
229
+ else:
230
+ # Fallback to score_type if available
231
+ metric_data['name'] = metric_data.get('score_type', str(metric_obj))
232
+
185
233
  # Update the condition with our properly serialized metric
186
234
  condition["metric"] = metric_data
187
235
 
@@ -199,47 +247,6 @@ class Rule(BaseModel):
199
247
  raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
200
248
  return v
201
249
 
202
- class AlertResult(BaseModel):
203
- """
204
- Result of evaluating a rule.
205
-
206
- Example:
207
- {
208
- "status": "triggered",
209
- "rule_name": "Quality Check",
210
- "conditions_result": [
211
- {"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
212
- {"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
213
- ],
214
- "rule_id": "123e4567-e89b-12d3-a456-426614174000",
215
- "metadata": {
216
- "example_id": "example_123",
217
- "timestamp": "20240321_123456"
218
- },
219
- "notification": {
220
- "enabled": true,
221
- "communication_methods": ["slack", "email"],
222
- "email_addresses": ["user1@example.com", "user2@example.com"]
223
- }
224
- }
225
- """
226
- status: AlertStatus
227
- rule_id: Optional[str] = None # The unique identifier of the rule
228
- rule_name: str
229
- conditions_result: List[Dict[str, Any]]
230
- metadata: Dict[str, Any] = {}
231
- notification: Optional[NotificationConfig] = None # Configuration for notifications
232
-
233
- @property
234
- def example_id(self) -> Optional[str]:
235
- """Get example_id from metadata for backward compatibility"""
236
- return self.metadata.get("example_id")
237
-
238
- @property
239
- def timestamp(self) -> Optional[str]:
240
- """Get timestamp from metadata for backward compatibility"""
241
- return self.metadata.get("timestamp")
242
-
243
250
  class RulesEngine:
244
251
  """
245
252
  Engine for creating and evaluating rules against metrics.
@@ -406,7 +413,7 @@ class RulesEngine:
406
413
  # If rule has a notification config and the alert is triggered, include it in the result
407
414
  notification_config = rule.notification
408
415
 
409
- # Set the alert status based on whether the rule was triggered
416
+ # Set the alert status based on whether the rule was triggered using proper enum values
410
417
  status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
411
418
 
412
419
  # Create the alert result
@@ -416,7 +423,10 @@ class RulesEngine:
416
423
  rule_name=rule.name,
417
424
  conditions_result=condition_results,
418
425
  notification=notification_config,
419
- metadata=example_metadata or {}
426
+ metadata=example_metadata or {},
427
+ combine_type=rule.combine_type,
428
+ project_id=example_metadata.get("project_id") if example_metadata else None,
429
+ trace_span_id=example_metadata.get("trace_span_id") if example_metadata else None
420
430
  )
421
431
 
422
432
  results[rule_id] = alert_result
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import requests
3
3
  import time
4
+ import json
4
5
  import sys
5
6
  import itertools
6
7
  import threading
@@ -99,9 +100,9 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
99
100
  raise JudgmentAPIError(error_message)
100
101
  return response_data
101
102
 
102
- def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
103
+ def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
103
104
  """
104
- Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
105
+ Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
105
106
  """
106
107
 
107
108
  try:
@@ -145,46 +146,47 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
145
146
  """
146
147
  # No merge required
147
148
  if not local_results and api_results:
148
- return api_results
149
+ return [result.model_copy() for result in api_results]
149
150
  if not api_results and local_results:
150
- return local_results
151
+ return [result.model_copy() for result in local_results]
151
152
 
152
153
  if len(api_results) != len(local_results):
153
154
  # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
154
155
  raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
155
156
 
157
+ # Create a copy of api_results to avoid modifying the input
158
+ merged_results = [result.model_copy() for result in api_results]
159
+
156
160
  # Each ScoringResult in api and local have all the same fields besides `scorers_data`
157
- for api_result, local_result in zip(api_results, local_results):
158
- if not (api_result.data_object and local_result.data_object):
161
+ for merged_result, local_result in zip(merged_results, local_results):
162
+ if not (merged_result.data_object and local_result.data_object):
159
163
  raise ValueError("Data object is None in one of the results.")
160
- if api_result.data_object.input != local_result.data_object.input:
164
+ if merged_result.data_object.input != local_result.data_object.input:
161
165
  raise ValueError("The API and local results are not aligned.")
162
- if api_result.data_object.actual_output != local_result.data_object.actual_output:
166
+ if merged_result.data_object.actual_output != local_result.data_object.actual_output:
163
167
  raise ValueError("The API and local results are not aligned.")
164
- if api_result.data_object.expected_output != local_result.data_object.expected_output:
168
+ if merged_result.data_object.expected_output != local_result.data_object.expected_output:
165
169
  raise ValueError("The API and local results are not aligned.")
166
- if api_result.data_object.context != local_result.data_object.context:
170
+ if merged_result.data_object.context != local_result.data_object.context:
167
171
  raise ValueError("The API and local results are not aligned.")
168
- if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
172
+ if merged_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
169
173
  raise ValueError("The API and local results are not aligned.")
170
- if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
174
+ if merged_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
171
175
  raise ValueError("The API and local results are not aligned.")
172
- if api_result.data_object.tools_called != local_result.data_object.tools_called:
176
+ if merged_result.data_object.tools_called != local_result.data_object.tools_called:
173
177
  raise ValueError("The API and local results are not aligned.")
174
- if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
178
+ if merged_result.data_object.expected_tools != local_result.data_object.expected_tools:
175
179
  raise ValueError("The API and local results are not aligned.")
176
180
 
177
-
178
181
  # Merge ScorerData from the API and local scorers together
179
- api_scorer_data = api_result.scorers_data
182
+ api_scorer_data = merged_result.scorers_data
180
183
  local_scorer_data = local_result.scorers_data
181
184
  if api_scorer_data is None and local_scorer_data is not None:
182
- api_result.scorers_data = local_scorer_data
183
-
184
- if api_scorer_data is not None and local_scorer_data is not None:
185
- api_result.scorers_data = api_scorer_data + local_scorer_data
185
+ merged_result.scorers_data = local_scorer_data
186
+ elif api_scorer_data is not None and local_scorer_data is not None:
187
+ merged_result.scorers_data = api_scorer_data + local_scorer_data
186
188
 
187
- return api_results
189
+ return merged_results
188
190
 
189
191
 
190
192
  def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
@@ -362,14 +364,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
362
364
  """
363
365
  Checks if the example contains the necessary parameters for the scorer.
364
366
  """
367
+ prompt_user = False
365
368
  for scorer in scorers:
366
369
  for example in examples:
367
370
  missing_params = []
368
371
  for param in scorer.required_params:
369
372
  if getattr(example, param.value) is None:
370
- missing_params.append(f"'{param.value}'")
373
+ missing_params.append(f"{param.value}")
371
374
  if missing_params:
372
- print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
375
+ rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
376
+ rprint(f"Missing parameters: {', '.join(missing_params)}")
377
+ rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
378
+ rprint("-"*40)
379
+ prompt_user = True
380
+
381
+ if prompt_user:
382
+ user_input = input("Do you want to continue? (y/n)")
383
+ if user_input.lower() != "y":
384
+ sys.exit(0)
385
+ else:
386
+ rprint("[green]Continuing...[/green]")
373
387
 
374
388
  def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
375
389
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -392,8 +406,15 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
392
406
  )
393
407
  if function and tracer:
394
408
  new_traces: List[Trace] = []
395
- tracer.offline_mode = True
396
- tracer.traces = []
409
+
410
+ # Handle case where tracer is actually a callback handler
411
+ actual_tracer = tracer
412
+ if hasattr(tracer, 'tracer') and hasattr(tracer.tracer, 'traces'):
413
+ # This is a callback handler, get the underlying tracer
414
+ actual_tracer = tracer.tracer
415
+
416
+ actual_tracer.offline_mode = True
417
+ actual_tracer.traces = []
397
418
  for example in examples:
398
419
  if example.input:
399
420
  if isinstance(example.input, str):
@@ -404,19 +425,21 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
404
425
  raise ValueError(f"Input must be string or dict, got {type(example.input)}")
405
426
  else:
406
427
  result = run_with_spinner("Running agent function: ", function)
407
- for i, trace in enumerate(tracer.traces):
428
+
429
+
430
+ for i, trace in enumerate(actual_tracer.traces):
408
431
  # We set the root-level trace span with the expected tools of the Trace
409
432
  trace = Trace(**trace)
410
- trace.entries[0].expected_tools = examples[i].expected_tools
433
+ trace.trace_spans[0].expected_tools = examples[i].expected_tools
411
434
  new_traces.append(trace)
412
435
  trace_run.traces = new_traces
413
- tracer.traces = []
436
+ actual_tracer.traces = []
414
437
 
415
438
  # Execute evaluation using Judgment API
416
439
  info("Starting API evaluation")
417
440
  try: # execute an EvaluationRun with just JudgmentScorers
418
441
  debug("Sending request to Judgment API")
419
- response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
442
+ response_data: Dict = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
420
443
  scoring_results = [ScoringResult(**result) for result in response_data["results"]]
421
444
  info(f"Received {len(scoring_results)} results from API")
422
445
  except JudgmentAPIError as e:
@@ -894,6 +917,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
894
917
  f"Processing evaluation '{evaluation_run.eval_name}': "
895
918
  )
896
919
  else:
920
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
897
921
  if judgment_scorers:
898
922
  # Execute evaluation using Judgment API
899
923
  info("Starting API evaluation")
@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
12
12
  from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
14
  from judgeval.constants import UNBOUNDED_SCORERS
15
-
15
+ from judgeval.data.example import ExampleParams
16
16
  class JudgevalScorer:
17
17
  """
18
18
  Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
39
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
40
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
41
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
+ required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
42
43
  error: Optional[str] = None
43
44
  success: Optional[bool] = None
44
45
 
@@ -51,6 +52,7 @@ class JudgevalScorer:
51
52
  reason: Optional[str] = None,
52
53
  success: Optional[bool] = None,
53
54
  evaluation_model: Optional[str] = None,
55
+ required_params: Optional[List[ExampleParams]] = None,
54
56
  strict_mode: bool = False,
55
57
  async_mode: bool = True,
56
58
  verbose_mode: bool = True,
@@ -87,6 +89,7 @@ class JudgevalScorer:
87
89
  self.evaluation_cost = evaluation_cost
88
90
  self.verbose_logs = verbose_logs
89
91
  self.additional_metadata = additional_metadata
92
+ self.required_params = required_params
90
93
 
91
94
  def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
92
95
  """
@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
30
30
  from pydantic import BaseModel, model_serializer, Field
31
31
 
32
32
  from judgeval.data import Example
33
+ from judgeval.data.example import ExampleParams
33
34
  from judgeval.scorers import JudgevalScorer
34
35
  from judgeval.scorers.utils import (
35
36
  scorer_progress_meter,
@@ -64,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
64
65
  async_mode: bool = True,
65
66
  strict_mode: bool = False,
66
67
  verbose_mode: bool = False,
68
+ required_params: Optional[List[ExampleParams]] = None,
67
69
  ):
68
70
  # Initialize BaseModel first
69
71
  BaseModel.__init__(
@@ -85,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
85
87
  async_mode=async_mode,
86
88
  strict_mode=strict_mode,
87
89
  verbose_mode=verbose_mode,
90
+ required_params=required_params,
88
91
  )
89
92
 
90
93
  def score_example(
judgeval/utils/alerts.py CHANGED
@@ -20,12 +20,20 @@ class AlertResult(BaseModel):
20
20
  status: Status of the alert (triggered or not)
21
21
  conditions_result: List of condition evaluation results
22
22
  metadata: Dictionary containing example_id, timestamp, and other metadata
23
+ notification: Optional notification configuration for triggered alerts
24
+ combine_type: The combination type used ("all" or "any")
25
+ project_id: Optional project identifier
26
+ trace_span_id: Optional trace span identifier
23
27
  """
24
28
  rule_name: str
25
29
  rule_id: Optional[str] = None # The unique identifier of the rule
26
30
  status: AlertStatus
27
31
  conditions_result: List[Dict[str, Any]] = []
28
32
  metadata: Dict[str, Any] = {}
33
+ notification: Optional[Any] = None # NotificationConfig when triggered, None otherwise
34
+ combine_type: Optional[str] = None # "all" or "any"
35
+ project_id: Optional[str] = None # Project identifier
36
+ trace_span_id: Optional[str] = None # Trace span identifier
29
37
 
30
38
  @property
31
39
  def example_id(self) -> Optional[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.40
3
+ Version: 0.0.42
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
18
18
  Requires-Dist: langchain-huggingface
19
19
  Requires-Dist: langchain-openai
20
20
  Requires-Dist: litellm==1.61.15
21
+ Requires-Dist: matplotlib>=3.10.3
21
22
  Requires-Dist: nest-asyncio
22
23
  Requires-Dist: openai
23
24
  Requires-Dist: pandas
@@ -31,44 +32,47 @@ Description-Content-Type: text/markdown
31
32
  <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
32
33
  <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
33
34
 
34
- **Build monitoring & evaluation pipelines for complex agents**
35
+ <br>
36
+ <div style="font-size: 1.5em;">
37
+ Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
38
+ </div>
35
39
 
36
- <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
40
+ ## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
37
41
 
38
- <br>
42
+ [Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
39
43
 
40
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://judgment.mintlify.app/getting_started) [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
44
+ We're hiring! Join us in our mission to unleash optimized agents.
41
45
 
42
46
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
43
47
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
44
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
48
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
45
49
 
46
- </div>
50
+ <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
47
51
 
48
- ## Judgeval: open-source testing, monitoring, and optimization for AI agents
52
+ </div>
49
53
 
50
- Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
51
54
 
52
- Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
55
+ Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
53
56
 
54
- We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://docs.judgmentlabs.ai/getting-started) to get started.
57
+ Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
58
+ > 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
55
59
 
56
60
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
57
61
 
58
62
  ## 📋 Table of Contents
59
- * [✨ Features](#-features)
60
- * [🔍 Tracing](#-tracing)
61
- * [🧪 Evals](#-evals)
62
- * [📡 Monitoring](#-monitoring)
63
- * [📊 Datasets](#-datasets)
64
- * [💡 Insights](#-insights)
65
- * [🛠️ Installation](#️-installation)
66
- * [🏁 Get Started](#-get-started)
67
- * [🏢 Self-Hosting](#-self-hosting)
68
- * [📚 Cookbooks](#-cookbooks)
69
- * [💻 Development with Cursor](#-development-with-cursor)
70
- * [⭐ Star Us on GitHub](#-star-us-on-github)
71
- * [❤️ Contributors](#️-contributors)
63
+ - [✨ Features](#-features)
64
+ - [🛠️ Installation](#️-installation)
65
+ - [🏁 Quickstarts](#-quickstarts)
66
+ - [🛰️ Tracing](#️-tracing)
67
+ - [📝 Offline Evaluations](#-offline-evaluations)
68
+ - [📡 Online Evaluations](#-online-evaluations)
69
+ - [🏢 Self-Hosting](#-self-hosting)
70
+ - [Key Features](#key-features)
71
+ - [Getting Started](#getting-started)
72
+ - [📚 Cookbooks](#-cookbooks)
73
+ - [💻 Development with Cursor](#-development-with-cursor)
74
+ - [⭐ Star Us on GitHub](#-star-us-on-github)
75
+ - [❤️ Contributors](#️-contributors)
72
76
 
73
77
  <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
74
78
 
@@ -77,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
77
81
 
78
82
  | | |
79
83
  |:---|:---:|
80
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
81
- | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
82
- | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
83
- | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
84
- | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
84
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
85
+ | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
86
+ | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
87
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
85
88
 
86
89
  ## 🛠️ Installation
87
90
 
@@ -91,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
91
94
  pip install judgeval
92
95
  ```
93
96
 
94
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
97
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
95
98
 
96
- **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
99
+ ```bash
100
+ export JUDGMENT_API_KEY=...
101
+ export JUDGMENT_ORG_ID=...
102
+ ```
97
103
 
98
- ## 🏁 Get Started
104
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
99
105
 
100
- Here's how you can quickly start using Judgeval:
106
+ ## 🏁 Quickstarts
101
107
 
102
108
  ### 🛰️ Tracing
103
109
 
104
- Track your agent execution with full observability with just a few lines of code.
105
110
  Create a file named `traces.py` with the following code:
106
111
 
107
112
  ```python
@@ -126,12 +131,15 @@ def main():
126
131
 
127
132
  main()
128
133
  ```
134
+ You'll see your trace exported to the Judgment Platform:
135
+
136
+ <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
137
+
129
138
 
130
139
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
131
140
 
132
141
  ### 📝 Offline Evaluations
133
142
 
134
- You can evaluate your agent's execution to measure quality metrics such as hallucination.
135
143
  Create a file named `evaluate.py` with the following code:
136
144
 
137
145
  ```python evaluate.py
@@ -147,7 +155,7 @@ example = Example(
147
155
  retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
148
156
  )
149
157
 
150
- scorer = FaithfulnessScorer(threshold=0.5)
158
+ scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
151
159
  results = client.run_evaluation(
152
160
  examples=[example],
153
161
  scorers=[scorer],
@@ -196,6 +204,8 @@ def main():
196
204
  main()
197
205
  ```
198
206
 
207
+ You should see an evaluation attached to your trace on the Judgment Platform.
208
+
199
209
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
200
210
 
201
211
  ## 🏢 Self-Hosting
@@ -220,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
220
230
 
221
231
  ### Sample Agents
222
232
 
223
- #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
224
- A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
225
-
226
- #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
227
- A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
228
-
229
- ### Custom Evaluators
230
-
231
- #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
232
- Detecting and evaluating Personal Identifiable Information (PII) leakage.
233
-
234
- #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
235
-
236
- Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
233
+ #### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
234
+ A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
237
235
 
238
236
  ## 💻 Development with Cursor
239
237
  When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.