judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.7.1.dist-info/RECORD +0 -82
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/rules.py
DELETED
@@ -1,521 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Rules system for Judgeval that enables alerts based on metric thresholds.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from typing import Dict, List, Optional, Union, Any, Tuple
|
6
|
-
from pydantic import BaseModel, Field, ConfigDict
|
7
|
-
import asyncio
|
8
|
-
from concurrent.futures import ThreadPoolExecutor
|
9
|
-
import uuid
|
10
|
-
|
11
|
-
from judgeval.scorers import APIScorerConfig, BaseScorer
|
12
|
-
from judgeval.utils.alerts import AlertStatus, AlertResult
|
13
|
-
|
14
|
-
|
15
|
-
class Condition(BaseModel):
|
16
|
-
"""
|
17
|
-
A single metric condition.
|
18
|
-
|
19
|
-
Example:
|
20
|
-
{
|
21
|
-
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIScorerConfig, BaseScorer
|
22
|
-
}
|
23
|
-
|
24
|
-
The Condition class uses the scorer's threshold and success function internally.
|
25
|
-
"""
|
26
|
-
|
27
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
28
|
-
|
29
|
-
metric: Union[APIScorerConfig, BaseScorer]
|
30
|
-
|
31
|
-
@property
|
32
|
-
def metric_name(self) -> str:
|
33
|
-
"""Get the name of the metric for lookups in scores dictionary."""
|
34
|
-
if hasattr(self.metric, "score_type"):
|
35
|
-
# Handle APIScorerConfig and BaseScorer which have score_type
|
36
|
-
return self.metric.score_type
|
37
|
-
elif hasattr(self.metric, "__name__"):
|
38
|
-
# Handle cases where metric has a __name__ attribute
|
39
|
-
return self.metric.__name__
|
40
|
-
# Fallback to string representation
|
41
|
-
return str(self.metric)
|
42
|
-
|
43
|
-
@property
|
44
|
-
def threshold(self) -> float:
|
45
|
-
"""Get the threshold from the metric."""
|
46
|
-
return self.metric.threshold if hasattr(self.metric, "threshold") else 0.5
|
47
|
-
|
48
|
-
def evaluate(self, value: float) -> bool:
|
49
|
-
"""
|
50
|
-
Evaluate the condition against a value.
|
51
|
-
Returns True if the condition passes, False otherwise.
|
52
|
-
Uses the scorer's success check function if available.
|
53
|
-
"""
|
54
|
-
# Store the value in the scorer
|
55
|
-
if hasattr(self.metric, "score"):
|
56
|
-
self.metric.score = value
|
57
|
-
|
58
|
-
# Use the scorer's success check function if available
|
59
|
-
if hasattr(self.metric, "success_check"):
|
60
|
-
return self.metric.success_check()
|
61
|
-
elif hasattr(self.metric, "success_check"):
|
62
|
-
return self.metric.success_check()
|
63
|
-
else:
|
64
|
-
# Fallback to default comparison (greater than or equal)
|
65
|
-
return value >= self.threshold if self.threshold is not None else False
|
66
|
-
|
67
|
-
|
68
|
-
class PagerDutyConfig(BaseModel):
|
69
|
-
"""
|
70
|
-
Configuration for PagerDuty notifications.
|
71
|
-
|
72
|
-
Attributes:
|
73
|
-
routing_key: PagerDuty integration routing key
|
74
|
-
severity: Severity level (critical, error, warning, info)
|
75
|
-
source: Source of the alert (defaults to "judgeval")
|
76
|
-
component: Optional component that triggered the alert
|
77
|
-
group: Optional logical grouping for the alert
|
78
|
-
class_type: Optional class/type of alert event
|
79
|
-
"""
|
80
|
-
|
81
|
-
routing_key: str
|
82
|
-
severity: str = "error" # critical, error, warning, info
|
83
|
-
source: str = "judgeval"
|
84
|
-
component: Optional[str] = None
|
85
|
-
group: Optional[str] = None
|
86
|
-
class_type: Optional[str] = None
|
87
|
-
|
88
|
-
def model_dump(self, **kwargs):
|
89
|
-
"""Convert the PagerDutyConfig to a dictionary for JSON serialization."""
|
90
|
-
return {
|
91
|
-
"routing_key": self.routing_key,
|
92
|
-
"severity": self.severity,
|
93
|
-
"source": self.source,
|
94
|
-
"component": self.component,
|
95
|
-
"group": self.group,
|
96
|
-
"class_type": self.class_type,
|
97
|
-
}
|
98
|
-
|
99
|
-
|
100
|
-
class NotificationConfig(BaseModel):
|
101
|
-
"""
|
102
|
-
Configuration for notifications when a rule is triggered.
|
103
|
-
|
104
|
-
Example:
|
105
|
-
{
|
106
|
-
"enabled": true,
|
107
|
-
"communication_methods": ["email", "broadcast_slack", "broadcast_email", "pagerduty"],
|
108
|
-
"email_addresses": ["user1@example.com", "user2@example.com"],
|
109
|
-
"pagerduty_config": {
|
110
|
-
"routing_key": "R0ABCD1234567890123456789",
|
111
|
-
"severity": "error"
|
112
|
-
},
|
113
|
-
"send_at": 1632150000 # Unix timestamp (specific date/time)
|
114
|
-
}
|
115
|
-
|
116
|
-
Communication Methods:
|
117
|
-
- "email": Send emails to specified email addresses
|
118
|
-
- "broadcast_slack": Send broadcast notifications to all configured Slack channels
|
119
|
-
- "broadcast_email": Send broadcast emails to all organization emails
|
120
|
-
- "pagerduty": Send alerts to PagerDuty using the configured routing key
|
121
|
-
"""
|
122
|
-
|
123
|
-
enabled: bool = True
|
124
|
-
communication_methods: List[str] = []
|
125
|
-
email_addresses: Optional[List[str]] = None
|
126
|
-
pagerduty_config: Optional[PagerDutyConfig] = None
|
127
|
-
send_at: Optional[int] = None # Unix timestamp for scheduled notifications
|
128
|
-
|
129
|
-
def model_dump(self, **kwargs):
|
130
|
-
"""Convert the NotificationConfig to a dictionary for JSON serialization."""
|
131
|
-
return {
|
132
|
-
"enabled": self.enabled,
|
133
|
-
"communication_methods": self.communication_methods,
|
134
|
-
"email_addresses": self.email_addresses,
|
135
|
-
"pagerduty_config": self.pagerduty_config.model_dump()
|
136
|
-
if self.pagerduty_config
|
137
|
-
else None,
|
138
|
-
"send_at": self.send_at,
|
139
|
-
}
|
140
|
-
|
141
|
-
|
142
|
-
class Rule(BaseModel):
|
143
|
-
"""
|
144
|
-
Configuration for a single rule.
|
145
|
-
|
146
|
-
Example:
|
147
|
-
{
|
148
|
-
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
149
|
-
"name": "Quality Check",
|
150
|
-
"description": "Check if quality metrics meet thresholds",
|
151
|
-
"conditions": [
|
152
|
-
{"metric": FaithfulnessScorer(threshold=0.7)},
|
153
|
-
{"metric": AnswerRelevancyScorer(threshold=0.8)}
|
154
|
-
],
|
155
|
-
"combine_type": "all", # "all" or "any"
|
156
|
-
"notification": {
|
157
|
-
"enabled": true,
|
158
|
-
"communication_methods": ["slack", "email"],
|
159
|
-
"email_addresses": ["user1@example.com", "user2@example.com"]
|
160
|
-
}
|
161
|
-
}
|
162
|
-
"""
|
163
|
-
|
164
|
-
rule_id: str = Field(
|
165
|
-
default_factory=lambda: str(uuid.uuid4())
|
166
|
-
) # Random UUID string as default value
|
167
|
-
name: str
|
168
|
-
description: Optional[str] = None
|
169
|
-
conditions: List[Condition]
|
170
|
-
combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
|
171
|
-
notification: Optional[NotificationConfig] = None # Configuration for notifications
|
172
|
-
|
173
|
-
def model_dump(self, **kwargs):
|
174
|
-
"""
|
175
|
-
Custom serialization that properly handles condition serialization.
|
176
|
-
"""
|
177
|
-
data = super().model_dump(**kwargs)
|
178
|
-
|
179
|
-
# Special handling for conditions with complex metric objects
|
180
|
-
if "conditions" in data:
|
181
|
-
for i, condition in enumerate(data["conditions"]):
|
182
|
-
if "metric" in condition:
|
183
|
-
# Get the actual metric object
|
184
|
-
metric_obj = self.conditions[i].metric
|
185
|
-
|
186
|
-
# Create standardized metric representation needed by server API
|
187
|
-
metric_data = {"score_type": "", "threshold": 0.0, "name": ""}
|
188
|
-
|
189
|
-
# First try to use object's own serialization methods
|
190
|
-
if hasattr(metric_obj, "to_dict"):
|
191
|
-
orig_data = metric_obj.to_dict()
|
192
|
-
# Copy any existing fields
|
193
|
-
for key, value in orig_data.items():
|
194
|
-
metric_data[key] = value
|
195
|
-
elif hasattr(metric_obj, "model_dump"):
|
196
|
-
orig_data = metric_obj.model_dump()
|
197
|
-
# Copy any existing fields
|
198
|
-
for key, value in orig_data.items():
|
199
|
-
metric_data[key] = value
|
200
|
-
|
201
|
-
# If we already have data from original serialization methods but missing required fields
|
202
|
-
if "name" in metric_data and "score_type" not in metric_data:
|
203
|
-
metric_data["score_type"] = metric_data["name"]
|
204
|
-
|
205
|
-
# Ensure required fields have values by checking various sources
|
206
|
-
if not metric_data["score_type"]:
|
207
|
-
# Try to get score_type from different possible attributes
|
208
|
-
if hasattr(metric_obj, "score_type"):
|
209
|
-
metric_data["score_type"] = metric_obj.score_type
|
210
|
-
elif hasattr(metric_obj, "name"):
|
211
|
-
metric_data["score_type"] = metric_obj.name
|
212
|
-
else:
|
213
|
-
# Last resort: use string representation
|
214
|
-
metric_data["score_type"] = str(metric_obj)
|
215
|
-
|
216
|
-
# Make sure threshold is set
|
217
|
-
if (
|
218
|
-
not metric_data.get("threshold")
|
219
|
-
and metric_data.get("threshold") != 0.0
|
220
|
-
):
|
221
|
-
if hasattr(metric_obj, "threshold"):
|
222
|
-
metric_data["threshold"] = metric_obj.threshold
|
223
|
-
else:
|
224
|
-
# Use condition threshold if metric doesn't have one
|
225
|
-
metric_data["threshold"] = self.conditions[i].threshold
|
226
|
-
|
227
|
-
# Make sure name is set
|
228
|
-
if not metric_data.get("name"):
|
229
|
-
if hasattr(metric_obj, "__name__"):
|
230
|
-
metric_data["name"] = metric_obj.__name__
|
231
|
-
elif hasattr(metric_obj, "name"):
|
232
|
-
metric_data["name"] = metric_obj.name
|
233
|
-
else:
|
234
|
-
# Fallback to score_type if available
|
235
|
-
metric_data["name"] = metric_data.get(
|
236
|
-
"score_type", str(metric_obj)
|
237
|
-
)
|
238
|
-
|
239
|
-
# Update the condition with our properly serialized metric
|
240
|
-
condition["metric"] = metric_data
|
241
|
-
|
242
|
-
return data
|
243
|
-
|
244
|
-
|
245
|
-
class RulesEngine:
|
246
|
-
"""
|
247
|
-
Engine for creating and evaluating rules against metrics.
|
248
|
-
|
249
|
-
Example:
|
250
|
-
```python
|
251
|
-
# Define rules
|
252
|
-
rules = {
|
253
|
-
"1": Rule(
|
254
|
-
name="Quality Check",
|
255
|
-
description="Check if quality metrics meet thresholds",
|
256
|
-
conditions=[
|
257
|
-
Condition(metric=FaithfulnessScorer(threshold=0.7)),
|
258
|
-
Condition(metric=AnswerRelevancyScorer(threshold=0.8))
|
259
|
-
],
|
260
|
-
combine_type="all"
|
261
|
-
)
|
262
|
-
}
|
263
|
-
|
264
|
-
# Create rules engine
|
265
|
-
engine = RulesEngine(rules)
|
266
|
-
|
267
|
-
# Configure notifications
|
268
|
-
engine.configure_notification(
|
269
|
-
rule_id="1",
|
270
|
-
enabled=True,
|
271
|
-
communication_methods=["slack", "email"],
|
272
|
-
email_addresses=["user@example.com"]
|
273
|
-
)
|
274
|
-
|
275
|
-
# Evaluate rules
|
276
|
-
scores = {"faithfulness": 0.65, "relevancy": 0.85}
|
277
|
-
results = engine.evaluate_rules(scores, {"example_id": "example_123"})
|
278
|
-
```
|
279
|
-
"""
|
280
|
-
|
281
|
-
def __init__(self, rules: Dict[str, Rule]):
|
282
|
-
"""
|
283
|
-
Initialize the rules engine.
|
284
|
-
|
285
|
-
Args:
|
286
|
-
rules: Dictionary mapping rule IDs to Rule objects
|
287
|
-
"""
|
288
|
-
self.rules = rules
|
289
|
-
|
290
|
-
def configure_notification(
|
291
|
-
self,
|
292
|
-
rule_id: str,
|
293
|
-
enabled: bool = True,
|
294
|
-
communication_methods: List[str] | None = None,
|
295
|
-
email_addresses: List[str] | None = None,
|
296
|
-
send_at: Optional[int] = None,
|
297
|
-
) -> None:
|
298
|
-
"""
|
299
|
-
Configure notification settings for a specific rule.
|
300
|
-
|
301
|
-
Args:
|
302
|
-
rule_id: ID of the rule to configure notifications for
|
303
|
-
enabled: Whether notifications are enabled for this rule
|
304
|
-
communication_methods: List of notification methods (e.g., ["slack", "email"])
|
305
|
-
email_addresses: List of email addresses to send notifications to
|
306
|
-
send_at: Optional Unix timestamp for when to send the notification
|
307
|
-
"""
|
308
|
-
if rule_id not in self.rules:
|
309
|
-
raise ValueError(f"Rule ID '{rule_id}' not found")
|
310
|
-
|
311
|
-
rule = self.rules[rule_id]
|
312
|
-
|
313
|
-
# Create notification configuration if it doesn't exist
|
314
|
-
if rule.notification is None:
|
315
|
-
rule.notification = NotificationConfig()
|
316
|
-
|
317
|
-
# Set notification parameters
|
318
|
-
rule.notification.enabled = enabled
|
319
|
-
|
320
|
-
if communication_methods is not None:
|
321
|
-
rule.notification.communication_methods = communication_methods
|
322
|
-
|
323
|
-
if email_addresses is not None:
|
324
|
-
rule.notification.email_addresses = email_addresses
|
325
|
-
|
326
|
-
if send_at is not None:
|
327
|
-
rule.notification.send_at = send_at
|
328
|
-
|
329
|
-
def configure_all_notifications(
|
330
|
-
self,
|
331
|
-
enabled: bool = True,
|
332
|
-
communication_methods: List[str] | None = None,
|
333
|
-
email_addresses: List[str] | None = None,
|
334
|
-
send_at: Optional[int] = None,
|
335
|
-
) -> None:
|
336
|
-
"""
|
337
|
-
Configure notification settings for all rules.
|
338
|
-
|
339
|
-
Args:
|
340
|
-
enabled: Whether notifications are enabled
|
341
|
-
communication_methods: List of notification methods (e.g., ["slack", "email"])
|
342
|
-
email_addresses: List of email addresses to send notifications to
|
343
|
-
send_at: Optional Unix timestamp for when to send the notification
|
344
|
-
"""
|
345
|
-
for rule_id, rule in self.rules.items():
|
346
|
-
self.configure_notification(
|
347
|
-
rule_id=rule_id,
|
348
|
-
enabled=enabled,
|
349
|
-
communication_methods=communication_methods,
|
350
|
-
email_addresses=email_addresses,
|
351
|
-
send_at=send_at,
|
352
|
-
)
|
353
|
-
|
354
|
-
def evaluate_rules(
|
355
|
-
self,
|
356
|
-
scores: Dict[str, float],
|
357
|
-
example_metadata: Optional[Dict[str, Any]] = None,
|
358
|
-
) -> Dict[str, AlertResult]:
|
359
|
-
"""
|
360
|
-
Evaluate all rules against a set of scores.
|
361
|
-
Returns mapping of rule IDs to their alert results.
|
362
|
-
|
363
|
-
Args:
|
364
|
-
scores: Dictionary of metric names to their score values
|
365
|
-
example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
|
366
|
-
"""
|
367
|
-
results = {}
|
368
|
-
|
369
|
-
for rule_id, rule in self.rules.items():
|
370
|
-
# Evaluate each condition
|
371
|
-
condition_results = []
|
372
|
-
passed_conditions = []
|
373
|
-
|
374
|
-
for condition in rule.conditions:
|
375
|
-
# Get the metric name for lookup
|
376
|
-
metric_name = condition.metric_name
|
377
|
-
value = scores.get(metric_name)
|
378
|
-
|
379
|
-
if value is None:
|
380
|
-
# Skip this condition instead of evaluating it as false
|
381
|
-
condition_results.append(
|
382
|
-
{
|
383
|
-
"metric": metric_name,
|
384
|
-
"value": None,
|
385
|
-
"threshold": condition.threshold,
|
386
|
-
"passed": None, # Using None to indicate the condition was skipped
|
387
|
-
"skipped": True, # Add a flag to indicate this condition was skipped
|
388
|
-
}
|
389
|
-
)
|
390
|
-
continue # Skip adding to passed_conditions
|
391
|
-
else:
|
392
|
-
passed = condition.evaluate(value)
|
393
|
-
condition_results.append(
|
394
|
-
{
|
395
|
-
"metric": metric_name,
|
396
|
-
"value": value,
|
397
|
-
"threshold": condition.threshold,
|
398
|
-
"passed": passed,
|
399
|
-
"skipped": False, # Indicate this condition was evaluated
|
400
|
-
}
|
401
|
-
)
|
402
|
-
passed_conditions.append(passed)
|
403
|
-
|
404
|
-
# Determine if alert should trigger - only consider conditions that weren't skipped
|
405
|
-
if not passed_conditions:
|
406
|
-
# If all conditions were skipped, the rule doesn't trigger
|
407
|
-
triggered = False
|
408
|
-
else:
|
409
|
-
if rule.combine_type == "all":
|
410
|
-
# For "all" combine_type:
|
411
|
-
# - All evaluated conditions must pass
|
412
|
-
# - All conditions must have been evaluated (none skipped)
|
413
|
-
all_conditions_passed = all(passed_conditions)
|
414
|
-
all_conditions_evaluated = len(passed_conditions) == len(
|
415
|
-
rule.conditions
|
416
|
-
)
|
417
|
-
triggered = all_conditions_passed and all_conditions_evaluated
|
418
|
-
else:
|
419
|
-
# For "any" combine_type, at least one condition must pass
|
420
|
-
triggered = any(passed_conditions)
|
421
|
-
|
422
|
-
# Create alert result with example metadata
|
423
|
-
notification_config = None
|
424
|
-
if triggered and rule.notification:
|
425
|
-
# If rule has a notification config and the alert is triggered, include it in the result
|
426
|
-
notification_config = rule.notification
|
427
|
-
|
428
|
-
# Set the alert status based on whether the rule was triggered using proper enum values
|
429
|
-
status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
|
430
|
-
|
431
|
-
# Create the alert result
|
432
|
-
alert_result = AlertResult(
|
433
|
-
status=status,
|
434
|
-
rule_id=rule.rule_id,
|
435
|
-
rule_name=rule.name,
|
436
|
-
conditions_result=condition_results,
|
437
|
-
notification=notification_config,
|
438
|
-
metadata=example_metadata or {},
|
439
|
-
combine_type=rule.combine_type,
|
440
|
-
project_id=example_metadata.get("project_id")
|
441
|
-
if example_metadata
|
442
|
-
else None,
|
443
|
-
trace_span_id=example_metadata.get("trace_span_id")
|
444
|
-
if example_metadata
|
445
|
-
else None,
|
446
|
-
)
|
447
|
-
|
448
|
-
results[rule_id] = alert_result
|
449
|
-
|
450
|
-
return results
|
451
|
-
|
452
|
-
async def evaluate_rules_parallel(
|
453
|
-
self,
|
454
|
-
example_scores: Dict[str, Dict[str, float]],
|
455
|
-
example_metadata: Dict[str, Dict[str, Any]],
|
456
|
-
max_concurrent: int = 100,
|
457
|
-
) -> Dict[str, Dict[str, AlertResult]]:
|
458
|
-
"""
|
459
|
-
Evaluate all rules against multiple examples in parallel.
|
460
|
-
|
461
|
-
Args:
|
462
|
-
example_scores: Dictionary mapping example_ids to their score dictionaries
|
463
|
-
example_metadata: Dictionary mapping example_ids to their metadata
|
464
|
-
max_concurrent: Maximum number of concurrent evaluations
|
465
|
-
|
466
|
-
Returns:
|
467
|
-
Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
|
468
|
-
"""
|
469
|
-
# Create semaphore to limit concurrent executions
|
470
|
-
semaphore = asyncio.Semaphore(max_concurrent)
|
471
|
-
results = {}
|
472
|
-
tasks = []
|
473
|
-
|
474
|
-
# Create a task for each example
|
475
|
-
for example_id, scores in example_scores.items():
|
476
|
-
metadata = example_metadata.get(example_id, {})
|
477
|
-
task = self._evaluate_with_semaphore(
|
478
|
-
semaphore=semaphore,
|
479
|
-
example_id=example_id,
|
480
|
-
scores=scores,
|
481
|
-
metadata=metadata,
|
482
|
-
)
|
483
|
-
tasks.append(task)
|
484
|
-
|
485
|
-
# Run all tasks and collect results
|
486
|
-
example_results = await asyncio.gather(*tasks)
|
487
|
-
|
488
|
-
# Organize results by example_id
|
489
|
-
for example_id, result in example_results:
|
490
|
-
results[example_id] = result
|
491
|
-
|
492
|
-
return results
|
493
|
-
|
494
|
-
async def _evaluate_with_semaphore(
|
495
|
-
self,
|
496
|
-
semaphore: asyncio.Semaphore,
|
497
|
-
example_id: str,
|
498
|
-
scores: Dict[str, float],
|
499
|
-
metadata: Dict[str, Any],
|
500
|
-
) -> Tuple[str, Dict[str, AlertResult]]:
|
501
|
-
"""
|
502
|
-
Helper method to evaluate rules for an example with semaphore control.
|
503
|
-
|
504
|
-
Args:
|
505
|
-
semaphore: Semaphore to control concurrency
|
506
|
-
example_id: ID of the example being evaluated
|
507
|
-
scores: Dictionary of scores for this example
|
508
|
-
metadata: Metadata for this example
|
509
|
-
|
510
|
-
Returns:
|
511
|
-
Tuple of (example_id, rule_results)
|
512
|
-
"""
|
513
|
-
async with semaphore:
|
514
|
-
# Run the evaluation in a thread pool to avoid blocking the event loop
|
515
|
-
# for CPU-bound operations
|
516
|
-
with ThreadPoolExecutor() as executor:
|
517
|
-
rule_results = await asyncio.get_event_loop().run_in_executor(
|
518
|
-
executor, self.evaluate_rules, scores, metadata
|
519
|
-
)
|
520
|
-
|
521
|
-
return (example_id, rule_results)
|
@@ -1,52 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` tool correctness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
-
from judgeval.constants import APIScorerType
|
11
|
-
from typing import Optional, Dict
|
12
|
-
from judgeval.data import ExampleParams
|
13
|
-
|
14
|
-
|
15
|
-
class ExecutionOrderScorer(APIScorerConfig):
|
16
|
-
kwargs: Optional[Dict] = None
|
17
|
-
|
18
|
-
def __init__(
|
19
|
-
self,
|
20
|
-
threshold: float,
|
21
|
-
should_exact_match: bool = False,
|
22
|
-
should_consider_ordering: bool = False,
|
23
|
-
):
|
24
|
-
super().__init__(
|
25
|
-
threshold=threshold,
|
26
|
-
score_type=APIScorerType.EXECUTION_ORDER,
|
27
|
-
required_params=[
|
28
|
-
ExampleParams.ACTUAL_OUTPUT,
|
29
|
-
ExampleParams.EXPECTED_OUTPUT,
|
30
|
-
],
|
31
|
-
)
|
32
|
-
self.kwargs = {
|
33
|
-
"should_exact_match": should_exact_match,
|
34
|
-
"should_consider_ordering": should_consider_ordering,
|
35
|
-
}
|
36
|
-
|
37
|
-
@property
|
38
|
-
def __name__(self):
|
39
|
-
return "Execution Order"
|
40
|
-
|
41
|
-
def to_dict(self) -> dict:
|
42
|
-
"""
|
43
|
-
Converts the scorer configuration to a dictionary format.
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
dict: A dictionary containing the scorer's configuration
|
47
|
-
"""
|
48
|
-
return {
|
49
|
-
"score_type": self.score_type,
|
50
|
-
"threshold": self.threshold,
|
51
|
-
"kwargs": self.kwargs,
|
52
|
-
}
|
@@ -1,28 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` hallucination scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
-
from judgeval.constants import APIScorerType
|
11
|
-
from judgeval.data import ExampleParams
|
12
|
-
|
13
|
-
|
14
|
-
class HallucinationScorer(APIScorerConfig):
|
15
|
-
def __init__(self, threshold: float):
|
16
|
-
super().__init__(
|
17
|
-
threshold=threshold,
|
18
|
-
score_type=APIScorerType.HALLUCINATION,
|
19
|
-
required_params=[
|
20
|
-
ExampleParams.INPUT,
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
ExampleParams.CONTEXT,
|
23
|
-
],
|
24
|
-
)
|
25
|
-
|
26
|
-
@property
|
27
|
-
def __name__(self):
|
28
|
-
return "Hallucination"
|
judgeval/utils/alerts.py
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Handling alerts in Judgeval.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from enum import Enum
|
6
|
-
from typing import Dict, Any, List, Optional
|
7
|
-
from pydantic import BaseModel
|
8
|
-
|
9
|
-
|
10
|
-
class AlertStatus(str, Enum):
|
11
|
-
"""Status of an alert evaluation."""
|
12
|
-
|
13
|
-
TRIGGERED = "triggered"
|
14
|
-
NOT_TRIGGERED = "not_triggered"
|
15
|
-
|
16
|
-
|
17
|
-
class AlertResult(BaseModel):
|
18
|
-
"""
|
19
|
-
Result of a rule evaluation.
|
20
|
-
|
21
|
-
Attributes:
|
22
|
-
rule_name: Name of the rule that was evaluated
|
23
|
-
rule_id: Unique identifier of the rule
|
24
|
-
status: Status of the alert (triggered or not)
|
25
|
-
conditions_result: List of condition evaluation results
|
26
|
-
metadata: Dictionary containing example_id, timestamp, and other metadata
|
27
|
-
notification: Optional notification configuration for triggered alerts
|
28
|
-
combine_type: The combination type used ("all" or "any")
|
29
|
-
project_id: Optional project identifier
|
30
|
-
trace_span_id: Optional trace span identifier
|
31
|
-
"""
|
32
|
-
|
33
|
-
rule_name: str
|
34
|
-
rule_id: Optional[str] = None # The unique identifier of the rule
|
35
|
-
status: AlertStatus
|
36
|
-
conditions_result: List[Dict[str, Any]] = []
|
37
|
-
metadata: Dict[str, Any] = {}
|
38
|
-
notification: Optional[Any] = (
|
39
|
-
None # NotificationConfig when triggered, None otherwise
|
40
|
-
)
|
41
|
-
combine_type: Optional[str] = None # "all" or "any"
|
42
|
-
project_id: Optional[str] = None # Project identifier
|
43
|
-
trace_span_id: Optional[str] = None # Trace span identifier
|
44
|
-
|
45
|
-
@property
|
46
|
-
def example_id(self) -> Optional[str]:
|
47
|
-
"""Get example_id from metadata for backward compatibility"""
|
48
|
-
return self.metadata.get("example_id")
|
49
|
-
|
50
|
-
@property
|
51
|
-
def timestamp(self) -> Optional[str]:
|
52
|
-
"""Get timestamp from metadata for backward compatibility"""
|
53
|
-
return self.metadata.get("timestamp")
|
54
|
-
|
55
|
-
@property
|
56
|
-
def conditions_results(self) -> List[Dict[str, Any]]:
|
57
|
-
"""Backwards compatibility property for the conditions_result field"""
|
58
|
-
return self.conditions_result
|
59
|
-
|
60
|
-
def model_dump(self, **kwargs):
|
61
|
-
"""
|
62
|
-
Convert the AlertResult to a dictionary for JSON serialization.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
**kwargs: Additional arguments to pass to Pydantic's model_dump
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
dict: Dictionary representation of the AlertResult
|
69
|
-
"""
|
70
|
-
data = (
|
71
|
-
super().model_dump(**kwargs)
|
72
|
-
if hasattr(super(), "model_dump")
|
73
|
-
else super().dict(**kwargs)
|
74
|
-
)
|
75
|
-
|
76
|
-
# Handle the NotificationConfig object if it exists
|
77
|
-
if hasattr(self, "notification") and self.notification is not None:
|
78
|
-
if hasattr(self.notification, "model_dump"):
|
79
|
-
data["notification"] = self.notification.model_dump()
|
80
|
-
elif hasattr(self.notification, "dict"):
|
81
|
-
data["notification"] = self.notification.dict()
|
82
|
-
else:
|
83
|
-
# Manually convert the notification to a dictionary
|
84
|
-
notif = self.notification
|
85
|
-
data["notification"] = {
|
86
|
-
"enabled": notif.enabled,
|
87
|
-
"communication_methods": notif.communication_methods,
|
88
|
-
"email_addresses": notif.email_addresses,
|
89
|
-
"slack_channels": getattr(notif, "slack_channels", []),
|
90
|
-
"send_at": notif.send_at,
|
91
|
-
}
|
92
|
-
|
93
|
-
return data
|