judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +126 -59
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +3 -2
- judgeval/data/datasets/eval_dataset_client.py +25 -14
- judgeval/data/example.py +8 -1
- judgeval/evaluation_run.py +9 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +163 -28
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +32 -14
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/RECORD +20 -18
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0
judgeval/rules.py
ADDED
@@ -0,0 +1,384 @@
|
|
1
|
+
"""
|
2
|
+
Rules system for Judgeval that enables alerts based on metric thresholds.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict, List, Optional, Union, Any, Set, Tuple
|
6
|
+
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
7
|
+
from enum import Enum
|
8
|
+
from datetime import datetime
|
9
|
+
import asyncio
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
11
|
+
import time
|
12
|
+
import uuid # Add import for uuid module
|
13
|
+
|
14
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
15
|
+
from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
|
16
|
+
|
17
|
+
class AlertStatus(str, Enum):
|
18
|
+
"""Status of an alert evaluation."""
|
19
|
+
TRIGGERED = "triggered"
|
20
|
+
NOT_TRIGGERED = "not_triggered"
|
21
|
+
|
22
|
+
class Operator(str, Enum):
|
23
|
+
"""Comparison operators for conditions."""
|
24
|
+
GT = ">"
|
25
|
+
GTE = ">="
|
26
|
+
LT = "<"
|
27
|
+
LTE = "<="
|
28
|
+
EQ = "=="
|
29
|
+
NEQ = "!="
|
30
|
+
|
31
|
+
class Condition(BaseModel):
|
32
|
+
"""
|
33
|
+
A single metric condition.
|
34
|
+
|
35
|
+
Example:
|
36
|
+
{
|
37
|
+
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
|
38
|
+
"operator": ">=",
|
39
|
+
"threshold": 0.7
|
40
|
+
}
|
41
|
+
"""
|
42
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
43
|
+
|
44
|
+
metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
|
45
|
+
operator: Operator
|
46
|
+
threshold: float
|
47
|
+
|
48
|
+
@property
|
49
|
+
def metric_name(self) -> str:
|
50
|
+
"""Get the name of the metric for lookups in scores dictionary."""
|
51
|
+
if isinstance(self.metric, ScorerWrapper):
|
52
|
+
# Handle ScorerWrapper case specifically
|
53
|
+
return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
|
54
|
+
elif hasattr(self.metric, 'score_type'):
|
55
|
+
# Handle APIJudgmentScorer and JudgevalScorer which have score_type
|
56
|
+
return self.metric.score_type
|
57
|
+
elif hasattr(self.metric, '__name__'):
|
58
|
+
# Handle cases where metric has a __name__ attribute
|
59
|
+
return self.metric.__name__
|
60
|
+
# Fallback to string representation
|
61
|
+
return str(self.metric)
|
62
|
+
|
63
|
+
def evaluate(self, value: float) -> bool:
|
64
|
+
"""Evaluate this condition against a value."""
|
65
|
+
if self.operator == Operator.GT:
|
66
|
+
return value > self.threshold
|
67
|
+
elif self.operator == Operator.GTE:
|
68
|
+
return value >= self.threshold
|
69
|
+
elif self.operator == Operator.LT:
|
70
|
+
return value < self.threshold
|
71
|
+
elif self.operator == Operator.LTE:
|
72
|
+
return value <= self.threshold
|
73
|
+
elif self.operator == Operator.EQ:
|
74
|
+
return value == self.threshold
|
75
|
+
elif self.operator == Operator.NEQ:
|
76
|
+
return value != self.threshold
|
77
|
+
else:
|
78
|
+
raise ValueError(f"Unknown operator: {self.operator}")
|
79
|
+
|
80
|
+
class Rule(BaseModel):
|
81
|
+
"""
|
82
|
+
Configuration for a single rule.
|
83
|
+
|
84
|
+
Example:
|
85
|
+
{
|
86
|
+
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
87
|
+
"name": "Quality Check",
|
88
|
+
"description": "Check if quality metrics meet thresholds",
|
89
|
+
"conditions": [
|
90
|
+
{"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
|
91
|
+
{"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
|
92
|
+
],
|
93
|
+
"combine_type": "all" # "all" or "any"
|
94
|
+
}
|
95
|
+
"""
|
96
|
+
rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
|
97
|
+
name: str
|
98
|
+
description: Optional[str] = None
|
99
|
+
conditions: List[Condition]
|
100
|
+
combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
|
101
|
+
|
102
|
+
def model_dump(self, **kwargs):
|
103
|
+
"""
|
104
|
+
Custom serialization that properly handles condition serialization.
|
105
|
+
"""
|
106
|
+
data = super().model_dump(**kwargs)
|
107
|
+
|
108
|
+
# Special handling for conditions with complex metric objects
|
109
|
+
if "conditions" in data:
|
110
|
+
for i, condition in enumerate(data["conditions"]):
|
111
|
+
if "metric" in condition:
|
112
|
+
# Get the actual metric object
|
113
|
+
metric_obj = self.conditions[i].metric
|
114
|
+
|
115
|
+
# Create standardized metric representation needed by server API
|
116
|
+
metric_data = {
|
117
|
+
"score_type": "",
|
118
|
+
"threshold": 0.0
|
119
|
+
}
|
120
|
+
|
121
|
+
# First try to use object's own serialization methods
|
122
|
+
if hasattr(metric_obj, "to_dict"):
|
123
|
+
orig_data = metric_obj.to_dict()
|
124
|
+
# Copy any existing fields
|
125
|
+
for key, value in orig_data.items():
|
126
|
+
metric_data[key] = value
|
127
|
+
elif hasattr(metric_obj, "model_dump"):
|
128
|
+
orig_data = metric_obj.model_dump()
|
129
|
+
# Copy any existing fields
|
130
|
+
for key, value in orig_data.items():
|
131
|
+
metric_data[key] = value
|
132
|
+
|
133
|
+
# If we already have data from original serialization methods but missing required fields
|
134
|
+
if 'name' in metric_data and 'score_type' not in metric_data:
|
135
|
+
metric_data['score_type'] = metric_data['name']
|
136
|
+
|
137
|
+
# Ensure required fields have values by checking various sources
|
138
|
+
if not metric_data['score_type']:
|
139
|
+
# Try to get score_type from different possible attributes
|
140
|
+
if hasattr(metric_obj, 'score_type'):
|
141
|
+
metric_data['score_type'] = metric_obj.score_type
|
142
|
+
elif hasattr(metric_obj, 'name'):
|
143
|
+
metric_data['score_type'] = metric_obj.name
|
144
|
+
else:
|
145
|
+
# Last resort: use string representation
|
146
|
+
metric_data['score_type'] = str(metric_obj)
|
147
|
+
|
148
|
+
# Make sure threshold is set
|
149
|
+
if not metric_data.get('threshold') and metric_data.get('threshold') != 0.0:
|
150
|
+
if hasattr(metric_obj, 'threshold'):
|
151
|
+
metric_data['threshold'] = metric_obj.threshold
|
152
|
+
else:
|
153
|
+
# Use condition threshold if metric doesn't have one
|
154
|
+
metric_data['threshold'] = self.conditions[i].threshold
|
155
|
+
|
156
|
+
# Update the condition with our properly serialized metric
|
157
|
+
condition["metric"] = metric_data
|
158
|
+
|
159
|
+
return data
|
160
|
+
|
161
|
+
@field_validator('conditions')
|
162
|
+
def validate_conditions_not_empty(cls, v):
|
163
|
+
if not v:
|
164
|
+
raise ValueError("Conditions list cannot be empty")
|
165
|
+
return v
|
166
|
+
|
167
|
+
@field_validator('combine_type')
|
168
|
+
def validate_combine_type(cls, v):
|
169
|
+
if v not in ["all", "any"]:
|
170
|
+
raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
|
171
|
+
return v
|
172
|
+
|
173
|
+
|
174
|
+
class AlertResult(BaseModel):
|
175
|
+
"""
|
176
|
+
Result of evaluating a rule.
|
177
|
+
|
178
|
+
Example:
|
179
|
+
{
|
180
|
+
"status": "triggered",
|
181
|
+
"rule_name": "Quality Check",
|
182
|
+
"conditions_result": [
|
183
|
+
{"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
|
184
|
+
{"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
|
185
|
+
],
|
186
|
+
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
187
|
+
"metadata": {
|
188
|
+
"example_id": "example_123",
|
189
|
+
"timestamp": "20240321_123456"
|
190
|
+
}
|
191
|
+
}
|
192
|
+
"""
|
193
|
+
status: AlertStatus
|
194
|
+
rule_id: Optional[str] = None # The unique identifier of the rule
|
195
|
+
rule_name: str
|
196
|
+
conditions_result: List[Dict[str, Any]]
|
197
|
+
metadata: Dict[str, Any] = {}
|
198
|
+
|
199
|
+
@property
|
200
|
+
def example_id(self) -> Optional[str]:
|
201
|
+
"""Get example_id from metadata for backward compatibility"""
|
202
|
+
return self.metadata.get("example_id")
|
203
|
+
|
204
|
+
@property
|
205
|
+
def timestamp(self) -> Optional[str]:
|
206
|
+
"""Get timestamp from metadata for backward compatibility"""
|
207
|
+
return self.metadata.get("timestamp")
|
208
|
+
|
209
|
+
class RulesEngine:
|
210
|
+
"""
|
211
|
+
Engine for evaluating rules and managing alerts.
|
212
|
+
|
213
|
+
Example usage:
|
214
|
+
rules = {
|
215
|
+
"quality_check": Rule(
|
216
|
+
name="Quality Check",
|
217
|
+
conditions=[
|
218
|
+
Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
|
219
|
+
Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
|
220
|
+
],
|
221
|
+
combine_type="all"
|
222
|
+
)
|
223
|
+
}
|
224
|
+
|
225
|
+
engine = RulesEngine(rules)
|
226
|
+
scores = {"faithfulness": 0.8, "relevancy": 0.9}
|
227
|
+
alerts = engine.evaluate_rules(scores, example_metadata={
|
228
|
+
"example_id": "example_123",
|
229
|
+
"timestamp": "20240321_123456"
|
230
|
+
})
|
231
|
+
"""
|
232
|
+
|
233
|
+
def __init__(self, rules: Dict[str, Rule]):
|
234
|
+
"""
|
235
|
+
Initialize the RulesEngine with rules.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
rules: Dictionary mapping rule IDs to rule configurations
|
239
|
+
"""
|
240
|
+
self.rules = rules
|
241
|
+
|
242
|
+
def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
|
243
|
+
"""
|
244
|
+
Evaluate all rules against a set of scores.
|
245
|
+
Returns mapping of rule IDs to their alert results.
|
246
|
+
|
247
|
+
Args:
|
248
|
+
scores: Dictionary of metric names to their score values
|
249
|
+
example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
|
250
|
+
"""
|
251
|
+
results = {}
|
252
|
+
|
253
|
+
for rule_id, rule in self.rules.items():
|
254
|
+
# Evaluate each condition
|
255
|
+
condition_results = []
|
256
|
+
passed_conditions = []
|
257
|
+
|
258
|
+
for condition in rule.conditions:
|
259
|
+
# Get the metric name for lookup
|
260
|
+
metric_name = condition.metric_name
|
261
|
+
value = scores.get(metric_name)
|
262
|
+
if value is None:
|
263
|
+
# Skip this condition instead of evaluating it as false
|
264
|
+
condition_results.append({
|
265
|
+
"metric": metric_name,
|
266
|
+
"value": None,
|
267
|
+
"threshold": condition.threshold,
|
268
|
+
"operator": condition.operator,
|
269
|
+
"passed": None, # Using None to indicate the condition was skipped
|
270
|
+
"skipped": True # Add a flag to indicate this condition was skipped
|
271
|
+
})
|
272
|
+
continue # Skip adding to passed_conditions
|
273
|
+
else:
|
274
|
+
passed = condition.evaluate(value)
|
275
|
+
condition_results.append({
|
276
|
+
"metric": metric_name,
|
277
|
+
"value": value,
|
278
|
+
"threshold": condition.threshold,
|
279
|
+
"operator": condition.operator,
|
280
|
+
"passed": passed,
|
281
|
+
"skipped": False # Indicate this condition was evaluated
|
282
|
+
})
|
283
|
+
passed_conditions.append(passed)
|
284
|
+
|
285
|
+
# Determine if alert should trigger - only consider conditions that weren't skipped
|
286
|
+
if not passed_conditions:
|
287
|
+
# If all conditions were skipped, the rule doesn't trigger
|
288
|
+
triggered = False
|
289
|
+
else:
|
290
|
+
triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
|
291
|
+
|
292
|
+
# Create alert result with example metadata
|
293
|
+
alert_result = AlertResult(
|
294
|
+
status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
|
295
|
+
rule_id=rule.rule_id, # Include the rule's unique identifier
|
296
|
+
rule_name=rule.name,
|
297
|
+
conditions_result=condition_results
|
298
|
+
)
|
299
|
+
|
300
|
+
# Add example metadata if provided
|
301
|
+
if example_metadata:
|
302
|
+
if "example_id" in example_metadata:
|
303
|
+
alert_result.metadata["example_id"] = example_metadata["example_id"]
|
304
|
+
if "timestamp" in example_metadata:
|
305
|
+
alert_result.metadata["timestamp"] = example_metadata["timestamp"]
|
306
|
+
|
307
|
+
results[rule_id] = alert_result
|
308
|
+
|
309
|
+
return results
|
310
|
+
|
311
|
+
async def evaluate_rules_parallel(self,
|
312
|
+
example_scores: Dict[str, Dict[str, float]],
|
313
|
+
example_metadata: Dict[str, Dict[str, Any]],
|
314
|
+
max_concurrent: int = 100) -> Dict[str, Dict[str, AlertResult]]:
|
315
|
+
"""
|
316
|
+
Evaluate all rules against multiple examples in parallel.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
example_scores: Dictionary mapping example_ids to their score dictionaries
|
320
|
+
example_metadata: Dictionary mapping example_ids to their metadata
|
321
|
+
max_concurrent: Maximum number of concurrent evaluations
|
322
|
+
|
323
|
+
Returns:
|
324
|
+
Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
|
325
|
+
"""
|
326
|
+
# Create semaphore to limit concurrent executions
|
327
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
328
|
+
results = {}
|
329
|
+
tasks = []
|
330
|
+
|
331
|
+
# Create a task for each example
|
332
|
+
for example_id, scores in example_scores.items():
|
333
|
+
metadata = example_metadata.get(example_id, {})
|
334
|
+
task = self._evaluate_with_semaphore(
|
335
|
+
semaphore=semaphore,
|
336
|
+
example_id=example_id,
|
337
|
+
scores=scores,
|
338
|
+
metadata=metadata
|
339
|
+
)
|
340
|
+
tasks.append(task)
|
341
|
+
|
342
|
+
# Run all tasks and collect results
|
343
|
+
example_results = await asyncio.gather(*tasks)
|
344
|
+
|
345
|
+
# Organize results by example_id
|
346
|
+
for example_id, result in example_results:
|
347
|
+
results[example_id] = result
|
348
|
+
|
349
|
+
return results
|
350
|
+
|
351
|
+
async def _evaluate_with_semaphore(self,
|
352
|
+
semaphore: asyncio.Semaphore,
|
353
|
+
example_id: str,
|
354
|
+
scores: Dict[str, float],
|
355
|
+
metadata: Dict[str, Any]) -> Tuple[str, Dict[str, AlertResult]]:
|
356
|
+
"""
|
357
|
+
Helper method to evaluate rules for an example with semaphore control.
|
358
|
+
|
359
|
+
Args:
|
360
|
+
semaphore: Semaphore to control concurrency
|
361
|
+
example_id: ID of the example being evaluated
|
362
|
+
scores: Dictionary of scores for this example
|
363
|
+
metadata: Metadata for this example
|
364
|
+
|
365
|
+
Returns:
|
366
|
+
Tuple of (example_id, rule_results)
|
367
|
+
"""
|
368
|
+
async with semaphore:
|
369
|
+
# Run the evaluation in a thread pool to avoid blocking the event loop
|
370
|
+
# for CPU-bound operations
|
371
|
+
with ThreadPoolExecutor() as executor:
|
372
|
+
start_time = time.perf_counter()
|
373
|
+
rule_results = await asyncio.get_event_loop().run_in_executor(
|
374
|
+
executor,
|
375
|
+
self.evaluate_rules,
|
376
|
+
scores,
|
377
|
+
metadata
|
378
|
+
)
|
379
|
+
end_time = time.perf_counter()
|
380
|
+
|
381
|
+
# Could log performance metrics here if needed
|
382
|
+
# debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
|
383
|
+
|
384
|
+
return (example_id, rule_results)
|
judgeval/run_evaluation.py
CHANGED
@@ -20,6 +20,7 @@ from judgeval.constants import (
|
|
20
20
|
ROOT_API,
|
21
21
|
JUDGMENT_EVAL_API_URL,
|
22
22
|
JUDGMENT_EVAL_LOG_API_URL,
|
23
|
+
MAX_CONCURRENT_EVALUATIONS
|
23
24
|
)
|
24
25
|
from judgeval.common.exceptions import JudgmentAPIError
|
25
26
|
from judgeval.evaluation_run import EvaluationRun
|
@@ -30,6 +31,7 @@ from judgeval.common.logger import (
|
|
30
31
|
error,
|
31
32
|
example_logging_context
|
32
33
|
)
|
34
|
+
from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
|
33
35
|
|
34
36
|
|
35
37
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
@@ -50,9 +52,11 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
50
52
|
response = requests.post(
|
51
53
|
JUDGMENT_EVAL_API_URL, headers={
|
52
54
|
"Content-Type": "application/json",
|
53
|
-
"Authorization": f"Bearer {evaluation_run.judgment_api_key}"
|
55
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
56
|
+
"X-Organization-Id": evaluation_run.organization_id
|
54
57
|
},
|
55
|
-
json=payload
|
58
|
+
json=payload,
|
59
|
+
verify=False)
|
56
60
|
response_data = response.json()
|
57
61
|
except Exception as e:
|
58
62
|
error(f"Error: {e}")
|
@@ -140,7 +144,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
140
144
|
return results
|
141
145
|
|
142
146
|
|
143
|
-
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
|
147
|
+
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
|
144
148
|
"""
|
145
149
|
Checks if an evaluation run name already exists for a given project.
|
146
150
|
|
@@ -158,13 +162,15 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
158
162
|
f"{ROOT_API}/eval-run-name-exists/",
|
159
163
|
headers={
|
160
164
|
"Content-Type": "application/json",
|
161
|
-
"Authorization": f"Bearer {judgment_api_key}"
|
165
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
166
|
+
"X-Organization-Id": organization_id
|
162
167
|
},
|
163
168
|
json={
|
164
169
|
"eval_name": eval_name,
|
165
170
|
"project_name": project_name,
|
166
171
|
"judgment_api_key": judgment_api_key,
|
167
|
-
}
|
172
|
+
},
|
173
|
+
verify=False
|
168
174
|
)
|
169
175
|
|
170
176
|
if response.status_code == 409:
|
@@ -199,14 +205,15 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
199
205
|
JUDGMENT_EVAL_LOG_API_URL,
|
200
206
|
headers={
|
201
207
|
"Content-Type": "application/json",
|
202
|
-
"Authorization": f"Bearer {evaluation_run.judgment_api_key}"
|
208
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
209
|
+
"X-Organization-Id": evaluation_run.organization_id
|
203
210
|
},
|
204
211
|
json={
|
205
212
|
"results": [result.to_dict() for result in merged_results],
|
206
|
-
"judgment_api_key": evaluation_run.judgment_api_key,
|
207
213
|
"project_name": evaluation_run.project_name,
|
208
214
|
"eval_name": evaluation_run.eval_name,
|
209
|
-
}
|
215
|
+
},
|
216
|
+
verify=False
|
210
217
|
)
|
211
218
|
|
212
219
|
if not res.ok:
|
@@ -226,6 +233,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
226
233
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
227
234
|
|
228
235
|
|
236
|
+
|
229
237
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
230
238
|
"""
|
231
239
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -243,7 +251,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
243
251
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
244
252
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
245
253
|
log_results (bool): Whether to log the results to the Judgment API
|
246
|
-
|
254
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
247
255
|
|
248
256
|
Returns:
|
249
257
|
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
@@ -254,7 +262,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
254
262
|
check_eval_run_name_exists(
|
255
263
|
evaluation_run.eval_name,
|
256
264
|
evaluation_run.project_name,
|
257
|
-
evaluation_run.judgment_api_key
|
265
|
+
evaluation_run.judgment_api_key,
|
266
|
+
evaluation_run.organization_id
|
258
267
|
)
|
259
268
|
|
260
269
|
# Set example IDs if not already set
|
@@ -312,7 +321,9 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
312
321
|
aggregator=evaluation_run.aggregator,
|
313
322
|
metadata=evaluation_run.metadata,
|
314
323
|
judgment_api_key=evaluation_run.judgment_api_key,
|
315
|
-
|
324
|
+
organization_id=evaluation_run.organization_id,
|
325
|
+
log_results=evaluation_run.log_results,
|
326
|
+
rules=evaluation_run.rules
|
316
327
|
)
|
317
328
|
debug("Sending request to Judgment API")
|
318
329
|
response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
|
@@ -342,7 +353,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
342
353
|
]
|
343
354
|
|
344
355
|
api_results.append(ScoringResult(**filtered_result))
|
345
|
-
|
346
356
|
# Run local evals
|
347
357
|
if local_scorers: # List[JudgevalScorer]
|
348
358
|
info("Starting local evaluation")
|
@@ -360,12 +370,11 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
360
370
|
show_indicator=True,
|
361
371
|
_use_bar_indicator=True,
|
362
372
|
throttle_value=0,
|
363
|
-
max_concurrent=
|
373
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
364
374
|
)
|
365
375
|
)
|
366
376
|
local_results = results
|
367
377
|
info(f"Local evaluation complete with {len(local_results)} results")
|
368
|
-
|
369
378
|
# Aggregate the ScorerData from the API and local evaluations
|
370
379
|
debug("Merging API and local results")
|
371
380
|
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
@@ -373,6 +382,15 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
373
382
|
|
374
383
|
info(f"Successfully merged {len(merged_results)} results")
|
375
384
|
|
385
|
+
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
386
|
+
# if evaluation_run.rules and merged_results:
|
387
|
+
# run_rules(
|
388
|
+
# local_results=merged_results,
|
389
|
+
# rules=evaluation_run.rules,
|
390
|
+
# judgment_api_key=evaluation_run.judgment_api_key,
|
391
|
+
# organization_id=evaluation_run.organization_id
|
392
|
+
# )
|
393
|
+
|
376
394
|
if evaluation_run.log_results:
|
377
395
|
log_evaluation_results(merged_results, evaluation_run)
|
378
396
|
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -34,22 +34,22 @@ class APIJudgmentScorer(BaseModel):
|
|
34
34
|
@field_validator('score_type')
|
35
35
|
def convert_to_enum_value(cls, v):
|
36
36
|
"""
|
37
|
-
Validates that the `score_type` is a valid `
|
38
|
-
Converts string values to `
|
37
|
+
Validates that the `score_type` is a valid `APIScorer` enum value.
|
38
|
+
Converts string values to `APIScorer` enum values.
|
39
39
|
"""
|
40
40
|
debug(f"Attempting to convert score_type value: {v}")
|
41
41
|
if isinstance(v, APIScorer):
|
42
|
-
info(f"Using existing
|
43
|
-
return v
|
42
|
+
info(f"Using existing APIScorer: {v}")
|
43
|
+
return v
|
44
44
|
elif isinstance(v, str):
|
45
|
-
debug(f"Converting string value to
|
46
|
-
return APIScorer[v.upper()]
|
45
|
+
debug(f"Converting string value to APIScorer enum: {v}")
|
46
|
+
return APIScorer[v.upper()]
|
47
47
|
error(f"Invalid score_type value: {v}")
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
|
-
|
49
|
+
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
52
|
-
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
52
|
+
|
53
53
|
def to_dict(self) -> dict:
|
54
54
|
"""
|
55
55
|
Converts the scorer configuration to a dictionary format.
|
@@ -58,7 +58,6 @@ class APIJudgmentScorer(BaseModel):
|
|
58
58
|
dict: A dictionary containing the scorer's configuration
|
59
59
|
"""
|
60
60
|
return {
|
61
|
-
"score_type": self.score_type,
|
61
|
+
"score_type": str(self.score_type.value), # Convert enum to string for serialization
|
62
62
|
"threshold": self.threshold
|
63
|
-
}
|
64
|
-
|
63
|
+
}
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -48,5 +48,5 @@ class APIJudgmentScorer(BaseModel):
|
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
49
|
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
52
52
|
|
judgeval/utils/alerts.py
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
"""
|
2
|
+
Handling alerts in Judgeval.
|
3
|
+
"""
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Dict, Any, List, Optional
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
class AlertStatus(str, Enum):
|
9
|
+
"""Status of an alert evaluation."""
|
10
|
+
TRIGGERED = "triggered"
|
11
|
+
NOT_TRIGGERED = "not_triggered"
|
12
|
+
|
13
|
+
class AlertResult(BaseModel):
|
14
|
+
"""
|
15
|
+
Result of a rule evaluation.
|
16
|
+
|
17
|
+
Attributes:
|
18
|
+
rule_name: Name of the rule that was evaluated
|
19
|
+
rule_id: Unique identifier of the rule
|
20
|
+
status: Status of the alert (triggered or not)
|
21
|
+
conditions_result: List of condition evaluation results
|
22
|
+
metadata: Dictionary containing example_id, timestamp, and other metadata
|
23
|
+
"""
|
24
|
+
rule_name: str
|
25
|
+
rule_id: Optional[str] = None # The unique identifier of the rule
|
26
|
+
status: AlertStatus
|
27
|
+
conditions_result: List[Dict[str, Any]] = []
|
28
|
+
metadata: Dict[str, Any] = {}
|
29
|
+
|
30
|
+
@property
|
31
|
+
def example_id(self) -> Optional[str]:
|
32
|
+
"""Get example_id from metadata for backward compatibility"""
|
33
|
+
return self.metadata.get("example_id")
|
34
|
+
|
35
|
+
@property
|
36
|
+
def timestamp(self) -> Optional[str]:
|
37
|
+
"""Get timestamp from metadata for backward compatibility"""
|
38
|
+
return self.metadata.get("timestamp")
|
39
|
+
|
40
|
+
@property
|
41
|
+
def conditions_results(self) -> List[Dict[str, Any]]:
|
42
|
+
"""Backwards compatibility property for the conditions_result field"""
|
43
|
+
return self.conditions_result
|