judgeval 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +104 -28
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +1 -1
- judgeval/data/datasets/eval_dataset_client.py +0 -1
- judgeval/evaluation_run.py +8 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +139 -14
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +16 -5
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/METADATA +1 -1
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/RECORD +19 -17
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/WHEEL +0 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -15,7 +15,8 @@ from judgeval.scorers import (
|
|
15
15
|
APIJudgmentScorer,
|
16
16
|
JudgevalScorer,
|
17
17
|
ClassifierScorer,
|
18
|
-
ScorerWrapper
|
18
|
+
ScorerWrapper,
|
19
|
+
score,
|
19
20
|
)
|
20
21
|
from judgeval.evaluation_run import EvaluationRun
|
21
22
|
from judgeval.run_evaluation import (
|
@@ -26,6 +27,7 @@ from judgeval.judges import JudgevalJudge
|
|
26
27
|
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
27
28
|
from judgeval.common.exceptions import JudgmentAPIError
|
28
29
|
from pydantic import BaseModel
|
30
|
+
from judgeval.rules import Rule
|
29
31
|
|
30
32
|
class EvalRunRequestBody(BaseModel):
|
31
33
|
eval_name: str
|
@@ -34,7 +36,7 @@ class EvalRunRequestBody(BaseModel):
|
|
34
36
|
|
35
37
|
|
36
38
|
class JudgmentClient:
|
37
|
-
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("
|
39
|
+
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
|
38
40
|
self.judgment_api_key = judgment_api_key
|
39
41
|
self.organization_id = organization_id
|
40
42
|
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
@@ -58,17 +60,69 @@ class JudgmentClient:
|
|
58
60
|
project_name: str = "default_project",
|
59
61
|
eval_run_name: str = "default_eval_run",
|
60
62
|
override: bool = False,
|
61
|
-
use_judgment: bool = True
|
63
|
+
use_judgment: bool = True,
|
64
|
+
rules: Optional[List[Rule]] = None
|
62
65
|
) -> List[ScoringResult]:
|
63
66
|
"""
|
64
67
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
68
|
+
|
69
|
+
Args:
|
70
|
+
examples (List[Example]): The examples to evaluate
|
71
|
+
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
72
|
+
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
73
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
74
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
75
|
+
log_results (bool): Whether to log the results to the Judgment API
|
76
|
+
project_name (str): The name of the project the evaluation results belong to
|
77
|
+
eval_run_name (str): A name for this evaluation run
|
78
|
+
override (bool): Whether to override an existing evaluation run with the same name
|
79
|
+
use_judgment (bool): Whether to use Judgment API for evaluation
|
80
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
List[ScoringResult]: The results of the evaluation
|
65
84
|
"""
|
66
85
|
try:
|
67
86
|
# Load appropriate implementations for all scorers
|
68
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
69
|
-
|
70
|
-
|
71
|
-
|
87
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
88
|
+
for scorer in scorers:
|
89
|
+
try:
|
90
|
+
if isinstance(scorer, ScorerWrapper):
|
91
|
+
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
92
|
+
else:
|
93
|
+
loaded_scorers.append(scorer)
|
94
|
+
except Exception as e:
|
95
|
+
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
96
|
+
|
97
|
+
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
98
|
+
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
99
|
+
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
100
|
+
|
101
|
+
# Convert ScorerWrapper in rules to their implementations
|
102
|
+
loaded_rules = None
|
103
|
+
if rules:
|
104
|
+
loaded_rules = []
|
105
|
+
for rule in rules:
|
106
|
+
try:
|
107
|
+
processed_conditions = []
|
108
|
+
for condition in rule.conditions:
|
109
|
+
# Convert metric if it's a ScorerWrapper
|
110
|
+
if isinstance(condition.metric, ScorerWrapper):
|
111
|
+
try:
|
112
|
+
condition_copy = condition.model_copy()
|
113
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
114
|
+
processed_conditions.append(condition_copy)
|
115
|
+
except Exception as e:
|
116
|
+
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
117
|
+
else:
|
118
|
+
processed_conditions.append(condition)
|
119
|
+
|
120
|
+
# Create new rule with processed conditions
|
121
|
+
new_rule = rule.model_copy()
|
122
|
+
new_rule.conditions = processed_conditions
|
123
|
+
loaded_rules.append(new_rule)
|
124
|
+
except Exception as e:
|
125
|
+
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
72
126
|
|
73
127
|
eval = EvaluationRun(
|
74
128
|
log_results=log_results,
|
@@ -79,12 +133,15 @@ class JudgmentClient:
|
|
79
133
|
model=model,
|
80
134
|
aggregator=aggregator,
|
81
135
|
metadata=metadata,
|
82
|
-
judgment_api_key=self.judgment_api_key,
|
136
|
+
judgment_api_key=self.judgment_api_key,
|
137
|
+
rules=loaded_rules,
|
83
138
|
organization_id=self.organization_id
|
84
139
|
)
|
85
140
|
return run_eval(eval, override)
|
86
141
|
except ValueError as e:
|
87
142
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
143
|
+
except Exception as e:
|
144
|
+
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
88
145
|
|
89
146
|
def evaluate_dataset(
|
90
147
|
self,
|
@@ -96,17 +153,68 @@ class JudgmentClient:
|
|
96
153
|
project_name: str = "",
|
97
154
|
eval_run_name: str = "",
|
98
155
|
log_results: bool = False,
|
99
|
-
use_judgment: bool = True
|
156
|
+
use_judgment: bool = True,
|
157
|
+
rules: Optional[List[Rule]] = None
|
100
158
|
) -> List[ScoringResult]:
|
101
159
|
"""
|
102
160
|
Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
|
161
|
+
|
162
|
+
Args:
|
163
|
+
dataset (EvalDataset): The dataset containing examples to evaluate
|
164
|
+
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
165
|
+
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
166
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
167
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
168
|
+
project_name (str): The name of the project the evaluation results belong to
|
169
|
+
eval_run_name (str): A name for this evaluation run
|
170
|
+
log_results (bool): Whether to log the results to the Judgment API
|
171
|
+
use_judgment (bool): Whether to use Judgment API for evaluation
|
172
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
List[ScoringResult]: The results of the evaluation
|
103
176
|
"""
|
104
177
|
try:
|
105
178
|
# Load appropriate implementations for all scorers
|
106
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
107
|
-
|
108
|
-
|
109
|
-
|
179
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
180
|
+
for scorer in scorers:
|
181
|
+
try:
|
182
|
+
if isinstance(scorer, ScorerWrapper):
|
183
|
+
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
184
|
+
else:
|
185
|
+
loaded_scorers.append(scorer)
|
186
|
+
except Exception as e:
|
187
|
+
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
188
|
+
|
189
|
+
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
190
|
+
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
191
|
+
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
192
|
+
|
193
|
+
# Convert ScorerWrapper in rules to their implementations
|
194
|
+
loaded_rules = None
|
195
|
+
if rules:
|
196
|
+
loaded_rules = []
|
197
|
+
for rule in rules:
|
198
|
+
try:
|
199
|
+
processed_conditions = []
|
200
|
+
for condition in rule.conditions:
|
201
|
+
# Convert metric if it's a ScorerWrapper
|
202
|
+
if isinstance(condition.metric, ScorerWrapper):
|
203
|
+
try:
|
204
|
+
condition_copy = condition.model_copy()
|
205
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
206
|
+
processed_conditions.append(condition_copy)
|
207
|
+
except Exception as e:
|
208
|
+
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
209
|
+
else:
|
210
|
+
processed_conditions.append(condition)
|
211
|
+
|
212
|
+
# Create new rule with processed conditions
|
213
|
+
new_rule = rule.model_copy()
|
214
|
+
new_rule.conditions = processed_conditions
|
215
|
+
loaded_rules.append(new_rule)
|
216
|
+
except Exception as e:
|
217
|
+
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
110
218
|
|
111
219
|
evaluation_run = EvaluationRun(
|
112
220
|
log_results=log_results,
|
@@ -118,11 +226,14 @@ class JudgmentClient:
|
|
118
226
|
aggregator=aggregator,
|
119
227
|
metadata=metadata,
|
120
228
|
judgment_api_key=self.judgment_api_key,
|
229
|
+
rules=loaded_rules,
|
121
230
|
organization_id=self.organization_id
|
122
231
|
)
|
123
232
|
return run_eval(evaluation_run)
|
124
233
|
except ValueError as e:
|
125
234
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
235
|
+
except Exception as e:
|
236
|
+
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
126
237
|
|
127
238
|
def create_dataset(self) -> EvalDataset:
|
128
239
|
return self.eval_dataset_client.create_dataset()
|
@@ -364,9 +475,22 @@ class JudgmentClient:
|
|
364
475
|
project_name: str = "default_project",
|
365
476
|
eval_run_name: str = "default_eval_run",
|
366
477
|
override: bool = False,
|
478
|
+
rules: Optional[List[Rule]] = None
|
367
479
|
) -> None:
|
368
480
|
"""
|
369
481
|
Asserts a test by running the evaluation and checking the results for success
|
482
|
+
|
483
|
+
Args:
|
484
|
+
examples (List[Example]): The examples to evaluate
|
485
|
+
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
486
|
+
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
487
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
488
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
489
|
+
log_results (bool): Whether to log the results to the Judgment API
|
490
|
+
project_name (str): The name of the project the evaluation results belong to
|
491
|
+
eval_run_name (str): A name for this evaluation run
|
492
|
+
override (bool): Whether to override an existing evaluation run with the same name
|
493
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
370
494
|
"""
|
371
495
|
results = self.run_evaluation(
|
372
496
|
examples=examples,
|
@@ -377,7 +501,8 @@ class JudgmentClient:
|
|
377
501
|
log_results=log_results,
|
378
502
|
project_name=project_name,
|
379
503
|
eval_run_name=eval_run_name,
|
380
|
-
override=override
|
504
|
+
override=override,
|
505
|
+
rules=rules
|
381
506
|
)
|
382
507
|
|
383
508
|
assert_test(results)
|
judgeval/rules.py
ADDED
@@ -0,0 +1,384 @@
|
|
1
|
+
"""
|
2
|
+
Rules system for Judgeval that enables alerts based on metric thresholds.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict, List, Optional, Union, Any, Set, Tuple
|
6
|
+
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
7
|
+
from enum import Enum
|
8
|
+
from datetime import datetime
|
9
|
+
import asyncio
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
11
|
+
import time
|
12
|
+
import uuid # Add import for uuid module
|
13
|
+
|
14
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
15
|
+
from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
|
16
|
+
|
17
|
+
class AlertStatus(str, Enum):
|
18
|
+
"""Status of an alert evaluation."""
|
19
|
+
TRIGGERED = "triggered"
|
20
|
+
NOT_TRIGGERED = "not_triggered"
|
21
|
+
|
22
|
+
class Operator(str, Enum):
|
23
|
+
"""Comparison operators for conditions."""
|
24
|
+
GT = ">"
|
25
|
+
GTE = ">="
|
26
|
+
LT = "<"
|
27
|
+
LTE = "<="
|
28
|
+
EQ = "=="
|
29
|
+
NEQ = "!="
|
30
|
+
|
31
|
+
class Condition(BaseModel):
|
32
|
+
"""
|
33
|
+
A single metric condition.
|
34
|
+
|
35
|
+
Example:
|
36
|
+
{
|
37
|
+
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
|
38
|
+
"operator": ">=",
|
39
|
+
"threshold": 0.7
|
40
|
+
}
|
41
|
+
"""
|
42
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
43
|
+
|
44
|
+
metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
|
45
|
+
operator: Operator
|
46
|
+
threshold: float
|
47
|
+
|
48
|
+
@property
|
49
|
+
def metric_name(self) -> str:
|
50
|
+
"""Get the name of the metric for lookups in scores dictionary."""
|
51
|
+
if isinstance(self.metric, ScorerWrapper):
|
52
|
+
# Handle ScorerWrapper case specifically
|
53
|
+
return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
|
54
|
+
elif hasattr(self.metric, 'score_type'):
|
55
|
+
# Handle APIJudgmentScorer and JudgevalScorer which have score_type
|
56
|
+
return self.metric.score_type
|
57
|
+
elif hasattr(self.metric, '__name__'):
|
58
|
+
# Handle cases where metric has a __name__ attribute
|
59
|
+
return self.metric.__name__
|
60
|
+
# Fallback to string representation
|
61
|
+
return str(self.metric)
|
62
|
+
|
63
|
+
def evaluate(self, value: float) -> bool:
|
64
|
+
"""Evaluate this condition against a value."""
|
65
|
+
if self.operator == Operator.GT:
|
66
|
+
return value > self.threshold
|
67
|
+
elif self.operator == Operator.GTE:
|
68
|
+
return value >= self.threshold
|
69
|
+
elif self.operator == Operator.LT:
|
70
|
+
return value < self.threshold
|
71
|
+
elif self.operator == Operator.LTE:
|
72
|
+
return value <= self.threshold
|
73
|
+
elif self.operator == Operator.EQ:
|
74
|
+
return value == self.threshold
|
75
|
+
elif self.operator == Operator.NEQ:
|
76
|
+
return value != self.threshold
|
77
|
+
else:
|
78
|
+
raise ValueError(f"Unknown operator: {self.operator}")
|
79
|
+
|
80
|
+
class Rule(BaseModel):
|
81
|
+
"""
|
82
|
+
Configuration for a single rule.
|
83
|
+
|
84
|
+
Example:
|
85
|
+
{
|
86
|
+
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
87
|
+
"name": "Quality Check",
|
88
|
+
"description": "Check if quality metrics meet thresholds",
|
89
|
+
"conditions": [
|
90
|
+
{"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
|
91
|
+
{"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
|
92
|
+
],
|
93
|
+
"combine_type": "all" # "all" or "any"
|
94
|
+
}
|
95
|
+
"""
|
96
|
+
rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
|
97
|
+
name: str
|
98
|
+
description: Optional[str] = None
|
99
|
+
conditions: List[Condition]
|
100
|
+
combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
|
101
|
+
|
102
|
+
def model_dump(self, **kwargs):
|
103
|
+
"""
|
104
|
+
Custom serialization that properly handles condition serialization.
|
105
|
+
"""
|
106
|
+
data = super().model_dump(**kwargs)
|
107
|
+
|
108
|
+
# Special handling for conditions with complex metric objects
|
109
|
+
if "conditions" in data:
|
110
|
+
for i, condition in enumerate(data["conditions"]):
|
111
|
+
if "metric" in condition:
|
112
|
+
# Get the actual metric object
|
113
|
+
metric_obj = self.conditions[i].metric
|
114
|
+
|
115
|
+
# Create standardized metric representation needed by server API
|
116
|
+
metric_data = {
|
117
|
+
"score_type": "",
|
118
|
+
"threshold": 0.0
|
119
|
+
}
|
120
|
+
|
121
|
+
# First try to use object's own serialization methods
|
122
|
+
if hasattr(metric_obj, "to_dict"):
|
123
|
+
orig_data = metric_obj.to_dict()
|
124
|
+
# Copy any existing fields
|
125
|
+
for key, value in orig_data.items():
|
126
|
+
metric_data[key] = value
|
127
|
+
elif hasattr(metric_obj, "model_dump"):
|
128
|
+
orig_data = metric_obj.model_dump()
|
129
|
+
# Copy any existing fields
|
130
|
+
for key, value in orig_data.items():
|
131
|
+
metric_data[key] = value
|
132
|
+
|
133
|
+
# If we already have data from original serialization methods but missing required fields
|
134
|
+
if 'name' in metric_data and 'score_type' not in metric_data:
|
135
|
+
metric_data['score_type'] = metric_data['name']
|
136
|
+
|
137
|
+
# Ensure required fields have values by checking various sources
|
138
|
+
if not metric_data['score_type']:
|
139
|
+
# Try to get score_type from different possible attributes
|
140
|
+
if hasattr(metric_obj, 'score_type'):
|
141
|
+
metric_data['score_type'] = metric_obj.score_type
|
142
|
+
elif hasattr(metric_obj, 'name'):
|
143
|
+
metric_data['score_type'] = metric_obj.name
|
144
|
+
else:
|
145
|
+
# Last resort: use string representation
|
146
|
+
metric_data['score_type'] = str(metric_obj)
|
147
|
+
|
148
|
+
# Make sure threshold is set
|
149
|
+
if not metric_data.get('threshold') and metric_data.get('threshold') != 0.0:
|
150
|
+
if hasattr(metric_obj, 'threshold'):
|
151
|
+
metric_data['threshold'] = metric_obj.threshold
|
152
|
+
else:
|
153
|
+
# Use condition threshold if metric doesn't have one
|
154
|
+
metric_data['threshold'] = self.conditions[i].threshold
|
155
|
+
|
156
|
+
# Update the condition with our properly serialized metric
|
157
|
+
condition["metric"] = metric_data
|
158
|
+
|
159
|
+
return data
|
160
|
+
|
161
|
+
@field_validator('conditions')
|
162
|
+
def validate_conditions_not_empty(cls, v):
|
163
|
+
if not v:
|
164
|
+
raise ValueError("Conditions list cannot be empty")
|
165
|
+
return v
|
166
|
+
|
167
|
+
@field_validator('combine_type')
|
168
|
+
def validate_combine_type(cls, v):
|
169
|
+
if v not in ["all", "any"]:
|
170
|
+
raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
|
171
|
+
return v
|
172
|
+
|
173
|
+
|
174
|
+
class AlertResult(BaseModel):
|
175
|
+
"""
|
176
|
+
Result of evaluating a rule.
|
177
|
+
|
178
|
+
Example:
|
179
|
+
{
|
180
|
+
"status": "triggered",
|
181
|
+
"rule_name": "Quality Check",
|
182
|
+
"conditions_result": [
|
183
|
+
{"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
|
184
|
+
{"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
|
185
|
+
],
|
186
|
+
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
187
|
+
"metadata": {
|
188
|
+
"example_id": "example_123",
|
189
|
+
"timestamp": "20240321_123456"
|
190
|
+
}
|
191
|
+
}
|
192
|
+
"""
|
193
|
+
status: AlertStatus
|
194
|
+
rule_id: Optional[str] = None # The unique identifier of the rule
|
195
|
+
rule_name: str
|
196
|
+
conditions_result: List[Dict[str, Any]]
|
197
|
+
metadata: Dict[str, Any] = {}
|
198
|
+
|
199
|
+
@property
|
200
|
+
def example_id(self) -> Optional[str]:
|
201
|
+
"""Get example_id from metadata for backward compatibility"""
|
202
|
+
return self.metadata.get("example_id")
|
203
|
+
|
204
|
+
@property
|
205
|
+
def timestamp(self) -> Optional[str]:
|
206
|
+
"""Get timestamp from metadata for backward compatibility"""
|
207
|
+
return self.metadata.get("timestamp")
|
208
|
+
|
209
|
+
class RulesEngine:
|
210
|
+
"""
|
211
|
+
Engine for evaluating rules and managing alerts.
|
212
|
+
|
213
|
+
Example usage:
|
214
|
+
rules = {
|
215
|
+
"quality_check": Rule(
|
216
|
+
name="Quality Check",
|
217
|
+
conditions=[
|
218
|
+
Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
|
219
|
+
Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
|
220
|
+
],
|
221
|
+
combine_type="all"
|
222
|
+
)
|
223
|
+
}
|
224
|
+
|
225
|
+
engine = RulesEngine(rules)
|
226
|
+
scores = {"faithfulness": 0.8, "relevancy": 0.9}
|
227
|
+
alerts = engine.evaluate_rules(scores, example_metadata={
|
228
|
+
"example_id": "example_123",
|
229
|
+
"timestamp": "20240321_123456"
|
230
|
+
})
|
231
|
+
"""
|
232
|
+
|
233
|
+
def __init__(self, rules: Dict[str, Rule]):
|
234
|
+
"""
|
235
|
+
Initialize the RulesEngine with rules.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
rules: Dictionary mapping rule IDs to rule configurations
|
239
|
+
"""
|
240
|
+
self.rules = rules
|
241
|
+
|
242
|
+
def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
|
243
|
+
"""
|
244
|
+
Evaluate all rules against a set of scores.
|
245
|
+
Returns mapping of rule IDs to their alert results.
|
246
|
+
|
247
|
+
Args:
|
248
|
+
scores: Dictionary of metric names to their score values
|
249
|
+
example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
|
250
|
+
"""
|
251
|
+
results = {}
|
252
|
+
|
253
|
+
for rule_id, rule in self.rules.items():
|
254
|
+
# Evaluate each condition
|
255
|
+
condition_results = []
|
256
|
+
passed_conditions = []
|
257
|
+
|
258
|
+
for condition in rule.conditions:
|
259
|
+
# Get the metric name for lookup
|
260
|
+
metric_name = condition.metric_name
|
261
|
+
value = scores.get(metric_name)
|
262
|
+
if value is None:
|
263
|
+
# Skip this condition instead of evaluating it as false
|
264
|
+
condition_results.append({
|
265
|
+
"metric": metric_name,
|
266
|
+
"value": None,
|
267
|
+
"threshold": condition.threshold,
|
268
|
+
"operator": condition.operator,
|
269
|
+
"passed": None, # Using None to indicate the condition was skipped
|
270
|
+
"skipped": True # Add a flag to indicate this condition was skipped
|
271
|
+
})
|
272
|
+
continue # Skip adding to passed_conditions
|
273
|
+
else:
|
274
|
+
passed = condition.evaluate(value)
|
275
|
+
condition_results.append({
|
276
|
+
"metric": metric_name,
|
277
|
+
"value": value,
|
278
|
+
"threshold": condition.threshold,
|
279
|
+
"operator": condition.operator,
|
280
|
+
"passed": passed,
|
281
|
+
"skipped": False # Indicate this condition was evaluated
|
282
|
+
})
|
283
|
+
passed_conditions.append(passed)
|
284
|
+
|
285
|
+
# Determine if alert should trigger - only consider conditions that weren't skipped
|
286
|
+
if not passed_conditions:
|
287
|
+
# If all conditions were skipped, the rule doesn't trigger
|
288
|
+
triggered = False
|
289
|
+
else:
|
290
|
+
triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
|
291
|
+
|
292
|
+
# Create alert result with example metadata
|
293
|
+
alert_result = AlertResult(
|
294
|
+
status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
|
295
|
+
rule_id=rule.rule_id, # Include the rule's unique identifier
|
296
|
+
rule_name=rule.name,
|
297
|
+
conditions_result=condition_results
|
298
|
+
)
|
299
|
+
|
300
|
+
# Add example metadata if provided
|
301
|
+
if example_metadata:
|
302
|
+
if "example_id" in example_metadata:
|
303
|
+
alert_result.metadata["example_id"] = example_metadata["example_id"]
|
304
|
+
if "timestamp" in example_metadata:
|
305
|
+
alert_result.metadata["timestamp"] = example_metadata["timestamp"]
|
306
|
+
|
307
|
+
results[rule_id] = alert_result
|
308
|
+
|
309
|
+
return results
|
310
|
+
|
311
|
+
async def evaluate_rules_parallel(self,
|
312
|
+
example_scores: Dict[str, Dict[str, float]],
|
313
|
+
example_metadata: Dict[str, Dict[str, Any]],
|
314
|
+
max_concurrent: int = 100) -> Dict[str, Dict[str, AlertResult]]:
|
315
|
+
"""
|
316
|
+
Evaluate all rules against multiple examples in parallel.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
example_scores: Dictionary mapping example_ids to their score dictionaries
|
320
|
+
example_metadata: Dictionary mapping example_ids to their metadata
|
321
|
+
max_concurrent: Maximum number of concurrent evaluations
|
322
|
+
|
323
|
+
Returns:
|
324
|
+
Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
|
325
|
+
"""
|
326
|
+
# Create semaphore to limit concurrent executions
|
327
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
328
|
+
results = {}
|
329
|
+
tasks = []
|
330
|
+
|
331
|
+
# Create a task for each example
|
332
|
+
for example_id, scores in example_scores.items():
|
333
|
+
metadata = example_metadata.get(example_id, {})
|
334
|
+
task = self._evaluate_with_semaphore(
|
335
|
+
semaphore=semaphore,
|
336
|
+
example_id=example_id,
|
337
|
+
scores=scores,
|
338
|
+
metadata=metadata
|
339
|
+
)
|
340
|
+
tasks.append(task)
|
341
|
+
|
342
|
+
# Run all tasks and collect results
|
343
|
+
example_results = await asyncio.gather(*tasks)
|
344
|
+
|
345
|
+
# Organize results by example_id
|
346
|
+
for example_id, result in example_results:
|
347
|
+
results[example_id] = result
|
348
|
+
|
349
|
+
return results
|
350
|
+
|
351
|
+
async def _evaluate_with_semaphore(self,
|
352
|
+
semaphore: asyncio.Semaphore,
|
353
|
+
example_id: str,
|
354
|
+
scores: Dict[str, float],
|
355
|
+
metadata: Dict[str, Any]) -> Tuple[str, Dict[str, AlertResult]]:
|
356
|
+
"""
|
357
|
+
Helper method to evaluate rules for an example with semaphore control.
|
358
|
+
|
359
|
+
Args:
|
360
|
+
semaphore: Semaphore to control concurrency
|
361
|
+
example_id: ID of the example being evaluated
|
362
|
+
scores: Dictionary of scores for this example
|
363
|
+
metadata: Metadata for this example
|
364
|
+
|
365
|
+
Returns:
|
366
|
+
Tuple of (example_id, rule_results)
|
367
|
+
"""
|
368
|
+
async with semaphore:
|
369
|
+
# Run the evaluation in a thread pool to avoid blocking the event loop
|
370
|
+
# for CPU-bound operations
|
371
|
+
with ThreadPoolExecutor() as executor:
|
372
|
+
start_time = time.perf_counter()
|
373
|
+
rule_results = await asyncio.get_event_loop().run_in_executor(
|
374
|
+
executor,
|
375
|
+
self.evaluate_rules,
|
376
|
+
scores,
|
377
|
+
metadata
|
378
|
+
)
|
379
|
+
end_time = time.perf_counter()
|
380
|
+
|
381
|
+
# Could log performance metrics here if needed
|
382
|
+
# debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
|
383
|
+
|
384
|
+
return (example_id, rule_results)
|