judgeval 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,8 @@ from judgeval.scorers import (
15
15
  APIJudgmentScorer,
16
16
  JudgevalScorer,
17
17
  ClassifierScorer,
18
- ScorerWrapper
18
+ ScorerWrapper,
19
+ score,
19
20
  )
20
21
  from judgeval.evaluation_run import EvaluationRun
21
22
  from judgeval.run_evaluation import (
@@ -26,6 +27,7 @@ from judgeval.judges import JudgevalJudge
26
27
  from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
28
  from judgeval.common.exceptions import JudgmentAPIError
28
29
  from pydantic import BaseModel
30
+ from judgeval.rules import Rule
29
31
 
30
32
  class EvalRunRequestBody(BaseModel):
31
33
  eval_name: str
@@ -34,7 +36,7 @@ class EvalRunRequestBody(BaseModel):
34
36
 
35
37
 
36
38
  class JudgmentClient:
37
- def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("ORGANIZATION_ID")):
39
+ def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
38
40
  self.judgment_api_key = judgment_api_key
39
41
  self.organization_id = organization_id
40
42
  self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
@@ -58,17 +60,69 @@ class JudgmentClient:
58
60
  project_name: str = "default_project",
59
61
  eval_run_name: str = "default_eval_run",
60
62
  override: bool = False,
61
- use_judgment: bool = True
63
+ use_judgment: bool = True,
64
+ rules: Optional[List[Rule]] = None
62
65
  ) -> List[ScoringResult]:
63
66
  """
64
67
  Executes an evaluation of `Example`s using one or more `Scorer`s
68
+
69
+ Args:
70
+ examples (List[Example]): The examples to evaluate
71
+ scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
72
+ model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
73
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
74
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
75
+ log_results (bool): Whether to log the results to the Judgment API
76
+ project_name (str): The name of the project the evaluation results belong to
77
+ eval_run_name (str): A name for this evaluation run
78
+ override (bool): Whether to override an existing evaluation run with the same name
79
+ use_judgment (bool): Whether to use Judgment API for evaluation
80
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
81
+
82
+ Returns:
83
+ List[ScoringResult]: The results of the evaluation
65
84
  """
66
85
  try:
67
86
  # Load appropriate implementations for all scorers
68
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
69
- scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
70
- for scorer in scorers
71
- ]
87
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
88
+ for scorer in scorers:
89
+ try:
90
+ if isinstance(scorer, ScorerWrapper):
91
+ loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
92
+ else:
93
+ loaded_scorers.append(scorer)
94
+ except Exception as e:
95
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
96
+
97
+ # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
98
+ if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
99
+ raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
100
+
101
+ # Convert ScorerWrapper in rules to their implementations
102
+ loaded_rules = None
103
+ if rules:
104
+ loaded_rules = []
105
+ for rule in rules:
106
+ try:
107
+ processed_conditions = []
108
+ for condition in rule.conditions:
109
+ # Convert metric if it's a ScorerWrapper
110
+ if isinstance(condition.metric, ScorerWrapper):
111
+ try:
112
+ condition_copy = condition.model_copy()
113
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
114
+ processed_conditions.append(condition_copy)
115
+ except Exception as e:
116
+ raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
117
+ else:
118
+ processed_conditions.append(condition)
119
+
120
+ # Create new rule with processed conditions
121
+ new_rule = rule.model_copy()
122
+ new_rule.conditions = processed_conditions
123
+ loaded_rules.append(new_rule)
124
+ except Exception as e:
125
+ raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
72
126
 
73
127
  eval = EvaluationRun(
74
128
  log_results=log_results,
@@ -79,12 +133,15 @@ class JudgmentClient:
79
133
  model=model,
80
134
  aggregator=aggregator,
81
135
  metadata=metadata,
82
- judgment_api_key=self.judgment_api_key,
136
+ judgment_api_key=self.judgment_api_key,
137
+ rules=loaded_rules,
83
138
  organization_id=self.organization_id
84
139
  )
85
140
  return run_eval(eval, override)
86
141
  except ValueError as e:
87
142
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
143
+ except Exception as e:
144
+ raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
88
145
 
89
146
  def evaluate_dataset(
90
147
  self,
@@ -96,17 +153,68 @@ class JudgmentClient:
96
153
  project_name: str = "",
97
154
  eval_run_name: str = "",
98
155
  log_results: bool = False,
99
- use_judgment: bool = True
156
+ use_judgment: bool = True,
157
+ rules: Optional[List[Rule]] = None
100
158
  ) -> List[ScoringResult]:
101
159
  """
102
160
  Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
161
+
162
+ Args:
163
+ dataset (EvalDataset): The dataset containing examples to evaluate
164
+ scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
165
+ model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
166
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
167
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
168
+ project_name (str): The name of the project the evaluation results belong to
169
+ eval_run_name (str): A name for this evaluation run
170
+ log_results (bool): Whether to log the results to the Judgment API
171
+ use_judgment (bool): Whether to use Judgment API for evaluation
172
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
173
+
174
+ Returns:
175
+ List[ScoringResult]: The results of the evaluation
103
176
  """
104
177
  try:
105
178
  # Load appropriate implementations for all scorers
106
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
107
- scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
108
- for scorer in scorers
109
- ]
179
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
180
+ for scorer in scorers:
181
+ try:
182
+ if isinstance(scorer, ScorerWrapper):
183
+ loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
184
+ else:
185
+ loaded_scorers.append(scorer)
186
+ except Exception as e:
187
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
188
+
189
+ # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
190
+ if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
191
+ raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
192
+
193
+ # Convert ScorerWrapper in rules to their implementations
194
+ loaded_rules = None
195
+ if rules:
196
+ loaded_rules = []
197
+ for rule in rules:
198
+ try:
199
+ processed_conditions = []
200
+ for condition in rule.conditions:
201
+ # Convert metric if it's a ScorerWrapper
202
+ if isinstance(condition.metric, ScorerWrapper):
203
+ try:
204
+ condition_copy = condition.model_copy()
205
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
206
+ processed_conditions.append(condition_copy)
207
+ except Exception as e:
208
+ raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
209
+ else:
210
+ processed_conditions.append(condition)
211
+
212
+ # Create new rule with processed conditions
213
+ new_rule = rule.model_copy()
214
+ new_rule.conditions = processed_conditions
215
+ loaded_rules.append(new_rule)
216
+ except Exception as e:
217
+ raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
110
218
 
111
219
  evaluation_run = EvaluationRun(
112
220
  log_results=log_results,
@@ -118,11 +226,14 @@ class JudgmentClient:
118
226
  aggregator=aggregator,
119
227
  metadata=metadata,
120
228
  judgment_api_key=self.judgment_api_key,
229
+ rules=loaded_rules,
121
230
  organization_id=self.organization_id
122
231
  )
123
232
  return run_eval(evaluation_run)
124
233
  except ValueError as e:
125
234
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
235
+ except Exception as e:
236
+ raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
126
237
 
127
238
  def create_dataset(self) -> EvalDataset:
128
239
  return self.eval_dataset_client.create_dataset()
@@ -195,7 +306,8 @@ class JudgmentClient:
195
306
  "Authorization": f"Bearer {self.judgment_api_key}",
196
307
  "X-Organization-Id": self.organization_id
197
308
  },
198
- json=eval_run_request_body.model_dump())
309
+ json=eval_run_request_body.model_dump(),
310
+ verify=False)
199
311
  if eval_run.status_code != requests.codes.ok:
200
312
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
201
313
 
@@ -267,7 +379,8 @@ class JudgmentClient:
267
379
  "Content-Type": "application/json",
268
380
  "Authorization": f"Bearer {self.judgment_api_key}",
269
381
  },
270
- json={} # Empty body now
382
+ json={}, # Empty body now
383
+ verify=False
271
384
  )
272
385
  if response.status_code == 200:
273
386
  return True, response.json()
@@ -298,7 +411,8 @@ class JudgmentClient:
298
411
  "Content-Type": "application/json",
299
412
  "Authorization": f"Bearer {self.judgment_api_key}",
300
413
  "X-Organization-Id": self.organization_id
301
- }
414
+ },
415
+ verify=False
302
416
  )
303
417
 
304
418
  if response.status_code == 500:
@@ -341,7 +455,8 @@ class JudgmentClient:
341
455
  "Content-Type": "application/json",
342
456
  "Authorization": f"Bearer {self.judgment_api_key}",
343
457
  "X-Organization-Id": self.organization_id
344
- }
458
+ },
459
+ verify=False
345
460
  )
346
461
 
347
462
  if response.status_code == 500:
@@ -364,9 +479,22 @@ class JudgmentClient:
364
479
  project_name: str = "default_project",
365
480
  eval_run_name: str = "default_eval_run",
366
481
  override: bool = False,
482
+ rules: Optional[List[Rule]] = None
367
483
  ) -> None:
368
484
  """
369
485
  Asserts a test by running the evaluation and checking the results for success
486
+
487
+ Args:
488
+ examples (List[Example]): The examples to evaluate
489
+ scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
490
+ model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
491
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
492
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
493
+ log_results (bool): Whether to log the results to the Judgment API
494
+ project_name (str): The name of the project the evaluation results belong to
495
+ eval_run_name (str): A name for this evaluation run
496
+ override (bool): Whether to override an existing evaluation run with the same name
497
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
370
498
  """
371
499
  results = self.run_evaluation(
372
500
  examples=examples,
@@ -377,7 +505,8 @@ class JudgmentClient:
377
505
  log_results=log_results,
378
506
  project_name=project_name,
379
507
  eval_run_name=eval_run_name,
380
- override=override
508
+ override=override,
509
+ rules=rules
381
510
  )
382
511
 
383
512
  assert_test(results)
judgeval/rules.py ADDED
@@ -0,0 +1,384 @@
1
+ """
2
+ Rules system for Judgeval that enables alerts based on metric thresholds.
3
+ """
4
+
5
+ from typing import Dict, List, Optional, Union, Any, Set, Tuple
6
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
7
+ from enum import Enum
8
+ from datetime import datetime
9
+ import asyncio
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ import time
12
+ import uuid # Add import for uuid module
13
+
14
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
15
+ from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
16
+
17
+ class AlertStatus(str, Enum):
18
+ """Status of an alert evaluation."""
19
+ TRIGGERED = "triggered"
20
+ NOT_TRIGGERED = "not_triggered"
21
+
22
+ class Operator(str, Enum):
23
+ """Comparison operators for conditions."""
24
+ GT = ">"
25
+ GTE = ">="
26
+ LT = "<"
27
+ LTE = "<="
28
+ EQ = "=="
29
+ NEQ = "!="
30
+
31
+ class Condition(BaseModel):
32
+ """
33
+ A single metric condition.
34
+
35
+ Example:
36
+ {
37
+ "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
38
+ "operator": ">=",
39
+ "threshold": 0.7
40
+ }
41
+ """
42
+ model_config = ConfigDict(arbitrary_types_allowed=True)
43
+
44
+ metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
45
+ operator: Operator
46
+ threshold: float
47
+
48
+ @property
49
+ def metric_name(self) -> str:
50
+ """Get the name of the metric for lookups in scores dictionary."""
51
+ if isinstance(self.metric, ScorerWrapper):
52
+ # Handle ScorerWrapper case specifically
53
+ return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
54
+ elif hasattr(self.metric, 'score_type'):
55
+ # Handle APIJudgmentScorer and JudgevalScorer which have score_type
56
+ return self.metric.score_type
57
+ elif hasattr(self.metric, '__name__'):
58
+ # Handle cases where metric has a __name__ attribute
59
+ return self.metric.__name__
60
+ # Fallback to string representation
61
+ return str(self.metric)
62
+
63
+ def evaluate(self, value: float) -> bool:
64
+ """Evaluate this condition against a value."""
65
+ if self.operator == Operator.GT:
66
+ return value > self.threshold
67
+ elif self.operator == Operator.GTE:
68
+ return value >= self.threshold
69
+ elif self.operator == Operator.LT:
70
+ return value < self.threshold
71
+ elif self.operator == Operator.LTE:
72
+ return value <= self.threshold
73
+ elif self.operator == Operator.EQ:
74
+ return value == self.threshold
75
+ elif self.operator == Operator.NEQ:
76
+ return value != self.threshold
77
+ else:
78
+ raise ValueError(f"Unknown operator: {self.operator}")
79
+
80
+ class Rule(BaseModel):
81
+ """
82
+ Configuration for a single rule.
83
+
84
+ Example:
85
+ {
86
+ "rule_id": "123e4567-e89b-12d3-a456-426614174000",
87
+ "name": "Quality Check",
88
+ "description": "Check if quality metrics meet thresholds",
89
+ "conditions": [
90
+ {"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
91
+ {"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
92
+ ],
93
+ "combine_type": "all" # "all" or "any"
94
+ }
95
+ """
96
+ rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
97
+ name: str
98
+ description: Optional[str] = None
99
+ conditions: List[Condition]
100
+ combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
101
+
102
+ def model_dump(self, **kwargs):
103
+ """
104
+ Custom serialization that properly handles condition serialization.
105
+ """
106
+ data = super().model_dump(**kwargs)
107
+
108
+ # Special handling for conditions with complex metric objects
109
+ if "conditions" in data:
110
+ for i, condition in enumerate(data["conditions"]):
111
+ if "metric" in condition:
112
+ # Get the actual metric object
113
+ metric_obj = self.conditions[i].metric
114
+
115
+ # Create standardized metric representation needed by server API
116
+ metric_data = {
117
+ "score_type": "",
118
+ "threshold": 0.0
119
+ }
120
+
121
+ # First try to use object's own serialization methods
122
+ if hasattr(metric_obj, "to_dict"):
123
+ orig_data = metric_obj.to_dict()
124
+ # Copy any existing fields
125
+ for key, value in orig_data.items():
126
+ metric_data[key] = value
127
+ elif hasattr(metric_obj, "model_dump"):
128
+ orig_data = metric_obj.model_dump()
129
+ # Copy any existing fields
130
+ for key, value in orig_data.items():
131
+ metric_data[key] = value
132
+
133
+ # If we already have data from original serialization methods but missing required fields
134
+ if 'name' in metric_data and 'score_type' not in metric_data:
135
+ metric_data['score_type'] = metric_data['name']
136
+
137
+ # Ensure required fields have values by checking various sources
138
+ if not metric_data['score_type']:
139
+ # Try to get score_type from different possible attributes
140
+ if hasattr(metric_obj, 'score_type'):
141
+ metric_data['score_type'] = metric_obj.score_type
142
+ elif hasattr(metric_obj, 'name'):
143
+ metric_data['score_type'] = metric_obj.name
144
+ else:
145
+ # Last resort: use string representation
146
+ metric_data['score_type'] = str(metric_obj)
147
+
148
+ # Make sure threshold is set
149
+ if not metric_data.get('threshold') and metric_data.get('threshold') != 0.0:
150
+ if hasattr(metric_obj, 'threshold'):
151
+ metric_data['threshold'] = metric_obj.threshold
152
+ else:
153
+ # Use condition threshold if metric doesn't have one
154
+ metric_data['threshold'] = self.conditions[i].threshold
155
+
156
+ # Update the condition with our properly serialized metric
157
+ condition["metric"] = metric_data
158
+
159
+ return data
160
+
161
+ @field_validator('conditions')
162
+ def validate_conditions_not_empty(cls, v):
163
+ if not v:
164
+ raise ValueError("Conditions list cannot be empty")
165
+ return v
166
+
167
+ @field_validator('combine_type')
168
+ def validate_combine_type(cls, v):
169
+ if v not in ["all", "any"]:
170
+ raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
171
+ return v
172
+
173
+
174
+ class AlertResult(BaseModel):
175
+ """
176
+ Result of evaluating a rule.
177
+
178
+ Example:
179
+ {
180
+ "status": "triggered",
181
+ "rule_name": "Quality Check",
182
+ "conditions_result": [
183
+ {"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
184
+ {"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
185
+ ],
186
+ "rule_id": "123e4567-e89b-12d3-a456-426614174000",
187
+ "metadata": {
188
+ "example_id": "example_123",
189
+ "timestamp": "20240321_123456"
190
+ }
191
+ }
192
+ """
193
+ status: AlertStatus
194
+ rule_id: Optional[str] = None # The unique identifier of the rule
195
+ rule_name: str
196
+ conditions_result: List[Dict[str, Any]]
197
+ metadata: Dict[str, Any] = {}
198
+
199
+ @property
200
+ def example_id(self) -> Optional[str]:
201
+ """Get example_id from metadata for backward compatibility"""
202
+ return self.metadata.get("example_id")
203
+
204
+ @property
205
+ def timestamp(self) -> Optional[str]:
206
+ """Get timestamp from metadata for backward compatibility"""
207
+ return self.metadata.get("timestamp")
208
+
209
+ class RulesEngine:
210
+ """
211
+ Engine for evaluating rules and managing alerts.
212
+
213
+ Example usage:
214
+ rules = {
215
+ "quality_check": Rule(
216
+ name="Quality Check",
217
+ conditions=[
218
+ Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
219
+ Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
220
+ ],
221
+ combine_type="all"
222
+ )
223
+ }
224
+
225
+ engine = RulesEngine(rules)
226
+ scores = {"faithfulness": 0.8, "relevancy": 0.9}
227
+ alerts = engine.evaluate_rules(scores, example_metadata={
228
+ "example_id": "example_123",
229
+ "timestamp": "20240321_123456"
230
+ })
231
+ """
232
+
233
+ def __init__(self, rules: Dict[str, Rule]):
234
+ """
235
+ Initialize the RulesEngine with rules.
236
+
237
+ Args:
238
+ rules: Dictionary mapping rule IDs to rule configurations
239
+ """
240
+ self.rules = rules
241
+
242
+ def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
243
+ """
244
+ Evaluate all rules against a set of scores.
245
+ Returns mapping of rule IDs to their alert results.
246
+
247
+ Args:
248
+ scores: Dictionary of metric names to their score values
249
+ example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
250
+ """
251
+ results = {}
252
+
253
+ for rule_id, rule in self.rules.items():
254
+ # Evaluate each condition
255
+ condition_results = []
256
+ passed_conditions = []
257
+
258
+ for condition in rule.conditions:
259
+ # Get the metric name for lookup
260
+ metric_name = condition.metric_name
261
+ value = scores.get(metric_name)
262
+ if value is None:
263
+ # Skip this condition instead of evaluating it as false
264
+ condition_results.append({
265
+ "metric": metric_name,
266
+ "value": None,
267
+ "threshold": condition.threshold,
268
+ "operator": condition.operator,
269
+ "passed": None, # Using None to indicate the condition was skipped
270
+ "skipped": True # Add a flag to indicate this condition was skipped
271
+ })
272
+ continue # Skip adding to passed_conditions
273
+ else:
274
+ passed = condition.evaluate(value)
275
+ condition_results.append({
276
+ "metric": metric_name,
277
+ "value": value,
278
+ "threshold": condition.threshold,
279
+ "operator": condition.operator,
280
+ "passed": passed,
281
+ "skipped": False # Indicate this condition was evaluated
282
+ })
283
+ passed_conditions.append(passed)
284
+
285
+ # Determine if alert should trigger - only consider conditions that weren't skipped
286
+ if not passed_conditions:
287
+ # If all conditions were skipped, the rule doesn't trigger
288
+ triggered = False
289
+ else:
290
+ triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
291
+
292
+ # Create alert result with example metadata
293
+ alert_result = AlertResult(
294
+ status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
295
+ rule_id=rule.rule_id, # Include the rule's unique identifier
296
+ rule_name=rule.name,
297
+ conditions_result=condition_results
298
+ )
299
+
300
+ # Add example metadata if provided
301
+ if example_metadata:
302
+ if "example_id" in example_metadata:
303
+ alert_result.metadata["example_id"] = example_metadata["example_id"]
304
+ if "timestamp" in example_metadata:
305
+ alert_result.metadata["timestamp"] = example_metadata["timestamp"]
306
+
307
+ results[rule_id] = alert_result
308
+
309
+ return results
310
+
311
+ async def evaluate_rules_parallel(self,
312
+ example_scores: Dict[str, Dict[str, float]],
313
+ example_metadata: Dict[str, Dict[str, Any]],
314
+ max_concurrent: int = 100) -> Dict[str, Dict[str, AlertResult]]:
315
+ """
316
+ Evaluate all rules against multiple examples in parallel.
317
+
318
+ Args:
319
+ example_scores: Dictionary mapping example_ids to their score dictionaries
320
+ example_metadata: Dictionary mapping example_ids to their metadata
321
+ max_concurrent: Maximum number of concurrent evaluations
322
+
323
+ Returns:
324
+ Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
325
+ """
326
+ # Create semaphore to limit concurrent executions
327
+ semaphore = asyncio.Semaphore(max_concurrent)
328
+ results = {}
329
+ tasks = []
330
+
331
+ # Create a task for each example
332
+ for example_id, scores in example_scores.items():
333
+ metadata = example_metadata.get(example_id, {})
334
+ task = self._evaluate_with_semaphore(
335
+ semaphore=semaphore,
336
+ example_id=example_id,
337
+ scores=scores,
338
+ metadata=metadata
339
+ )
340
+ tasks.append(task)
341
+
342
+ # Run all tasks and collect results
343
+ example_results = await asyncio.gather(*tasks)
344
+
345
+ # Organize results by example_id
346
+ for example_id, result in example_results:
347
+ results[example_id] = result
348
+
349
+ return results
350
+
351
+ async def _evaluate_with_semaphore(self,
352
+ semaphore: asyncio.Semaphore,
353
+ example_id: str,
354
+ scores: Dict[str, float],
355
+ metadata: Dict[str, Any]) -> Tuple[str, Dict[str, AlertResult]]:
356
+ """
357
+ Helper method to evaluate rules for an example with semaphore control.
358
+
359
+ Args:
360
+ semaphore: Semaphore to control concurrency
361
+ example_id: ID of the example being evaluated
362
+ scores: Dictionary of scores for this example
363
+ metadata: Metadata for this example
364
+
365
+ Returns:
366
+ Tuple of (example_id, rule_results)
367
+ """
368
+ async with semaphore:
369
+ # Run the evaluation in a thread pool to avoid blocking the event loop
370
+ # for CPU-bound operations
371
+ with ThreadPoolExecutor() as executor:
372
+ start_time = time.perf_counter()
373
+ rule_results = await asyncio.get_event_loop().run_in_executor(
374
+ executor,
375
+ self.evaluate_rules,
376
+ scores,
377
+ metadata
378
+ )
379
+ end_time = time.perf_counter()
380
+
381
+ # Could log performance metrics here if needed
382
+ # debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
383
+
384
+ return (example_id, rule_results)