judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/rules.py
CHANGED
@@ -2,43 +2,39 @@
|
|
2
2
|
Rules system for Judgeval that enables alerts based on metric thresholds.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import Dict, List, Optional, Union, Any,
|
5
|
+
from typing import Dict, List, Optional, Union, Any, Tuple
|
6
6
|
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
7
|
-
from enum import Enum
|
8
7
|
import asyncio
|
9
8
|
from concurrent.futures import ThreadPoolExecutor
|
10
|
-
import time
|
11
9
|
import uuid
|
12
|
-
import os
|
13
|
-
import re
|
14
|
-
import json
|
15
|
-
from datetime import datetime
|
16
10
|
|
17
11
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
18
12
|
from judgeval.utils.alerts import AlertStatus, AlertResult
|
19
13
|
|
14
|
+
|
20
15
|
class Condition(BaseModel):
|
21
16
|
"""
|
22
17
|
A single metric condition.
|
23
|
-
|
18
|
+
|
24
19
|
Example:
|
25
20
|
{
|
26
21
|
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
|
27
22
|
}
|
28
|
-
|
23
|
+
|
29
24
|
The Condition class uses the scorer's threshold and success function internally.
|
30
25
|
"""
|
26
|
+
|
31
27
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
32
|
-
|
28
|
+
|
33
29
|
metric: Union[APIJudgmentScorer, JudgevalScorer]
|
34
30
|
|
35
31
|
@property
|
36
32
|
def metric_name(self) -> str:
|
37
33
|
"""Get the name of the metric for lookups in scores dictionary."""
|
38
|
-
if hasattr(self.metric,
|
34
|
+
if hasattr(self.metric, "score_type"):
|
39
35
|
# Handle APIJudgmentScorer and JudgevalScorer which have score_type
|
40
36
|
return self.metric.score_type
|
41
|
-
elif hasattr(self.metric,
|
37
|
+
elif hasattr(self.metric, "__name__"):
|
42
38
|
# Handle cases where metric has a __name__ attribute
|
43
39
|
return self.metric.__name__
|
44
40
|
# Fallback to string representation
|
@@ -47,7 +43,7 @@ class Condition(BaseModel):
|
|
47
43
|
@property
|
48
44
|
def threshold(self) -> float:
|
49
45
|
"""Get the threshold from the metric."""
|
50
|
-
return self.metric.threshold if hasattr(self.metric,
|
46
|
+
return self.metric.threshold if hasattr(self.metric, "threshold") else 0.5
|
51
47
|
|
52
48
|
def evaluate(self, value: float) -> bool:
|
53
49
|
"""
|
@@ -56,22 +52,23 @@ class Condition(BaseModel):
|
|
56
52
|
Uses the scorer's success check function if available.
|
57
53
|
"""
|
58
54
|
# Store the value in the scorer
|
59
|
-
if hasattr(self.metric,
|
55
|
+
if hasattr(self.metric, "score"):
|
60
56
|
self.metric.score = value
|
61
|
-
|
57
|
+
|
62
58
|
# Use the scorer's success check function if available
|
63
|
-
if hasattr(self.metric,
|
59
|
+
if hasattr(self.metric, "success_check"):
|
64
60
|
return self.metric.success_check()
|
65
|
-
elif hasattr(self.metric,
|
61
|
+
elif hasattr(self.metric, "_success_check"):
|
66
62
|
return self.metric._success_check()
|
67
63
|
else:
|
68
64
|
# Fallback to default comparison (greater than or equal)
|
69
65
|
return value >= self.threshold if self.threshold is not None else False
|
70
66
|
|
67
|
+
|
71
68
|
class PagerDutyConfig(BaseModel):
|
72
69
|
"""
|
73
70
|
Configuration for PagerDuty notifications.
|
74
|
-
|
71
|
+
|
75
72
|
Attributes:
|
76
73
|
routing_key: PagerDuty integration routing key
|
77
74
|
severity: Severity level (critical, error, warning, info)
|
@@ -80,13 +77,14 @@ class PagerDutyConfig(BaseModel):
|
|
80
77
|
group: Optional logical grouping for the alert
|
81
78
|
class_type: Optional class/type of alert event
|
82
79
|
"""
|
80
|
+
|
83
81
|
routing_key: str
|
84
82
|
severity: str = "error" # critical, error, warning, info
|
85
83
|
source: str = "judgeval"
|
86
84
|
component: Optional[str] = None
|
87
85
|
group: Optional[str] = None
|
88
86
|
class_type: Optional[str] = None
|
89
|
-
|
87
|
+
|
90
88
|
def model_dump(self, **kwargs):
|
91
89
|
"""Convert the PagerDutyConfig to a dictionary for JSON serialization."""
|
92
90
|
return {
|
@@ -95,13 +93,14 @@ class PagerDutyConfig(BaseModel):
|
|
95
93
|
"source": self.source,
|
96
94
|
"component": self.component,
|
97
95
|
"group": self.group,
|
98
|
-
"class_type": self.class_type
|
96
|
+
"class_type": self.class_type,
|
99
97
|
}
|
100
98
|
|
99
|
+
|
101
100
|
class NotificationConfig(BaseModel):
|
102
101
|
"""
|
103
102
|
Configuration for notifications when a rule is triggered.
|
104
|
-
|
103
|
+
|
105
104
|
Example:
|
106
105
|
{
|
107
106
|
"enabled": true,
|
@@ -113,33 +112,37 @@ class NotificationConfig(BaseModel):
|
|
113
112
|
},
|
114
113
|
"send_at": 1632150000 # Unix timestamp (specific date/time)
|
115
114
|
}
|
116
|
-
|
115
|
+
|
117
116
|
Communication Methods:
|
118
117
|
- "email": Send emails to specified email addresses
|
119
118
|
- "broadcast_slack": Send broadcast notifications to all configured Slack channels
|
120
119
|
- "broadcast_email": Send broadcast emails to all organization emails
|
121
120
|
- "pagerduty": Send alerts to PagerDuty using the configured routing key
|
122
121
|
"""
|
122
|
+
|
123
123
|
enabled: bool = True
|
124
124
|
communication_methods: List[str] = []
|
125
125
|
email_addresses: Optional[List[str]] = None
|
126
126
|
pagerduty_config: Optional[PagerDutyConfig] = None
|
127
127
|
send_at: Optional[int] = None # Unix timestamp for scheduled notifications
|
128
|
-
|
128
|
+
|
129
129
|
def model_dump(self, **kwargs):
|
130
130
|
"""Convert the NotificationConfig to a dictionary for JSON serialization."""
|
131
131
|
return {
|
132
132
|
"enabled": self.enabled,
|
133
133
|
"communication_methods": self.communication_methods,
|
134
134
|
"email_addresses": self.email_addresses,
|
135
|
-
"pagerduty_config": self.pagerduty_config.model_dump()
|
136
|
-
|
135
|
+
"pagerduty_config": self.pagerduty_config.model_dump()
|
136
|
+
if self.pagerduty_config
|
137
|
+
else None,
|
138
|
+
"send_at": self.send_at,
|
137
139
|
}
|
138
140
|
|
141
|
+
|
139
142
|
class Rule(BaseModel):
|
140
143
|
"""
|
141
144
|
Configuration for a single rule.
|
142
|
-
|
145
|
+
|
143
146
|
Example:
|
144
147
|
{
|
145
148
|
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
@@ -157,34 +160,32 @@ class Rule(BaseModel):
|
|
157
160
|
}
|
158
161
|
}
|
159
162
|
"""
|
160
|
-
|
163
|
+
|
164
|
+
rule_id: str = Field(
|
165
|
+
default_factory=lambda: str(uuid.uuid4())
|
166
|
+
) # Random UUID string as default value
|
161
167
|
name: str
|
162
168
|
description: Optional[str] = None
|
163
169
|
conditions: List[Condition]
|
164
170
|
combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
|
165
171
|
notification: Optional[NotificationConfig] = None # Configuration for notifications
|
166
|
-
|
167
172
|
|
168
173
|
def model_dump(self, **kwargs):
|
169
174
|
"""
|
170
175
|
Custom serialization that properly handles condition serialization.
|
171
176
|
"""
|
172
177
|
data = super().model_dump(**kwargs)
|
173
|
-
|
178
|
+
|
174
179
|
# Special handling for conditions with complex metric objects
|
175
180
|
if "conditions" in data:
|
176
181
|
for i, condition in enumerate(data["conditions"]):
|
177
182
|
if "metric" in condition:
|
178
183
|
# Get the actual metric object
|
179
184
|
metric_obj = self.conditions[i].metric
|
180
|
-
|
185
|
+
|
181
186
|
# Create standardized metric representation needed by server API
|
182
|
-
metric_data = {
|
183
|
-
|
184
|
-
"threshold": 0.0,
|
185
|
-
"name": ""
|
186
|
-
}
|
187
|
-
|
187
|
+
metric_data = {"score_type": "", "threshold": 0.0, "name": ""}
|
188
|
+
|
188
189
|
# First try to use object's own serialization methods
|
189
190
|
if hasattr(metric_obj, "to_dict"):
|
190
191
|
orig_data = metric_obj.to_dict()
|
@@ -196,61 +197,67 @@ class Rule(BaseModel):
|
|
196
197
|
# Copy any existing fields
|
197
198
|
for key, value in orig_data.items():
|
198
199
|
metric_data[key] = value
|
199
|
-
|
200
|
+
|
200
201
|
# If we already have data from original serialization methods but missing required fields
|
201
|
-
if
|
202
|
-
metric_data[
|
203
|
-
|
202
|
+
if "name" in metric_data and "score_type" not in metric_data:
|
203
|
+
metric_data["score_type"] = metric_data["name"]
|
204
|
+
|
204
205
|
# Ensure required fields have values by checking various sources
|
205
|
-
if not metric_data[
|
206
|
+
if not metric_data["score_type"]:
|
206
207
|
# Try to get score_type from different possible attributes
|
207
|
-
if hasattr(metric_obj,
|
208
|
-
metric_data[
|
209
|
-
elif hasattr(metric_obj,
|
210
|
-
metric_data[
|
208
|
+
if hasattr(metric_obj, "score_type"):
|
209
|
+
metric_data["score_type"] = metric_obj.score_type
|
210
|
+
elif hasattr(metric_obj, "name"):
|
211
|
+
metric_data["score_type"] = metric_obj.name
|
211
212
|
else:
|
212
213
|
# Last resort: use string representation
|
213
|
-
metric_data[
|
214
|
-
|
214
|
+
metric_data["score_type"] = str(metric_obj)
|
215
|
+
|
215
216
|
# Make sure threshold is set
|
216
|
-
if
|
217
|
-
|
218
|
-
|
217
|
+
if (
|
218
|
+
not metric_data.get("threshold")
|
219
|
+
and metric_data.get("threshold") != 0.0
|
220
|
+
):
|
221
|
+
if hasattr(metric_obj, "threshold"):
|
222
|
+
metric_data["threshold"] = metric_obj.threshold
|
219
223
|
else:
|
220
224
|
# Use condition threshold if metric doesn't have one
|
221
|
-
metric_data[
|
222
|
-
|
225
|
+
metric_data["threshold"] = self.conditions[i].threshold
|
226
|
+
|
223
227
|
# Make sure name is set
|
224
|
-
if not metric_data.get(
|
225
|
-
if hasattr(metric_obj,
|
226
|
-
metric_data[
|
227
|
-
elif hasattr(metric_obj,
|
228
|
-
metric_data[
|
228
|
+
if not metric_data.get("name"):
|
229
|
+
if hasattr(metric_obj, "__name__"):
|
230
|
+
metric_data["name"] = metric_obj.__name__
|
231
|
+
elif hasattr(metric_obj, "name"):
|
232
|
+
metric_data["name"] = metric_obj.name
|
229
233
|
else:
|
230
234
|
# Fallback to score_type if available
|
231
|
-
metric_data[
|
232
|
-
|
235
|
+
metric_data["name"] = metric_data.get(
|
236
|
+
"score_type", str(metric_obj)
|
237
|
+
)
|
238
|
+
|
233
239
|
# Update the condition with our properly serialized metric
|
234
240
|
condition["metric"] = metric_data
|
235
|
-
|
241
|
+
|
236
242
|
return data
|
237
243
|
|
238
|
-
@field_validator(
|
244
|
+
@field_validator("conditions")
|
239
245
|
def validate_conditions_not_empty(cls, v):
|
240
246
|
if not v:
|
241
247
|
raise ValueError("Conditions list cannot be empty")
|
242
248
|
return v
|
243
249
|
|
244
|
-
@field_validator(
|
250
|
+
@field_validator("combine_type")
|
245
251
|
def validate_combine_type(cls, v):
|
246
252
|
if v not in ["all", "any"]:
|
247
253
|
raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
|
248
254
|
return v
|
249
255
|
|
256
|
+
|
250
257
|
class RulesEngine:
|
251
258
|
"""
|
252
259
|
Engine for creating and evaluating rules against metrics.
|
253
|
-
|
260
|
+
|
254
261
|
Example:
|
255
262
|
```python
|
256
263
|
# Define rules
|
@@ -265,10 +272,10 @@ class RulesEngine:
|
|
265
272
|
combine_type="all"
|
266
273
|
)
|
267
274
|
}
|
268
|
-
|
275
|
+
|
269
276
|
# Create rules engine
|
270
277
|
engine = RulesEngine(rules)
|
271
|
-
|
278
|
+
|
272
279
|
# Configure notifications
|
273
280
|
engine.configure_notification(
|
274
281
|
rule_id="1",
|
@@ -276,29 +283,33 @@ class RulesEngine:
|
|
276
283
|
communication_methods=["slack", "email"],
|
277
284
|
email_addresses=["user@example.com"]
|
278
285
|
)
|
279
|
-
|
286
|
+
|
280
287
|
# Evaluate rules
|
281
288
|
scores = {"faithfulness": 0.65, "relevancy": 0.85}
|
282
289
|
results = engine.evaluate_rules(scores, {"example_id": "example_123"})
|
283
290
|
```
|
284
291
|
"""
|
285
|
-
|
292
|
+
|
286
293
|
def __init__(self, rules: Dict[str, Rule]):
|
287
294
|
"""
|
288
295
|
Initialize the rules engine.
|
289
|
-
|
296
|
+
|
290
297
|
Args:
|
291
298
|
rules: Dictionary mapping rule IDs to Rule objects
|
292
299
|
"""
|
293
300
|
self.rules = rules
|
294
301
|
|
295
|
-
def configure_notification(
|
296
|
-
|
297
|
-
|
298
|
-
|
302
|
+
def configure_notification(
|
303
|
+
self,
|
304
|
+
rule_id: str,
|
305
|
+
enabled: bool = True,
|
306
|
+
communication_methods: List[str] | None = None,
|
307
|
+
email_addresses: List[str] | None = None,
|
308
|
+
send_at: Optional[int] = None,
|
309
|
+
) -> None:
|
299
310
|
"""
|
300
311
|
Configure notification settings for a specific rule.
|
301
|
-
|
312
|
+
|
302
313
|
Args:
|
303
314
|
rule_id: ID of the rule to configure notifications for
|
304
315
|
enabled: Whether notifications are enabled for this rule
|
@@ -308,32 +319,35 @@ class RulesEngine:
|
|
308
319
|
"""
|
309
320
|
if rule_id not in self.rules:
|
310
321
|
raise ValueError(f"Rule ID '{rule_id}' not found")
|
311
|
-
|
322
|
+
|
312
323
|
rule = self.rules[rule_id]
|
313
|
-
|
324
|
+
|
314
325
|
# Create notification configuration if it doesn't exist
|
315
326
|
if rule.notification is None:
|
316
327
|
rule.notification = NotificationConfig()
|
317
|
-
|
328
|
+
|
318
329
|
# Set notification parameters
|
319
330
|
rule.notification.enabled = enabled
|
320
|
-
|
331
|
+
|
321
332
|
if communication_methods is not None:
|
322
333
|
rule.notification.communication_methods = communication_methods
|
323
|
-
|
334
|
+
|
324
335
|
if email_addresses is not None:
|
325
336
|
rule.notification.email_addresses = email_addresses
|
326
|
-
|
337
|
+
|
327
338
|
if send_at is not None:
|
328
339
|
rule.notification.send_at = send_at
|
329
|
-
|
330
|
-
def configure_all_notifications(
|
331
|
-
|
332
|
-
|
333
|
-
|
340
|
+
|
341
|
+
def configure_all_notifications(
|
342
|
+
self,
|
343
|
+
enabled: bool = True,
|
344
|
+
communication_methods: List[str] | None = None,
|
345
|
+
email_addresses: List[str] | None = None,
|
346
|
+
send_at: Optional[int] = None,
|
347
|
+
) -> None:
|
334
348
|
"""
|
335
349
|
Configure notification settings for all rules.
|
336
|
-
|
350
|
+
|
337
351
|
Args:
|
338
352
|
enabled: Whether notifications are enabled
|
339
353
|
communication_methods: List of notification methods (e.g., ["slack", "email"])
|
@@ -346,14 +360,18 @@ class RulesEngine:
|
|
346
360
|
enabled=enabled,
|
347
361
|
communication_methods=communication_methods,
|
348
362
|
email_addresses=email_addresses,
|
349
|
-
send_at=send_at
|
363
|
+
send_at=send_at,
|
350
364
|
)
|
351
|
-
|
352
|
-
def evaluate_rules(
|
365
|
+
|
366
|
+
def evaluate_rules(
|
367
|
+
self,
|
368
|
+
scores: Dict[str, float],
|
369
|
+
example_metadata: Optional[Dict[str, Any]] = None,
|
370
|
+
) -> Dict[str, AlertResult]:
|
353
371
|
"""
|
354
372
|
Evaluate all rules against a set of scores.
|
355
373
|
Returns mapping of rule IDs to their alert results.
|
356
|
-
|
374
|
+
|
357
375
|
Args:
|
358
376
|
scores: Dictionary of metric names to their score values
|
359
377
|
example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
|
@@ -364,33 +382,37 @@ class RulesEngine:
|
|
364
382
|
# Evaluate each condition
|
365
383
|
condition_results = []
|
366
384
|
passed_conditions = []
|
367
|
-
|
385
|
+
|
368
386
|
for condition in rule.conditions:
|
369
387
|
# Get the metric name for lookup
|
370
388
|
metric_name = condition.metric_name
|
371
389
|
value = scores.get(metric_name)
|
372
|
-
|
390
|
+
|
373
391
|
if value is None:
|
374
392
|
# Skip this condition instead of evaluating it as false
|
375
|
-
condition_results.append(
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
393
|
+
condition_results.append(
|
394
|
+
{
|
395
|
+
"metric": metric_name,
|
396
|
+
"value": None,
|
397
|
+
"threshold": condition.threshold,
|
398
|
+
"passed": None, # Using None to indicate the condition was skipped
|
399
|
+
"skipped": True, # Add a flag to indicate this condition was skipped
|
400
|
+
}
|
401
|
+
)
|
382
402
|
continue # Skip adding to passed_conditions
|
383
403
|
else:
|
384
404
|
passed = condition.evaluate(value)
|
385
|
-
condition_results.append(
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
405
|
+
condition_results.append(
|
406
|
+
{
|
407
|
+
"metric": metric_name,
|
408
|
+
"value": value,
|
409
|
+
"threshold": condition.threshold,
|
410
|
+
"passed": passed,
|
411
|
+
"skipped": False, # Indicate this condition was evaluated
|
412
|
+
}
|
413
|
+
)
|
392
414
|
passed_conditions.append(passed)
|
393
|
-
|
415
|
+
|
394
416
|
# Determine if alert should trigger - only consider conditions that weren't skipped
|
395
417
|
if not passed_conditions:
|
396
418
|
# If all conditions were skipped, the rule doesn't trigger
|
@@ -401,21 +423,23 @@ class RulesEngine:
|
|
401
423
|
# - All evaluated conditions must pass
|
402
424
|
# - All conditions must have been evaluated (none skipped)
|
403
425
|
all_conditions_passed = all(passed_conditions)
|
404
|
-
all_conditions_evaluated = len(passed_conditions) == len(
|
426
|
+
all_conditions_evaluated = len(passed_conditions) == len(
|
427
|
+
rule.conditions
|
428
|
+
)
|
405
429
|
triggered = all_conditions_passed and all_conditions_evaluated
|
406
430
|
else:
|
407
431
|
# For "any" combine_type, at least one condition must pass
|
408
432
|
triggered = any(passed_conditions)
|
409
|
-
|
433
|
+
|
410
434
|
# Create alert result with example metadata
|
411
435
|
notification_config = None
|
412
436
|
if triggered and rule.notification:
|
413
437
|
# If rule has a notification config and the alert is triggered, include it in the result
|
414
438
|
notification_config = rule.notification
|
415
|
-
|
439
|
+
|
416
440
|
# Set the alert status based on whether the rule was triggered using proper enum values
|
417
441
|
status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
|
418
|
-
|
442
|
+
|
419
443
|
# Create the alert result
|
420
444
|
alert_result = AlertResult(
|
421
445
|
status=status,
|
@@ -425,26 +449,32 @@ class RulesEngine:
|
|
425
449
|
notification=notification_config,
|
426
450
|
metadata=example_metadata or {},
|
427
451
|
combine_type=rule.combine_type,
|
428
|
-
project_id=example_metadata.get("project_id")
|
429
|
-
|
452
|
+
project_id=example_metadata.get("project_id")
|
453
|
+
if example_metadata
|
454
|
+
else None,
|
455
|
+
trace_span_id=example_metadata.get("trace_span_id")
|
456
|
+
if example_metadata
|
457
|
+
else None,
|
430
458
|
)
|
431
|
-
|
459
|
+
|
432
460
|
results[rule_id] = alert_result
|
433
|
-
|
461
|
+
|
434
462
|
return results
|
435
|
-
|
436
|
-
async def evaluate_rules_parallel(
|
437
|
-
|
438
|
-
|
439
|
-
|
463
|
+
|
464
|
+
async def evaluate_rules_parallel(
|
465
|
+
self,
|
466
|
+
example_scores: Dict[str, Dict[str, float]],
|
467
|
+
example_metadata: Dict[str, Dict[str, Any]],
|
468
|
+
max_concurrent: int = 100,
|
469
|
+
) -> Dict[str, Dict[str, AlertResult]]:
|
440
470
|
"""
|
441
471
|
Evaluate all rules against multiple examples in parallel.
|
442
|
-
|
472
|
+
|
443
473
|
Args:
|
444
474
|
example_scores: Dictionary mapping example_ids to their score dictionaries
|
445
475
|
example_metadata: Dictionary mapping example_ids to their metadata
|
446
476
|
max_concurrent: Maximum number of concurrent evaluations
|
447
|
-
|
477
|
+
|
448
478
|
Returns:
|
449
479
|
Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
|
450
480
|
"""
|
@@ -452,7 +482,7 @@ class RulesEngine:
|
|
452
482
|
semaphore = asyncio.Semaphore(max_concurrent)
|
453
483
|
results = {}
|
454
484
|
tasks = []
|
455
|
-
|
485
|
+
|
456
486
|
# Create a task for each example
|
457
487
|
for example_id, scores in example_scores.items():
|
458
488
|
metadata = example_metadata.get(example_id, {})
|
@@ -460,33 +490,35 @@ class RulesEngine:
|
|
460
490
|
semaphore=semaphore,
|
461
491
|
example_id=example_id,
|
462
492
|
scores=scores,
|
463
|
-
metadata=metadata
|
493
|
+
metadata=metadata,
|
464
494
|
)
|
465
495
|
tasks.append(task)
|
466
|
-
|
496
|
+
|
467
497
|
# Run all tasks and collect results
|
468
498
|
example_results = await asyncio.gather(*tasks)
|
469
|
-
|
499
|
+
|
470
500
|
# Organize results by example_id
|
471
501
|
for example_id, result in example_results:
|
472
502
|
results[example_id] = result
|
473
|
-
|
503
|
+
|
474
504
|
return results
|
475
|
-
|
476
|
-
async def _evaluate_with_semaphore(
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
505
|
+
|
506
|
+
async def _evaluate_with_semaphore(
|
507
|
+
self,
|
508
|
+
semaphore: asyncio.Semaphore,
|
509
|
+
example_id: str,
|
510
|
+
scores: Dict[str, float],
|
511
|
+
metadata: Dict[str, Any],
|
512
|
+
) -> Tuple[str, Dict[str, AlertResult]]:
|
481
513
|
"""
|
482
514
|
Helper method to evaluate rules for an example with semaphore control.
|
483
|
-
|
515
|
+
|
484
516
|
Args:
|
485
517
|
semaphore: Semaphore to control concurrency
|
486
518
|
example_id: ID of the example being evaluated
|
487
519
|
scores: Dictionary of scores for this example
|
488
520
|
metadata: Metadata for this example
|
489
|
-
|
521
|
+
|
490
522
|
Returns:
|
491
523
|
Tuple of (example_id, rule_results)
|
492
524
|
"""
|
@@ -494,13 +526,8 @@ class RulesEngine:
|
|
494
526
|
# Run the evaluation in a thread pool to avoid blocking the event loop
|
495
527
|
# for CPU-bound operations
|
496
528
|
with ThreadPoolExecutor() as executor:
|
497
|
-
start_time = time.perf_counter()
|
498
529
|
rule_results = await asyncio.get_event_loop().run_in_executor(
|
499
|
-
executor,
|
500
|
-
self.evaluate_rules,
|
501
|
-
scores,
|
502
|
-
metadata
|
530
|
+
executor, self.evaluate_rules, scores, metadata
|
503
531
|
)
|
504
|
-
|
505
|
-
|
506
|
-
return (example_id, rule_results)
|
532
|
+
|
533
|
+
return (example_id, rule_results)
|