judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/rules.py CHANGED
@@ -2,43 +2,39 @@
2
2
  Rules system for Judgeval that enables alerts based on metric thresholds.
3
3
  """
4
4
 
5
- from typing import Dict, List, Optional, Union, Any, Set, Tuple
5
+ from typing import Dict, List, Optional, Union, Any, Tuple
6
6
  from pydantic import BaseModel, Field, field_validator, ConfigDict
7
- from enum import Enum
8
7
  import asyncio
9
8
  from concurrent.futures import ThreadPoolExecutor
10
- import time
11
9
  import uuid
12
- import os
13
- import re
14
- import json
15
- from datetime import datetime
16
10
 
17
11
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
18
12
  from judgeval.utils.alerts import AlertStatus, AlertResult
19
13
 
14
+
20
15
  class Condition(BaseModel):
21
16
  """
22
17
  A single metric condition.
23
-
18
+
24
19
  Example:
25
20
  {
26
21
  "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
27
22
  }
28
-
23
+
29
24
  The Condition class uses the scorer's threshold and success function internally.
30
25
  """
26
+
31
27
  model_config = ConfigDict(arbitrary_types_allowed=True)
32
-
28
+
33
29
  metric: Union[APIJudgmentScorer, JudgevalScorer]
34
30
 
35
31
  @property
36
32
  def metric_name(self) -> str:
37
33
  """Get the name of the metric for lookups in scores dictionary."""
38
- if hasattr(self.metric, 'score_type'):
34
+ if hasattr(self.metric, "score_type"):
39
35
  # Handle APIJudgmentScorer and JudgevalScorer which have score_type
40
36
  return self.metric.score_type
41
- elif hasattr(self.metric, '__name__'):
37
+ elif hasattr(self.metric, "__name__"):
42
38
  # Handle cases where metric has a __name__ attribute
43
39
  return self.metric.__name__
44
40
  # Fallback to string representation
@@ -47,7 +43,7 @@ class Condition(BaseModel):
47
43
  @property
48
44
  def threshold(self) -> float:
49
45
  """Get the threshold from the metric."""
50
- return self.metric.threshold if hasattr(self.metric, 'threshold') else 0.5
46
+ return self.metric.threshold if hasattr(self.metric, "threshold") else 0.5
51
47
 
52
48
  def evaluate(self, value: float) -> bool:
53
49
  """
@@ -56,22 +52,23 @@ class Condition(BaseModel):
56
52
  Uses the scorer's success check function if available.
57
53
  """
58
54
  # Store the value in the scorer
59
- if hasattr(self.metric, 'score'):
55
+ if hasattr(self.metric, "score"):
60
56
  self.metric.score = value
61
-
57
+
62
58
  # Use the scorer's success check function if available
63
- if hasattr(self.metric, 'success_check'):
59
+ if hasattr(self.metric, "success_check"):
64
60
  return self.metric.success_check()
65
- elif hasattr(self.metric, '_success_check'):
61
+ elif hasattr(self.metric, "_success_check"):
66
62
  return self.metric._success_check()
67
63
  else:
68
64
  # Fallback to default comparison (greater than or equal)
69
65
  return value >= self.threshold if self.threshold is not None else False
70
66
 
67
+
71
68
  class PagerDutyConfig(BaseModel):
72
69
  """
73
70
  Configuration for PagerDuty notifications.
74
-
71
+
75
72
  Attributes:
76
73
  routing_key: PagerDuty integration routing key
77
74
  severity: Severity level (critical, error, warning, info)
@@ -80,13 +77,14 @@ class PagerDutyConfig(BaseModel):
80
77
  group: Optional logical grouping for the alert
81
78
  class_type: Optional class/type of alert event
82
79
  """
80
+
83
81
  routing_key: str
84
82
  severity: str = "error" # critical, error, warning, info
85
83
  source: str = "judgeval"
86
84
  component: Optional[str] = None
87
85
  group: Optional[str] = None
88
86
  class_type: Optional[str] = None
89
-
87
+
90
88
  def model_dump(self, **kwargs):
91
89
  """Convert the PagerDutyConfig to a dictionary for JSON serialization."""
92
90
  return {
@@ -95,13 +93,14 @@ class PagerDutyConfig(BaseModel):
95
93
  "source": self.source,
96
94
  "component": self.component,
97
95
  "group": self.group,
98
- "class_type": self.class_type
96
+ "class_type": self.class_type,
99
97
  }
100
98
 
99
+
101
100
  class NotificationConfig(BaseModel):
102
101
  """
103
102
  Configuration for notifications when a rule is triggered.
104
-
103
+
105
104
  Example:
106
105
  {
107
106
  "enabled": true,
@@ -113,33 +112,37 @@ class NotificationConfig(BaseModel):
113
112
  },
114
113
  "send_at": 1632150000 # Unix timestamp (specific date/time)
115
114
  }
116
-
115
+
117
116
  Communication Methods:
118
117
  - "email": Send emails to specified email addresses
119
118
  - "broadcast_slack": Send broadcast notifications to all configured Slack channels
120
119
  - "broadcast_email": Send broadcast emails to all organization emails
121
120
  - "pagerduty": Send alerts to PagerDuty using the configured routing key
122
121
  """
122
+
123
123
  enabled: bool = True
124
124
  communication_methods: List[str] = []
125
125
  email_addresses: Optional[List[str]] = None
126
126
  pagerduty_config: Optional[PagerDutyConfig] = None
127
127
  send_at: Optional[int] = None # Unix timestamp for scheduled notifications
128
-
128
+
129
129
  def model_dump(self, **kwargs):
130
130
  """Convert the NotificationConfig to a dictionary for JSON serialization."""
131
131
  return {
132
132
  "enabled": self.enabled,
133
133
  "communication_methods": self.communication_methods,
134
134
  "email_addresses": self.email_addresses,
135
- "pagerduty_config": self.pagerduty_config.model_dump() if self.pagerduty_config else None,
136
- "send_at": self.send_at
135
+ "pagerduty_config": self.pagerduty_config.model_dump()
136
+ if self.pagerduty_config
137
+ else None,
138
+ "send_at": self.send_at,
137
139
  }
138
140
 
141
+
139
142
  class Rule(BaseModel):
140
143
  """
141
144
  Configuration for a single rule.
142
-
145
+
143
146
  Example:
144
147
  {
145
148
  "rule_id": "123e4567-e89b-12d3-a456-426614174000",
@@ -157,34 +160,32 @@ class Rule(BaseModel):
157
160
  }
158
161
  }
159
162
  """
160
- rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
163
+
164
+ rule_id: str = Field(
165
+ default_factory=lambda: str(uuid.uuid4())
166
+ ) # Random UUID string as default value
161
167
  name: str
162
168
  description: Optional[str] = None
163
169
  conditions: List[Condition]
164
170
  combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
165
171
  notification: Optional[NotificationConfig] = None # Configuration for notifications
166
-
167
172
 
168
173
  def model_dump(self, **kwargs):
169
174
  """
170
175
  Custom serialization that properly handles condition serialization.
171
176
  """
172
177
  data = super().model_dump(**kwargs)
173
-
178
+
174
179
  # Special handling for conditions with complex metric objects
175
180
  if "conditions" in data:
176
181
  for i, condition in enumerate(data["conditions"]):
177
182
  if "metric" in condition:
178
183
  # Get the actual metric object
179
184
  metric_obj = self.conditions[i].metric
180
-
185
+
181
186
  # Create standardized metric representation needed by server API
182
- metric_data = {
183
- "score_type": "",
184
- "threshold": 0.0,
185
- "name": ""
186
- }
187
-
187
+ metric_data = {"score_type": "", "threshold": 0.0, "name": ""}
188
+
188
189
  # First try to use object's own serialization methods
189
190
  if hasattr(metric_obj, "to_dict"):
190
191
  orig_data = metric_obj.to_dict()
@@ -196,61 +197,67 @@ class Rule(BaseModel):
196
197
  # Copy any existing fields
197
198
  for key, value in orig_data.items():
198
199
  metric_data[key] = value
199
-
200
+
200
201
  # If we already have data from original serialization methods but missing required fields
201
- if 'name' in metric_data and 'score_type' not in metric_data:
202
- metric_data['score_type'] = metric_data['name']
203
-
202
+ if "name" in metric_data and "score_type" not in metric_data:
203
+ metric_data["score_type"] = metric_data["name"]
204
+
204
205
  # Ensure required fields have values by checking various sources
205
- if not metric_data['score_type']:
206
+ if not metric_data["score_type"]:
206
207
  # Try to get score_type from different possible attributes
207
- if hasattr(metric_obj, 'score_type'):
208
- metric_data['score_type'] = metric_obj.score_type
209
- elif hasattr(metric_obj, 'name'):
210
- metric_data['score_type'] = metric_obj.name
208
+ if hasattr(metric_obj, "score_type"):
209
+ metric_data["score_type"] = metric_obj.score_type
210
+ elif hasattr(metric_obj, "name"):
211
+ metric_data["score_type"] = metric_obj.name
211
212
  else:
212
213
  # Last resort: use string representation
213
- metric_data['score_type'] = str(metric_obj)
214
-
214
+ metric_data["score_type"] = str(metric_obj)
215
+
215
216
  # Make sure threshold is set
216
- if not metric_data.get('threshold') and metric_data.get('threshold') != 0.0:
217
- if hasattr(metric_obj, 'threshold'):
218
- metric_data['threshold'] = metric_obj.threshold
217
+ if (
218
+ not metric_data.get("threshold")
219
+ and metric_data.get("threshold") != 0.0
220
+ ):
221
+ if hasattr(metric_obj, "threshold"):
222
+ metric_data["threshold"] = metric_obj.threshold
219
223
  else:
220
224
  # Use condition threshold if metric doesn't have one
221
- metric_data['threshold'] = self.conditions[i].threshold
222
-
225
+ metric_data["threshold"] = self.conditions[i].threshold
226
+
223
227
  # Make sure name is set
224
- if not metric_data.get('name'):
225
- if hasattr(metric_obj, '__name__'):
226
- metric_data['name'] = metric_obj.__name__
227
- elif hasattr(metric_obj, 'name'):
228
- metric_data['name'] = metric_obj.name
228
+ if not metric_data.get("name"):
229
+ if hasattr(metric_obj, "__name__"):
230
+ metric_data["name"] = metric_obj.__name__
231
+ elif hasattr(metric_obj, "name"):
232
+ metric_data["name"] = metric_obj.name
229
233
  else:
230
234
  # Fallback to score_type if available
231
- metric_data['name'] = metric_data.get('score_type', str(metric_obj))
232
-
235
+ metric_data["name"] = metric_data.get(
236
+ "score_type", str(metric_obj)
237
+ )
238
+
233
239
  # Update the condition with our properly serialized metric
234
240
  condition["metric"] = metric_data
235
-
241
+
236
242
  return data
237
243
 
238
- @field_validator('conditions')
244
+ @field_validator("conditions")
239
245
  def validate_conditions_not_empty(cls, v):
240
246
  if not v:
241
247
  raise ValueError("Conditions list cannot be empty")
242
248
  return v
243
249
 
244
- @field_validator('combine_type')
250
+ @field_validator("combine_type")
245
251
  def validate_combine_type(cls, v):
246
252
  if v not in ["all", "any"]:
247
253
  raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
248
254
  return v
249
255
 
256
+
250
257
  class RulesEngine:
251
258
  """
252
259
  Engine for creating and evaluating rules against metrics.
253
-
260
+
254
261
  Example:
255
262
  ```python
256
263
  # Define rules
@@ -265,10 +272,10 @@ class RulesEngine:
265
272
  combine_type="all"
266
273
  )
267
274
  }
268
-
275
+
269
276
  # Create rules engine
270
277
  engine = RulesEngine(rules)
271
-
278
+
272
279
  # Configure notifications
273
280
  engine.configure_notification(
274
281
  rule_id="1",
@@ -276,29 +283,33 @@ class RulesEngine:
276
283
  communication_methods=["slack", "email"],
277
284
  email_addresses=["user@example.com"]
278
285
  )
279
-
286
+
280
287
  # Evaluate rules
281
288
  scores = {"faithfulness": 0.65, "relevancy": 0.85}
282
289
  results = engine.evaluate_rules(scores, {"example_id": "example_123"})
283
290
  ```
284
291
  """
285
-
292
+
286
293
  def __init__(self, rules: Dict[str, Rule]):
287
294
  """
288
295
  Initialize the rules engine.
289
-
296
+
290
297
  Args:
291
298
  rules: Dictionary mapping rule IDs to Rule objects
292
299
  """
293
300
  self.rules = rules
294
301
 
295
- def configure_notification(self, rule_id: str, enabled: bool = True,
296
- communication_methods: List[str] = None,
297
- email_addresses: List[str] = None,
298
- send_at: Optional[int] = None) -> None:
302
+ def configure_notification(
303
+ self,
304
+ rule_id: str,
305
+ enabled: bool = True,
306
+ communication_methods: List[str] | None = None,
307
+ email_addresses: List[str] | None = None,
308
+ send_at: Optional[int] = None,
309
+ ) -> None:
299
310
  """
300
311
  Configure notification settings for a specific rule.
301
-
312
+
302
313
  Args:
303
314
  rule_id: ID of the rule to configure notifications for
304
315
  enabled: Whether notifications are enabled for this rule
@@ -308,32 +319,35 @@ class RulesEngine:
308
319
  """
309
320
  if rule_id not in self.rules:
310
321
  raise ValueError(f"Rule ID '{rule_id}' not found")
311
-
322
+
312
323
  rule = self.rules[rule_id]
313
-
324
+
314
325
  # Create notification configuration if it doesn't exist
315
326
  if rule.notification is None:
316
327
  rule.notification = NotificationConfig()
317
-
328
+
318
329
  # Set notification parameters
319
330
  rule.notification.enabled = enabled
320
-
331
+
321
332
  if communication_methods is not None:
322
333
  rule.notification.communication_methods = communication_methods
323
-
334
+
324
335
  if email_addresses is not None:
325
336
  rule.notification.email_addresses = email_addresses
326
-
337
+
327
338
  if send_at is not None:
328
339
  rule.notification.send_at = send_at
329
-
330
- def configure_all_notifications(self, enabled: bool = True,
331
- communication_methods: List[str] = None,
332
- email_addresses: List[str] = None,
333
- send_at: Optional[int] = None) -> None:
340
+
341
+ def configure_all_notifications(
342
+ self,
343
+ enabled: bool = True,
344
+ communication_methods: List[str] | None = None,
345
+ email_addresses: List[str] | None = None,
346
+ send_at: Optional[int] = None,
347
+ ) -> None:
334
348
  """
335
349
  Configure notification settings for all rules.
336
-
350
+
337
351
  Args:
338
352
  enabled: Whether notifications are enabled
339
353
  communication_methods: List of notification methods (e.g., ["slack", "email"])
@@ -346,14 +360,18 @@ class RulesEngine:
346
360
  enabled=enabled,
347
361
  communication_methods=communication_methods,
348
362
  email_addresses=email_addresses,
349
- send_at=send_at
363
+ send_at=send_at,
350
364
  )
351
-
352
- def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
365
+
366
+ def evaluate_rules(
367
+ self,
368
+ scores: Dict[str, float],
369
+ example_metadata: Optional[Dict[str, Any]] = None,
370
+ ) -> Dict[str, AlertResult]:
353
371
  """
354
372
  Evaluate all rules against a set of scores.
355
373
  Returns mapping of rule IDs to their alert results.
356
-
374
+
357
375
  Args:
358
376
  scores: Dictionary of metric names to their score values
359
377
  example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
@@ -364,33 +382,37 @@ class RulesEngine:
364
382
  # Evaluate each condition
365
383
  condition_results = []
366
384
  passed_conditions = []
367
-
385
+
368
386
  for condition in rule.conditions:
369
387
  # Get the metric name for lookup
370
388
  metric_name = condition.metric_name
371
389
  value = scores.get(metric_name)
372
-
390
+
373
391
  if value is None:
374
392
  # Skip this condition instead of evaluating it as false
375
- condition_results.append({
376
- "metric": metric_name,
377
- "value": None,
378
- "threshold": condition.threshold,
379
- "passed": None, # Using None to indicate the condition was skipped
380
- "skipped": True # Add a flag to indicate this condition was skipped
381
- })
393
+ condition_results.append(
394
+ {
395
+ "metric": metric_name,
396
+ "value": None,
397
+ "threshold": condition.threshold,
398
+ "passed": None, # Using None to indicate the condition was skipped
399
+ "skipped": True, # Add a flag to indicate this condition was skipped
400
+ }
401
+ )
382
402
  continue # Skip adding to passed_conditions
383
403
  else:
384
404
  passed = condition.evaluate(value)
385
- condition_results.append({
386
- "metric": metric_name,
387
- "value": value,
388
- "threshold": condition.threshold,
389
- "passed": passed,
390
- "skipped": False # Indicate this condition was evaluated
391
- })
405
+ condition_results.append(
406
+ {
407
+ "metric": metric_name,
408
+ "value": value,
409
+ "threshold": condition.threshold,
410
+ "passed": passed,
411
+ "skipped": False, # Indicate this condition was evaluated
412
+ }
413
+ )
392
414
  passed_conditions.append(passed)
393
-
415
+
394
416
  # Determine if alert should trigger - only consider conditions that weren't skipped
395
417
  if not passed_conditions:
396
418
  # If all conditions were skipped, the rule doesn't trigger
@@ -401,21 +423,23 @@ class RulesEngine:
401
423
  # - All evaluated conditions must pass
402
424
  # - All conditions must have been evaluated (none skipped)
403
425
  all_conditions_passed = all(passed_conditions)
404
- all_conditions_evaluated = len(passed_conditions) == len(rule.conditions)
426
+ all_conditions_evaluated = len(passed_conditions) == len(
427
+ rule.conditions
428
+ )
405
429
  triggered = all_conditions_passed and all_conditions_evaluated
406
430
  else:
407
431
  # For "any" combine_type, at least one condition must pass
408
432
  triggered = any(passed_conditions)
409
-
433
+
410
434
  # Create alert result with example metadata
411
435
  notification_config = None
412
436
  if triggered and rule.notification:
413
437
  # If rule has a notification config and the alert is triggered, include it in the result
414
438
  notification_config = rule.notification
415
-
439
+
416
440
  # Set the alert status based on whether the rule was triggered using proper enum values
417
441
  status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
418
-
442
+
419
443
  # Create the alert result
420
444
  alert_result = AlertResult(
421
445
  status=status,
@@ -425,26 +449,32 @@ class RulesEngine:
425
449
  notification=notification_config,
426
450
  metadata=example_metadata or {},
427
451
  combine_type=rule.combine_type,
428
- project_id=example_metadata.get("project_id") if example_metadata else None,
429
- trace_span_id=example_metadata.get("trace_span_id") if example_metadata else None
452
+ project_id=example_metadata.get("project_id")
453
+ if example_metadata
454
+ else None,
455
+ trace_span_id=example_metadata.get("trace_span_id")
456
+ if example_metadata
457
+ else None,
430
458
  )
431
-
459
+
432
460
  results[rule_id] = alert_result
433
-
461
+
434
462
  return results
435
-
436
- async def evaluate_rules_parallel(self,
437
- example_scores: Dict[str, Dict[str, float]],
438
- example_metadata: Dict[str, Dict[str, Any]],
439
- max_concurrent: int = 100) -> Dict[str, Dict[str, AlertResult]]:
463
+
464
+ async def evaluate_rules_parallel(
465
+ self,
466
+ example_scores: Dict[str, Dict[str, float]],
467
+ example_metadata: Dict[str, Dict[str, Any]],
468
+ max_concurrent: int = 100,
469
+ ) -> Dict[str, Dict[str, AlertResult]]:
440
470
  """
441
471
  Evaluate all rules against multiple examples in parallel.
442
-
472
+
443
473
  Args:
444
474
  example_scores: Dictionary mapping example_ids to their score dictionaries
445
475
  example_metadata: Dictionary mapping example_ids to their metadata
446
476
  max_concurrent: Maximum number of concurrent evaluations
447
-
477
+
448
478
  Returns:
449
479
  Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
450
480
  """
@@ -452,7 +482,7 @@ class RulesEngine:
452
482
  semaphore = asyncio.Semaphore(max_concurrent)
453
483
  results = {}
454
484
  tasks = []
455
-
485
+
456
486
  # Create a task for each example
457
487
  for example_id, scores in example_scores.items():
458
488
  metadata = example_metadata.get(example_id, {})
@@ -460,33 +490,35 @@ class RulesEngine:
460
490
  semaphore=semaphore,
461
491
  example_id=example_id,
462
492
  scores=scores,
463
- metadata=metadata
493
+ metadata=metadata,
464
494
  )
465
495
  tasks.append(task)
466
-
496
+
467
497
  # Run all tasks and collect results
468
498
  example_results = await asyncio.gather(*tasks)
469
-
499
+
470
500
  # Organize results by example_id
471
501
  for example_id, result in example_results:
472
502
  results[example_id] = result
473
-
503
+
474
504
  return results
475
-
476
- async def _evaluate_with_semaphore(self,
477
- semaphore: asyncio.Semaphore,
478
- example_id: str,
479
- scores: Dict[str, float],
480
- metadata: Dict[str, Any]) -> Tuple[str, Dict[str, AlertResult]]:
505
+
506
+ async def _evaluate_with_semaphore(
507
+ self,
508
+ semaphore: asyncio.Semaphore,
509
+ example_id: str,
510
+ scores: Dict[str, float],
511
+ metadata: Dict[str, Any],
512
+ ) -> Tuple[str, Dict[str, AlertResult]]:
481
513
  """
482
514
  Helper method to evaluate rules for an example with semaphore control.
483
-
515
+
484
516
  Args:
485
517
  semaphore: Semaphore to control concurrency
486
518
  example_id: ID of the example being evaluated
487
519
  scores: Dictionary of scores for this example
488
520
  metadata: Metadata for this example
489
-
521
+
490
522
  Returns:
491
523
  Tuple of (example_id, rule_results)
492
524
  """
@@ -494,13 +526,8 @@ class RulesEngine:
494
526
  # Run the evaluation in a thread pool to avoid blocking the event loop
495
527
  # for CPU-bound operations
496
528
  with ThreadPoolExecutor() as executor:
497
- start_time = time.perf_counter()
498
529
  rule_results = await asyncio.get_event_loop().run_in_executor(
499
- executor,
500
- self.evaluate_rules,
501
- scores,
502
- metadata
530
+ executor, self.evaluate_rules, scores, metadata
503
531
  )
504
- end_time = time.perf_counter()
505
-
506
- return (example_id, rule_results)
532
+
533
+ return (example_id, rule_results)