judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/rules.py ADDED
@@ -0,0 +1,384 @@
1
+ """
2
+ Rules system for Judgeval that enables alerts based on metric thresholds.
3
+ """
4
+
5
+ from typing import Dict, List, Optional, Union, Any, Set, Tuple
6
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
7
+ from enum import Enum
8
+ from datetime import datetime
9
+ import asyncio
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ import time
12
+ import uuid # Add import for uuid module
13
+
14
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
15
+ from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
16
+
17
+ class AlertStatus(str, Enum):
18
+ """Status of an alert evaluation."""
19
+ TRIGGERED = "triggered"
20
+ NOT_TRIGGERED = "not_triggered"
21
+
22
+ class Operator(str, Enum):
23
+ """Comparison operators for conditions."""
24
+ GT = ">"
25
+ GTE = ">="
26
+ LT = "<"
27
+ LTE = "<="
28
+ EQ = "=="
29
+ NEQ = "!="
30
+
31
+ class Condition(BaseModel):
32
+ """
33
+ A single metric condition.
34
+
35
+ Example:
36
+ {
37
+ "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
38
+ "operator": ">=",
39
+ "threshold": 0.7
40
+ }
41
+ """
42
+ model_config = ConfigDict(arbitrary_types_allowed=True)
43
+
44
+ metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
45
+ operator: Operator
46
+ threshold: float
47
+
48
+ @property
49
+ def metric_name(self) -> str:
50
+ """Get the name of the metric for lookups in scores dictionary."""
51
+ if isinstance(self.metric, ScorerWrapper):
52
+ # Handle ScorerWrapper case specifically
53
+ return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
54
+ elif hasattr(self.metric, 'score_type'):
55
+ # Handle APIJudgmentScorer and JudgevalScorer which have score_type
56
+ return self.metric.score_type
57
+ elif hasattr(self.metric, '__name__'):
58
+ # Handle cases where metric has a __name__ attribute
59
+ return self.metric.__name__
60
+ # Fallback to string representation
61
+ return str(self.metric)
62
+
63
+ def evaluate(self, value: float) -> bool:
64
+ """Evaluate this condition against a value."""
65
+ if self.operator == Operator.GT:
66
+ return value > self.threshold
67
+ elif self.operator == Operator.GTE:
68
+ return value >= self.threshold
69
+ elif self.operator == Operator.LT:
70
+ return value < self.threshold
71
+ elif self.operator == Operator.LTE:
72
+ return value <= self.threshold
73
+ elif self.operator == Operator.EQ:
74
+ return value == self.threshold
75
+ elif self.operator == Operator.NEQ:
76
+ return value != self.threshold
77
+ else:
78
+ raise ValueError(f"Unknown operator: {self.operator}")
79
+
80
+ class Rule(BaseModel):
81
+ """
82
+ Configuration for a single rule.
83
+
84
+ Example:
85
+ {
86
+ "rule_id": "123e4567-e89b-12d3-a456-426614174000",
87
+ "name": "Quality Check",
88
+ "description": "Check if quality metrics meet thresholds",
89
+ "conditions": [
90
+ {"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
91
+ {"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
92
+ ],
93
+ "combine_type": "all" # "all" or "any"
94
+ }
95
+ """
96
+ rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
97
+ name: str
98
+ description: Optional[str] = None
99
+ conditions: List[Condition]
100
+ combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
101
+
102
+ def model_dump(self, **kwargs):
103
+ """
104
+ Custom serialization that properly handles condition serialization.
105
+ """
106
+ data = super().model_dump(**kwargs)
107
+
108
+ # Special handling for conditions with complex metric objects
109
+ if "conditions" in data:
110
+ for i, condition in enumerate(data["conditions"]):
111
+ if "metric" in condition:
112
+ # Get the actual metric object
113
+ metric_obj = self.conditions[i].metric
114
+
115
+ # Create standardized metric representation needed by server API
116
+ metric_data = {
117
+ "score_type": "",
118
+ "threshold": 0.0
119
+ }
120
+
121
+ # First try to use object's own serialization methods
122
+ if hasattr(metric_obj, "to_dict"):
123
+ orig_data = metric_obj.to_dict()
124
+ # Copy any existing fields
125
+ for key, value in orig_data.items():
126
+ metric_data[key] = value
127
+ elif hasattr(metric_obj, "model_dump"):
128
+ orig_data = metric_obj.model_dump()
129
+ # Copy any existing fields
130
+ for key, value in orig_data.items():
131
+ metric_data[key] = value
132
+
133
+ # If we already have data from original serialization methods but missing required fields
134
+ if 'name' in metric_data and 'score_type' not in metric_data:
135
+ metric_data['score_type'] = metric_data['name']
136
+
137
+ # Ensure required fields have values by checking various sources
138
+ if not metric_data['score_type']:
139
+ # Try to get score_type from different possible attributes
140
+ if hasattr(metric_obj, 'score_type'):
141
+ metric_data['score_type'] = metric_obj.score_type
142
+ elif hasattr(metric_obj, 'name'):
143
+ metric_data['score_type'] = metric_obj.name
144
+ else:
145
+ # Last resort: use string representation
146
+ metric_data['score_type'] = str(metric_obj)
147
+
148
+ # Make sure threshold is set
149
+ if not metric_data.get('threshold') and metric_data.get('threshold') != 0.0:
150
+ if hasattr(metric_obj, 'threshold'):
151
+ metric_data['threshold'] = metric_obj.threshold
152
+ else:
153
+ # Use condition threshold if metric doesn't have one
154
+ metric_data['threshold'] = self.conditions[i].threshold
155
+
156
+ # Update the condition with our properly serialized metric
157
+ condition["metric"] = metric_data
158
+
159
+ return data
160
+
161
+ @field_validator('conditions')
162
+ def validate_conditions_not_empty(cls, v):
163
+ if not v:
164
+ raise ValueError("Conditions list cannot be empty")
165
+ return v
166
+
167
+ @field_validator('combine_type')
168
+ def validate_combine_type(cls, v):
169
+ if v not in ["all", "any"]:
170
+ raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
171
+ return v
172
+
173
+
174
+ class AlertResult(BaseModel):
175
+ """
176
+ Result of evaluating a rule.
177
+
178
+ Example:
179
+ {
180
+ "status": "triggered",
181
+ "rule_name": "Quality Check",
182
+ "conditions_result": [
183
+ {"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
184
+ {"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
185
+ ],
186
+ "rule_id": "123e4567-e89b-12d3-a456-426614174000",
187
+ "metadata": {
188
+ "example_id": "example_123",
189
+ "timestamp": "20240321_123456"
190
+ }
191
+ }
192
+ """
193
+ status: AlertStatus
194
+ rule_id: Optional[str] = None # The unique identifier of the rule
195
+ rule_name: str
196
+ conditions_result: List[Dict[str, Any]]
197
+ metadata: Dict[str, Any] = {}
198
+
199
+ @property
200
+ def example_id(self) -> Optional[str]:
201
+ """Get example_id from metadata for backward compatibility"""
202
+ return self.metadata.get("example_id")
203
+
204
+ @property
205
+ def timestamp(self) -> Optional[str]:
206
+ """Get timestamp from metadata for backward compatibility"""
207
+ return self.metadata.get("timestamp")
208
+
209
+ class RulesEngine:
210
+ """
211
+ Engine for evaluating rules and managing alerts.
212
+
213
+ Example usage:
214
+ rules = {
215
+ "quality_check": Rule(
216
+ name="Quality Check",
217
+ conditions=[
218
+ Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
219
+ Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
220
+ ],
221
+ combine_type="all"
222
+ )
223
+ }
224
+
225
+ engine = RulesEngine(rules)
226
+ scores = {"faithfulness": 0.8, "relevancy": 0.9}
227
+ alerts = engine.evaluate_rules(scores, example_metadata={
228
+ "example_id": "example_123",
229
+ "timestamp": "20240321_123456"
230
+ })
231
+ """
232
+
233
+ def __init__(self, rules: Dict[str, Rule]):
234
+ """
235
+ Initialize the RulesEngine with rules.
236
+
237
+ Args:
238
+ rules: Dictionary mapping rule IDs to rule configurations
239
+ """
240
+ self.rules = rules
241
+
242
+ def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
243
+ """
244
+ Evaluate all rules against a set of scores.
245
+ Returns mapping of rule IDs to their alert results.
246
+
247
+ Args:
248
+ scores: Dictionary of metric names to their score values
249
+ example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
250
+ """
251
+ results = {}
252
+
253
+ for rule_id, rule in self.rules.items():
254
+ # Evaluate each condition
255
+ condition_results = []
256
+ passed_conditions = []
257
+
258
+ for condition in rule.conditions:
259
+ # Get the metric name for lookup
260
+ metric_name = condition.metric_name
261
+ value = scores.get(metric_name)
262
+ if value is None:
263
+ # Skip this condition instead of evaluating it as false
264
+ condition_results.append({
265
+ "metric": metric_name,
266
+ "value": None,
267
+ "threshold": condition.threshold,
268
+ "operator": condition.operator,
269
+ "passed": None, # Using None to indicate the condition was skipped
270
+ "skipped": True # Add a flag to indicate this condition was skipped
271
+ })
272
+ continue # Skip adding to passed_conditions
273
+ else:
274
+ passed = condition.evaluate(value)
275
+ condition_results.append({
276
+ "metric": metric_name,
277
+ "value": value,
278
+ "threshold": condition.threshold,
279
+ "operator": condition.operator,
280
+ "passed": passed,
281
+ "skipped": False # Indicate this condition was evaluated
282
+ })
283
+ passed_conditions.append(passed)
284
+
285
+ # Determine if alert should trigger - only consider conditions that weren't skipped
286
+ if not passed_conditions:
287
+ # If all conditions were skipped, the rule doesn't trigger
288
+ triggered = False
289
+ else:
290
+ triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
291
+
292
+ # Create alert result with example metadata
293
+ alert_result = AlertResult(
294
+ status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
295
+ rule_id=rule.rule_id, # Include the rule's unique identifier
296
+ rule_name=rule.name,
297
+ conditions_result=condition_results
298
+ )
299
+
300
+ # Add example metadata if provided
301
+ if example_metadata:
302
+ if "example_id" in example_metadata:
303
+ alert_result.metadata["example_id"] = example_metadata["example_id"]
304
+ if "timestamp" in example_metadata:
305
+ alert_result.metadata["timestamp"] = example_metadata["timestamp"]
306
+
307
+ results[rule_id] = alert_result
308
+
309
+ return results
310
+
311
+ async def evaluate_rules_parallel(self,
312
+ example_scores: Dict[str, Dict[str, float]],
313
+ example_metadata: Dict[str, Dict[str, Any]],
314
+ max_concurrent: int = 100) -> Dict[str, Dict[str, AlertResult]]:
315
+ """
316
+ Evaluate all rules against multiple examples in parallel.
317
+
318
+ Args:
319
+ example_scores: Dictionary mapping example_ids to their score dictionaries
320
+ example_metadata: Dictionary mapping example_ids to their metadata
321
+ max_concurrent: Maximum number of concurrent evaluations
322
+
323
+ Returns:
324
+ Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
325
+ """
326
+ # Create semaphore to limit concurrent executions
327
+ semaphore = asyncio.Semaphore(max_concurrent)
328
+ results = {}
329
+ tasks = []
330
+
331
+ # Create a task for each example
332
+ for example_id, scores in example_scores.items():
333
+ metadata = example_metadata.get(example_id, {})
334
+ task = self._evaluate_with_semaphore(
335
+ semaphore=semaphore,
336
+ example_id=example_id,
337
+ scores=scores,
338
+ metadata=metadata
339
+ )
340
+ tasks.append(task)
341
+
342
+ # Run all tasks and collect results
343
+ example_results = await asyncio.gather(*tasks)
344
+
345
+ # Organize results by example_id
346
+ for example_id, result in example_results:
347
+ results[example_id] = result
348
+
349
+ return results
350
+
351
+ async def _evaluate_with_semaphore(self,
352
+ semaphore: asyncio.Semaphore,
353
+ example_id: str,
354
+ scores: Dict[str, float],
355
+ metadata: Dict[str, Any]) -> Tuple[str, Dict[str, AlertResult]]:
356
+ """
357
+ Helper method to evaluate rules for an example with semaphore control.
358
+
359
+ Args:
360
+ semaphore: Semaphore to control concurrency
361
+ example_id: ID of the example being evaluated
362
+ scores: Dictionary of scores for this example
363
+ metadata: Metadata for this example
364
+
365
+ Returns:
366
+ Tuple of (example_id, rule_results)
367
+ """
368
+ async with semaphore:
369
+ # Run the evaluation in a thread pool to avoid blocking the event loop
370
+ # for CPU-bound operations
371
+ with ThreadPoolExecutor() as executor:
372
+ start_time = time.perf_counter()
373
+ rule_results = await asyncio.get_event_loop().run_in_executor(
374
+ executor,
375
+ self.evaluate_rules,
376
+ scores,
377
+ metadata
378
+ )
379
+ end_time = time.perf_counter()
380
+
381
+ # Could log performance metrics here if needed
382
+ # debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
383
+
384
+ return (example_id, rule_results)
@@ -20,6 +20,7 @@ from judgeval.constants import (
20
20
  ROOT_API,
21
21
  JUDGMENT_EVAL_API_URL,
22
22
  JUDGMENT_EVAL_LOG_API_URL,
23
+ MAX_CONCURRENT_EVALUATIONS
23
24
  )
24
25
  from judgeval.common.exceptions import JudgmentAPIError
25
26
  from judgeval.evaluation_run import EvaluationRun
@@ -30,6 +31,7 @@ from judgeval.common.logger import (
30
31
  error,
31
32
  example_logging_context
32
33
  )
34
+ from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
33
35
 
34
36
 
35
37
  def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
@@ -50,9 +52,11 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
50
52
  response = requests.post(
51
53
  JUDGMENT_EVAL_API_URL, headers={
52
54
  "Content-Type": "application/json",
53
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}"
55
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
56
+ "X-Organization-Id": evaluation_run.organization_id
54
57
  },
55
- json=payload)
58
+ json=payload,
59
+ verify=False)
56
60
  response_data = response.json()
57
61
  except Exception as e:
58
62
  error(f"Error: {e}")
@@ -140,7 +144,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
140
144
  return results
141
145
 
142
146
 
143
- def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
147
+ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
144
148
  """
145
149
  Checks if an evaluation run name already exists for a given project.
146
150
 
@@ -158,13 +162,15 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
158
162
  f"{ROOT_API}/eval-run-name-exists/",
159
163
  headers={
160
164
  "Content-Type": "application/json",
161
- "Authorization": f"Bearer {judgment_api_key}"
165
+ "Authorization": f"Bearer {judgment_api_key}",
166
+ "X-Organization-Id": organization_id
162
167
  },
163
168
  json={
164
169
  "eval_name": eval_name,
165
170
  "project_name": project_name,
166
171
  "judgment_api_key": judgment_api_key,
167
- }
172
+ },
173
+ verify=False
168
174
  )
169
175
 
170
176
  if response.status_code == 409:
@@ -199,14 +205,15 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
199
205
  JUDGMENT_EVAL_LOG_API_URL,
200
206
  headers={
201
207
  "Content-Type": "application/json",
202
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}"
208
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
209
+ "X-Organization-Id": evaluation_run.organization_id
203
210
  },
204
211
  json={
205
212
  "results": [result.to_dict() for result in merged_results],
206
- "judgment_api_key": evaluation_run.judgment_api_key,
207
213
  "project_name": evaluation_run.project_name,
208
214
  "eval_name": evaluation_run.eval_name,
209
- }
215
+ },
216
+ verify=False
210
217
  )
211
218
 
212
219
  if not res.ok:
@@ -226,6 +233,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
226
233
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
227
234
 
228
235
 
236
+
229
237
  def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
230
238
  """
231
239
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -243,7 +251,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
243
251
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
244
252
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
245
253
  log_results (bool): Whether to log the results to the Judgment API
246
-
254
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
247
255
 
248
256
  Returns:
249
257
  List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
@@ -254,7 +262,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
254
262
  check_eval_run_name_exists(
255
263
  evaluation_run.eval_name,
256
264
  evaluation_run.project_name,
257
- evaluation_run.judgment_api_key
265
+ evaluation_run.judgment_api_key,
266
+ evaluation_run.organization_id
258
267
  )
259
268
 
260
269
  # Set example IDs if not already set
@@ -312,7 +321,9 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
312
321
  aggregator=evaluation_run.aggregator,
313
322
  metadata=evaluation_run.metadata,
314
323
  judgment_api_key=evaluation_run.judgment_api_key,
315
- log_results=evaluation_run.log_results
324
+ organization_id=evaluation_run.organization_id,
325
+ log_results=evaluation_run.log_results,
326
+ rules=evaluation_run.rules
316
327
  )
317
328
  debug("Sending request to Judgment API")
318
329
  response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
@@ -342,7 +353,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
342
353
  ]
343
354
 
344
355
  api_results.append(ScoringResult(**filtered_result))
345
-
346
356
  # Run local evals
347
357
  if local_scorers: # List[JudgevalScorer]
348
358
  info("Starting local evaluation")
@@ -360,12 +370,11 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
360
370
  show_indicator=True,
361
371
  _use_bar_indicator=True,
362
372
  throttle_value=0,
363
- max_concurrent=100,
373
+ max_concurrent=MAX_CONCURRENT_EVALUATIONS,
364
374
  )
365
375
  )
366
376
  local_results = results
367
377
  info(f"Local evaluation complete with {len(local_results)} results")
368
-
369
378
  # Aggregate the ScorerData from the API and local evaluations
370
379
  debug("Merging API and local results")
371
380
  merged_results: List[ScoringResult] = merge_results(api_results, local_results)
@@ -373,6 +382,15 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
373
382
 
374
383
  info(f"Successfully merged {len(merged_results)} results")
375
384
 
385
+ # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
386
+ # if evaluation_run.rules and merged_results:
387
+ # run_rules(
388
+ # local_results=merged_results,
389
+ # rules=evaluation_run.rules,
390
+ # judgment_api_key=evaluation_run.judgment_api_key,
391
+ # organization_id=evaluation_run.organization_id
392
+ # )
393
+
376
394
  if evaluation_run.log_results:
377
395
  log_evaluation_results(merged_results, evaluation_run)
378
396
 
@@ -34,22 +34,22 @@ class APIJudgmentScorer(BaseModel):
34
34
  @field_validator('score_type')
35
35
  def convert_to_enum_value(cls, v):
36
36
  """
37
- Validates that the `score_type` is a valid `JudgmentMetric` enum value.
38
- Converts string values to `JudgmentMetric` enum values.
37
+ Validates that the `score_type` is a valid `APIScorer` enum value.
38
+ Converts string values to `APIScorer` enum values.
39
39
  """
40
40
  debug(f"Attempting to convert score_type value: {v}")
41
41
  if isinstance(v, APIScorer):
42
- info(f"Using existing JudgmentMetric: {v.value}")
43
- return v.value
42
+ info(f"Using existing APIScorer: {v}")
43
+ return v
44
44
  elif isinstance(v, str):
45
- debug(f"Converting string value to JudgmentMetric enum: {v}")
46
- return APIScorer[v.upper()].value
45
+ debug(f"Converting string value to APIScorer enum: {v}")
46
+ return APIScorer[v.upper()]
47
47
  error(f"Invalid score_type value: {v}")
48
48
  raise ValueError(f"Invalid value for score_type: {v}")
49
-
49
+
50
50
  def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
52
-
51
+ return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
52
+
53
53
  def to_dict(self) -> dict:
54
54
  """
55
55
  Converts the scorer configuration to a dictionary format.
@@ -58,7 +58,6 @@ class APIJudgmentScorer(BaseModel):
58
58
  dict: A dictionary containing the scorer's configuration
59
59
  """
60
60
  return {
61
- "score_type": self.score_type,
61
+ "score_type": str(self.score_type.value), # Convert enum to string for serialization
62
62
  "threshold": self.threshold
63
- }
64
-
63
+ }
@@ -48,5 +48,5 @@ class APIJudgmentScorer(BaseModel):
48
48
  raise ValueError(f"Invalid value for score_type: {v}")
49
49
 
50
50
  def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
51
+ return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
52
52
 
@@ -46,7 +46,6 @@ class AnswerRelevancyScorer(JudgevalScorer):
46
46
  )
47
47
  self.model, self.using_native_model = create_judge(model)
48
48
  self.evaluation_model = self.model.get_model_name()
49
- print(self.model)
50
49
 
51
50
  def score_example(
52
51
  self,
@@ -0,0 +1,43 @@
1
+ """
2
+ Handling alerts in Judgeval.
3
+ """
4
+ from enum import Enum
5
+ from typing import Dict, Any, List, Optional
6
+ from pydantic import BaseModel
7
+
8
+ class AlertStatus(str, Enum):
9
+ """Status of an alert evaluation."""
10
+ TRIGGERED = "triggered"
11
+ NOT_TRIGGERED = "not_triggered"
12
+
13
+ class AlertResult(BaseModel):
14
+ """
15
+ Result of a rule evaluation.
16
+
17
+ Attributes:
18
+ rule_name: Name of the rule that was evaluated
19
+ rule_id: Unique identifier of the rule
20
+ status: Status of the alert (triggered or not)
21
+ conditions_result: List of condition evaluation results
22
+ metadata: Dictionary containing example_id, timestamp, and other metadata
23
+ """
24
+ rule_name: str
25
+ rule_id: Optional[str] = None # The unique identifier of the rule
26
+ status: AlertStatus
27
+ conditions_result: List[Dict[str, Any]] = []
28
+ metadata: Dict[str, Any] = {}
29
+
30
+ @property
31
+ def example_id(self) -> Optional[str]:
32
+ """Get example_id from metadata for backward compatibility"""
33
+ return self.metadata.get("example_id")
34
+
35
+ @property
36
+ def timestamp(self) -> Optional[str]:
37
+ """Get timestamp from metadata for backward compatibility"""
38
+ return self.metadata.get("timestamp")
39
+
40
+ @property
41
+ def conditions_results(self) -> List[Dict[str, Any]]:
42
+ """Backwards compatibility property for the conditions_result field"""
43
+ return self.conditions_result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues