judgeval 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -20,6 +20,7 @@ from rich import print as rprint
20
20
  # Third-party imports
21
21
  import pika
22
22
  import requests
23
+ from litellm import cost_per_token
23
24
  from pydantic import BaseModel
24
25
  from rich import print as rprint
25
26
  from openai import OpenAI
@@ -332,6 +333,9 @@ class TraceClient:
332
333
  self.span_type = None
333
334
  self._current_span: Optional[TraceEntry] = None
334
335
  self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
336
+ self.visited_nodes = [] # Track nodes visited through langgraph_node spans
337
+ self.executed_tools = [] # Track tools executed through tool spans
338
+ self.executed_node_tools = [] # Track node:tool combinations
335
339
 
336
340
  @contextmanager
337
341
  def span(self, name: str, span_type: SpanType = "span"):
@@ -618,30 +622,70 @@ class TraceClient:
618
622
  total_completion_tokens = 0
619
623
  total_tokens = 0
620
624
 
625
+ total_prompt_tokens_cost = 0.0
626
+ total_completion_tokens_cost = 0.0
627
+ total_cost = 0.0
628
+
621
629
  for entry in condensed_entries:
622
630
  if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
623
- usage = entry["output"].get("usage", {})
631
+ output = entry["output"]
632
+ usage = output.get("usage", {})
633
+ model_name = entry.get("inputs", {}).get("model", "")
634
+ prompt_tokens = 0
635
+ completion_tokens = 0
636
+
624
637
  # Handle OpenAI/Together format
625
638
  if "prompt_tokens" in usage:
626
- total_prompt_tokens += usage.get("prompt_tokens", 0)
627
- total_completion_tokens += usage.get("completion_tokens", 0)
639
+ prompt_tokens = usage.get("prompt_tokens", 0)
640
+ completion_tokens = usage.get("completion_tokens", 0)
641
+ total_prompt_tokens += prompt_tokens
642
+ total_completion_tokens += completion_tokens
628
643
  # Handle Anthropic format
629
644
  elif "input_tokens" in usage:
630
- total_prompt_tokens += usage.get("input_tokens", 0)
631
- total_completion_tokens += usage.get("output_tokens", 0)
645
+ prompt_tokens = usage.get("input_tokens", 0)
646
+ completion_tokens = usage.get("output_tokens", 0)
647
+ total_prompt_tokens += prompt_tokens
648
+ total_completion_tokens += completion_tokens
649
+
632
650
  total_tokens += usage.get("total_tokens", 0)
651
+
652
+ # Calculate costs if model name is available
653
+ if model_name:
654
+ try:
655
+ prompt_cost, completion_cost = cost_per_token(
656
+ model=model_name,
657
+ prompt_tokens=prompt_tokens,
658
+ completion_tokens=completion_tokens
659
+ )
660
+ total_prompt_tokens_cost += prompt_cost
661
+ total_completion_tokens_cost += completion_cost
662
+ total_cost += prompt_cost + completion_cost
663
+
664
+ # Add cost information directly to the usage dictionary in the condensed entry
665
+ if "usage" not in output:
666
+ output["usage"] = {}
667
+ output["usage"]["prompt_tokens_cost_usd"] = prompt_cost
668
+ output["usage"]["completion_tokens_cost_usd"] = completion_cost
669
+ output["usage"]["total_cost_usd"] = prompt_cost + completion_cost
670
+ except Exception as e:
671
+ # If cost calculation fails, continue without adding costs
672
+ print(f"Error calculating cost for model '{model_name}': {str(e)}")
673
+ pass
633
674
 
634
675
  # Create trace document
635
676
  trace_data = {
636
677
  "trace_id": self.trace_id,
637
678
  "name": self.name,
638
679
  "project_name": self.project_name,
639
- "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
680
+ "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
640
681
  "duration": total_duration,
641
682
  "token_counts": {
642
683
  "prompt_tokens": total_prompt_tokens,
643
684
  "completion_tokens": total_completion_tokens,
644
685
  "total_tokens": total_tokens,
686
+ "prompt_tokens_cost_usd": total_prompt_tokens_cost,
687
+ "completion_tokens_cost_usd": total_completion_tokens_cost,
688
+ "total_cost_usd": total_cost
645
689
  },
646
690
  "entries": condensed_entries,
647
691
  "empty_save": empty_save,
@@ -697,7 +741,6 @@ class Tracer:
697
741
 
698
742
  if not organization_id:
699
743
  raise ValueError("Tracer must be configured with an Organization ID")
700
-
701
744
  self.api_key: str = api_key
702
745
  self.project_name: str = project_name
703
746
  self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
@@ -767,8 +810,9 @@ class Tracer:
767
810
  project_name: Optional project name override
768
811
  overwrite: Whether to overwrite existing traces
769
812
  """
813
+ # If monitoring is disabled, return the function as is
770
814
  if not self.enable_monitoring:
771
- return
815
+ return func if func else lambda f: f
772
816
 
773
817
  if func is None:
774
818
  return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
@@ -872,6 +916,9 @@ class Tracer:
872
916
  return wrapper
873
917
 
874
918
  def async_evaluate(self, *args, **kwargs):
919
+ if not self.enable_evaluations:
920
+ return
921
+
875
922
  if self._current_trace:
876
923
  self._current_trace.async_evaluate(*args, **kwargs)
877
924
  else:
judgeval/constants.py CHANGED
@@ -46,13 +46,14 @@ JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
46
46
  JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
47
47
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
48
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
- JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
49
+ JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
50
50
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
51
  JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
52
52
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
53
53
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
54
54
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
55
- JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_eval_queue/"
55
+ JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
56
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
56
57
  # RabbitMQ
57
58
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
58
59
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
@@ -90,9 +90,18 @@ class EvalDataset:
90
90
  def add_from_csv(
91
91
  self,
92
92
  file_path: str,
93
+ header_mapping: dict,
94
+ primary_delimiter: str = ",",
95
+ secondary_delimiter: str = ";"
93
96
  ) -> None:
94
97
  """
95
98
  Add Examples from a CSV file.
99
+
100
+ Args:
101
+ file_path (str): Path to the CSV file
102
+ header_mapping (dict): Dictionary mapping Example headers to custom headers
103
+ primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
104
+ secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
96
105
  """
97
106
  try:
98
107
  import pandas as pd
@@ -102,9 +111,10 @@ class EvalDataset:
102
111
  )
103
112
 
104
113
  # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
105
- df = pd.read_csv(file_path, dtype={'trace_id': str})
114
+ df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
106
115
  """
107
- Expect the CSV to have headers
116
+ The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
117
+ Available headers for Example objects are as follows:
108
118
 
109
119
  "input", "actual_output", "expected_output", "context", \
110
120
  "retrieval_context", "additional_metadata", "tools_called", \
@@ -113,35 +123,48 @@ class EvalDataset:
113
123
 
114
124
  We want to collect the examples separately which can
115
125
  be determined by the "example" column. If the value is True, then it is an
116
- example
126
+ example, and we expect the `input` and `actual_output` fields to be non-null.
117
127
 
118
- We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
119
- This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
128
+ We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
129
+ This can be adjusted using the `secondary_delimiter` parameter.
120
130
  """
121
131
  examples = []
122
-
132
+
133
+ def process_csv_row(value, header):
134
+ """
135
+ Maps a singular value in the CSV file to the appropriate type based on the header.
136
+ If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
137
+ """
138
+ # check that the CSV value is not null for entry
139
+ null_replacement = dict() if header == 'additional_metadata' else None
140
+ if pd.isna(value) or value == '':
141
+ return null_replacement
142
+ try:
143
+ value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
144
+ except (ValueError, SyntaxError):
145
+ value = str(value)
146
+ if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
147
+ # attempt to split the value by the secondary delimiter
148
+ value = value.split(secondary_delimiter)
149
+
150
+ return value
151
+
123
152
  for _, row in df.iterrows():
124
153
  data = {
125
- "input": row["input"],
126
- "actual_output": row["actual_output"] if pd.notna(row["actual_output"]) else None,
127
- "expected_output": row["expected_output"] if pd.notna(row["expected_output"]) else None,
128
- "context": row["context"].split(";") if pd.notna(row["context"]) else [],
129
- "retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
130
- "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
131
- "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
132
- "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
133
- "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
134
- "example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
154
+ header: process_csv_row(
155
+ row[header_mapping[header]], header
156
+ )
157
+ for header in header_mapping
135
158
  }
136
- if row["example"]:
137
- data["name"] = row["name"] if pd.notna(row["name"]) else None
159
+ if "example" in header_mapping and row[header_mapping["example"]]:
160
+ if "name" in header_mapping:
161
+ data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
138
162
  # every Example has `input` and `actual_output` fields
139
163
  if data["input"] is not None and data["actual_output"] is not None:
140
164
  e = Example(**data)
141
165
  examples.append(e)
142
166
  else:
143
167
  raise ValueError("Every example must have an 'input' and 'actual_output' field.")
144
-
145
168
 
146
169
  for e in examples:
147
170
  self.add_example(e)
@@ -146,16 +146,17 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
146
146
 
147
147
  self.start_span("LangGraph", span_type="Main Function")
148
148
 
149
- node = metadata.get("langgraph_node")
150
- if node != None and node != self.previous_node:
151
- self.start_span(node, span_type="node")
152
- self.executed_node_tools.append(node)
153
- self.executed_nodes.append(node)
154
- self.trace_client.record_input({
155
- 'args': inputs,
156
- 'kwargs': kwargs
157
- })
158
- self.previous_node = node
149
+ metadata = kwargs.get("metadata", {})
150
+ if node := metadata.get("langgraph_node"):
151
+ if node != self.previous_node:
152
+ # Track node execution
153
+ self.trace_client.visited_nodes.append(node)
154
+ self.trace_client.executed_node_tools.append(node)
155
+ self.trace_client.record_input({
156
+ 'args': inputs,
157
+ 'kwargs': kwargs
158
+ })
159
+ self.previous_node = node
159
160
 
160
161
  def on_chain_end(
161
162
  self,
@@ -198,8 +199,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
198
199
  ):
199
200
  name = serialized["name"]
200
201
  self.start_span(name, span_type="tool")
201
- self.executed_node_tools.append(f"{self.previous_node}:{name}")
202
- self.executed_tools.append(name)
202
+ if name:
203
+ # Track tool execution
204
+ self.trace_client.executed_tools.append(name)
205
+ node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
206
+ self.trace_client.executed_node_tools.append(node_tool)
203
207
  self.trace_client.record_input({
204
208
  'args': input_str,
205
209
  'kwargs': kwargs
@@ -38,6 +38,11 @@ class EvalRunRequestBody(BaseModel):
38
38
  project_name: str
39
39
  judgment_api_key: str
40
40
 
41
+ class DeleteEvalRunRequestBody(BaseModel):
42
+ eval_names: List[str]
43
+ project_name: str
44
+ judgment_api_key: str
45
+
41
46
 
42
47
  class JudgmentClient:
43
48
  def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
@@ -52,7 +57,24 @@ class JudgmentClient:
52
57
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
53
58
  else:
54
59
  print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
55
-
60
+
61
+ def a_run_evaluation(
62
+ self,
63
+ examples: List[Example],
64
+ scorers: List[Union[ScorerWrapper, JudgevalScorer]],
65
+ model: Union[str, List[str], JudgevalJudge],
66
+ aggregator: Optional[str] = None,
67
+ metadata: Optional[Dict[str, Any]] = None,
68
+ log_results: bool = True,
69
+ project_name: str = "default_project",
70
+ eval_run_name: str = "default_eval_run",
71
+ override: bool = False,
72
+ use_judgment: bool = True,
73
+ ignore_errors: bool = True,
74
+ rules: Optional[List[Rule]] = None
75
+ ) -> List[ScoringResult]:
76
+ return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
77
+
56
78
  def run_evaluation(
57
79
  self,
58
80
  examples: List[Example],
@@ -65,6 +87,8 @@ class JudgmentClient:
65
87
  eval_run_name: str = "default_eval_run",
66
88
  override: bool = False,
67
89
  use_judgment: bool = True,
90
+ ignore_errors: bool = True,
91
+ async_execution: bool = False,
68
92
  rules: Optional[List[Rule]] = None
69
93
  ) -> List[ScoringResult]:
70
94
  """
@@ -81,6 +105,7 @@ class JudgmentClient:
81
105
  eval_run_name (str): A name for this evaluation run
82
106
  override (bool): Whether to override an existing evaluation run with the same name
83
107
  use_judgment (bool): Whether to use Judgment API for evaluation
108
+ ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
84
109
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
85
110
 
86
111
  Returns:
@@ -141,7 +166,7 @@ class JudgmentClient:
141
166
  rules=loaded_rules,
142
167
  organization_id=self.organization_id
143
168
  )
144
- return run_eval(eval, override)
169
+ return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
145
170
  except ValueError as e:
146
171
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
147
172
  except Exception as e:
@@ -324,19 +349,22 @@ class JudgmentClient:
324
349
  eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
325
350
  return eval_run_result
326
351
 
327
- def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
352
+ def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
328
353
  """
329
- Deletes an evaluation from the server by project and run name.
354
+ Deletes an evaluation from the server by project and run names.
330
355
 
331
356
  Args:
332
357
  project_name (str): Name of the project
333
- eval_run_name (str): Name of the evaluation run
358
+ eval_run_names (List[str]): List of names of the evaluation runs
334
359
 
335
360
  Returns:
336
361
  bool: Whether the evaluation was successfully deleted
337
362
  """
338
- eval_run_request_body = EvalRunRequestBody(project_name=project_name,
339
- eval_name=eval_run_name,
363
+ if not eval_run_names:
364
+ raise ValueError("No evaluation run names provided")
365
+
366
+ eval_run_request_body = DeleteEvalRunRequestBody(project_name=project_name,
367
+ eval_names=eval_run_names,
340
368
  judgment_api_key=self.judgment_api_key)
341
369
  response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
342
370
  json=eval_run_request_body.model_dump(),
@@ -345,9 +373,11 @@ class JudgmentClient:
345
373
  "Authorization": f"Bearer {self.judgment_api_key}",
346
374
  "X-Organization-Id": self.organization_id
347
375
  })
348
- if response.status_code != requests.codes.ok:
376
+ if response.status_code == 404:
377
+ raise ValueError(f"Eval results not found: {response.json()}")
378
+ elif response.status_code == 500:
349
379
  raise ValueError(f"Error deleting eval results: {response.json()}")
350
- return response.json()
380
+ return bool(response.json())
351
381
 
352
382
  def delete_project_evals(self, project_name: str) -> bool:
353
383
  """
judgeval/rules.py CHANGED
@@ -17,15 +17,6 @@ class AlertStatus(str, Enum):
17
17
  TRIGGERED = "triggered"
18
18
  NOT_TRIGGERED = "not_triggered"
19
19
 
20
- class Operator(str, Enum):
21
- """Comparison operators for conditions."""
22
- GT = ">"
23
- GTE = ">="
24
- LT = "<"
25
- LTE = "<="
26
- EQ = "=="
27
- NEQ = "!="
28
-
29
20
  class Condition(BaseModel):
30
21
  """
31
22
  A single metric condition.
@@ -33,15 +24,13 @@ class Condition(BaseModel):
33
24
  Example:
34
25
  {
35
26
  "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
36
- "operator": ">=",
37
- "threshold": 0.7
38
27
  }
28
+
29
+ The Condition class uses the scorer's threshold and success function internally.
39
30
  """
40
31
  model_config = ConfigDict(arbitrary_types_allowed=True)
41
32
 
42
- metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
43
- operator: Operator
44
- threshold: float
33
+ metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
45
34
 
46
35
  @property
47
36
  def metric_name(self) -> str:
@@ -58,22 +47,60 @@ class Condition(BaseModel):
58
47
  # Fallback to string representation
59
48
  return str(self.metric)
60
49
 
50
+ @property
51
+ def threshold(self) -> float:
52
+ """Get the threshold from the metric."""
53
+ return self.metric.threshold if hasattr(self.metric, 'threshold') else 0.5
54
+
61
55
  def evaluate(self, value: float) -> bool:
62
- """Evaluate this condition against a value."""
63
- if self.operator == Operator.GT:
64
- return value > self.threshold
65
- elif self.operator == Operator.GTE:
66
- return value >= self.threshold
67
- elif self.operator == Operator.LT:
68
- return value < self.threshold
69
- elif self.operator == Operator.LTE:
70
- return value <= self.threshold
71
- elif self.operator == Operator.EQ:
72
- return value == self.threshold
73
- elif self.operator == Operator.NEQ:
74
- return value != self.threshold
56
+ """
57
+ Evaluate the condition against a value.
58
+ Returns True if the condition passes, False otherwise.
59
+ Uses the scorer's success check function if available.
60
+ """
61
+ # Store the value in the scorer
62
+ if hasattr(self.metric, 'score'):
63
+ self.metric.score = value
64
+
65
+ # Use the scorer's success check function if available
66
+ if hasattr(self.metric, 'success_check'):
67
+ return self.metric.success_check()
68
+ elif hasattr(self.metric, '_success_check'):
69
+ return self.metric._success_check()
75
70
  else:
76
- raise ValueError(f"Unknown operator: {self.operator}")
71
+ # Fallback to default comparison (greater than or equal)
72
+ return value >= self.threshold if self.threshold is not None else False
73
+
74
+ class NotificationConfig(BaseModel):
75
+ """
76
+ Configuration for notifications when a rule is triggered.
77
+
78
+ Example:
79
+ {
80
+ "enabled": true,
81
+ "communication_methods": ["email", "broadcast_slack", "broadcast_email"],
82
+ "email_addresses": ["user1@example.com", "user2@example.com"],
83
+ "send_at": 1632150000 # Unix timestamp (specific date/time)
84
+ }
85
+
86
+ Communication Methods:
87
+ - "email": Send emails to specified email addresses
88
+ - "broadcast_slack": Send broadcast notifications to all configured Slack channels
89
+ - "broadcast_email": Send broadcast emails to all organization emails
90
+ """
91
+ enabled: bool = True
92
+ communication_methods: List[str] = []
93
+ email_addresses: Optional[List[str]] = None
94
+ send_at: Optional[int] = None # Unix timestamp for scheduled notifications
95
+
96
+ def model_dump(self, **kwargs):
97
+ """Convert the NotificationConfig to a dictionary for JSON serialization."""
98
+ return {
99
+ "enabled": self.enabled,
100
+ "communication_methods": self.communication_methods,
101
+ "email_addresses": self.email_addresses,
102
+ "send_at": self.send_at
103
+ }
77
104
 
78
105
  class Rule(BaseModel):
79
106
  """
@@ -85,10 +112,15 @@ class Rule(BaseModel):
85
112
  "name": "Quality Check",
86
113
  "description": "Check if quality metrics meet thresholds",
87
114
  "conditions": [
88
- {"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
89
- {"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
115
+ {"metric": FaithfulnessScorer(threshold=0.7)},
116
+ {"metric": AnswerRelevancyScorer(threshold=0.8)}
90
117
  ],
91
- "combine_type": "all" # "all" or "any"
118
+ "combine_type": "all", # "all" or "any"
119
+ "notification": {
120
+ "enabled": true,
121
+ "communication_methods": ["slack", "email"],
122
+ "email_addresses": ["user1@example.com", "user2@example.com"]
123
+ }
92
124
  }
93
125
  """
94
126
  rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
@@ -96,6 +128,8 @@ class Rule(BaseModel):
96
128
  description: Optional[str] = None
97
129
  conditions: List[Condition]
98
130
  combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
131
+ notification: Optional[NotificationConfig] = None # Configuration for notifications
132
+
99
133
 
100
134
  def model_dump(self, **kwargs):
101
135
  """
@@ -168,7 +202,6 @@ class Rule(BaseModel):
168
202
  raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
169
203
  return v
170
204
 
171
-
172
205
  class AlertResult(BaseModel):
173
206
  """
174
207
  Result of evaluating a rule.
@@ -185,6 +218,11 @@ class AlertResult(BaseModel):
185
218
  "metadata": {
186
219
  "example_id": "example_123",
187
220
  "timestamp": "20240321_123456"
221
+ },
222
+ "notification": {
223
+ "enabled": true,
224
+ "communication_methods": ["slack", "email"],
225
+ "email_addresses": ["user1@example.com", "user2@example.com"]
188
226
  }
189
227
  }
190
228
  """
@@ -193,6 +231,7 @@ class AlertResult(BaseModel):
193
231
  rule_name: str
194
232
  conditions_result: List[Dict[str, Any]]
195
233
  metadata: Dict[str, Any] = {}
234
+ notification: Optional[NotificationConfig] = None # Configuration for notifications
196
235
 
197
236
  @property
198
237
  def example_id(self) -> Optional[str]:
@@ -206,36 +245,105 @@ class AlertResult(BaseModel):
206
245
 
207
246
  class RulesEngine:
208
247
  """
209
- Engine for evaluating rules and managing alerts.
248
+ Engine for creating and evaluating rules against metrics.
210
249
 
211
- Example usage:
250
+ Example:
251
+ ```python
252
+ # Define rules
212
253
  rules = {
213
- "quality_check": Rule(
254
+ "1": Rule(
214
255
  name="Quality Check",
256
+ description="Check if quality metrics meet thresholds",
215
257
  conditions=[
216
- Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
217
- Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
258
+ Condition(metric=FaithfulnessScorer(threshold=0.7)),
259
+ Condition(metric=AnswerRelevancyScorer(threshold=0.8))
218
260
  ],
219
261
  combine_type="all"
220
262
  )
221
263
  }
222
264
 
265
+ # Create rules engine
223
266
  engine = RulesEngine(rules)
224
- scores = {"faithfulness": 0.8, "relevancy": 0.9}
225
- alerts = engine.evaluate_rules(scores, example_metadata={
226
- "example_id": "example_123",
227
- "timestamp": "20240321_123456"
228
- })
267
+
268
+ # Configure notifications
269
+ engine.configure_notification(
270
+ rule_id="1",
271
+ enabled=True,
272
+ communication_methods=["slack", "email"],
273
+ email_addresses=["user@example.com"]
274
+ )
275
+
276
+ # Evaluate rules
277
+ scores = {"faithfulness": 0.65, "relevancy": 0.85}
278
+ results = engine.evaluate_rules(scores, {"example_id": "example_123"})
279
+ ```
229
280
  """
230
281
 
231
282
  def __init__(self, rules: Dict[str, Rule]):
232
283
  """
233
- Initialize the RulesEngine with rules.
284
+ Initialize the rules engine.
234
285
 
235
286
  Args:
236
- rules: Dictionary mapping rule IDs to rule configurations
287
+ rules: Dictionary mapping rule IDs to Rule objects
237
288
  """
238
289
  self.rules = rules
290
+
291
+ def configure_notification(self, rule_id: str, enabled: bool = True,
292
+ communication_methods: List[str] = None,
293
+ email_addresses: List[str] = None,
294
+ send_at: Optional[int] = None) -> None:
295
+ """
296
+ Configure notification settings for a specific rule.
297
+
298
+ Args:
299
+ rule_id: ID of the rule to configure notifications for
300
+ enabled: Whether notifications are enabled for this rule
301
+ communication_methods: List of notification methods (e.g., ["slack", "email"])
302
+ email_addresses: List of email addresses to send notifications to
303
+ send_at: Optional Unix timestamp for when to send the notification
304
+ """
305
+ if rule_id not in self.rules:
306
+ raise ValueError(f"Rule ID '{rule_id}' not found")
307
+
308
+ rule = self.rules[rule_id]
309
+
310
+ # Create notification configuration if it doesn't exist
311
+ if rule.notification is None:
312
+ rule.notification = NotificationConfig()
313
+
314
+ # Set notification parameters
315
+ rule.notification.enabled = enabled
316
+
317
+ if communication_methods is not None:
318
+ rule.notification.communication_methods = communication_methods
319
+
320
+ if email_addresses is not None:
321
+ rule.notification.email_addresses = email_addresses
322
+
323
+ if send_at is not None:
324
+ rule.notification.send_at = send_at
325
+
326
+ def configure_all_notifications(self, enabled: bool = True,
327
+ communication_methods: List[str] = None,
328
+ email_addresses: List[str] = None,
329
+ send_at: Optional[int] = None) -> None:
330
+ """
331
+ Configure notification settings for all rules.
332
+
333
+ Args:
334
+ enabled: Whether notifications are enabled
335
+ communication_methods: List of notification methods (e.g., ["slack", "email"])
336
+ email_addresses: List of email addresses to send notifications to
337
+ send_at: Optional Unix timestamp for when to send the notification
338
+ """
339
+ for rule_id, rule in self.rules.items():
340
+ self.configure_notification(
341
+ rule_id=rule_id,
342
+ enabled=enabled,
343
+ communication_methods=communication_methods,
344
+ email_addresses=email_addresses,
345
+ send_at=send_at
346
+ )
239
347
 
240
348
  def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
241
349
  """
@@ -257,13 +365,13 @@ class RulesEngine:
257
365
  # Get the metric name for lookup
258
366
  metric_name = condition.metric_name
259
367
  value = scores.get(metric_name)
368
+
260
369
  if value is None:
261
370
  # Skip this condition instead of evaluating it as false
262
371
  condition_results.append({
263
372
  "metric": metric_name,
264
373
  "value": None,
265
374
  "threshold": condition.threshold,
266
- "operator": condition.operator,
267
375
  "passed": None, # Using None to indicate the condition was skipped
268
376
  "skipped": True # Add a flag to indicate this condition was skipped
269
377
  })
@@ -274,7 +382,6 @@ class RulesEngine:
274
382
  "metric": metric_name,
275
383
  "value": value,
276
384
  "threshold": condition.threshold,
277
- "operator": condition.operator,
278
385
  "passed": passed,
279
386
  "skipped": False # Indicate this condition was evaluated
280
387
  })
@@ -285,23 +392,36 @@ class RulesEngine:
285
392
  # If all conditions were skipped, the rule doesn't trigger
286
393
  triggered = False
287
394
  else:
288
- triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
395
+ if rule.combine_type == "all":
396
+ # For "all" combine_type:
397
+ # - All evaluated conditions must pass
398
+ # - All conditions must have been evaluated (none skipped)
399
+ all_conditions_passed = all(passed_conditions)
400
+ all_conditions_evaluated = len(passed_conditions) == len(rule.conditions)
401
+ triggered = all_conditions_passed and all_conditions_evaluated
402
+ else:
403
+ # For "any" combine_type, at least one condition must pass
404
+ triggered = any(passed_conditions)
289
405
 
290
406
  # Create alert result with example metadata
407
+ notification_config = None
408
+ if triggered and rule.notification:
409
+ # If rule has a notification config and the alert is triggered, include it in the result
410
+ notification_config = rule.notification
411
+
412
+ # Set the alert status based on whether the rule was triggered
413
+ status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
414
+
415
+ # Create the alert result
291
416
  alert_result = AlertResult(
292
- status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
293
- rule_id=rule.rule_id, # Include the rule's unique identifier
417
+ status=status,
418
+ rule_id=rule.rule_id,
294
419
  rule_name=rule.name,
295
- conditions_result=condition_results
420
+ conditions_result=condition_results,
421
+ notification=notification_config,
422
+ metadata=example_metadata or {}
296
423
  )
297
424
 
298
- # Add example metadata if provided
299
- if example_metadata:
300
- if "example_id" in example_metadata:
301
- alert_result.metadata["example_id"] = example_metadata["example_id"]
302
- if "timestamp" in example_metadata:
303
- alert_result.metadata["timestamp"] = example_metadata["timestamp"]
304
-
305
425
  results[rule_id] = alert_result
306
426
 
307
427
  return results
@@ -376,7 +496,4 @@ class RulesEngine:
376
496
  )
377
497
  end_time = time.perf_counter()
378
498
 
379
- # Could log performance metrics here if needed
380
- # debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
381
-
382
499
  return (example_id, rule_results)
@@ -23,17 +23,35 @@ from judgeval.constants import (
23
23
  ROOT_API,
24
24
  JUDGMENT_EVAL_API_URL,
25
25
  JUDGMENT_EVAL_LOG_API_URL,
26
- MAX_CONCURRENT_EVALUATIONS
26
+ MAX_CONCURRENT_EVALUATIONS,
27
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
27
28
  )
28
29
  from judgeval.common.exceptions import JudgmentAPIError
29
- from judgeval.evaluation_run import EvaluationRun
30
30
  from judgeval.common.logger import (
31
31
  debug,
32
32
  info,
33
33
  error,
34
34
  example_logging_context
35
35
  )
36
+ from judgeval.evaluation_run import EvaluationRun
37
+
36
38
 
39
+ def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
40
+ """
41
+ Sends an evaluation run to the RabbitMQ evaluation queue.
42
+ """
43
+ payload = evaluation_run.model_dump(warnings=False)
44
+ response = requests.post(
45
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
46
+ headers={
47
+ "Content-Type": "application/json",
48
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
49
+ "X-Organization-Id": evaluation_run.organization_id
50
+ },
51
+ json=payload,
52
+ verify=True
53
+ )
54
+ return response.json()
37
55
 
38
56
  def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
39
57
  """
@@ -51,13 +69,15 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
51
69
  # submit API request to execute evals
52
70
  payload = evaluation_run.model_dump(warnings=False)
53
71
  response = requests.post(
54
- JUDGMENT_EVAL_API_URL, headers={
55
- "Content-Type": "application/json",
56
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
57
- "X-Organization-Id": evaluation_run.organization_id
58
- },
59
- json=payload,
60
- verify=True)
72
+ JUDGMENT_EVAL_API_URL,
73
+ headers={
74
+ "Content-Type": "application/json",
75
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
76
+ "X-Organization-Id": evaluation_run.organization_id
77
+ },
78
+ json=payload,
79
+ verify=True
80
+ )
61
81
  response_data = response.json()
62
82
  except Exception as e:
63
83
  error(f"Error: {e}")
@@ -281,13 +301,14 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
281
301
  # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
282
302
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
283
303
 
284
-
285
- def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
304
+ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
286
305
  """
287
306
  Executes an evaluation of `Example`s using one or more `Scorer`s
288
307
 
289
308
  Args:
290
309
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
310
+ override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
311
+ ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
291
312
 
292
313
  Args:
293
314
  project_name (str): The name of the project the evaluation results belong to
@@ -354,101 +375,117 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
354
375
 
355
376
  api_results: List[ScoringResult] = []
356
377
  local_results: List[ScoringResult] = []
357
-
358
- # Execute evaluation using Judgment API
359
- if judgment_scorers:
378
+
379
+ if async_execution:
360
380
  check_examples(evaluation_run.examples, evaluation_run.scorers)
361
- info("Starting API evaluation")
362
- debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
363
- try: # execute an EvaluationRun with just JudgmentScorers
364
- api_evaluation_run: EvaluationRun = EvaluationRun(
365
- eval_name=evaluation_run.eval_name,
366
- project_name=evaluation_run.project_name,
367
- examples=evaluation_run.examples,
368
- scorers=judgment_scorers,
369
- model=evaluation_run.model,
370
- aggregator=evaluation_run.aggregator,
371
- metadata=evaluation_run.metadata,
372
- judgment_api_key=evaluation_run.judgment_api_key,
373
- organization_id=evaluation_run.organization_id,
374
- log_results=evaluation_run.log_results,
375
- rules=evaluation_run.rules
381
+ info("Starting async evaluation")
382
+ payload = evaluation_run.model_dump(warnings=False)
383
+ requests.post(
384
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
385
+ headers={
386
+ "Content-Type": "application/json",
387
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
388
+ "X-Organization-Id": evaluation_run.organization_id
389
+ },
390
+ json=payload,
391
+ verify=True
392
+ )
393
+ print("Successfully added evaluation to queue")
394
+ else:
395
+ if judgment_scorers:
396
+ # Execute evaluation using Judgment API
397
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
398
+ info("Starting API evaluation")
399
+ debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
400
+ try: # execute an EvaluationRun with just JudgmentScorers
401
+ api_evaluation_run: EvaluationRun = EvaluationRun(
402
+ eval_name=evaluation_run.eval_name,
403
+ project_name=evaluation_run.project_name,
404
+ examples=evaluation_run.examples,
405
+ scorers=judgment_scorers,
406
+ model=evaluation_run.model,
407
+ aggregator=evaluation_run.aggregator,
408
+ metadata=evaluation_run.metadata,
409
+ judgment_api_key=evaluation_run.judgment_api_key,
410
+ organization_id=evaluation_run.organization_id,
411
+ log_results=evaluation_run.log_results,
412
+ rules=evaluation_run.rules
413
+ )
414
+ debug("Sending request to Judgment API")
415
+ response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
416
+ info(f"Received {len(response_data['results'])} results from API")
417
+ except JudgmentAPIError as e:
418
+ error(f"An error occurred while executing the Judgment API request: {str(e)}")
419
+ raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
420
+ except ValueError as e:
421
+ raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
422
+
423
+ # Convert the response data to `ScoringResult` objects
424
+ debug("Processing API results")
425
+ for idx, result in enumerate(response_data["results"]):
426
+ with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
427
+ for scorer in judgment_scorers:
428
+ debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
429
+ # filter for key-value pairs that are used to initialize ScoringResult
430
+ # there may be some stuff in here that doesn't belong in ScoringResult
431
+ # TODO: come back and refactor this to have ScoringResult take in **kwargs
432
+ filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
433
+
434
+ # Convert scorers_data dicts to ScorerData objects
435
+ if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
436
+ filtered_result["scorers_data"] = [
437
+ ScorerData(**scorer_dict)
438
+ for scorer_dict in filtered_result["scorers_data"]
439
+ ]
440
+
441
+ api_results.append(ScoringResult(**filtered_result))
442
+ # Run local evals
443
+ if local_scorers: # List[JudgevalScorer]
444
+ # We should be removing local scorers soon
445
+ info("Starting local evaluation")
446
+ for example in evaluation_run.examples:
447
+ with example_logging_context(example.timestamp, example.example_id):
448
+ debug(f"Processing example {example.example_id}: {example.input}")
449
+
450
+ results: List[ScoringResult] = asyncio.run(
451
+ a_execute_scoring(
452
+ evaluation_run.examples,
453
+ local_scorers,
454
+ model=evaluation_run.model,
455
+ ignore_errors=ignore_errors,
456
+ skip_on_missing_params=True,
457
+ show_indicator=True,
458
+ _use_bar_indicator=True,
459
+ throttle_value=0,
460
+ max_concurrent=MAX_CONCURRENT_EVALUATIONS,
461
+ )
376
462
  )
377
- debug("Sending request to Judgment API")
378
- response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
379
- info(f"Received {len(response_data['results'])} results from API")
380
- except JudgmentAPIError as e:
381
- error(f"An error occurred while executing the Judgment API request: {str(e)}")
382
- raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
383
- except ValueError as e:
384
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
463
+ local_results = results
464
+ info(f"Local evaluation complete with {len(local_results)} results")
465
+ # Aggregate the ScorerData from the API and local evaluations
466
+ debug("Merging API and local results")
467
+ merged_results: List[ScoringResult] = merge_results(api_results, local_results)
468
+ merged_results = check_missing_scorer_data(merged_results)
469
+
470
+ info(f"Successfully merged {len(merged_results)} results")
471
+
472
+ # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
473
+ # if evaluation_run.rules and merged_results:
474
+ # run_rules(
475
+ # local_results=merged_results,
476
+ # rules=evaluation_run.rules,
477
+ # judgment_api_key=evaluation_run.judgment_api_key,
478
+ # organization_id=evaluation_run.organization_id
479
+ # )
385
480
 
386
- # Convert the response data to `ScoringResult` objects
387
- debug("Processing API results")
388
- for idx, result in enumerate(response_data["results"]):
389
- with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
390
- for scorer in judgment_scorers:
391
- debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
392
- # filter for key-value pairs that are used to initialize ScoringResult
393
- # there may be some stuff in here that doesn't belong in ScoringResult
394
- # TODO: come back and refactor this to have ScoringResult take in **kwargs
395
- filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
396
-
397
- # Convert scorers_data dicts to ScorerData objects
398
- if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
399
- filtered_result["scorers_data"] = [
400
- ScorerData(**scorer_dict)
401
- for scorer_dict in filtered_result["scorers_data"]
402
- ]
403
-
404
- api_results.append(ScoringResult(**filtered_result))
405
- # Run local evals
406
- if local_scorers: # List[JudgevalScorer]
407
- # We should be removing local scorers soon
408
- info("Starting local evaluation")
409
- for example in evaluation_run.examples:
410
- with example_logging_context(example.timestamp, example.example_id):
411
- debug(f"Processing example {example.example_id}: {example.input}")
412
-
413
- results: List[ScoringResult] = asyncio.run(
414
- a_execute_scoring(
415
- evaluation_run.examples,
416
- local_scorers,
417
- model=evaluation_run.model,
418
- ignore_errors=True,
419
- skip_on_missing_params=True,
420
- show_indicator=True,
421
- _use_bar_indicator=True,
422
- throttle_value=0,
423
- max_concurrent=MAX_CONCURRENT_EVALUATIONS,
424
- )
425
- )
426
- local_results = results
427
- info(f"Local evaluation complete with {len(local_results)} results")
428
- # Aggregate the ScorerData from the API and local evaluations
429
- debug("Merging API and local results")
430
- merged_results: List[ScoringResult] = merge_results(api_results, local_results)
431
- merged_results = check_missing_scorer_data(merged_results)
432
-
433
- info(f"Successfully merged {len(merged_results)} results")
434
-
435
- # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
436
- # if evaluation_run.rules and merged_results:
437
- # run_rules(
438
- # local_results=merged_results,
439
- # rules=evaluation_run.rules,
440
- # judgment_api_key=evaluation_run.judgment_api_key,
441
- # organization_id=evaluation_run.organization_id
442
- # )
443
-
444
- if evaluation_run.log_results:
445
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
446
- rprint(pretty_str)
447
-
448
- for i, result in enumerate(merged_results):
449
- if not result.scorers_data: # none of the scorers could be executed on this example
450
- info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
451
- return merged_results
481
+ if evaluation_run.log_results:
482
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
483
+ rprint(pretty_str)
484
+
485
+ for i, result in enumerate(merged_results):
486
+ if not result.scorers_data: # none of the scorers could be executed on this example
487
+ info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
488
+ return merged_results
452
489
 
453
490
  def assert_test(scoring_results: List[ScoringResult]) -> None:
454
491
  """
judgeval/scorers/score.py CHANGED
@@ -274,15 +274,16 @@ async def a_execute_scoring(
274
274
  semaphore = asyncio.Semaphore(max_concurrent)
275
275
 
276
276
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
277
- try:
278
- async with semaphore:
277
+ async with semaphore:
278
+ try:
279
279
  return await func(*args, **kwargs)
280
- except Exception as e:
281
- error(f"Error executing function: {e}")
282
- if kwargs.get('ignore_errors', False):
283
- # Return None when ignoring errors
284
- return None
285
- raise
280
+ except Exception as e:
281
+ print(f"Error executing function: {e}")
282
+ if kwargs.get('ignore_errors', False):
283
+ # Simply return None when ignoring errors, as expected by the test
284
+ return None
285
+ # If we're not ignoring errors, propagate the exception
286
+ raise
286
287
 
287
288
  if verbose_mode is not None:
288
289
  for scorer in scorers:
@@ -391,6 +392,7 @@ async def a_eval_examples_helper(
391
392
  Returns:
392
393
  None
393
394
  """
395
+
394
396
  show_metrics_indicator = show_indicator and not _use_bar_indicator
395
397
 
396
398
  for scorer in scorers:
@@ -416,12 +418,15 @@ async def a_eval_examples_helper(
416
418
  continue
417
419
  scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
418
420
  process_example.update_scorer_data(scorer_data) # Update process example with the same scorer data
419
-
421
+
420
422
  test_end_time = time.perf_counter()
421
423
  run_duration = test_end_time - scoring_start_time
422
424
 
423
425
  process_example.update_run_duration(run_duration) # Update process example with execution time duration
424
- scoring_results[score_index] = generate_scoring_result(process_example) # Converts the outcomes of the executed test to a ScoringResult and saves it
425
-
426
+
427
+ # Generate the scoring result and store it safely (to avoid race conditions)
428
+ result = generate_scoring_result(process_example)
429
+ scoring_results[score_index] = result
430
+
426
431
  if pbar is not None:
427
432
  pbar.update(1)
judgeval/utils/alerts.py CHANGED
@@ -40,4 +40,35 @@ class AlertResult(BaseModel):
40
40
  @property
41
41
  def conditions_results(self) -> List[Dict[str, Any]]:
42
42
  """Backwards compatibility property for the conditions_result field"""
43
- return self.conditions_result
43
+ return self.conditions_result
44
+
45
+ def model_dump(self, **kwargs):
46
+ """
47
+ Convert the AlertResult to a dictionary for JSON serialization.
48
+
49
+ Args:
50
+ **kwargs: Additional arguments to pass to Pydantic's model_dump
51
+
52
+ Returns:
53
+ dict: Dictionary representation of the AlertResult
54
+ """
55
+ data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
56
+
57
+ # Handle the NotificationConfig object if it exists
58
+ if hasattr(self, "notification") and self.notification is not None:
59
+ if hasattr(self.notification, "model_dump"):
60
+ data["notification"] = self.notification.model_dump()
61
+ elif hasattr(self.notification, "dict"):
62
+ data["notification"] = self.notification.dict()
63
+ else:
64
+ # Manually convert the notification to a dictionary
65
+ notif = self.notification
66
+ data["notification"] = {
67
+ "enabled": notif.enabled,
68
+ "communication_methods": notif.communication_methods,
69
+ "email_addresses": notif.email_addresses,
70
+ "slack_channels": getattr(notif, "slack_channels", []),
71
+ "send_at": notif.send_at
72
+ }
73
+
74
+ return data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.24
3
+ Version: 0.0.26
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,14 +1,14 @@
1
1
  judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
3
+ judgeval/constants.py,sha256=iTUro5SdXcYX00W18l32zL_EEEqHf5OT9uA5yZAme_s,5158
4
4
  judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
5
- judgeval/judgment_client.py,sha256=e-2e4KK-xy8-WLgzg8H0D6pZC8By9IWdu2iK-lHe39A,24076
6
- judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
7
- judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
5
+ judgeval/judgment_client.py,sha256=2z134M0GeW3CdOZDx688UXmqJUlU31hlcFlLwUhF_Tg,25429
6
+ judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
7
+ judgeval/run_evaluation.py,sha256=8FZ-shJ0120iTuT2S1rXzmVcoIHPsFPb0THTGOtKoHM,25772
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
11
+ judgeval/common/tracer.py,sha256=Z87Q3pQrtfHYvE1vsTMdIUfR-iz_IM8dqvW9VwVdtMQ,42434
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
13
  judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
14
14
  judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
@@ -16,9 +16,9 @@ judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
16
16
  judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
17
17
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
18
18
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
19
- judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
19
+ judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
20
20
  judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
21
- judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
21
+ judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
22
22
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
23
23
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
24
24
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -31,7 +31,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
31
31
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
32
32
  judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
33
33
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
34
- judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
34
+ judgeval/scorers/score.py,sha256=PhyAyMkc7KO_DZpFSN1HD_FS3BvdleQPZhYvQkNAdxI,18816
35
35
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
36
36
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
37
37
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
@@ -86,8 +86,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.p
86
86
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
87
87
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
88
88
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
89
- judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
90
- judgeval-0.0.24.dist-info/METADATA,sha256=YvmYQNZs3P1l5ggRe0ejgauUIYJWTsrxlxUZNffKDeI,5418
91
- judgeval-0.0.24.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
- judgeval-0.0.24.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
- judgeval-0.0.24.dist-info/RECORD,,
89
+ judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
90
+ judgeval-0.0.26.dist-info/METADATA,sha256=rhTpfY5GRclxtkkXU4RrUj1ckpuxd2xsgF53oQyK6qo,5418
91
+ judgeval-0.0.26.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
+ judgeval-0.0.26.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
+ judgeval-0.0.26.dist-info/RECORD,,