judgeval 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +55 -8
- judgeval/constants.py +3 -2
- judgeval/data/datasets/dataset.py +42 -19
- judgeval/integrations/langgraph.py +16 -12
- judgeval/judgment_client.py +39 -9
- judgeval/rules.py +177 -60
- judgeval/run_evaluation.py +140 -103
- judgeval/scorers/score.py +16 -11
- judgeval/utils/alerts.py +32 -1
- {judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/METADATA +1 -1
- {judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/RECORD +13 -13
- {judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/WHEEL +0 -0
- {judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -20,6 +20,7 @@ from rich import print as rprint
|
|
20
20
|
# Third-party imports
|
21
21
|
import pika
|
22
22
|
import requests
|
23
|
+
from litellm import cost_per_token
|
23
24
|
from pydantic import BaseModel
|
24
25
|
from rich import print as rprint
|
25
26
|
from openai import OpenAI
|
@@ -332,6 +333,9 @@ class TraceClient:
|
|
332
333
|
self.span_type = None
|
333
334
|
self._current_span: Optional[TraceEntry] = None
|
334
335
|
self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
|
336
|
+
self.visited_nodes = [] # Track nodes visited through langgraph_node spans
|
337
|
+
self.executed_tools = [] # Track tools executed through tool spans
|
338
|
+
self.executed_node_tools = [] # Track node:tool combinations
|
335
339
|
|
336
340
|
@contextmanager
|
337
341
|
def span(self, name: str, span_type: SpanType = "span"):
|
@@ -618,30 +622,70 @@ class TraceClient:
|
|
618
622
|
total_completion_tokens = 0
|
619
623
|
total_tokens = 0
|
620
624
|
|
625
|
+
total_prompt_tokens_cost = 0.0
|
626
|
+
total_completion_tokens_cost = 0.0
|
627
|
+
total_cost = 0.0
|
628
|
+
|
621
629
|
for entry in condensed_entries:
|
622
630
|
if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
|
623
|
-
|
631
|
+
output = entry["output"]
|
632
|
+
usage = output.get("usage", {})
|
633
|
+
model_name = entry.get("inputs", {}).get("model", "")
|
634
|
+
prompt_tokens = 0
|
635
|
+
completion_tokens = 0
|
636
|
+
|
624
637
|
# Handle OpenAI/Together format
|
625
638
|
if "prompt_tokens" in usage:
|
626
|
-
|
627
|
-
|
639
|
+
prompt_tokens = usage.get("prompt_tokens", 0)
|
640
|
+
completion_tokens = usage.get("completion_tokens", 0)
|
641
|
+
total_prompt_tokens += prompt_tokens
|
642
|
+
total_completion_tokens += completion_tokens
|
628
643
|
# Handle Anthropic format
|
629
644
|
elif "input_tokens" in usage:
|
630
|
-
|
631
|
-
|
645
|
+
prompt_tokens = usage.get("input_tokens", 0)
|
646
|
+
completion_tokens = usage.get("output_tokens", 0)
|
647
|
+
total_prompt_tokens += prompt_tokens
|
648
|
+
total_completion_tokens += completion_tokens
|
649
|
+
|
632
650
|
total_tokens += usage.get("total_tokens", 0)
|
651
|
+
|
652
|
+
# Calculate costs if model name is available
|
653
|
+
if model_name:
|
654
|
+
try:
|
655
|
+
prompt_cost, completion_cost = cost_per_token(
|
656
|
+
model=model_name,
|
657
|
+
prompt_tokens=prompt_tokens,
|
658
|
+
completion_tokens=completion_tokens
|
659
|
+
)
|
660
|
+
total_prompt_tokens_cost += prompt_cost
|
661
|
+
total_completion_tokens_cost += completion_cost
|
662
|
+
total_cost += prompt_cost + completion_cost
|
663
|
+
|
664
|
+
# Add cost information directly to the usage dictionary in the condensed entry
|
665
|
+
if "usage" not in output:
|
666
|
+
output["usage"] = {}
|
667
|
+
output["usage"]["prompt_tokens_cost_usd"] = prompt_cost
|
668
|
+
output["usage"]["completion_tokens_cost_usd"] = completion_cost
|
669
|
+
output["usage"]["total_cost_usd"] = prompt_cost + completion_cost
|
670
|
+
except Exception as e:
|
671
|
+
# If cost calculation fails, continue without adding costs
|
672
|
+
print(f"Error calculating cost for model '{model_name}': {str(e)}")
|
673
|
+
pass
|
633
674
|
|
634
675
|
# Create trace document
|
635
676
|
trace_data = {
|
636
677
|
"trace_id": self.trace_id,
|
637
678
|
"name": self.name,
|
638
679
|
"project_name": self.project_name,
|
639
|
-
"created_at": datetime.
|
680
|
+
"created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
|
640
681
|
"duration": total_duration,
|
641
682
|
"token_counts": {
|
642
683
|
"prompt_tokens": total_prompt_tokens,
|
643
684
|
"completion_tokens": total_completion_tokens,
|
644
685
|
"total_tokens": total_tokens,
|
686
|
+
"prompt_tokens_cost_usd": total_prompt_tokens_cost,
|
687
|
+
"completion_tokens_cost_usd": total_completion_tokens_cost,
|
688
|
+
"total_cost_usd": total_cost
|
645
689
|
},
|
646
690
|
"entries": condensed_entries,
|
647
691
|
"empty_save": empty_save,
|
@@ -697,7 +741,6 @@ class Tracer:
|
|
697
741
|
|
698
742
|
if not organization_id:
|
699
743
|
raise ValueError("Tracer must be configured with an Organization ID")
|
700
|
-
|
701
744
|
self.api_key: str = api_key
|
702
745
|
self.project_name: str = project_name
|
703
746
|
self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
|
@@ -767,8 +810,9 @@ class Tracer:
|
|
767
810
|
project_name: Optional project name override
|
768
811
|
overwrite: Whether to overwrite existing traces
|
769
812
|
"""
|
813
|
+
# If monitoring is disabled, return the function as is
|
770
814
|
if not self.enable_monitoring:
|
771
|
-
return
|
815
|
+
return func if func else lambda f: f
|
772
816
|
|
773
817
|
if func is None:
|
774
818
|
return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
|
@@ -872,6 +916,9 @@ class Tracer:
|
|
872
916
|
return wrapper
|
873
917
|
|
874
918
|
def async_evaluate(self, *args, **kwargs):
|
919
|
+
if not self.enable_evaluations:
|
920
|
+
return
|
921
|
+
|
875
922
|
if self._current_trace:
|
876
923
|
self._current_trace.async_evaluate(*args, **kwargs)
|
877
924
|
else:
|
judgeval/constants.py
CHANGED
@@ -46,13 +46,14 @@ JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
|
46
46
|
JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
|
47
47
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
48
48
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
49
|
-
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/
|
49
|
+
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
50
50
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
51
51
|
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
52
52
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
53
53
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
54
54
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
55
|
-
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/
|
55
|
+
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
|
56
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
56
57
|
# RabbitMQ
|
57
58
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
58
59
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
@@ -90,9 +90,18 @@ class EvalDataset:
|
|
90
90
|
def add_from_csv(
|
91
91
|
self,
|
92
92
|
file_path: str,
|
93
|
+
header_mapping: dict,
|
94
|
+
primary_delimiter: str = ",",
|
95
|
+
secondary_delimiter: str = ";"
|
93
96
|
) -> None:
|
94
97
|
"""
|
95
98
|
Add Examples from a CSV file.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
file_path (str): Path to the CSV file
|
102
|
+
header_mapping (dict): Dictionary mapping Example headers to custom headers
|
103
|
+
primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
|
104
|
+
secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
|
96
105
|
"""
|
97
106
|
try:
|
98
107
|
import pandas as pd
|
@@ -102,9 +111,10 @@ class EvalDataset:
|
|
102
111
|
)
|
103
112
|
|
104
113
|
# Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
|
105
|
-
df = pd.read_csv(file_path, dtype={'trace_id': str})
|
114
|
+
df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
|
106
115
|
"""
|
107
|
-
|
116
|
+
The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
|
117
|
+
Available headers for Example objects are as follows:
|
108
118
|
|
109
119
|
"input", "actual_output", "expected_output", "context", \
|
110
120
|
"retrieval_context", "additional_metadata", "tools_called", \
|
@@ -113,35 +123,48 @@ class EvalDataset:
|
|
113
123
|
|
114
124
|
We want to collect the examples separately which can
|
115
125
|
be determined by the "example" column. If the value is True, then it is an
|
116
|
-
example
|
126
|
+
example, and we expect the `input` and `actual_output` fields to be non-null.
|
117
127
|
|
118
|
-
We also assume that if there are multiple retrieval contexts or
|
119
|
-
This can be adjusted using the `
|
128
|
+
We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
|
129
|
+
This can be adjusted using the `secondary_delimiter` parameter.
|
120
130
|
"""
|
121
131
|
examples = []
|
122
|
-
|
132
|
+
|
133
|
+
def process_csv_row(value, header):
|
134
|
+
"""
|
135
|
+
Maps a singular value in the CSV file to the appropriate type based on the header.
|
136
|
+
If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
|
137
|
+
"""
|
138
|
+
# check that the CSV value is not null for entry
|
139
|
+
null_replacement = dict() if header == 'additional_metadata' else None
|
140
|
+
if pd.isna(value) or value == '':
|
141
|
+
return null_replacement
|
142
|
+
try:
|
143
|
+
value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
|
144
|
+
except (ValueError, SyntaxError):
|
145
|
+
value = str(value)
|
146
|
+
if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
|
147
|
+
# attempt to split the value by the secondary delimiter
|
148
|
+
value = value.split(secondary_delimiter)
|
149
|
+
|
150
|
+
return value
|
151
|
+
|
123
152
|
for _, row in df.iterrows():
|
124
153
|
data = {
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
"retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
|
130
|
-
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
|
131
|
-
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
|
132
|
-
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
|
133
|
-
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
|
134
|
-
"example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
|
154
|
+
header: process_csv_row(
|
155
|
+
row[header_mapping[header]], header
|
156
|
+
)
|
157
|
+
for header in header_mapping
|
135
158
|
}
|
136
|
-
if row["example"]:
|
137
|
-
|
159
|
+
if "example" in header_mapping and row[header_mapping["example"]]:
|
160
|
+
if "name" in header_mapping:
|
161
|
+
data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
|
138
162
|
# every Example has `input` and `actual_output` fields
|
139
163
|
if data["input"] is not None and data["actual_output"] is not None:
|
140
164
|
e = Example(**data)
|
141
165
|
examples.append(e)
|
142
166
|
else:
|
143
167
|
raise ValueError("Every example must have an 'input' and 'actual_output' field.")
|
144
|
-
|
145
168
|
|
146
169
|
for e in examples:
|
147
170
|
self.add_example(e)
|
@@ -146,16 +146,17 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
146
146
|
|
147
147
|
self.start_span("LangGraph", span_type="Main Function")
|
148
148
|
|
149
|
-
|
150
|
-
if node
|
151
|
-
self.
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
149
|
+
metadata = kwargs.get("metadata", {})
|
150
|
+
if node := metadata.get("langgraph_node"):
|
151
|
+
if node != self.previous_node:
|
152
|
+
# Track node execution
|
153
|
+
self.trace_client.visited_nodes.append(node)
|
154
|
+
self.trace_client.executed_node_tools.append(node)
|
155
|
+
self.trace_client.record_input({
|
156
|
+
'args': inputs,
|
157
|
+
'kwargs': kwargs
|
158
|
+
})
|
159
|
+
self.previous_node = node
|
159
160
|
|
160
161
|
def on_chain_end(
|
161
162
|
self,
|
@@ -198,8 +199,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
198
199
|
):
|
199
200
|
name = serialized["name"]
|
200
201
|
self.start_span(name, span_type="tool")
|
201
|
-
|
202
|
-
|
202
|
+
if name:
|
203
|
+
# Track tool execution
|
204
|
+
self.trace_client.executed_tools.append(name)
|
205
|
+
node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
|
206
|
+
self.trace_client.executed_node_tools.append(node_tool)
|
203
207
|
self.trace_client.record_input({
|
204
208
|
'args': input_str,
|
205
209
|
'kwargs': kwargs
|
judgeval/judgment_client.py
CHANGED
@@ -38,6 +38,11 @@ class EvalRunRequestBody(BaseModel):
|
|
38
38
|
project_name: str
|
39
39
|
judgment_api_key: str
|
40
40
|
|
41
|
+
class DeleteEvalRunRequestBody(BaseModel):
|
42
|
+
eval_names: List[str]
|
43
|
+
project_name: str
|
44
|
+
judgment_api_key: str
|
45
|
+
|
41
46
|
|
42
47
|
class JudgmentClient:
|
43
48
|
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
|
@@ -52,7 +57,24 @@ class JudgmentClient:
|
|
52
57
|
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
53
58
|
else:
|
54
59
|
print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
|
55
|
-
|
60
|
+
|
61
|
+
def a_run_evaluation(
|
62
|
+
self,
|
63
|
+
examples: List[Example],
|
64
|
+
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
65
|
+
model: Union[str, List[str], JudgevalJudge],
|
66
|
+
aggregator: Optional[str] = None,
|
67
|
+
metadata: Optional[Dict[str, Any]] = None,
|
68
|
+
log_results: bool = True,
|
69
|
+
project_name: str = "default_project",
|
70
|
+
eval_run_name: str = "default_eval_run",
|
71
|
+
override: bool = False,
|
72
|
+
use_judgment: bool = True,
|
73
|
+
ignore_errors: bool = True,
|
74
|
+
rules: Optional[List[Rule]] = None
|
75
|
+
) -> List[ScoringResult]:
|
76
|
+
return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
|
77
|
+
|
56
78
|
def run_evaluation(
|
57
79
|
self,
|
58
80
|
examples: List[Example],
|
@@ -65,6 +87,8 @@ class JudgmentClient:
|
|
65
87
|
eval_run_name: str = "default_eval_run",
|
66
88
|
override: bool = False,
|
67
89
|
use_judgment: bool = True,
|
90
|
+
ignore_errors: bool = True,
|
91
|
+
async_execution: bool = False,
|
68
92
|
rules: Optional[List[Rule]] = None
|
69
93
|
) -> List[ScoringResult]:
|
70
94
|
"""
|
@@ -81,6 +105,7 @@ class JudgmentClient:
|
|
81
105
|
eval_run_name (str): A name for this evaluation run
|
82
106
|
override (bool): Whether to override an existing evaluation run with the same name
|
83
107
|
use_judgment (bool): Whether to use Judgment API for evaluation
|
108
|
+
ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
|
84
109
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
85
110
|
|
86
111
|
Returns:
|
@@ -141,7 +166,7 @@ class JudgmentClient:
|
|
141
166
|
rules=loaded_rules,
|
142
167
|
organization_id=self.organization_id
|
143
168
|
)
|
144
|
-
return run_eval(eval, override)
|
169
|
+
return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
|
145
170
|
except ValueError as e:
|
146
171
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
147
172
|
except Exception as e:
|
@@ -324,19 +349,22 @@ class JudgmentClient:
|
|
324
349
|
eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
|
325
350
|
return eval_run_result
|
326
351
|
|
327
|
-
def delete_eval(self, project_name: str,
|
352
|
+
def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
|
328
353
|
"""
|
329
|
-
Deletes an evaluation from the server by project and run
|
354
|
+
Deletes an evaluation from the server by project and run names.
|
330
355
|
|
331
356
|
Args:
|
332
357
|
project_name (str): Name of the project
|
333
|
-
|
358
|
+
eval_run_names (List[str]): List of names of the evaluation runs
|
334
359
|
|
335
360
|
Returns:
|
336
361
|
bool: Whether the evaluation was successfully deleted
|
337
362
|
"""
|
338
|
-
|
339
|
-
|
363
|
+
if not eval_run_names:
|
364
|
+
raise ValueError("No evaluation run names provided")
|
365
|
+
|
366
|
+
eval_run_request_body = DeleteEvalRunRequestBody(project_name=project_name,
|
367
|
+
eval_names=eval_run_names,
|
340
368
|
judgment_api_key=self.judgment_api_key)
|
341
369
|
response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
|
342
370
|
json=eval_run_request_body.model_dump(),
|
@@ -345,9 +373,11 @@ class JudgmentClient:
|
|
345
373
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
346
374
|
"X-Organization-Id": self.organization_id
|
347
375
|
})
|
348
|
-
if response.status_code
|
376
|
+
if response.status_code == 404:
|
377
|
+
raise ValueError(f"Eval results not found: {response.json()}")
|
378
|
+
elif response.status_code == 500:
|
349
379
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
350
|
-
return response.json()
|
380
|
+
return bool(response.json())
|
351
381
|
|
352
382
|
def delete_project_evals(self, project_name: str) -> bool:
|
353
383
|
"""
|
judgeval/rules.py
CHANGED
@@ -17,15 +17,6 @@ class AlertStatus(str, Enum):
|
|
17
17
|
TRIGGERED = "triggered"
|
18
18
|
NOT_TRIGGERED = "not_triggered"
|
19
19
|
|
20
|
-
class Operator(str, Enum):
|
21
|
-
"""Comparison operators for conditions."""
|
22
|
-
GT = ">"
|
23
|
-
GTE = ">="
|
24
|
-
LT = "<"
|
25
|
-
LTE = "<="
|
26
|
-
EQ = "=="
|
27
|
-
NEQ = "!="
|
28
|
-
|
29
20
|
class Condition(BaseModel):
|
30
21
|
"""
|
31
22
|
A single metric condition.
|
@@ -33,15 +24,13 @@ class Condition(BaseModel):
|
|
33
24
|
Example:
|
34
25
|
{
|
35
26
|
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
|
36
|
-
"operator": ">=",
|
37
|
-
"threshold": 0.7
|
38
27
|
}
|
28
|
+
|
29
|
+
The Condition class uses the scorer's threshold and success function internally.
|
39
30
|
"""
|
40
31
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
41
32
|
|
42
|
-
metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
|
43
|
-
operator: Operator
|
44
|
-
threshold: float
|
33
|
+
metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
|
45
34
|
|
46
35
|
@property
|
47
36
|
def metric_name(self) -> str:
|
@@ -58,22 +47,60 @@ class Condition(BaseModel):
|
|
58
47
|
# Fallback to string representation
|
59
48
|
return str(self.metric)
|
60
49
|
|
50
|
+
@property
|
51
|
+
def threshold(self) -> float:
|
52
|
+
"""Get the threshold from the metric."""
|
53
|
+
return self.metric.threshold if hasattr(self.metric, 'threshold') else 0.5
|
54
|
+
|
61
55
|
def evaluate(self, value: float) -> bool:
|
62
|
-
"""
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
56
|
+
"""
|
57
|
+
Evaluate the condition against a value.
|
58
|
+
Returns True if the condition passes, False otherwise.
|
59
|
+
Uses the scorer's success check function if available.
|
60
|
+
"""
|
61
|
+
# Store the value in the scorer
|
62
|
+
if hasattr(self.metric, 'score'):
|
63
|
+
self.metric.score = value
|
64
|
+
|
65
|
+
# Use the scorer's success check function if available
|
66
|
+
if hasattr(self.metric, 'success_check'):
|
67
|
+
return self.metric.success_check()
|
68
|
+
elif hasattr(self.metric, '_success_check'):
|
69
|
+
return self.metric._success_check()
|
75
70
|
else:
|
76
|
-
|
71
|
+
# Fallback to default comparison (greater than or equal)
|
72
|
+
return value >= self.threshold if self.threshold is not None else False
|
73
|
+
|
74
|
+
class NotificationConfig(BaseModel):
|
75
|
+
"""
|
76
|
+
Configuration for notifications when a rule is triggered.
|
77
|
+
|
78
|
+
Example:
|
79
|
+
{
|
80
|
+
"enabled": true,
|
81
|
+
"communication_methods": ["email", "broadcast_slack", "broadcast_email"],
|
82
|
+
"email_addresses": ["user1@example.com", "user2@example.com"],
|
83
|
+
"send_at": 1632150000 # Unix timestamp (specific date/time)
|
84
|
+
}
|
85
|
+
|
86
|
+
Communication Methods:
|
87
|
+
- "email": Send emails to specified email addresses
|
88
|
+
- "broadcast_slack": Send broadcast notifications to all configured Slack channels
|
89
|
+
- "broadcast_email": Send broadcast emails to all organization emails
|
90
|
+
"""
|
91
|
+
enabled: bool = True
|
92
|
+
communication_methods: List[str] = []
|
93
|
+
email_addresses: Optional[List[str]] = None
|
94
|
+
send_at: Optional[int] = None # Unix timestamp for scheduled notifications
|
95
|
+
|
96
|
+
def model_dump(self, **kwargs):
|
97
|
+
"""Convert the NotificationConfig to a dictionary for JSON serialization."""
|
98
|
+
return {
|
99
|
+
"enabled": self.enabled,
|
100
|
+
"communication_methods": self.communication_methods,
|
101
|
+
"email_addresses": self.email_addresses,
|
102
|
+
"send_at": self.send_at
|
103
|
+
}
|
77
104
|
|
78
105
|
class Rule(BaseModel):
|
79
106
|
"""
|
@@ -85,10 +112,15 @@ class Rule(BaseModel):
|
|
85
112
|
"name": "Quality Check",
|
86
113
|
"description": "Check if quality metrics meet thresholds",
|
87
114
|
"conditions": [
|
88
|
-
{"metric": FaithfulnessScorer(threshold=0.7)
|
89
|
-
{"metric": AnswerRelevancyScorer(threshold=0.8)
|
115
|
+
{"metric": FaithfulnessScorer(threshold=0.7)},
|
116
|
+
{"metric": AnswerRelevancyScorer(threshold=0.8)}
|
90
117
|
],
|
91
|
-
"combine_type": "all" # "all" or "any"
|
118
|
+
"combine_type": "all", # "all" or "any"
|
119
|
+
"notification": {
|
120
|
+
"enabled": true,
|
121
|
+
"communication_methods": ["slack", "email"],
|
122
|
+
"email_addresses": ["user1@example.com", "user2@example.com"]
|
123
|
+
}
|
92
124
|
}
|
93
125
|
"""
|
94
126
|
rule_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Random UUID string as default value
|
@@ -96,6 +128,8 @@ class Rule(BaseModel):
|
|
96
128
|
description: Optional[str] = None
|
97
129
|
conditions: List[Condition]
|
98
130
|
combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
|
131
|
+
notification: Optional[NotificationConfig] = None # Configuration for notifications
|
132
|
+
|
99
133
|
|
100
134
|
def model_dump(self, **kwargs):
|
101
135
|
"""
|
@@ -168,7 +202,6 @@ class Rule(BaseModel):
|
|
168
202
|
raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
|
169
203
|
return v
|
170
204
|
|
171
|
-
|
172
205
|
class AlertResult(BaseModel):
|
173
206
|
"""
|
174
207
|
Result of evaluating a rule.
|
@@ -185,6 +218,11 @@ class AlertResult(BaseModel):
|
|
185
218
|
"metadata": {
|
186
219
|
"example_id": "example_123",
|
187
220
|
"timestamp": "20240321_123456"
|
221
|
+
},
|
222
|
+
"notification": {
|
223
|
+
"enabled": true,
|
224
|
+
"communication_methods": ["slack", "email"],
|
225
|
+
"email_addresses": ["user1@example.com", "user2@example.com"]
|
188
226
|
}
|
189
227
|
}
|
190
228
|
"""
|
@@ -193,6 +231,7 @@ class AlertResult(BaseModel):
|
|
193
231
|
rule_name: str
|
194
232
|
conditions_result: List[Dict[str, Any]]
|
195
233
|
metadata: Dict[str, Any] = {}
|
234
|
+
notification: Optional[NotificationConfig] = None # Configuration for notifications
|
196
235
|
|
197
236
|
@property
|
198
237
|
def example_id(self) -> Optional[str]:
|
@@ -206,36 +245,105 @@ class AlertResult(BaseModel):
|
|
206
245
|
|
207
246
|
class RulesEngine:
|
208
247
|
"""
|
209
|
-
Engine for evaluating rules
|
248
|
+
Engine for creating and evaluating rules against metrics.
|
210
249
|
|
211
|
-
Example
|
250
|
+
Example:
|
251
|
+
```python
|
252
|
+
# Define rules
|
212
253
|
rules = {
|
213
|
-
"
|
254
|
+
"1": Rule(
|
214
255
|
name="Quality Check",
|
256
|
+
description="Check if quality metrics meet thresholds",
|
215
257
|
conditions=[
|
216
|
-
Condition(metric=FaithfulnessScorer(threshold=0.7)
|
217
|
-
Condition(metric=AnswerRelevancyScorer(threshold=0.8)
|
258
|
+
Condition(metric=FaithfulnessScorer(threshold=0.7)),
|
259
|
+
Condition(metric=AnswerRelevancyScorer(threshold=0.8))
|
218
260
|
],
|
219
261
|
combine_type="all"
|
220
262
|
)
|
221
263
|
}
|
222
264
|
|
265
|
+
# Create rules engine
|
223
266
|
engine = RulesEngine(rules)
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
"
|
228
|
-
|
267
|
+
|
268
|
+
# Configure notifications
|
269
|
+
engine.configure_notification(
|
270
|
+
rule_id="1",
|
271
|
+
enabled=True,
|
272
|
+
communication_methods=["slack", "email"],
|
273
|
+
email_addresses=["user@example.com"]
|
274
|
+
)
|
275
|
+
|
276
|
+
# Evaluate rules
|
277
|
+
scores = {"faithfulness": 0.65, "relevancy": 0.85}
|
278
|
+
results = engine.evaluate_rules(scores, {"example_id": "example_123"})
|
279
|
+
```
|
229
280
|
"""
|
230
281
|
|
231
282
|
def __init__(self, rules: Dict[str, Rule]):
|
232
283
|
"""
|
233
|
-
Initialize the
|
284
|
+
Initialize the rules engine.
|
234
285
|
|
235
286
|
Args:
|
236
|
-
rules: Dictionary mapping rule IDs to
|
287
|
+
rules: Dictionary mapping rule IDs to Rule objects
|
237
288
|
"""
|
238
289
|
self.rules = rules
|
290
|
+
|
291
|
+
def configure_notification(self, rule_id: str, enabled: bool = True,
|
292
|
+
communication_methods: List[str] = None,
|
293
|
+
email_addresses: List[str] = None,
|
294
|
+
send_at: Optional[int] = None) -> None:
|
295
|
+
"""
|
296
|
+
Configure notification settings for a specific rule.
|
297
|
+
|
298
|
+
Args:
|
299
|
+
rule_id: ID of the rule to configure notifications for
|
300
|
+
enabled: Whether notifications are enabled for this rule
|
301
|
+
communication_methods: List of notification methods (e.g., ["slack", "email"])
|
302
|
+
email_addresses: List of email addresses to send notifications to
|
303
|
+
send_at: Optional Unix timestamp for when to send the notification
|
304
|
+
"""
|
305
|
+
if rule_id not in self.rules:
|
306
|
+
raise ValueError(f"Rule ID '{rule_id}' not found")
|
307
|
+
|
308
|
+
rule = self.rules[rule_id]
|
309
|
+
|
310
|
+
# Create notification configuration if it doesn't exist
|
311
|
+
if rule.notification is None:
|
312
|
+
rule.notification = NotificationConfig()
|
313
|
+
|
314
|
+
# Set notification parameters
|
315
|
+
rule.notification.enabled = enabled
|
316
|
+
|
317
|
+
if communication_methods is not None:
|
318
|
+
rule.notification.communication_methods = communication_methods
|
319
|
+
|
320
|
+
if email_addresses is not None:
|
321
|
+
rule.notification.email_addresses = email_addresses
|
322
|
+
|
323
|
+
if send_at is not None:
|
324
|
+
rule.notification.send_at = send_at
|
325
|
+
|
326
|
+
def configure_all_notifications(self, enabled: bool = True,
|
327
|
+
communication_methods: List[str] = None,
|
328
|
+
email_addresses: List[str] = None,
|
329
|
+
send_at: Optional[int] = None) -> None:
|
330
|
+
"""
|
331
|
+
Configure notification settings for all rules.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
enabled: Whether notifications are enabled
|
335
|
+
communication_methods: List of notification methods (e.g., ["slack", "email"])
|
336
|
+
email_addresses: List of email addresses to send notifications to
|
337
|
+
send_at: Optional Unix timestamp for when to send the notification
|
338
|
+
"""
|
339
|
+
for rule_id, rule in self.rules.items():
|
340
|
+
self.configure_notification(
|
341
|
+
rule_id=rule_id,
|
342
|
+
enabled=enabled,
|
343
|
+
communication_methods=communication_methods,
|
344
|
+
email_addresses=email_addresses,
|
345
|
+
send_at=send_at
|
346
|
+
)
|
239
347
|
|
240
348
|
def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
|
241
349
|
"""
|
@@ -257,13 +365,13 @@ class RulesEngine:
|
|
257
365
|
# Get the metric name for lookup
|
258
366
|
metric_name = condition.metric_name
|
259
367
|
value = scores.get(metric_name)
|
368
|
+
|
260
369
|
if value is None:
|
261
370
|
# Skip this condition instead of evaluating it as false
|
262
371
|
condition_results.append({
|
263
372
|
"metric": metric_name,
|
264
373
|
"value": None,
|
265
374
|
"threshold": condition.threshold,
|
266
|
-
"operator": condition.operator,
|
267
375
|
"passed": None, # Using None to indicate the condition was skipped
|
268
376
|
"skipped": True # Add a flag to indicate this condition was skipped
|
269
377
|
})
|
@@ -274,7 +382,6 @@ class RulesEngine:
|
|
274
382
|
"metric": metric_name,
|
275
383
|
"value": value,
|
276
384
|
"threshold": condition.threshold,
|
277
|
-
"operator": condition.operator,
|
278
385
|
"passed": passed,
|
279
386
|
"skipped": False # Indicate this condition was evaluated
|
280
387
|
})
|
@@ -285,23 +392,36 @@ class RulesEngine:
|
|
285
392
|
# If all conditions were skipped, the rule doesn't trigger
|
286
393
|
triggered = False
|
287
394
|
else:
|
288
|
-
|
395
|
+
if rule.combine_type == "all":
|
396
|
+
# For "all" combine_type:
|
397
|
+
# - All evaluated conditions must pass
|
398
|
+
# - All conditions must have been evaluated (none skipped)
|
399
|
+
all_conditions_passed = all(passed_conditions)
|
400
|
+
all_conditions_evaluated = len(passed_conditions) == len(rule.conditions)
|
401
|
+
triggered = all_conditions_passed and all_conditions_evaluated
|
402
|
+
else:
|
403
|
+
# For "any" combine_type, at least one condition must pass
|
404
|
+
triggered = any(passed_conditions)
|
289
405
|
|
290
406
|
# Create alert result with example metadata
|
407
|
+
notification_config = None
|
408
|
+
if triggered and rule.notification:
|
409
|
+
# If rule has a notification config and the alert is triggered, include it in the result
|
410
|
+
notification_config = rule.notification
|
411
|
+
|
412
|
+
# Set the alert status based on whether the rule was triggered
|
413
|
+
status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
|
414
|
+
|
415
|
+
# Create the alert result
|
291
416
|
alert_result = AlertResult(
|
292
|
-
status=
|
293
|
-
rule_id=rule.rule_id,
|
417
|
+
status=status,
|
418
|
+
rule_id=rule.rule_id,
|
294
419
|
rule_name=rule.name,
|
295
|
-
conditions_result=condition_results
|
420
|
+
conditions_result=condition_results,
|
421
|
+
notification=notification_config,
|
422
|
+
metadata=example_metadata or {}
|
296
423
|
)
|
297
424
|
|
298
|
-
# Add example metadata if provided
|
299
|
-
if example_metadata:
|
300
|
-
if "example_id" in example_metadata:
|
301
|
-
alert_result.metadata["example_id"] = example_metadata["example_id"]
|
302
|
-
if "timestamp" in example_metadata:
|
303
|
-
alert_result.metadata["timestamp"] = example_metadata["timestamp"]
|
304
|
-
|
305
425
|
results[rule_id] = alert_result
|
306
426
|
|
307
427
|
return results
|
@@ -376,7 +496,4 @@ class RulesEngine:
|
|
376
496
|
)
|
377
497
|
end_time = time.perf_counter()
|
378
498
|
|
379
|
-
# Could log performance metrics here if needed
|
380
|
-
# debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
|
381
|
-
|
382
499
|
return (example_id, rule_results)
|
judgeval/run_evaluation.py
CHANGED
@@ -23,17 +23,35 @@ from judgeval.constants import (
|
|
23
23
|
ROOT_API,
|
24
24
|
JUDGMENT_EVAL_API_URL,
|
25
25
|
JUDGMENT_EVAL_LOG_API_URL,
|
26
|
-
MAX_CONCURRENT_EVALUATIONS
|
26
|
+
MAX_CONCURRENT_EVALUATIONS,
|
27
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
27
28
|
)
|
28
29
|
from judgeval.common.exceptions import JudgmentAPIError
|
29
|
-
from judgeval.evaluation_run import EvaluationRun
|
30
30
|
from judgeval.common.logger import (
|
31
31
|
debug,
|
32
32
|
info,
|
33
33
|
error,
|
34
34
|
example_logging_context
|
35
35
|
)
|
36
|
+
from judgeval.evaluation_run import EvaluationRun
|
37
|
+
|
36
38
|
|
39
|
+
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
40
|
+
"""
|
41
|
+
Sends an evaluation run to the RabbitMQ evaluation queue.
|
42
|
+
"""
|
43
|
+
payload = evaluation_run.model_dump(warnings=False)
|
44
|
+
response = requests.post(
|
45
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
46
|
+
headers={
|
47
|
+
"Content-Type": "application/json",
|
48
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
49
|
+
"X-Organization-Id": evaluation_run.organization_id
|
50
|
+
},
|
51
|
+
json=payload,
|
52
|
+
verify=True
|
53
|
+
)
|
54
|
+
return response.json()
|
37
55
|
|
38
56
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
39
57
|
"""
|
@@ -51,13 +69,15 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
51
69
|
# submit API request to execute evals
|
52
70
|
payload = evaluation_run.model_dump(warnings=False)
|
53
71
|
response = requests.post(
|
54
|
-
JUDGMENT_EVAL_API_URL,
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
72
|
+
JUDGMENT_EVAL_API_URL,
|
73
|
+
headers={
|
74
|
+
"Content-Type": "application/json",
|
75
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
76
|
+
"X-Organization-Id": evaluation_run.organization_id
|
77
|
+
},
|
78
|
+
json=payload,
|
79
|
+
verify=True
|
80
|
+
)
|
61
81
|
response_data = response.json()
|
62
82
|
except Exception as e:
|
63
83
|
error(f"Error: {e}")
|
@@ -281,13 +301,14 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
281
301
|
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
282
302
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
283
303
|
|
284
|
-
|
285
|
-
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
304
|
+
def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
286
305
|
"""
|
287
306
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
288
307
|
|
289
308
|
Args:
|
290
309
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
310
|
+
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
311
|
+
ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
|
291
312
|
|
292
313
|
Args:
|
293
314
|
project_name (str): The name of the project the evaluation results belong to
|
@@ -354,101 +375,117 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
354
375
|
|
355
376
|
api_results: List[ScoringResult] = []
|
356
377
|
local_results: List[ScoringResult] = []
|
357
|
-
|
358
|
-
|
359
|
-
if judgment_scorers:
|
378
|
+
|
379
|
+
if async_execution:
|
360
380
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
361
|
-
info("Starting
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
381
|
+
info("Starting async evaluation")
|
382
|
+
payload = evaluation_run.model_dump(warnings=False)
|
383
|
+
requests.post(
|
384
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
385
|
+
headers={
|
386
|
+
"Content-Type": "application/json",
|
387
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
388
|
+
"X-Organization-Id": evaluation_run.organization_id
|
389
|
+
},
|
390
|
+
json=payload,
|
391
|
+
verify=True
|
392
|
+
)
|
393
|
+
print("Successfully added evaluation to queue")
|
394
|
+
else:
|
395
|
+
if judgment_scorers:
|
396
|
+
# Execute evaluation using Judgment API
|
397
|
+
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
398
|
+
info("Starting API evaluation")
|
399
|
+
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
400
|
+
try: # execute an EvaluationRun with just JudgmentScorers
|
401
|
+
api_evaluation_run: EvaluationRun = EvaluationRun(
|
402
|
+
eval_name=evaluation_run.eval_name,
|
403
|
+
project_name=evaluation_run.project_name,
|
404
|
+
examples=evaluation_run.examples,
|
405
|
+
scorers=judgment_scorers,
|
406
|
+
model=evaluation_run.model,
|
407
|
+
aggregator=evaluation_run.aggregator,
|
408
|
+
metadata=evaluation_run.metadata,
|
409
|
+
judgment_api_key=evaluation_run.judgment_api_key,
|
410
|
+
organization_id=evaluation_run.organization_id,
|
411
|
+
log_results=evaluation_run.log_results,
|
412
|
+
rules=evaluation_run.rules
|
413
|
+
)
|
414
|
+
debug("Sending request to Judgment API")
|
415
|
+
response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
|
416
|
+
info(f"Received {len(response_data['results'])} results from API")
|
417
|
+
except JudgmentAPIError as e:
|
418
|
+
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
419
|
+
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
420
|
+
except ValueError as e:
|
421
|
+
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
|
422
|
+
|
423
|
+
# Convert the response data to `ScoringResult` objects
|
424
|
+
debug("Processing API results")
|
425
|
+
for idx, result in enumerate(response_data["results"]):
|
426
|
+
with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
|
427
|
+
for scorer in judgment_scorers:
|
428
|
+
debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
|
429
|
+
# filter for key-value pairs that are used to initialize ScoringResult
|
430
|
+
# there may be some stuff in here that doesn't belong in ScoringResult
|
431
|
+
# TODO: come back and refactor this to have ScoringResult take in **kwargs
|
432
|
+
filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
|
433
|
+
|
434
|
+
# Convert scorers_data dicts to ScorerData objects
|
435
|
+
if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
|
436
|
+
filtered_result["scorers_data"] = [
|
437
|
+
ScorerData(**scorer_dict)
|
438
|
+
for scorer_dict in filtered_result["scorers_data"]
|
439
|
+
]
|
440
|
+
|
441
|
+
api_results.append(ScoringResult(**filtered_result))
|
442
|
+
# Run local evals
|
443
|
+
if local_scorers: # List[JudgevalScorer]
|
444
|
+
# We should be removing local scorers soon
|
445
|
+
info("Starting local evaluation")
|
446
|
+
for example in evaluation_run.examples:
|
447
|
+
with example_logging_context(example.timestamp, example.example_id):
|
448
|
+
debug(f"Processing example {example.example_id}: {example.input}")
|
449
|
+
|
450
|
+
results: List[ScoringResult] = asyncio.run(
|
451
|
+
a_execute_scoring(
|
452
|
+
evaluation_run.examples,
|
453
|
+
local_scorers,
|
454
|
+
model=evaluation_run.model,
|
455
|
+
ignore_errors=ignore_errors,
|
456
|
+
skip_on_missing_params=True,
|
457
|
+
show_indicator=True,
|
458
|
+
_use_bar_indicator=True,
|
459
|
+
throttle_value=0,
|
460
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
461
|
+
)
|
376
462
|
)
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
463
|
+
local_results = results
|
464
|
+
info(f"Local evaluation complete with {len(local_results)} results")
|
465
|
+
# Aggregate the ScorerData from the API and local evaluations
|
466
|
+
debug("Merging API and local results")
|
467
|
+
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
468
|
+
merged_results = check_missing_scorer_data(merged_results)
|
469
|
+
|
470
|
+
info(f"Successfully merged {len(merged_results)} results")
|
471
|
+
|
472
|
+
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
473
|
+
# if evaluation_run.rules and merged_results:
|
474
|
+
# run_rules(
|
475
|
+
# local_results=merged_results,
|
476
|
+
# rules=evaluation_run.rules,
|
477
|
+
# judgment_api_key=evaluation_run.judgment_api_key,
|
478
|
+
# organization_id=evaluation_run.organization_id
|
479
|
+
# )
|
385
480
|
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
# TODO: come back and refactor this to have ScoringResult take in **kwargs
|
395
|
-
filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
|
396
|
-
|
397
|
-
# Convert scorers_data dicts to ScorerData objects
|
398
|
-
if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
|
399
|
-
filtered_result["scorers_data"] = [
|
400
|
-
ScorerData(**scorer_dict)
|
401
|
-
for scorer_dict in filtered_result["scorers_data"]
|
402
|
-
]
|
403
|
-
|
404
|
-
api_results.append(ScoringResult(**filtered_result))
|
405
|
-
# Run local evals
|
406
|
-
if local_scorers: # List[JudgevalScorer]
|
407
|
-
# We should be removing local scorers soon
|
408
|
-
info("Starting local evaluation")
|
409
|
-
for example in evaluation_run.examples:
|
410
|
-
with example_logging_context(example.timestamp, example.example_id):
|
411
|
-
debug(f"Processing example {example.example_id}: {example.input}")
|
412
|
-
|
413
|
-
results: List[ScoringResult] = asyncio.run(
|
414
|
-
a_execute_scoring(
|
415
|
-
evaluation_run.examples,
|
416
|
-
local_scorers,
|
417
|
-
model=evaluation_run.model,
|
418
|
-
ignore_errors=True,
|
419
|
-
skip_on_missing_params=True,
|
420
|
-
show_indicator=True,
|
421
|
-
_use_bar_indicator=True,
|
422
|
-
throttle_value=0,
|
423
|
-
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
424
|
-
)
|
425
|
-
)
|
426
|
-
local_results = results
|
427
|
-
info(f"Local evaluation complete with {len(local_results)} results")
|
428
|
-
# Aggregate the ScorerData from the API and local evaluations
|
429
|
-
debug("Merging API and local results")
|
430
|
-
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
431
|
-
merged_results = check_missing_scorer_data(merged_results)
|
432
|
-
|
433
|
-
info(f"Successfully merged {len(merged_results)} results")
|
434
|
-
|
435
|
-
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
436
|
-
# if evaluation_run.rules and merged_results:
|
437
|
-
# run_rules(
|
438
|
-
# local_results=merged_results,
|
439
|
-
# rules=evaluation_run.rules,
|
440
|
-
# judgment_api_key=evaluation_run.judgment_api_key,
|
441
|
-
# organization_id=evaluation_run.organization_id
|
442
|
-
# )
|
443
|
-
|
444
|
-
if evaluation_run.log_results:
|
445
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
|
446
|
-
rprint(pretty_str)
|
447
|
-
|
448
|
-
for i, result in enumerate(merged_results):
|
449
|
-
if not result.scorers_data: # none of the scorers could be executed on this example
|
450
|
-
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
|
451
|
-
return merged_results
|
481
|
+
if evaluation_run.log_results:
|
482
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
|
483
|
+
rprint(pretty_str)
|
484
|
+
|
485
|
+
for i, result in enumerate(merged_results):
|
486
|
+
if not result.scorers_data: # none of the scorers could be executed on this example
|
487
|
+
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
|
488
|
+
return merged_results
|
452
489
|
|
453
490
|
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
454
491
|
"""
|
judgeval/scorers/score.py
CHANGED
@@ -274,15 +274,16 @@ async def a_execute_scoring(
|
|
274
274
|
semaphore = asyncio.Semaphore(max_concurrent)
|
275
275
|
|
276
276
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
277
|
-
|
278
|
-
|
277
|
+
async with semaphore:
|
278
|
+
try:
|
279
279
|
return await func(*args, **kwargs)
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
280
|
+
except Exception as e:
|
281
|
+
print(f"Error executing function: {e}")
|
282
|
+
if kwargs.get('ignore_errors', False):
|
283
|
+
# Simply return None when ignoring errors, as expected by the test
|
284
|
+
return None
|
285
|
+
# If we're not ignoring errors, propagate the exception
|
286
|
+
raise
|
286
287
|
|
287
288
|
if verbose_mode is not None:
|
288
289
|
for scorer in scorers:
|
@@ -391,6 +392,7 @@ async def a_eval_examples_helper(
|
|
391
392
|
Returns:
|
392
393
|
None
|
393
394
|
"""
|
395
|
+
|
394
396
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
395
397
|
|
396
398
|
for scorer in scorers:
|
@@ -416,12 +418,15 @@ async def a_eval_examples_helper(
|
|
416
418
|
continue
|
417
419
|
scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
|
418
420
|
process_example.update_scorer_data(scorer_data) # Update process example with the same scorer data
|
419
|
-
|
421
|
+
|
420
422
|
test_end_time = time.perf_counter()
|
421
423
|
run_duration = test_end_time - scoring_start_time
|
422
424
|
|
423
425
|
process_example.update_run_duration(run_duration) # Update process example with execution time duration
|
424
|
-
|
425
|
-
|
426
|
+
|
427
|
+
# Generate the scoring result and store it safely (to avoid race conditions)
|
428
|
+
result = generate_scoring_result(process_example)
|
429
|
+
scoring_results[score_index] = result
|
430
|
+
|
426
431
|
if pbar is not None:
|
427
432
|
pbar.update(1)
|
judgeval/utils/alerts.py
CHANGED
@@ -40,4 +40,35 @@ class AlertResult(BaseModel):
|
|
40
40
|
@property
|
41
41
|
def conditions_results(self) -> List[Dict[str, Any]]:
|
42
42
|
"""Backwards compatibility property for the conditions_result field"""
|
43
|
-
return self.conditions_result
|
43
|
+
return self.conditions_result
|
44
|
+
|
45
|
+
def model_dump(self, **kwargs):
|
46
|
+
"""
|
47
|
+
Convert the AlertResult to a dictionary for JSON serialization.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
**kwargs: Additional arguments to pass to Pydantic's model_dump
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
dict: Dictionary representation of the AlertResult
|
54
|
+
"""
|
55
|
+
data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
|
56
|
+
|
57
|
+
# Handle the NotificationConfig object if it exists
|
58
|
+
if hasattr(self, "notification") and self.notification is not None:
|
59
|
+
if hasattr(self.notification, "model_dump"):
|
60
|
+
data["notification"] = self.notification.model_dump()
|
61
|
+
elif hasattr(self.notification, "dict"):
|
62
|
+
data["notification"] = self.notification.dict()
|
63
|
+
else:
|
64
|
+
# Manually convert the notification to a dictionary
|
65
|
+
notif = self.notification
|
66
|
+
data["notification"] = {
|
67
|
+
"enabled": notif.enabled,
|
68
|
+
"communication_methods": notif.communication_methods,
|
69
|
+
"email_addresses": notif.email_addresses,
|
70
|
+
"slack_channels": getattr(notif, "slack_channels", []),
|
71
|
+
"send_at": notif.send_at
|
72
|
+
}
|
73
|
+
|
74
|
+
return data
|
@@ -1,14 +1,14 @@
|
|
1
1
|
judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=iTUro5SdXcYX00W18l32zL_EEEqHf5OT9uA5yZAme_s,5158
|
4
4
|
judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/rules.py,sha256=
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=2z134M0GeW3CdOZDx688UXmqJUlU31hlcFlLwUhF_Tg,25429
|
6
|
+
judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
|
7
|
+
judgeval/run_evaluation.py,sha256=8FZ-shJ0120iTuT2S1rXzmVcoIHPsFPb0THTGOtKoHM,25772
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=Z87Q3pQrtfHYvE1vsTMdIUfR-iz_IM8dqvW9VwVdtMQ,42434
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
13
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
14
14
|
judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
|
@@ -16,9 +16,9 @@ judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
|
|
16
16
|
judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
|
17
17
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
18
18
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
19
|
-
judgeval/data/datasets/dataset.py,sha256=
|
19
|
+
judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
|
20
20
|
judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
|
21
|
-
judgeval/integrations/langgraph.py,sha256=
|
21
|
+
judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
|
22
22
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
23
23
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
24
24
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
@@ -31,7 +31,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
|
|
31
31
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
32
|
judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
|
33
33
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
34
|
-
judgeval/scorers/score.py,sha256=
|
34
|
+
judgeval/scorers/score.py,sha256=PhyAyMkc7KO_DZpFSN1HD_FS3BvdleQPZhYvQkNAdxI,18816
|
35
35
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
36
36
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
|
37
37
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
|
@@ -86,8 +86,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.p
|
|
86
86
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
|
87
87
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
88
88
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
89
|
-
judgeval/utils/alerts.py,sha256=
|
90
|
-
judgeval-0.0.
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
89
|
+
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
90
|
+
judgeval-0.0.26.dist-info/METADATA,sha256=rhTpfY5GRclxtkkXU4RrUj1ckpuxd2xsgF53oQyK6qo,5418
|
91
|
+
judgeval-0.0.26.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
92
|
+
judgeval-0.0.26.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
93
|
+
judgeval-0.0.26.dist-info/RECORD,,
|
File without changes
|
File without changes
|