judgeval 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +65 -2
- judgeval/constants.py +2 -1
- judgeval/data/api_example.py +3 -16
- judgeval/data/datasets/dataset.py +114 -2
- judgeval/data/example.py +16 -15
- judgeval/data/result.py +3 -3
- judgeval/judgment_client.py +20 -3
- judgeval/run_evaluation.py +62 -8
- judgeval/scorers/__init__.py +2 -2
- judgeval/scorers/api_scorer.py +3 -1
- judgeval/scorers/judgeval_scorers/__init__.py +6 -6
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -2
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +10 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +11 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +11 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +10 -3
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +43 -0
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +10 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +10 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +9 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +9 -2
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +10 -3
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/{tool_correctness/tool_correctness_scorer.py → execution_order/execution_order.py} +3 -3
- {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/METADATA +7 -3
- {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/RECORD +32 -32
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/WHEEL +0 -0
- {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -10,7 +10,9 @@ import os
|
|
10
10
|
import time
|
11
11
|
import uuid
|
12
12
|
import warnings
|
13
|
+
from contextvars import ContextVar
|
13
14
|
from contextlib import contextmanager
|
15
|
+
from collections import defaultdict
|
14
16
|
from dataclasses import dataclass, field
|
15
17
|
from datetime import datetime
|
16
18
|
from http import HTTPStatus
|
@@ -36,6 +38,7 @@ from judgeval.constants import (
|
|
36
38
|
RABBITMQ_PORT,
|
37
39
|
RABBITMQ_QUEUE,
|
38
40
|
JUDGMENT_TRACES_DELETE_API_URL,
|
41
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
39
42
|
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
|
40
43
|
)
|
41
44
|
from judgeval.judgment_client import JudgmentClient
|
@@ -53,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
|
|
53
56
|
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
54
57
|
from langchain_core.agents import AgentAction, AgentFinish
|
55
58
|
from langchain_core.outputs import LLMResult
|
56
|
-
|
59
|
+
from langchain_core.tracers.context import register_configure_hook
|
57
60
|
from langchain_core.messages.ai import AIMessage
|
58
61
|
from langchain_core.messages.tool import ToolMessage
|
59
62
|
from langchain_core.messages.base import BaseMessage
|
@@ -250,7 +253,8 @@ class TraceManagerClient:
|
|
250
253
|
raise ValueError(f"Failed to save trace data: {response.text}")
|
251
254
|
|
252
255
|
if not empty_save and "ui_results_url" in response.json():
|
253
|
-
|
256
|
+
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
|
257
|
+
rprint(pretty_str)
|
254
258
|
|
255
259
|
def delete_trace(self, trace_id: str):
|
256
260
|
"""
|
@@ -293,6 +297,27 @@ class TraceManagerClient:
|
|
293
297
|
raise ValueError(f"Failed to delete trace: {response.text}")
|
294
298
|
|
295
299
|
return response.json()
|
300
|
+
|
301
|
+
def delete_project(self, project_name: str):
|
302
|
+
"""
|
303
|
+
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
304
|
+
"""
|
305
|
+
response = requests.delete(
|
306
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
307
|
+
json={
|
308
|
+
"project_name": project_name,
|
309
|
+
},
|
310
|
+
headers={
|
311
|
+
"Content-Type": "application/json",
|
312
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
313
|
+
"X-Organization-Id": self.organization_id
|
314
|
+
}
|
315
|
+
)
|
316
|
+
|
317
|
+
if response.status_code != HTTPStatus.OK:
|
318
|
+
raise ValueError(f"Failed to delete traces: {response.text}")
|
319
|
+
|
320
|
+
return response.json()
|
296
321
|
|
297
322
|
|
298
323
|
class TraceClient:
|
@@ -962,6 +987,10 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
962
987
|
class JudgevalCallbackHandler(BaseCallbackHandler):
|
963
988
|
def __init__(self, trace_client: TraceClient):
|
964
989
|
self.trace_client = trace_client
|
990
|
+
self.previous_node = "__start__"
|
991
|
+
self.executed_node_tools = []
|
992
|
+
self.executed_nodes = []
|
993
|
+
self.executed_tools = []
|
965
994
|
self.openai_count = 1
|
966
995
|
|
967
996
|
def start_span(self, name: str, span_type: SpanType = "span"):
|
@@ -1049,6 +1078,23 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1049
1078
|
# End the retriever span
|
1050
1079
|
self.end_span(self.trace_client._current_span, span_type="retriever")
|
1051
1080
|
|
1081
|
+
def on_chain_start(
|
1082
|
+
self,
|
1083
|
+
serialized: Dict[str, Any],
|
1084
|
+
inputs: Dict[str, Any],
|
1085
|
+
*,
|
1086
|
+
run_id: UUID,
|
1087
|
+
parent_run_id: Optional[UUID] = None,
|
1088
|
+
tags: Optional[List[str]] = None,
|
1089
|
+
metadata: Optional[Dict[str, Any]] = None,
|
1090
|
+
**kwargs: Any
|
1091
|
+
) -> None:
|
1092
|
+
node = metadata.get("langgraph_node")
|
1093
|
+
if node != None and node != "__start__" and node != self.previous_node:
|
1094
|
+
self.executed_node_tools.append(node)
|
1095
|
+
self.executed_nodes.append(node)
|
1096
|
+
self.previous_node = node
|
1097
|
+
|
1052
1098
|
def on_tool_start(
|
1053
1099
|
self,
|
1054
1100
|
serialized: Optional[dict[str, Any]],
|
@@ -1060,6 +1106,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1060
1106
|
):
|
1061
1107
|
name = serialized["name"]
|
1062
1108
|
self.start_span(name, span_type="tool")
|
1109
|
+
self.executed_node_tools.append(f"{self.previous_node}:{name}")
|
1110
|
+
self.executed_tools.append(name)
|
1063
1111
|
self.trace_client.record_input({
|
1064
1112
|
'args': input_str,
|
1065
1113
|
'kwargs': kwargs
|
@@ -1128,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1128
1176
|
'args': str(messages),
|
1129
1177
|
'kwargs': kwargs
|
1130
1178
|
})
|
1179
|
+
|
1180
|
+
judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
|
1181
|
+
"judgeval_callback_handler", default=None
|
1182
|
+
)
|
1183
|
+
|
1184
|
+
def set_global_handler(handler: JudgevalCallbackHandler):
|
1185
|
+
judgeval_callback_handler_var.set(handler)
|
1186
|
+
|
1187
|
+
def clear_global_handler():
|
1188
|
+
judgeval_callback_handler_var.set(None)
|
1189
|
+
|
1190
|
+
register_configure_hook(
|
1191
|
+
context_var=judgeval_callback_handler_var,
|
1192
|
+
inheritable=True,
|
1193
|
+
)
|
judgeval/constants.py
CHANGED
@@ -22,7 +22,7 @@ class APIScorer(str, Enum):
|
|
22
22
|
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
|
23
23
|
CONTEXTUAL_PRECISION = "contextual_precision"
|
24
24
|
INSTRUCTION_ADHERENCE = "instruction_adherence"
|
25
|
-
|
25
|
+
EXECUTION_ORDER = "execution_order"
|
26
26
|
JSON_CORRECTNESS = "json_correctness"
|
27
27
|
COMPARISON = "comparison"
|
28
28
|
GROUNDEDNESS = "groundedness"
|
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
|
48
48
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
49
49
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
50
50
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
51
|
+
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
51
52
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
52
53
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
53
54
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
judgeval/data/api_example.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any
|
1
|
+
from typing import List, Optional, Dict, Any, Union
|
2
2
|
from pydantic import BaseModel, ConfigDict, model_validator
|
3
3
|
|
4
4
|
from judgeval.data.example import Example
|
@@ -13,8 +13,8 @@ class ProcessExample(BaseModel):
|
|
13
13
|
"""
|
14
14
|
name: str
|
15
15
|
input: Optional[str] = None
|
16
|
-
actual_output: Optional[str] = None
|
17
|
-
expected_output: Optional[str] = None
|
16
|
+
actual_output: Optional[Union[str, List[str]]] = None
|
17
|
+
expected_output: Optional[Union[str, List[str]]] = None
|
18
18
|
context: Optional[list] = None
|
19
19
|
retrieval_context: Optional[list] = None
|
20
20
|
tools_called: Optional[list] = None
|
@@ -57,19 +57,6 @@ class ProcessExample(BaseModel):
|
|
57
57
|
|
58
58
|
def update_run_duration(self, run_duration: float):
|
59
59
|
self.run_duration = run_duration
|
60
|
-
|
61
|
-
@model_validator(mode="before")
|
62
|
-
def check_input(cls, values: Dict[str, Any]):
|
63
|
-
input = values.get("input")
|
64
|
-
actual_output = values.get("actual_output")
|
65
|
-
|
66
|
-
if (input is None or actual_output is None):
|
67
|
-
error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
|
68
|
-
raise ValueError(
|
69
|
-
"'input' and 'actual_output' must be provided."
|
70
|
-
)
|
71
|
-
|
72
|
-
return values
|
73
60
|
|
74
61
|
|
75
62
|
def create_process_example(
|
@@ -3,6 +3,7 @@ import csv
|
|
3
3
|
import datetime
|
4
4
|
import json
|
5
5
|
import os
|
6
|
+
import yaml
|
6
7
|
from dataclasses import dataclass, field
|
7
8
|
from typing import List, Union, Literal
|
8
9
|
|
@@ -190,6 +191,76 @@ class EvalDataset:
|
|
190
191
|
for g in ground_truths:
|
191
192
|
self.add_ground_truth(g)
|
192
193
|
|
194
|
+
def add_from_yaml(self, file_path: str) -> None:
|
195
|
+
debug(f"Loading dataset from YAML file: {file_path}")
|
196
|
+
"""
|
197
|
+
Adds examples and ground truths from a YAML file.
|
198
|
+
|
199
|
+
The format of the YAML file is expected to be a dictionary with two keys: "examples" and "ground_truths".
|
200
|
+
The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
|
201
|
+
|
202
|
+
The YAML file is expected to have the following format:
|
203
|
+
ground_truths:
|
204
|
+
- input: "test input"
|
205
|
+
actual_output: null
|
206
|
+
expected_output: "expected output"
|
207
|
+
context:
|
208
|
+
- "context1"
|
209
|
+
retrieval_context:
|
210
|
+
- "retrieval1"
|
211
|
+
additional_metadata:
|
212
|
+
key: "value"
|
213
|
+
comments: "test comment"
|
214
|
+
tools_called:
|
215
|
+
- "tool1"
|
216
|
+
expected_tools:
|
217
|
+
- "tool1"
|
218
|
+
source_file: "test.py"
|
219
|
+
trace_id: "094121"
|
220
|
+
examples:
|
221
|
+
- input: "test input"
|
222
|
+
actual_output: "test output"
|
223
|
+
expected_output: "expected output"
|
224
|
+
context:
|
225
|
+
- "context1"
|
226
|
+
- "context2"
|
227
|
+
retrieval_context:
|
228
|
+
- "retrieval1"
|
229
|
+
additional_metadata:
|
230
|
+
key: "value"
|
231
|
+
tools_called:
|
232
|
+
- "tool1"
|
233
|
+
expected_tools:
|
234
|
+
- "tool1"
|
235
|
+
- "tool2"
|
236
|
+
name: "test example"
|
237
|
+
example_id: null
|
238
|
+
timestamp: "20241230_160117"
|
239
|
+
trace_id: "123"
|
240
|
+
"""
|
241
|
+
try:
|
242
|
+
with open(file_path, "r") as file:
|
243
|
+
payload = yaml.safe_load(file)
|
244
|
+
if payload is None:
|
245
|
+
raise ValueError("The YAML file is empty.")
|
246
|
+
examples = payload.get("examples", [])
|
247
|
+
ground_truths = payload.get("ground_truths", [])
|
248
|
+
except FileNotFoundError:
|
249
|
+
error(f"YAML file not found: {file_path}")
|
250
|
+
raise FileNotFoundError(f"The file {file_path} was not found.")
|
251
|
+
except yaml.YAMLError:
|
252
|
+
error(f"Invalid YAML file: {file_path}")
|
253
|
+
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
254
|
+
|
255
|
+
info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from YAML")
|
256
|
+
new_examples = [Example(**e) for e in examples]
|
257
|
+
for e in new_examples:
|
258
|
+
self.add_example(e)
|
259
|
+
|
260
|
+
new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
|
261
|
+
for g in new_ground_truths:
|
262
|
+
self.add_ground_truth(g)
|
263
|
+
|
193
264
|
def add_example(self, e: Example) -> None:
|
194
265
|
self.examples = self.examples + [e]
|
195
266
|
# TODO if we need to add rank, then we need to do it here
|
@@ -197,7 +268,7 @@ class EvalDataset:
|
|
197
268
|
def add_ground_truth(self, g: GroundTruthExample) -> None:
|
198
269
|
self.ground_truths = self.ground_truths + [g]
|
199
270
|
|
200
|
-
def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
|
271
|
+
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
|
201
272
|
"""
|
202
273
|
Saves the dataset as a file. Save both the ground truths and examples.
|
203
274
|
|
@@ -266,8 +337,49 @@ class EvalDataset:
|
|
266
337
|
g.trace_id
|
267
338
|
]
|
268
339
|
)
|
340
|
+
elif file_type == "yaml":
|
341
|
+
with open(complete_path, "w") as file:
|
342
|
+
yaml_data = {
|
343
|
+
"examples": [
|
344
|
+
{
|
345
|
+
"input": e.input,
|
346
|
+
"actual_output": e.actual_output,
|
347
|
+
"expected_output": e.expected_output,
|
348
|
+
"context": e.context,
|
349
|
+
"retrieval_context": e.retrieval_context,
|
350
|
+
"additional_metadata": e.additional_metadata,
|
351
|
+
"tools_called": e.tools_called,
|
352
|
+
"expected_tools": e.expected_tools,
|
353
|
+
"name": e.name,
|
354
|
+
"comments": None, # Example does not have comments
|
355
|
+
"source_file": None, # Example does not have source file
|
356
|
+
"example": True, # Adding an Example
|
357
|
+
"trace_id": e.trace_id
|
358
|
+
}
|
359
|
+
for e in self.examples
|
360
|
+
],
|
361
|
+
"ground_truths": [
|
362
|
+
{
|
363
|
+
"input": g.input,
|
364
|
+
"actual_output": g.actual_output,
|
365
|
+
"expected_output": g.expected_output,
|
366
|
+
"context": g.context,
|
367
|
+
"retrieval_context": g.retrieval_context,
|
368
|
+
"additional_metadata": g.additional_metadata,
|
369
|
+
"tools_called": g.tools_called,
|
370
|
+
"expected_tools": g.expected_tools,
|
371
|
+
"name": None, # GroundTruthExample does not have name
|
372
|
+
"comments": g.comments,
|
373
|
+
"source_file": g.source_file,
|
374
|
+
"example": False, # Adding a GroundTruthExample, not an Example
|
375
|
+
"trace_id": g.trace_id
|
376
|
+
}
|
377
|
+
for g in self.ground_truths
|
378
|
+
]
|
379
|
+
}
|
380
|
+
yaml.dump(yaml_data, file, default_flow_style=False)
|
269
381
|
else:
|
270
|
-
ACCEPTABLE_FILE_TYPES = ["json", "csv"]
|
382
|
+
ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
|
271
383
|
raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
|
272
384
|
|
273
385
|
def __iter__(self):
|
judgeval/data/example.py
CHANGED
@@ -2,11 +2,13 @@
|
|
2
2
|
Classes for representing examples in a dataset.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
5
|
+
|
6
|
+
from typing import Optional, Any, Dict, List, Union
|
6
7
|
from uuid import uuid4
|
7
8
|
from pydantic import BaseModel, Field, field_validator
|
8
9
|
from enum import Enum
|
9
10
|
from datetime import datetime
|
11
|
+
import time
|
10
12
|
|
11
13
|
|
12
14
|
class ExampleParams(Enum):
|
@@ -22,9 +24,9 @@ class ExampleParams(Enum):
|
|
22
24
|
|
23
25
|
|
24
26
|
class Example(BaseModel):
|
25
|
-
input: str
|
26
|
-
actual_output: str
|
27
|
-
expected_output: Optional[str] = None
|
27
|
+
input: Optional[str] = None
|
28
|
+
actual_output: Optional[Union[str, List[str]]] = None
|
29
|
+
expected_output: Optional[Union[str, List[str]]] = None
|
28
30
|
context: Optional[List[str]] = None
|
29
31
|
retrieval_context: Optional[List[str]] = None
|
30
32
|
additional_metadata: Optional[Dict[str, Any]] = None
|
@@ -37,12 +39,6 @@ class Example(BaseModel):
|
|
37
39
|
trace_id: Optional[str] = None
|
38
40
|
|
39
41
|
def __init__(self, **data):
|
40
|
-
# Check that required fields are provided
|
41
|
-
if 'input' not in data:
|
42
|
-
raise ValueError("Example must be initialized with 'input' field.")
|
43
|
-
if 'actual_output' not in data:
|
44
|
-
raise ValueError("Example must be initialized with 'actual_output' field.")
|
45
|
-
|
46
42
|
if 'example_id' not in data:
|
47
43
|
data['example_id'] = str(uuid4())
|
48
44
|
# Set timestamp if not provided
|
@@ -53,22 +49,27 @@ class Example(BaseModel):
|
|
53
49
|
@field_validator('input', mode='before')
|
54
50
|
@classmethod
|
55
51
|
def validate_input(cls, v):
|
56
|
-
if not v or not isinstance(v, str):
|
52
|
+
if v is not None and (not v or not isinstance(v, str)):
|
57
53
|
raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
|
58
54
|
return v
|
59
55
|
|
60
56
|
@field_validator('actual_output', mode='before')
|
61
57
|
@classmethod
|
62
58
|
def validate_actual_output(cls, v):
|
63
|
-
if not
|
64
|
-
|
59
|
+
if v is not None:
|
60
|
+
if not isinstance(v, (str, list)):
|
61
|
+
raise ValueError(f"Actual output must be a string or a list of strings but got {v} of type {type(v)}")
|
62
|
+
if isinstance(v, list) and not all(isinstance(item, str) for item in v):
|
63
|
+
raise ValueError(f"All items in actual_output must be strings but got {v}")
|
65
64
|
return v
|
66
65
|
|
67
66
|
@field_validator('expected_output', mode='before')
|
68
67
|
@classmethod
|
69
68
|
def validate_expected_output(cls, v):
|
70
|
-
if v is not None and not isinstance(v, str):
|
71
|
-
raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
|
69
|
+
if v is not None and not isinstance(v, (str, list)):
|
70
|
+
raise ValueError(f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}")
|
71
|
+
if isinstance(v, list) and not all(isinstance(item, str) for item in v):
|
72
|
+
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
72
73
|
return v
|
73
74
|
|
74
75
|
@field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
|
judgeval/data/result.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
|
-
from typing import List, Union, Optional, Dict, Any
|
2
|
+
from typing import List, Union, Optional, Dict, Any, Union
|
3
3
|
|
4
4
|
from judgeval.data import ScorerData, ProcessExample
|
5
5
|
|
@@ -30,8 +30,8 @@ class ScoringResult:
|
|
30
30
|
|
31
31
|
# Inputs from the original example
|
32
32
|
input: Optional[str] = None
|
33
|
-
actual_output: Optional[str] = None
|
34
|
-
expected_output: Optional[str] = None
|
33
|
+
actual_output: Optional[Union[str, List[str]]] = None
|
34
|
+
expected_output: Optional[Union[str, List[str]]] = None
|
35
35
|
context: Optional[List[str]] = None
|
36
36
|
retrieval_context: Optional[List[str]] = None
|
37
37
|
additional_metadata: Optional[Dict[str, Any]] = None
|
judgeval/judgment_client.py
CHANGED
@@ -27,7 +27,8 @@ from judgeval.judges import JudgevalJudge
|
|
27
27
|
from judgeval.constants import (
|
28
28
|
JUDGMENT_EVAL_FETCH_API_URL,
|
29
29
|
JUDGMENT_EVAL_DELETE_API_URL,
|
30
|
-
JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
30
|
+
JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
31
|
+
JUDGMENT_PROJECT_DELETE_API_URL
|
31
32
|
)
|
32
33
|
from judgeval.common.exceptions import JudgmentAPIError
|
33
34
|
from pydantic import BaseModel
|
@@ -156,7 +157,7 @@ class JudgmentClient:
|
|
156
157
|
metadata: Optional[Dict[str, Any]] = None,
|
157
158
|
project_name: str = "",
|
158
159
|
eval_run_name: str = "",
|
159
|
-
log_results: bool =
|
160
|
+
log_results: bool = True,
|
160
161
|
use_judgment: bool = True,
|
161
162
|
rules: Optional[List[Rule]] = None
|
162
163
|
) -> List[ScoringResult]:
|
@@ -362,7 +363,6 @@ class JudgmentClient:
|
|
362
363
|
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
363
364
|
json={
|
364
365
|
"project_name": project_name,
|
365
|
-
"judgment_api_key": self.judgment_api_key,
|
366
366
|
},
|
367
367
|
headers={
|
368
368
|
"Content-Type": "application/json",
|
@@ -372,6 +372,23 @@ class JudgmentClient:
|
|
372
372
|
if response.status_code != requests.codes.ok:
|
373
373
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
374
374
|
return response.json()
|
375
|
+
|
376
|
+
def delete_project(self, project_name: str) -> bool:
|
377
|
+
"""
|
378
|
+
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
379
|
+
"""
|
380
|
+
response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
|
381
|
+
json={
|
382
|
+
"project_name": project_name,
|
383
|
+
},
|
384
|
+
headers={
|
385
|
+
"Content-Type": "application/json",
|
386
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
387
|
+
"X-Organization-Id": self.organization_id
|
388
|
+
})
|
389
|
+
if response.status_code != requests.codes.ok:
|
390
|
+
raise ValueError(f"Error deleting project: {response.json()}")
|
391
|
+
return response.json()
|
375
392
|
|
376
393
|
def _validate_api_key(self):
|
377
394
|
"""
|
judgeval/run_evaluation.py
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
import asyncio
|
2
2
|
import requests
|
3
|
-
|
3
|
+
import time
|
4
|
+
import sys
|
5
|
+
import itertools
|
6
|
+
import threading
|
7
|
+
from typing import List, Dict, Any
|
4
8
|
from datetime import datetime
|
5
9
|
from rich import print as rprint
|
6
10
|
|
7
11
|
from judgeval.data import (
|
8
12
|
ScorerData,
|
9
|
-
ScoringResult
|
13
|
+
ScoringResult,
|
14
|
+
Example
|
10
15
|
)
|
11
16
|
from judgeval.scorers import (
|
12
17
|
JudgevalScorer,
|
@@ -14,7 +19,6 @@ from judgeval.scorers import (
|
|
14
19
|
ClassifierScorer
|
15
20
|
)
|
16
21
|
from judgeval.scorers.score import a_execute_scoring
|
17
|
-
|
18
22
|
from judgeval.constants import (
|
19
23
|
ROOT_API,
|
20
24
|
JUDGMENT_EVAL_API_URL,
|
@@ -185,7 +189,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
185
189
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
186
190
|
|
187
191
|
|
188
|
-
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) ->
|
192
|
+
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
|
189
193
|
"""
|
190
194
|
Logs evaluation results to the Judgment API database.
|
191
195
|
|
@@ -220,7 +224,9 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
220
224
|
raise JudgmentAPIError(error_message)
|
221
225
|
|
222
226
|
if "ui_results_url" in res.json():
|
223
|
-
|
227
|
+
url = res.json()['ui_results_url']
|
228
|
+
pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
229
|
+
return pretty_str
|
224
230
|
|
225
231
|
except requests.exceptions.RequestException as e:
|
226
232
|
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
@@ -229,6 +235,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
229
235
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
230
236
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
231
237
|
|
238
|
+
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
239
|
+
"""Run a function with a spinner in the terminal."""
|
240
|
+
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
241
|
+
|
242
|
+
def display_spinner():
|
243
|
+
while not stop_spinner_event.is_set():
|
244
|
+
sys.stdout.write(f'\r{message}{next(spinner)}')
|
245
|
+
sys.stdout.flush()
|
246
|
+
time.sleep(0.1)
|
247
|
+
|
248
|
+
stop_spinner_event = threading.Event()
|
249
|
+
spinner_thread = threading.Thread(target=display_spinner)
|
250
|
+
spinner_thread.start()
|
251
|
+
|
252
|
+
try:
|
253
|
+
result = func(*args, **kwargs)
|
254
|
+
except Exception as e:
|
255
|
+
error(f"An error occurred: {str(e)}")
|
256
|
+
stop_spinner_event.set()
|
257
|
+
spinner_thread.join()
|
258
|
+
raise e
|
259
|
+
finally:
|
260
|
+
stop_spinner_event.set()
|
261
|
+
spinner_thread.join()
|
262
|
+
|
263
|
+
sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
|
264
|
+
sys.stdout.flush()
|
265
|
+
|
266
|
+
return result
|
267
|
+
|
268
|
+
def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
|
269
|
+
"""
|
270
|
+
Checks if the example contains the necessary parameters for the scorer.
|
271
|
+
"""
|
272
|
+
for scorer in scorers:
|
273
|
+
if isinstance(scorer, APIJudgmentScorer):
|
274
|
+
for example in examples:
|
275
|
+
missing_params = []
|
276
|
+
for param in scorer.required_params:
|
277
|
+
if getattr(example, param.value) is None:
|
278
|
+
missing_params.append(f"'{param.value}'")
|
279
|
+
if missing_params:
|
280
|
+
# We do this because we want to inform users that an example is missing parameters for a scorer
|
281
|
+
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
282
|
+
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
232
283
|
|
233
284
|
|
234
285
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
@@ -253,7 +304,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
253
304
|
Returns:
|
254
305
|
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
255
306
|
"""
|
256
|
-
|
307
|
+
|
257
308
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
258
309
|
if not override and evaluation_run.log_results:
|
259
310
|
check_eval_run_name_exists(
|
@@ -306,6 +357,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
306
357
|
|
307
358
|
# Execute evaluation using Judgment API
|
308
359
|
if judgment_scorers:
|
360
|
+
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
309
361
|
info("Starting API evaluation")
|
310
362
|
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
311
363
|
try: # execute an EvaluationRun with just JudgmentScorers
|
@@ -323,7 +375,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
323
375
|
rules=evaluation_run.rules
|
324
376
|
)
|
325
377
|
debug("Sending request to Judgment API")
|
326
|
-
response_data: List[Dict] =
|
378
|
+
response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
|
327
379
|
info(f"Received {len(response_data['results'])} results from API")
|
328
380
|
except JudgmentAPIError as e:
|
329
381
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
@@ -352,6 +404,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
352
404
|
api_results.append(ScoringResult(**filtered_result))
|
353
405
|
# Run local evals
|
354
406
|
if local_scorers: # List[JudgevalScorer]
|
407
|
+
# We should be removing local scorers soon
|
355
408
|
info("Starting local evaluation")
|
356
409
|
for example in evaluation_run.examples:
|
357
410
|
with example_logging_context(example.timestamp, example.example_id):
|
@@ -389,7 +442,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
389
442
|
# )
|
390
443
|
|
391
444
|
if evaluation_run.log_results:
|
392
|
-
log_evaluation_results
|
445
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
|
446
|
+
rprint(pretty_str)
|
393
447
|
|
394
448
|
for i, result in enumerate(merged_results):
|
395
449
|
if not result.scorers_data: # none of the scorers could be executed on this example
|
judgeval/scorers/__init__.py
CHANGED
@@ -2,7 +2,7 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
3
|
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers import (
|
5
|
-
|
5
|
+
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
7
7
|
SummarizationScorer,
|
8
8
|
HallucinationScorer,
|
@@ -24,7 +24,7 @@ __all__ = [
|
|
24
24
|
"JudgevalScorer",
|
25
25
|
"PromptScorer",
|
26
26
|
"ClassifierScorer",
|
27
|
-
"
|
27
|
+
"ExecutionOrderScorer",
|
28
28
|
"JSONCorrectnessScorer",
|
29
29
|
"SummarizationScorer",
|
30
30
|
"HallucinationScorer",
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -5,8 +5,9 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
|
+
from typing import List
|
8
9
|
from judgeval.common.logger import debug, info, warning, error
|
9
|
-
|
10
|
+
from judgeval.data import ExampleParams
|
10
11
|
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
12
|
|
12
13
|
|
@@ -20,6 +21,7 @@ class APIJudgmentScorer(BaseModel):
|
|
20
21
|
"""
|
21
22
|
score_type: APIScorer
|
22
23
|
threshold: float
|
24
|
+
required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
|
23
25
|
|
24
26
|
@field_validator('threshold')
|
25
27
|
def validate_threshold(cls, v, info):
|
@@ -2,7 +2,7 @@ from typing import Type, Optional, Any
|
|
2
2
|
|
3
3
|
# Import implementations
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
|
-
|
5
|
+
ExecutionOrderScorer as APIExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer as APIJSONCorrectnessScorer,
|
7
7
|
SummarizationScorer as APISummarizationScorer,
|
8
8
|
HallucinationScorer as APIHallucinationScorer,
|
@@ -24,7 +24,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
|
|
24
24
|
ContextualRelevancyScorer as LocalContextualRelevancyScorer,
|
25
25
|
FaithfulnessScorer as LocalFaithfulnessScorer,
|
26
26
|
JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
|
27
|
-
|
27
|
+
ExecutionOrderScorer as LocalExecutionOrderScorer,
|
28
28
|
HallucinationScorer as LocalHallucinationScorer,
|
29
29
|
SummarizationScorer as LocalSummarizationScorer,
|
30
30
|
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
|
@@ -98,9 +98,9 @@ AnswerRelevancyScorer = ScorerWrapper(
|
|
98
98
|
local_implementation=LocalAnswerRelevancyScorer
|
99
99
|
)
|
100
100
|
|
101
|
-
|
102
|
-
api_implementation=
|
103
|
-
local_implementation=
|
101
|
+
ExecutionOrderScorer = ScorerWrapper(
|
102
|
+
api_implementation=APIExecutionOrderScorer,
|
103
|
+
local_implementation=LocalExecutionOrderScorer
|
104
104
|
)
|
105
105
|
|
106
106
|
JSONCorrectnessScorer = ScorerWrapper(
|
@@ -154,7 +154,7 @@ GroundednessScorer = ScorerWrapper(
|
|
154
154
|
)
|
155
155
|
|
156
156
|
__all__ = [
|
157
|
-
"
|
157
|
+
"ExecutionOrderScorer",
|
158
158
|
"JSONCorrectnessScorer",
|
159
159
|
"SummarizationScorer",
|
160
160
|
"HallucinationScorer",
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
1
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import ExecutionOrderScorer
|
2
2
|
from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
|
3
3
|
from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
|
@@ -13,7 +13,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
|
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
|
15
15
|
__all__ = [
|
16
|
-
"
|
16
|
+
"ExecutionOrderScorer",
|
17
17
|
"JSONCorrectnessScorer",
|
18
18
|
"SummarizationScorer",
|
19
19
|
"HallucinationScorer",
|
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class AnswerCorrectnessScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.ANSWER_CORRECTNESS,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.EXPECTED_OUTPUT,
|
22
|
+
]
|
23
|
+
)
|
16
24
|
|
17
25
|
@property
|
18
26
|
def __name__(self):
|
@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class AnswerRelevancyScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.ANSWER_RELEVANCY,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
]
|
22
|
+
)
|
16
23
|
|
17
24
|
@property
|
18
25
|
def __name__(self):
|
@@ -9,12 +9,20 @@ TODO add link to docs page for this scorer
|
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from typing import Optional, Dict
|
12
|
-
|
12
|
+
from judgeval.data import ExampleParams
|
13
13
|
class ComparisonScorer(APIJudgmentScorer):
|
14
14
|
kwargs: Optional[Dict] = None
|
15
15
|
|
16
16
|
def __init__(self, threshold: float, criteria: str, description: str):
|
17
|
-
super().__init__(
|
17
|
+
super().__init__(
|
18
|
+
threshold=threshold,
|
19
|
+
score_type=APIScorer.COMPARISON,
|
20
|
+
required_params=[
|
21
|
+
ExampleParams.INPUT,
|
22
|
+
ExampleParams.ACTUAL_OUTPUT,
|
23
|
+
ExampleParams.EXPECTED_OUTPUT,
|
24
|
+
]
|
25
|
+
)
|
18
26
|
self.kwargs = {"criteria": criteria, "description": description}
|
19
27
|
|
20
28
|
@property
|
@@ -8,11 +8,20 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class ContextualPrecisionScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.CONTEXTUAL_PRECISION,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
+
ExampleParams.EXPECTED_OUTPUT,
|
23
|
+
]
|
24
|
+
)
|
16
25
|
|
17
26
|
@property
|
18
27
|
def __name__(self):
|
@@ -8,12 +8,21 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
+
from judgeval.data import ExampleParams
|
11
12
|
|
12
13
|
|
13
14
|
class ContextualRecallScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
16
|
-
|
16
|
+
super().__init__(
|
17
|
+
threshold=threshold,
|
18
|
+
score_type=APIScorer.CONTEXTUAL_RECALL,
|
19
|
+
required_params=[
|
20
|
+
ExampleParams.INPUT,
|
21
|
+
ExampleParams.ACTUAL_OUTPUT,
|
22
|
+
ExampleParams.EXPECTED_OUTPUT,
|
23
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
24
|
+
]
|
25
|
+
)
|
17
26
|
@property
|
18
27
|
def __name__(self):
|
19
28
|
return "Contextual Recall"
|
@@ -8,15 +8,22 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class ContextualRelevancyScorer(APIJudgmentScorer):
|
14
14
|
"""
|
15
15
|
Scorer that checks if the output of a model is relevant to the retrieval context
|
16
16
|
"""
|
17
17
|
def __init__(self, threshold: float):
|
18
|
-
super().__init__(
|
19
|
-
|
18
|
+
super().__init__(
|
19
|
+
threshold=threshold,
|
20
|
+
score_type=APIScorer.CONTEXTUAL_RELEVANCY,
|
21
|
+
required_params=[
|
22
|
+
ExampleParams.INPUT,
|
23
|
+
ExampleParams.ACTUAL_OUTPUT,
|
24
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
25
|
+
]
|
26
|
+
)
|
20
27
|
@property
|
21
28
|
def __name__(self):
|
22
29
|
return "Contextual Relevancy"
|
@@ -0,0 +1,43 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool correctness scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from typing import Optional, Dict, List
|
12
|
+
from judgeval.data import ExampleParams
|
13
|
+
|
14
|
+
class ExecutionOrderScorer(APIJudgmentScorer):
|
15
|
+
kwargs: Optional[Dict] = None
|
16
|
+
|
17
|
+
def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
|
18
|
+
super().__init__(
|
19
|
+
threshold=threshold,
|
20
|
+
score_type=APIScorer.EXECUTION_ORDER,
|
21
|
+
required_params=[
|
22
|
+
ExampleParams.ACTUAL_OUTPUT,
|
23
|
+
ExampleParams.EXPECTED_OUTPUT,
|
24
|
+
]
|
25
|
+
)
|
26
|
+
self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
|
27
|
+
|
28
|
+
@property
|
29
|
+
def __name__(self):
|
30
|
+
return "Execution Order"
|
31
|
+
|
32
|
+
def to_dict(self) -> dict:
|
33
|
+
"""
|
34
|
+
Converts the scorer configuration to a dictionary format.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
dict: A dictionary containing the scorer's configuration
|
38
|
+
"""
|
39
|
+
return {
|
40
|
+
"score_type": self.score_type,
|
41
|
+
"threshold": self.threshold,
|
42
|
+
"kwargs": self.kwargs
|
43
|
+
}
|
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class FaithfulnessScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.FAITHFULNESS,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
+
]
|
23
|
+
)
|
16
24
|
|
17
25
|
@property
|
18
26
|
def __name__(self):
|
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class GroundednessScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.GROUNDEDNESS,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
+
]
|
23
|
+
)
|
16
24
|
|
17
25
|
@property
|
18
26
|
def __name__(self):
|
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class HallucinationScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.HALLUCINATION,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.CONTEXT,
|
22
|
+
]
|
23
|
+
)
|
16
24
|
|
17
25
|
@property
|
18
26
|
def __name__(self):
|
@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class InstructionAdherenceScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.INSTRUCTION_ADHERENCE,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
]
|
22
|
+
)
|
16
23
|
|
17
24
|
@property
|
18
25
|
def __name__(self):
|
@@ -11,13 +11,20 @@ from pydantic import BaseModel, Field
|
|
11
11
|
# Internal imports
|
12
12
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
13
13
|
from judgeval.constants import APIScorer
|
14
|
-
|
14
|
+
from judgeval.data import ExampleParams
|
15
15
|
|
16
16
|
class JSONCorrectnessScorer(APIJudgmentScorer):
|
17
17
|
json_schema: BaseModel = Field(None, exclude=True)
|
18
18
|
|
19
19
|
def __init__(self, threshold: float, json_schema: BaseModel):
|
20
|
-
super().__init__(
|
20
|
+
super().__init__(
|
21
|
+
threshold=threshold,
|
22
|
+
score_type=APIScorer.JSON_CORRECTNESS,
|
23
|
+
required_params=[
|
24
|
+
ExampleParams.INPUT,
|
25
|
+
ExampleParams.ACTUAL_OUTPUT,
|
26
|
+
]
|
27
|
+
)
|
21
28
|
object.__setattr__(self, 'json_schema', json_schema)
|
22
29
|
|
23
30
|
def to_dict(self):
|
@@ -7,12 +7,19 @@ TODO add link to docs page for this scorer
|
|
7
7
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class SummarizationScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.SUMMARIZATION,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
]
|
22
|
+
)
|
16
23
|
|
17
24
|
@property
|
18
25
|
def __name__(self):
|
@@ -4,7 +4,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.c
|
|
4
4
|
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
|
5
5
|
from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
|
6
6
|
from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
|
7
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.
|
7
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
|
8
8
|
from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
|
9
9
|
from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
|
10
10
|
from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
|
@@ -20,7 +20,7 @@ __all__ = [
|
|
20
20
|
"ContextualRelevancyScorer",
|
21
21
|
"FaithfulnessScorer",
|
22
22
|
"JsonCorrectnessScorer",
|
23
|
-
"
|
23
|
+
"ExecutionOrderScorer",
|
24
24
|
"HallucinationScorer",
|
25
25
|
"SummarizationScorer",
|
26
26
|
"InstructionAdherenceScorer",
|
@@ -45,7 +45,7 @@ def get_lcs(seq1, seq2):
|
|
45
45
|
return lcs[::-1]
|
46
46
|
|
47
47
|
|
48
|
-
class
|
48
|
+
class ExecutionOrderScorer(JudgevalScorer):
|
49
49
|
def __init__(
|
50
50
|
self,
|
51
51
|
threshold: float = 0.5,
|
@@ -56,7 +56,7 @@ class ToolCorrectnessScorer(JudgevalScorer):
|
|
56
56
|
should_consider_ordering: bool = False,
|
57
57
|
):
|
58
58
|
super().__init__(
|
59
|
-
score_type=APIScorer.
|
59
|
+
score_type=APIScorer.EXECUTION_ORDER,
|
60
60
|
threshold=1 if strict_mode else threshold,
|
61
61
|
evaluation_model=None,
|
62
62
|
include_reason=include_reason,
|
@@ -152,5 +152,5 @@ class ToolCorrectnessScorer(JudgevalScorer):
|
|
152
152
|
|
153
153
|
@property
|
154
154
|
def __name__(self):
|
155
|
-
return "
|
155
|
+
return "Execution Order"
|
156
156
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.21
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
+
Requires-Dist: langchain
|
16
|
+
Requires-Dist: langchain-anthropic
|
17
|
+
Requires-Dist: langchain-core
|
18
|
+
Requires-Dist: langchain-huggingface
|
19
|
+
Requires-Dist: langchain-openai
|
15
20
|
Requires-Dist: litellm
|
16
21
|
Requires-Dist: nest-asyncio
|
17
22
|
Requires-Dist: openai
|
23
|
+
Requires-Dist: openpyxl
|
18
24
|
Requires-Dist: pandas
|
19
25
|
Requires-Dist: pika
|
20
26
|
Requires-Dist: python-dotenv==1.0.1
|
@@ -23,8 +29,6 @@ Requires-Dist: supabase
|
|
23
29
|
Requires-Dist: together
|
24
30
|
Requires-Dist: uvicorn
|
25
31
|
Provides-Extra: dev
|
26
|
-
Requires-Dist: langfuse==2.50.3; extra == 'dev'
|
27
|
-
Requires-Dist: patronus; extra == 'dev'
|
28
32
|
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
29
33
|
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
30
34
|
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
@@ -1,23 +1,23 @@
|
|
1
1
|
judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
|
4
4
|
judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
|
5
|
-
judgeval/judgment_client.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=5lqp9X67qPzBUu7kQYETslsc3L5JjxrDVgVLslF07A0,24173
|
6
6
|
judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
7
|
+
judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=WFjFNf3NZ2BN8UAu2MG0F3Om9LgJNma3m_GrxyXgJqE,46655
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
13
|
judgeval/data/__init__.py,sha256=QykVE22Qf-I2f1g-jC9-iQyLNXgDmX1-vHbCgZg8Ra8,558
|
14
|
-
judgeval/data/api_example.py,sha256=
|
15
|
-
judgeval/data/example.py,sha256=
|
14
|
+
judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
|
15
|
+
judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
|
16
16
|
judgeval/data/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
17
|
-
judgeval/data/result.py,sha256=
|
17
|
+
judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
|
18
18
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
19
19
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
20
|
-
judgeval/data/datasets/dataset.py,sha256=
|
20
|
+
judgeval/data/datasets/dataset.py,sha256=LrBK8y3y1R9_BKmXxTzdXMMIQvXlq7tf7TM-u7jgSxE,16839
|
21
21
|
judgeval/data/datasets/eval_dataset_client.py,sha256=QsfHyFC4WePV7uJGYUVjiIwtk1Ie_VpWUrnd2Q4kKdU,11479
|
22
22
|
judgeval/data/datasets/utils.py,sha256=6DpGCPmGFNOKIGNcVCOSjTOdWemrpAuYnlo778sGG7g,2455
|
23
23
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
@@ -26,33 +26,33 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
|
|
26
26
|
judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
|
27
27
|
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
28
28
|
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
29
|
-
judgeval/scorers/__init__.py,sha256=
|
30
|
-
judgeval/scorers/api_scorer.py,sha256=
|
29
|
+
judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
|
30
|
+
judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
|
31
31
|
judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
|
32
32
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
33
33
|
judgeval/scorers/judgeval_scorer.py,sha256=oIkfoGXA09wL_vcK1DRibzQSA-MFNa-hmw1IhGBErf8,6592
|
34
34
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
35
35
|
judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
|
36
36
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
37
|
-
judgeval/scorers/judgeval_scorers/__init__.py,sha256
|
38
|
-
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=
|
39
|
-
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=
|
40
|
-
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=
|
41
|
-
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=
|
42
|
-
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=
|
43
|
-
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=
|
44
|
-
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=
|
45
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
46
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
47
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
48
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
49
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
50
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
51
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
37
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
|
38
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
|
39
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
|
40
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
|
41
|
+
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
|
42
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
|
44
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
|
45
|
+
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
|
46
|
+
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
|
47
|
+
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
|
48
|
+
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
|
49
|
+
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
|
50
|
+
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
|
51
|
+
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
|
52
52
|
judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
|
53
53
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
54
54
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
|
55
|
-
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=
|
55
|
+
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=k_t-THIAtsk7lNvm9faj0u24dPZjn7qRbZ8YGjQ21xs,1926
|
56
56
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
|
57
57
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=3Dpm8BIIe0Th2p0ccO5bb-le93lywjOLSo712HwEIUE,10196
|
58
58
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=hBUqEd8Hy3g8peOVjpSmRb31fPtpodDzdRUonhKRl30,6686
|
@@ -71,6 +71,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompt
|
|
71
71
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py,sha256=JPCvrekKLbl_xdD49evhtiFIVocuegCpCBkn1auzTSE,184
|
72
72
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
|
73
73
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=uO-8Uo7VrXu4xWpxjIx6_UI3aw5KuJxubSHb71Nzm6Q,4574
|
74
|
+
judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py,sha256=DpOHbjYEhVmP-RiaTEa5PZHpoPvduNXG5p6k9lR0AS0,157
|
75
|
+
judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py,sha256=y-Ag8YuzEvExUIj4qU7y53INVLH9L_TUTJLIxCIdAQo,5458
|
74
76
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
|
75
77
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=LPVTGHBBJSpE6TrgzZQS2_vw4P9HiUYmykrwo6UMdws,11251
|
76
78
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
|
@@ -84,11 +86,9 @@ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_co
|
|
84
86
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py,sha256=mv6-XeLSV5yj1H98YYV2iTYVd88zKftZJP42Lgl6R80,89
|
85
87
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
|
86
88
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
87
|
-
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
88
|
-
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=8ucE8UrA44Mr-wHgVsFNU9gKunkPxe87VPYrFVi949g,5461
|
89
89
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
90
90
|
judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
94
|
-
judgeval-0.0.
|
91
|
+
judgeval-0.0.21.dist-info/METADATA,sha256=jQW4w6jGNaHvPWTcqX3ZGr_SKeCpNl7DsNr-cwrYHsA,1378
|
92
|
+
judgeval-0.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
93
|
+
judgeval-0.0.21.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
94
|
+
judgeval-0.0.21.dist-info/RECORD,,
|
@@ -1,19 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` tool correctness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
|
12
|
-
|
13
|
-
class ToolCorrectnessScorer(APIJudgmentScorer):
|
14
|
-
def __init__(self, threshold: float):
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
|
16
|
-
|
17
|
-
@property
|
18
|
-
def __name__(self):
|
19
|
-
return "Tool Correctness"
|
File without changes
|
File without changes
|