judgeval 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +0 -1
- judgeval/common/tracer.py +24 -0
- judgeval/constants.py +1 -1
- judgeval/data/api_example.py +3 -16
- judgeval/data/datasets/dataset.py +114 -2
- judgeval/data/example.py +17 -16
- judgeval/data/result.py +3 -3
- judgeval/scorers/__init__.py +2 -2
- judgeval/scorers/judgeval_scorers/__init__.py +6 -6
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +35 -0
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/{tool_correctness/tool_correctness_scorer.py → execution_order/execution_order.py} +3 -3
- {judgeval-0.0.18.dist-info → judgeval-0.0.20.dist-info}/METADATA +1 -1
- {judgeval-0.0.18.dist-info → judgeval-0.0.20.dist-info}/RECORD +18 -18
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- {judgeval-0.0.18.dist-info → judgeval-0.0.20.dist-info}/WHEEL +0 -0
- {judgeval-0.0.18.dist-info → judgeval-0.0.20.dist-info}/licenses/LICENSE.md +0 -0
judgeval/clients.py
CHANGED
judgeval/common/tracer.py
CHANGED
@@ -11,6 +11,7 @@ import time
|
|
11
11
|
import uuid
|
12
12
|
import warnings
|
13
13
|
from contextlib import contextmanager
|
14
|
+
from collections import defaultdict
|
14
15
|
from dataclasses import dataclass, field
|
15
16
|
from datetime import datetime
|
16
17
|
from http import HTTPStatus
|
@@ -962,6 +963,10 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
962
963
|
class JudgevalCallbackHandler(BaseCallbackHandler):
|
963
964
|
def __init__(self, trace_client: TraceClient):
|
964
965
|
self.trace_client = trace_client
|
966
|
+
self.previous_node = "__start__"
|
967
|
+
self.executed_node_tools = []
|
968
|
+
self.executed_nodes = []
|
969
|
+
self.executed_tools = []
|
965
970
|
self.openai_count = 1
|
966
971
|
|
967
972
|
def start_span(self, name: str, span_type: SpanType = "span"):
|
@@ -1049,6 +1054,23 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1049
1054
|
# End the retriever span
|
1050
1055
|
self.end_span(self.trace_client._current_span, span_type="retriever")
|
1051
1056
|
|
1057
|
+
def on_chain_start(
|
1058
|
+
self,
|
1059
|
+
serialized: Dict[str, Any],
|
1060
|
+
inputs: Dict[str, Any],
|
1061
|
+
*,
|
1062
|
+
run_id: UUID,
|
1063
|
+
parent_run_id: Optional[UUID] = None,
|
1064
|
+
tags: Optional[List[str]] = None,
|
1065
|
+
metadata: Optional[Dict[str, Any]] = None,
|
1066
|
+
**kwargs: Any
|
1067
|
+
) -> None:
|
1068
|
+
node = metadata.get("langgraph_node")
|
1069
|
+
if node != None and node != "__start__" and node != self.previous_node:
|
1070
|
+
self.executed_node_tools.append(node)
|
1071
|
+
self.executed_nodes.append(node)
|
1072
|
+
self.previous_node = node
|
1073
|
+
|
1052
1074
|
def on_tool_start(
|
1053
1075
|
self,
|
1054
1076
|
serialized: Optional[dict[str, Any]],
|
@@ -1060,6 +1082,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1060
1082
|
):
|
1061
1083
|
name = serialized["name"]
|
1062
1084
|
self.start_span(name, span_type="tool")
|
1085
|
+
self.executed_node_tools.append(f"{self.previous_node}:{name}")
|
1086
|
+
self.executed_tools.append(name)
|
1063
1087
|
self.trace_client.record_input({
|
1064
1088
|
'args': input_str,
|
1065
1089
|
'kwargs': kwargs
|
judgeval/constants.py
CHANGED
@@ -22,7 +22,7 @@ class APIScorer(str, Enum):
|
|
22
22
|
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
|
23
23
|
CONTEXTUAL_PRECISION = "contextual_precision"
|
24
24
|
INSTRUCTION_ADHERENCE = "instruction_adherence"
|
25
|
-
|
25
|
+
EXECUTION_ORDER = "execution_order"
|
26
26
|
JSON_CORRECTNESS = "json_correctness"
|
27
27
|
COMPARISON = "comparison"
|
28
28
|
GROUNDEDNESS = "groundedness"
|
judgeval/data/api_example.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any
|
1
|
+
from typing import List, Optional, Dict, Any, Union
|
2
2
|
from pydantic import BaseModel, ConfigDict, model_validator
|
3
3
|
|
4
4
|
from judgeval.data.example import Example
|
@@ -13,8 +13,8 @@ class ProcessExample(BaseModel):
|
|
13
13
|
"""
|
14
14
|
name: str
|
15
15
|
input: Optional[str] = None
|
16
|
-
actual_output: Optional[str] = None
|
17
|
-
expected_output: Optional[str] = None
|
16
|
+
actual_output: Optional[Union[str, List[str]]] = None
|
17
|
+
expected_output: Optional[Union[str, List[str]]] = None
|
18
18
|
context: Optional[list] = None
|
19
19
|
retrieval_context: Optional[list] = None
|
20
20
|
tools_called: Optional[list] = None
|
@@ -57,19 +57,6 @@ class ProcessExample(BaseModel):
|
|
57
57
|
|
58
58
|
def update_run_duration(self, run_duration: float):
|
59
59
|
self.run_duration = run_duration
|
60
|
-
|
61
|
-
@model_validator(mode="before")
|
62
|
-
def check_input(cls, values: Dict[str, Any]):
|
63
|
-
input = values.get("input")
|
64
|
-
actual_output = values.get("actual_output")
|
65
|
-
|
66
|
-
if (input is None or actual_output is None):
|
67
|
-
error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
|
68
|
-
raise ValueError(
|
69
|
-
"'input' and 'actual_output' must be provided."
|
70
|
-
)
|
71
|
-
|
72
|
-
return values
|
73
60
|
|
74
61
|
|
75
62
|
def create_process_example(
|
@@ -3,6 +3,7 @@ import csv
|
|
3
3
|
import datetime
|
4
4
|
import json
|
5
5
|
import os
|
6
|
+
import yaml
|
6
7
|
from dataclasses import dataclass, field
|
7
8
|
from typing import List, Union, Literal
|
8
9
|
|
@@ -190,6 +191,76 @@ class EvalDataset:
|
|
190
191
|
for g in ground_truths:
|
191
192
|
self.add_ground_truth(g)
|
192
193
|
|
194
|
+
def add_from_yaml(self, file_path: str) -> None:
|
195
|
+
debug(f"Loading dataset from YAML file: {file_path}")
|
196
|
+
"""
|
197
|
+
Adds examples and ground truths from a YAML file.
|
198
|
+
|
199
|
+
The format of the YAML file is expected to be a dictionary with two keys: "examples" and "ground_truths".
|
200
|
+
The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
|
201
|
+
|
202
|
+
The YAML file is expected to have the following format:
|
203
|
+
ground_truths:
|
204
|
+
- input: "test input"
|
205
|
+
actual_output: null
|
206
|
+
expected_output: "expected output"
|
207
|
+
context:
|
208
|
+
- "context1"
|
209
|
+
retrieval_context:
|
210
|
+
- "retrieval1"
|
211
|
+
additional_metadata:
|
212
|
+
key: "value"
|
213
|
+
comments: "test comment"
|
214
|
+
tools_called:
|
215
|
+
- "tool1"
|
216
|
+
expected_tools:
|
217
|
+
- "tool1"
|
218
|
+
source_file: "test.py"
|
219
|
+
trace_id: "094121"
|
220
|
+
examples:
|
221
|
+
- input: "test input"
|
222
|
+
actual_output: "test output"
|
223
|
+
expected_output: "expected output"
|
224
|
+
context:
|
225
|
+
- "context1"
|
226
|
+
- "context2"
|
227
|
+
retrieval_context:
|
228
|
+
- "retrieval1"
|
229
|
+
additional_metadata:
|
230
|
+
key: "value"
|
231
|
+
tools_called:
|
232
|
+
- "tool1"
|
233
|
+
expected_tools:
|
234
|
+
- "tool1"
|
235
|
+
- "tool2"
|
236
|
+
name: "test example"
|
237
|
+
example_id: null
|
238
|
+
timestamp: "20241230_160117"
|
239
|
+
trace_id: "123"
|
240
|
+
"""
|
241
|
+
try:
|
242
|
+
with open(file_path, "r") as file:
|
243
|
+
payload = yaml.safe_load(file)
|
244
|
+
if payload is None:
|
245
|
+
raise ValueError("The YAML file is empty.")
|
246
|
+
examples = payload.get("examples", [])
|
247
|
+
ground_truths = payload.get("ground_truths", [])
|
248
|
+
except FileNotFoundError:
|
249
|
+
error(f"YAML file not found: {file_path}")
|
250
|
+
raise FileNotFoundError(f"The file {file_path} was not found.")
|
251
|
+
except yaml.YAMLError:
|
252
|
+
error(f"Invalid YAML file: {file_path}")
|
253
|
+
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
254
|
+
|
255
|
+
info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from YAML")
|
256
|
+
new_examples = [Example(**e) for e in examples]
|
257
|
+
for e in new_examples:
|
258
|
+
self.add_example(e)
|
259
|
+
|
260
|
+
new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
|
261
|
+
for g in new_ground_truths:
|
262
|
+
self.add_ground_truth(g)
|
263
|
+
|
193
264
|
def add_example(self, e: Example) -> None:
|
194
265
|
self.examples = self.examples + [e]
|
195
266
|
# TODO if we need to add rank, then we need to do it here
|
@@ -197,7 +268,7 @@ class EvalDataset:
|
|
197
268
|
def add_ground_truth(self, g: GroundTruthExample) -> None:
|
198
269
|
self.ground_truths = self.ground_truths + [g]
|
199
270
|
|
200
|
-
def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
|
271
|
+
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
|
201
272
|
"""
|
202
273
|
Saves the dataset as a file. Save both the ground truths and examples.
|
203
274
|
|
@@ -266,8 +337,49 @@ class EvalDataset:
|
|
266
337
|
g.trace_id
|
267
338
|
]
|
268
339
|
)
|
340
|
+
elif file_type == "yaml":
|
341
|
+
with open(complete_path, "w") as file:
|
342
|
+
yaml_data = {
|
343
|
+
"examples": [
|
344
|
+
{
|
345
|
+
"input": e.input,
|
346
|
+
"actual_output": e.actual_output,
|
347
|
+
"expected_output": e.expected_output,
|
348
|
+
"context": e.context,
|
349
|
+
"retrieval_context": e.retrieval_context,
|
350
|
+
"additional_metadata": e.additional_metadata,
|
351
|
+
"tools_called": e.tools_called,
|
352
|
+
"expected_tools": e.expected_tools,
|
353
|
+
"name": e.name,
|
354
|
+
"comments": None, # Example does not have comments
|
355
|
+
"source_file": None, # Example does not have source file
|
356
|
+
"example": True, # Adding an Example
|
357
|
+
"trace_id": e.trace_id
|
358
|
+
}
|
359
|
+
for e in self.examples
|
360
|
+
],
|
361
|
+
"ground_truths": [
|
362
|
+
{
|
363
|
+
"input": g.input,
|
364
|
+
"actual_output": g.actual_output,
|
365
|
+
"expected_output": g.expected_output,
|
366
|
+
"context": g.context,
|
367
|
+
"retrieval_context": g.retrieval_context,
|
368
|
+
"additional_metadata": g.additional_metadata,
|
369
|
+
"tools_called": g.tools_called,
|
370
|
+
"expected_tools": g.expected_tools,
|
371
|
+
"name": None, # GroundTruthExample does not have name
|
372
|
+
"comments": g.comments,
|
373
|
+
"source_file": g.source_file,
|
374
|
+
"example": False, # Adding a GroundTruthExample, not an Example
|
375
|
+
"trace_id": g.trace_id
|
376
|
+
}
|
377
|
+
for g in self.ground_truths
|
378
|
+
]
|
379
|
+
}
|
380
|
+
yaml.dump(yaml_data, file, default_flow_style=False)
|
269
381
|
else:
|
270
|
-
ACCEPTABLE_FILE_TYPES = ["json", "csv"]
|
382
|
+
ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
|
271
383
|
raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
|
272
384
|
|
273
385
|
def __iter__(self):
|
judgeval/data/example.py
CHANGED
@@ -2,11 +2,13 @@
|
|
2
2
|
Classes for representing examples in a dataset.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
5
|
+
|
6
|
+
from typing import Optional, Any, Dict, List, Union
|
6
7
|
from uuid import uuid4
|
7
|
-
from pydantic import BaseModel, Field
|
8
|
+
from pydantic import BaseModel, Field, field_validator
|
8
9
|
from enum import Enum
|
9
10
|
from datetime import datetime
|
11
|
+
import time
|
10
12
|
|
11
13
|
|
12
14
|
class ExampleParams(Enum):
|
@@ -22,9 +24,9 @@ class ExampleParams(Enum):
|
|
22
24
|
|
23
25
|
|
24
26
|
class Example(BaseModel):
|
25
|
-
input: str
|
26
|
-
actual_output: str
|
27
|
-
expected_output: Optional[str] = None
|
27
|
+
input: Optional[str] = None
|
28
|
+
actual_output: Optional[Union[str, List[str]]] = None
|
29
|
+
expected_output: Optional[Union[str, List[str]]] = None
|
28
30
|
context: Optional[List[str]] = None
|
29
31
|
retrieval_context: Optional[List[str]] = None
|
30
32
|
additional_metadata: Optional[Dict[str, Any]] = None
|
@@ -37,12 +39,6 @@ class Example(BaseModel):
|
|
37
39
|
trace_id: Optional[str] = None
|
38
40
|
|
39
41
|
def __init__(self, **data):
|
40
|
-
# Check that required fields are provided
|
41
|
-
if 'input' not in data:
|
42
|
-
raise ValueError("Example must be initialized with 'input' field.")
|
43
|
-
if 'actual_output' not in data:
|
44
|
-
raise ValueError("Example must be initialized with 'actual_output' field.")
|
45
|
-
|
46
42
|
if 'example_id' not in data:
|
47
43
|
data['example_id'] = str(uuid4())
|
48
44
|
# Set timestamp if not provided
|
@@ -53,22 +49,27 @@ class Example(BaseModel):
|
|
53
49
|
@field_validator('input', mode='before')
|
54
50
|
@classmethod
|
55
51
|
def validate_input(cls, v):
|
56
|
-
if not v or not isinstance(v, str):
|
52
|
+
if v is not None and (not v or not isinstance(v, str)):
|
57
53
|
raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
|
58
54
|
return v
|
59
55
|
|
60
56
|
@field_validator('actual_output', mode='before')
|
61
57
|
@classmethod
|
62
58
|
def validate_actual_output(cls, v):
|
63
|
-
if not
|
64
|
-
|
59
|
+
if v is not None:
|
60
|
+
if not isinstance(v, (str, list)):
|
61
|
+
raise ValueError(f"Actual output must be a string or a list of strings but got {v} of type {type(v)}")
|
62
|
+
if isinstance(v, list) and not all(isinstance(item, str) for item in v):
|
63
|
+
raise ValueError(f"All items in actual_output must be strings but got {v}")
|
65
64
|
return v
|
66
65
|
|
67
66
|
@field_validator('expected_output', mode='before')
|
68
67
|
@classmethod
|
69
68
|
def validate_expected_output(cls, v):
|
70
|
-
if v is not None and not isinstance(v, str):
|
71
|
-
raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
|
69
|
+
if v is not None and not isinstance(v, (str, list)):
|
70
|
+
raise ValueError(f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}")
|
71
|
+
if isinstance(v, list) and not all(isinstance(item, str) for item in v):
|
72
|
+
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
72
73
|
return v
|
73
74
|
|
74
75
|
@field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
|
judgeval/data/result.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
|
-
from typing import List, Union, Optional, Dict, Any
|
2
|
+
from typing import List, Union, Optional, Dict, Any, Union
|
3
3
|
|
4
4
|
from judgeval.data import ScorerData, ProcessExample
|
5
5
|
|
@@ -30,8 +30,8 @@ class ScoringResult:
|
|
30
30
|
|
31
31
|
# Inputs from the original example
|
32
32
|
input: Optional[str] = None
|
33
|
-
actual_output: Optional[str] = None
|
34
|
-
expected_output: Optional[str] = None
|
33
|
+
actual_output: Optional[Union[str, List[str]]] = None
|
34
|
+
expected_output: Optional[Union[str, List[str]]] = None
|
35
35
|
context: Optional[List[str]] = None
|
36
36
|
retrieval_context: Optional[List[str]] = None
|
37
37
|
additional_metadata: Optional[Dict[str, Any]] = None
|
judgeval/scorers/__init__.py
CHANGED
@@ -2,7 +2,7 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
3
|
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers import (
|
5
|
-
|
5
|
+
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
7
7
|
SummarizationScorer,
|
8
8
|
HallucinationScorer,
|
@@ -24,7 +24,7 @@ __all__ = [
|
|
24
24
|
"JudgevalScorer",
|
25
25
|
"PromptScorer",
|
26
26
|
"ClassifierScorer",
|
27
|
-
"
|
27
|
+
"ExecutionOrderScorer",
|
28
28
|
"JSONCorrectnessScorer",
|
29
29
|
"SummarizationScorer",
|
30
30
|
"HallucinationScorer",
|
@@ -2,7 +2,7 @@ from typing import Type, Optional, Any
|
|
2
2
|
|
3
3
|
# Import implementations
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
|
-
|
5
|
+
ExecutionOrderScorer as APIExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer as APIJSONCorrectnessScorer,
|
7
7
|
SummarizationScorer as APISummarizationScorer,
|
8
8
|
HallucinationScorer as APIHallucinationScorer,
|
@@ -24,7 +24,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
|
|
24
24
|
ContextualRelevancyScorer as LocalContextualRelevancyScorer,
|
25
25
|
FaithfulnessScorer as LocalFaithfulnessScorer,
|
26
26
|
JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
|
27
|
-
|
27
|
+
ExecutionOrderScorer as LocalExecutionOrderScorer,
|
28
28
|
HallucinationScorer as LocalHallucinationScorer,
|
29
29
|
SummarizationScorer as LocalSummarizationScorer,
|
30
30
|
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
|
@@ -98,9 +98,9 @@ AnswerRelevancyScorer = ScorerWrapper(
|
|
98
98
|
local_implementation=LocalAnswerRelevancyScorer
|
99
99
|
)
|
100
100
|
|
101
|
-
|
102
|
-
api_implementation=
|
103
|
-
local_implementation=
|
101
|
+
ExecutionOrderScorer = ScorerWrapper(
|
102
|
+
api_implementation=APIExecutionOrderScorer,
|
103
|
+
local_implementation=LocalExecutionOrderScorer
|
104
104
|
)
|
105
105
|
|
106
106
|
JSONCorrectnessScorer = ScorerWrapper(
|
@@ -154,7 +154,7 @@ GroundednessScorer = ScorerWrapper(
|
|
154
154
|
)
|
155
155
|
|
156
156
|
__all__ = [
|
157
|
-
"
|
157
|
+
"ExecutionOrderScorer",
|
158
158
|
"JSONCorrectnessScorer",
|
159
159
|
"SummarizationScorer",
|
160
160
|
"HallucinationScorer",
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
1
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import ExecutionOrderScorer
|
2
2
|
from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
|
3
3
|
from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
|
@@ -13,7 +13,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
|
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
|
15
15
|
__all__ = [
|
16
|
-
"
|
16
|
+
"ExecutionOrderScorer",
|
17
17
|
"JSONCorrectnessScorer",
|
18
18
|
"SummarizationScorer",
|
19
19
|
"HallucinationScorer",
|
@@ -0,0 +1,35 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool correctness scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from typing import Optional, Dict
|
12
|
+
|
13
|
+
class ExecutionOrderScorer(APIJudgmentScorer):
|
14
|
+
kwargs: Optional[Dict] = None
|
15
|
+
|
16
|
+
def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
|
17
|
+
super().__init__(threshold=threshold, score_type=APIScorer.EXECUTION_ORDER)
|
18
|
+
self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
|
19
|
+
|
20
|
+
@property
|
21
|
+
def __name__(self):
|
22
|
+
return "Execution Order"
|
23
|
+
|
24
|
+
def to_dict(self) -> dict:
|
25
|
+
"""
|
26
|
+
Converts the scorer configuration to a dictionary format.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
dict: A dictionary containing the scorer's configuration
|
30
|
+
"""
|
31
|
+
return {
|
32
|
+
"score_type": self.score_type,
|
33
|
+
"threshold": self.threshold,
|
34
|
+
"kwargs": self.kwargs
|
35
|
+
}
|
@@ -4,7 +4,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.c
|
|
4
4
|
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
|
5
5
|
from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
|
6
6
|
from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
|
7
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.
|
7
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
|
8
8
|
from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
|
9
9
|
from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
|
10
10
|
from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
|
@@ -20,7 +20,7 @@ __all__ = [
|
|
20
20
|
"ContextualRelevancyScorer",
|
21
21
|
"FaithfulnessScorer",
|
22
22
|
"JsonCorrectnessScorer",
|
23
|
-
"
|
23
|
+
"ExecutionOrderScorer",
|
24
24
|
"HallucinationScorer",
|
25
25
|
"SummarizationScorer",
|
26
26
|
"InstructionAdherenceScorer",
|
@@ -45,7 +45,7 @@ def get_lcs(seq1, seq2):
|
|
45
45
|
return lcs[::-1]
|
46
46
|
|
47
47
|
|
48
|
-
class
|
48
|
+
class ExecutionOrderScorer(JudgevalScorer):
|
49
49
|
def __init__(
|
50
50
|
self,
|
51
51
|
threshold: float = 0.5,
|
@@ -56,7 +56,7 @@ class ToolCorrectnessScorer(JudgevalScorer):
|
|
56
56
|
should_consider_ordering: bool = False,
|
57
57
|
):
|
58
58
|
super().__init__(
|
59
|
-
score_type=APIScorer.
|
59
|
+
score_type=APIScorer.EXECUTION_ORDER,
|
60
60
|
threshold=1 if strict_mode else threshold,
|
61
61
|
evaluation_model=None,
|
62
62
|
include_reason=include_reason,
|
@@ -152,5 +152,5 @@ class ToolCorrectnessScorer(JudgevalScorer):
|
|
152
152
|
|
153
153
|
@property
|
154
154
|
def __name__(self):
|
155
|
-
return "
|
155
|
+
return "Execution Order"
|
156
156
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
|
2
|
-
judgeval/clients.py,sha256=
|
3
|
-
judgeval/constants.py,sha256=
|
2
|
+
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
+
judgeval/constants.py,sha256=i8JIDUyo38Vt0R1n0GRA4FaakkBC5F2o4hQa0ncSF2E,5008
|
4
4
|
judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
|
5
5
|
judgeval/judgment_client.py,sha256=evlvcrYO9pF-oCgcvlGE59iODN0C6GJtn7bySFU_88k,23384
|
6
6
|
judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
|
@@ -8,16 +8,16 @@ judgeval/run_evaluation.py,sha256=yLW24kFcw0xzXHvnDclYqtujTww6SDwvut6HM1x7SXk,21
|
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=FYrAuav6OiiawHLQ2e154MLvCBMdh-z_ucU2h7XK08M,45295
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
13
|
judgeval/data/__init__.py,sha256=QykVE22Qf-I2f1g-jC9-iQyLNXgDmX1-vHbCgZg8Ra8,558
|
14
|
-
judgeval/data/api_example.py,sha256=
|
15
|
-
judgeval/data/example.py,sha256=
|
14
|
+
judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
|
15
|
+
judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
|
16
16
|
judgeval/data/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
17
|
-
judgeval/data/result.py,sha256=
|
17
|
+
judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
|
18
18
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
19
19
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
20
|
-
judgeval/data/datasets/dataset.py,sha256=
|
20
|
+
judgeval/data/datasets/dataset.py,sha256=LrBK8y3y1R9_BKmXxTzdXMMIQvXlq7tf7TM-u7jgSxE,16839
|
21
21
|
judgeval/data/datasets/eval_dataset_client.py,sha256=QsfHyFC4WePV7uJGYUVjiIwtk1Ie_VpWUrnd2Q4kKdU,11479
|
22
22
|
judgeval/data/datasets/utils.py,sha256=6DpGCPmGFNOKIGNcVCOSjTOdWemrpAuYnlo778sGG7g,2455
|
23
23
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
@@ -26,7 +26,7 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
|
|
26
26
|
judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
|
27
27
|
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
28
28
|
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
29
|
-
judgeval/scorers/__init__.py,sha256=
|
29
|
+
judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
|
30
30
|
judgeval/scorers/api_scorer.py,sha256=wGqTQCbUE7uE-PzaKcCmexAqutdTunjFR0zVA6bUxdE,2518
|
31
31
|
judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
|
32
32
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
@@ -34,25 +34,25 @@ judgeval/scorers/judgeval_scorer.py,sha256=oIkfoGXA09wL_vcK1DRibzQSA-MFNa-hmw1Ih
|
|
34
34
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
35
35
|
judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
|
36
36
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
37
|
-
judgeval/scorers/judgeval_scorers/__init__.py,sha256
|
38
|
-
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=
|
37
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
|
38
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
|
39
39
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
|
40
40
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
|
41
41
|
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=6Q1qbsANOoZ3PM8n_gtZLIMbTBB9879L3acRelNJ6Uk,1001
|
42
42
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=2zBrm_EEc143bmPA4HVcf8XtQeuc_BexczGx-SHlwRY,473
|
43
43
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=NyojBWy_lRYx8diREulSK8s9dfYdZav4eZjg3TwUm0M,461
|
44
44
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=wROMWOliCnB39ftX9TdeZmG9y0vrnxIGVby65tLOQRU,574
|
45
|
+
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=qxnvEDeKRlyzxX3EX53sW4oXxAM8Fj_q6ibdTxJNTAc,1076
|
45
46
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=gNf_i5c0jjpz2zCGhe7TtDMLKxc1PdOExJMFB5X7hSg,442
|
46
47
|
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=esO76hEp0NzeBUdoSICPLdx5AeA5zWSt_2zpcSgvGis,442
|
47
48
|
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexPkKgo1rCALMivypROQjG5WWEsKXEFZxe2k,446
|
48
49
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=t1lWYOF0Pxvw5-NrI1Dt9FojaOncOCRlZc4a2SA20h4,477
|
49
50
|
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
|
50
51
|
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
|
51
|
-
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
|
52
52
|
judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
|
53
53
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
54
54
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
|
55
|
-
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=
|
55
|
+
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=k_t-THIAtsk7lNvm9faj0u24dPZjn7qRbZ8YGjQ21xs,1926
|
56
56
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
|
57
57
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=3Dpm8BIIe0Th2p0ccO5bb-le93lywjOLSo712HwEIUE,10196
|
58
58
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=hBUqEd8Hy3g8peOVjpSmRb31fPtpodDzdRUonhKRl30,6686
|
@@ -71,6 +71,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompt
|
|
71
71
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py,sha256=JPCvrekKLbl_xdD49evhtiFIVocuegCpCBkn1auzTSE,184
|
72
72
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
|
73
73
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=uO-8Uo7VrXu4xWpxjIx6_UI3aw5KuJxubSHb71Nzm6Q,4574
|
74
|
+
judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py,sha256=DpOHbjYEhVmP-RiaTEa5PZHpoPvduNXG5p6k9lR0AS0,157
|
75
|
+
judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py,sha256=y-Ag8YuzEvExUIj4qU7y53INVLH9L_TUTJLIxCIdAQo,5458
|
74
76
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
|
75
77
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=LPVTGHBBJSpE6TrgzZQS2_vw4P9HiUYmykrwo6UMdws,11251
|
76
78
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
|
@@ -84,11 +86,9 @@ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_co
|
|
84
86
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py,sha256=mv6-XeLSV5yj1H98YYV2iTYVd88zKftZJP42Lgl6R80,89
|
85
87
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
|
86
88
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
87
|
-
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
88
|
-
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=8ucE8UrA44Mr-wHgVsFNU9gKunkPxe87VPYrFVi949g,5461
|
89
89
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
90
90
|
judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
94
|
-
judgeval-0.0.
|
91
|
+
judgeval-0.0.20.dist-info/METADATA,sha256=cz7uKUuHAc1rdANc8IJ5klQhlmrqOu_K1y6wwEIAdFU,1283
|
92
|
+
judgeval-0.0.20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
93
|
+
judgeval-0.0.20.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
94
|
+
judgeval-0.0.20.dist-info/RECORD,,
|
@@ -1,19 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` tool correctness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
|
12
|
-
|
13
|
-
class ToolCorrectnessScorer(APIJudgmentScorer):
|
14
|
-
def __init__(self, threshold: float):
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
|
16
|
-
|
17
|
-
@property
|
18
|
-
def __name__(self):
|
19
|
-
return "Tool Correctness"
|
File without changes
|
File without changes
|