judgeval 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. judgeval/common/tracer.py +65 -2
  2. judgeval/constants.py +2 -1
  3. judgeval/data/api_example.py +3 -16
  4. judgeval/data/datasets/dataset.py +114 -2
  5. judgeval/data/example.py +16 -15
  6. judgeval/data/result.py +3 -3
  7. judgeval/judgment_client.py +20 -3
  8. judgeval/run_evaluation.py +62 -8
  9. judgeval/scorers/__init__.py +2 -2
  10. judgeval/scorers/api_scorer.py +3 -1
  11. judgeval/scorers/judgeval_scorers/__init__.py +6 -6
  12. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -2
  13. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -2
  14. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -2
  15. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +10 -2
  16. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +11 -2
  17. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +11 -2
  18. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +10 -3
  19. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +43 -0
  20. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -2
  21. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +10 -2
  22. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +10 -2
  23. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +9 -2
  24. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +9 -2
  25. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +10 -3
  26. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +2 -2
  27. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +3 -0
  28. judgeval/scorers/judgeval_scorers/local_implementations/{tool_correctness/tool_correctness_scorer.py → execution_order/execution_order.py} +3 -3
  29. {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/METADATA +7 -3
  30. {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/RECORD +32 -32
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  32. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  33. {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/WHEEL +0 -0
  34. {judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py CHANGED
@@ -10,7 +10,9 @@ import os
10
10
  import time
11
11
  import uuid
12
12
  import warnings
13
+ from contextvars import ContextVar
13
14
  from contextlib import contextmanager
15
+ from collections import defaultdict
14
16
  from dataclasses import dataclass, field
15
17
  from datetime import datetime
16
18
  from http import HTTPStatus
@@ -36,6 +38,7 @@ from judgeval.constants import (
36
38
  RABBITMQ_PORT,
37
39
  RABBITMQ_QUEUE,
38
40
  JUDGMENT_TRACES_DELETE_API_URL,
41
+ JUDGMENT_PROJECT_DELETE_API_URL,
39
42
  JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
40
43
  )
41
44
  from judgeval.judgment_client import JudgmentClient
@@ -53,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
53
56
  from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
54
57
  from langchain_core.agents import AgentAction, AgentFinish
55
58
  from langchain_core.outputs import LLMResult
56
-
59
+ from langchain_core.tracers.context import register_configure_hook
57
60
  from langchain_core.messages.ai import AIMessage
58
61
  from langchain_core.messages.tool import ToolMessage
59
62
  from langchain_core.messages.base import BaseMessage
@@ -250,7 +253,8 @@ class TraceManagerClient:
250
253
  raise ValueError(f"Failed to save trace data: {response.text}")
251
254
 
252
255
  if not empty_save and "ui_results_url" in response.json():
253
- rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
256
+ pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
257
+ rprint(pretty_str)
254
258
 
255
259
  def delete_trace(self, trace_id: str):
256
260
  """
@@ -293,6 +297,27 @@ class TraceManagerClient:
293
297
  raise ValueError(f"Failed to delete trace: {response.text}")
294
298
 
295
299
  return response.json()
300
+
301
+ def delete_project(self, project_name: str):
302
+ """
303
+ Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
304
+ """
305
+ response = requests.delete(
306
+ JUDGMENT_PROJECT_DELETE_API_URL,
307
+ json={
308
+ "project_name": project_name,
309
+ },
310
+ headers={
311
+ "Content-Type": "application/json",
312
+ "Authorization": f"Bearer {self.judgment_api_key}",
313
+ "X-Organization-Id": self.organization_id
314
+ }
315
+ )
316
+
317
+ if response.status_code != HTTPStatus.OK:
318
+ raise ValueError(f"Failed to delete traces: {response.text}")
319
+
320
+ return response.json()
296
321
 
297
322
 
298
323
  class TraceClient:
@@ -962,6 +987,10 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
962
987
  class JudgevalCallbackHandler(BaseCallbackHandler):
963
988
  def __init__(self, trace_client: TraceClient):
964
989
  self.trace_client = trace_client
990
+ self.previous_node = "__start__"
991
+ self.executed_node_tools = []
992
+ self.executed_nodes = []
993
+ self.executed_tools = []
965
994
  self.openai_count = 1
966
995
 
967
996
  def start_span(self, name: str, span_type: SpanType = "span"):
@@ -1049,6 +1078,23 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1049
1078
  # End the retriever span
1050
1079
  self.end_span(self.trace_client._current_span, span_type="retriever")
1051
1080
 
1081
+ def on_chain_start(
1082
+ self,
1083
+ serialized: Dict[str, Any],
1084
+ inputs: Dict[str, Any],
1085
+ *,
1086
+ run_id: UUID,
1087
+ parent_run_id: Optional[UUID] = None,
1088
+ tags: Optional[List[str]] = None,
1089
+ metadata: Optional[Dict[str, Any]] = None,
1090
+ **kwargs: Any
1091
+ ) -> None:
1092
+ node = metadata.get("langgraph_node")
1093
+ if node != None and node != "__start__" and node != self.previous_node:
1094
+ self.executed_node_tools.append(node)
1095
+ self.executed_nodes.append(node)
1096
+ self.previous_node = node
1097
+
1052
1098
  def on_tool_start(
1053
1099
  self,
1054
1100
  serialized: Optional[dict[str, Any]],
@@ -1060,6 +1106,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1060
1106
  ):
1061
1107
  name = serialized["name"]
1062
1108
  self.start_span(name, span_type="tool")
1109
+ self.executed_node_tools.append(f"{self.previous_node}:{name}")
1110
+ self.executed_tools.append(name)
1063
1111
  self.trace_client.record_input({
1064
1112
  'args': input_str,
1065
1113
  'kwargs': kwargs
@@ -1128,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1128
1176
  'args': str(messages),
1129
1177
  'kwargs': kwargs
1130
1178
  })
1179
+
1180
+ judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
1181
+ "judgeval_callback_handler", default=None
1182
+ )
1183
+
1184
+ def set_global_handler(handler: JudgevalCallbackHandler):
1185
+ judgeval_callback_handler_var.set(handler)
1186
+
1187
+ def clear_global_handler():
1188
+ judgeval_callback_handler_var.set(None)
1189
+
1190
+ register_configure_hook(
1191
+ context_var=judgeval_callback_handler_var,
1192
+ inheritable=True,
1193
+ )
judgeval/constants.py CHANGED
@@ -22,7 +22,7 @@ class APIScorer(str, Enum):
22
22
  CONTEXTUAL_RELEVANCY = "contextual_relevancy"
23
23
  CONTEXTUAL_PRECISION = "contextual_precision"
24
24
  INSTRUCTION_ADHERENCE = "instruction_adherence"
25
- TOOL_CORRECTNESS = "tool_correctness"
25
+ EXECUTION_ORDER = "execution_order"
26
26
  JSON_CORRECTNESS = "json_correctness"
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
48
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
49
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
50
50
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
+ JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
51
52
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
52
53
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
53
54
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Dict, Any
1
+ from typing import List, Optional, Dict, Any, Union
2
2
  from pydantic import BaseModel, ConfigDict, model_validator
3
3
 
4
4
  from judgeval.data.example import Example
@@ -13,8 +13,8 @@ class ProcessExample(BaseModel):
13
13
  """
14
14
  name: str
15
15
  input: Optional[str] = None
16
- actual_output: Optional[str] = None
17
- expected_output: Optional[str] = None
16
+ actual_output: Optional[Union[str, List[str]]] = None
17
+ expected_output: Optional[Union[str, List[str]]] = None
18
18
  context: Optional[list] = None
19
19
  retrieval_context: Optional[list] = None
20
20
  tools_called: Optional[list] = None
@@ -57,19 +57,6 @@ class ProcessExample(BaseModel):
57
57
 
58
58
  def update_run_duration(self, run_duration: float):
59
59
  self.run_duration = run_duration
60
-
61
- @model_validator(mode="before")
62
- def check_input(cls, values: Dict[str, Any]):
63
- input = values.get("input")
64
- actual_output = values.get("actual_output")
65
-
66
- if (input is None or actual_output is None):
67
- error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
68
- raise ValueError(
69
- "'input' and 'actual_output' must be provided."
70
- )
71
-
72
- return values
73
60
 
74
61
 
75
62
  def create_process_example(
@@ -3,6 +3,7 @@ import csv
3
3
  import datetime
4
4
  import json
5
5
  import os
6
+ import yaml
6
7
  from dataclasses import dataclass, field
7
8
  from typing import List, Union, Literal
8
9
 
@@ -190,6 +191,76 @@ class EvalDataset:
190
191
  for g in ground_truths:
191
192
  self.add_ground_truth(g)
192
193
 
194
+ def add_from_yaml(self, file_path: str) -> None:
195
+ debug(f"Loading dataset from YAML file: {file_path}")
196
+ """
197
+ Adds examples and ground truths from a YAML file.
198
+
199
+ The format of the YAML file is expected to be a dictionary with two keys: "examples" and "ground_truths".
200
+ The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
201
+
202
+ The YAML file is expected to have the following format:
203
+ ground_truths:
204
+ - input: "test input"
205
+ actual_output: null
206
+ expected_output: "expected output"
207
+ context:
208
+ - "context1"
209
+ retrieval_context:
210
+ - "retrieval1"
211
+ additional_metadata:
212
+ key: "value"
213
+ comments: "test comment"
214
+ tools_called:
215
+ - "tool1"
216
+ expected_tools:
217
+ - "tool1"
218
+ source_file: "test.py"
219
+ trace_id: "094121"
220
+ examples:
221
+ - input: "test input"
222
+ actual_output: "test output"
223
+ expected_output: "expected output"
224
+ context:
225
+ - "context1"
226
+ - "context2"
227
+ retrieval_context:
228
+ - "retrieval1"
229
+ additional_metadata:
230
+ key: "value"
231
+ tools_called:
232
+ - "tool1"
233
+ expected_tools:
234
+ - "tool1"
235
+ - "tool2"
236
+ name: "test example"
237
+ example_id: null
238
+ timestamp: "20241230_160117"
239
+ trace_id: "123"
240
+ """
241
+ try:
242
+ with open(file_path, "r") as file:
243
+ payload = yaml.safe_load(file)
244
+ if payload is None:
245
+ raise ValueError("The YAML file is empty.")
246
+ examples = payload.get("examples", [])
247
+ ground_truths = payload.get("ground_truths", [])
248
+ except FileNotFoundError:
249
+ error(f"YAML file not found: {file_path}")
250
+ raise FileNotFoundError(f"The file {file_path} was not found.")
251
+ except yaml.YAMLError:
252
+ error(f"Invalid YAML file: {file_path}")
253
+ raise ValueError(f"The file {file_path} is not a valid YAML file.")
254
+
255
+ info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from YAML")
256
+ new_examples = [Example(**e) for e in examples]
257
+ for e in new_examples:
258
+ self.add_example(e)
259
+
260
+ new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
261
+ for g in new_ground_truths:
262
+ self.add_ground_truth(g)
263
+
193
264
  def add_example(self, e: Example) -> None:
194
265
  self.examples = self.examples + [e]
195
266
  # TODO if we need to add rank, then we need to do it here
@@ -197,7 +268,7 @@ class EvalDataset:
197
268
  def add_ground_truth(self, g: GroundTruthExample) -> None:
198
269
  self.ground_truths = self.ground_truths + [g]
199
270
 
200
- def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
271
+ def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
201
272
  """
202
273
  Saves the dataset as a file. Save both the ground truths and examples.
203
274
 
@@ -266,8 +337,49 @@ class EvalDataset:
266
337
  g.trace_id
267
338
  ]
268
339
  )
340
+ elif file_type == "yaml":
341
+ with open(complete_path, "w") as file:
342
+ yaml_data = {
343
+ "examples": [
344
+ {
345
+ "input": e.input,
346
+ "actual_output": e.actual_output,
347
+ "expected_output": e.expected_output,
348
+ "context": e.context,
349
+ "retrieval_context": e.retrieval_context,
350
+ "additional_metadata": e.additional_metadata,
351
+ "tools_called": e.tools_called,
352
+ "expected_tools": e.expected_tools,
353
+ "name": e.name,
354
+ "comments": None, # Example does not have comments
355
+ "source_file": None, # Example does not have source file
356
+ "example": True, # Adding an Example
357
+ "trace_id": e.trace_id
358
+ }
359
+ for e in self.examples
360
+ ],
361
+ "ground_truths": [
362
+ {
363
+ "input": g.input,
364
+ "actual_output": g.actual_output,
365
+ "expected_output": g.expected_output,
366
+ "context": g.context,
367
+ "retrieval_context": g.retrieval_context,
368
+ "additional_metadata": g.additional_metadata,
369
+ "tools_called": g.tools_called,
370
+ "expected_tools": g.expected_tools,
371
+ "name": None, # GroundTruthExample does not have name
372
+ "comments": g.comments,
373
+ "source_file": g.source_file,
374
+ "example": False, # Adding a GroundTruthExample, not an Example
375
+ "trace_id": g.trace_id
376
+ }
377
+ for g in self.ground_truths
378
+ ]
379
+ }
380
+ yaml.dump(yaml_data, file, default_flow_style=False)
269
381
  else:
270
- ACCEPTABLE_FILE_TYPES = ["json", "csv"]
382
+ ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
271
383
  raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
272
384
 
273
385
  def __iter__(self):
judgeval/data/example.py CHANGED
@@ -2,11 +2,13 @@
2
2
  Classes for representing examples in a dataset.
3
3
  """
4
4
 
5
- from typing import Optional, Any, Dict, List
5
+
6
+ from typing import Optional, Any, Dict, List, Union
6
7
  from uuid import uuid4
7
8
  from pydantic import BaseModel, Field, field_validator
8
9
  from enum import Enum
9
10
  from datetime import datetime
11
+ import time
10
12
 
11
13
 
12
14
  class ExampleParams(Enum):
@@ -22,9 +24,9 @@ class ExampleParams(Enum):
22
24
 
23
25
 
24
26
  class Example(BaseModel):
25
- input: str
26
- actual_output: str
27
- expected_output: Optional[str] = None
27
+ input: Optional[str] = None
28
+ actual_output: Optional[Union[str, List[str]]] = None
29
+ expected_output: Optional[Union[str, List[str]]] = None
28
30
  context: Optional[List[str]] = None
29
31
  retrieval_context: Optional[List[str]] = None
30
32
  additional_metadata: Optional[Dict[str, Any]] = None
@@ -37,12 +39,6 @@ class Example(BaseModel):
37
39
  trace_id: Optional[str] = None
38
40
 
39
41
  def __init__(self, **data):
40
- # Check that required fields are provided
41
- if 'input' not in data:
42
- raise ValueError("Example must be initialized with 'input' field.")
43
- if 'actual_output' not in data:
44
- raise ValueError("Example must be initialized with 'actual_output' field.")
45
-
46
42
  if 'example_id' not in data:
47
43
  data['example_id'] = str(uuid4())
48
44
  # Set timestamp if not provided
@@ -53,22 +49,27 @@ class Example(BaseModel):
53
49
  @field_validator('input', mode='before')
54
50
  @classmethod
55
51
  def validate_input(cls, v):
56
- if not v or not isinstance(v, str):
52
+ if v is not None and (not v or not isinstance(v, str)):
57
53
  raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
58
54
  return v
59
55
 
60
56
  @field_validator('actual_output', mode='before')
61
57
  @classmethod
62
58
  def validate_actual_output(cls, v):
63
- if not isinstance(v, str):
64
- raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
59
+ if v is not None:
60
+ if not isinstance(v, (str, list)):
61
+ raise ValueError(f"Actual output must be a string or a list of strings but got {v} of type {type(v)}")
62
+ if isinstance(v, list) and not all(isinstance(item, str) for item in v):
63
+ raise ValueError(f"All items in actual_output must be strings but got {v}")
65
64
  return v
66
65
 
67
66
  @field_validator('expected_output', mode='before')
68
67
  @classmethod
69
68
  def validate_expected_output(cls, v):
70
- if v is not None and not isinstance(v, str):
71
- raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
69
+ if v is not None and not isinstance(v, (str, list)):
70
+ raise ValueError(f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}")
71
+ if isinstance(v, list) and not all(isinstance(item, str) for item in v):
72
+ raise ValueError(f"All items in expected_output must be strings but got {v}")
72
73
  return v
73
74
 
74
75
  @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
judgeval/data/result.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import List, Union, Optional, Dict, Any
2
+ from typing import List, Union, Optional, Dict, Any, Union
3
3
 
4
4
  from judgeval.data import ScorerData, ProcessExample
5
5
 
@@ -30,8 +30,8 @@ class ScoringResult:
30
30
 
31
31
  # Inputs from the original example
32
32
  input: Optional[str] = None
33
- actual_output: Optional[str] = None
34
- expected_output: Optional[str] = None
33
+ actual_output: Optional[Union[str, List[str]]] = None
34
+ expected_output: Optional[Union[str, List[str]]] = None
35
35
  context: Optional[List[str]] = None
36
36
  retrieval_context: Optional[List[str]] = None
37
37
  additional_metadata: Optional[Dict[str, Any]] = None
@@ -27,7 +27,8 @@ from judgeval.judges import JudgevalJudge
27
27
  from judgeval.constants import (
28
28
  JUDGMENT_EVAL_FETCH_API_URL,
29
29
  JUDGMENT_EVAL_DELETE_API_URL,
30
- JUDGMENT_EVAL_DELETE_PROJECT_API_URL
30
+ JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
31
+ JUDGMENT_PROJECT_DELETE_API_URL
31
32
  )
32
33
  from judgeval.common.exceptions import JudgmentAPIError
33
34
  from pydantic import BaseModel
@@ -156,7 +157,7 @@ class JudgmentClient:
156
157
  metadata: Optional[Dict[str, Any]] = None,
157
158
  project_name: str = "",
158
159
  eval_run_name: str = "",
159
- log_results: bool = False,
160
+ log_results: bool = True,
160
161
  use_judgment: bool = True,
161
162
  rules: Optional[List[Rule]] = None
162
163
  ) -> List[ScoringResult]:
@@ -362,7 +363,6 @@ class JudgmentClient:
362
363
  response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
363
364
  json={
364
365
  "project_name": project_name,
365
- "judgment_api_key": self.judgment_api_key,
366
366
  },
367
367
  headers={
368
368
  "Content-Type": "application/json",
@@ -372,6 +372,23 @@ class JudgmentClient:
372
372
  if response.status_code != requests.codes.ok:
373
373
  raise ValueError(f"Error deleting eval results: {response.json()}")
374
374
  return response.json()
375
+
376
+ def delete_project(self, project_name: str) -> bool:
377
+ """
378
+ Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
379
+ """
380
+ response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
381
+ json={
382
+ "project_name": project_name,
383
+ },
384
+ headers={
385
+ "Content-Type": "application/json",
386
+ "Authorization": f"Bearer {self.judgment_api_key}",
387
+ "X-Organization-Id": self.organization_id
388
+ })
389
+ if response.status_code != requests.codes.ok:
390
+ raise ValueError(f"Error deleting project: {response.json()}")
391
+ return response.json()
375
392
 
376
393
  def _validate_api_key(self):
377
394
  """
@@ -1,12 +1,17 @@
1
1
  import asyncio
2
2
  import requests
3
- from typing import List, Dict
3
+ import time
4
+ import sys
5
+ import itertools
6
+ import threading
7
+ from typing import List, Dict, Any
4
8
  from datetime import datetime
5
9
  from rich import print as rprint
6
10
 
7
11
  from judgeval.data import (
8
12
  ScorerData,
9
- ScoringResult
13
+ ScoringResult,
14
+ Example
10
15
  )
11
16
  from judgeval.scorers import (
12
17
  JudgevalScorer,
@@ -14,7 +19,6 @@ from judgeval.scorers import (
14
19
  ClassifierScorer
15
20
  )
16
21
  from judgeval.scorers.score import a_execute_scoring
17
-
18
22
  from judgeval.constants import (
19
23
  ROOT_API,
20
24
  JUDGMENT_EVAL_API_URL,
@@ -185,7 +189,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
185
189
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
186
190
 
187
191
 
188
- def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
192
+ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
189
193
  """
190
194
  Logs evaluation results to the Judgment API database.
191
195
 
@@ -220,7 +224,9 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
220
224
  raise JudgmentAPIError(error_message)
221
225
 
222
226
  if "ui_results_url" in res.json():
223
- rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
227
+ url = res.json()['ui_results_url']
228
+ pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
229
+ return pretty_str
224
230
 
225
231
  except requests.exceptions.RequestException as e:
226
232
  error(f"Request failed while saving evaluation results to DB: {str(e)}")
@@ -229,6 +235,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
229
235
  error(f"Failed to save evaluation results to DB: {str(e)}")
230
236
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
231
237
 
238
+ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
239
+ """Run a function with a spinner in the terminal."""
240
+ spinner = itertools.cycle(['|', '/', '-', '\\'])
241
+
242
+ def display_spinner():
243
+ while not stop_spinner_event.is_set():
244
+ sys.stdout.write(f'\r{message}{next(spinner)}')
245
+ sys.stdout.flush()
246
+ time.sleep(0.1)
247
+
248
+ stop_spinner_event = threading.Event()
249
+ spinner_thread = threading.Thread(target=display_spinner)
250
+ spinner_thread.start()
251
+
252
+ try:
253
+ result = func(*args, **kwargs)
254
+ except Exception as e:
255
+ error(f"An error occurred: {str(e)}")
256
+ stop_spinner_event.set()
257
+ spinner_thread.join()
258
+ raise e
259
+ finally:
260
+ stop_spinner_event.set()
261
+ spinner_thread.join()
262
+
263
+ sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
264
+ sys.stdout.flush()
265
+
266
+ return result
267
+
268
+ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
269
+ """
270
+ Checks if the example contains the necessary parameters for the scorer.
271
+ """
272
+ for scorer in scorers:
273
+ if isinstance(scorer, APIJudgmentScorer):
274
+ for example in examples:
275
+ missing_params = []
276
+ for param in scorer.required_params:
277
+ if getattr(example, param.value) is None:
278
+ missing_params.append(f"'{param.value}'")
279
+ if missing_params:
280
+ # We do this because we want to inform users that an example is missing parameters for a scorer
281
+ # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
282
+ print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
232
283
 
233
284
 
234
285
  def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
@@ -253,7 +304,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
253
304
  Returns:
254
305
  List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
255
306
  """
256
-
307
+
257
308
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
258
309
  if not override and evaluation_run.log_results:
259
310
  check_eval_run_name_exists(
@@ -306,6 +357,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
306
357
 
307
358
  # Execute evaluation using Judgment API
308
359
  if judgment_scorers:
360
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
309
361
  info("Starting API evaluation")
310
362
  debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
311
363
  try: # execute an EvaluationRun with just JudgmentScorers
@@ -323,7 +375,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
323
375
  rules=evaluation_run.rules
324
376
  )
325
377
  debug("Sending request to Judgment API")
326
- response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
378
+ response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
327
379
  info(f"Received {len(response_data['results'])} results from API")
328
380
  except JudgmentAPIError as e:
329
381
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -352,6 +404,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
352
404
  api_results.append(ScoringResult(**filtered_result))
353
405
  # Run local evals
354
406
  if local_scorers: # List[JudgevalScorer]
407
+ # We should be removing local scorers soon
355
408
  info("Starting local evaluation")
356
409
  for example in evaluation_run.examples:
357
410
  with example_logging_context(example.timestamp, example.example_id):
@@ -389,7 +442,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
389
442
  # )
390
443
 
391
444
  if evaluation_run.log_results:
392
- log_evaluation_results(merged_results, evaluation_run)
445
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
446
+ rprint(pretty_str)
393
447
 
394
448
  for i, result in enumerate(merged_results):
395
449
  if not result.scorers_data: # none of the scorers could be executed on this example
@@ -2,7 +2,7 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
3
  from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
4
  from judgeval.scorers.judgeval_scorers import (
5
- ToolCorrectnessScorer,
5
+ ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
7
7
  SummarizationScorer,
8
8
  HallucinationScorer,
@@ -24,7 +24,7 @@ __all__ = [
24
24
  "JudgevalScorer",
25
25
  "PromptScorer",
26
26
  "ClassifierScorer",
27
- "ToolCorrectnessScorer",
27
+ "ExecutionOrderScorer",
28
28
  "JSONCorrectnessScorer",
29
29
  "SummarizationScorer",
30
30
  "HallucinationScorer",
@@ -5,8 +5,9 @@ Scores `Example`s using ready-made Judgment evaluators.
5
5
  """
6
6
 
7
7
  from pydantic import BaseModel, field_validator
8
+ from typing import List
8
9
  from judgeval.common.logger import debug, info, warning, error
9
-
10
+ from judgeval.data import ExampleParams
10
11
  from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
12
 
12
13
 
@@ -20,6 +21,7 @@ class APIJudgmentScorer(BaseModel):
20
21
  """
21
22
  score_type: APIScorer
22
23
  threshold: float
24
+ required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
23
25
 
24
26
  @field_validator('threshold')
25
27
  def validate_threshold(cls, v, info):
@@ -2,7 +2,7 @@ from typing import Type, Optional, Any
2
2
 
3
3
  # Import implementations
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
5
- ToolCorrectnessScorer as APIToolCorrectnessScorer,
5
+ ExecutionOrderScorer as APIExecutionOrderScorer,
6
6
  JSONCorrectnessScorer as APIJSONCorrectnessScorer,
7
7
  SummarizationScorer as APISummarizationScorer,
8
8
  HallucinationScorer as APIHallucinationScorer,
@@ -24,7 +24,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
24
24
  ContextualRelevancyScorer as LocalContextualRelevancyScorer,
25
25
  FaithfulnessScorer as LocalFaithfulnessScorer,
26
26
  JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
27
- ToolCorrectnessScorer as LocalToolCorrectnessScorer,
27
+ ExecutionOrderScorer as LocalExecutionOrderScorer,
28
28
  HallucinationScorer as LocalHallucinationScorer,
29
29
  SummarizationScorer as LocalSummarizationScorer,
30
30
  AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
@@ -98,9 +98,9 @@ AnswerRelevancyScorer = ScorerWrapper(
98
98
  local_implementation=LocalAnswerRelevancyScorer
99
99
  )
100
100
 
101
- ToolCorrectnessScorer = ScorerWrapper(
102
- api_implementation=APIToolCorrectnessScorer,
103
- local_implementation=LocalToolCorrectnessScorer
101
+ ExecutionOrderScorer = ScorerWrapper(
102
+ api_implementation=APIExecutionOrderScorer,
103
+ local_implementation=LocalExecutionOrderScorer
104
104
  )
105
105
 
106
106
  JSONCorrectnessScorer = ScorerWrapper(
@@ -154,7 +154,7 @@ GroundednessScorer = ScorerWrapper(
154
154
  )
155
155
 
156
156
  __all__ = [
157
- "ToolCorrectnessScorer",
157
+ "ExecutionOrderScorer",
158
158
  "JSONCorrectnessScorer",
159
159
  "SummarizationScorer",
160
160
  "HallucinationScorer",
@@ -1,4 +1,4 @@
1
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_correctness import ToolCorrectnessScorer
1
+ from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import ExecutionOrderScorer
2
2
  from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
3
3
  from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
@@ -13,7 +13,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
 
15
15
  __all__ = [
16
- "ToolCorrectnessScorer",
16
+ "ExecutionOrderScorer",
17
17
  "JSONCorrectnessScorer",
18
18
  "SummarizationScorer",
19
19
  "HallucinationScorer",
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class AnswerCorrectnessScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.ANSWER_CORRECTNESS,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.EXPECTED_OUTPUT,
22
+ ]
23
+ )
16
24
 
17
25
  @property
18
26
  def __name__(self):
@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class AnswerRelevancyScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.ANSWER_RELEVANCY,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ]
22
+ )
16
23
 
17
24
  @property
18
25
  def __name__(self):
@@ -9,12 +9,20 @@ TODO add link to docs page for this scorer
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from typing import Optional, Dict
12
-
12
+ from judgeval.data import ExampleParams
13
13
  class ComparisonScorer(APIJudgmentScorer):
14
14
  kwargs: Optional[Dict] = None
15
15
 
16
16
  def __init__(self, threshold: float, criteria: str, description: str):
17
- super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
17
+ super().__init__(
18
+ threshold=threshold,
19
+ score_type=APIScorer.COMPARISON,
20
+ required_params=[
21
+ ExampleParams.INPUT,
22
+ ExampleParams.ACTUAL_OUTPUT,
23
+ ExampleParams.EXPECTED_OUTPUT,
24
+ ]
25
+ )
18
26
  self.kwargs = {"criteria": criteria, "description": description}
19
27
 
20
28
  @property
@@ -8,11 +8,20 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class ContextualPrecisionScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.CONTEXTUAL_PRECISION,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.RETRIEVAL_CONTEXT,
22
+ ExampleParams.EXPECTED_OUTPUT,
23
+ ]
24
+ )
16
25
 
17
26
  @property
18
27
  def __name__(self):
@@ -8,12 +8,21 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
+ from judgeval.data import ExampleParams
11
12
 
12
13
 
13
14
  class ContextualRecallScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
16
-
16
+ super().__init__(
17
+ threshold=threshold,
18
+ score_type=APIScorer.CONTEXTUAL_RECALL,
19
+ required_params=[
20
+ ExampleParams.INPUT,
21
+ ExampleParams.ACTUAL_OUTPUT,
22
+ ExampleParams.EXPECTED_OUTPUT,
23
+ ExampleParams.RETRIEVAL_CONTEXT,
24
+ ]
25
+ )
17
26
  @property
18
27
  def __name__(self):
19
28
  return "Contextual Recall"
@@ -8,15 +8,22 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class ContextualRelevancyScorer(APIJudgmentScorer):
14
14
  """
15
15
  Scorer that checks if the output of a model is relevant to the retrieval context
16
16
  """
17
17
  def __init__(self, threshold: float):
18
- super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
19
-
18
+ super().__init__(
19
+ threshold=threshold,
20
+ score_type=APIScorer.CONTEXTUAL_RELEVANCY,
21
+ required_params=[
22
+ ExampleParams.INPUT,
23
+ ExampleParams.ACTUAL_OUTPUT,
24
+ ExampleParams.RETRIEVAL_CONTEXT,
25
+ ]
26
+ )
20
27
  @property
21
28
  def __name__(self):
22
29
  return "Contextual Relevancy"
@@ -0,0 +1,43 @@
1
+ """
2
+ `judgeval` tool correctness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+ from typing import Optional, Dict, List
12
+ from judgeval.data import ExampleParams
13
+
14
+ class ExecutionOrderScorer(APIJudgmentScorer):
15
+ kwargs: Optional[Dict] = None
16
+
17
+ def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
18
+ super().__init__(
19
+ threshold=threshold,
20
+ score_type=APIScorer.EXECUTION_ORDER,
21
+ required_params=[
22
+ ExampleParams.ACTUAL_OUTPUT,
23
+ ExampleParams.EXPECTED_OUTPUT,
24
+ ]
25
+ )
26
+ self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
27
+
28
+ @property
29
+ def __name__(self):
30
+ return "Execution Order"
31
+
32
+ def to_dict(self) -> dict:
33
+ """
34
+ Converts the scorer configuration to a dictionary format.
35
+
36
+ Returns:
37
+ dict: A dictionary containing the scorer's configuration
38
+ """
39
+ return {
40
+ "score_type": self.score_type,
41
+ "threshold": self.threshold,
42
+ "kwargs": self.kwargs
43
+ }
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class FaithfulnessScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.FAITHFULNESS,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.RETRIEVAL_CONTEXT,
22
+ ]
23
+ )
16
24
 
17
25
  @property
18
26
  def __name__(self):
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class GroundednessScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.GROUNDEDNESS,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.RETRIEVAL_CONTEXT,
22
+ ]
23
+ )
16
24
 
17
25
  @property
18
26
  def __name__(self):
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class HallucinationScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.HALLUCINATION,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.CONTEXT,
22
+ ]
23
+ )
16
24
 
17
25
  @property
18
26
  def __name__(self):
@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class InstructionAdherenceScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.INSTRUCTION_ADHERENCE,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ]
22
+ )
16
23
 
17
24
  @property
18
25
  def __name__(self):
@@ -11,13 +11,20 @@ from pydantic import BaseModel, Field
11
11
  # Internal imports
12
12
  from judgeval.scorers.api_scorer import APIJudgmentScorer
13
13
  from judgeval.constants import APIScorer
14
-
14
+ from judgeval.data import ExampleParams
15
15
 
16
16
  class JSONCorrectnessScorer(APIJudgmentScorer):
17
17
  json_schema: BaseModel = Field(None, exclude=True)
18
18
 
19
19
  def __init__(self, threshold: float, json_schema: BaseModel):
20
- super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
20
+ super().__init__(
21
+ threshold=threshold,
22
+ score_type=APIScorer.JSON_CORRECTNESS,
23
+ required_params=[
24
+ ExampleParams.INPUT,
25
+ ExampleParams.ACTUAL_OUTPUT,
26
+ ]
27
+ )
21
28
  object.__setattr__(self, 'json_schema', json_schema)
22
29
 
23
30
  def to_dict(self):
@@ -7,12 +7,19 @@ TODO add link to docs page for this scorer
7
7
 
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
10
+ from judgeval.constants import APIScorer
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class SummarizationScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.SUMMARIZATION,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ]
22
+ )
16
23
 
17
24
  @property
18
25
  def __name__(self):
@@ -4,7 +4,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.c
4
4
  from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
5
5
  from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
6
6
  from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
7
- from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
7
+ from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
8
8
  from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
9
9
  from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
10
10
  from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
@@ -20,7 +20,7 @@ __all__ = [
20
20
  "ContextualRelevancyScorer",
21
21
  "FaithfulnessScorer",
22
22
  "JsonCorrectnessScorer",
23
- "ToolCorrectnessScorer",
23
+ "ExecutionOrderScorer",
24
24
  "HallucinationScorer",
25
25
  "SummarizationScorer",
26
26
  "InstructionAdherenceScorer",
@@ -0,0 +1,3 @@
1
+ from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
2
+
3
+ __all__ = ["ExecutionOrderScorer"]
@@ -45,7 +45,7 @@ def get_lcs(seq1, seq2):
45
45
  return lcs[::-1]
46
46
 
47
47
 
48
- class ToolCorrectnessScorer(JudgevalScorer):
48
+ class ExecutionOrderScorer(JudgevalScorer):
49
49
  def __init__(
50
50
  self,
51
51
  threshold: float = 0.5,
@@ -56,7 +56,7 @@ class ToolCorrectnessScorer(JudgevalScorer):
56
56
  should_consider_ordering: bool = False,
57
57
  ):
58
58
  super().__init__(
59
- score_type=APIScorer.TOOL_CORRECTNESS,
59
+ score_type=APIScorer.EXECUTION_ORDER,
60
60
  threshold=1 if strict_mode else threshold,
61
61
  evaluation_model=None,
62
62
  include_reason=include_reason,
@@ -152,5 +152,5 @@ class ToolCorrectnessScorer(JudgevalScorer):
152
152
 
153
153
  @property
154
154
  def __name__(self):
155
- return "Tool Correctness"
155
+ return "Execution Order"
156
156
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.19
3
+ Version: 0.0.21
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
+ Requires-Dist: langchain
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
15
20
  Requires-Dist: litellm
16
21
  Requires-Dist: nest-asyncio
17
22
  Requires-Dist: openai
23
+ Requires-Dist: openpyxl
18
24
  Requires-Dist: pandas
19
25
  Requires-Dist: pika
20
26
  Requires-Dist: python-dotenv==1.0.1
@@ -23,8 +29,6 @@ Requires-Dist: supabase
23
29
  Requires-Dist: together
24
30
  Requires-Dist: uvicorn
25
31
  Provides-Extra: dev
26
- Requires-Dist: langfuse==2.50.3; extra == 'dev'
27
- Requires-Dist: patronus; extra == 'dev'
28
32
  Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
29
33
  Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
30
34
  Requires-Dist: pytest>=8.3.4; extra == 'dev'
@@ -1,23 +1,23 @@
1
1
  judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=BXTzKBmhDVutiitaCRarfkc_M-0NplRJofIt_QSa5QI,5010
3
+ judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
4
4
  judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
5
- judgeval/judgment_client.py,sha256=evlvcrYO9pF-oCgcvlGE59iODN0C6GJtn7bySFU_88k,23384
5
+ judgeval/judgment_client.py,sha256=5lqp9X67qPzBUu7kQYETslsc3L5JjxrDVgVLslF07A0,24173
6
6
  judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
7
- judgeval/run_evaluation.py,sha256=yLW24kFcw0xzXHvnDclYqtujTww6SDwvut6HM1x7SXk,21505
7
+ judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=tTG4VZRXJjilm0ltQCeXJvd7TiL9W1PSVaf0LOmw2C4,44430
11
+ judgeval/common/tracer.py,sha256=WFjFNf3NZ2BN8UAu2MG0F3Om9LgJNma3m_GrxyXgJqE,46655
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
13
  judgeval/data/__init__.py,sha256=QykVE22Qf-I2f1g-jC9-iQyLNXgDmX1-vHbCgZg8Ra8,558
14
- judgeval/data/api_example.py,sha256=NEiJKpf2WIo4FPQ2-vuoCZ_9ixexhdg_wdNYWXPSA2M,4094
15
- judgeval/data/example.py,sha256=PHqRI8l94ylLgfgjIH4DqcFFHb-t-WBxRkZb9eXKlpI,5648
14
+ judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
15
+ judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
16
16
  judgeval/data/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
17
- judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
17
+ judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
18
18
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
19
19
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
20
- judgeval/data/datasets/dataset.py,sha256=0NItb98Yz0P954rg9FF9s09uVQ7cEg9A5J6Xvie9nhw,12022
20
+ judgeval/data/datasets/dataset.py,sha256=LrBK8y3y1R9_BKmXxTzdXMMIQvXlq7tf7TM-u7jgSxE,16839
21
21
  judgeval/data/datasets/eval_dataset_client.py,sha256=QsfHyFC4WePV7uJGYUVjiIwtk1Ie_VpWUrnd2Q4kKdU,11479
22
22
  judgeval/data/datasets/utils.py,sha256=6DpGCPmGFNOKIGNcVCOSjTOdWemrpAuYnlo778sGG7g,2455
23
23
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
@@ -26,33 +26,33 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
26
26
  judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
27
27
  judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
28
28
  judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
29
- judgeval/scorers/__init__.py,sha256=_KP6c1dr6O2p95hx_WvRpZXfSGg9r2hNn_PjY9Ch5ds,1160
30
- judgeval/scorers/api_scorer.py,sha256=wGqTQCbUE7uE-PzaKcCmexAqutdTunjFR0zVA6bUxdE,2518
29
+ judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
30
+ judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
31
31
  judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
32
32
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
33
33
  judgeval/scorers/judgeval_scorer.py,sha256=oIkfoGXA09wL_vcK1DRibzQSA-MFNa-hmw1IhGBErf8,6592
34
34
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
35
35
  judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
36
36
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
37
- judgeval/scorers/judgeval_scorers/__init__.py,sha256=-nnqz-aU5PB_m1cb-2ySpZ18WDxupxmQCr-ws0aSalw,6000
38
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=cJSwTA6hqZXUSaPkTl4yDyl3cUzv0IlcTu592uoTY98,1651
39
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
40
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
41
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=6Q1qbsANOoZ3PM8n_gtZLIMbTBB9879L3acRelNJ6Uk,1001
42
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=2zBrm_EEc143bmPA4HVcf8XtQeuc_BexczGx-SHlwRY,473
43
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=NyojBWy_lRYx8diREulSK8s9dfYdZav4eZjg3TwUm0M,461
44
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=wROMWOliCnB39ftX9TdeZmG9y0vrnxIGVby65tLOQRU,574
45
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=gNf_i5c0jjpz2zCGhe7TtDMLKxc1PdOExJMFB5X7hSg,442
46
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=esO76hEp0NzeBUdoSICPLdx5AeA5zWSt_2zpcSgvGis,442
47
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexPkKgo1rCALMivypROQjG5WWEsKXEFZxe2k,446
48
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=t1lWYOF0Pxvw5-NrI1Dt9FojaOncOCRlZc4a2SA20h4,477
49
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
50
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
51
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
37
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
38
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
39
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
40
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
41
+ judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
42
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
43
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
44
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
45
+ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
46
+ judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
47
+ judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
48
+ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
49
+ judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
50
+ judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
51
+ judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
52
52
  judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
53
53
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
54
54
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
55
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=pipWXfS_n4UsnZViwZAF2bPB1FYNfmoJAJUNY7JSq7I,1937
55
+ judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=k_t-THIAtsk7lNvm9faj0u24dPZjn7qRbZ8YGjQ21xs,1926
56
56
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
57
57
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=3Dpm8BIIe0Th2p0ccO5bb-le93lywjOLSo712HwEIUE,10196
58
58
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=hBUqEd8Hy3g8peOVjpSmRb31fPtpodDzdRUonhKRl30,6686
@@ -71,6 +71,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompt
71
71
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py,sha256=JPCvrekKLbl_xdD49evhtiFIVocuegCpCBkn1auzTSE,184
72
72
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
73
73
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=uO-8Uo7VrXu4xWpxjIx6_UI3aw5KuJxubSHb71Nzm6Q,4574
74
+ judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py,sha256=DpOHbjYEhVmP-RiaTEa5PZHpoPvduNXG5p6k9lR0AS0,157
75
+ judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py,sha256=y-Ag8YuzEvExUIj4qU7y53INVLH9L_TUTJLIxCIdAQo,5458
74
76
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
75
77
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=LPVTGHBBJSpE6TrgzZQS2_vw4P9HiUYmykrwo6UMdws,11251
76
78
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
@@ -84,11 +86,9 @@ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_co
84
86
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py,sha256=mv6-XeLSV5yj1H98YYV2iTYVd88zKftZJP42Lgl6R80,89
85
87
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
86
88
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
87
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
88
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=8ucE8UrA44Mr-wHgVsFNU9gKunkPxe87VPYrFVi949g,5461
89
89
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
90
90
  judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
91
- judgeval-0.0.19.dist-info/METADATA,sha256=6HqNDRgJ1LI3hleMhMiGId7EULc9xJY0lYXhq4TEZOg,1283
92
- judgeval-0.0.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
93
- judgeval-0.0.19.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
94
- judgeval-0.0.19.dist-info/RECORD,,
91
+ judgeval-0.0.21.dist-info/METADATA,sha256=jQW4w6jGNaHvPWTcqX3ZGr_SKeCpNl7DsNr-cwrYHsA,1378
92
+ judgeval-0.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
93
+ judgeval-0.0.21.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
94
+ judgeval-0.0.21.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- """
2
- `judgeval` tool correctness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class ToolCorrectnessScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
16
-
17
- @property
18
- def __name__(self):
19
- return "Tool Correctness"
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
2
-
3
- __all__ = ["ToolCorrectnessScorer"]