judgeval 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/clients.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import os
2
2
  from dotenv import load_dotenv
3
3
  from openai import OpenAI
4
- from langfuse import Langfuse
5
4
  from typing import Optional
6
5
  from together import Together, AsyncTogether
7
6
 
judgeval/common/tracer.py CHANGED
@@ -11,6 +11,7 @@ import time
11
11
  import uuid
12
12
  import warnings
13
13
  from contextlib import contextmanager
14
+ from collections import defaultdict
14
15
  from dataclasses import dataclass, field
15
16
  from datetime import datetime
16
17
  from http import HTTPStatus
@@ -962,6 +963,10 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
962
963
  class JudgevalCallbackHandler(BaseCallbackHandler):
963
964
  def __init__(self, trace_client: TraceClient):
964
965
  self.trace_client = trace_client
966
+ self.previous_node = "__start__"
967
+ self.executed_node_tools = []
968
+ self.executed_nodes = []
969
+ self.executed_tools = []
965
970
  self.openai_count = 1
966
971
 
967
972
  def start_span(self, name: str, span_type: SpanType = "span"):
@@ -1049,6 +1054,23 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1049
1054
  # End the retriever span
1050
1055
  self.end_span(self.trace_client._current_span, span_type="retriever")
1051
1056
 
1057
+ def on_chain_start(
1058
+ self,
1059
+ serialized: Dict[str, Any],
1060
+ inputs: Dict[str, Any],
1061
+ *,
1062
+ run_id: UUID,
1063
+ parent_run_id: Optional[UUID] = None,
1064
+ tags: Optional[List[str]] = None,
1065
+ metadata: Optional[Dict[str, Any]] = None,
1066
+ **kwargs: Any
1067
+ ) -> None:
1068
+ node = metadata.get("langgraph_node")
1069
+ if node != None and node != "__start__" and node != self.previous_node:
1070
+ self.executed_node_tools.append(node)
1071
+ self.executed_nodes.append(node)
1072
+ self.previous_node = node
1073
+
1052
1074
  def on_tool_start(
1053
1075
  self,
1054
1076
  serialized: Optional[dict[str, Any]],
@@ -1060,6 +1082,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1060
1082
  ):
1061
1083
  name = serialized["name"]
1062
1084
  self.start_span(name, span_type="tool")
1085
+ self.executed_node_tools.append(f"{self.previous_node}:{name}")
1086
+ self.executed_tools.append(name)
1063
1087
  self.trace_client.record_input({
1064
1088
  'args': input_str,
1065
1089
  'kwargs': kwargs
judgeval/constants.py CHANGED
@@ -22,7 +22,7 @@ class APIScorer(str, Enum):
22
22
  CONTEXTUAL_RELEVANCY = "contextual_relevancy"
23
23
  CONTEXTUAL_PRECISION = "contextual_precision"
24
24
  INSTRUCTION_ADHERENCE = "instruction_adherence"
25
- TOOL_CORRECTNESS = "tool_correctness"
25
+ EXECUTION_ORDER = "execution_order"
26
26
  JSON_CORRECTNESS = "json_correctness"
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Dict, Any
1
+ from typing import List, Optional, Dict, Any, Union
2
2
  from pydantic import BaseModel, ConfigDict, model_validator
3
3
 
4
4
  from judgeval.data.example import Example
@@ -13,8 +13,8 @@ class ProcessExample(BaseModel):
13
13
  """
14
14
  name: str
15
15
  input: Optional[str] = None
16
- actual_output: Optional[str] = None
17
- expected_output: Optional[str] = None
16
+ actual_output: Optional[Union[str, List[str]]] = None
17
+ expected_output: Optional[Union[str, List[str]]] = None
18
18
  context: Optional[list] = None
19
19
  retrieval_context: Optional[list] = None
20
20
  tools_called: Optional[list] = None
@@ -57,19 +57,6 @@ class ProcessExample(BaseModel):
57
57
 
58
58
  def update_run_duration(self, run_duration: float):
59
59
  self.run_duration = run_duration
60
-
61
- @model_validator(mode="before")
62
- def check_input(cls, values: Dict[str, Any]):
63
- input = values.get("input")
64
- actual_output = values.get("actual_output")
65
-
66
- if (input is None or actual_output is None):
67
- error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
68
- raise ValueError(
69
- "'input' and 'actual_output' must be provided."
70
- )
71
-
72
- return values
73
60
 
74
61
 
75
62
  def create_process_example(
@@ -3,6 +3,7 @@ import csv
3
3
  import datetime
4
4
  import json
5
5
  import os
6
+ import yaml
6
7
  from dataclasses import dataclass, field
7
8
  from typing import List, Union, Literal
8
9
 
@@ -190,6 +191,76 @@ class EvalDataset:
190
191
  for g in ground_truths:
191
192
  self.add_ground_truth(g)
192
193
 
194
+ def add_from_yaml(self, file_path: str) -> None:
195
+ debug(f"Loading dataset from YAML file: {file_path}")
196
+ """
197
+ Adds examples and ground truths from a YAML file.
198
+
199
+ The format of the YAML file is expected to be a dictionary with two keys: "examples" and "ground_truths".
200
+ The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
201
+
202
+ The YAML file is expected to have the following format:
203
+ ground_truths:
204
+ - input: "test input"
205
+ actual_output: null
206
+ expected_output: "expected output"
207
+ context:
208
+ - "context1"
209
+ retrieval_context:
210
+ - "retrieval1"
211
+ additional_metadata:
212
+ key: "value"
213
+ comments: "test comment"
214
+ tools_called:
215
+ - "tool1"
216
+ expected_tools:
217
+ - "tool1"
218
+ source_file: "test.py"
219
+ trace_id: "094121"
220
+ examples:
221
+ - input: "test input"
222
+ actual_output: "test output"
223
+ expected_output: "expected output"
224
+ context:
225
+ - "context1"
226
+ - "context2"
227
+ retrieval_context:
228
+ - "retrieval1"
229
+ additional_metadata:
230
+ key: "value"
231
+ tools_called:
232
+ - "tool1"
233
+ expected_tools:
234
+ - "tool1"
235
+ - "tool2"
236
+ name: "test example"
237
+ example_id: null
238
+ timestamp: "20241230_160117"
239
+ trace_id: "123"
240
+ """
241
+ try:
242
+ with open(file_path, "r") as file:
243
+ payload = yaml.safe_load(file)
244
+ if payload is None:
245
+ raise ValueError("The YAML file is empty.")
246
+ examples = payload.get("examples", [])
247
+ ground_truths = payload.get("ground_truths", [])
248
+ except FileNotFoundError:
249
+ error(f"YAML file not found: {file_path}")
250
+ raise FileNotFoundError(f"The file {file_path} was not found.")
251
+ except yaml.YAMLError:
252
+ error(f"Invalid YAML file: {file_path}")
253
+ raise ValueError(f"The file {file_path} is not a valid YAML file.")
254
+
255
+ info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from YAML")
256
+ new_examples = [Example(**e) for e in examples]
257
+ for e in new_examples:
258
+ self.add_example(e)
259
+
260
+ new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
261
+ for g in new_ground_truths:
262
+ self.add_ground_truth(g)
263
+
193
264
  def add_example(self, e: Example) -> None:
194
265
  self.examples = self.examples + [e]
195
266
  # TODO if we need to add rank, then we need to do it here
@@ -197,7 +268,7 @@ class EvalDataset:
197
268
  def add_ground_truth(self, g: GroundTruthExample) -> None:
198
269
  self.ground_truths = self.ground_truths + [g]
199
270
 
200
- def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
271
+ def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
201
272
  """
202
273
  Saves the dataset as a file. Save both the ground truths and examples.
203
274
 
@@ -266,8 +337,49 @@ class EvalDataset:
266
337
  g.trace_id
267
338
  ]
268
339
  )
340
+ elif file_type == "yaml":
341
+ with open(complete_path, "w") as file:
342
+ yaml_data = {
343
+ "examples": [
344
+ {
345
+ "input": e.input,
346
+ "actual_output": e.actual_output,
347
+ "expected_output": e.expected_output,
348
+ "context": e.context,
349
+ "retrieval_context": e.retrieval_context,
350
+ "additional_metadata": e.additional_metadata,
351
+ "tools_called": e.tools_called,
352
+ "expected_tools": e.expected_tools,
353
+ "name": e.name,
354
+ "comments": None, # Example does not have comments
355
+ "source_file": None, # Example does not have source file
356
+ "example": True, # Adding an Example
357
+ "trace_id": e.trace_id
358
+ }
359
+ for e in self.examples
360
+ ],
361
+ "ground_truths": [
362
+ {
363
+ "input": g.input,
364
+ "actual_output": g.actual_output,
365
+ "expected_output": g.expected_output,
366
+ "context": g.context,
367
+ "retrieval_context": g.retrieval_context,
368
+ "additional_metadata": g.additional_metadata,
369
+ "tools_called": g.tools_called,
370
+ "expected_tools": g.expected_tools,
371
+ "name": None, # GroundTruthExample does not have name
372
+ "comments": g.comments,
373
+ "source_file": g.source_file,
374
+ "example": False, # Adding a GroundTruthExample, not an Example
375
+ "trace_id": g.trace_id
376
+ }
377
+ for g in self.ground_truths
378
+ ]
379
+ }
380
+ yaml.dump(yaml_data, file, default_flow_style=False)
269
381
  else:
270
- ACCEPTABLE_FILE_TYPES = ["json", "csv"]
382
+ ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
271
383
  raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
272
384
 
273
385
  def __iter__(self):
judgeval/data/example.py CHANGED
@@ -2,11 +2,13 @@
2
2
  Classes for representing examples in a dataset.
3
3
  """
4
4
 
5
- from typing import Optional, Any, Dict, List
5
+
6
+ from typing import Optional, Any, Dict, List, Union
6
7
  from uuid import uuid4
7
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel, Field, field_validator
8
9
  from enum import Enum
9
10
  from datetime import datetime
11
+ import time
10
12
 
11
13
 
12
14
  class ExampleParams(Enum):
@@ -22,9 +24,9 @@ class ExampleParams(Enum):
22
24
 
23
25
 
24
26
  class Example(BaseModel):
25
- input: str
26
- actual_output: str
27
- expected_output: Optional[str] = None
27
+ input: Optional[str] = None
28
+ actual_output: Optional[Union[str, List[str]]] = None
29
+ expected_output: Optional[Union[str, List[str]]] = None
28
30
  context: Optional[List[str]] = None
29
31
  retrieval_context: Optional[List[str]] = None
30
32
  additional_metadata: Optional[Dict[str, Any]] = None
@@ -37,12 +39,6 @@ class Example(BaseModel):
37
39
  trace_id: Optional[str] = None
38
40
 
39
41
  def __init__(self, **data):
40
- # Check that required fields are provided
41
- if 'input' not in data:
42
- raise ValueError("Example must be initialized with 'input' field.")
43
- if 'actual_output' not in data:
44
- raise ValueError("Example must be initialized with 'actual_output' field.")
45
-
46
42
  if 'example_id' not in data:
47
43
  data['example_id'] = str(uuid4())
48
44
  # Set timestamp if not provided
@@ -53,22 +49,27 @@ class Example(BaseModel):
53
49
  @field_validator('input', mode='before')
54
50
  @classmethod
55
51
  def validate_input(cls, v):
56
- if not v or not isinstance(v, str):
52
+ if v is not None and (not v or not isinstance(v, str)):
57
53
  raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
58
54
  return v
59
55
 
60
56
  @field_validator('actual_output', mode='before')
61
57
  @classmethod
62
58
  def validate_actual_output(cls, v):
63
- if not isinstance(v, str):
64
- raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
59
+ if v is not None:
60
+ if not isinstance(v, (str, list)):
61
+ raise ValueError(f"Actual output must be a string or a list of strings but got {v} of type {type(v)}")
62
+ if isinstance(v, list) and not all(isinstance(item, str) for item in v):
63
+ raise ValueError(f"All items in actual_output must be strings but got {v}")
65
64
  return v
66
65
 
67
66
  @field_validator('expected_output', mode='before')
68
67
  @classmethod
69
68
  def validate_expected_output(cls, v):
70
- if v is not None and not isinstance(v, str):
71
- raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
69
+ if v is not None and not isinstance(v, (str, list)):
70
+ raise ValueError(f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}")
71
+ if isinstance(v, list) and not all(isinstance(item, str) for item in v):
72
+ raise ValueError(f"All items in expected_output must be strings but got {v}")
72
73
  return v
73
74
 
74
75
  @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
judgeval/data/result.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import List, Union, Optional, Dict, Any
2
+ from typing import List, Union, Optional, Dict, Any, Union
3
3
 
4
4
  from judgeval.data import ScorerData, ProcessExample
5
5
 
@@ -30,8 +30,8 @@ class ScoringResult:
30
30
 
31
31
  # Inputs from the original example
32
32
  input: Optional[str] = None
33
- actual_output: Optional[str] = None
34
- expected_output: Optional[str] = None
33
+ actual_output: Optional[Union[str, List[str]]] = None
34
+ expected_output: Optional[Union[str, List[str]]] = None
35
35
  context: Optional[List[str]] = None
36
36
  retrieval_context: Optional[List[str]] = None
37
37
  additional_metadata: Optional[Dict[str, Any]] = None
@@ -2,7 +2,7 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
3
  from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
4
  from judgeval.scorers.judgeval_scorers import (
5
- ToolCorrectnessScorer,
5
+ ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
7
7
  SummarizationScorer,
8
8
  HallucinationScorer,
@@ -24,7 +24,7 @@ __all__ = [
24
24
  "JudgevalScorer",
25
25
  "PromptScorer",
26
26
  "ClassifierScorer",
27
- "ToolCorrectnessScorer",
27
+ "ExecutionOrderScorer",
28
28
  "JSONCorrectnessScorer",
29
29
  "SummarizationScorer",
30
30
  "HallucinationScorer",
@@ -2,7 +2,7 @@ from typing import Type, Optional, Any
2
2
 
3
3
  # Import implementations
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
5
- ToolCorrectnessScorer as APIToolCorrectnessScorer,
5
+ ExecutionOrderScorer as APIExecutionOrderScorer,
6
6
  JSONCorrectnessScorer as APIJSONCorrectnessScorer,
7
7
  SummarizationScorer as APISummarizationScorer,
8
8
  HallucinationScorer as APIHallucinationScorer,
@@ -24,7 +24,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
24
24
  ContextualRelevancyScorer as LocalContextualRelevancyScorer,
25
25
  FaithfulnessScorer as LocalFaithfulnessScorer,
26
26
  JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
27
- ToolCorrectnessScorer as LocalToolCorrectnessScorer,
27
+ ExecutionOrderScorer as LocalExecutionOrderScorer,
28
28
  HallucinationScorer as LocalHallucinationScorer,
29
29
  SummarizationScorer as LocalSummarizationScorer,
30
30
  AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
@@ -98,9 +98,9 @@ AnswerRelevancyScorer = ScorerWrapper(
98
98
  local_implementation=LocalAnswerRelevancyScorer
99
99
  )
100
100
 
101
- ToolCorrectnessScorer = ScorerWrapper(
102
- api_implementation=APIToolCorrectnessScorer,
103
- local_implementation=LocalToolCorrectnessScorer
101
+ ExecutionOrderScorer = ScorerWrapper(
102
+ api_implementation=APIExecutionOrderScorer,
103
+ local_implementation=LocalExecutionOrderScorer
104
104
  )
105
105
 
106
106
  JSONCorrectnessScorer = ScorerWrapper(
@@ -154,7 +154,7 @@ GroundednessScorer = ScorerWrapper(
154
154
  )
155
155
 
156
156
  __all__ = [
157
- "ToolCorrectnessScorer",
157
+ "ExecutionOrderScorer",
158
158
  "JSONCorrectnessScorer",
159
159
  "SummarizationScorer",
160
160
  "HallucinationScorer",
@@ -1,4 +1,4 @@
1
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_correctness import ToolCorrectnessScorer
1
+ from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import ExecutionOrderScorer
2
2
  from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
3
3
  from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
@@ -13,7 +13,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
 
15
15
  __all__ = [
16
- "ToolCorrectnessScorer",
16
+ "ExecutionOrderScorer",
17
17
  "JSONCorrectnessScorer",
18
18
  "SummarizationScorer",
19
19
  "HallucinationScorer",
@@ -0,0 +1,35 @@
1
+ """
2
+ `judgeval` tool correctness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+ from typing import Optional, Dict
12
+
13
+ class ExecutionOrderScorer(APIJudgmentScorer):
14
+ kwargs: Optional[Dict] = None
15
+
16
+ def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
17
+ super().__init__(threshold=threshold, score_type=APIScorer.EXECUTION_ORDER)
18
+ self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
19
+
20
+ @property
21
+ def __name__(self):
22
+ return "Execution Order"
23
+
24
+ def to_dict(self) -> dict:
25
+ """
26
+ Converts the scorer configuration to a dictionary format.
27
+
28
+ Returns:
29
+ dict: A dictionary containing the scorer's configuration
30
+ """
31
+ return {
32
+ "score_type": self.score_type,
33
+ "threshold": self.threshold,
34
+ "kwargs": self.kwargs
35
+ }
@@ -4,7 +4,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.c
4
4
  from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
5
5
  from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
6
6
  from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
7
- from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
7
+ from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
8
8
  from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
9
9
  from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
10
10
  from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
@@ -20,7 +20,7 @@ __all__ = [
20
20
  "ContextualRelevancyScorer",
21
21
  "FaithfulnessScorer",
22
22
  "JsonCorrectnessScorer",
23
- "ToolCorrectnessScorer",
23
+ "ExecutionOrderScorer",
24
24
  "HallucinationScorer",
25
25
  "SummarizationScorer",
26
26
  "InstructionAdherenceScorer",
@@ -0,0 +1,3 @@
1
+ from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
2
+
3
+ __all__ = ["ExecutionOrderScorer"]
@@ -45,7 +45,7 @@ def get_lcs(seq1, seq2):
45
45
  return lcs[::-1]
46
46
 
47
47
 
48
- class ToolCorrectnessScorer(JudgevalScorer):
48
+ class ExecutionOrderScorer(JudgevalScorer):
49
49
  def __init__(
50
50
  self,
51
51
  threshold: float = 0.5,
@@ -56,7 +56,7 @@ class ToolCorrectnessScorer(JudgevalScorer):
56
56
  should_consider_ordering: bool = False,
57
57
  ):
58
58
  super().__init__(
59
- score_type=APIScorer.TOOL_CORRECTNESS,
59
+ score_type=APIScorer.EXECUTION_ORDER,
60
60
  threshold=1 if strict_mode else threshold,
61
61
  evaluation_model=None,
62
62
  include_reason=include_reason,
@@ -152,5 +152,5 @@ class ToolCorrectnessScorer(JudgevalScorer):
152
152
 
153
153
  @property
154
154
  def __name__(self):
155
- return "Tool Correctness"
155
+ return "Execution Order"
156
156
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.18
3
+ Version: 0.0.20
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,6 +1,6 @@
1
1
  judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
2
- judgeval/clients.py,sha256=mG3SeEdzAA4eUrxbNHIpxWVgGykknpvCo3_wtKOa324,974
3
- judgeval/constants.py,sha256=BXTzKBmhDVutiitaCRarfkc_M-0NplRJofIt_QSa5QI,5010
2
+ judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
+ judgeval/constants.py,sha256=i8JIDUyo38Vt0R1n0GRA4FaakkBC5F2o4hQa0ncSF2E,5008
4
4
  judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
5
5
  judgeval/judgment_client.py,sha256=evlvcrYO9pF-oCgcvlGE59iODN0C6GJtn7bySFU_88k,23384
6
6
  judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
@@ -8,16 +8,16 @@ judgeval/run_evaluation.py,sha256=yLW24kFcw0xzXHvnDclYqtujTww6SDwvut6HM1x7SXk,21
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=tTG4VZRXJjilm0ltQCeXJvd7TiL9W1PSVaf0LOmw2C4,44430
11
+ judgeval/common/tracer.py,sha256=FYrAuav6OiiawHLQ2e154MLvCBMdh-z_ucU2h7XK08M,45295
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
13
  judgeval/data/__init__.py,sha256=QykVE22Qf-I2f1g-jC9-iQyLNXgDmX1-vHbCgZg8Ra8,558
14
- judgeval/data/api_example.py,sha256=NEiJKpf2WIo4FPQ2-vuoCZ_9ixexhdg_wdNYWXPSA2M,4094
15
- judgeval/data/example.py,sha256=jsKkq91CWUnsvlfPP8qdXTCOg7l5ClFQkCeVoNJCZMc,5631
14
+ judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
15
+ judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
16
16
  judgeval/data/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
17
- judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
17
+ judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
18
18
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
19
19
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
20
- judgeval/data/datasets/dataset.py,sha256=0NItb98Yz0P954rg9FF9s09uVQ7cEg9A5J6Xvie9nhw,12022
20
+ judgeval/data/datasets/dataset.py,sha256=LrBK8y3y1R9_BKmXxTzdXMMIQvXlq7tf7TM-u7jgSxE,16839
21
21
  judgeval/data/datasets/eval_dataset_client.py,sha256=QsfHyFC4WePV7uJGYUVjiIwtk1Ie_VpWUrnd2Q4kKdU,11479
22
22
  judgeval/data/datasets/utils.py,sha256=6DpGCPmGFNOKIGNcVCOSjTOdWemrpAuYnlo778sGG7g,2455
23
23
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
@@ -26,7 +26,7 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
26
26
  judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
27
27
  judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
28
28
  judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
29
- judgeval/scorers/__init__.py,sha256=_KP6c1dr6O2p95hx_WvRpZXfSGg9r2hNn_PjY9Ch5ds,1160
29
+ judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
30
30
  judgeval/scorers/api_scorer.py,sha256=wGqTQCbUE7uE-PzaKcCmexAqutdTunjFR0zVA6bUxdE,2518
31
31
  judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
32
32
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
@@ -34,25 +34,25 @@ judgeval/scorers/judgeval_scorer.py,sha256=oIkfoGXA09wL_vcK1DRibzQSA-MFNa-hmw1Ih
34
34
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
35
35
  judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
36
36
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
37
- judgeval/scorers/judgeval_scorers/__init__.py,sha256=-nnqz-aU5PB_m1cb-2ySpZ18WDxupxmQCr-ws0aSalw,6000
38
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=cJSwTA6hqZXUSaPkTl4yDyl3cUzv0IlcTu592uoTY98,1651
37
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
38
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
39
39
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
40
40
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
41
41
  judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=6Q1qbsANOoZ3PM8n_gtZLIMbTBB9879L3acRelNJ6Uk,1001
42
42
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=2zBrm_EEc143bmPA4HVcf8XtQeuc_BexczGx-SHlwRY,473
43
43
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=NyojBWy_lRYx8diREulSK8s9dfYdZav4eZjg3TwUm0M,461
44
44
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=wROMWOliCnB39ftX9TdeZmG9y0vrnxIGVby65tLOQRU,574
45
+ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=qxnvEDeKRlyzxX3EX53sW4oXxAM8Fj_q6ibdTxJNTAc,1076
45
46
  judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=gNf_i5c0jjpz2zCGhe7TtDMLKxc1PdOExJMFB5X7hSg,442
46
47
  judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=esO76hEp0NzeBUdoSICPLdx5AeA5zWSt_2zpcSgvGis,442
47
48
  judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexPkKgo1rCALMivypROQjG5WWEsKXEFZxe2k,446
48
49
  judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=t1lWYOF0Pxvw5-NrI1Dt9FojaOncOCRlZc4a2SA20h4,477
49
50
  judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
50
51
  judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
51
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
52
52
  judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
53
53
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
54
54
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
55
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=pipWXfS_n4UsnZViwZAF2bPB1FYNfmoJAJUNY7JSq7I,1937
55
+ judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=k_t-THIAtsk7lNvm9faj0u24dPZjn7qRbZ8YGjQ21xs,1926
56
56
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
57
57
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=3Dpm8BIIe0Th2p0ccO5bb-le93lywjOLSo712HwEIUE,10196
58
58
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=hBUqEd8Hy3g8peOVjpSmRb31fPtpodDzdRUonhKRl30,6686
@@ -71,6 +71,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompt
71
71
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py,sha256=JPCvrekKLbl_xdD49evhtiFIVocuegCpCBkn1auzTSE,184
72
72
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
73
73
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=uO-8Uo7VrXu4xWpxjIx6_UI3aw5KuJxubSHb71Nzm6Q,4574
74
+ judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py,sha256=DpOHbjYEhVmP-RiaTEa5PZHpoPvduNXG5p6k9lR0AS0,157
75
+ judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py,sha256=y-Ag8YuzEvExUIj4qU7y53INVLH9L_TUTJLIxCIdAQo,5458
74
76
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
75
77
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=LPVTGHBBJSpE6TrgzZQS2_vw4P9HiUYmykrwo6UMdws,11251
76
78
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
@@ -84,11 +86,9 @@ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_co
84
86
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py,sha256=mv6-XeLSV5yj1H98YYV2iTYVd88zKftZJP42Lgl6R80,89
85
87
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
86
88
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
87
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
88
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=8ucE8UrA44Mr-wHgVsFNU9gKunkPxe87VPYrFVi949g,5461
89
89
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
90
90
  judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
91
- judgeval-0.0.18.dist-info/METADATA,sha256=HgUKRC4MPmKHowspF1WKlP5xbpnJiLzfkbZiC4bYIek,1283
92
- judgeval-0.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
93
- judgeval-0.0.18.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
94
- judgeval-0.0.18.dist-info/RECORD,,
91
+ judgeval-0.0.20.dist-info/METADATA,sha256=cz7uKUuHAc1rdANc8IJ5klQhlmrqOu_K1y6wwEIAdFU,1283
92
+ judgeval-0.0.20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
93
+ judgeval-0.0.20.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
94
+ judgeval-0.0.20.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- """
2
- `judgeval` tool correctness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class ToolCorrectnessScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
16
-
17
- @property
18
- def __name__(self):
19
- return "Tool Correctness"
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
2
-
3
- __all__ = ["ToolCorrectnessScorer"]