judgeval 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -2,6 +2,7 @@
2
2
  Tracing system for judgeval that allows for function tracing using decorators.
3
3
  """
4
4
 
5
+ import os
5
6
  import time
6
7
  import functools
7
8
  import requests
@@ -20,6 +21,7 @@ import json
20
21
  import warnings
21
22
  from pydantic import BaseModel
22
23
  from http import HTTPStatus
24
+ from rich import print as rprint
23
25
 
24
26
  from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
25
27
  from judgeval.judgment_client import JudgmentClient
@@ -121,8 +123,29 @@ class TraceEntry:
121
123
 
122
124
  Handles special cases:
123
125
  - Pydantic models are converted using model_dump()
126
+ - We try to serialize into JSON, then string, then the base representation (__repr__)
124
127
  - Non-serializable objects return None with a warning
125
128
  """
129
+
130
+ def safe_stringify(output, function_name):
131
+ """
132
+ Safely converts an object to a string or repr, handling serialization issues gracefully.
133
+ """
134
+ try:
135
+ return str(output)
136
+ except (TypeError, OverflowError, ValueError):
137
+ pass
138
+
139
+ try:
140
+ return repr(output)
141
+ except (TypeError, OverflowError, ValueError):
142
+ pass
143
+
144
+ warnings.warn(
145
+ f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
146
+ )
147
+ return None
148
+
126
149
  if isinstance(self.output, BaseModel):
127
150
  return self.output.model_dump()
128
151
 
@@ -131,8 +154,7 @@ class TraceEntry:
131
154
  json.dumps(self.output)
132
155
  return self.output
133
156
  except (TypeError, OverflowError, ValueError):
134
- warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
135
- return None
157
+ return safe_stringify(self.output, self.function)
136
158
 
137
159
  class TraceClient:
138
160
  """Client for managing a single trace context"""
@@ -361,6 +383,24 @@ class TraceClient:
361
383
  raw_entries = [entry.to_dict() for entry in self.entries]
362
384
  condensed_entries = self.condense_trace(raw_entries)
363
385
 
386
+ # Calculate total token counts from LLM API calls
387
+ total_prompt_tokens = 0
388
+ total_completion_tokens = 0
389
+ total_tokens = 0
390
+
391
+ for entry in condensed_entries:
392
+ if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
393
+ usage = entry["output"].get("usage", {})
394
+ # Handle OpenAI/Together format
395
+ if "prompt_tokens" in usage:
396
+ total_prompt_tokens += usage.get("prompt_tokens", 0)
397
+ total_completion_tokens += usage.get("completion_tokens", 0)
398
+ # Handle Anthropic format
399
+ elif "input_tokens" in usage:
400
+ total_prompt_tokens += usage.get("input_tokens", 0)
401
+ total_completion_tokens += usage.get("output_tokens", 0)
402
+ total_tokens += usage.get("total_tokens", 0)
403
+
364
404
  # Create trace document
365
405
  trace_data = {
366
406
  "trace_id": self.trace_id,
@@ -370,10 +410,10 @@ class TraceClient:
370
410
  "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
371
411
  "duration": total_duration,
372
412
  "token_counts": {
373
- "prompt_tokens": 0, # Dummy value
374
- "completion_tokens": 0, # Dummy value
375
- "total_tokens": 0, # Dummy value
376
- }, # TODO: Add token counts
413
+ "prompt_tokens": total_prompt_tokens,
414
+ "completion_tokens": total_completion_tokens,
415
+ "total_tokens": total_tokens,
416
+ },
377
417
  "entries": condensed_entries,
378
418
  "empty_save": empty_save,
379
419
  "overwrite": overwrite
@@ -393,6 +433,9 @@ class TraceClient:
393
433
  elif response.status_code != HTTPStatus.OK:
394
434
  raise ValueError(f"Failed to save trace data: {response.text}")
395
435
 
436
+ if not empty_save and "ui_results_url" in response.json():
437
+ rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
438
+
396
439
  return self.trace_id, trace_data
397
440
 
398
441
  class Tracer:
@@ -403,7 +446,7 @@ class Tracer:
403
446
  cls._instance = super(Tracer, cls).__new__(cls)
404
447
  return cls._instance
405
448
 
406
- def __init__(self, api_key: str):
449
+ def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
407
450
  if not hasattr(self, 'initialized'):
408
451
 
409
452
  if not api_key:
judgeval/constants.py CHANGED
@@ -36,6 +36,7 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
36
36
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
37
37
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
38
38
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
39
+ JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
39
40
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
40
41
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
41
42
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
@@ -1,4 +1,5 @@
1
1
  from judgeval.data.datasets.dataset import EvalDataset
2
2
  from judgeval.data.datasets.ground_truth import GroundTruthExample
3
+ from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
3
4
 
4
- __all__ = ["EvalDataset", "GroundTruthExample"]
5
+ __all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
@@ -2,16 +2,11 @@ import ast
2
2
  import csv
3
3
  import datetime
4
4
  import json
5
- from rich.console import Console
6
- from rich.progress import Progress, SpinnerColumn, TextColumn
7
- import requests
8
5
  from dataclasses import dataclass, field
9
6
  import os
10
7
  from typing import List, Optional, Union, Literal
11
8
 
12
- from judgeval.constants import JUDGMENT_DATASETS_PUSH_API_URL, JUDGMENT_DATASETS_PULL_API_URL
13
9
  from judgeval.data.datasets.ground_truth import GroundTruthExample
14
- from judgeval.data.datasets.utils import ground_truths_to_examples, examples_to_ground_truths
15
10
  from judgeval.data import Example
16
11
  from judgeval.common.logger import debug, error, warning, info
17
12
 
@@ -37,120 +32,6 @@ class EvalDataset:
37
32
  self._id = None
38
33
  self.judgment_api_key = judgment_api_key
39
34
 
40
- def push(self, alias: str, overwrite: Optional[bool] = False) -> bool:
41
- debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
42
- if overwrite:
43
- warning(f"Overwrite enabled for alias '{alias}'")
44
- """
45
- Pushes the dataset to Judgment platform
46
-
47
- Mock request:
48
- {
49
- "alias": alias,
50
- "ground_truths": [...],
51
- "examples": [...],
52
- "overwrite": overwrite
53
- } ==>
54
- {
55
- "_alias": alias,
56
- "_id": "..." # ID of the dataset
57
- }
58
- """
59
- with Progress(
60
- SpinnerColumn(style="rgb(106,0,255)"),
61
- TextColumn("[progress.description]{task.description}"),
62
- transient=False,
63
- ) as progress:
64
- task_id = progress.add_task(
65
- f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
66
- total=100,
67
- )
68
- content = {
69
- "alias": alias,
70
- "ground_truths": [g.to_dict() for g in self.ground_truths],
71
- "examples": [e.to_dict() for e in self.examples],
72
- "overwrite": overwrite,
73
- "judgment_api_key": self.judgment_api_key
74
- }
75
- try:
76
- response = requests.post(
77
- JUDGMENT_DATASETS_PUSH_API_URL,
78
- json=content
79
- )
80
- if response.status_code == 500:
81
- error(f"Server error during push: {content.get('message')}")
82
- return False
83
- response.raise_for_status()
84
- except requests.exceptions.HTTPError as err:
85
- if response.status_code == 422:
86
- error(f"Validation error during push: {err.response.json()}")
87
- else:
88
- error(f"HTTP error during push: {err}")
89
-
90
- info(f"Successfully pushed dataset with alias '{alias}'")
91
- payload = response.json()
92
- self._alias = payload.get("_alias")
93
- self._id = payload.get("_id")
94
- progress.update(
95
- task_id,
96
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
97
- )
98
- return True
99
-
100
- def pull(self, alias: str):
101
- debug(f"Pulling dataset with alias '{alias}'")
102
- """
103
- Pulls the dataset from Judgment platform
104
-
105
- Mock request:
106
- {
107
- "alias": alias,
108
- "user_id": user_id
109
- }
110
- ==>
111
- {
112
- "ground_truths": [...],
113
- "examples": [...],
114
- "_alias": alias,
115
- "_id": "..." # ID of the dataset
116
- }
117
- """
118
- # Make a POST request to the Judgment API to get the dataset
119
-
120
- with Progress(
121
- SpinnerColumn(style="rgb(106,0,255)"),
122
- TextColumn("[progress.description]{task.description}"),
123
- transient=False,
124
- ) as progress:
125
- task_id = progress.add_task(
126
- f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
127
- total=100,
128
- )
129
- request_body = {
130
- "alias": alias,
131
- "judgment_api_key": self.judgment_api_key
132
- }
133
-
134
- try:
135
- response = requests.post(
136
- JUDGMENT_DATASETS_PULL_API_URL,
137
- json=request_body
138
- )
139
- response.raise_for_status()
140
- except requests.exceptions.RequestException as e:
141
- error(f"Error pulling dataset: {str(e)}")
142
- raise
143
-
144
- info(f"Successfully pulled dataset with alias '{alias}'")
145
- payload = response.json()
146
- self.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
147
- self.examples = [Example(**e) for e in payload.get("examples", [])]
148
- self._alias = payload.get("_alias")
149
- self._id = payload.get("_id")
150
- progress.update(
151
- task_id,
152
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
153
- )
154
35
 
155
36
  def add_from_json(self, file_path: str) -> None:
156
37
  debug(f"Loading dataset from JSON file: {file_path}")
@@ -402,6 +283,4 @@ class EvalDataset:
402
283
  f"_alias={self._alias}, "
403
284
  f"_id={self._id}"
404
285
  f")"
405
- )
406
-
407
-
286
+ )
@@ -0,0 +1,193 @@
1
+
2
+ from typing import Optional
3
+ import requests
4
+ from rich.progress import Progress, SpinnerColumn, TextColumn
5
+
6
+ from judgeval.common.logger import debug, error, warning, info
7
+ from judgeval.constants import (
8
+ JUDGMENT_DATASETS_PUSH_API_URL,
9
+ JUDGMENT_DATASETS_PULL_API_URL,
10
+ JUDGMENT_DATASETS_PULL_ALL_API_URL
11
+ )
12
+ from judgeval.data import Example
13
+ from judgeval.data.datasets import EvalDataset
14
+ from judgeval.data.datasets.ground_truth import GroundTruthExample
15
+
16
+
17
+
18
+
19
+ class EvalDatasetClient:
20
+ def __init__(self, judgment_api_key: str):
21
+ self.judgment_api_key = judgment_api_key
22
+
23
+ def create_dataset(self) -> EvalDataset:
24
+ return EvalDataset(judgment_api_key=self.judgment_api_key)
25
+
26
+ def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
27
+ debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
28
+ if overwrite:
29
+ warning(f"Overwrite enabled for alias '{alias}'")
30
+ """
31
+ Pushes the dataset to Judgment platform
32
+
33
+ Mock request:
34
+ dataset = {
35
+ "alias": alias,
36
+ "ground_truths": [...],
37
+ "examples": [...],
38
+ "overwrite": overwrite
39
+ } ==>
40
+ {
41
+ "_alias": alias,
42
+ "_id": "..." # ID of the dataset
43
+ }
44
+ """
45
+ with Progress(
46
+ SpinnerColumn(style="rgb(106,0,255)"),
47
+ TextColumn("[progress.description]{task.description}"),
48
+ transient=False,
49
+ ) as progress:
50
+ task_id = progress.add_task(
51
+ f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
52
+ total=100,
53
+ )
54
+ content = {
55
+ "alias": alias,
56
+ "ground_truths": [g.to_dict() for g in dataset.ground_truths],
57
+ "examples": [e.to_dict() for e in dataset.examples],
58
+ "overwrite": overwrite,
59
+ "judgment_api_key": dataset.judgment_api_key
60
+ }
61
+ try:
62
+ response = requests.post(
63
+ JUDGMENT_DATASETS_PUSH_API_URL,
64
+ json=content
65
+ )
66
+ if response.status_code == 500:
67
+ error(f"Server error during push: {content.get('message')}")
68
+ return False
69
+ response.raise_for_status()
70
+ except requests.exceptions.HTTPError as err:
71
+ if response.status_code == 422:
72
+ error(f"Validation error during push: {err.response.json()}")
73
+ else:
74
+ error(f"HTTP error during push: {err}")
75
+
76
+ info(f"Successfully pushed dataset with alias '{alias}'")
77
+ payload = response.json()
78
+ dataset._alias = payload.get("_alias")
79
+ dataset._id = payload.get("_id")
80
+ progress.update(
81
+ task_id,
82
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
83
+ )
84
+ return True
85
+
86
+ def pull(self, alias: str) -> EvalDataset:
87
+ debug(f"Pulling dataset with alias '{alias}'")
88
+ """
89
+ Pulls the dataset from Judgment platform
90
+
91
+ Mock request:
92
+ {
93
+ "alias": alias,
94
+ "user_id": user_id
95
+ }
96
+ ==>
97
+ {
98
+ "ground_truths": [...],
99
+ "examples": [...],
100
+ "_alias": alias,
101
+ "_id": "..." # ID of the dataset
102
+ }
103
+ """
104
+ # Make a POST request to the Judgment API to get the dataset
105
+ dataset = self.create_dataset()
106
+
107
+ with Progress(
108
+ SpinnerColumn(style="rgb(106,0,255)"),
109
+ TextColumn("[progress.description]{task.description}"),
110
+ transient=False,
111
+ ) as progress:
112
+ task_id = progress.add_task(
113
+ f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
114
+ total=100,
115
+ )
116
+ request_body = {
117
+ "alias": alias,
118
+ "judgment_api_key": self.judgment_api_key
119
+ }
120
+
121
+ try:
122
+ response = requests.post(
123
+ JUDGMENT_DATASETS_PULL_API_URL,
124
+ json=request_body
125
+ )
126
+ response.raise_for_status()
127
+ except requests.exceptions.RequestException as e:
128
+ error(f"Error pulling dataset: {str(e)}")
129
+ raise
130
+
131
+ info(f"Successfully pulled dataset with alias '{alias}'")
132
+ payload = response.json()
133
+ dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
134
+ dataset.examples = [Example(**e) for e in payload.get("examples", [])]
135
+ dataset._alias = payload.get("_alias")
136
+ dataset._id = payload.get("_id")
137
+ progress.update(
138
+ task_id,
139
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
140
+ )
141
+
142
+ return dataset
143
+
144
+ def pull_all_user_dataset_stats(self) -> dict:
145
+ debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
146
+ """
147
+ Pulls the user datasets stats from Judgment platform
148
+
149
+ Mock request:
150
+ {
151
+ "user_id": user_id
152
+ }
153
+ ==>
154
+ {
155
+ "test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
156
+ "test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
157
+ ...
158
+ }
159
+ """
160
+ # Make a POST request to the Judgment API to get the dataset
161
+
162
+ with Progress(
163
+ SpinnerColumn(style="rgb(106,0,255)"),
164
+ TextColumn("[progress.description]{task.description}"),
165
+ transient=False,
166
+ ) as progress:
167
+ task_id = progress.add_task(
168
+ f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
169
+ total=100,
170
+ )
171
+ request_body = {
172
+ "judgment_api_key": self.judgment_api_key
173
+ }
174
+
175
+ try:
176
+ response = requests.post(
177
+ JUDGMENT_DATASETS_PULL_ALL_API_URL,
178
+ json=request_body
179
+ )
180
+ response.raise_for_status()
181
+ except requests.exceptions.RequestException as e:
182
+ error(f"Error pulling dataset: {str(e)}")
183
+ raise
184
+
185
+ info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
186
+ payload = response.json()
187
+
188
+ progress.update(
189
+ task_id,
190
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
191
+ )
192
+
193
+ return payload
judgeval/data/result.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import List, Union, Optional
2
+ from typing import List, Union, Optional, Dict, Any
3
3
 
4
4
  from judgeval.data import ScorerData, ProcessExample
5
5
 
@@ -18,6 +18,9 @@ class ScoringResult:
18
18
  expected_output (Optional[str]): The expected output of the example
19
19
  context (Optional[List[str]]): The context of the example
20
20
  retrieval_context (Optional[List[str]]): The retrieval context of the example
21
+ additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
22
+ tools_called (Optional[List[str]]): The tools called by the example
23
+ expected_tools (Optional[List[str]]): The expected tools of the example
21
24
  trace_id (Optional[str]): The trace id of the example
22
25
 
23
26
  """
@@ -31,6 +34,9 @@ class ScoringResult:
31
34
  expected_output: Optional[str] = None
32
35
  context: Optional[List[str]] = None
33
36
  retrieval_context: Optional[List[str]] = None
37
+ additional_metadata: Optional[Dict[str, Any]] = None
38
+ tools_called: Optional[List[str]] = None
39
+ expected_tools: Optional[List[str]] = None
34
40
  trace_id: Optional[str] = None
35
41
 
36
42
  example_id: Optional[str] = None
@@ -46,6 +52,9 @@ class ScoringResult:
46
52
  "expected_output": self.expected_output,
47
53
  "context": self.context,
48
54
  "retrieval_context": self.retrieval_context,
55
+ "additional_metadata": self.additional_metadata,
56
+ "tools_called": self.tools_called,
57
+ "expected_tools": self.expected_tools,
49
58
  "trace_id": self.trace_id,
50
59
  "example_id": self.example_id
51
60
  }
@@ -59,6 +68,9 @@ class ScoringResult:
59
68
  expected_output={self.expected_output}, \
60
69
  context={self.context}, \
61
70
  retrieval_context={self.retrieval_context}, \
71
+ additional_metadata={self.additional_metadata}, \
72
+ tools_called={self.tools_called}, \
73
+ expected_tools={self.expected_tools}, \
62
74
  trace_id={self.trace_id})"
63
75
 
64
76
 
@@ -79,5 +91,8 @@ def generate_scoring_result(
79
91
  expected_output=process_example.expected_output,
80
92
  context=process_example.context,
81
93
  retrieval_context=process_example.retrieval_context,
94
+ additional_metadata=process_example.additional_metadata,
95
+ tools_called=process_example.tools_called,
96
+ expected_tools=process_example.expected_tools,
82
97
  trace_id=process_example.trace_id
83
98
  )
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
6
6
  import requests
7
7
 
8
8
  from judgeval.constants import ROOT_API
9
- from judgeval.data.datasets import EvalDataset
9
+ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
12
  Example
@@ -36,6 +36,7 @@ class EvalRunRequestBody(BaseModel):
36
36
  class JudgmentClient:
37
37
  def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
38
38
  self.judgment_api_key = judgment_api_key
39
+ self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
39
40
 
40
41
  # Verify API key is valid
41
42
  result, response = self._validate_api_key()
@@ -121,7 +122,7 @@ class JudgmentClient:
121
122
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
122
123
 
123
124
  def create_dataset(self) -> EvalDataset:
124
- return EvalDataset(judgment_api_key=self.judgment_api_key)
125
+ return self.eval_dataset_client.create_dataset()
125
126
 
126
127
  def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
127
128
  """
@@ -137,7 +138,7 @@ class JudgmentClient:
137
138
  """
138
139
  # Set judgment_api_key just in case it was not set
139
140
  dataset.judgment_api_key = self.judgment_api_key
140
- return dataset.push(alias, overwrite)
141
+ return self.eval_dataset_client.push(dataset, alias, overwrite)
141
142
 
142
143
  def pull_dataset(self, alias: str) -> EvalDataset:
143
144
  """
@@ -149,9 +150,20 @@ class JudgmentClient:
149
150
  Returns:
150
151
  EvalDataset: The retrieved dataset
151
152
  """
152
- dataset = EvalDataset(judgment_api_key=self.judgment_api_key)
153
- dataset.pull(alias)
154
- return dataset
153
+ return self.eval_dataset_client.pull(alias)
154
+
155
+ def pull_all_user_dataset_stats(self) -> dict:
156
+ """
157
+ Retrieves all dataset stats from the Judgment platform for the user.
158
+
159
+ Args:
160
+ alias (str): The name of the dataset to retrieve
161
+
162
+ Returns:
163
+ EvalDataset: The retrieved dataset
164
+ """
165
+ return self.eval_dataset_client.pull_all_user_dataset_stats()
166
+
155
167
 
156
168
  # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
157
169
  def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -97,6 +97,13 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
97
97
  raise ValueError("The API and local results are not aligned.")
98
98
  if api_result.retrieval_context != local_result.retrieval_context:
99
99
  raise ValueError("The API and local results are not aligned.")
100
+ if api_result.additional_metadata != local_result.additional_metadata:
101
+ raise ValueError("The API and local results are not aligned.")
102
+ if api_result.tools_called != local_result.tools_called:
103
+ raise ValueError("The API and local results are not aligned.")
104
+ if api_result.expected_tools != local_result.expected_tools:
105
+ raise ValueError("The API and local results are not aligned.")
106
+
100
107
 
101
108
  # Merge ScorerData from the API and local scorers together
102
109
  api_scorer_data = api_result.scorers_data
@@ -254,6 +261,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
254
261
  debug(f"Context: {example.context}")
255
262
  if example.retrieval_context:
256
263
  debug(f"Retrieval context: {example.retrieval_context}")
264
+ if example.additional_metadata:
265
+ debug(f"Additional metadata: {example.additional_metadata}")
266
+ if example.tools_called:
267
+ debug(f"Tools called: {example.tools_called}")
268
+ if example.expected_tools:
269
+ debug(f"Expected tools: {example.expected_tools}")
257
270
 
258
271
  debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
259
272
 
@@ -379,6 +392,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
379
392
  'expected_output': result.expected_output,
380
393
  'context': result.context,
381
394
  'retrieval_context': result.retrieval_context,
395
+ 'additional_metadata': result.additional_metadata,
396
+ 'tools_called': result.tools_called,
397
+ 'expected_tools': result.expected_tools,
382
398
  'eval_run_name': result.eval_run_name,
383
399
  'failed_scorers': []
384
400
  }
@@ -397,6 +413,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
397
413
  error_msg += f"Expected Output: {fail_case['expected_output']}\n"
398
414
  error_msg += f"Context: {fail_case['context']}\n"
399
415
  error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
416
+ error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
417
+ error_msg += f"Tools Called: {fail_case['tools_called']}\n"
418
+ error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
400
419
  error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
401
420
 
402
421
  for fail_scorer in fail_case['failed_scorers']:
@@ -13,6 +13,7 @@ from judgeval.scorers.judgeval_scorers import (
13
13
  AnswerRelevancyScorer,
14
14
  ScorerWrapper,
15
15
  AnswerCorrectnessScorer,
16
+ Text2SQLScorer,
16
17
  )
17
18
 
18
19
  __all__ = [
@@ -31,4 +32,5 @@ __all__ = [
31
32
  "AnswerRelevancyScorer",
32
33
  "ScorerWrapper",
33
34
  "AnswerCorrectnessScorer",
35
+ "Text2SQLScorer",
34
36
  ]
@@ -28,6 +28,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
28
28
  AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
29
29
  )
30
30
 
31
+ from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
32
+
33
+
31
34
  class ScorerWrapper:
32
35
  """
33
36
  Wrapper class that can dynamically load either API or local implementation of a scorer.
@@ -141,4 +144,5 @@ __all__ = [
141
144
  "ContextualPrecisionScorer",
142
145
  "ContextualRecallScorer",
143
146
  "AnswerRelevancyScorer",
147
+ "Text2SQLScorer",
144
148
  ]
@@ -0,0 +1,3 @@
1
+ from .text2sql import Text2SQLScorer
2
+
3
+ __all__ = ["Text2SQLScorer"]
@@ -0,0 +1,3 @@
1
+ from .text2sql_scorer import Text2SQLScorer
2
+
3
+ __all__ = ["Text2SQLScorer"]
@@ -0,0 +1,54 @@
1
+ """
2
+ ClassifierScorer implementation for basic Text-to-SQL evaluation.
3
+
4
+ Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
5
+ Determines if the LLM-generated SQL query is valid and works for the natural language query.
6
+ """
7
+ from judgeval.scorers import ClassifierScorer
8
+
9
+ Text2SQLScorer = ClassifierScorer(
10
+ "Text to SQL",
11
+ slug="text2sql-1010101010",
12
+ threshold=1.0,
13
+ conversation=[{
14
+ "role": "system",
15
+ "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
16
+
17
+ ** TASK INSTRUCTIONS **
18
+ Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
19
+ Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
20
+
21
+ ** TIPS **
22
+ - Look for correct references to the table schema for column names, table names, etc.
23
+ - Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
24
+ - Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
25
+ - Verify that WHERE conditions use the correct operators and data types for comparisons
26
+ - Ensure LIMIT and OFFSET clauses make sense for the query's purpose
27
+ - Check that JOINs use the correct keys and maintain referential integrity
28
+ - Verify that ORDER BY clauses use valid column names and sort directions
29
+ - Check for proper handling of NULL values where relevant
30
+ - Ensure subqueries are properly constructed and correlated when needed
31
+ - EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
32
+
33
+ ** FORMATTING YOUR ANSWER **
34
+ If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
35
+ IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
36
+ IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
37
+
38
+ ** YOUR TURN **
39
+ Natural language query:
40
+ {{input}}
41
+
42
+ LLM generated SQL query:
43
+ {{actual_output}}
44
+
45
+ Table schema:
46
+ {{context}}
47
+ """
48
+ }],
49
+ options={
50
+ "Y": 1.0,
51
+ "N": 0.0
52
+ }
53
+ )
54
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,21 +1,22 @@
1
1
  judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
2
2
  judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
3
- judgeval/constants.py,sha256=5O1jWvxMCRyMSWhmkrvPqfBctx42c7kMtgTS7ORVcFw,1965
3
+ judgeval/constants.py,sha256=qwWc3EOpXSn9SHq5rylkHhnzH5WldedqSMCToa7vgZk,2040
4
4
  judgeval/evaluation_run.py,sha256=KcIS7mDR_9XEdqYrJXFcrLz5IDMof34HcD5VtjZgV8w,5884
5
- judgeval/judgment_client.py,sha256=lVVVDxRQ750nd0wT827dca94YzThNjuFWWJ-BTFW7lg,11367
6
- judgeval/run_evaluation.py,sha256=A9jjtWPH2_5W43a1f98R8u-8PuVczoJZNCZIyCoRqi8,18918
5
+ judgeval/judgment_client.py,sha256=jMeayUI-Z-GX4mVMVC9t5f7ENKLQ8dOepScYu5Yytf0,11777
6
+ judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
7
7
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
8
8
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
9
9
  judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
10
- judgeval/common/tracer.py,sha256=JWUmsjxs2N6Cu5nol7vRbwWKFRLHJlwCnHWgg3W17GM,23812
10
+ judgeval/common/tracer.py,sha256=k5g9ZLeM-fLdV_q9NpodN8gW4nLTIXsbxeTaXVjm9jk,25658
11
11
  judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
12
12
  judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
13
13
  judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
14
14
  judgeval/data/example.py,sha256=lymGZ3jG818-r2vyFunt6OLFrhESOyJnbhao_ljTjlA,2471
15
- judgeval/data/result.py,sha256=CVp_mZrBbKjIH9rPB6rg7T2jY1jUy7JVyI7_kUbRC7w,3490
15
+ judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
16
16
  judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
17
- judgeval/data/datasets/__init__.py,sha256=Xh6TSsCcEsJeYjjubfeGa3WU8YQfuwKXH3jR9EeDFgg,171
18
- judgeval/data/datasets/dataset.py,sha256=9GGspdKDhMw2dJAS7ZvOZHSoNGwMzCtgnFYDe6y4yog,16484
17
+ judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
18
+ judgeval/data/datasets/dataset.py,sha256=AGdU21vZ4iVjqbjQ7JY-u29FzJrdDFTgdvhzvYVJNyo,11833
19
+ judgeval/data/datasets/eval_dataset_client.py,sha256=TaCDzymGFNFjGRrieEdQB8dT8xqNPpsEi2XLGFyrJno,7113
19
20
  judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
20
21
  judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
21
22
  judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
@@ -24,7 +25,7 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
24
25
  judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
25
26
  judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
26
27
  judgeval/judges/utils.py,sha256=YUvivcGV1OKLPMJ9N6aTvhA0r_zzJ2NXriPguiiaVaY,2110
27
- judgeval/scorers/__init__.py,sha256=3rq2VtszrJk9gZ3oAMVd7EGlSugr8aRlHWprMDgQPaQ,956
28
+ judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
28
29
  judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
29
30
  judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
30
31
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
@@ -32,7 +33,7 @@ judgeval/scorers/judgeval_scorer.py,sha256=14SZ3sBZtGNM3BCegKebkNad9LTs5Tyhs0kD6
32
33
  judgeval/scorers/prompt_scorer.py,sha256=bUv8eZNy1XGVM1gNMt33dgIVX6zj63bGAV6O0o0c7yg,17821
33
34
  judgeval/scorers/score.py,sha256=zJKG21h9Njyj2vS36CAFK2wlbOcHSKgrLgHV5_25KKw,18630
34
35
  judgeval/scorers/utils.py,sha256=dtueaJm8e3Ph3wj1vC-srzadgK_CoIlOefdvMQ-cwK8,6826
35
- judgeval/scorers/judgeval_scorers/__init__.py,sha256=077QnuBfw9Sy9RP2TF2oKCtt5PbaqBZLyiP-gczKShk,5092
36
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=D12jJAKTcfmz8fDBkYeOmdzZMZsURuODIJ5p7Nk1lWE,5189
36
37
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=zFwH2TC5AFlpDRfVKc6GN4YTtnmeyALl-JRLoZD_Jco,1284
37
38
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
38
39
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
@@ -44,6 +45,9 @@ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexP
44
45
  judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
45
46
  judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
46
47
  judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
48
+ judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
49
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
50
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
47
51
  judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=ZDbmYHwIbPD75Gj9JKtEWnpBdSVGGRmbn1_IOR6GR-c,1627
48
52
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
49
53
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
@@ -73,7 +77,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
73
77
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
74
78
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
75
79
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
76
- judgeval-0.0.8.dist-info/METADATA,sha256=91SMIPO60Q_Ab7yTjL2sKmPgmfl6Bji6_QAzkjaOHlk,1204
77
- judgeval-0.0.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
78
- judgeval-0.0.8.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
79
- judgeval-0.0.8.dist-info/RECORD,,
80
+ judgeval-0.0.10.dist-info/METADATA,sha256=i9jeAPs3jY5hAHAdE_rlen4qJdEk0eAqQ0BOzMie97I,1205
81
+ judgeval-0.0.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
82
+ judgeval-0.0.10.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
83
+ judgeval-0.0.10.dist-info/RECORD,,