judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,35 +4,31 @@ Implements the JudgmentClient to interact with the Judgment API.
4
4
 
5
5
  from __future__ import annotations
6
6
  import os
7
+ import importlib.util
8
+ from pathlib import Path
7
9
  from uuid import uuid4
8
- from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
10
+ from typing import Optional, List, Dict, Union
9
11
 
10
12
  from judgeval.data import (
11
13
  ScoringResult,
12
14
  Example,
13
- Trace,
14
15
  )
15
16
  from judgeval.scorers import (
16
17
  APIScorerConfig,
17
18
  BaseScorer,
18
19
  )
19
- from judgeval.evaluation_run import EvaluationRun
20
+ from judgeval.data.evaluation_run import EvaluationRun
20
21
  from judgeval.run_evaluation import (
21
22
  run_eval,
22
23
  assert_test,
23
- run_trace_eval,
24
24
  )
25
- from judgeval.data.trace_run import TraceRun
26
25
  from judgeval.common.api import JudgmentApiClient
27
26
  from judgeval.common.exceptions import JudgmentAPIError
28
- from judgeval.common.tracer import Tracer
29
27
  from judgeval.common.utils import validate_api_key
30
28
  from pydantic import BaseModel
31
29
  from judgeval.common.logger import judgeval_logger
32
30
 
33
31
 
34
- if TYPE_CHECKING:
35
- from judgeval.integrations.langgraph import JudgevalCallbackHandler
36
32
  from judgeval.constants import DEFAULT_GPT_MODEL
37
33
 
38
34
 
@@ -84,50 +80,6 @@ class JudgmentClient(metaclass=SingletonMeta):
84
80
  else:
85
81
  judgeval_logger.info("Successfully initialized JudgmentClient!")
86
82
 
87
- def run_trace_evaluation(
88
- self,
89
- scorers: List[Union[APIScorerConfig, BaseScorer]],
90
- examples: Optional[List[Example]] = None,
91
- function: Optional[Callable] = None,
92
- tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
93
- traces: Optional[List[Trace]] = None,
94
- tools: Optional[List[Dict[str, Any]]] = None,
95
- project_name: str = "default_project",
96
- eval_run_name: str = "default_eval_trace",
97
- model: Optional[str] = DEFAULT_GPT_MODEL,
98
- append: bool = False,
99
- override: bool = False,
100
- ) -> List[ScoringResult]:
101
- try:
102
- if examples and not function:
103
- raise ValueError("Cannot pass in examples without a function")
104
-
105
- if traces and function:
106
- raise ValueError("Cannot pass in traces and function")
107
-
108
- if examples and traces:
109
- raise ValueError("Cannot pass in both examples and traces")
110
-
111
- trace_run = TraceRun(
112
- project_name=project_name,
113
- eval_name=eval_run_name,
114
- traces=traces,
115
- scorers=scorers,
116
- model=model,
117
- append=append,
118
- organization_id=self.organization_id,
119
- tools=tools,
120
- )
121
- return run_trace_eval(
122
- trace_run, self.judgment_api_key, override, function, tracer, examples
123
- )
124
- except ValueError as e:
125
- raise ValueError(
126
- f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
127
- )
128
- except Exception as e:
129
- raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
130
-
131
83
  def run_evaluation(
132
84
  self,
133
85
  examples: List[Example],
@@ -135,8 +87,7 @@ class JudgmentClient(metaclass=SingletonMeta):
135
87
  model: Optional[str] = DEFAULT_GPT_MODEL,
136
88
  project_name: str = "default_project",
137
89
  eval_run_name: str = "default_eval_run",
138
- override: bool = False,
139
- append: bool = False,
90
+ show_url: bool = True,
140
91
  ) -> List[ScoringResult]:
141
92
  """
142
93
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -147,21 +98,13 @@ class JudgmentClient(metaclass=SingletonMeta):
147
98
  model (str): The model used as a judge when using LLM as a Judge
148
99
  project_name (str): The name of the project the evaluation results belong to
149
100
  eval_run_name (str): A name for this evaluation run
150
- override (bool): Whether to override an existing evaluation run with the same name
151
- append (bool): Whether to append to an existing evaluation run with the same name
152
101
 
153
102
  Returns:
154
103
  List[ScoringResult]: The results of the evaluation
155
104
  """
156
- if override and append:
157
- raise ValueError(
158
- "Cannot set both override and append to True. Please choose one."
159
- )
160
105
 
161
106
  try:
162
107
  eval = EvaluationRun(
163
- append=append,
164
- override=override,
165
108
  project_name=project_name,
166
109
  eval_name=eval_run_name,
167
110
  examples=examples,
@@ -172,7 +115,7 @@ class JudgmentClient(metaclass=SingletonMeta):
172
115
  return run_eval(
173
116
  eval,
174
117
  self.judgment_api_key,
175
- override,
118
+ show_url=show_url,
176
119
  )
177
120
  except ValueError as e:
178
121
  raise ValueError(
@@ -181,22 +124,6 @@ class JudgmentClient(metaclass=SingletonMeta):
181
124
  except Exception as e:
182
125
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
183
126
 
184
- def pull_eval(
185
- self, project_name: str, eval_run_name: str
186
- ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
187
- """Pull evaluation results from the server.
188
-
189
- Args:
190
- project_name (str): Name of the project
191
- eval_run_name (str): Name of the evaluation run
192
-
193
- Returns:
194
- Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
195
- - id (str): The evaluation run ID
196
- - results (List[ScoringResult]): List of scoring results
197
- """
198
- return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
199
-
200
127
  def create_project(self, project_name: str) -> bool:
201
128
  """
202
129
  Creates a project on the server.
@@ -222,8 +149,6 @@ class JudgmentClient(metaclass=SingletonMeta):
222
149
  model: Optional[str] = DEFAULT_GPT_MODEL,
223
150
  project_name: str = "default_test",
224
151
  eval_run_name: str = str(uuid4()),
225
- override: bool = False,
226
- append: bool = False,
227
152
  ) -> None:
228
153
  """
229
154
  Asserts a test by running the evaluation and checking the results for success
@@ -234,9 +159,6 @@ class JudgmentClient(metaclass=SingletonMeta):
234
159
  model (str): The model used as a judge when using LLM as a Judge
235
160
  project_name (str): The name of the project the evaluation results belong to
236
161
  eval_run_name (str): A name for this evaluation run
237
- override (bool): Whether to override an existing evaluation run with the same name
238
- append (bool): Whether to append to an existing evaluation run with the same name
239
- async_execution (bool): Whether to run the evaluation asynchronously
240
162
  """
241
163
 
242
164
  results: List[ScoringResult]
@@ -247,66 +169,99 @@ class JudgmentClient(metaclass=SingletonMeta):
247
169
  model=model,
248
170
  project_name=project_name,
249
171
  eval_run_name=eval_run_name,
250
- override=override,
251
- append=append,
252
172
  )
253
173
  assert_test(results)
254
174
 
255
- def assert_trace_test(
175
+ def _extract_scorer_name(self, scorer_file_path: str) -> str:
176
+ """Extract scorer name from the scorer file by importing it."""
177
+ try:
178
+ spec = importlib.util.spec_from_file_location(
179
+ "scorer_module", scorer_file_path
180
+ )
181
+ if spec is None or spec.loader is None:
182
+ raise ImportError(f"Could not load spec from {scorer_file_path}")
183
+
184
+ module = importlib.util.module_from_spec(spec)
185
+ spec.loader.exec_module(module)
186
+
187
+ for attr_name in dir(module):
188
+ attr = getattr(module, attr_name)
189
+ if (
190
+ isinstance(attr, type)
191
+ and any("Scorer" in str(base) for base in attr.__mro__)
192
+ and attr.__module__ == "scorer_module"
193
+ ):
194
+ try:
195
+ # Instantiate the scorer and get its name
196
+ scorer_instance = attr()
197
+ if hasattr(scorer_instance, "name"):
198
+ return scorer_instance.name
199
+ except Exception:
200
+ # Skip if instantiation fails
201
+ continue
202
+
203
+ raise AttributeError("No scorer class found or could be instantiated")
204
+ except Exception as e:
205
+ judgeval_logger.warning(f"Could not extract scorer name: {e}")
206
+ return Path(scorer_file_path).stem
207
+
208
+ def upload_custom_scorer(
256
209
  self,
257
- scorers: List[Union[APIScorerConfig, BaseScorer]],
258
- examples: Optional[List[Example]] = None,
259
- function: Optional[Callable] = None,
260
- tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
261
- traces: Optional[List[Trace]] = None,
262
- tools: Optional[List[Dict[str, Any]]] = None,
263
- model: Optional[str] = DEFAULT_GPT_MODEL,
264
- project_name: str = "default_test",
265
- eval_run_name: str = str(uuid4()),
266
- override: bool = False,
267
- append: bool = False,
268
- async_execution: bool = False,
269
- ) -> None:
210
+ scorer_file_path: str,
211
+ requirements_file_path: Optional[str] = None,
212
+ unique_name: Optional[str] = None,
213
+ ) -> bool:
270
214
  """
271
- Asserts a test by running the evaluation and checking the results for success
215
+ Upload custom ExampleScorer from files to backend.
272
216
 
273
217
  Args:
274
- examples (List[Example]): The examples to evaluate.
275
- scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
276
- model (str): The model used as a judge when using LLM as a Judge
277
- project_name (str): The name of the project the evaluation results belong to
278
- eval_run_name (str): A name for this evaluation run
279
- override (bool): Whether to override an existing evaluation run with the same name
280
- append (bool): Whether to append to an existing evaluation run with the same name
281
- function (Optional[Callable]): A function to use for evaluation
282
- tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
283
- tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
284
- async_execution (bool): Whether to run the evaluation asynchronously
218
+ scorer_file_path: Path to Python file containing CustomScorer class
219
+ requirements_file_path: Optional path to requirements.txt
220
+ unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
221
+
222
+ Returns:
223
+ bool: True if upload successful
224
+
225
+ Raises:
226
+ ValueError: If scorer file is invalid
227
+ FileNotFoundError: If scorer file doesn't exist
285
228
  """
229
+ import os
286
230
 
287
- # Check for enable_param_checking and tools
288
- for scorer in scorers:
289
- if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
290
- if scorer.kwargs.get("enable_param_checking") is True:
291
- if not tools:
292
- raise ValueError(
293
- f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
294
- )
231
+ if not os.path.exists(scorer_file_path):
232
+ raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
295
233
 
296
- results: List[ScoringResult]
234
+ # Auto-detect scorer name if not provided
235
+ if unique_name is None:
236
+ unique_name = self._extract_scorer_name(scorer_file_path)
237
+ judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
297
238
 
298
- results = self.run_trace_evaluation(
299
- examples=examples,
300
- traces=traces,
301
- scorers=scorers,
302
- model=model,
303
- project_name=project_name,
304
- eval_run_name=eval_run_name,
305
- override=override,
306
- append=append,
307
- function=function,
308
- tracer=tracer,
309
- tools=tools,
310
- )
239
+ # Read scorer code
240
+ with open(scorer_file_path, "r") as f:
241
+ scorer_code = f.read()
311
242
 
312
- assert_test(results)
243
+ # Read requirements (optional)
244
+ requirements_text = ""
245
+ if requirements_file_path and os.path.exists(requirements_file_path):
246
+ with open(requirements_file_path, "r") as f:
247
+ requirements_text = f.read()
248
+
249
+ try:
250
+ response = self.api_client.upload_custom_scorer(
251
+ scorer_name=unique_name,
252
+ scorer_code=scorer_code,
253
+ requirements_text=requirements_text,
254
+ )
255
+
256
+ if response.get("status") == "success":
257
+ judgeval_logger.info(
258
+ f"Successfully uploaded custom scorer: {unique_name}"
259
+ )
260
+ return True
261
+ else:
262
+ judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
263
+ return False
264
+
265
+ except Exception as e:
266
+ judgeval_logger.error(f"Error uploading custom scorer: {e}")
267
+ raise
@@ -13,9 +13,8 @@ import time
13
13
  from judgeval.common.logger import judgeval_logger
14
14
  from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
15
15
  from judgeval.data import ScoringResult
16
- from judgeval.evaluation_run import EvaluationRun
16
+ from judgeval.data.evaluation_run import EvaluationRun
17
17
  from judgeval.utils.async_utils import safe_run_async
18
- from judgeval.scorers import BaseScorer
19
18
  from judgeval.scorers.score import a_execute_scoring
20
19
 
21
20
 
@@ -43,9 +42,8 @@ class LocalEvaluationQueue:
43
42
 
44
43
  def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
45
44
  """Execute evaluation run locally and return results."""
46
- local_scorers = [s for s in evaluation_run.scorers if isinstance(s, BaseScorer)]
47
45
 
48
- if not local_scorers:
46
+ if not evaluation_run.custom_scorers:
49
47
  raise ValueError(
50
48
  "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
51
49
  "Found only APIScorerConfig instances."
@@ -54,7 +52,7 @@ class LocalEvaluationQueue:
54
52
  return safe_run_async(
55
53
  a_execute_scoring(
56
54
  evaluation_run.examples,
57
- local_scorers,
55
+ evaluation_run.custom_scorers,
58
56
  model=evaluation_run.model,
59
57
  throttle_value=0,
60
58
  max_concurrent=self._max_concurrent // self._num_workers,