judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/judges/utils.py CHANGED
@@ -7,6 +7,7 @@ from typing import Optional, Union, Tuple, List
7
7
 
8
8
  from judgeval.common.exceptions import InvalidJudgeModelError
9
9
  from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
10
+ from judgeval.constants import DEFAULT_GPT_MODEL
10
11
  from judgeval.constants import (
11
12
  TOGETHER_SUPPORTED_MODELS,
12
13
  JUDGMENT_SUPPORTED_MODELS,
@@ -30,7 +31,7 @@ def create_judge(
30
31
  If no model is provided, uses GPT4o as the default judge.
31
32
  """
32
33
  if model is None: # default option
33
- return LiteLLMJudge(model="gpt-4.1"), True
34
+ return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
34
35
  if not isinstance(model, (str, list, JudgevalJudge)):
35
36
  raise InvalidJudgeModelError(
36
37
  f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
@@ -2,9 +2,12 @@
2
2
  Implements the JudgmentClient to interact with the Judgment API.
3
3
  """
4
4
 
5
+ from __future__ import annotations
5
6
  import os
7
+ import importlib.util
8
+ from pathlib import Path
6
9
  from uuid import uuid4
7
- from typing import Optional, List, Dict, Any, Union, Callable
10
+ from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
8
11
 
9
12
  from judgeval.data import (
10
13
  ScoringResult,
@@ -15,7 +18,7 @@ from judgeval.scorers import (
15
18
  APIScorerConfig,
16
19
  BaseScorer,
17
20
  )
18
- from judgeval.evaluation_run import EvaluationRun
21
+ from judgeval.data.evaluation_run import EvaluationRun
19
22
  from judgeval.run_evaluation import (
20
23
  run_eval,
21
24
  assert_test,
@@ -28,7 +31,11 @@ from judgeval.common.tracer import Tracer
28
31
  from judgeval.common.utils import validate_api_key
29
32
  from pydantic import BaseModel
30
33
  from judgeval.common.logger import judgeval_logger
31
- from judgeval.integrations.langgraph import JudgevalCallbackHandler
34
+
35
+
36
+ if TYPE_CHECKING:
37
+ from judgeval.integrations.langgraph import JudgevalCallbackHandler
38
+ from judgeval.constants import DEFAULT_GPT_MODEL
32
39
 
33
40
 
34
41
  class EvalRunRequestBody(BaseModel):
@@ -89,9 +96,7 @@ class JudgmentClient(metaclass=SingletonMeta):
89
96
  tools: Optional[List[Dict[str, Any]]] = None,
90
97
  project_name: str = "default_project",
91
98
  eval_run_name: str = "default_eval_trace",
92
- model: Optional[str] = "gpt-4.1",
93
- append: bool = False,
94
- override: bool = False,
99
+ model: Optional[str] = DEFAULT_GPT_MODEL,
95
100
  ) -> List[ScoringResult]:
96
101
  try:
97
102
  if examples and not function:
@@ -109,12 +114,11 @@ class JudgmentClient(metaclass=SingletonMeta):
109
114
  traces=traces,
110
115
  scorers=scorers,
111
116
  model=model,
112
- append=append,
113
117
  organization_id=self.organization_id,
114
118
  tools=tools,
115
119
  )
116
120
  return run_trace_eval(
117
- trace_run, self.judgment_api_key, override, function, tracer, examples
121
+ trace_run, self.judgment_api_key, function, tracer, examples
118
122
  )
119
123
  except ValueError as e:
120
124
  raise ValueError(
@@ -127,11 +131,9 @@ class JudgmentClient(metaclass=SingletonMeta):
127
131
  self,
128
132
  examples: List[Example],
129
133
  scorers: List[Union[APIScorerConfig, BaseScorer]],
130
- model: Optional[str] = "gpt-4.1",
134
+ model: Optional[str] = DEFAULT_GPT_MODEL,
131
135
  project_name: str = "default_project",
132
136
  eval_run_name: str = "default_eval_run",
133
- override: bool = False,
134
- append: bool = False,
135
137
  ) -> List[ScoringResult]:
136
138
  """
137
139
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -142,21 +144,13 @@ class JudgmentClient(metaclass=SingletonMeta):
142
144
  model (str): The model used as a judge when using LLM as a Judge
143
145
  project_name (str): The name of the project the evaluation results belong to
144
146
  eval_run_name (str): A name for this evaluation run
145
- override (bool): Whether to override an existing evaluation run with the same name
146
- append (bool): Whether to append to an existing evaluation run with the same name
147
147
 
148
148
  Returns:
149
149
  List[ScoringResult]: The results of the evaluation
150
150
  """
151
- if override and append:
152
- raise ValueError(
153
- "Cannot set both override and append to True. Please choose one."
154
- )
155
151
 
156
152
  try:
157
153
  eval = EvaluationRun(
158
- append=append,
159
- override=override,
160
154
  project_name=project_name,
161
155
  eval_name=eval_run_name,
162
156
  examples=examples,
@@ -167,7 +161,6 @@ class JudgmentClient(metaclass=SingletonMeta):
167
161
  return run_eval(
168
162
  eval,
169
163
  self.judgment_api_key,
170
- override,
171
164
  )
172
165
  except ValueError as e:
173
166
  raise ValueError(
@@ -176,22 +169,6 @@ class JudgmentClient(metaclass=SingletonMeta):
176
169
  except Exception as e:
177
170
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
178
171
 
179
- def pull_eval(
180
- self, project_name: str, eval_run_name: str
181
- ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
182
- """Pull evaluation results from the server.
183
-
184
- Args:
185
- project_name (str): Name of the project
186
- eval_run_name (str): Name of the evaluation run
187
-
188
- Returns:
189
- Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
190
- - id (str): The evaluation run ID
191
- - results (List[ScoringResult]): List of scoring results
192
- """
193
- return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
194
-
195
172
  def create_project(self, project_name: str) -> bool:
196
173
  """
197
174
  Creates a project on the server.
@@ -214,11 +191,9 @@ class JudgmentClient(metaclass=SingletonMeta):
214
191
  self,
215
192
  examples: List[Example],
216
193
  scorers: List[Union[APIScorerConfig, BaseScorer]],
217
- model: Optional[str] = "gpt-4.1",
194
+ model: Optional[str] = DEFAULT_GPT_MODEL,
218
195
  project_name: str = "default_test",
219
196
  eval_run_name: str = str(uuid4()),
220
- override: bool = False,
221
- append: bool = False,
222
197
  ) -> None:
223
198
  """
224
199
  Asserts a test by running the evaluation and checking the results for success
@@ -229,9 +204,6 @@ class JudgmentClient(metaclass=SingletonMeta):
229
204
  model (str): The model used as a judge when using LLM as a Judge
230
205
  project_name (str): The name of the project the evaluation results belong to
231
206
  eval_run_name (str): A name for this evaluation run
232
- override (bool): Whether to override an existing evaluation run with the same name
233
- append (bool): Whether to append to an existing evaluation run with the same name
234
- async_execution (bool): Whether to run the evaluation asynchronously
235
207
  """
236
208
 
237
209
  results: List[ScoringResult]
@@ -242,8 +214,6 @@ class JudgmentClient(metaclass=SingletonMeta):
242
214
  model=model,
243
215
  project_name=project_name,
244
216
  eval_run_name=eval_run_name,
245
- override=override,
246
- append=append,
247
217
  )
248
218
  assert_test(results)
249
219
 
@@ -255,12 +225,9 @@ class JudgmentClient(metaclass=SingletonMeta):
255
225
  tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
256
226
  traces: Optional[List[Trace]] = None,
257
227
  tools: Optional[List[Dict[str, Any]]] = None,
258
- model: Optional[str] = "gpt-4.1",
228
+ model: Optional[str] = DEFAULT_GPT_MODEL,
259
229
  project_name: str = "default_test",
260
230
  eval_run_name: str = str(uuid4()),
261
- override: bool = False,
262
- append: bool = False,
263
- async_execution: bool = False,
264
231
  ) -> None:
265
232
  """
266
233
  Asserts a test by running the evaluation and checking the results for success
@@ -271,12 +238,9 @@ class JudgmentClient(metaclass=SingletonMeta):
271
238
  model (str): The model used as a judge when using LLM as a Judge
272
239
  project_name (str): The name of the project the evaluation results belong to
273
240
  eval_run_name (str): A name for this evaluation run
274
- override (bool): Whether to override an existing evaluation run with the same name
275
- append (bool): Whether to append to an existing evaluation run with the same name
276
241
  function (Optional[Callable]): A function to use for evaluation
277
242
  tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
278
243
  tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
279
- async_execution (bool): Whether to run the evaluation asynchronously
280
244
  """
281
245
 
282
246
  # Check for enable_param_checking and tools
@@ -297,11 +261,107 @@ class JudgmentClient(metaclass=SingletonMeta):
297
261
  model=model,
298
262
  project_name=project_name,
299
263
  eval_run_name=eval_run_name,
300
- override=override,
301
- append=append,
302
264
  function=function,
303
265
  tracer=tracer,
304
266
  tools=tools,
305
267
  )
306
268
 
307
269
  assert_test(results)
270
+
271
+ def _extract_scorer_name(self, scorer_file_path: str) -> str:
272
+ """Extract scorer name from the scorer file by importing it."""
273
+ try:
274
+ spec = importlib.util.spec_from_file_location(
275
+ "scorer_module", scorer_file_path
276
+ )
277
+ if spec is None or spec.loader is None:
278
+ raise ImportError(f"Could not load spec from {scorer_file_path}")
279
+
280
+ module = importlib.util.module_from_spec(spec)
281
+ spec.loader.exec_module(module)
282
+
283
+ for attr_name in dir(module):
284
+ attr = getattr(module, attr_name)
285
+ if (
286
+ isinstance(attr, type)
287
+ and any("Scorer" in str(base) for base in attr.__mro__)
288
+ and attr.__module__ == "scorer_module"
289
+ ):
290
+ try:
291
+ # Instantiate the scorer and get its name
292
+ scorer_instance = attr()
293
+ if hasattr(scorer_instance, "name"):
294
+ return scorer_instance.name
295
+ except Exception:
296
+ # Skip if instantiation fails
297
+ continue
298
+
299
+ raise AttributeError("No scorer class found or could be instantiated")
300
+ except Exception as e:
301
+ judgeval_logger.warning(f"Could not extract scorer name: {e}")
302
+ return Path(scorer_file_path).stem
303
+
304
+ def save_custom_scorer(
305
+ self,
306
+ scorer_file_path: str,
307
+ requirements_file_path: Optional[str] = None,
308
+ unique_name: Optional[str] = None,
309
+ ) -> bool:
310
+ """
311
+ Upload custom ExampleScorer from files to backend.
312
+
313
+ Args:
314
+ scorer_file_path: Path to Python file containing CustomScorer class
315
+ requirements_file_path: Optional path to requirements.txt
316
+ unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
317
+
318
+ Returns:
319
+ bool: True if upload successful
320
+
321
+ Raises:
322
+ ValueError: If scorer file is invalid
323
+ FileNotFoundError: If scorer file doesn't exist
324
+ """
325
+ import os
326
+
327
+ if not os.path.exists(scorer_file_path):
328
+ raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
329
+
330
+ # Auto-detect scorer name if not provided
331
+ if unique_name is None:
332
+ unique_name = self._extract_scorer_name(scorer_file_path)
333
+ judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
334
+
335
+ # Read scorer code
336
+ with open(scorer_file_path, "r") as f:
337
+ scorer_code = f.read()
338
+
339
+ # Read requirements (optional)
340
+ requirements_text = ""
341
+ if requirements_file_path and os.path.exists(requirements_file_path):
342
+ with open(requirements_file_path, "r") as f:
343
+ requirements_text = f.read()
344
+
345
+ # Upload to backend
346
+ judgeval_logger.info(
347
+ f"Uploading custom scorer: {unique_name}, this can take a couple of minutes..."
348
+ )
349
+ try:
350
+ response = self.api_client.upload_custom_scorer(
351
+ scorer_name=unique_name,
352
+ scorer_code=scorer_code,
353
+ requirements_text=requirements_text,
354
+ )
355
+
356
+ if response.get("status") == "success":
357
+ judgeval_logger.info(
358
+ f"Successfully uploaded custom scorer: {unique_name}"
359
+ )
360
+ return True
361
+ else:
362
+ judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
363
+ return False
364
+
365
+ except Exception as e:
366
+ judgeval_logger.error(f"Error uploading custom scorer: {e}")
367
+ raise
@@ -0,0 +1,190 @@
1
+ """Local evaluation queue for batching custom scorer evaluations.
2
+
3
+ This module provides a simple in-memory queue for EvaluationRun objects that contain
4
+ only local (BaseScorer) scorers. Useful for batching evaluations and processing them
5
+ either synchronously or in a background thread.
6
+ """
7
+
8
+ import queue
9
+ import threading
10
+ from typing import Callable, List, Optional
11
+ import time
12
+
13
+ from judgeval.common.logger import judgeval_logger
14
+ from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
15
+ from judgeval.data import ScoringResult
16
+ from judgeval.data.evaluation_run import EvaluationRun
17
+ from judgeval.utils.async_utils import safe_run_async
18
+ from judgeval.scorers.score import a_execute_scoring
19
+
20
+
21
+ class LocalEvaluationQueue:
22
+ """Lightweight in-memory queue for local evaluation runs.
23
+
24
+ Only supports EvaluationRuns with local scorers (BaseScorer instances).
25
+ API scorers (APIScorerConfig) are not supported as they have their own queue.
26
+ """
27
+
28
+ def __init__(
29
+ self, max_concurrent: int = MAX_CONCURRENT_EVALUATIONS, num_workers: int = 4
30
+ ):
31
+ if num_workers <= 0:
32
+ raise ValueError("num_workers must be a positive integer.")
33
+ self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
34
+ self._max_concurrent = max_concurrent
35
+ self._num_workers = num_workers # Number of worker threads
36
+ self._worker_threads: List[threading.Thread] = []
37
+ self._shutdown_event = threading.Event()
38
+
39
+ def enqueue(self, evaluation_run: EvaluationRun) -> None:
40
+ """Add evaluation run to the queue."""
41
+ self._queue.put(evaluation_run)
42
+
43
+ def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
44
+ """Execute evaluation run locally and return results."""
45
+
46
+ if not evaluation_run.custom_scorers:
47
+ raise ValueError(
48
+ "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
49
+ "Found only APIScorerConfig instances."
50
+ )
51
+
52
+ return safe_run_async(
53
+ a_execute_scoring(
54
+ evaluation_run.examples,
55
+ evaluation_run.custom_scorers,
56
+ model=evaluation_run.model,
57
+ throttle_value=0,
58
+ max_concurrent=self._max_concurrent // self._num_workers,
59
+ show_progress=False,
60
+ )
61
+ )
62
+
63
+ def run_all(
64
+ self,
65
+ callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
66
+ ) -> None:
67
+ """Process all queued runs synchronously.
68
+
69
+ Args:
70
+ callback: Optional function called after each run with (run, results).
71
+ """
72
+ while not self._queue.empty():
73
+ run = self._queue.get()
74
+ if run is None: # Sentinel for worker shutdown
75
+ self._queue.put(None)
76
+ break
77
+ results = self._process_run(run)
78
+ if callback:
79
+ callback(run, results)
80
+ self._queue.task_done()
81
+
82
+ def start_workers(
83
+ self,
84
+ callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
85
+ ) -> List[threading.Thread]:
86
+ """Start multiple background threads to process runs in parallel.
87
+
88
+ Args:
89
+ callback: Optional function called after each run with (run, results).
90
+
91
+ Returns:
92
+ List of started worker threads.
93
+ """
94
+
95
+ def _worker(worker_id: int) -> None:
96
+ while not self._shutdown_event.is_set():
97
+ try:
98
+ # Use timeout so workers can check shutdown event periodically
99
+ run = self._queue.get(timeout=1.0)
100
+ if run is None: # Sentinel to stop worker
101
+ # Put sentinel back for other workers
102
+ self._queue.put(None)
103
+ self._queue.task_done()
104
+ break
105
+
106
+ try:
107
+ results = self._process_run(run)
108
+ if callback:
109
+ callback(run, results)
110
+ except Exception as exc:
111
+ judgeval_logger.error(
112
+ f"Worker {worker_id} error processing {run.eval_name}: {exc}"
113
+ )
114
+ # Continue processing other runs instead of shutting down all workers
115
+ finally:
116
+ self._queue.task_done()
117
+
118
+ except queue.Empty:
119
+ # Timeout - check shutdown event and continue
120
+ continue
121
+
122
+ # Start worker threads
123
+ for i in range(self._num_workers):
124
+ thread = threading.Thread(target=_worker, args=(i,), daemon=True)
125
+ thread.start()
126
+ self._worker_threads.append(thread)
127
+
128
+ return self._worker_threads
129
+
130
+ def start_worker(
131
+ self,
132
+ callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
133
+ ) -> Optional[threading.Thread]:
134
+ """Start a single background thread to process runs (backward compatibility).
135
+
136
+ Args:
137
+ callback: Optional function called after each run with (run, results).
138
+
139
+ Returns:
140
+ The started thread, or None if no threads were started.
141
+ """
142
+ threads = self.start_workers(callback)
143
+ return threads[0] if threads else None
144
+
145
+ def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
146
+ """Wait for all queued tasks to complete.
147
+
148
+ Args:
149
+ timeout: Maximum time to wait in seconds. None means wait indefinitely.
150
+
151
+ Returns:
152
+ True if all tasks completed, False if timeout occurred.
153
+ """
154
+ try:
155
+ if timeout is None:
156
+ self._queue.join()
157
+ return True
158
+ else:
159
+ start_time = time.time()
160
+ while not self._queue.empty() or self._queue.unfinished_tasks > 0:
161
+ if time.time() - start_time > timeout:
162
+ return False
163
+ time.sleep(0.1)
164
+ return True
165
+ except Exception:
166
+ return False
167
+
168
+ def stop_workers(self) -> None:
169
+ """Signal all background workers to stop after current tasks complete."""
170
+ if not self._worker_threads:
171
+ return
172
+
173
+ # Signal shutdown
174
+ self._shutdown_event.set()
175
+
176
+ # Send sentinel to wake up any blocking workers
177
+ for _ in range(self._num_workers):
178
+ self._queue.put(None)
179
+
180
+ # Wait for all workers to finish with timeout
181
+ for thread in self._worker_threads:
182
+ if thread.is_alive():
183
+ thread.join(timeout=5.0)
184
+ if thread.is_alive():
185
+ judgeval_logger.warning(
186
+ f"Worker thread {thread.name} did not shut down gracefully"
187
+ )
188
+
189
+ self._worker_threads.clear()
190
+ self._shutdown_event.clear()