judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/cli.py +65 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +46 -54
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +241 -0
- judgeval/common/tracer/core.py +772 -467
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +16 -26
- judgeval/constants.py +1 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +38 -8
- judgeval/data/trace.py +6 -122
- judgeval/data/trace_run.py +2 -3
- judgeval/dataset.py +2 -0
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +113 -53
- judgeval/local_eval_queue.py +190 -0
- judgeval/run_evaluation.py +43 -197
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- judgeval/scorers/score.py +33 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
- judgeval-0.6.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -76
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judges/utils.py
CHANGED
@@ -7,6 +7,7 @@ from typing import Optional, Union, Tuple, List
|
|
7
7
|
|
8
8
|
from judgeval.common.exceptions import InvalidJudgeModelError
|
9
9
|
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
|
10
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
10
11
|
from judgeval.constants import (
|
11
12
|
TOGETHER_SUPPORTED_MODELS,
|
12
13
|
JUDGMENT_SUPPORTED_MODELS,
|
@@ -30,7 +31,7 @@ def create_judge(
|
|
30
31
|
If no model is provided, uses GPT4o as the default judge.
|
31
32
|
"""
|
32
33
|
if model is None: # default option
|
33
|
-
return LiteLLMJudge(model=
|
34
|
+
return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
|
34
35
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
35
36
|
raise InvalidJudgeModelError(
|
36
37
|
f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
|
judgeval/judgment_client.py
CHANGED
@@ -2,9 +2,12 @@
|
|
2
2
|
Implements the JudgmentClient to interact with the Judgment API.
|
3
3
|
"""
|
4
4
|
|
5
|
+
from __future__ import annotations
|
5
6
|
import os
|
7
|
+
import importlib.util
|
8
|
+
from pathlib import Path
|
6
9
|
from uuid import uuid4
|
7
|
-
from typing import Optional, List, Dict, Any, Union, Callable
|
10
|
+
from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
|
8
11
|
|
9
12
|
from judgeval.data import (
|
10
13
|
ScoringResult,
|
@@ -15,7 +18,7 @@ from judgeval.scorers import (
|
|
15
18
|
APIScorerConfig,
|
16
19
|
BaseScorer,
|
17
20
|
)
|
18
|
-
from judgeval.evaluation_run import EvaluationRun
|
21
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
19
22
|
from judgeval.run_evaluation import (
|
20
23
|
run_eval,
|
21
24
|
assert_test,
|
@@ -28,7 +31,11 @@ from judgeval.common.tracer import Tracer
|
|
28
31
|
from judgeval.common.utils import validate_api_key
|
29
32
|
from pydantic import BaseModel
|
30
33
|
from judgeval.common.logger import judgeval_logger
|
31
|
-
|
34
|
+
|
35
|
+
|
36
|
+
if TYPE_CHECKING:
|
37
|
+
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
38
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
32
39
|
|
33
40
|
|
34
41
|
class EvalRunRequestBody(BaseModel):
|
@@ -89,9 +96,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
89
96
|
tools: Optional[List[Dict[str, Any]]] = None,
|
90
97
|
project_name: str = "default_project",
|
91
98
|
eval_run_name: str = "default_eval_trace",
|
92
|
-
model: Optional[str] =
|
93
|
-
append: bool = False,
|
94
|
-
override: bool = False,
|
99
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
95
100
|
) -> List[ScoringResult]:
|
96
101
|
try:
|
97
102
|
if examples and not function:
|
@@ -109,12 +114,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
109
114
|
traces=traces,
|
110
115
|
scorers=scorers,
|
111
116
|
model=model,
|
112
|
-
append=append,
|
113
117
|
organization_id=self.organization_id,
|
114
118
|
tools=tools,
|
115
119
|
)
|
116
120
|
return run_trace_eval(
|
117
|
-
trace_run, self.judgment_api_key,
|
121
|
+
trace_run, self.judgment_api_key, function, tracer, examples
|
118
122
|
)
|
119
123
|
except ValueError as e:
|
120
124
|
raise ValueError(
|
@@ -127,11 +131,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
127
131
|
self,
|
128
132
|
examples: List[Example],
|
129
133
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
130
|
-
model: Optional[str] =
|
134
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
131
135
|
project_name: str = "default_project",
|
132
136
|
eval_run_name: str = "default_eval_run",
|
133
|
-
override: bool = False,
|
134
|
-
append: bool = False,
|
135
137
|
) -> List[ScoringResult]:
|
136
138
|
"""
|
137
139
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -142,21 +144,13 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
142
144
|
model (str): The model used as a judge when using LLM as a Judge
|
143
145
|
project_name (str): The name of the project the evaluation results belong to
|
144
146
|
eval_run_name (str): A name for this evaluation run
|
145
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
146
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
147
147
|
|
148
148
|
Returns:
|
149
149
|
List[ScoringResult]: The results of the evaluation
|
150
150
|
"""
|
151
|
-
if override and append:
|
152
|
-
raise ValueError(
|
153
|
-
"Cannot set both override and append to True. Please choose one."
|
154
|
-
)
|
155
151
|
|
156
152
|
try:
|
157
153
|
eval = EvaluationRun(
|
158
|
-
append=append,
|
159
|
-
override=override,
|
160
154
|
project_name=project_name,
|
161
155
|
eval_name=eval_run_name,
|
162
156
|
examples=examples,
|
@@ -167,7 +161,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
167
161
|
return run_eval(
|
168
162
|
eval,
|
169
163
|
self.judgment_api_key,
|
170
|
-
override,
|
171
164
|
)
|
172
165
|
except ValueError as e:
|
173
166
|
raise ValueError(
|
@@ -176,22 +169,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
176
169
|
except Exception as e:
|
177
170
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
178
171
|
|
179
|
-
def pull_eval(
|
180
|
-
self, project_name: str, eval_run_name: str
|
181
|
-
) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
182
|
-
"""Pull evaluation results from the server.
|
183
|
-
|
184
|
-
Args:
|
185
|
-
project_name (str): Name of the project
|
186
|
-
eval_run_name (str): Name of the evaluation run
|
187
|
-
|
188
|
-
Returns:
|
189
|
-
Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
|
190
|
-
- id (str): The evaluation run ID
|
191
|
-
- results (List[ScoringResult]): List of scoring results
|
192
|
-
"""
|
193
|
-
return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
|
194
|
-
|
195
172
|
def create_project(self, project_name: str) -> bool:
|
196
173
|
"""
|
197
174
|
Creates a project on the server.
|
@@ -214,11 +191,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
214
191
|
self,
|
215
192
|
examples: List[Example],
|
216
193
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
217
|
-
model: Optional[str] =
|
194
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
218
195
|
project_name: str = "default_test",
|
219
196
|
eval_run_name: str = str(uuid4()),
|
220
|
-
override: bool = False,
|
221
|
-
append: bool = False,
|
222
197
|
) -> None:
|
223
198
|
"""
|
224
199
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -229,9 +204,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
229
204
|
model (str): The model used as a judge when using LLM as a Judge
|
230
205
|
project_name (str): The name of the project the evaluation results belong to
|
231
206
|
eval_run_name (str): A name for this evaluation run
|
232
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
233
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
234
|
-
async_execution (bool): Whether to run the evaluation asynchronously
|
235
207
|
"""
|
236
208
|
|
237
209
|
results: List[ScoringResult]
|
@@ -242,8 +214,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
242
214
|
model=model,
|
243
215
|
project_name=project_name,
|
244
216
|
eval_run_name=eval_run_name,
|
245
|
-
override=override,
|
246
|
-
append=append,
|
247
217
|
)
|
248
218
|
assert_test(results)
|
249
219
|
|
@@ -255,12 +225,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
255
225
|
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
256
226
|
traces: Optional[List[Trace]] = None,
|
257
227
|
tools: Optional[List[Dict[str, Any]]] = None,
|
258
|
-
model: Optional[str] =
|
228
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
259
229
|
project_name: str = "default_test",
|
260
230
|
eval_run_name: str = str(uuid4()),
|
261
|
-
override: bool = False,
|
262
|
-
append: bool = False,
|
263
|
-
async_execution: bool = False,
|
264
231
|
) -> None:
|
265
232
|
"""
|
266
233
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -271,12 +238,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
271
238
|
model (str): The model used as a judge when using LLM as a Judge
|
272
239
|
project_name (str): The name of the project the evaluation results belong to
|
273
240
|
eval_run_name (str): A name for this evaluation run
|
274
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
275
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
276
241
|
function (Optional[Callable]): A function to use for evaluation
|
277
242
|
tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
|
278
243
|
tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
|
279
|
-
async_execution (bool): Whether to run the evaluation asynchronously
|
280
244
|
"""
|
281
245
|
|
282
246
|
# Check for enable_param_checking and tools
|
@@ -297,11 +261,107 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
297
261
|
model=model,
|
298
262
|
project_name=project_name,
|
299
263
|
eval_run_name=eval_run_name,
|
300
|
-
override=override,
|
301
|
-
append=append,
|
302
264
|
function=function,
|
303
265
|
tracer=tracer,
|
304
266
|
tools=tools,
|
305
267
|
)
|
306
268
|
|
307
269
|
assert_test(results)
|
270
|
+
|
271
|
+
def _extract_scorer_name(self, scorer_file_path: str) -> str:
|
272
|
+
"""Extract scorer name from the scorer file by importing it."""
|
273
|
+
try:
|
274
|
+
spec = importlib.util.spec_from_file_location(
|
275
|
+
"scorer_module", scorer_file_path
|
276
|
+
)
|
277
|
+
if spec is None or spec.loader is None:
|
278
|
+
raise ImportError(f"Could not load spec from {scorer_file_path}")
|
279
|
+
|
280
|
+
module = importlib.util.module_from_spec(spec)
|
281
|
+
spec.loader.exec_module(module)
|
282
|
+
|
283
|
+
for attr_name in dir(module):
|
284
|
+
attr = getattr(module, attr_name)
|
285
|
+
if (
|
286
|
+
isinstance(attr, type)
|
287
|
+
and any("Scorer" in str(base) for base in attr.__mro__)
|
288
|
+
and attr.__module__ == "scorer_module"
|
289
|
+
):
|
290
|
+
try:
|
291
|
+
# Instantiate the scorer and get its name
|
292
|
+
scorer_instance = attr()
|
293
|
+
if hasattr(scorer_instance, "name"):
|
294
|
+
return scorer_instance.name
|
295
|
+
except Exception:
|
296
|
+
# Skip if instantiation fails
|
297
|
+
continue
|
298
|
+
|
299
|
+
raise AttributeError("No scorer class found or could be instantiated")
|
300
|
+
except Exception as e:
|
301
|
+
judgeval_logger.warning(f"Could not extract scorer name: {e}")
|
302
|
+
return Path(scorer_file_path).stem
|
303
|
+
|
304
|
+
def save_custom_scorer(
|
305
|
+
self,
|
306
|
+
scorer_file_path: str,
|
307
|
+
requirements_file_path: Optional[str] = None,
|
308
|
+
unique_name: Optional[str] = None,
|
309
|
+
) -> bool:
|
310
|
+
"""
|
311
|
+
Upload custom ExampleScorer from files to backend.
|
312
|
+
|
313
|
+
Args:
|
314
|
+
scorer_file_path: Path to Python file containing CustomScorer class
|
315
|
+
requirements_file_path: Optional path to requirements.txt
|
316
|
+
unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
|
317
|
+
|
318
|
+
Returns:
|
319
|
+
bool: True if upload successful
|
320
|
+
|
321
|
+
Raises:
|
322
|
+
ValueError: If scorer file is invalid
|
323
|
+
FileNotFoundError: If scorer file doesn't exist
|
324
|
+
"""
|
325
|
+
import os
|
326
|
+
|
327
|
+
if not os.path.exists(scorer_file_path):
|
328
|
+
raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
|
329
|
+
|
330
|
+
# Auto-detect scorer name if not provided
|
331
|
+
if unique_name is None:
|
332
|
+
unique_name = self._extract_scorer_name(scorer_file_path)
|
333
|
+
judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
|
334
|
+
|
335
|
+
# Read scorer code
|
336
|
+
with open(scorer_file_path, "r") as f:
|
337
|
+
scorer_code = f.read()
|
338
|
+
|
339
|
+
# Read requirements (optional)
|
340
|
+
requirements_text = ""
|
341
|
+
if requirements_file_path and os.path.exists(requirements_file_path):
|
342
|
+
with open(requirements_file_path, "r") as f:
|
343
|
+
requirements_text = f.read()
|
344
|
+
|
345
|
+
# Upload to backend
|
346
|
+
judgeval_logger.info(
|
347
|
+
f"Uploading custom scorer: {unique_name}, this can take a couple of minutes..."
|
348
|
+
)
|
349
|
+
try:
|
350
|
+
response = self.api_client.upload_custom_scorer(
|
351
|
+
scorer_name=unique_name,
|
352
|
+
scorer_code=scorer_code,
|
353
|
+
requirements_text=requirements_text,
|
354
|
+
)
|
355
|
+
|
356
|
+
if response.get("status") == "success":
|
357
|
+
judgeval_logger.info(
|
358
|
+
f"Successfully uploaded custom scorer: {unique_name}"
|
359
|
+
)
|
360
|
+
return True
|
361
|
+
else:
|
362
|
+
judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
|
363
|
+
return False
|
364
|
+
|
365
|
+
except Exception as e:
|
366
|
+
judgeval_logger.error(f"Error uploading custom scorer: {e}")
|
367
|
+
raise
|
@@ -0,0 +1,190 @@
|
|
1
|
+
"""Local evaluation queue for batching custom scorer evaluations.
|
2
|
+
|
3
|
+
This module provides a simple in-memory queue for EvaluationRun objects that contain
|
4
|
+
only local (BaseScorer) scorers. Useful for batching evaluations and processing them
|
5
|
+
either synchronously or in a background thread.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import queue
|
9
|
+
import threading
|
10
|
+
from typing import Callable, List, Optional
|
11
|
+
import time
|
12
|
+
|
13
|
+
from judgeval.common.logger import judgeval_logger
|
14
|
+
from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
|
15
|
+
from judgeval.data import ScoringResult
|
16
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
17
|
+
from judgeval.utils.async_utils import safe_run_async
|
18
|
+
from judgeval.scorers.score import a_execute_scoring
|
19
|
+
|
20
|
+
|
21
|
+
class LocalEvaluationQueue:
|
22
|
+
"""Lightweight in-memory queue for local evaluation runs.
|
23
|
+
|
24
|
+
Only supports EvaluationRuns with local scorers (BaseScorer instances).
|
25
|
+
API scorers (APIScorerConfig) are not supported as they have their own queue.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self, max_concurrent: int = MAX_CONCURRENT_EVALUATIONS, num_workers: int = 4
|
30
|
+
):
|
31
|
+
if num_workers <= 0:
|
32
|
+
raise ValueError("num_workers must be a positive integer.")
|
33
|
+
self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
|
34
|
+
self._max_concurrent = max_concurrent
|
35
|
+
self._num_workers = num_workers # Number of worker threads
|
36
|
+
self._worker_threads: List[threading.Thread] = []
|
37
|
+
self._shutdown_event = threading.Event()
|
38
|
+
|
39
|
+
def enqueue(self, evaluation_run: EvaluationRun) -> None:
|
40
|
+
"""Add evaluation run to the queue."""
|
41
|
+
self._queue.put(evaluation_run)
|
42
|
+
|
43
|
+
def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
|
44
|
+
"""Execute evaluation run locally and return results."""
|
45
|
+
|
46
|
+
if not evaluation_run.custom_scorers:
|
47
|
+
raise ValueError(
|
48
|
+
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
49
|
+
"Found only APIScorerConfig instances."
|
50
|
+
)
|
51
|
+
|
52
|
+
return safe_run_async(
|
53
|
+
a_execute_scoring(
|
54
|
+
evaluation_run.examples,
|
55
|
+
evaluation_run.custom_scorers,
|
56
|
+
model=evaluation_run.model,
|
57
|
+
throttle_value=0,
|
58
|
+
max_concurrent=self._max_concurrent // self._num_workers,
|
59
|
+
show_progress=False,
|
60
|
+
)
|
61
|
+
)
|
62
|
+
|
63
|
+
def run_all(
|
64
|
+
self,
|
65
|
+
callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
|
66
|
+
) -> None:
|
67
|
+
"""Process all queued runs synchronously.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
callback: Optional function called after each run with (run, results).
|
71
|
+
"""
|
72
|
+
while not self._queue.empty():
|
73
|
+
run = self._queue.get()
|
74
|
+
if run is None: # Sentinel for worker shutdown
|
75
|
+
self._queue.put(None)
|
76
|
+
break
|
77
|
+
results = self._process_run(run)
|
78
|
+
if callback:
|
79
|
+
callback(run, results)
|
80
|
+
self._queue.task_done()
|
81
|
+
|
82
|
+
def start_workers(
|
83
|
+
self,
|
84
|
+
callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
|
85
|
+
) -> List[threading.Thread]:
|
86
|
+
"""Start multiple background threads to process runs in parallel.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
callback: Optional function called after each run with (run, results).
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
List of started worker threads.
|
93
|
+
"""
|
94
|
+
|
95
|
+
def _worker(worker_id: int) -> None:
|
96
|
+
while not self._shutdown_event.is_set():
|
97
|
+
try:
|
98
|
+
# Use timeout so workers can check shutdown event periodically
|
99
|
+
run = self._queue.get(timeout=1.0)
|
100
|
+
if run is None: # Sentinel to stop worker
|
101
|
+
# Put sentinel back for other workers
|
102
|
+
self._queue.put(None)
|
103
|
+
self._queue.task_done()
|
104
|
+
break
|
105
|
+
|
106
|
+
try:
|
107
|
+
results = self._process_run(run)
|
108
|
+
if callback:
|
109
|
+
callback(run, results)
|
110
|
+
except Exception as exc:
|
111
|
+
judgeval_logger.error(
|
112
|
+
f"Worker {worker_id} error processing {run.eval_name}: {exc}"
|
113
|
+
)
|
114
|
+
# Continue processing other runs instead of shutting down all workers
|
115
|
+
finally:
|
116
|
+
self._queue.task_done()
|
117
|
+
|
118
|
+
except queue.Empty:
|
119
|
+
# Timeout - check shutdown event and continue
|
120
|
+
continue
|
121
|
+
|
122
|
+
# Start worker threads
|
123
|
+
for i in range(self._num_workers):
|
124
|
+
thread = threading.Thread(target=_worker, args=(i,), daemon=True)
|
125
|
+
thread.start()
|
126
|
+
self._worker_threads.append(thread)
|
127
|
+
|
128
|
+
return self._worker_threads
|
129
|
+
|
130
|
+
def start_worker(
|
131
|
+
self,
|
132
|
+
callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
|
133
|
+
) -> Optional[threading.Thread]:
|
134
|
+
"""Start a single background thread to process runs (backward compatibility).
|
135
|
+
|
136
|
+
Args:
|
137
|
+
callback: Optional function called after each run with (run, results).
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
The started thread, or None if no threads were started.
|
141
|
+
"""
|
142
|
+
threads = self.start_workers(callback)
|
143
|
+
return threads[0] if threads else None
|
144
|
+
|
145
|
+
def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
|
146
|
+
"""Wait for all queued tasks to complete.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
timeout: Maximum time to wait in seconds. None means wait indefinitely.
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
True if all tasks completed, False if timeout occurred.
|
153
|
+
"""
|
154
|
+
try:
|
155
|
+
if timeout is None:
|
156
|
+
self._queue.join()
|
157
|
+
return True
|
158
|
+
else:
|
159
|
+
start_time = time.time()
|
160
|
+
while not self._queue.empty() or self._queue.unfinished_tasks > 0:
|
161
|
+
if time.time() - start_time > timeout:
|
162
|
+
return False
|
163
|
+
time.sleep(0.1)
|
164
|
+
return True
|
165
|
+
except Exception:
|
166
|
+
return False
|
167
|
+
|
168
|
+
def stop_workers(self) -> None:
|
169
|
+
"""Signal all background workers to stop after current tasks complete."""
|
170
|
+
if not self._worker_threads:
|
171
|
+
return
|
172
|
+
|
173
|
+
# Signal shutdown
|
174
|
+
self._shutdown_event.set()
|
175
|
+
|
176
|
+
# Send sentinel to wake up any blocking workers
|
177
|
+
for _ in range(self._num_workers):
|
178
|
+
self._queue.put(None)
|
179
|
+
|
180
|
+
# Wait for all workers to finish with timeout
|
181
|
+
for thread in self._worker_threads:
|
182
|
+
if thread.is_alive():
|
183
|
+
thread.join(timeout=5.0)
|
184
|
+
if thread.is_alive():
|
185
|
+
judgeval_logger.warning(
|
186
|
+
f"Worker thread {thread.name} did not shut down gracefully"
|
187
|
+
)
|
188
|
+
|
189
|
+
self._worker_threads.clear()
|
190
|
+
self._shutdown_event.clear()
|