judgeval 0.16.7__py3-none-any.whl → 0.16.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (43) hide show
  1. judgeval/api/api_types.py +1 -2
  2. judgeval/data/judgment_types.py +1 -2
  3. judgeval/tracer/__init__.py +7 -52
  4. judgeval/tracer/llm/config.py +12 -44
  5. judgeval/tracer/llm/constants.py +0 -1
  6. judgeval/tracer/llm/llm_anthropic/config.py +3 -17
  7. judgeval/tracer/llm/llm_anthropic/messages.py +440 -0
  8. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  9. judgeval/tracer/llm/llm_anthropic/wrapper.py +40 -621
  10. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  11. judgeval/tracer/llm/llm_google/config.py +3 -21
  12. judgeval/tracer/llm/llm_google/generate_content.py +125 -0
  13. judgeval/tracer/llm/llm_google/wrapper.py +19 -454
  14. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +192 -0
  15. judgeval/tracer/llm/llm_openai/chat_completions.py +437 -0
  16. judgeval/tracer/llm/llm_openai/config.py +3 -29
  17. judgeval/tracer/llm/llm_openai/responses.py +444 -0
  18. judgeval/tracer/llm/llm_openai/wrapper.py +43 -641
  19. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  20. judgeval/tracer/llm/llm_together/chat_completions.py +398 -0
  21. judgeval/tracer/llm/llm_together/config.py +3 -20
  22. judgeval/tracer/llm/llm_together/wrapper.py +34 -485
  23. judgeval/tracer/llm/providers.py +4 -48
  24. judgeval/utils/decorators/dont_throw.py +30 -14
  25. judgeval/utils/wrappers/README.md +3 -0
  26. judgeval/utils/wrappers/__init__.py +15 -0
  27. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  28. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  29. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  30. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  31. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  32. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  33. judgeval/utils/wrappers/utils.py +35 -0
  34. judgeval/version.py +1 -1
  35. {judgeval-0.16.7.dist-info → judgeval-0.16.9.dist-info}/METADATA +1 -1
  36. {judgeval-0.16.7.dist-info → judgeval-0.16.9.dist-info}/RECORD +40 -27
  37. judgeval/tracer/llm/llm_groq/config.py +0 -23
  38. judgeval/tracer/llm/llm_groq/wrapper.py +0 -498
  39. judgeval/tracer/local_eval_queue.py +0 -199
  40. /judgeval/{tracer/llm/llm_groq/__init__.py → utils/wrappers/py.typed} +0 -0
  41. {judgeval-0.16.7.dist-info → judgeval-0.16.9.dist-info}/WHEEL +0 -0
  42. {judgeval-0.16.7.dist-info → judgeval-0.16.9.dist-info}/entry_points.txt +0 -0
  43. {judgeval-0.16.7.dist-info → judgeval-0.16.9.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,498 +0,0 @@
1
- from __future__ import annotations
2
- import functools
3
- from typing import (
4
- TYPE_CHECKING,
5
- Any,
6
- Callable,
7
- Optional,
8
- Protocol,
9
- Tuple,
10
- Union,
11
- Iterator,
12
- AsyncIterator,
13
- Sequence,
14
- runtime_checkable,
15
- )
16
-
17
- from judgeval.tracer.llm.llm_groq.config import (
18
- groq_Groq,
19
- groq_AsyncGroq,
20
- )
21
- from judgeval.tracer.managers import sync_span_context, async_span_context
22
- from judgeval.logger import judgeval_logger
23
- from judgeval.tracer.keys import AttributeKeys
24
- from judgeval.tracer.utils import set_span_attribute
25
- from judgeval.utils.serialize import safe_serialize
26
-
27
- if TYPE_CHECKING:
28
- from judgeval.tracer import Tracer
29
- from opentelemetry.trace import Span
30
-
31
- # Keep the original client type for runtime compatibility
32
- GroqClientType = Union[groq_Groq, groq_AsyncGroq]
33
-
34
-
35
- # Usage protocols
36
- @runtime_checkable
37
- class GroqPromptTokensDetails(Protocol):
38
- cached_tokens: Optional[int]
39
-
40
-
41
- @runtime_checkable
42
- class GroqUsage(Protocol):
43
- prompt_tokens: Optional[int]
44
- completion_tokens: Optional[int]
45
- total_tokens: Optional[int]
46
- prompt_tokens_details: Optional[GroqPromptTokensDetails]
47
-
48
-
49
- # Message protocols
50
- @runtime_checkable
51
- class GroqMessage(Protocol):
52
- content: Optional[str]
53
- role: str
54
-
55
-
56
- @runtime_checkable
57
- class GroqChoice(Protocol):
58
- index: int
59
- message: GroqMessage
60
- finish_reason: Optional[str]
61
-
62
-
63
- @runtime_checkable
64
- class GroqChatCompletion(Protocol):
65
- id: str
66
- object: str
67
- created: int
68
- model: str
69
- choices: Sequence[GroqChoice]
70
- usage: Optional[GroqUsage]
71
-
72
-
73
- # Stream protocols
74
- @runtime_checkable
75
- class GroqStreamDelta(Protocol):
76
- content: Optional[str]
77
-
78
-
79
- @runtime_checkable
80
- class GroqStreamChoice(Protocol):
81
- index: int
82
- delta: GroqStreamDelta
83
-
84
-
85
- @runtime_checkable
86
- class GroqStreamChunk(Protocol):
87
- choices: Sequence[GroqStreamChoice]
88
- usage: Optional[GroqUsage]
89
-
90
-
91
- # Client protocols
92
- @runtime_checkable
93
- class GroqClient(Protocol):
94
- pass
95
-
96
-
97
- @runtime_checkable
98
- class GroqAsyncClient(Protocol):
99
- pass
100
-
101
-
102
- # Union types
103
- GroqResponseType = GroqChatCompletion
104
- GroqStreamType = Union[Iterator[GroqStreamChunk], AsyncIterator[GroqStreamChunk]]
105
-
106
-
107
- def _extract_groq_content(chunk: GroqStreamChunk) -> str:
108
- if chunk.choices and len(chunk.choices) > 0:
109
- delta_content = chunk.choices[0].delta.content
110
- if delta_content:
111
- return delta_content
112
- return ""
113
-
114
-
115
- def _extract_groq_tokens(usage_data: GroqUsage) -> Tuple[int, int, int, int]:
116
- prompt_tokens = usage_data.prompt_tokens or 0
117
- completion_tokens = usage_data.completion_tokens or 0
118
- cache_read_input_tokens = 0
119
- if (
120
- hasattr(usage_data, "prompt_tokens_details")
121
- and usage_data.prompt_tokens_details
122
- and hasattr(usage_data.prompt_tokens_details, "cached_tokens")
123
- and usage_data.prompt_tokens_details.cached_tokens is not None
124
- ):
125
- cache_read_input_tokens = usage_data.prompt_tokens_details.cached_tokens
126
- cache_creation_input_tokens = 0 # Groq doesn't have cache creation tokens
127
- return (
128
- prompt_tokens,
129
- completion_tokens,
130
- cache_read_input_tokens,
131
- cache_creation_input_tokens,
132
- )
133
-
134
-
135
- def _format_groq_output(
136
- response: GroqChatCompletion,
137
- ) -> Tuple[Optional[Union[str, list[dict[str, Any]]]], Optional[GroqUsage]]:
138
- message_content: Optional[Union[str, list[dict[str, Any]]]] = None
139
- usage_data: Optional[GroqUsage] = None
140
-
141
- try:
142
- if isinstance(response, GroqChatCompletion):
143
- usage_data = response.usage
144
- if response.choices and len(response.choices) > 0:
145
- content = response.choices[0].message.content
146
- if content:
147
- # Return structured data for consistency with other providers
148
- message_content = [{"type": "text", "text": str(content)}]
149
- except (AttributeError, IndexError, TypeError):
150
- pass
151
-
152
- return message_content, usage_data
153
-
154
-
155
- class TracedGroqGenerator:
156
- def __init__(
157
- self,
158
- tracer: Tracer,
159
- generator: Iterator[GroqStreamChunk],
160
- client: GroqClientType,
161
- span: Span,
162
- model_name: str,
163
- ):
164
- self.tracer = tracer
165
- self.generator = generator
166
- self.client = client
167
- self.span = span
168
- self.model_name = model_name
169
- self.accumulated_content = ""
170
-
171
- def __iter__(self) -> Iterator[GroqStreamChunk]:
172
- return self
173
-
174
- def __next__(self) -> GroqStreamChunk:
175
- try:
176
- chunk = next(self.generator)
177
- content = _extract_groq_content(chunk)
178
- if content:
179
- self.accumulated_content += content
180
- if chunk.usage:
181
- prompt_tokens, completion_tokens, cache_read, cache_creation = (
182
- _extract_groq_tokens(chunk.usage)
183
- )
184
- set_span_attribute(
185
- self.span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
186
- )
187
- set_span_attribute(
188
- self.span,
189
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
190
- completion_tokens,
191
- )
192
- set_span_attribute(
193
- self.span,
194
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
195
- cache_read,
196
- )
197
- set_span_attribute(
198
- self.span,
199
- AttributeKeys.JUDGMENT_USAGE_METADATA,
200
- safe_serialize(chunk.usage),
201
- )
202
- return chunk
203
- except StopIteration:
204
- set_span_attribute(
205
- self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
206
- )
207
- self.span.end()
208
- raise
209
- except Exception as e:
210
- if self.span:
211
- self.span.record_exception(e)
212
- self.span.end()
213
- raise
214
-
215
-
216
- class TracedGroqAsyncGenerator:
217
- def __init__(
218
- self,
219
- tracer: Tracer,
220
- async_generator: AsyncIterator[GroqStreamChunk],
221
- client: GroqClientType,
222
- span: Span,
223
- model_name: str,
224
- ):
225
- self.tracer = tracer
226
- self.async_generator = async_generator
227
- self.client = client
228
- self.span = span
229
- self.model_name = model_name
230
- self.accumulated_content = ""
231
-
232
- def __aiter__(self) -> AsyncIterator[GroqStreamChunk]:
233
- return self
234
-
235
- async def __anext__(self) -> GroqStreamChunk:
236
- try:
237
- chunk = await self.async_generator.__anext__()
238
- content = _extract_groq_content(chunk)
239
- if content:
240
- self.accumulated_content += content
241
- if chunk.usage:
242
- prompt_tokens, completion_tokens, cache_read, cache_creation = (
243
- _extract_groq_tokens(chunk.usage)
244
- )
245
- set_span_attribute(
246
- self.span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
247
- )
248
- set_span_attribute(
249
- self.span,
250
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
251
- completion_tokens,
252
- )
253
- set_span_attribute(
254
- self.span,
255
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
256
- cache_read,
257
- )
258
- set_span_attribute(
259
- self.span,
260
- AttributeKeys.JUDGMENT_USAGE_METADATA,
261
- safe_serialize(chunk.usage),
262
- )
263
- return chunk
264
- except StopAsyncIteration:
265
- set_span_attribute(
266
- self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
267
- )
268
- self.span.end()
269
- raise
270
- except Exception as e:
271
- if self.span:
272
- self.span.record_exception(e)
273
- self.span.end()
274
- raise
275
-
276
-
277
- def wrap_groq_client(tracer: Tracer, client: GroqClientType) -> GroqClientType:
278
- def wrapped(function: Callable, span_name: str):
279
- @functools.wraps(function)
280
- def wrapper(*args, **kwargs):
281
- if kwargs.get("stream", False):
282
- span = tracer.get_tracer().start_span(
283
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
284
- )
285
- tracer.add_agent_attributes_to_span(span)
286
- set_span_attribute(
287
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
288
- )
289
- model_name = kwargs.get("model", "")
290
- set_span_attribute(span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name)
291
- stream_response = function(*args, **kwargs)
292
- return TracedGroqGenerator(
293
- tracer, stream_response, client, span, model_name
294
- )
295
- else:
296
- with sync_span_context(
297
- tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
298
- ) as span:
299
- try:
300
- tracer.add_agent_attributes_to_span(span)
301
- set_span_attribute(
302
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
303
- )
304
- model_name = kwargs.get("model", "")
305
- # Add groq/ prefix for server-side cost calculation
306
- prefixed_model_name = f"groq/{model_name}" if model_name else ""
307
- set_span_attribute(
308
- span,
309
- AttributeKeys.GEN_AI_REQUEST_MODEL,
310
- prefixed_model_name,
311
- )
312
- except Exception as e:
313
- judgeval_logger.error(
314
- f"[groq wrapped] Error adding span metadata: {e}"
315
- )
316
-
317
- response = function(*args, **kwargs)
318
-
319
- try:
320
- if isinstance(response, GroqChatCompletion):
321
- output, usage_data = _format_groq_output(response)
322
- # Serialize structured data to JSON for span attribute
323
- if output:
324
- if isinstance(output, list):
325
- output_str = safe_serialize(output)
326
- else:
327
- output_str = str(output)
328
- set_span_attribute(
329
- span, AttributeKeys.GEN_AI_COMPLETION, output_str
330
- )
331
- if usage_data:
332
- (
333
- prompt_tokens,
334
- completion_tokens,
335
- cache_read,
336
- cache_creation,
337
- ) = _extract_groq_tokens(usage_data)
338
- set_span_attribute(
339
- span,
340
- AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
341
- prompt_tokens,
342
- )
343
- set_span_attribute(
344
- span,
345
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
346
- completion_tokens,
347
- )
348
- set_span_attribute(
349
- span,
350
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
351
- cache_read,
352
- )
353
- set_span_attribute(
354
- span,
355
- AttributeKeys.JUDGMENT_USAGE_METADATA,
356
- safe_serialize(usage_data),
357
- )
358
- # Add groq/ prefix to response model for server-side cost calculation
359
- response_model = getattr(response, "model", model_name)
360
- prefixed_response_model = (
361
- f"groq/{response_model}" if response_model else ""
362
- )
363
- set_span_attribute(
364
- span,
365
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
366
- prefixed_response_model,
367
- )
368
- except Exception as e:
369
- judgeval_logger.error(
370
- f"[groq wrapped] Error adding span metadata: {e}"
371
- )
372
- finally:
373
- return response
374
-
375
- return wrapper
376
-
377
- def wrapped_async(function: Callable, span_name: str):
378
- @functools.wraps(function)
379
- async def wrapper(*args, **kwargs):
380
- if kwargs.get("stream", False):
381
- span = tracer.get_tracer().start_span(
382
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
383
- )
384
- tracer.add_agent_attributes_to_span(span)
385
- set_span_attribute(
386
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
387
- )
388
- model_name = kwargs.get("model", "")
389
- # Add groq/ prefix for server-side cost calculation
390
- prefixed_model_name = f"groq/{model_name}" if model_name else ""
391
- set_span_attribute(
392
- span, AttributeKeys.GEN_AI_REQUEST_MODEL, prefixed_model_name
393
- )
394
- stream_response = await function(*args, **kwargs)
395
- return TracedGroqAsyncGenerator(
396
- tracer, stream_response, client, span, model_name
397
- )
398
- else:
399
- async with async_span_context(
400
- tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
401
- ) as span:
402
- try:
403
- tracer.add_agent_attributes_to_span(span)
404
- set_span_attribute(
405
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
406
- )
407
- model_name = kwargs.get("model", "")
408
- # Add groq/ prefix for server-side cost calculation
409
- prefixed_model_name = f"groq/{model_name}" if model_name else ""
410
- set_span_attribute(
411
- span,
412
- AttributeKeys.GEN_AI_REQUEST_MODEL,
413
- prefixed_model_name,
414
- )
415
- except Exception as e:
416
- judgeval_logger.error(
417
- f"[groq wrapped_async] Error adding span metadata: {e}"
418
- )
419
-
420
- response = await function(*args, **kwargs)
421
-
422
- try:
423
- if isinstance(response, GroqChatCompletion):
424
- output, usage_data = _format_groq_output(response)
425
- # Serialize structured data to JSON for span attribute
426
- if output:
427
- if isinstance(output, list):
428
- output_str = safe_serialize(output)
429
- else:
430
- output_str = str(output)
431
- set_span_attribute(
432
- span, AttributeKeys.GEN_AI_COMPLETION, output_str
433
- )
434
- if usage_data:
435
- (
436
- prompt_tokens,
437
- completion_tokens,
438
- cache_read,
439
- cache_creation,
440
- ) = _extract_groq_tokens(usage_data)
441
- set_span_attribute(
442
- span,
443
- AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
444
- prompt_tokens,
445
- )
446
- set_span_attribute(
447
- span,
448
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
449
- completion_tokens,
450
- )
451
- set_span_attribute(
452
- span,
453
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
454
- cache_read,
455
- )
456
- set_span_attribute(
457
- span,
458
- AttributeKeys.JUDGMENT_USAGE_METADATA,
459
- safe_serialize(usage_data),
460
- )
461
- # Add groq/ prefix to response model for server-side cost calculation
462
- response_model = getattr(response, "model", model_name)
463
- prefixed_response_model = (
464
- f"groq/{response_model}" if response_model else ""
465
- )
466
- set_span_attribute(
467
- span,
468
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
469
- prefixed_response_model,
470
- )
471
- except Exception as e:
472
- judgeval_logger.error(
473
- f"[groq wrapped_async] Error adding span metadata: {e}"
474
- )
475
- finally:
476
- return response
477
-
478
- return wrapper
479
-
480
- span_name = "GROQ_API_CALL"
481
- if groq_Groq is not None and isinstance(client, groq_Groq):
482
- # Type narrowing for mypy
483
- groq_client = client # type: ignore[assignment]
484
- setattr(
485
- groq_client.chat.completions,
486
- "create",
487
- wrapped(groq_client.chat.completions.create, span_name),
488
- )
489
- elif groq_AsyncGroq is not None and isinstance(client, groq_AsyncGroq):
490
- # Type narrowing for mypy
491
- async_groq_client = client # type: ignore[assignment]
492
- setattr(
493
- async_groq_client.chat.completions,
494
- "create",
495
- wrapped_async(async_groq_client.chat.completions.create, span_name),
496
- )
497
-
498
- return client
@@ -1,199 +0,0 @@
1
- """Local evaluation queue for batching custom scorer evaluations.
2
-
3
- This module provides a simple in-memory queue for EvaluationRun objects that contain
4
- only local (BaseScorer) scorers. Useful for batching evaluations and processing them
5
- either synchronously or in a background thread.
6
- """
7
-
8
- import queue
9
- import threading
10
- from typing import Callable, List, Optional
11
- import time
12
-
13
- from judgeval.logger import judgeval_logger
14
- from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
15
- from judgeval.data import ScoringResult
16
- from judgeval.data.evaluation_run import ExampleEvaluationRun
17
- from judgeval.utils.async_utils import safe_run_async
18
- from judgeval.scorers.score import a_execute_scoring
19
- from judgeval.api import JudgmentSyncClient
20
- from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
21
-
22
-
23
- class LocalEvaluationQueue:
24
- """Lightweight in-memory queue for local evaluation runs.
25
-
26
- Only supports EvaluationRuns with local scorers (BaseScorer instances).
27
- API scorers (ExampleAPIScorerConfig) are not supported as they have their own queue.
28
- """
29
-
30
- def __init__(
31
- self,
32
- max_concurrent: int = JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
33
- num_workers: int = 4,
34
- ):
35
- if num_workers <= 0:
36
- raise ValueError("num_workers must be a positive integer.")
37
- self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
38
- self._max_concurrent = max_concurrent
39
- self._num_workers = num_workers # Number of worker threads
40
- self._worker_threads: List[threading.Thread] = []
41
- self._shutdown_event = threading.Event()
42
- self._api_client = JudgmentSyncClient(
43
- api_key=JUDGMENT_API_KEY,
44
- organization_id=JUDGMENT_ORG_ID,
45
- )
46
-
47
- def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
48
- """Add evaluation run to the queue."""
49
- self._queue.put(evaluation_run)
50
-
51
- def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
52
- """Execute evaluation run locally and return results."""
53
-
54
- if not evaluation_run.custom_scorers:
55
- raise ValueError(
56
- "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
57
- "Found only ExampleAPIScorerConfig instances."
58
- )
59
-
60
- return safe_run_async(
61
- a_execute_scoring(
62
- evaluation_run.examples,
63
- evaluation_run.custom_scorers,
64
- model=evaluation_run.model,
65
- throttle_value=0,
66
- max_concurrent=self._max_concurrent // self._num_workers,
67
- show_progress=False,
68
- )
69
- )
70
-
71
- def run_all(
72
- self,
73
- callback: Optional[
74
- Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
75
- ] = None,
76
- ) -> None:
77
- """Process all queued runs synchronously.
78
-
79
- Args:
80
- callback: Optional function called after each run with (run, results).
81
- """
82
- while not self._queue.empty():
83
- run = self._queue.get()
84
- if run is None: # Sentinel for worker shutdown
85
- self._queue.put(None)
86
- break
87
- results = self._process_run(run)
88
- if callback:
89
- callback(run, results)
90
- self._queue.task_done()
91
-
92
- def start_workers(
93
- self,
94
- ) -> List[threading.Thread]:
95
- """Start multiple background threads to process runs in parallel.
96
- Returns:
97
- List of started worker threads.
98
- """
99
-
100
- def _worker(worker_id: int) -> None:
101
- while not self._shutdown_event.is_set():
102
- try:
103
- # Use timeout so workers can check shutdown event periodically
104
- run = self._queue.get(timeout=1.0)
105
- if run is None: # Sentinel to stop worker
106
- # Put sentinel back for other workers
107
- self._queue.put(None)
108
- self._queue.task_done()
109
- break
110
-
111
- try:
112
- results = self._process_run(run)
113
- results_dict = [result.model_dump() for result in results]
114
- self._api_client.log_eval_results(
115
- payload={"results": results_dict, "run": run.model_dump()}
116
- )
117
- except Exception as exc:
118
- judgeval_logger.error(
119
- f"Worker {worker_id} error processing {run.eval_name}: {exc}"
120
- )
121
- # Continue processing other runs instead of shutting down all workers
122
- finally:
123
- self._queue.task_done()
124
-
125
- except queue.Empty:
126
- # Timeout - check shutdown event and continue
127
- continue
128
-
129
- # Start worker threads
130
- for i in range(self._num_workers):
131
- thread = threading.Thread(target=_worker, args=(i,), daemon=True)
132
- thread.start()
133
- self._worker_threads.append(thread)
134
-
135
- return self._worker_threads
136
-
137
- def start_worker(
138
- self,
139
- callback: Optional[
140
- Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
141
- ] = None,
142
- ) -> Optional[threading.Thread]:
143
- """Start a single background thread to process runs (backward compatibility).
144
-
145
- Args:
146
- callback: Optional function called after each run with (run, results).
147
-
148
- Returns:
149
- The started thread, or None if no threads were started.
150
- """
151
- threads = self.start_workers()
152
- return threads[0] if threads else None
153
-
154
- def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
155
- """Wait for all queued tasks to complete.
156
-
157
- Args:
158
- timeout: Maximum time to wait in seconds. None means wait indefinitely.
159
-
160
- Returns:
161
- True if all tasks completed, False if timeout occurred.
162
- """
163
- try:
164
- if timeout is None:
165
- self._queue.join()
166
- return True
167
- else:
168
- start_time = time.time()
169
- while not self._queue.empty() or self._queue.unfinished_tasks > 0:
170
- if time.time() - start_time > timeout:
171
- return False
172
- time.sleep(0.1)
173
- return True
174
- except Exception:
175
- return False
176
-
177
- def stop_workers(self) -> None:
178
- """Signal all background workers to stop after current tasks complete."""
179
- if not self._worker_threads:
180
- return
181
-
182
- # Signal shutdown
183
- self._shutdown_event.set()
184
-
185
- # Send sentinel to wake up any blocking workers
186
- for _ in range(self._num_workers):
187
- self._queue.put(None)
188
-
189
- # Wait for all workers to finish with timeout
190
- for thread in self._worker_threads:
191
- if thread.is_alive():
192
- thread.join(timeout=5.0)
193
- if thread.is_alive():
194
- judgeval_logger.warning(
195
- f"Worker thread {thread.name} did not shut down gracefully"
196
- )
197
-
198
- self._worker_threads.clear()
199
- self._shutdown_event.clear()