judgeval 0.16.7__py3-none-any.whl → 0.16.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/api/api_types.py +1 -2
- judgeval/data/judgment_types.py +1 -2
- judgeval/tracer/__init__.py +7 -52
- judgeval/tracer/llm/config.py +12 -44
- judgeval/tracer/llm/constants.py +0 -1
- judgeval/tracer/llm/llm_anthropic/config.py +3 -17
- judgeval/tracer/llm/llm_anthropic/messages.py +440 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +40 -621
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +3 -21
- judgeval/tracer/llm/llm_google/generate_content.py +125 -0
- judgeval/tracer/llm/llm_google/wrapper.py +19 -454
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +192 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +437 -0
- judgeval/tracer/llm/llm_openai/config.py +3 -29
- judgeval/tracer/llm/llm_openai/responses.py +444 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +43 -641
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +398 -0
- judgeval/tracer/llm/llm_together/config.py +3 -20
- judgeval/tracer/llm/llm_together/wrapper.py +34 -485
- judgeval/tracer/llm/providers.py +4 -48
- judgeval/utils/decorators/dont_throw.py +30 -14
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +1 -1
- {judgeval-0.16.7.dist-info → judgeval-0.16.8.dist-info}/METADATA +1 -1
- {judgeval-0.16.7.dist-info → judgeval-0.16.8.dist-info}/RECORD +40 -27
- judgeval/tracer/llm/llm_groq/config.py +0 -23
- judgeval/tracer/llm/llm_groq/wrapper.py +0 -498
- judgeval/tracer/local_eval_queue.py +0 -199
- /judgeval/{tracer/llm/llm_groq/__init__.py → utils/wrappers/py.typed} +0 -0
- {judgeval-0.16.7.dist-info → judgeval-0.16.8.dist-info}/WHEEL +0 -0
- {judgeval-0.16.7.dist-info → judgeval-0.16.8.dist-info}/entry_points.txt +0 -0
- {judgeval-0.16.7.dist-info → judgeval-0.16.8.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,498 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
import functools
|
|
3
|
-
from typing import (
|
|
4
|
-
TYPE_CHECKING,
|
|
5
|
-
Any,
|
|
6
|
-
Callable,
|
|
7
|
-
Optional,
|
|
8
|
-
Protocol,
|
|
9
|
-
Tuple,
|
|
10
|
-
Union,
|
|
11
|
-
Iterator,
|
|
12
|
-
AsyncIterator,
|
|
13
|
-
Sequence,
|
|
14
|
-
runtime_checkable,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
from judgeval.tracer.llm.llm_groq.config import (
|
|
18
|
-
groq_Groq,
|
|
19
|
-
groq_AsyncGroq,
|
|
20
|
-
)
|
|
21
|
-
from judgeval.tracer.managers import sync_span_context, async_span_context
|
|
22
|
-
from judgeval.logger import judgeval_logger
|
|
23
|
-
from judgeval.tracer.keys import AttributeKeys
|
|
24
|
-
from judgeval.tracer.utils import set_span_attribute
|
|
25
|
-
from judgeval.utils.serialize import safe_serialize
|
|
26
|
-
|
|
27
|
-
if TYPE_CHECKING:
|
|
28
|
-
from judgeval.tracer import Tracer
|
|
29
|
-
from opentelemetry.trace import Span
|
|
30
|
-
|
|
31
|
-
# Keep the original client type for runtime compatibility
|
|
32
|
-
GroqClientType = Union[groq_Groq, groq_AsyncGroq]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# Usage protocols
|
|
36
|
-
@runtime_checkable
|
|
37
|
-
class GroqPromptTokensDetails(Protocol):
|
|
38
|
-
cached_tokens: Optional[int]
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@runtime_checkable
|
|
42
|
-
class GroqUsage(Protocol):
|
|
43
|
-
prompt_tokens: Optional[int]
|
|
44
|
-
completion_tokens: Optional[int]
|
|
45
|
-
total_tokens: Optional[int]
|
|
46
|
-
prompt_tokens_details: Optional[GroqPromptTokensDetails]
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# Message protocols
|
|
50
|
-
@runtime_checkable
|
|
51
|
-
class GroqMessage(Protocol):
|
|
52
|
-
content: Optional[str]
|
|
53
|
-
role: str
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
@runtime_checkable
|
|
57
|
-
class GroqChoice(Protocol):
|
|
58
|
-
index: int
|
|
59
|
-
message: GroqMessage
|
|
60
|
-
finish_reason: Optional[str]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@runtime_checkable
|
|
64
|
-
class GroqChatCompletion(Protocol):
|
|
65
|
-
id: str
|
|
66
|
-
object: str
|
|
67
|
-
created: int
|
|
68
|
-
model: str
|
|
69
|
-
choices: Sequence[GroqChoice]
|
|
70
|
-
usage: Optional[GroqUsage]
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
# Stream protocols
|
|
74
|
-
@runtime_checkable
|
|
75
|
-
class GroqStreamDelta(Protocol):
|
|
76
|
-
content: Optional[str]
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@runtime_checkable
|
|
80
|
-
class GroqStreamChoice(Protocol):
|
|
81
|
-
index: int
|
|
82
|
-
delta: GroqStreamDelta
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@runtime_checkable
|
|
86
|
-
class GroqStreamChunk(Protocol):
|
|
87
|
-
choices: Sequence[GroqStreamChoice]
|
|
88
|
-
usage: Optional[GroqUsage]
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
# Client protocols
|
|
92
|
-
@runtime_checkable
|
|
93
|
-
class GroqClient(Protocol):
|
|
94
|
-
pass
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
@runtime_checkable
|
|
98
|
-
class GroqAsyncClient(Protocol):
|
|
99
|
-
pass
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# Union types
|
|
103
|
-
GroqResponseType = GroqChatCompletion
|
|
104
|
-
GroqStreamType = Union[Iterator[GroqStreamChunk], AsyncIterator[GroqStreamChunk]]
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def _extract_groq_content(chunk: GroqStreamChunk) -> str:
|
|
108
|
-
if chunk.choices and len(chunk.choices) > 0:
|
|
109
|
-
delta_content = chunk.choices[0].delta.content
|
|
110
|
-
if delta_content:
|
|
111
|
-
return delta_content
|
|
112
|
-
return ""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def _extract_groq_tokens(usage_data: GroqUsage) -> Tuple[int, int, int, int]:
|
|
116
|
-
prompt_tokens = usage_data.prompt_tokens or 0
|
|
117
|
-
completion_tokens = usage_data.completion_tokens or 0
|
|
118
|
-
cache_read_input_tokens = 0
|
|
119
|
-
if (
|
|
120
|
-
hasattr(usage_data, "prompt_tokens_details")
|
|
121
|
-
and usage_data.prompt_tokens_details
|
|
122
|
-
and hasattr(usage_data.prompt_tokens_details, "cached_tokens")
|
|
123
|
-
and usage_data.prompt_tokens_details.cached_tokens is not None
|
|
124
|
-
):
|
|
125
|
-
cache_read_input_tokens = usage_data.prompt_tokens_details.cached_tokens
|
|
126
|
-
cache_creation_input_tokens = 0 # Groq doesn't have cache creation tokens
|
|
127
|
-
return (
|
|
128
|
-
prompt_tokens,
|
|
129
|
-
completion_tokens,
|
|
130
|
-
cache_read_input_tokens,
|
|
131
|
-
cache_creation_input_tokens,
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def _format_groq_output(
|
|
136
|
-
response: GroqChatCompletion,
|
|
137
|
-
) -> Tuple[Optional[Union[str, list[dict[str, Any]]]], Optional[GroqUsage]]:
|
|
138
|
-
message_content: Optional[Union[str, list[dict[str, Any]]]] = None
|
|
139
|
-
usage_data: Optional[GroqUsage] = None
|
|
140
|
-
|
|
141
|
-
try:
|
|
142
|
-
if isinstance(response, GroqChatCompletion):
|
|
143
|
-
usage_data = response.usage
|
|
144
|
-
if response.choices and len(response.choices) > 0:
|
|
145
|
-
content = response.choices[0].message.content
|
|
146
|
-
if content:
|
|
147
|
-
# Return structured data for consistency with other providers
|
|
148
|
-
message_content = [{"type": "text", "text": str(content)}]
|
|
149
|
-
except (AttributeError, IndexError, TypeError):
|
|
150
|
-
pass
|
|
151
|
-
|
|
152
|
-
return message_content, usage_data
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
class TracedGroqGenerator:
|
|
156
|
-
def __init__(
|
|
157
|
-
self,
|
|
158
|
-
tracer: Tracer,
|
|
159
|
-
generator: Iterator[GroqStreamChunk],
|
|
160
|
-
client: GroqClientType,
|
|
161
|
-
span: Span,
|
|
162
|
-
model_name: str,
|
|
163
|
-
):
|
|
164
|
-
self.tracer = tracer
|
|
165
|
-
self.generator = generator
|
|
166
|
-
self.client = client
|
|
167
|
-
self.span = span
|
|
168
|
-
self.model_name = model_name
|
|
169
|
-
self.accumulated_content = ""
|
|
170
|
-
|
|
171
|
-
def __iter__(self) -> Iterator[GroqStreamChunk]:
|
|
172
|
-
return self
|
|
173
|
-
|
|
174
|
-
def __next__(self) -> GroqStreamChunk:
|
|
175
|
-
try:
|
|
176
|
-
chunk = next(self.generator)
|
|
177
|
-
content = _extract_groq_content(chunk)
|
|
178
|
-
if content:
|
|
179
|
-
self.accumulated_content += content
|
|
180
|
-
if chunk.usage:
|
|
181
|
-
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
182
|
-
_extract_groq_tokens(chunk.usage)
|
|
183
|
-
)
|
|
184
|
-
set_span_attribute(
|
|
185
|
-
self.span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
|
|
186
|
-
)
|
|
187
|
-
set_span_attribute(
|
|
188
|
-
self.span,
|
|
189
|
-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
190
|
-
completion_tokens,
|
|
191
|
-
)
|
|
192
|
-
set_span_attribute(
|
|
193
|
-
self.span,
|
|
194
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
195
|
-
cache_read,
|
|
196
|
-
)
|
|
197
|
-
set_span_attribute(
|
|
198
|
-
self.span,
|
|
199
|
-
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
200
|
-
safe_serialize(chunk.usage),
|
|
201
|
-
)
|
|
202
|
-
return chunk
|
|
203
|
-
except StopIteration:
|
|
204
|
-
set_span_attribute(
|
|
205
|
-
self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
|
|
206
|
-
)
|
|
207
|
-
self.span.end()
|
|
208
|
-
raise
|
|
209
|
-
except Exception as e:
|
|
210
|
-
if self.span:
|
|
211
|
-
self.span.record_exception(e)
|
|
212
|
-
self.span.end()
|
|
213
|
-
raise
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
class TracedGroqAsyncGenerator:
|
|
217
|
-
def __init__(
|
|
218
|
-
self,
|
|
219
|
-
tracer: Tracer,
|
|
220
|
-
async_generator: AsyncIterator[GroqStreamChunk],
|
|
221
|
-
client: GroqClientType,
|
|
222
|
-
span: Span,
|
|
223
|
-
model_name: str,
|
|
224
|
-
):
|
|
225
|
-
self.tracer = tracer
|
|
226
|
-
self.async_generator = async_generator
|
|
227
|
-
self.client = client
|
|
228
|
-
self.span = span
|
|
229
|
-
self.model_name = model_name
|
|
230
|
-
self.accumulated_content = ""
|
|
231
|
-
|
|
232
|
-
def __aiter__(self) -> AsyncIterator[GroqStreamChunk]:
|
|
233
|
-
return self
|
|
234
|
-
|
|
235
|
-
async def __anext__(self) -> GroqStreamChunk:
|
|
236
|
-
try:
|
|
237
|
-
chunk = await self.async_generator.__anext__()
|
|
238
|
-
content = _extract_groq_content(chunk)
|
|
239
|
-
if content:
|
|
240
|
-
self.accumulated_content += content
|
|
241
|
-
if chunk.usage:
|
|
242
|
-
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
243
|
-
_extract_groq_tokens(chunk.usage)
|
|
244
|
-
)
|
|
245
|
-
set_span_attribute(
|
|
246
|
-
self.span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
|
|
247
|
-
)
|
|
248
|
-
set_span_attribute(
|
|
249
|
-
self.span,
|
|
250
|
-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
251
|
-
completion_tokens,
|
|
252
|
-
)
|
|
253
|
-
set_span_attribute(
|
|
254
|
-
self.span,
|
|
255
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
256
|
-
cache_read,
|
|
257
|
-
)
|
|
258
|
-
set_span_attribute(
|
|
259
|
-
self.span,
|
|
260
|
-
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
261
|
-
safe_serialize(chunk.usage),
|
|
262
|
-
)
|
|
263
|
-
return chunk
|
|
264
|
-
except StopAsyncIteration:
|
|
265
|
-
set_span_attribute(
|
|
266
|
-
self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
|
|
267
|
-
)
|
|
268
|
-
self.span.end()
|
|
269
|
-
raise
|
|
270
|
-
except Exception as e:
|
|
271
|
-
if self.span:
|
|
272
|
-
self.span.record_exception(e)
|
|
273
|
-
self.span.end()
|
|
274
|
-
raise
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def wrap_groq_client(tracer: Tracer, client: GroqClientType) -> GroqClientType:
|
|
278
|
-
def wrapped(function: Callable, span_name: str):
|
|
279
|
-
@functools.wraps(function)
|
|
280
|
-
def wrapper(*args, **kwargs):
|
|
281
|
-
if kwargs.get("stream", False):
|
|
282
|
-
span = tracer.get_tracer().start_span(
|
|
283
|
-
span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
284
|
-
)
|
|
285
|
-
tracer.add_agent_attributes_to_span(span)
|
|
286
|
-
set_span_attribute(
|
|
287
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
288
|
-
)
|
|
289
|
-
model_name = kwargs.get("model", "")
|
|
290
|
-
set_span_attribute(span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name)
|
|
291
|
-
stream_response = function(*args, **kwargs)
|
|
292
|
-
return TracedGroqGenerator(
|
|
293
|
-
tracer, stream_response, client, span, model_name
|
|
294
|
-
)
|
|
295
|
-
else:
|
|
296
|
-
with sync_span_context(
|
|
297
|
-
tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
298
|
-
) as span:
|
|
299
|
-
try:
|
|
300
|
-
tracer.add_agent_attributes_to_span(span)
|
|
301
|
-
set_span_attribute(
|
|
302
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
303
|
-
)
|
|
304
|
-
model_name = kwargs.get("model", "")
|
|
305
|
-
# Add groq/ prefix for server-side cost calculation
|
|
306
|
-
prefixed_model_name = f"groq/{model_name}" if model_name else ""
|
|
307
|
-
set_span_attribute(
|
|
308
|
-
span,
|
|
309
|
-
AttributeKeys.GEN_AI_REQUEST_MODEL,
|
|
310
|
-
prefixed_model_name,
|
|
311
|
-
)
|
|
312
|
-
except Exception as e:
|
|
313
|
-
judgeval_logger.error(
|
|
314
|
-
f"[groq wrapped] Error adding span metadata: {e}"
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
response = function(*args, **kwargs)
|
|
318
|
-
|
|
319
|
-
try:
|
|
320
|
-
if isinstance(response, GroqChatCompletion):
|
|
321
|
-
output, usage_data = _format_groq_output(response)
|
|
322
|
-
# Serialize structured data to JSON for span attribute
|
|
323
|
-
if output:
|
|
324
|
-
if isinstance(output, list):
|
|
325
|
-
output_str = safe_serialize(output)
|
|
326
|
-
else:
|
|
327
|
-
output_str = str(output)
|
|
328
|
-
set_span_attribute(
|
|
329
|
-
span, AttributeKeys.GEN_AI_COMPLETION, output_str
|
|
330
|
-
)
|
|
331
|
-
if usage_data:
|
|
332
|
-
(
|
|
333
|
-
prompt_tokens,
|
|
334
|
-
completion_tokens,
|
|
335
|
-
cache_read,
|
|
336
|
-
cache_creation,
|
|
337
|
-
) = _extract_groq_tokens(usage_data)
|
|
338
|
-
set_span_attribute(
|
|
339
|
-
span,
|
|
340
|
-
AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
|
|
341
|
-
prompt_tokens,
|
|
342
|
-
)
|
|
343
|
-
set_span_attribute(
|
|
344
|
-
span,
|
|
345
|
-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
346
|
-
completion_tokens,
|
|
347
|
-
)
|
|
348
|
-
set_span_attribute(
|
|
349
|
-
span,
|
|
350
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
351
|
-
cache_read,
|
|
352
|
-
)
|
|
353
|
-
set_span_attribute(
|
|
354
|
-
span,
|
|
355
|
-
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
356
|
-
safe_serialize(usage_data),
|
|
357
|
-
)
|
|
358
|
-
# Add groq/ prefix to response model for server-side cost calculation
|
|
359
|
-
response_model = getattr(response, "model", model_name)
|
|
360
|
-
prefixed_response_model = (
|
|
361
|
-
f"groq/{response_model}" if response_model else ""
|
|
362
|
-
)
|
|
363
|
-
set_span_attribute(
|
|
364
|
-
span,
|
|
365
|
-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
|
|
366
|
-
prefixed_response_model,
|
|
367
|
-
)
|
|
368
|
-
except Exception as e:
|
|
369
|
-
judgeval_logger.error(
|
|
370
|
-
f"[groq wrapped] Error adding span metadata: {e}"
|
|
371
|
-
)
|
|
372
|
-
finally:
|
|
373
|
-
return response
|
|
374
|
-
|
|
375
|
-
return wrapper
|
|
376
|
-
|
|
377
|
-
def wrapped_async(function: Callable, span_name: str):
|
|
378
|
-
@functools.wraps(function)
|
|
379
|
-
async def wrapper(*args, **kwargs):
|
|
380
|
-
if kwargs.get("stream", False):
|
|
381
|
-
span = tracer.get_tracer().start_span(
|
|
382
|
-
span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
383
|
-
)
|
|
384
|
-
tracer.add_agent_attributes_to_span(span)
|
|
385
|
-
set_span_attribute(
|
|
386
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
387
|
-
)
|
|
388
|
-
model_name = kwargs.get("model", "")
|
|
389
|
-
# Add groq/ prefix for server-side cost calculation
|
|
390
|
-
prefixed_model_name = f"groq/{model_name}" if model_name else ""
|
|
391
|
-
set_span_attribute(
|
|
392
|
-
span, AttributeKeys.GEN_AI_REQUEST_MODEL, prefixed_model_name
|
|
393
|
-
)
|
|
394
|
-
stream_response = await function(*args, **kwargs)
|
|
395
|
-
return TracedGroqAsyncGenerator(
|
|
396
|
-
tracer, stream_response, client, span, model_name
|
|
397
|
-
)
|
|
398
|
-
else:
|
|
399
|
-
async with async_span_context(
|
|
400
|
-
tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
401
|
-
) as span:
|
|
402
|
-
try:
|
|
403
|
-
tracer.add_agent_attributes_to_span(span)
|
|
404
|
-
set_span_attribute(
|
|
405
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
406
|
-
)
|
|
407
|
-
model_name = kwargs.get("model", "")
|
|
408
|
-
# Add groq/ prefix for server-side cost calculation
|
|
409
|
-
prefixed_model_name = f"groq/{model_name}" if model_name else ""
|
|
410
|
-
set_span_attribute(
|
|
411
|
-
span,
|
|
412
|
-
AttributeKeys.GEN_AI_REQUEST_MODEL,
|
|
413
|
-
prefixed_model_name,
|
|
414
|
-
)
|
|
415
|
-
except Exception as e:
|
|
416
|
-
judgeval_logger.error(
|
|
417
|
-
f"[groq wrapped_async] Error adding span metadata: {e}"
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
response = await function(*args, **kwargs)
|
|
421
|
-
|
|
422
|
-
try:
|
|
423
|
-
if isinstance(response, GroqChatCompletion):
|
|
424
|
-
output, usage_data = _format_groq_output(response)
|
|
425
|
-
# Serialize structured data to JSON for span attribute
|
|
426
|
-
if output:
|
|
427
|
-
if isinstance(output, list):
|
|
428
|
-
output_str = safe_serialize(output)
|
|
429
|
-
else:
|
|
430
|
-
output_str = str(output)
|
|
431
|
-
set_span_attribute(
|
|
432
|
-
span, AttributeKeys.GEN_AI_COMPLETION, output_str
|
|
433
|
-
)
|
|
434
|
-
if usage_data:
|
|
435
|
-
(
|
|
436
|
-
prompt_tokens,
|
|
437
|
-
completion_tokens,
|
|
438
|
-
cache_read,
|
|
439
|
-
cache_creation,
|
|
440
|
-
) = _extract_groq_tokens(usage_data)
|
|
441
|
-
set_span_attribute(
|
|
442
|
-
span,
|
|
443
|
-
AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
|
|
444
|
-
prompt_tokens,
|
|
445
|
-
)
|
|
446
|
-
set_span_attribute(
|
|
447
|
-
span,
|
|
448
|
-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
|
|
449
|
-
completion_tokens,
|
|
450
|
-
)
|
|
451
|
-
set_span_attribute(
|
|
452
|
-
span,
|
|
453
|
-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
454
|
-
cache_read,
|
|
455
|
-
)
|
|
456
|
-
set_span_attribute(
|
|
457
|
-
span,
|
|
458
|
-
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
459
|
-
safe_serialize(usage_data),
|
|
460
|
-
)
|
|
461
|
-
# Add groq/ prefix to response model for server-side cost calculation
|
|
462
|
-
response_model = getattr(response, "model", model_name)
|
|
463
|
-
prefixed_response_model = (
|
|
464
|
-
f"groq/{response_model}" if response_model else ""
|
|
465
|
-
)
|
|
466
|
-
set_span_attribute(
|
|
467
|
-
span,
|
|
468
|
-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
|
|
469
|
-
prefixed_response_model,
|
|
470
|
-
)
|
|
471
|
-
except Exception as e:
|
|
472
|
-
judgeval_logger.error(
|
|
473
|
-
f"[groq wrapped_async] Error adding span metadata: {e}"
|
|
474
|
-
)
|
|
475
|
-
finally:
|
|
476
|
-
return response
|
|
477
|
-
|
|
478
|
-
return wrapper
|
|
479
|
-
|
|
480
|
-
span_name = "GROQ_API_CALL"
|
|
481
|
-
if groq_Groq is not None and isinstance(client, groq_Groq):
|
|
482
|
-
# Type narrowing for mypy
|
|
483
|
-
groq_client = client # type: ignore[assignment]
|
|
484
|
-
setattr(
|
|
485
|
-
groq_client.chat.completions,
|
|
486
|
-
"create",
|
|
487
|
-
wrapped(groq_client.chat.completions.create, span_name),
|
|
488
|
-
)
|
|
489
|
-
elif groq_AsyncGroq is not None and isinstance(client, groq_AsyncGroq):
|
|
490
|
-
# Type narrowing for mypy
|
|
491
|
-
async_groq_client = client # type: ignore[assignment]
|
|
492
|
-
setattr(
|
|
493
|
-
async_groq_client.chat.completions,
|
|
494
|
-
"create",
|
|
495
|
-
wrapped_async(async_groq_client.chat.completions.create, span_name),
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
return client
|
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
"""Local evaluation queue for batching custom scorer evaluations.
|
|
2
|
-
|
|
3
|
-
This module provides a simple in-memory queue for EvaluationRun objects that contain
|
|
4
|
-
only local (BaseScorer) scorers. Useful for batching evaluations and processing them
|
|
5
|
-
either synchronously or in a background thread.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import queue
|
|
9
|
-
import threading
|
|
10
|
-
from typing import Callable, List, Optional
|
|
11
|
-
import time
|
|
12
|
-
|
|
13
|
-
from judgeval.logger import judgeval_logger
|
|
14
|
-
from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
|
|
15
|
-
from judgeval.data import ScoringResult
|
|
16
|
-
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
17
|
-
from judgeval.utils.async_utils import safe_run_async
|
|
18
|
-
from judgeval.scorers.score import a_execute_scoring
|
|
19
|
-
from judgeval.api import JudgmentSyncClient
|
|
20
|
-
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class LocalEvaluationQueue:
|
|
24
|
-
"""Lightweight in-memory queue for local evaluation runs.
|
|
25
|
-
|
|
26
|
-
Only supports EvaluationRuns with local scorers (BaseScorer instances).
|
|
27
|
-
API scorers (ExampleAPIScorerConfig) are not supported as they have their own queue.
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
max_concurrent: int = JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
|
|
33
|
-
num_workers: int = 4,
|
|
34
|
-
):
|
|
35
|
-
if num_workers <= 0:
|
|
36
|
-
raise ValueError("num_workers must be a positive integer.")
|
|
37
|
-
self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
|
|
38
|
-
self._max_concurrent = max_concurrent
|
|
39
|
-
self._num_workers = num_workers # Number of worker threads
|
|
40
|
-
self._worker_threads: List[threading.Thread] = []
|
|
41
|
-
self._shutdown_event = threading.Event()
|
|
42
|
-
self._api_client = JudgmentSyncClient(
|
|
43
|
-
api_key=JUDGMENT_API_KEY,
|
|
44
|
-
organization_id=JUDGMENT_ORG_ID,
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
|
|
48
|
-
"""Add evaluation run to the queue."""
|
|
49
|
-
self._queue.put(evaluation_run)
|
|
50
|
-
|
|
51
|
-
def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
|
|
52
|
-
"""Execute evaluation run locally and return results."""
|
|
53
|
-
|
|
54
|
-
if not evaluation_run.custom_scorers:
|
|
55
|
-
raise ValueError(
|
|
56
|
-
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
|
57
|
-
"Found only ExampleAPIScorerConfig instances."
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
return safe_run_async(
|
|
61
|
-
a_execute_scoring(
|
|
62
|
-
evaluation_run.examples,
|
|
63
|
-
evaluation_run.custom_scorers,
|
|
64
|
-
model=evaluation_run.model,
|
|
65
|
-
throttle_value=0,
|
|
66
|
-
max_concurrent=self._max_concurrent // self._num_workers,
|
|
67
|
-
show_progress=False,
|
|
68
|
-
)
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
def run_all(
|
|
72
|
-
self,
|
|
73
|
-
callback: Optional[
|
|
74
|
-
Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
|
|
75
|
-
] = None,
|
|
76
|
-
) -> None:
|
|
77
|
-
"""Process all queued runs synchronously.
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
callback: Optional function called after each run with (run, results).
|
|
81
|
-
"""
|
|
82
|
-
while not self._queue.empty():
|
|
83
|
-
run = self._queue.get()
|
|
84
|
-
if run is None: # Sentinel for worker shutdown
|
|
85
|
-
self._queue.put(None)
|
|
86
|
-
break
|
|
87
|
-
results = self._process_run(run)
|
|
88
|
-
if callback:
|
|
89
|
-
callback(run, results)
|
|
90
|
-
self._queue.task_done()
|
|
91
|
-
|
|
92
|
-
def start_workers(
|
|
93
|
-
self,
|
|
94
|
-
) -> List[threading.Thread]:
|
|
95
|
-
"""Start multiple background threads to process runs in parallel.
|
|
96
|
-
Returns:
|
|
97
|
-
List of started worker threads.
|
|
98
|
-
"""
|
|
99
|
-
|
|
100
|
-
def _worker(worker_id: int) -> None:
|
|
101
|
-
while not self._shutdown_event.is_set():
|
|
102
|
-
try:
|
|
103
|
-
# Use timeout so workers can check shutdown event periodically
|
|
104
|
-
run = self._queue.get(timeout=1.0)
|
|
105
|
-
if run is None: # Sentinel to stop worker
|
|
106
|
-
# Put sentinel back for other workers
|
|
107
|
-
self._queue.put(None)
|
|
108
|
-
self._queue.task_done()
|
|
109
|
-
break
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
results = self._process_run(run)
|
|
113
|
-
results_dict = [result.model_dump() for result in results]
|
|
114
|
-
self._api_client.log_eval_results(
|
|
115
|
-
payload={"results": results_dict, "run": run.model_dump()}
|
|
116
|
-
)
|
|
117
|
-
except Exception as exc:
|
|
118
|
-
judgeval_logger.error(
|
|
119
|
-
f"Worker {worker_id} error processing {run.eval_name}: {exc}"
|
|
120
|
-
)
|
|
121
|
-
# Continue processing other runs instead of shutting down all workers
|
|
122
|
-
finally:
|
|
123
|
-
self._queue.task_done()
|
|
124
|
-
|
|
125
|
-
except queue.Empty:
|
|
126
|
-
# Timeout - check shutdown event and continue
|
|
127
|
-
continue
|
|
128
|
-
|
|
129
|
-
# Start worker threads
|
|
130
|
-
for i in range(self._num_workers):
|
|
131
|
-
thread = threading.Thread(target=_worker, args=(i,), daemon=True)
|
|
132
|
-
thread.start()
|
|
133
|
-
self._worker_threads.append(thread)
|
|
134
|
-
|
|
135
|
-
return self._worker_threads
|
|
136
|
-
|
|
137
|
-
def start_worker(
|
|
138
|
-
self,
|
|
139
|
-
callback: Optional[
|
|
140
|
-
Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
|
|
141
|
-
] = None,
|
|
142
|
-
) -> Optional[threading.Thread]:
|
|
143
|
-
"""Start a single background thread to process runs (backward compatibility).
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
callback: Optional function called after each run with (run, results).
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
The started thread, or None if no threads were started.
|
|
150
|
-
"""
|
|
151
|
-
threads = self.start_workers()
|
|
152
|
-
return threads[0] if threads else None
|
|
153
|
-
|
|
154
|
-
def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
|
|
155
|
-
"""Wait for all queued tasks to complete.
|
|
156
|
-
|
|
157
|
-
Args:
|
|
158
|
-
timeout: Maximum time to wait in seconds. None means wait indefinitely.
|
|
159
|
-
|
|
160
|
-
Returns:
|
|
161
|
-
True if all tasks completed, False if timeout occurred.
|
|
162
|
-
"""
|
|
163
|
-
try:
|
|
164
|
-
if timeout is None:
|
|
165
|
-
self._queue.join()
|
|
166
|
-
return True
|
|
167
|
-
else:
|
|
168
|
-
start_time = time.time()
|
|
169
|
-
while not self._queue.empty() or self._queue.unfinished_tasks > 0:
|
|
170
|
-
if time.time() - start_time > timeout:
|
|
171
|
-
return False
|
|
172
|
-
time.sleep(0.1)
|
|
173
|
-
return True
|
|
174
|
-
except Exception:
|
|
175
|
-
return False
|
|
176
|
-
|
|
177
|
-
def stop_workers(self) -> None:
|
|
178
|
-
"""Signal all background workers to stop after current tasks complete."""
|
|
179
|
-
if not self._worker_threads:
|
|
180
|
-
return
|
|
181
|
-
|
|
182
|
-
# Signal shutdown
|
|
183
|
-
self._shutdown_event.set()
|
|
184
|
-
|
|
185
|
-
# Send sentinel to wake up any blocking workers
|
|
186
|
-
for _ in range(self._num_workers):
|
|
187
|
-
self._queue.put(None)
|
|
188
|
-
|
|
189
|
-
# Wait for all workers to finish with timeout
|
|
190
|
-
for thread in self._worker_threads:
|
|
191
|
-
if thread.is_alive():
|
|
192
|
-
thread.join(timeout=5.0)
|
|
193
|
-
if thread.is_alive():
|
|
194
|
-
judgeval_logger.warning(
|
|
195
|
-
f"Worker thread {thread.name} did not shut down gracefully"
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
self._worker_threads.clear()
|
|
199
|
-
self._shutdown_event.clear()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|