judgeval 0.14.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/api/__init__.py +4 -40
- judgeval/api/api_types.py +34 -27
- judgeval/data/judgment_types.py +39 -35
- judgeval/dataset/__init__.py +1 -1
- judgeval/evaluation/__init__.py +9 -21
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +25 -2
- judgeval/tracer/__init__.py +4 -0
- judgeval/tracer/llm/__init__.py +36 -4
- judgeval/version.py +1 -1
- judgeval-0.16.0.dist-info/METADATA +266 -0
- {judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/RECORD +15 -14
- judgeval-0.14.1.dist-info/METADATA +0 -158
- {judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/WHEEL +0 -0
- {judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/api/__init__.py
CHANGED
@@ -73,7 +73,7 @@ class JudgmentSyncClient:
|
|
73
73
|
|
74
74
|
def evaluate_examples(
|
75
75
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
76
|
-
) ->
|
76
|
+
) -> EvaluateResponse:
|
77
77
|
query_params = {}
|
78
78
|
if stream is not None:
|
79
79
|
query_params["stream"] = stream
|
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
|
|
86
86
|
|
87
87
|
def evaluate_traces(
|
88
88
|
self, payload: TraceEvaluationRun, stream: Optional[str] = None
|
89
|
-
) ->
|
89
|
+
) -> EvaluateResponse:
|
90
90
|
query_params = {}
|
91
91
|
if stream is not None:
|
92
92
|
query_params["stream"] = stream
|
@@ -111,16 +111,6 @@ class JudgmentSyncClient:
|
|
111
111
|
payload,
|
112
112
|
)
|
113
113
|
|
114
|
-
def get_evaluation_status(self, experiment_run_id: str, project_name: str) -> Any:
|
115
|
-
query_params = {}
|
116
|
-
query_params["experiment_run_id"] = experiment_run_id
|
117
|
-
query_params["project_name"] = project_name
|
118
|
-
return self._request(
|
119
|
-
"GET",
|
120
|
-
url_for("/get_evaluation_status/"),
|
121
|
-
query_params,
|
122
|
-
)
|
123
|
-
|
124
114
|
def datasets_insert_examples_for_judgeval(
|
125
115
|
self, payload: DatasetInsertExamples
|
126
116
|
) -> Any:
|
@@ -222,13 +212,6 @@ class JudgmentSyncClient:
|
|
222
212
|
payload,
|
223
213
|
)
|
224
214
|
|
225
|
-
def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
226
|
-
return self._request(
|
227
|
-
"POST",
|
228
|
-
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
229
|
-
payload,
|
230
|
-
)
|
231
|
-
|
232
215
|
|
233
216
|
class JudgmentAsyncClient:
|
234
217
|
__slots__ = ("api_key", "organization_id", "client")
|
@@ -280,7 +263,7 @@ class JudgmentAsyncClient:
|
|
280
263
|
|
281
264
|
async def evaluate_examples(
|
282
265
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
283
|
-
) ->
|
266
|
+
) -> EvaluateResponse:
|
284
267
|
query_params = {}
|
285
268
|
if stream is not None:
|
286
269
|
query_params["stream"] = stream
|
@@ -293,7 +276,7 @@ class JudgmentAsyncClient:
|
|
293
276
|
|
294
277
|
async def evaluate_traces(
|
295
278
|
self, payload: TraceEvaluationRun, stream: Optional[str] = None
|
296
|
-
) ->
|
279
|
+
) -> EvaluateResponse:
|
297
280
|
query_params = {}
|
298
281
|
if stream is not None:
|
299
282
|
query_params["stream"] = stream
|
@@ -318,18 +301,6 @@ class JudgmentAsyncClient:
|
|
318
301
|
payload,
|
319
302
|
)
|
320
303
|
|
321
|
-
async def get_evaluation_status(
|
322
|
-
self, experiment_run_id: str, project_name: str
|
323
|
-
) -> Any:
|
324
|
-
query_params = {}
|
325
|
-
query_params["experiment_run_id"] = experiment_run_id
|
326
|
-
query_params["project_name"] = project_name
|
327
|
-
return await self._request(
|
328
|
-
"GET",
|
329
|
-
url_for("/get_evaluation_status/"),
|
330
|
-
query_params,
|
331
|
-
)
|
332
|
-
|
333
304
|
async def datasets_insert_examples_for_judgeval(
|
334
305
|
self, payload: DatasetInsertExamples
|
335
306
|
) -> Any:
|
@@ -433,13 +404,6 @@ class JudgmentAsyncClient:
|
|
433
404
|
payload,
|
434
405
|
)
|
435
406
|
|
436
|
-
async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
437
|
-
return await self._request(
|
438
|
-
"POST",
|
439
|
-
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
440
|
-
payload,
|
441
|
-
)
|
442
|
-
|
443
407
|
|
444
408
|
__all__ = [
|
445
409
|
"JudgmentSyncClient",
|
judgeval/api/api_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-10-07T20:43:52+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -24,6 +24,15 @@ class DatasetsFetch(TypedDict):
|
|
24
24
|
project_name: str
|
25
25
|
|
26
26
|
|
27
|
+
class DatasetsTableRow(TypedDict):
|
28
|
+
dataset_id: str
|
29
|
+
name: str
|
30
|
+
created_at: str
|
31
|
+
kind: Literal["trace", "example"]
|
32
|
+
entries: int
|
33
|
+
creator: str
|
34
|
+
|
35
|
+
|
27
36
|
class ProjectAdd(TypedDict):
|
28
37
|
project_name: str
|
29
38
|
|
@@ -137,6 +146,14 @@ class ValidationError(TypedDict):
|
|
137
146
|
type: str
|
138
147
|
|
139
148
|
|
149
|
+
class UsageInfo(TypedDict):
|
150
|
+
total_judgees: int
|
151
|
+
regular_use: int
|
152
|
+
pay_as_you_go_use: int
|
153
|
+
remaining_regular: int
|
154
|
+
remaining_after: int
|
155
|
+
|
156
|
+
|
140
157
|
DatasetKind = Literal["trace", "example"]
|
141
158
|
|
142
159
|
|
@@ -180,18 +197,10 @@ class OtelTraceSpan(TypedDict):
|
|
180
197
|
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
181
198
|
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
182
199
|
duration: NotRequired[Optional[int]]
|
183
|
-
status_code: NotRequired[Optional[
|
200
|
+
status_code: NotRequired[Optional[int]]
|
184
201
|
status_message: NotRequired[Optional[str]]
|
185
202
|
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
186
203
|
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
187
|
-
legacy_span_id: NotRequired[Optional[str]]
|
188
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
189
|
-
output: Any
|
190
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
191
|
-
agent_id: NotRequired[Optional[str]]
|
192
|
-
cumulative_llm_cost: NotRequired[Optional[float]]
|
193
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
194
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
195
204
|
|
196
205
|
|
197
206
|
class OtelSpanListItemScores(TypedDict):
|
@@ -206,7 +215,7 @@ class OtelSpanDetailScores(TypedDict):
|
|
206
215
|
score: float
|
207
216
|
reason: NotRequired[Optional[str]]
|
208
217
|
name: str
|
209
|
-
|
218
|
+
example_id: NotRequired[Optional[str]]
|
210
219
|
|
211
220
|
|
212
221
|
class ExampleEvaluationRun(TypedDict):
|
@@ -244,15 +253,6 @@ class DatasetInsertExamples(TypedDict):
|
|
244
253
|
project_name: str
|
245
254
|
|
246
255
|
|
247
|
-
class DatasetInfo(TypedDict):
|
248
|
-
dataset_id: str
|
249
|
-
name: str
|
250
|
-
created_at: str
|
251
|
-
dataset_kind: DatasetKind
|
252
|
-
entries: int
|
253
|
-
creator: str
|
254
|
-
|
255
|
-
|
256
256
|
class DatasetCreate(TypedDict):
|
257
257
|
name: str
|
258
258
|
dataset_kind: DatasetKind
|
@@ -279,16 +279,17 @@ class OtelTraceListItem(TypedDict):
|
|
279
279
|
organization_id: str
|
280
280
|
project_id: str
|
281
281
|
trace_id: str
|
282
|
-
|
282
|
+
created_at: str
|
283
283
|
duration: NotRequired[Optional[int]]
|
284
|
-
has_notification: NotRequired[Optional[bool]]
|
285
284
|
tags: NotRequired[Optional[List[str]]]
|
286
285
|
experiment_run_id: NotRequired[Optional[str]]
|
287
286
|
span_name: NotRequired[Optional[str]]
|
288
|
-
|
289
|
-
error: NotRequired[
|
287
|
+
llm_cost: NotRequired[Optional[float]]
|
288
|
+
error: NotRequired[str]
|
290
289
|
scores: NotRequired[List[OtelSpanListItemScores]]
|
291
290
|
customer_id: NotRequired[Optional[str]]
|
291
|
+
input: NotRequired[Optional[str]]
|
292
|
+
output: NotRequired[Optional[str]]
|
292
293
|
input_preview: NotRequired[Optional[str]]
|
293
294
|
output_preview: NotRequired[Optional[str]]
|
294
295
|
annotation_count: NotRequired[int]
|
@@ -310,9 +311,9 @@ class OtelSpanDetail(TypedDict):
|
|
310
311
|
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
311
312
|
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
312
313
|
duration: NotRequired[Optional[int]]
|
313
|
-
status_code: NotRequired[Optional[
|
314
|
+
status_code: NotRequired[Optional[int]]
|
314
315
|
status_message: NotRequired[Optional[str]]
|
315
|
-
events: NotRequired[Optional[
|
316
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
316
317
|
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
317
318
|
llm_cost: NotRequired[Optional[float]]
|
318
319
|
prompt_tokens: NotRequired[Optional[int]]
|
@@ -320,6 +321,12 @@ class OtelSpanDetail(TypedDict):
|
|
320
321
|
scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
|
321
322
|
|
322
323
|
|
324
|
+
class EvaluateResponse(TypedDict):
|
325
|
+
status: str
|
326
|
+
results: List[ScoringResult]
|
327
|
+
resource_usage: NotRequired[Optional[UsageInfo]]
|
328
|
+
|
329
|
+
|
323
330
|
class EvalResults(TypedDict):
|
324
331
|
results: List[ScoringResult]
|
325
332
|
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
@@ -335,5 +342,5 @@ class DatasetReturn(TypedDict):
|
|
335
342
|
name: str
|
336
343
|
project_name: str
|
337
344
|
dataset_kind: DatasetKind
|
338
|
-
examples: NotRequired[
|
345
|
+
examples: NotRequired[List[Example]]
|
339
346
|
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-10-07T20:43:51+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -26,6 +26,20 @@ class DatasetsFetch(BaseModel):
|
|
26
26
|
project_name: Annotated[str, Field(title="Project Name")]
|
27
27
|
|
28
28
|
|
29
|
+
class Kind(Enum):
|
30
|
+
trace = "trace"
|
31
|
+
example = "example"
|
32
|
+
|
33
|
+
|
34
|
+
class DatasetsTableRow(BaseModel):
|
35
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
36
|
+
name: Annotated[str, Field(title="Name")]
|
37
|
+
created_at: Annotated[str, Field(title="Created At")]
|
38
|
+
kind: Annotated[Kind, Field(title="Kind")]
|
39
|
+
entries: Annotated[int, Field(title="Entries")]
|
40
|
+
creator: Annotated[str, Field(title="Creator")]
|
41
|
+
|
42
|
+
|
29
43
|
class ProjectAdd(BaseModel):
|
30
44
|
project_name: Annotated[str, Field(title="Project Name")]
|
31
45
|
|
@@ -148,6 +162,14 @@ class ValidationError(BaseModel):
|
|
148
162
|
type: Annotated[str, Field(title="Error Type")]
|
149
163
|
|
150
164
|
|
165
|
+
class UsageInfo(BaseModel):
|
166
|
+
total_judgees: Annotated[int, Field(title="Total Judgees")]
|
167
|
+
regular_use: Annotated[int, Field(title="Regular Use")]
|
168
|
+
pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
|
169
|
+
remaining_regular: Annotated[int, Field(title="Remaining Regular")]
|
170
|
+
remaining_after: Annotated[int, Field(title="Remaining After")]
|
171
|
+
|
172
|
+
|
151
173
|
class DatasetKind(Enum):
|
152
174
|
trace = "trace"
|
153
175
|
example = "example"
|
@@ -199,22 +221,10 @@ class OtelTraceSpan(BaseModel):
|
|
199
221
|
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
200
222
|
] = None
|
201
223
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
202
|
-
status_code: Annotated[Optional[
|
224
|
+
status_code: Annotated[Optional[int], Field(title="Status Code")] = None
|
203
225
|
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
204
226
|
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
205
227
|
links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
|
206
|
-
legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
|
207
|
-
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
208
|
-
output: Annotated[Any, Field(title="Output")]
|
209
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
210
|
-
agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
|
211
|
-
cumulative_llm_cost: Annotated[
|
212
|
-
Optional[float], Field(title="Cumulative Llm Cost")
|
213
|
-
] = None
|
214
|
-
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
215
|
-
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
216
|
-
None
|
217
|
-
)
|
218
228
|
|
219
229
|
|
220
230
|
class OtelSpanListItemScores(BaseModel):
|
@@ -229,7 +239,7 @@ class OtelSpanDetailScores(BaseModel):
|
|
229
239
|
score: Annotated[float, Field(title="Score")]
|
230
240
|
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
231
241
|
name: Annotated[str, Field(title="Name")]
|
232
|
-
|
242
|
+
example_id: Annotated[Optional[str], Field(title="Example Id")] = None
|
233
243
|
|
234
244
|
|
235
245
|
class ExampleEvaluationRun(BaseModel):
|
@@ -277,15 +287,6 @@ class DatasetInsertExamples(BaseModel):
|
|
277
287
|
project_name: Annotated[str, Field(title="Project Name")]
|
278
288
|
|
279
289
|
|
280
|
-
class DatasetInfo(BaseModel):
|
281
|
-
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
282
|
-
name: Annotated[str, Field(title="Name")]
|
283
|
-
created_at: Annotated[str, Field(title="Created At")]
|
284
|
-
dataset_kind: DatasetKind
|
285
|
-
entries: Annotated[int, Field(title="Entries")]
|
286
|
-
creator: Annotated[str, Field(title="Creator")]
|
287
|
-
|
288
|
-
|
289
290
|
class DatasetCreate(BaseModel):
|
290
291
|
name: Annotated[str, Field(title="Name")]
|
291
292
|
dataset_kind: DatasetKind
|
@@ -314,20 +315,19 @@ class OtelTraceListItem(BaseModel):
|
|
314
315
|
organization_id: Annotated[str, Field(title="Organization Id")]
|
315
316
|
project_id: Annotated[str, Field(title="Project Id")]
|
316
317
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
317
|
-
|
318
|
+
created_at: Annotated[AwareDatetime, Field(title="Created At")]
|
318
319
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
319
|
-
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
|
320
320
|
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
321
321
|
experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
|
322
322
|
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
323
|
-
|
324
|
-
|
325
|
-
] = None
|
326
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
323
|
+
llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
|
324
|
+
error: Annotated[Optional[str], Field(title="Error")] = ""
|
327
325
|
scores: Annotated[
|
328
326
|
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
329
327
|
] = []
|
330
328
|
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
329
|
+
input: Annotated[Optional[str], Field(title="Input")] = None
|
330
|
+
output: Annotated[Optional[str], Field(title="Output")] = None
|
331
331
|
input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
|
332
332
|
output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
|
333
333
|
annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
|
@@ -338,7 +338,7 @@ class OtelTraceListItem(BaseModel):
|
|
338
338
|
class OtelSpanDetail(BaseModel):
|
339
339
|
organization_id: Annotated[str, Field(title="Organization Id")]
|
340
340
|
project_id: Annotated[str, Field(title="Project Id")]
|
341
|
-
timestamp: Annotated[
|
341
|
+
timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
|
342
342
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
343
343
|
span_id: Annotated[str, Field(title="Span Id")]
|
344
344
|
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
@@ -353,11 +353,9 @@ class OtelSpanDetail(BaseModel):
|
|
353
353
|
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
354
354
|
] = None
|
355
355
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
356
|
-
status_code: Annotated[Optional[
|
356
|
+
status_code: Annotated[Optional[int], Field(title="Status Code")] = None
|
357
357
|
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
358
|
-
events: Annotated[
|
359
|
-
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
|
360
|
-
] = None
|
358
|
+
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
361
359
|
links: Annotated[
|
362
360
|
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
|
363
361
|
] = None
|
@@ -369,6 +367,12 @@ class OtelSpanDetail(BaseModel):
|
|
369
367
|
)
|
370
368
|
|
371
369
|
|
370
|
+
class EvaluateResponse(BaseModel):
|
371
|
+
status: Annotated[str, Field(title="Status")]
|
372
|
+
results: Annotated[List[ScoringResult], Field(title="Results")]
|
373
|
+
resource_usage: Optional[UsageInfo] = None
|
374
|
+
|
375
|
+
|
372
376
|
class EvalResults(BaseModel):
|
373
377
|
results: Annotated[List[ScoringResult], Field(title="Results")]
|
374
378
|
run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
|
judgeval/dataset/__init__.py
CHANGED
judgeval/evaluation/__init__.py
CHANGED
@@ -84,7 +84,7 @@ def log_evaluation_results(
|
|
84
84
|
|
85
85
|
def _poll_evaluation_until_complete(
|
86
86
|
evaluation_run: ExampleEvaluationRun,
|
87
|
-
|
87
|
+
expected_examples_count: int,
|
88
88
|
poll_interval_seconds: float = 5,
|
89
89
|
max_failures: int = 5,
|
90
90
|
max_poll_count: int = 60, # This should be equivalent to 5 minutes
|
@@ -117,29 +117,22 @@ def _poll_evaluation_until_complete(
|
|
117
117
|
poll_count += 1
|
118
118
|
try:
|
119
119
|
# Check status
|
120
|
-
status_response = api_client.get_evaluation_status(
|
121
|
-
experiment_run_id, project_name
|
122
|
-
)
|
123
|
-
|
124
|
-
if status_response.get("status") != "completed":
|
125
|
-
time.sleep(poll_interval_seconds)
|
126
|
-
continue
|
127
|
-
|
128
|
-
example_scorer_pairings = status_response.get("results", [])
|
129
|
-
if len(example_scorer_pairings) != expected_scorer_data_count:
|
130
|
-
time.sleep(poll_interval_seconds)
|
131
|
-
continue
|
132
|
-
|
133
120
|
results_response = api_client.fetch_experiment_run(
|
134
121
|
{
|
135
122
|
"experiment_run_id": experiment_run_id,
|
136
123
|
"project_name": project_name,
|
137
124
|
}
|
138
125
|
)
|
126
|
+
|
127
|
+
example_scorer_pairings = results_response.get("results", [])
|
128
|
+
if len(example_scorer_pairings) != expected_examples_count:
|
129
|
+
time.sleep(poll_interval_seconds)
|
130
|
+
continue
|
131
|
+
|
139
132
|
url = results_response.get("ui_results_url")
|
140
133
|
|
141
134
|
scoring_result_list = []
|
142
|
-
for res in
|
135
|
+
for res in example_scorer_pairings:
|
143
136
|
example = res.get("data", {}).copy()
|
144
137
|
example["example_id"] = res.get("example_id")
|
145
138
|
scoring_result = ScoringResult(
|
@@ -241,14 +234,9 @@ def run_eval(
|
|
241
234
|
)
|
242
235
|
raise JudgmentRuntimeError(error_message)
|
243
236
|
|
244
|
-
num_scorers = (
|
245
|
-
len(evaluation_run.judgment_scorers)
|
246
|
-
if evaluation_run.judgment_scorers
|
247
|
-
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
248
|
-
)
|
249
237
|
results, url = _poll_evaluation_until_complete(
|
250
238
|
evaluation_run=evaluation_run,
|
251
|
-
|
239
|
+
expected_examples_count=len(evaluation_run.examples),
|
252
240
|
)
|
253
241
|
finally:
|
254
242
|
stop_event.set()
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from judgeval.tracer import Tracer
|
3
|
+
from judgeval.logger import judgeval_logger
|
4
|
+
from judgeval.utils.url import url_for
|
5
|
+
|
6
|
+
|
7
|
+
try:
|
8
|
+
import openlit # type: ignore
|
9
|
+
except ImportError:
|
10
|
+
raise ImportError(
|
11
|
+
"Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class Openlit(ABC):
|
16
|
+
@staticmethod
|
17
|
+
def initialize(
|
18
|
+
**kwargs,
|
19
|
+
):
|
20
|
+
tracer = Tracer.get_instance()
|
21
|
+
if not tracer or not tracer._initialized:
|
22
|
+
raise ValueError(
|
23
|
+
"Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
|
24
|
+
)
|
25
|
+
|
26
|
+
api_key = tracer.api_key
|
27
|
+
organization_id = tracer.organization_id
|
28
|
+
project_name = tracer.project_name
|
29
|
+
|
30
|
+
project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
|
31
|
+
if not project_id:
|
32
|
+
judgeval_logger.warning(
|
33
|
+
f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
|
34
|
+
)
|
35
|
+
return
|
36
|
+
|
37
|
+
openlit.init(
|
38
|
+
service_name=project_name,
|
39
|
+
otlp_endpoint=url_for("/otel"),
|
40
|
+
otlp_headers={
|
41
|
+
"Authorization": f"Bearer {api_key}",
|
42
|
+
"X-Organization-Id": organization_id,
|
43
|
+
"X-Project-Id": project_id,
|
44
|
+
},
|
45
|
+
tracer=tracer.get_tracer(),
|
46
|
+
**kwargs,
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
__all__ = ["Openlit"]
|
@@ -20,6 +20,7 @@ def push_prompt_scorer(
|
|
20
20
|
threshold: float,
|
21
21
|
options: Optional[Dict[str, float]] = None,
|
22
22
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
23
|
+
description: Optional[str] = None,
|
23
24
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
24
25
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
25
26
|
is_trace: bool = False,
|
@@ -33,6 +34,7 @@ def push_prompt_scorer(
|
|
33
34
|
"threshold": threshold,
|
34
35
|
"options": options,
|
35
36
|
"model": model,
|
37
|
+
"description": description,
|
36
38
|
"is_trace": is_trace,
|
37
39
|
}
|
38
40
|
)
|
@@ -102,6 +104,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
102
104
|
score_type: APIScorerType
|
103
105
|
prompt: str
|
104
106
|
options: Optional[Dict[str, float]] = None
|
107
|
+
description: Optional[str] = None
|
105
108
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
106
109
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
107
110
|
|
@@ -130,6 +133,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
130
133
|
threshold=scorer_config["threshold"],
|
131
134
|
options=scorer_config.get("options"),
|
132
135
|
model=scorer_config.get("model"),
|
136
|
+
description=scorer_config.get("description"),
|
133
137
|
judgment_api_key=judgment_api_key,
|
134
138
|
organization_id=organization_id,
|
135
139
|
)
|
@@ -142,6 +146,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
142
146
|
threshold: float = 0.5,
|
143
147
|
options: Optional[Dict[str, float]] = None,
|
144
148
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
149
|
+
description: Optional[str] = None,
|
145
150
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
146
151
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
147
152
|
):
|
@@ -158,6 +163,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
158
163
|
threshold,
|
159
164
|
options,
|
160
165
|
model,
|
166
|
+
description,
|
161
167
|
judgment_api_key,
|
162
168
|
organization_id,
|
163
169
|
is_trace,
|
@@ -170,6 +176,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
170
176
|
threshold=threshold,
|
171
177
|
options=options,
|
172
178
|
model=model,
|
179
|
+
description=description,
|
173
180
|
judgment_api_key=judgment_api_key,
|
174
181
|
organization_id=organization_id,
|
175
182
|
)
|
@@ -215,6 +222,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
215
222
|
self.push_prompt_scorer()
|
216
223
|
judgeval_logger.info(f"Successfully updated options for {self.name}")
|
217
224
|
|
225
|
+
def set_description(self, description: Optional[str]):
|
226
|
+
"""
|
227
|
+
Updates the description of the scorer.
|
228
|
+
"""
|
229
|
+
self.description = description
|
230
|
+
self.push_prompt_scorer()
|
231
|
+
judgeval_logger.info(f"Successfully updated description for {self.name}")
|
232
|
+
|
218
233
|
def append_to_prompt(self, prompt_addition: str):
|
219
234
|
"""
|
220
235
|
Appends a string to the prompt.
|
@@ -248,7 +263,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
248
263
|
"""
|
249
264
|
return copy(self.options) if self.options is not None else None
|
250
265
|
|
251
|
-
def
|
266
|
+
def get_description(self) -> str | None:
|
267
|
+
"""
|
268
|
+
Returns the description of the scorer.
|
269
|
+
"""
|
270
|
+
return self.description
|
271
|
+
|
272
|
+
def get_name(self) -> str:
|
252
273
|
"""
|
253
274
|
Returns the name of the scorer.
|
254
275
|
"""
|
@@ -264,6 +285,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
264
285
|
"prompt": self.prompt,
|
265
286
|
"threshold": self.threshold,
|
266
287
|
"options": self.options,
|
288
|
+
"description": self.description,
|
267
289
|
}
|
268
290
|
|
269
291
|
def push_prompt_scorer(self):
|
@@ -276,13 +298,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
276
298
|
self.threshold,
|
277
299
|
self.options,
|
278
300
|
self.model,
|
301
|
+
self.description,
|
279
302
|
self.judgment_api_key,
|
280
303
|
self.organization_id,
|
281
304
|
isinstance(self, TracePromptScorer),
|
282
305
|
)
|
283
306
|
|
284
307
|
def __str__(self):
|
285
|
-
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
|
308
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
|
286
309
|
|
287
310
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
288
311
|
base = super().model_dump(*args, **kwargs)
|
judgeval/tracer/__init__.py
CHANGED
@@ -255,6 +255,10 @@ class Tracer(metaclass=SingletonMeta):
|
|
255
255
|
def get_current_agent_context(self):
|
256
256
|
return self.agent_context
|
257
257
|
|
258
|
+
def get_span_processor(self) -> JudgmentSpanProcessor:
|
259
|
+
"""Get the internal span processor of this tracer instance."""
|
260
|
+
return self.judgment_processor
|
261
|
+
|
258
262
|
def set_customer_id(self, customer_id: str) -> None:
|
259
263
|
span = self.get_current_span()
|
260
264
|
if span and span.is_recording():
|
judgeval/tracer/llm/__init__.py
CHANGED
@@ -137,9 +137,23 @@ def _extract_openai_content(chunk) -> str:
|
|
137
137
|
|
138
138
|
def _extract_anthropic_content(chunk) -> str:
|
139
139
|
"""Extract content from Anthropic streaming chunk."""
|
140
|
-
if hasattr(chunk, "type")
|
141
|
-
if
|
142
|
-
|
140
|
+
if hasattr(chunk, "type"):
|
141
|
+
if chunk.type == "content_block_delta":
|
142
|
+
if hasattr(chunk, "delta"):
|
143
|
+
if hasattr(chunk.delta, "text"):
|
144
|
+
return chunk.delta.text or ""
|
145
|
+
elif hasattr(chunk.delta, "partial_json"):
|
146
|
+
# Tool use input streaming - return raw JSON to accumulate properly
|
147
|
+
return chunk.delta.partial_json or ""
|
148
|
+
elif chunk.type == "content_block_start":
|
149
|
+
if hasattr(chunk, "content_block") and hasattr(chunk.content_block, "type"):
|
150
|
+
if chunk.content_block.type == "tool_use":
|
151
|
+
tool_info = {
|
152
|
+
"type": "tool_use",
|
153
|
+
"id": getattr(chunk.content_block, "id", None),
|
154
|
+
"name": getattr(chunk.content_block, "name", None),
|
155
|
+
}
|
156
|
+
return f"[TOOL_USE_START: {tool_info}]"
|
143
157
|
elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
|
144
158
|
return chunk.delta.text or ""
|
145
159
|
elif hasattr(chunk, "text"):
|
@@ -409,7 +423,25 @@ def _format_anthropic_output(
|
|
409
423
|
and usage.cache_creation_input_tokens is not None
|
410
424
|
else 0
|
411
425
|
)
|
412
|
-
|
426
|
+
# Extract content from Anthropic response, handling both text and tool use blocks
|
427
|
+
message_content = None
|
428
|
+
if hasattr(response, "content") and response.content:
|
429
|
+
content_parts = []
|
430
|
+
for content_block in response.content:
|
431
|
+
block_type = getattr(content_block, "type", None)
|
432
|
+
if block_type == "text":
|
433
|
+
# Text content block
|
434
|
+
content_parts.append(getattr(content_block, "text", ""))
|
435
|
+
elif block_type == "tool_use":
|
436
|
+
# Tool use block - serialize the tool call information
|
437
|
+
tool_info = {
|
438
|
+
"type": "tool_use",
|
439
|
+
"id": getattr(content_block, "id", None),
|
440
|
+
"name": getattr(content_block, "name", None),
|
441
|
+
"input": getattr(content_block, "input", None),
|
442
|
+
}
|
443
|
+
content_parts.append(f"[TOOL_USE: {tool_info}]")
|
444
|
+
message_content = "\n".join(content_parts) if content_parts else None
|
413
445
|
|
414
446
|
if model_name:
|
415
447
|
return message_content, _create_usage(
|
judgeval/version.py
CHANGED
@@ -0,0 +1,266 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: judgeval
|
3
|
+
Version: 0.16.0
|
4
|
+
Summary: Judgeval Package
|
5
|
+
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
|
+
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
7
|
+
Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
|
8
|
+
License-Expression: Apache-2.0
|
9
|
+
License-File: LICENSE.md
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.10
|
13
|
+
Requires-Dist: boto3>=1.40.11
|
14
|
+
Requires-Dist: click<8.2.0
|
15
|
+
Requires-Dist: dotenv
|
16
|
+
Requires-Dist: httpx>=0.28.1
|
17
|
+
Requires-Dist: litellm<1.75.0
|
18
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
19
|
+
Requires-Dist: opentelemetry-sdk>=1.36.0
|
20
|
+
Requires-Dist: orjson>=3.9.0
|
21
|
+
Requires-Dist: typer>=0.9.0
|
22
|
+
Provides-Extra: s3
|
23
|
+
Requires-Dist: boto3>=1.40.11; extra == 's3'
|
24
|
+
Provides-Extra: trainer
|
25
|
+
Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
|
28
|
+
<div align="center">
|
29
|
+
|
30
|
+
<a href="https://judgmentlabs.ai/">
|
31
|
+
<picture>
|
32
|
+
<source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
|
33
|
+
<img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
|
34
|
+
</picture>
|
35
|
+
</a>
|
36
|
+
|
37
|
+
<br>
|
38
|
+
|
39
|
+
## Agent Behavior Monitoring (ABM)
|
40
|
+
|
41
|
+
Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
|
42
|
+
|
43
|
+
[](https://docs.judgmentlabs.ai/documentation)
|
44
|
+
[](https://app.judgmentlabs.ai/register)
|
45
|
+
[](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
46
|
+
|
47
|
+
|
48
|
+
[](https://x.com/JudgmentLabs)
|
49
|
+
[](https://www.linkedin.com/company/judgmentlabs)
|
50
|
+
|
51
|
+
</div>
|
52
|
+
|
53
|
+
|
54
|
+
</table>
|
55
|
+
|
56
|
+
## [NEW] 🎆 Agent Reinforcement Learning
|
57
|
+
|
58
|
+
Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
|
59
|
+
|
60
|
+
Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
|
61
|
+
|
62
|
+
```python
|
63
|
+
await trainer.train(
|
64
|
+
agent_function=your_agent_function, # entry point to your agent
|
65
|
+
scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
|
66
|
+
prompts=training_prompts, # Tasks
|
67
|
+
rft_provider="fireworks"
|
68
|
+
)
|
69
|
+
```
|
70
|
+
|
71
|
+
**That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
|
72
|
+
|
73
|
+
👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
|
74
|
+
|
75
|
+
|
76
|
+
You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
|
77
|
+
|
78
|
+
|
79
|
+
## Judgeval Overview
|
80
|
+
|
81
|
+
Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
|
82
|
+
|
83
|
+
Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
|
84
|
+
|
85
|
+
## 📚 Cookbooks
|
86
|
+
|
87
|
+
| Try Out | Notebook | Description |
|
88
|
+
|:---------|:-----|:------------|
|
89
|
+
| RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
|
90
|
+
| Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
|
91
|
+
| Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
|
92
|
+
| Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
|
93
|
+
|
94
|
+
You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
|
95
|
+
|
96
|
+
You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
|
97
|
+
|
98
|
+
## Why Judgeval?
|
99
|
+
|
100
|
+
🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
|
101
|
+
|
102
|
+
⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
|
103
|
+
|
104
|
+
🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
|
105
|
+
|
106
|
+
📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
|
107
|
+
<!-- Add link to Bucketing docs once we have it -->
|
108
|
+
<!--
|
109
|
+
TODO: Once we have trainer code docs, plug in here
|
110
|
+
-->
|
111
|
+
|
112
|
+
🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
|
113
|
+
|
114
|
+
<!--
|
115
|
+
Use this once we have AI PM features:
|
116
|
+
|
117
|
+
**Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
|
118
|
+
|
119
|
+
-->
|
120
|
+
|
121
|
+
## 🛠️ Quickstart
|
122
|
+
|
123
|
+
Get started with Judgeval by installing our SDK using pip:
|
124
|
+
|
125
|
+
```bash
|
126
|
+
pip install judgeval
|
127
|
+
```
|
128
|
+
|
129
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
|
130
|
+
|
131
|
+
```bash
|
132
|
+
export JUDGMENT_API_KEY=...
|
133
|
+
export JUDGMENT_ORG_ID=...
|
134
|
+
```
|
135
|
+
|
136
|
+
**If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
|
137
|
+
|
138
|
+
### Start monitoring with Judgeval
|
139
|
+
|
140
|
+
```python
|
141
|
+
from judgeval.tracer import Tracer, wrap
|
142
|
+
from judgeval.data import Example
|
143
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
144
|
+
from openai import OpenAI
|
145
|
+
|
146
|
+
|
147
|
+
judgment = Tracer(project_name="default_project")
|
148
|
+
client = wrap(OpenAI()) # tracks all LLM calls
|
149
|
+
|
150
|
+
@judgment.observe(span_type="tool")
|
151
|
+
def format_question(question: str) -> str:
|
152
|
+
# dummy tool
|
153
|
+
return f"Question : {question}"
|
154
|
+
|
155
|
+
@judgment.observe(span_type="function")
|
156
|
+
def run_agent(prompt: str) -> str:
|
157
|
+
task = format_question(prompt)
|
158
|
+
response = client.chat.completions.create(
|
159
|
+
model="gpt-5-mini",
|
160
|
+
messages=[{"role": "user", "content": task}]
|
161
|
+
)
|
162
|
+
|
163
|
+
judgment.async_evaluate( # trigger online monitoring
|
164
|
+
scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
|
165
|
+
example=Example(input=task, actual_output=response), # customize to your data
|
166
|
+
model="gpt-5",
|
167
|
+
)
|
168
|
+
return response.choices[0].message.content
|
169
|
+
|
170
|
+
run_agent("What is the capital of the United States?")
|
171
|
+
```
|
172
|
+
|
173
|
+
Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
|
174
|
+
|
175
|
+

|
176
|
+
|
177
|
+
|
178
|
+
### Customizable Scorers Over Agent Behavior
|
179
|
+
|
180
|
+
Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
|
181
|
+
in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
|
182
|
+
|
183
|
+
|
184
|
+
First, create a behavior scorer in a file called `helpfulness_scorer.py`:
|
185
|
+
|
186
|
+
```python
|
187
|
+
from judgeval.data import Example
|
188
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
189
|
+
|
190
|
+
# Define custom example class
|
191
|
+
class QuestionAnswer(Example):
|
192
|
+
question: str
|
193
|
+
answer: str
|
194
|
+
|
195
|
+
# Define a server-hosted custom scorer
|
196
|
+
class HelpfulnessScorer(ExampleScorer):
|
197
|
+
name: str = "Helpfulness Scorer"
|
198
|
+
server_hosted: bool = True # Enable server hosting
|
199
|
+
async def a_score_example(self, example: QuestionAnswer):
|
200
|
+
# Custom scoring logic for agent behavior
|
201
|
+
# Can be an arbitrary combination of code and LLM calls
|
202
|
+
if len(example.answer) > 10 and "?" not in example.answer:
|
203
|
+
self.reason = "Answer is detailed and provides helpful information"
|
204
|
+
return 1.0
|
205
|
+
else:
|
206
|
+
self.reason = "Answer is too brief or unclear"
|
207
|
+
return 0.0
|
208
|
+
```
|
209
|
+
|
210
|
+
Then deploy your scorer to Judgment's infrastructure:
|
211
|
+
|
212
|
+
```bash
|
213
|
+
echo "pydantic" > requirements.txt
|
214
|
+
uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
|
215
|
+
```
|
216
|
+
|
217
|
+
Now you can instrument your agent with monitoring and online evaluation:
|
218
|
+
|
219
|
+
```python
|
220
|
+
from judgeval.tracer import Tracer, wrap
|
221
|
+
from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
|
222
|
+
from openai import OpenAI
|
223
|
+
|
224
|
+
judgment = Tracer(project_name="default_project")
|
225
|
+
client = wrap(OpenAI()) # tracks all LLM calls
|
226
|
+
|
227
|
+
@judgment.observe(span_type="tool")
|
228
|
+
def format_task(question: str) -> str: # replace with your prompt engineering
|
229
|
+
return f"Please answer the following question: {question}"
|
230
|
+
|
231
|
+
@judgment.observe(span_type="tool")
|
232
|
+
def answer_question(prompt: str) -> str: # replace with your LLM system calls
|
233
|
+
response = client.chat.completions.create(
|
234
|
+
model="gpt-5-mini",
|
235
|
+
messages=[{"role": "user", "content": prompt}]
|
236
|
+
)
|
237
|
+
return response.choices[0].message.content
|
238
|
+
|
239
|
+
@judgment.observe(span_type="function")
|
240
|
+
def run_agent(question: str) -> str:
|
241
|
+
task = format_task(question)
|
242
|
+
answer = answer_question(task)
|
243
|
+
|
244
|
+
# Add online evaluation with server-hosted scorer
|
245
|
+
judgment.async_evaluate(
|
246
|
+
scorer=HelpfulnessScorer(),
|
247
|
+
example=QuestionAnswer(question=question, answer=answer),
|
248
|
+
sampling_rate=0.9 # Evaluate 90% of agent runs
|
249
|
+
)
|
250
|
+
|
251
|
+
return answer
|
252
|
+
|
253
|
+
if __name__ == "__main__":
|
254
|
+
result = run_agent("What is the capital of the United States?")
|
255
|
+
print(result)
|
256
|
+
```
|
257
|
+
|
258
|
+
Congratulations! Your online eval result should look like this:
|
259
|
+
|
260
|
+

|
261
|
+
|
262
|
+
You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
|
263
|
+
|
264
|
+
---
|
265
|
+
|
266
|
+
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
@@ -4,22 +4,23 @@ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
|
|
4
4
|
judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
|
5
5
|
judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
|
6
6
|
judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
|
7
|
-
judgeval/version.py,sha256=
|
7
|
+
judgeval/version.py,sha256=UCd6S0KuM6h0ZUz8pm-Ty1EDHaJNSUYM_7PrDz0ov-E,74
|
8
8
|
judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
|
9
|
-
judgeval/api/__init__.py,sha256=
|
10
|
-
judgeval/api/api_types.py,sha256=
|
9
|
+
judgeval/api/__init__.py,sha256=ho8L4wC9y-STYEpk5zHwc2mZJhC4ezW8jiGgOIERBVY,12058
|
10
|
+
judgeval/api/api_types.py,sha256=6wrjvO8XsYbfPxjQ_sHS9EOjqexbn3XDFclWqb4CgZ4,8874
|
11
11
|
judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
|
12
12
|
judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
|
13
13
|
judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
|
14
|
-
judgeval/data/judgment_types.py,sha256=
|
14
|
+
judgeval/data/judgment_types.py,sha256=uI4wUiXeA6k8o2ONia506eaZcydHKQKrK1LzccTK-xc,16577
|
15
15
|
judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
|
16
16
|
judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
|
17
17
|
judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
|
18
18
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
19
19
|
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
20
|
-
judgeval/dataset/__init__.py,sha256=
|
21
|
-
judgeval/evaluation/__init__.py,sha256=
|
20
|
+
judgeval/dataset/__init__.py,sha256=kL0_tIMP3qV6t4W17HQU91ybdXMZ5iDZzyUKzyfRdyY,8269
|
21
|
+
judgeval/evaluation/__init__.py,sha256=WcqOgQdwgtc_BwEwDz6RDlF2RczyLrNjjIevQp-_NKE,12788
|
22
22
|
judgeval/integrations/langgraph/__init__.py,sha256=HwXmtDxaO75Kn4KPErnMb6Ne6FcpRxV_SCYVuwFsve0,332
|
23
|
+
judgeval/integrations/openlit/__init__.py,sha256=-8D4D6-fGsWPwoOojw82OaE9X5sUbmb16x1bF-WfOmg,1571
|
23
24
|
judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
|
24
25
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
25
26
|
judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
|
@@ -39,8 +40,8 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFy
|
|
39
40
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
|
40
41
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
|
41
42
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
|
42
|
-
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=
|
43
|
-
judgeval/tracer/__init__.py,sha256=
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=zJ0n3HyZ1FFBnMnTYxBi37m_3Er7ENd4HpqLjNi5Eag,10902
|
44
|
+
judgeval/tracer/__init__.py,sha256=uIOx-2P_FVwBKhwVkkIOyEQCv3gouCZ2I8-eApocnKU,36165
|
44
45
|
judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
|
45
46
|
judgeval/tracer/keys.py,sha256=ho4-_w4ngTVejdSKUH80sG6vtYt4c7FEKrYpFrDfPLs,2105
|
46
47
|
judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
|
@@ -50,7 +51,7 @@ judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwo
|
|
50
51
|
judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
|
51
52
|
judgeval/tracer/exporters/store.py,sha256=KQV3cyqteesByQjR-9VdPXT9OlUZ-6F08ogqj837_c0,1012
|
52
53
|
judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
|
53
|
-
judgeval/tracer/llm/__init__.py,sha256=
|
54
|
+
judgeval/tracer/llm/__init__.py,sha256=b7toFMVyZU4Pv8jximfneP5gyohUB4DwJDvy8b2_IMw,44217
|
54
55
|
judgeval/tracer/llm/providers.py,sha256=UU8xrh2n9p3xZwnlWMUcZoFpog2-F9-YfcV0c2aUNqQ,1432
|
55
56
|
judgeval/tracer/llm/anthropic/__init__.py,sha256=DUTkYjMejWLI8inFJ_Ih7vf7_aJFAiCyi1Oxls-ACGo,439
|
56
57
|
judgeval/tracer/llm/google/__init__.py,sha256=7j96SPUl61yVl3jCQ-JuPpgVU9GhmcsBzY2vj5wJAVo,506
|
@@ -72,8 +73,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
|
|
72
73
|
judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
|
73
74
|
judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
|
74
75
|
judgeval/utils/version_check.py,sha256=ylZQSqV7kLzEOChxvav9SCHUU4OnaCp36tXHLjdzmw0,1072
|
75
|
-
judgeval-0.
|
76
|
-
judgeval-0.
|
77
|
-
judgeval-0.
|
78
|
-
judgeval-0.
|
79
|
-
judgeval-0.
|
76
|
+
judgeval-0.16.0.dist-info/METADATA,sha256=kojyijzNE_2gKKvMGrs7E0zHHv3GtOXRjfmIOUQujTY,11512
|
77
|
+
judgeval-0.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
78
|
+
judgeval-0.16.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
79
|
+
judgeval-0.16.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
80
|
+
judgeval-0.16.0.dist-info/RECORD,,
|
@@ -1,158 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: judgeval
|
3
|
-
Version: 0.14.1
|
4
|
-
Summary: Judgeval Package
|
5
|
-
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
|
-
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
7
|
-
Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
|
8
|
-
License-Expression: Apache-2.0
|
9
|
-
License-File: LICENSE.md
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.10
|
13
|
-
Requires-Dist: boto3>=1.40.11
|
14
|
-
Requires-Dist: click<8.2.0
|
15
|
-
Requires-Dist: dotenv
|
16
|
-
Requires-Dist: httpx>=0.28.1
|
17
|
-
Requires-Dist: litellm<1.75.0
|
18
|
-
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
19
|
-
Requires-Dist: opentelemetry-sdk>=1.36.0
|
20
|
-
Requires-Dist: orjson>=3.9.0
|
21
|
-
Requires-Dist: typer>=0.9.0
|
22
|
-
Provides-Extra: s3
|
23
|
-
Requires-Dist: boto3>=1.40.11; extra == 's3'
|
24
|
-
Provides-Extra: trainer
|
25
|
-
Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
|
26
|
-
Description-Content-Type: text/markdown
|
27
|
-
|
28
|
-
<div align="center">
|
29
|
-
|
30
|
-
<img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
|
31
|
-
<img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
|
32
|
-
|
33
|
-
<br>
|
34
|
-
<div style="font-size: 1.5em;">
|
35
|
-
Enable self-learning agents with environment data and evals.
|
36
|
-
</div>
|
37
|
-
|
38
|
-
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
39
|
-
|
40
|
-
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
41
|
-
|
42
|
-
We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for monitoring and post-training.
|
43
|
-
|
44
|
-
[](https://x.com/JudgmentLabs)
|
45
|
-
[](https://www.linkedin.com/company/judgmentlabs)
|
46
|
-
[](https://discord.gg/tGVFf8UBUY)
|
47
|
-
|
48
|
-
<img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
|
49
|
-
|
50
|
-
</div>
|
51
|
-
|
52
|
-
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
53
|
-
|
54
|
-
## 🎬 See Judgeval in Action
|
55
|
-
|
56
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
57
|
-
|
58
|
-
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
59
|
-
<tr>
|
60
|
-
<td align="center" style="padding: 8px; width: 50%;">
|
61
|
-
<img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
62
|
-
<br><strong>🤖 Agents Running</strong>
|
63
|
-
</td>
|
64
|
-
<td align="center" style="padding: 8px; width: 50%;">
|
65
|
-
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
66
|
-
<br><strong>📊 Capturing Environment Data </strong>
|
67
|
-
</td>
|
68
|
-
</tr>
|
69
|
-
<tr>
|
70
|
-
<td align="center" style="padding: 8px; width: 50%;">
|
71
|
-
<img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
72
|
-
<br><strong>✅ Agents Completed Running</strong>
|
73
|
-
</td>
|
74
|
-
<td align="center" style="padding: 8px; width: 50%;">
|
75
|
-
<img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
76
|
-
<br><strong>📤 Exporting Agent Environment Data</strong>
|
77
|
-
</td>
|
78
|
-
</tr>
|
79
|
-
|
80
|
-
</table>
|
81
|
-
|
82
|
-
## 📋 Table of Contents
|
83
|
-
- [🛠️ Installation](#️-installation)
|
84
|
-
- [🏁 Quickstarts](#-quickstarts)
|
85
|
-
- [✨ Features](#-features)
|
86
|
-
- [🏢 Self-Hosting](#-self-hosting)
|
87
|
-
- [📚 Cookbooks](#-cookbooks)
|
88
|
-
- [💻 Development with Cursor](#-development-with-cursor)
|
89
|
-
|
90
|
-
## 🛠️ Installation
|
91
|
-
|
92
|
-
Get started with Judgeval by installing our SDK using pip:
|
93
|
-
|
94
|
-
```bash
|
95
|
-
pip install judgeval
|
96
|
-
```
|
97
|
-
|
98
|
-
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
|
99
|
-
|
100
|
-
```bash
|
101
|
-
export JUDGMENT_API_KEY=...
|
102
|
-
export JUDGMENT_ORG_ID=...
|
103
|
-
```
|
104
|
-
|
105
|
-
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
106
|
-
|
107
|
-
|
108
|
-
## ✨ Features
|
109
|
-
|
110
|
-
| | |
|
111
|
-
|:---|:---:|
|
112
|
-
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
113
|
-
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
114
|
-
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
115
|
-
|
116
|
-
## 🏢 Self-Hosting
|
117
|
-
|
118
|
-
Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
|
119
|
-
|
120
|
-
### Key Features
|
121
|
-
* Deploy Judgment on your own AWS account
|
122
|
-
* Store data in your own Supabase instance
|
123
|
-
* Access Judgment through your own custom domain
|
124
|
-
|
125
|
-
### Getting Started
|
126
|
-
1. Check out our [self-hosting documentation](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) for detailed setup instructions, along with how your self-hosted instance can be accessed
|
127
|
-
2. Use the [Judgment CLI](https://docs.judgmentlabs.ai/documentation/developer-tools/judgment-cli/installation) to deploy your self-hosted environment
|
128
|
-
3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
|
129
|
-
|
130
|
-
## 📚 Cookbooks
|
131
|
-
|
132
|
-
Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
|
133
|
-
|
134
|
-
You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
|
135
|
-
|
136
|
-
## 💻 Development with Cursor
|
137
|
-
Building agents and LLM workflows in Cursor works best when your coding assistant has the proper context about Judgment integration. The Cursor rules file contains the key information needed for your assistant to implement Judgment features effectively.
|
138
|
-
|
139
|
-
Refer to the official [documentation](https://docs.judgmentlabs.ai/documentation/developer-tools/cursor/cursor-rules) for access to the rules file and more information on integrating this rules file with your codebase.
|
140
|
-
|
141
|
-
## ⭐ Star Us on GitHub
|
142
|
-
|
143
|
-
If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
|
144
|
-
|
145
|
-
## ❤️ Contributors
|
146
|
-
|
147
|
-
There are many ways to contribute to Judgeval:
|
148
|
-
|
149
|
-
- Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
|
150
|
-
- Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
|
151
|
-
- Speaking or writing about Judgment and letting us know!
|
152
|
-
|
153
|
-
<!-- Contributors collage -->
|
154
|
-
[](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
|
155
|
-
|
156
|
-
---
|
157
|
-
|
158
|
-
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
File without changes
|
File without changes
|
File without changes
|