judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/cli.py +65 -0
- judgeval/common/api/api.py +44 -38
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +8 -9
- judgeval/common/tracer/core.py +448 -256
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +2 -1
- judgeval/common/tracer/trace_manager.py +6 -1
- judgeval/common/trainer/__init__.py +5 -0
- judgeval/common/trainer/config.py +125 -0
- judgeval/common/trainer/console.py +151 -0
- judgeval/common/trainer/trainable_model.py +238 -0
- judgeval/common/trainer/trainer.py +301 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +37 -8
- judgeval/data/trace.py +1 -0
- judgeval/data/trace_run.py +0 -2
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judgment_client.py +90 -135
- judgeval/local_eval_queue.py +3 -5
- judgeval/run_evaluation.py +43 -299
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/METADATA +10 -47
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/RECORD +29 -22
- judgeval-0.7.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -80
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/WHEEL +0 -0
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -6,10 +6,10 @@ import time
|
|
6
6
|
import orjson
|
7
7
|
import sys
|
8
8
|
import threading
|
9
|
-
from typing import List, Dict, Union,
|
9
|
+
from typing import List, Dict, Union, Tuple, Any, TYPE_CHECKING
|
10
10
|
from rich import print as rprint
|
11
11
|
|
12
|
-
from judgeval.data import ScorerData, ScoringResult, Example
|
12
|
+
from judgeval.data import ScorerData, ScoringResult, Example
|
13
13
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
14
14
|
from judgeval.scorers.score import a_execute_scoring
|
15
15
|
from judgeval.common.api import JudgmentApiClient
|
@@ -22,10 +22,7 @@ from judgeval.common.logger import judgeval_logger
|
|
22
22
|
|
23
23
|
|
24
24
|
if TYPE_CHECKING:
|
25
|
-
from judgeval.
|
26
|
-
from judgeval.data.trace_run import TraceRun
|
27
|
-
from judgeval.evaluation_run import EvaluationRun
|
28
|
-
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
25
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
29
26
|
|
30
27
|
|
31
28
|
def safe_run_async(coro):
|
@@ -99,29 +96,6 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
99
96
|
)
|
100
97
|
|
101
98
|
|
102
|
-
def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
|
103
|
-
"""
|
104
|
-
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
105
|
-
"""
|
106
|
-
|
107
|
-
try:
|
108
|
-
# submit API request to execute evals
|
109
|
-
if not judgment_api_key or not trace_run.organization_id:
|
110
|
-
raise ValueError("API key and organization ID are required")
|
111
|
-
api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
|
112
|
-
return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
|
113
|
-
except Exception as e:
|
114
|
-
judgeval_logger.error(f"Error: {e}")
|
115
|
-
|
116
|
-
details = "An unknown error occurred."
|
117
|
-
if isinstance(e, JudgmentAPIException):
|
118
|
-
details = e.response_json.get("detail", "An unknown error occurred.")
|
119
|
-
|
120
|
-
raise JudgmentAPIError(
|
121
|
-
"An error occurred while executing the Judgment API request: " + details
|
122
|
-
)
|
123
|
-
|
124
|
-
|
125
99
|
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
126
100
|
"""
|
127
101
|
Checks if any `ScoringResult` objects are missing `scorers_data`.
|
@@ -140,83 +114,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
140
114
|
return results
|
141
115
|
|
142
116
|
|
143
|
-
def check_experiment_type(
|
144
|
-
eval_name: str,
|
145
|
-
project_name: str,
|
146
|
-
judgment_api_key: str,
|
147
|
-
organization_id: str,
|
148
|
-
is_trace: bool,
|
149
|
-
) -> None:
|
150
|
-
"""
|
151
|
-
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
152
|
-
"""
|
153
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
154
|
-
|
155
|
-
try:
|
156
|
-
api_client.check_experiment_type(eval_name, project_name, is_trace)
|
157
|
-
except JudgmentAPIException as e:
|
158
|
-
if e.response.status_code == 422:
|
159
|
-
judgeval_logger.error(f"{e.response_json}")
|
160
|
-
raise ValueError(f"{e.response_json}")
|
161
|
-
else:
|
162
|
-
raise e
|
163
|
-
except Exception as e:
|
164
|
-
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
165
|
-
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
166
|
-
|
167
|
-
|
168
|
-
def check_eval_run_name_exists(
|
169
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
170
|
-
) -> None:
|
171
|
-
"""
|
172
|
-
Checks if an evaluation run name already exists for a given project.
|
173
|
-
|
174
|
-
Args:
|
175
|
-
eval_name (str): Name of the evaluation run
|
176
|
-
project_name (str): Name of the project
|
177
|
-
judgment_api_key (str): API key for authentication
|
178
|
-
|
179
|
-
Raises:
|
180
|
-
ValueError: If the evaluation run name already exists
|
181
|
-
JudgmentAPIError: If there's an API error during the check
|
182
|
-
"""
|
183
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
184
|
-
try:
|
185
|
-
api_client.check_eval_run_name_exists(eval_name, project_name)
|
186
|
-
except JudgmentAPIException as e:
|
187
|
-
if e.response.status_code == 409:
|
188
|
-
error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
|
189
|
-
judgeval_logger.error(error_str)
|
190
|
-
raise ValueError(error_str)
|
191
|
-
else:
|
192
|
-
raise e
|
193
|
-
|
194
|
-
except Exception as e:
|
195
|
-
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
196
|
-
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
197
|
-
|
198
|
-
|
199
|
-
def check_example_keys(
|
200
|
-
keys: List[str],
|
201
|
-
eval_name: str,
|
202
|
-
project_name: str,
|
203
|
-
judgment_api_key: str,
|
204
|
-
organization_id: str,
|
205
|
-
) -> None:
|
206
|
-
"""
|
207
|
-
Checks if the current experiment (if one exists) has the same keys for example
|
208
|
-
"""
|
209
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
210
|
-
try:
|
211
|
-
api_client.check_example_keys(keys, eval_name, project_name)
|
212
|
-
except Exception as e:
|
213
|
-
judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
|
214
|
-
raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
|
215
|
-
|
216
|
-
|
217
117
|
def log_evaluation_results(
|
218
118
|
scoring_results: List[ScoringResult],
|
219
|
-
run:
|
119
|
+
run: EvaluationRun,
|
220
120
|
judgment_api_key: str,
|
221
121
|
) -> str:
|
222
122
|
"""
|
@@ -282,137 +182,8 @@ def check_examples(
|
|
282
182
|
rprint("[green]Continuing...[/green]")
|
283
183
|
|
284
184
|
|
285
|
-
def run_trace_eval(
|
286
|
-
trace_run: TraceRun,
|
287
|
-
judgment_api_key: str,
|
288
|
-
override: bool = False,
|
289
|
-
function: Optional[Callable] = None,
|
290
|
-
tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
|
291
|
-
examples: Optional[List[Example]] = None,
|
292
|
-
) -> List[ScoringResult]:
|
293
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
294
|
-
if not override and not trace_run.append:
|
295
|
-
check_eval_run_name_exists(
|
296
|
-
trace_run.eval_name,
|
297
|
-
trace_run.project_name,
|
298
|
-
judgment_api_key,
|
299
|
-
trace_run.organization_id,
|
300
|
-
)
|
301
|
-
|
302
|
-
if trace_run.append:
|
303
|
-
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
304
|
-
check_experiment_type(
|
305
|
-
trace_run.eval_name,
|
306
|
-
trace_run.project_name,
|
307
|
-
judgment_api_key,
|
308
|
-
trace_run.organization_id,
|
309
|
-
True,
|
310
|
-
)
|
311
|
-
if function and tracer and examples is not None:
|
312
|
-
new_traces: List[Trace] = []
|
313
|
-
|
314
|
-
# Handle case where tracer is actually a callback handler
|
315
|
-
actual_tracer = tracer
|
316
|
-
if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
|
317
|
-
# This is a callback handler, get the underlying tracer
|
318
|
-
actual_tracer = tracer.tracer
|
319
|
-
|
320
|
-
if trace_run.project_name != actual_tracer.project_name:
|
321
|
-
raise ValueError(
|
322
|
-
f"Project name mismatch between run_trace_eval and tracer. "
|
323
|
-
f"Trace run: {trace_run.project_name}, "
|
324
|
-
f"Tracer: {actual_tracer.project_name}"
|
325
|
-
)
|
326
|
-
|
327
|
-
actual_tracer.offline_mode = True
|
328
|
-
actual_tracer.traces = []
|
329
|
-
judgeval_logger.info("Running agent function: ")
|
330
|
-
for example in examples:
|
331
|
-
if example.input:
|
332
|
-
if isinstance(example.input, str):
|
333
|
-
function(example.input)
|
334
|
-
elif isinstance(example.input, dict):
|
335
|
-
function(**example.input)
|
336
|
-
else:
|
337
|
-
raise ValueError(
|
338
|
-
f"Input must be string or dict, got {type(example.input)}"
|
339
|
-
)
|
340
|
-
else:
|
341
|
-
function()
|
342
|
-
|
343
|
-
for i, trace in enumerate(actual_tracer.traces):
|
344
|
-
# We set the root-level trace span with the expected tools of the Trace
|
345
|
-
trace = Trace(**trace)
|
346
|
-
trace.trace_spans[0].expected_tools = examples[i].expected_tools
|
347
|
-
new_traces.append(trace)
|
348
|
-
trace_run.traces = new_traces
|
349
|
-
actual_tracer.traces = []
|
350
|
-
|
351
|
-
# Execute evaluation using Judgment API
|
352
|
-
try: # execute an EvaluationRun with just JudgmentScorers
|
353
|
-
judgeval_logger.info("Executing Trace Evaluation... ")
|
354
|
-
response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
|
355
|
-
scoring_results = [
|
356
|
-
ScoringResult(**result) for result in response_data["results"]
|
357
|
-
]
|
358
|
-
except JudgmentAPIError as e:
|
359
|
-
raise JudgmentAPIError(
|
360
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
361
|
-
)
|
362
|
-
except ValueError as e:
|
363
|
-
raise ValueError(
|
364
|
-
f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
|
365
|
-
)
|
366
|
-
|
367
|
-
# Convert the response data to `ScoringResult` objects
|
368
|
-
# TODO: allow for custom scorer on traces
|
369
|
-
|
370
|
-
url = log_evaluation_results(
|
371
|
-
response_data["agent_results"], trace_run, judgment_api_key
|
372
|
-
)
|
373
|
-
rprint(
|
374
|
-
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
375
|
-
)
|
376
|
-
return scoring_results
|
377
|
-
|
378
|
-
|
379
|
-
async def get_evaluation_status(
|
380
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
381
|
-
) -> Dict:
|
382
|
-
"""
|
383
|
-
Gets the status of an async evaluation run.
|
384
|
-
|
385
|
-
Args:
|
386
|
-
eval_name (str): Name of the evaluation run
|
387
|
-
project_name (str): Name of the project
|
388
|
-
judgment_api_key (str): API key for authentication
|
389
|
-
organization_id (str): Organization ID for the evaluation
|
390
|
-
|
391
|
-
Returns:
|
392
|
-
Dict: Status information including:
|
393
|
-
- status: 'pending', 'running', 'completed', or 'failed'
|
394
|
-
- results: List of ScoringResult objects if completed
|
395
|
-
- error: Error message if failed
|
396
|
-
"""
|
397
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
398
|
-
try:
|
399
|
-
return api_client.get_evaluation_status(eval_name, project_name)
|
400
|
-
except Exception as e:
|
401
|
-
raise JudgmentAPIError(
|
402
|
-
f"An error occurred while checking evaluation status: {str(e)}"
|
403
|
-
)
|
404
|
-
|
405
|
-
|
406
|
-
def retrieve_counts(result: Dict):
|
407
|
-
scorer_data_count = 0
|
408
|
-
for example in result.get("examples", []):
|
409
|
-
for scorer in example.get("scorer_data", []):
|
410
|
-
scorer_data_count += 1
|
411
|
-
return scorer_data_count
|
412
|
-
|
413
|
-
|
414
185
|
def _poll_evaluation_until_complete(
|
415
|
-
|
186
|
+
experiment_run_id: str,
|
416
187
|
project_name: str,
|
417
188
|
judgment_api_key: str,
|
418
189
|
organization_id: str,
|
@@ -443,14 +214,16 @@ def _poll_evaluation_until_complete(
|
|
443
214
|
poll_count += 1
|
444
215
|
try:
|
445
216
|
# Check status
|
446
|
-
status_response = api_client.get_evaluation_status(
|
217
|
+
status_response = api_client.get_evaluation_status(
|
218
|
+
experiment_run_id, project_name
|
219
|
+
)
|
447
220
|
|
448
221
|
if status_response.get("status") != "completed":
|
449
222
|
time.sleep(poll_interval_seconds)
|
450
223
|
continue
|
451
224
|
|
452
225
|
results_response = api_client.fetch_evaluation_results(
|
453
|
-
|
226
|
+
experiment_run_id, project_name
|
454
227
|
)
|
455
228
|
url = results_response.get("ui_results_url")
|
456
229
|
|
@@ -513,14 +286,15 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
513
286
|
def run_eval(
|
514
287
|
evaluation_run: EvaluationRun,
|
515
288
|
judgment_api_key: str,
|
516
|
-
|
289
|
+
show_url: bool = True,
|
517
290
|
) -> List[ScoringResult]:
|
518
291
|
"""
|
519
292
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
520
293
|
|
521
294
|
Args:
|
522
295
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
523
|
-
|
296
|
+
judgment_api_key (str): API key for authentication
|
297
|
+
show_url (bool): Whether to display the evaluation results URL. Defaults to True.
|
524
298
|
|
525
299
|
Returns:
|
526
300
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -534,52 +308,31 @@ def run_eval(
|
|
534
308
|
f"All examples must have the same keys: {current_keys} != {keys}"
|
535
309
|
)
|
536
310
|
|
537
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
538
|
-
if not override and not evaluation_run.append:
|
539
|
-
check_eval_run_name_exists(
|
540
|
-
evaluation_run.eval_name,
|
541
|
-
evaluation_run.project_name,
|
542
|
-
judgment_api_key,
|
543
|
-
evaluation_run.organization_id,
|
544
|
-
)
|
545
|
-
|
546
|
-
if evaluation_run.append:
|
547
|
-
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
548
|
-
check_experiment_type(
|
549
|
-
evaluation_run.eval_name,
|
550
|
-
evaluation_run.project_name,
|
551
|
-
judgment_api_key,
|
552
|
-
evaluation_run.organization_id,
|
553
|
-
False,
|
554
|
-
)
|
555
|
-
|
556
|
-
# Ensure that current experiment (if one exists) has the same keys for example
|
557
|
-
check_example_keys(
|
558
|
-
keys=list(keys),
|
559
|
-
eval_name=evaluation_run.eval_name,
|
560
|
-
project_name=evaluation_run.project_name,
|
561
|
-
judgment_api_key=judgment_api_key,
|
562
|
-
organization_id=evaluation_run.organization_id,
|
563
|
-
)
|
564
|
-
|
565
|
-
judgment_scorers: List[APIScorerConfig] = []
|
566
|
-
local_scorers: List[BaseScorer] = []
|
567
|
-
for scorer in evaluation_run.scorers:
|
568
|
-
if isinstance(scorer, APIScorerConfig):
|
569
|
-
judgment_scorers.append(scorer)
|
570
|
-
else:
|
571
|
-
local_scorers.append(scorer)
|
572
|
-
|
573
311
|
results: List[ScoringResult] = []
|
574
312
|
url = ""
|
575
313
|
|
576
|
-
if
|
314
|
+
if (
|
315
|
+
len(evaluation_run.custom_scorers) > 0
|
316
|
+
and len(evaluation_run.judgment_scorers) > 0
|
317
|
+
):
|
577
318
|
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
578
319
|
judgeval_logger.error(error_msg)
|
579
320
|
raise ValueError(error_msg)
|
580
321
|
|
581
|
-
|
582
|
-
|
322
|
+
e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
|
323
|
+
|
324
|
+
if evaluation_run.judgment_scorers or e2b_scorers:
|
325
|
+
if evaluation_run.judgment_scorers and e2b_scorers:
|
326
|
+
error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
|
327
|
+
judgeval_logger.error(error_msg)
|
328
|
+
raise ValueError(error_msg)
|
329
|
+
|
330
|
+
if len(e2b_scorers) > 1:
|
331
|
+
error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
|
332
|
+
judgeval_logger.error(error_msg)
|
333
|
+
raise ValueError(error_msg)
|
334
|
+
|
335
|
+
check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
|
583
336
|
stop_event = threading.Event()
|
584
337
|
t = threading.Thread(
|
585
338
|
target=progress_logger, args=(stop_event, "Running evaluation...")
|
@@ -600,36 +353,26 @@ def run_eval(
|
|
600
353
|
)
|
601
354
|
raise JudgmentAPIError(error_message)
|
602
355
|
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
)
|
609
|
-
old_scorer_data_count = retrieve_counts(results_response)
|
610
|
-
except Exception:
|
611
|
-
# This usually means the user did append = True but the eval run name doesn't exist yet
|
612
|
-
pass
|
613
|
-
|
356
|
+
num_scorers = (
|
357
|
+
len(evaluation_run.judgment_scorers)
|
358
|
+
if evaluation_run.judgment_scorers
|
359
|
+
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
360
|
+
)
|
614
361
|
results, url = _poll_evaluation_until_complete(
|
615
|
-
|
362
|
+
experiment_run_id=evaluation_run.id,
|
616
363
|
project_name=evaluation_run.project_name,
|
617
364
|
judgment_api_key=judgment_api_key,
|
618
365
|
organization_id=evaluation_run.organization_id,
|
619
|
-
expected_scorer_data_count=(
|
620
|
-
len(evaluation_run.scorers) * len(evaluation_run.examples)
|
621
|
-
)
|
622
|
-
+ old_scorer_data_count,
|
366
|
+
expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
|
623
367
|
)
|
624
368
|
finally:
|
625
369
|
stop_event.set()
|
626
370
|
t.join()
|
627
|
-
|
628
|
-
if len(local_scorers) > 0:
|
371
|
+
else:
|
629
372
|
results = safe_run_async(
|
630
373
|
a_execute_scoring(
|
631
374
|
evaluation_run.examples,
|
632
|
-
|
375
|
+
evaluation_run.custom_scorers,
|
633
376
|
model=evaluation_run.model,
|
634
377
|
throttle_value=0,
|
635
378
|
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
@@ -640,9 +383,10 @@ def run_eval(
|
|
640
383
|
scoring_result.model_dump(warnings=False) for scoring_result in results
|
641
384
|
]
|
642
385
|
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
643
|
-
|
644
|
-
|
645
|
-
|
386
|
+
if show_url:
|
387
|
+
rprint(
|
388
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
389
|
+
)
|
646
390
|
return results
|
647
391
|
|
648
392
|
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
|
|
26
26
|
name: Optional[str] = (
|
27
27
|
None # name of your scorer (Faithfulness, PromptScorer-randomslug)
|
28
28
|
)
|
29
|
+
class_name: Optional[str] = None # The name of the class of the scorer
|
29
30
|
score: Optional[float] = None # The float score of the scorer run on the test case
|
30
31
|
score_breakdown: Optional[Dict] = None
|
31
32
|
reason: Optional[str] = ""
|
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
|
|
39
40
|
error: Optional[str] = None # The error message if the scorer failed
|
40
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
41
42
|
user: Optional[str] = None # The user ID of the scorer
|
43
|
+
server_hosted: bool = False # Whether the scorer is enabled for e2b
|
42
44
|
|
43
|
-
@model_validator(mode="
|
45
|
+
@model_validator(mode="after")
|
44
46
|
@classmethod
|
45
|
-
def enforce_strict_threshold(cls, data:
|
46
|
-
if data.
|
47
|
-
data
|
47
|
+
def enforce_strict_threshold(cls, data: "BaseScorer"):
|
48
|
+
if data.strict_mode:
|
49
|
+
data.threshold = 1.0
|
48
50
|
return data
|
49
51
|
|
50
52
|
@model_validator(mode="after")
|
51
53
|
@classmethod
|
52
54
|
def default_name(cls, m: "BaseScorer") -> "BaseScorer":
|
55
|
+
# Always set class_name to the string name of the class
|
56
|
+
m.class_name = m.__class__.__name__
|
53
57
|
if not m.name:
|
54
|
-
|
55
|
-
class_name = getattr(m, "__class__", None)
|
56
|
-
if class_name and getattr(m.__class__, "__name__", None):
|
57
|
-
m.name = m.__class__.__name__
|
58
|
-
else:
|
59
|
-
m.name = m.score_type
|
58
|
+
m.name = m.class_name
|
60
59
|
return m
|
61
60
|
|
62
61
|
def _add_model(self, model: str):
|
@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
|
|
11
11
|
def push_prompt_scorer(
|
12
12
|
name: str,
|
13
13
|
prompt: str,
|
14
|
+
threshold: float,
|
14
15
|
options: Optional[Dict[str, float]] = None,
|
15
16
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
16
17
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
17
18
|
) -> str:
|
18
19
|
client = JudgmentApiClient(judgment_api_key, organization_id)
|
19
20
|
try:
|
20
|
-
r = client.save_scorer(name, prompt, options)
|
21
|
+
r = client.save_scorer(name, prompt, threshold, options)
|
21
22
|
except JudgmentAPIException as e:
|
22
23
|
if e.status_code == 500:
|
23
24
|
raise JudgmentAPIError(
|
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
|
|
90
91
|
return cls(
|
91
92
|
name=name,
|
92
93
|
prompt=scorer_config["prompt"],
|
94
|
+
threshold=scorer_config["threshold"],
|
93
95
|
options=scorer_config.get("options"),
|
94
96
|
judgment_api_key=judgment_api_key,
|
95
97
|
organization_id=organization_id,
|
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
|
|
100
102
|
cls,
|
101
103
|
name: str,
|
102
104
|
prompt: str,
|
105
|
+
threshold: Optional[float] = 0.5,
|
103
106
|
options: Optional[Dict[str, float]] = None,
|
104
107
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
105
108
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
106
109
|
):
|
107
110
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
108
|
-
push_prompt_scorer(
|
111
|
+
push_prompt_scorer(
|
112
|
+
name, prompt, threshold, options, judgment_api_key, organization_id
|
113
|
+
)
|
109
114
|
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
110
115
|
return cls(
|
111
116
|
name=name,
|
112
117
|
prompt=prompt,
|
118
|
+
threshold=threshold,
|
113
119
|
options=options,
|
114
120
|
judgment_api_key=judgment_api_key,
|
115
121
|
organization_id=organization_id,
|
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
|
|
158
164
|
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
159
165
|
|
160
166
|
# Getters
|
167
|
+
def get_threshold(self) -> float | None:
|
168
|
+
"""
|
169
|
+
Returns the threshold of the scorer.
|
170
|
+
"""
|
171
|
+
return self.threshold
|
172
|
+
|
161
173
|
def get_prompt(self) -> str | None:
|
162
174
|
"""
|
163
175
|
Returns the prompt of the scorer.
|
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
|
|
183
195
|
return {
|
184
196
|
"name": self.name,
|
185
197
|
"prompt": self.prompt,
|
198
|
+
"threshold": self.threshold,
|
186
199
|
"options": self.options,
|
187
200
|
}
|
188
201
|
|
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
|
|
193
206
|
push_prompt_scorer(
|
194
207
|
self.name,
|
195
208
|
self.prompt,
|
209
|
+
self.threshold,
|
196
210
|
self.options,
|
197
211
|
self.judgment_api_key,
|
198
212
|
self.organization_id,
|
199
213
|
)
|
200
214
|
|
201
215
|
def __str__(self):
|
202
|
-
return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
|
216
|
+
return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
|
203
217
|
|
204
218
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
205
219
|
base = super().model_dump(*args, **kwargs)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -11,6 +11,8 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: boto3
|
14
|
+
Requires-Dist: click<8.2.0
|
15
|
+
Requires-Dist: fireworks-ai>=0.19.18
|
14
16
|
Requires-Dist: langchain-anthropic
|
15
17
|
Requires-Dist: langchain-core
|
16
18
|
Requires-Dist: langchain-huggingface
|
@@ -23,6 +25,7 @@ Requires-Dist: orjson>=3.9.0
|
|
23
25
|
Requires-Dist: python-dotenv
|
24
26
|
Requires-Dist: requests
|
25
27
|
Requires-Dist: rich
|
28
|
+
Requires-Dist: typer>=0.9.0
|
26
29
|
Provides-Extra: langchain
|
27
30
|
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
28
31
|
Requires-Dist: langchain-core; extra == 'langchain'
|
@@ -37,7 +40,7 @@ Description-Content-Type: text/markdown
|
|
37
40
|
|
38
41
|
<br>
|
39
42
|
<div style="font-size: 1.5em;">
|
40
|
-
Enable self-learning agents with
|
43
|
+
Enable self-learning agents with environment data and evals.
|
41
44
|
</div>
|
42
45
|
|
43
46
|
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
@@ -54,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
|
|
54
57
|
|
55
58
|
</div>
|
56
59
|
|
57
|
-
Judgeval offers **open-source tooling** for
|
60
|
+
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
58
61
|
|
59
62
|
## 🎬 See Judgeval in Action
|
60
63
|
|
61
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval
|
64
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
62
65
|
|
63
66
|
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
64
67
|
<tr>
|
@@ -67,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
|
|
67
70
|
<br><strong>🤖 Agents Running</strong>
|
68
71
|
</td>
|
69
72
|
<td align="center" style="padding: 8px; width: 50%;">
|
70
|
-
<img src="assets/trace.gif" alt="
|
71
|
-
<br><strong>📊
|
73
|
+
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
74
|
+
<br><strong>📊 Capturing Environment Data </strong>
|
72
75
|
</td>
|
73
76
|
</tr>
|
74
77
|
<tr>
|
@@ -109,54 +112,14 @@ export JUDGMENT_ORG_ID=...
|
|
109
112
|
|
110
113
|
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
111
114
|
|
112
|
-
## 🏁 Quickstarts
|
113
|
-
|
114
|
-
### 🛰️ Tracing
|
115
|
-
|
116
|
-
Create a file named `agent.py` with the following code:
|
117
|
-
|
118
|
-
```python
|
119
|
-
from judgeval.tracer import Tracer, wrap
|
120
|
-
from openai import OpenAI
|
121
|
-
|
122
|
-
client = wrap(OpenAI()) # tracks all LLM calls
|
123
|
-
judgment = Tracer(project_name="my_project")
|
124
|
-
|
125
|
-
@judgment.observe(span_type="tool")
|
126
|
-
def format_question(question: str) -> str:
|
127
|
-
# dummy tool
|
128
|
-
return f"Question : {question}"
|
129
|
-
|
130
|
-
@judgment.observe(span_type="function")
|
131
|
-
def run_agent(prompt: str) -> str:
|
132
|
-
task = format_question(prompt)
|
133
|
-
response = client.chat.completions.create(
|
134
|
-
model="gpt-4.1",
|
135
|
-
messages=[{"role": "user", "content": task}]
|
136
|
-
)
|
137
|
-
return response.choices[0].message.content
|
138
|
-
|
139
|
-
run_agent("What is the capital of the United States?")
|
140
|
-
```
|
141
|
-
You'll see your trace exported to the Judgment Platform:
|
142
|
-
|
143
|
-
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
144
|
-
|
145
|
-
|
146
|
-
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
147
|
-
|
148
|
-
|
149
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
150
|
-
|
151
115
|
|
152
116
|
## ✨ Features
|
153
117
|
|
154
118
|
| | |
|
155
119
|
|:---|:---:|
|
156
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
157
120
|
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
158
121
|
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
159
|
-
| <h3>📊 Datasets</h3>Export
|
122
|
+
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
160
123
|
|
161
124
|
## 🏢 Self-Hosting
|
162
125
|
|