judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/cli.py +65 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +46 -54
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +241 -0
- judgeval/common/tracer/core.py +772 -467
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +16 -26
- judgeval/constants.py +1 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +38 -8
- judgeval/data/trace.py +6 -122
- judgeval/data/trace_run.py +2 -3
- judgeval/dataset.py +2 -0
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +113 -53
- judgeval/local_eval_queue.py +190 -0
- judgeval/run_evaluation.py +43 -197
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- judgeval/scorers/score.py +33 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
- judgeval-0.6.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -76
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import asyncio
|
2
4
|
import concurrent.futures
|
3
5
|
import time
|
4
6
|
import orjson
|
5
7
|
import sys
|
6
8
|
import threading
|
7
|
-
from typing import List, Dict, Union, Optional, Callable, Tuple, Any
|
9
|
+
from typing import List, Dict, Union, Optional, Callable, Tuple, Any, TYPE_CHECKING
|
8
10
|
from rich import print as rprint
|
9
11
|
|
10
12
|
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
@@ -17,10 +19,13 @@ from judgeval.constants import (
|
|
17
19
|
from judgeval.common.exceptions import JudgmentAPIError
|
18
20
|
from judgeval.common.api.api import JudgmentAPIException
|
19
21
|
from judgeval.common.logger import judgeval_logger
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
from judgeval.
|
22
|
+
|
23
|
+
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from judgeval.common.tracer import Tracer
|
26
|
+
from judgeval.data.trace_run import TraceRun
|
27
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
28
|
+
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
24
29
|
|
25
30
|
|
26
31
|
def safe_run_async(coro):
|
@@ -135,80 +140,6 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
135
140
|
return results
|
136
141
|
|
137
142
|
|
138
|
-
def check_experiment_type(
|
139
|
-
eval_name: str,
|
140
|
-
project_name: str,
|
141
|
-
judgment_api_key: str,
|
142
|
-
organization_id: str,
|
143
|
-
is_trace: bool,
|
144
|
-
) -> None:
|
145
|
-
"""
|
146
|
-
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
147
|
-
"""
|
148
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
149
|
-
|
150
|
-
try:
|
151
|
-
api_client.check_experiment_type(eval_name, project_name, is_trace)
|
152
|
-
except JudgmentAPIException as e:
|
153
|
-
if e.response.status_code == 422:
|
154
|
-
judgeval_logger.error(f"{e.response_json}")
|
155
|
-
raise ValueError(f"{e.response_json}")
|
156
|
-
else:
|
157
|
-
raise e
|
158
|
-
except Exception as e:
|
159
|
-
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
160
|
-
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
161
|
-
|
162
|
-
|
163
|
-
def check_eval_run_name_exists(
|
164
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
165
|
-
) -> None:
|
166
|
-
"""
|
167
|
-
Checks if an evaluation run name already exists for a given project.
|
168
|
-
|
169
|
-
Args:
|
170
|
-
eval_name (str): Name of the evaluation run
|
171
|
-
project_name (str): Name of the project
|
172
|
-
judgment_api_key (str): API key for authentication
|
173
|
-
|
174
|
-
Raises:
|
175
|
-
ValueError: If the evaluation run name already exists
|
176
|
-
JudgmentAPIError: If there's an API error during the check
|
177
|
-
"""
|
178
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
179
|
-
try:
|
180
|
-
api_client.check_eval_run_name_exists(eval_name, project_name)
|
181
|
-
except JudgmentAPIException as e:
|
182
|
-
if e.response.status_code == 409:
|
183
|
-
error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
|
184
|
-
judgeval_logger.error(error_str)
|
185
|
-
raise ValueError(error_str)
|
186
|
-
else:
|
187
|
-
raise e
|
188
|
-
|
189
|
-
except Exception as e:
|
190
|
-
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
191
|
-
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
192
|
-
|
193
|
-
|
194
|
-
def check_example_keys(
|
195
|
-
keys: List[str],
|
196
|
-
eval_name: str,
|
197
|
-
project_name: str,
|
198
|
-
judgment_api_key: str,
|
199
|
-
organization_id: str,
|
200
|
-
) -> None:
|
201
|
-
"""
|
202
|
-
Checks if the current experiment (if one exists) has the same keys for example
|
203
|
-
"""
|
204
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
205
|
-
try:
|
206
|
-
api_client.check_example_keys(keys, eval_name, project_name)
|
207
|
-
except Exception as e:
|
208
|
-
judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
|
209
|
-
raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
|
210
|
-
|
211
|
-
|
212
143
|
def log_evaluation_results(
|
213
144
|
scoring_results: List[ScoringResult],
|
214
145
|
run: Union[EvaluationRun, TraceRun],
|
@@ -280,29 +211,10 @@ def check_examples(
|
|
280
211
|
def run_trace_eval(
|
281
212
|
trace_run: TraceRun,
|
282
213
|
judgment_api_key: str,
|
283
|
-
override: bool = False,
|
284
214
|
function: Optional[Callable] = None,
|
285
|
-
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
215
|
+
tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
|
286
216
|
examples: Optional[List[Example]] = None,
|
287
217
|
) -> List[ScoringResult]:
|
288
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
289
|
-
if not override and not trace_run.append:
|
290
|
-
check_eval_run_name_exists(
|
291
|
-
trace_run.eval_name,
|
292
|
-
trace_run.project_name,
|
293
|
-
judgment_api_key,
|
294
|
-
trace_run.organization_id,
|
295
|
-
)
|
296
|
-
|
297
|
-
if trace_run.append:
|
298
|
-
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
299
|
-
check_experiment_type(
|
300
|
-
trace_run.eval_name,
|
301
|
-
trace_run.project_name,
|
302
|
-
judgment_api_key,
|
303
|
-
trace_run.organization_id,
|
304
|
-
True,
|
305
|
-
)
|
306
218
|
if function and tracer and examples is not None:
|
307
219
|
new_traces: List[Trace] = []
|
308
220
|
|
@@ -371,43 +283,8 @@ def run_trace_eval(
|
|
371
283
|
return scoring_results
|
372
284
|
|
373
285
|
|
374
|
-
async def get_evaluation_status(
|
375
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
376
|
-
) -> Dict:
|
377
|
-
"""
|
378
|
-
Gets the status of an async evaluation run.
|
379
|
-
|
380
|
-
Args:
|
381
|
-
eval_name (str): Name of the evaluation run
|
382
|
-
project_name (str): Name of the project
|
383
|
-
judgment_api_key (str): API key for authentication
|
384
|
-
organization_id (str): Organization ID for the evaluation
|
385
|
-
|
386
|
-
Returns:
|
387
|
-
Dict: Status information including:
|
388
|
-
- status: 'pending', 'running', 'completed', or 'failed'
|
389
|
-
- results: List of ScoringResult objects if completed
|
390
|
-
- error: Error message if failed
|
391
|
-
"""
|
392
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
393
|
-
try:
|
394
|
-
return api_client.get_evaluation_status(eval_name, project_name)
|
395
|
-
except Exception as e:
|
396
|
-
raise JudgmentAPIError(
|
397
|
-
f"An error occurred while checking evaluation status: {str(e)}"
|
398
|
-
)
|
399
|
-
|
400
|
-
|
401
|
-
def retrieve_counts(result: Dict):
|
402
|
-
scorer_data_count = 0
|
403
|
-
for example in result.get("examples", []):
|
404
|
-
for scorer in example.get("scorer_data", []):
|
405
|
-
scorer_data_count += 1
|
406
|
-
return scorer_data_count
|
407
|
-
|
408
|
-
|
409
286
|
def _poll_evaluation_until_complete(
|
410
|
-
|
287
|
+
experiment_run_id: str,
|
411
288
|
project_name: str,
|
412
289
|
judgment_api_key: str,
|
413
290
|
organization_id: str,
|
@@ -438,14 +315,16 @@ def _poll_evaluation_until_complete(
|
|
438
315
|
poll_count += 1
|
439
316
|
try:
|
440
317
|
# Check status
|
441
|
-
status_response = api_client.get_evaluation_status(
|
318
|
+
status_response = api_client.get_evaluation_status(
|
319
|
+
experiment_run_id, project_name
|
320
|
+
)
|
442
321
|
|
443
322
|
if status_response.get("status") != "completed":
|
444
323
|
time.sleep(poll_interval_seconds)
|
445
324
|
continue
|
446
325
|
|
447
326
|
results_response = api_client.fetch_evaluation_results(
|
448
|
-
|
327
|
+
experiment_run_id, project_name
|
449
328
|
)
|
450
329
|
url = results_response.get("ui_results_url")
|
451
330
|
|
@@ -508,14 +387,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
508
387
|
def run_eval(
|
509
388
|
evaluation_run: EvaluationRun,
|
510
389
|
judgment_api_key: str,
|
511
|
-
override: bool = False,
|
512
390
|
) -> List[ScoringResult]:
|
513
391
|
"""
|
514
392
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
515
393
|
|
516
394
|
Args:
|
517
395
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
518
|
-
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
519
396
|
|
520
397
|
Returns:
|
521
398
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -529,52 +406,31 @@ def run_eval(
|
|
529
406
|
f"All examples must have the same keys: {current_keys} != {keys}"
|
530
407
|
)
|
531
408
|
|
532
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
533
|
-
if not override and not evaluation_run.append:
|
534
|
-
check_eval_run_name_exists(
|
535
|
-
evaluation_run.eval_name,
|
536
|
-
evaluation_run.project_name,
|
537
|
-
judgment_api_key,
|
538
|
-
evaluation_run.organization_id,
|
539
|
-
)
|
540
|
-
|
541
|
-
if evaluation_run.append:
|
542
|
-
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
543
|
-
check_experiment_type(
|
544
|
-
evaluation_run.eval_name,
|
545
|
-
evaluation_run.project_name,
|
546
|
-
judgment_api_key,
|
547
|
-
evaluation_run.organization_id,
|
548
|
-
False,
|
549
|
-
)
|
550
|
-
|
551
|
-
# Ensure that current experiment (if one exists) has the same keys for example
|
552
|
-
check_example_keys(
|
553
|
-
keys=list(keys),
|
554
|
-
eval_name=evaluation_run.eval_name,
|
555
|
-
project_name=evaluation_run.project_name,
|
556
|
-
judgment_api_key=judgment_api_key,
|
557
|
-
organization_id=evaluation_run.organization_id,
|
558
|
-
)
|
559
|
-
|
560
|
-
judgment_scorers: List[APIScorerConfig] = []
|
561
|
-
local_scorers: List[BaseScorer] = []
|
562
|
-
for scorer in evaluation_run.scorers:
|
563
|
-
if isinstance(scorer, APIScorerConfig):
|
564
|
-
judgment_scorers.append(scorer)
|
565
|
-
else:
|
566
|
-
local_scorers.append(scorer)
|
567
|
-
|
568
409
|
results: List[ScoringResult] = []
|
569
410
|
url = ""
|
570
411
|
|
571
|
-
if
|
412
|
+
if (
|
413
|
+
len(evaluation_run.custom_scorers) > 0
|
414
|
+
and len(evaluation_run.judgment_scorers) > 0
|
415
|
+
):
|
572
416
|
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
573
417
|
judgeval_logger.error(error_msg)
|
574
418
|
raise ValueError(error_msg)
|
575
419
|
|
576
|
-
|
577
|
-
|
420
|
+
e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
|
421
|
+
|
422
|
+
if evaluation_run.judgment_scorers or e2b_scorers:
|
423
|
+
if evaluation_run.judgment_scorers and e2b_scorers:
|
424
|
+
error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
|
425
|
+
judgeval_logger.error(error_msg)
|
426
|
+
raise ValueError(error_msg)
|
427
|
+
|
428
|
+
if len(e2b_scorers) > 1:
|
429
|
+
error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
|
430
|
+
judgeval_logger.error(error_msg)
|
431
|
+
raise ValueError(error_msg)
|
432
|
+
|
433
|
+
check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
|
578
434
|
stop_event = threading.Event()
|
579
435
|
t = threading.Thread(
|
580
436
|
target=progress_logger, args=(stop_event, "Running evaluation...")
|
@@ -595,36 +451,26 @@ def run_eval(
|
|
595
451
|
)
|
596
452
|
raise JudgmentAPIError(error_message)
|
597
453
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
)
|
604
|
-
old_scorer_data_count = retrieve_counts(results_response)
|
605
|
-
except Exception:
|
606
|
-
# This usually means the user did append = True but the eval run name doesn't exist yet
|
607
|
-
pass
|
608
|
-
|
454
|
+
num_scorers = (
|
455
|
+
len(evaluation_run.judgment_scorers)
|
456
|
+
if evaluation_run.judgment_scorers
|
457
|
+
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
458
|
+
)
|
609
459
|
results, url = _poll_evaluation_until_complete(
|
610
|
-
|
460
|
+
experiment_run_id=evaluation_run.id,
|
611
461
|
project_name=evaluation_run.project_name,
|
612
462
|
judgment_api_key=judgment_api_key,
|
613
463
|
organization_id=evaluation_run.organization_id,
|
614
|
-
expected_scorer_data_count=(
|
615
|
-
len(evaluation_run.scorers) * len(evaluation_run.examples)
|
616
|
-
)
|
617
|
-
+ old_scorer_data_count,
|
464
|
+
expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
|
618
465
|
)
|
619
466
|
finally:
|
620
467
|
stop_event.set()
|
621
468
|
t.join()
|
622
|
-
|
623
|
-
if len(local_scorers) > 0:
|
469
|
+
else:
|
624
470
|
results = safe_run_async(
|
625
471
|
a_execute_scoring(
|
626
472
|
evaluation_run.examples,
|
627
|
-
|
473
|
+
evaluation_run.custom_scorers,
|
628
474
|
model=evaluation_run.model,
|
629
475
|
throttle_value=0,
|
630
476
|
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
|
|
26
26
|
name: Optional[str] = (
|
27
27
|
None # name of your scorer (Faithfulness, PromptScorer-randomslug)
|
28
28
|
)
|
29
|
+
class_name: Optional[str] = None # The name of the class of the scorer
|
29
30
|
score: Optional[float] = None # The float score of the scorer run on the test case
|
30
31
|
score_breakdown: Optional[Dict] = None
|
31
32
|
reason: Optional[str] = ""
|
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
|
|
39
40
|
error: Optional[str] = None # The error message if the scorer failed
|
40
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
41
42
|
user: Optional[str] = None # The user ID of the scorer
|
43
|
+
server_hosted: bool = False # Whether the scorer is enabled for e2b
|
42
44
|
|
43
|
-
@model_validator(mode="
|
45
|
+
@model_validator(mode="after")
|
44
46
|
@classmethod
|
45
|
-
def enforce_strict_threshold(cls, data:
|
46
|
-
if data.
|
47
|
-
data
|
47
|
+
def enforce_strict_threshold(cls, data: "BaseScorer"):
|
48
|
+
if data.strict_mode:
|
49
|
+
data.threshold = 1.0
|
48
50
|
return data
|
49
51
|
|
50
52
|
@model_validator(mode="after")
|
51
53
|
@classmethod
|
52
54
|
def default_name(cls, m: "BaseScorer") -> "BaseScorer":
|
55
|
+
# Always set class_name to the string name of the class
|
56
|
+
m.class_name = m.__class__.__name__
|
53
57
|
if not m.name:
|
54
|
-
|
55
|
-
class_name = getattr(m, "__class__", None)
|
56
|
-
if class_name and getattr(m.__class__, "__name__", None):
|
57
|
-
m.name = m.__class__.__name__
|
58
|
-
else:
|
59
|
-
m.name = m.score_type
|
58
|
+
m.name = m.class_name
|
60
59
|
return m
|
61
60
|
|
62
61
|
def _add_model(self, model: str):
|
@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
|
|
11
11
|
def push_prompt_scorer(
|
12
12
|
name: str,
|
13
13
|
prompt: str,
|
14
|
+
threshold: float,
|
14
15
|
options: Optional[Dict[str, float]] = None,
|
15
16
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
16
17
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
17
18
|
) -> str:
|
18
19
|
client = JudgmentApiClient(judgment_api_key, organization_id)
|
19
20
|
try:
|
20
|
-
r = client.save_scorer(name, prompt, options)
|
21
|
+
r = client.save_scorer(name, prompt, threshold, options)
|
21
22
|
except JudgmentAPIException as e:
|
22
23
|
if e.status_code == 500:
|
23
24
|
raise JudgmentAPIError(
|
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
|
|
90
91
|
return cls(
|
91
92
|
name=name,
|
92
93
|
prompt=scorer_config["prompt"],
|
94
|
+
threshold=scorer_config["threshold"],
|
93
95
|
options=scorer_config.get("options"),
|
94
96
|
judgment_api_key=judgment_api_key,
|
95
97
|
organization_id=organization_id,
|
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
|
|
100
102
|
cls,
|
101
103
|
name: str,
|
102
104
|
prompt: str,
|
105
|
+
threshold: Optional[float] = 0.5,
|
103
106
|
options: Optional[Dict[str, float]] = None,
|
104
107
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
105
108
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
106
109
|
):
|
107
110
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
108
|
-
push_prompt_scorer(
|
111
|
+
push_prompt_scorer(
|
112
|
+
name, prompt, threshold, options, judgment_api_key, organization_id
|
113
|
+
)
|
109
114
|
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
110
115
|
return cls(
|
111
116
|
name=name,
|
112
117
|
prompt=prompt,
|
118
|
+
threshold=threshold,
|
113
119
|
options=options,
|
114
120
|
judgment_api_key=judgment_api_key,
|
115
121
|
organization_id=organization_id,
|
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
|
|
158
164
|
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
159
165
|
|
160
166
|
# Getters
|
167
|
+
def get_threshold(self) -> float | None:
|
168
|
+
"""
|
169
|
+
Returns the threshold of the scorer.
|
170
|
+
"""
|
171
|
+
return self.threshold
|
172
|
+
|
161
173
|
def get_prompt(self) -> str | None:
|
162
174
|
"""
|
163
175
|
Returns the prompt of the scorer.
|
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
|
|
183
195
|
return {
|
184
196
|
"name": self.name,
|
185
197
|
"prompt": self.prompt,
|
198
|
+
"threshold": self.threshold,
|
186
199
|
"options": self.options,
|
187
200
|
}
|
188
201
|
|
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
|
|
193
206
|
push_prompt_scorer(
|
194
207
|
self.name,
|
195
208
|
self.prompt,
|
209
|
+
self.threshold,
|
196
210
|
self.options,
|
197
211
|
self.judgment_api_key,
|
198
212
|
self.organization_id,
|
199
213
|
)
|
200
214
|
|
201
215
|
def __str__(self):
|
202
|
-
return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
|
216
|
+
return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
|
203
217
|
|
204
218
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
205
219
|
base = super().model_dump(*args, **kwargs)
|
judgeval/scorers/score.py
CHANGED
@@ -17,6 +17,7 @@ from judgeval.scorers import BaseScorer
|
|
17
17
|
from judgeval.scorers.utils import clone_scorers
|
18
18
|
from judgeval.common.logger import judgeval_logger
|
19
19
|
from judgeval.judges import JudgevalJudge
|
20
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
20
21
|
|
21
22
|
|
22
23
|
async def safe_a_score_example(
|
@@ -55,10 +56,11 @@ async def safe_a_score_example(
|
|
55
56
|
async def a_execute_scoring(
|
56
57
|
examples: List[Example],
|
57
58
|
scorers: List[BaseScorer],
|
58
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] =
|
59
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = DEFAULT_GPT_MODEL,
|
59
60
|
ignore_errors: bool = False,
|
60
61
|
throttle_value: int = 0,
|
61
62
|
max_concurrent: int = 100,
|
63
|
+
show_progress: bool = True,
|
62
64
|
) -> List[ScoringResult]:
|
63
65
|
"""
|
64
66
|
Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
|
@@ -71,8 +73,7 @@ async def a_execute_scoring(
|
|
71
73
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
72
74
|
throttle_value (int): The amount of time to wait between starting each task.
|
73
75
|
max_concurrent (int): The maximum number of concurrent tasks.
|
74
|
-
|
75
|
-
_use_bar_indicator (bool): Whether to use a progress bar indicator.
|
76
|
+
show_progress (bool): Whether to show the progress bar indicator.
|
76
77
|
|
77
78
|
Returns:
|
78
79
|
List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
|
@@ -101,16 +102,37 @@ async def a_execute_scoring(
|
|
101
102
|
tasks = []
|
102
103
|
cloned_scorers: List[BaseScorer]
|
103
104
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
105
|
+
if show_progress:
|
106
|
+
with tqdm_asyncio(
|
107
|
+
desc=f"Evaluating {len(examples)} example(s) in parallel",
|
108
|
+
unit="Example",
|
109
|
+
total=len(examples),
|
110
|
+
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
111
|
+
) as pbar:
|
112
|
+
for i, ex in enumerate(examples):
|
113
|
+
if isinstance(ex, Example):
|
114
|
+
if len(scorers) == 0:
|
115
|
+
pbar.update(1)
|
116
|
+
continue
|
117
|
+
|
118
|
+
cloned_scorers = clone_scorers(scorers)
|
119
|
+
task = execute_with_semaphore(
|
120
|
+
func=a_eval_examples_helper,
|
121
|
+
scorers=cloned_scorers,
|
122
|
+
example=ex,
|
123
|
+
scoring_results=scoring_results,
|
124
|
+
score_index=i,
|
125
|
+
ignore_errors=ignore_errors,
|
126
|
+
pbar=pbar,
|
127
|
+
)
|
128
|
+
tasks.append(asyncio.create_task(task))
|
129
|
+
|
130
|
+
await asyncio.sleep(throttle_value)
|
131
|
+
await asyncio.gather(*tasks)
|
132
|
+
else:
|
110
133
|
for i, ex in enumerate(examples):
|
111
134
|
if isinstance(ex, Example):
|
112
135
|
if len(scorers) == 0:
|
113
|
-
pbar.update(1)
|
114
136
|
continue
|
115
137
|
|
116
138
|
cloned_scorers = clone_scorers(scorers)
|
@@ -121,7 +143,7 @@ async def a_execute_scoring(
|
|
121
143
|
scoring_results=scoring_results,
|
122
144
|
score_index=i,
|
123
145
|
ignore_errors=ignore_errors,
|
124
|
-
pbar=
|
146
|
+
pbar=None,
|
125
147
|
)
|
126
148
|
tasks.append(asyncio.create_task(task))
|
127
149
|
|
@@ -0,0 +1,36 @@
|
|
1
|
+
"""Async utilities for judgeval."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import concurrent.futures
|
5
|
+
from typing import Awaitable, TypeVar
|
6
|
+
|
7
|
+
|
8
|
+
# Generic type variable for coroutine return type
|
9
|
+
T = TypeVar("T")
|
10
|
+
|
11
|
+
|
12
|
+
def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
13
|
+
"""Safely execute an async *coro* from synchronous code.
|
14
|
+
|
15
|
+
This helper handles two common situations:
|
16
|
+
|
17
|
+
1. **No running event loop** – Simply delegates to ``asyncio.run``.
|
18
|
+
2. **Existing running loop** – Executes the coroutine in a separate
|
19
|
+
thread so that we don't attempt to nest event loops (which would raise
|
20
|
+
``RuntimeError``).
|
21
|
+
|
22
|
+
Args:
|
23
|
+
coro: The coroutine to execute.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
The result returned by *coro*.
|
27
|
+
"""
|
28
|
+
|
29
|
+
try:
|
30
|
+
asyncio.get_running_loop()
|
31
|
+
except RuntimeError:
|
32
|
+
return asyncio.run(coro)
|
33
|
+
|
34
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
35
|
+
future = executor.submit(lambda: asyncio.run(coro))
|
36
|
+
return future.result()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -10,27 +10,26 @@ License-File: LICENSE.md
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
|
-
Requires-Dist: anthropic
|
14
13
|
Requires-Dist: boto3
|
15
|
-
Requires-Dist:
|
16
|
-
Requires-Dist: google-genai
|
17
|
-
Requires-Dist: groq>=0.30.0
|
14
|
+
Requires-Dist: click<8.2.0
|
18
15
|
Requires-Dist: langchain-anthropic
|
19
16
|
Requires-Dist: langchain-core
|
20
17
|
Requires-Dist: langchain-huggingface
|
21
18
|
Requires-Dist: langchain-openai
|
22
19
|
Requires-Dist: litellm>=1.61.15
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist: nest-asyncio
|
25
|
-
Requires-Dist: openai
|
20
|
+
Requires-Dist: nest-asyncio>=1.6.0
|
26
21
|
Requires-Dist: opentelemetry-api>=1.34.1
|
27
22
|
Requires-Dist: opentelemetry-sdk>=1.34.1
|
28
23
|
Requires-Dist: orjson>=3.9.0
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist: python-dotenv==1.0.1
|
31
|
-
Requires-Dist: python-slugify>=8.0.4
|
24
|
+
Requires-Dist: python-dotenv
|
32
25
|
Requires-Dist: requests
|
33
|
-
Requires-Dist:
|
26
|
+
Requires-Dist: rich
|
27
|
+
Requires-Dist: typer>=0.9.0
|
28
|
+
Provides-Extra: langchain
|
29
|
+
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
30
|
+
Requires-Dist: langchain-core; extra == 'langchain'
|
31
|
+
Requires-Dist: langchain-huggingface; extra == 'langchain'
|
32
|
+
Requires-Dist: langchain-openai; extra == 'langchain'
|
34
33
|
Description-Content-Type: text/markdown
|
35
34
|
|
36
35
|
<div align="center">
|