judgeval 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +4 -4
- judgeval/api/__init__.py +17 -9
- judgeval/api/api_types.py +20 -18
- judgeval/data/evaluation_run.py +10 -11
- judgeval/data/judgment_types.py +25 -14
- judgeval/data/result.py +1 -0
- judgeval/data/scorer_data.py +1 -26
- judgeval/dataset/__init__.py +17 -16
- judgeval/env.py +11 -2
- judgeval/evaluation/__init__.py +20 -63
- judgeval/integrations/langgraph/__init__.py +2 -1
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +15 -15
- judgeval/scorers/api_scorer.py +0 -8
- judgeval/scorers/base_scorer.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +3 -5
- judgeval/scorers/score.py +1 -1
- judgeval/tracer/__init__.py +7 -10
- judgeval/tracer/local_eval_queue.py +11 -7
- judgeval/tracer/utils.py +2 -2
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/trainable_model.py +1 -1
- judgeval/trainer/trainer.py +8 -6
- judgeval/utils/async_utils.py +7 -3
- judgeval/utils/testing.py +0 -4
- {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/METADATA +1 -1
- {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/RECORD +34 -35
- judgeval/data/tool.py +0 -5
- {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/WHEEL +0 -0
- {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/evaluation/__init__.py
CHANGED
@@ -3,14 +3,11 @@ from __future__ import annotations
|
|
3
3
|
import asyncio
|
4
4
|
import concurrent.futures
|
5
5
|
import time
|
6
|
-
import orjson
|
7
|
-
import sys
|
8
6
|
import threading
|
9
|
-
from typing import List,
|
7
|
+
from typing import List, Tuple, TYPE_CHECKING
|
10
8
|
from rich import print as rprint
|
11
9
|
|
12
|
-
from judgeval.data import ScorerData, ScoringResult
|
13
|
-
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
10
|
+
from judgeval.data import ScorerData, ScoringResult
|
14
11
|
from judgeval.scorers.score import a_execute_scoring
|
15
12
|
from judgeval.api import JudgmentSyncClient
|
16
13
|
from judgeval.env import (
|
@@ -19,9 +16,10 @@ from judgeval.env import (
|
|
19
16
|
from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
|
20
17
|
from judgeval.logger import judgeval_logger
|
21
18
|
|
19
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
22
20
|
|
23
21
|
if TYPE_CHECKING:
|
24
|
-
from judgeval.data.evaluation_run import
|
22
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
25
23
|
|
26
24
|
|
27
25
|
def safe_run_async(coro):
|
@@ -49,8 +47,7 @@ def safe_run_async(coro):
|
|
49
47
|
|
50
48
|
def log_evaluation_results(
|
51
49
|
scoring_results: List[ScoringResult],
|
52
|
-
run:
|
53
|
-
judgment_api_key: str,
|
50
|
+
run: ExampleEvaluationRun,
|
54
51
|
) -> str:
|
55
52
|
"""
|
56
53
|
Logs evaluation results to the Judgment API database.
|
@@ -65,10 +62,10 @@ def log_evaluation_results(
|
|
65
62
|
ValueError: If there's a validation error with the results
|
66
63
|
"""
|
67
64
|
try:
|
68
|
-
if not
|
65
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
69
66
|
raise ValueError("API key and organization ID are required")
|
70
67
|
|
71
|
-
api_client = JudgmentSyncClient(
|
68
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
72
69
|
response = api_client.log_eval_results(
|
73
70
|
{
|
74
71
|
"results": scoring_results, # type: ignore
|
@@ -85,41 +82,8 @@ def log_evaluation_results(
|
|
85
82
|
)
|
86
83
|
|
87
84
|
|
88
|
-
def check_examples(
|
89
|
-
examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
|
90
|
-
) -> None:
|
91
|
-
"""
|
92
|
-
Checks if the example contains the necessary parameters for the scorer.
|
93
|
-
"""
|
94
|
-
prompt_user = False
|
95
|
-
for scorer in scorers:
|
96
|
-
for example in examples:
|
97
|
-
missing_params = []
|
98
|
-
for param in scorer.required_params:
|
99
|
-
if getattr(example, param.value) is None:
|
100
|
-
missing_params.append(f"{param.value}")
|
101
|
-
if missing_params:
|
102
|
-
rprint(
|
103
|
-
f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
|
104
|
-
)
|
105
|
-
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
106
|
-
rprint(
|
107
|
-
f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
|
108
|
-
)
|
109
|
-
rprint("-" * 40)
|
110
|
-
prompt_user = True
|
111
|
-
|
112
|
-
if prompt_user:
|
113
|
-
user_input = input("Do you want to continue? (y/n)")
|
114
|
-
if user_input.lower() != "y":
|
115
|
-
sys.exit(0)
|
116
|
-
else:
|
117
|
-
rprint("[green]Continuing...[/green]")
|
118
|
-
|
119
|
-
|
120
85
|
def _poll_evaluation_until_complete(
|
121
|
-
evaluation_run:
|
122
|
-
judgment_api_key: str,
|
86
|
+
evaluation_run: ExampleEvaluationRun,
|
123
87
|
expected_scorer_data_count: int,
|
124
88
|
poll_interval_seconds: float = 5,
|
125
89
|
max_failures: int = 5,
|
@@ -140,13 +104,15 @@ def _poll_evaluation_until_complete(
|
|
140
104
|
Returns:
|
141
105
|
List[ScoringResult]: The evaluation results
|
142
106
|
"""
|
143
|
-
organization_id = evaluation_run.organization_id
|
144
107
|
project_name = evaluation_run.project_name
|
145
108
|
experiment_run_id = evaluation_run.id
|
146
109
|
|
110
|
+
if not project_name or not experiment_run_id:
|
111
|
+
raise ValueError("Project name and experiment run ID are required")
|
112
|
+
|
147
113
|
poll_count = 0
|
148
114
|
exception_count = 0
|
149
|
-
api_client = JudgmentSyncClient(
|
115
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
150
116
|
while poll_count < max_poll_count:
|
151
117
|
poll_count += 1
|
152
118
|
try:
|
@@ -213,14 +179,13 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
213
179
|
|
214
180
|
|
215
181
|
def run_eval(
|
216
|
-
evaluation_run:
|
217
|
-
judgment_api_key: str,
|
182
|
+
evaluation_run: ExampleEvaluationRun,
|
218
183
|
) -> List[ScoringResult]:
|
219
184
|
"""
|
220
185
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
221
186
|
|
222
187
|
Args:
|
223
|
-
evaluation_run (
|
188
|
+
evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
|
224
189
|
|
225
190
|
Returns:
|
226
191
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -258,16 +223,13 @@ def run_eval(
|
|
258
223
|
judgeval_logger.error(error_msg)
|
259
224
|
raise ValueError(error_msg)
|
260
225
|
|
261
|
-
check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
|
262
226
|
stop_event = threading.Event()
|
263
227
|
t = threading.Thread(
|
264
228
|
target=progress_logger, args=(stop_event, "Running evaluation...")
|
265
229
|
)
|
266
230
|
t.start()
|
267
231
|
try:
|
268
|
-
api_client = JudgmentSyncClient(
|
269
|
-
judgment_api_key, evaluation_run.organization_id
|
270
|
-
)
|
232
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
271
233
|
response = api_client.add_to_run_eval_queue_examples(
|
272
234
|
evaluation_run.model_dump(warnings=False) # type: ignore
|
273
235
|
)
|
@@ -286,7 +248,6 @@ def run_eval(
|
|
286
248
|
)
|
287
249
|
results, url = _poll_evaluation_until_complete(
|
288
250
|
evaluation_run=evaluation_run,
|
289
|
-
judgment_api_key=judgment_api_key,
|
290
251
|
expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
|
291
252
|
)
|
292
253
|
finally:
|
@@ -306,7 +267,7 @@ def run_eval(
|
|
306
267
|
send_results = [
|
307
268
|
scoring_result.model_dump(warnings=False) for scoring_result in results
|
308
269
|
]
|
309
|
-
url = log_evaluation_results(send_results, evaluation_run
|
270
|
+
url = log_evaluation_results(send_results, evaluation_run)
|
310
271
|
rprint(
|
311
272
|
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
312
273
|
)
|
@@ -323,27 +284,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
323
284
|
Returns:
|
324
285
|
None. Raises exceptions for any failed test cases.
|
325
286
|
"""
|
326
|
-
failed_cases: List[ScorerData] = []
|
287
|
+
failed_cases: List[List[ScorerData]] = []
|
327
288
|
|
328
289
|
for result in scoring_results:
|
329
290
|
if not result.success:
|
330
291
|
# Create a test case context with all relevant fields
|
331
|
-
test_case:
|
292
|
+
test_case: List[ScorerData] = []
|
332
293
|
if result.scorers_data:
|
333
294
|
# If the result was not successful, check each scorer_data
|
334
295
|
for scorer_data in result.scorers_data:
|
335
296
|
if not scorer_data.success:
|
336
|
-
|
337
|
-
# Remove threshold, evaluation model for Tool Order scorer
|
338
|
-
scorer_data.threshold = None
|
339
|
-
scorer_data.evaluation_model = None
|
340
|
-
test_case["failed_scorers"].append(scorer_data)
|
297
|
+
test_case.append(scorer_data)
|
341
298
|
failed_cases.append(test_case)
|
342
299
|
|
343
300
|
if failed_cases:
|
344
301
|
error_msg = "The following test cases failed: \n"
|
345
302
|
for fail_case in failed_cases:
|
346
|
-
for fail_scorer in fail_case
|
303
|
+
for fail_scorer in fail_case:
|
347
304
|
error_msg += (
|
348
305
|
f"\nScorer Name: {fail_scorer.name}\n"
|
349
306
|
f"Threshold: {fail_scorer.threshold}\n"
|
@@ -507,6 +507,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
507
507
|
)
|
508
508
|
|
509
509
|
# Extract response content
|
510
|
+
output: Any
|
510
511
|
if response.generations:
|
511
512
|
last_generation = response.generations[-1][-1]
|
512
513
|
if (
|
@@ -547,7 +548,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
547
548
|
for key, value in usage_attrs.items():
|
548
549
|
span.set_attribute(key, value)
|
549
550
|
|
550
|
-
self._end_span(run_id=run_id, outputs=output, **usage_attrs)
|
551
|
+
self._end_span(run_id=run_id, outputs=output, **usage_attrs) # type: ignore
|
551
552
|
|
552
553
|
except Exception as e:
|
553
554
|
judgeval_logger.exception(f"Error in on_llm_end: {e}")
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import (
|
2
2
|
APIScorerConfig,
|
3
|
-
ExampleAPIScorerConfig,
|
4
|
-
TraceAPIScorerConfig,
|
5
3
|
)
|
6
4
|
from judgeval.scorers.base_scorer import BaseScorer
|
7
5
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
@@ -15,8 +13,6 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
15
13
|
|
16
14
|
__all__ = [
|
17
15
|
"APIScorerConfig",
|
18
|
-
"ExampleAPIScorerConfig",
|
19
|
-
"TraceAPIScorerConfig",
|
20
16
|
"BaseScorer",
|
21
17
|
"TracePromptScorer",
|
22
18
|
"PromptScorer",
|
judgeval/scorers/agent_scorer.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
from judgeval.scorers.base_scorer import BaseScorer
|
2
|
-
from judgeval.data.judgment_types import Trace as JudgmentTrace
|
3
|
-
from typing import List, Optional
|
4
|
-
from abc import abstractmethod
|
1
|
+
# from judgeval.scorers.base_scorer import BaseScorer
|
2
|
+
# from judgeval.data.judgment_types import Trace as JudgmentTrace
|
3
|
+
# from typing import List, Optional
|
4
|
+
# from abc import abstractmethod
|
5
5
|
|
6
6
|
|
7
|
-
class TraceScorer(BaseScorer):
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
7
|
+
# class TraceScorer(BaseScorer):
|
8
|
+
# @abstractmethod
|
9
|
+
# async def a_score_trace(
|
10
|
+
# self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
|
11
|
+
# ) -> float:
|
12
|
+
# """
|
13
|
+
# Asynchronously measures the score on a trace
|
14
|
+
# """
|
15
|
+
# raise NotImplementedError(
|
16
|
+
# "You must implement the `a_score_trace` method in your custom scorer"
|
17
|
+
# )
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -63,11 +63,3 @@ class APIScorerConfig(BaseModel):
|
|
63
63
|
|
64
64
|
def __str__(self):
|
65
65
|
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
66
|
-
|
67
|
-
|
68
|
-
class ExampleAPIScorerConfig(APIScorerConfig):
|
69
|
-
pass
|
70
|
-
|
71
|
-
|
72
|
-
class TraceAPIScorerConfig(APIScorerConfig):
|
73
|
-
pass
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -27,7 +27,7 @@ class BaseScorer(BaseModel):
|
|
27
27
|
threshold: float = 0.5
|
28
28
|
|
29
29
|
# name of your scorer (Faithfulness, PromptScorer-randomslug)
|
30
|
-
name:
|
30
|
+
name: str = ""
|
31
31
|
|
32
32
|
# The name of the class of the scorer
|
33
33
|
class_name: Optional[str] = None
|
@@ -42,7 +42,7 @@ class BaseScorer(BaseModel):
|
|
42
42
|
using_native_model: Optional[bool] = None
|
43
43
|
|
44
44
|
# Whether the test case passed or failed
|
45
|
-
success:
|
45
|
+
success: bool = False
|
46
46
|
|
47
47
|
# The name of the model used to evaluate the test case
|
48
48
|
model: Optional[str] = None
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
from typing import List
|
13
13
|
|
14
14
|
|
15
|
-
class AnswerCorrectnessScorer(
|
15
|
+
class AnswerCorrectnessScorer(APIScorerConfig):
|
16
16
|
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
17
17
|
required_params: List[ExampleParams] = [
|
18
18
|
ExampleParams.INPUT,
|
@@ -1,10 +1,10 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
2
2
|
from judgeval.constants import APIScorerType
|
3
3
|
from judgeval.data import ExampleParams
|
4
4
|
from typing import List
|
5
5
|
|
6
6
|
|
7
|
-
class AnswerRelevancyScorer(
|
7
|
+
class AnswerRelevancyScorer(APIScorerConfig):
|
8
8
|
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
9
9
|
required_params: List[ExampleParams] = [
|
10
10
|
ExampleParams.INPUT,
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
from typing import List
|
13
13
|
|
14
14
|
|
15
|
-
class FaithfulnessScorer(
|
15
|
+
class FaithfulnessScorer(APIScorerConfig):
|
16
16
|
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
17
17
|
required_params: List[ExampleParams] = [
|
18
18
|
ExampleParams.INPUT,
|
@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
|
14
|
-
class InstructionAdherenceScorer(
|
14
|
+
class InstructionAdherenceScorer(APIScorerConfig):
|
15
15
|
def __init__(self, threshold: float):
|
16
16
|
super().__init__(
|
17
17
|
threshold=threshold,
|
@@ -1,7 +1,5 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import (
|
2
2
|
APIScorerConfig,
|
3
|
-
ExampleAPIScorerConfig,
|
4
|
-
TraceAPIScorerConfig,
|
5
3
|
)
|
6
4
|
from judgeval.constants import APIScorerType
|
7
5
|
from typing import Dict, Any, Optional
|
@@ -55,7 +53,7 @@ def fetch_prompt_scorer(
|
|
55
53
|
):
|
56
54
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
57
55
|
try:
|
58
|
-
scorer_config = client.
|
56
|
+
scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
|
59
57
|
scorer_config.pop("created_at")
|
60
58
|
scorer_config.pop("updated_at")
|
61
59
|
return scorer_config
|
@@ -284,9 +282,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
284
282
|
return base
|
285
283
|
|
286
284
|
|
287
|
-
class PromptScorer(BasePromptScorer,
|
285
|
+
class PromptScorer(BasePromptScorer, APIScorerConfig):
|
288
286
|
pass
|
289
287
|
|
290
288
|
|
291
|
-
class TracePromptScorer(BasePromptScorer,
|
289
|
+
class TracePromptScorer(BasePromptScorer, APIScorerConfig):
|
292
290
|
pass
|
judgeval/scorers/score.py
CHANGED
judgeval/tracer/__init__.py
CHANGED
@@ -43,8 +43,8 @@ from judgeval.env import (
|
|
43
43
|
JUDGMENT_ORG_ID,
|
44
44
|
)
|
45
45
|
from judgeval.logger import judgeval_logger
|
46
|
-
from judgeval.scorers.api_scorer import
|
47
|
-
from judgeval.scorers.
|
46
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
47
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
48
48
|
from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
|
49
49
|
from judgeval.tracer.managers import (
|
50
50
|
sync_span_context,
|
@@ -328,7 +328,7 @@ class Tracer:
|
|
328
328
|
run_condition = scorer_config.run_condition
|
329
329
|
sampling_rate = scorer_config.sampling_rate
|
330
330
|
|
331
|
-
if not isinstance(scorer, (
|
331
|
+
if not isinstance(scorer, (APIScorerConfig)):
|
332
332
|
judgeval_logger.error(
|
333
333
|
"Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
|
334
334
|
% type(scorer)
|
@@ -358,7 +358,6 @@ class Tracer:
|
|
358
358
|
eval_run_name = f"async_trace_evaluate_{span_id}"
|
359
359
|
|
360
360
|
eval_run = TraceEvaluationRun(
|
361
|
-
organization_id=self.organization_id,
|
362
361
|
project_name=self.project_name,
|
363
362
|
eval_name=eval_run_name,
|
364
363
|
scorers=[scorer],
|
@@ -862,7 +861,7 @@ class Tracer:
|
|
862
861
|
self,
|
863
862
|
/,
|
864
863
|
*,
|
865
|
-
scorer: Union[
|
864
|
+
scorer: Union[APIScorerConfig, ExampleScorer],
|
866
865
|
example: Example,
|
867
866
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
868
867
|
sampling_rate: float = 1.0,
|
@@ -871,7 +870,7 @@ class Tracer:
|
|
871
870
|
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
872
871
|
return
|
873
872
|
|
874
|
-
if not isinstance(scorer, (
|
873
|
+
if not isinstance(scorer, (APIScorerConfig, ExampleScorer)):
|
875
874
|
judgeval_logger.error(
|
876
875
|
"Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
|
877
876
|
% type(scorer)
|
@@ -902,13 +901,12 @@ class Tracer:
|
|
902
901
|
span_context = self.get_current_span().get_span_context()
|
903
902
|
trace_id = format(span_context.trace_id, "032x")
|
904
903
|
span_id = format(span_context.span_id, "016x")
|
905
|
-
hosted_scoring = isinstance(scorer,
|
906
|
-
isinstance(scorer,
|
904
|
+
hosted_scoring = isinstance(scorer, APIScorerConfig) or (
|
905
|
+
isinstance(scorer, ExampleScorer) and scorer.server_hosted
|
907
906
|
)
|
908
907
|
eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
909
908
|
if hosted_scoring:
|
910
909
|
eval_run = ExampleEvaluationRun(
|
911
|
-
organization_id=self.organization_id,
|
912
910
|
project_name=self.project_name,
|
913
911
|
eval_name=eval_run_name,
|
914
912
|
examples=[example],
|
@@ -923,7 +921,6 @@ class Tracer:
|
|
923
921
|
else:
|
924
922
|
# Handle custom scorers using local evaluation queue
|
925
923
|
eval_run = ExampleEvaluationRun(
|
926
|
-
organization_id=self.organization_id,
|
927
924
|
project_name=self.project_name,
|
928
925
|
eval_name=eval_run_name,
|
929
926
|
examples=[example],
|
@@ -13,7 +13,7 @@ import time
|
|
13
13
|
from judgeval.logger import judgeval_logger
|
14
14
|
from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
|
15
15
|
from judgeval.data import ScoringResult
|
16
|
-
from judgeval.data.evaluation_run import
|
16
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
17
17
|
from judgeval.utils.async_utils import safe_run_async
|
18
18
|
from judgeval.scorers.score import a_execute_scoring
|
19
19
|
from judgeval.api import JudgmentSyncClient
|
@@ -34,7 +34,7 @@ class LocalEvaluationQueue:
|
|
34
34
|
):
|
35
35
|
if num_workers <= 0:
|
36
36
|
raise ValueError("num_workers must be a positive integer.")
|
37
|
-
self._queue: queue.Queue[Optional[
|
37
|
+
self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
|
38
38
|
self._max_concurrent = max_concurrent
|
39
39
|
self._num_workers = num_workers # Number of worker threads
|
40
40
|
self._worker_threads: List[threading.Thread] = []
|
@@ -44,11 +44,11 @@ class LocalEvaluationQueue:
|
|
44
44
|
organization_id=JUDGMENT_ORG_ID,
|
45
45
|
)
|
46
46
|
|
47
|
-
def enqueue(self, evaluation_run:
|
47
|
+
def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
|
48
48
|
"""Add evaluation run to the queue."""
|
49
49
|
self._queue.put(evaluation_run)
|
50
50
|
|
51
|
-
def _process_run(self, evaluation_run:
|
51
|
+
def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
|
52
52
|
"""Execute evaluation run locally and return results."""
|
53
53
|
|
54
54
|
if not evaluation_run.custom_scorers:
|
@@ -70,7 +70,9 @@ class LocalEvaluationQueue:
|
|
70
70
|
|
71
71
|
def run_all(
|
72
72
|
self,
|
73
|
-
callback: Optional[
|
73
|
+
callback: Optional[
|
74
|
+
Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
|
75
|
+
] = None,
|
74
76
|
) -> None:
|
75
77
|
"""Process all queued runs synchronously.
|
76
78
|
|
@@ -134,7 +136,9 @@ class LocalEvaluationQueue:
|
|
134
136
|
|
135
137
|
def start_worker(
|
136
138
|
self,
|
137
|
-
callback: Optional[
|
139
|
+
callback: Optional[
|
140
|
+
Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
|
141
|
+
] = None,
|
138
142
|
) -> Optional[threading.Thread]:
|
139
143
|
"""Start a single background thread to process runs (backward compatibility).
|
140
144
|
|
@@ -144,7 +148,7 @@ class LocalEvaluationQueue:
|
|
144
148
|
Returns:
|
145
149
|
The started thread, or None if no threads were started.
|
146
150
|
"""
|
147
|
-
threads = self.start_workers(
|
151
|
+
threads = self.start_workers()
|
148
152
|
return threads[0] if threads else None
|
149
153
|
|
150
154
|
def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
|
judgeval/tracer/utils.py
CHANGED
@@ -2,7 +2,7 @@ from typing import Any
|
|
2
2
|
from opentelemetry.trace import Span
|
3
3
|
from pydantic import BaseModel
|
4
4
|
from typing import Callable, Optional
|
5
|
-
from judgeval.scorers.api_scorer import
|
5
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
6
6
|
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
7
7
|
|
8
8
|
|
@@ -14,7 +14,7 @@ def set_span_attribute(span: Span, name: str, value: Any):
|
|
14
14
|
|
15
15
|
|
16
16
|
class TraceScorerConfig(BaseModel):
|
17
|
-
scorer:
|
17
|
+
scorer: APIScorerConfig
|
18
18
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL
|
19
19
|
sampling_rate: float = 1.0
|
20
20
|
run_condition: Optional[Callable[..., bool]] = None
|
judgeval/trainer/config.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any, TYPE_CHECKING
|
|
5
5
|
import json
|
6
6
|
|
7
7
|
if TYPE_CHECKING:
|
8
|
-
from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral
|
8
|
+
from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral # type: ignore[import-not-found]
|
9
9
|
|
10
10
|
|
11
11
|
@dataclass
|
judgeval/trainer/trainer.py
CHANGED
@@ -2,7 +2,7 @@ import asyncio
|
|
2
2
|
import json
|
3
3
|
import time
|
4
4
|
from typing import Optional, Callable, Any, List, Union, Dict
|
5
|
-
from fireworks import Dataset
|
5
|
+
from fireworks import Dataset # type: ignore[import-not-found]
|
6
6
|
from .config import TrainerConfig, ModelConfig
|
7
7
|
from .trainable_model import TrainableModel
|
8
8
|
from judgeval.tracer import Tracer
|
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
|
|
10
10
|
from judgeval.tracer.exporters import InMemorySpanExporter
|
11
11
|
from judgeval.tracer.keys import AttributeKeys
|
12
12
|
from judgeval import JudgmentClient
|
13
|
-
from judgeval.scorers import BaseScorer,
|
13
|
+
from judgeval.scorers import BaseScorer, APIScorerConfig
|
14
14
|
from judgeval.data import Example
|
15
15
|
from .console import _spinner_progress, _print_progress, _print_progress_update
|
16
16
|
from judgeval.exceptions import JudgmentRuntimeError
|
@@ -85,7 +85,9 @@ class JudgmentTrainer:
|
|
85
85
|
if not first_found and span_attributes.get(
|
86
86
|
AttributeKeys.JUDGMENT_INPUT
|
87
87
|
):
|
88
|
-
input_data = span_attributes.get(
|
88
|
+
input_data: Any = span_attributes.get(
|
89
|
+
AttributeKeys.JUDGMENT_INPUT, {}
|
90
|
+
)
|
89
91
|
if isinstance(input_data, dict) and "messages" in input_data:
|
90
92
|
input_messages = input_data["messages"]
|
91
93
|
if input_messages:
|
@@ -154,7 +156,7 @@ class JudgmentTrainer:
|
|
154
156
|
async def generate_rollouts_and_rewards(
|
155
157
|
self,
|
156
158
|
agent_function: Callable[[Any], Any],
|
157
|
-
scorers: List[Union[
|
159
|
+
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
158
160
|
prompts: List[Any],
|
159
161
|
num_prompts_per_step: Optional[int] = None,
|
160
162
|
num_generations_per_prompt: Optional[int] = None,
|
@@ -264,7 +266,7 @@ class JudgmentTrainer:
|
|
264
266
|
async def run_reinforcement_learning(
|
265
267
|
self,
|
266
268
|
agent_function: Callable[[Any], Any],
|
267
|
-
scorers: List[Union[
|
269
|
+
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
268
270
|
prompts: List[Any],
|
269
271
|
) -> ModelConfig:
|
270
272
|
"""
|
@@ -370,7 +372,7 @@ class JudgmentTrainer:
|
|
370
372
|
async def train(
|
371
373
|
self,
|
372
374
|
agent_function: Callable[[Any], Any],
|
373
|
-
scorers: List[Union[
|
375
|
+
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
374
376
|
prompts: List[Any],
|
375
377
|
rft_provider: Optional[str] = None,
|
376
378
|
) -> ModelConfig:
|
judgeval/utils/async_utils.py
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
|
3
3
|
import asyncio
|
4
4
|
import concurrent.futures
|
5
|
-
from typing import Awaitable, TypeVar
|
5
|
+
from typing import Awaitable, TypeVar, Coroutine
|
6
6
|
|
7
7
|
|
8
8
|
T = TypeVar("T")
|
9
9
|
|
10
10
|
|
11
|
-
def safe_run_async(coro: Awaitable[T]) -> T:
|
11
|
+
def safe_run_async(coro: Awaitable[T]) -> T:
|
12
12
|
"""Safely execute an async *coro* from synchronous code.
|
13
13
|
|
14
14
|
This helper handles two common situations:
|
@@ -24,6 +24,8 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
|
24
24
|
Returns:
|
25
25
|
The result returned by *coro*.
|
26
26
|
"""
|
27
|
+
if not isinstance(coro, Coroutine):
|
28
|
+
raise TypeError("The provided awaitable must be a coroutine.")
|
27
29
|
|
28
30
|
try:
|
29
31
|
asyncio.get_running_loop()
|
@@ -31,5 +33,7 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
|
31
33
|
return asyncio.run(coro)
|
32
34
|
|
33
35
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
34
|
-
future = executor.submit(
|
36
|
+
future: concurrent.futures.Future[T] = executor.submit(
|
37
|
+
lambda: asyncio.run(coro)
|
38
|
+
)
|
35
39
|
return future.result()
|
judgeval/utils/testing.py
CHANGED
@@ -26,10 +26,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
|
26
26
|
# If the result was not successful, check each scorer_data
|
27
27
|
for scorer_data in result.scorers_data:
|
28
28
|
if not scorer_data.success:
|
29
|
-
if scorer_data.name == "Tool Order":
|
30
|
-
# Remove threshold, evaluation model for Tool Order scorer
|
31
|
-
scorer_data.threshold = None
|
32
|
-
scorer_data.evaluation_model = None
|
33
29
|
test_case.append(scorer_data)
|
34
30
|
failed_cases.append(test_case)
|
35
31
|
|