judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +132 -281
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +2 -3
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +2 -9
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +7 -7
- judgeval/data/result.py +3 -3
- judgeval/data/tool.py +19 -0
- judgeval/data/trace.py +5 -1
- judgeval/data/{sequence_run.py → trace_run.py} +4 -4
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +187 -1768
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +21 -25
- judgeval/run_evaluation.py +381 -107
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
- judgeval-0.0.39.dist-info/METADATA +247 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/RECORD +23 -23
- judgeval/data/sequence.py +0 -50
- judgeval-0.0.37.dist-info/METADATA +0 -214
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/WHEEL +0 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judges/litellm_judge.py
CHANGED
@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
|
|
12
12
|
|
13
13
|
|
14
14
|
class LiteLLMJudge(JudgevalJudge):
|
15
|
-
def __init__(self, model: str = "gpt-
|
15
|
+
def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
|
16
16
|
debug(f"Initializing LiteLLMJudge with model={model}")
|
17
17
|
self.model = model
|
18
18
|
self.kwargs = kwargs
|
@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
|
|
136
136
|
"""
|
137
137
|
def __init__(self,
|
138
138
|
models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
|
139
|
-
aggregator: str = 'gpt-
|
139
|
+
aggregator: str = 'gpt-4.1',
|
140
140
|
**kwargs):
|
141
141
|
"""
|
142
142
|
`models` are the individual judge models to be used for generating responses.
|
judgeval/judges/utils.py
CHANGED
@@ -23,7 +23,7 @@ def create_judge(
|
|
23
23
|
If no model is provided, uses GPT4o as the default judge.
|
24
24
|
"""
|
25
25
|
if model is None: # default option
|
26
|
-
return LiteLLMJudge(model="gpt-
|
26
|
+
return LiteLLMJudge(model="gpt-4.1"), True
|
27
27
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
28
28
|
raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
|
29
29
|
# If model is already a valid judge type, return it and mark native
|
judgeval/judgment_client.py
CHANGED
@@ -12,7 +12,7 @@ from judgeval.data import (
|
|
12
12
|
ScoringResult,
|
13
13
|
Example,
|
14
14
|
CustomExample,
|
15
|
-
|
15
|
+
Trace,
|
16
16
|
)
|
17
17
|
from judgeval.scorers import (
|
18
18
|
APIJudgmentScorer,
|
@@ -23,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
|
|
23
23
|
from judgeval.run_evaluation import (
|
24
24
|
run_eval,
|
25
25
|
assert_test,
|
26
|
-
|
26
|
+
run_trace_eval
|
27
27
|
)
|
28
|
-
from judgeval.data.
|
28
|
+
from judgeval.data.trace_run import TraceRun
|
29
29
|
from judgeval.judges import JudgevalJudge
|
30
30
|
from judgeval.constants import (
|
31
31
|
JUDGMENT_EVAL_FETCH_API_URL,
|
@@ -105,16 +105,16 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
105
105
|
rules=rules
|
106
106
|
)
|
107
107
|
|
108
|
-
def
|
108
|
+
def run_trace_evaluation(
|
109
109
|
self,
|
110
110
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
111
111
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
112
|
-
|
112
|
+
traces: Optional[List[Trace]] = None,
|
113
113
|
examples: Optional[List[Example]] = None,
|
114
114
|
test_file: Optional[str] = None,
|
115
115
|
aggregator: Optional[str] = None,
|
116
116
|
project_name: str = "default_project",
|
117
|
-
eval_run_name: str = "
|
117
|
+
eval_run_name: str = "default_eval_trace",
|
118
118
|
log_results: bool = True,
|
119
119
|
append: bool = False,
|
120
120
|
override: bool = False,
|
@@ -134,16 +134,16 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
134
134
|
if examples and not function:
|
135
135
|
raise ValueError("Cannot pass in examples without a function")
|
136
136
|
|
137
|
-
if
|
138
|
-
raise ValueError("Cannot pass in
|
137
|
+
if traces and function:
|
138
|
+
raise ValueError("Cannot pass in traces and function")
|
139
139
|
|
140
|
-
if examples and
|
141
|
-
raise ValueError("Cannot pass in both examples and
|
140
|
+
if examples and traces:
|
141
|
+
raise ValueError("Cannot pass in both examples and traces")
|
142
142
|
|
143
|
-
|
143
|
+
trace_run = TraceRun(
|
144
144
|
project_name=project_name,
|
145
145
|
eval_name=eval_run_name,
|
146
|
-
|
146
|
+
traces=traces,
|
147
147
|
scorers=scorers,
|
148
148
|
model=model,
|
149
149
|
aggregator=aggregator,
|
@@ -152,9 +152,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
152
152
|
judgment_api_key=self.judgment_api_key,
|
153
153
|
organization_id=self.organization_id,
|
154
154
|
)
|
155
|
-
return
|
155
|
+
return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
|
156
156
|
except ValueError as e:
|
157
|
-
raise ValueError(f"Please check your
|
157
|
+
raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
|
158
158
|
except Exception as e:
|
159
159
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
160
160
|
|
@@ -245,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
245
245
|
"""
|
246
246
|
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
247
247
|
|
248
|
-
def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
249
|
-
"""
|
250
|
-
Appends a `Sequence` to the Judgment platform for storage.
|
251
|
-
"""
|
252
|
-
return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
|
253
|
-
|
254
248
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
255
249
|
"""
|
256
250
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
@@ -486,7 +480,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
486
480
|
|
487
481
|
return response.json()["slug"]
|
488
482
|
|
489
|
-
def assert_test(
|
483
|
+
async def assert_test(
|
490
484
|
self,
|
491
485
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
492
486
|
examples: Optional[List[Example]] = None,
|
@@ -500,7 +494,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
500
494
|
override: bool = False,
|
501
495
|
rules: Optional[List[Rule]] = None,
|
502
496
|
function: Optional[Callable] = None,
|
503
|
-
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
497
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
498
|
+
async_execution: bool = False
|
504
499
|
) -> None:
|
505
500
|
"""
|
506
501
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -523,7 +518,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
523
518
|
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
524
519
|
|
525
520
|
if function:
|
526
|
-
results = self.
|
521
|
+
results = self.run_trace_evaluation(
|
527
522
|
examples=examples,
|
528
523
|
scorers=scorers,
|
529
524
|
model=model,
|
@@ -538,7 +533,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
538
533
|
test_file=test_file
|
539
534
|
)
|
540
535
|
else:
|
541
|
-
results = self.run_evaluation(
|
536
|
+
results = await self.run_evaluation(
|
542
537
|
examples=examples,
|
543
538
|
scorers=scorers,
|
544
539
|
model=model,
|
@@ -548,7 +543,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
548
543
|
project_name=project_name,
|
549
544
|
eval_run_name=eval_run_name,
|
550
545
|
override=override,
|
551
|
-
rules=rules
|
546
|
+
rules=rules,
|
547
|
+
async_execution=async_execution
|
552
548
|
)
|
553
549
|
|
554
550
|
assert_test(results)
|