judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
12
12
 
13
13
 
14
14
  class LiteLLMJudge(JudgevalJudge):
15
- def __init__(self, model: str = "gpt-4o-mini", **kwargs):
15
+ def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
16
16
  debug(f"Initializing LiteLLMJudge with model={model}")
17
17
  self.model = model
18
18
  self.kwargs = kwargs
@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
136
136
  """
137
137
  def __init__(self,
138
138
  models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
139
- aggregator: str = 'gpt-4o',
139
+ aggregator: str = 'gpt-4.1',
140
140
  **kwargs):
141
141
  """
142
142
  `models` are the individual judge models to be used for generating responses.
judgeval/judges/utils.py CHANGED
@@ -23,7 +23,7 @@ def create_judge(
23
23
  If no model is provided, uses GPT4o as the default judge.
24
24
  """
25
25
  if model is None: # default option
26
- return LiteLLMJudge(model="gpt-4o"), True
26
+ return LiteLLMJudge(model="gpt-4.1"), True
27
27
  if not isinstance(model, (str, list, JudgevalJudge)):
28
28
  raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
29
29
  # If model is already a valid judge type, return it and mark native
@@ -12,7 +12,7 @@ from judgeval.data import (
12
12
  ScoringResult,
13
13
  Example,
14
14
  CustomExample,
15
- Sequence,
15
+ Trace,
16
16
  )
17
17
  from judgeval.scorers import (
18
18
  APIJudgmentScorer,
@@ -23,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
23
23
  from judgeval.run_evaluation import (
24
24
  run_eval,
25
25
  assert_test,
26
- run_sequence_eval
26
+ run_trace_eval
27
27
  )
28
- from judgeval.data.sequence_run import SequenceRun
28
+ from judgeval.data.trace_run import TraceRun
29
29
  from judgeval.judges import JudgevalJudge
30
30
  from judgeval.constants import (
31
31
  JUDGMENT_EVAL_FETCH_API_URL,
@@ -105,16 +105,16 @@ class JudgmentClient(metaclass=SingletonMeta):
105
105
  rules=rules
106
106
  )
107
107
 
108
- def run_sequence_evaluation(
108
+ def run_trace_evaluation(
109
109
  self,
110
110
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
111
111
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
112
- sequences: Optional[List[Sequence]] = None,
112
+ traces: Optional[List[Trace]] = None,
113
113
  examples: Optional[List[Example]] = None,
114
114
  test_file: Optional[str] = None,
115
115
  aggregator: Optional[str] = None,
116
116
  project_name: str = "default_project",
117
- eval_run_name: str = "default_eval_sequence",
117
+ eval_run_name: str = "default_eval_trace",
118
118
  log_results: bool = True,
119
119
  append: bool = False,
120
120
  override: bool = False,
@@ -134,16 +134,16 @@ class JudgmentClient(metaclass=SingletonMeta):
134
134
  if examples and not function:
135
135
  raise ValueError("Cannot pass in examples without a function")
136
136
 
137
- if sequences and function:
138
- raise ValueError("Cannot pass in sequences and function")
137
+ if traces and function:
138
+ raise ValueError("Cannot pass in traces and function")
139
139
 
140
- if examples and sequences:
141
- raise ValueError("Cannot pass in both examples and sequences")
140
+ if examples and traces:
141
+ raise ValueError("Cannot pass in both examples and traces")
142
142
 
143
- sequence_run = SequenceRun(
143
+ trace_run = TraceRun(
144
144
  project_name=project_name,
145
145
  eval_name=eval_run_name,
146
- sequences=sequences,
146
+ traces=traces,
147
147
  scorers=scorers,
148
148
  model=model,
149
149
  aggregator=aggregator,
@@ -152,9 +152,9 @@ class JudgmentClient(metaclass=SingletonMeta):
152
152
  judgment_api_key=self.judgment_api_key,
153
153
  organization_id=self.organization_id,
154
154
  )
155
- return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
155
+ return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
156
156
  except ValueError as e:
157
- raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
157
+ raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
158
158
  except Exception as e:
159
159
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
160
160
 
@@ -245,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
245
245
  """
246
246
  return self.eval_dataset_client.append_examples(alias, examples, project_name)
247
247
 
248
- def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
249
- """
250
- Appends a `Sequence` to the Judgment platform for storage.
251
- """
252
- return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
253
-
254
248
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
255
249
  """
256
250
  Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -486,7 +480,7 @@ class JudgmentClient(metaclass=SingletonMeta):
486
480
 
487
481
  return response.json()["slug"]
488
482
 
489
- def assert_test(
483
+ async def assert_test(
490
484
  self,
491
485
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
492
486
  examples: Optional[List[Example]] = None,
@@ -500,7 +494,8 @@ class JudgmentClient(metaclass=SingletonMeta):
500
494
  override: bool = False,
501
495
  rules: Optional[List[Rule]] = None,
502
496
  function: Optional[Callable] = None,
503
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
497
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
498
+ async_execution: bool = False
504
499
  ) -> None:
505
500
  """
506
501
  Asserts a test by running the evaluation and checking the results for success
@@ -523,7 +518,7 @@ class JudgmentClient(metaclass=SingletonMeta):
523
518
  raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
524
519
 
525
520
  if function:
526
- results = self.run_sequence_evaluation(
521
+ results = self.run_trace_evaluation(
527
522
  examples=examples,
528
523
  scorers=scorers,
529
524
  model=model,
@@ -538,7 +533,7 @@ class JudgmentClient(metaclass=SingletonMeta):
538
533
  test_file=test_file
539
534
  )
540
535
  else:
541
- results = self.run_evaluation(
536
+ results = await self.run_evaluation(
542
537
  examples=examples,
543
538
  scorers=scorers,
544
539
  model=model,
@@ -548,7 +543,8 @@ class JudgmentClient(metaclass=SingletonMeta):
548
543
  project_name=project_name,
549
544
  eval_run_name=eval_run_name,
550
545
  override=override,
551
- rules=rules
546
+ rules=rules,
547
+ async_execution=async_execution
552
548
  )
553
549
 
554
550
  assert_test(results)