judgeval 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,7 +24,7 @@ from judgeval.common.logger import judgeval_logger
24
24
  if TYPE_CHECKING:
25
25
  from judgeval.common.tracer import Tracer
26
26
  from judgeval.data.trace_run import TraceRun
27
- from judgeval.evaluation_run import EvaluationRun
27
+ from judgeval.data.evaluation_run import EvaluationRun
28
28
  from judgeval.integrations.langgraph import JudgevalCallbackHandler
29
29
 
30
30
 
@@ -140,80 +140,6 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
140
140
  return results
141
141
 
142
142
 
143
- def check_experiment_type(
144
- eval_name: str,
145
- project_name: str,
146
- judgment_api_key: str,
147
- organization_id: str,
148
- is_trace: bool,
149
- ) -> None:
150
- """
151
- Checks if the current experiment, if one exists, has the same type (examples of traces)
152
- """
153
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
154
-
155
- try:
156
- api_client.check_experiment_type(eval_name, project_name, is_trace)
157
- except JudgmentAPIException as e:
158
- if e.response.status_code == 422:
159
- judgeval_logger.error(f"{e.response_json}")
160
- raise ValueError(f"{e.response_json}")
161
- else:
162
- raise e
163
- except Exception as e:
164
- judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
165
- raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
166
-
167
-
168
- def check_eval_run_name_exists(
169
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
170
- ) -> None:
171
- """
172
- Checks if an evaluation run name already exists for a given project.
173
-
174
- Args:
175
- eval_name (str): Name of the evaluation run
176
- project_name (str): Name of the project
177
- judgment_api_key (str): API key for authentication
178
-
179
- Raises:
180
- ValueError: If the evaluation run name already exists
181
- JudgmentAPIError: If there's an API error during the check
182
- """
183
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
184
- try:
185
- api_client.check_eval_run_name_exists(eval_name, project_name)
186
- except JudgmentAPIException as e:
187
- if e.response.status_code == 409:
188
- error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
189
- judgeval_logger.error(error_str)
190
- raise ValueError(error_str)
191
- else:
192
- raise e
193
-
194
- except Exception as e:
195
- judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
196
- raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
197
-
198
-
199
- def check_example_keys(
200
- keys: List[str],
201
- eval_name: str,
202
- project_name: str,
203
- judgment_api_key: str,
204
- organization_id: str,
205
- ) -> None:
206
- """
207
- Checks if the current experiment (if one exists) has the same keys for example
208
- """
209
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
210
- try:
211
- api_client.check_example_keys(keys, eval_name, project_name)
212
- except Exception as e:
213
- judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
214
- raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
215
-
216
-
217
143
  def log_evaluation_results(
218
144
  scoring_results: List[ScoringResult],
219
145
  run: Union[EvaluationRun, TraceRun],
@@ -285,29 +211,10 @@ def check_examples(
285
211
  def run_trace_eval(
286
212
  trace_run: TraceRun,
287
213
  judgment_api_key: str,
288
- override: bool = False,
289
214
  function: Optional[Callable] = None,
290
215
  tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
291
216
  examples: Optional[List[Example]] = None,
292
217
  ) -> List[ScoringResult]:
293
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
294
- if not override and not trace_run.append:
295
- check_eval_run_name_exists(
296
- trace_run.eval_name,
297
- trace_run.project_name,
298
- judgment_api_key,
299
- trace_run.organization_id,
300
- )
301
-
302
- if trace_run.append:
303
- # Check that the current experiment, if one exists, has the same type (examples or traces)
304
- check_experiment_type(
305
- trace_run.eval_name,
306
- trace_run.project_name,
307
- judgment_api_key,
308
- trace_run.organization_id,
309
- True,
310
- )
311
218
  if function and tracer and examples is not None:
312
219
  new_traces: List[Trace] = []
313
220
 
@@ -376,43 +283,8 @@ def run_trace_eval(
376
283
  return scoring_results
377
284
 
378
285
 
379
- async def get_evaluation_status(
380
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
381
- ) -> Dict:
382
- """
383
- Gets the status of an async evaluation run.
384
-
385
- Args:
386
- eval_name (str): Name of the evaluation run
387
- project_name (str): Name of the project
388
- judgment_api_key (str): API key for authentication
389
- organization_id (str): Organization ID for the evaluation
390
-
391
- Returns:
392
- Dict: Status information including:
393
- - status: 'pending', 'running', 'completed', or 'failed'
394
- - results: List of ScoringResult objects if completed
395
- - error: Error message if failed
396
- """
397
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
398
- try:
399
- return api_client.get_evaluation_status(eval_name, project_name)
400
- except Exception as e:
401
- raise JudgmentAPIError(
402
- f"An error occurred while checking evaluation status: {str(e)}"
403
- )
404
-
405
-
406
- def retrieve_counts(result: Dict):
407
- scorer_data_count = 0
408
- for example in result.get("examples", []):
409
- for scorer in example.get("scorer_data", []):
410
- scorer_data_count += 1
411
- return scorer_data_count
412
-
413
-
414
286
  def _poll_evaluation_until_complete(
415
- eval_name: str,
287
+ experiment_run_id: str,
416
288
  project_name: str,
417
289
  judgment_api_key: str,
418
290
  organization_id: str,
@@ -443,14 +315,16 @@ def _poll_evaluation_until_complete(
443
315
  poll_count += 1
444
316
  try:
445
317
  # Check status
446
- status_response = api_client.get_evaluation_status(eval_name, project_name)
318
+ status_response = api_client.get_evaluation_status(
319
+ experiment_run_id, project_name
320
+ )
447
321
 
448
322
  if status_response.get("status") != "completed":
449
323
  time.sleep(poll_interval_seconds)
450
324
  continue
451
325
 
452
326
  results_response = api_client.fetch_evaluation_results(
453
- project_name, eval_name
327
+ experiment_run_id, project_name
454
328
  )
455
329
  url = results_response.get("ui_results_url")
456
330
 
@@ -513,14 +387,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
513
387
  def run_eval(
514
388
  evaluation_run: EvaluationRun,
515
389
  judgment_api_key: str,
516
- override: bool = False,
517
390
  ) -> List[ScoringResult]:
518
391
  """
519
392
  Executes an evaluation of `Example`s using one or more `Scorer`s
520
393
 
521
394
  Args:
522
395
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
523
- override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
524
396
 
525
397
  Returns:
526
398
  List[ScoringResult]: A list of ScoringResult objects
@@ -534,52 +406,31 @@ def run_eval(
534
406
  f"All examples must have the same keys: {current_keys} != {keys}"
535
407
  )
536
408
 
537
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
538
- if not override and not evaluation_run.append:
539
- check_eval_run_name_exists(
540
- evaluation_run.eval_name,
541
- evaluation_run.project_name,
542
- judgment_api_key,
543
- evaluation_run.organization_id,
544
- )
545
-
546
- if evaluation_run.append:
547
- # Check that the current experiment, if one exists, has the same type (examples of traces)
548
- check_experiment_type(
549
- evaluation_run.eval_name,
550
- evaluation_run.project_name,
551
- judgment_api_key,
552
- evaluation_run.organization_id,
553
- False,
554
- )
555
-
556
- # Ensure that current experiment (if one exists) has the same keys for example
557
- check_example_keys(
558
- keys=list(keys),
559
- eval_name=evaluation_run.eval_name,
560
- project_name=evaluation_run.project_name,
561
- judgment_api_key=judgment_api_key,
562
- organization_id=evaluation_run.organization_id,
563
- )
564
-
565
- judgment_scorers: List[APIScorerConfig] = []
566
- local_scorers: List[BaseScorer] = []
567
- for scorer in evaluation_run.scorers:
568
- if isinstance(scorer, APIScorerConfig):
569
- judgment_scorers.append(scorer)
570
- else:
571
- local_scorers.append(scorer)
572
-
573
409
  results: List[ScoringResult] = []
574
410
  url = ""
575
411
 
576
- if len(local_scorers) > 0 and len(judgment_scorers) > 0:
412
+ if (
413
+ len(evaluation_run.custom_scorers) > 0
414
+ and len(evaluation_run.judgment_scorers) > 0
415
+ ):
577
416
  error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
578
417
  judgeval_logger.error(error_msg)
579
418
  raise ValueError(error_msg)
580
419
 
581
- if len(judgment_scorers) > 0:
582
- check_examples(evaluation_run.examples, judgment_scorers)
420
+ e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
421
+
422
+ if evaluation_run.judgment_scorers or e2b_scorers:
423
+ if evaluation_run.judgment_scorers and e2b_scorers:
424
+ error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
425
+ judgeval_logger.error(error_msg)
426
+ raise ValueError(error_msg)
427
+
428
+ if len(e2b_scorers) > 1:
429
+ error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
430
+ judgeval_logger.error(error_msg)
431
+ raise ValueError(error_msg)
432
+
433
+ check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
583
434
  stop_event = threading.Event()
584
435
  t = threading.Thread(
585
436
  target=progress_logger, args=(stop_event, "Running evaluation...")
@@ -600,36 +451,26 @@ def run_eval(
600
451
  )
601
452
  raise JudgmentAPIError(error_message)
602
453
 
603
- old_scorer_data_count = 0
604
- if evaluation_run.append:
605
- try:
606
- results_response = api_client.fetch_evaluation_results(
607
- evaluation_run.project_name, evaluation_run.eval_name
608
- )
609
- old_scorer_data_count = retrieve_counts(results_response)
610
- except Exception:
611
- # This usually means the user did append = True but the eval run name doesn't exist yet
612
- pass
613
-
454
+ num_scorers = (
455
+ len(evaluation_run.judgment_scorers)
456
+ if evaluation_run.judgment_scorers
457
+ else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
458
+ )
614
459
  results, url = _poll_evaluation_until_complete(
615
- eval_name=evaluation_run.eval_name,
460
+ experiment_run_id=evaluation_run.id,
616
461
  project_name=evaluation_run.project_name,
617
462
  judgment_api_key=judgment_api_key,
618
463
  organization_id=evaluation_run.organization_id,
619
- expected_scorer_data_count=(
620
- len(evaluation_run.scorers) * len(evaluation_run.examples)
621
- )
622
- + old_scorer_data_count,
464
+ expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
623
465
  )
624
466
  finally:
625
467
  stop_event.set()
626
468
  t.join()
627
-
628
- if len(local_scorers) > 0:
469
+ else:
629
470
  results = safe_run_async(
630
471
  a_execute_scoring(
631
472
  evaluation_run.examples,
632
- local_scorers,
473
+ evaluation_run.custom_scorers,
633
474
  model=evaluation_run.model,
634
475
  throttle_value=0,
635
476
  max_concurrent=MAX_CONCURRENT_EVALUATIONS,
@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
26
26
  name: Optional[str] = (
27
27
  None # name of your scorer (Faithfulness, PromptScorer-randomslug)
28
28
  )
29
+ class_name: Optional[str] = None # The name of the class of the scorer
29
30
  score: Optional[float] = None # The float score of the scorer run on the test case
30
31
  score_breakdown: Optional[Dict] = None
31
32
  reason: Optional[str] = ""
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
39
40
  error: Optional[str] = None # The error message if the scorer failed
40
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
41
42
  user: Optional[str] = None # The user ID of the scorer
43
+ server_hosted: bool = False # Whether the scorer is enabled for e2b
42
44
 
43
- @model_validator(mode="before")
45
+ @model_validator(mode="after")
44
46
  @classmethod
45
- def enforce_strict_threshold(cls, data: dict):
46
- if data.get("strict_mode"):
47
- data["threshold"] = 1.0
47
+ def enforce_strict_threshold(cls, data: "BaseScorer"):
48
+ if data.strict_mode:
49
+ data.threshold = 1.0
48
50
  return data
49
51
 
50
52
  @model_validator(mode="after")
51
53
  @classmethod
52
54
  def default_name(cls, m: "BaseScorer") -> "BaseScorer":
55
+ # Always set class_name to the string name of the class
56
+ m.class_name = m.__class__.__name__
53
57
  if not m.name:
54
- # Try to use the class name if it exists and is not empty
55
- class_name = getattr(m, "__class__", None)
56
- if class_name and getattr(m.__class__, "__name__", None):
57
- m.name = m.__class__.__name__
58
- else:
59
- m.name = m.score_type
58
+ m.name = m.class_name
60
59
  return m
61
60
 
62
61
  def _add_model(self, model: str):
@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
11
11
  def push_prompt_scorer(
12
12
  name: str,
13
13
  prompt: str,
14
+ threshold: float,
14
15
  options: Optional[Dict[str, float]] = None,
15
16
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
16
17
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
17
18
  ) -> str:
18
19
  client = JudgmentApiClient(judgment_api_key, organization_id)
19
20
  try:
20
- r = client.save_scorer(name, prompt, options)
21
+ r = client.save_scorer(name, prompt, threshold, options)
21
22
  except JudgmentAPIException as e:
22
23
  if e.status_code == 500:
23
24
  raise JudgmentAPIError(
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
90
91
  return cls(
91
92
  name=name,
92
93
  prompt=scorer_config["prompt"],
94
+ threshold=scorer_config["threshold"],
93
95
  options=scorer_config.get("options"),
94
96
  judgment_api_key=judgment_api_key,
95
97
  organization_id=organization_id,
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
100
102
  cls,
101
103
  name: str,
102
104
  prompt: str,
105
+ threshold: Optional[float] = 0.5,
103
106
  options: Optional[Dict[str, float]] = None,
104
107
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
105
108
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
106
109
  ):
107
110
  if not scorer_exists(name, judgment_api_key, organization_id):
108
- push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
111
+ push_prompt_scorer(
112
+ name, prompt, threshold, options, judgment_api_key, organization_id
113
+ )
109
114
  judgeval_logger.info(f"Successfully created PromptScorer: {name}")
110
115
  return cls(
111
116
  name=name,
112
117
  prompt=prompt,
118
+ threshold=threshold,
113
119
  options=options,
114
120
  judgment_api_key=judgment_api_key,
115
121
  organization_id=organization_id,
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
158
164
  judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
159
165
 
160
166
  # Getters
167
+ def get_threshold(self) -> float | None:
168
+ """
169
+ Returns the threshold of the scorer.
170
+ """
171
+ return self.threshold
172
+
161
173
  def get_prompt(self) -> str | None:
162
174
  """
163
175
  Returns the prompt of the scorer.
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
183
195
  return {
184
196
  "name": self.name,
185
197
  "prompt": self.prompt,
198
+ "threshold": self.threshold,
186
199
  "options": self.options,
187
200
  }
188
201
 
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
193
206
  push_prompt_scorer(
194
207
  self.name,
195
208
  self.prompt,
209
+ self.threshold,
196
210
  self.options,
197
211
  self.judgment_api_key,
198
212
  self.organization_id,
199
213
  )
200
214
 
201
215
  def __str__(self):
202
- return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
216
+ return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
203
217
 
204
218
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
205
219
  base = super().model_dump(*args, **kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: boto3
14
+ Requires-Dist: click<8.2.0
14
15
  Requires-Dist: langchain-anthropic
15
16
  Requires-Dist: langchain-core
16
17
  Requires-Dist: langchain-huggingface
@@ -23,6 +24,7 @@ Requires-Dist: orjson>=3.9.0
23
24
  Requires-Dist: python-dotenv
24
25
  Requires-Dist: requests
25
26
  Requires-Dist: rich
27
+ Requires-Dist: typer>=0.9.0
26
28
  Provides-Extra: langchain
27
29
  Requires-Dist: langchain-anthropic; extra == 'langchain'
28
30
  Requires-Dist: langchain-core; extra == 'langchain'
@@ -1,43 +1,44 @@
1
1
  judgeval/__init__.py,sha256=5Lm1JMYFREJGN_8X-Wpruu_ovwGLJ08gCzNAt-u-pQE,419
2
+ judgeval/cli.py,sha256=IcL4_bGr9CtEeea1-AFqM_TEV_VomDlArlxh4IomiSQ,1754
2
3
  judgeval/clients.py,sha256=HHul68PV1om0dxsVZZu90TtCiy5zaqAwph16jXTQzQo,989
3
4
  judgeval/constants.py,sha256=UNoTLHgbpZHRInPM2ZaI3m0XokPkee5ILlg20reqhzo,4180
4
5
  judgeval/dataset.py,sha256=vOrDKam2I-K1WcVF5IBkQruCDvXTc8PRaFm4-dV0lXs,6220
5
- judgeval/evaluation_run.py,sha256=FJpnc1sGncmAOAnEUO0n2vNXjlycljGqBdV99qPT5og,3087
6
- judgeval/judgment_client.py,sha256=tGhENRb2YVIe2WUlcssC8DuEijeUC7Ajj_rh_Dh7bzA,11878
7
- judgeval/local_eval_queue.py,sha256=fAI0_OlvCr-WOCQWw18C4JIRJHKYzlyGzsGUm8LcsYE,7076
6
+ judgeval/judgment_client.py,sha256=-7xcBFowzXKedMINwfZCOL4FKucECWPNEY9QVMo_cys,13644
7
+ judgeval/local_eval_queue.py,sha256=GmlXeZt7bfAJe1hPUjDg_irth4RkNqL2Zdi7VzboBzI,6984
8
8
  judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
9
- judgeval/run_evaluation.py,sha256=4kcaw3R_akhxqutGFGTaBS2pqD-3d0ET7zMDL1_7HK4,27741
9
+ judgeval/run_evaluation.py,sha256=gs-_v_i95LKlJj95G2RmQXvIyBfoldnd1pWCNO4UqsM,21985
10
10
  judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
11
11
  judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
12
12
  judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
13
13
  judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
14
14
  judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
15
15
  judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
16
- judgeval/common/api/api.py,sha256=uuLH6veC0LewfZ1IFiiUi5_OV7zTa7xTIK9LRlLoufc,13743
17
- judgeval/common/api/constants.py,sha256=DXej0m8HEhb871SdiR8t_o4fzeMoQjHYqb_X0Plj8wY,4577
18
- judgeval/common/api/json_encoder.py,sha256=XsScZe9hZP56yuxQ-3Ox6K8DcbjWxc2Yq7FcLF9qkUE,5852
16
+ judgeval/common/api/api.py,sha256=fWtMNln0o1wOhJ9wangWpyY_j3WF7P3at_LYPJEicP0,13670
17
+ judgeval/common/api/constants.py,sha256=y0BDcQqHBZ7MwLd4gT5hLUF8UMs_GVwsJGC-ibfxCAw,4698
18
+ judgeval/common/api/json_encoder.py,sha256=QQgCe2FBmW1uWKx8yvuhr4U7_b4D0sG97GZtXHKnBdk,5881
19
19
  judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
20
20
  judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
21
21
  judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
22
22
  judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
23
- judgeval/common/tracer/core.py,sha256=rI7P0CaarP5FLQZmOGWpOJkjdf6WUgSds6i_QF04J3M,85071
23
+ judgeval/common/tracer/core.py,sha256=TQ80NODaJx7gzmntevDLA3evVJ3m2Zy2s0Pwd7APG9Y,84867
24
24
  judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
25
- judgeval/common/tracer/otel_span_processor.py,sha256=W7SM62KnxJ48vC9WllIHRKaLlvxkCwqYoT4KqZLfGNs,6497
25
+ judgeval/common/tracer/otel_span_processor.py,sha256=BD-FKXaZft5_3zqy1Qe_tpkudVOLop9AGhBjZUgp-Z8,6502
26
26
  judgeval/common/tracer/providers.py,sha256=3c3YOtKuoBjlTL0rc2HAGnUpppqvsyzrN5H6EKCqEi0,2733
27
- judgeval/common/tracer/span_processor.py,sha256=eFjTgSWSkM6BWE94CrvgafDg_WkxLsFL_MafwBG-p9M,1145
28
- judgeval/common/tracer/span_transformer.py,sha256=mUmfUYjEekUEOXAZMmH0WEF94ge05EBi5ftSc-T91zQ,7314
27
+ judgeval/common/tracer/span_processor.py,sha256=1NQxNSVWcb8qCFLmslSVMnaWdkOZmiFJnxeeN0i6vnU,1150
28
+ judgeval/common/tracer/span_transformer.py,sha256=cfzz6RpTCOG9Io9knNlwtAW34p3wyK-u8jSNMu24p1w,7382
29
29
  judgeval/common/tracer/trace_manager.py,sha256=ltiXcWC-68DRc8uSa28qHiWRSIBf6NpYOPkZYooR8tg,3086
30
30
  judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
31
+ judgeval/data/evaluation_run.py,sha256=IirmYZ1_9N99eep7DDuoyshwjmpNK9bQCxCWXnnhhuI,4053
31
32
  judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
32
- judgeval/data/judgment_types.py,sha256=1DTpCnIdDM93Rozu9Dr812Q5K3lZfawMcWbPG2ofbxM,8407
33
+ judgeval/data/judgment_types.py,sha256=3nGCUZ1YJhXajhFlAQvax0SOJ8eLuORtquwwjMreJFw,9826
33
34
  judgeval/data/result.py,sha256=OtSnBUrdQpjyAqxXRLTW3wC9v9lOm_GqzL14ccRQxrg,2124
34
35
  judgeval/data/scorer_data.py,sha256=5QBHtvOIWOq0Rn9_uPJzAMRYMlWxMB-rXnG_6kV4Z4Y,2955
35
36
  judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
36
- judgeval/data/trace.py,sha256=LG-IZksynC1VgfUBuBfIIfR1DT9Bn-sY4vIj6Rc9K6Q,2791
37
- judgeval/data/trace_run.py,sha256=ZCAzktgOSUPD0p1XQj8qGcF-DdsdQFNZM2dtY0aKGbE,1657
37
+ judgeval/data/trace.py,sha256=S781vVU1BvQ_kTS3s7UGYdmYVVxVGjDzWJHZpHedyf0,2834
38
+ judgeval/data/trace_run.py,sha256=Oo1vDrJYX_itt4tt7PJf7fNKd0HE3fnBJxuIkRY8Wrg,1585
38
39
  judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
39
40
  judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
40
- judgeval/integrations/langgraph.py,sha256=kJXLsgBY7DgsUTZyVQ47deDgHm887brFHfyIbuyerGw,29986
41
+ judgeval/integrations/langgraph.py,sha256=XsTNpKvXZmSf4TJBtRKSd5AB7S-Td9GTG5wZW9Npj6k,30062
41
42
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
42
43
  judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
43
44
  judgeval/judges/litellm_judge.py,sha256=K9yCGOmozt7sYO0u8CHWyZNi8mXnSR3pPkP8yVsvuRc,2561
@@ -47,7 +48,7 @@ judgeval/judges/utils.py,sha256=_t6oYN9q63wyP7D4jI8X0bNmvVw7OfaE7uMTYDVS14E,2782
47
48
  judgeval/scorers/__init__.py,sha256=4H_cinTQ4EogZv59YEV-3U9EOTLppNwgAPTi1-jI9Fw,746
48
49
  judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
49
50
  judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIHw,2523
50
- judgeval/scorers/base_scorer.py,sha256=eDfQk8N8TQfM1ayJDWr0NTdSQxcbk9-VZHd0Igb9EbI,2878
51
+ judgeval/scorers/base_scorer.py,sha256=hKrLLh2DaxTgAfze8p_IapvsrogRCevYgfaNCDeOJzc,2869
51
52
  judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
52
53
  judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
53
54
  judgeval/scorers/score.py,sha256=SWyoqOOvyLpLy39tLyb_Q94sdh9r_IuDv6YNREw52lg,7546
@@ -61,7 +62,7 @@ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=NABO_iBd
61
62
  judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
62
63
  judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=SnFLvU4FGsMeUVUp0SGHSy_6wgfwr_vHPGnZx5YJl_Q,691
63
64
  judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
64
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=nx73DeoVkSqJTP1hYxMsJobG9HVWgMDN5-xFOXt_8Ts,7348
65
+ judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=1FsUGjQu3oa2rF-oqt32j-yA2YM33_trGTJ0HgagFJ0,7793
65
66
  judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMNyOax9UkvoRdSyUYdO2Os1-Nko43y89m2Luo,594
66
67
  judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
67
68
  judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
@@ -69,7 +70,8 @@ judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
69
70
  judgeval/utils/async_utils.py,sha256=uNx1SopEc0quSjc8GBQqyba0SmCMAzv2NKIq6xYwttc,989
70
71
  judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
71
72
  judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
72
- judgeval-0.5.0.dist-info/METADATA,sha256=wwnunL-UcNKbB7D5t-UnOM_x3DVghU2BBPAVxa0tNfo,10348
73
- judgeval-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
74
- judgeval-0.5.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
75
- judgeval-0.5.0.dist-info/RECORD,,
73
+ judgeval-0.6.0.dist-info/METADATA,sha256=CulXMs0v5YrHjR3ntVX8xWKcZyxwEpo_nOYs_hkaeN8,10403
74
+ judgeval-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
75
+ judgeval-0.6.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
76
+ judgeval-0.6.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
77
+ judgeval-0.6.0.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ judgeval = judgeval.cli:app
@@ -1,80 +0,0 @@
1
- from typing import List, Optional, Union
2
- from pydantic import BaseModel, field_validator, Field
3
-
4
- from judgeval.data import Example
5
- from judgeval.scorers import BaseScorer, APIScorerConfig
6
- from judgeval.constants import ACCEPTABLE_MODELS, DEFAULT_GPT_MODEL
7
-
8
-
9
- class EvaluationRun(BaseModel):
10
- """
11
- Stores example and evaluation scorers together for running an eval task
12
-
13
- Args:
14
- project_name (str): The name of the project the evaluation results belong to
15
- eval_name (str): A name for this evaluation run
16
- examples (List[Example]): The examples to evaluate
17
- scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
18
- model (str): The model used as a judge when using LLM as a Judge
19
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
20
- """
21
-
22
- organization_id: Optional[str] = None
23
- project_name: Optional[str] = Field(default=None, validate_default=True)
24
- eval_name: Optional[str] = Field(default=None, validate_default=True)
25
- examples: List[Example]
26
- scorers: List[Union[APIScorerConfig, BaseScorer]]
27
- model: Optional[str] = DEFAULT_GPT_MODEL
28
- trace_span_id: Optional[str] = None
29
- trace_id: Optional[str] = None
30
- # API Key will be "" until user calls client.run_eval(), then API Key will be set
31
- override: Optional[bool] = False
32
- append: Optional[bool] = False
33
-
34
- def model_dump(self, **kwargs):
35
- data = super().model_dump(**kwargs)
36
-
37
- data["scorers"] = [
38
- scorer.model_dump() for scorer in self.scorers
39
- ] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
40
- data["examples"] = [example.model_dump() for example in self.examples]
41
-
42
- return data
43
-
44
- @field_validator("examples")
45
- def validate_examples(cls, v):
46
- if not v:
47
- raise ValueError("Examples cannot be empty.")
48
- for item in v:
49
- if not isinstance(item, Example):
50
- raise ValueError(f"Item of type {type(item)} is not a Example")
51
- return v
52
-
53
- @field_validator("scorers", mode="before")
54
- def validate_scorers(cls, v):
55
- if not v:
56
- raise ValueError("Scorers cannot be empty.")
57
- if not all(
58
- isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
59
- for scorer in v
60
- ):
61
- raise ValueError(
62
- "All scorers must be of type BaseScorer or APIScorerConfig."
63
- )
64
- return v
65
-
66
- @field_validator("model")
67
- def validate_model(cls, v, values):
68
- if not v:
69
- raise ValueError("Model cannot be empty.")
70
-
71
- # Check if model is string or list of strings
72
- if isinstance(v, str):
73
- if v not in ACCEPTABLE_MODELS:
74
- raise ValueError(
75
- f"Model name {v} not recognized. Please select a valid model name.)"
76
- )
77
- return v
78
-
79
- class Config:
80
- arbitrary_types_allowed = True