judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,14 +4,15 @@ import time
4
4
  import sys
5
5
  import itertools
6
6
  import threading
7
- from typing import List, Dict, Any
7
+ from typing import List, Dict, Any, Union
8
8
  from datetime import datetime
9
9
  from rich import print as rprint
10
10
 
11
11
  from judgeval.data import (
12
12
  ScorerData,
13
13
  ScoringResult,
14
- Example
14
+ Example,
15
+ CustomExample
15
16
  )
16
17
  from judgeval.scorers import (
17
18
  JudgevalScorer,
@@ -22,6 +23,7 @@ from judgeval.scorers.score import a_execute_scoring
22
23
  from judgeval.constants import (
23
24
  ROOT_API,
24
25
  JUDGMENT_EVAL_API_URL,
26
+ JUDGMENT_SEQUENCE_EVAL_API_URL,
25
27
  JUDGMENT_EVAL_LOG_API_URL,
26
28
  MAX_CONCURRENT_EVALUATIONS,
27
29
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
@@ -34,7 +36,7 @@ from judgeval.common.logger import (
34
36
  example_logging_context
35
37
  )
36
38
  from judgeval.evaluation_run import EvaluationRun
37
-
39
+ from judgeval.data.sequence_run import SequenceRun
38
40
 
39
41
  def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
40
42
  """
@@ -91,6 +93,36 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
91
93
  raise JudgmentAPIError(error_message)
92
94
  return response_data
93
95
 
96
+ def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
97
+ """
98
+ Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
99
+ """
100
+
101
+ try:
102
+ # submit API request to execute evals
103
+ payload = sequence_run.model_dump(warnings=False)
104
+ response = requests.post(
105
+ JUDGMENT_SEQUENCE_EVAL_API_URL,
106
+ headers={
107
+ "Content-Type": "application/json",
108
+ "Authorization": f"Bearer {sequence_run.judgment_api_key}",
109
+ "X-Organization-Id": sequence_run.organization_id
110
+ },
111
+ json=payload,
112
+ verify=True
113
+ )
114
+ response_data = response.json()
115
+ except Exception as e:
116
+ error(f"Error: {e}")
117
+ details = response.json().get("detail", "No details provided")
118
+ raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
119
+ # Check if the response status code is not 2XX
120
+ # Add check for the duplicate eval run name
121
+ if not response.ok:
122
+ error_message = response_data.get('detail', 'An unknown error occurred.')
123
+ error(f"Error: {error_message=}")
124
+ raise JudgmentAPIError(error_message)
125
+ return response_data
94
126
 
95
127
  def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
96
128
  """
@@ -197,8 +229,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
197
229
  )
198
230
 
199
231
  if response.status_code == 409:
200
- error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
201
- raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
232
+ error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
233
+ raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
202
234
 
203
235
  if not response.ok:
204
236
  response_data = response.json()
@@ -211,7 +243,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
211
243
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
212
244
 
213
245
 
214
- def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
246
+ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
215
247
  """
216
248
  Logs evaluation results to the Judgment API database.
217
249
 
@@ -228,13 +260,12 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
228
260
  JUDGMENT_EVAL_LOG_API_URL,
229
261
  headers={
230
262
  "Content-Type": "application/json",
231
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
232
- "X-Organization-Id": evaluation_run.organization_id
263
+ "Authorization": f"Bearer {run.judgment_api_key}",
264
+ "X-Organization-Id": run.organization_id
233
265
  },
234
266
  json={
235
- "results": [result.to_dict() for result in merged_results],
236
- "project_name": evaluation_run.project_name,
237
- "eval_name": evaluation_run.eval_name,
267
+ "results": [result.model_dump(warnings=False) for result in merged_results],
268
+ "run": run.model_dump(warnings=False)
238
269
  },
239
270
  verify=True
240
271
  )
@@ -303,6 +334,42 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
303
334
  # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
304
335
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
305
336
 
337
+ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
338
+ # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
339
+ if not override and sequence_run.log_results and not sequence_run.append:
340
+ check_eval_run_name_exists(
341
+ sequence_run.eval_name,
342
+ sequence_run.project_name,
343
+ sequence_run.judgment_api_key,
344
+ sequence_run.organization_id
345
+ )
346
+
347
+ # Execute evaluation using Judgment API
348
+ info("Starting API evaluation")
349
+ try: # execute an EvaluationRun with just JudgmentScorers
350
+ debug("Sending request to Judgment API")
351
+ response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
352
+
353
+ info(f"Received {len(response_data['results'])} results from API")
354
+ except JudgmentAPIError as e:
355
+ error(f"An error occurred while executing the Judgment API request: {str(e)}")
356
+ raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
357
+ except ValueError as e:
358
+ raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
359
+
360
+ # Convert the response data to `ScoringResult` objects
361
+ debug("Processing API results")
362
+ api_results = []
363
+ for result in response_data["results"]:
364
+ api_results.append(ScoringResult(**result))
365
+
366
+ # TODO: allow for custom scorer on sequences
367
+ if sequence_run.log_results:
368
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
369
+ rprint(pretty_str)
370
+
371
+
372
+
306
373
  def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
307
374
  """
308
375
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -329,7 +396,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
329
396
  """
330
397
 
331
398
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
332
- if not override and evaluation_run.log_results:
399
+ if not override and evaluation_run.log_results and not evaluation_run.append:
333
400
  check_eval_run_name_exists(
334
401
  evaluation_run.eval_name,
335
402
  evaluation_run.project_name,
@@ -373,12 +440,20 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
373
440
  local_scorers.append(scorer)
374
441
  debug(f"Added local scorer: {type(scorer).__name__}")
375
442
 
443
+ custom_example_check = [scorer.custom_example for scorer in local_scorers]
444
+ if any(custom_example_check) and not all(custom_example_check):
445
+ error("All scorers must be custom scorers if using custom examples")
446
+ raise ValueError("All scorers must be custom scorers if using custom examples")
447
+
376
448
  debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
377
449
 
378
450
  api_results: List[ScoringResult] = []
379
451
  local_results: List[ScoringResult] = []
380
452
 
381
453
  if async_execution:
454
+ if len(local_scorers) > 0:
455
+ error("Local scorers are not supported in async execution")
456
+
382
457
  check_examples(evaluation_run.examples, evaluation_run.scorers)
383
458
  info("Starting async evaluation")
384
459
  payload = evaluation_run.model_dump(warnings=False)
@@ -396,7 +471,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
396
471
  else:
397
472
  if judgment_scorers:
398
473
  # Execute evaluation using Judgment API
399
- check_examples(evaluation_run.examples, evaluation_run.scorers)
400
474
  info("Starting API evaluation")
401
475
  debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
402
476
  try: # execute an EvaluationRun with just JudgmentScorers
@@ -17,6 +17,7 @@ from judgeval.scorers.judgeval_scorers import (
17
17
  ComparisonScorer,
18
18
  InstructionAdherenceScorer,
19
19
  GroundednessScorer,
20
+ DerailmentScorer,
20
21
  )
21
22
 
22
23
  __all__ = [
@@ -39,4 +40,5 @@ __all__ = [
39
40
  "ComparisonScorer",
40
41
  "InstructionAdherenceScorer",
41
42
  "GroundednessScorer",
43
+ "DerailmentScorer",
42
44
  ]
@@ -34,6 +34,7 @@ class JudgevalScorer:
34
34
  async_mode: bool = True # Whether to run the scorer in async mode
35
35
  verbose_mode: bool = True # Whether to run the scorer in verbose mode
36
36
  include_reason: bool = False # Whether to include the reason in the output
37
+ custom_example: bool = False # Whether the scorer corresponds to CustomExamples
37
38
  error: Optional[str] = None # The error message if the scorer failed
38
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
39
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
@@ -52,6 +53,7 @@ class JudgevalScorer:
52
53
  async_mode: bool = True,
53
54
  verbose_mode: bool = True,
54
55
  include_reason: bool = False,
56
+ custom_example: bool = False,
55
57
  error: Optional[str] = None,
56
58
  evaluation_cost: Optional[float] = None,
57
59
  verbose_logs: Optional[str] = None,
@@ -78,6 +80,7 @@ class JudgevalScorer:
78
80
  self.async_mode = async_mode
79
81
  self.verbose_mode = verbose_mode
80
82
  self.include_reason = include_reason
83
+ self.custom_example = custom_example
81
84
  self.error = error
82
85
  self.evaluation_cost = evaluation_cost
83
86
  self.verbose_logs = verbose_logs
@@ -15,6 +15,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
15
15
  ComparisonScorer as APIComparisonScorer,
16
16
  InstructionAdherenceScorer as APIInstructionAdherenceScorer,
17
17
  GroundednessScorer as APIGroundednessScorer,
18
+ DerailmentScorer as APIDerailmentScorer,
18
19
  )
19
20
 
20
21
  from judgeval.scorers.judgeval_scorers.local_implementations import (
@@ -153,6 +154,11 @@ GroundednessScorer = ScorerWrapper(
153
154
  api_implementation=APIGroundednessScorer,
154
155
  )
155
156
 
157
+ DerailmentScorer = ScorerWrapper(
158
+ api_implementation=APIDerailmentScorer,
159
+ local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
160
+ )
161
+
156
162
  __all__ = [
157
163
  "ExecutionOrderScorer",
158
164
  "JSONCorrectnessScorer",
@@ -166,4 +172,5 @@ __all__ = [
166
172
  "Text2SQLScorer",
167
173
  "ComparisonScorer",
168
174
  "GroundednessScorer",
175
+ "DerailmentScorer",
169
176
  ]
@@ -11,7 +11,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import Ans
11
11
  from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
12
12
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
-
14
+ from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
15
  __all__ = [
16
16
  "ExecutionOrderScorer",
17
17
  "JSONCorrectnessScorer",
@@ -26,4 +26,5 @@ __all__ = [
26
26
  "ComparisonScorer",
27
27
  "InstructionAdherenceScorer",
28
28
  "GroundednessScorer",
29
+ "DerailmentScorer",
29
30
  ]
@@ -0,0 +1,21 @@
1
+ """
2
+ `judgeval` answer relevancy scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+ class DerailmentScorer(APIJudgmentScorer):
13
+ def __init__(self, threshold: float):
14
+ super().__init__(
15
+ threshold=threshold,
16
+ score_type=APIScorer.DERAILMENT,
17
+ )
18
+
19
+ @property
20
+ def __name__(self):
21
+ return "Derailment"
judgeval/scorers/score.py CHANGED
@@ -11,6 +11,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
11
11
 
12
12
  from judgeval.data import (
13
13
  Example,
14
+ CustomExample,
14
15
  ScoringResult,
15
16
  generate_scoring_result,
16
17
  create_scorer_data,
@@ -240,7 +241,7 @@ async def score_with_indicator(
240
241
 
241
242
 
242
243
  async def a_execute_scoring(
243
- examples: List[Example],
244
+ examples: Union[List[Example], List[CustomExample]],
244
245
  scorers: List[JudgevalScorer],
245
246
  model: Optional[Union[str, List[str], JudgevalJudge]] = None,
246
247
  ignore_errors: bool = True,
@@ -256,7 +257,7 @@ async def a_execute_scoring(
256
257
  Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
257
258
 
258
259
  Args:
259
- examples (List[Example]): A list of `Example` objects to be evaluated.
260
+ examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
260
261
  scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
261
262
  model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
262
263
  ignore_errors (bool): Whether to ignore errors during evaluation.
@@ -313,7 +314,7 @@ async def a_execute_scoring(
313
314
  debug(f"Scorer threshold: {scorer.threshold}")
314
315
  if hasattr(scorer, 'model'):
315
316
  debug(f"Scorer model: {type(scorer.model).__name__}")
316
- if isinstance(ex, Example):
317
+ if isinstance(ex, Example) or isinstance(ex, CustomExample):
317
318
  if len(scorers) == 0:
318
319
  pbar.update(1)
319
320
  continue
@@ -339,7 +340,7 @@ async def a_execute_scoring(
339
340
  await asyncio.gather(*tasks)
340
341
  else:
341
342
  for i, ex in enumerate(examples):
342
- if isinstance(ex, Example):
343
+ if isinstance(ex, Example) or isinstance(ex, CustomExample):
343
344
  if len(scorers) == 0:
344
345
  continue
345
346
 
@@ -366,7 +367,7 @@ async def a_execute_scoring(
366
367
 
367
368
  async def a_eval_examples_helper(
368
369
  scorers: List[JudgevalScorer],
369
- example: Example,
370
+ example: Union[Example, CustomExample],
370
371
  scoring_results: List[ScoringResult],
371
372
  score_index: int,
372
373
  ignore_errors: bool,
@@ -0,0 +1,22 @@
1
+ import importlib.metadata
2
+ import requests
3
+ import threading
4
+
5
+ def check_latest_version(package_name: str = "judgeval"):
6
+ def _check():
7
+ try:
8
+ current_version = importlib.metadata.version(package_name)
9
+ response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
10
+ latest_version = response.json()["info"]["version"]
11
+
12
+ if current_version != latest_version:
13
+ print(
14
+ f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
15
+ f"but the latest version is '{latest_version}'. While this version is still supported, "
16
+ f"we recommend upgrading to avoid potential issues or missing features: "
17
+ f"`pip install --upgrade {package_name}`"
18
+ )
19
+ except Exception:
20
+ pass
21
+
22
+ threading.Thread(target=_check, daemon=True).start()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.30
3
+ Version: 0.0.32
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,46 +1,50 @@
1
- judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
1
+ judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
4
- judgeval/evaluation_run.py,sha256=6Kft3wZDWkdBDZoMwOhWf7zSAOF4naI7Pcg_YlZaZY4,6394
5
- judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
3
+ judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
4
+ judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
5
+ judgeval/judgment_client.py,sha256=k0q2s5A0RkhF9ElD9o-KWN10H36t3Of2PrvNF-silf8,26141
6
6
  judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
7
- judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
7
+ judgeval/run_evaluation.py,sha256=hnEY8QckEviXYNJutf-6tLFq2DWCzqWV1EVyPvrVXyA,28512
8
+ judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
8
9
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
10
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
11
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=b-eQyC_MPwMAVQJS6wtVW0_7hzk8tC9EV6NZZoNjWos,58188
12
+ judgeval/common/tracer.py,sha256=owRRfIZXPUOVCCn0macygnf18mcp8am1eULGnZXD0Kk,68876
12
13
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
- judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
14
- judgeval/data/custom_api_example.py,sha256=uW_ZBzkDLWumtudmfRHAJQkVYpm2qWgcDf7vBNLpS-o,3444
15
- judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
16
- judgeval/data/result.py,sha256=BT4f2FF5EFuiRjOmS4vuIXsrEwSlG16Vw3QaWi6PZzc,3122
14
+ judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
15
+ judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
16
+ judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
17
+ judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
17
18
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
19
+ judgeval/data/sequence.py,sha256=Fkk2HJGnPboH-Fvwgxub_ryG0eUXa3cbsj7ZD0qkeBo,2204
20
+ judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
18
21
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
19
- judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
20
- judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
21
- judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
22
+ judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
23
+ judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
24
+ judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
22
25
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
23
26
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
24
27
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
25
28
  judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
26
29
  judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
27
30
  judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
28
- judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
31
+ judgeval/scorers/__init__.py,sha256=Z_88Sr45gLFAIbMHzG1BF24TUQGCDiuP9QpmVFvSYJM,1204
29
32
  judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
30
33
  judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
31
34
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
32
- judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
35
+ judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
33
36
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
34
- judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
37
+ judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
35
38
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
36
- judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
37
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
39
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=kSmQWKeBvLeZMfLYNQSc2qbJYo1MFIQnf3P-D4ltuSM,6232
40
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
38
41
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
39
42
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
40
43
  judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
41
44
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
42
45
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
43
46
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
47
+ judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
44
48
  judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
45
49
  judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
46
50
  judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
@@ -87,7 +91,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
87
91
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
88
92
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
89
93
  judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
90
- judgeval-0.0.30.dist-info/METADATA,sha256=A1AbKJ1CqCjM4ankQWZ8AVKZdxLZBUUWehKHdgT43l0,5418
91
- judgeval-0.0.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
- judgeval-0.0.30.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
- judgeval-0.0.30.dist-info/RECORD,,
94
+ judgeval-0.0.32.dist-info/METADATA,sha256=RJzqlHJwfYiOXEcyEEO5WQBM0DC1zQDuoN-Plix6U38,5418
95
+ judgeval-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
96
+ judgeval-0.0.32.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
97
+ judgeval-0.0.32.dist-info/RECORD,,
@@ -1,91 +0,0 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, ConfigDict, model_validator
3
-
4
- from judgeval.data.example import Example
5
- from judgeval.data.custom_example import CustomExample
6
- from judgeval.data.scorer_data import ScorerData
7
- from judgeval.common.logger import debug, error
8
-
9
- class ProcessExample(BaseModel):
10
- """
11
- ProcessExample is an `Example` object that contains intermediate information
12
- about an undergoing evaluation on the original `Example`. It is used purely for
13
- internal operations and keeping track of the evaluation process.
14
- """
15
- name: str
16
- # input: Optional[str] = None
17
- # actual_output: Optional[Union[str, List[str]]] = None
18
- # expected_output: Optional[Union[str, List[str]]] = None
19
- # context: Optional[list] = None
20
- # retrieval_context: Optional[list] = None
21
- # tools_called: Optional[list] = None
22
- # expected_tools: Optional[list] = None
23
-
24
- # make these optional, not all test cases in a conversation will be evaluated
25
- success: Optional[bool] = None
26
- scorers_data: Optional[List[ScorerData]] = None
27
- run_duration: Optional[float] = None
28
- evaluation_cost: Optional[float] = None
29
-
30
- order: Optional[int] = None
31
- # These should map 1 to 1 from golden
32
- additional_metadata: Optional[Dict] = None
33
- comments: Optional[str] = None
34
- trace_id: Optional[str] = None
35
- model_config = ConfigDict(arbitrary_types_allowed=True)
36
-
37
- def update_scorer_data(self, scorer_data: ScorerData):
38
- """
39
- Updates scorer data field of test case after the scorers have been
40
- evaluated on this test case.
41
- """
42
- debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
43
- # self.scorers_data is a list of ScorerData objects that contain the
44
- # evaluation results of each scorer on this test case
45
- if self.scorers_data is None:
46
- self.scorers_data = [scorer_data]
47
- else:
48
- self.scorers_data.append(scorer_data)
49
-
50
- if self.success is None:
51
- # self.success will be None when it is a message
52
- # in that case we will be setting success for the first time
53
- self.success = scorer_data.success
54
- else:
55
- if scorer_data.success is False:
56
- debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
57
- self.success = False
58
-
59
- def update_run_duration(self, run_duration: float):
60
- self.run_duration = run_duration
61
-
62
-
63
- def create_process_custom_example(
64
- example: CustomExample,
65
- ) -> ProcessExample:
66
- """
67
- When an LLM Test Case is executed, we track its progress using an ProcessExample.
68
-
69
- This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
70
- """
71
- success = True
72
- if example.name is not None:
73
- name = example.name
74
- else:
75
- name = "Test Case Placeholder"
76
- debug(f"No name provided for example, using default name: {name}")
77
- order = None
78
- scorers_data = []
79
-
80
- debug(f"Creating ProcessExample for: {name}")
81
- process_ex = ProcessExample(
82
- name=name,
83
- success=success,
84
- scorers_data=scorers_data,
85
- run_duration=None,
86
- evaluation_cost=None,
87
- order=order,
88
- additional_metadata=example.additional_metadata,
89
- trace_id=example.trace_id
90
- )
91
- return process_ex