judgeval 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from judgeval.constants import (
20
20
  ROOT_API,
21
21
  JUDGMENT_EVAL_API_URL,
22
22
  JUDGMENT_EVAL_LOG_API_URL,
23
+ MAX_CONCURRENT_EVALUATIONS
23
24
  )
24
25
  from judgeval.common.exceptions import JudgmentAPIError
25
26
  from judgeval.evaluation_run import EvaluationRun
@@ -30,6 +31,7 @@ from judgeval.common.logger import (
30
31
  error,
31
32
  example_logging_context
32
33
  )
34
+ from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
33
35
 
34
36
 
35
37
  def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
@@ -228,6 +230,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
228
230
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
229
231
 
230
232
 
233
+
231
234
  def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
232
235
  """
233
236
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -245,7 +248,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
245
248
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
246
249
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
247
250
  log_results (bool): Whether to log the results to the Judgment API
248
-
251
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
249
252
 
250
253
  Returns:
251
254
  List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
@@ -316,7 +319,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
316
319
  metadata=evaluation_run.metadata,
317
320
  judgment_api_key=evaluation_run.judgment_api_key,
318
321
  organization_id=evaluation_run.organization_id,
319
- log_results=evaluation_run.log_results
322
+ log_results=evaluation_run.log_results,
323
+ rules=evaluation_run.rules
320
324
  )
321
325
  debug("Sending request to Judgment API")
322
326
  response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
@@ -346,7 +350,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
346
350
  ]
347
351
 
348
352
  api_results.append(ScoringResult(**filtered_result))
349
-
350
353
  # Run local evals
351
354
  if local_scorers: # List[JudgevalScorer]
352
355
  info("Starting local evaluation")
@@ -364,12 +367,11 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
364
367
  show_indicator=True,
365
368
  _use_bar_indicator=True,
366
369
  throttle_value=0,
367
- max_concurrent=100,
370
+ max_concurrent=MAX_CONCURRENT_EVALUATIONS,
368
371
  )
369
372
  )
370
373
  local_results = results
371
374
  info(f"Local evaluation complete with {len(local_results)} results")
372
-
373
375
  # Aggregate the ScorerData from the API and local evaluations
374
376
  debug("Merging API and local results")
375
377
  merged_results: List[ScoringResult] = merge_results(api_results, local_results)
@@ -377,6 +379,15 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
377
379
 
378
380
  info(f"Successfully merged {len(merged_results)} results")
379
381
 
382
+ # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
383
+ # if evaluation_run.rules and merged_results:
384
+ # run_rules(
385
+ # local_results=merged_results,
386
+ # rules=evaluation_run.rules,
387
+ # judgment_api_key=evaluation_run.judgment_api_key,
388
+ # organization_id=evaluation_run.organization_id
389
+ # )
390
+
380
391
  if evaluation_run.log_results:
381
392
  log_evaluation_results(merged_results, evaluation_run)
382
393
 
@@ -34,22 +34,22 @@ class APIJudgmentScorer(BaseModel):
34
34
  @field_validator('score_type')
35
35
  def convert_to_enum_value(cls, v):
36
36
  """
37
- Validates that the `score_type` is a valid `JudgmentMetric` enum value.
38
- Converts string values to `JudgmentMetric` enum values.
37
+ Validates that the `score_type` is a valid `APIScorer` enum value.
38
+ Converts string values to `APIScorer` enum values.
39
39
  """
40
40
  debug(f"Attempting to convert score_type value: {v}")
41
41
  if isinstance(v, APIScorer):
42
- info(f"Using existing JudgmentMetric: {v.value}")
43
- return v.value
42
+ info(f"Using existing APIScorer: {v}")
43
+ return v
44
44
  elif isinstance(v, str):
45
- debug(f"Converting string value to JudgmentMetric enum: {v}")
46
- return APIScorer[v.upper()].value
45
+ debug(f"Converting string value to APIScorer enum: {v}")
46
+ return APIScorer[v.upper()]
47
47
  error(f"Invalid score_type value: {v}")
48
48
  raise ValueError(f"Invalid value for score_type: {v}")
49
-
49
+
50
50
  def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
52
-
51
+ return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
52
+
53
53
  def to_dict(self) -> dict:
54
54
  """
55
55
  Converts the scorer configuration to a dictionary format.
@@ -58,7 +58,6 @@ class APIJudgmentScorer(BaseModel):
58
58
  dict: A dictionary containing the scorer's configuration
59
59
  """
60
60
  return {
61
- "score_type": self.score_type,
61
+ "score_type": str(self.score_type.value), # Convert enum to string for serialization
62
62
  "threshold": self.threshold
63
- }
64
-
63
+ }
@@ -48,5 +48,5 @@ class APIJudgmentScorer(BaseModel):
48
48
  raise ValueError(f"Invalid value for score_type: {v}")
49
49
 
50
50
  def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
51
+ return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
52
52
 
@@ -46,7 +46,6 @@ class AnswerRelevancyScorer(JudgevalScorer):
46
46
  )
47
47
  self.model, self.using_native_model = create_judge(model)
48
48
  self.evaluation_model = self.model.get_model_name()
49
- print(self.model)
50
49
 
51
50
  def score_example(
52
51
  self,
@@ -0,0 +1,43 @@
1
+ """
2
+ Handling alerts in Judgeval.
3
+ """
4
+ from enum import Enum
5
+ from typing import Dict, Any, List, Optional
6
+ from pydantic import BaseModel
7
+
8
+ class AlertStatus(str, Enum):
9
+ """Status of an alert evaluation."""
10
+ TRIGGERED = "triggered"
11
+ NOT_TRIGGERED = "not_triggered"
12
+
13
+ class AlertResult(BaseModel):
14
+ """
15
+ Result of a rule evaluation.
16
+
17
+ Attributes:
18
+ rule_name: Name of the rule that was evaluated
19
+ rule_id: Unique identifier of the rule
20
+ status: Status of the alert (triggered or not)
21
+ conditions_result: List of condition evaluation results
22
+ metadata: Dictionary containing example_id, timestamp, and other metadata
23
+ """
24
+ rule_name: str
25
+ rule_id: Optional[str] = None # The unique identifier of the rule
26
+ status: AlertStatus
27
+ conditions_result: List[Dict[str, Any]] = []
28
+ metadata: Dict[str, Any] = {}
29
+
30
+ @property
31
+ def example_id(self) -> Optional[str]:
32
+ """Get example_id from metadata for backward compatibility"""
33
+ return self.metadata.get("example_id")
34
+
35
+ @property
36
+ def timestamp(self) -> Optional[str]:
37
+ """Get timestamp from metadata for backward compatibility"""
38
+ return self.metadata.get("timestamp")
39
+
40
+ @property
41
+ def conditions_results(self) -> List[Dict[str, Any]]:
42
+ """Backwards compatibility property for the conditions_result field"""
43
+ return self.conditions_result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.14
3
+ Version: 0.0.16
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,33 +1,34 @@
1
1
  judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
2
2
  judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
3
- judgeval/constants.py,sha256=43hGesvBbX1uzc4KXvjLCVdd6cyZRMSnEJp11oA7h74,2794
4
- judgeval/evaluation_run.py,sha256=59lG8AUFTKqbY_JVEEA0I093-Pmiy0ERYDK5BuXuEGg,5965
5
- judgeval/judgment_client.py,sha256=ryGT3A9-Him6oco3WvuHbjB-FVvAR3wCiiGz03eO_Q4,15409
6
- judgeval/run_evaluation.py,sha256=Cc7BS07WyqsNpQ38HdMdRI782N3DANjM8UcIq9AwaGA,20769
3
+ judgeval/constants.py,sha256=9ndMvpjXEUPF3RlZZXyTXGDVWE3roH9MH6CZ9h38lrc,4700
4
+ judgeval/evaluation_run.py,sha256=qGNafAPLW9tq6KV3eDolIASodLxcSBWQO0mQ0Aq9Cig,6285
5
+ judgeval/judgment_client.py,sha256=fg45YySB-WbusPw3JmsQEvMIiqBzKcZiXl5I8ewRft0,23252
6
+ judgeval/rules.py,sha256=6RT3DdMvu-LpjqiSDMAd4d2T8S8pOfc_IBEvTgf3GS4,15665
7
+ judgeval/run_evaluation.py,sha256=1BRIaDnu622lzgEgbNzD5T4_hbNfMaJZqiicBbEBGQg,21416
7
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
8
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
9
10
  judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
10
- judgeval/common/tracer.py,sha256=qam2suh-0_Cu_B7AWg3AMfEo2TisRZVY1SnAfqhiFQo,33211
11
- judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
11
+ judgeval/common/tracer.py,sha256=pggEJTAH690jg7LPTfVBdGG3IMjkqpbsUBIoSnGtWes,36477
12
+ judgeval/common/utils.py,sha256=T1lpObopcH868NIgOTzNViTB33OGadcVWxWcfh2pm3E,33439
12
13
  judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
13
14
  judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
14
15
  judgeval/data/example.py,sha256=Rd-eDEM-giYfkfsGh_PBS2wwl15QlQPzbMV-J64Yj5E,2991
15
16
  judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
16
17
  judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
17
18
  judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
18
- judgeval/data/datasets/dataset.py,sha256=KdAY0KRUB2jxcGmc1XXXheFFcPsGFOIGY-kTwBNQS_Y,12080
19
- judgeval/data/datasets/eval_dataset_client.py,sha256=DzxWQIiHlbpg6FpmWY6brcSP_h_rGcztk2A_6tQNFys,11411
19
+ judgeval/data/datasets/dataset.py,sha256=FRl2efBQZEpyK_ZTM7FMQQ7wjmtvcHCMFBq8L7O2Wus,12080
20
+ judgeval/data/datasets/eval_dataset_client.py,sha256=UDzWOYlJqMDEYLyp9IyNBXWrXG2TyZ2L2JJnzly2mgc,11353
20
21
  judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
21
22
  judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
22
23
  judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
23
24
  judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8,1000
24
25
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
25
26
  judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
26
- judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
27
- judgeval/judges/utils.py,sha256=sYxSJq5cI9LtyJaxurcW9IwngALC9Ty8F_Mb8gz81nE,2732
27
+ judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
28
+ judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
28
29
  judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
29
- judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
30
- judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
30
+ judgeval/scorers/api_scorer.py,sha256=PPpropMg_vFyUZULWqRPhtz_h2-NVydBMNnGtRpGk4E,2135
31
+ judgeval/scorers/base_scorer.py,sha256=lz3QWPQQIbtsA-TWUjXYYRfQ96uCaAzqxt7Dn4TJa4s,1800
31
32
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
32
33
  judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
33
34
  judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
@@ -53,7 +54,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__ini
53
54
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
54
55
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=5B_G7PPEsfLq6cwWkKWcLuy2k_5RgoOzsW3wOZLIeMk,6703
55
56
  judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py,sha256=r6yae5iaWtlBL_cP8I-1SuhS9dulsy1e7W9Rcz82v6E,169
56
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=QG-oxa6-c74VzTuni17RQ9aeT0t1lCuxQXDMznqX8rc,10714
57
+ judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=ipER9zyJLq0NqcmxYwfDhavp4rUDYIaDbghR1R0YpaU,10688
57
58
  judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py,sha256=GfbKv595s1a0dB1No_kDsap6gfcr6dYRGiXx0PDb89k,6557
58
59
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py,sha256=J6tc-T60AVOEaNVuoVU0XIG6dvQri99Q0tnX_Tm-0vc,108
59
60
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py,sha256=tRgRyjGpc4Pe3nQ1c-5NeNYFvbulL7YEnoRa9zLp1gc,9649
@@ -78,7 +79,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
78
79
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
79
80
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
80
81
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
81
- judgeval-0.0.14.dist-info/METADATA,sha256=ZmCAECDNWwzpuES1slYKWcY_U-SMOsjaOdtSoj6wu0I,1283
82
- judgeval-0.0.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
83
- judgeval-0.0.14.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
84
- judgeval-0.0.14.dist-info/RECORD,,
82
+ judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
83
+ judgeval-0.0.16.dist-info/METADATA,sha256=EQHgzBDEctzLCN0SpLC1m53Z8xVlKZvE0sVMPYkK7yc,1283
84
+ judgeval-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
85
+ judgeval-0.0.16.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
86
+ judgeval-0.0.16.dist-info/RECORD,,