judgeval 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +104 -28
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +1 -1
- judgeval/data/datasets/eval_dataset_client.py +0 -1
- judgeval/evaluation_run.py +8 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +139 -14
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +16 -5
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/METADATA +1 -1
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/RECORD +19 -17
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/WHEEL +0 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -20,6 +20,7 @@ from judgeval.constants import (
|
|
20
20
|
ROOT_API,
|
21
21
|
JUDGMENT_EVAL_API_URL,
|
22
22
|
JUDGMENT_EVAL_LOG_API_URL,
|
23
|
+
MAX_CONCURRENT_EVALUATIONS
|
23
24
|
)
|
24
25
|
from judgeval.common.exceptions import JudgmentAPIError
|
25
26
|
from judgeval.evaluation_run import EvaluationRun
|
@@ -30,6 +31,7 @@ from judgeval.common.logger import (
|
|
30
31
|
error,
|
31
32
|
example_logging_context
|
32
33
|
)
|
34
|
+
from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
|
33
35
|
|
34
36
|
|
35
37
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
@@ -228,6 +230,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
228
230
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
229
231
|
|
230
232
|
|
233
|
+
|
231
234
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
232
235
|
"""
|
233
236
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -245,7 +248,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
245
248
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
246
249
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
247
250
|
log_results (bool): Whether to log the results to the Judgment API
|
248
|
-
|
251
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
249
252
|
|
250
253
|
Returns:
|
251
254
|
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
@@ -316,7 +319,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
316
319
|
metadata=evaluation_run.metadata,
|
317
320
|
judgment_api_key=evaluation_run.judgment_api_key,
|
318
321
|
organization_id=evaluation_run.organization_id,
|
319
|
-
log_results=evaluation_run.log_results
|
322
|
+
log_results=evaluation_run.log_results,
|
323
|
+
rules=evaluation_run.rules
|
320
324
|
)
|
321
325
|
debug("Sending request to Judgment API")
|
322
326
|
response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
|
@@ -346,7 +350,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
346
350
|
]
|
347
351
|
|
348
352
|
api_results.append(ScoringResult(**filtered_result))
|
349
|
-
|
350
353
|
# Run local evals
|
351
354
|
if local_scorers: # List[JudgevalScorer]
|
352
355
|
info("Starting local evaluation")
|
@@ -364,12 +367,11 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
364
367
|
show_indicator=True,
|
365
368
|
_use_bar_indicator=True,
|
366
369
|
throttle_value=0,
|
367
|
-
max_concurrent=
|
370
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
368
371
|
)
|
369
372
|
)
|
370
373
|
local_results = results
|
371
374
|
info(f"Local evaluation complete with {len(local_results)} results")
|
372
|
-
|
373
375
|
# Aggregate the ScorerData from the API and local evaluations
|
374
376
|
debug("Merging API and local results")
|
375
377
|
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
@@ -377,6 +379,15 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
377
379
|
|
378
380
|
info(f"Successfully merged {len(merged_results)} results")
|
379
381
|
|
382
|
+
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
383
|
+
# if evaluation_run.rules and merged_results:
|
384
|
+
# run_rules(
|
385
|
+
# local_results=merged_results,
|
386
|
+
# rules=evaluation_run.rules,
|
387
|
+
# judgment_api_key=evaluation_run.judgment_api_key,
|
388
|
+
# organization_id=evaluation_run.organization_id
|
389
|
+
# )
|
390
|
+
|
380
391
|
if evaluation_run.log_results:
|
381
392
|
log_evaluation_results(merged_results, evaluation_run)
|
382
393
|
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -34,22 +34,22 @@ class APIJudgmentScorer(BaseModel):
|
|
34
34
|
@field_validator('score_type')
|
35
35
|
def convert_to_enum_value(cls, v):
|
36
36
|
"""
|
37
|
-
Validates that the `score_type` is a valid `
|
38
|
-
Converts string values to `
|
37
|
+
Validates that the `score_type` is a valid `APIScorer` enum value.
|
38
|
+
Converts string values to `APIScorer` enum values.
|
39
39
|
"""
|
40
40
|
debug(f"Attempting to convert score_type value: {v}")
|
41
41
|
if isinstance(v, APIScorer):
|
42
|
-
info(f"Using existing
|
43
|
-
return v
|
42
|
+
info(f"Using existing APIScorer: {v}")
|
43
|
+
return v
|
44
44
|
elif isinstance(v, str):
|
45
|
-
debug(f"Converting string value to
|
46
|
-
return APIScorer[v.upper()]
|
45
|
+
debug(f"Converting string value to APIScorer enum: {v}")
|
46
|
+
return APIScorer[v.upper()]
|
47
47
|
error(f"Invalid score_type value: {v}")
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
|
-
|
49
|
+
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
52
|
-
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
52
|
+
|
53
53
|
def to_dict(self) -> dict:
|
54
54
|
"""
|
55
55
|
Converts the scorer configuration to a dictionary format.
|
@@ -58,7 +58,6 @@ class APIJudgmentScorer(BaseModel):
|
|
58
58
|
dict: A dictionary containing the scorer's configuration
|
59
59
|
"""
|
60
60
|
return {
|
61
|
-
"score_type": self.score_type,
|
61
|
+
"score_type": str(self.score_type.value), # Convert enum to string for serialization
|
62
62
|
"threshold": self.threshold
|
63
|
-
}
|
64
|
-
|
63
|
+
}
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -48,5 +48,5 @@ class APIJudgmentScorer(BaseModel):
|
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
49
|
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
52
52
|
|
judgeval/utils/alerts.py
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
"""
|
2
|
+
Handling alerts in Judgeval.
|
3
|
+
"""
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Dict, Any, List, Optional
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
class AlertStatus(str, Enum):
|
9
|
+
"""Status of an alert evaluation."""
|
10
|
+
TRIGGERED = "triggered"
|
11
|
+
NOT_TRIGGERED = "not_triggered"
|
12
|
+
|
13
|
+
class AlertResult(BaseModel):
|
14
|
+
"""
|
15
|
+
Result of a rule evaluation.
|
16
|
+
|
17
|
+
Attributes:
|
18
|
+
rule_name: Name of the rule that was evaluated
|
19
|
+
rule_id: Unique identifier of the rule
|
20
|
+
status: Status of the alert (triggered or not)
|
21
|
+
conditions_result: List of condition evaluation results
|
22
|
+
metadata: Dictionary containing example_id, timestamp, and other metadata
|
23
|
+
"""
|
24
|
+
rule_name: str
|
25
|
+
rule_id: Optional[str] = None # The unique identifier of the rule
|
26
|
+
status: AlertStatus
|
27
|
+
conditions_result: List[Dict[str, Any]] = []
|
28
|
+
metadata: Dict[str, Any] = {}
|
29
|
+
|
30
|
+
@property
|
31
|
+
def example_id(self) -> Optional[str]:
|
32
|
+
"""Get example_id from metadata for backward compatibility"""
|
33
|
+
return self.metadata.get("example_id")
|
34
|
+
|
35
|
+
@property
|
36
|
+
def timestamp(self) -> Optional[str]:
|
37
|
+
"""Get timestamp from metadata for backward compatibility"""
|
38
|
+
return self.metadata.get("timestamp")
|
39
|
+
|
40
|
+
@property
|
41
|
+
def conditions_results(self) -> List[Dict[str, Any]]:
|
42
|
+
"""Backwards compatibility property for the conditions_result field"""
|
43
|
+
return self.conditions_result
|
@@ -1,33 +1,34 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/evaluation_run.py,sha256=
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/
|
3
|
+
judgeval/constants.py,sha256=9ndMvpjXEUPF3RlZZXyTXGDVWE3roH9MH6CZ9h38lrc,4700
|
4
|
+
judgeval/evaluation_run.py,sha256=qGNafAPLW9tq6KV3eDolIASodLxcSBWQO0mQ0Aq9Cig,6285
|
5
|
+
judgeval/judgment_client.py,sha256=fg45YySB-WbusPw3JmsQEvMIiqBzKcZiXl5I8ewRft0,23252
|
6
|
+
judgeval/rules.py,sha256=6RT3DdMvu-LpjqiSDMAd4d2T8S8pOfc_IBEvTgf3GS4,15665
|
7
|
+
judgeval/run_evaluation.py,sha256=1BRIaDnu622lzgEgbNzD5T4_hbNfMaJZqiicBbEBGQg,21416
|
7
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
10
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
11
|
-
judgeval/common/utils.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=pggEJTAH690jg7LPTfVBdGG3IMjkqpbsUBIoSnGtWes,36477
|
12
|
+
judgeval/common/utils.py,sha256=T1lpObopcH868NIgOTzNViTB33OGadcVWxWcfh2pm3E,33439
|
12
13
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
14
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
14
15
|
judgeval/data/example.py,sha256=Rd-eDEM-giYfkfsGh_PBS2wwl15QlQPzbMV-J64Yj5E,2991
|
15
16
|
judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
|
16
17
|
judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
|
17
18
|
judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
|
18
|
-
judgeval/data/datasets/dataset.py,sha256=
|
19
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
19
|
+
judgeval/data/datasets/dataset.py,sha256=FRl2efBQZEpyK_ZTM7FMQQ7wjmtvcHCMFBq8L7O2Wus,12080
|
20
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=UDzWOYlJqMDEYLyp9IyNBXWrXG2TyZ2L2JJnzly2mgc,11353
|
20
21
|
judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
21
22
|
judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
|
22
23
|
judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
|
23
24
|
judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8,1000
|
24
25
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
25
26
|
judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
|
26
|
-
judgeval/judges/together_judge.py,sha256=
|
27
|
-
judgeval/judges/utils.py,sha256=
|
27
|
+
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
28
|
+
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
28
29
|
judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
|
29
|
-
judgeval/scorers/api_scorer.py,sha256=
|
30
|
-
judgeval/scorers/base_scorer.py,sha256=
|
30
|
+
judgeval/scorers/api_scorer.py,sha256=PPpropMg_vFyUZULWqRPhtz_h2-NVydBMNnGtRpGk4E,2135
|
31
|
+
judgeval/scorers/base_scorer.py,sha256=lz3QWPQQIbtsA-TWUjXYYRfQ96uCaAzqxt7Dn4TJa4s,1800
|
31
32
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
33
|
judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
|
33
34
|
judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
|
@@ -53,7 +54,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__ini
|
|
53
54
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
|
54
55
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=5B_G7PPEsfLq6cwWkKWcLuy2k_5RgoOzsW3wOZLIeMk,6703
|
55
56
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py,sha256=r6yae5iaWtlBL_cP8I-1SuhS9dulsy1e7W9Rcz82v6E,169
|
56
|
-
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=
|
57
|
+
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=ipER9zyJLq0NqcmxYwfDhavp4rUDYIaDbghR1R0YpaU,10688
|
57
58
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py,sha256=GfbKv595s1a0dB1No_kDsap6gfcr6dYRGiXx0PDb89k,6557
|
58
59
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py,sha256=J6tc-T60AVOEaNVuoVU0XIG6dvQri99Q0tnX_Tm-0vc,108
|
59
60
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py,sha256=tRgRyjGpc4Pe3nQ1c-5NeNYFvbulL7YEnoRa9zLp1gc,9649
|
@@ -78,7 +79,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
|
|
78
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
79
80
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
80
81
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
81
|
-
judgeval
|
82
|
-
judgeval-0.0.
|
83
|
-
judgeval-0.0.
|
84
|
-
judgeval-0.0.
|
82
|
+
judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
|
83
|
+
judgeval-0.0.16.dist-info/METADATA,sha256=EQHgzBDEctzLCN0SpLC1m53Z8xVlKZvE0sVMPYkK7yc,1283
|
84
|
+
judgeval-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
85
|
+
judgeval-0.0.16.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
86
|
+
judgeval-0.0.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|