judgeval 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +108 -30
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +1 -1
- judgeval/data/datasets/eval_dataset_client.py +10 -6
- judgeval/evaluation_run.py +8 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +147 -18
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +22 -8
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/RECORD +19 -17
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -20,6 +20,7 @@ from judgeval.constants import (
|
|
20
20
|
ROOT_API,
|
21
21
|
JUDGMENT_EVAL_API_URL,
|
22
22
|
JUDGMENT_EVAL_LOG_API_URL,
|
23
|
+
MAX_CONCURRENT_EVALUATIONS
|
23
24
|
)
|
24
25
|
from judgeval.common.exceptions import JudgmentAPIError
|
25
26
|
from judgeval.evaluation_run import EvaluationRun
|
@@ -30,6 +31,7 @@ from judgeval.common.logger import (
|
|
30
31
|
error,
|
31
32
|
example_logging_context
|
32
33
|
)
|
34
|
+
from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
|
33
35
|
|
34
36
|
|
35
37
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
@@ -53,7 +55,8 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
53
55
|
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
54
56
|
"X-Organization-Id": evaluation_run.organization_id
|
55
57
|
},
|
56
|
-
json=payload
|
58
|
+
json=payload,
|
59
|
+
verify=False)
|
57
60
|
response_data = response.json()
|
58
61
|
except Exception as e:
|
59
62
|
error(f"Error: {e}")
|
@@ -166,7 +169,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
166
169
|
"eval_name": eval_name,
|
167
170
|
"project_name": project_name,
|
168
171
|
"judgment_api_key": judgment_api_key,
|
169
|
-
}
|
172
|
+
},
|
173
|
+
verify=False
|
170
174
|
)
|
171
175
|
|
172
176
|
if response.status_code == 409:
|
@@ -208,7 +212,8 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
208
212
|
"results": [result.to_dict() for result in merged_results],
|
209
213
|
"project_name": evaluation_run.project_name,
|
210
214
|
"eval_name": evaluation_run.eval_name,
|
211
|
-
}
|
215
|
+
},
|
216
|
+
verify=False
|
212
217
|
)
|
213
218
|
|
214
219
|
if not res.ok:
|
@@ -228,6 +233,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
228
233
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
229
234
|
|
230
235
|
|
236
|
+
|
231
237
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
232
238
|
"""
|
233
239
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -245,7 +251,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
245
251
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
246
252
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
247
253
|
log_results (bool): Whether to log the results to the Judgment API
|
248
|
-
|
254
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
249
255
|
|
250
256
|
Returns:
|
251
257
|
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
@@ -316,7 +322,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
316
322
|
metadata=evaluation_run.metadata,
|
317
323
|
judgment_api_key=evaluation_run.judgment_api_key,
|
318
324
|
organization_id=evaluation_run.organization_id,
|
319
|
-
log_results=evaluation_run.log_results
|
325
|
+
log_results=evaluation_run.log_results,
|
326
|
+
rules=evaluation_run.rules
|
320
327
|
)
|
321
328
|
debug("Sending request to Judgment API")
|
322
329
|
response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
|
@@ -346,7 +353,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
346
353
|
]
|
347
354
|
|
348
355
|
api_results.append(ScoringResult(**filtered_result))
|
349
|
-
|
350
356
|
# Run local evals
|
351
357
|
if local_scorers: # List[JudgevalScorer]
|
352
358
|
info("Starting local evaluation")
|
@@ -364,12 +370,11 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
364
370
|
show_indicator=True,
|
365
371
|
_use_bar_indicator=True,
|
366
372
|
throttle_value=0,
|
367
|
-
max_concurrent=
|
373
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
368
374
|
)
|
369
375
|
)
|
370
376
|
local_results = results
|
371
377
|
info(f"Local evaluation complete with {len(local_results)} results")
|
372
|
-
|
373
378
|
# Aggregate the ScorerData from the API and local evaluations
|
374
379
|
debug("Merging API and local results")
|
375
380
|
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
@@ -377,6 +382,15 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
377
382
|
|
378
383
|
info(f"Successfully merged {len(merged_results)} results")
|
379
384
|
|
385
|
+
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
386
|
+
# if evaluation_run.rules and merged_results:
|
387
|
+
# run_rules(
|
388
|
+
# local_results=merged_results,
|
389
|
+
# rules=evaluation_run.rules,
|
390
|
+
# judgment_api_key=evaluation_run.judgment_api_key,
|
391
|
+
# organization_id=evaluation_run.organization_id
|
392
|
+
# )
|
393
|
+
|
380
394
|
if evaluation_run.log_results:
|
381
395
|
log_evaluation_results(merged_results, evaluation_run)
|
382
396
|
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -34,22 +34,22 @@ class APIJudgmentScorer(BaseModel):
|
|
34
34
|
@field_validator('score_type')
|
35
35
|
def convert_to_enum_value(cls, v):
|
36
36
|
"""
|
37
|
-
Validates that the `score_type` is a valid `
|
38
|
-
Converts string values to `
|
37
|
+
Validates that the `score_type` is a valid `APIScorer` enum value.
|
38
|
+
Converts string values to `APIScorer` enum values.
|
39
39
|
"""
|
40
40
|
debug(f"Attempting to convert score_type value: {v}")
|
41
41
|
if isinstance(v, APIScorer):
|
42
|
-
info(f"Using existing
|
43
|
-
return v
|
42
|
+
info(f"Using existing APIScorer: {v}")
|
43
|
+
return v
|
44
44
|
elif isinstance(v, str):
|
45
|
-
debug(f"Converting string value to
|
46
|
-
return APIScorer[v.upper()]
|
45
|
+
debug(f"Converting string value to APIScorer enum: {v}")
|
46
|
+
return APIScorer[v.upper()]
|
47
47
|
error(f"Invalid score_type value: {v}")
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
|
-
|
49
|
+
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
52
|
-
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
52
|
+
|
53
53
|
def to_dict(self) -> dict:
|
54
54
|
"""
|
55
55
|
Converts the scorer configuration to a dictionary format.
|
@@ -58,7 +58,6 @@ class APIJudgmentScorer(BaseModel):
|
|
58
58
|
dict: A dictionary containing the scorer's configuration
|
59
59
|
"""
|
60
60
|
return {
|
61
|
-
"score_type": self.score_type,
|
61
|
+
"score_type": str(self.score_type.value), # Convert enum to string for serialization
|
62
62
|
"threshold": self.threshold
|
63
|
-
}
|
64
|
-
|
63
|
+
}
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -48,5 +48,5 @@ class APIJudgmentScorer(BaseModel):
|
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
49
|
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
52
52
|
|
judgeval/utils/alerts.py
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
"""
|
2
|
+
Handling alerts in Judgeval.
|
3
|
+
"""
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Dict, Any, List, Optional
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
class AlertStatus(str, Enum):
|
9
|
+
"""Status of an alert evaluation."""
|
10
|
+
TRIGGERED = "triggered"
|
11
|
+
NOT_TRIGGERED = "not_triggered"
|
12
|
+
|
13
|
+
class AlertResult(BaseModel):
|
14
|
+
"""
|
15
|
+
Result of a rule evaluation.
|
16
|
+
|
17
|
+
Attributes:
|
18
|
+
rule_name: Name of the rule that was evaluated
|
19
|
+
rule_id: Unique identifier of the rule
|
20
|
+
status: Status of the alert (triggered or not)
|
21
|
+
conditions_result: List of condition evaluation results
|
22
|
+
metadata: Dictionary containing example_id, timestamp, and other metadata
|
23
|
+
"""
|
24
|
+
rule_name: str
|
25
|
+
rule_id: Optional[str] = None # The unique identifier of the rule
|
26
|
+
status: AlertStatus
|
27
|
+
conditions_result: List[Dict[str, Any]] = []
|
28
|
+
metadata: Dict[str, Any] = {}
|
29
|
+
|
30
|
+
@property
|
31
|
+
def example_id(self) -> Optional[str]:
|
32
|
+
"""Get example_id from metadata for backward compatibility"""
|
33
|
+
return self.metadata.get("example_id")
|
34
|
+
|
35
|
+
@property
|
36
|
+
def timestamp(self) -> Optional[str]:
|
37
|
+
"""Get timestamp from metadata for backward compatibility"""
|
38
|
+
return self.metadata.get("timestamp")
|
39
|
+
|
40
|
+
@property
|
41
|
+
def conditions_results(self) -> List[Dict[str, Any]]:
|
42
|
+
"""Backwards compatibility property for the conditions_result field"""
|
43
|
+
return self.conditions_result
|
@@ -1,33 +1,34 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/evaluation_run.py,sha256=
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/
|
3
|
+
judgeval/constants.py,sha256=9ndMvpjXEUPF3RlZZXyTXGDVWE3roH9MH6CZ9h38lrc,4700
|
4
|
+
judgeval/evaluation_run.py,sha256=qGNafAPLW9tq6KV3eDolIASodLxcSBWQO0mQ0Aq9Cig,6285
|
5
|
+
judgeval/judgment_client.py,sha256=HfNcpppOQH2QPHMsGpt5E2YuEgGfp31dTT4ZexhtN7o,23377
|
6
|
+
judgeval/rules.py,sha256=6RT3DdMvu-LpjqiSDMAd4d2T8S8pOfc_IBEvTgf3GS4,15665
|
7
|
+
judgeval/run_evaluation.py,sha256=4GnezPm_gGjPppJ_9thJqMJ56uHBIuNkZYc8JoYN3Fw,21490
|
7
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
10
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
11
|
-
judgeval/common/utils.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=cq-8welgUQSKxHZ1z3rbnSnEir_j19z4bK9XYrVmvEo,36529
|
12
|
+
judgeval/common/utils.py,sha256=T1lpObopcH868NIgOTzNViTB33OGadcVWxWcfh2pm3E,33439
|
12
13
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
14
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
14
15
|
judgeval/data/example.py,sha256=Rd-eDEM-giYfkfsGh_PBS2wwl15QlQPzbMV-J64Yj5E,2991
|
15
16
|
judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
|
16
17
|
judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
|
17
18
|
judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
|
18
|
-
judgeval/data/datasets/dataset.py,sha256=
|
19
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
19
|
+
judgeval/data/datasets/dataset.py,sha256=FRl2efBQZEpyK_ZTM7FMQQ7wjmtvcHCMFBq8L7O2Wus,12080
|
20
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=azdv6rnyoX5O-d8g0AjNW620yS-7zm4tV1kmyQ63TlE,11531
|
20
21
|
judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
21
22
|
judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
|
22
23
|
judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
|
23
24
|
judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8,1000
|
24
25
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
25
26
|
judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
|
26
|
-
judgeval/judges/together_judge.py,sha256=
|
27
|
-
judgeval/judges/utils.py,sha256=
|
27
|
+
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
28
|
+
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
28
29
|
judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
|
29
|
-
judgeval/scorers/api_scorer.py,sha256=
|
30
|
-
judgeval/scorers/base_scorer.py,sha256=
|
30
|
+
judgeval/scorers/api_scorer.py,sha256=PPpropMg_vFyUZULWqRPhtz_h2-NVydBMNnGtRpGk4E,2135
|
31
|
+
judgeval/scorers/base_scorer.py,sha256=lz3QWPQQIbtsA-TWUjXYYRfQ96uCaAzqxt7Dn4TJa4s,1800
|
31
32
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
33
|
judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
|
33
34
|
judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
|
@@ -53,7 +54,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__ini
|
|
53
54
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
|
54
55
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=5B_G7PPEsfLq6cwWkKWcLuy2k_5RgoOzsW3wOZLIeMk,6703
|
55
56
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py,sha256=r6yae5iaWtlBL_cP8I-1SuhS9dulsy1e7W9Rcz82v6E,169
|
56
|
-
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=
|
57
|
+
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=ipER9zyJLq0NqcmxYwfDhavp4rUDYIaDbghR1R0YpaU,10688
|
57
58
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py,sha256=GfbKv595s1a0dB1No_kDsap6gfcr6dYRGiXx0PDb89k,6557
|
58
59
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py,sha256=J6tc-T60AVOEaNVuoVU0XIG6dvQri99Q0tnX_Tm-0vc,108
|
59
60
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py,sha256=tRgRyjGpc4Pe3nQ1c-5NeNYFvbulL7YEnoRa9zLp1gc,9649
|
@@ -78,7 +79,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
|
|
78
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
79
80
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
80
81
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
81
|
-
judgeval
|
82
|
-
judgeval-0.0.
|
83
|
-
judgeval-0.0.
|
84
|
-
judgeval-0.0.
|
82
|
+
judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
|
83
|
+
judgeval-0.0.15.dist-info/METADATA,sha256=ZYYDsxPClqUtpV8_l-9lUSHPnx9eWCj06oR6jeen17g,1283
|
84
|
+
judgeval-0.0.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
85
|
+
judgeval-0.0.15.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
86
|
+
judgeval-0.0.15.dist-info/RECORD,,
|
File without changes
|
File without changes
|