judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/score.py
CHANGED
@@ -2,15 +2,14 @@
|
|
2
2
|
Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
6
5
|
import asyncio
|
7
|
-
import time
|
6
|
+
import time
|
8
7
|
from tqdm.asyncio import tqdm_asyncio
|
9
8
|
from typing import List, Union, Optional, Callable
|
10
9
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
11
10
|
|
12
11
|
from judgeval.data import (
|
13
|
-
Example,
|
12
|
+
Example,
|
14
13
|
CustomExample,
|
15
14
|
ScoringResult,
|
16
15
|
generate_scoring_result,
|
@@ -22,6 +21,7 @@ from judgeval.common.exceptions import MissingTestCaseParamsError
|
|
22
21
|
from judgeval.common.logger import example_logging_context, debug, error, warning, info
|
23
22
|
from judgeval.judges import JudgevalJudge
|
24
23
|
|
24
|
+
|
25
25
|
async def safe_a_score_example(
|
26
26
|
scorer: JudgevalScorer,
|
27
27
|
example: Example,
|
@@ -35,32 +35,42 @@ async def safe_a_score_example(
|
|
35
35
|
Args:
|
36
36
|
scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
|
37
37
|
example (Example): The `Example` to be scored.
|
38
|
-
|
39
|
-
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
38
|
+
|
39
|
+
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
40
40
|
If set to false, any error will be raised and stop the evaluation.
|
41
41
|
If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
|
42
|
-
|
43
|
-
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
42
|
+
|
43
|
+
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
44
44
|
"""
|
45
45
|
debug(f"Starting safe_a_score_example for example {example.example_id}")
|
46
46
|
try:
|
47
47
|
await scorer.a_score_example(example, _show_indicator=False)
|
48
48
|
info(f"Successfully scored example {example.example_id}")
|
49
49
|
except MissingTestCaseParamsError as e:
|
50
|
-
if
|
50
|
+
if (
|
51
|
+
skip_on_missing_params
|
52
|
+
): # Skip the example if the scorer requires parameters that are missing
|
51
53
|
with example_logging_context(example.created_at, example.example_id):
|
52
|
-
warning(
|
54
|
+
warning(
|
55
|
+
f"Skipping example {example.example_id} due to missing parameters"
|
56
|
+
)
|
53
57
|
scorer.skipped = True
|
54
58
|
return
|
55
59
|
else:
|
56
|
-
if
|
60
|
+
if (
|
61
|
+
ignore_errors
|
62
|
+
): # Gracefully handle the error, does not stop the evaluation
|
57
63
|
scorer.error = str(e)
|
58
64
|
scorer.success = False
|
59
65
|
with example_logging_context(example.created_at, example.example_id):
|
60
|
-
warning(
|
66
|
+
warning(
|
67
|
+
f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
|
68
|
+
)
|
61
69
|
else: # Raise the error and stop the evaluation
|
62
70
|
with example_logging_context(example.created_at, example.example_id):
|
63
|
-
error(
|
71
|
+
error(
|
72
|
+
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
73
|
+
)
|
64
74
|
raise
|
65
75
|
except TypeError: # in case a_score_example does not accept _show_indicator
|
66
76
|
try:
|
@@ -69,17 +79,27 @@ async def safe_a_score_example(
|
|
69
79
|
if skip_on_missing_params:
|
70
80
|
scorer.skipped = True
|
71
81
|
with example_logging_context(example.created_at, example.example_id):
|
72
|
-
warning(
|
82
|
+
warning(
|
83
|
+
f"Skipping example {example.example_id} due to missing parameters"
|
84
|
+
)
|
73
85
|
return
|
74
86
|
else:
|
75
87
|
if ignore_errors:
|
76
88
|
scorer.error = str(e)
|
77
|
-
scorer.success = False
|
78
|
-
with example_logging_context(
|
79
|
-
|
89
|
+
scorer.success = False
|
90
|
+
with example_logging_context(
|
91
|
+
example.created_at, example.example_id
|
92
|
+
):
|
93
|
+
warning(
|
94
|
+
f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
|
95
|
+
)
|
80
96
|
else:
|
81
|
-
with example_logging_context(
|
82
|
-
|
97
|
+
with example_logging_context(
|
98
|
+
example.created_at, example.example_id
|
99
|
+
):
|
100
|
+
error(
|
101
|
+
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
102
|
+
)
|
83
103
|
raise
|
84
104
|
except Exception as e:
|
85
105
|
if ignore_errors:
|
@@ -121,7 +141,7 @@ async def score_task(
|
|
121
141
|
"""
|
122
142
|
while not progress.finished:
|
123
143
|
start_time = time.perf_counter()
|
124
|
-
|
144
|
+
|
125
145
|
try:
|
126
146
|
await scorer.a_score_example(example, _show_indicator=False)
|
127
147
|
finish_text = "Completed"
|
@@ -129,7 +149,9 @@ async def score_task(
|
|
129
149
|
if skip_on_missing_params:
|
130
150
|
scorer.skipped = True
|
131
151
|
with example_logging_context(example.created_at, example.example_id):
|
132
|
-
debug(
|
152
|
+
debug(
|
153
|
+
f"Skipping example {example.example_id} due to missing parameters"
|
154
|
+
)
|
133
155
|
return
|
134
156
|
else:
|
135
157
|
if ignore_errors:
|
@@ -137,8 +159,12 @@ async def score_task(
|
|
137
159
|
scorer.success = False # Override success
|
138
160
|
finish_text = "Failed"
|
139
161
|
else:
|
140
|
-
with example_logging_context(
|
141
|
-
|
162
|
+
with example_logging_context(
|
163
|
+
example.created_at, example.example_id
|
164
|
+
):
|
165
|
+
error(
|
166
|
+
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
167
|
+
)
|
142
168
|
raise
|
143
169
|
except TypeError:
|
144
170
|
try:
|
@@ -147,8 +173,12 @@ async def score_task(
|
|
147
173
|
except MissingTestCaseParamsError as e:
|
148
174
|
if skip_on_missing_params:
|
149
175
|
scorer.skipped = True
|
150
|
-
with example_logging_context(
|
151
|
-
|
176
|
+
with example_logging_context(
|
177
|
+
example.created_at, example.example_id
|
178
|
+
):
|
179
|
+
debug(
|
180
|
+
f"Skipping example {example.example_id} due to missing parameters"
|
181
|
+
)
|
152
182
|
return
|
153
183
|
else:
|
154
184
|
if ignore_errors:
|
@@ -156,8 +186,12 @@ async def score_task(
|
|
156
186
|
scorer.success = False # Override success
|
157
187
|
finish_text = "Failed"
|
158
188
|
else:
|
159
|
-
with example_logging_context(
|
160
|
-
|
189
|
+
with example_logging_context(
|
190
|
+
example.created_at, example.example_id
|
191
|
+
):
|
192
|
+
error(
|
193
|
+
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
194
|
+
)
|
161
195
|
raise
|
162
196
|
except Exception as e:
|
163
197
|
if ignore_errors:
|
@@ -165,7 +199,9 @@ async def score_task(
|
|
165
199
|
scorer.success = False # Override success
|
166
200
|
finish_text = "Failed"
|
167
201
|
with example_logging_context(example.created_at, example.example_id):
|
168
|
-
warning(
|
202
|
+
warning(
|
203
|
+
f"Ignoring errors for example {example.example_id}: {str(e)}"
|
204
|
+
)
|
169
205
|
else:
|
170
206
|
with example_logging_context(example.created_at, example.example_id):
|
171
207
|
error(f"Stopping example {example.example_id}: {str(e)}")
|
@@ -213,9 +249,7 @@ async def score_with_indicator(
|
|
213
249
|
tasks = []
|
214
250
|
for scorer in scorers:
|
215
251
|
task_id = progress.add_task(
|
216
|
-
description=scorer_console_msg(
|
217
|
-
scorer, async_mode=True
|
218
|
-
),
|
252
|
+
description=scorer_console_msg(scorer, async_mode=True),
|
219
253
|
total=100,
|
220
254
|
) # Add task to progress bar
|
221
255
|
tasks.append(
|
@@ -231,9 +265,7 @@ async def score_with_indicator(
|
|
231
265
|
await asyncio.gather(*tasks)
|
232
266
|
else:
|
233
267
|
tasks = [
|
234
|
-
safe_a_score_example(
|
235
|
-
scorer, example, ignore_errors, skip_on_missing_params
|
236
|
-
)
|
268
|
+
safe_a_score_example(scorer, example, ignore_errors, skip_on_missing_params)
|
237
269
|
for scorer in scorers
|
238
270
|
]
|
239
271
|
|
@@ -280,7 +312,7 @@ async def a_execute_scoring(
|
|
280
312
|
return await func(*args, **kwargs)
|
281
313
|
except Exception as e:
|
282
314
|
print(f"Error executing function: {e}")
|
283
|
-
if kwargs.get(
|
315
|
+
if kwargs.get("ignore_errors", False):
|
284
316
|
# Simply return None when ignoring errors, as expected by the test
|
285
317
|
return None
|
286
318
|
# If we're not ignoring errors, propagate the exception
|
@@ -290,12 +322,13 @@ async def a_execute_scoring(
|
|
290
322
|
for scorer in scorers:
|
291
323
|
scorer.verbose_mode = verbose_mode
|
292
324
|
|
293
|
-
# Add model to scorers
|
325
|
+
# Add model to scorers
|
294
326
|
for scorer in scorers:
|
295
327
|
scorer._add_model(model)
|
296
328
|
|
297
329
|
scoring_results: List[ScoringResult] = [None for _ in examples]
|
298
330
|
tasks = []
|
331
|
+
cloned_scorers: List[JudgevalScorer]
|
299
332
|
|
300
333
|
if show_indicator and _use_bar_indicator:
|
301
334
|
with tqdm_asyncio(
|
@@ -311,18 +344,16 @@ async def a_execute_scoring(
|
|
311
344
|
debug(f"Using {len(scorers)} scorers")
|
312
345
|
for scorer in scorers:
|
313
346
|
debug(f"Using scorer: {type(scorer).__name__}")
|
314
|
-
if hasattr(scorer,
|
347
|
+
if hasattr(scorer, "threshold"):
|
315
348
|
debug(f"Scorer threshold: {scorer.threshold}")
|
316
|
-
if hasattr(scorer,
|
349
|
+
if hasattr(scorer, "model"):
|
317
350
|
debug(f"Scorer model: {type(scorer.model).__name__}")
|
318
351
|
if isinstance(ex, Example) or isinstance(ex, CustomExample):
|
319
352
|
if len(scorers) == 0:
|
320
353
|
pbar.update(1)
|
321
354
|
continue
|
322
|
-
|
323
|
-
cloned_scorers
|
324
|
-
scorers
|
325
|
-
)
|
355
|
+
|
356
|
+
cloned_scorers = clone_scorers(scorers)
|
326
357
|
task = execute_with_semaphore(
|
327
358
|
func=a_eval_examples_helper,
|
328
359
|
scorers=cloned_scorers,
|
@@ -345,9 +376,7 @@ async def a_execute_scoring(
|
|
345
376
|
if len(scorers) == 0:
|
346
377
|
continue
|
347
378
|
|
348
|
-
cloned_scorers
|
349
|
-
scorers
|
350
|
-
)
|
379
|
+
cloned_scorers = clone_scorers(scorers)
|
351
380
|
task = execute_with_semaphore(
|
352
381
|
func=a_eval_examples_helper,
|
353
382
|
scorers=cloned_scorers,
|
@@ -376,10 +405,10 @@ async def a_eval_examples_helper(
|
|
376
405
|
show_indicator: bool,
|
377
406
|
_use_bar_indicator: bool,
|
378
407
|
pbar: Optional[tqdm_asyncio] = None,
|
379
|
-
|
408
|
+
) -> None:
|
380
409
|
"""
|
381
410
|
Evaluate a single example asynchronously using a list of scorers.
|
382
|
-
|
411
|
+
|
383
412
|
Args:
|
384
413
|
scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
|
385
414
|
example (Example): The example to be evaluated.
|
@@ -410,23 +439,27 @@ async def a_eval_examples_helper(
|
|
410
439
|
show_indicator=show_metrics_indicator,
|
411
440
|
) # execute the scoring functions of each scorer on the example
|
412
441
|
|
413
|
-
# Now that all the scoring functions of each scorer have executed, we collect
|
442
|
+
# Now that all the scoring functions of each scorer have executed, we collect
|
414
443
|
# the results and update the ScoringResult with the scorer data
|
415
444
|
success = True
|
416
445
|
scorer_data_list = []
|
417
446
|
for scorer in scorers:
|
418
447
|
# At this point, the scorer has been executed and already contains data.
|
419
|
-
if getattr(scorer,
|
448
|
+
if getattr(scorer, "skipped", False):
|
420
449
|
continue
|
421
|
-
scorer_data = create_scorer_data(
|
450
|
+
scorer_data = create_scorer_data(
|
451
|
+
scorer
|
452
|
+
) # Fetch scorer data from completed scorer evaluation
|
422
453
|
success = success and scorer_data.success
|
423
454
|
scorer_data_list.append(scorer_data)
|
424
|
-
|
455
|
+
|
425
456
|
scoring_end_time = time.perf_counter()
|
426
457
|
run_duration = scoring_end_time - scoring_start_time
|
427
|
-
|
428
|
-
scoring_result = generate_scoring_result(
|
458
|
+
|
459
|
+
scoring_result = generate_scoring_result(
|
460
|
+
example, scorer_data_list, run_duration, success
|
461
|
+
)
|
429
462
|
scoring_results[score_index] = scoring_result
|
430
|
-
|
463
|
+
|
431
464
|
if pbar is not None:
|
432
465
|
pbar.update(1)
|
judgeval/scorers/utils.py
CHANGED
@@ -83,7 +83,9 @@ def scorer_progress_meter(
|
|
83
83
|
yield
|
84
84
|
|
85
85
|
|
86
|
-
def parse_response_json(
|
86
|
+
def parse_response_json(
|
87
|
+
llm_response: str, scorer: Optional[JudgevalScorer] = None
|
88
|
+
) -> dict:
|
87
89
|
"""
|
88
90
|
Extracts JSON output from an LLM response and returns it as a dictionary.
|
89
91
|
|
@@ -100,8 +102,12 @@ def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = No
|
|
100
102
|
llm_response = llm_response + "}"
|
101
103
|
end = len(llm_response)
|
102
104
|
|
103
|
-
json_str =
|
104
|
-
|
105
|
+
json_str = (
|
106
|
+
llm_response[start:end] if start != -1 and end != 0 else ""
|
107
|
+
) # extract the JSON string
|
108
|
+
json_str = re.sub(
|
109
|
+
r",\s*([\]}])", r"\1", json_str
|
110
|
+
) # Remove trailing comma if present
|
105
111
|
|
106
112
|
try:
|
107
113
|
return json.loads(json_str)
|
@@ -131,7 +137,7 @@ def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
|
|
131
137
|
Args:
|
132
138
|
metric (JudgevalScorer): The scorer object.
|
133
139
|
steps (List[str]): The steps to be included in the verbose logs.
|
134
|
-
|
140
|
+
|
135
141
|
Returns:
|
136
142
|
str: The verbose logs (Concatenated steps).
|
137
143
|
"""
|
@@ -157,7 +163,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
|
|
157
163
|
|
158
164
|
Returns:
|
159
165
|
asyncio.AbstractEventLoop: The current or newly created event loop.
|
160
|
-
|
166
|
+
|
161
167
|
Raises:
|
162
168
|
RuntimeError: If the event loop is closed.
|
163
169
|
"""
|
@@ -205,5 +211,3 @@ def check_example_params(
|
|
205
211
|
error_str = f"{missing_params_str} fields in example cannot be None for the '{scorer.__name__}' scorer"
|
206
212
|
scorer.error = error_str
|
207
213
|
raise MissingExampleParamsError(error_str)
|
208
|
-
|
209
|
-
|
judgeval/tracer/__init__.py
CHANGED
judgeval/utils/alerts.py
CHANGED
@@ -1,19 +1,23 @@
|
|
1
1
|
"""
|
2
2
|
Handling alerts in Judgeval.
|
3
3
|
"""
|
4
|
+
|
4
5
|
from enum import Enum
|
5
6
|
from typing import Dict, Any, List, Optional
|
6
7
|
from pydantic import BaseModel
|
7
8
|
|
9
|
+
|
8
10
|
class AlertStatus(str, Enum):
|
9
11
|
"""Status of an alert evaluation."""
|
12
|
+
|
10
13
|
TRIGGERED = "triggered"
|
11
14
|
NOT_TRIGGERED = "not_triggered"
|
12
15
|
|
16
|
+
|
13
17
|
class AlertResult(BaseModel):
|
14
18
|
"""
|
15
19
|
Result of a rule evaluation.
|
16
|
-
|
20
|
+
|
17
21
|
Attributes:
|
18
22
|
rule_name: Name of the rule that was evaluated
|
19
23
|
rule_id: Unique identifier of the rule
|
@@ -25,26 +29,29 @@ class AlertResult(BaseModel):
|
|
25
29
|
project_id: Optional project identifier
|
26
30
|
trace_span_id: Optional trace span identifier
|
27
31
|
"""
|
32
|
+
|
28
33
|
rule_name: str
|
29
34
|
rule_id: Optional[str] = None # The unique identifier of the rule
|
30
35
|
status: AlertStatus
|
31
36
|
conditions_result: List[Dict[str, Any]] = []
|
32
37
|
metadata: Dict[str, Any] = {}
|
33
|
-
notification: Optional[Any] =
|
38
|
+
notification: Optional[Any] = (
|
39
|
+
None # NotificationConfig when triggered, None otherwise
|
40
|
+
)
|
34
41
|
combine_type: Optional[str] = None # "all" or "any"
|
35
42
|
project_id: Optional[str] = None # Project identifier
|
36
43
|
trace_span_id: Optional[str] = None # Trace span identifier
|
37
|
-
|
44
|
+
|
38
45
|
@property
|
39
46
|
def example_id(self) -> Optional[str]:
|
40
47
|
"""Get example_id from metadata for backward compatibility"""
|
41
48
|
return self.metadata.get("example_id")
|
42
|
-
|
49
|
+
|
43
50
|
@property
|
44
51
|
def timestamp(self) -> Optional[str]:
|
45
52
|
"""Get timestamp from metadata for backward compatibility"""
|
46
53
|
return self.metadata.get("timestamp")
|
47
|
-
|
54
|
+
|
48
55
|
@property
|
49
56
|
def conditions_results(self) -> List[Dict[str, Any]]:
|
50
57
|
"""Backwards compatibility property for the conditions_result field"""
|
@@ -53,15 +60,19 @@ class AlertResult(BaseModel):
|
|
53
60
|
def model_dump(self, **kwargs):
|
54
61
|
"""
|
55
62
|
Convert the AlertResult to a dictionary for JSON serialization.
|
56
|
-
|
63
|
+
|
57
64
|
Args:
|
58
65
|
**kwargs: Additional arguments to pass to Pydantic's model_dump
|
59
|
-
|
66
|
+
|
60
67
|
Returns:
|
61
68
|
dict: Dictionary representation of the AlertResult
|
62
69
|
"""
|
63
|
-
data =
|
64
|
-
|
70
|
+
data = (
|
71
|
+
super().model_dump(**kwargs)
|
72
|
+
if hasattr(super(), "model_dump")
|
73
|
+
else super().dict(**kwargs)
|
74
|
+
)
|
75
|
+
|
65
76
|
# Handle the NotificationConfig object if it exists
|
66
77
|
if hasattr(self, "notification") and self.notification is not None:
|
67
78
|
if hasattr(self.notification, "model_dump"):
|
@@ -76,7 +87,7 @@ class AlertResult(BaseModel):
|
|
76
87
|
"communication_methods": notif.communication_methods,
|
77
88
|
"email_addresses": notif.email_addresses,
|
78
89
|
"slack_channels": getattr(notif, "slack_channels", []),
|
79
|
-
"send_at": notif.send_at
|
90
|
+
"send_at": notif.send_at,
|
80
91
|
}
|
81
|
-
|
82
|
-
return data
|
92
|
+
|
93
|
+
return data
|
@@ -1,15 +1,11 @@
|
|
1
1
|
import yaml
|
2
|
-
from
|
3
|
-
|
4
|
-
info,
|
5
|
-
error,
|
6
|
-
example_logging_context
|
7
|
-
)
|
2
|
+
from typing import List
|
3
|
+
from judgeval.common.logger import debug, info, error
|
8
4
|
|
9
5
|
from judgeval.data import Example
|
10
6
|
|
11
7
|
|
12
|
-
def
|
8
|
+
def get_examples_from_yaml(file_path: str) -> List[Example] | None:
|
13
9
|
debug(f"Loading dataset from YAML file: {file_path}")
|
14
10
|
"""
|
15
11
|
Adds examples from a YAML file.
|
@@ -51,7 +47,7 @@ def add_from_yaml(file_path: str) -> None:
|
|
51
47
|
except yaml.YAMLError:
|
52
48
|
error(f"Invalid YAML file: {file_path}")
|
53
49
|
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
54
|
-
|
50
|
+
|
55
51
|
info(f"Added {len(examples)} examples from YAML")
|
56
52
|
new_examples = [Example(**e) for e in examples]
|
57
|
-
return new_examples
|
53
|
+
return new_examples
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import requests as requests_original
|
2
|
+
from requests.adapters import HTTPAdapter
|
3
|
+
from urllib3.util.retry import Retry
|
4
|
+
from http import HTTPStatus
|
5
|
+
|
6
|
+
|
7
|
+
class RetrySession(requests_original.Session):
|
8
|
+
def __init__(
|
9
|
+
self,
|
10
|
+
retries=3,
|
11
|
+
backoff_factor=0.5,
|
12
|
+
status_forcelist=[HTTPStatus.BAD_GATEWAY, HTTPStatus.SERVICE_UNAVAILABLE],
|
13
|
+
):
|
14
|
+
super().__init__()
|
15
|
+
|
16
|
+
retry_strategy = Retry(
|
17
|
+
total=retries,
|
18
|
+
read=retries,
|
19
|
+
connect=retries,
|
20
|
+
backoff_factor=backoff_factor,
|
21
|
+
status_forcelist=status_forcelist,
|
22
|
+
)
|
23
|
+
|
24
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
25
|
+
self.mount("http://", adapter)
|
26
|
+
self.mount("https://", adapter)
|
27
|
+
|
28
|
+
|
29
|
+
requests = RetrySession()
|
judgeval/version_check.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
import importlib.metadata
|
2
|
-
import requests
|
2
|
+
from judgeval.utils.requests import requests
|
3
3
|
import threading
|
4
4
|
|
5
|
+
|
5
6
|
def check_latest_version(package_name: str = "judgeval"):
|
6
7
|
def _check():
|
7
8
|
try:
|
8
9
|
current_version = importlib.metadata.version(package_name)
|
9
|
-
response = requests.get(
|
10
|
+
response = requests.get(
|
11
|
+
f"https://pypi.org/pypi/{package_name}/json", timeout=2
|
12
|
+
)
|
10
13
|
latest_version = response.json()["info"]["version"]
|
11
14
|
|
12
15
|
if current_version != latest_version:
|