judgeval 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +6 -4
- judgeval/common/tracer.py +504 -257
- judgeval/common/utils.py +5 -1
- judgeval/constants.py +2 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/datasets/dataset.py +12 -6
- judgeval/data/datasets/eval_dataset_client.py +3 -1
- judgeval/data/example.py +7 -7
- judgeval/data/tool.py +29 -1
- judgeval/data/trace.py +31 -39
- judgeval/data/trace_run.py +2 -1
- judgeval/evaluation_run.py +4 -7
- judgeval/judgment_client.py +34 -7
- judgeval/run_evaluation.py +67 -19
- judgeval/scorers/__init__.py +4 -1
- judgeval/scorers/judgeval_scorer.py +12 -1
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
- judgeval/scorers/prompt_scorer.py +8 -164
- judgeval/scorers/score.py +15 -15
- judgeval-0.0.41.dist-info/METADATA +1450 -0
- {judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/RECORD +26 -24
- judgeval-0.0.39.dist-info/METADATA +0 -247
- {judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/WHEEL +0 -0
- {judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/score.py
CHANGED
@@ -48,7 +48,7 @@ async def safe_a_score_example(
|
|
48
48
|
info(f"Successfully scored example {example.example_id}")
|
49
49
|
except MissingTestCaseParamsError as e:
|
50
50
|
if skip_on_missing_params: # Skip the example if the scorer requires parameters that are missing
|
51
|
-
with example_logging_context(example.
|
51
|
+
with example_logging_context(example.created_at, example.example_id):
|
52
52
|
warning(f"Skipping example {example.example_id} due to missing parameters")
|
53
53
|
scorer.skipped = True
|
54
54
|
return
|
@@ -56,10 +56,10 @@ async def safe_a_score_example(
|
|
56
56
|
if ignore_errors: # Gracefully handle the error, does not stop the evaluation
|
57
57
|
scorer.error = str(e)
|
58
58
|
scorer.success = False
|
59
|
-
with example_logging_context(example.
|
59
|
+
with example_logging_context(example.created_at, example.example_id):
|
60
60
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
|
61
61
|
else: # Raise the error and stop the evaluation
|
62
|
-
with example_logging_context(example.
|
62
|
+
with example_logging_context(example.created_at, example.example_id):
|
63
63
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
64
64
|
raise
|
65
65
|
except TypeError: # in case a_score_example does not accept _show_indicator
|
@@ -68,27 +68,27 @@ async def safe_a_score_example(
|
|
68
68
|
except MissingTestCaseParamsError as e:
|
69
69
|
if skip_on_missing_params:
|
70
70
|
scorer.skipped = True
|
71
|
-
with example_logging_context(example.
|
71
|
+
with example_logging_context(example.created_at, example.example_id):
|
72
72
|
warning(f"Skipping example {example.example_id} due to missing parameters")
|
73
73
|
return
|
74
74
|
else:
|
75
75
|
if ignore_errors:
|
76
76
|
scorer.error = str(e)
|
77
77
|
scorer.success = False
|
78
|
-
with example_logging_context(example.
|
78
|
+
with example_logging_context(example.created_at, example.example_id):
|
79
79
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
|
80
80
|
else:
|
81
|
-
with example_logging_context(example.
|
81
|
+
with example_logging_context(example.created_at, example.example_id):
|
82
82
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
83
83
|
raise
|
84
84
|
except Exception as e:
|
85
85
|
if ignore_errors:
|
86
86
|
scorer.error = str(e)
|
87
87
|
scorer.success = False # Assuming you want to set success to False
|
88
|
-
with example_logging_context(example.
|
88
|
+
with example_logging_context(example.created_at, example.example_id):
|
89
89
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
|
90
90
|
else:
|
91
|
-
with example_logging_context(example.
|
91
|
+
with example_logging_context(example.created_at, example.example_id):
|
92
92
|
error(f"Stopping example {example.example_id}: {str(e)}")
|
93
93
|
raise
|
94
94
|
|
@@ -128,7 +128,7 @@ async def score_task(
|
|
128
128
|
except MissingTestCaseParamsError as e:
|
129
129
|
if skip_on_missing_params:
|
130
130
|
scorer.skipped = True
|
131
|
-
with example_logging_context(example.
|
131
|
+
with example_logging_context(example.created_at, example.example_id):
|
132
132
|
debug(f"Skipping example {example.example_id} due to missing parameters")
|
133
133
|
return
|
134
134
|
else:
|
@@ -137,7 +137,7 @@ async def score_task(
|
|
137
137
|
scorer.success = False # Override success
|
138
138
|
finish_text = "Failed"
|
139
139
|
else:
|
140
|
-
with example_logging_context(example.
|
140
|
+
with example_logging_context(example.created_at, example.example_id):
|
141
141
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
142
142
|
raise
|
143
143
|
except TypeError:
|
@@ -147,7 +147,7 @@ async def score_task(
|
|
147
147
|
except MissingTestCaseParamsError as e:
|
148
148
|
if skip_on_missing_params:
|
149
149
|
scorer.skipped = True
|
150
|
-
with example_logging_context(example.
|
150
|
+
with example_logging_context(example.created_at, example.example_id):
|
151
151
|
debug(f"Skipping example {example.example_id} due to missing parameters")
|
152
152
|
return
|
153
153
|
else:
|
@@ -156,7 +156,7 @@ async def score_task(
|
|
156
156
|
scorer.success = False # Override success
|
157
157
|
finish_text = "Failed"
|
158
158
|
else:
|
159
|
-
with example_logging_context(example.
|
159
|
+
with example_logging_context(example.created_at, example.example_id):
|
160
160
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
161
161
|
raise
|
162
162
|
except Exception as e:
|
@@ -164,10 +164,10 @@ async def score_task(
|
|
164
164
|
scorer.error = str(e)
|
165
165
|
scorer.success = False # Override success
|
166
166
|
finish_text = "Failed"
|
167
|
-
with example_logging_context(example.
|
167
|
+
with example_logging_context(example.created_at, example.example_id):
|
168
168
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
|
169
169
|
else:
|
170
|
-
with example_logging_context(example.
|
170
|
+
with example_logging_context(example.created_at, example.example_id):
|
171
171
|
error(f"Stopping example {example.example_id}: {str(e)}")
|
172
172
|
raise
|
173
173
|
|
@@ -305,7 +305,7 @@ async def a_execute_scoring(
|
|
305
305
|
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
306
306
|
) as pbar:
|
307
307
|
for i, ex in enumerate(examples):
|
308
|
-
with example_logging_context(ex.
|
308
|
+
with example_logging_context(ex.created_at, ex.example_id):
|
309
309
|
debug(f"Starting scoring for example {ex.example_id}")
|
310
310
|
debug(f"Input: {ex.input}")
|
311
311
|
debug(f"Using {len(scorers)} scorers")
|