eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
5
|
+
from eval_framework.shared.types import Completion
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CheckJsonFormat(BaseMetric[Completion]):
|
|
9
|
+
NAME = "JSON Format"
|
|
10
|
+
|
|
11
|
+
def _preprocess(self, completion: str) -> str:
|
|
12
|
+
completion = completion.strip()
|
|
13
|
+
for prefix in ["```json", "```Json", "```JSON", "```"]:
|
|
14
|
+
completion = completion.removeprefix(prefix)
|
|
15
|
+
completion = completion.removesuffix("```")
|
|
16
|
+
completion = completion.strip()
|
|
17
|
+
return completion
|
|
18
|
+
|
|
19
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
20
|
+
if response.error is not None:
|
|
21
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
22
|
+
|
|
23
|
+
json_text = self._preprocess(response.completion)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
json.loads(json_text)
|
|
27
|
+
is_valid_json = True
|
|
28
|
+
except ValueError as _:
|
|
29
|
+
is_valid_json = False
|
|
30
|
+
|
|
31
|
+
return [
|
|
32
|
+
MetricResult(metric_name=self.NAME, value=float(is_valid_json), higher_is_better=True, error=response.error)
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class CheckPostScriptFormat(BaseMetric[Completion]):
|
|
37
|
+
"""
|
|
38
|
+
This metric is honestly not that great
|
|
39
|
+
In the original IFEval implementation it just checks whether the
|
|
40
|
+
text contains the string (P.)P.S. or variants thereof such as p. s.
|
|
41
|
+
It doesn't check for parsing
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
NAME = "Postscript Format"
|
|
45
|
+
|
|
46
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
47
|
+
if response.error is not None:
|
|
48
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
49
|
+
|
|
50
|
+
postscript_pattern = r"\s*(P\.S\.|P\.P\.S\.)"
|
|
51
|
+
postscript = re.findall(postscript_pattern, response.completion, flags=re.MULTILINE)
|
|
52
|
+
return [
|
|
53
|
+
MetricResult(
|
|
54
|
+
metric_name=self.NAME, value=1.0 if postscript else 0.0, higher_is_better=True, error=response.error
|
|
55
|
+
)
|
|
56
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
4
|
+
from eval_framework.shared.types import Completion
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class GridDifference(BaseMetric[Completion]):
|
|
8
|
+
NAME = "grid_difference"
|
|
9
|
+
|
|
10
|
+
def count_differences(self, character_list_1: list[str], character_list_2: list[str]) -> int:
|
|
11
|
+
count = 0
|
|
12
|
+
for character_1, character_2 in zip(character_list_1, character_list_2):
|
|
13
|
+
if character_1 != character_2:
|
|
14
|
+
count += 1
|
|
15
|
+
return count
|
|
16
|
+
|
|
17
|
+
def calculate_score(
|
|
18
|
+
self, output_ground_truth_difference_count: int, input_ground_truth_difference_count: int
|
|
19
|
+
) -> float:
|
|
20
|
+
if output_ground_truth_difference_count == 0 and input_ground_truth_difference_count == 0:
|
|
21
|
+
return 1.0
|
|
22
|
+
score = 1.0 - (float(output_ground_truth_difference_count) / float(input_ground_truth_difference_count))
|
|
23
|
+
return score
|
|
24
|
+
|
|
25
|
+
def extract_grid_from_prompt(self, prompt: str) -> str:
|
|
26
|
+
# Extract grid between known markers
|
|
27
|
+
start_marker = "Below is the input grid with masked regions:"
|
|
28
|
+
end_marker = "Please output the completed grid"
|
|
29
|
+
|
|
30
|
+
# Use regex with DOTALL flag to match across newlines
|
|
31
|
+
match = re.search(re.escape(start_marker) + r"(.*?)" + re.escape(end_marker), prompt, re.DOTALL)
|
|
32
|
+
|
|
33
|
+
if match:
|
|
34
|
+
grid = match.group(1).strip()
|
|
35
|
+
return grid
|
|
36
|
+
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
40
|
+
if response.error is not None:
|
|
41
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
42
|
+
|
|
43
|
+
input_grid = self.extract_grid_from_prompt(prompt=response.last_user_instruction).split()
|
|
44
|
+
output_grid = response.completion.split()
|
|
45
|
+
|
|
46
|
+
assert response.ground_truth_list[0], "Ground truth list is empty or not provided in the response."
|
|
47
|
+
ground_truth_grid = response.ground_truth_list[0].split()
|
|
48
|
+
|
|
49
|
+
input_ground_truth_differences_count = self.count_differences(input_grid, ground_truth_grid)
|
|
50
|
+
output_ground_truth_differences_count = self.count_differences(output_grid, ground_truth_grid)
|
|
51
|
+
|
|
52
|
+
exact_match = True
|
|
53
|
+
score = 1.0
|
|
54
|
+
normalized_score = 1.0
|
|
55
|
+
if output_ground_truth_differences_count != 0:
|
|
56
|
+
exact_match = False
|
|
57
|
+
score = self.calculate_score(
|
|
58
|
+
output_ground_truth_differences_count,
|
|
59
|
+
input_ground_truth_differences_count,
|
|
60
|
+
)
|
|
61
|
+
normalized_score = max(score, 0.0)
|
|
62
|
+
|
|
63
|
+
return [
|
|
64
|
+
MetricResult(
|
|
65
|
+
metric_name=f"{self.NAME}_exact_match",
|
|
66
|
+
value=float(exact_match),
|
|
67
|
+
higher_is_better=True,
|
|
68
|
+
error=response.error,
|
|
69
|
+
),
|
|
70
|
+
MetricResult(metric_name=f"{self.NAME}_score", value=score, higher_is_better=True, error=response.error),
|
|
71
|
+
MetricResult(
|
|
72
|
+
metric_name=f"{self.NAME}_normalized_score",
|
|
73
|
+
value=normalized_score,
|
|
74
|
+
higher_is_better=True,
|
|
75
|
+
error=response.error,
|
|
76
|
+
),
|
|
77
|
+
]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from eval_framework.external.ifeval_impl.utils import process_results
|
|
4
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
5
|
+
from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class IFEvalMetricContext(BaseMetricContext):
|
|
9
|
+
key: int
|
|
10
|
+
instruction_id_list: list[str]
|
|
11
|
+
prompt: str
|
|
12
|
+
additional_kwargs: list[dict[str, Any]]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class IFEvalMetric(BaseMetric[Completion]):
|
|
16
|
+
NAME = "IFEval"
|
|
17
|
+
|
|
18
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
19
|
+
context = extract_context_metric(response, IFEvalMetricContext)
|
|
20
|
+
|
|
21
|
+
if response.error is not None:
|
|
22
|
+
return [
|
|
23
|
+
MetricResult(
|
|
24
|
+
metric_name=f"{self.NAME}/prompt_level_strict_acc",
|
|
25
|
+
value=None,
|
|
26
|
+
higher_is_better=True,
|
|
27
|
+
error=response.error,
|
|
28
|
+
),
|
|
29
|
+
MetricResult(
|
|
30
|
+
metric_name=f"{self.NAME}/prompt_level_loose_acc",
|
|
31
|
+
value=None,
|
|
32
|
+
higher_is_better=True,
|
|
33
|
+
error=response.error,
|
|
34
|
+
),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
grading = process_results(context, [response.completion])
|
|
38
|
+
|
|
39
|
+
results = [
|
|
40
|
+
MetricResult(
|
|
41
|
+
metric_name=f"{self.NAME}/prompt_level_strict_acc",
|
|
42
|
+
value=float(grading["prompt_level_strict_acc"]),
|
|
43
|
+
higher_is_better=True,
|
|
44
|
+
error=response.error,
|
|
45
|
+
),
|
|
46
|
+
MetricResult(
|
|
47
|
+
metric_name=f"{self.NAME}/prompt_level_loose_acc",
|
|
48
|
+
value=float(grading["prompt_level_loose_acc"]),
|
|
49
|
+
higher_is_better=True,
|
|
50
|
+
error=response.error,
|
|
51
|
+
),
|
|
52
|
+
]
|
|
53
|
+
# this framework does not support a custom aggregation step (see agg_inst_level_acc()) so work around
|
|
54
|
+
# by returning the result for each instruction as a separate MetricResult
|
|
55
|
+
results += [
|
|
56
|
+
MetricResult(
|
|
57
|
+
metric_name=f"{self.NAME}/inst_level_strict_acc",
|
|
58
|
+
value=float(v),
|
|
59
|
+
higher_is_better=True,
|
|
60
|
+
error=response.error,
|
|
61
|
+
)
|
|
62
|
+
for v in grading["inst_level_strict_acc"]
|
|
63
|
+
]
|
|
64
|
+
results += [
|
|
65
|
+
MetricResult(
|
|
66
|
+
metric_name=f"{self.NAME}/inst_level_loose_acc",
|
|
67
|
+
value=float(v),
|
|
68
|
+
higher_is_better=True,
|
|
69
|
+
error=response.error,
|
|
70
|
+
)
|
|
71
|
+
for v in grading["inst_level_loose_acc"]
|
|
72
|
+
]
|
|
73
|
+
return results
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections.abc import Mapping
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import jsonschema # type: ignore
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
9
|
+
from eval_framework.shared.types import Completion
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JsonFormatEvaluation(BaseModel):
|
|
13
|
+
is_just_json: bool = False
|
|
14
|
+
is_valid_json: bool = False
|
|
15
|
+
fulfills_schema: bool | None = None
|
|
16
|
+
json_parsing_error: str | None = None
|
|
17
|
+
schema_validation_error: str | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class JsonFormat(BaseMetric[Completion]):
|
|
21
|
+
NAME = "JSON Format"
|
|
22
|
+
|
|
23
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
24
|
+
keys = [
|
|
25
|
+
"is_just_json",
|
|
26
|
+
"is_valid_json",
|
|
27
|
+
"fulfills_schema",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
if response.error is not None:
|
|
31
|
+
return [
|
|
32
|
+
MetricResult(metric_name=f"{self.NAME}/{k}", value=None, higher_is_better=True, error=response.error)
|
|
33
|
+
for k in keys
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
if response.completion == "":
|
|
37
|
+
return [
|
|
38
|
+
MetricResult(metric_name=f"{self.NAME}/{k}", value=0.0, higher_is_better=True, error=response.error)
|
|
39
|
+
for k in keys
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
json_dict, grading = self._extract_and_parse_json(response.completion)
|
|
43
|
+
schema = json.loads(str(response.ground_truth))["json_schema"]
|
|
44
|
+
if schema and json_dict is None:
|
|
45
|
+
grading.fulfills_schema = False
|
|
46
|
+
if schema and json_dict is not None:
|
|
47
|
+
grading = self._validate_json_against_schema(json_dict, schema, grading)
|
|
48
|
+
|
|
49
|
+
results = []
|
|
50
|
+
for key in keys:
|
|
51
|
+
result = MetricResult(
|
|
52
|
+
metric_name=f"{self.NAME}/{key}",
|
|
53
|
+
value=float(getattr(grading, key)) if getattr(grading, key) is not None else None,
|
|
54
|
+
higher_is_better=True,
|
|
55
|
+
error=response.error,
|
|
56
|
+
code_execution_trace=(grading.json_parsing_error or "") + (grading.schema_validation_error or ""),
|
|
57
|
+
)
|
|
58
|
+
results.append(result)
|
|
59
|
+
return results
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _validate_json_against_schema(
|
|
63
|
+
json_obj: object, schema: Mapping[str, Any], evaluation_result: JsonFormatEvaluation
|
|
64
|
+
) -> JsonFormatEvaluation:
|
|
65
|
+
evaluation_result = evaluation_result.model_copy(deep=True)
|
|
66
|
+
try:
|
|
67
|
+
jsonschema.validate(json_obj, schema)
|
|
68
|
+
evaluation_result.fulfills_schema = True
|
|
69
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
70
|
+
evaluation_result.fulfills_schema = False
|
|
71
|
+
evaluation_result.schema_validation_error = type(e).__name__
|
|
72
|
+
except jsonschema.exceptions.SchemaError as e:
|
|
73
|
+
evaluation_result.schema_validation_error = type(e).__name__
|
|
74
|
+
return evaluation_result
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _extract_and_parse_json(completion: str) -> tuple[object, JsonFormatEvaluation]:
|
|
78
|
+
evaluation_result = JsonFormatEvaluation()
|
|
79
|
+
json_dict = None
|
|
80
|
+
try:
|
|
81
|
+
json_dict = json.loads(remove_comments(completion.strip("`")))
|
|
82
|
+
evaluation_result.is_just_json = True
|
|
83
|
+
evaluation_result.is_valid_json = True
|
|
84
|
+
except Exception as _:
|
|
85
|
+
try:
|
|
86
|
+
json_string = remove_comments(get_json_object(completion))
|
|
87
|
+
json_dict = json.loads(json_string)
|
|
88
|
+
evaluation_result.is_valid_json = True
|
|
89
|
+
except Exception as e:
|
|
90
|
+
evaluation_result.json_parsing_error = type(e).__name__
|
|
91
|
+
return json_dict, evaluation_result
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_json_object(text: str) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Extract the first valid JSON object or array from text.
|
|
97
|
+
|
|
98
|
+
This function handles nested brackets properly by using a bracket counting
|
|
99
|
+
approach to find complete JSON structures, rather than using regex which
|
|
100
|
+
can incorrectly match outer brackets containing non-JSON content.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def find_json_at_position(text: str, start_pos: int, open_char: str, close_char: str) -> str | None:
|
|
104
|
+
"""Find a complete JSON object/array starting at the given position."""
|
|
105
|
+
if start_pos >= len(text) or text[start_pos] != open_char:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
bracket_count = 0
|
|
109
|
+
in_string = False
|
|
110
|
+
escaped = False
|
|
111
|
+
|
|
112
|
+
for i in range(start_pos, len(text)):
|
|
113
|
+
char = text[i]
|
|
114
|
+
|
|
115
|
+
if escaped:
|
|
116
|
+
escaped = False
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
if char == "\\" and in_string:
|
|
120
|
+
escaped = True
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
if char == '"' and not escaped:
|
|
124
|
+
in_string = not in_string
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
if not in_string:
|
|
128
|
+
if char == open_char:
|
|
129
|
+
bracket_count += 1
|
|
130
|
+
elif char == close_char:
|
|
131
|
+
bracket_count -= 1
|
|
132
|
+
if bracket_count == 0:
|
|
133
|
+
# Found complete JSON structure
|
|
134
|
+
candidate = text[start_pos : i + 1]
|
|
135
|
+
# Test if it's valid JSON
|
|
136
|
+
try:
|
|
137
|
+
json.loads(candidate)
|
|
138
|
+
return candidate
|
|
139
|
+
except json.JSONDecodeError:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
# Look for JSON objects {} and arrays []
|
|
145
|
+
json_candidates = []
|
|
146
|
+
|
|
147
|
+
# Search for objects starting with {
|
|
148
|
+
for i in range(len(text)):
|
|
149
|
+
if text[i] == "{":
|
|
150
|
+
candidate = find_json_at_position(text, i, "{", "}")
|
|
151
|
+
if candidate:
|
|
152
|
+
json_candidates.append(candidate)
|
|
153
|
+
|
|
154
|
+
# Search for arrays starting with [
|
|
155
|
+
for i in range(len(text)):
|
|
156
|
+
if text[i] == "[":
|
|
157
|
+
candidate = find_json_at_position(text, i, "[", "]")
|
|
158
|
+
if candidate:
|
|
159
|
+
json_candidates.append(candidate)
|
|
160
|
+
|
|
161
|
+
if not json_candidates:
|
|
162
|
+
raise RuntimeError(f"No valid JSON object found in {text}.")
|
|
163
|
+
|
|
164
|
+
# Return the longest valid JSON (most likely to be the main content)
|
|
165
|
+
return max(json_candidates, key=len)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def remove_comments(text: str, comment_indicator: str = "//") -> str:
|
|
169
|
+
lines = text.splitlines()
|
|
170
|
+
lines = [line.split(comment_indicator)[0] for line in lines]
|
|
171
|
+
return "\n".join([line for line in lines if line.strip()])
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from eval_framework.exceptions import LogicError
|
|
2
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
3
|
+
from eval_framework.metrics.llm.graders.language import AVAILABLE_LANGUAGES
|
|
4
|
+
from eval_framework.shared.types import Completion
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LanguageChecker(BaseMetric[Completion]):
|
|
8
|
+
NAME = "Language Check"
|
|
9
|
+
|
|
10
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
11
|
+
if response.error is not None:
|
|
12
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
13
|
+
|
|
14
|
+
if response.ground_truth is None:
|
|
15
|
+
raise LogicError("Language detection needs ground_truth.")
|
|
16
|
+
if response.ground_truth not in AVAILABLE_LANGUAGES:
|
|
17
|
+
raise LogicError("Checking for unknown or unavailable language.")
|
|
18
|
+
|
|
19
|
+
completion_language = response.get_completion_language()
|
|
20
|
+
target_language = response.ground_truth
|
|
21
|
+
value = float(completion_language == target_language)
|
|
22
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GermanCompletionChecker(BaseMetric[Completion]):
|
|
26
|
+
NAME = "German Completion Check"
|
|
27
|
+
|
|
28
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
29
|
+
if response.error is not None:
|
|
30
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
31
|
+
|
|
32
|
+
raw_completion_language = response.get_raw_completion_language()
|
|
33
|
+
value = float(raw_completion_language == "de")
|
|
34
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LanguageConsistencyChecker(BaseMetric[Completion]):
|
|
38
|
+
NAME = "Language Consistency"
|
|
39
|
+
|
|
40
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
41
|
+
if response.error is not None:
|
|
42
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
43
|
+
|
|
44
|
+
completion_language = response.get_completion_language()
|
|
45
|
+
target_language = response.get_instruction_language()
|
|
46
|
+
if completion_language == target_language == "":
|
|
47
|
+
return [] # No language information could be determined
|
|
48
|
+
else:
|
|
49
|
+
value = float(completion_language == target_language)
|
|
50
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class LanguageRawConsistencyChecker(BaseMetric[Completion]):
|
|
54
|
+
NAME = "Language Consistency Raw"
|
|
55
|
+
|
|
56
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
57
|
+
if response.error is not None:
|
|
58
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
59
|
+
|
|
60
|
+
raw_completion_language = response.get_raw_completion_language()
|
|
61
|
+
target_language = response.get_instruction_language()
|
|
62
|
+
|
|
63
|
+
if raw_completion_language == target_language == "":
|
|
64
|
+
return [] # No language information could be determined
|
|
65
|
+
else:
|
|
66
|
+
value = float(raw_completion_language == target_language)
|
|
67
|
+
return [
|
|
68
|
+
MetricResult(
|
|
69
|
+
metric_name=self.NAME,
|
|
70
|
+
value=value,
|
|
71
|
+
higher_is_better=True,
|
|
72
|
+
error=response.error,
|
|
73
|
+
)
|
|
74
|
+
]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
5
|
+
from eval_framework.metrics.completion.text_counter import ParagraphCounter, SentenceCounter, WordCounter
|
|
6
|
+
from eval_framework.shared.types import Completion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LengthRequirementUnit(Enum):
|
|
10
|
+
WORDS = "words"
|
|
11
|
+
SENTENCES = "sentences"
|
|
12
|
+
PARAGRAPHS = "paragraphs"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LengthRequirementType(Enum):
|
|
16
|
+
MIN = "minimum"
|
|
17
|
+
MAX = "maximum"
|
|
18
|
+
TARGET = "target"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LengthControl(BaseMetric[Completion]):
|
|
22
|
+
NAME = "length_control"
|
|
23
|
+
|
|
24
|
+
def __init__(self, tolerance: float = 1 / 6) -> None:
|
|
25
|
+
super().__init__()
|
|
26
|
+
self.tolerance = tolerance
|
|
27
|
+
|
|
28
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
29
|
+
if response.error is not None:
|
|
30
|
+
return [
|
|
31
|
+
MetricResult(
|
|
32
|
+
metric_name=f"{self.NAME}/fulfills_length_requirement",
|
|
33
|
+
value=None,
|
|
34
|
+
higher_is_better=True,
|
|
35
|
+
error=response.error if response.error is not None else None,
|
|
36
|
+
)
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
expectations = json.loads(str(response.ground_truth))
|
|
40
|
+
stripped_completion = response.completion.strip()
|
|
41
|
+
|
|
42
|
+
match LengthRequirementUnit(expectations["unit"]):
|
|
43
|
+
case LengthRequirementUnit.WORDS:
|
|
44
|
+
count = WordCounter._count_words(stripped_completion)
|
|
45
|
+
case LengthRequirementUnit.SENTENCES:
|
|
46
|
+
count = SentenceCounter._count_sentences(stripped_completion)
|
|
47
|
+
case LengthRequirementUnit.PARAGRAPHS:
|
|
48
|
+
count = ParagraphCounter._count_paragraphs(stripped_completion)
|
|
49
|
+
case _:
|
|
50
|
+
raise NotImplementedError(f"LengthRequirementUnit {expectations['unit']} is not supported.")
|
|
51
|
+
|
|
52
|
+
expected_count = int(expectations["count"])
|
|
53
|
+
normalized_distance_to_target = (count - expected_count) / float(expected_count)
|
|
54
|
+
absolute_normalized_distance_to_target = abs(normalized_distance_to_target)
|
|
55
|
+
|
|
56
|
+
match LengthRequirementType(expectations["type"]):
|
|
57
|
+
case LengthRequirementType.TARGET:
|
|
58
|
+
fulfills_length_requirement = absolute_normalized_distance_to_target <= self.tolerance
|
|
59
|
+
case LengthRequirementType.MIN:
|
|
60
|
+
fulfills_length_requirement = count >= expected_count
|
|
61
|
+
case LengthRequirementType.MAX:
|
|
62
|
+
fulfills_length_requirement = count <= expected_count
|
|
63
|
+
case _:
|
|
64
|
+
raise NotImplementedError(f"LengthRequirementType {expectations['type']} is not supported.")
|
|
65
|
+
|
|
66
|
+
return [
|
|
67
|
+
MetricResult(
|
|
68
|
+
metric_name=f"{self.NAME}/normalized_distance_to_target",
|
|
69
|
+
value=float(normalized_distance_to_target),
|
|
70
|
+
higher_is_better=False,
|
|
71
|
+
),
|
|
72
|
+
MetricResult(
|
|
73
|
+
metric_name=f"{self.NAME}/absolute_normalized_distance_to_target",
|
|
74
|
+
value=float(absolute_normalized_distance_to_target),
|
|
75
|
+
higher_is_better=False,
|
|
76
|
+
),
|
|
77
|
+
MetricResult(
|
|
78
|
+
metric_name=f"{self.NAME}/fulfills_length_requirement",
|
|
79
|
+
value=float(fulfills_length_requirement),
|
|
80
|
+
higher_is_better=True,
|
|
81
|
+
error=response.error,
|
|
82
|
+
),
|
|
83
|
+
]
|