eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from eval_framework.exceptions import LogicError
|
|
2
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
3
|
+
from eval_framework.metrics.completion.rouge_1 import ROUGE_1
|
|
4
|
+
from eval_framework.metrics.completion.rouge_2 import ROUGE_2
|
|
5
|
+
from eval_framework.metrics.completion.rouge_l import ROUGE_L
|
|
6
|
+
from eval_framework.shared.types import Completion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ROUGE_GEOMETRIC_MEAN(BaseMetric[Completion]):
|
|
10
|
+
"""ROUGE Geometric Mean"""
|
|
11
|
+
|
|
12
|
+
NAME = "ROUGE-Geometric-Mean"
|
|
13
|
+
|
|
14
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
15
|
+
if response.error is not None:
|
|
16
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
17
|
+
if response.completion == "":
|
|
18
|
+
return [MetricResult(metric_name=self.NAME, value=0.0, higher_is_better=True, error=response.error)]
|
|
19
|
+
if any(gt is None for gt in response.ground_truth_list):
|
|
20
|
+
raise LogicError("When calculating ROUGE Geometric Mean ground_truth cannot be None.")
|
|
21
|
+
|
|
22
|
+
# Calculate ROUGE-1, ROUGE-2, and ROUGE-L
|
|
23
|
+
rouge_1 = ROUGE_1().calculate(response)[0].value
|
|
24
|
+
rouge_2 = ROUGE_2().calculate(response)[0].value
|
|
25
|
+
rouge_l = ROUGE_L().calculate(response)[0].value
|
|
26
|
+
|
|
27
|
+
# Calculate the geometric mean of ROUGE-1, ROUGE-2, and ROUGE-L
|
|
28
|
+
if rouge_1 is None or rouge_2 is None or rouge_l is None:
|
|
29
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
30
|
+
|
|
31
|
+
geometric_mean = (rouge_1 * rouge_2 * rouge_l) ** (1 / 3)
|
|
32
|
+
return [
|
|
33
|
+
MetricResult(
|
|
34
|
+
metric_name=self.NAME, value=float(geometric_mean), higher_is_better=True, error=response.error
|
|
35
|
+
)
|
|
36
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from eval_framework.exceptions import LogicError
|
|
2
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
3
|
+
from eval_framework.shared.types import Completion
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ROUGE_L(BaseMetric[Completion]):
|
|
7
|
+
"""ROUGE-L"""
|
|
8
|
+
|
|
9
|
+
NAME = "ROUGE-L"
|
|
10
|
+
|
|
11
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
12
|
+
if response.error is not None:
|
|
13
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
14
|
+
|
|
15
|
+
if response.completion == "":
|
|
16
|
+
return [MetricResult(metric_name=self.NAME, value=0.0, higher_is_better=True, error=response.error)]
|
|
17
|
+
if None in response.ground_truth_list:
|
|
18
|
+
raise LogicError("When calculating ROUGE-L ground_truth cannot be None.")
|
|
19
|
+
|
|
20
|
+
# ROUGE-L is essentially an F1 score, but it’s a specific F1 score based on
|
|
21
|
+
# the Longest Common Subsequence (LCS) between a candidate summary and a reference summary.
|
|
22
|
+
rouge = max([_calculate_rouge_l(response.completion, gt) for gt in response.ground_truth_list]) # type: ignore[arg-type]
|
|
23
|
+
return [MetricResult(metric_name=self.NAME, value=float(rouge), higher_is_better=True, error=response.error)]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _longest_common_subsequence_length(candidate_tokens: list[str], reference_tokens: list[str]) -> int:
|
|
27
|
+
candidate_len, reference_len = len(candidate_tokens), len(reference_tokens)
|
|
28
|
+
lcs_matrix = [[0] * (reference_len + 1) for _ in range(candidate_len + 1)]
|
|
29
|
+
|
|
30
|
+
for i in range(candidate_len + 1):
|
|
31
|
+
for j in range(reference_len + 1):
|
|
32
|
+
if i == 0 or j == 0:
|
|
33
|
+
lcs_matrix[i][j] = 0
|
|
34
|
+
elif candidate_tokens[i - 1] == reference_tokens[j - 1]:
|
|
35
|
+
lcs_matrix[i][j] = lcs_matrix[i - 1][j - 1] + 1
|
|
36
|
+
else:
|
|
37
|
+
lcs_matrix[i][j] = max(lcs_matrix[i - 1][j], lcs_matrix[i][j - 1])
|
|
38
|
+
|
|
39
|
+
return lcs_matrix[candidate_len][reference_len]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _calculate_rouge_l(completion: str, ground_truth: str) -> float:
|
|
43
|
+
lcs_length = _longest_common_subsequence_length(completion.split(), ground_truth.split())
|
|
44
|
+
if lcs_length == 0:
|
|
45
|
+
return 0.0
|
|
46
|
+
precision = lcs_length / len(completion.split())
|
|
47
|
+
recall = lcs_length / len(ground_truth.split())
|
|
48
|
+
if precision + recall == 0:
|
|
49
|
+
f1_score = 0.0
|
|
50
|
+
else:
|
|
51
|
+
f1_score = (2 * precision * recall) / (precision + recall)
|
|
52
|
+
return f1_score
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import tomllib
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import xmltodict
|
|
8
|
+
import yaml
|
|
9
|
+
from lxml import etree
|
|
10
|
+
|
|
11
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
12
|
+
from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StructMetricContext(BaseMetricContext):
|
|
16
|
+
output_type: str
|
|
17
|
+
paths: list[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StructMetric(BaseMetric[Completion]):
|
|
21
|
+
NAME = "StructMetric"
|
|
22
|
+
|
|
23
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
24
|
+
if response.error is not None:
|
|
25
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
26
|
+
|
|
27
|
+
context = extract_context_metric(response, StructMetricContext)
|
|
28
|
+
|
|
29
|
+
output_type = context.output_type
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
match output_type.lower():
|
|
33
|
+
case "json":
|
|
34
|
+
result = json.loads(response.completion)
|
|
35
|
+
case "yaml":
|
|
36
|
+
result = list(yaml.safe_load_all(response.completion))
|
|
37
|
+
if isinstance(result, list) and len(result) == 1:
|
|
38
|
+
result = result[0]
|
|
39
|
+
else:
|
|
40
|
+
raise yaml.YAMLError("Multiple documents found in YAML")
|
|
41
|
+
case "toml":
|
|
42
|
+
result = tomllib.loads(response.completion)
|
|
43
|
+
case "xml":
|
|
44
|
+
result = xmltodict.parse(response.completion)
|
|
45
|
+
case "csv":
|
|
46
|
+
csv_output = csv.DictReader(io.StringIO(response.completion))
|
|
47
|
+
# Check for unclosed quotes
|
|
48
|
+
if response.completion.count('"') % 2 != 0:
|
|
49
|
+
raise csv.Error("Unclosed quote in CSV")
|
|
50
|
+
if not csv_output.fieldnames:
|
|
51
|
+
raise csv.Error("CSV has no headers")
|
|
52
|
+
result = {"csv_headers": csv_output.fieldnames, "csv_rows": list(csv_output)}
|
|
53
|
+
case _:
|
|
54
|
+
raise ValueError(f"Unsupported format: {output_type}")
|
|
55
|
+
valid_format = 1.0
|
|
56
|
+
except (json.JSONDecodeError, yaml.YAMLError, tomllib.TOMLDecodeError, csv.Error, Exception):
|
|
57
|
+
valid_format = 0.0
|
|
58
|
+
|
|
59
|
+
has_required_fields = 0.0
|
|
60
|
+
if valid_format == 1:
|
|
61
|
+
# assert "paths" in response.eval_kwargs, "Paths must be provided in eval_kwargs"
|
|
62
|
+
assert context.paths is not None, "Paths must be provided in context"
|
|
63
|
+
paths = context.paths
|
|
64
|
+
assert isinstance(paths, list), "Paths must be a list of strings"
|
|
65
|
+
valid_paths = 0
|
|
66
|
+
for path in paths:
|
|
67
|
+
if path_exists(result, path):
|
|
68
|
+
valid_paths += 1
|
|
69
|
+
has_required_fields = valid_paths / len(paths) if paths else 1.0
|
|
70
|
+
|
|
71
|
+
return [
|
|
72
|
+
MetricResult(
|
|
73
|
+
metric_name=f"{self.NAME}/valid_format",
|
|
74
|
+
value=valid_format,
|
|
75
|
+
higher_is_better=True,
|
|
76
|
+
),
|
|
77
|
+
MetricResult(
|
|
78
|
+
metric_name=f"{self.NAME}/has_keywords",
|
|
79
|
+
value=has_required_fields,
|
|
80
|
+
higher_is_better=True,
|
|
81
|
+
),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def is_valid_html(html: str) -> bool:
|
|
86
|
+
parser = etree.HTMLParser(recover=False)
|
|
87
|
+
try:
|
|
88
|
+
etree.fromstring(html.encode("utf-8"), parser)
|
|
89
|
+
except etree.XMLSyntaxError:
|
|
90
|
+
return False
|
|
91
|
+
return len(parser.error_log) == 0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class RenderableStructMetricContext(BaseMetricContext):
|
|
95
|
+
output_type: str
|
|
96
|
+
keywords: list[str]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class RenderableStructMetric(StructMetric):
|
|
100
|
+
NAME = "RenderableStructMetric"
|
|
101
|
+
|
|
102
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
103
|
+
if response.error is not None:
|
|
104
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
105
|
+
|
|
106
|
+
context = extract_context_metric(response, RenderableStructMetricContext)
|
|
107
|
+
|
|
108
|
+
output_type = context.output_type
|
|
109
|
+
|
|
110
|
+
valid_format = 0.0
|
|
111
|
+
match output_type.lower():
|
|
112
|
+
case "html":
|
|
113
|
+
valid_format = float(is_valid_html(response.completion))
|
|
114
|
+
case _:
|
|
115
|
+
raise ValueError(f"Unsupported format for RenderableStructMetric: {output_type}")
|
|
116
|
+
|
|
117
|
+
assert context.keywords is not None, "Keywords must be provided in context"
|
|
118
|
+
keywords = context.keywords
|
|
119
|
+
assert isinstance(keywords, list), "Keywords must be a list of strings"
|
|
120
|
+
has_keywords = 1.0
|
|
121
|
+
if keywords:
|
|
122
|
+
has_keywords = sum(1 for keyword in keywords if keyword.lower() in response.completion.lower()) / len(
|
|
123
|
+
keywords
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return [
|
|
127
|
+
MetricResult(
|
|
128
|
+
metric_name=f"{self.NAME}/valid_format",
|
|
129
|
+
value=valid_format,
|
|
130
|
+
higher_is_better=True,
|
|
131
|
+
),
|
|
132
|
+
MetricResult(
|
|
133
|
+
metric_name=f"{self.NAME}/has_keywords",
|
|
134
|
+
value=has_keywords,
|
|
135
|
+
higher_is_better=True,
|
|
136
|
+
),
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# adapted from: https://github.com/TIGER-AI-Lab/StructEval/blob/main/structeval/eval_engine/eval_utils.py
|
|
141
|
+
def tokenize_path(path: str) -> list[str]:
|
|
142
|
+
"""
|
|
143
|
+
Tokenize a dot-notation path, handling back-ticks and array indices.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
path: The path string (e.g. "users.0.name" or "users[0].name")
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of path tokens
|
|
150
|
+
"""
|
|
151
|
+
# Special‑case: treat CSV header paths as a single token
|
|
152
|
+
if path.startswith("csv::"):
|
|
153
|
+
return [path]
|
|
154
|
+
|
|
155
|
+
tokens, buf, in_bt = [], "", False
|
|
156
|
+
i, n = 0, len(path)
|
|
157
|
+
|
|
158
|
+
while i < n:
|
|
159
|
+
ch = path[i]
|
|
160
|
+
|
|
161
|
+
# Toggle back-tick state
|
|
162
|
+
if ch == "`":
|
|
163
|
+
in_bt = not in_bt
|
|
164
|
+
i += 1
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Dot delimiter (when not inside back-ticks)
|
|
168
|
+
if ch == "." and not in_bt:
|
|
169
|
+
if buf:
|
|
170
|
+
tokens.append(buf)
|
|
171
|
+
buf = ""
|
|
172
|
+
i += 1
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Bracket "[index]" treated as separate token
|
|
176
|
+
if ch == "[" and not in_bt:
|
|
177
|
+
if buf:
|
|
178
|
+
tokens.append(buf)
|
|
179
|
+
buf = ""
|
|
180
|
+
j = path.find("]", i)
|
|
181
|
+
if j == -1:
|
|
182
|
+
raise ValueError(f"Unclosed '[' in path: {path}")
|
|
183
|
+
tokens.append(path[i : j + 1]) # e.g. "[0]"
|
|
184
|
+
i = j + 1
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Regular character
|
|
188
|
+
buf += ch
|
|
189
|
+
i += 1
|
|
190
|
+
|
|
191
|
+
if buf:
|
|
192
|
+
tokens.append(buf)
|
|
193
|
+
return tokens
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# adapted from: https://github.com/TIGER-AI-Lab/StructEval/blob/main/structeval/eval_engine/eval_utils.py
|
|
197
|
+
def path_exists(data: Any, path: str) -> bool:
|
|
198
|
+
"""
|
|
199
|
+
Check if a path exists in a structured data object.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
data: The structured data to check
|
|
203
|
+
path: The path to check (dot notation)
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
True if path exists, False otherwise
|
|
207
|
+
"""
|
|
208
|
+
tokens = tokenize_path(path)
|
|
209
|
+
|
|
210
|
+
def walk(node: Any, toks: list[str]) -> bool:
|
|
211
|
+
if not toks:
|
|
212
|
+
return True
|
|
213
|
+
tok, *rest = toks
|
|
214
|
+
|
|
215
|
+
# CSV header rule (root level only)
|
|
216
|
+
if isinstance(node, dict) and "csv_headers" in node and tok.startswith("csv::"):
|
|
217
|
+
header = tok[5:]
|
|
218
|
+
return header in node["csv_headers"] and not rest # must be terminal
|
|
219
|
+
|
|
220
|
+
# Wildcard
|
|
221
|
+
if tok == "*":
|
|
222
|
+
if isinstance(node, list):
|
|
223
|
+
return any(walk(item, rest) for item in node)
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
# Fixed index [n]
|
|
227
|
+
if tok.startswith("[") and tok.endswith("]"):
|
|
228
|
+
try:
|
|
229
|
+
idx = int(tok[1:-1])
|
|
230
|
+
except ValueError:
|
|
231
|
+
return False
|
|
232
|
+
return isinstance(node, list) and 0 <= idx < len(node) and walk(node[idx], rest)
|
|
233
|
+
|
|
234
|
+
# Dict key handling (JSON/YAML/TOML/XML)
|
|
235
|
+
if isinstance(node, dict):
|
|
236
|
+
# 1️⃣ Literal key match (works for "@id" too)
|
|
237
|
+
if tok in node:
|
|
238
|
+
return walk(node[tok], rest)
|
|
239
|
+
|
|
240
|
+
# 2️⃣ XML attribute fallback: "@id" → "id"
|
|
241
|
+
if tok.startswith("@"):
|
|
242
|
+
attr = tok[1:]
|
|
243
|
+
if attr in node:
|
|
244
|
+
return walk(node[attr], rest)
|
|
245
|
+
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
return walk(data, tokens)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import sacrebleu
|
|
2
|
+
|
|
3
|
+
from eval_framework.exceptions import LogicError
|
|
4
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
5
|
+
from eval_framework.shared.types import Completion
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TER(BaseMetric[Completion]):
|
|
9
|
+
"""Translation Error Rate is an error metric for machine translation that
|
|
10
|
+
measures the number of edits required to change a system output into one
|
|
11
|
+
of the references
|
|
12
|
+
Source: http://www.cs.umd.edu/~snover/tercom/
|
|
13
|
+
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
NAME = "TER"
|
|
17
|
+
|
|
18
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
19
|
+
if response.error is not None:
|
|
20
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)]
|
|
21
|
+
|
|
22
|
+
scores = []
|
|
23
|
+
for ground_truth in response.ground_truth_list:
|
|
24
|
+
if ground_truth == "" or ground_truth is None:
|
|
25
|
+
raise LogicError("When calculating TER we need a ground truth.")
|
|
26
|
+
|
|
27
|
+
sacre_formatted_completion = [response.completion]
|
|
28
|
+
sacre_formatted_ground_truth = [[ground_truth]]
|
|
29
|
+
ter_score = sacrebleu.corpus_ter(sacre_formatted_completion, sacre_formatted_ground_truth).score
|
|
30
|
+
scores.append(ter_score)
|
|
31
|
+
|
|
32
|
+
return [
|
|
33
|
+
MetricResult(metric_name=self.NAME, value=float(min(scores)), higher_is_better=False, error=response.error)
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LINEWISE_TER(BaseMetric[Completion]):
|
|
38
|
+
"""Minimum Line-level TER (Translation Edit Rate) score."""
|
|
39
|
+
|
|
40
|
+
NAME = "Linewise TER"
|
|
41
|
+
|
|
42
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
43
|
+
if response.error is not None:
|
|
44
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)]
|
|
45
|
+
|
|
46
|
+
scores = []
|
|
47
|
+
for ground_truth in response.ground_truth_list:
|
|
48
|
+
for sentence in response.completion.split("\n"):
|
|
49
|
+
if sentence == "":
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
if ground_truth == "" or ground_truth is None:
|
|
53
|
+
raise LogicError("When calculating TER we need a ground truth.")
|
|
54
|
+
|
|
55
|
+
sacre_formatted_completion = [sentence]
|
|
56
|
+
sacre_formatted_ground_truth = [[ground_truth]]
|
|
57
|
+
ter_score = sacrebleu.corpus_ter(sacre_formatted_completion, sacre_formatted_ground_truth).score
|
|
58
|
+
scores.append(ter_score)
|
|
59
|
+
|
|
60
|
+
return [
|
|
61
|
+
MetricResult(
|
|
62
|
+
metric_name=self.NAME,
|
|
63
|
+
value=float(min(scores, default=100)),
|
|
64
|
+
higher_is_better=False,
|
|
65
|
+
error=response.error,
|
|
66
|
+
)
|
|
67
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import nltk
|
|
4
|
+
|
|
5
|
+
from eval_framework.metrics.base import (
|
|
6
|
+
BaseMetric,
|
|
7
|
+
MetricResult,
|
|
8
|
+
)
|
|
9
|
+
from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
|
|
10
|
+
|
|
11
|
+
ALPHABETS = "([A-Za-z])"
|
|
12
|
+
PREFIXES = "(Mr|St|Mrs|Ms|Dr|www)[.]"
|
|
13
|
+
SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
|
|
14
|
+
STARTERS = (
|
|
15
|
+
r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
16
|
+
)
|
|
17
|
+
ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
|
18
|
+
WEBSITES = "[.](com|net|org|io|gov|edu|me)"
|
|
19
|
+
DIGITS = "([0-9])"
|
|
20
|
+
MULTIPLE_DOTS = r"\.{2,}"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class WordCounterMetricContext(BaseMetricContext):
|
|
24
|
+
comparison: str
|
|
25
|
+
word_count: int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class WordCounter(BaseMetric[Completion]):
|
|
29
|
+
NAME = "Word Count"
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def _count_words(text: str) -> int:
|
|
33
|
+
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
|
34
|
+
tokens = tokenizer.tokenize(text)
|
|
35
|
+
num_words = len(tokens)
|
|
36
|
+
return num_words
|
|
37
|
+
|
|
38
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
39
|
+
if response.error is not None:
|
|
40
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
41
|
+
|
|
42
|
+
context = extract_context_metric(response, WordCounterMetricContext)
|
|
43
|
+
|
|
44
|
+
assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
|
|
45
|
+
|
|
46
|
+
num_words = self._count_words(response.completion)
|
|
47
|
+
if context.comparison == "less than":
|
|
48
|
+
valid_word_count = num_words < context.word_count
|
|
49
|
+
if context.comparison == "at least":
|
|
50
|
+
valid_word_count = num_words >= context.word_count
|
|
51
|
+
|
|
52
|
+
return [
|
|
53
|
+
MetricResult(
|
|
54
|
+
metric_name=self.NAME, value=float(valid_word_count), higher_is_better=True, error=response.error
|
|
55
|
+
)
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class SentenceCounterMetricContext(BaseMetricContext):
|
|
60
|
+
comparison: str
|
|
61
|
+
sentence_count: int
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SentenceCounter(BaseMetric[Completion]):
|
|
65
|
+
NAME = "Sentence Count"
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _count_sentences(text: str) -> int:
|
|
69
|
+
# Note that nltk.tokenize.sent_tokenize would be a straightforward alternative but is also not ideal. Example:
|
|
70
|
+
#
|
|
71
|
+
# "Mr. Jones gave me $10,000.00... And then he left. Numbers 5...10. Numbers 5..10. Review: bad food,
|
|
72
|
+
# bad service,..., so I'd miss it."
|
|
73
|
+
#
|
|
74
|
+
# this: ['Mr. Jones gave me $10,000.00...', 'And then he left.', 'Numbers 5...', '10.', 'Numbers 5..', '10.',
|
|
75
|
+
# 'Review: bad food, bad service,...', ", so I'd miss it."].
|
|
76
|
+
# nltk: ['Mr. Jones gave me $10,000.00... And then he left.', 'Numbers 5...10.',
|
|
77
|
+
# "Numbers 5..10. Review: bad food, bad service,..., so I'd miss it."]
|
|
78
|
+
|
|
79
|
+
text = f" {text} "
|
|
80
|
+
text = text.replace("\n", " ")
|
|
81
|
+
text = re.sub(PREFIXES, "\\1<prd>", text)
|
|
82
|
+
text = re.sub(WEBSITES, "<prd>\\1", text)
|
|
83
|
+
text = re.sub(DIGITS + "[.]" + DIGITS, "\\1<prd>\\2", text)
|
|
84
|
+
text = re.sub(
|
|
85
|
+
MULTIPLE_DOTS,
|
|
86
|
+
lambda match: "<prd>" * len(match.group(0)) + "<stop>",
|
|
87
|
+
text,
|
|
88
|
+
)
|
|
89
|
+
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
|
|
90
|
+
text = re.sub(r"\s" + ALPHABETS + "[.] ", " \\1<prd> ", text)
|
|
91
|
+
text = re.sub(ACRONYMS + " " + STARTERS, "\\1<stop> \\2", text)
|
|
92
|
+
text = re.sub(
|
|
93
|
+
ALPHABETS + "[.]" + ALPHABETS + "[.]" + ALPHABETS + "[.]",
|
|
94
|
+
"\\1<prd>\\2<prd>\\3<prd>",
|
|
95
|
+
text,
|
|
96
|
+
)
|
|
97
|
+
text = re.sub(ALPHABETS + "[.]" + ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
|
|
98
|
+
text = re.sub(" " + SUFFIXES + "[.] " + STARTERS, " \\1<stop> \\2", text)
|
|
99
|
+
text = re.sub(" " + SUFFIXES + "[.]", " \\1<prd>", text)
|
|
100
|
+
text = re.sub(" " + ALPHABETS + "[.]", " \\1<prd>", text)
|
|
101
|
+
text = text.replace(".”", "”.")
|
|
102
|
+
text = text.replace('."', '".')
|
|
103
|
+
text = text.replace('!"', '"!')
|
|
104
|
+
text = text.replace('?"', '"?')
|
|
105
|
+
text = text.replace(".", ".<stop>")
|
|
106
|
+
text = text.replace("?", "?<stop>")
|
|
107
|
+
text = text.replace("!", "!<stop>")
|
|
108
|
+
text = text.replace("<prd>", ".")
|
|
109
|
+
sentences = text.split("<stop>")
|
|
110
|
+
sentences = [s.strip() for s in sentences]
|
|
111
|
+
if sentences and not sentences[-1]:
|
|
112
|
+
sentences = sentences[:-1]
|
|
113
|
+
return len(sentences)
|
|
114
|
+
|
|
115
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
116
|
+
if response.error is not None:
|
|
117
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
118
|
+
|
|
119
|
+
context = extract_context_metric(response, SentenceCounterMetricContext)
|
|
120
|
+
|
|
121
|
+
assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
|
|
122
|
+
|
|
123
|
+
num_sentences = self._count_sentences(response.completion)
|
|
124
|
+
if context.comparison == "less than":
|
|
125
|
+
valid_sentence_count = num_sentences < context.sentence_count
|
|
126
|
+
elif context.comparison == "at least":
|
|
127
|
+
valid_sentence_count = num_sentences >= context.sentence_count
|
|
128
|
+
|
|
129
|
+
return [
|
|
130
|
+
MetricResult(
|
|
131
|
+
metric_name=self.NAME, value=float(valid_sentence_count), higher_is_better=True, error=response.error
|
|
132
|
+
)
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class ParagraphCounterMetricContext(BaseMetricContext):
|
|
137
|
+
comparison: str
|
|
138
|
+
paragraph_count: int
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class ParagraphCounter(BaseMetric[Completion]):
|
|
142
|
+
NAME = "Paragraph Count"
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _count_paragraphs(text: str) -> int:
|
|
146
|
+
paragraphs = re.split(r"\s?\n\n\s?", text)
|
|
147
|
+
return len(paragraphs)
|
|
148
|
+
|
|
149
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
150
|
+
if response.error is not None:
|
|
151
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
152
|
+
|
|
153
|
+
context = extract_context_metric(response, ParagraphCounterMetricContext)
|
|
154
|
+
|
|
155
|
+
assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
|
|
156
|
+
|
|
157
|
+
num_paragraphs = self._count_paragraphs(response.completion)
|
|
158
|
+
if context.comparison == "less than":
|
|
159
|
+
valid_paragraph_count = num_paragraphs < context.paragraph_count
|
|
160
|
+
elif context.comparison == "at least":
|
|
161
|
+
valid_paragraph_count = num_paragraphs >= context.paragraph_count
|
|
162
|
+
|
|
163
|
+
return [
|
|
164
|
+
MetricResult(
|
|
165
|
+
metric_name=self.NAME, value=float(valid_paragraph_count), higher_is_better=True, error=response.error
|
|
166
|
+
)
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class ResponseToOriginalLengthRatio(BaseMetric[Completion]):
|
|
171
|
+
NAME = "Response to Original Length Ratio"
|
|
172
|
+
|
|
173
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
174
|
+
if response.error is not None:
|
|
175
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
176
|
+
|
|
177
|
+
len_original = len(response.last_user_instruction)
|
|
178
|
+
if len_original > 0:
|
|
179
|
+
score = len(response.completion) / len_original
|
|
180
|
+
return [MetricResult(metric_name=self.NAME, value=score, higher_is_better=False, error=response.error)]
|
|
181
|
+
else:
|
|
182
|
+
return []
|
|
File without changes
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
2
|
+
from eval_framework.shared.types import Completion, Loglikelihood
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BytesLoglikelihood(BaseMetric[Loglikelihood]):
|
|
6
|
+
NAME = "Bytes"
|
|
7
|
+
|
|
8
|
+
def calculate(self, response: Loglikelihood) -> list[MetricResult]:
|
|
9
|
+
if response.error or response.concat_compression is None:
|
|
10
|
+
value = None
|
|
11
|
+
else:
|
|
12
|
+
value = response.concat_compression.num_bytes
|
|
13
|
+
|
|
14
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SequencePositionsLoglikelihood(BaseMetric[Loglikelihood]):
|
|
18
|
+
NAME = "SequencePositions"
|
|
19
|
+
|
|
20
|
+
def calculate(self, response: Loglikelihood) -> list[MetricResult]:
|
|
21
|
+
if response.error or response.concat_compression is None:
|
|
22
|
+
value = None
|
|
23
|
+
else:
|
|
24
|
+
value = response.concat_compression.num_tokens
|
|
25
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BytesCompletion(BaseMetric[Completion]):
|
|
29
|
+
NAME = "Bytes"
|
|
30
|
+
|
|
31
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
32
|
+
if response.error or response.concat_compression is None:
|
|
33
|
+
value = None
|
|
34
|
+
else:
|
|
35
|
+
value = response.concat_compression.num_bytes
|
|
36
|
+
|
|
37
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SequencePositionsCompletion(BaseMetric[Completion]):
|
|
41
|
+
NAME = "SequencePositions"
|
|
42
|
+
|
|
43
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
44
|
+
if response.error or response.concat_compression is None:
|
|
45
|
+
value = None
|
|
46
|
+
else:
|
|
47
|
+
value = response.concat_compression.num_tokens
|
|
48
|
+
return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
|
|
File without changes
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from eval_framework.llm.base import BaseLLM
|
|
2
|
+
from eval_framework.metrics.base import BaseMetric
|
|
3
|
+
from eval_framework.shared.types import Completion
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseLLMJudgeMetric(BaseMetric[Completion]):
|
|
7
|
+
def __init__(self, llm_judge: BaseLLM) -> None:
|
|
8
|
+
self._llm_judge = llm_judge
|