evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
_ARTICLES = re.compile(r'\b(a|an|the)\b', re.UNICODE)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _answer_to_bags(answer):
|
|
8
|
+
if isinstance(answer, (list, tuple)):
|
|
9
|
+
raw_spans = answer
|
|
10
|
+
else:
|
|
11
|
+
raw_spans = [answer]
|
|
12
|
+
normalized_spans = []
|
|
13
|
+
token_bags = []
|
|
14
|
+
for raw_span in raw_spans:
|
|
15
|
+
normalized_span = _normalize(raw_span)
|
|
16
|
+
normalized_spans.append(normalized_span)
|
|
17
|
+
token_bags.append(set(normalized_span.split()))
|
|
18
|
+
return normalized_spans, token_bags
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is_number(text):
|
|
22
|
+
try:
|
|
23
|
+
float(text)
|
|
24
|
+
return True
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _remove_articles(text):
|
|
30
|
+
return _ARTICLES.sub(' ', text)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _white_space_fix(text):
|
|
34
|
+
return ' '.join(text.split())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _remove_punc(text):
|
|
38
|
+
exclude = set(string.punctuation)
|
|
39
|
+
if not _is_number(text):
|
|
40
|
+
return ''.join(ch for ch in text if ch not in exclude)
|
|
41
|
+
else:
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _fix_number(text):
|
|
46
|
+
return str(float(text)) if _is_number(text) else text
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _tokenize(text):
|
|
50
|
+
return re.split(' |-', text)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _normalize(answer):
|
|
54
|
+
tokens = [
|
|
55
|
+
_white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
|
|
56
|
+
]
|
|
57
|
+
tokens = [token for token in tokens if token.strip()]
|
|
58
|
+
normalized = ' '.join(tokens).strip()
|
|
59
|
+
return normalized
|
|
File without changes
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import EvalType, OutputType
|
|
3
|
+
from evalscope.metrics import LLMJudge, exact_match
|
|
4
|
+
|
|
5
|
+
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
|
+
|
|
7
|
+
<text>
|
|
8
|
+
{context}
|
|
9
|
+
</text>
|
|
10
|
+
|
|
11
|
+
{question}
|
|
12
|
+
|
|
13
|
+
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='frames',
|
|
18
|
+
pretty_name='FRAMES',
|
|
19
|
+
description=
|
|
20
|
+
'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
|
|
21
|
+
dataset_id='iic/frames',
|
|
22
|
+
model_adapter=OutputType.GENERATION,
|
|
23
|
+
output_types=[OutputType.GENERATION],
|
|
24
|
+
metric_list=['AverageAccuracy'],
|
|
25
|
+
few_shot_num=0,
|
|
26
|
+
train_split=None,
|
|
27
|
+
eval_split='test',
|
|
28
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
29
|
+
)
|
|
30
|
+
class FramesAdapter(DataAdapter):
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
def load(self, **kwargs):
|
|
36
|
+
# default load with snapshot
|
|
37
|
+
kwargs['file_structure'] = {'default': ['test.jsonl']}
|
|
38
|
+
data_dict = super().load_with_snapshot(**kwargs)
|
|
39
|
+
return data_dict
|
|
40
|
+
|
|
41
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
42
|
+
"""
|
|
43
|
+
Generate model prompt from input data.
|
|
44
|
+
"""
|
|
45
|
+
context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
|
|
46
|
+
question = input_d['Prompt']
|
|
47
|
+
prompt = self.prompt_template.format(context=context, question=question)
|
|
48
|
+
return self.gen_prompt_data(prompt)
|
|
49
|
+
|
|
50
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Parse the raw input labels (gold).
|
|
53
|
+
"""
|
|
54
|
+
return input_d['Answer']
|
|
55
|
+
|
|
56
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Parse the predicted result and extract proper answer.
|
|
59
|
+
"""
|
|
60
|
+
response = result.replace('*', '')
|
|
61
|
+
|
|
62
|
+
if 'the answer is' in response:
|
|
63
|
+
ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
|
|
64
|
+
else:
|
|
65
|
+
ans = ''
|
|
66
|
+
|
|
67
|
+
return ans
|
|
68
|
+
|
|
69
|
+
def match(self, gold: str, pred: str) -> float:
|
|
70
|
+
"""
|
|
71
|
+
Match the gold answer and the predicted answer.
|
|
72
|
+
"""
|
|
73
|
+
from .utils import normalize_answer
|
|
74
|
+
gold = normalize_answer(gold)
|
|
75
|
+
pred = normalize_answer(pred)
|
|
76
|
+
return exact_match(gold=gold, pred=pred)
|
|
77
|
+
|
|
78
|
+
def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
|
|
79
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
80
|
+
|
|
81
|
+
raw_input = kwargs.get('raw_input', None)
|
|
82
|
+
question = raw_input['Prompt']
|
|
83
|
+
# get grading response
|
|
84
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
|
|
85
|
+
orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
86
|
+
# parse grading response
|
|
87
|
+
if 'YES' in orm_response:
|
|
88
|
+
return 1.0
|
|
89
|
+
else:
|
|
90
|
+
return 0.0
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def normalize_answer(s):
|
|
6
|
+
|
|
7
|
+
def remove_articles(text):
|
|
8
|
+
return re.sub(r'\b(a|an|the)\b', ' ', text)
|
|
9
|
+
|
|
10
|
+
def white_space_fix(text):
|
|
11
|
+
return ' '.join(text.split())
|
|
12
|
+
|
|
13
|
+
def remove_punc(text):
|
|
14
|
+
exclude = set(string.punctuation)
|
|
15
|
+
return ''.join(ch for ch in text if ch not in exclude)
|
|
16
|
+
|
|
17
|
+
def lower(text):
|
|
18
|
+
return text.lower()
|
|
19
|
+
|
|
20
|
+
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
|
|
24
|
+
Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
|
|
25
|
+
Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
|
|
26
|
+
Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
|
|
27
|
+
|
|
28
|
+
Your output must follow the following format:
|
|
29
|
+
1) Provide an explanation for why the answers are equivalent or not.
|
|
30
|
+
2) Then provide your final answer in the form of: [[YES]] or [[NO]]
|
|
31
|
+
""" # noqa: E501
|
|
32
|
+
|
|
33
|
+
ORM_USER_TEMPLATE = """
|
|
34
|
+
Problem: {problem}
|
|
35
|
+
Answer 1: {answer_1}
|
|
36
|
+
Answer 2: {answer_2}
|
|
37
|
+
"""
|
|
@@ -4,7 +4,7 @@ from collections import defaultdict
|
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import mean
|
|
8
8
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
@@ -112,9 +112,13 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
112
112
|
"""
|
|
113
113
|
res = dict()
|
|
114
114
|
if 'AverageRouge' in self.metric_list:
|
|
115
|
+
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
116
|
+
|
|
115
117
|
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
|
|
116
118
|
res.update(rouge_dict)
|
|
117
119
|
if 'AverageBLEU' in self.metric_list:
|
|
120
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
121
|
+
|
|
118
122
|
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
119
123
|
res.update(bleu_dict)
|
|
120
124
|
return res
|
|
File without changes
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
from itertools import product
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
from typing import TYPE_CHECKING, List, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
7
|
+
from evalscope.metrics import LLMJudge, exact_match
|
|
8
|
+
from evalscope.metrics.metrics import mean
|
|
9
|
+
from evalscope.utils import get_logger
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from evalscope.report import Report
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
PROMPT_TEMPLATE = """Please read the following text and answer the question below.
|
|
17
|
+
|
|
18
|
+
<text>
|
|
19
|
+
{context}
|
|
20
|
+
</text>
|
|
21
|
+
|
|
22
|
+
<question>
|
|
23
|
+
{question}
|
|
24
|
+
</question>
|
|
25
|
+
|
|
26
|
+
Don't give information outside the document or repeat your findings."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@Benchmark.register(
|
|
30
|
+
name='needle_haystack',
|
|
31
|
+
pretty_name='Needle in a Haystack',
|
|
32
|
+
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. \
|
|
33
|
+
It requires the model to find specific information within a large corpus of text.',
|
|
34
|
+
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
35
|
+
metric_list=['AverageAccuracy'],
|
|
36
|
+
subset_list=['english', 'chinese'],
|
|
37
|
+
few_shot_num=0,
|
|
38
|
+
train_split=None,
|
|
39
|
+
eval_split='test',
|
|
40
|
+
system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
|
|
41
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
42
|
+
extra_params={
|
|
43
|
+
'retrieval_question': 'What is the best thing to do in San Francisco?',
|
|
44
|
+
'needles':
|
|
45
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
|
|
46
|
+
'context_lengths_min': 1000,
|
|
47
|
+
'context_lengths_max': 32000,
|
|
48
|
+
'context_lengths_num_intervals': 10,
|
|
49
|
+
'document_depth_percent_min': 0,
|
|
50
|
+
'document_depth_percent_max': 100,
|
|
51
|
+
'document_depth_percent_intervals': 10,
|
|
52
|
+
'tokenizer_path': 'Qwen/Qwen3-0.6B',
|
|
53
|
+
})
|
|
54
|
+
class NeedleHaystackAdapter(DataAdapter):
|
|
55
|
+
|
|
56
|
+
def __init__(self, **kwargs):
|
|
57
|
+
super().__init__(**kwargs)
|
|
58
|
+
|
|
59
|
+
self.llm_as_a_judge = True
|
|
60
|
+
# set extra params
|
|
61
|
+
extra_params = kwargs.get('extra_params', {})
|
|
62
|
+
self.retrieval_question = extra_params.get('retrieval_question',
|
|
63
|
+
'What is the best thing to do in San Francisco?')
|
|
64
|
+
self.needles = extra_params.get(
|
|
65
|
+
'needles',
|
|
66
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'])
|
|
67
|
+
self.context_lengths_min = extra_params.get('context_lengths_min', 1000)
|
|
68
|
+
self.context_lengths_max = extra_params.get('context_lengths_max', 32000)
|
|
69
|
+
self.context_lengths_num_intervals = extra_params.get('context_lengths_num_intervals', 10)
|
|
70
|
+
self.document_depth_percent_min = extra_params.get('document_depth_percent_min', 0)
|
|
71
|
+
self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
|
|
72
|
+
self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
|
|
73
|
+
self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
|
|
74
|
+
|
|
75
|
+
self.__init_tokenizer()
|
|
76
|
+
self.__init_length()
|
|
77
|
+
|
|
78
|
+
def __init_length(self):
|
|
79
|
+
""" Initialize context lengths and document depth percentages based on the provided parameters."""
|
|
80
|
+
import numpy as np
|
|
81
|
+
|
|
82
|
+
self.context_lengths = np.round(
|
|
83
|
+
np.linspace(
|
|
84
|
+
self.context_lengths_min,
|
|
85
|
+
self.context_lengths_max,
|
|
86
|
+
num=self.context_lengths_num_intervals,
|
|
87
|
+
endpoint=True)).astype(int)
|
|
88
|
+
|
|
89
|
+
self.document_depth_percents = np.round(
|
|
90
|
+
np.linspace(
|
|
91
|
+
self.document_depth_percent_min,
|
|
92
|
+
self.document_depth_percent_max,
|
|
93
|
+
num=self.document_depth_percent_intervals,
|
|
94
|
+
endpoint=True)).astype(int)
|
|
95
|
+
|
|
96
|
+
def __init_tokenizer(self):
|
|
97
|
+
""" Initialize the tokenizer based on the provided tokenizer path."""
|
|
98
|
+
from modelscope import AutoTokenizer
|
|
99
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
|
|
100
|
+
|
|
101
|
+
def load(self, **kwargs):
|
|
102
|
+
# default load with snapshot
|
|
103
|
+
kwargs['file_structure'] = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
|
|
104
|
+
data_dict = super().load_with_snapshot(**kwargs)
|
|
105
|
+
return data_dict
|
|
106
|
+
|
|
107
|
+
def gen_prompts(self, data_dict: dict) -> dict:
|
|
108
|
+
"""
|
|
109
|
+
Generate dataset prompts from raw input, unify the prompt format for different datasets.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
data_dict: {'english': {'test': [sample_d_1, sample_d_2, ...]},
|
|
113
|
+
'chinese': {'test': [sample_d_1, sample_d_2, ...]}}
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
{'subset_name': [prompt_d_1, prompt_d_2, ...]}
|
|
117
|
+
prompt_d_i (dict): refer to the output of gen_prompt method.
|
|
118
|
+
|
|
119
|
+
e.g. train -- few-shot data, test -- target dataset to evaluate.
|
|
120
|
+
"""
|
|
121
|
+
res_dict: dict = {}
|
|
122
|
+
|
|
123
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
124
|
+
res_dict[sub_name] = []
|
|
125
|
+
for sample_d in sub_data_dict[self.eval_split]:
|
|
126
|
+
# Generate prompts for each sample in the dataset
|
|
127
|
+
tokens_context = self._get_context_tokens(sample_d['text'])
|
|
128
|
+
for context_length, depth_percent in tqdm(
|
|
129
|
+
product(self.context_lengths, self.document_depth_percents),
|
|
130
|
+
desc=f'Generating {sub_name} prompts'):
|
|
131
|
+
# Insert needles into the context at the specified depth percentage
|
|
132
|
+
context = self._insert_needles(tokens_context, depth_percent, context_length)
|
|
133
|
+
# Build the input dictionary for the prompt
|
|
134
|
+
input_d = {
|
|
135
|
+
'context_length': int(context_length),
|
|
136
|
+
'depth_percent': int(depth_percent),
|
|
137
|
+
'question': self.retrieval_question,
|
|
138
|
+
'answer': '\n'.join(self.needles),
|
|
139
|
+
'context': context,
|
|
140
|
+
}
|
|
141
|
+
prompt_d = self.gen_prompt(input_d=input_d)
|
|
142
|
+
prompt_d[AnswerKeys.RAW_INPUT] = input_d
|
|
143
|
+
res_dict[sub_name].append(prompt_d)
|
|
144
|
+
|
|
145
|
+
return res_dict
|
|
146
|
+
|
|
147
|
+
def _get_context_tokens(self, input_context: str) -> list:
|
|
148
|
+
"""
|
|
149
|
+
Encodes the context string into tokens using the tokenizer, ensuring the tokenized context
|
|
150
|
+
is at least as long as the maximum context length required.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
input_context (str): The context string to be tokenized.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
List[int]: A list of token IDs representing the context.
|
|
157
|
+
"""
|
|
158
|
+
max_context_length = max(self.context_lengths)
|
|
159
|
+
context = input_context
|
|
160
|
+
tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
|
|
161
|
+
# Repeat the context until reaching the required length
|
|
162
|
+
while len(tokens_context) < max_context_length:
|
|
163
|
+
context += '\n' + input_context
|
|
164
|
+
tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
|
|
165
|
+
return tokens_context
|
|
166
|
+
|
|
167
|
+
def _insert_needles(self, tokens_context, depth_percent, context_length):
|
|
168
|
+
"""
|
|
169
|
+
Inserts multiple needles (specific facts or pieces of information) into the original context string at
|
|
170
|
+
designated depth percentages, effectively distributing these needles throughout the context. This method
|
|
171
|
+
is designed to test a model's ability to retrieve specific information (needles) from a larger body of text
|
|
172
|
+
(haystack) based on the placement depth of these needles.
|
|
173
|
+
|
|
174
|
+
The method first encodes the context and each needle into tokens to calculate their lengths in tokens.
|
|
175
|
+
It then adjusts the context length to accommodate the final buffer length. This is crucial for ensuring
|
|
176
|
+
that the total token count (context plus needles) does not exceed the maximum allowable context length,
|
|
177
|
+
which might otherwise lead to information being truncated.
|
|
178
|
+
|
|
179
|
+
This approach calculates the initial insertion point for the first needle as before but then calculates even
|
|
180
|
+
spacing for the remaining needles based on the remaining context length. It ensures that needles are
|
|
181
|
+
distributed as evenly as possible throughout the context after the first insertion.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
tokens_context (List[int]): The original context tokens.
|
|
185
|
+
depth_percent (float): The depth percent at which to insert the needles.
|
|
186
|
+
context_length (int): The total length of the context in tokens, adjusted for final buffer.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
str: The new context with needles inserted.
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
context_length -= 150
|
|
193
|
+
|
|
194
|
+
# Calculate the total length of all needles in tokens
|
|
195
|
+
total_needles_length = sum(len(self.tokenizer.encode(needle)) for needle in self.needles)
|
|
196
|
+
|
|
197
|
+
# Ensure context length accounts for needles
|
|
198
|
+
if len(tokens_context) + total_needles_length > context_length:
|
|
199
|
+
tokens_context = tokens_context[:context_length - total_needles_length]
|
|
200
|
+
|
|
201
|
+
# To evenly distribute the needles, we calculate the intervals they need to be inserted.
|
|
202
|
+
depth_percent_interval = (100 - depth_percent) / len(self.needles)
|
|
203
|
+
|
|
204
|
+
# Reset the insertion percentages list for the current context
|
|
205
|
+
self.insertion_percentages = []
|
|
206
|
+
|
|
207
|
+
# Insert needles at calculated points
|
|
208
|
+
for needle in self.needles:
|
|
209
|
+
|
|
210
|
+
tokens_needle = self.tokenizer.encode(needle)
|
|
211
|
+
|
|
212
|
+
if depth_percent == 100:
|
|
213
|
+
# If your depth percent is 100 (which means your needle is the last thing in the doc),
|
|
214
|
+
# throw it at the end
|
|
215
|
+
tokens_context = tokens_context + tokens_needle
|
|
216
|
+
else:
|
|
217
|
+
# Go get the position (in terms of tokens) to insert your needle
|
|
218
|
+
insertion_point = int(len(tokens_context) * (depth_percent / 100))
|
|
219
|
+
|
|
220
|
+
# tokens_new_context represents the tokens before the needle
|
|
221
|
+
tokens_new_context = tokens_context[:insertion_point]
|
|
222
|
+
|
|
223
|
+
# We want to make sure that we place our needle at a sentence break
|
|
224
|
+
# so we first see what token a '.' is
|
|
225
|
+
period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
|
|
226
|
+
'。') # Handle both English and Chinese periods
|
|
227
|
+
|
|
228
|
+
# Then we iteration backwards until we find the first period
|
|
229
|
+
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
|
|
230
|
+
insertion_point -= 1
|
|
231
|
+
tokens_new_context = tokens_context[:insertion_point]
|
|
232
|
+
|
|
233
|
+
# Insert the needle into the context at the found position
|
|
234
|
+
tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
|
|
235
|
+
|
|
236
|
+
# Log
|
|
237
|
+
insertion_percentage = (insertion_point / len(tokens_context)) * 100
|
|
238
|
+
self.insertion_percentages.append(insertion_percentage)
|
|
239
|
+
logger.debug(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
|
|
240
|
+
f'total length now: {len(tokens_context)} tokens')
|
|
241
|
+
|
|
242
|
+
# Adjust depth for next needle
|
|
243
|
+
depth_percent += depth_percent_interval
|
|
244
|
+
|
|
245
|
+
new_context = self.tokenizer.decode(tokens_context)
|
|
246
|
+
return new_context
|
|
247
|
+
|
|
248
|
+
def gen_prompt(self, input_d: dict, **kwargs) -> dict:
|
|
249
|
+
"""
|
|
250
|
+
Generate the prompt for each sample in the dataset.
|
|
251
|
+
Args:
|
|
252
|
+
input_d: A dictionary containing the input data for the prompt.
|
|
253
|
+
It should contain 'context' and optionally 'question'.
|
|
254
|
+
Returns:
|
|
255
|
+
A dictionary containing the prompt data
|
|
256
|
+
"""
|
|
257
|
+
context = input_d.get('context')
|
|
258
|
+
question = input_d.get('question')
|
|
259
|
+
|
|
260
|
+
prompt = self.prompt_template.format(context=context, question=question)
|
|
261
|
+
|
|
262
|
+
return self.gen_prompt_data(prompt, system_prompt=self.system_prompt)
|
|
263
|
+
|
|
264
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
265
|
+
"""
|
|
266
|
+
Parse the raw input labels (gold).
|
|
267
|
+
"""
|
|
268
|
+
return input_d.get('answer', '').strip()
|
|
269
|
+
|
|
270
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
271
|
+
"""
|
|
272
|
+
Parse the predicted result and extract proper answer.
|
|
273
|
+
"""
|
|
274
|
+
return result
|
|
275
|
+
|
|
276
|
+
def match(self, gold: str, pred: str) -> float:
|
|
277
|
+
"""
|
|
278
|
+
Match the gold answer and the predicted answer.
|
|
279
|
+
"""
|
|
280
|
+
from .utils import normalize_answer
|
|
281
|
+
norm_gold = normalize_answer(gold)
|
|
282
|
+
norm_pred = normalize_answer(pred)
|
|
283
|
+
# Use exact match for Needle in a Haystack
|
|
284
|
+
return exact_match(gold=norm_gold, pred=norm_pred)
|
|
285
|
+
|
|
286
|
+
def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> dict:
|
|
287
|
+
"""
|
|
288
|
+
Use LLM as a judge to evaluate the predicted answer against the gold answer.
|
|
289
|
+
"""
|
|
290
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
|
|
291
|
+
|
|
292
|
+
raw_input = kwargs.get('raw_input', None)
|
|
293
|
+
question = raw_input.get('question')
|
|
294
|
+
context_length = raw_input.get('context_length')
|
|
295
|
+
depth_percent = raw_input.get('depth_percent')
|
|
296
|
+
|
|
297
|
+
# get grading response
|
|
298
|
+
prompt = ORM_USER_TEMPLATE.format(question=question, gold=gold, pred=pred)
|
|
299
|
+
orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
300
|
+
|
|
301
|
+
# parse grading score with regex, [[score]]
|
|
302
|
+
score = parse_score(orm_response) if orm_response else 0.0
|
|
303
|
+
return {f'Context#{context_length} Depth#{depth_percent}': score}
|
|
304
|
+
|
|
305
|
+
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
306
|
+
"""
|
|
307
|
+
compute weighted mean of the bleu score of all samples
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
review_res_list: [score1, score2, ...]
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
avg_res: List[dict]
|
|
314
|
+
|
|
315
|
+
"""
|
|
316
|
+
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
317
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
318
|
+
|
|
319
|
+
def post_process_report(self, report: 'Report', **kwargs):
|
|
320
|
+
try:
|
|
321
|
+
import os
|
|
322
|
+
|
|
323
|
+
from .utils import draw_score_chat
|
|
324
|
+
|
|
325
|
+
report_path = kwargs.get('report_path')
|
|
326
|
+
data_frame = report.to_dataframe()
|
|
327
|
+
# split `Metric` to `Context` and `Depth`
|
|
328
|
+
data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
|
|
329
|
+
data_frame['Depth'] = data_frame['Depth'].str.replace('Depth#', '').astype(float)
|
|
330
|
+
data_frame['Context'] = data_frame['Context'].str.replace('Context#', '').astype(int)
|
|
331
|
+
# split by `Subset` to multi sub data frame
|
|
332
|
+
for subset in data_frame['Subset'].unique():
|
|
333
|
+
sub_df = data_frame[data_frame['Subset'] == subset]
|
|
334
|
+
# draw charts for each subset
|
|
335
|
+
pivot_table = sub_df.pivot_table(
|
|
336
|
+
values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
|
|
337
|
+
pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
|
|
338
|
+
draw_score_chat(pivot_table, outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'))
|
|
339
|
+
|
|
340
|
+
except Exception as e:
|
|
341
|
+
logger.error(f'Error generating charts: {e}')
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
import string
|
|
6
|
+
from matplotlib.colors import LinearSegmentedColormap
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize_answer(s):
|
|
10
|
+
|
|
11
|
+
def remove_articles(text):
|
|
12
|
+
return re.sub(r'\b(a|an|the)\b', ' ', text)
|
|
13
|
+
|
|
14
|
+
def white_space_fix(text):
|
|
15
|
+
return ' '.join(text.split())
|
|
16
|
+
|
|
17
|
+
def remove_punc(text):
|
|
18
|
+
exclude = set(string.punctuation)
|
|
19
|
+
return ''.join(ch for ch in text if ch not in exclude)
|
|
20
|
+
|
|
21
|
+
def lower(text):
|
|
22
|
+
return text.lower()
|
|
23
|
+
|
|
24
|
+
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_score(score_str: str) -> int:
|
|
28
|
+
"""
|
|
29
|
+
Parses a score string and returns an integer score.
|
|
30
|
+
The score should be in the format [[score]].
|
|
31
|
+
"""
|
|
32
|
+
score_match = re.search(r'\[\[(\d+)\]\]', score_str)
|
|
33
|
+
if score_match:
|
|
34
|
+
score = int(score_match.group(1))
|
|
35
|
+
return score / 10.0
|
|
36
|
+
else:
|
|
37
|
+
return 0.0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def draw_score_chat(pivot_table, outpath):
|
|
41
|
+
# Create a custom colormap. Go to https://coolors.co/ and pick cool colors
|
|
42
|
+
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
|
|
43
|
+
|
|
44
|
+
# Create the heatmap with better aesthetics
|
|
45
|
+
plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
|
|
46
|
+
sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=True, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
|
|
47
|
+
|
|
48
|
+
# More aesthetics
|
|
49
|
+
plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
|
|
50
|
+
plt.xlabel('Token Limit') # X-axis label
|
|
51
|
+
plt.ylabel('Depth Percent') # Y-axis label
|
|
52
|
+
plt.xticks(rotation=45) # Rotates the x-axis labels to prevent overlap
|
|
53
|
+
plt.yticks(rotation=0) # Ensures the y-axis labels are horizontal
|
|
54
|
+
plt.tight_layout() # Fits everything neatly into the figure area
|
|
55
|
+
|
|
56
|
+
# save the figure
|
|
57
|
+
plt.savefig(outpath, dpi=300, bbox_inches='tight')
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if the model answer is correct based on the reference answer.
|
|
61
|
+
Your input is a question, a reference answer, and a model answer. You need to check if the model answer is correct based on the reference answer.
|
|
62
|
+
You should focus on the correctness of the model answer compared to the reference answer, without attempting to solve the original question.
|
|
63
|
+
You must provide your final score in the form of a number from 1 to 10, where:
|
|
64
|
+
|
|
65
|
+
Score 1: The answer is completely unrelated to the reference.
|
|
66
|
+
Score 3: The answer has minor relevance but does not align with the reference.
|
|
67
|
+
Score 5: The answer has moderate relevance but contains inaccuracies.
|
|
68
|
+
Score 7: The answer aligns with the reference but has minor omissions.
|
|
69
|
+
Score 10: The answer is completely accurate and aligns perfectly with the reference.
|
|
70
|
+
|
|
71
|
+
Only respond with a numberical score with formatted as [[score]].""" # noqa: E501
|
|
72
|
+
|
|
73
|
+
ORM_USER_TEMPLATE = """
|
|
74
|
+
Question: {question}
|
|
75
|
+
|
|
76
|
+
Reference Answer: {gold}
|
|
77
|
+
|
|
78
|
+
Model Answer: {pred}
|
|
79
|
+
"""
|
|
@@ -148,6 +148,7 @@ class SimpleQAAdapter(DataAdapter):
|
|
|
148
148
|
'is_correct': 1 if res == 'A' else 0,
|
|
149
149
|
'is_incorrect': 1 if res == 'B' else 0,
|
|
150
150
|
'is_not_attempted': 1 if res == 'C' else 0,
|
|
151
|
+
'judge_response': grading_response,
|
|
151
152
|
}
|
|
152
153
|
|
|
153
154
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
File without changes
|