PyPI - evalscope - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

evalscope 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show

evalscope/arguments.py +3 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +49 -0
evalscope/benchmarks/aime/aime25_adapter.py +49 -0
evalscope/benchmarks/arc/arc_adapter.py +5 -7
evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
evalscope/benchmarks/benchmark.py +5 -3
evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
evalscope/benchmarks/data_adapter.py +88 -29
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +68 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/critique_template.txt +13 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
evalscope/benchmarks/race/race_adapter.py +3 -3
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
evalscope/cli/start_app.py +4 -1
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +4 -2
evalscope/collections/evaluator.py +109 -39
evalscope/collections/sampler.py +2 -1
evalscope/collections/schema.py +1 -2
evalscope/config.py +4 -1
evalscope/evaluator/evaluator.py +81 -65
evalscope/metrics/__init__.py +2 -1
evalscope/metrics/math_parser.py +526 -0
evalscope/metrics/metrics.py +39 -3
evalscope/metrics/named_metrics.py +31 -7
evalscope/models/base_adapter.py +7 -1
evalscope/models/chat_adapter.py +69 -49
evalscope/models/choice_adapter.py +52 -45
evalscope/models/custom_adapter.py +2 -2
evalscope/models/local_model.py +7 -2
evalscope/models/server_adapter.py +106 -61
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +5 -1
evalscope/perf/http_client.py +2 -2
evalscope/perf/plugin/api/openai_api.py +11 -1
evalscope/perf/utils/benchmark_util.py +6 -2
evalscope/report/app.py +42 -23
evalscope/run.py +11 -8
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +264 -0
evalscope/third_party/thinkbench/infer.py +100 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +47 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/utils/chat_service.py +2 -2
evalscope/utils/io_utils.py +1 -1
evalscope/utils/model_utils.py +17 -1
evalscope/utils/utils.py +45 -45
evalscope/version.py +2 -2
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
tests/cli/test_run.py +108 -19
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/metrics/math_accuracy.py +0 -200
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -23,6 +23,7 @@ class BenchmarkData:
     n_chunks: int = 0
     n_chunks_time: float = 0.0
     max_gpu_memory_cost = 0
+    time_per_output_token: float = 0.0
     prompt_tokens = None
     completion_tokens = None
@@ -37,6 +38,7 @@ class BenchmarkData:
             self.first_chunk_latency = self.query_latency
             self.n_chunks = 1
             self.n_chunks_time = self.query_latency
+        self.time_per_output_token = self.query_latency / self.completion_tokens
     def _calculate_tokens(self, api_plugin):
         self.prompt_tokens, self.completion_tokens = \
@@ -63,6 +65,7 @@ class BenchmarkMetrics:
     start_time: Optional[float] = None
     total_time: float = 1.0
     n_total_queries: int = 0
+    n_time_per_output_token: float = 0.0
     avg_first_chunk_latency: float = -1
     avg_latency: float = -1
@@ -92,6 +95,7 @@ class BenchmarkMetrics:
             self.total_first_chunk_latency += benchmark_data.first_chunk_latency
             self.n_total_chunks += benchmark_data.n_chunks
             self.total_chunks_time += benchmark_data.n_chunks_time
+            self.n_time_per_output_token += benchmark_data.time_per_output_token
         else:
             self.n_failed_queries += 1
@@ -108,7 +112,7 @@ class BenchmarkMetrics:
             self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
             self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
             self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
-            self.avg_time_per_token = self.total_time / self.n_total_completion_tokens
+            self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
             self.qps = self.n_succeed_queries / self.total_time
         except ZeroDivisionError as e:
             logger.exception(e)
@@ -125,7 +129,7 @@ class BenchmarkMetrics:
             'Average QPS': round(self.qps, default_ndigits),
             'Average latency (s)': round(self.avg_latency, default_ndigits),
             'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
-            'Average time per output token (s)': round(self.avg_time_per_token, 5),
+            'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
             'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
             'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
             'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),

evalscope/report/app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
+import re
 from dataclasses import dataclass
 from typing import Any, List, Union
@@ -18,6 +19,9 @@ from evalscope.version import __version__
 logger = get_logger()
 PLOTLY_THEME = 'plotly_dark'
+REPORT_TOKEN = '@@'
+MODEL_TOKEN = '::'
+DATASET_TOKEN = ', '
 def scan_for_report_folders(root_path):
@@ -41,8 +45,9 @@ def scan_for_report_folders(root_path):
             datasets = []
             for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
                 datasets.append(os.path.basename(dataset_item).split('.')[0])
-            datasets = ','.join(datasets)
-            reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
+            datasets = DATASET_TOKEN.join(datasets)
+            reports.append(
+                f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
     reports = sorted(reports, reverse=True)
     logger.debug(f'reports: {reports}')
@@ -50,9 +55,9 @@ def scan_for_report_folders(root_path):
 def process_report_name(report_name: str):
-    prefix, report_name = report_name.split('@')
-    model_name, datasets = report_name.split(':')
-    datasets = datasets.split(',')
+    prefix, report_name = report_name.split(REPORT_TOKEN)
+    model_name, datasets = report_name.split(MODEL_TOKEN)
+    datasets = datasets.split(DATASET_TOKEN)
     return prefix, model_name, datasets
@@ -170,7 +175,7 @@ def plot_single_dataset_scores(df: pd.DataFrame):
         text=df[ReportKey.score],
         barmode='group')
-    width = 0.2 if len(df[ReportKey.subset_name]) <= 5 else None
+    width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
     plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
     plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
     return plot
@@ -218,7 +223,16 @@ def dict_to_markdown(data) -> str:
     return '\n\n'.join(markdown_lines)
+def convert_html_tags(text):
+    # match begin label
+    text = re.sub(r'<(\w+)>', r'[\1]', text)
+    # match end label
+    text = re.sub(r'</(\w+)>', r'[/\1]', text)
+    return text
 def process_string(string: str, max_length: int = 2048) -> str:
+    string = convert_html_tags(string)  # for display labels e.g. `<think>`
     if len(string) > max_length:
         return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
     return string
@@ -226,9 +240,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
 def process_model_prediction(item: Any):
     if isinstance(item, dict):
-        return dict_to_markdown(item)
+        res = dict_to_markdown(item)
+        return process_string(res)
     elif isinstance(item, list):
-        return '\n'.join([process_model_prediction(item) for item in item])
+        res = '\n'.join([process_model_prediction(item) for item in item])
+        return process_string(res)
     else:
         return process_string(str(item))
@@ -257,19 +273,20 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
     ds = []
     for i, item in origin_df.iterrows():
         raw_input = item['raw_input']
-        raw_pred_answer = item['choices'][0]['message']['content']
-        parsed_gold_answer = item['choices'][0]['review']['gold']
-        parsed_pred_answer = item['choices'][0]['review']['pred']
-        score = item['choices'][0]['review']['result']
-        raw_d = {
-            'Input': raw_input,
-            'Generated': raw_pred_answer,
-            'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
-            'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
-            'Score': score,
-            'NScore': normalize_score(score)
-        }
-        ds.append(raw_d)
+        for choice in item['choices']:
+            raw_pred_answer = choice['message']['content']
+            parsed_gold_answer = choice['review']['gold']
+            parsed_pred_answer = choice['review']['pred']
+            score = choice['review']['result']
+            raw_d = {
+                'Input': raw_input,
+                'Generated': raw_pred_answer,
+                'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
+                'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
+                'Score': score,
+                'NScore': normalize_score(score)
+            }
+            ds.append(raw_d)
     df_subset = pd.DataFrame(ds)
     return df_subset
@@ -284,6 +301,8 @@ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: i
     end = start + rows_per_page
     df_subset = data_review_df.iloc[start:end].copy()
     df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
+    df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
+    df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
     df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
     styler = style_df(df_subset, columns=['NScore'])
     return df_subset, styler
@@ -504,8 +523,8 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
         outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
     def update_single_report_data(root_path, report_name):
         report_list, datasets, task_cfg = load_single_report(root_path, report_name)
-        work_dir = os.path.join(root_path, report_name.split('@')[0])
-        model_name = report_name.split('@')[1].split(':')[0]
+        work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
+        model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
         return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
     @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])

evalscope/run.py CHANGED Viewed

@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
 def setup_work_directory(task_cfg: TaskConfig, run_time: str):
     """Set the working directory for the task."""
+    # use cache
     if task_cfg.use_cache:
         task_cfg.work_dir = task_cfg.use_cache
         logger.info(f'Set resume from {task_cfg.work_dir}')
     # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
-    task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
+    else:
+        task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
     outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
@@ -112,8 +114,8 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     logger.info(task_cfg)
     for evaluator in evaluators:
-        res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
-        eval_results[dataset_name] = res_dict
+        res_dict = evaluator.eval()
+        eval_results[evaluator.dataset_name] = res_dict
     return eval_results
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     from evalscope.evaluator import Evaluator
     from evalscope.models import initialize_model_adapter
+    benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
+    # Initialize data adapter
+    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
     if dataset_name == DataCollection.NAME:
         # EvaluatorCollection is a collection of evaluators
         from evalscope.collections import EvaluatorCollection
-        return EvaluatorCollection(task_cfg, outputs)
+        return EvaluatorCollection(task_cfg, data_adapter, outputs)
-    benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
-    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
+    # Initialize model adapter
     model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
     # update task_cfg.dataset_args
     task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
     return Evaluator(
-        dataset_name_or_path=benchmark.dataset_id,
         data_adapter=data_adapter,
         model_adapter=model_adapter,
         outputs=outputs,

evalscope/third_party/thinkbench/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.third_party.thinkbench.eval import run_task

evalscope/third_party/thinkbench/eval.py ADDED Viewed

@@ -0,0 +1,264 @@
+import json
+import os
+import pandas as pd
+import plotly.graph_objects as go
+import re
+from collections import defaultdict
+from functools import lru_cache
+from modelscope import AutoTokenizer
+from plotly.subplots import make_subplots
+from tqdm.contrib.concurrent import thread_map
+from typing import List
+from evalscope.third_party.thinkbench.tools.llm import request_url
+from evalscope.third_party.thinkbench.tools.utils import extract_answer
+from evalscope.utils.io_utils import dump_jsonl_data
+cur_path = os.path.dirname(os.path.abspath(__file__))
+class EvalThink:
+    def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
+        self.report_path = report_path
+        self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
+        self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
+        self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
+        self.subset_dict = defaultdict(lambda: defaultdict(list))
+        self.think_end_token = '</think>'
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model_name = model_name
+        self.dataset_name = dataset_name
+        self.subsets = subsets
+        self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
+        self.split_strategies = split_strategies  # split by llm, keywords, separator
+        self.judge_config = judge_config
+    @lru_cache(maxsize=None)
+    def get_think_part(self, text):
+        last_think_end = text.rfind(self.think_end_token)
+        return text[:last_think_end].lower()
+    @lru_cache(maxsize=None)
+    def cal_tokens(self, text: str):
+        return len(self.tokenizer.encode(text, add_special_tokens=False))
+    def process_choice(self, choice, problem):
+        think_part = self.get_think_part(choice['message']['content'])
+        answer = choice['review']['gold']
+        tokens = self.cal_tokens(think_part)
+        switch_count = sum(think_part.count(token) for token in self.switch_tokens)
+        useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
+        score = choice['review']['result']
+        return tokens, switch_count, useful_tokens, score
+    def process_item(self, item):
+        problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
+        results = []
+        for choice in item['choices']:
+            results.append(self.process_choice(choice, problem))
+            break  # only process the first choice
+        tokens, switch_counts, useful_tokens, scores = zip(*results)
+        avg_tokens = sum(tokens) / len(tokens)
+        avg_thought_num = sum(switch_counts) / len(switch_counts)
+        avg_token_efficiency = sum(useful_tokens) / sum(tokens)
+        avg_accuracy = sum(scores) / len(scores)
+        return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
+    def split_by_llm(self, response, problem) -> List[str]:
+        response = response.replace('\n', ' ') # remove newline characters
+        prompt = self.reformat_template.format(problem=problem, response=response)
+        llm_response = request_url(self.judge_config, prompt)
+        return llm_response.split('\n\n')
+    def split_by_keywords(self, text) -> List[str]:
+        pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
+        segments = re.split(pattern, text)
+        # remove empty segments
+        segments = [segment.strip() for segment in segments if segment.strip()]
+        return segments if segments else [text]
+    def split_by_separator(self, text) -> List[str]:
+        return text.split('\n\n')
+    def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
+        tagged_response = ''
+        for sdx, step in enumerate(response):
+            tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
+        tagged_response = tagged_response.strip()
+        prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
+        llm_response = request_url(self.judge_config, prompt)
+        answer_index = extract_answer(llm_response)
+        dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
+                        os.path.join(self.report_path, 'answer_index.jsonl'),
+                        dump_mode='append')
+        try:
+            answer_index = int(answer_index)
+        except Exception:
+            answer_index = -1
+        return answer_index
+    def get_first_correct(self, response: str, problem: str, answer: str) -> str:
+        if self.split_strategies == 'llm':
+            text_list = self.split_by_llm(response, problem)
+        elif self.split_strategies == 'keywords':
+            text_list = self.split_by_keywords(response)
+        else:
+            text_list = self.split_by_separator(response)
+        answer_index = self.get_answer_index(text_list, problem, answer)
+        if answer_index == -1:  # no correct answer found
+            first_correct = ''
+        else:
+            first_correct = '\n\n'.join(text_list[: answer_index])
+        return first_correct
+    def plot_metrics(self, results, output_dir):
+        fig = make_subplots(rows=1, cols=len(self.metrics),
+                            subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
+                            shared_xaxes=True, x_title='Subsets')
+        for i, metric in enumerate(self.metrics, start=1):
+            y_values = [results[metric][subset] for subset in self.subsets]
+            fig.add_trace(
+                go.Scatter(x=list(range(len(self.subsets))), y=y_values,
+                           mode='lines+markers',
+                           name=metric.replace('_', ' ').title()),
+                row=1, col=i
+            )
+            # Add annotations for each data point
+            for j, y in enumerate(y_values):
+                fig.add_annotation(
+                    x=j,
+                    y=y,
+                    text=f'{y:.2f}',
+                    showarrow=False,
+                    yshift=10,
+                    row=1,
+                    col=i
+                )
+        fig.update_layout(
+            height=500,
+            width=1500,
+            title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
+            legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
+        )
+        for i in range(1, len(self.metrics) + 1):
+            fig.update_xaxes(
+                ticktext=self.subsets,
+                tickvals=list(range(len(self.subsets))),
+                row=1, col=i
+            )
+            fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
+        # Update y-axis ranges
+        fig.update_yaxes(range=[0, 1], row=1, col=1)  # Token Efficiency
+        fig.update_yaxes(range=[0, 13], row=1, col=3)  # Switch Frequency
+        fig.update_yaxes(range=[0, 1], row=1, col=4)  # Accuracy
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
+        fig.write_image(output_path)
+        print(f'save figure to: {output_path}')
+    def filter_df(self, df, response_len: int = 8000, count: int=10):
+        def is_valid_row(row):
+            return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
+        bools = df.apply(is_valid_row, axis=1)
+        return df[bools].head(count)
+    def evaluate(self, output_dir, max_tokens=8000, count=50):
+        for subset in self.subsets:
+            review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
+            review_df = pd.read_json(review_path, lines=True)
+            review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
+            results = thread_map(
+                self.process_item,
+                (item for _, item in review_df.iterrows()),
+                desc=f'Evaluating {subset}',
+                total=len(review_df),
+                max_workers=16
+            )
+            avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
+            self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
+            self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
+            self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
+            self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
+        results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
+                   for metric in self.metrics}
+        self.plot_metrics(results, output_dir)
+        return results
+def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
+    evaluator = EvalThink(**config,)
+    results = evaluator.evaluate(output_dir, max_tokens, count)
+    print(results)
+judge_config = dict(
+    api_key='EMPTY',
+    base_url='http://0.0.0.0:8801/v1',
+    model_name='Qwen2.5-72B-Instruct',
+)
+distill_qwen_config = dict(
+    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
+    model_name = 'DeepSeek-R1-Distill-Qwen-7B',
+    tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+math_qwen_config = dict(
+    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
+    model_name = 'Qwen2.5-Math-7B-Instruct',
+    tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator'
+)
+r1_config = dict(
+    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
+    model_name = 'deepseek-r1',
+    tokenizer_path = 'deepseek-ai/DeepSeek-R1',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator'
+)
+qwq_config = dict(
+    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
+    model_name = 'qwq-32b-preview',
+    tokenizer_path = 'Qwen/QwQ-32B-Preview',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator'
+)
+if __name__ == '__main__':
+    run_task(distill_qwen_config)
+    # run_task(math_qwen_config)
+    # run_task(r1_config)
+    # run_task(qwq_config)

evalscope/third_party/thinkbench/infer.py ADDED Viewed

@@ -0,0 +1,100 @@
+import os
+from evalscope import TaskConfig, run_task
+def eval_distill_qwen():
+    model_name = 'DeepSeek-R1-Distill-Qwen-7B'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='http://0.0.0.0:8801/v1/chat/completions',
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=32,
+        generation_config={
+            'max_tokens': 20000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+    )
+    run_task(task_config)
+def eval_math_qwen():
+    model_name = 'Qwen2.5-Math-7B-Instruct'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='http://0.0.0.0:8801/v1/chat/completions',
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=32,
+        generation_config={
+            'max_tokens': 3000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 3,
+        },
+    )
+    run_task(task_config)
+def eval_r1():
+    model_name = 'deepseek-r1'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+        api_key=os.environ['DASHSCOPE_API_KEY'],
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=3,
+        generation_config={
+            'max_tokens': 12000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+        limit=50,
+        use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
+    )
+    run_task(task_config)
+def eval_qwq():
+    model_name = 'qwq-32b-preview'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+        api_key=os.environ['DASHSCOPE_API_KEY'],
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=32,
+        generation_config={
+            'max_tokens': 8000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+        use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
+    )
+    run_task(task_config)
+if __name__ == '__main__':
+    # eval_distill_qwen()
+    # eval_math_qwen()
+    # eval_r1()
+    eval_qwq()

evalscope/third_party/thinkbench/resources/critique_template.txt ADDED Viewed

@@ -0,0 +1,17 @@
+The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
+[Math Problem]
+{problem}
+[Correct Answer]
+{answer}
+[Solution]
+{tagged_response}
+Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
+Please put your final answer (i.e., the index) in \boxed{{}}.

evalscope/third_party/thinkbench/resources/reformat_template.txt ADDED Viewed

@@ -0,0 +1,31 @@
+I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
+* Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
+  - Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
+  - Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
+  - If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
+  - Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
+* For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
+* Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
+* Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
+* Reply with the reformatted solution directly.
+--------------------------------------------------
+Here is the math problem, and the solution that needs to be reformatted:
+[Math Problem]
+{problem}
+[Solution]
+{response}

evalscope/third_party/thinkbench/tools/__init__.py ADDED Viewed

File without changes

evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl