PyPI - evalscope - Versions diffs - 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

evalscope 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show

evalscope/arguments.py +6 -1
evalscope/benchmarks/aime/aime24_adapter.py +3 -3
evalscope/benchmarks/aime/aime25_adapter.py +3 -3
evalscope/benchmarks/arc/arc_adapter.py +15 -18
evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
evalscope/benchmarks/benchmark.py +12 -11
evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
evalscope/benchmarks/data_adapter.py +59 -21
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
evalscope/benchmarks/musr/musr_adapter.py +8 -5
evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
evalscope/benchmarks/race/race_adapter.py +12 -16
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
evalscope/benchmarks/super_gpqa/utils.py +85 -0
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
evalscope/benchmarks/utils.py +43 -0
evalscope/collections/evaluator.py +14 -5
evalscope/config.py +15 -2
evalscope/constants.py +14 -0
evalscope/evaluator/evaluator.py +51 -13
evalscope/metrics/llm_judge.py +104 -0
evalscope/metrics/named_metrics.py +1 -0
evalscope/models/__init__.py +2 -1
evalscope/models/base_adapter.py +25 -5
evalscope/models/chat_adapter.py +3 -0
evalscope/models/choice_adapter.py +4 -0
evalscope/models/custom_adapter.py +2 -0
evalscope/models/register.py +28 -0
evalscope/models/server_adapter.py +35 -8
evalscope/perf/arguments.py +13 -7
evalscope/perf/benchmark.py +5 -0
evalscope/perf/http_client.py +15 -5
evalscope/perf/main.py +1 -0
evalscope/perf/utils/analysis_result.py +1 -1
evalscope/report/app.py +3 -0
evalscope/report/combinator.py +2 -2
evalscope/run.py +6 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/eval.py +220 -55
evalscope/third_party/thinkbench/infer.py +37 -7
evalscope/third_party/thinkbench/tools/llm.py +1 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
evalscope/utils/chat_service.py +1 -0
evalscope/utils/filters.py +59 -0
evalscope/utils/logger.py +3 -3
evalscope/version.py +2 -2
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
tests/cli/test_all.py +144 -0
tests/cli/test_collection.py +28 -2
tests/cli/test_run.py +201 -32
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0

evalscope/third_party/thinkbench/eval.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import List
 from evalscope.third_party.thinkbench.tools.llm import request_url
 from evalscope.third_party.thinkbench.tools.utils import extract_answer
-from evalscope.utils.io_utils import dump_jsonl_data
+from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list
 cur_path = os.path.dirname(os.path.abspath(__file__))
@@ -28,27 +28,42 @@ class EvalThink:
         self.model_name = model_name
         self.dataset_name = dataset_name
         self.subsets = subsets
-        self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
+        self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy']
         self.split_strategies = split_strategies  # split by llm, keywords, separator
         self.judge_config = judge_config
+        self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl')
+        self.model_parse_dict = self.__init_parse_file()
-    @lru_cache(maxsize=None)
-    def get_think_part(self, text):
-        last_think_end = text.rfind(self.think_end_token)
-        return text[:last_think_end].lower()
+    def __init_parse_file(self):
+        if not os.path.exists(self.model_parse_file_path):
+           return {}
+        else:
+            list_file =  jsonl_to_list(self.model_parse_file_path)
+            # convert to dict prompt as key, answer_index as value
+            return {item['prompt']: item['answer_index'] for item in list_file}
+    def get_think_part(self, message: dict) -> str:
+        if 'reasoning_content' in message and message['reasoning_content']:
+            return message['reasoning_content']
+        else:
+            text = message['content']
+            last_think_end = text.rfind(self.think_end_token)
+            return text[:last_think_end]
     @lru_cache(maxsize=None)
     def cal_tokens(self, text: str):
         return len(self.tokenizer.encode(text, add_special_tokens=False))
     def process_choice(self, choice, problem):
-        think_part = self.get_think_part(choice['message']['content'])
+        think_part = self.get_think_part(choice['message'])
         answer = choice['review']['gold']
         tokens = self.cal_tokens(think_part)
-        switch_count = sum(think_part.count(token) for token in self.switch_tokens)
+        switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens)
         useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
-        score = choice['review']['result']
-        return tokens, switch_count, useful_tokens, score
+        reflection_tokens = tokens - useful_tokens
+        # score = choice['review']['result']
+        score = 0 if useful_tokens == 0 else 1
+        return tokens, switch_count, useful_tokens, reflection_tokens, score
     def process_item(self, item):
         problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
@@ -57,14 +72,15 @@ class EvalThink:
             results.append(self.process_choice(choice, problem))
             break  # only process the first choice
-        tokens, switch_counts, useful_tokens, scores = zip(*results)
+        total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results)
-        avg_tokens = sum(tokens) / len(tokens)
+        avg_tokens = sum(total_tokens) / len(total_tokens)
         avg_thought_num = sum(switch_counts) / len(switch_counts)
-        avg_token_efficiency = sum(useful_tokens) / sum(tokens)
+        avg_token_efficiency = sum(useful_tokens) / sum(total_tokens)
         avg_accuracy = sum(scores) / len(scores)
-        return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
+        avg_useful_tokens = sum(useful_tokens) / len(useful_tokens)
+        avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens)
+        return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens
     def split_by_llm(self, response, problem) -> List[str]:
         response = response.replace('\n', ' ') # remove newline characters
@@ -90,12 +106,17 @@ class EvalThink:
         tagged_response = tagged_response.strip()
         prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
-        llm_response = request_url(self.judge_config, prompt)
-        answer_index = extract_answer(llm_response)
-        dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
-                        os.path.join(self.report_path, 'answer_index.jsonl'),
-                        dump_mode='append')
+        if prompt in self.model_parse_dict:
+            answer_index = self.model_parse_dict[prompt]
+        else:
+            llm_response = request_url(self.judge_config, prompt)
+            if not llm_response:
+                answer_index = -1
+            else:
+                answer_index = extract_answer(llm_response)
+            dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
+                            self.model_parse_file_path, dump_mode='append')
         try:
             answer_index = int(answer_index)
         except Exception:
@@ -119,18 +140,27 @@ class EvalThink:
         return first_correct
     def plot_metrics(self, results, output_dir):
-        fig = make_subplots(rows=1, cols=len(self.metrics),
-                            subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
-                            shared_xaxes=True, x_title='Subsets')
-        for i, metric in enumerate(self.metrics, start=1):
+        # Change layout to 2x3
+        fig = make_subplots(rows=2, cols=3,
+                            subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
+                                          'Token Efficiency', 'Thought Num', 'Accuracy'),
+                            shared_xaxes=True, x_title='Subsets',
+                            vertical_spacing=0.1,  # Decrease vertical spacing between subplots
+                            horizontal_spacing=0.1)  # Decrease horizontal spacing between subplots
+        metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
+                        'token_efficiency', 'thought_num', 'accuracy']
+        for i, metric in enumerate(metrics_order, start=1):
             y_values = [results[metric][subset] for subset in self.subsets]
+            # Determine row and column for 2x3 layout
+            row = (i - 1) // 3 + 1
+            col = (i - 1) % 3 + 1
             fig.add_trace(
                 go.Scatter(x=list(range(len(self.subsets))), y=y_values,
                            mode='lines+markers',
                            name=metric.replace('_', ' ').title()),
-                row=1, col=i
+                row=row, col=col
             )
             # Add annotations for each data point
             for j, y in enumerate(y_values):
@@ -140,28 +170,34 @@ class EvalThink:
                     text=f'{y:.2f}',
                     showarrow=False,
                     yshift=10,
-                    row=1,
-                    col=i
+                    row=row,
+                    col=col
                 )
         fig.update_layout(
-            height=500,
-            width=1500,
+            height=800,  # Adjust height for 2x3 layout
+            width=1200,   # Adjust width for 2x3 layout
             title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
             legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
         )
-        for i in range(1, len(self.metrics) + 1):
+        for i in range(1, len(metrics_order) + 1):
+            row = (i - 1) // 3 + 1
+            col = (i - 1) % 3 + 1
             fig.update_xaxes(
                 ticktext=self.subsets,
                 tickvals=list(range(len(self.subsets))),
-                row=1, col=i
+                row=row, col=col
             )
-            fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
+            fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col)
         # Update y-axis ranges
-        fig.update_yaxes(range=[0, 1], row=1, col=1)  # Token Efficiency
-        fig.update_yaxes(range=[0, 13], row=1, col=3)  # Switch Frequency
-        fig.update_yaxes(range=[0, 1], row=1, col=4)  # Accuracy
+        fig.update_yaxes(range=[500, 5000], row=1, col=1)  # Reasoning Tokens
+        fig.update_yaxes(range=[0, 3000], row=1, col=2)  # First Correct Tokens
+        fig.update_yaxes(range=[0, 3000], row=1, col=3)  # Reflection Tokens
+        fig.update_yaxes(range=[0, 1], row=2, col=1)     # Token Efficiency
+        fig.update_yaxes(range=[0, 13], row=2, col=2)    # Thought Num
+        fig.update_yaxes(range=[0, 1], row=2, col=3)     # Accuracy
         os.makedirs(output_dir, exist_ok=True)
         output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
@@ -179,7 +215,7 @@ class EvalThink:
         return df[bools].head(count)
-    def evaluate(self, output_dir, max_tokens=8000, count=50):
+    def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128):
         for subset in self.subsets:
             review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
             review_df = pd.read_json(review_path, lines=True)
@@ -191,15 +227,17 @@ class EvalThink:
                 (item for _, item in review_df.iterrows()),
                 desc=f'Evaluating {subset}',
                 total=len(review_df),
-                max_workers=16
+                max_workers=workers
             )
-            avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
+            avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results)
-            self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
+            self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens)
             self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
             self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
             self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
+            self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens)
+            self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens)
         results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
@@ -207,13 +245,111 @@ class EvalThink:
         self.plot_metrics(results, output_dir)
+        # save results to json
+        dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json'))
         return results
-def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
+def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128):
     evaluator = EvalThink(**config,)
-    results = evaluator.evaluate(output_dir, max_tokens, count)
+    results = evaluator.evaluate(output_dir, max_tokens, count, workers)
     print(results)
+def combine_results(configs: List[dict], output_path: str):
+    """
+    Combine evaluation results from multiple model configs into one plot.
+    All models' results for the same metric will be shown in the same subplot for easy comparison.
+    Args:
+        configs: List of model config dicts containing model_name and report_path
+    """
+    # Combine results from different runs
+    combined_results = defaultdict(lambda: defaultdict(dict))
+    for config in configs:
+        model_name = config['model_name']
+        report_path = config['report_path']
+        # Results is a dict with metric as key and subset as value
+        results = json_to_dict(os.path.join(report_path, f'think_eval_results.json'))
+        combined_results[model_name] = results
+    # Create a 2x3 subplot layout, one subplot per metric
+    fig = make_subplots(rows=2, cols=3,
+                       subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
+                                     'Token Efficiency', 'Thought Num', 'Accuracy'),
+                       shared_xaxes=True, x_title='Subsets',
+                       vertical_spacing=0.08,  # 减小垂直间距
+                       horizontal_spacing=0.05)  # 减小水平间距
+    metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
+                    'token_efficiency', 'thought_num', 'accuracy']
+    # Assign different colors for each model
+    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
+    # Plot each metric in a separate subplot
+    for i, metric in enumerate(metrics_order, start=1):
+        row = (i - 1) // 3 + 1
+        col = (i - 1) % 3 + 1
+        # Get subsets from first model (assuming all models have same subsets)
+        subsets = list(next(iter(combined_results.values()))[metric].keys())
+        # Add all models' data for this metric to the same subplot
+        for j, (model_name, results) in enumerate(combined_results.items()):
+            y_values = [results[metric][subset] for subset in subsets]
+            fig.add_trace(
+                go.Scatter(x=subsets, y=y_values,
+                          mode='lines+markers',
+                          name=model_name,  # Just model name since metrics are shown in subplot titles
+                          line=dict(color=colors[j % len(colors)]),
+                          showlegend=(i == 1)),  # Only show legend for first metric
+                row=row, col=col
+            )
+            # Add value annotations
+            for k, y in enumerate(y_values):
+                fig.add_annotation(
+                    x=subsets[k],
+                    y=y,
+                    text=f'{y:.2f}',
+                    showarrow=False,
+                    yshift=10,
+                    font=dict(size=12, color=colors[j % len(colors)]),
+                    row=row, col=col
+                )
+        # Update axis ranges and labels based on metric type
+        # if metric == 'token_efficiency':
+        #     fig.update_yaxes(range=[0.2, 0.7], row=row, col=col)
+        # elif metric == 'accuracy':
+        #     fig.update_yaxes(range=[0.8, 1], row=row, col=col)
+        fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col)
+    # Update layout
+    fig.update_layout(
+        height=1000,  # 增加高度
+        width=1500,   # 增加宽度
+        title_text=f'Model Comparison Across Evaluation Metrics on MATH-500',
+        title=dict(font=dict(size=22)),  # 增大标题字号
+        font=dict(size=14),  # 增大整体字号
+        legend=dict(
+            orientation='h',
+            yanchor='bottom',
+            y=1.02,
+            xanchor='right',
+            x=1,
+            font=dict(size=14)  # 增大图例字号
+        )
+    )
+    # Save plot
+    os.makedirs('outputs', exist_ok=True)
+    fig.write_image(output_path)
+    print(f'Model comparison plot saved to {output_path}')
+    return combined_results
 judge_config = dict(
     api_key='EMPTY',
     base_url='http://0.0.0.0:8801/v1',
@@ -221,7 +357,7 @@ judge_config = dict(
 )
 distill_qwen_config = dict(
-    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
+    report_path = './outputs/20250218_180219',
     model_name = 'DeepSeek-R1-Distill-Qwen-7B',
     tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
     dataset_name = 'math_500',
@@ -231,34 +367,63 @@ distill_qwen_config = dict(
 )
 math_qwen_config = dict(
-    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
+    report_path = './outputs/20250219_202358',
     model_name = 'Qwen2.5-Math-7B-Instruct',
     tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
     dataset_name = 'math_500',
     subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
-    split_strategies='separator'
+    split_strategies='separator',
+    judge_config=judge_config
 )
 r1_config = dict(
-    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
+    report_path = './outputs/20250307_000404',
     model_name = 'deepseek-r1',
     tokenizer_path = 'deepseek-ai/DeepSeek-R1',
     dataset_name = 'math_500',
     subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
-    split_strategies='separator'
+    split_strategies='separator',
+    judge_config=judge_config
 )
-qwq_config = dict(
-    report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
+qwq_preview_config = dict(
+    report_path = './outputs/20250221_105911',
     model_name = 'qwq-32b-preview',
     tokenizer_path = 'Qwen/QwQ-32B-Preview',
     dataset_name = 'math_500',
     subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
-    split_strategies='separator'
+    split_strategies='separator',
+    judge_config=judge_config
+)
+qwq_config = dict(
+    report_path = './outputs/20250306_181550',
+    model_name = 'QwQ-32B',
+    tokenizer_path = 'Qwen/QwQ-32B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+distill_qwen_32b = dict(
+    report_path = './outputs/20250306_235951',
+    model_name = 'deepseek-r1-distill-qwen-32b',
+    tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
 )
 if __name__ == '__main__':
-    run_task(distill_qwen_config)
+    # run_task(distill_qwen_config, count=80)
     # run_task(math_qwen_config)
-    # run_task(r1_config)
-    # run_task(qwq_config)
+    # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
+    # run_task(r1_config, max_tokens=20000, count=200, workers=128)
+    # run_task(qwq_config, max_tokens=20000, count=200, workers=128)
+    # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
+    # combine_results([qwq_config, r1_config, qwq_preview_config,  distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
+    # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
+    combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')

evalscope/third_party/thinkbench/infer.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from evalscope import TaskConfig, run_task
+DASHSCOPE_API_KEY = 'sk-723135c241x'
 def eval_distill_qwen():
     model_name = 'DeepSeek-R1-Distill-Qwen-7B'
@@ -53,20 +54,48 @@ def eval_r1():
     task_config = TaskConfig(
         api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
-        api_key=os.environ['DASHSCOPE_API_KEY'],
+        api_key=DASHSCOPE_API_KEY,
         model=model_name,
         eval_type='service',
         datasets=[dataset_name],
         dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
-        eval_batch_size=3,
+        eval_batch_size=8,
+        generation_config={
+            'max_tokens': 20000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+        use_cache='./outputs/20250307_000404',
+        timeout=36000,
+        stream=True
+    )
+    run_task(task_config)
+def eval_distill_32b():
+    model_name = 'deepseek-r1-distill-qwen-32b'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+        api_key=DASHSCOPE_API_KEY,
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=5,
         generation_config={
             'max_tokens': 12000,  # avoid exceed max length
             'temperature': 0.6,
             'top_p': 0.95,
             'n': 1,
         },
-        limit=50,
-        use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
+        use_cache='./outputs/20250306_235951',
+        timeout=32000,
+        stream=True
     )
     run_task(task_config)
@@ -89,12 +118,13 @@ def eval_qwq():
             'top_p': 0.95,
             'n': 1,
         },
-        use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
+        use_cache='./outputs/20250221_105911'
     )
     run_task(task_config)
 if __name__ == '__main__':
     # eval_distill_qwen()
     # eval_math_qwen()
-    # eval_r1()
-    eval_qwq()
+    eval_r1()
+    # eval_qwq()
+    # eval_distill_32b()

evalscope/third_party/thinkbench/tools/llm.py CHANGED Viewed

@@ -15,6 +15,7 @@ def request_url(llm_config, content):
         return completion.choices[0].message.content
     except Exception as e:
         print(e)
+        return None
 def request_qwen(content):
     try:

evalscope/third_party/toolbench_static/llm/swift_infer.py CHANGED Viewed

@@ -1,37 +1,67 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 from dataclasses import dataclass
-from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
-from swift.utils import seed_everything
-# TODO: Support custom model for swift infer
+from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
+# 设置GPU环境变量
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 @dataclass
 class SwiftInferArgs:
     model_id_or_path: str
     model_type: str
+    infer_backend: str = 'vllm'  # 可选 'pt', 'vllm', 'lmdeploy'
     max_new_tokens: int = 2048
+    temperature: float = 0.1
+    max_batch_size: int = 16
 class SwiftInfer:
     def __init__(self, args: SwiftInferArgs):
-        model_type = args.model_type
-        template_type = get_default_template_type(model_type)
-        model, tokenizer = get_model_tokenizer(
-            model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
-        model.generation_config.max_new_tokens = args.max_new_tokens
-        print(f'** Generation config: {model.generation_config}')
+        # infer backend模型初始化
+        if args.infer_backend == 'pt':
+            self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
+        elif args.infer_backend == 'vllm':
+            from swift.llm import VllmEngine
+            self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
+        elif args.infer_backend == 'lmdeploy':
+            from swift.llm import LmdeployEngine
+            self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
+        else:
+            raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
-        template = get_template(template_type, tokenizer)
-        seed_everything(42)
-        self.tokenizer = tokenizer
-        self.model = model
-        self.template = template
+        # 基本配置获取 （可选）
+        self.request_config = RequestConfig(
+            max_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            stream=False  # 可以透传参数改为True进行流式推理
+        )
     def predict(self, system: str, query: str, history: list):
+        # Swift 3.0标准接口中，消息传入的格式是：
+        # messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
+        #            {"role": "user", "content": "用户问题内容"},
+        #            {"role": "assistant", "content": "助手回答内容"}, ...]
+        messages = []
+        if system.strip():
+            messages.append({'role': 'system', 'content': system})
+        # 将历史对话拼接进message中
+        for qa_pair in history:
+            # 假定 history 中每个元素形如 ("user input", "model response")，请根据你的数据格式进行调整。
+            user_answer, model_response = qa_pair
+            messages.append({'role': 'user', 'content': user_answer})
+            messages.append({'role': 'assistant', 'content': model_response})
+        # 添加本次用户问题
+        messages.append({'role': 'user', 'content': query})
+        infer_request = InferRequest(messages=messages)
+        # 进行推理
+        response = self.engine.infer([infer_request], self.request_config)
-        response, history = inference(self.model, self.template, query=query, system=system, history=history)
+        # 提取模型返回的文本结果（假设非stream模式）
+        result_text = response[0].choices[0].message.content.strip()
-        return response
+        return result_text

evalscope/utils/chat_service.py CHANGED Viewed

@@ -32,6 +32,7 @@ class ModelList(BaseModel):
 class ChatMessage(BaseModel):
     role: Literal['user', 'assistant', 'system']
     content: str
+    reasoning_content: Optional[str] = None
 class DeltaMessage(BaseModel):

evalscope/utils/filters.py ADDED Viewed

@@ -0,0 +1,59 @@
+import re
+from typing import Any, Callable, Dict
+class Filter:
+    """
+    A base Filter class that implements the registry pattern
+    """
+    _registry: Dict[str, Callable[[str, Any], str]] = {}
+    @classmethod
+    def register(cls, name: str) -> Callable:
+        """
+        Decorator to register a new filter function
+        """
+        def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
+            cls._registry[name] = func
+            return func
+        return decorator
+    @classmethod
+    def get_filter(cls, name: str) -> Callable:
+        """
+        Get a registered filter by name
+        """
+        return cls._registry.get(name)
+    @classmethod
+    def apply(cls, name: str, value: str, *args, **kwargs) -> str:
+        """
+        Apply a registered filter to a value
+        """
+        filter_func = cls.get_filter(name)
+        if filter_func is None:
+            raise ValueError(f'Filter {name} not found')
+        return filter_func(value, *args, **kwargs)
+@Filter.register('remove_until')
+def remove_until(value: str, marker: str) -> str:
+    """
+    Remove everything before the last occurrence of marker
+    """
+    if marker not in value:
+        return value
+    return value[value.rindex(marker) + len(marker):]
+@Filter.register('extract')
+def extract(value: str, pattern: str) -> str:
+    """
+    Extract content from string using regex pattern
+    """
+    match = re.search(pattern, value)
+    if match:
+        return match.group(0)
+    return ''

evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl