PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/bfcl/generation.py ADDED Viewed

@@ -0,0 +1,222 @@
+import json
+import time
+from typing import Any
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import dict_to_chat_message
+from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput, ModelUsage
+from evalscope.api.tool.tool_info import ToolInfo
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+def predict(model: Model, sample: Sample) -> ModelOutput:
+    """Main prediction function for BFCL using the new API framework."""
+    # Extract the row data from sample metadata
+    row = sample.metadata
+    is_fc_model = row.get('is_fc_model', False)
+    if is_fc_model:
+        response, model_usage = generate_turn_with_tools(model, row)
+    else:
+        response, model_usage = generate_turn(model, row)
+    sample.metadata['generation'] = response
+    # wrap response with openai types
+    return ModelOutput(
+        model=model.name,
+        choices=[ChatCompletionChoice.from_content(json.dumps(response, ensure_ascii=False, indent=2))],
+        model_usage=model_usage,
+        time=time.time()
+    )
+def generate_turn(model: Model, row: dict[str, Any]):
+    from bfcl_eval.constants.default_prompts import (
+        DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
+        MAXIMUM_STEP_LIMIT,
+    )
+    from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
+    from bfcl_eval.model_handler.utils import default_decode_execute_prompting
+    all_model_responses = []
+    current_messages = []
+    turns = row['turns']
+    model_usage = ModelUsage()
+    for turn_idx, messages in enumerate(turns):
+        n_steps = 0
+        current_responses = []
+        current_messages += messages.copy()
+        if str(turn_idx) in row['missing_functions']:
+            assert len(messages) == 0, 'Holdout turn should not have user message.'
+            new_turn = [{
+                'role':
+                'user',
+                'content':
+                DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
+                    functions=row['missing_functions'][str(turn_idx)]
+                ),
+            }]
+            current_messages += new_turn
+        while True:
+            # Create a sample for the current messages
+            from evalscope.api.messages.chat_message import dict_to_chat_message
+            chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
+            # Get model response using generate method
+            model_output = model.generate(chat_messages)
+            # Handle the response based on the model output structure
+            message = model_output.message
+            model_usage += model_output.usage
+            current_messages.append(message)
+            if isinstance(message, str):
+                result = message
+            else:
+                result = message.text
+            logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
+            current_responses.append(result)
+            execute_tools = row.get('should_execute_tool_calls', False)
+            if execute_tools:
+                try:
+                    tool_calls = default_decode_execute_prompting(result)
+                except Exception:
+                    tool_calls = None
+                if tool_calls is None:
+                    break
+                tool_outputs, _ = execute_multi_turn_func_call(
+                    tool_calls,
+                    initial_config=row['initial_config'],
+                    involved_classes=row['involved_classes'],
+                    model_name='evaluator_loop',
+                    test_entry_id=row['id'],
+                    long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
+                    is_evaL_run=False,
+                )
+                # Append tool outputs to the current messages
+                tool_results = []
+                for tool_output, tool_call in zip(tool_outputs, tool_calls):
+                    tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
+                current_messages.append({
+                    'role': 'user',
+                    'content': repr(tool_results),
+                })
+            else:
+                break
+            n_steps += 1
+            if n_steps > MAXIMUM_STEP_LIMIT:
+                logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
+                break
+        all_model_responses.append(current_responses)
+    return all_model_responses, model_usage
+def generate_turn_with_tools(model: Model, row: dict[str, Any]):
+    from bfcl_eval.constants.default_prompts import DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, MAXIMUM_STEP_LIMIT
+    from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
+    from bfcl_eval.model_handler.utils import convert_to_function_call
+    all_model_responses = []
+    current_messages = []
+    turns = row['turns']
+    model_usage = ModelUsage()
+    for turn_idx, messages in enumerate(turns):
+        n_steps = 0
+        current_responses = []
+        current_messages += messages.copy()
+        tools = row['tools']
+        if str(turn_idx) in row['missing_functions']:
+            assert len(messages) == 0, 'Holdout turn should not have user message.'
+            # inject new functions on the fly
+            new_tools = row['missing_functions'][str(turn_idx)]
+            for new_tool in new_tools:
+                cur_tool = new_tool[0]
+                # change type to object
+                if cur_tool['parameters']['type'] != 'object':
+                    cur_tool['parameters']['type'] = 'object'
+                tools.append({
+                    'type': 'function',
+                    'function': cur_tool,
+                })
+            new_turn = [{
+                'role': 'user',
+                'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
+            }]
+            current_messages += new_turn
+        while True:
+            # Create a sample for the current messages with tools
+            chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
+            current_sample = Sample(
+                input=chat_messages,
+                target='',
+                tools=[ToolInfo.model_validate(tool['function']) for tool in tools],
+            )
+            # Get model response
+            model_output = model.generate(current_sample.input, tools=current_sample.tools)
+            # Handle the response based on the model output structure
+            message = model_output.message
+            model_usage += model_output.usage
+            current_messages.append(message)
+            if isinstance(message, str):
+                model_responses = [message]
+                tool_call_strs = None
+            elif message.tool_calls:
+                model_responses = [{tc.function.name: tc.function.arguments} for tc in message.tool_calls]
+                try:
+                    tool_call_strs = convert_to_function_call(model_responses)
+                except Exception as e:
+                    logger.error(f'Error converting tool calls to function call strings: {e}')
+                    tool_call_strs = None
+            else:
+                model_responses = [message.text]
+                tool_call_strs = None
+            current_responses.extend(model_responses)
+            execute_tools = row.get('should_execute_tool_calls', False)
+            if execute_tools and tool_call_strs is not None:
+                tool_outputs, _ = execute_multi_turn_func_call(
+                    tool_call_strs,
+                    initial_config=row['initial_config'],
+                    involved_classes=row['involved_classes'],
+                    model_name='evaluator_loop',
+                    test_entry_id=row['id'],
+                    long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
+                    is_evaL_run=False,
+                )
+                for tc, tool_output in zip(message.tool_calls, tool_outputs, strict=False):
+                    current_messages.append({
+                        'role': 'tool',
+                        'tool_call_id': tc.id,
+                        'content': json.dumps({'response': tool_output}),
+                    })
+            else:
+                break
+            n_steps += 1
+            if n_steps > MAXIMUM_STEP_LIMIT:
+                logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
+                break
+        all_model_responses.append(current_responses)
+    return all_model_responses, model_usage

evalscope/benchmarks/ceval/ceval_adapter.py CHANGED Viewed

@@ -1,73 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import csv
-import os
-from collections import defaultdict
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType, OutputType
-from evalscope.metrics import exact_match
-from evalscope.metrics.completion_parsers import ResponseParser
-from evalscope.utils.io_utils import csv_to_list
-from evalscope.utils.logger import get_logger
-# flake8: noqa
+from typing import Any, Dict
-logger = get_logger()
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
-SUBSET_LIST = [
-    'computer_network',
-    'operating_system',
-    'computer_architecture',
-    'college_programming',
-    'college_physics',
-    'college_chemistry',
-    'advanced_mathematics',
-    'probability_and_statistics',
-    'discrete_mathematics',
-    'electrical_engineer',
-    'metrology_engineer',
-    'high_school_mathematics',
-    'high_school_physics',
-    'high_school_chemistry',
-    'high_school_biology',
-    'middle_school_mathematics',
-    'middle_school_biology',
-    'middle_school_physics',
-    'middle_school_chemistry',
-    'veterinary_medicine',
-    'college_economics',
-    'business_administration',
-    'marxism',
-    'mao_zedong_thought',
-    'education_science',
-    'teacher_qualification',
-    'high_school_politics',
-    'high_school_geography',
-    'middle_school_politics',
-    'middle_school_geography',
-    'modern_chinese_history',
-    'ideological_and_moral_cultivation',
-    'logic',
-    'law',
-    'chinese_language_and_literature',
-    'art_studies',
-    'professional_tour_guide',
-    'legal_professional',
-    'high_school_chinese',
-    'high_school_history',
-    'middle_school_history',
-    'civil_servant',
-    'sports_science',
-    'plant_protection',
-    'basic_medicine',
-    'clinical_medicine',
-    'urban_and_rural_planner',
-    'accountant',
-    'fire_engineer',
-    'environmental_impact_assessment_engineer',
-    'tax_accountant',
-    'physician',
-]
+logger = get_logger()
 SUBJECT_MAPPING = {
     'computer_network': ['Computer Network', '计算机网络', 'STEM'],
@@ -124,115 +65,105 @@ SUBJECT_MAPPING = {
     'physician': ['Physician', '医师资格', 'Other']
 }
-@Benchmark.register(
-    name='ceval',
-    pretty_name='C-Eval',
-    tags=['Knowledge', 'MCQ', 'Chinese'],
-    description=
-    'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.',  # noqa: E501
-    dataset_id='modelscope/ceval-exam',
-    model_adapter=OutputType.GENERATION,
-    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
-    subset_list=SUBSET_LIST,
-    metric_list=['AverageAccuracy'],
-    few_shot_num=0,
-    train_split='dev',
-    eval_split='val',
-    prompt_template=
-    '以下是中国关于{subset_name}考试的单项选择题，请选出其中的正确答案。你的回答的最后一行应该是这样的格式：“答案是：LETTER”（不带引号），其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
+# Based on the prompt template for Chinese evaluation
+USER_PROMPT_TEMPLATE = """以下是中国关于{subject}的单项选择题，请选出其中的正确答案。你的回答的最后一行应该是这样的格式："答案：LETTER"（不带引号），其中 LETTER 是 A、B、C、D 中的一个。
+问题：{question}
+选项：
+{choices}
+""".lstrip()  # noqa: E501
+FEWSHOT_TEMPLATE = """以下是一些示例问题：
+{fewshot}
+""".lstrip()
+@register_benchmark(
+    BenchmarkMeta(
+        name='ceval',
+        pretty_name='C-Eval',
+        tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
+        description=
+        'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.',  # noqa: E501
+        dataset_id='evalscope/ceval',
+        subset_list=list(SUBJECT_MAPPING.keys()),
+        metric_list=['acc'],
+        few_shot_num=5,
+        train_split='dev',
+        eval_split='val',
+        prompt_template=USER_PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
 )
-class CEVALAdapter(DataAdapter):
+class CEVALAdapter(MultiChoiceAdapter):
     def __init__(self, **kwargs):
-        few_shot_num = kwargs.get('few_shot_num', 0)
-        if few_shot_num > 5:
-            logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
-            kwargs['few_shot_num'] = 5
         super().__init__(**kwargs)
         self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
-        self.choices = ['A', 'B', 'C', 'D']
-    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict = defaultdict(dict)
-        for subset_name in subset_list:
-            for split_name in [self.train_split, self.eval_split]:
-                if os.path.exists(dataset_name_or_path):
-                    file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
-                else:
-                    file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
-                if os.path.exists(file_path):
-                    data_dict[subset_name][split_name] = csv_to_list(file_path)
-        return data_dict
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
-        Args:
-            input_d (dict): The raw input. A single data format of the C-Eval:
-            {'id': 0,
-            'question': '下列关于税法基本原则的表述中，不正确的是____。',
-            'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
-            'B': '税收公平原则源于法律上的平等性原则',
-            'C': '税收效率原则包含经济效率和行政效率两个方面',
-            'D': '税务机关按法定程序依法征税，可以自由做出减征、停征或免征税款的决定',
-            'answer': 'D',
-            'explanation': ''}
-        Returns:
-            {'data': ['prompt ...']}
-        """
-        few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
-        if len(few_shot_prompts) > 0:
-            context: str = '\n'.join(few_shot_prompts) + '\n'
-        else:
-            context = ''
-        query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
-        subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
-        full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
-        return self.gen_prompt_data(full_prompt)
-    def get_gold_answer(self, input_d: dict) -> str:
-        # Get the gold choice
-        return input_d.get('answer', '')
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        # Build choices list from A, B, C, D fields
+        choices = [record['A'], record['B'], record['C'], record['D']]
+        subset = self.current_subset_name
+        return Sample(
+            input=record['question'],
+            choices=choices,
+            target=record['answer'],
+            metadata={
+                'id': record.get('id', ''),
+                'explanation': record.get('explanation', ''),
+                'subject': subset
+            },
+        )
+    def sample_to_fewshot(self, sample: Sample) -> str:
+        q_str = f"""问题：{sample.input}"""
+        choices = sample.choices if sample.choices is not None else []
+        opt_str_list = []
+        for i, choice in enumerate(choices):
+            opt_str_list.append(f"""{chr(65 + i)}. {choice}""")
+        opt_str = '\n'.join(opt_str_list)
+        opt_str = f"""选项：\n{opt_str}"""
+        exp_str = f"""解析：{sample.metadata.get('explanation', '')}"""
+        ans_str = f"""答案：{sample.target}"""
+        final_str = '\n'.join([q_str, opt_str, exp_str, ans_str])
+        return final_str
+    def format_fewshot_template(self, fewshot, sample):
+        fewshot_str = FEWSHOT_TEMPLATE.format(fewshot=fewshot)
+        prompt_str = self.format_prompt_template(sample)
+        return fewshot_str + '\n' + prompt_str
+    def format_prompt_template(self, sample):
+        subject_name = SUBJECT_MAPPING.get(sample.metadata['subject'])[1]
+        choices = sample.choices if sample.choices is not None else []
+        choices_str = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(choices)])
+        return USER_PROMPT_TEMPLATE.format(subject=subject_name, question=sample.input, choices=choices_str)
+    def extract_answer(self, prediction, task_state) -> str:
         """
-        Parse the model output to get the answer. Could be the best choice index.
+        Extract the answer from the prediction based on the task state.
         Args:
-            result: Predicted answer from the model. Usually a string for chat.
-            raw_input_d (dict): The raw input. Depending on the dataset.
-            eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
+            prediction (str): The model's prediction string
+            task_state (dict): The current task state containing metadata
         Returns:
-            The parsed answer. Depending on the dataset. Usually a string for chat.
+            str: The extracted answer from the prediction
         """
-        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
-            return result
-        else:
-            return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
-    def match(self, gold: str, pred: str) -> float:
-        return exact_match(gold=gold, pred=pred)
-    def _format_example(self, input_d: dict, include_answer=True):
-        example = '问题：' + input_d['question']
-        for choice in self.choices:
-            example += f'\n{choice}. {input_d[f"{choice}"]}'
+        import re
-        if include_answer:
-            example += '\n答案: ' + input_d['answer'] + '\n\n'
+        # Use regex to find the answer in the format "答案：LETTER"
+        match = re.search(r'答案：([A-D])', prediction)
+        if match:
+            return match.group(1)
         else:
-            example += '\n答案: '
-        return example
+            logger.warning(f'No valid answer found in prediction: {prediction}')
+            return ''

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl