PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/models/adapters/server_adapter.py DELETED Viewed

@@ -1,236 +0,0 @@
-import copy
-import openai
-from collections import defaultdict
-from openai.types.chat import ChatCompletion, ChatCompletionChunk
-from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
-from typing import List, Optional, Union
-from evalscope.utils.argument_utils import get_supported_params
-from evalscope.utils.logger import get_logger
-from ..register import register_model_adapter
-from .base_adapter import BaseModelAdapter
-logger = get_logger()
-@register_model_adapter(name='server')
-class ServerModelAdapter(BaseModelAdapter):
-    """
-    Server model adapter to request remote API model and generate results.
-    """
-    def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
-        """
-        Args:
-            api_url: The URL of the remote API model.
-            model_id: The ID of the remote API model.
-            api_key: The API key of the remote API model.
-        """
-        self.api_url = api_url.rstrip('/').rsplit('/chat/completions', 1)[0]
-        self.model_id = model_id
-        self.api_key = api_key
-        self.client = openai.OpenAI(
-            api_key=self.api_key,
-            base_url=self.api_url,
-        )
-        self.supported_params = get_supported_params(self.client.chat.completions.create)
-        self.seed = kwargs.get('seed', None)
-        self.timeout = kwargs.get('timeout', 60)
-        self.stream = kwargs.get('stream', False)
-        self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
-        super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
-    def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
-        """
-        Model prediction func.
-        Args:
-            inputs (List[dict]): The input data.
-            infer_cfg (dict): Inference configuration.
-        Returns:
-            res (List[dict]): The model prediction results.
-        """
-        infer_cfg = infer_cfg or {}
-        results = []
-        for input_item in inputs:
-            response = self.process_single_input(input_item, infer_cfg)
-            results.append(response)
-        return results
-    def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
-        """Process a single input item."""
-        request_json = self.make_request(input_item, infer_cfg)
-        response = self.send_request(request_json)
-        return response
-    def make_request_messages(self, input_item: dict) -> list:
-        """
-        Make request messages for OpenAI API.
-        """
-        if input_item.get('messages', None):
-            return input_item['messages']
-        data: list = input_item['data']
-        if isinstance(data[0], tuple):  # for truthful_qa and hellaswag
-            query = '\n'.join(''.join(item) for item in data)
-            system_prompt = input_item.get('system_prompt', None)
-        else:
-            query = data[0]
-            system_prompt = input_item.get('system_prompt', None)
-        messages = []
-        if system_prompt:
-            messages.append({'role': 'system', 'content': system_prompt})
-        messages.append({'role': 'user', 'content': query})
-        return messages
-    def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
-        """Make request to remote API."""
-        messages = self.make_request_messages(input_item)
-        # Format request JSON according to OpenAI API format
-        request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
-        if self.timeout:
-            request_json['timeout'] = self.timeout
-        request_json['stream'] = self.stream
-        if self.stream:
-            request_json['stream_options'] = {'include_usage': True}
-        if input_item.get('tools', None):
-            tools_copy = copy.deepcopy(input_item.get('tools'))
-            # Remove the "responses" from the functions, as that doesn't
-            # need to go to the model
-            for tool in tools_copy:
-                if 'function' in tool and 'response' in tool['function']:
-                    del tool['function']['response']
-            request_json['tools'] = tools_copy
-        logger.debug(f'Request to remote API: {request_json}')
-        return request_json
-    def _parse_extra_params(self, request_json):
-        api_params = {}
-        extra_body = {}
-        for key, value in request_json.items():
-            if key in self.supported_params:
-                api_params[key] = value
-            else:
-                extra_body[key] = value
-        if extra_body:
-            api_params['extra_body'] = extra_body
-        return api_params
-    def send_request(self, request_json: dict) -> dict:
-        try:
-            parsed_request = self._parse_extra_params(request_json)
-            response = self.client.chat.completions.create(**parsed_request)
-            if response and self.stream:
-                response = self._collect_stream_response(response)
-            return response.model_dump(exclude_unset=True)
-        except Exception as e:
-            logger.error(f'Error when calling remote API: {str(e)}')
-            raise e
-    def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
-        collected_chunks = []
-        collected_messages = defaultdict(list)
-        collected_reasoning = defaultdict(list)
-        collected_tool_calls = defaultdict(dict)
-        for chunk in response_stream:
-            collected_chunks.append(chunk)
-            for choice in chunk.choices:
-                # Handle reasoning content
-                if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
-                    collected_reasoning[choice.index].append(choice.delta.reasoning_content)
-                # Handle regular content
-                if choice.delta.content is not None:
-                    collected_messages[choice.index].append(choice.delta.content)
-                # Handle tool calls
-                if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
-                    for tool_call in choice.delta.tool_calls:
-                        tool_id = tool_call.index
-                        # Initialize tool call if not present
-                        if tool_id not in collected_tool_calls[choice.index]:
-                            collected_tool_calls[choice.index][tool_id] = {
-                                'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
-                                'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
-                                'function': {
-                                    'name': '',
-                                    'arguments': ''
-                                }
-                            }
-                        # Update tool call with new chunks
-                        if hasattr(tool_call, 'function'):
-                            if hasattr(tool_call.function, 'name') and tool_call.function.name:
-                                collected_tool_calls[
-                                    choice.index][tool_id]['function']['name'] = tool_call.function.name
-                            if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
-                                collected_tool_calls[
-                                    choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
-                        # Update ID if it was received later
-                        if hasattr(tool_call, 'id') and tool_call.id:
-                            collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
-        # Get all unique choice indices from all collections
-        all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
-            collected_tool_calls.keys())
-        choices = []
-        for index in all_indices:
-            full_reply_content = ''.join(collected_messages.get(index, []))
-            reasoning = ''.join(collected_reasoning.get(index, []))
-            # Process tool_calls for this choice if any exists
-            tool_calls_list = None
-            if index in collected_tool_calls and collected_tool_calls[index]:
-                tool_calls_list = list(collected_tool_calls[index].values())
-                # Filter out any tool calls with None id (incomplete tool calls)
-                tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
-            # use the finish_reason from the last chunk that generated this choice
-            finish_reason = None
-            for chunk in reversed(collected_chunks):
-                if chunk.choices and chunk.choices[0].index == index:
-                    finish_reason = chunk.choices[0].finish_reason
-                    break
-            message_kwargs = {'role': 'assistant', 'content': full_reply_content}
-            if reasoning:
-                message_kwargs['reasoning_content'] = reasoning
-            if tool_calls_list:
-                message_kwargs['tool_calls'] = tool_calls_list
-            choice = Choice(
-                finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
-            choices.append(choice)
-        # build the final completion object
-        return ChatCompletion(
-            id=collected_chunks[0].id,
-            choices=choices,
-            created=collected_chunks[0].created,
-            model=collected_chunks[0].model,
-            object='chat.completion',
-            usage=collected_chunks[-1].usage  # use the usage from the last chunk
-        )

evalscope/models/adapters/t2i_adapter.py DELETED Viewed

@@ -1,79 +0,0 @@
-import os
-import time
-import torch
-from typing import Any, Dict, List, Optional, Tuple, Union
-from evalscope.constants import OutputType
-from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
-from evalscope.utils.io_utils import OutputsStructure
-from evalscope.utils.logger import get_logger
-from ..local_model import LocalModel
-from ..register import register_model_adapter
-from .base_adapter import BaseModelAdapter
-logger = get_logger()
-@register_model_adapter(name=OutputType.IMAGE_GENERATION)
-class T2IModelAdapter(BaseModelAdapter):
-    """
-    Text to image model adapter.
-    """
-    def __init__(self, model: LocalModel, **kwargs):
-        super().__init__(model)
-        self.task_config = kwargs.get('task_cfg', None)
-        assert self.task_config is not None, 'Task config is required for T2I model adapter.'
-        self.save_path = os.path.join(self.task_config.work_dir, OutputsStructure.PREDICTIONS_DIR,
-                                      self.task_config.model_id, 'images')
-        os.makedirs(self.save_path, exist_ok=True)
-    def _model_generate(self, prompt, infer_cfg=None) -> List:
-        """
-        Generate images from the model.
-        Args:
-            prompt: The input prompt.
-            infer_cfg: The inference configuration.
-        Returns:
-            The generated images.
-        """
-        infer_cfg = infer_cfg or {}
-        sample = self.model(prompt=prompt, **infer_cfg).images
-        return sample
-    @torch.no_grad()
-    def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
-        """
-        Args:
-            inputs: The input data.
-            infer_cfg: The inference configuration.
-        Returns:
-            The prediction results.
-        """
-        results = []
-        for input_item in inputs:
-            prompt = input_item['data'][0]
-            image_id = input_item.get('id') or input_item.get('index')
-            samples = self._model_generate(prompt, infer_cfg)
-            choices_list = []
-            for index, sample in enumerate(samples):
-                image_file_path = os.path.join(self.save_path, f'{image_id}_{index}.jpeg')
-                sample.save(image_file_path)
-                logger.debug(f'Saved image to {image_file_path}')
-                choice = ChatCompletionResponseChoice(
-                    index=index, message=ChatMessage(content=image_file_path, role='assistant'), finish_reason='stop')
-                choices_list.append(choice)
-            res_d = ChatCompletionResponse(
-                model=self.model_id, choices=choices_list, object='images.generations',
-                created=int(time.time())).model_dump(exclude_unset=True)
-            results.append(res_d)
-        return results

evalscope/models/adapters/tau_bench_adapter.py DELETED Viewed

@@ -1,189 +0,0 @@
-import json
-import time
-from typing import Any, Dict, List, Optional, Union
-from evalscope.utils.logger import get_logger
-from ..register import register_model_adapter
-from .server_adapter import ServerModelAdapter
-logger = get_logger()
-@register_model_adapter(name='tau_bench_server')
-class TauBenchAdapter(ServerModelAdapter):
-    """
-    TauBench model adapter to request remote API model and generate results for TauBench evaluation.
-    Support multi-turn and single-turn function calling tasks.
-    """
-    def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
-        """
-        Args:
-            api_url: The URL of the remote API model.
-            model_id: The ID of the remote API model.
-            api_key: The API key of the remote API model.
-        """
-        super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
-        self._patch_agent_solve()
-    def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
-        """
-        Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
-        where each list is a follow up turn in the conversation
-        each turn is a List[List[Message]]
-        Args:
-            inputs (List[dict]): The input data.
-            infer_cfg (dict): Inference configuration.
-        Returns:
-            res (List[dict]): The model prediction results.
-        """
-        infer_cfg = infer_cfg or {}
-        results = []
-        for input_item in inputs:
-            raw_input = input_item.get('raw_input')
-            res_d = self.solve(env_name=raw_input['env_name'], task_index=raw_input['task_index'], infer_cfg=infer_cfg)
-            wrapper_res = {
-                'choices': [{
-                    'index': 0,
-                    'message': {
-                        'content': json.dumps(res_d, ensure_ascii=False),
-                        'role': 'assistant'
-                    }
-                }],
-                'created':
-                time.time(),
-                'model':
-                self.model_id,
-                'object':
-                'chat.completion',
-                'usage': {
-                    'completion_tokens': 0,
-                    'prompt_tokens': 0,
-                    'total_tokens': 0
-                }
-            }
-            results.append(wrapper_res)
-        return results
-    def _patch_agent_solve(self):
-        """Patch ToolCallingAgent.solve method to use custom model configuration"""
-        from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
-        from tau_bench.envs.base import Env
-        from tau_bench.types import RESPOND_ACTION_NAME, SolveResult
-        from typing import List, Optional
-        def patched_solve(self,
-                          env: Env,
-                          task_index: Optional[int] = None,
-                          max_num_steps: int = 30,
-                          infer_cfg: Optional[dict] = {}) -> SolveResult:
-            env_reset_res = env.reset(task_index=task_index)
-            obs = env_reset_res.observation
-            info = env_reset_res.info.model_dump()
-            reward = 0.0
-            messages: List[Dict[str, Any]] = [
-                {
-                    'role': 'system',
-                    'content': self.wiki
-                },
-                {
-                    'role': 'user',
-                    'content': obs
-                },
-            ]
-            for step_index in range(max_num_steps):
-                # Use adapter's model configuration instead of agent's
-                request_json = adapter_instance.make_request(
-                    input_item={
-                        'messages': messages,
-                        'tools': self.tools_info
-                    }, infer_cfg=infer_cfg)
-                res = adapter_instance.send_request(request_json)
-                next_message = res['choices'][0]['message']
-                action = message_to_action(next_message)
-                env_response = env.step(action)
-                reward = env_response.reward
-                info = {**info, **env_response.info.model_dump()}
-                if action.name != RESPOND_ACTION_NAME:
-                    next_message['tool_calls'] = next_message['tool_calls'][:1]
-                    messages.extend([
-                        next_message,
-                        {
-                            'role': 'tool',
-                            'tool_call_id': next_message['tool_calls'][0]['id'],
-                            'name': next_message['tool_calls'][0]['function']['name'],
-                            'content': env_response.observation,
-                        },
-                    ])
-                else:
-                    messages.extend([
-                        next_message,
-                        {
-                            'role': 'user',
-                            'content': env_response.observation
-                        },
-                    ])
-                logger.debug(f'Task: {task_index} Step: {step_index} finished')
-                if env_response.done:
-                    break
-            return SolveResult(
-                reward=reward,
-                info=info,
-                messages=messages,
-                total_cost=0,
-            )
-        adapter_instance = self
-        ToolCallingAgent.solve = patched_solve
-        return 'ToolCallingAgent.solve patched successfully'
-    def solve(self, env_name, task_index, infer_cfg, **kwargs):
-        """
-        Solve a specific task in the TauBench environment.
-        Args:
-            env_name (str): The name of the TauBench environment.
-            task_index (int): The index of the task to solve.
-            **kwargs: Additional arguments for the task.
-        Returns:
-            dict: The result of the task.
-        """
-        from tau_bench.agents.tool_calling_agent import ToolCallingAgent
-        from tau_bench.envs import get_env
-        # This method can be implemented to solve specific tasks in the TauBench environment
-        isolated_env = get_env(
-            env_name=env_name,
-            user_strategy='llm',
-            user_model='dummy',  # Use dummy model to prevent errors
-            user_provider='openai',  # Use dummy provider to prevent errors
-            task_split='test',
-            task_index=task_index,
-        )
-        agent = ToolCallingAgent(
-            tools_info=isolated_env.tools_info,
-            wiki=isolated_env.wiki,
-            model='dummy',  # Use dummy model to prevent errors
-            provider='dummy',  # Use dummy provider to prevent errors
-            temperature=0,  # dummy temperature to prevent errors
-        )
-        res = agent.solve(env=isolated_env, task_index=task_index, infer_cfg=infer_cfg)
-        return res.model_dump()

evalscope/models/custom/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from .custom_model import CustomModel
-from .dummy_model import DummyCustomModel

evalscope/models/custom/custom_model.py DELETED Viewed

@@ -1,50 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import torch
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Union
-class CustomModel(ABC):
-    def __init__(self, config: dict, **kwargs):
-        self.config = config
-        self.kwargs = kwargs
-    @abstractmethod
-    @torch.no_grad()
-    def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
-        """
-        Model prediction function for batch inputs.
-        Args:
-            prompts (str): The input batch of prompts to predict.
-            **kwargs: kwargs
-        Returns:
-            res (dict): The model prediction results (batch). Format:
-            [
-                {
-                  'choices': [
-                    {
-                      'index': 0,
-                      'message': {
-                        'content': 'xxx',
-                        'role': 'assistant'
-                      }
-                    }
-                  ],
-                  'created': 1677664795,
-                  'model': 'gpt-3.5-turbo-0613',   # should be model_id
-                  'object': 'chat.completion',
-                  'usage': {
-                    'completion_tokens': 17,
-                    'prompt_tokens': 57,
-                    'total_tokens': 74
-                  }
-                }
-            ,
-            ...
-            ]
-        """
-        raise NotImplementedError

evalscope/models/custom/dummy_model.py DELETED Viewed

@@ -1,99 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import time
-from typing import List
-from evalscope.models import CustomModel
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-class DummyCustomModel(CustomModel):
-    def __init__(self, config: dict = {}, **kwargs):
-        super(DummyCustomModel, self).__init__(config=config, **kwargs)
-    def make_request_messages(self, input_item: dict) -> list:
-        """
-        Make request messages for OpenAI API.
-        """
-        if input_item.get('messages', None):
-            return input_item['messages']
-        data: list = input_item['data']
-        if isinstance(data[0], tuple):  # for truthful_qa and hellaswag
-            query = '\n'.join(''.join(item) for item in data)
-            system_prompt = input_item.get('system_prompt', None)
-        else:
-            query = data[0]
-            system_prompt = input_item.get('system_prompt', None)
-        messages = []
-        if system_prompt:
-            messages.append({'role': 'system', 'content': system_prompt})
-        messages.append({'role': 'user', 'content': query})
-        return messages
-    def predict(self, prompts: List[dict], **kwargs):
-        original_inputs = kwargs.get('origin_inputs', None)
-        infer_cfg = kwargs.get('infer_cfg', None)
-        logger.debug(f'** Prompts: {prompts}')
-        if original_inputs is not None:
-            logger.debug(f'** Original inputs: {original_inputs}')
-        if infer_cfg is not None:
-            logger.debug(f'** Inference config: {infer_cfg}')
-        # Simulate a response based on the prompts
-        # Must return a list of dicts with the same format as the OpenAI API.
-        responses = []
-        for input_item in original_inputs:
-            # message = self.make_request_messages(input_item)
-            # response = f'Dummy response for prompt: {message}'
-            res_d = {
-                'choices': [{
-                    'index': 0,
-                    'message': {
-                        'content': '*PlaceHolder*',
-                        'role': 'assistant'
-                    }
-                }],
-                'created': time.time(),
-                'model': self.config.get('model_id'),
-                'object': 'chat.completion',
-                'usage': {
-                    'completion_tokens': 0,
-                    'prompt_tokens': 0,
-                    'total_tokens': 0
-                }
-            }
-            responses.append(res_d)
-        return responses
-if __name__ == '__main__':
-    from evalscope import TaskConfig, run_task
-    dummy_model = DummyCustomModel()
-    task_config = TaskConfig(
-        model=dummy_model,
-        model_id='evalscope-model-dummy',
-        datasets=['gsm8k'],
-        eval_type='custom',  # must be custom for custom model evaluation
-        generation_config={
-            'max_new_tokens': 100,
-            'temperature': 0.0,
-            'top_p': 1.0,
-            'top_k': 50,
-            'repetition_penalty': 1.0
-        },
-        debug=True,
-        limit=5,
-    )
-    eval_results = run_task(task_cfg=task_config)

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl