PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/api/model/model.py ADDED Viewed

@@ -0,0 +1,386 @@
+import abc
+from pydantic_core import to_jsonable_python
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Optional, Sequence, Union
+from evalscope.api.messages import ChatMessage, ChatMessageAssistant, ChatMessageSystem, ChatMessageUser
+from evalscope.api.registry import get_model_api
+from evalscope.api.tool import ToolChoice, ToolFunction, ToolInfo
+from evalscope.utils import get_logger
+from evalscope.utils.function_utils import thread_safe
+from .generate_config import GenerateConfig
+from .model_output import ModelOutput
+if TYPE_CHECKING:
+    from evalscope.config import TaskConfig
+logger = get_logger()
+class ModelAPI(abc.ABC):
+    """Model API provider."""
+    def __init__(
+        self,
+        model_name: str,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        config: GenerateConfig = GenerateConfig(),
+        **kwargs
+    ) -> None:
+        """Create a model API provider.
+        Args:
+           model_name (str): Model name.
+           base_url (str | None): Alternate base URL for model.
+           api_key (str | None): API key for model.
+           api_key_vars (list[str]): Environment variables that
+              may contain keys for this provider (used for override)
+           config (GenerateConfig): Model configuration.
+        """
+        self.model_name = model_name
+        self.base_url = base_url
+        self.api_key = api_key
+        self.config = config
+    @abc.abstractmethod
+    def generate(
+        self,
+        input: List[ChatMessage],
+        tools: List[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        """Generate output from the model.
+        Args:
+          input (str | list[ChatMessage]): Chat message
+            input (if a `str` is passed it is converted
+            to a `ChatUserMessage`).
+          tools (list[ToolInfo]): Tools available for the
+            model to call.
+          tool_choice (ToolChoice): Directives to the model
+            as to which tools to prefer.
+          config (GenerateConfig): Model configuration.
+        Returns:
+           ModelOutput
+        """
+        ...
+    def batch_generate(
+        self,
+        inputs: List[List[ChatMessage]],
+        tools: List[List[ToolInfo]],
+        tool_choices: List[ToolChoice],
+        configs: List[GenerateConfig],
+    ) -> Generator[ModelOutput, None, None]:
+        """Default batch implementation using individual generate calls.
+        ModelAPI implementations can override this for optimized batch processing.
+        Args:
+          inputs: List of preprocessed chat message inputs.
+          tools: List of tools for each input.
+          tool_choices: List of tool choices for each input.
+          configs: List of configs for each input.
+        Returns:
+            Generator yielding ModelOutput for each input.
+        """
+        from concurrent.futures import ThreadPoolExecutor
+        def single_generate(args):
+            input_msgs, input_tools, tool_choice, config = args
+            return self.generate(input_msgs, input_tools, tool_choice, config)
+        with ThreadPoolExecutor(max_workers=self.config.batch_size) as executor:
+            futures = []
+            for input_msgs, input_tools, tool_choice, config in zip(inputs, tools, tool_choices, configs):
+                future = executor.submit(single_generate, (input_msgs, input_tools, tool_choice, config))
+                futures.append(future)
+            for future in futures:
+                yield future.result()
+    def supports_batch(self) -> bool:
+        """Whether this ModelAPI supports optimized batch processing."""
+        return False
+    def max_tokens(self) -> Optional[int]:
+        """Default max_tokens."""
+        return None
+    def max_tokens_for_config(self, config: GenerateConfig) -> Optional[int]:
+        """Default max_tokens for a given config.
+        Args:
+           config: Generation config.
+        Returns:
+           Default maximum tokens for specified configuration.
+        """
+        return None
+    def tools_required(self) -> bool:
+        """Any tool use in a message stream means that tools must be passed."""
+        return False
+    def tool_result_images(self) -> bool:
+        """Tool results can contain images"""
+        return False
+class Model:
+    """Model interface.
+    Use `get_model()` to get an instance of a model.
+    """
+    api: ModelAPI
+    """Model API."""
+    config: GenerateConfig
+    """Generation config."""
+    def __init__(self, api: ModelAPI, config: GenerateConfig, model_args: Dict[str, Any] = {}) -> None:
+        """Create a model.
+        Args:
+           api: Model API provider.
+           config: Model configuration.
+           model_args: Optional model args
+        """
+        self.api = api
+        self.config = config
+        self.model_args = model_args
+    @property
+    def name(self) -> str:
+        """Model name or path to model."""
+        return self.api.model_name
+    @property
+    def role(self) -> Optional[str]:
+        """Model role."""
+        return self._role
+    @role.setter
+    def role(self, role: str) -> None:
+        self._role = role
+    def __str__(self) -> str:
+        return f'Model(name={self.model_id}, role={self.role})'
+    def generate(
+        self,
+        input: Union[str, List[ChatMessage]],
+        tools: Optional[Sequence[ToolInfo]] = None,
+        tool_choice: Optional[ToolChoice] = None,
+        config: Optional[GenerateConfig] = None,
+    ) -> ModelOutput:
+        """Generate output from the model.
+        Args:
+          input: Chat message input (if a `str` is passed it is converted
+            to a `ChatMessageUser`).
+          tools: Tools available for the model to call.
+          tool_choice: Directives to the model as to which tools to prefer.
+          config: Model configuration.
+        Returns:
+           ModelOutput
+        """
+        processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
+            input, tools, tool_choice, config
+        )
+        # Call the model's generate method
+        output = self.api.generate(
+            input=processed_input,
+            tools=processed_tools,
+            tool_choice=processed_tool_choice,
+            config=processed_config,
+        )
+        # return output
+        return output
+    def batch_generate(
+        self,
+        inputs: List[List[ChatMessage]],
+        tools: List[List[ToolInfo]],
+        tool_choices: List[ToolChoice],
+        configs: List[GenerateConfig],
+    ) -> Generator[ModelOutput, None, None]:
+        """Generate output from the model for a batch of inputs.
+        Args:
+          inputs (List[List[ChatMessage]]): Batch of chat message inputs.
+          tools (List[List[ToolInfo]]): Batch of tools for each input.
+          tool_choices (List[ToolChoice]): Batch of tool choices for each input.
+          configs (List[GenerateConfig]): Batch of configs for each input.
+        """
+        preprocessed_data = []
+        for input_item, input_tools, input_tool_choice, input_config in zip(inputs, tools, tool_choices, configs):
+            processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
+                input=input_item, tools=input_tools, tool_choice=input_tool_choice, config=input_config
+            )
+            preprocessed_data.append((processed_input, processed_tools, processed_tool_choice, processed_config))
+        # check if ModelAPI supports batch processing
+        if self.api.supports_batch() and len(preprocessed_data) > 1:
+            # use the batch_generate method of the ModelAPI
+            inputs, tools, tool_choices, configs = zip(*preprocessed_data)
+            batch_results = self.api.batch_generate(
+                inputs=list(inputs), tools=list(tools), tool_choices=list(tool_choices), configs=list(configs)
+            )
+            for result in batch_results:
+                yield result
+        else:
+            # fall back to processing each input individually
+            for input_msgs, input_tools, tool_choice, config in preprocessed_data:
+                result = self.api.generate(input_msgs, input_tools, tool_choice, config)
+                yield result
+    def _preprocess_input(
+        self,
+        input: Union[str, List[ChatMessage]],
+        tools: Optional[Sequence[ToolInfo]] = None,
+        tool_choice: Optional[ToolChoice] = None,
+        config: Optional[GenerateConfig] = None,
+    ) -> tuple[List[ChatMessage], List[ToolInfo], ToolChoice, GenerateConfig]:
+        """pre process input for generate."""
+        # merge passed config
+        if config is not None:
+            config = self.config.merge(config)
+        else:
+            config = self.config.model_copy(deep=True)
+        # provide max_tokens from the model api if required
+        if config.max_tokens is None:
+            config.max_tokens = self.api.max_tokens_for_config(config)
+            if config.max_tokens is None:
+                config.max_tokens = self.api.max_tokens()
+        # normalize input to chat
+        if isinstance(input, str):
+            input = [ChatMessageUser(content=input)]
+        # handle tools and tool_choice
+        tool_choice = tool_choice if tool_choice is not None else 'auto'
+        tools_info = list(tools) if tools is not None else []
+        if isinstance(tool_choice, ToolFunction):
+            tools_info = [tool for tool in tools_info if tool.name == tool_choice.name]
+        if tool_choice == 'none' or len(tools_info) == 0:
+            if not self.api.tools_required():
+                tools_info = []
+            tool_choice = 'none'
+        return input, tools_info, tool_choice, config
+class ModelCache:
+    _models: Dict[str, 'Model'] = {}
+    @classmethod
+    def get(cls, key: str) -> Optional['Model']:
+        return cls._models.get(key, None)
+    @classmethod
+    def set(cls, key: str, model: 'Model') -> None:
+        cls._models[key] = model
+def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
+    """Get an instance of a model with the specified task configuration.
+    Args:
+        task_config (TaskConfig): Task configuration.
+    Returns:
+        Model: An instance of the model.
+    """
+    model = task_config.model
+    eval_type = task_config.eval_type
+    base_url = task_config.api_url
+    api_key = task_config.api_key
+    config = task_config.generation_config
+    model_args = task_config.model_args or {}
+    return get_model(
+        model=model, eval_type=eval_type, base_url=base_url, api_key=api_key, config=config, model_args=model_args
+    )
+@thread_safe
+def get_model(
+    model: Union[str, Model, ModelAPI],
+    eval_type: str,
+    base_url: Optional[str] = None,
+    api_key: Optional[str] = None,
+    config: GenerateConfig = GenerateConfig(),
+    model_args: dict = {},
+    role: Optional[str] = None,
+    memoize: bool = True,
+) -> Model:
+    """Get an instance of a model.
+    Calls to get_model() are memoized (i.e. a call with the same arguments
+    will return an existing instance of the model rather than creating a
+    new one). You can disable this with `memoize=False`.
+    Args:
+        task_config (TaskConfig): Task configuration.
+        memoize (bool): Whether to memoize the model instance.
+    Returns:
+        Model instance.
+    """
+    # start with seeing if a model was passed
+    if isinstance(model, Model):
+        return model
+    if isinstance(model, ModelAPI):
+        return Model(model, config, model_args)
+    # see if we can return a memoized model instance
+    # (exclude mockllm since custom_outputs is an infinite generator)
+    model_cache_key: str = ''
+    if eval_type.startswith('mock_llm'):
+        memoize = False
+    if memoize:
+        model_cache_key = (
+            model + str(role) + config.model_dump_json(exclude_none=True) + str(base_url) + str(api_key)
+            + str(to_jsonable_python(model_args, fallback=lambda _: None))
+        )
+        cached = ModelCache.get(model_cache_key)
+        if cached is not None:
+            return cached
+    logger.info(
+        f'Creating model {model} with eval_type={eval_type} '
+        f'base_url={base_url}, api_key={api_key}, config={config}, model_args={model_args}'
+    )
+    # find a matching model type
+    modelapi_type = get_model_api(eval_type)
+    modelapi_instance = modelapi_type(
+        model_name=model,
+        base_url=base_url,
+        api_key=api_key,
+        config=config,
+        **model_args,
+    )
+    m = Model(modelapi_instance, config, model_args)
+    if role is not None:
+        m.role = role
+    if memoize:
+        ModelCache.set(model_cache_key, m)
+    return m

evalscope/api/model/model_output.py ADDED Viewed

@@ -0,0 +1,285 @@
+import uuid
+from pydantic import BaseModel, Field, JsonValue, model_validator
+from typing import Any, Dict, List, Literal, Optional, Type, Union
+from evalscope.api.messages import ChatMessageAssistant, Content
+from evalscope.api.tool import ToolCall, ToolFunction
+class ModelUsage(BaseModel):
+    """Token usage for completion."""
+    input_tokens: int = Field(default=0)
+    """Total input tokens used."""
+    output_tokens: int = Field(default=0)
+    """Total output tokens used."""
+    total_tokens: int = Field(default=0)
+    """Total tokens used."""
+    input_tokens_cache_write: Optional[int] = Field(default=None)
+    """Number of tokens written to the cache."""
+    input_tokens_cache_read: Optional[int] = Field(default=None)
+    """Number of tokens retrieved from the cache."""
+    reasoning_tokens: Optional[int] = Field(default=None)
+    """Number of tokens used for reasoning."""
+    def __add__(self, other: 'ModelUsage') -> 'ModelUsage':
+        def optional_sum(a: Optional[int], b: Optional[int]) -> Optional[int]:
+            if a is not None and b is not None:
+                return a + b
+            if a is not None:
+                return a
+            if b is not None:
+                return b
+            return None
+        return ModelUsage(
+            input_tokens=self.input_tokens + other.input_tokens,
+            output_tokens=self.output_tokens + other.output_tokens,
+            total_tokens=self.total_tokens + other.total_tokens,
+            input_tokens_cache_write=optional_sum(self.input_tokens_cache_write, other.input_tokens_cache_write),
+            input_tokens_cache_read=optional_sum(self.input_tokens_cache_read, other.input_tokens_cache_read),
+            reasoning_tokens=optional_sum(self.reasoning_tokens, other.reasoning_tokens),
+        )
+StopReason = Literal[
+    'stop',
+    'max_tokens',
+    'model_length',
+    'tool_calls',
+    'content_filter',
+    'unknown',
+]
+"""Reason that the model stopped or failed to generate."""
+class TopLogprob(BaseModel):
+    """List of the most likely tokens and their log probability, at this token position."""
+    token: str
+    """The top-kth token represented as a string."""
+    logprob: float
+    """The log probability value of the model for the top-kth token."""
+    bytes: Optional[List[int]] = Field(default=None)
+    """The top-kth token represented as a byte array (a list of integers)."""
+class Logprob(BaseModel):
+    """Log probability for a token."""
+    token: str
+    """The predicted token represented as a string."""
+    logprob: float
+    """The log probability value of the model for the predicted token."""
+    bytes: Optional[List[int]] = Field(default=None)
+    """The predicted token represented as a byte array (a list of integers)."""
+    top_logprobs: Optional[List[TopLogprob]] = Field(default=None)
+    """If the `top_logprobs` argument is greater than 0, this will contain an ordered list of the top K most likely tokens and their log probabilities."""  # noqa: E501
+class Logprobs(BaseModel):
+    """Log probability information for a completion choice."""
+    content: List[Logprob]
+    """a (num_generated_tokens,) length list containing the individual log probabilities for each generated token."""
+class ChatCompletionChoice(BaseModel):
+    """Choice generated for completion."""
+    message: ChatMessageAssistant
+    """Assistant message."""
+    stop_reason: StopReason = Field(default='unknown')
+    """Reason that the model stopped generating."""
+    logprobs: Optional[Logprobs] = Field(default=None)
+    """Logprobs."""
+    @model_validator(mode='before')
+    @classmethod
+    def migrate_stop_reason(cls: Type['ChatCompletionChoice'], values: Dict[str, Any]) -> Dict[str, Any]:
+        if 'stop_reason' in values:
+            stop_reason = values['stop_reason']
+            if stop_reason == 'length':
+                values['stop_reason'] = 'max_tokens'
+        return values
+    @classmethod
+    def from_content(cls, content: Union[str, List[Content]]) -> 'ChatCompletionChoice':
+        """Create a ChatCompletionChoice from content string."""
+        return cls(
+            message=ChatMessageAssistant(content=content),
+            stop_reason='stop',
+        )
+class ModelOutput(BaseModel):
+    """Output from model generation."""
+    model: str = Field(default_factory=str)
+    """Model used for generation."""
+    choices: List[ChatCompletionChoice] = Field(default=[])
+    """Completion choices."""
+    usage: Optional[ModelUsage] = Field(default=None)
+    """Model token usage"""
+    time: Optional[float] = Field(default=None)
+    """Time elapsed (in seconds) for call to generate."""
+    metadata: Optional[Dict[str, Any]] = Field(default=None)
+    """Additional metadata associated with model output."""
+    error: Optional[str] = Field(default=None)
+    """Error message in the case of content moderation refusals."""
+    @property
+    def empty(self) -> bool:
+        return len(self.choices) == 0
+    @property
+    def stop_reason(self) -> StopReason:
+        """First message stop reason."""
+        return self.choices[0].stop_reason
+    @property
+    def message(self) -> ChatMessageAssistant:
+        """First message choice."""
+        return self.choices[0].message
+    @property
+    def completion(self) -> str:
+        """Text of first message choice text."""
+        if len(self.choices) > 0:
+            return self.choices[0].message.text
+        else:
+            return '\n'.join(choice.message.text for choice in self.choices)
+    @completion.setter
+    def completion(self, completion: str) -> None:
+        """Set the text of the first message choice.
+        Args:
+          completion (str): Text for first message.
+        """
+        if len(self.choices) > 0:
+            self.choices[0].message.text = completion
+        else:
+            self.choices.append(
+                ChatCompletionChoice(
+                    message=ChatMessageAssistant(content=completion, model=self.model),
+                    stop_reason='stop',
+                )
+            )
+    @property
+    def completions(self) -> List[str]:
+        """List of all message choices text."""
+        return [choice.message.text for choice in self.choices]
+    @staticmethod
+    def from_content(
+        model: str,
+        content: Union[str, List[Content]],
+        stop_reason: StopReason = 'stop',
+        error: Optional[str] = None,
+    ) -> 'ModelOutput':
+        """Create ModelOutput from simple text content.
+        Args:
+           model: Model name.
+           content: Text content from generation.
+           stop_reason: Stop reason for generation.
+           error: Error message.
+        """
+        return ModelOutput(
+            model=model,
+            choices=[
+                ChatCompletionChoice(
+                    message=ChatMessageAssistant(content=content, model=model, source='generate'),
+                    stop_reason=stop_reason,
+                )
+            ],
+            error=error,
+        )
+    @staticmethod
+    def for_tool_call(
+        model: str,
+        tool_name: str,
+        tool_arguments: Dict[str, Any],
+        internal: Optional[JsonValue] = None,
+        tool_call_id: Optional[str] = None,
+        content: Optional[str] = None,
+    ) -> 'ModelOutput':
+        """
+        Returns a ModelOutput for requesting a tool call.
+        Args:
+            model: model name
+            tool_name: The name of the tool.
+            internal: The model's internal info for the tool (if any).
+            tool_arguments: The arguments passed to the tool.
+            tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
+            content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
+        Returns:
+            A ModelOutput corresponding to the tool call
+        """
+        if content is None:
+            content = f'tool call for tool {tool_name}'
+        if tool_call_id is None:
+            tool_call_id = f'for_tool_call_{uuid.uuid4()}'
+        return ModelOutput(
+            model=model,
+            choices=[
+                ChatCompletionChoice(
+                    message=ChatMessageAssistant(
+                        content=content,
+                        model=model,
+                        source='generate',
+                        tool_calls=[
+                            ToolCall(
+                                id=tool_call_id,
+                                internal=internal,
+                                function=ToolFunction(
+                                    name=tool_name,
+                                    arguments=tool_arguments,
+                                )
+                            )
+                        ],
+                    ),
+                    stop_reason='tool_calls',
+                )
+            ],
+        )
+def as_stop_reason(reason: Optional[str]) -> StopReason:
+    """Encode common reason strings into standard StopReason."""
+    if reason in ['stop', 'eos']:
+        return 'stop'
+    elif reason == 'length':
+        return 'max_tokens'
+    elif reason in ['tool_calls', 'function_call']:
+        return 'tool_calls'
+    elif reason in ['content_filter', 'model_length', 'max_tokens']:
+        return reason
+    else:
+        return 'unknown'

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl