PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py CHANGED Viewed

@@ -61,17 +61,18 @@ def t5_tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_IN
 def load_pretrained_model(
-        model_cls,
-        model_args,
-        model_path=None,
-        tokenizer_path=None,
-        model_max_length=None,
-        padding_side=None,
-        image_aspect_ratio='pad',  # or 'square'
-        mmprojector_repo=None,
-        mmprojector_name=None,
-        device='cuda',
-        cache_dir=CACHE_DIR):
+    model_cls,
+    model_args,
+    model_path=None,
+    tokenizer_path=None,
+    model_max_length=None,
+    padding_side=None,
+    image_aspect_ratio='pad',  # or 'square'
+    mmprojector_repo=None,
+    mmprojector_name=None,
+    device='cuda',
+    cache_dir=CACHE_DIR
+):
     tokenizer_dict = {}
     if model_max_length:
         tokenizer_dict['model_max_length'] = model_max_length
@@ -80,7 +81,7 @@ def load_pretrained_model(
     from ..utils import download_file
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, **tokenizer_dict)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, **tokenizer_dict)
     # tokenizer.pad_token = tokenizer.unk_token # could be redundant
     model_path = download_file(model_path, cache_dir=cache_dir)
@@ -106,7 +107,8 @@ def load_pretrained_model(
         model_args.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter  # important to set to correct path
         model.get_model().initialize_vision_modules(
-            model_args)  # This will load the CLIP vision encoder and MLP projector
+            model_args
+        )  # This will load the CLIP vision encoder and MLP projector
     else:
         model.resize_token_embeddings(len(tokenizer))  # perhaps not needed

evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py CHANGED Viewed

@@ -8,8 +8,9 @@ from ..model import ScoreModel
 class VQAScoreModel(ScoreModel):
     @abstractmethod
-    def forward(self, images: List[str], texts: List[str], question_template: str,
-                answer_template: str) -> torch.Tensor:
+    def forward(
+        self, images: List[str], texts: List[str], question_template: str, answer_template: str
+    ) -> torch.Tensor:
         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
         question_template: a string with optional {} to be replaced with the 'text'
         answer_template: a string with optional {} to be replaced with the 'text'

evalscope/models/__init__.py CHANGED Viewed

@@ -4,38 +4,15 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .adapters import (BaseModelAdapter, BFCLAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
-                           CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
-                           TauBenchAdapter, initialize_model_adapter)
-    from .custom import CustomModel, DummyCustomModel
-    from .local_model import LocalModel, get_local_model
-    from .register import get_model_adapter
+    from .model_apis import llm_ckpt, mockllm, openai_api
 else:
     _import_structure = {
-        'adapters': [
-            'BaseModelAdapter',
-            'initialize_model_adapter',
-            'ChatGenerationModelAdapter',
-            'ContinuationLogitsModelAdapter',
-            'MultiChoiceModelAdapter',
-            'CustomModelAdapter',
-            'ServerModelAdapter',
-            'T2IModelAdapter',
-            'TauBenchAdapter',
-            'BFCLAdapter',
-        ],
-        'custom': [
-            'CustomModel',
-            'DummyCustomModel',
-        ],
-        'local_model': [
-            'LocalModel',
-            'get_local_model',
-        ],
-        'register': [
-            'get_model_adapter',
-        ],
+        'model_apis': [
+            'openai_api',
+            'mockllm',
+            'llm_ckpt',
+        ]
     }
     import sys

evalscope/models/mockllm.py ADDED Viewed

@@ -0,0 +1,65 @@
+from typing import Any, Dict, Generator, Iterable, Iterator, List, Optional, Union
+from evalscope.api.dataset import Dataset
+from evalscope.api.messages import ChatMessage
+from evalscope.api.model import GenerateConfig, ModelAPI, ModelOutput
+from evalscope.api.tool import ToolChoice, ToolInfo
+from evalscope.utils.function_utils import thread_safe
+class MockLLM(ModelAPI):
+    """A mock implementation of the ModelAPI class for testing purposes.
+    Always returns default_output, unless you pass in a model_args
+    key "custom_outputs" with a value of an Iterable[ModelOutput]
+    """
+    default_output = 'Default output from mockllm/model'
+    outputs: Iterator[ModelOutput]
+    def __init__(
+        self,
+        model_name: str,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        config: GenerateConfig = GenerateConfig(),
+        custom_outputs: Iterable[ModelOutput] = None,
+        **model_args: Dict[str, Any],
+    ) -> None:
+        super().__init__(model_name, base_url, api_key, config)
+        self.model_args = model_args
+        if custom_outputs is not None:
+            # We cannot rely on the user of this model giving custom_outputs
+            # the correct type since they do not call this constructor
+            # Hence this type check and the one in generate.
+            if not isinstance(custom_outputs, (Iterable, Generator)):
+                raise ValueError(
+                    f"model_args['custom_outputs'] must be an Iterable or a Generator, got {custom_outputs}"
+                )
+            self.outputs = iter(custom_outputs)
+        else:
+            self.outputs = iter((
+                ModelOutput.from_content(model='mockllm', content=self.default_output)
+                for _ in iter(int, 1)  # produce an infinite iterator
+            ))
+    @thread_safe
+    def generate(
+        self,
+        input: List[ChatMessage],
+        tools: List[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        try:
+            output = next(self.outputs)
+        except StopIteration:
+            raise ValueError('custom_outputs ran out of values')
+        if not isinstance(output, ModelOutput):
+            raise ValueError(f'output must be an instance of ModelOutput; got {type(output)}; content: {repr(output)}')
+        return output
+    def batch_generate(inputs: Dataset, config: GenerateConfig) -> List[ModelOutput]:
+        return super().batch_generate(inputs, config)

evalscope/models/model_apis.py ADDED Viewed

@@ -0,0 +1,47 @@
+from evalscope.api.model import ModelAPI
+from evalscope.api.registry import register_model_api
+from evalscope.utils.deprecation_utils import deprecated
+@register_model_api(name='mock_llm')
+def mockllm() -> type[ModelAPI]:
+    from .mockllm import MockLLM
+    return MockLLM
+@register_model_api(name='openai_api')
+def openai_api() -> type[ModelAPI]:
+    from .openai_compatible import OpenAICompatibleAPI
+    return OpenAICompatibleAPI
+@register_model_api(name='server')
+@deprecated(since='1.0.0', remove_in='1.1.0', alternative='openai_api')
+def server() -> type[ModelAPI]:
+    from .openai_compatible import OpenAICompatibleAPI
+    return OpenAICompatibleAPI
+@register_model_api(name='llm_ckpt')
+def llm_ckpt() -> type[ModelAPI]:
+    from .modelscope import ModelScopeAPI
+    return ModelScopeAPI
+@register_model_api(name='checkpoint')
+@deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
+def checkpoint() -> type[ModelAPI]:
+    from .modelscope import ModelScopeAPI
+    return ModelScopeAPI
+@register_model_api(name='text2image')
+def text2image() -> type[ModelAPI]:
+    from .text2image_model import Text2ImageAPI
+    return Text2ImageAPI

evalscope/models/modelscope.py ADDED Viewed

@@ -0,0 +1,455 @@
+from __future__ import annotations
+import copy
+import functools
+import json
+import time
+import torch  # type: ignore
+from concurrent.futures import Future
+from dataclasses import dataclass
+from logging import getLogger
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+from queue import Empty, Queue
+from threading import Thread
+from torch import Tensor  # type: ignore
+from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
+from typing_extensions import override
+from evalscope.api.messages import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
+from evalscope.api.model import (
+    ChatCompletionChoice,
+    GenerateConfig,
+    Logprob,
+    Logprobs,
+    ModelAPI,
+    ModelOutput,
+    ModelUsage,
+    TopLogprob,
+)
+from evalscope.api.tool import ToolChoice, ToolInfo
+from evalscope.utils.model_utils import get_device
+logger = getLogger()
+class ModelScopeAPI(ModelAPI):
+    def __init__(
+        self,
+        model_name: str,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        config: GenerateConfig = GenerateConfig(),
+        **model_args: Any,
+    ):
+        super().__init__(
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key,
+            config=config,
+        )
+        # collect known model_args (then delete them so we can pass the rest on)
+        def collect_model_arg(name: str) -> Optional[Any]:
+            nonlocal model_args
+            value = model_args.get(name, None)
+            if value is not None:
+                model_args.pop(name)
+            return value
+        model_path = collect_model_arg('model_path')
+        device_map = collect_model_arg('device_map')
+        torch_dtype = collect_model_arg('precision')
+        tokenizer_path = collect_model_arg('tokenizer_path')
+        self.chat_template = collect_model_arg('chat_template')
+        self.tokenizer_call_args = collect_model_arg('tokenizer_call_args')
+        self.enable_thinking = collect_model_arg('enable_thinking')
+        if self.tokenizer_call_args is None:
+            self.tokenizer_call_args = {}
+        # device
+        self.device = device_map or get_device()
+        # torch dtype
+        DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
+        if isinstance(torch_dtype, str) and torch_dtype != 'auto':
+            torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
+        self.torch_dtype = torch_dtype
+        # model
+        model_name_or_path = model_path or model_name
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            device_map=self.device,
+            token=self.api_key,
+            torch_dtype=self.torch_dtype,
+            trust_remote_code=True,
+            **model_args
+        )
+        # tokenizer
+        tokenizer_name_or_path = tokenizer_path or model_name_or_path
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True)
+        # LLMs generally don't have a pad token and we need one for batching
+        if self.tokenizer.pad_token is None:
+            if self.tokenizer.eos_token is not None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            else:
+                # add a pad token
+                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        # set padding side to left for LLMs
+        self.tokenizer.padding_side = 'left'
+        # set chat template if provided
+        if self.chat_template:
+            self.tokenizer.chat_template = self.chat_template
+            logger.info(f'Using custom chat template: {self.chat_template}')
+    def generate(
+        self,
+        input: List[ChatMessage],
+        tools: List[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        # create chat
+        chat = self.ms_chat(input, tools)
+        assert isinstance(self.tokenizer_call_args, dict)
+        # prepare tokenizer
+        tokenizer = functools.partial(
+            self.tokenizer,
+            return_tensors='pt',
+            padding=True,
+            **self.tokenizer_call_args,
+        )
+        # prepare generator
+        kwargs: Dict[str, Any] = {}
+        if config.do_sample is not None:
+            kwargs['do_sample'] = config.do_sample
+        if config.n is not None:
+            if config.n > 1:
+                assert config.do_sample, 'n > 1 requires do_sample=True in GenerateConfig'
+            kwargs['num_return_sequences'] = config.n
+        if config.max_tokens is not None:
+            kwargs['max_new_tokens'] = config.max_tokens
+        if config.temperature is not None:
+            kwargs['temperature'] = config.temperature
+        if config.top_p is not None:
+            kwargs['top_p'] = config.top_p
+        if config.top_k is not None:
+            kwargs['top_k'] = config.top_k
+        if config.logprobs is not None:
+            kwargs['output_logits'] = config.logprobs
+        if 'return_dict_in_generate' in kwargs:
+            assert kwargs['return_dict_in_generate']
+        if config.stop_seqs is not None:
+            from transformers.generation import StopStringCriteria  # type: ignore
+            stopping_criteria = [StopStringCriteria(self.tokenizer, config.stop_seqs)]
+            kwargs['stopping_criteria'] = stopping_criteria
+        kwargs['return_dict_in_generate'] = True
+        generator = functools.partial(self.model.generate, **kwargs)
+        # prepare decoder
+        decoder = functools.partial(
+            self.tokenizer.batch_decode,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        # generate
+        responses = batched_generate(
+            GenerateInput(
+                input=chat,
+                device=self.model.device,
+                tokenizer=tokenizer,
+                generator=generator,
+                decoder=decoder,
+                batch_size=config.batch_size or self.max_connections(),
+            )
+        )
+        choices: List[ChatCompletionChoice] = []
+        for response in responses:
+            # gather logprobs
+            final_logprobs = None
+            if config.logprobs is not None:
+                final_logprobs = extract_logprobs(
+                    response=response,
+                    top=config.top_logprobs,
+                    tokenizer=self.tokenizer,
+                )
+            # construct choice
+            # TODO: Handle tool calls
+            choice = ChatCompletionChoice(
+                message=ChatMessageAssistant(content=response.output, model=self.model_name, source='generate'),
+                logprobs=(Logprobs(content=final_logprobs) if final_logprobs is not None else None),
+            )
+            choices.append(choice)
+        # return output
+        return ModelOutput(
+            model=self.model_name,
+            choices=choices,
+            usage=ModelUsage(
+                input_tokens=response.input_tokens,
+                output_tokens=response.output_tokens,
+                total_tokens=response.total_tokens,
+            ),
+            time=response.time,
+        )
+    @override
+    def max_tokens(self) -> Optional[int]:
+        """Default is 2048, bump it up to a value suitable for evals."""
+        return 2048
+    @override
+    def max_connections(self) -> int:
+        """Effectively the batch size."""
+        return 8
+    def ms_chat(self, messages: List[ChatMessage], tools: List[ToolInfo]) -> str:
+        # convert to ms format
+        tools_list = []
+        ms_messages = copy.deepcopy(messages)
+        if len(tools) > 0:
+            tools_list = [json.loads(tool.model_dump_json(exclude_none=True, indent=2)) for tool in tools]
+        ms_messages = message_content_to_string(ms_messages)
+        # apply chat template
+        if self.tokenizer.chat_template is not None:
+            chat = self.tokenizer.apply_chat_template(
+                ms_messages,
+                add_generation_prompt=True,
+                tokenize=False,
+                tools=tools_list if len(tools_list) > 0 else None,
+                enable_thinking=self.enable_thinking,  # not all models use this, check if it is supported
+            )
+        else:
+            chat = ''
+            for message in ms_messages:
+                chat += f'{message.role}: {message.content}\n'
+        # return
+        return cast(str, chat)
+def message_content_to_string(messages: List[ChatMessage]) -> List[ChatMessage]:
+    """Convert list of content in `ChatMessageAssistant`, `ChatMessageUser` or `ChatMessageSystem` to a string."""
+    for message in messages:
+        if isinstance(message.content, list):
+            is_multimodal = any(
+                isinstance(item, (ContentAudio, ContentImage, ContentVideo)) for item in message.content
+            )
+            if is_multimodal:
+                raise NotImplementedError(
+                    'Transformer model does not support multimodal content, please provide text inputs only.'
+                )
+            message.content = message.text
+    return messages
+# return value from generate as a result of specifying return_dict_in_generate
+class ModelGenerateOutput:
+    sequences: Tensor
+    logits: tuple[Tensor]
+class Tokenizer(Protocol):
+    def __call__(self, input: List[str]) -> Dict[Literal['input_ids', 'attention_mask'], Tensor]:
+        ...
+class Generator(Protocol):
+    def __call__(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
+        ...
+class Decoder(Protocol):
+    def __call__(self, sequences: Tensor) -> list[str]:
+        ...
+@dataclass
+class GenerateInput:
+    input: str
+    device: str
+    tokenizer: Tokenizer
+    generator: Generator
+    decoder: Decoder
+    batch_size: int
+@dataclass
+class GenerateOutput:
+    output: str
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+    logprobs: Optional[torch.Tensor]
+    time: float
+@dataclass
+class _QueueItem:
+    input: GenerateInput
+    future: Future[GenerateOutput]
+batch_thread: Optional[Thread] = None
+batch_queue: 'Queue[_QueueItem]' = Queue()
+def batched_generate(input: GenerateInput) -> List[GenerateOutput]:
+    # start the background thread if necessary
+    global batch_thread
+    if batch_thread is None:
+        batch_thread = Thread(target=process_batches, daemon=True)
+        batch_thread.start()
+    # enqueue the job
+    future = Future[GenerateOutput]()
+    batch_queue.put(_QueueItem(input=input, future=future))
+    return future.result()
+def process_batches() -> None:
+    while True:
+        # drain the queue (wait until no new messages have shown up for 2 seconds)
+        inputs: List[Tuple[GenerateInput, Future[GenerateOutput]]] = []
+        while True:
+            try:
+                input = batch_queue.get(timeout=2)
+                inputs.append((input.input, input.future))
+                if len(inputs) == input.input.batch_size:
+                    # max batch size reached
+                    break
+            except Empty:
+                # we have exhausted the queue
+                break
+        # see if we have any work to do
+        if len(inputs) == 0:
+            continue
+        try:
+            # capture the generator and decoder functions
+            start_time = time.monotonic()
+            first_input = inputs[0][0]
+            device = first_input.device
+            tokenizer = first_input.tokenizer
+            generator = first_input.generator
+            decoder = first_input.decoder
+            num_return_sequences = generator.keywords.get('num_return_sequences', 1)
+            # tokenize and move to device
+            tokenized_inputs = tokenizer([item[0].input for item in inputs])
+            input_ids = tokenized_inputs['input_ids']
+            attention_mask = tokenized_inputs['attention_mask']
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+            # generate
+            with torch.inference_mode():
+                generation_outputs = cast(
+                    ModelGenerateOutput,
+                    generator(input_ids=input_ids, attention_mask=attention_mask),
+                )
+                generate_ids = generation_outputs.sequences
+                logits = generation_outputs.logits
+            # get logprobs from logits
+            logprobs = None
+            if logits is not None:
+                stacked_logits = torch.stack(logits).transpose(0, 1)
+                logprobs = torch.nn.functional.log_softmax(stacked_logits, dim=-1)
+            # decode
+            generated_tokens = generate_ids[:, input_ids.size(dim=1):]
+            if logprobs is not None:
+                assert logprobs.shape[1] == generated_tokens.shape[1]
+            outputs = decoder(sequences=generated_tokens)
+            # call back futures
+            total_time = time.monotonic() - start_time
+            for input_index in range(len(inputs)):
+                choices: List[GenerateOutput] = []
+                # handle input
+                future = inputs[input_index][1]
+                input_tokens = input_ids[input_index].shape[-1]
+                # handle choices
+                for choice_index in range(num_return_sequences):
+                    output_index = input_index * num_return_sequences + choice_index
+                    # handle out of
+                    output = outputs[output_index]
+                    output_tokens = generate_ids[output_index].shape[-1] - input_tokens
+                    logprobs_tensor = logprobs[output_index] if logprobs is not None else None
+                    # create the output
+                    choices.append(
+                        GenerateOutput(
+                            output=output,
+                            input_tokens=input_tokens,
+                            output_tokens=output_tokens,
+                            total_tokens=input_tokens + output_tokens,
+                            logprobs=logprobs_tensor,
+                            time=total_time,
+                        )
+                    )
+                # asyncio futures are not thread safe, so we need to pass the event loop
+                # down to this point, so we can mark the future as done in a thread safe manner.
+                # see: https://docs.python.org/3/library/asyncio-dev.html#concurrency-and-multithreading
+                future.set_result(choices)
+        except Exception as ex:
+            for inp in inputs:
+                future = inp[1]
+                future.set_exception(ex)
+def extract_logprobs(
+    response: GenerateOutput,
+    top: Optional[int],
+    tokenizer,
+) -> List[Logprob]:
+    assert response.logprobs is not None
+    k = top or 1
+    topk_values, topk_inds = response.logprobs.topk(k=k, dim=-1)
+    final_logprobs = []
+    for toks, vals in zip(topk_inds, topk_values):
+        top_logprobs: List[TopLogprob] = []
+        for tok, val in zip(toks, vals):
+            # TODO: you get byte artifacts converting single ids to tokens like this...
+            # but `tokenizer.decode` strips spaces. There must be a better way to do this.
+            token_str = tokenizer.convert_ids_to_tokens(tok.item())
+            top_logprobs.append(TopLogprob(
+                token=token_str,
+                logprob=val,
+                bytes=list(map(ord, token_str)),
+            ))
+        final_logprobs.append(
+            Logprob(
+                token=top_logprobs[0].token,
+                logprob=top_logprobs[0].logprob,
+                bytes=top_logprobs[0].bytes,
+                top_logprobs=top_logprobs,
+            )
+        )
+    return final_logprobs

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl