evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  3. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  22. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  23. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  24. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
  25. evalscope/backend/rag_eval/utils/clip.py +149 -0
  26. evalscope/backend/rag_eval/utils/embedding.py +183 -0
  27. evalscope/backend/rag_eval/utils/llm.py +72 -0
  28. evalscope/backend/rag_eval/utils/tools.py +63 -0
  29. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  30. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  31. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  32. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  33. evalscope/benchmarks/race/samples.jsonl +5 -0
  34. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  35. evalscope/cli/start_perf.py +8 -11
  36. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  37. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  38. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  39. evalscope/metrics/rouge_metric.py +30 -15
  40. evalscope/perf/arguments.py +179 -0
  41. evalscope/perf/benchmark.py +245 -0
  42. evalscope/perf/http_client.py +127 -711
  43. evalscope/perf/main.py +35 -0
  44. evalscope/perf/plugin/__init__.py +2 -0
  45. evalscope/perf/plugin/api/__init__.py +3 -0
  46. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  47. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  48. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  49. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  50. evalscope/perf/plugin/datasets/__init__.py +6 -0
  51. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  52. evalscope/perf/plugin/datasets/custom.py +21 -0
  53. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  54. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  55. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  56. evalscope/perf/plugin/datasets/openqa.py +38 -0
  57. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  58. evalscope/perf/plugin/registry.py +54 -0
  59. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  60. evalscope/perf/utils/benchmark_util.py +135 -0
  61. evalscope/perf/utils/chat_service.py +252 -0
  62. evalscope/perf/utils/db_util.py +200 -0
  63. evalscope/perf/utils/handler.py +46 -0
  64. evalscope/perf/utils/local_server.py +139 -0
  65. evalscope/registry/config/cfg_arena.yaml +77 -0
  66. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  67. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  68. evalscope/registry/config/cfg_single.yaml +78 -0
  69. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  70. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  71. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  72. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  73. evalscope/registry/data/question.jsonl +80 -0
  74. evalscope/third_party/longbench_write/README.md +118 -0
  75. evalscope/third_party/longbench_write/default_task.json +27 -0
  76. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  77. evalscope/third_party/toolbench_static/README.md +118 -0
  78. evalscope/third_party/toolbench_static/config_default.json +15 -0
  79. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  80. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  81. evalscope/utils/logger.py +18 -20
  82. evalscope/utils/utils.py +41 -42
  83. evalscope/version.py +2 -2
  84. evalscope-0.7.0.dist-info/LICENSE +203 -0
  85. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/METADATA +162 -103
  86. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/RECORD +107 -32
  87. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  88. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  89. tests/cli/__init__.py +1 -0
  90. tests/cli/test_run.py +76 -0
  91. tests/perf/__init__.py +1 -0
  92. tests/perf/test_perf.py +96 -0
  93. tests/rag/__init__.py +0 -0
  94. tests/rag/test_clip_benchmark.py +85 -0
  95. tests/rag/test_mteb.py +136 -0
  96. tests/rag/test_ragas.py +120 -0
  97. tests/swift/__init__.py +1 -0
  98. tests/swift/test_run_swift_eval.py +146 -0
  99. tests/swift/test_run_swift_vlm_eval.py +128 -0
  100. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  101. tests/test_run_all.py +12 -0
  102. tests/vlm/__init__.py +1 -0
  103. tests/vlm/test_vlmeval.py +59 -0
  104. evalscope/perf/_logging.py +0 -32
  105. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  106. evalscope/perf/datasets/openqa.py +0 -22
  107. evalscope/perf/plugin_registry.py +0 -35
  108. evalscope/perf/query_parameters.py +0 -42
  109. evalscope/perf/server_sent_event.py +0 -43
  110. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  111. /evalscope/{perf/datasets → backend/rag_eval/utils}/__init__.py +0 -0
  112. /evalscope/{preprocess/tokenizers → perf/utils}/__init__.py +0 -0
  113. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  114. {evalscope/preprocess → tests}/__init__.py +0 -0
@@ -0,0 +1,149 @@
1
+ import os
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from typing import List
5
+ from PIL import Image
6
+ from evalscope.backend.rag_eval.utils.tools import download_model, PIL_to_base64
7
+ from transformers import AutoModel, AutoProcessor
8
+ from langchain_core.embeddings import Embeddings
9
+
10
+
11
+ class VisionModel:
12
+ @staticmethod
13
+ def load(**kw):
14
+ api_base = kw.get("api_base", None)
15
+ if api_base:
16
+
17
+ return VLMAPI(
18
+ model_name=kw.get("model_name", ""),
19
+ openai_api_base=api_base,
20
+ openai_api_key=kw.get("api_key", "EMPTY"),
21
+ prompt=kw.get("prompt", None),
22
+ )
23
+ else:
24
+ return CLIPModel(**kw)
25
+
26
+
27
+ class VLMAPI:
28
+ def __init__(self, model_name, openai_api_base, openai_api_key, prompt=None):
29
+ from langchain_openai import ChatOpenAI
30
+ from langchain_core.prompts import ChatPromptTemplate
31
+
32
+ self.model_name = model_name
33
+ self.model = ChatOpenAI(
34
+ model_name=model_name,
35
+ openai_api_base=openai_api_base,
36
+ openai_api_key=openai_api_key,
37
+ )
38
+ self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
39
+ self.prompt = ChatPromptTemplate.from_messages(
40
+ [
41
+ ("system", prompt if prompt else self.default_prompt),
42
+ (
43
+ "user",
44
+ [
45
+ {
46
+ "type": "image_url",
47
+ "image_url": {"url": "data:image/jpeg;base64,{image_data}"},
48
+ }
49
+ ],
50
+ ),
51
+ ]
52
+ )
53
+ self.chain = self.prompt | self.model
54
+ self.transform = PIL_to_base64
55
+
56
+ def encode_image(self, images):
57
+ captions = []
58
+ for image in images:
59
+ response = self.chain.invoke({"image_data": image})
60
+ captions.append(response.content)
61
+ return captions
62
+
63
+
64
+ class CLIPModel(Embeddings):
65
+ def __init__(
66
+ self,
67
+ model_name: str,
68
+ revision: str = "master",
69
+ hub="modelscope",
70
+ device="cpu",
71
+ ):
72
+ self.device = device
73
+ self.model_name = model_name
74
+ self.revision = revision
75
+
76
+ # Download the model if it doesn't exist locally
77
+ if not os.path.exists(model_name) and hub == "modelscope":
78
+ model_name = download_model(self.model_name, self.revision)
79
+
80
+ # Load the model and processor
81
+ self.model = AutoModel.from_pretrained(model_name).to(self.device)
82
+ self.processor = AutoProcessor.from_pretrained(model_name)
83
+ self.transform = self.processor.image_processor
84
+ self.tokenizer = self.processor.tokenizer
85
+
86
+ def encode_text(self, batch_texts: List[str] | List[List[str]]):
87
+ if isinstance(batch_texts[0], list):
88
+ batch_texts = [
89
+ text for _, texts in enumerate(batch_texts) for text in texts
90
+ ]
91
+ # Ensure that the input texts are within the token limit
92
+ max_length = self.tokenizer.model_max_length
93
+ if not max_length or max_length > 0xFFFFFF:
94
+ max_length = 512
95
+ encoded_inputs = self.tokenizer(
96
+ text=batch_texts,
97
+ max_length=max_length,
98
+ padding=True,
99
+ truncation=True,
100
+ return_tensors="pt",
101
+ )
102
+
103
+ inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
104
+
105
+ with torch.no_grad():
106
+ text_features = self.model.get_text_features(**inputs)
107
+ text_features = F.normalize(text_features, p=2, dim=-1)
108
+ return text_features
109
+
110
+ def encode_image(self, image):
111
+ batch_images = torch.stack([d["pixel_values"][0] for d in image])
112
+ batch_images = batch_images.to(self.device)
113
+ with torch.no_grad():
114
+ image_features = self.model.get_image_features(batch_images)
115
+ image_features = F.normalize(image_features, p=2, dim=-1)
116
+ return image_features
117
+
118
+ def embed_documents(self, texts):
119
+ text_features = self.encode_text(texts)
120
+ return text_features.cpu().numpy().tolist()
121
+
122
+ def embed_query(self, text):
123
+ text_features = self.encode_text([text])
124
+ return text_features.cpu().numpy().tolist()[0]
125
+
126
+ def embed_image(self, uris: List[str]):
127
+ # read image and transform
128
+ images = [Image.open(image_path) for image_path in uris]
129
+ transformed_images = [
130
+ self.transform(
131
+ image,
132
+ return_tensors="pt",
133
+ )
134
+ for image in images
135
+ ]
136
+ image_features = self.encode_image(transformed_images)
137
+ return image_features.cpu().numpy().tolist()
138
+
139
+
140
+ if __name__ == "__main__":
141
+ model = CLIPModel("AI-ModelScope/chinese-clip-vit-large-patch14-336px")
142
+ model.embed_image(
143
+ [
144
+ "custom_eval/multimodal/images/AMNH.jpg",
145
+ "custom_eval/multimodal/images/AMNH.jpg",
146
+ ]
147
+ )
148
+ model.encode_text(["我喜欢吃饭" * 1000])
149
+ print("done")
@@ -0,0 +1,183 @@
1
+ import os
2
+ import torch
3
+ from typing import List, Optional, Union, Dict
4
+ from sentence_transformers import models
5
+ from sentence_transformers.SentenceTransformer import SentenceTransformer
6
+ from sentence_transformers.cross_encoder import CrossEncoder
7
+ from torch import Tensor
8
+ from evalscope.backend.rag_eval.utils.tools import download_model
9
+ from evalscope.utils.logger import get_logger
10
+ from langchain_core.embeddings import Embeddings
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class BaseModel(Embeddings):
16
+ def __init__(
17
+ self,
18
+ model_name_or_path: str,
19
+ max_seq_length: int = 512,
20
+ prompt: str = '',
21
+ revision: Optional[str] = None,
22
+ **kwargs,
23
+ ):
24
+ self.model_name_or_path = model_name_or_path
25
+ self.max_seq_length = max_seq_length
26
+ self.model_kwargs = kwargs.pop('model_kwargs', {})
27
+ self.model_kwargs['trust_remote_code'] = True
28
+
29
+ self.config_kwargs = kwargs.pop('config_kwargs', {})
30
+ self.config_kwargs['trust_remote_code'] = True
31
+
32
+ self.encode_kwargs = kwargs.pop('encode_kwargs', {})
33
+ self.encode_kwargs['convert_to_tensor'] = True
34
+
35
+ self.prompt = prompt
36
+ self.revision = revision
37
+
38
+ @property
39
+ def mteb_model_meta(self):
40
+ """Model metadata for MTEB (Multilingual Task Embeddings Benchmark)"""
41
+ from mteb import ModelMeta
42
+
43
+ return ModelMeta(
44
+ name=os.path.basename(self.model_name_or_path),
45
+ revision=self.revision,
46
+ languages=None,
47
+ release_date=None,
48
+ )
49
+
50
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
51
+ """Embed search docs. Compact langchain.
52
+
53
+ Args:
54
+ texts: List of text to embed.
55
+
56
+ Returns:
57
+ List of embeddings.
58
+ """
59
+ return self.encode_corpus(texts).tolist()
60
+
61
+ def embed_query(self, text: str) -> List[float]:
62
+ """Embed query text. Compact langchain.
63
+
64
+ Args:
65
+ text: Text to embed.
66
+
67
+ Returns:
68
+ Embedding.
69
+ """
70
+ return self.encode_queries(text).tolist()
71
+
72
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
73
+ """Embed text."""
74
+ raise NotImplementedError
75
+
76
+ def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
77
+ """Embed query text. Compact mteb."""
78
+ raise NotImplementedError
79
+
80
+ def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
81
+ """Embed search docs . Compact mteb."""
82
+ raise NotImplementedError
83
+
84
+
85
+ class SentenceTransformerModel(BaseModel):
86
+ def __init__(
87
+ self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs
88
+ ):
89
+ super().__init__(model_name_or_path, **kwargs)
90
+
91
+ if not pooling_mode:
92
+ self.model = SentenceTransformer(
93
+ self.model_name_or_path,
94
+ config_kwargs=self.config_kwargs,
95
+ model_kwargs=self.model_kwargs,
96
+ )
97
+ else:
98
+ word_embedding_model = models.Transformer(
99
+ self.model_name_or_path,
100
+ config_args=self.config_kwargs,
101
+ model_args=self.model_kwargs,
102
+ )
103
+ pooling_model = models.Pooling(
104
+ word_embedding_model.get_word_embedding_dimension(),
105
+ pooling_mode=pooling_mode,
106
+ )
107
+ self.model = SentenceTransformer(
108
+ modules=[word_embedding_model, pooling_model],
109
+ )
110
+
111
+ self.model.max_seq_length = self.max_seq_length
112
+
113
+ def encode(self, texts: Union[str, List[str]], prompt=None, **kwargs) -> List[torch.Tensor]:
114
+ kwargs.pop('prompt_name', '') # remove prompt name, use prompt
115
+ self.encode_kwargs.update(kwargs)
116
+
117
+ embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
118
+ assert isinstance(embeddings, Tensor)
119
+ return embeddings.cpu().detach()
120
+
121
+ def encode_queries(self, queries, **kwargs):
122
+ return self.encode(queries, prompt=self.prompt)
123
+
124
+ def encode_corpus(self, corpus, **kwargs):
125
+ if isinstance(corpus[0], dict):
126
+ input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
127
+ else:
128
+ input_texts = corpus
129
+ return self.encode(input_texts)
130
+
131
+
132
+ class CrossEncoderModel(BaseModel):
133
+ def __init__(self, model_name_or_path: str, **kwargs):
134
+ super().__init__(model_name_or_path, **kwargs)
135
+ self.model = CrossEncoder(
136
+ self.model_name_or_path,
137
+ trust_remote_code=True,
138
+ max_length=self.max_seq_length,
139
+ )
140
+
141
+ def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
142
+ self.encode_kwargs.update(kwargs)
143
+
144
+ if len(sentences[0]) == 3: # Note: For mteb retrieval task
145
+ processed_sentences = []
146
+ for query, docs, instruction in sentences:
147
+ if isinstance(docs, dict):
148
+ docs = docs['text']
149
+ processed_sentences.append((self.prompt + query, docs))
150
+ sentences = processed_sentences
151
+ embeddings = self.model.predict(sentences, **self.encode_kwargs)
152
+ assert isinstance(embeddings, Tensor)
153
+ return embeddings
154
+
155
+
156
+ class EmbeddingModel:
157
+ """Custom embeddings"""
158
+
159
+ @staticmethod
160
+ def load(
161
+ model_name_or_path: str = '',
162
+ is_cross_encoder: bool = False,
163
+ hub: str = 'modelscope',
164
+ revision: Optional[str] = 'master',
165
+ **kwargs,
166
+ ):
167
+ # If model path does not exist and hub is 'modelscope', download the model
168
+ if not os.path.exists(model_name_or_path) and hub == 'modelscope':
169
+ model_name_or_path = download_model(model_name_or_path, revision)
170
+
171
+ # Return different model instances based on whether it is a cross-encoder and pooling mode
172
+ if is_cross_encoder:
173
+ return CrossEncoderModel(
174
+ model_name_or_path,
175
+ revision=revision,
176
+ **kwargs,
177
+ )
178
+ else:
179
+ return SentenceTransformerModel(
180
+ model_name_or_path,
181
+ revision=revision,
182
+ **kwargs,
183
+ )
@@ -0,0 +1,72 @@
1
+ import os
2
+ from typing import Any, Dict, Iterator, List, Mapping, Optional
3
+ from modelscope.utils.hf_util import GenerationConfig
4
+ from langchain_core.callbacks.manager import CallbackManagerForLLMRun
5
+ from langchain_core.language_models.llms import LLM as BaseLLM
6
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter
7
+ from langchain_openai import ChatOpenAI
8
+
9
+
10
+ class LLM:
11
+ @staticmethod
12
+ def load(**kw):
13
+ api_base = kw.get('api_base', None)
14
+ if api_base:
15
+ return ChatOpenAI(
16
+ model_name=kw.get('model_name', ''),
17
+ openai_api_base=api_base,
18
+ openai_api_key=kw.get('api_key', 'EMPTY'),
19
+ )
20
+ else:
21
+ return LocalLLM(**kw)
22
+
23
+
24
+ class LocalLLM(BaseLLM):
25
+ """A custom LLM that loads a model from a given path and performs inference."""
26
+
27
+ model_name_or_path: str
28
+ model_revision: str = 'master'
29
+ template_type: str = 'default'
30
+ model_name: Optional[str]
31
+ model: Optional[ChatGenerationModelAdapter]
32
+ generation_config: Optional[Dict]
33
+
34
+ def __init__(self, **kw):
35
+ super().__init__(**kw)
36
+ self.model_name = os.path.basename(self.model_name_or_path)
37
+ self.model = ChatGenerationModelAdapter(
38
+ model_id=self.model_name_or_path,
39
+ model_revision=self.model_revision,
40
+ template_type=self.template_type,
41
+ generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
42
+ )
43
+
44
+ def _call(
45
+ self,
46
+ prompt: str,
47
+ stop: Optional[List[str]] = None,
48
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
49
+ **kwargs: Any,
50
+ ) -> str:
51
+ """Run the LLM on the given input."""
52
+ infer_cfg = {'stop': stop}
53
+
54
+ response = self.model._model_generate(prompt, infer_cfg)
55
+ return response
56
+
57
+ @property
58
+ def _identifying_params(self) -> Dict[str, Any]:
59
+ """Return a dictionary of identifying parameters."""
60
+ return {
61
+ # The model name allows users to specify custom token counting
62
+ # rules in LLM monitoring applications (e.g., in LangSmith users
63
+ # can provide per token pricing for their model and monitor
64
+ # costs for the given LLM.)
65
+ 'model_name': self.model_name,
66
+ 'revision': self.model_revision,
67
+ }
68
+
69
+ @property
70
+ def _llm_type(self) -> str:
71
+ """Get the type of language model used by this chat model. Used for logging purposes only."""
72
+ return self.model_name
@@ -0,0 +1,63 @@
1
+ import io
2
+ import os
3
+ import base64
4
+ from modelscope import snapshot_download
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ def PIL_to_bytes(image_format, **kwargs):
11
+ OPTIONS = {
12
+ "webp": dict(format="webp", lossless=True),
13
+ "png": dict(format="png"),
14
+ "jpg": dict(format="jpeg"),
15
+ }
16
+
17
+ def transform(image):
18
+ bytestream = io.BytesIO()
19
+ image.save(bytestream, **OPTIONS[image_format])
20
+ return bytestream.getvalue()
21
+
22
+ return transform
23
+
24
+
25
+ def PIL_to_base64(image, **kwargs):
26
+ bytestream = io.BytesIO()
27
+ image.save(bytestream, format="jpeg")
28
+ return base64.b64encode(bytestream.getvalue()).decode("utf-8")
29
+
30
+
31
+ def path_to_bytes(filepath):
32
+ with open(filepath, "rb") as fp:
33
+ return fp.read()
34
+
35
+
36
+ def path_to_base64(filepath):
37
+ file_content = path_to_bytes(filepath)
38
+ return base64.b64encode(file_content).decode("utf-8")
39
+
40
+
41
+ def ensure_dir(file_path):
42
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
43
+
44
+
45
+ def save_to_jsonl(df, file_path):
46
+ ensure_dir(file_path)
47
+ df.to_json(file_path, orient="records", lines=True, force_ascii=False)
48
+
49
+
50
+ def save_to_tsv(df, file_path):
51
+ ensure_dir(file_path)
52
+ df.to_csv(file_path, sep="\t", index=False)
53
+
54
+
55
+ def download_model(model_id: str, revision: str):
56
+ """
57
+ default base dir: '~/.cache/modelscope/hub/model_id'
58
+ """
59
+ logger.info(f"Loading model {model_id} from modelscope")
60
+
61
+ model_path = snapshot_download(model_id=model_id, revision=revision)
62
+
63
+ return model_path
@@ -1,10 +1,11 @@
1
+ import copy
2
+ import subprocess
3
+ from functools import partial
1
4
  from typing import Optional, Union
2
- from evalscope.utils import is_module_installed, get_valid_list
5
+
3
6
  from evalscope.backend.base import BackendManager
7
+ from evalscope.utils import get_valid_list, is_module_installed
4
8
  from evalscope.utils.logger import get_logger
5
- from functools import partial
6
- import subprocess
7
- import copy
8
9
 
9
10
  logger = get_logger()
10
11
 
@@ -19,6 +20,7 @@ class ExecutionMode:
19
20
 
20
21
 
21
22
  class VLMEvalKitBackendManager(BackendManager):
23
+
22
24
  def __init__(self, config: Union[str, dict], **kwargs):
23
25
  """BackendManager for VLM Evaluation Kit
24
26
 
@@ -36,7 +38,6 @@ class VLMEvalKitBackendManager(BackendManager):
36
38
 
37
39
  self._check_valid()
38
40
 
39
-
40
41
  def _check_valid(self):
41
42
  # Ensure not both model and datasets are empty
42
43
  if not self.args.data or not self.args.model:
@@ -45,15 +46,15 @@ class VLMEvalKitBackendManager(BackendManager):
45
46
  # Check datasets
46
47
  valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
47
48
  if len(invalid_datasets) != 0:
48
- logger.warning(f"Using custom dataset: {invalid_datasets}, ")
49
-
49
+ logger.warning(f'Using custom dataset: {invalid_datasets}, ')
50
+
50
51
  # Check model
51
52
  if isinstance(self.args.model[0], dict):
52
53
  model_names = [model['name'] for model in self.args.model]
53
54
  valid_model_names, invalid_model_names = get_valid_list(model_names, self.valid_model_names)
54
55
  assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
55
56
  f'refer to the following list to get proper model name: {self.valid_model_names}'
56
-
57
+
57
58
  # set model_cfg
58
59
  new_model_names = []
59
60
  for model_cfg in self.args.model:
@@ -62,19 +63,15 @@ class VLMEvalKitBackendManager(BackendManager):
62
63
  if model_name == 'CustomAPIModel':
63
64
  model_type = model_cfg['type']
64
65
  remain_cfg = copy.deepcopy(model_cfg)
65
- del remain_cfg['name'] # remove not used args
66
- del remain_cfg['type'] # remove not used args
67
-
68
- self.valid_models.update({
69
- model_type: partial(model_class,
70
- model=model_type,
71
- **remain_cfg)
72
- })
66
+ del remain_cfg['name'] # remove not used args
67
+ del remain_cfg['type'] # remove not used args
68
+
69
+ self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
73
70
  new_model_names.append(model_type)
74
71
  else:
75
72
  remain_cfg = copy.deepcopy(model_cfg)
76
- del remain_cfg['name'] # remove not used args
77
-
73
+ del remain_cfg['name'] # remove not used args
74
+
78
75
  self.valid_models[model_name] = partial(model_class, **remain_cfg)
79
76
  new_model_names.append(model_name)
80
77
 
@@ -83,7 +80,7 @@ class VLMEvalKitBackendManager(BackendManager):
83
80
  elif isinstance(self.args.model[0], str):
84
81
  valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
85
82
  if len(invalid_datasets) != 0:
86
- logger.warning(f"Using custom dataset: {invalid_datasets}, ")
83
+ logger.warning(f'Using custom dataset: {invalid_datasets}, ')
87
84
 
88
85
  @property
89
86
  def cmd(self):
@@ -127,7 +124,7 @@ class VLMEvalKitBackendManager(BackendManager):
127
124
  f'--data {" ".join(self.args.data)} ' \
128
125
  f'{self.get_restore_arg("verbose", self.args.verbose)} ' \
129
126
  f'{self.get_restore_arg("ignore", self.args.ignore)} ' \
130
- f'{self.get_restore_arg("rerun", self.args.rerun)} ' \
127
+ f'{self.get_restore_arg("reuse", self.args.reuse)} ' \
131
128
  f'{self.get_arg_with_default("work-dir", self.args.work_dir)} ' \
132
129
  f'{self.get_arg_with_default("limit", self.args.limit)} ' \
133
130
  f'{self.get_arg_with_default("mode", self.args.mode)} ' \
@@ -141,7 +138,12 @@ class VLMEvalKitBackendManager(BackendManager):
141
138
  if run_mode == ExecutionMode.CMD:
142
139
  logger.info(f'** Run command: {self.cmd}')
143
140
  try:
144
- subprocess.run(self.cmd, check=True, ext=True, shell=True,)
141
+ subprocess.run(
142
+ self.cmd,
143
+ check=True,
144
+ ext=True,
145
+ shell=True,
146
+ )
145
147
  except subprocess.CalledProcessError as e:
146
148
  logger.error(f'** Run command failed: {e.stderr}')
147
149
  raise
@@ -0,0 +1 @@
1
+ {'id': 0, 'question': '下列关于税法基本原则的表述中,不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}
@@ -0,0 +1,5 @@
1
+ {'input': '毛毛骑在牛背上过河,他共有甲、乙、丙、丁4头牛,甲过河要20分钟,乙过河要30分钟,丙过河要40分钟,丁过河要50分钟。毛毛每次只能赶2头牛过河,要把4头牛都赶到对岸去,最少要多少分钟?', 'A': '190', 'B': '180', 'C': '170', 'D': '160', 'target': 'D'}
2
+ {'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关', 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
3
+ {'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
4
+ {'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
5
+ {'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
@@ -0,0 +1,5 @@
1
+ {'input': 'A "dished face" profile is often associated with', 'A': 'a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'B': 'a recessive maxilla due to failure of elongation of the cranial base.', 'C': 'an enlarged frontal bone due to hydrocephaly.', 'D': 'defective development of the maxillary air sinus.', 'target': 'B'}
2
+ {'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.', 'A': 'Service quality.', 'B': 'Service action.', 'C': 'Service recovery.', 'D': 'Service satisfaction.', 'target': 'A'}
3
+ {'input': ' Information collected for the first time specifically for a marketing research study is called:', 'A': 'Secondary research.', 'B': 'Primary research.', 'C': 'Soft research.', 'D': 'Experimental research.', 'target': 'B'}
4
+ {'input': "This includes advertisements that contain 'call-to-response' mechanisms such as telephone numbers, website addresses, email and postal addresses:", 'A': 'Direct response advertising.', 'B': 'Sales promotions.', 'C': 'Mass media advertising.', 'D': 'Public relations.', 'target': 'A'}
5
+ {'input': 'Which of the following is not part of the external marketing environment?', 'A': 'Political.', 'B': 'Legal.', 'C': 'Product.', 'D': 'Socio-cultural.', 'target': 'C'}
@@ -0,0 +1,5 @@
1
+ {'example_id': 'middle4227.txt', 'article': 'There are many kinds...ealthy.\n,.', 'answer': 'D', 'question': 'We may read this pas... in _ .', 'options': ['a letter', 'a story', 'a newspaper', 'a health magazine']}
2
+ {'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
3
+ {'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats _ .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
4
+ {'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
5
+ {'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
@@ -0,0 +1,5 @@
1
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the man behind The Chipmunks?"}], "ideal": ["David Seville", "david seville"]}
2
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}], "ideal": ["Sunset Blvd", "West Sunset Boulevard", "Sunset Boulevard", "Sunset Bulevard", "Sunset Blvd.", "sunset boulevard", "sunset bulevard", "west sunset boulevard", "sunset blvd"]}
3
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the next British Prime Minister after Arthur Balfour?"}], "ideal": ["Sir Henry Campbell-Bannerman", "Campbell-Bannerman", "Campbell Bannerman", "Sir Henry Campbell Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman", "henry campbell bannerman", "sir henry campbell bannerman", "campbell bannerman"]}
4
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who had a 70s No 1 hit with Kiss You All Over?"}], "ideal": ["Internal exile", "Exiles", "Transported for life", "Exile (politics and government)", "Voluntary exile", "Sent into exile", "Exile and Banishment", "Self-exile", "Forced exile", "Exile", "Exile in Greek tragedy", "Banish", "Banishment", "exiles", "voluntary exile", "forced exile", "banish", "self exile", "exile politics and government", "exile in greek tragedy", "sent into exile", "banishment", "transported for life", "exile", "internal exile", "exile and banishment"]}
5
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}
@@ -1,20 +1,21 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from abc import abstractmethod
3
- import os, sys, time
2
+ import os
4
3
  from argparse import ArgumentParser
5
- import subprocess
6
-
7
4
 
8
5
  from evalscope.cli.base import CLICommand
9
- from evalscope.perf.http_client import add_argument, run_perf_benchmark
6
+ from evalscope.perf.arguments import add_argument
7
+ from evalscope.perf.main import run_perf_benchmark
10
8
 
11
9
  current_path = os.path.dirname(os.path.abspath(__file__))
12
10
  root_path = os.path.dirname(current_path)
11
+
12
+
13
13
  def subparser_func(args):
14
14
  """ Function which will be called for a specific sub parser.
15
15
  """
16
16
  return PerfBenchCMD(args)
17
-
17
+
18
+
18
19
  class PerfBenchCMD(CLICommand):
19
20
  name = 'perf'
20
21
 
@@ -28,10 +29,6 @@ class PerfBenchCMD(CLICommand):
28
29
  parser = parsers.add_parser(PerfBenchCMD.name)
29
30
  add_argument(parser)
30
31
  parser.set_defaults(func=subparser_func)
31
-
32
+
32
33
  def execute(self):
33
34
  run_perf_benchmark(self.args)
34
-
35
-
36
-
37
-
@@ -51,7 +51,7 @@ try:
51
51
  punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
52
52
 
53
53
  if not os.path.exists(punkt_path):
54
- os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
54
+ os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
55
55
  os.system(f'unzip {punkt_path} -d {nltk_dir}')
56
56
  else:
57
57
  logger.info(f'{punkt_path} already exists, skipping download')