evalscope 0.13.2__py3-none-any.whl β†’ 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +4 -4
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/__init__.py +2 -2
  12. evalscope/benchmarks/aigc/__init__.py +0 -0
  13. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  14. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  15. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  16. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  17. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  18. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  19. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  20. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  21. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  22. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  23. evalscope/benchmarks/arc/arc_adapter.py +2 -2
  24. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  25. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  26. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  27. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  28. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  29. evalscope/benchmarks/data_adapter.py +21 -10
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  35. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
  36. evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
  37. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  38. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  39. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
  41. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  42. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  43. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  44. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  45. evalscope/benchmarks/utils.py +7 -16
  46. evalscope/cli/start_app.py +1 -1
  47. evalscope/collections/evaluator.py +20 -6
  48. evalscope/config.py +8 -4
  49. evalscope/constants.py +11 -0
  50. evalscope/evaluator/evaluator.py +2 -2
  51. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  52. evalscope/metrics/__init__.py +49 -4
  53. evalscope/metrics/llm_judge.py +1 -1
  54. evalscope/metrics/named_metrics.py +13 -0
  55. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  56. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  57. evalscope/metrics/t2v_metrics/constants.py +12 -0
  58. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  59. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  60. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  61. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  62. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  63. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  64. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  65. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  66. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  67. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  68. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  69. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  70. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  71. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  72. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  73. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  74. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  75. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  76. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  77. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  139. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  140. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  141. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  142. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  143. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  144. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  145. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  146. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  147. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  148. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  149. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  154. evalscope/metrics/t2v_metrics/score.py +78 -0
  155. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  156. evalscope/models/__init__.py +50 -14
  157. evalscope/models/adapters/__init__.py +17 -0
  158. evalscope/models/{base_adapter.py β†’ adapters/base_adapter.py} +17 -17
  159. evalscope/models/{chat_adapter.py β†’ adapters/chat_adapter.py} +10 -7
  160. evalscope/models/{choice_adapter.py β†’ adapters/choice_adapter.py} +2 -6
  161. evalscope/models/{custom_adapter.py β†’ adapters/custom_adapter.py} +2 -4
  162. evalscope/models/{server_adapter.py β†’ adapters/server_adapter.py} +1 -3
  163. evalscope/models/adapters/t2i_adapter.py +76 -0
  164. evalscope/models/custom/__init__.py +2 -1
  165. evalscope/models/custom/dummy_model.py +11 -13
  166. evalscope/models/local_model.py +82 -33
  167. evalscope/models/model.py +2 -42
  168. evalscope/models/register.py +26 -0
  169. evalscope/perf/arguments.py +24 -5
  170. evalscope/perf/benchmark.py +28 -42
  171. evalscope/perf/http_client.py +2 -3
  172. evalscope/perf/plugin/api/custom_api.py +1 -1
  173. evalscope/perf/plugin/api/openai_api.py +2 -2
  174. evalscope/perf/plugin/datasets/custom.py +4 -1
  175. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  176. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  177. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  178. evalscope/perf/plugin/datasets/openqa.py +4 -1
  179. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  180. evalscope/perf/utils/benchmark_util.py +14 -8
  181. evalscope/perf/utils/db_util.py +9 -3
  182. evalscope/perf/utils/log_utils.py +41 -0
  183. evalscope/report/__init__.py +1 -0
  184. evalscope/report/app.py +128 -78
  185. evalscope/report/app_arguments.py +11 -0
  186. evalscope/report/generator.py +1 -1
  187. evalscope/run.py +10 -3
  188. evalscope/summarizer.py +2 -1
  189. evalscope/third_party/thinkbench/eval.py +19 -7
  190. evalscope/utils/chat_service.py +2 -2
  191. evalscope/utils/import_utils.py +66 -0
  192. evalscope/utils/utils.py +48 -29
  193. evalscope/version.py +2 -2
  194. {evalscope-0.13.2.dist-info β†’ evalscope-0.15.0.dist-info}/METADATA +37 -15
  195. {evalscope-0.13.2.dist-info β†’ evalscope-0.15.0.dist-info}/RECORD +209 -96
  196. tests/aigc/__init__.py +1 -0
  197. tests/aigc/test_t2i.py +87 -0
  198. tests/cli/test_all.py +4 -4
  199. tests/cli/test_collection.py +2 -1
  200. tests/cli/test_run.py +19 -12
  201. tests/perf/test_perf.py +3 -3
  202. tests/rag/test_clip_benchmark.py +0 -1
  203. tests/rag/test_mteb.py +37 -8
  204. tests/rag/test_ragas.py +29 -26
  205. tests/vlm/test_vlmeval.py +37 -1
  206. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  207. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  208. evalscope/metrics/code_metric.py +0 -98
  209. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  210. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  211. {evalscope-0.13.2.dist-info β†’ evalscope-0.15.0.dist-info}/LICENSE +0 -0
  212. {evalscope-0.13.2.dist-info β†’ evalscope-0.15.0.dist-info}/WHEEL +0 -0
  213. {evalscope-0.13.2.dist-info β†’ evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
  214. {evalscope-0.13.2.dist-info β†’ evalscope-0.15.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,66 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+
4
+ import importlib
5
+ import os
6
+ from itertools import chain
7
+ from types import ModuleType
8
+ from typing import Any
9
+
10
+ from .logger import get_logger
11
+
12
+ logger = get_logger() # pylint: disable=invalid-name
13
+
14
+
15
+ class _LazyModule(ModuleType):
16
+ """
17
+ Module class that surfaces all objects but only performs associated imports when the objects are requested.
18
+ """
19
+
20
+ # Very heavily inspired by optuna.integration._IntegrationModule
21
+ # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
22
+ def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
23
+ super().__init__(name)
24
+ self._modules = set(import_structure.keys())
25
+ self._class_to_module = {}
26
+ for key, values in import_structure.items():
27
+ for value in values:
28
+ self._class_to_module[value] = key
29
+ # Needed for autocompletion in an IDE
30
+ self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
31
+ self.__file__ = module_file
32
+ self.__spec__ = module_spec
33
+ self.__path__ = [os.path.dirname(module_file)]
34
+ self._objects = {} if extra_objects is None else extra_objects
35
+ self._name = name
36
+ self._import_structure = import_structure
37
+
38
+ # Needed for autocompletion in an IDE
39
+ def __dir__(self):
40
+ result = super().__dir__()
41
+ # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
42
+ # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
43
+ for attr in self.__all__:
44
+ if attr not in result:
45
+ result.append(attr)
46
+ return result
47
+
48
+ def __getattr__(self, name: str) -> Any:
49
+ if name in self._objects:
50
+ return self._objects[name]
51
+ if name in self._modules:
52
+ value = self._get_module(name)
53
+ elif name in self._class_to_module.keys():
54
+ module = self._get_module(self._class_to_module[name])
55
+ value = getattr(module, name)
56
+ else:
57
+ raise AttributeError(f'module {self.__name__} has no attribute {name}')
58
+
59
+ setattr(self, name, value)
60
+ return value
61
+
62
+ def _get_module(self, module_name: str):
63
+ return importlib.import_module('.' + module_name, self.__name__)
64
+
65
+ def __reduce__(self):
66
+ return self.__class__, (self._name, self.__file__, self._import_structure)
evalscope/utils/utils.py CHANGED
@@ -76,21 +76,21 @@ def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
76
76
  class ResponseParser:
77
77
 
78
78
  @staticmethod
79
- def parse_first_capital(text: str) -> str:
79
+ def parse_first_capital(text: str, options: list[str]) -> str:
80
80
  for t in text:
81
- if t.isupper():
81
+ if t.isupper() and (t in options):
82
82
  return t
83
83
  return ''
84
84
 
85
85
  @staticmethod
86
- def parse_last_capital(text: str) -> str:
86
+ def parse_last_capital(text: str, options: list[str]) -> str:
87
87
  for t in text[::-1]:
88
- if t.isupper():
88
+ if t.isupper() and (t in options):
89
89
  return t
90
90
  return ''
91
91
 
92
92
  @staticmethod
93
- def parse_first_option_with_choices(text: str, options: list) -> str:
93
+ def parse_first_option_with_choices(text: str, options: list[str]) -> str:
94
94
  """
95
95
  Find first valid option for text.
96
96
 
@@ -98,7 +98,7 @@ class ResponseParser:
98
98
  text: The text to parse.
99
99
  options: The options to find. e.g. ['A', 'B', 'C', 'D']
100
100
  """
101
- options_concat = '|'.join([str(i) for i in options])
101
+ options_concat = ResponseParser.process_options(options)
102
102
 
103
103
  patterns = [
104
104
  rf'η­”ζ‘ˆζ˜―?\s?([{options_concat}])',
@@ -155,48 +155,61 @@ class ResponseParser:
155
155
  for i in options:
156
156
  if i in outputs:
157
157
  return i
158
- return ''
158
+ # If no match found, try to find the last capital letter in the text
159
+ last_capital = ResponseParser.parse_last_capital(text, options)
160
+ if last_capital:
161
+ return last_capital
162
+ return 'No valid option found'
159
163
 
160
164
  @staticmethod
161
- def parse_first_option(text: str) -> str:
165
+ def parse_first_option(text: str, options: list[str]) -> str:
162
166
  """
163
167
  Find first valid option for text.
164
168
 
165
169
  Args:
166
170
  text: The text to parse.
167
171
  """
172
+ options_pattern = ResponseParser.process_options(options)
173
+
168
174
  patterns = [
169
- r'answer is \(?(\w+)\)?',
170
- r'[Aa]nswer:\s*(\w+)',
171
- r'[Tt]he correct answer is:\s*(\w+)',
172
- r'[Tt]he correct answer is:\n\s*(\w+)',
173
- r'[Tt]he correct answer is:\n\n-\s*(\w+)',
174
- r'[Tt]he answer might be:\n\n-\s*(\w+)',
175
- r'[Tt]he answer is \s*(\w+)',
175
+ rf'[Aa]nswer:\s*({options_pattern})',
176
+ rf'ANSWER:\s*({options_pattern})',
177
+ rf'answer is \(?({options_pattern})\)?',
178
+ rf'[Tt]he correct answer is:\s*({options_pattern})',
179
+ rf'[Tt]he correct answer is:\n\s*({options_pattern})',
180
+ rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
181
+ rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
182
+ rf'[Tt]he answer is \s*({options_pattern})',
176
183
  ]
177
184
 
178
185
  regexes = [re.compile(pattern) for pattern in patterns]
179
186
  for regex in regexes:
180
- match = regex.search(text)
181
- if match:
182
- return match.group(1)
183
- return ''
187
+ matches = regex.search(text)
188
+ if matches:
189
+ return matches.group(1)
190
+ # If no match found, try to find the last capital letter in the text
191
+ last_capital = ResponseParser.parse_last_capital(text, options)
192
+ if last_capital:
193
+ return last_capital
194
+ return 'No valid option found'
195
+
184
196
 
185
197
  @staticmethod
186
- def parse_first_capital_multi(text: str) -> str:
187
- match = re.search(r'([A-D]+)', text)
198
+ def parse_bracketed_answer(text: str, options: list[str]) -> str:
199
+ options = ResponseParser.process_options(options)
200
+ # Match the first occurrence of the options in angle brackets
201
+ match = re.search(rf'<({options})>', text)
188
202
  if match:
189
203
  return match.group(1)
190
- return ''
204
+ return 'No valid option found'
191
205
 
192
206
  @staticmethod
193
- def parse_last_option(text: str, options: str) -> str:
194
- match = re.findall(rf'([{options}])', text)
195
- if match:
196
- return match[-1]
197
- return ''
198
-
199
-
207
+ def process_options(options: list[str]) -> str:
208
+ # Escape each option to ensure special characters in options are treated literally
209
+ escaped_options = [re.escape(option) for option in options]
210
+ # Join options into a regex pattern separated by '|', to match any of the options
211
+ options_pattern = '|'.join(escaped_options)
212
+ return options_pattern
200
213
 
201
214
  def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
202
215
  """
@@ -299,3 +312,9 @@ def seed_everything(seed: int):
299
312
  torch.cuda.manual_seed_all(seed)
300
313
  torch.backends.cudnn.deterministic = True
301
314
  torch.backends.cudnn.benchmark = False
315
+
316
+ if __name__ == '__main__':
317
+ options = ['A', 'B', 'C', 'D']
318
+ answers = ['Context .... ANSWER: A', 'answer: A']
319
+ for answer in answers:
320
+ print(ResponseParser.parse_first_option(answer, options))
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.13.2'
4
- __release_datetime__ = '2025-04-01 20:00:00'
3
+ __version__ = '0.15.0'
4
+ __release_datetime__ = '2025-04-29 00:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.2
3
+ Version: 0.15.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -26,8 +26,10 @@ Requires-Dist: latex2sympy2
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
29
+ Requires-Dist: omegaconf
29
30
  Requires-Dist: openai
30
31
  Requires-Dist: pandas
32
+ Requires-Dist: pillow
31
33
  Requires-Dist: pyarrow
32
34
  Requires-Dist: pyyaml
33
35
  Requires-Dist: requests
@@ -39,9 +41,15 @@ Requires-Dist: seaborn
39
41
  Requires-Dist: sympy
40
42
  Requires-Dist: tabulate
41
43
  Requires-Dist: torch
44
+ Requires-Dist: torchvision
42
45
  Requires-Dist: tqdm
43
46
  Requires-Dist: transformers>=4.33
44
47
  Requires-Dist: word2number
48
+ Provides-Extra: aigc
49
+ Requires-Dist: diffusers; extra == "aigc"
50
+ Requires-Dist: iopath; extra == "aigc"
51
+ Requires-Dist: open-clip-torch; extra == "aigc"
52
+ Requires-Dist: opencv-python; extra == "aigc"
45
53
  Provides-Extra: all
46
54
  Requires-Dist: accelerate; extra == "all"
47
55
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
@@ -53,8 +61,10 @@ Requires-Dist: latex2sympy2; extra == "all"
53
61
  Requires-Dist: matplotlib; extra == "all"
54
62
  Requires-Dist: modelscope[framework]; extra == "all"
55
63
  Requires-Dist: nltk>=3.9; extra == "all"
64
+ Requires-Dist: omegaconf; extra == "all"
56
65
  Requires-Dist: openai; extra == "all"
57
66
  Requires-Dist: pandas; extra == "all"
67
+ Requires-Dist: pillow; extra == "all"
58
68
  Requires-Dist: pyarrow; extra == "all"
59
69
  Requires-Dist: pyyaml; extra == "all"
60
70
  Requires-Dist: requests; extra == "all"
@@ -66,17 +76,18 @@ Requires-Dist: seaborn; extra == "all"
66
76
  Requires-Dist: sympy; extra == "all"
67
77
  Requires-Dist: tabulate; extra == "all"
68
78
  Requires-Dist: torch; extra == "all"
79
+ Requires-Dist: torchvision; extra == "all"
69
80
  Requires-Dist: tqdm; extra == "all"
70
81
  Requires-Dist: transformers>=4.33; extra == "all"
71
82
  Requires-Dist: word2number; extra == "all"
72
83
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
73
84
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
74
- Requires-Dist: langchain<0.3.0; extra == "all"
75
- Requires-Dist: langchain-community<0.3.0; extra == "all"
76
- Requires-Dist: langchain-core<0.3.0; extra == "all"
77
- Requires-Dist: langchain-openai<0.3.0; extra == "all"
85
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
86
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
87
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
88
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
78
89
  Requires-Dist: mteb==1.19.4; extra == "all"
79
- Requires-Dist: ragas==0.2.9; extra == "all"
90
+ Requires-Dist: ragas==0.2.14; extra == "all"
80
91
  Requires-Dist: webdataset>0.2.0; extra == "all"
81
92
  Requires-Dist: aiohttp; extra == "all"
82
93
  Requires-Dist: fastapi; extra == "all"
@@ -86,6 +97,10 @@ Requires-Dist: transformers; extra == "all"
86
97
  Requires-Dist: unicorn; extra == "all"
87
98
  Requires-Dist: gradio==5.4.0; extra == "all"
88
99
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
100
+ Requires-Dist: diffusers; extra == "all"
101
+ Requires-Dist: iopath; extra == "all"
102
+ Requires-Dist: open-clip-torch; extra == "all"
103
+ Requires-Dist: opencv-python; extra == "all"
89
104
  Provides-Extra: app
90
105
  Requires-Dist: gradio==5.4.0; extra == "app"
91
106
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
@@ -99,12 +114,12 @@ Requires-Dist: sse-starlette; extra == "perf"
99
114
  Requires-Dist: transformers; extra == "perf"
100
115
  Requires-Dist: unicorn; extra == "perf"
101
116
  Provides-Extra: rag
102
- Requires-Dist: langchain<0.3.0; extra == "rag"
103
- Requires-Dist: langchain-community<0.3.0; extra == "rag"
104
- Requires-Dist: langchain-core<0.3.0; extra == "rag"
105
- Requires-Dist: langchain-openai<0.3.0; extra == "rag"
117
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
118
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
119
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
120
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
106
121
  Requires-Dist: mteb==1.19.4; extra == "rag"
107
- Requires-Dist: ragas==0.2.9; extra == "rag"
122
+ Requires-Dist: ragas==0.2.14; extra == "rag"
108
123
  Requires-Dist: webdataset>0.2.0; extra == "rag"
109
124
  Provides-Extra: vlmeval
110
125
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -121,7 +136,7 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
121
136
  </p>
122
137
 
123
138
  <p align="center">
124
- <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
139
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
125
140
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
126
141
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
127
142
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -199,6 +214,10 @@ Please scan the QR code below to join our community groups:
199
214
 
200
215
  ## πŸŽ‰ News
201
216
 
217
+ - πŸ”₯ **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read πŸ“–](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
218
+ - πŸ”₯ **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
219
+ - πŸ”₯ **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
220
+ - πŸ”₯ **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
202
221
  - πŸ”₯ **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
203
222
  - πŸ”₯ **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
204
223
  - πŸ”₯ **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -212,15 +231,14 @@ Please scan the QR code below to join our community groups:
212
231
  - πŸ”₯ **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
213
232
  - πŸ”₯ **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [πŸ“– Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
214
233
  - πŸ”₯ **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [πŸ“– Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
234
+ <details><summary>More</summary>
235
+
215
236
  - πŸ”₯πŸ”₯ **[2024.12.31]** Support for adding benchmark evaluations, refer to the [πŸ“– Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [πŸ“– Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
216
237
  - πŸ”₯ **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
217
238
  - πŸ”₯ **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
218
239
  - πŸ”₯ **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [πŸ“– Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
219
240
  - πŸ”₯ **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
220
241
  - πŸ”₯ **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
221
-
222
- <details><summary>More</summary>
223
-
224
242
  - πŸ”₯ **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [πŸ“– read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
225
243
  - πŸ”₯ **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
226
244
  - πŸ”₯ **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -503,6 +521,10 @@ Reference: Performance Testing [πŸ“– User Guide](https://evalscope.readthedocs.i
503
521
 
504
522
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
505
523
 
524
+ **Supports swanlab for recording results**
525
+
526
+ ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
527
+
506
528
  **Supports Speed Benchmark**
507
529
 
508
530
  It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports: