evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -70,7 +70,7 @@ def seed_everything(seed: int):
70
70
  random.seed(seed)
71
71
  np.random.seed(seed)
72
72
 
73
- if check_import('torch'):
73
+ if check_import('torch', raise_warning=False):
74
74
  import torch
75
75
 
76
76
  torch.manual_seed(seed)
@@ -81,12 +81,27 @@ def answer_options(choices: Choices) -> str:
81
81
  return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
82
82
 
83
83
 
84
+ def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
85
+ """
86
+ Returns the `choices` formatted as a letter list, e.g.:
87
+
88
+ ["choice 1", "choice 2", "choice 3"] ->
89
+ "A,B,C"
90
+ """
91
+ if isinstance(choices, list):
92
+ choices = Choices(choices)
93
+
94
+ indexes = list(range(len(choices)))
95
+
96
+ return ','.join([f'{answer_character(i)}' for i in indexes])
97
+
98
+
84
99
  def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
85
100
  if isinstance(choices, list):
86
101
  choices = Choices(choices)
87
102
 
88
103
  choices_text = answer_options(choices)
89
- letters = ','.join(answer_character(i) for i in range(len(choices)))
104
+ letters = format_letter_choices(choices)
90
105
  if not fewshot:
91
106
  return template.format(
92
107
  choices=choices_text,
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '1.0.1'
4
- __release_datetime__ = '2025-09-05 14:00:00'
3
+ __version__ = '1.1.0'
4
+ __release_datetime__ = '2025-10-14 14:00:00'
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/evalscope
6
5
  Author: ModelScope team
7
6
  Author-email: contact@modelscope.cn
8
7
  License: Apache License 2.0
8
+ Project-URL: Homepage, https://github.com/modelscope/evalscope
9
9
  Keywords: python,llm,evaluation
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Operating System :: OS Independent
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: License :: OSI Approved :: Apache Software License
17
18
  Requires-Python: >=3.9
18
19
  Description-Content-Type: text/markdown
19
20
  License-File: LICENSE
@@ -24,7 +25,7 @@ Requires-Dist: dotenv
24
25
  Requires-Dist: jieba
25
26
  Requires-Dist: jsonlines
26
27
  Requires-Dist: langdetect
27
- Requires-Dist: latex2sympy2-extended
28
+ Requires-Dist: latex2sympy2-extended[antlr4_9_3]
28
29
  Requires-Dist: matplotlib
29
30
  Requires-Dist: modelscope[framework]>=1.27
30
31
  Requires-Dist: nltk>=3.9
@@ -56,35 +57,6 @@ Requires-Dist: peft>=0.17; extra == "aigc"
56
57
  Requires-Dist: torch; extra == "aigc"
57
58
  Requires-Dist: torchvision; extra == "aigc"
58
59
  Provides-Extra: all
59
- Requires-Dist: colorlog; extra == "all"
60
- Requires-Dist: datasets==3.6.0; extra == "all"
61
- Requires-Dist: docstring-parser; extra == "all"
62
- Requires-Dist: dotenv; extra == "all"
63
- Requires-Dist: jieba; extra == "all"
64
- Requires-Dist: jsonlines; extra == "all"
65
- Requires-Dist: langdetect; extra == "all"
66
- Requires-Dist: latex2sympy2-extended; extra == "all"
67
- Requires-Dist: matplotlib; extra == "all"
68
- Requires-Dist: modelscope[framework]>=1.27; extra == "all"
69
- Requires-Dist: nltk>=3.9; extra == "all"
70
- Requires-Dist: openai; extra == "all"
71
- Requires-Dist: overrides; extra == "all"
72
- Requires-Dist: pandas; extra == "all"
73
- Requires-Dist: pillow; extra == "all"
74
- Requires-Dist: pydantic; extra == "all"
75
- Requires-Dist: pyyaml>=5.1; extra == "all"
76
- Requires-Dist: requests; extra == "all"
77
- Requires-Dist: rich; extra == "all"
78
- Requires-Dist: rouge-chinese; extra == "all"
79
- Requires-Dist: rouge-score>=0.1.0; extra == "all"
80
- Requires-Dist: sacrebleu; extra == "all"
81
- Requires-Dist: scikit-learn; extra == "all"
82
- Requires-Dist: seaborn; extra == "all"
83
- Requires-Dist: sympy; extra == "all"
84
- Requires-Dist: tabulate; extra == "all"
85
- Requires-Dist: tqdm; extra == "all"
86
- Requires-Dist: transformers>=4.33; extra == "all"
87
- Requires-Dist: word2number; extra == "all"
88
60
  Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
89
61
  Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
90
62
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
@@ -97,7 +69,9 @@ Requires-Dist: torch; extra == "all"
97
69
  Requires-Dist: webdataset>0.2.0; extra == "all"
98
70
  Requires-Dist: aiohttp; extra == "all"
99
71
  Requires-Dist: fastapi; extra == "all"
72
+ Requires-Dist: jinja2; extra == "all"
100
73
  Requires-Dist: numpy; extra == "all"
74
+ Requires-Dist: rich; extra == "all"
101
75
  Requires-Dist: sse-starlette; extra == "all"
102
76
  Requires-Dist: transformers; extra == "all"
103
77
  Requires-Dist: uvicorn; extra == "all"
@@ -110,17 +84,10 @@ Requires-Dist: open-clip-torch; extra == "all"
110
84
  Requires-Dist: opencv-python; extra == "all"
111
85
  Requires-Dist: peft>=0.17; extra == "all"
112
86
  Requires-Dist: torchvision; extra == "all"
113
- Requires-Dist: bfcl-eval==2025.6.16; extra == "all"
114
- Requires-Dist: human-eval; extra == "all"
115
- Requires-Dist: pytest; extra == "all"
116
- Requires-Dist: pytest-cov; extra == "all"
117
- Requires-Dist: python-dotenv; extra == "all"
118
87
  Provides-Extra: app
119
88
  Requires-Dist: gradio==5.4.0; extra == "app"
120
89
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
121
90
  Provides-Extra: dev
122
- Requires-Dist: bfcl-eval==2025.6.16; extra == "dev"
123
- Requires-Dist: human-eval; extra == "dev"
124
91
  Requires-Dist: pytest; extra == "dev"
125
92
  Requires-Dist: pytest-cov; extra == "dev"
126
93
  Requires-Dist: python-dotenv; extra == "dev"
@@ -136,6 +103,7 @@ Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
136
103
  Provides-Extra: perf
137
104
  Requires-Dist: aiohttp; extra == "perf"
138
105
  Requires-Dist: fastapi; extra == "perf"
106
+ Requires-Dist: jinja2; extra == "perf"
139
107
  Requires-Dist: numpy; extra == "perf"
140
108
  Requires-Dist: rich; extra == "perf"
141
109
  Requires-Dist: sse-starlette; extra == "perf"
@@ -271,7 +239,9 @@ Please scan the QR code below to join our community groups:
271
239
  > **Version 1.0 Refactoring**
272
240
  >
273
241
  > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
274
-
242
+ - 🔥 **[2025.10.14]** Added support for OCRBench, OCRBench-v2, DocVQA, InfoVQA, ChartQA, and BLINK multimodal image-text evaluation benchmarks.
243
+ - 🔥 **[2025.09.22]** Code evaluation benchmarks (HumanEval, LiveCodeBench) now support running in a sandbox environment. To use this feature, please install [ms-enclave](https://github.com/modelscope/ms-enclave) first.
244
+ - 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
275
245
  - 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
276
246
  - 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
277
247
  - 🔥 **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
@@ -1,47 +1,48 @@
1
1
  evalscope/__init__.py,sha256=oivLvqwNw2JlB-h-Z8_525IpfKcYEkS51F59tEfpy5w,445
2
- evalscope/arguments.py,sha256=OthHwNhG9VrP7_CYocmjZ4iVyG5LJbzO0FhseoLBalk,5663
3
- evalscope/config.py,sha256=NVFXbU0kVof2V8Bnjs-O2FEPdlXx3rZuoHcttm1THbM,10564
4
- evalscope/constants.py,sha256=cbkKHmEcJHF9T0m4yREx08__tulj6MV59im2RW-pR3c,3433
5
- evalscope/run.py,sha256=1JjqSky3Fm3v1tOE9pgR7alODoSNWa4ZdoLTWFLgjRE,6510
2
+ evalscope/arguments.py,sha256=jKAF47PsqXRioU21gRHw9hxJnfR31z_X7c__glRY5ns,6257
3
+ evalscope/config.py,sha256=S2N11-AxQkT7lVffpjXdtpT4QpnSP6th-c8I-501mwM,11507
4
+ evalscope/constants.py,sha256=W3E4Jp-x6qxvPOYtU9bNlzlERFvSAA_3F007apIwUlU,3601
5
+ evalscope/run.py,sha256=A9_7pR3FiA-It46A3Mqk7ce6fQy548p0ux2QUugj2hI,6531
6
6
  evalscope/summarizer.py,sha256=HUDJ1zKi22uNst3AUfX67Z0sHzeZy-4S8sYyvxJnBzc,5901
7
- evalscope/version.py,sha256=5Jk88EAyvBpPzsQaFYKGjukIwF3tVCXIrarT94bYsCQ,118
7
+ evalscope/version.py,sha256=hqGJMtjd3F6yPJucqhuYtXuGYSumthFmroHsUTY761Y,118
8
8
  evalscope/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  evalscope/api/registry.py,sha256=Qk0KMGDbt-iI0-OfoJZbOtxt76qreAVWh36HOoQAKM4,5448
10
10
  evalscope/api/benchmark/__init__.py,sha256=9xcTxpcQ6HhZ0QDwEIZhAT5IjybzaJ60VGLcmaFE5dU,188
11
- evalscope/api/benchmark/benchmark.py,sha256=q5hmEH845DfmvEB1NvlHM1b-oCCMpatIamT-2ubudbM,10088
12
- evalscope/api/benchmark/meta.py,sha256=G6Q5E1JwO-CpEwsjhMrXHExlVRUF1Ah5Nz21vkP8IV0,4218
11
+ evalscope/api/benchmark/benchmark.py,sha256=gqAM81SeGb_Q0rA6Q-LFpnNkOUiwOj43aRWECtCxAOE,10832
12
+ evalscope/api/benchmark/meta.py,sha256=N4u8NQjkjIw-xaf6KFnb6C8JDKB0DLbsXyXblDqIpvE,4304
13
13
  evalscope/api/benchmark/adapters/__init__.py,sha256=uLt_GiU4s-_6Rjgmr4OUTtE7dvEX-ZIQ403fd6oNuxA,264
14
- evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=Y8wzOxq3qpbE2lgLZyXHxoLUxjlmbS-N6ByObrBwOvc,27977
14
+ evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=RWDweSmXKGv5hPPjeV4VF76gbKqYJEsab_lQYGUM2PA,28785
15
15
  evalscope/api/benchmark/adapters/image_edit_adapter.py,sha256=06V-_A8RKuMNYMt7-vaXn2qBa9LIZgfFO_6PUuhAkh0,3052
16
- evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=wp_6Kws3GoBk_mSzQP8Nr40osFf3iPJpntkANYAuIcc,2979
17
- evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=4mccYHKB-9iyOZ0uwkTi2TgC76KIJpcu_4hnfbU5NMc,6434
18
- evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=N9LPh5tTGkvRYzp4giI0La0u4xzrHcJGhdTY9jiNCxY,219
16
+ evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=auqLNvF50Or9bo3LOmQLXHfFaTTCTqvQzZog3glInng,3062
17
+ evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=jO64hwjQexIv-MTyHH0Ffp_6p--9TKufOmX_U39mAnE,6385
18
+ evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=5d7ITkeosikb7u0ag0WkMaZ0SAYGkR_wKM9NP495GKk,280
19
19
  evalscope/api/dataset/__init__.py,sha256=RHFMzwfONEqmmn3vRtxyN3r29mipDUUUSEDhuwm0YpQ,147
20
20
  evalscope/api/dataset/dataset.py,sha256=9bwSx89zgOOBRQkRPVv-B5Yi30A6J1MLtekQSqwsy9g,11328
21
21
  evalscope/api/dataset/loader.py,sha256=t7KLH5ltLUumhiPIyYJzk6zn2iKLx-D2gIIoMhKdnhc,9714
22
22
  evalscope/api/dataset/utils.py,sha256=3E0ikqr6QWV_lX0d3Z4F4xFuVTcwbeDPgCvJY7v83Bc,4935
23
23
  evalscope/api/evaluator/__init__.py,sha256=-Ure6X4GlE7VYSNWSZ_DpjbUBGa5irVTymLENEHTYqY,138
24
24
  evalscope/api/evaluator/cache.py,sha256=a_M2ouUjtkMr5m3wRbmsE8ETP_aacxbm0d38yY5RljM,13244
25
- evalscope/api/evaluator/evaluator.py,sha256=SGW4RIKc79IlUP5FisrEycJlqORcaYxyIP5eabaSfeU,1600
26
- evalscope/api/evaluator/state.py,sha256=OyZUtQw9Wd6X8MA2mtmTGn74ReBq1x-JfWwV_TT99UY,8892
25
+ evalscope/api/evaluator/evaluator.py,sha256=xMF4w2qiQ7NNgOhSKs9Vd4VZ33SCDwTTJ82lDhaj1FQ,1734
26
+ evalscope/api/evaluator/state.py,sha256=Elz2cmbvOOqvOaEOAMatxgk4BdjqDZB3XKTaL4iqJLI,9039
27
27
  evalscope/api/filter/__init__.py,sha256=5eWKjT-dAiz8nE0S6WnU6plqjXZHYn7CJOgFiHSoovM,66
28
28
  evalscope/api/filter/filter.py,sha256=fsPddaHE5wwFIXgUWITFqlYXqdh6vx3QqcEf3rSXKVI,2068
29
29
  evalscope/api/messages/__init__.py,sha256=UKZ9VVCt7NPrcZXv_1e8MZ8mOWu0eLRvMIXykpJPZ9I,378
30
- evalscope/api/messages/chat_message.py,sha256=LZ3Yv_Ts5ASCfrq2y_zecpY3IN5lzHsRbaxz8WRQgD8,9698
30
+ evalscope/api/messages/chat_message.py,sha256=D88TklSAWOaG21EBDVDoRPwzVCqzEGbVW4sA8Af4axc,10053
31
31
  evalscope/api/messages/content.py,sha256=gUBUeK60BUhkwoulyzKL6q0iMt3VLlah9onLG1XVrWY,2772
32
32
  evalscope/api/messages/utils.py,sha256=uqlEbYEoUKpXLW8tQtP-cY5Miq7W0Xl6a98j55u6m6E,1266
33
33
  evalscope/api/metric/__init__.py,sha256=Cj2F8eiVny5uNtfPXKwQDq2owlHVKNzfr-COLYMEox4,106
34
34
  evalscope/api/metric/metric.py,sha256=XkjBqpZbFYynhTIH8WawfPmItbDQ6jWufE_ox9zDPCU,1568
35
35
  evalscope/api/metric/scorer.py,sha256=dczSQwkRmPk1uvNCMGT5G6nYbwWTcpwsZtyYXWkrJII,3749
36
- evalscope/api/mixin/__init__.py,sha256=DpHdR7t9d-HUzBXxwsW3t5MxM4kgoThQ4WF8s8EuSBY,43
37
- evalscope/api/mixin/llm_judge_mixin.py,sha256=KPNH41IL7md5XEYqC2ZbmnYm4tIrV-MgxpfKOWbYsMc,5624
36
+ evalscope/api/mixin/__init__.py,sha256=xBuoTuao5o_EFThgeeeWI87x64Q12aJttsaZc8gak_c,83
37
+ evalscope/api/mixin/llm_judge_mixin.py,sha256=ECVDfxCeAEkymFssD7xKhIDcct2qgQTqGnbijXk9leE,5675
38
+ evalscope/api/mixin/sandbox_mixin.py,sha256=uKqBtTtttKwrUArY-CTMDdFHjRBOR7Kl1sxaGHe-S2Q,7653
38
39
  evalscope/api/model/__init__.py,sha256=YxKdz1IKUt6eYoC7nx81yD2BtyiWQDvaoTcc8O9lvoE,286
39
- evalscope/api/model/generate_config.py,sha256=SyUNlZhcoBpLlMK8esu1XQs61SSPN_D5QN8TRUcnroI,7760
40
- evalscope/api/model/model.py,sha256=HecfGqaaB201n7I1pZ5Q4_aVC-xLA93uxdGgoreRYFw,12771
40
+ evalscope/api/model/generate_config.py,sha256=wQeDknXb49yBKSRL9rlIyerPobGXqU-A4hL1vySNGPo,7656
41
+ evalscope/api/model/model.py,sha256=c7YVbYYk47MHWwPjoB66xWjgmHdUGTOSOdtIsLcJfyc,12782
41
42
  evalscope/api/model/model_output.py,sha256=NeN6bLtAvg_3fTirewWfdP-_x4SJXa9pGuRpyXJY3B8,9333
42
43
  evalscope/api/tool/__init__.py,sha256=bEaW5ryY-erLcl2zMoDJNgiaBqlSPAL0jQ5daUHvvrw,272
43
44
  evalscope/api/tool/tool_call.py,sha256=WqMnw69L_yhQWycENZ7azPRhxRidhmrMcYAy7UTIqvg,2836
44
- evalscope/api/tool/tool_info.py,sha256=aqquWQRWWx7fPItIwiubiz2VRe2TLl_Jmn1ArIlngbw,5716
45
+ evalscope/api/tool/tool_info.py,sha256=FQOBqxKZ6Qb4f40iRH1mLg64cEhu1_-9Rn-f5iUrD2w,5733
45
46
  evalscope/api/tool/utils.py,sha256=IWFzM6WspzBmNPicXn6b7KS6Y-1I-ErsK9fua4cb53Y,2324
46
47
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
47
48
  evalscope/app/app.py,sha256=EaBWorA87ZmyIHovIE3styHWEVFsu_F70pTmP4-5zTQ,836
@@ -49,15 +50,15 @@ evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,69
49
50
  evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
50
51
  evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
51
52
  evalscope/app/ui/app_ui.py,sha256=wLrQ4VM7BnzvaYmPAk8NH9t5BaWooHFJcgmAOOd2I1w,2032
52
- evalscope/app/ui/multi_model.py,sha256=fO8z-ZFucWtgaKmuQ50AkUp4BoYOFqOkxeTBUUAK0bM,15122
53
+ evalscope/app/ui/multi_model.py,sha256=mvMgpgiJGRrNRtReFcD_PiLatq-81zp65Vb3JYUP3PE,15356
53
54
  evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
54
- evalscope/app/ui/single_model.py,sha256=1rgYrJOO75fJG2pa74tzEocO_91jXOAKFQAUViBcYFk,9459
55
+ evalscope/app/ui/single_model.py,sha256=zFt1uDYrcgNJ7e_YLigrs6IXT3jyGMVn-7rv4CHAZvE,9741
55
56
  evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
56
- evalscope/app/utils/data_utils.py,sha256=m7Z0Us_josUFseI8VJpIp8QaYeLnu91E2HCZ8WSB07E,7396
57
+ evalscope/app/utils/data_utils.py,sha256=GYOfkh0NoueeX3od-L852Q9C9SSkEFlW_40wjPa5b9w,7470
57
58
  evalscope/app/utils/env_utils.py,sha256=2pmz4uNun-XNP6TqM6Oe576XopweEClhBaIdWO--kd0,382
58
59
  evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
59
60
  evalscope/app/utils/text_utils.py,sha256=-K-hRPMZ29Yqjhzd-391gPaD4B4wUuIg71PfbLnGJ38,3754
60
- evalscope/app/utils/visualization.py,sha256=dwEXbGfY7vFysnL0HmrHS2BEWaJkg-dZ9ayDlRhdvv4,3559
61
+ evalscope/app/utils/visualization.py,sha256=lycwcr-kFT2FKVw6iWMh3iD_n4dqpWVzhXMLDnkN8QY,3563
61
62
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
63
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
63
64
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -107,18 +108,22 @@ evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KB
107
108
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
108
109
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
109
110
  evalscope/benchmarks/__init__.py,sha256=WHR4ej9Tqa2N9CyIaUWXS8EnHZtcujaNeg9hf8GT31Y,1182
111
+ evalscope/benchmarks/ai2d/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
+ evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=qnQT2E0ZG8g4noOafu-QvBOKm-zEJ5X08QHw3ekNa4w,2473
110
113
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
114
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=HTlriHoHzlm1Rf3KAiGRLs8sx6Gyf6s7RGtOjk_hGS4,1767
112
115
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=ZOE_6Zhg1MatWJSu2Zq372nKUODYtNFZimS1MJRFz5A,1591
113
116
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
117
  evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=sjaWcK8WH1XY0kzm5eHsq_7J62EJocAf4gRV_UB8ZBE,4971
118
+ evalscope/benchmarks/amc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
+ evalscope/benchmarks/amc/amc_adapter.py,sha256=NzLPOmj3fJhPw6gVrB8KtxEbqwUqQ923vXHnLWEfdiU,1418
115
120
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
121
  evalscope/benchmarks/arc/arc_adapter.py,sha256=GASZmoJ-PpzBG70cBdABZA5uVqoyosjV-jf9WShK7L8,1622
117
122
  evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
123
  evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=Ddn_hVO1PvNQ_kNknXfdJCz1AVnXZEdGWq4gX1_Qqow,7275
119
124
  evalscope/benchmarks/arena_hard/utils.py,sha256=23xCd7_ksrM4xMJBp7N2ZwpUpq1zpoQFjLm1oBcdgQY,5559
120
125
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
121
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=GcvgwBhIw7OG-ljWQ_urVOoWlrFjrBy1LAZ-Atm02Dw,5570
126
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=lRI-DfdFkyg4ylW4d-6CUfiNqlF7K_IoTjzJz3jYTUs,6346
122
127
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
123
128
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
124
129
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -147,10 +152,15 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
147
152
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
148
153
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
149
154
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
150
- evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=N_AVgdfI4DXph0n3U1bChP9AQLx3_-ogAInFE-4EGig,10972
151
- evalscope/benchmarks/bfcl/generation.py,sha256=gOYzwTNEi2G0zykKdsx42Pc0Ql8iPD6RoX3MRbUhMJo,8698
155
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ZmwGylqXCAcpJ8glQmj7HkDa8OqE9KODiHvWelTGLIo,17033
156
+ evalscope/benchmarks/bfcl/generation.py,sha256=c6lNjo-VTSUrVg-pqyPSucrbCKBOdBSyN0aR5AAtE4A,8701
157
+ evalscope/benchmarks/blink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ evalscope/benchmarks/blink/blink_adapter.py,sha256=ocQKsDGwnUAg2si2p7tqIGeH3PKPqTSByjbt7ceraRo,2642
152
159
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
153
160
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=4FLPgY-UtqINafnNxfOsE9AwS6GFXFCUGOBI-4EZUGk,8503
161
+ evalscope/benchmarks/chartqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
+ evalscope/benchmarks/chartqa/chartqa_adapter.py,sha256=DA1kthMUvn4_GUfdRfuR-au3RkhE3WKPnR_f8nlhd4c,2813
163
+ evalscope/benchmarks/chartqa/utils.py,sha256=Ta9ZUMpIqzrAszju7_WOMBAlilH1Tx6TCheVpjrZJJI,1672
154
164
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
165
  evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=OWzRlSGswV24V-heLqqo7GQzpJp01TZ0DhFHq0iUP9A,8238
156
166
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -162,15 +172,17 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=eetF21dN5
162
172
  evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
173
  evalscope/benchmarks/docmath/docmath_adapter.py,sha256=-mel6hA-x_e7fV0uOHdX5BpoQEVyQ5VqwIwEqSNDpnc,4623
164
174
  evalscope/benchmarks/docmath/utils.py,sha256=d6Yjoa5q91kjr1SdVPVBndzDaUzMlO_GfEqMtUXXr0s,7707
175
+ evalscope/benchmarks/docvqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
+ evalscope/benchmarks/docvqa/docvqa_adapter.py,sha256=xGaayycILYoLd8r6wLLppDbU6Z1FdafbYFyjLHaftAA,2882
165
177
  evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
- evalscope/benchmarks/drop/drop_adapter.py,sha256=PyvZ1WOdHQ0u0_JpuP97_yQsCUbzGcYsJf3bWKbakzg,9968
178
+ evalscope/benchmarks/drop/drop_adapter.py,sha256=Jbbr5O_Y5LI_vT_RskRQVKxGkiIraX_uXP7fYaZ5eZs,9995
167
179
  evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
168
180
  evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
181
  evalscope/benchmarks/frames/frames_adapter.py,sha256=w1kRya7w5omt95HHE6AzbzYVhyTT5r521676d_xJ6Vg,5514
170
182
  evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
171
183
  evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
184
  evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=DzJaokqZwR2L8HDiahss8EbQ3vcsMXkzkMghxU-uAOo,21639
173
- evalscope/benchmarks/general_arena/utils.py,sha256=zS4l1RKwvl0Z9Mk7kth9WVQGHTgE_aNDZa_XNy9tGyM,6874
185
+ evalscope/benchmarks/general_arena/utils.py,sha256=p6pZfvdNCMOU_vWHm_DYU57Sa2WTDdFOkVBubblCRN4,6912
174
186
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
175
187
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=7VKg_EzXkRvoWpR7h8qB4sVVb1eZHCGcPk-X_NMS5tE,2062
176
188
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -180,12 +192,16 @@ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=zWK2hhyKw5n8K30YvMjSm6XMwyrireO
180
192
  evalscope/benchmarks/gpqa/prompt.py,sha256=b1Gw2D5dEdhvLYymPfcvGKJdHrIzpiZkOwURKSxiQJg,5576
181
193
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
182
194
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=W4vTXsC7iHN1AgvpaCf1Rj7y2O8QczIluucnpSC5aYo,2636
195
+ evalscope/benchmarks/healthbench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
196
+ evalscope/benchmarks/healthbench/healthbench_adapter.py,sha256=1sL7i9yhORH4xiFWB9puPKWNZZFJGZFAlKdlzHp-fiw,13228
197
+ evalscope/benchmarks/healthbench/utils.py,sha256=M8SnOEhlqXWm03CFE6CAtbMiu6MqdGgVczAv-LPjA7Y,3683
183
198
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
184
199
  evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=tAe63NfV5ljUm1f4RTSFxWOVKBUhk3Cc0EGzF5uYLK4,2041
185
200
  evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
- evalscope/benchmarks/hle/hle_adapter.py,sha256=4YVmETL9mEiLxF4vWRjePLyFaxelax6nOaqoAH5ZxmU,6389
201
+ evalscope/benchmarks/hle/hle_adapter.py,sha256=kJP7bzIDbr82GKi0FTy2zf_j1UWNBfuXYzokYJ-S9WE,6410
187
202
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
188
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=5x2pnkbI9ZPPOyrRBsJ5ZcOCGJr8OR7qXLgVlY6eJxs,5825
203
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=uLs3UHSALS3YHt0qzBismrIqdEUgbEalQbjC0CU7ym4,4085
204
+ evalscope/benchmarks/humaneval/utils.py,sha256=rPnc_JuSjNg9aV7UMUwsLrDlm-ufj64GNIBCWBeuRcM,6517
189
205
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
206
  evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=55FQwJ0_eDijppkVVlM5XCXzgRFmjH1SvGMItGsvn6o,2769
191
207
  evalscope/benchmarks/ifeval/instructions.py,sha256=HXnn1JgU3dpYltqIovFAn02DxkYOGw337kLMlOfJxJE,56048
@@ -197,22 +213,31 @@ evalscope/benchmarks/image_edit/gedit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
197
213
  evalscope/benchmarks/image_edit/gedit/gedit_adapter.py,sha256=a6hhRbnGCvMEMsbnSbczjXd4vHfMVEnFfP459FCF_Mc,5250
198
214
  evalscope/benchmarks/image_edit/gedit/utils.py,sha256=UN0z9Dafs8d8lEXqxin321d8smiS3H9p3gyLkZFPFNg,14735
199
215
  evalscope/benchmarks/image_edit/gedit/vie_prompts.py,sha256=qVXWQyVUwZxEasDjVmYBk30_JI4gnvHacMOmMsA4wcI,22056
216
+ evalscope/benchmarks/infovqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
217
+ evalscope/benchmarks/infovqa/infovqa_adapter.py,sha256=3m_EvfRZ5ItHkz-3mVlsF_NnPS7NH1-EXwUW-s4VMxA,2617
200
218
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
219
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=mNHA_Fuj_gAdOEoR7oChnGmErf1czqwnk8Zk-jRhBys,1304
202
220
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
203
- evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=wgx8RDbkXi2Mlt-aK_6o4VcoPb7I3eL8z8h8JW4SnEo,6510
221
+ evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=maN8qHmDHJpexPeB0qwZoXJ5zrqPbJDYVRptqvXI9d4,6827
204
222
  evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
205
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=tl7nGLDUgmNtyR4faE0aoW11OgLhsx7ZdKmONGDlQnQ,5203
223
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=qnprJTv7zWA9aq6Lw4aDoall---kiivR0oDo3uSO2mI,6399
206
224
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=fEzWz_fUGwi5Ncum5PNVF9jFcuDwGgs7Vt_10YKBE2Q,2087
207
225
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
208
226
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
227
+ evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py,sha256=7DDx46EwtoR776vWjofJl1zaYCLdmeq8cF3fhDGdZgA,7424
209
228
  evalscope/benchmarks/live_code_bench/testing_util.py,sha256=TuoOTciC-hz3FTeDzsQB_THH3Be9UOP2XMrax-4sXkM,17282
210
229
  evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
211
230
  evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=Rx7iZ5JaEo73YwIzhm78gMDQ6gqcErbnWWXHxXM6BcU,2379
212
231
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
213
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=uuxjmqftY_r-hJBCjfBgYUELrBaB86MG8dIu2wTikgI,1848
232
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=hn7SQhoIHKuH-2A_nGUhQPRw2gl2G-kZldc9ueY0G3A,1802
214
233
  evalscope/benchmarks/math_vista/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
234
  evalscope/benchmarks/math_vista/math_vista_adapter.py,sha256=Mu9BpH0rDNM0yMrGws4SEOnXy2NTSIKwyLs5t4nAP-s,5842
235
+ evalscope/benchmarks/minerva_math/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
236
+ evalscope/benchmarks/minerva_math/minerva_math_adapter.py,sha256=jyT9_D4w8PTtLBN3Kn10_CnssH_mPuRNnn9rek_zUEs,1655
237
+ evalscope/benchmarks/mm_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
238
+ evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=ht2DVt_zEBJp4jvGy3myHHgdUUP9eff2O5BpIc9Fv74,4376
239
+ evalscope/benchmarks/mm_star/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
240
+ evalscope/benchmarks/mm_star/mm_star_adapter.py,sha256=oamLv6U2-JAK5mdVLkUgYxkOahxQkQYMRKAyu_xPAUE,2818
216
241
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
217
242
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=2NT3QbfPzajUTFZ0tBCl6PRrtFtAr5jPZNQRW2Idlno,5947
218
243
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -220,18 +245,44 @@ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=GtIyUubUg6Q6Ydh1Adj0-32
220
245
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
221
246
  evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=m_37OIFrJB4ZIvtbDJ_m9P9mA2QtrNjGfbbVo15awJg,7402
222
247
  evalscope/benchmarks/mmmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
- evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=C7UM6HvomcA_Srf7771S0CaUvifBX63i161XaacraGQ,6038
248
+ evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=WrykWq8n61CVrQ4XQhI3iEySgErHdZyng3udOL-Pddk,6054
224
249
  evalscope/benchmarks/mmmu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
225
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=a7rZV4WMPxeBdfwanmUjsB8yG1rwNXCsWCoqzOq-dd4,4901
250
+ evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=banPS1nDt9bQ95urKbSZnR-hBTw23eL9MSrHt_0ZLp0,4725
251
+ evalscope/benchmarks/multi_if/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
252
+ evalscope/benchmarks/multi_if/ifeval.py,sha256=7y2rnJ4q1_DVA7I9mUnF7TBpu7Kez0X_Xhl-AJInzWk,87949
253
+ evalscope/benchmarks/multi_if/metrics.py,sha256=LWnhQw25cRNMReJ_xJ7Fx7WYHcT9i2FG1FUjYOuQDrI,4291
254
+ evalscope/benchmarks/multi_if/multi_if_adapter.py,sha256=I3_YPPUuRbrs9Gt3Qjhx9RM5Vu2gDFnheDcGu-oe840,5924
226
255
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
227
256
  evalscope/benchmarks/musr/musr_adapter.py,sha256=kx6bckj7Nijl4Wysuj-mKYdy0hIRDJho8yVTup403Hc,1473
228
257
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
229
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=mO8zhdCpoWPtlBk9GSzgcP25vEoQLYGwUM1QfcQ4iSE,17151
258
+ evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=GYaswWPwYI3aV5HSpcuBTgW9-HDtf2xzNZg0WrsI0Yo,17033
230
259
  evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
260
+ evalscope/benchmarks/ocr_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
261
+ evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py,sha256=gkQb7g0-Lf5Sjemqs5kqogCLGFJI6YQv8-vGI1EbyLE,4392
262
+ evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py,sha256=cBpRDJvI9f6vKRD4wTPv-8ThGddR3EhVobgjQQUAYlE,2606
263
+ evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py,sha256=31bL0V32Fq7prF1WoVjXmrmMdhg0qNcoiOaKykKOrZM,36528
264
+ evalscope/benchmarks/ocr_bench_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
265
+ evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py,sha256=QGY4R75UxDafIwSaOEPPuCaX3Z8BGoZVvcc6OWbeO9w,7976
266
+ evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py,sha256=d1nU7LNwubBd_1rIe7i67hOVcJx5IUXkqVeqt1CQzak,1624
267
+ evalscope/benchmarks/ocr_bench_v2/parallel.py,sha256=Q54wFSSRBp-kG2MhW4eOoXE1W9g-SDVhN8JuphDERsE,2029
268
+ evalscope/benchmarks/ocr_bench_v2/spotting_metric.py,sha256=nftLaTOKEmqvSWr-c20f9hyyvNnd-Hg3E46KwqmkjLc,6149
269
+ evalscope/benchmarks/ocr_bench_v2/utils.py,sha256=z9DSh2m1yvM3vsvxvqdHuPgRFxgdmEnzuNIuO7PAV3s,15914
270
+ evalscope/benchmarks/ocr_bench_v2/vqa_metric.py,sha256=XkAiXk1uE7lsWQQXvjnHXZMsga8B9FVyq5qG8ghePK4,8980
271
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
272
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt,sha256=QO0K9z1ethy_lgs9vaxGN1u5DnPFsssp8z62Cni24iw,1424
273
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py,sha256=qCuqDtsCfxAiQHYLNdHU7BQ9kLIZ9iyfmRxtIrGOBck,20349
274
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py,sha256=7HzM1PEw8wNOhmQOsZe582Y2rr4u66Q3JKVvvMasntE,19565
275
+ evalscope/benchmarks/olympiad_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
276
+ evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py,sha256=zePVmGjmyuwCWVb4h1PIQKAIFqBehwRwO2WOD0KX_ik,6565
277
+ evalscope/benchmarks/olympiad_bench/utils.py,sha256=w7vEZcT3vCVq8_DSMgAjZPpVFVHStJPJYsPkrs-yOFM,21412
278
+ evalscope/benchmarks/omni_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
279
+ evalscope/benchmarks/omni_bench/omni_bench_adapter.py,sha256=IJkRSokQC6MF_pN46Yofr_NaZaNt1XZFX1PUBmX4-qA,3651
231
280
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
232
281
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=XN3F6NH7mF4ibwGX5nI01sqEHz05UQFnBAyfAe14QYE,6174
233
282
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
234
283
  evalscope/benchmarks/race/race_adapter.py,sha256=KibT9gHpIOZhTcWihG0dUDAX4gAHa2g1WdGPOcEP9OY,1705
284
+ evalscope/benchmarks/real_world_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
285
+ evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py,sha256=J2u0J9d31uvkoz9nBI9tCMqG27hmYwdLQPPef9jx_pg,2788
235
286
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
236
287
  evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=_duveAliSaPUqVSLQ2TtSv5sfwvFFy7t-MgIIokQ24s,9017
237
288
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -240,7 +291,7 @@ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=SPqpBebiHj_oyEqU94p
240
291
  evalscope/benchmarks/super_gpqa/utils.py,sha256=OK_oT-DnWNssITEwu_Zc3Ty5v21n0IaJQYftK2cpwmQ,3401
241
292
  evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
242
293
  evalscope/benchmarks/tau_bench/generation.py,sha256=d7J5xrxEI-0BYxdSuxdDavcR7f1ipBdpQsKZzwyzGds,5190
243
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=1Dj5r9zMuLJ59wHusEcHVTszBE8BVhAK8lNZzBBzKT8,6375
294
+ evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=47wA0ia6gezA3nqvUpd4Pb8f5alCrBKEt7GOxJFupow,6464
244
295
  evalscope/benchmarks/text2image/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
296
  evalscope/benchmarks/text2image/evalmuse_adapter.py,sha256=g-Wc1qTg-xWLTjiZPo8zmQud75ac-8mBpYRxOHfiO0g,3024
246
297
  evalscope/benchmarks/text2image/genai_bench_adapter.py,sha256=1GDB3gS9zwrfb9C83LQdQyN7bvvqeYuu5ulJ9Igmi2k,1876
@@ -268,15 +319,15 @@ evalscope/collections/__init__.py,sha256=x05hFLrjGsdtuHtc6PyQXHNuucVdYaBN9ZrM8gB
268
319
  evalscope/collections/sampler.py,sha256=086pzXQO4CO_QYCd10z149Sjh6sBpRBeIHf5OTLOVu8,4896
269
320
  evalscope/collections/schema.py,sha256=yzAlnH0O7iiWB4UnkFXI_Dvxcsq9hDgl0aGK2OpyBY8,4158
270
321
  evalscope/evaluator/__init__.py,sha256=KzYmVTfU-1pdX7va7l3B1-5QKWG07hj1B7rYkMmxitY,91
271
- evalscope/evaluator/evaluator.py,sha256=mkq85ieBRSc5X2FFxijomb2jD3YDKR6UelKFVP6WT8Y,13592
322
+ evalscope/evaluator/evaluator.py,sha256=o99m1CF7xuc3Qn2M25AhWulothZxICwZgZiWxSbynTc,14435
272
323
  evalscope/filters/__init__.py,sha256=AsXwKYDjGhFsJvtj036PRjMOPsHGt-CRicnHTtM_qA4,51
273
324
  evalscope/filters/extraction.py,sha256=KLFr_3XYsrv0PTvmXy0ugj2sqv2ZOWJFV7G_MmGjTHk,4146
274
325
  evalscope/filters/selection.py,sha256=yiJu2JjXDH_lgfEtB9umkGcA3zpo3zvnyoq2mKrXbnw,1609
275
326
  evalscope/metrics/__init__.py,sha256=1giVHESSjn98uBiAvYm5uLsmRQwmf9NHPSt7OT_QJss,1615
276
327
  evalscope/metrics/llm_judge.py,sha256=XukhH9PQtIZAcbjJlOmOD9ye3ngRv_IGKKJE9jhheOE,8653
277
328
  evalscope/metrics/math_parser.py,sha256=BMfautQtNNiF9f2DIEfO6SXSn_GYhzaddAjGWG10MJA,17257
278
- evalscope/metrics/metric.py,sha256=6la8Nq2E_brArDcNwkbRX3ECef0AAE3IrBCfUVE7UKc,10176
279
- evalscope/metrics/metrics.py,sha256=VxAggzEfaLKxWcXyuve8QbEBwV2W71udVyt0gynzGec,14134
329
+ evalscope/metrics/metric.py,sha256=KNp_DNi9Ntq4my5G7La7AlP2Vj1p6hIgOheAh-4go5Q,12861
330
+ evalscope/metrics/metrics.py,sha256=Y7TQ6MYaGE32EntTz-18CmQqYMpo1rQSvUiSwzBgpaQ,14599
280
331
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
281
332
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
282
333
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
@@ -354,9 +405,9 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py,sh
354
405
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py,sha256=OOr1JD9kTlUGXZNG5b3kvkUaNz7QTmhaGoHhIKL69qo,7613
355
406
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py,sha256=Ns7oM4KpKxWZTo8Lefe4EDFw-jzp5633zAArcWjoVZA,9772
356
407
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py,sha256=KIF5tsiE7a5dbDfa-IKwzuzMUpuEAQPrm1nWFFtAeoI,20032
357
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py,sha256=uhaehowhTqRhQtq_dVCgF-9Iu4yU19AMxx2sJimYwlA,52711
408
+ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py,sha256=85ZvV2gKSnsbP5941PeJ-JJ4t8_lOYQe1EOxrHlIbNI,52728
358
409
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py,sha256=o5ykt3Q_WQlNmyxjQaS2-KPLGq1xqLZixNYam_Bs6NA,18701
359
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py,sha256=aBKdQQS7cHMPgYqIknCdHCZ7j2_QLACPn_jU_njiMIs,46840
410
+ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py,sha256=NPDpIRxjiroafZk5Z2uA9bC8Bi-yXY7um5HXxThF7N0,46857
360
411
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
361
412
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py,sha256=s7EkhtrIJ0LPUuLBArws8N23R1MoIoNaYUjwsbUqRkY,7994
362
413
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py,sha256=FnUyxxazEVaP69pAq9cig3j-mcX37BX-unPj0SVKUJI,3805
@@ -376,7 +427,7 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/bl
376
427
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py,sha256=TOAI-KaUrtKjR1GNU_WwNXNpb9gGT-KX2FYe3muv_e0,4275
377
428
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py,sha256=-DprR09KYuwNEzEbhPvFRI3MR4_VdPMUGLPN6sL9Ym8,14625
378
429
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py,sha256=S68U0DxWYGDmreRbH5yLDHBNN9PsczY9H0Uik0hO-ds,13872
379
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py,sha256=i1XlJe_PTSmiPkZKIhUXC_lc0-z2ewNYo4W1DvZQxjY,36678
430
+ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py,sha256=zv_WyHi67hvgHQ4DkZ8a4UoPcgrADKayqVtiIq-p3V4,36695
380
431
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py,sha256=p67DDiFS-676z0z8jPj6NwXwNjEsqTXaXCh3g2UiDno,840
381
432
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
382
433
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
@@ -384,47 +435,47 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugmen
384
435
  evalscope/models/__init__.py,sha256=RmW2S31BHBhMN49_VVF_5PJAk-TsuZQkuF2ALShbhAw,556
385
436
  evalscope/models/image_edit_model.py,sha256=oVjGgebnFu3ZXBJLNn62rJ65fcJR7DlG4qEVxisPJ2Y,4104
386
437
  evalscope/models/mockllm.py,sha256=t1fFAHkEb1n_atOCfnGteCX3DWp774lnWcHzi5lBjwM,2511
387
- evalscope/models/model_apis.py,sha256=qzoksjHJHE8CLoNT0UlnFVkmeS7ufguiAtaxZSC5Djc,1957
438
+ evalscope/models/model_apis.py,sha256=ZkZ_nfbeAFJnCndRvRIRLcbmJFTMhGRBi-WfMu0uZKE,1922
388
439
  evalscope/models/modelscope.py,sha256=jSFkho_Ir2py54y_Bwj9jpCoY2mMKkZ8ORzne-ldAIE,15806
389
440
  evalscope/models/openai_compatible.py,sha256=2uK78nDhWwgph7hcIiMc3NHRbIwvswRDM9o9ENahj4k,4659
390
441
  evalscope/models/text2image_model.py,sha256=Sdiyw6vewjVTiXK8RFEh1pohOhDge80EoIWYpnLjr5Y,3929
391
- evalscope/models/utils/openai.py,sha256=xnnpPKWAsqqEscOQr0WJjr7gHUa9POs55Bs1Zv6MXNQ,28182
442
+ evalscope/models/utils/openai.py,sha256=0DzuvTQYFEqcTp6sVtB2VZY7xeyWcOS0I6votqWegUg,28130
392
443
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
393
- evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
444
+ evalscope/perf/arguments.py,sha256=FmwVE4gC09B8nLd0sdczeEA9b5ztv4kwhOvLuby4wI8,11695
394
445
  evalscope/perf/benchmark.py,sha256=nSJr8lQvHDYiG33tNhkYaVOYONjhJ2wUb1x5RlUiXRY,7968
395
446
  evalscope/perf/http_client.py,sha256=4Ov1Cwi7gMgO05ZmazwyfYjUGAQNGWn7nbfl1ljRNh4,4610
396
447
  evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
397
448
  evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
398
449
  evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
399
450
  evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
400
- evalscope/perf/plugin/api/base.py,sha256=9cX4xwTzy5ycnWqmQqRGMLasTEX6jVlobtADkh1KwXE,2782
451
+ evalscope/perf/plugin/api/base.py,sha256=RRZVk9MFuBwb9PFPTklFhQ_RTihg_E7W_LR26-ldPIA,2782
401
452
  evalscope/perf/plugin/api/custom_api.py,sha256=VYJO2lUt9EKdWz6zeYCfvdI0MqfcsIgcKvxqvY5C-3k,10376
402
453
  evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
403
- evalscope/perf/plugin/api/default_api.py,sha256=kjuHQ-zRHe5WU4ofSzWBpWbIxBQBOh_ucu1z2g62gWg,4315
404
- evalscope/perf/plugin/api/openai_api.py,sha256=oewwOPhv0BLdC7n3BUngpVrDYst5wMrBEPhN8oGMKNU,7703
454
+ evalscope/perf/plugin/api/default_api.py,sha256=qvMIjbe_rM13cDHcFCwjtCsjc11qE80Yg7LypaSNTXc,4251
455
+ evalscope/perf/plugin/api/openai_api.py,sha256=a6w4C_voza61trHskHaWNPFr2x2zhRVwIXdiNnMH81E,10570
405
456
  evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
406
457
  evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
407
458
  evalscope/perf/plugin/datasets/custom.py,sha256=yoRHTvTGAglaZ-mmRkPjYNMG7uZYuT1_KrBxnl2i0qg,1385
408
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=M-w1UjOMkA6Uh9v-SURDrm1YCL-m1Cn1u1cIcEJFDpY,1044
409
- evalscope/perf/plugin/datasets/kontext_bench.py,sha256=-KsoXS7nAd6hzN4oCe85zcLkZQT-1IGWQFThuuvE7vo,1092
459
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=nhHiGNhXX-2c17NQ5q5Q7FgV2hB8XVeeAP8dKkboyHE,1033
460
+ evalscope/perf/plugin/datasets/kontext_bench.py,sha256=cN70hiBX1940IWvNWZG9YGE4vO1yj41Bo7bqmOWusoQ,1081
410
461
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=F4ppdjKKLzFNf_16h6S-6nAU4lOfOFI2-tPgIeZDTMA,996
411
462
  evalscope/perf/plugin/datasets/longalpaca.py,sha256=JjPGYP8NdPmP48wff2fL5IZQfajXL5qhZBvKmZxtfW4,1336
412
463
  evalscope/perf/plugin/datasets/openqa.py,sha256=5PqqiIvNTLlRrPb8PWqMGQyWRb6LuIqipYn67-xd-dY,1519
413
464
  evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
414
- evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i85LeKTzjvGcvhzc8UNN-gUI,3240
465
+ evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=e6exWQnupWkTDNwt2MmEK-hccuxEDmWLJRMM70onKi0,3230
415
466
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
416
467
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
417
468
  evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
418
- evalscope/perf/utils/benchmark_util.py,sha256=V91JwpiR66tOz3N5RPp3Es29M9BghdCHj_Czb0FBekI,7274
469
+ evalscope/perf/utils/benchmark_util.py,sha256=A5d--rCElabDOl6Aaxqnu0fNR5c763YZwKIHBSeTK00,7294
419
470
  evalscope/perf/utils/db_util.py,sha256=HAISq6M7xCD2gjUEqqfbK3FjBxA-tvr_n-751tU9ypo,11634
420
471
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
421
472
  evalscope/perf/utils/local_server.py,sha256=_lSPlNEnOmPA_DtREgPS_vj2w_7D8PPSpypXbb0YfJM,4880
422
473
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
423
474
  evalscope/perf/utils/rich_display.py,sha256=AQmXv1EuA1-IGgco-Jy1NLOmTKv4eBFH2K4QS8OoGVo,8206
424
- evalscope/report/__init__.py,sha256=DTigCg9fkU_zGNDqIaZy3CWYbrlvODvCxCTVqSx6ano,875
425
- evalscope/report/combinator.py,sha256=MAiOCj_q5mXm8-3lARvCSG12jUVEdJ8VcoEHJapoWzo,4134
475
+ evalscope/report/__init__.py,sha256=xS6eeTgsPdIlIOhzUn-ND77uV34vMVug4PmXHmYAxwM,1080
476
+ evalscope/report/combinator.py,sha256=F7KOClXVh56-XEw3Sb5uxwA6L8ZlH_P4-MOlm3Yp_Cg,5020
426
477
  evalscope/report/generator.py,sha256=t2R3WGa4SowTRUPOgITtyTR4QDiJ6i3FH__byDKZU8Y,4959
427
- evalscope/report/report.py,sha256=KxboijAVNENxYHjiwyyqW_aQZ0F2CyJ6MbqUJTRHJMs,8273
478
+ evalscope/report/report.py,sha256=lEBD_E_RJiydFTaGFNLIMTFxNrqv8QcLZb_iuUg5HB0,8479
428
479
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
429
480
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
430
481
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -464,43 +515,17 @@ evalscope/utils/__init__.py,sha256=5OH8cOoX3YKMKUu0dMRvwzckXligIbUV-1jjJNXlpGI,2
464
515
  evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
465
516
  evalscope/utils/chat_service.py,sha256=sSki2pKGQP3UjcIf_lbO06afI-vsaUAqglwX__wUDEw,8766
466
517
  evalscope/utils/deprecation_utils.py,sha256=aDv3HFNcJFZ7rxNgALQP0-ITO8L23HC_RX-C_m2i34Y,1610
467
- evalscope/utils/function_utils.py,sha256=a752Z4Xb1rznnLJU9g5Pxqd3r_XzfLzAkdcjSX0kOVc,650
468
- evalscope/utils/import_utils.py,sha256=b6N2x5kB_TMCkSKBlBZ5kL-x-eo_B_DWRQKtsxYL-WM,3808
469
- evalscope/utils/io_utils.py,sha256=q26SU80VvLi1e--KDbMmIjuw3ex_WEWzkgLkmsK9n1g,11191
470
- evalscope/utils/json_schema.py,sha256=MLCS8cSLXF83UPebBaVWDfXJnf0qXsXnr-bIRG88cI4,7485
471
- evalscope/utils/logger.py,sha256=SPhhXo9gyZtWDYDLumII2CEmwHsaW8Bu1IjK5UqWrKQ,5273
472
- evalscope/utils/model_utils.py,sha256=rzEnlwWgupkH1vmmv-tL9-udpwHuiQlZhbX9fXPEcZg,2434
473
- evalscope/utils/multi_choices.py,sha256=OxBER7amWpoRY0Z-o39rDmCNK6wpr1HQm9mMHpWLgp0,9524
518
+ evalscope/utils/function_utils.py,sha256=E-AIzx_PKrZDGl1cBvlvqNvMa8yM2WUJ2wh73PNBXrQ,1887
519
+ evalscope/utils/import_utils.py,sha256=S0WQ3gt4zpwJHjGcyC-604pWWExg3JV7f3wzoOH-tuo,5794
520
+ evalscope/utils/io_utils.py,sha256=79F0p7dFxA84tIVSL_C4piJgeQQtVUfb2R_Xcd8v_cE,11615
521
+ evalscope/utils/json_schema.py,sha256=GVP1m6g4mBrsFmOWOOVnmvl2joOz8gTlGEytLv5qy7s,8451
522
+ evalscope/utils/logger.py,sha256=roFk4Su4aJwsF0s-uYc5-tABnghwYPX3gpkA5QUGzK8,5675
523
+ evalscope/utils/model_utils.py,sha256=mdtYoHhUdfpxUtnS52XZjNdO3uSK4yeIBHT3aDU7s-A,2455
524
+ evalscope/utils/multi_choices.py,sha256=0UJbgr5eXNgitPC79JLcyUU-OXg9BlM-mVk-fWtUSno,9881
474
525
  evalscope/utils/url_utils.py,sha256=9HcFt9uZNbOJR3ADUFQ_dBFKziHV6H66Df7HYs1M4Po,1757
475
- tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
476
- tests/common.py,sha256=BB136KcGaEfdWqMwApa48K0CTSGmOCUZ0FYDqpfYnAA,2423
477
- tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
478
- tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
479
- tests/benchmark/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
480
- tests/benchmark/test_eval.py,sha256=Grms3aMWQONexSsSvOSxkoURHLJ2Z0SqBjrcVWDoMRs,12455
481
- tests/benchmark/test_image_edit.py,sha256=z3z7psMRFynpVgUAFoH--ieeGXzb9cHkrq3tT_sCZo8,2165
482
- tests/benchmark/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
483
- tests/benchmark/test_vlm.py,sha256=k2DC0zWO2TtVSf-MP-n-wGwfk9MWKKd6hZzkC4nlUO0,2541
484
- tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
485
- tests/cli/test_all.py,sha256=1omOXC1lBphBLm0hTf5HNstlF_bwi16dYyr00gvaCTM,7301
486
- tests/cli/test_collection.py,sha256=lGz3YUS_0gM6_HjQLe26OfBAkHOPOEDWMO-UyP58GN8,4455
487
- tests/cli/test_custom.py,sha256=9z_N7Re712xI62TqVSTBdzB_iFFEUb55wcWIcGvJb84,9254
488
- tests/cli/test_reasoning.py,sha256=rU181LLoKbFCpNPFCIZULxEgsJ2PYswel2pP2EsjEmo,2696
489
- tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
490
- tests/perf/test_perf.py,sha256=yqm3abB5ZdNPKaJkvzMvfcz-Cz_o2RxUZ3ZnqgRb-tQ,5937
491
- tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
492
- tests/rag/test_clip_benchmark.py,sha256=qpSLgmHMGcYTnxP7AI__y-ii5_tu_fCSht6p3TBetkA,2650
493
- tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
494
- tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
495
- tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
496
- tests/swift/test_run_swift_eval.py,sha256=YbIhYNoI4kAB-ox-OXAKUifLIXTFqP-xGZicrAgK_V0,5784
497
- tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4VlwL03atI,4934
498
- tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
499
- tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
500
- tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
501
- evalscope-1.0.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
502
- evalscope-1.0.1.dist-info/METADATA,sha256=2XzuX9tVYzONuLHVq2WsQ_uaWImGVwiY2IPAJhpNEOA,40287
503
- evalscope-1.0.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
504
- evalscope-1.0.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
505
- evalscope-1.0.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
506
- evalscope-1.0.1.dist-info/RECORD,,
526
+ evalscope-1.1.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
527
+ evalscope-1.1.0.dist-info/METADATA,sha256=pap4NeCTqw7bec2KqYboFj25zabm1m5rwoiqukX8EO4,39544
528
+ evalscope-1.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
529
+ evalscope-1.1.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
530
+ evalscope-1.1.0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
531
+ evalscope-1.1.0.dist-info/RECORD,,
tests/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.