evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -10,8 +10,14 @@ from typing import TYPE_CHECKING
10
10
  from evalscope.report import ReportKey, get_data_frame
11
11
  from evalscope.utils.logger import get_logger
12
12
  from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
13
- from ..utils.data_utils import (get_acc_report_df, get_compare_report_df, get_model_prediction, get_single_dataset_df,
14
- load_multi_report, load_single_report)
13
+ from ..utils.data_utils import (
14
+ get_acc_report_df,
15
+ get_compare_report_df,
16
+ get_model_prediction,
17
+ get_single_dataset_df,
18
+ load_multi_report,
19
+ load_single_report,
20
+ )
15
21
  from ..utils.localization import get_multi_model_locale
16
22
  from ..utils.text_utils import convert_markdown_image, process_model_prediction
17
23
  from ..utils.visualization import plot_multi_report_radar
@@ -62,7 +68,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
62
68
  label=locale_dict.get('answer_mode'),
63
69
  choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'],
64
70
  value='All',
65
- interactive=True)
71
+ interactive=True
72
+ )
66
73
  score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
67
74
 
68
75
  data_comparison_df = gr.State(None)
@@ -75,7 +82,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
75
82
  comparison_counts = gr.Markdown('')
76
83
  with gr.Column():
77
84
  page_number = gr.Number(
78
- value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
85
+ value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
86
+ )
79
87
 
80
88
  # Input and Gold answer sections remain at the top
81
89
  with gr.Row(variant='panel'):
@@ -133,7 +141,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
133
141
 
134
142
  @multi_report_name.change(
135
143
  inputs=[sidebar.root_path, multi_report_name],
136
- outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select])
144
+ outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]
145
+ )
137
146
  def update_multi_report_data(root_path, multi_report_names):
138
147
  if not multi_report_names:
139
148
  return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
@@ -147,13 +156,14 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
147
156
  model_choices = multi_report_names
148
157
 
149
158
  return report_list, report_radar_plot, styler, gr.update(
150
- choices=model_choices, value=model_choices[0]), gr.update(
151
- choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
159
+ choices=model_choices, value=model_choices[0]
160
+ ), gr.update(choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
152
161
 
153
162
  @gr.on(
154
163
  triggers=[model_a_select.change, model_b_select.change],
155
164
  inputs=[sidebar.root_path, model_a_select, model_b_select],
156
- outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio])
165
+ outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]
166
+ )
157
167
  def update_selected_models(root_path, model_a, model_b):
158
168
  if not model_a or not model_b:
159
169
  return gr.skip()
@@ -172,13 +182,16 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
172
182
  model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
173
183
  model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
174
184
 
175
- return (model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
176
- gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None))
185
+ return (
186
+ model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
187
+ gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None)
188
+ )
177
189
 
178
190
  @gr.on(
179
191
  triggers=[dataset_radio.change],
180
192
  inputs=[dataset_radio, model_a_report, model_b_report],
181
- outputs=[subset_select, data_comparison_df])
193
+ outputs=[subset_select, data_comparison_df]
194
+ )
182
195
  def update_dataset_comparison(dataset_name, model_a_report, model_b_report):
183
196
  if not dataset_name or model_a_report is None or model_b_report is None:
184
197
  return gr.skip()
@@ -198,7 +211,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
198
211
  @gr.on(
199
212
  triggers=[subset_select.change],
200
213
  inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select],
201
- outputs=[data_comparison_df, page_number])
214
+ outputs=[data_comparison_df, page_number]
215
+ )
202
216
  def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name):
203
217
  if not subset_name or not dataset_name:
204
218
  return gr.skip()
@@ -230,7 +244,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
230
244
  @gr.on(
231
245
  triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change],
232
246
  inputs=[data_comparison_df, answer_mode_radio, score_threshold],
233
- outputs=[filtered_comparison_df, page_number, comparison_counts])
247
+ outputs=[filtered_comparison_df, page_number, comparison_counts]
248
+ )
234
249
  def filter_comparison_data(comparison_df, answer_mode, score_threshold):
235
250
  if comparison_df is None:
236
251
  return None, gr.update(value=1, maximum=1), ''
@@ -256,13 +271,19 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
256
271
  # Count statistics
257
272
  pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold])
258
273
  pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold])
259
- pass_both_count = len(comparison_df[(comparison_df['A_NScore'] >= score_threshold)
260
- & (comparison_df['B_NScore'] >= score_threshold)])
261
- fail_both_count = len(comparison_df[(comparison_df['A_NScore'] < score_threshold)
262
- & (comparison_df['B_NScore'] < score_threshold)])
263
-
264
- counts_text = (f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
265
- f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}')
274
+ pass_both_count = len(
275
+ comparison_df[(comparison_df['A_NScore'] >= score_threshold)
276
+ & (comparison_df['B_NScore'] >= score_threshold)]
277
+ )
278
+ fail_both_count = len(
279
+ comparison_df[(comparison_df['A_NScore'] < score_threshold)
280
+ & (comparison_df['B_NScore'] < score_threshold)]
281
+ )
282
+
283
+ counts_text = (
284
+ f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
285
+ f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}'
286
+ )
266
287
 
267
288
  max_page = max(1, len(filtered_df))
268
289
 
@@ -277,9 +298,11 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
277
298
  outputs=[
278
299
  input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated,
279
300
  model_b_pred, model_b_score, model_b_nscore
280
- ])
281
- def update_comparison_display(filtered_df, page_number, score_threshold, model_a_select, model_b_select,
282
- model_a_name_val, model_b_name_val):
301
+ ]
302
+ )
303
+ def update_comparison_display(
304
+ filtered_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name_val, model_b_name_val
305
+ ):
283
306
  if filtered_df is None or len(filtered_df) == 0:
284
307
  return '', '', '', '', '', '', '', '', '', ''
285
308
 
@@ -317,7 +340,9 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
317
340
  else:
318
341
  b_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{b_nscore_val}</div>"
319
342
 
320
- return (input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
321
- b_score_md, b_nscore_html)
343
+ return (
344
+ input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
345
+ b_score_md, b_nscore_html
346
+ )
322
347
 
323
348
  return MultiModelComponents(multi_report_name=multi_report_name)
@@ -10,8 +10,13 @@ from typing import TYPE_CHECKING
10
10
  from evalscope.report import Report, ReportKey, get_data_frame
11
11
  from evalscope.utils.logger import get_logger
12
12
  from ..constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
13
- from ..utils.data_utils import (get_acc_report_df, get_model_prediction, get_report_analysis, get_single_dataset_df,
14
- load_single_report)
13
+ from ..utils.data_utils import (
14
+ get_acc_report_df,
15
+ get_model_prediction,
16
+ get_report_analysis,
17
+ get_single_dataset_df,
18
+ load_single_report,
19
+ )
15
20
  from ..utils.localization import get_single_model_locale
16
21
  from ..utils.text_utils import convert_markdown_image, process_json_content, process_model_prediction
17
22
  from ..utils.visualization import plot_single_dataset_scores, plot_single_report_scores, plot_single_report_sunburst
@@ -63,7 +68,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
63
68
 
64
69
  with gr.Row():
65
70
  answer_mode_radio = gr.Radio(
66
- label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
71
+ label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True
72
+ )
67
73
  score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
68
74
 
69
75
  data_review_df = gr.State(None)
@@ -76,7 +82,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
76
82
  answer_mode_counts = gr.Markdown('')
77
83
  with gr.Column():
78
84
  page_number = gr.Number(
79
- value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
85
+ value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
86
+ )
80
87
 
81
88
  # show data review table
82
89
  with gr.Row(variant='panel'):
@@ -98,14 +105,15 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
98
105
  with gr.Row(variant='panel'):
99
106
  with gr.Column():
100
107
  gr.Markdown('### *Input*')
101
- input_text = gr.Code('', elem_id='input_text', language='json', wrap_lines=False)
108
+ input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS)
102
109
  with gr.Column():
103
110
  gr.Markdown('### *Generated*')
104
111
  generated_text = gr.Markdown('', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS)
105
112
 
106
113
  @report_name.change(
107
114
  inputs=[sidebar.root_path, report_name],
108
- outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
115
+ outputs=[report_list, task_config, dataset_radio, work_dir, model_name]
116
+ )
109
117
  def update_single_report_data(root_path, report_name):
110
118
  report_list, datasets, task_cfg = load_single_report(root_path, report_name)
111
119
  work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
@@ -122,7 +130,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
122
130
  @gr.on(
123
131
  triggers=[dataset_radio.change, report_list.change],
124
132
  inputs=[dataset_radio, report_list],
125
- outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
133
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis]
134
+ )
126
135
  def update_single_report_dataset(dataset_name, report_list):
127
136
  logger.debug(f'Updating single report dataset: {dataset_name}')
128
137
  report_df = get_data_frame(report_list=report_list)
@@ -136,7 +145,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
136
145
  @gr.on(
137
146
  triggers=[subset_select.change],
138
147
  inputs=[work_dir, model_name, dataset_radio, subset_select],
139
- outputs=[data_review_df, page_number])
148
+ outputs=[data_review_df, page_number]
149
+ )
140
150
  def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
141
151
  if not subset_name:
142
152
  return gr.skip()
@@ -146,7 +156,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
146
156
  @gr.on(
147
157
  triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
148
158
  inputs=[data_review_df, answer_mode_radio, score_threshold],
149
- outputs=[filtered_review_df, page_number, answer_mode_counts])
159
+ outputs=[filtered_review_df, page_number, answer_mode_counts]
160
+ )
150
161
  def filter_data(data_review_df, answer_mode, score_threshold):
151
162
  if data_review_df is None:
152
163
  return None, gr.update(value=1, maximum=1), ''
@@ -172,7 +183,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
172
183
  @gr.on(
173
184
  triggers=[filtered_review_df.change, page_number.change],
174
185
  inputs=[filtered_review_df, page_number, score_threshold],
175
- outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
186
+ outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore]
187
+ )
176
188
  def update_table_components(filtered_df, page_number, score_threshold):
177
189
  if filtered_df is None or len(filtered_df) == 0:
178
190
  return '', '', '', '', '', ''
@@ -185,7 +197,7 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
185
197
  row = filtered_df.iloc[start]
186
198
 
187
199
  # Process the data for display
188
- input_md = process_json_content(row['Input'])
200
+ input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
189
201
  generated_md = process_model_prediction(row['Generated'])
190
202
  gold_md = process_model_prediction(row['Gold'])
191
203
  pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
@@ -7,9 +7,10 @@ import os
7
7
  import pandas as pd
8
8
  from typing import Any, Dict, List, Union
9
9
 
10
+ from evalscope.api.evaluator import CacheManager, ReviewResult
10
11
  from evalscope.constants import DataCollection
11
12
  from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
12
- from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
13
+ from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list, yaml_to_dict
13
14
  from evalscope.utils.logger import get_logger
14
15
  from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
15
16
 
@@ -39,7 +40,8 @@ def scan_for_report_folders(root_path):
39
40
  datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
40
41
  datasets = DATASET_TOKEN.join(datasets)
41
42
  reports.append(
42
- f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
43
+ f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
44
+ )
43
45
 
44
46
  reports = sorted(reports, reverse=True)
45
47
  logger.debug(f'reports: {reports}')
@@ -61,7 +63,8 @@ def load_single_report(root_path: str, report_name: str):
61
63
  config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
62
64
  if not config_files:
63
65
  raise FileNotFoundError(
64
- f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}')
66
+ f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
67
+ )
65
68
  task_cfg_path = config_files[0]
66
69
  task_cfg = yaml_to_dict(task_cfg_path)
67
70
  return report_list, datasets, task_cfg
@@ -134,31 +137,44 @@ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
134
137
 
135
138
 
136
139
  def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
137
- data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
138
- subset_name = subset_name.replace('/', '_') # for collection report
139
- review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
140
- logger.debug(f'review_path: {review_path}')
141
- origin_df = pd.read_json(review_path, lines=True)
140
+ # Load review cache
141
+ outputs = OutputsStructure(work_dir, is_make=False)
142
+ cache_manager = CacheManager(outputs, model_name, dataset_name)
143
+ if dataset_name == DataCollection.NAME:
144
+ review_cache_path = cache_manager.get_review_cache_path('default')
145
+ else:
146
+ review_cache_path = cache_manager.get_review_cache_path(subset_name)
147
+ logger.debug(f'review_path: {review_cache_path}')
148
+ review_caches = jsonl_to_list(review_cache_path)
142
149
 
143
150
  ds = []
144
- for i, item in origin_df.iterrows():
145
- raw_input = item['raw_input']
146
- sample_index = item['index']
147
- for choice_index, choice in enumerate(item['choices']):
148
- raw_pred_answer = choice['message']['content']
149
- parsed_gold_answer = choice['review']['gold']
150
- parsed_pred_answer = choice['review']['pred']
151
- score = choice['review']['result']
152
- raw_d = {
153
- 'Index': f'{sample_index}_{choice_index}',
154
- 'Input': raw_input,
155
- 'Generated': raw_pred_answer if raw_pred_answer != parsed_pred_answer else '*Same as Pred*',
156
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
157
- 'Pred': parsed_pred_answer,
158
- 'Score': score,
159
- 'NScore': normalize_score(score)
160
- }
161
- ds.append(raw_d)
151
+ for cache in review_caches:
152
+ review_result = ReviewResult.model_validate(cache)
153
+ sample_score = review_result.sample_score
154
+
155
+ if dataset_name == DataCollection.NAME:
156
+ # Filter subset name
157
+ collection_info = sample_score.sample_metadata[DataCollection.INFO]
158
+ sample_dataset_name = collection_info.get('dataset_name', 'default')
159
+ sample_subset_name = collection_info.get('subset_name', 'default')
160
+ if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
161
+ continue
162
+
163
+ prediction = sample_score.score.prediction
164
+ target = review_result.target
165
+ extracted_prediction = sample_score.score.extracted_prediction
166
+ score = sample_score.score
167
+ raw_d = {
168
+ 'Index': str(review_result.index),
169
+ 'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
+ 'Metadata': sample_score.sample_metadata,
171
+ 'Generated': prediction if prediction != extracted_prediction else '*Same as Pred*',
172
+ 'Gold': target,
173
+ 'Pred': extracted_prediction,
174
+ 'Score': score.model_dump(exclude_none=True),
175
+ 'NScore': normalize_score(score.main_value)
176
+ }
177
+ ds.append(raw_d)
162
178
 
163
179
  df_subset = pd.DataFrame(ds)
164
180
  return df_subset
@@ -109,8 +109,6 @@ def process_json_content(content: Any) -> str:
109
109
  Returns:
110
110
  str: The processed content formatted for markdown display.
111
111
  """
112
- if isinstance(content, (np.bool_, np.int_, np.float_)):
113
- content = str(content)
114
112
 
115
113
  if isinstance(content, str):
116
114
  content = {'content': content}
@@ -47,7 +47,8 @@ def plot_single_report_sunburst(report_list: List[Report]):
47
47
  color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
48
48
  color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
49
49
  template=PLOTLY_THEME,
50
- maxdepth=4)
50
+ maxdepth=4
51
+ )
51
52
  plot.update_traces(insidetextorientation='radial')
52
53
  plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
53
54
  return plot
@@ -61,7 +62,8 @@ def plot_single_dataset_scores(df: pd.DataFrame):
61
62
  y=df[ReportKey.score],
62
63
  color=df[ReportKey.subset_name],
63
64
  text=df[ReportKey.score],
64
- barmode='group')
65
+ barmode='group'
66
+ )
65
67
 
66
68
  width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
67
69
  plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
@@ -82,10 +84,13 @@ def plot_multi_report_radar(df: pd.DataFrame):
82
84
  r=common_group[ReportKey.score],
83
85
  theta=common_group[ReportKey.dataset_name],
84
86
  name=model_name,
85
- fill='toself'))
87
+ fill='toself'
88
+ )
89
+ )
86
90
 
87
91
  fig.update_layout(
88
92
  template=PLOTLY_THEME,
89
93
  polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
90
- margin=dict(t=20, l=20, r=20, b=20))
94
+ margin=dict(t=20, l=20, r=20, b=20)
95
+ )
91
96
  return fig
evalscope/arguments.py CHANGED
@@ -1,7 +1,8 @@
1
+ # flake8: noqa: E501
1
2
  import argparse
2
3
  import json
3
4
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
5
+ from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask, OutputType
5
6
 
6
7
 
7
8
  class ParseStrArgsAction(argparse.Action):
@@ -47,7 +48,6 @@ def add_argument(parser: argparse.ArgumentParser):
47
48
  parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
48
49
 
49
50
  # Template-related arguments
50
- parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
51
51
  parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
52
52
 
53
53
  # Dataset-related arguments
@@ -65,14 +65,13 @@ def add_argument(parser: argparse.ArgumentParser):
65
65
  parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
66
66
  choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
67
67
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
68
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
69
- choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
70
- parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
71
68
  parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
69
+ parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
70
+ parser.add_argument('--repeats', type=int, default=1, help='Number of times to repeat the dataset items for k-metrics.') # noqa: E501
72
71
 
73
72
  # Cache and working directory arguments
74
- parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
75
73
  parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
74
+ parser.add_argument('--rerun-review', action='store_true', default=False, help='Rerun the review process when use_cache.')
76
75
  parser.add_argument('--work-dir', type=str, help='The root cache dir.')
77
76
 
78
77
  # Debug and runtime mode arguments
@@ -83,7 +82,7 @@ def add_argument(parser: argparse.ArgumentParser):
83
82
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
84
83
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
85
84
  parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
86
- parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
85
+ parser.add_argument('--stream', action='store_true', default=None, help='Stream mode.') # noqa: E501
87
86
 
88
87
  # LLMJudge arguments
89
88
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
@@ -49,7 +49,8 @@ register_template(
49
49
  reserved_roles=[
50
50
  dict(role='SYSTEM', api_role='SYSTEM'),
51
51
  ],
52
- ))
52
+ )
53
+ )
53
54
 
54
55
  if __name__ == '__main__':
55
56
  res = MetaTemplateType.get_template_name_list()
@@ -182,8 +182,10 @@ class OpenCompassBackendManager(BackendManager):
182
182
  else:
183
183
  valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
184
184
  if len(invalid_dataset_names) > 0:
185
- logger.error(f'Invalid datasets: {invalid_dataset_names}, '
186
- f'refer to the following list to get proper dataset name: {dataset_names_all}')
185
+ logger.error(
186
+ f'Invalid datasets: {invalid_dataset_names}, '
187
+ f'refer to the following list to get proper dataset name: {dataset_names_all}'
188
+ )
187
189
  assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
188
190
  f'To get the valid datasets, please refer to {dataset_names_all}'
189
191
 
@@ -252,7 +254,8 @@ if __name__ == '__main__':
252
254
  'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
253
255
  }],
254
256
  'limit': 5
255
- })
257
+ }
258
+ )
256
259
  all_datasets = OpenCompassBackendManager.list_datasets()
257
260
  print(f'all_datasets: {all_datasets}')
258
261
  oc_backend_manager.run()
@@ -100,16 +100,16 @@ class DatasetWrapper(TorchDataset):
100
100
 
101
101
  def get_dataset_default_task(dataset):
102
102
  if dataset in (
103
- 'custom',
104
- 'muge',
105
- 'flickr30k',
106
- 'flickr8k',
107
- 'mscoco_captions',
108
- 'mscoco_captions2017',
109
- 'multilingual_mscoco_captions',
110
- 'flickr30k-200',
111
- 'crossmodal3600',
112
- 'xtd200',
103
+ 'custom',
104
+ 'muge',
105
+ 'flickr30k',
106
+ 'flickr8k',
107
+ 'mscoco_captions',
108
+ 'mscoco_captions2017',
109
+ 'multilingual_mscoco_captions',
110
+ 'flickr30k-200',
111
+ 'crossmodal3600',
112
+ 'xtd200',
113
113
  ):
114
114
  return 'zeroshot_retrieval'
115
115
  else:
@@ -4,8 +4,11 @@ import torch
4
4
  from itertools import product
5
5
 
6
6
  from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
7
- from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (build_dataset, get_dataloader,
8
- get_dataset_default_task)
7
+ from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
8
+ build_dataset,
9
+ get_dataloader,
10
+ get_dataset_default_task,
11
+ )
9
12
  from evalscope.backend.rag_eval.clip_benchmark.tasks import image_caption, zeroshot_classification, zeroshot_retrieval
10
13
  from evalscope.backend.rag_eval.utils.clip import VisionModel
11
14
  from evalscope.utils.logger import get_logger
@@ -66,8 +69,9 @@ def evaluate(args: Arguments):
66
69
  if verbose:
67
70
  logger.info(f'Zero-shot templates: {zeroshot_templates}')
68
71
  classnames = dataset.classes if hasattr(dataset, 'classes') else None
69
- assert (zeroshot_templates is not None
70
- and classnames is not None), 'Dataset does not support classification'
72
+ assert (
73
+ zeroshot_templates is not None and classnames is not None
74
+ ), 'Dataset does not support classification'
71
75
  metrics = zeroshot_classification.evaluate(
72
76
  model,
73
77
  dataloader,
@@ -34,7 +34,8 @@ def rag_eval(args: EvaluationArguments, ) -> None:
34
34
  target_lang=args.language,
35
35
  llm=LangchainLLMWrapper(llm),
36
36
  adapt_instruction=True,
37
- ))
37
+ )
38
+ )
38
39
  # load dataset
39
40
  dataset = Dataset.from_json(args.testset_file)
40
41
 
@@ -27,7 +27,8 @@ def default_query_distribution(llm: BaseRagasLLM, kg: KnowledgeGraph, language:
27
27
  target_lang=language,
28
28
  llm=llm,
29
29
  adapt_instruction=True,
30
- ))
30
+ )
31
+ )
31
32
 
32
33
  default_queries = [
33
34
  single_hop,
@@ -44,8 +44,9 @@ def default_transforms(
44
44
  return bins
45
45
 
46
46
  def filter_doc_with_num_tokens(node, min_num_tokens=500):
47
- return (node.type == NodeType.DOCUMENT
48
- and num_tokens_from_string(node.properties['page_content']) > min_num_tokens)
47
+ return (
48
+ node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > min_num_tokens
49
+ )
49
50
 
50
51
  def filter_docs(node):
51
52
  return node.type == NodeType.DOCUMENT
@@ -90,7 +91,8 @@ def default_transforms(
90
91
  target_lang=language,
91
92
  llm=llm,
92
93
  adapt_instruction=True,
93
- ))
94
+ )
95
+ )
94
96
 
95
97
  transforms = [
96
98
  headline_extractor,
@@ -121,7 +123,8 @@ def default_transforms(
121
123
  target_lang=language,
122
124
  llm=llm,
123
125
  adapt_instruction=True,
124
- ))
126
+ )
127
+ )
125
128
 
126
129
  transforms = [
127
130
  summary_extractor,
@@ -113,7 +113,8 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
113
113
 
114
114
  # generate testset
115
115
  generator = TestsetGenerator(
116
- llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list)
116
+ llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
117
+ )
117
118
 
118
119
  testset = generator.generate(
119
120
  testset_size=args.test_size,
@@ -34,7 +34,8 @@ async def translate_prompt(
34
34
 
35
35
  logger.info(f'Translating prompts to {target_lang}')
36
36
  adapted_prompts = await prompt_user.adapt_prompts(
37
- language=target_lang, llm=llm, adapt_instruction=adapt_instruction)
37
+ language=target_lang, llm=llm, adapt_instruction=adapt_instruction
38
+ )
38
39
  prompt_user.set_prompts(**adapted_prompts)
39
40
  try:
40
41
  prompt_user.save_prompts(prompt_dir)
@@ -196,7 +196,8 @@ class APIEmbeddingModel(BaseModel):
196
196
  openai_api_base=self.openai_api_base,
197
197
  openai_api_key=self.openai_api_key,
198
198
  dimensions=self.dimensions,
199
- check_embedding_ctx_length=False)
199
+ check_embedding_ctx_length=False
200
+ )
200
201
 
201
202
  super().__init__(model_name_or_path=self.model_name, **kwargs)
202
203