evalscope 1.0.1__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (578) hide show
  1. evalscope-1.1.0/MANIFEST.in +10 -0
  2. {evalscope-1.0.1/evalscope.egg-info → evalscope-1.1.0}/PKG-INFO +6 -3
  3. {evalscope-1.0.1 → evalscope-1.1.0}/README.md +3 -1
  4. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  5. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  7. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  8. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/benchmark.py +27 -2
  9. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/meta.py +3 -0
  10. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/evaluator/evaluator.py +5 -0
  11. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/evaluator/state.py +5 -0
  12. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/messages/chat_message.py +6 -1
  13. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/mixin/__init__.py +1 -0
  14. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/mixin/llm_judge_mixin.py +2 -0
  15. evalscope-1.1.0/evalscope/api/mixin/sandbox_mixin.py +204 -0
  16. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/model/generate_config.py +0 -3
  17. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/model/model.py +1 -1
  18. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/tool/tool_info.py +1 -1
  19. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/ui/multi_model.py +6 -1
  20. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/ui/single_model.py +8 -2
  21. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/utils/data_utils.py +3 -2
  22. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/utils/visualization.py +2 -2
  23. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/arguments.py +6 -0
  24. evalscope-1.1.0/evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  25. evalscope-1.1.0/evalscope/benchmarks/amc/amc_adapter.py +46 -0
  26. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  27. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  28. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bfcl/generation.py +7 -7
  29. evalscope-1.1.0/evalscope/benchmarks/blink/blink_adapter.py +61 -0
  30. evalscope-1.1.0/evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope-1.1.0/evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope-1.1.0/evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  33. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/drop/drop_adapter.py +1 -1
  34. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/general_arena/utils.py +2 -1
  35. evalscope-1.1.0/evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  36. evalscope-1.1.0/evalscope/benchmarks/healthbench/utils.py +102 -0
  37. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/hle/hle_adapter.py +3 -2
  38. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  39. evalscope-1.1.0/evalscope/benchmarks/humaneval/utils.py +235 -0
  40. evalscope-1.1.0/evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  41. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  42. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  43. evalscope-1.1.0/evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  44. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  45. evalscope-1.1.0/evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  46. evalscope-1.1.0/evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  47. evalscope-1.1.0/evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  48. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  49. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  50. evalscope-1.1.0/evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  51. evalscope-1.1.0/evalscope/benchmarks/multi_if/metrics.py +120 -0
  52. evalscope-1.1.0/evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  53. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  54. evalscope-1.1.0/evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  55. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  56. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  57. {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2}/__init__.py +0 -0
  58. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  59. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  60. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  61. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  62. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  63. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  64. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  65. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  66. evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  67. evalscope-1.1.0/evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  68. evalscope-1.1.0/evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  69. evalscope-1.1.0/evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  70. evalscope-1.1.0/evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  71. evalscope-1.1.0/evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  72. evalscope-1.1.0/evalscope/benchmarks/simple_qa/__init__.py +0 -0
  73. evalscope-1.1.0/evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  74. evalscope-1.1.0/evalscope/benchmarks/tau_bench/__init__.py +0 -0
  75. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  76. evalscope-1.1.0/evalscope/benchmarks/text2image/__init__.py +0 -0
  77. evalscope-1.1.0/evalscope/benchmarks/tool_bench/__init__.py +0 -0
  78. evalscope-1.1.0/evalscope/benchmarks/winogrande/__init__.py +0 -0
  79. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/config.py +24 -1
  80. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/constants.py +3 -0
  81. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/evaluator/evaluator.py +25 -7
  82. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/metric.py +78 -2
  83. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/metrics.py +16 -0
  84. evalscope-1.1.0/evalscope/metrics/t2v_metrics/__init__.py +0 -0
  85. evalscope-1.1.0/evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  86. evalscope-1.1.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  87. evalscope-1.1.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  88. evalscope-1.1.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  89. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  90. evalscope-1.1.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  91. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  92. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  93. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/model_apis.py +10 -8
  94. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/utils/openai.py +1 -2
  95. evalscope-1.1.0/evalscope/perf/__init__.py +0 -0
  96. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/arguments.py +2 -0
  97. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/api/base.py +2 -2
  98. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/api/default_api.py +7 -7
  99. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/api/openai_api.py +83 -19
  100. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  101. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  102. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  103. evalscope-1.1.0/evalscope/perf/utils/__init__.py +0 -0
  104. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/benchmark_util.py +1 -2
  105. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/report/__init__.py +9 -1
  106. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/report/combinator.py +45 -20
  107. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/report/report.py +8 -4
  108. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/run.py +1 -1
  109. evalscope-1.1.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  110. evalscope-1.1.0/evalscope/utils/function_utils.py +70 -0
  111. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/import_utils.py +63 -13
  112. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/io_utils.py +19 -11
  113. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/json_schema.py +25 -2
  114. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/logger.py +19 -0
  115. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/model_utils.py +1 -1
  116. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/multi_choices.py +16 -1
  117. evalscope-1.1.0/evalscope/version.py +4 -0
  118. {evalscope-1.0.1 → evalscope-1.1.0/evalscope.egg-info}/PKG-INFO +6 -3
  119. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope.egg-info/SOURCES.txt +53 -38
  120. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope.egg-info/requires.txt +4 -37
  121. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope.egg-info/top_level.txt +0 -1
  122. evalscope-1.1.0/pyproject.toml +61 -0
  123. evalscope-1.1.0/setup.py +4 -0
  124. evalscope-1.0.1/MANIFEST.in +0 -4
  125. evalscope-1.0.1/evalscope/utils/function_utils.py +0 -29
  126. evalscope-1.0.1/evalscope/version.py +0 -4
  127. evalscope-1.0.1/evalscope.egg-info/not-zip-safe +0 -1
  128. evalscope-1.0.1/requirements/aigc.txt +0 -8
  129. evalscope-1.0.1/requirements/app.txt +0 -2
  130. evalscope-1.0.1/requirements/dev.txt +0 -5
  131. evalscope-1.0.1/requirements/docs.txt +0 -6
  132. evalscope-1.0.1/requirements/framework.txt +0 -29
  133. evalscope-1.0.1/requirements/opencompass.txt +0 -1
  134. evalscope-1.0.1/requirements/perf.txt +0 -10
  135. evalscope-1.0.1/requirements/rag.txt +0 -8
  136. evalscope-1.0.1/requirements/vlmeval.txt +0 -1
  137. evalscope-1.0.1/requirements.txt +0 -1
  138. evalscope-1.0.1/setup.py +0 -196
  139. evalscope-1.0.1/tests/__init__.py +0 -1
  140. evalscope-1.0.1/tests/benchmark/__init__.py +0 -1
  141. evalscope-1.0.1/tests/benchmark/test_eval.py +0 -385
  142. evalscope-1.0.1/tests/benchmark/test_image_edit.py +0 -65
  143. evalscope-1.0.1/tests/benchmark/test_t2i.py +0 -142
  144. evalscope-1.0.1/tests/benchmark/test_vlm.py +0 -80
  145. evalscope-1.0.1/tests/cli/__init__.py +0 -1
  146. evalscope-1.0.1/tests/cli/test_all.py +0 -269
  147. evalscope-1.0.1/tests/cli/test_collection.py +0 -99
  148. evalscope-1.0.1/tests/cli/test_custom.py +0 -268
  149. evalscope-1.0.1/tests/cli/test_reasoning.py +0 -81
  150. evalscope-1.0.1/tests/common.py +0 -73
  151. evalscope-1.0.1/tests/perf/__init__.py +0 -1
  152. evalscope-1.0.1/tests/perf/test_perf.py +0 -178
  153. evalscope-1.0.1/tests/rag/test_clip_benchmark.py +0 -87
  154. evalscope-1.0.1/tests/rag/test_mteb.py +0 -213
  155. evalscope-1.0.1/tests/rag/test_ragas.py +0 -128
  156. evalscope-1.0.1/tests/swift/__init__.py +0 -1
  157. evalscope-1.0.1/tests/swift/test_run_swift_eval.py +0 -146
  158. evalscope-1.0.1/tests/swift/test_run_swift_vlm_eval.py +0 -128
  159. evalscope-1.0.1/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  160. evalscope-1.0.1/tests/test_run_all.py +0 -12
  161. evalscope-1.0.1/tests/utils.py +0 -13
  162. evalscope-1.0.1/tests/vlm/__init__.py +0 -1
  163. evalscope-1.0.1/tests/vlm/test_vlmeval.py +0 -102
  164. {evalscope-1.0.1 → evalscope-1.1.0}/LICENSE +0 -0
  165. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/__init__.py +0 -0
  166. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/__init__.py +0 -0
  167. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/__init__.py +0 -0
  168. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/adapters/__init__.py +0 -0
  169. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/benchmark/adapters/image_edit_adapter.py +0 -0
  170. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/dataset/__init__.py +0 -0
  171. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/dataset/dataset.py +0 -0
  172. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/dataset/loader.py +0 -0
  173. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/dataset/utils.py +0 -0
  174. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/evaluator/__init__.py +0 -0
  175. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/evaluator/cache.py +0 -0
  176. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/filter/__init__.py +0 -0
  177. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/filter/filter.py +0 -0
  178. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/messages/__init__.py +0 -0
  179. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/messages/content.py +0 -0
  180. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/messages/utils.py +0 -0
  181. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/metric/__init__.py +0 -0
  182. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/metric/metric.py +0 -0
  183. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/metric/scorer.py +0 -0
  184. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/model/__init__.py +0 -0
  185. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/model/model_output.py +0 -0
  186. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/registry.py +0 -0
  187. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/tool/__init__.py +0 -0
  188. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/tool/tool_call.py +0 -0
  189. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/api/tool/utils.py +0 -0
  190. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/__init__.py +0 -0
  191. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/app.py +0 -0
  192. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/arguments.py +0 -0
  193. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/constants.py +0 -0
  194. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/ui/__init__.py +0 -0
  195. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/ui/app_ui.py +0 -0
  196. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/ui/sidebar.py +0 -0
  197. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/ui/visualization.py +0 -0
  198. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/utils/env_utils.py +0 -0
  199. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/utils/localization.py +0 -0
  200. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/app/utils/text_utils.py +0 -0
  201. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/__init__.py +0 -0
  202. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/base.py +0 -0
  203. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/opencompass/__init__.py +0 -0
  204. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  205. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  206. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  207. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  208. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  209. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  210. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  211. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  212. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  213. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  214. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  215. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  216. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  217. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  218. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  219. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  220. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  221. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  222. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  223. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  224. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  225. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  226. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  227. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  228. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  229. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  230. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  231. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  232. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  233. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  234. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  235. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  236. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  237. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  238. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  239. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  240. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  241. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  242. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  243. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  244. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  245. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  246. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  247. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  248. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  249. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/__init__.py +0 -0
  250. {evalscope-1.0.1/evalscope/benchmarks/aime → evalscope-1.1.0/evalscope/benchmarks/ai2d}/__init__.py +0 -0
  251. {evalscope-1.0.1/evalscope/benchmarks/alpaca_eval → evalscope-1.1.0/evalscope/benchmarks/aime}/__init__.py +0 -0
  252. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  253. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  254. {evalscope-1.0.1/evalscope/benchmarks/arena_hard → evalscope-1.1.0/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
  255. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -0
  256. {evalscope-1.0.1/evalscope/benchmarks/bfcl → evalscope-1.1.0/evalscope/benchmarks/amc}/__init__.py +0 -0
  257. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  258. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  259. {evalscope-1.0.1/evalscope/benchmarks/chinese_simple_qa → evalscope-1.1.0/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
  260. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
  261. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/arena_hard/utils.py +0 -0
  262. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  263. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  264. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  265. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  266. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  267. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  268. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  269. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  270. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  271. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  272. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  273. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  274. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  275. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  276. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  277. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  278. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  279. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  280. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  281. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  282. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  283. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  284. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  285. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  286. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  287. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  288. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  289. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  290. {evalscope-1.0.1/evalscope/benchmarks/data_collection → evalscope-1.1.0/evalscope/benchmarks/bfcl}/__init__.py +0 -0
  291. {evalscope-1.0.1/evalscope/benchmarks/docmath → evalscope-1.1.0/evalscope/benchmarks/blink}/__init__.py +0 -0
  292. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  293. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  294. {evalscope-1.0.1/evalscope/benchmarks/drop → evalscope-1.1.0/evalscope/benchmarks/chartqa}/__init__.py +0 -0
  295. {evalscope-1.0.1/evalscope/benchmarks/frames → evalscope-1.1.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  296. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
  297. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  298. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  299. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  300. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  301. {evalscope-1.0.1/evalscope/benchmarks/general_arena → evalscope-1.1.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  302. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
  303. {evalscope-1.0.1/evalscope/benchmarks/general_mcq → evalscope-1.1.0/evalscope/benchmarks/docmath}/__init__.py +0 -0
  304. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
  305. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/docmath/utils.py +0 -0
  306. {evalscope-1.0.1/evalscope/benchmarks/gpqa → evalscope-1.1.0/evalscope/benchmarks/docvqa}/__init__.py +0 -0
  307. {evalscope-1.0.1/evalscope/benchmarks/hle → evalscope-1.1.0/evalscope/benchmarks/drop}/__init__.py +0 -0
  308. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/drop/utils.py +0 -0
  309. {evalscope-1.0.1/evalscope/benchmarks/ifeval → evalscope-1.1.0/evalscope/benchmarks/frames}/__init__.py +0 -0
  310. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
  311. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/frames/utils.py +0 -0
  312. {evalscope-1.0.1/evalscope/benchmarks/image_edit → evalscope-1.1.0/evalscope/benchmarks/general_arena}/__init__.py +0 -0
  313. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/general_arena/general_arena_adapter.py +0 -0
  314. {evalscope-1.0.1/evalscope/benchmarks/image_edit/gedit → evalscope-1.1.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  315. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
  316. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  317. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  318. {evalscope-1.0.1/evalscope/benchmarks/iquiz → evalscope-1.1.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  319. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  320. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/gpqa/prompt.py +0 -0
  321. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  322. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  323. {evalscope-1.0.1/evalscope/benchmarks/live_code_bench → evalscope-1.1.0/evalscope/benchmarks/healthbench}/__init__.py +0 -0
  324. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  325. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  326. {evalscope-1.0.1/evalscope/benchmarks/maritime_bench → evalscope-1.1.0/evalscope/benchmarks/hle}/__init__.py +0 -0
  327. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  328. {evalscope-1.0.1/evalscope/benchmarks/math_500 → evalscope-1.1.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  329. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
  330. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  331. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  332. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  333. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  334. {evalscope-1.0.1/evalscope/benchmarks/math_vista → evalscope-1.1.0/evalscope/benchmarks/image_edit}/__init__.py +0 -0
  335. {evalscope-1.0.1/evalscope/benchmarks/mmlu_pro → evalscope-1.1.0/evalscope/benchmarks/image_edit/gedit}/__init__.py +0 -0
  336. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +0 -0
  337. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/image_edit/gedit/utils.py +0 -0
  338. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/image_edit/gedit/vie_prompts.py +0 -0
  339. {evalscope-1.0.1/evalscope/benchmarks/mmlu_redux → evalscope-1.1.0/evalscope/benchmarks/infovqa}/__init__.py +0 -0
  340. {evalscope-1.0.1/evalscope/benchmarks/mmmu → evalscope-1.1.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  341. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  342. {evalscope-1.0.1/evalscope/benchmarks/mmmu_pro → evalscope-1.1.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  343. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  344. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  345. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  346. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  347. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
  348. {evalscope-1.0.1/evalscope/benchmarks/musr → evalscope-1.1.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  349. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
  350. {evalscope-1.0.1/evalscope/benchmarks/needle_haystack → evalscope-1.1.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
  351. {evalscope-1.0.1/evalscope/benchmarks/process_bench → evalscope-1.1.0/evalscope/benchmarks/math_vista}/__init__.py +0 -0
  352. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/math_vista/math_vista_adapter.py +0 -0
  353. {evalscope-1.0.1/evalscope/benchmarks/simple_qa → evalscope-1.1.0/evalscope/benchmarks/minerva_math}/__init__.py +0 -0
  354. {evalscope-1.0.1/evalscope/benchmarks/super_gpqa → evalscope-1.1.0/evalscope/benchmarks/mm_bench}/__init__.py +0 -0
  355. {evalscope-1.0.1/evalscope/benchmarks/tau_bench → evalscope-1.1.0/evalscope/benchmarks/mm_star}/__init__.py +0 -0
  356. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  357. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  358. {evalscope-1.0.1/evalscope/benchmarks/text2image → evalscope-1.1.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  359. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
  360. {evalscope-1.0.1/evalscope/benchmarks/tool_bench → evalscope-1.1.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  361. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
  362. {evalscope-1.0.1/evalscope/benchmarks/winogrande → evalscope-1.1.0/evalscope/benchmarks/mmmu}/__init__.py +0 -0
  363. {evalscope-1.0.1/evalscope/metrics/t2v_metrics → evalscope-1.1.0/evalscope/benchmarks/mmmu_pro}/__init__.py +0 -0
  364. {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models → evalscope-1.1.0/evalscope/benchmarks/multi_if}/__init__.py +0 -0
  365. {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-1.1.0/evalscope/benchmarks/musr}/__init__.py +0 -0
  366. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
  367. {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-1.1.0/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
  368. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
  369. {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-1.1.0/evalscope/benchmarks/ocr_bench}/__init__.py +0 -0
  370. {evalscope-1.0.1/evalscope/perf → evalscope-1.1.0/evalscope/benchmarks/ocr_bench_v2/spotting_eval}/__init__.py +0 -0
  371. {evalscope-1.0.1/evalscope/perf/utils → evalscope-1.1.0/evalscope/benchmarks/olympiad_bench}/__init__.py +0 -0
  372. {evalscope-1.0.1/evalscope/third_party/thinkbench/tools → evalscope-1.1.0/evalscope/benchmarks/omni_bench}/__init__.py +0 -0
  373. {evalscope-1.0.1/tests/rag → evalscope-1.1.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  374. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  375. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/race/__init__.py +0 -0
  376. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
  377. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
  378. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/super_gpqa/prompt.py +0 -0
  379. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  380. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  381. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/tau_bench/generation.py +0 -0
  382. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/text2image/evalmuse_adapter.py +0 -0
  383. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/text2image/genai_bench_adapter.py +0 -0
  384. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/text2image/general_t2i_adapter.py +0 -0
  385. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/text2image/hpdv2_adapter.py +0 -0
  386. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/text2image/tifa_adapter.py +0 -0
  387. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +0 -0
  388. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/tool_bench/utils.py +0 -0
  389. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  390. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  391. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  392. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  393. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  394. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -0
  395. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/__init__.py +0 -0
  396. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/base.py +0 -0
  397. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/cli.py +0 -0
  398. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/start_app.py +0 -0
  399. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/start_eval.py +0 -0
  400. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/start_perf.py +0 -0
  401. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/cli/start_server.py +0 -0
  402. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/collections/__init__.py +0 -0
  403. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/collections/sampler.py +0 -0
  404. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/collections/schema.py +0 -0
  405. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/evaluator/__init__.py +0 -0
  406. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/filters/__init__.py +0 -0
  407. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/filters/extraction.py +0 -0
  408. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/filters/selection.py +0 -0
  409. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/__init__.py +0 -0
  410. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  411. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  412. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/llm_judge.py +0 -0
  413. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/math_parser.py +0 -0
  414. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/rouge_metric.py +0 -0
  415. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
  416. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/constants.py +0 -0
  417. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
  418. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
  419. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
  420. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
  421. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
  422. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
  423. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
  424. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
  425. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
  426. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
  427. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
  428. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
  429. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
  430. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
  431. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
  432. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
  433. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
  434. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
  435. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
  436. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
  437. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
  438. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
  439. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
  440. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
  441. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
  442. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
  443. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  444. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  445. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  446. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  447. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  448. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  449. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  450. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  451. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  452. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  453. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
  454. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
  455. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
  456. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
  457. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
  458. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
  459. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
  460. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
  461. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
  462. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
  463. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
  464. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
  465. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
  466. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
  467. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
  468. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
  469. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
  470. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
  471. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
  472. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
  473. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
  474. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
  475. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
  476. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
  477. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
  478. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
  479. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
  480. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
  481. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
  482. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
  483. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
  484. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
  485. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
  486. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
  487. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
  488. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
  489. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
  490. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
  491. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
  492. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
  493. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
  494. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
  495. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
  496. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
  497. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
  498. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
  499. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
  500. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
  501. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
  502. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
  503. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
  504. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
  505. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/score.py +0 -0
  506. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
  507. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/__init__.py +0 -0
  508. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/image_edit_model.py +0 -0
  509. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/mockllm.py +0 -0
  510. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/modelscope.py +0 -0
  511. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/openai_compatible.py +0 -0
  512. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/models/text2image_model.py +0 -0
  513. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/benchmark.py +0 -0
  514. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/http_client.py +0 -0
  515. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/main.py +0 -0
  516. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/__init__.py +0 -0
  517. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  518. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  519. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  520. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  521. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  522. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  523. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  524. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  525. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  526. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
  527. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  528. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/plugin/registry.py +0 -0
  529. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/analysis_result.py +0 -0
  530. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/db_util.py +0 -0
  531. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/handler.py +0 -0
  532. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/local_server.py +0 -0
  533. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/log_utils.py +0 -0
  534. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/perf/utils/rich_display.py +0 -0
  535. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/report/generator.py +0 -0
  536. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/summarizer.py +0 -0
  537. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/__init__.py +0 -0
  538. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/README.md +0 -0
  539. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  540. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  541. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  542. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  543. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  544. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  545. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  546. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  547. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  548. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  549. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  550. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  551. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  552. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  553. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  554. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
  555. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/eval.py +0 -0
  556. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/infer.py +0 -0
  557. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  558. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  559. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  560. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  561. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  562. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  563. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  564. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  565. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  566. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  567. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  568. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  569. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  570. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  571. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/__init__.py +0 -0
  572. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/argument_utils.py +0 -0
  573. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/chat_service.py +0 -0
  574. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/deprecation_utils.py +0 -0
  575. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope/utils/url_utils.py +0 -0
  576. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope.egg-info/dependency_links.txt +0 -0
  577. {evalscope-1.0.1 → evalscope-1.1.0}/evalscope.egg-info/entry_points.txt +0 -0
  578. {evalscope-1.0.1 → evalscope-1.1.0}/setup.cfg +0 -0
@@ -0,0 +1,10 @@
1
+ include README.md
2
+
3
+ # Include all resources (code + other files) inside the package
4
+ recursive-include evalscope *
5
+
6
+ # Exclude cache/compiled artifacts
7
+ global-exclude *.py[cod] __pycache__ *.so *.dylib
8
+
9
+ # If there are models/large files, you can add prune/exclude as needed
10
+ # Example: prune evalscope/models
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/evalscope
6
5
  Author: ModelScope team
7
6
  Author-email: contact@modelscope.cn
8
7
  License: Apache License 2.0
8
+ Project-URL: Homepage, https://github.com/modelscope/evalscope
9
9
  Keywords: python,llm,evaluation
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Operating System :: OS Independent
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: License :: OSI Approved :: Apache Software License
17
18
  Requires-Python: >=3.9
18
19
  Description-Content-Type: text/markdown
19
20
  Provides-Extra: opencompass
@@ -145,7 +146,9 @@ Please scan the QR code below to join our community groups:
145
146
  > **Version 1.0 Refactoring**
146
147
  >
147
148
  > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
148
-
149
+ - 🔥 **[2025.10.14]** Added support for OCRBench, OCRBench-v2, DocVQA, InfoVQA, ChartQA, and BLINK multimodal image-text evaluation benchmarks.
150
+ - 🔥 **[2025.09.22]** Code evaluation benchmarks (HumanEval, LiveCodeBench) now support running in a sandbox environment. To use this feature, please install [ms-enclave](https://github.com/modelscope/ms-enclave) first.
151
+ - 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
149
152
  - 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
150
153
  - 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
151
154
  - 🔥 **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
@@ -116,7 +116,9 @@ Please scan the QR code below to join our community groups:
116
116
  > **Version 1.0 Refactoring**
117
117
  >
118
118
  > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
119
-
119
+ - 🔥 **[2025.10.14]** Added support for OCRBench, OCRBench-v2, DocVQA, InfoVQA, ChartQA, and BLINK multimodal image-text evaluation benchmarks.
120
+ - 🔥 **[2025.09.22]** Code evaluation benchmarks (HumanEval, LiveCodeBench) now support running in a sandbox environment. To use this feature, please install [ms-enclave](https://github.com/modelscope/ms-enclave) first.
121
+ - 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
120
122
  - 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
121
123
  - 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
122
124
  - 🔥 **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
@@ -128,6 +128,9 @@ class DefaultDataAdapter(DataAdapter):
128
128
  for sample in self.test_dataset[subset]:
129
129
  if isinstance(sample.input, str):
130
130
  sample.input = self.process_sample_str_input(sample, subset)
131
+ elif isinstance(sample.input, list):
132
+ # Handle list[ChatMessage] and add system prompt if needed
133
+ sample.input = self.process_sample_messages_input(sample, subset)
131
134
 
132
135
  def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
133
136
  """
@@ -142,6 +145,15 @@ class DefaultDataAdapter(DataAdapter):
142
145
  input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
143
146
  return input_messages
144
147
 
148
+ def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
149
+ """
150
+ Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
151
+ """
152
+ messages = list(sample.input) # shallow copy to avoid in-place mutations
153
+ if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
154
+ messages = [ChatMessageSystem(content=self.system_prompt)] + messages
155
+ return messages
156
+
145
157
  def process_sample_input(self, sample: Sample, subset: str) -> str:
146
158
  """
147
159
  Process a single sample's input by applying prompt templates and few-shot formatting.
@@ -642,9 +654,7 @@ class DefaultDataAdapter(DataAdapter):
642
654
  """
643
655
  pass
644
656
 
645
- def _on_generate_report(
646
- self, scores: Dict[str, List[AggScore]], model_name: str, add_aggregation_name: bool = True
647
- ) -> Report:
657
+ def _on_generate_report(self, scores: Dict[str, List[AggScore]], model_name: str) -> Report:
648
658
  """
649
659
  Hook method called during report generation.
650
660
 
@@ -660,7 +670,7 @@ class DefaultDataAdapter(DataAdapter):
660
670
  Report: The generated evaluation report
661
671
  """
662
672
  return ReportGenerator.generate_report(
663
- score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=add_aggregation_name
673
+ score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=self.add_aggregation_name
664
674
  )
665
675
 
666
676
  @override
@@ -682,3 +692,7 @@ class DefaultDataAdapter(DataAdapter):
682
692
  report = self._on_generate_report(scores, model_name=model_name)
683
693
  self._on_generate_report_end(report, output_dir, **kwargs)
684
694
  return report
695
+
696
+ def finalize(self, *args, **kwargs):
697
+ # Finalize the evaluation process
698
+ self.sandbox_finalize(*args, **kwargs)
@@ -18,8 +18,11 @@ class MultiChoiceAdapter(DefaultDataAdapter):
18
18
  This adapter formats the input for multi-choice questions and handles few-shot examples.
19
19
  """
20
20
 
21
- multiple_correct: bool = False
22
- """Whether the benchmark allows multiple correct answers."""
21
+ def __init__(self, **kwargs):
22
+ super().__init__(**kwargs)
23
+
24
+ self.multiple_correct: bool = False
25
+ """Whether the benchmark allows multiple correct answers."""
23
26
 
24
27
  def format_prompt_template(self, sample: Sample) -> str:
25
28
  """
@@ -19,6 +19,11 @@ logger = get_logger()
19
19
  class Text2ImageAdapter(DefaultDataAdapter):
20
20
  """Text to Image Adapter for benchmarks."""
21
21
 
22
+ def __init__(self, **kwargs):
23
+ super().__init__(**kwargs)
24
+
25
+ self.add_aggregation_name = False # Do not add aggregation name in the report by default
26
+
22
27
  def load_from_disk(self, **kwargs):
23
28
  return super().load_from_disk(use_local_loader=True)
24
29
 
@@ -150,7 +155,3 @@ class Text2ImageAdapter(DefaultDataAdapter):
150
155
  score.metadata[metric_name] = f'error: {str(e)}'
151
156
 
152
157
  return score
153
-
154
- def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
155
- # Don't add aggregation name for needle haystack adapter
156
- return super()._on_generate_report(scores, model_name, False)
@@ -3,4 +3,6 @@ from .default_data_adapter import DefaultDataAdapter
3
3
 
4
4
  class VisionLanguageAdapter(DefaultDataAdapter):
5
5
  """Adapter for vision-language benchmarks. e.g., image captioning, visual question answering, etc."""
6
- pass
6
+
7
+ def __init__(self, **kwargs):
8
+ super().__init__(**kwargs)
@@ -9,7 +9,7 @@ from evalscope.api.dataset import DatasetDict, Sample
9
9
  from evalscope.api.evaluator import TaskState
10
10
  from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
11
11
  from evalscope.api.metric import AggScore, SampleScore
12
- from evalscope.api.mixin import LLMJudgeMixin
12
+ from evalscope.api.mixin import LLMJudgeMixin, SandboxMixin
13
13
  from evalscope.api.model import Model
14
14
  from evalscope.report import Report
15
15
  from evalscope.utils.logger import get_logger
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
21
21
  logger = get_logger()
22
22
 
23
23
 
24
- class DataAdapter(LLMJudgeMixin, ABC):
24
+ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
25
25
  """
26
26
  Data Adapter for the benchmark.
27
27
  """
@@ -43,6 +43,12 @@ class DataAdapter(LLMJudgeMixin, ABC):
43
43
  self.save_metadata = True
44
44
  """Whether to save metadata in the review result"""
45
45
 
46
+ self.add_aggregation_name = True
47
+ """Whether to add aggregation name in the report"""
48
+
49
+ self.add_overall_metric = True
50
+ """Whether to add overall metric in the report"""
51
+
46
52
  self.category_map = {}
47
53
  """Category map for the benchmark"""
48
54
 
@@ -86,6 +92,11 @@ class DataAdapter(LLMJudgeMixin, ABC):
86
92
  """
87
93
  pass
88
94
 
95
+ @abstractmethod
96
+ def finalize(self, *args, **kwargs) -> None:
97
+ """Finalize the evaluation process."""
98
+ pass
99
+
89
100
  @property
90
101
  def name(self) -> str:
91
102
  """
@@ -334,6 +345,20 @@ class DataAdapter(LLMJudgeMixin, ABC):
334
345
  """
335
346
  self._benchmark_meta.shuffle_choices = value
336
347
 
348
+ @property
349
+ def review_timeout(self) -> Optional[float]:
350
+ """
351
+ Return the timeout for the review process.
352
+ """
353
+ return self._benchmark_meta.review_timeout
354
+
355
+ @review_timeout.setter
356
+ def review_timeout(self, value: float):
357
+ """
358
+ Set the timeout for the review process.
359
+ """
360
+ self._benchmark_meta.review_timeout = value
361
+
337
362
  @contextlib.contextmanager
338
363
  def _temporary_attribute(self, attr_name: str, new_value):
339
364
  """
@@ -79,6 +79,9 @@ class BenchmarkMeta:
79
79
  shuffle_choices: bool = False
80
80
  """Whether to shuffle the choices in multiple-choice datasets."""
81
81
 
82
+ review_timeout: Optional[float] = None
83
+ """ Timeout for review in seconds."""
84
+
82
85
  extra_params: Dict = field(default_factory=dict)
83
86
  """ Additional parameters for the benchmark."""
84
87
 
@@ -54,3 +54,8 @@ class Evaluator(abc.ABC):
54
54
  def get_report(self, *args, **kwargs) -> Report:
55
55
  """Get the evaluation report."""
56
56
  pass
57
+
58
+ @abc.abstractmethod
59
+ def finalize(self, *args, **kwargs) -> None:
60
+ """Finalize the evaluation process."""
61
+ pass
@@ -273,3 +273,8 @@ class TaskState:
273
273
  def target(self) -> str:
274
274
  """The scoring target for this `Sample`."""
275
275
  return self._target.text
276
+
277
+ @target.setter
278
+ def target(self, text: str) -> None:
279
+ """Set the target for review purposes."""
280
+ self._target = Target(text)
@@ -3,7 +3,7 @@ from pydantic import BaseModel, Field, JsonValue, model_validator
3
3
  from typing import Any, Dict, List, Literal, Optional, Type, Union
4
4
 
5
5
  from evalscope.api.tool import ToolCall, ToolCallError
6
- from .content import Content, ContentImage, ContentReasoning, ContentText
6
+ from .content import Content, ContentAudio, ContentImage, ContentReasoning, ContentText
7
7
  from .utils import parse_content_with_reasoning
8
8
 
9
9
 
@@ -225,6 +225,11 @@ def messages_to_markdown(messages: List[ChatMessage], max_length: Optional[int]
225
225
  if max_length and len(image_base64) > max_length:
226
226
  image_base64 = image_base64[:max_length]
227
227
  content_parts.append(f'![image]({image_base64})')
228
+ elif isinstance(content_item, ContentAudio):
229
+ audio_base64 = content_item.audio
230
+ if max_length and len(audio_base64) > max_length:
231
+ audio_base64 = audio_base64[:max_length]
232
+ content_parts.append(f"<audio controls src='{audio_base64}'></audio>")
228
233
  elif isinstance(content_item, ContentReasoning):
229
234
  content_parts.append(f'**Reasoning:** {content_item.reasoning}')
230
235
 
@@ -1 +1,2 @@
1
1
  from .llm_judge_mixin import LLMJudgeMixin
2
+ from .sandbox_mixin import SandboxMixin
@@ -24,6 +24,8 @@ class LLMJudgeMixin:
24
24
 
25
25
  self._llm_judge: Optional[LLMJudge] = None
26
26
 
27
+ super().__init__(task_config=task_config)
28
+
27
29
  @property
28
30
  def llm_judge(self) -> Optional[LLMJudge]:
29
31
  """Get LLM judge instance with lazy initialization."""
@@ -0,0 +1,204 @@
1
+ import asyncio
2
+ import threading
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
4
+
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ from ms_enclave.sandbox.manager import SandboxManager
9
+
10
+ from evalscope.config import TaskConfig
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class SandboxMixin:
16
+ """Sandbox mixin for sandboxed code execution."""
17
+
18
+ def __init__(self, task_config: 'TaskConfig'):
19
+ self._task_config = task_config
20
+
21
+ self._manager: Optional['SandboxManager'] = None
22
+ """Sandbox manager instance."""
23
+
24
+ self._sandbox_id: Optional[str] = None
25
+ """Sandbox ID."""
26
+
27
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
28
+ """Event loop for async operations."""
29
+
30
+ # Initialize sandbox synchronously by running async methods
31
+ if self.use_sandbox:
32
+ self._loop = asyncio.new_event_loop()
33
+
34
+ # Start the loop in a separate thread
35
+ def run_loop():
36
+ asyncio.set_event_loop(self._loop)
37
+ self._loop.run_forever()
38
+
39
+ self._loop_thread = threading.Thread(target=run_loop, daemon=True)
40
+ self._loop_thread.start()
41
+
42
+ # Wait for initialization
43
+ future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
44
+ future.result()
45
+
46
+ super().__init__()
47
+
48
+ async def _async_init(self):
49
+ """Async initialization helper."""
50
+ await self.init_sandbox_manager_async()
51
+ await self.init_sandbox_async()
52
+
53
+ @property
54
+ def use_sandbox(self) -> bool:
55
+ """
56
+ Return whether to use sandbox for the benchmark.
57
+ """
58
+ if not self._task_config:
59
+ return False
60
+ else:
61
+ return self._task_config.use_sandbox
62
+
63
+ @property
64
+ def sandbox_manager(self) -> Optional['SandboxManager']:
65
+ """Get the sandbox manager instance."""
66
+ return self._manager
67
+
68
+ @property
69
+ def sandbox_id(self) -> Optional[str]:
70
+ """Get the sandbox ID."""
71
+ return self._sandbox_id
72
+
73
+ async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
74
+ """Initialize the sandbox manager asynchronously."""
75
+ if self._manager is not None:
76
+ return self._manager
77
+
78
+ if not self.use_sandbox:
79
+ return None
80
+
81
+ from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
82
+
83
+ manager_config = self._task_config.sandbox_manager_config or {}
84
+ if manager_config.get('base_url'):
85
+ # Remote manager
86
+ self._manager = HttpSandboxManager(**manager_config)
87
+ else:
88
+ # Local manager
89
+ self._manager = LocalSandboxManager(**manager_config)
90
+
91
+ await self._manager.start()
92
+ logger.info('Sandbox manager initialized.')
93
+ return self._manager
94
+
95
+ def init_sandbox_manager(self) -> Optional['SandboxManager']:
96
+ """Initialize the sandbox manager."""
97
+ if self._manager is not None:
98
+ return self._manager
99
+
100
+ if not self.use_sandbox:
101
+ return None
102
+
103
+ # Use the dedicated loop if available
104
+ if self._loop and not self._loop.is_closed():
105
+ future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
106
+ return future.result()
107
+ else:
108
+ # Fallback for cases where no loop is available
109
+ return asyncio.run(self.init_sandbox_manager_async())
110
+
111
+ async def init_sandbox_async(self) -> Optional[str]:
112
+ """Initialize the sandbox instance asynchronously."""
113
+ if self._sandbox_id is not None:
114
+ return self._sandbox_id
115
+
116
+ if not self.use_sandbox:
117
+ return None
118
+
119
+ from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
120
+
121
+ sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
122
+ image='python:3.11-slim', tools_config={
123
+ 'shell_executor': {},
124
+ 'python_executor': {}
125
+ }
126
+ )
127
+ sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
128
+
129
+ self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
130
+
131
+ sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
132
+
133
+ logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
134
+ return self._sandbox_id
135
+
136
+ def init_sandbox(self) -> Optional[str]:
137
+ """Initialize the sandbox instance."""
138
+ if self._sandbox_id is not None:
139
+ return self._sandbox_id
140
+
141
+ if not self.use_sandbox:
142
+ return None
143
+
144
+ # Use the dedicated loop if available
145
+ if self._loop and not self._loop.is_closed():
146
+ future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
147
+ return future.result()
148
+ else:
149
+ # Fallback for cases where no loop is available
150
+ return asyncio.run(self.init_sandbox_async())
151
+
152
+ def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
153
+ """Execute code in the sandbox."""
154
+ if not self._sandbox_id or not self._manager:
155
+ logger.warning('Sandbox is not initialized.')
156
+ return {'error': 'Sandbox is not initialized.'}
157
+
158
+ from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
159
+
160
+ async def _execute_async():
161
+ if language.lower() == 'python':
162
+ tool_name = 'python_executor'
163
+ parameters = {'code': code, 'timeout': timeout}
164
+ result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
165
+ elif language.lower() == 'shell':
166
+ tool_name = 'shell_executor'
167
+ parameters = {'command': code, 'timeout': timeout}
168
+ result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
169
+ else:
170
+ logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
171
+ result = ToolResult(
172
+ status=ExecutionStatus.ERROR,
173
+ tool_name='code_executor',
174
+ output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
175
+ )
176
+ return result
177
+
178
+ # Use the dedicated loop if available
179
+ if self._loop and not self._loop.is_closed():
180
+ future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
181
+ result = future.result(timeout + 10) # Add some buffer to the timeout
182
+ else:
183
+ # Fallback for cases where no loop is available
184
+ result = asyncio.run(_execute_async())
185
+
186
+ return result.model_dump(exclude_none=True)
187
+
188
+ def sandbox_finalize(self, *args, **kwargs):
189
+ """Finalize the sandbox manager."""
190
+ if self._manager:
191
+ try:
192
+ if self._loop and not self._loop.is_closed():
193
+ # Stop the manager using the dedicated loop
194
+ future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
195
+ future.result(timeout=30)
196
+
197
+ # Stop the event loop
198
+ self._loop.call_soon_threadsafe(self._loop.stop)
199
+ if hasattr(self, '_loop_thread'):
200
+ self._loop_thread.join(timeout=5)
201
+
202
+ logger.info('Sandbox manager finalized.')
203
+ except Exception as e:
204
+ logger.warning(f'Error finalizing sandbox manager: {e}')
@@ -36,9 +36,6 @@ class GenerateConfig(BaseModel):
36
36
  stream: Optional[bool] = Field(default=None)
37
37
  """Whether to stream the response (default is model specific)."""
38
38
 
39
- system_message: Optional[str] = Field(default=None)
40
- """Override the default system message."""
41
-
42
39
  max_tokens: Optional[int] = Field(default=None)
43
40
  """The maximum number of tokens that can be generated in the completion (default is model specific)."""
44
41
 
@@ -365,7 +365,7 @@ def get_model(
365
365
 
366
366
  logger.info(
367
367
  f'Creating model {model} with eval_type={eval_type} '
368
- f'base_url={base_url}, api_key={api_key}, config={config}, model_args={model_args}'
368
+ f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
369
369
  )
370
370
 
371
371
  # find a matching model type
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  from dataclasses import dataclass
3
3
  from docstring_parser import Docstring, parse
4
- from pydantic import BaseModel, Field
4
+ from pydantic import BaseModel, Field, field_validator
5
5
  from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
6
6
 
7
7
  from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
204
204
  data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
205
205
 
206
206
  # Get subset choices - should be same for both models
207
- subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
207
+ # Only select the subsets that Cat.0 is not '-'
208
+ df_for_subsets = data_score_df_a.copy()
209
+ subsets = sorted(
210
+ df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
211
+ ReportKey.subset_name].dropna().unique().tolist()
212
+ )
208
213
 
209
214
  return gr.update(choices=subsets, value=None), None
210
215
 
@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
134
134
  )
135
135
  def update_single_report_dataset(dataset_name, report_list):
136
136
  logger.debug(f'Updating single report dataset: {dataset_name}')
137
- report_df = get_data_frame(report_list=report_list)
137
+ report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
138
138
  analysis = get_report_analysis(report_list, dataset_name)
139
139
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
140
140
  data_score_plot = plot_single_dataset_scores(data_score_df)
141
- subsets = data_score_df[ReportKey.subset_name].unique().tolist()
141
+ # Only select the subsets that Cat.0 is not '-'
142
+ df_for_subsets = data_score_df.copy()
143
+ subsets = sorted(
144
+ df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
145
+ ReportKey.subset_name].dropna().unique().tolist()
146
+ )
147
+
142
148
  logger.debug(f'subsets: {subsets}')
143
149
  return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
144
150
 
@@ -168,9 +168,10 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
168
168
  'Index': str(review_result.index),
169
169
  'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
170
  'Metadata': metadata,
171
- 'Generated': prediction,
171
+ 'Generated': prediction or '', # Ensure no None value
172
172
  'Gold': target,
173
- 'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
173
+ 'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
174
+ or '', # Ensure no None value
174
175
  'Score': score.model_dump(exclude_none=True),
175
176
  'NScore': normalize_score(score.main_value)
176
177
  }
@@ -18,7 +18,7 @@ logger = get_logger()
18
18
  def plot_single_report_scores(df: pd.DataFrame):
19
19
  if df is None:
20
20
  return None
21
- logger.debug(f'df: {df}')
21
+ logger.debug(f'df: \n{df}')
22
22
  plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
23
23
 
24
24
  width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
36
36
  df = get_data_frame(report_list=report_list, flatten_metrics=False)
37
37
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
38
38
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
39
- logger.debug(f'df: {df}')
39
+ logger.debug(f'df: \n{df}')
40
40
  df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
41
41
 
42
42
  plot = px.sunburst(
@@ -87,6 +87,12 @@ def add_argument(parser: argparse.ArgumentParser):
87
87
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
88
88
  parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
89
89
  parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
90
+
91
+ # Sandbox-related arguments
92
+ parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
93
+ parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
94
+ parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
95
+ parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
90
96
  # yapf: enable
91
97
 
92
98