evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,89 +0,0 @@
1
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import datasets
15
- import json
16
- import os
17
- import pandas as pd
18
-
19
- _CITATION = """\
20
- @article{2017arXivtriviaqa,
21
- author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
22
- Daniel and {Zettlemoyer}, Luke},
23
- title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
24
- journal = {arXiv e-prints},
25
- year = 2017,
26
- eid = {arXiv:1705.03551},
27
- pages = {arXiv:1705.03551},
28
- archivePrefix = {arXiv},
29
- eprint = {1705.03551},
30
- }
31
- """
32
-
33
- _DESCRIPTION = """\
34
- TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
35
- """
36
-
37
- _HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'
38
-
39
- _URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'
40
-
41
- task_list = ['default']
42
-
43
-
44
- class TriviaQAConfig(datasets.BuilderConfig):
45
-
46
- def __init__(self, **kwargs):
47
- super().__init__(version=datasets.Version('1.0.0'), **kwargs)
48
-
49
-
50
- class TriviaQA(datasets.GeneratorBasedBuilder):
51
- BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]
52
-
53
- def _info(self):
54
- features = datasets.Features({
55
- 'input': [{
56
- 'role': datasets.features.Value('string'),
57
- 'content': datasets.features.Value('string'),
58
- }],
59
- 'ideal': [datasets.Value('string')],
60
- })
61
- return datasets.DatasetInfo(
62
- description=_DESCRIPTION,
63
- features=features,
64
- homepage=_HOMEPAGE,
65
- citation=_CITATION,
66
- )
67
-
68
- def _split_generators(self, dl_manager):
69
- data_dir = dl_manager.download_and_extract(_URL)
70
- return [
71
- datasets.SplitGenerator(
72
- name=datasets.Split.TEST,
73
- gen_kwargs={
74
- 'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
75
- },
76
- ),
77
- datasets.SplitGenerator(
78
- name=datasets.Split('dev'),
79
- gen_kwargs={
80
- 'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
81
- },
82
- ),
83
- ]
84
-
85
- def _generate_examples(self, filepath):
86
- with open(filepath, encoding='utf-8') as f:
87
- contents = [json.loads(line) for line in f.readlines()]
88
- for i, instance in enumerate(contents):
89
- yield i, instance
@@ -1,163 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """TruthfulQA dataset."""
16
- # flake8: noqa
17
-
18
- import csv
19
- import datasets
20
- import json
21
-
22
- _CITATION = """\
23
- @misc{lin2021truthfulqa,
24
- title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
25
- author={Stephanie Lin and Jacob Hilton and Owain Evans},
26
- year={2021},
27
- eprint={2109.07958},
28
- archivePrefix={arXiv},
29
- primaryClass={cs.CL}
30
- }
31
- """
32
-
33
- _DESCRIPTION = """\
34
- TruthfulQA is a benchmark to measure whether a language model is truthful in
35
- generating answers to questions. The benchmark comprises 817 questions that
36
- span 38 categories, including health, law, finance and politics. Questions are
37
- crafted so that some humans would answer falsely due to a false belief or
38
- misconception. To perform well, models must avoid generating false answers
39
- learned from imitating human texts.
40
- """
41
-
42
- _HOMEPAGE = 'https://github.com/sylinrl/TruthfulQA'
43
-
44
- _LICENSE = 'Apache License 2.0'
45
-
46
-
47
- class TruthfulQaConfig(datasets.BuilderConfig):
48
- """BuilderConfig for TruthfulQA."""
49
-
50
- def __init__(self, url, features, **kwargs):
51
- """BuilderConfig for TruthfulQA.
52
- Args:
53
- url: *string*, the url to the configuration's data.
54
- features: *list[string]*, list of features that'll appear in the feature dict.
55
- **kwargs: keyword arguments forwarded to super.
56
- """
57
- super().__init__(version=datasets.Version('1.1.0'), **kwargs)
58
- self.url = url
59
- self.features = features
60
-
61
-
62
- class TruthfulQa(datasets.GeneratorBasedBuilder):
63
- """TruthfulQA is a benchmark to measure whether a language model is truthful in generating answers to questions."""
64
-
65
- BUILDER_CONFIGS = [
66
- TruthfulQaConfig(
67
- name='generation',
68
- # url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
69
- url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/TruthfulQA.csv',
70
- features=datasets.Features({
71
- 'type': datasets.Value('string'),
72
- 'category': datasets.Value('string'),
73
- 'question': datasets.Value('string'),
74
- 'best_answer': datasets.Value('string'),
75
- 'correct_answers': datasets.features.Sequence(datasets.Value('string')),
76
- 'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
77
- 'source': datasets.Value('string'),
78
- }),
79
- description=
80
- "The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
81
- ),
82
- TruthfulQaConfig(
83
- name='multiple_choice',
84
- # url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
85
- url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/mc_task.json',
86
- features=datasets.Features({
87
- 'question': datasets.Value('string'),
88
- 'mc1_targets': {
89
- 'choices': datasets.features.Sequence(datasets.Value('string')),
90
- 'labels': datasets.features.Sequence(datasets.Value('int32')),
91
- },
92
- 'mc2_targets': {
93
- 'choices': datasets.features.Sequence(datasets.Value('string')),
94
- 'labels': datasets.features.Sequence(datasets.Value('int32')),
95
- },
96
- }),
97
- description=
98
- "The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
99
- ),
100
- ]
101
-
102
- def _info(self):
103
- return datasets.DatasetInfo(
104
- description=_DESCRIPTION,
105
- features=self.config.features,
106
- homepage=_HOMEPAGE,
107
- license=_LICENSE,
108
- citation=_CITATION,
109
- )
110
-
111
- def _split_generators(self, dl_manager):
112
- data_dir = dl_manager.download(self.config.url)
113
- return [
114
- datasets.SplitGenerator(
115
- name=datasets.Split.VALIDATION,
116
- gen_kwargs={
117
- 'filepath': data_dir,
118
- },
119
- ),
120
- ]
121
-
122
- def _split_csv_list(self, csv_list: str, delimiter: str = ';') -> str:
123
- """
124
- Splits a csv list field, delimited by `delimiter` (';'), into a list
125
- of strings.
126
- """
127
- csv_list = csv_list.strip().split(delimiter)
128
- return [item.strip() for item in csv_list]
129
-
130
- def _generate_examples(self, filepath):
131
- if self.config.name == 'multiple_choice':
132
- # Multiple choice data is in a `JSON` file.
133
- with open(filepath, encoding='utf-8') as f:
134
- contents = json.load(f)
135
- for key, row in enumerate(contents):
136
- yield key, {
137
- 'question': row['question'],
138
- 'mc1_targets': {
139
- 'choices': list(row['mc1_targets'].keys()),
140
- 'labels': list(row['mc1_targets'].values()),
141
- },
142
- 'mc2_targets': {
143
- 'choices': list(row['mc2_targets'].keys()),
144
- 'labels': list(row['mc2_targets'].values()),
145
- },
146
- }
147
- else:
148
- # Generation data is in a `CSV` file.
149
- with open(filepath, newline='', encoding='utf-8-sig') as f:
150
- contents = csv.DictReader(f)
151
- for key, row in enumerate(contents):
152
- # Ensure that references exist.
153
- if not row['Correct Answers'] or not row['Incorrect Answers']:
154
- continue
155
- yield key, {
156
- 'type': row['Type'],
157
- 'category': row['Category'],
158
- 'question': row['Question'],
159
- 'best_answer': row['Best Answer'],
160
- 'correct_answers': self._split_csv_list(row['Correct Answers']),
161
- 'incorrect_answers': self._split_csv_list(row['Incorrect Answers']),
162
- 'source': row['Source'],
163
- }
@@ -1,60 +0,0 @@
1
- from dataclasses import asdict, dataclass
2
- from functools import wraps
3
- from typing import Dict, List, Optional, Union
4
-
5
- from .filters import Filter
6
-
7
-
8
- @dataclass
9
- class PromptData:
10
- data: List[str]
11
- index: Optional[Union[int, str]] = 0
12
- system_prompt: Optional[str] = None
13
- multi_choices: Optional[List[str]] = None
14
- id: Optional[str] = None
15
- messages: Optional[List[dict]] = None
16
- extra_data: Optional[Dict] = None
17
-
18
- def to_dict(self) -> Dict:
19
- return {k: v for k, v in asdict(self).items() if v is not None}
20
-
21
-
22
- def preprocess_decorator(func):
23
-
24
- @wraps(func)
25
- def wrapper(self, result: str, raw_input_d: dict = None, **kwargs):
26
- if result is None:
27
- result = ''
28
- filters = self.config_kwargs.get('filters', None)
29
- if filters:
30
- # Apply filters to the resultply filters to the result
31
- for filter_name, filter_value in filters.items():
32
- result = Filter.apply(filter_name, result, filter_value)
33
- return func(self, result, raw_input_d, **kwargs)
34
-
35
- return wrapper
36
-
37
-
38
- def load_file_with_extension(file_path: Union[str, List[str]]) -> List[dict]:
39
- """
40
- Load a file with a specific extension and return its content as a list of dictionaries.
41
- """
42
- import json
43
- import os
44
-
45
- if isinstance(file_path, str):
46
- file_path = [file_path]
47
-
48
- data = []
49
- for path in file_path:
50
- if not os.path.exists(path):
51
- raise FileNotFoundError(f'The file {path} does not exist.')
52
-
53
- with open(path, 'r', encoding='utf-8') as f:
54
- if path.endswith('.json'):
55
- data.extend(json.load(f))
56
- elif path.endswith('.jsonl'):
57
- data.extend([json.loads(line) for line in f])
58
- elif path.endswith('.txt'):
59
- data.extend([{'text': f.read()}])
60
- return data
@@ -1,375 +0,0 @@
1
- import json
2
- import os
3
- import pandas as pd
4
- import random
5
- from collections import defaultdict
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from copy import deepcopy
8
- from tabulate import tabulate
9
- from tqdm import tqdm
10
- from typing import Any, Dict, List
11
-
12
- from evalscope.benchmarks import Benchmark, DataAdapter
13
- from evalscope.collections.sampler import DatasetEntry
14
- from evalscope.config import TaskConfig
15
- from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
16
- from evalscope.evaluator import Evaluator
17
- from evalscope.models import initialize_model_adapter
18
- from evalscope.report import ReportGenerator
19
- from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
20
- from evalscope.utils.logger import get_logger
21
-
22
- logger = get_logger()
23
-
24
-
25
- class SimpleEvaluator(Evaluator):
26
-
27
- def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
28
- super().__init__(
29
- dataset_name_or_path=dataset_name,
30
- data_adapter=data_adapter,
31
- model_adapter=model_adapter,
32
- task_cfg=task_cfg,
33
- outputs=outputs)
34
-
35
- def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
36
- input_prompts = [sample.prompt for sample in samples]
37
- subset_name = samples[0].subset_name
38
- try:
39
- # get answer from model
40
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
41
- except Exception as e:
42
- logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
43
- # if ignore_errors is True, continue to next input
44
- if self.task_cfg.ignore_errors:
45
- logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
46
- return [None] * len(samples), samples
47
- else:
48
- raise e
49
- # process answers
50
- answers_list = []
51
- for answer_d, input_prompt in zip(answer_ds, input_prompts):
52
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
53
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
54
- answers_list.append(processed_answer)
55
- return answers_list, samples
56
-
57
- def get_review(self, answer_d) -> dict:
58
- review_id, reviewer_spec = self._generate_review_id(answer_d)
59
- review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
60
- return review_d
61
-
62
- def get_score(self, review_d) -> float:
63
- metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
64
- return metric_score
65
-
66
-
67
- class EvaluatorCollection:
68
-
69
- def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure, base_model):
70
- self.task_cfg = task_cfg
71
- self.data_adapter = data_adapter
72
- self.outputs = outputs
73
- self.model = base_model
74
-
75
- self.dataset, self.dataset_name = self.load()
76
- self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
77
- self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
78
- self.evaluators = self._initialize_evaluators()
79
-
80
- def load(self) -> tuple[List[DatasetEntry], str]:
81
- dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
82
- raw_dataset = self.data_adapter.load()
83
- # random limit the dataset
84
- limit = len(raw_dataset)
85
- if self.task_cfg.limit is not None:
86
- if isinstance(self.task_cfg.limit, int):
87
- limit = self.task_cfg.limit
88
- elif isinstance(self.task_cfg.limit, float):
89
- limit = int(len(raw_dataset) * self.task_cfg.limit)
90
- raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
91
- # index dataset
92
- datasets = []
93
- for sample in raw_dataset:
94
- sample['prompt'].update({'index': sample['index']})
95
- datasets.append(DatasetEntry(**sample))
96
-
97
- return datasets, dataset_name
98
-
99
- @staticmethod
100
- def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
101
- dataset_name_map = defaultdict(lambda: defaultdict(list))
102
- for sample in dataset:
103
- dataset_name, subset_name = sample.dataset_name, sample.subset_name
104
- dataset_name_map[dataset_name][subset_name].append(sample.index)
105
- return dataset_name_map
106
-
107
- @staticmethod
108
- def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
109
- dataset_id_map = {}
110
- for sample in dataset:
111
- dataset_id_map[sample.index] = sample
112
- return dataset_id_map
113
-
114
- def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
115
- evaluators = {}
116
- # load dataset args
117
- dataset_args = deepcopy(self.task_cfg.dataset_args)
118
- common_args = dataset_args.get(DataCollection.NAME, {})
119
- for dataset_name in self.dataset_name_map.keys():
120
- benchmark = Benchmark.get(dataset_name)
121
- model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
122
- # update dataset args
123
- cur_dataset_args = dataset_args.get(dataset_name, {})
124
- cur_dataset_args.update(common_args)
125
- # get data adapter
126
- data_adapter = benchmark.get_data_adapter(cur_dataset_args)
127
- evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
128
- self.outputs)
129
- return evaluators
130
-
131
- def get_report(self, scores):
132
- if not scores:
133
- return
134
-
135
- def get_dataframe(scores):
136
- data = []
137
- for dataset_name, data_map in self.dataset_name_map.items():
138
- for subset_name, ids in data_map.items():
139
- for _id in ids:
140
- row_data: DatasetEntry = self.dataset_id_map[_id]
141
- for metric in scores[_id]:
142
- data.append(
143
- dict(
144
- task_type=row_data.task_type,
145
- categories=tuple(row_data.categories),
146
- dataset_name=dataset_name,
147
- subset_name=subset_name,
148
- tags=row_data.tags,
149
- metric=metric['metric_name'],
150
- score=metric['score']))
151
- return pd.DataFrame(data)
152
-
153
- def aggregate_and_sort(df, group_by_cols):
154
- # aggregate by group_by_cols, and calculate average_score and count
155
- report_df = df.groupby(group_by_cols) \
156
- .agg(average_score=('score', 'mean'), count=('score', 'size')) \
157
- .reset_index()
158
- report_df['average_score'] = report_df['average_score'].round(4)
159
- report_df = report_df.sort_values(by='count', ascending=False) \
160
- .to_dict(orient='records')
161
- return report_df
162
-
163
- df = get_dataframe(scores)
164
-
165
- # multi-level aggregation
166
- subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
167
- dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
168
- task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
169
-
170
- # explode tags to multiple rows
171
- df_exploded_tags = df.explode('tags')
172
- tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
173
-
174
- # process multi-level categories
175
- df_categories = df.copy()
176
- # multi-level aggregation for categories
177
- max_depth = df_categories['categories'].apply(len).max()
178
- for level in range(max_depth):
179
- df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
180
- if len(x) > level else '')
181
- category_report_df = aggregate_and_sort(df_categories,
182
- [f'category{level}' for level in range(max_depth)] + ['metric'])
183
-
184
- # convert to dict format
185
- report_dict = {
186
- 'subset_level': subset_report_df,
187
- 'dataset_level': dataset_report_df,
188
- 'task_level': task_report_df,
189
- 'tag_level': tag_report_df,
190
- 'category_level': category_report_df,
191
- }
192
-
193
- # record report
194
- for level, data in report_dict.items():
195
- table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
196
- logger.info(f'{level} Report:\n{table}')
197
-
198
- report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
199
- # Make report analysis
200
- if self.task_cfg.analysis_report:
201
- logger.info('Generating report analysis, please wait ...')
202
- analysis = report.generate_analysis(self.task_cfg.judge_model_args)
203
- logger.info('Report analysis:\n%s', analysis)
204
- else:
205
- logger.info('Skipping report analysis (`analysis_report=False`).')
206
-
207
- # save report to JSON file
208
- report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
209
- report.to_json(report_file_path)
210
-
211
- logger.info(f'Report saved to {report_file_path}')
212
- return report
213
-
214
- def _filter_answer(self, pred_file_path):
215
- answer_dict = defaultdict(dict)
216
- if self.task_cfg.use_cache and os.path.exists(pred_file_path):
217
- answers_list = jsonl_to_list(pred_file_path)
218
- # Create a set of sample indices for which we have answers
219
- indices = set()
220
- for answer in answers_list:
221
- index = answer.get(AnswerKeys.INDEX)
222
- answer_dict[index] = answer
223
- indices.add(index)
224
-
225
- # Filter dataset to only include samples that don't have answers
226
- data = [sample for sample in self.dataset if sample.index not in indices]
227
-
228
- # Initialize name map for the filtered dataset
229
- data_map = self._init_name_map(data)
230
-
231
- logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
232
- return answer_dict, data, data_map
233
- else:
234
- # If cache isn't enabled or file doesn't exist, return the full dataset
235
- return answer_dict, self.dataset, self.dataset_name_map
236
-
237
- def get_answers(self):
238
- pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
239
- f'{self.dataset_name}.jsonl')
240
- os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
241
-
242
- answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
243
-
244
- eval_batch_size = self.task_cfg.eval_batch_size
245
- # Process samples and get answers
246
- with tqdm(total=len(dataset), desc='Getting answers') as pbar:
247
- if self.task_cfg.eval_type == EvalType.SERVICE:
248
- # Create a thread pool for parallel processing
249
- with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
250
- futures = []
251
- for sample in dataset:
252
- evaluator = self.evaluators[sample.dataset_name]
253
- futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
254
- # Process completed tasks
255
- for future in as_completed(futures):
256
- answer_list, samples = future.result()
257
- for answer_d, sample in zip(answer_list, samples):
258
- if answer_d is None:
259
- continue
260
- answers[sample.index] = answer_d
261
- dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
262
- pbar.update(1)
263
- else:
264
- for dataset_name, data_map in dataset_name_map.items():
265
- # get evaluator for the dataset
266
- evaluator = self.evaluators[dataset_name]
267
- for subset_name, ids in data_map.items():
268
- for i in range(0, len(ids), eval_batch_size):
269
- # get batch samples
270
- batch_ids = ids[i:i + eval_batch_size]
271
- batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
272
- answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
273
- # update answers
274
- for answer_d, sample in zip(answer_list, samples):
275
- if answer_d is None:
276
- continue
277
- answers[sample.index] = answer_d
278
- dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
279
- pbar.update(1)
280
- return answers
281
-
282
- def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
283
- """
284
- Retrieve or generate reviews for given answers.
285
-
286
- Args:
287
- answers: Dictionary of answers indexed by sample index.
288
-
289
- Returns:
290
- Dictionary of reviews indexed by sample index.
291
- """
292
- # Set up the review file path
293
- review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
294
- os.makedirs(review_file_path, exist_ok=True)
295
-
296
- review_history_map = defaultdict(dict)
297
-
298
- # Handle caching logic
299
- if os.path.exists(review_file_path):
300
- if not self.task_cfg.use_cache:
301
- # Clear existing reviews if not using cache
302
- self._clear_review_files(review_file_path)
303
- else:
304
- # Load existing reviews if using cache
305
- self._load_existing_reviews(review_file_path, review_history_map)
306
-
307
- reviews = {}
308
- for sample in tqdm(self.dataset, desc='Getting reviews'):
309
- try:
310
- file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
311
-
312
- if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
313
- # Use cached review if available
314
- review_d = review_history_map[file_name][sample.index]
315
- else:
316
- # Generate new review
317
- evaluator = self.evaluators[sample.dataset_name]
318
- review_d = evaluator.get_review(answers[sample.index])
319
- # Only save the review if it's not in the cache
320
- self._save_review(review_file_path, file_name, review_d)
321
-
322
- reviews[sample.index] = review_d
323
- except Exception as e:
324
- logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
325
-
326
- return reviews
327
-
328
- def _clear_review_files(self, review_file_path: str) -> None:
329
- """Clear existing review files."""
330
- if os.path.isdir(review_file_path):
331
- for filename in os.listdir(review_file_path):
332
- file_path = os.path.join(review_file_path, filename)
333
- try:
334
- if os.path.isfile(file_path):
335
- os.remove(file_path)
336
- except Exception as e:
337
- logger.error(f'Error deleting file {file_path}: {e}')
338
- else:
339
- os.remove(review_file_path)
340
-
341
- def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
342
- """Load existing reviews from files."""
343
- logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
344
- if os.path.isdir(review_file_path):
345
- for filename in os.listdir(review_file_path):
346
- if '.ipynb_checkpoints' in filename:
347
- continue
348
- file_path = os.path.join(review_file_path, filename)
349
- with open(file_path, 'r') as f:
350
- review_history = [json.loads(line.strip()) for line in f]
351
- review_history_map[filename] = {item['index']: item for item in review_history}
352
-
353
- def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
354
- """Save a single review to file."""
355
- file_path = os.path.join(review_file_path, file_name)
356
- dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
357
-
358
- def get_scores(self, reviews) -> float:
359
- scores = defaultdict(dict)
360
- for sample in tqdm(self.dataset, desc='Getting scores'):
361
- evaluator = self.evaluators[sample.dataset_name]
362
- if sample.index not in reviews:
363
- continue
364
- review_d = reviews[sample.index]
365
- score = evaluator.get_score(review_d)
366
- scores[sample.index] = score
367
-
368
- return scores
369
-
370
- def eval(self, **kwargs):
371
- answers = self.get_answers()
372
- reviews = self.get_reviews(answers)
373
- scores = self.get_scores(reviews)
374
- report = self.get_report(scores)
375
- return report