evalscope 0.5.5__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (219) hide show
  1. {evalscope-0.5.5 → evalscope-0.6.0}/PKG-INFO +15 -13
  2. {evalscope-0.5.5 → evalscope-0.6.0}/README.md +8 -8
  3. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  4. evalscope-0.6.0/evalscope/backend/rag_eval/__init__.py +4 -0
  5. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/backend_manager.py +28 -16
  6. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
  7. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
  8. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
  9. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
  10. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
  11. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
  12. evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
  13. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/arguments.py +5 -3
  14. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/base.py +9 -7
  15. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/task_template.py +4 -2
  16. evalscope-0.6.0/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
  17. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +1 -3
  18. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +6 -0
  19. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/ragas/__init__.py +1 -1
  20. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/ragas/arguments.py +15 -5
  21. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
  22. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
  23. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
  24. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/task_template.py +61 -0
  25. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
  26. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
  27. evalscope-0.6.0/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
  28. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/api/openai_api.py +2 -2
  29. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/http_client.py +1 -1
  30. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/openai_api.py +2 -0
  31. evalscope-0.6.0/evalscope/preprocess/tokenizers/__init__.py +0 -0
  32. evalscope-0.6.0/evalscope/version.py +4 -0
  33. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/PKG-INFO +15 -13
  34. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/SOURCES.txt +15 -0
  35. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/requires.txt +6 -4
  36. evalscope-0.5.5/evalscope/backend/rag_eval/__init__.py +0 -3
  37. evalscope-0.5.5/evalscope/backend/rag_eval/ragas/task_template.py +0 -117
  38. evalscope-0.5.5/evalscope/version.py +0 -4
  39. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/__init__.py +0 -0
  40. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/__init__.py +0 -0
  41. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/base.py +0 -0
  42. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/opencompass/__init__.py +0 -0
  43. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  44. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  45. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  46. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  47. {evalscope-0.5.5/evalscope/perf → evalscope-0.6.0/evalscope/backend/rag_eval/clip_benchmark/tasks}/__init__.py +0 -0
  48. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  49. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  50. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  51. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  52. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  53. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  54. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  55. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  56. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  57. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/__init__.py +0 -0
  58. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  59. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  60. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  61. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  62. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  63. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  64. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  65. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  66. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  67. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  68. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  69. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  70. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  71. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  72. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  73. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  74. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  75. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  76. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  77. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  78. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  79. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  80. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  81. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  82. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  83. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  84. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  85. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  86. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  87. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  88. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  89. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  90. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/benchmark.py +0 -0
  91. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  92. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  93. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  94. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  95. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  96. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  97. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  98. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  99. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  100. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/data_adapter.py +0 -0
  101. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  102. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  103. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  104. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  105. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  106. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  107. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  108. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  109. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  110. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  111. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  112. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  113. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  114. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  115. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/race/__init__.py +0 -0
  116. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/race/race.py +0 -0
  117. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
  118. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  119. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  120. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  121. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  122. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  123. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  124. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/cache.py +0 -0
  125. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/cli/__init__.py +0 -0
  126. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/cli/base.py +0 -0
  127. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/cli/cli.py +0 -0
  128. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/cli/start_perf.py +0 -0
  129. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/cli/start_server.py +0 -0
  130. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/config.py +0 -0
  131. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/constants.py +0 -0
  132. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/evaluator/__init__.py +0 -0
  133. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/evaluator/evaluator.py +0 -0
  134. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/evaluator/rating_eval.py +0 -0
  135. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  136. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  137. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/__init__.py +0 -0
  138. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  139. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  140. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/code_metric.py +0 -0
  141. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/math_accuracy.py +0 -0
  142. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/metrics.py +0 -0
  143. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/metrics/rouge_metric.py +0 -0
  144. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/__init__.py +0 -0
  145. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/api/__init__.py +0 -0
  146. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/custom/__init__.py +0 -0
  147. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/custom/custom_model.py +0 -0
  148. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/dummy_chat_model.py +0 -0
  149. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/model.py +0 -0
  150. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/model_adapter.py +0 -0
  151. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/openai_model.py +0 -0
  152. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/models/template.py +0 -0
  153. {evalscope-0.5.5/evalscope/perf/datasets → evalscope-0.6.0/evalscope/perf}/__init__.py +0 -0
  154. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/_logging.py +0 -0
  155. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/api_plugin_base.py +0 -0
  156. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/custom_api.py +0 -0
  157. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/dashscope_api.py +0 -0
  158. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/dataset_plugin_base.py +0 -0
  159. {evalscope-0.5.5/evalscope/preprocess/tokenizers → evalscope-0.6.0/evalscope/perf/datasets}/__init__.py +0 -0
  160. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/datasets/line_by_line.py +0 -0
  161. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
  162. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/datasets/openqa.py +0 -0
  163. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/how_to_analysis_result.py +0 -0
  164. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/plugin_registry.py +0 -0
  165. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/query_parameters.py +0 -0
  166. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/perf/server_sent_event.py +0 -0
  167. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/preprocess/__init__.py +0 -0
  168. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
  169. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/__init__.py +0 -0
  170. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/arc.yaml +0 -0
  171. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  172. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  173. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  174. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  175. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  176. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  177. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  178. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  179. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  180. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  181. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/run.py +0 -0
  182. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/run_arena.py +0 -0
  183. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/run_ms.py +0 -0
  184. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/summarizer.py +0 -0
  185. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/__init__.py +0 -0
  186. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  187. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  188. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  189. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  190. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  191. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  192. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  193. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  194. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  195. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  196. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  197. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  198. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  199. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  200. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  201. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  202. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  203. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  204. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/tools/__init__.py +0 -0
  205. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/tools/combine_reports.py +0 -0
  206. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  207. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/tools/rewrite_eval_results.py +0 -0
  208. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/__init__.py +0 -0
  209. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/arena_utils.py +0 -0
  210. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/completion_parsers.py +0 -0
  211. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/logger.py +0 -0
  212. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/task_cfg_parser.py +0 -0
  213. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/task_utils.py +0 -0
  214. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope/utils/utils.py +0 -0
  215. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/dependency_links.txt +0 -0
  216. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/entry_points.txt +0 -0
  217. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/not-zip-safe +0 -0
  218. {evalscope-0.5.5 → evalscope-0.6.0}/evalscope.egg-info/top_level.txt +0 -0
  219. {evalscope-0.5.5 → evalscope-0.6.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.5.5
3
+ Version: 0.6.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,7 +19,7 @@ Requires-Dist: torch
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets<3.0.0,>=2.18.0
22
+ Requires-Dist: datasets<=3.0.1,>=3.0.0
23
23
  Requires-Dist: editdistance
24
24
  Requires-Dist: jsonlines
25
25
  Requires-Dist: matplotlib
@@ -52,8 +52,9 @@ Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
52
52
  Provides-Extra: vlmeval
53
53
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
54
54
  Provides-Extra: rag
55
- Requires-Dist: ragas; extra == "rag"
56
55
  Requires-Dist: mteb>=0.14.16; extra == "rag"
56
+ Requires-Dist: ragas<0.3,>=0.2.3; extra == "rag"
57
+ Requires-Dist: webdataset>0.2.0; extra == "rag"
57
58
  Provides-Extra: inner
58
59
  Requires-Dist: absl-py; extra == "inner"
59
60
  Requires-Dist: accelerate; extra == "inner"
@@ -86,7 +87,7 @@ Requires-Dist: torch; extra == "all"
86
87
  Requires-Dist: absl-py; extra == "all"
87
88
  Requires-Dist: accelerate; extra == "all"
88
89
  Requires-Dist: cachetools; extra == "all"
89
- Requires-Dist: datasets<3.0.0,>=2.18.0; extra == "all"
90
+ Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
90
91
  Requires-Dist: editdistance; extra == "all"
91
92
  Requires-Dist: jsonlines; extra == "all"
92
93
  Requires-Dist: matplotlib; extra == "all"
@@ -116,14 +117,18 @@ Requires-Dist: jieba; extra == "all"
116
117
  Requires-Dist: rouge-chinese; extra == "all"
117
118
  Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
118
119
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
119
- Requires-Dist: ragas; extra == "all"
120
120
  Requires-Dist: mteb>=0.14.16; extra == "all"
121
+ Requires-Dist: ragas<0.3,>=0.2.3; extra == "all"
122
+ Requires-Dist: webdataset>0.2.0; extra == "all"
121
123
 
122
- English | [简体中文](README_zh.md)
123
124
 
124
125
 
125
126
  ![](docs/en/_static/images/evalscope_logo.png)
126
127
 
128
+ <p align="center">
129
+ English | <a href="README_zh.md">简体中文</a>
130
+ </p>
131
+
127
132
  <p align="center">
128
133
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
129
134
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope">
@@ -132,7 +137,7 @@ English | [简体中文](README_zh.md)
132
137
  <img src='https://readthedocs.org/projects/evalscope-en/badge/?version=latest' alt='Documentation Status' />
133
138
  </a>
134
139
  <br>
135
- <a href="https://evalscope.readthedocs.io/en/latest/"><span style="font-size: 16px;">📖 Documents</span></a> &nbsp | &nbsp<a href="https://evalscope.readthedocs.io/zh-cn/latest/"><span style="font-size: 16px;"> 📖 中文文档</span></a>
140
+ <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
136
141
  <p>
137
142
 
138
143
 
@@ -146,7 +151,7 @@ English | [简体中文](README_zh.md)
146
151
  - [Offline Evaluation](#offline-evaluation)
147
152
  - [Arena Mode](#arena-mode)
148
153
  - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
149
- - [Leaderboard](#leaderboard)
154
+
150
155
 
151
156
  ## 📝 Introduction
152
157
 
@@ -172,6 +177,8 @@ The architecture includes the following modules:
172
177
 
173
178
 
174
179
  ## 🎉 News
180
+ - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
181
+ - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
175
182
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
176
183
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
177
184
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
@@ -368,11 +375,6 @@ A stress testing tool that focuses on large language models and can be customize
368
375
  Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html)
369
376
 
370
377
 
371
- ## Leaderboard
372
- The LLM Leaderboard aims to provide an objective and comprehensive evaluation standard and platform to help researchers and developers understand and compare the performance of models on various tasks on ModelScope.
373
-
374
- Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
375
-
376
378
 
377
379
  ## TO-DO List
378
380
  - [x] RAG evaluation
@@ -1,8 +1,11 @@
1
- English | [简体中文](README_zh.md)
2
1
 
3
2
 
4
3
  ![](docs/en/_static/images/evalscope_logo.png)
5
4
 
5
+ <p align="center">
6
+ English | <a href="README_zh.md">简体中文</a>
7
+ </p>
8
+
6
9
  <p align="center">
7
10
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
8
11
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope">
@@ -11,7 +14,7 @@ English | [简体中文](README_zh.md)
11
14
  <img src='https://readthedocs.org/projects/evalscope-en/badge/?version=latest' alt='Documentation Status' />
12
15
  </a>
13
16
  <br>
14
- <a href="https://evalscope.readthedocs.io/en/latest/"><span style="font-size: 16px;">📖 Documents</span></a> &nbsp | &nbsp<a href="https://evalscope.readthedocs.io/zh-cn/latest/"><span style="font-size: 16px;"> 📖 中文文档</span></a>
17
+ <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
15
18
  <p>
16
19
 
17
20
 
@@ -25,7 +28,7 @@ English | [简体中文](README_zh.md)
25
28
  - [Offline Evaluation](#offline-evaluation)
26
29
  - [Arena Mode](#arena-mode)
27
30
  - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
28
- - [Leaderboard](#leaderboard)
31
+
29
32
 
30
33
  ## 📝 Introduction
31
34
 
@@ -51,6 +54,8 @@ The architecture includes the following modules:
51
54
 
52
55
 
53
56
  ## 🎉 News
57
+ - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
58
+ - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
54
59
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
55
60
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
56
61
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
@@ -247,11 +252,6 @@ A stress testing tool that focuses on large language models and can be customize
247
252
  Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html)
248
253
 
249
254
 
250
- ## Leaderboard
251
- The LLM Leaderboard aims to provide an objective and comprehensive evaluation standard and platform to help researchers and developers understand and compare the performance of models on various tasks on ModelScope.
252
-
253
- Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
254
-
255
255
 
256
256
  ## TO-DO List
257
257
  - [x] RAG evaluation
@@ -50,12 +50,12 @@ with read_base():
50
50
  from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
51
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
52
  from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
53
- from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
54
53
 
55
54
  # Note: to be supported
56
55
  # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
57
56
  # from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
58
57
  # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
58
+ # from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
59
59
 
60
60
 
61
61
  datasets = []
@@ -0,0 +1,4 @@
1
+ from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
2
+ from evalscope.backend.rag_eval.utils.llm import LLM, LocalLLM, ChatOpenAI
3
+ from evalscope.backend.rag_eval.utils.clip import VisionModel
4
+ from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
@@ -24,45 +24,57 @@ class RAGEvalBackendManager(BackendManager):
24
24
  else:
25
25
  logger.error(f"Please install `{module_name}` first")
26
26
 
27
- def run_mteb(self):
27
+ @staticmethod
28
+ def run_mteb(model_args, eval_args):
28
29
  from evalscope.backend.rag_eval.cmteb import ModelArguments, EvalArguments
29
30
  from evalscope.backend.rag_eval.cmteb import one_stage_eval, two_stage_eval
30
31
 
31
- if len(self.model_args) > 2:
32
+ if len(model_args) > 2:
32
33
  raise ValueError("Not support multiple models yet")
33
34
 
34
35
  # Convert arguments to dictionary
35
- model_args_list = [ModelArguments(**args).to_dict() for args in self.model_args]
36
- eval_args = EvalArguments(**self.eval_args).to_dict()
36
+ model_args_list = [ModelArguments(**args).to_dict() for args in model_args]
37
+ eval_args = EvalArguments(**eval_args).to_dict()
37
38
 
38
39
  if len(model_args_list) == 1:
39
40
  one_stage_eval(model_args_list[0], eval_args)
40
41
  else: # len(model_args_list) == 2
41
42
  two_stage_eval(model_args_list[0], model_args_list[1], eval_args)
42
43
 
43
- def run_ragas(self):
44
- from evalscope.backend.rag_eval.ragas import rag_eval, testset_generation
44
+ @staticmethod
45
+ def run_ragas(testset_args, eval_args):
46
+ from evalscope.backend.rag_eval.ragas import rag_eval
47
+ from evalscope.backend.rag_eval.ragas.tasks import generate_testset
45
48
  from evalscope.backend.rag_eval.ragas import (
46
49
  TestsetGenerationArguments,
47
50
  EvaluationArguments,
48
51
  )
49
52
 
50
- if self.testset_args is not None:
51
- testset_generation(TestsetGenerationArguments(**self.testset_args))
52
- if self.eval_args is not None:
53
- rag_eval(EvaluationArguments(**self.eval_args))
53
+ if testset_args is not None:
54
+ generate_testset(TestsetGenerationArguments(**testset_args))
55
+ if eval_args is not None:
56
+ rag_eval(EvaluationArguments(**eval_args))
57
+
58
+ @staticmethod
59
+ def run_clip_benchmark(args):
60
+ from evalscope.backend.rag_eval.clip_benchmark import Arguments, evaluate
61
+
62
+ evaluate(Arguments(**args))
54
63
 
55
64
  def run(self, *args, **kwargs):
56
65
  tool = self.config_d.pop("tool")
57
66
  if tool.lower() == "mteb":
58
67
  self._check_env("mteb")
59
- self.model_args = self.config_d["model"]
60
- self.eval_args = self.config_d["eval"]
61
- self.run_mteb()
68
+ model_args = self.config_d["model"]
69
+ eval_args = self.config_d["eval"]
70
+ self.run_mteb(model_args, eval_args)
62
71
  elif tool.lower() == "ragas":
63
72
  self._check_env("ragas")
64
- self.testset_args = self.config_d.get("testset_generation", None)
65
- self.eval_args = self.config_d.get("eval", None)
66
- self.run_ragas()
73
+ testset_args = self.config_d.get("testset_generation", None)
74
+ eval_args = self.config_d.get("eval", None)
75
+ self.run_ragas(testset_args, eval_args)
76
+ elif tool.lower() == "clip_benchmark":
77
+ self._check_env("webdataset")
78
+ self.run_clip_benchmark(self.config_d["eval"])
67
79
  else:
68
80
  raise ValueError(f"Unknown tool: {tool}")
@@ -0,0 +1,2 @@
1
+ from evalscope.backend.rag_eval.clip_benchmark.task_template import evaluate
2
+ from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
@@ -0,0 +1,34 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Dict
3
+
4
+
5
+ @dataclass
6
+ class Arguments:
7
+ # fmt: off
8
+ """
9
+ A dataclass to store and manage the arguments for the model configuration and data processing.
10
+ """
11
+ """
12
+ For CLIP model support, you can use the following fields:
13
+ model_name: str
14
+ revision: str = "master"
15
+ hub: str = "modelscope"
16
+
17
+ For API VLM model support, you can use the following fields, (image caption only):
18
+ model_name="gpt-4o-mini"
19
+ api_base: str = "",
20
+ api_key: Optional[str] = None
21
+ prompt: str = None
22
+ """
23
+ models: List[Dict] = field(default_factory=dict) # List of paths to the pre-trained models or model identifiers
24
+ dataset_name: List[str] = field(default_factory=list) # List of dataset names to be used
25
+ data_dir: str = None # Root directory where the datasets are stored
26
+ split: str = "test" # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
27
+ task: str = None
28
+ batch_size: int = 128 # Batch size for data loading
29
+ num_workers: int = 1 # Number of workers for data loading
30
+ verbose: bool = True # Flag to enable verbose logging
31
+ output_dir: str = "outputs" # Directory where the outputs (e.g., predictions, logs) will be saved
32
+ cache_dir: str = "cache" # Directory where the dataset cache will be stored
33
+ skip_existing: bool = False # Flag to skip processing if outputs already exist
34
+ limit: int = None # Limit the number of samples to be processed
@@ -0,0 +1,277 @@
1
+ import os
2
+ import torch
3
+ from torch.utils.data import DataLoader, Dataset as TorchDataset
4
+ from evalscope.utils.logger import get_logger
5
+
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ def build_dataset(
11
+ dataset_name,
12
+ root=None,
13
+ transform=None,
14
+ split="test",
15
+ wds_cache_dir=None,
16
+ **kwargs,
17
+ ):
18
+ """
19
+ Main function to use in order to build a dataset instance,
20
+
21
+ dataset_name: str
22
+ name of the dataset
23
+
24
+ root: str
25
+ root folder where the dataset is downloaded and stored. can be shared among datasets.
26
+
27
+ transform: torchvision transform applied to images
28
+
29
+ split: str
30
+ split to use, depending on the dataset can have different options.
31
+ In general, `train` and `test` are available.
32
+ For specific splits, please look at the corresponding dataset.
33
+
34
+ custom_classname_file: str or None
35
+ Custom classname file where keys are dataset names and values are list of classnames.
36
+
37
+ custom_template_file: str or None
38
+ Custom template file where keys are dataset names and values are list of prompts, or dicts
39
+ where keys are classnames and values are class-specific prompts.
40
+
41
+ """
42
+
43
+ if dataset_name == "dummy":
44
+ ds = Dummy()
45
+ elif dataset_name == "custom":
46
+ ds = build_custom_dataset(dataset_name, data_dir=root, transform=transform)
47
+ else:
48
+ # WebDataset support using `webdataset` library
49
+ ds = build_wds_dataset(
50
+ dataset_name,
51
+ transform=transform,
52
+ split=split,
53
+ data_dir=root,
54
+ cache_dir=wds_cache_dir,
55
+ )
56
+
57
+ return ds
58
+
59
+
60
+ class Dummy:
61
+
62
+ def __init__(self):
63
+ self.classes = ["blank image", "noisy image"]
64
+
65
+ def __getitem__(self, i):
66
+ return torch.zeros(3, 224, 224), 0
67
+
68
+ def __len__(self):
69
+ return 1
70
+
71
+
72
+ class DatasetWrapper(TorchDataset):
73
+ def __init__(self, dataset, transform=None, image_key="image", text_key="query"):
74
+ self.dataset = dataset
75
+ self.transform = transform
76
+ self.image_key = image_key
77
+ self.text_key = text_key
78
+
79
+ def __len__(self):
80
+ return len(self.dataset)
81
+
82
+ def __getitem__(self, idx):
83
+ item = self.dataset[idx]
84
+
85
+ # 加载图像
86
+ image = item[self.image_key]
87
+ if self.transform is not None:
88
+ image = self.transform(image, return_tensors="pt")
89
+
90
+ # 获取查询列表
91
+ query = item[self.text_key]
92
+ if isinstance(query, str):
93
+ query = [query]
94
+
95
+ return image, query
96
+
97
+
98
+ def get_dataset_default_task(dataset):
99
+ if dataset in (
100
+ "custom",
101
+ "muge",
102
+ "flickr30k",
103
+ "flickr8k",
104
+ "mscoco_captions",
105
+ "mscoco_captions2017",
106
+ "multilingual_mscoco_captions",
107
+ "flickr30k-200",
108
+ "crossmodal3600",
109
+ "xtd200",
110
+ ):
111
+ return "zeroshot_retrieval"
112
+ else:
113
+ return "zeroshot_classification"
114
+
115
+
116
+ def get_dataloader(dataset_name, dataset, batch_size, num_workers):
117
+ if dataset_name == "custom":
118
+ dataloader = DataLoader(
119
+ dataset,
120
+ batch_size=batch_size,
121
+ shuffle=False,
122
+ num_workers=num_workers,
123
+ collate_fn=image_captions_collate_fn,
124
+ )
125
+ else:
126
+ dataloader = DataLoader(
127
+ dataset.batched(batch_size),
128
+ batch_size=None,
129
+ shuffle=False,
130
+ num_workers=num_workers,
131
+ )
132
+ return dataloader
133
+
134
+
135
+ def image_captions_collate_fn(batch):
136
+ transposed = list(zip(*batch))
137
+ imgs = transposed[0]
138
+ texts = transposed[1]
139
+ return imgs, texts
140
+
141
+
142
+ def build_custom_dataset(dataset_name, data_dir, transform=None):
143
+ from datasets import load_dataset, Features, Image, Sequence, Value
144
+
145
+ qrels_ds = load_dataset(
146
+ "json",
147
+ data_files=os.path.join(data_dir, "image_queries.jsonl"),
148
+ features=Features(
149
+ {"image_path": Image(decode=True), "query": Sequence(Value("string"))}
150
+ ),
151
+ split="train",
152
+ )
153
+
154
+ dataset = DatasetWrapper(
155
+ qrels_ds, transform, image_key="image_path", text_key="query"
156
+ )
157
+ return dataset
158
+
159
+
160
+ def build_wds_dataset(
161
+ dataset_name, transform, split="test", data_dir="root", cache_dir=None
162
+ ):
163
+ """
164
+ Load a dataset in WebDataset format. Either local paths or HTTP URLs can be specified.
165
+ Expected file structure is:
166
+ ```
167
+ data_dir/
168
+ train/
169
+ nshards.txt
170
+ 0.tar
171
+ 1.tar
172
+ ...
173
+ test/
174
+ nshards.txt
175
+ 0.tar
176
+ 1.tar
177
+ ...
178
+ classnames.txt
179
+ zeroshot_classification_templates.txt
180
+ dataset_type.txt
181
+ ```
182
+ Classnames and templates are required for zeroshot classification, while dataset type
183
+ (equal to "retrieval") is required for zeroshot retrieval datasets.
184
+
185
+ You can use the `clip_benchmark_export_wds` or corresponding API
186
+ (`clip_benchmark.webdataset_builder.convert_dataset`) to convert datasets to this format.
187
+
188
+ Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
189
+ """
190
+ import webdataset as wds
191
+
192
+ def read_txt(fname):
193
+ if "://" in fname:
194
+ stream = os.popen("curl -L -s --fail '%s'" % fname, "r")
195
+ value = stream.read()
196
+ if stream.close():
197
+ raise FileNotFoundError("Failed to retreive data")
198
+ else:
199
+ with open(fname, "r") as file:
200
+ value = file.read()
201
+ return value
202
+
203
+ if not data_dir:
204
+ data_dir = f"https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master"
205
+
206
+ # Git LFS files have a different file path to access the raw data than other files
207
+ if data_dir.startswith("https://modelscope.cn/datasets"):
208
+ *split_url_head, _, url_path = data_dir.split("/", 7)
209
+ url_head = "/".join(split_url_head)
210
+ metadata_dir = "/".join([url_head, "resolve", url_path])
211
+ tardata_dir = "/".join([url_head, "resolve", url_path])
212
+ else:
213
+ metadata_dir = tardata_dir = data_dir
214
+ # Get number of shards
215
+ nshards_fname = os.path.join(metadata_dir, split, "nshards.txt")
216
+ nshards = int(
217
+ read_txt(nshards_fname)
218
+ ) # Do not catch FileNotFound, nshards.txt should be mandatory
219
+
220
+ # Get dataset type (classification or retrieval)
221
+ type_fname = os.path.join(metadata_dir, "dataset_type.txt")
222
+ try:
223
+ dataset_type = read_txt(type_fname).strip().lower()
224
+ except FileNotFoundError:
225
+ dataset_type = "classification"
226
+
227
+ filepattern = os.path.join(tardata_dir, split, "{0..%d}.tar" % (nshards - 1))
228
+ # Load webdataset (support WEBP, PNG, and JPG for now)
229
+ if not cache_dir or not isinstance(cache_dir, str):
230
+ cache_dir = None
231
+ else:
232
+ os.makedirs(cache_dir, exist_ok=True)
233
+ dataset = wds.WebDataset(
234
+ filepattern,
235
+ cache_dir=cache_dir,
236
+ nodesplitter=lambda src: src,
237
+ shardshuffle=False,
238
+ verbose=True,
239
+ ).decode(
240
+ wds.autodecode.ImageHandler("pil", extensions=["webp", "png", "jpg", "jpeg"])
241
+ )
242
+
243
+ # Load based on classification or retrieval task
244
+ if dataset_type == "retrieval":
245
+ dataset = dataset.to_tuple(["webp", "png", "jpg", "jpeg"], "txt").map_tuple(
246
+ transform, str.splitlines
247
+ )
248
+ dataset.classes = dataset.templates = None
249
+ else:
250
+ label_type = (
251
+ "npy" if dataset_type == "multilabel" else "cls"
252
+ ) # Special case for multilabel
253
+ dataset = dataset.to_tuple(
254
+ ["webp", "png", "jpg", "jpeg"], label_type
255
+ ).map_tuple(transform, None)
256
+ # Get class names if present
257
+ classnames_fname = os.path.join(metadata_dir, "classnames.txt")
258
+ try:
259
+ dataset.classes = [
260
+ line.strip() for line in read_txt(classnames_fname).splitlines()
261
+ ]
262
+ except FileNotFoundError:
263
+ logger.warning("WARNING: classnames.txt not found")
264
+ dataset.classes = None
265
+ # Get zeroshot classification templates if present
266
+ templates_fname = os.path.join(
267
+ metadata_dir, "zeroshot_classification_templates.txt"
268
+ )
269
+ try:
270
+ dataset.templates = [
271
+ line.strip() for line in read_txt(templates_fname).splitlines()
272
+ ]
273
+ except FileNotFoundError:
274
+ logger.warning("WARNING: zeroshot_classification_templates.txt not found")
275
+ dataset.templates = None
276
+
277
+ return dataset
@@ -0,0 +1,119 @@
1
+ import os
2
+ import torch
3
+ import json
4
+ from itertools import product
5
+
6
+ from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
7
+ build_dataset,
8
+ get_dataset_default_task,
9
+ get_dataloader,
10
+ )
11
+ from evalscope.backend.rag_eval.clip_benchmark.tasks import (
12
+ zeroshot_classification,
13
+ zeroshot_retrieval,
14
+ image_caption,
15
+ )
16
+ from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
17
+ from evalscope.backend.rag_eval.utils.clip import VisionModel
18
+ from evalscope.utils.logger import get_logger
19
+
20
+ logger = get_logger()
21
+
22
+
23
+ def evaluate(args: Arguments):
24
+ models = args.models
25
+ dataset_names = args.dataset_name
26
+ data_dir = args.data_dir
27
+ split = args.split
28
+ batch_size = args.batch_size
29
+ num_workers = args.num_workers
30
+ verbose = args.verbose
31
+ input_task = args.task
32
+ output_dir = args.output_dir
33
+ cache_dir = args.cache_dir
34
+ skip_existing = args.skip_existing
35
+ limit = args.limit
36
+
37
+ # Iterate over model and dataset combinations
38
+ for model_cfg, dataset_name in product(models, dataset_names):
39
+ task = input_task or get_dataset_default_task(dataset_name)
40
+ model_name = os.path.basename(model_cfg["model_name"])
41
+
42
+ output_path = os.path.join(output_dir, model_name)
43
+ os.makedirs(output_path, exist_ok=True)
44
+ output_file = os.path.join(output_path, f"{dataset_name}_{task}.json")
45
+
46
+ # Skip evaluation if the result already exists and skip_existing is True
47
+ if os.path.exists(output_file) and skip_existing:
48
+ if verbose:
49
+ logger.info(f"Skip {output_dir}, exists already.")
50
+ return
51
+
52
+ # Determine device (CPU or GPU)
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ model_cfg["device"] = device
55
+ # Initialize the model
56
+ model = VisionModel.load(**model_cfg)
57
+
58
+ # Build the dataset
59
+ dataset = build_dataset(
60
+ dataset_name=dataset_name,
61
+ root=data_dir,
62
+ transform=model.transform,
63
+ split=split,
64
+ wds_cache_dir=f"{cache_dir}/{dataset_name}",
65
+ )
66
+
67
+ # Create the dataloader
68
+ dataloader = get_dataloader(dataset_name, dataset, batch_size, num_workers)
69
+
70
+ # Evaluate based on the task
71
+ if task == "zeroshot_classification":
72
+ zeroshot_templates = (
73
+ dataset.templates if hasattr(dataset, "templates") else None
74
+ )
75
+ if verbose:
76
+ logger.info(f"Zero-shot templates: {zeroshot_templates}")
77
+ classnames = dataset.classes if hasattr(dataset, "classes") else None
78
+ assert (
79
+ zeroshot_templates is not None and classnames is not None
80
+ ), "Dataset does not support classification"
81
+ metrics = zeroshot_classification.evaluate(
82
+ model,
83
+ dataloader,
84
+ classnames,
85
+ zeroshot_templates,
86
+ device=device,
87
+ verbose=verbose,
88
+ limit=limit,
89
+ )
90
+ elif task == "zeroshot_retrieval":
91
+ metrics = zeroshot_retrieval.evaluate(
92
+ model, dataloader, recall_k_list=[5], device=device, limit=limit
93
+ )
94
+ elif task == "image_caption":
95
+ output_path = os.path.join(output_path, dataset_name, "retrieval_data")
96
+ metrics = image_caption.evaluate(
97
+ model, dataloader, limit=limit, output_path=output_path
98
+ )
99
+
100
+ # Prepare dump data
101
+ dump = {
102
+ "dataset": dataset_name,
103
+ "model": model_name,
104
+ "task": task,
105
+ "metrics": metrics,
106
+ }
107
+
108
+ if verbose:
109
+ logger.info(f"Evaluation results: {dump}")
110
+
111
+ # Write the results to output file
112
+ if verbose:
113
+ logger.info(f"Dump results to: {output_file}")
114
+ with open(output_file, "w") as f:
115
+ json.dump(dump, f)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ evaluate()