evalscope 0.5.0__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (171) hide show
  1. {evalscope-0.5.0 → evalscope-0.5.2}/PKG-INFO +130 -26
  2. {evalscope-0.5.0 → evalscope-0.5.2}/README.md +35 -19
  3. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/backend_manager.py +1 -3
  4. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/tasks/eval_api.py +1 -0
  5. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +3 -5
  6. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/task_utils.py +1 -1
  7. evalscope-0.5.2/evalscope/version.py +4 -0
  8. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/PKG-INFO +130 -26
  9. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/entry_points.txt +0 -1
  10. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/requires.txt +4 -5
  11. evalscope-0.5.0/evalscope/version.py +0 -4
  12. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/__init__.py +0 -0
  13. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/__init__.py +0 -0
  14. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/base.py +0 -0
  15. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/__init__.py +0 -0
  16. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  17. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  18. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  19. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  20. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/__init__.py +0 -0
  21. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/arc/__init__.py +0 -0
  22. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  23. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  24. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
  25. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  26. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  27. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  28. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  29. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  30. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  31. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  32. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  33. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  34. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  35. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  36. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  37. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  38. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  39. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  40. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  41. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  42. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  43. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  44. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  45. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  46. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  47. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  48. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  49. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  50. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  51. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  52. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  53. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/benchmark.py +0 -0
  54. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
  55. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  56. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  57. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  58. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  59. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  60. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  61. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  62. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  63. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/data_adapter.py +0 -0
  64. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  65. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  66. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  67. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  68. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  69. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  70. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  71. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  72. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  73. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  74. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  75. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  76. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  77. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  78. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/race/__init__.py +0 -0
  79. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/race/race.py +0 -0
  80. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/race/race_adapter.py +0 -0
  81. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  82. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  83. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  84. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  85. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  86. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  87. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cache.py +0 -0
  88. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/__init__.py +0 -0
  89. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/base.py +0 -0
  90. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/cli.py +0 -0
  91. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/start_perf.py +0 -0
  92. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/start_server.py +0 -0
  93. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/config.py +0 -0
  94. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/constants.py +0 -0
  95. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/__init__.py +0 -0
  96. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/evaluator.py +0 -0
  97. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/rating_eval.py +0 -0
  98. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
  99. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  100. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/__init__.py +0 -0
  101. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  102. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  103. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/code_metric.py +0 -0
  104. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/math_accuracy.py +0 -0
  105. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/metrics.py +0 -0
  106. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/rouge_metric.py +0 -0
  107. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/__init__.py +0 -0
  108. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/custom/__init__.py +0 -0
  109. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/custom/custom_model.py +0 -0
  110. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/dummy_chat_model.py +0 -0
  111. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/model.py +0 -0
  112. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/model_adapter.py +0 -0
  113. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/openai_model.py +0 -0
  114. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/template.py +0 -0
  115. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/__init__.py +0 -0
  116. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/_logging.py +0 -0
  117. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/api_plugin_base.py +0 -0
  118. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/custom_api.py +0 -0
  119. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/dashscope_api.py +0 -0
  120. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/dataset_plugin_base.py +0 -0
  121. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/__init__.py +0 -0
  122. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/line_by_line.py +0 -0
  123. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
  124. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/openqa.py +0 -0
  125. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/how_to_analysis_result.py +0 -0
  126. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/http_client.py +0 -0
  127. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/openai_api.py +0 -0
  128. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/plugin_registry.py +0 -0
  129. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/query_parameters.py +0 -0
  130. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/server_sent_event.py +0 -0
  131. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/preprocess/__init__.py +0 -0
  132. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/preprocess/tokenizers/__init__.py +0 -0
  133. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
  134. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/__init__.py +0 -0
  135. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/arc.yaml +0 -0
  136. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/bbh.yaml +0 -0
  137. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  138. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/ceval.yaml +0 -0
  139. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  140. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  141. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  142. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
  143. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  144. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
  145. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  146. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/run.py +0 -0
  147. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/run_arena.py +0 -0
  148. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/run_ms.py +0 -0
  149. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/summarizer.py +0 -0
  150. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/__init__.py +0 -0
  151. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  152. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
  153. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
  154. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  155. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  156. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  157. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/__init__.py +0 -0
  158. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/combine_reports.py +0 -0
  159. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  160. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/rewrite_eval_results.py +0 -0
  161. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/__init__.py +0 -0
  162. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/arena_utils.py +0 -0
  163. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/completion_parsers.py +0 -0
  164. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/logger.py +0 -0
  165. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/task_cfg_parser.py +0 -0
  166. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/utils.py +0 -0
  167. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/SOURCES.txt +0 -0
  168. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/dependency_links.txt +0 -0
  169. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/not-zip-safe +0 -0
  170. {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/top_level.txt +0 -0
  171. {evalscope-0.5.0 → evalscope-0.5.2}/setup.cfg +0 -0
@@ -1,13 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.5.0
4
- Summary: Eval-Scope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/eval-scope
3
+ Version: 0.5.2
4
+ Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
+ Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
7
7
  Author-email: contact@modelscope.cn
8
- License: UNKNOWN
9
8
  Keywords: python,llm,evaluation
10
- Platform: UNKNOWN
11
9
  Classifier: Development Status :: 4 - Beta
12
10
  Classifier: License :: OSI Approved :: Apache Software License
13
11
  Classifier: Operating System :: OS Independent
@@ -17,17 +15,109 @@ Classifier: Programming Language :: Python :: 3.9
17
15
  Classifier: Programming Language :: Python :: 3.10
18
16
  Requires-Python: >=3.8
19
17
  Description-Content-Type: text/markdown
18
+ Requires-Dist: torch
19
+ Requires-Dist: absl-py
20
+ Requires-Dist: accelerate
21
+ Requires-Dist: cachetools
22
+ Requires-Dist: editdistance
23
+ Requires-Dist: jsonlines
24
+ Requires-Dist: matplotlib
25
+ Requires-Dist: modelscope[framework]
26
+ Requires-Dist: nltk
27
+ Requires-Dist: openai
28
+ Requires-Dist: pandas
29
+ Requires-Dist: plotly
30
+ Requires-Dist: pyarrow
31
+ Requires-Dist: pympler
32
+ Requires-Dist: pyyaml
33
+ Requires-Dist: regex
34
+ Requires-Dist: requests
35
+ Requires-Dist: requests-toolbelt
36
+ Requires-Dist: rouge-score
37
+ Requires-Dist: sacrebleu
38
+ Requires-Dist: scikit-learn
39
+ Requires-Dist: seaborn
40
+ Requires-Dist: sentencepiece
41
+ Requires-Dist: simple-ddl-parser
42
+ Requires-Dist: tabulate
43
+ Requires-Dist: tiktoken
44
+ Requires-Dist: tqdm
45
+ Requires-Dist: transformers<4.43,>=4.33
46
+ Requires-Dist: transformers_stream_generator
47
+ Requires-Dist: jieba
48
+ Requires-Dist: rouge-chinese
20
49
  Provides-Extra: opencompass
50
+ Requires-Dist: ms-opencompass>=0.0.5; extra == "opencompass"
21
51
  Provides-Extra: vlmeval
52
+ Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
22
53
  Provides-Extra: inner
54
+ Requires-Dist: absl-py; extra == "inner"
55
+ Requires-Dist: accelerate; extra == "inner"
56
+ Requires-Dist: alibaba_itag_sdk; extra == "inner"
57
+ Requires-Dist: dashscope; extra == "inner"
58
+ Requires-Dist: editdistance; extra == "inner"
59
+ Requires-Dist: jsonlines; extra == "inner"
60
+ Requires-Dist: jsonlines; extra == "inner"
61
+ Requires-Dist: nltk; extra == "inner"
62
+ Requires-Dist: openai; extra == "inner"
63
+ Requires-Dist: pandas==1.5.3; extra == "inner"
64
+ Requires-Dist: plotly; extra == "inner"
65
+ Requires-Dist: pyarrow; extra == "inner"
66
+ Requires-Dist: pyodps; extra == "inner"
67
+ Requires-Dist: pyyaml; extra == "inner"
68
+ Requires-Dist: regex; extra == "inner"
69
+ Requires-Dist: requests==2.28.1; extra == "inner"
70
+ Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
71
+ Requires-Dist: rouge-score; extra == "inner"
72
+ Requires-Dist: sacrebleu; extra == "inner"
73
+ Requires-Dist: scikit-learn; extra == "inner"
74
+ Requires-Dist: seaborn; extra == "inner"
75
+ Requires-Dist: simple-ddl-parser; extra == "inner"
76
+ Requires-Dist: streamlit; extra == "inner"
77
+ Requires-Dist: tqdm; extra == "inner"
78
+ Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
79
+ Requires-Dist: transformers_stream_generator; extra == "inner"
23
80
  Provides-Extra: all
81
+ Requires-Dist: torch; extra == "all"
82
+ Requires-Dist: absl-py; extra == "all"
83
+ Requires-Dist: accelerate; extra == "all"
84
+ Requires-Dist: cachetools; extra == "all"
85
+ Requires-Dist: editdistance; extra == "all"
86
+ Requires-Dist: jsonlines; extra == "all"
87
+ Requires-Dist: matplotlib; extra == "all"
88
+ Requires-Dist: modelscope[framework]; extra == "all"
89
+ Requires-Dist: nltk; extra == "all"
90
+ Requires-Dist: openai; extra == "all"
91
+ Requires-Dist: pandas; extra == "all"
92
+ Requires-Dist: plotly; extra == "all"
93
+ Requires-Dist: pyarrow; extra == "all"
94
+ Requires-Dist: pympler; extra == "all"
95
+ Requires-Dist: pyyaml; extra == "all"
96
+ Requires-Dist: regex; extra == "all"
97
+ Requires-Dist: requests; extra == "all"
98
+ Requires-Dist: requests-toolbelt; extra == "all"
99
+ Requires-Dist: rouge-score; extra == "all"
100
+ Requires-Dist: sacrebleu; extra == "all"
101
+ Requires-Dist: scikit-learn; extra == "all"
102
+ Requires-Dist: seaborn; extra == "all"
103
+ Requires-Dist: sentencepiece; extra == "all"
104
+ Requires-Dist: simple-ddl-parser; extra == "all"
105
+ Requires-Dist: tabulate; extra == "all"
106
+ Requires-Dist: tiktoken; extra == "all"
107
+ Requires-Dist: tqdm; extra == "all"
108
+ Requires-Dist: transformers<4.43,>=4.33; extra == "all"
109
+ Requires-Dist: transformers_stream_generator; extra == "all"
110
+ Requires-Dist: jieba; extra == "all"
111
+ Requires-Dist: rouge-chinese; extra == "all"
112
+ Requires-Dist: ms-opencompass>=0.0.5; extra == "all"
113
+ Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
24
114
 
25
115
  English | [简体中文](README_zh.md)
26
116
 
27
117
  <p align="center">
28
118
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/evalscope">
29
119
  </a>
30
- <a href="https://github.com/modelscope/eval-scope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
120
+ <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
31
121
  <p>
32
122
 
33
123
  ## 📖 Table of Content
@@ -42,7 +132,7 @@ English | [简体中文](README_zh.md)
42
132
 
43
133
  ## 📝 Introduction
44
134
 
45
- Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the Eval-Scope framework, which includes the following components and features:
135
+ Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework, which includes the following components and features:
46
136
 
47
137
  - Pre-configured common benchmark datasets, including: MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
48
138
  - Implementation of common evaluation metrics
@@ -55,7 +145,7 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
55
145
  - Visualization tools
56
146
  - Model Inference Performance Evaluation [Tutorial](evalscope/perf/README.md)
57
147
  - Support for OpenCompass as an Evaluation Backend, featuring advanced encapsulation and task simplification to easily submit tasks to OpenCompass for evaluation.
58
- - Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through Eval-Scope, supporting various multimodal models and datasets.
148
+ - Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through EvalScope, supporting various multimodal models and datasets.
59
149
  - Full pipeline support: Seamlessly integrate with SWIFT to easily train and deploy model services, initiate evaluation tasks, view evaluation reports, and achieve an end-to-end large model development process.
60
150
 
61
151
 
@@ -76,33 +166,48 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
76
166
  - **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
77
167
  - **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. [User Guide](#vlmevalkit-evaluation-backend) 🔥🔥🔥
78
168
  - **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. [User Guide](#opencompass-evaluation-backend) 🔥🔥🔥
79
- - **[2024.06.13]** Eval-Scope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
169
+ - **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
80
170
  - **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
81
171
 
82
172
 
83
173
 
84
174
  ## 🛠️ Installation
85
175
  ### Install with pip
86
- 1. create conda environment
176
+ 1. create conda environment [Optional]
87
177
  ```shell
88
- conda create -n eval-scope python=3.10
89
- conda activate eval-scope
178
+ conda create -n evalscope python=3.10
179
+ conda activate evalscope
90
180
  ```
91
181
 
92
- 2. Install Eval-Scope
182
+ 2. Install EvalScope
93
183
  ```shell
94
- pip install evalscope
184
+ pip install evalscope # Installation with Native backend (by default)
185
+
186
+ pip install evalscope[opencompass] # Installation with OpenCompass backend
187
+ pip install evalscope[vlmeval] # Installation with VLMEvalKit backend
188
+ pip install evalscope[all] # Installation with all backends (Native, OpenCompass, VLMEvalKit)
95
189
  ```
96
190
 
191
+ DEPRECATION WARNING: For 0.4.3 or older versions, please use the following command to install:
192
+ ```shell
193
+ pip install llmuses<=0.4.3
194
+
195
+ # Usage:
196
+ from llmuses.run import run_task
197
+ ...
198
+
199
+ ```
200
+
201
+
97
202
  ### Install from source code
98
203
  1. Download source code
99
204
  ```shell
100
- git clone https://github.com/modelscope/eval-scope.git
205
+ git clone https://github.com/modelscope/evalscope.git
101
206
  ```
102
207
 
103
208
  2. Install dependencies
104
209
  ```shell
105
- cd eval-scope/
210
+ cd evalscope/
106
211
  pip install -e .
107
212
  ```
108
213
 
@@ -146,15 +251,15 @@ print(TemplateType.get_template_name_list())
146
251
  ```
147
252
 
148
253
  ### Evaluation Backend
149
- Eval-Scope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
150
- - **Native**: Eval-Scope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
151
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through Eval-Scope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
152
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through Eval-Scope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
153
- - **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to Eval-Scope as third-party backend.
254
+ EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
255
+ - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
256
+ - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
257
+ - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
258
+ - **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to EvalScope as third-party backend.
154
259
 
155
260
  #### OpenCompass Eval-Backend
156
261
 
157
- To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through Eval-Scope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
262
+ To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through EvalScope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
158
263
 
159
264
 
160
265
  ##### Installation
@@ -210,7 +315,7 @@ python examples/example_eval_swift_openai_api.py
210
315
 
211
316
  #### VLMEvalKit Evaluation Backend
212
317
 
213
- To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through Eval-Scope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
318
+ To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through EvalScope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
214
319
 
215
320
  ##### Installation
216
321
  ```shell
@@ -228,7 +333,8 @@ For detailed information about the datasets, please refer to [VLMEvalKit Support
228
333
  You can use the following to view the list of dataset names:
229
334
  ```python
230
335
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
231
- print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list(list_supported_VLMs().keys())}')
336
+ print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
337
+
232
338
  ```
233
339
  If the dataset file does not exist locally when loading the dataset, it will be automatically downloaded to the `~/LMUData/` directory.
234
340
 
@@ -471,5 +577,3 @@ The LLM Leaderboard aims to provide an objective and comprehensive evaluation st
471
577
  - [ ] Auto-reviewer
472
578
  - [ ] Qwen-max
473
579
 
474
-
475
-
@@ -3,7 +3,7 @@ English | [简体中文](README_zh.md)
3
3
  <p align="center">
4
4
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/evalscope">
5
5
  </a>
6
- <a href="https://github.com/modelscope/eval-scope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
6
+ <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
7
7
  <p>
8
8
 
9
9
  ## 📖 Table of Content
@@ -18,7 +18,7 @@ English | [简体中文](README_zh.md)
18
18
 
19
19
  ## 📝 Introduction
20
20
 
21
- Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the Eval-Scope framework, which includes the following components and features:
21
+ Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework, which includes the following components and features:
22
22
 
23
23
  - Pre-configured common benchmark datasets, including: MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
24
24
  - Implementation of common evaluation metrics
@@ -31,7 +31,7 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
31
31
  - Visualization tools
32
32
  - Model Inference Performance Evaluation [Tutorial](evalscope/perf/README.md)
33
33
  - Support for OpenCompass as an Evaluation Backend, featuring advanced encapsulation and task simplification to easily submit tasks to OpenCompass for evaluation.
34
- - Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through Eval-Scope, supporting various multimodal models and datasets.
34
+ - Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through EvalScope, supporting various multimodal models and datasets.
35
35
  - Full pipeline support: Seamlessly integrate with SWIFT to easily train and deploy model services, initiate evaluation tasks, view evaluation reports, and achieve an end-to-end large model development process.
36
36
 
37
37
 
@@ -52,33 +52,48 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
52
52
  - **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
53
53
  - **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. [User Guide](#vlmevalkit-evaluation-backend) 🔥🔥🔥
54
54
  - **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. [User Guide](#opencompass-evaluation-backend) 🔥🔥🔥
55
- - **[2024.06.13]** Eval-Scope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
55
+ - **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
56
56
  - **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
57
57
 
58
58
 
59
59
 
60
60
  ## 🛠️ Installation
61
61
  ### Install with pip
62
- 1. create conda environment
62
+ 1. create conda environment [Optional]
63
63
  ```shell
64
- conda create -n eval-scope python=3.10
65
- conda activate eval-scope
64
+ conda create -n evalscope python=3.10
65
+ conda activate evalscope
66
66
  ```
67
67
 
68
- 2. Install Eval-Scope
68
+ 2. Install EvalScope
69
69
  ```shell
70
- pip install evalscope
70
+ pip install evalscope # Installation with Native backend (by default)
71
+
72
+ pip install evalscope[opencompass] # Installation with OpenCompass backend
73
+ pip install evalscope[vlmeval] # Installation with VLMEvalKit backend
74
+ pip install evalscope[all] # Installation with all backends (Native, OpenCompass, VLMEvalKit)
71
75
  ```
72
76
 
77
+ DEPRECATION WARNING: For 0.4.3 or older versions, please use the following command to install:
78
+ ```shell
79
+ pip install llmuses<=0.4.3
80
+
81
+ # Usage:
82
+ from llmuses.run import run_task
83
+ ...
84
+
85
+ ```
86
+
87
+
73
88
  ### Install from source code
74
89
  1. Download source code
75
90
  ```shell
76
- git clone https://github.com/modelscope/eval-scope.git
91
+ git clone https://github.com/modelscope/evalscope.git
77
92
  ```
78
93
 
79
94
  2. Install dependencies
80
95
  ```shell
81
- cd eval-scope/
96
+ cd evalscope/
82
97
  pip install -e .
83
98
  ```
84
99
 
@@ -122,15 +137,15 @@ print(TemplateType.get_template_name_list())
122
137
  ```
123
138
 
124
139
  ### Evaluation Backend
125
- Eval-Scope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
126
- - **Native**: Eval-Scope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
127
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through Eval-Scope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
128
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through Eval-Scope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
129
- - **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to Eval-Scope as third-party backend.
140
+ EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
141
+ - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
142
+ - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
143
+ - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
144
+ - **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to EvalScope as third-party backend.
130
145
 
131
146
  #### OpenCompass Eval-Backend
132
147
 
133
- To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through Eval-Scope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
148
+ To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through EvalScope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
134
149
 
135
150
 
136
151
  ##### Installation
@@ -186,7 +201,7 @@ python examples/example_eval_swift_openai_api.py
186
201
 
187
202
  #### VLMEvalKit Evaluation Backend
188
203
 
189
- To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through Eval-Scope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
204
+ To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through EvalScope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
190
205
 
191
206
  ##### Installation
192
207
  ```shell
@@ -204,7 +219,8 @@ For detailed information about the datasets, please refer to [VLMEvalKit Support
204
219
  You can use the following to view the list of dataset names:
205
220
  ```python
206
221
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
207
- print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list(list_supported_VLMs().keys())}')
222
+ print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
223
+
208
224
  ```
209
225
  If the dataset file does not exist locally when loading the dataset, it will be automatically downloaded to the `~/LMUData/` directory.
210
226
 
@@ -76,9 +76,7 @@ class OpenCompassBackendManager(BackendManager):
76
76
  @staticmethod
77
77
  def _check_env():
78
78
  if is_module_installed('opencompass'):
79
- logger.info('Please make sure you have installed the `ms-opencompass`: `pip install ms-opencompass`')
80
- else:
81
- raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
79
+ logger.info('Check the OpenCompass environment: OK')
82
80
 
83
81
  @staticmethod
84
82
  def get_restore_arg(arg_name: str, arg_val: bool):
@@ -6,6 +6,7 @@ from opencompass.tasks import OpenICLInferTask
6
6
 
7
7
 
8
8
  with read_base():
9
+ from opencompass.configs.summarizers.medium import summarizer
9
10
  from evalscope.backend.opencompass.tasks.eval_datasets import datasets
10
11
 
11
12
  # 1. Get datasets
@@ -31,7 +31,7 @@ class VLMEvalKitBackendManager(BackendManager):
31
31
  from vlmeval.utils.arguments import Arguments as VLMEvalArguments
32
32
  self.args = VLMEvalArguments(**self.config_d)
33
33
 
34
- self.valid_models = self.list_supported_VLMs()
34
+ self.valid_models = self.list_supported_models()
35
35
  self.valid_model_names = list(self.valid_models.keys())
36
36
  self.valid_datasets = self.list_supported_datasets()
37
37
 
@@ -86,7 +86,7 @@ class VLMEvalKitBackendManager(BackendManager):
86
86
  return self.get_cmd()
87
87
 
88
88
  @staticmethod
89
- def list_supported_VLMs():
89
+ def list_supported_models():
90
90
  from vlmeval.config import supported_VLM
91
91
  return supported_VLM
92
92
 
@@ -98,9 +98,7 @@ class VLMEvalKitBackendManager(BackendManager):
98
98
  @staticmethod
99
99
  def _check_env():
100
100
  if is_module_installed('vlmeval'):
101
- logger.info('Please make sure you have installed the `ms-vlmeval`: `pip install ms-vlmeval`')
102
- else:
103
- raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
101
+ logger.info('Check VLM Evaluation Kit: Installed')
104
102
 
105
103
  @staticmethod
106
104
  def get_restore_arg(arg_name: str, arg_val: bool):
@@ -3,7 +3,7 @@ from enum import Enum
3
3
 
4
4
 
5
5
  class EvalBackend(Enum):
6
- # Use native evaluation pipeline of Eval-Scope
6
+ # Use native evaluation pipeline of EvalScope
7
7
  NATIVE = 'Native'
8
8
 
9
9
  # Use OpenCompass framework as the evaluation backend
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ __version__ = '0.5.2'
4
+ __release_datetime__ = '2024-08-06 08:00:00'