evalscope 0.13.0__tar.gz → 0.13.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (352) hide show
  1. {evalscope-0.13.0/evalscope.egg-info → evalscope-0.13.1}/PKG-INFO +33 -30
  2. {evalscope-0.13.0 → evalscope-0.13.1}/README.md +32 -29
  3. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/evaluator.py +1 -1
  4. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/config.py +5 -2
  5. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/constants.py +1 -0
  6. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/evaluator.py +5 -4
  7. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom_adapter.py +1 -1
  8. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/arguments.py +11 -40
  9. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/benchmark.py +34 -28
  10. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/main.py +1 -1
  11. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/__init__.py +1 -0
  12. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/openqa.py +6 -11
  13. evalscope-0.13.1/evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  14. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/db_util.py +3 -0
  15. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/run.py +14 -2
  16. evalscope-0.13.1/evalscope/version.py +4 -0
  17. {evalscope-0.13.0 → evalscope-0.13.1/evalscope.egg-info}/PKG-INFO +33 -30
  18. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/SOURCES.txt +1 -0
  19. {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/test_run.py +41 -11
  20. {evalscope-0.13.0 → evalscope-0.13.1}/tests/perf/test_perf.py +23 -0
  21. evalscope-0.13.0/evalscope/version.py +0 -4
  22. {evalscope-0.13.0 → evalscope-0.13.1}/LICENSE +0 -0
  23. {evalscope-0.13.0 → evalscope-0.13.1}/MANIFEST.in +0 -0
  24. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/__init__.py +0 -0
  25. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/arguments.py +0 -0
  26. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/__init__.py +0 -0
  27. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/base.py +0 -0
  28. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/__init__.py +0 -0
  29. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  30. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
  31. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  32. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  33. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  34. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  35. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  36. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  37. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  38. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  39. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  40. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  41. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  42. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  43. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  44. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  45. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  46. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  47. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  48. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  49. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  50. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  51. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  52. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  53. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  54. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  55. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  56. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  57. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  58. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  59. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  60. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  61. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  62. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  63. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  64. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  65. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  66. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  67. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  68. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  69. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  70. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  71. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  72. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  73. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  74. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  75. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/__init__.py +0 -0
  76. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/aime/__init__.py +0 -0
  77. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  78. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  79. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  80. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  81. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  82. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  83. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  84. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  85. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  86. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  87. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  88. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  89. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  90. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  91. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  92. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  93. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  94. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  95. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  96. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  97. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  98. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  99. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  100. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  101. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  102. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  103. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  104. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  105. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  106. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  107. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  108. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  109. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  110. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  111. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/benchmark.py +0 -0
  112. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  113. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  114. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  115. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  116. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
  117. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  118. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  119. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  120. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  121. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  122. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  123. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  124. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/data_adapter.py +0 -0
  125. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
  126. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
  127. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_mcq/__init__.py +0 -0
  128. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
  129. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  130. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  131. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gpqa/__init__.py +0 -0
  132. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  133. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  134. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  135. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  136. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  137. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  138. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  139. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  140. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  141. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  142. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  143. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/__init__.py +0 -0
  144. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
  145. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  146. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  147. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  148. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
  149. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/iquiz/__init__.py +0 -0
  150. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  151. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  152. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  153. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/execute_utils.py +0 -0
  154. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  155. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
  156. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  157. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  158. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  159. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
  160. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/math_500/__init__.py +0 -0
  161. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
  162. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  163. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  164. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  165. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  166. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  167. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
  168. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/musr/__init__.py +0 -0
  169. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
  170. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/process_bench/__init__.py +0 -0
  171. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  172. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  173. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/__init__.py +0 -0
  174. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/race.py +0 -0
  175. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
  176. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
  177. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/simple_qa/__init__.py +0 -0
  178. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
  179. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  180. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  181. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  182. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  183. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  184. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  185. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  186. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  187. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  188. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  189. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  190. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  191. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/utils.py +0 -0
  192. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/__init__.py +0 -0
  193. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/base.py +0 -0
  194. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/cli.py +0 -0
  195. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_app.py +0 -0
  196. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_eval.py +0 -0
  197. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_perf.py +0 -0
  198. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_server.py +0 -0
  199. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/__init__.py +0 -0
  200. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/sampler.py +0 -0
  201. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/schema.py +0 -0
  202. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/__init__.py +0 -0
  203. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/rating_eval.py +0 -0
  204. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
  205. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  206. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/__init__.py +0 -0
  207. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  208. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  209. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/code_metric.py +0 -0
  210. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/llm_judge.py +0 -0
  211. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/math_parser.py +0 -0
  212. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/metrics.py +0 -0
  213. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/named_metrics.py +0 -0
  214. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  215. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  216. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/rouge_metric.py +0 -0
  217. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/__init__.py +0 -0
  218. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/base_adapter.py +0 -0
  219. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/chat_adapter.py +0 -0
  220. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/choice_adapter.py +0 -0
  221. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom/__init__.py +0 -0
  222. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom/custom_model.py +0 -0
  223. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom/dummy_model.py +0 -0
  224. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/local_model.py +0 -0
  225. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/model.py +0 -0
  226. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/register.py +0 -0
  227. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/server_adapter.py +0 -0
  228. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/__init__.py +0 -0
  229. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/http_client.py +0 -0
  230. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/__init__.py +0 -0
  231. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/__init__.py +0 -0
  232. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/base.py +0 -0
  233. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
  234. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  235. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
  236. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/base.py +0 -0
  237. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/custom.py +0 -0
  238. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  239. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  240. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  241. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  242. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/registry.py +0 -0
  243. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/__init__.py +0 -0
  244. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/analysis_result.py +0 -0
  245. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/benchmark_util.py +0 -0
  246. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/handler.py +0 -0
  247. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/local_server.py +0 -0
  248. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/__init__.py +0 -0
  249. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
  250. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  251. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  252. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_single.yaml +0 -0
  253. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  254. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  255. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  256. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  257. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/question.jsonl +0 -0
  258. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/arc.yaml +0 -0
  259. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/bbh.yaml +0 -0
  260. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  261. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/ceval.yaml +0 -0
  262. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  263. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  264. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  265. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
  266. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  267. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
  268. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  269. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/__init__.py +0 -0
  270. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/app.py +0 -0
  271. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/combinator.py +0 -0
  272. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/generator.py +0 -0
  273. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/utils.py +0 -0
  274. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/run_arena.py +0 -0
  275. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/summarizer.py +0 -0
  276. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/__init__.py +0 -0
  277. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/README.md +0 -0
  278. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  279. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
  280. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  281. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  282. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  283. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  284. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  285. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  286. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  287. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  288. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  289. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  290. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  291. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  292. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  293. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
  294. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/eval.py +0 -0
  295. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/infer.py +0 -0
  296. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  297. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  298. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  299. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  300. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  301. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/README.md +0 -0
  302. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  303. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  304. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  305. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  306. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  307. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  308. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  309. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  310. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  311. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/__init__.py +0 -0
  312. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/arena_utils.py +0 -0
  313. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/chat_service.py +0 -0
  314. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/completion_parsers.py +0 -0
  315. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/filters.py +0 -0
  316. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/io_utils.py +0 -0
  317. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/logger.py +0 -0
  318. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/model_utils.py +0 -0
  319. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/utils.py +0 -0
  320. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/dependency_links.txt +0 -0
  321. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/entry_points.txt +0 -0
  322. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/not-zip-safe +0 -0
  323. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/requires.txt +0 -0
  324. {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/top_level.txt +0 -0
  325. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/app.txt +0 -0
  326. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/docs.txt +0 -0
  327. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/framework.txt +0 -0
  328. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/inner.txt +0 -0
  329. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/opencompass.txt +0 -0
  330. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/perf.txt +0 -0
  331. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/rag.txt +0 -0
  332. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/tests.txt +0 -0
  333. {evalscope-0.13.0 → evalscope-0.13.1}/requirements/vlmeval.txt +0 -0
  334. {evalscope-0.13.0 → evalscope-0.13.1}/requirements.txt +0 -0
  335. {evalscope-0.13.0 → evalscope-0.13.1}/setup.cfg +0 -0
  336. {evalscope-0.13.0 → evalscope-0.13.1}/setup.py +0 -0
  337. {evalscope-0.13.0 → evalscope-0.13.1}/tests/__init__.py +0 -0
  338. {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/__init__.py +0 -0
  339. {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/test_all.py +0 -0
  340. {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/test_collection.py +0 -0
  341. {evalscope-0.13.0 → evalscope-0.13.1}/tests/perf/__init__.py +0 -0
  342. {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/__init__.py +0 -0
  343. {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/test_clip_benchmark.py +0 -0
  344. {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/test_mteb.py +0 -0
  345. {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/test_ragas.py +0 -0
  346. {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/__init__.py +0 -0
  347. {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/test_run_swift_eval.py +0 -0
  348. {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  349. {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  350. {evalscope-0.13.0 → evalscope-0.13.1}/tests/test_run_all.py +0 -0
  351. {evalscope-0.13.0 → evalscope-0.13.1}/tests/vlm/__init__.py +0 -0
  352. {evalscope-0.13.0 → evalscope-0.13.1}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.0
3
+ Version: 0.13.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -239,7 +239,8 @@ Please scan the QR code below to join our community groups:
239
239
 
240
240
  ## 🎉 News
241
241
 
242
- - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
242
+ - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
243
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
243
244
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
244
245
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
245
246
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
@@ -277,23 +278,24 @@ Please scan the QR code below to join our community groups:
277
278
  We recommend using conda to manage your environment and installing dependencies with pip:
278
279
 
279
280
  1. Create a conda environment (optional)
280
- ```shell
281
- # It is recommended to use Python 3.10
282
- conda create -n evalscope python=3.10
283
- # Activate the conda environment
284
- conda activate evalscope
285
- ```
281
+ ```shell
282
+ # It is recommended to use Python 3.10
283
+ conda create -n evalscope python=3.10
284
+ # Activate the conda environment
285
+ conda activate evalscope
286
+ ```
286
287
 
287
288
  2. Install dependencies using pip
288
- ```shell
289
- pip install evalscope # Install Native backend (default)
290
- # Additional options
291
- pip install evalscope[opencompass] # Install OpenCompass backend
292
- pip install evalscope[vlmeval] # Install VLMEvalKit backend
293
- pip install evalscope[rag] # Install RAGEval backend
294
- pip install evalscope[perf] # Install Perf dependencies
295
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
296
- ```
289
+ ```shell
290
+ pip install evalscope # Install Native backend (default)
291
+ # Additional options
292
+ pip install 'evalscope[opencompass]' # Install OpenCompass backend
293
+ pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
294
+ pip install 'evalscope[rag]' # Install RAGEval backend
295
+ pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
296
+ pip install 'evalscope[app]' # Install dependencies for visualization
297
+ pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
298
+ ```
297
299
 
298
300
  > [!WARNING]
299
301
  > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -307,21 +309,22 @@ We recommend using conda to manage your environment and installing dependencies
307
309
 
308
310
  ### Method 2: Install from Source
309
311
  1. Download the source code
310
- ```shell
311
- git clone https://github.com/modelscope/evalscope.git
312
- ```
312
+ ```shell
313
+ git clone https://github.com/modelscope/evalscope.git
314
+ ```
313
315
 
314
316
  2. Install dependencies
315
- ```shell
316
- cd evalscope/
317
- pip install -e . # Install Native backend
318
- # Additional options
319
- pip install -e '.[opencompass]' # Install OpenCompass backend
320
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
321
- pip install -e '.[rag]' # Install RAGEval backend
322
- pip install -e '.[perf]' # Install Perf dependencies
323
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
324
- ```
317
+ ```shell
318
+ cd evalscope/
319
+ pip install -e . # Install Native backend
320
+ # Additional options
321
+ pip install -e '.[opencompass]' # Install OpenCompass backend
322
+ pip install -e '.[vlmeval]' # Install VLMEvalKit backend
323
+ pip install -e '.[rag]' # Install RAGEval backend
324
+ pip install -e '.[perf]' # Install Perf dependencies
325
+ pip install -e '.[app]' # Install visualization dependencies
326
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
327
+ ```
325
328
 
326
329
 
327
330
  ## 🚀 Quick Start
@@ -88,7 +88,8 @@ Please scan the QR code below to join our community groups:
88
88
 
89
89
  ## 🎉 News
90
90
 
91
- - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
91
+ - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
92
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
92
93
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
93
94
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
94
95
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
@@ -126,23 +127,24 @@ Please scan the QR code below to join our community groups:
126
127
  We recommend using conda to manage your environment and installing dependencies with pip:
127
128
 
128
129
  1. Create a conda environment (optional)
129
- ```shell
130
- # It is recommended to use Python 3.10
131
- conda create -n evalscope python=3.10
132
- # Activate the conda environment
133
- conda activate evalscope
134
- ```
130
+ ```shell
131
+ # It is recommended to use Python 3.10
132
+ conda create -n evalscope python=3.10
133
+ # Activate the conda environment
134
+ conda activate evalscope
135
+ ```
135
136
 
136
137
  2. Install dependencies using pip
137
- ```shell
138
- pip install evalscope # Install Native backend (default)
139
- # Additional options
140
- pip install evalscope[opencompass] # Install OpenCompass backend
141
- pip install evalscope[vlmeval] # Install VLMEvalKit backend
142
- pip install evalscope[rag] # Install RAGEval backend
143
- pip install evalscope[perf] # Install Perf dependencies
144
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
145
- ```
138
+ ```shell
139
+ pip install evalscope # Install Native backend (default)
140
+ # Additional options
141
+ pip install 'evalscope[opencompass]' # Install OpenCompass backend
142
+ pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
143
+ pip install 'evalscope[rag]' # Install RAGEval backend
144
+ pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
145
+ pip install 'evalscope[app]' # Install dependencies for visualization
146
+ pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
147
+ ```
146
148
 
147
149
  > [!WARNING]
148
150
  > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -156,21 +158,22 @@ We recommend using conda to manage your environment and installing dependencies
156
158
 
157
159
  ### Method 2: Install from Source
158
160
  1. Download the source code
159
- ```shell
160
- git clone https://github.com/modelscope/evalscope.git
161
- ```
161
+ ```shell
162
+ git clone https://github.com/modelscope/evalscope.git
163
+ ```
162
164
 
163
165
  2. Install dependencies
164
- ```shell
165
- cd evalscope/
166
- pip install -e . # Install Native backend
167
- # Additional options
168
- pip install -e '.[opencompass]' # Install OpenCompass backend
169
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
170
- pip install -e '.[rag]' # Install RAGEval backend
171
- pip install -e '.[perf]' # Install Perf dependencies
172
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
173
- ```
166
+ ```shell
167
+ cd evalscope/
168
+ pip install -e . # Install Native backend
169
+ # Additional options
170
+ pip install -e '.[opencompass]' # Install OpenCompass backend
171
+ pip install -e '.[vlmeval]' # Install VLMEvalKit backend
172
+ pip install -e '.[rag]' # Install RAGEval backend
173
+ pip install -e '.[perf]' # Install Perf dependencies
174
+ pip install -e '.[app]' # Install visualization dependencies
175
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
176
+ ```
174
177
 
175
178
 
176
179
  ## 🚀 Quick Start
@@ -181,7 +181,7 @@ class EvaluatorCollection:
181
181
  answers_list = jsonl_to_list(pred_file_path)
182
182
  indices = set()
183
183
  for answer in answers_list:
184
- index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
184
+ index = answer.get(AnswerKeys.INDEX)
185
185
  answer_dict[index] = answer
186
186
  indices.add(index)
187
187
  data = []
@@ -81,7 +81,7 @@ class TaskConfig:
81
81
  def __post_init__(self):
82
82
  if (not self.model_id) and self.model:
83
83
  if isinstance(self.model, CustomModel):
84
- self.model_id = type(self.model).__name__
84
+ self.model_id = self.model.config.get('model_id', 'custom_model')
85
85
  else:
86
86
  self.model_id = os.path.basename(self.model).rstrip(os.sep)
87
87
  # fix path error, see http://github.com/modelscope/evalscope/issues/377
@@ -92,7 +92,10 @@ class TaskConfig:
92
92
  self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
93
93
 
94
94
  def to_dict(self):
95
- return self.__dict__
95
+ result = self.__dict__.copy()
96
+ if isinstance(self.model, CustomModel):
97
+ result['model'] = self.model.__class__.__name__
98
+ return result
96
99
 
97
100
  def __str__(self):
98
101
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
@@ -77,6 +77,7 @@ class ArenaMode:
77
77
 
78
78
 
79
79
  class AnswerKeys:
80
+ INDEX = 'index'
80
81
  ANSWER_ID = 'answer_id'
81
82
  RAW_INPUT = 'raw_input'
82
83
  ORIGIN_PROMPT = 'origin_prompt'
@@ -81,7 +81,7 @@ class Evaluator(object):
81
81
  for subset_name, prompts_list in prompts.items():
82
82
  limit = self.task_cfg.limit or len(prompts_list)
83
83
  for index, prompt in enumerate(prompts_list[:limit]):
84
- prompt['index'] = index
84
+ prompt[AnswerKeys.INDEX] = index
85
85
  limited_prompts[subset_name].append(prompt)
86
86
 
87
87
  return limited_prompts
@@ -97,7 +97,8 @@ class Evaluator(object):
97
97
  answer_d[AnswerKeys.ANSWER_ID] = answer_id
98
98
  answer_d[AnswerKeys.SUBSET_NAME] = subset_name
99
99
  answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
100
- answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
100
+ # answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
101
+ answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
101
102
  return answer_d
102
103
 
103
104
  def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
@@ -117,7 +118,7 @@ class Evaluator(object):
117
118
  return answers_list, prompts_list
118
119
 
119
120
  def get_answered_indices(answers_list: List[Dict]) -> List[int]:
120
- indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
121
+ indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
121
122
 
122
123
  if all(index is None for index in indices):
123
124
  return list(range(len(answers_list)))
@@ -238,7 +239,7 @@ class Evaluator(object):
238
239
  pred = pred_content
239
240
 
240
241
  choice[ReviewKeys.REVIEW] = {
241
- ReviewKeys.GOLD: gold_content,
242
+ ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
242
243
  ReviewKeys.PRED: pred,
243
244
  ReviewKeys.RESULT: review_result
244
245
  }
@@ -66,4 +66,4 @@ class CustomModelAdapter(BaseModelAdapter):
66
66
  else:
67
67
  raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
68
68
 
69
- return self.custom_model.predict(prompts=in_prompts, **kwargs)
69
+ return self.custom_model.predict(prompts=in_prompts, origin_inputs=inputs, **kwargs)
@@ -24,6 +24,7 @@ class Arguments:
24
24
  connect_timeout: int = 600 # Connection timeout in seconds
25
25
  read_timeout: int = 600 # Read timeout in seconds
26
26
  api_key: Optional[str] = None
27
+ no_test_connection: bool = False # Test the connection before starting the benchmark
27
28
 
28
29
  # Performance and parallelism
29
30
  number: Optional[int] = None # Number of requests to be made
@@ -40,8 +41,9 @@ class Arguments:
40
41
  outputs_dir: str = DEFAULT_WORK_DIR
41
42
 
42
43
  # Prompt settings
43
- max_prompt_length: int = sys.maxsize # Maximum length of the prompt
44
+ max_prompt_length: int = 131072 # Maximum length of the prompt
44
45
  min_prompt_length: int = 0 # Minimum length of the prompt
46
+ prefix_length: int = 0 # Length of the prefix, only for random dataset
45
47
  prompt: Optional[str] = None # The prompt text
46
48
  query_template: Optional[str] = None # Template for the query
47
49
 
@@ -65,44 +67,12 @@ class Arguments:
65
67
 
66
68
  @staticmethod
67
69
  def from_args(args):
68
- return Arguments(
69
- model=args.model,
70
- attn_implementation=args.attn_implementation,
71
- url=args.url,
72
- port=args.port,
73
- api_key=args.api_key,
74
- connect_timeout=args.connect_timeout,
75
- read_timeout=args.read_timeout,
76
- number=args.number,
77
- parallel=args.parallel,
78
- rate=args.rate,
79
- log_every_n_query=args.log_every_n_query,
80
- headers=args.headers,
81
- wandb_api_key=args.wandb_api_key,
82
- name=args.name,
83
- outputs_dir=args.outputs_dir,
84
- debug=args.debug,
85
- tokenizer_path=args.tokenizer_path,
86
- api=args.api,
87
- max_prompt_length=args.max_prompt_length,
88
- min_prompt_length=args.min_prompt_length,
89
- prompt=args.prompt,
90
- query_template=args.query_template,
91
- dataset=args.dataset,
92
- dataset_path=args.dataset_path,
93
- frequency_penalty=args.frequency_penalty,
94
- logprobs=args.logprobs,
95
- max_tokens=args.max_tokens,
96
- min_tokens=args.min_tokens,
97
- n_choices=args.n_choices,
98
- seed=args.seed,
99
- stop=args.stop,
100
- stop_token_ids=args.stop_token_ids,
101
- stream=args.stream,
102
- temperature=args.temperature,
103
- top_p=args.top_p,
104
- top_k=args.top_k,
105
- )
70
+ # Convert Namespace to a dictionary and filter out None values
71
+ args_dict = {k: v for k, v in vars(args).items() if v is not None}
72
+
73
+ if 'func' in args_dict:
74
+ del args_dict['func'] # Note: compat CLI arguments
75
+ return Arguments(**args_dict)
106
76
 
107
77
  def __post_init__(self):
108
78
  self.headers = self.headers or {} # Default to empty dictionary
@@ -153,6 +123,7 @@ def add_argument(parser: argparse.ArgumentParser):
153
123
  parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
154
124
  parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
155
125
  parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
126
+ parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
156
127
 
157
128
  # Performance and parallelism
158
129
  parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
@@ -168,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
168
139
  # Prompt settings
169
140
  parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
170
141
  parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
142
+ parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
171
143
  parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
172
144
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
173
145
 
@@ -193,7 +165,6 @@ def add_argument(parser: argparse.ArgumentParser):
193
165
  parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
194
166
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
195
167
  parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
196
-
197
168
  # yapf: enable
198
169
 
199
170
 
@@ -150,39 +150,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
150
150
  name = args.name if args.name else f'{args.model_id}_{current_time}'
151
151
  wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
152
152
 
153
- with sqlite3.connect(result_db_path) as con:
154
- cursor = con.cursor()
155
- create_result_table(cursor)
156
- with tqdm(desc='Processing') as pbar:
157
- while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
- try:
159
- # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
- benchmark_data_queue.task_done()
162
- except asyncio.TimeoutError:
163
- # If timeout, continue to the next iteration
164
- continue
153
+ collected_benchmark_data = []
154
+
155
+ with tqdm(desc='Processing') as pbar:
156
+ while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
157
+ try:
158
+ # Attempt to get benchmark data from the queue with a timeout
159
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
160
+ benchmark_data_queue.task_done()
161
+ except asyncio.TimeoutError:
162
+ # If timeout, continue to the next iteration
163
+ continue
164
+
165
+ # Update metrics based on the benchmark data
166
+ metrics.update_metrics(benchmark_data, api_plugin)
165
167
 
166
- # Update metrics based on the benchmark data
167
- metrics.update_metrics(benchmark_data, api_plugin)
168
+ # Collect benchmark data for later database insertion
169
+ collected_benchmark_data.append(benchmark_data)
168
170
 
169
- # Insert benchmark data into the database and commit the transaction
170
- insert_benchmark_data(cursor, benchmark_data)
171
- con.commit()
171
+ # Create a message with the updated metrics
172
+ message = metrics.create_message()
172
173
 
173
- # Create a message with the updated metrics
174
- message = metrics.create_message()
174
+ # Log the message to wandb if the api key is provided
175
+ if args.wandb_api_key:
176
+ wandb.log(message)
175
177
 
176
- # Log the message to wandb if the api key is provided
177
- if args.wandb_api_key:
178
- wandb.log(message)
178
+ # Log the message to the logger every n queries
179
+ if int(metrics.n_total_queries) % args.log_every_n_query == 0:
180
+ msg = json.dumps(message, ensure_ascii=False, indent=2)
181
+ logger.info(msg)
179
182
 
180
- # Log the message to the logger every n queries
181
- if int(metrics.n_total_queries) % args.log_every_n_query == 0:
182
- msg = json.dumps(message, ensure_ascii=False, indent=2)
183
- logger.info(msg)
183
+ pbar.update(1) # Update the progress bar
184
184
 
185
- pbar.update(1) # Update the progress bar
185
+ # Now perform database operations after all benchmark data has been processed
186
+ with sqlite3.connect(result_db_path) as con:
187
+ cursor = con.cursor()
188
+ create_result_table(cursor)
189
+ for benchmark_data in collected_benchmark_data:
190
+ insert_benchmark_data(cursor, benchmark_data)
191
+ con.commit()
186
192
 
187
193
  return metrics, result_db_path
188
194
 
@@ -199,7 +205,7 @@ async def start_server(args: Arguments) -> bool:
199
205
  else:
200
206
  args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
207
 
202
- if not await test_connection(args):
208
+ if (not args.no_test_connection) and (not await test_connection(args)):
203
209
  raise TimeoutError('Test connection failed')
204
210
 
205
211
 
@@ -32,7 +32,7 @@ def run_perf_benchmark(args):
32
32
  if platform.system() == 'Windows':
33
33
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
34
34
 
35
- loop = asyncio.get_event_loop()
35
+ loop = asyncio.new_event_loop()
36
36
  if platform.system() != 'Windows':
37
37
  add_signal_handlers(loop)
38
38
 
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
3
3
  from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
4
4
  from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
5
5
  from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
6
+ from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
6
7
  from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
@@ -1,5 +1,5 @@
1
1
  import json
2
- import subprocess
2
+ import os
3
3
  from typing import Any, Dict, Iterator, List
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
18
18
 
19
19
  def build_messages(self) -> Iterator[List[Dict]]:
20
20
  if not self.query_parameters.dataset_path:
21
- subprocess.call([
22
- 'modelscope',
23
- 'download',
24
- '--dataset',
25
- 'AI-ModelScope/HC3-Chinese',
26
- 'open_qa.jsonl',
27
- '--local_dir',
28
- './data',
29
- ])
30
- self.query_parameters.dataset_path = './data/open_qa.jsonl'
21
+ from modelscope import dataset_snapshot_download
22
+
23
+ file_name = 'open_qa.jsonl'
24
+ local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
25
+ self.query_parameters.dataset_path = os.path.join(local_path, file_name)
31
26
 
32
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
33
28
  item = json.loads(item)
@@ -0,0 +1,51 @@
1
+ import numpy as np
2
+ from typing import Dict, Iterator, List
3
+
4
+ from evalscope.perf.arguments import Arguments
5
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
6
+ from evalscope.perf.plugin.registry import register_dataset
7
+
8
+
9
+ @register_dataset('random')
10
+ class RandomDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ """
13
+
14
+ def __init__(self, query_parameters: Arguments):
15
+ super().__init__(query_parameters)
16
+ assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
17
+
18
+ from modelscope import AutoTokenizer
19
+ self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
20
+ self.prefix_length = self.query_parameters.prefix_length
21
+ self.prefix_ids = self.get_random_inputs(self.prefix_length)
22
+ self.template_len = self.get_template_len()
23
+ self.number = self.query_parameters.number or 1
24
+
25
+ def build_messages(self) -> Iterator[List[Dict]]:
26
+ min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
27
+ max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
28
+
29
+ assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
30
+ assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
31
+
32
+ # refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
33
+ input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
34
+ offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
35
+
36
+ for i in range(self.number):
37
+ prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
38
+ prompt = self.tokenizer.decode(
39
+ self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
40
+ yield [{'role': 'user', 'content': prompt}]
41
+
42
+ def get_random_inputs(self, length: int) -> List[int]:
43
+ if length <= 0:
44
+ return []
45
+ input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
46
+ return input_ids
47
+
48
+ def get_template_len(self):
49
+ empty_message = [{'role': 'user', 'content': ''}]
50
+ template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
51
+ return len(template)
@@ -2,6 +2,7 @@ import base64
2
2
  import json
3
3
  import os
4
4
  import pickle
5
+ import re
5
6
  import sqlite3
6
7
  import sys
7
8
  from datetime import datetime
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
91
92
  def get_output_path(args: Arguments) -> str:
92
93
  current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
94
  output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
95
+ # Filter illegal characters
96
+ output_path = re.sub(r'[<>:"|?*]', '_', output_path)
94
97
  if not os.path.exists(output_path):
95
98
  os.makedirs(output_path, exist_ok=True)
96
99
  logger.info(f'Save the result to: {output_path}')
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
39
39
  configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
40
40
 
41
41
  if task_cfg.eval_backend != EvalBackend.NATIVE:
42
- return run_non_native_backend(task_cfg, outputs)
42
+ result = run_non_native_backend(task_cfg, outputs)
43
43
  else:
44
- return evaluate_model(task_cfg, outputs)
44
+ result = evaluate_model(task_cfg, outputs)
45
+
46
+ return result
45
47
 
46
48
 
47
49
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
117
119
  res_dict = evaluator.eval()
118
120
  eval_results[evaluator.dataset_name] = res_dict
119
121
 
122
+ # Clean up
123
+ if base_model is not None:
124
+ import gc
125
+ import torch
126
+
127
+ del base_model
128
+ del evaluators
129
+ torch.cuda.empty_cache()
130
+ gc.collect()
131
+
120
132
  return eval_results
121
133
 
122
134
 
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ __version__ = '0.13.1'
4
+ __release_datetime__ = '2025-03-24 18:00:00'