evalscope 0.8.0__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (304) hide show
  1. {evalscope-0.8.0/evalscope.egg-info → evalscope-0.8.2}/PKG-INFO +15 -3
  2. {evalscope-0.8.0 → evalscope-0.8.2}/README.md +12 -0
  3. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/base.py +1 -1
  4. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/clip.py +2 -2
  5. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/embedding.py +1 -1
  6. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  7. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  8. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  9. evalscope-0.8.2/evalscope/benchmarks/humaneval/humaneval_adapter.py +206 -0
  10. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/race_adapter.py +2 -1
  11. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/config.py +38 -2
  12. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/constants.py +24 -38
  13. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/__init__.py +0 -1
  14. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/evaluator.py +6 -4
  15. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/rating_eval.py +1 -1
  16. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  17. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/model_adapter.py +1 -1
  18. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/arguments.py +3 -1
  19. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/benchmark.py +3 -3
  20. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/main.py +5 -6
  21. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/openai_api.py +53 -49
  22. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/registry.py +3 -3
  23. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/benchmark_util.py +4 -4
  24. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/db_util.py +66 -22
  25. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/local_server.py +4 -1
  26. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/run.py +45 -82
  27. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/run_arena.py +2 -1
  28. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/summarizer.py +14 -26
  29. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/eval.py +2 -1
  30. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/longbench_write.py +2 -1
  31. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  32. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  33. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/combine_reports.py +2 -4
  34. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/rewrite_eval_results.py +1 -1
  35. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/__init__.py +1 -0
  36. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/chat_service.py +1 -1
  37. evalscope-0.8.2/evalscope/utils/io_utils.py +162 -0
  38. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/logger.py +8 -0
  39. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/utils.py +0 -175
  40. evalscope-0.8.2/evalscope/version.py +4 -0
  41. {evalscope-0.8.0 → evalscope-0.8.2/evalscope.egg-info}/PKG-INFO +15 -3
  42. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/SOURCES.txt +1 -21
  43. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/requires.txt +2 -2
  44. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/rag.txt +1 -1
  45. {evalscope-0.8.0 → evalscope-0.8.2}/tests/cli/test_run.py +11 -12
  46. {evalscope-0.8.0 → evalscope-0.8.2}/tests/perf/test_perf.py +3 -2
  47. {evalscope-0.8.0 → evalscope-0.8.2}/tests/vlm/test_vlmeval.py +3 -2
  48. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  49. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  50. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  51. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  52. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  53. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  54. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  55. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  56. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  57. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  58. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  59. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  60. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  61. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  62. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  63. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  64. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  65. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  66. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  67. evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  68. evalscope-0.8.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -20
  69. evalscope-0.8.0/evalscope/evaluator/humaneval_evaluator.py +0 -158
  70. evalscope-0.8.0/evalscope/version.py +0 -4
  71. {evalscope-0.8.0 → evalscope-0.8.2}/LICENSE +0 -0
  72. {evalscope-0.8.0 → evalscope-0.8.2}/MANIFEST.in +0 -0
  73. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/__init__.py +0 -0
  74. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/arguments.py +0 -0
  75. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/__init__.py +0 -0
  76. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/__init__.py +0 -0
  77. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  78. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/backend_manager.py +0 -0
  79. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  80. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  81. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  82. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/__init__.py +0 -0
  83. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  84. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  85. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  86. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  87. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  88. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  89. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  90. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  91. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  92. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  93. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  94. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  95. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  96. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  97. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  98. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  99. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  100. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  101. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  102. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  103. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  104. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  105. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  106. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  107. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  108. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  109. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  110. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  111. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  112. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  113. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  114. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  115. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  116. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  117. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  118. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  119. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  120. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  121. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/__init__.py +0 -0
  122. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/arc/__init__.py +0 -0
  123. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  124. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  125. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
  126. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  127. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  128. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  129. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  130. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  131. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  132. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  133. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  134. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  135. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  136. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  137. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  138. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  139. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  140. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  141. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  142. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  143. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  144. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  145. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  146. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  147. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  148. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  149. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  150. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  151. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  152. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  153. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  154. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/benchmark.py +0 -0
  155. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
  156. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  157. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  158. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
  159. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  160. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  161. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  162. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  163. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  164. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  165. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  166. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/data_adapter.py +0 -0
  167. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  168. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  169. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  170. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  171. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  172. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  173. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  174. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  175. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  176. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  177. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  178. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/__init__.py +0 -0
  179. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/race.py +0 -0
  180. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/samples.jsonl +0 -0
  181. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  182. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  183. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  184. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  185. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  186. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  187. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  188. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/__init__.py +0 -0
  189. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/base.py +0 -0
  190. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/cli.py +0 -0
  191. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/start_eval.py +0 -0
  192. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/start_perf.py +0 -0
  193. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/start_server.py +0 -0
  194. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
  195. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/__init__.py +0 -0
  196. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  197. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  198. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/code_metric.py +0 -0
  199. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/math_accuracy.py +0 -0
  200. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/metrics.py +0 -0
  201. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  202. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  203. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/rouge_metric.py +0 -0
  204. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/__init__.py +0 -0
  205. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/api/__init__.py +0 -0
  206. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/api/openai_api.py +0 -0
  207. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/custom/__init__.py +0 -0
  208. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/custom/custom_model.py +0 -0
  209. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/dummy_chat_model.py +0 -0
  210. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/model.py +0 -0
  211. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/openai_model.py +0 -0
  212. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/__init__.py +0 -0
  213. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/http_client.py +0 -0
  214. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/__init__.py +0 -0
  215. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/__init__.py +0 -0
  216. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/base.py +0 -0
  217. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/custom_api.py +0 -0
  218. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  219. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  220. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/base.py +0 -0
  221. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/custom.py +0 -0
  222. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  223. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  224. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  225. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  226. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  227. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/__init__.py +0 -0
  228. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/analysis_result.py +0 -0
  229. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/handler.py +0 -0
  230. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/__init__.py +0 -0
  231. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_arena.yaml +0 -0
  232. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  233. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  234. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_single.yaml +0 -0
  235. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  236. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  237. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  238. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  239. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/question.jsonl +0 -0
  240. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/arc.yaml +0 -0
  241. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/bbh.yaml +0 -0
  242. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  243. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/ceval.yaml +0 -0
  244. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  245. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  246. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  247. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
  248. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  249. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
  250. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  251. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/__init__.py +0 -0
  252. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/README.md +0 -0
  253. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/__init__.py +0 -0
  254. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/default_task.json +0 -0
  255. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  256. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/infer.py +0 -0
  257. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  258. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  259. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  260. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  261. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  262. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  263. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/utils.py +0 -0
  264. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/README.md +0 -0
  265. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  266. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  267. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  268. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
  269. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
  270. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  271. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  272. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  273. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/__init__.py +0 -0
  274. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  275. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/arena_utils.py +0 -0
  276. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/completion_parsers.py +0 -0
  277. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/model_utils.py +0 -0
  278. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/dependency_links.txt +0 -0
  279. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/entry_points.txt +0 -0
  280. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/not-zip-safe +0 -0
  281. {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/top_level.txt +0 -0
  282. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/docs.txt +0 -0
  283. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/framework.txt +0 -0
  284. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/inner.txt +0 -0
  285. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/opencompass.txt +0 -0
  286. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/perf.txt +0 -0
  287. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/tests.txt +0 -0
  288. {evalscope-0.8.0 → evalscope-0.8.2}/requirements/vlmeval.txt +0 -0
  289. {evalscope-0.8.0 → evalscope-0.8.2}/requirements.txt +0 -0
  290. {evalscope-0.8.0 → evalscope-0.8.2}/setup.cfg +0 -0
  291. {evalscope-0.8.0 → evalscope-0.8.2}/setup.py +0 -0
  292. {evalscope-0.8.0 → evalscope-0.8.2}/tests/__init__.py +0 -0
  293. {evalscope-0.8.0 → evalscope-0.8.2}/tests/cli/__init__.py +0 -0
  294. {evalscope-0.8.0 → evalscope-0.8.2}/tests/perf/__init__.py +0 -0
  295. {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/__init__.py +0 -0
  296. {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/test_clip_benchmark.py +0 -0
  297. {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/test_mteb.py +0 -0
  298. {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/test_ragas.py +0 -0
  299. {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/__init__.py +0 -0
  300. {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/test_run_swift_eval.py +0 -0
  301. {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  302. {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  303. {evalscope-0.8.0 → evalscope-0.8.2}/tests/test_run_all.py +0 -0
  304. {evalscope-0.8.0 → evalscope-0.8.2}/tests/vlm/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -54,7 +54,7 @@ Provides-Extra: vlmeval
54
54
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
55
55
  Provides-Extra: rag
56
56
  Requires-Dist: mteb==1.19.4; extra == "rag"
57
- Requires-Dist: ragas==0.2.7; extra == "rag"
57
+ Requires-Dist: ragas==0.2.9; extra == "rag"
58
58
  Requires-Dist: webdataset>0.2.0; extra == "rag"
59
59
  Provides-Extra: perf
60
60
  Requires-Dist: aiohttp; extra == "perf"
@@ -125,7 +125,7 @@ Requires-Dist: transformers_stream_generator; extra == "all"
125
125
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
126
126
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
127
127
  Requires-Dist: mteb==1.19.4; extra == "all"
128
- Requires-Dist: ragas==0.2.7; extra == "all"
128
+ Requires-Dist: ragas==0.2.9; extra == "all"
129
129
  Requires-Dist: webdataset>0.2.0; extra == "all"
130
130
  Requires-Dist: aiohttp; extra == "all"
131
131
  Requires-Dist: fastapi; extra == "all"
@@ -181,6 +181,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
181
181
  <br>EvalScope Framework.
182
182
  </p>
183
183
 
184
+ <details><summary>Framework Description</summary>
185
+
184
186
  The architecture includes the following modules:
185
187
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
186
188
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
@@ -194,6 +196,16 @@ The architecture includes the following modules:
194
196
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
195
197
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
196
198
 
199
+ </details>
200
+
201
+ ## ☎ User Groups
202
+
203
+ Please scan the QR code below to join our community groups:
204
+
205
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
206
+ :-------------------------:|:-------------------------:|:-------------------------:
207
+ <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
208
+
197
209
 
198
210
  ## 🎉 News
199
211
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
@@ -45,6 +45,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
45
45
  <br>EvalScope Framework.
46
46
  </p>
47
47
 
48
+ <details><summary>Framework Description</summary>
49
+
48
50
  The architecture includes the following modules:
49
51
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
50
52
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
@@ -58,6 +60,16 @@ The architecture includes the following modules:
58
60
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
59
61
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
60
62
 
63
+ </details>
64
+
65
+ ## ☎ User Groups
66
+
67
+ Please scan the QR code below to join our community groups:
68
+
69
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
70
+ :-------------------------:|:-------------------------:|:-------------------------:
71
+ <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
72
+
61
73
 
62
74
  ## 🎉 News
63
75
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
@@ -2,7 +2,7 @@
2
2
  from typing import Union
3
3
 
4
4
  from evalscope.config import TaskConfig
5
- from evalscope.utils import yaml_to_dict
5
+ from evalscope.utils.io_utils import yaml_to_dict
6
6
 
7
7
 
8
8
  class BackendManager:
@@ -4,7 +4,7 @@ import torch.nn.functional as F
4
4
  from langchain_core.embeddings import Embeddings
5
5
  from PIL import Image
6
6
  from transformers import AutoModel, AutoProcessor
7
- from typing import List
7
+ from typing import List, Union
8
8
 
9
9
  from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
10
10
  from evalscope.constants import HubType
@@ -86,7 +86,7 @@ class CLIPModel(Embeddings):
86
86
  self.transform = self.processor.image_processor
87
87
  self.tokenizer = self.processor.tokenizer
88
88
 
89
- def encode_text(self, batch_texts: List[str] | List[List[str]]):
89
+ def encode_text(self, batch_texts: Union[List[str], List[List[str]]]):
90
90
  if isinstance(batch_texts[0], list):
91
91
  batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
92
92
  # Ensure that the input texts are within the token limit
@@ -80,7 +80,7 @@ class BaseModel(Embeddings):
80
80
  """Embed query text. Compact mteb."""
81
81
  raise NotImplementedError
82
82
 
83
- def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
83
+ def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
84
84
  """Embed search docs . Compact mteb."""
85
85
  raise NotImplementedError
86
86
 
@@ -8,7 +8,7 @@ from typing import Any, Optional
8
8
  from evalscope.benchmarks.data_adapter import DataAdapter
9
9
  from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
10
10
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
11
- from evalscope.utils import jsonl_to_list
11
+ from evalscope.utils.io_utils import jsonl_to_list
12
12
  from evalscope.utils.logger import get_logger
13
13
 
14
14
  logger = get_logger()
@@ -6,7 +6,8 @@ import re
6
6
 
7
7
  from evalscope.benchmarks import DataAdapter
8
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
9
- from evalscope.utils import jsonl_to_list, normalize_score
9
+ from evalscope.utils import normalize_score
10
+ from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
12
 
12
13
  # flake8: noqa
@@ -5,7 +5,8 @@ import re
5
5
 
6
6
  from evalscope.benchmarks.data_adapter import DataAdapter
7
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import jsonl_to_list, normalize_score
8
+ from evalscope.utils import normalize_score
9
+ from evalscope.utils.io_utils import jsonl_to_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  # flake8: noqa
@@ -0,0 +1,206 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import json
3
+ import os
4
+ import re
5
+ from tqdm import tqdm
6
+ from typing import List
7
+
8
+ from evalscope.benchmarks.data_adapter import DataAdapter
9
+ from evalscope.metrics.metrics import weighted_mean
10
+ from evalscope.tools.combine_reports import gen_table
11
+ from evalscope.utils import normalize_score
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+ DATASET_ID = 'modelscope/humaneval'
17
+ SUBSET_LIST = ['openai_humaneval']
18
+
19
+ # Example:
20
+ # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
21
+
22
+
23
+ class HumanevalAdapter(DataAdapter):
24
+ """
25
+ A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
26
+ """
27
+
28
+ def __init__(self,
29
+ subset_list: list = None,
30
+ metric_list: list = None,
31
+ few_shot_num: int = None,
32
+ train_split: str = None,
33
+ eval_split: str = 'test',
34
+ prompt_template: str = 'Complete the following python code:\n',
35
+ **kwargs):
36
+ try:
37
+ from human_eval.data import stream_jsonl, write_jsonl
38
+ from human_eval.evaluation import check_correctness
39
+ except ImportError:
40
+ raise ImportError('Please install human_eval:'
41
+ 'https://github.com/openai/human-eval/tree/master#installation , '
42
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.')
43
+
44
+ if subset_list is None:
45
+ subset_list = SUBSET_LIST
46
+
47
+ if metric_list is None:
48
+ metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
49
+
50
+ self.k = [1]
51
+ self.num_workers = 4
52
+ self.timeout = 4.0
53
+ self.outputs = kwargs.get('outputs', None)
54
+
55
+ self.read_problems_func = stream_jsonl
56
+ self.write_jsonl_func = write_jsonl
57
+ self.eval_func = check_correctness
58
+
59
+ super().__init__(
60
+ subset_list=subset_list,
61
+ metric_list=metric_list,
62
+ few_shot_num=few_shot_num,
63
+ train_split=train_split,
64
+ eval_split=eval_split,
65
+ prompt_template=prompt_template,
66
+ **kwargs)
67
+
68
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
69
+ data_dict = {}
70
+ for subset_name in subset_list:
71
+ data_dict[subset_name] = {}
72
+ # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
73
+ data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
74
+
75
+ return data_dict
76
+
77
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
78
+ """
79
+ Generate prompt for the model.
80
+
81
+ Args:
82
+ input_d (dict): The raw input. A single data format of the Humaneval:
83
+ {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
84
+ """
85
+ full_prompt = input_d['prompt']
86
+ full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
87
+
88
+ return {'data': [full_prompt]}
89
+
90
+ def get_answers(self, infer_cfg: dict) -> List[dict]:
91
+ ans_list: list = []
92
+ system_prompt: str = ''
93
+ for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
94
+ prompt: str = system_prompt + data_d['prompt']
95
+ inputs: dict = {'data': [prompt]}
96
+
97
+ pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
98
+
99
+ pred_ans: str = pred_res['choices'][0]['message']['content']
100
+ pred_ans = self._postprocess(pred_ans)
101
+
102
+ ans_list.append({'task_id': task_id, 'completion': pred_ans})
103
+
104
+ return ans_list
105
+
106
+ def eval(self, infer_cfg: dict, **kwargs):
107
+
108
+ # predict
109
+ ans_list: list = self.get_answers(infer_cfg)
110
+ ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
111
+
112
+ self.write_jsonl_func(filename=ans_out_file, data=ans_list)
113
+ # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
114
+ logger.info('** Dump predictions successfully.')
115
+
116
+ # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
117
+ results = self.eval_func(
118
+ sample_file=ans_out_file,
119
+ k=self.k,
120
+ n_workers=self.num_workers,
121
+ timeout=self.timeout,
122
+ problem_file=self.problem_file)
123
+
124
+ # output: report
125
+ report_map: dict = self.gen_report(results=results)
126
+ report_dir: str = self.outputs_structure.reports_dir
127
+ report_file: str = os.path.join(report_dir, 'human_eval_report.json')
128
+
129
+ with open(report_file, 'w') as f:
130
+ f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
131
+ # logger.info(f'** Dump report to {report_file} \n')
132
+ logger.info('** Dump report \n')
133
+
134
+ try:
135
+ # Make table
136
+ report_table: str = gen_table([report_dir])
137
+ logger.info(f'** Report table: \n {report_table} \n')
138
+ except Exception:
139
+ logger.error('Failed to generate report table.')
140
+
141
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
142
+ total_num: int = sum([num for _, num in subset_score_map.values()])
143
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
144
+ weighted_avg_acc = normalize_score(score=weighted_avg_acc)
145
+ cate_avg_list = [{
146
+ 'name': subset_name,
147
+ 'score': normalize_score(score=score)
148
+ } for subset_name, (score, _) in subset_score_map.items()]
149
+
150
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
151
+
152
+ res_map = dict(
153
+ name=report_name or 'HumanEval',
154
+ metric='pass@1',
155
+ score=weighted_avg_acc,
156
+ category=[category_d],
157
+ total_num=total_num)
158
+
159
+ return res_map
160
+
161
+ @classmethod
162
+ def _postprocess(cls, text: str) -> str:
163
+ if '```' in text:
164
+ blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
165
+ if len(blocks) == 0:
166
+ text = text.split('```')[1] # fall back to default strategy
167
+ else:
168
+ text = blocks[0] # fetch the first code block
169
+ if not text.startswith('\n'): # in case starting with ```python
170
+ text = text[max(text.find('\n') + 1, 0):]
171
+ if text.strip().startswith('from') or text.strip().startswith('import'):
172
+ def_idx = text.find('def')
173
+ if def_idx != -1:
174
+ text = text[max(text.find('\n', def_idx) + 1, 0):]
175
+ text = text.split('\n\n')[0]
176
+ if text.strip().startswith('def'):
177
+ text = '\n'.join(text.split('\n')[1:])
178
+ if not text.startswith(' '):
179
+ if text.startswith(' '):
180
+ text = ' ' + text.lstrip()
181
+ else:
182
+ text = '\n'.join([' ' + line for line in text.split('\n')])
183
+ return text
184
+
185
+ def compute_metric(self, review_res_list: list) -> float:
186
+ """
187
+ Compute evaluation result by specific metric.
188
+
189
+ Args:
190
+ review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
191
+
192
+ Returns:
193
+ The metric score.
194
+ """
195
+ items = [(score, 1.0) for score in review_res_list]
196
+ return weighted_mean(items)
197
+
198
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
199
+ return self._postprocess(result)
200
+
201
+ def get_gold_answer(self, input_d: dict) -> str:
202
+ return input_d
203
+
204
+ def match(self, gold: str, pred: str) -> float:
205
+ res = self.eval_func(gold, pred, self.timeout)
206
+ return float(res['passed'])
@@ -5,7 +5,8 @@ import os
5
5
 
6
6
  from evalscope.benchmarks.data_adapter import DataAdapter
7
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import jsonl_to_list, normalize_score
8
+ from evalscope.utils import normalize_score
9
+ from evalscope.utils.io_utils import jsonl_to_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  # flake8: noqa
@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
9
9
 
10
10
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
11
11
  from evalscope.models.custom import CustomModel
12
- from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
12
+ from evalscope.utils import gen_hash
13
+ from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
13
14
  from evalscope.utils.logger import get_logger
14
15
 
15
16
  logger = get_logger()
@@ -31,6 +32,7 @@ DEFAULT_GENERATION_CONFIG = {
31
32
  class TaskConfig:
32
33
  # Model-related arguments
33
34
  model: Union[str, CustomModel, None] = None
35
+ model_id: Optional[str] = None
34
36
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
35
37
 
36
38
  # Template-related arguments
@@ -64,6 +66,13 @@ class TaskConfig:
64
66
  dry_run: bool = False
65
67
  seed: int = 42
66
68
 
69
+ def __post_init__(self):
70
+ if (not self.model_id) and self.model:
71
+ if isinstance(self.model, CustomModel):
72
+ self.model_id = type(self.model).__name__
73
+ else:
74
+ self.model_id = os.path.basename(self.model).rstrip(os.sep)
75
+
67
76
  def to_dict(self):
68
77
  # Note: to avoid serialization error for some model instance
69
78
  return self.__dict__
@@ -105,7 +114,9 @@ class TaskConfig:
105
114
  def from_args(args: Namespace):
106
115
  # Convert Namespace to a dictionary and filter out None values
107
116
  args_dict = {k: v for k, v in vars(args).items() if v is not None}
108
- del args_dict['func'] # Note: compat CLI arguments
117
+
118
+ if 'func' in args_dict:
119
+ del args_dict['func'] # Note: compat CLI arguments
109
120
 
110
121
  return TaskConfig.from_dict(args_dict)
111
122
 
@@ -119,6 +130,7 @@ class TaskConfig:
119
130
  continue
120
131
 
121
132
  task.model = custom_model
133
+ task.model_id = type(custom_model).__name__
122
134
  res_list.append(task)
123
135
 
124
136
  return res_list
@@ -168,6 +180,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
168
180
  registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
169
181
 
170
182
 
183
+ def parse_task_config(task_cfg) -> TaskConfig:
184
+ """Parse task configuration from various formats into a TaskConfig object."""
185
+ if isinstance(task_cfg, TaskConfig):
186
+ logger.info('Args: Task config is provided with TaskConfig type.')
187
+ elif isinstance(task_cfg, dict):
188
+ logger.info('Args: Task config is provided with dictionary type.')
189
+ task_cfg = TaskConfig.from_dict(task_cfg)
190
+ elif isinstance(task_cfg, Namespace):
191
+ logger.info('Args: Task config is provided with CommandLine type.')
192
+ task_cfg = TaskConfig.from_args(task_cfg)
193
+ elif isinstance(task_cfg, str):
194
+ extension = task_cfg.split('.')[-1]
195
+ logger.info(f'Args: Task config is provided with {extension} file type.')
196
+ if extension in ['yaml', 'yml']:
197
+ task_cfg = TaskConfig.from_yaml(task_cfg)
198
+ elif extension == 'json':
199
+ task_cfg = TaskConfig.from_json(task_cfg)
200
+ else:
201
+ raise ValueError('Args: Unsupported file extension.')
202
+ else:
203
+ raise ValueError('Args: Please provide a valid task config.')
204
+ return task_cfg
205
+
206
+
171
207
  class TempModel(CustomModel):
172
208
 
173
209
  def __init__(self, config: dict):
@@ -1,5 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
4
3
  from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
5
4
 
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
7
6
  DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
8
7
  DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
9
8
  DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
9
+ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
10
10
 
11
11
 
12
12
  class HubType:
@@ -76,33 +76,6 @@ class ArenaMode:
76
76
  PAIRWISE_BASELINE = 'pairwise_baseline'
77
77
 
78
78
 
79
- class OutputsStructure:
80
- LOGS_DIR = 'logs'
81
- PREDICTIONS_DIR = 'predictions'
82
- REVIEWS_DIR = 'reviews'
83
- REPORTS_DIR = 'reports'
84
- CONFIGS_DIR = 'configs'
85
-
86
- def __init__(self, outputs_dir: str, is_make: bool = True):
87
- self.outputs_dir = outputs_dir
88
- self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
89
- self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
90
- self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
91
- self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
92
- self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
93
-
94
- if is_make:
95
- self.create_directories()
96
-
97
- def create_directories(self):
98
- os.makedirs(self.outputs_dir, exist_ok=True)
99
- os.makedirs(self.logs_dir, exist_ok=True)
100
- os.makedirs(self.predictions_dir, exist_ok=True)
101
- os.makedirs(self.reviews_dir, exist_ok=True)
102
- os.makedirs(self.reports_dir, exist_ok=True)
103
- os.makedirs(self.configs_dir, exist_ok=True)
104
-
105
-
106
79
  class AnswerKeys:
107
80
  ANSWER_ID = 'answer_id'
108
81
  RAW_INPUT = 'raw_input'
@@ -166,17 +139,30 @@ class EvalType:
166
139
 
167
140
 
168
141
  class EvalBackend:
169
- # Use native evaluation pipeline of EvalScope
170
- NATIVE = 'Native'
171
142
 
172
- # Use OpenCompass framework as the evaluation backend
173
- OPEN_COMPASS = 'OpenCompass'
143
+ class _Backend:
144
+ # compatible with old version, set 'value'
145
+
146
+ def __init__(self, value):
147
+ self._value = value
148
+
149
+ @property
150
+ def value(self):
151
+ return self._value
152
+
153
+ def __str__(self):
154
+ return self._value
174
155
 
175
- # Use VLM Eval Kit as the multi-modal model evaluation backend
176
- VLM_EVAL_KIT = 'VLMEvalKit'
156
+ def __repr__(self):
157
+ return f"'{self._value}'"
177
158
 
178
- # Use RAGEval as the RAG evaluation backend
179
- RAG_EVAL = 'RAGEval'
159
+ def __eq__(self, other):
160
+ if isinstance(other, str):
161
+ return self._value == other
162
+ return NotImplemented
180
163
 
181
- # Use third-party evaluation backend/modules
182
- THIRD_PARTY = 'ThirdParty'
164
+ NATIVE = _Backend('Native')
165
+ OPEN_COMPASS = _Backend('OpenCompass')
166
+ VLM_EVAL_KIT = _Backend('VLMEvalKit')
167
+ RAG_EVAL = _Backend('RAGEval')
168
+ THIRD_PARTY = _Backend('ThirdParty')
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  from evalscope.evaluator.evaluator import Evaluator
4
- from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator
@@ -11,10 +11,11 @@ from typing import Any, Dict, List, Optional, Union
11
11
  from evalscope.benchmarks import DataAdapter
12
12
  from evalscope.config import TaskConfig
13
13
  from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
- OutputsStructure, ReviewKeys)
14
+ ReviewKeys)
15
15
  from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
16
16
  from evalscope.tools.combine_reports import gen_table
17
- from evalscope.utils import dict_torch_dtype_to_str, dump_jsonl_data, gen_hash, jsonl_to_list
17
+ from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
18
19
  from evalscope.utils.logger import get_logger
19
20
 
20
21
  logger = get_logger()
@@ -56,8 +57,8 @@ class Evaluator(object):
56
57
  **kwargs):
57
58
 
58
59
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
59
- self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
60
- self.model_name = os.path.basename(str(overall_task_cfg.model).rstrip(os.sep))
60
+ self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
61
+ self.model_name = overall_task_cfg.model_id
61
62
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
62
63
 
63
64
  self.datasets_dir = os.path.expanduser(datasets_dir)
@@ -85,6 +86,7 @@ class Evaluator(object):
85
86
  **kwargs)
86
87
 
87
88
  # Get prompts from dataset
89
+ # TODO: support sampler
88
90
  self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
89
91
  del self.dataset
90
92
 
@@ -5,8 +5,8 @@ import pyarrow as pa
5
5
  from typing import List, Union
6
6
 
7
7
  from evalscope.constants import MetricMembers
8
- from evalscope.utils import jsonl_to_list
9
8
  from evalscope.utils.arena_utils import compute_elo
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -12,8 +12,9 @@ from typing import Any, List
12
12
 
13
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
14
  from evalscope.models.openai_model import OpenAIModel
15
- from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
15
+ from evalscope.utils import completion_parsers, random_seeded_choice
16
16
  from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
+ from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
17
18
  from evalscope.utils.logger import get_logger
18
19
 
19
20
  logger = get_logger()
@@ -429,7 +429,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
429
429
  fix_do_sample_warning(self.generation_config)
430
430
 
431
431
  # Run inference
432
- output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
432
+ output_ids = self.model.generate(input_ids, generation_config=self.generation_config)
433
433
 
434
434
  response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
435
435
  return response
@@ -16,7 +16,7 @@ class Arguments:
16
16
  attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
17
17
  api: str = 'openai' # API to be used (default: 'openai')
18
18
  tokenizer_path: Optional[str] = None # Path to the tokenizer
19
- port: str = '8877' # Port number for the local API server
19
+ port: int = 8877 # Port number for the local API server
20
20
 
21
21
  # Connection settings
22
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
@@ -68,6 +68,7 @@ class Arguments:
68
68
  model=args.model,
69
69
  attn_implementation=args.attn_implementation,
70
70
  url=args.url,
71
+ port=args.port,
71
72
  api_key=args.api_key,
72
73
  connect_timeout=args.connect_timeout,
73
74
  read_timeout=args.read_timeout,
@@ -138,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
138
139
 
139
140
  # Connection settings
140
141
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
142
+ parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
141
143
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
142
144
  parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
143
145
  parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
157
157
  while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
158
  try:
159
159
  # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=1)
160
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
161
  benchmark_data_queue.task_done()
162
162
  except asyncio.TimeoutError:
163
163
  # If timeout, continue to the next iteration
@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
195
195
  server.start()
196
196
 
197
197
  if args.dataset.startswith('speed_benchmark'):
198
- args.url = 'http://127.0.0.1:8877/v1/completions'
198
+ args.url = f'http://127.0.0.1:{args.port}/v1/completions'
199
199
  else:
200
- args.url = 'http://127.0.0.1:8877/v1/chat/completions'
200
+ args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
201
 
202
202
  if not await test_connection(args):
203
203
  raise TimeoutError('Test connection failed')