evalscope 0.13.1__tar.gz → 0.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (364) hide show
  1. {evalscope-0.13.1/evalscope.egg-info → evalscope-0.14.0}/PKG-INFO +21 -55
  2. {evalscope-0.13.1 → evalscope-0.14.0}/README.md +10 -4
  3. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/arguments.py +1 -1
  4. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/__init__.py +1 -1
  5. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/backend_manager.py +21 -5
  6. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  7. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  8. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  9. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  10. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/embedding.py +49 -3
  11. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/llm.py +8 -9
  12. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  13. evalscope-0.14.0/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  14. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/arc/arc_adapter.py +1 -1
  15. evalscope-0.14.0/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  16. evalscope-0.14.0/evalscope/benchmarks/arena_hard/utils.py +162 -0
  17. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  18. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  19. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/data_adapter.py +30 -2
  20. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  21. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
  22. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  23. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  24. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  25. evalscope-0.14.0/evalscope/benchmarks/live_code_bench/testing_util.py +537 -0
  26. evalscope-0.14.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  27. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  28. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  29. evalscope-0.14.0/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  30. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/musr/musr_adapter.py +1 -1
  31. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  32. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/evaluator.py +4 -2
  33. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/config.py +2 -2
  34. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/llm_judge.py +1 -1
  35. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/chat_adapter.py +32 -11
  36. evalscope-0.14.0/evalscope/perf/__init__.py +0 -0
  37. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/arguments.py +30 -9
  38. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/benchmark.py +57 -103
  39. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/http_client.py +2 -3
  40. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/custom_api.py +1 -1
  41. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/openai_api.py +4 -2
  42. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/custom.py +4 -1
  43. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  44. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  45. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/openqa.py +4 -1
  46. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  47. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  48. evalscope-0.14.0/evalscope/perf/utils/__init__.py +0 -0
  49. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/benchmark_util.py +12 -6
  50. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/db_util.py +3 -3
  51. evalscope-0.14.0/evalscope/perf/utils/log_utils.py +41 -0
  52. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/app.py +11 -11
  53. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/run.py +7 -0
  54. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/summarizer.py +2 -1
  55. evalscope-0.14.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  56. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/utils.py +36 -25
  57. evalscope-0.14.0/evalscope/version.py +4 -0
  58. {evalscope-0.13.1 → evalscope-0.14.0/evalscope.egg-info}/PKG-INFO +21 -55
  59. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/SOURCES.txt +10 -4
  60. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/requires.txt +10 -51
  61. {evalscope-0.13.1 → evalscope-0.14.0}/requirements/framework.txt +0 -12
  62. evalscope-0.14.0/requirements/rag.txt +7 -0
  63. {evalscope-0.13.1 → evalscope-0.14.0}/setup.py +0 -1
  64. {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/test_all.py +36 -27
  65. {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/test_collection.py +2 -1
  66. {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/test_run.py +38 -20
  67. {evalscope-0.13.1 → evalscope-0.14.0}/tests/perf/test_perf.py +1 -2
  68. evalscope-0.14.0/tests/rag/__init__.py +0 -0
  69. {evalscope-0.13.1 → evalscope-0.14.0}/tests/rag/test_clip_benchmark.py +0 -1
  70. {evalscope-0.13.1 → evalscope-0.14.0}/tests/rag/test_mteb.py +37 -8
  71. {evalscope-0.13.1 → evalscope-0.14.0}/tests/rag/test_ragas.py +33 -27
  72. {evalscope-0.13.1 → evalscope-0.14.0}/tests/vlm/test_vlmeval.py +37 -1
  73. evalscope-0.13.1/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  74. evalscope-0.13.1/evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  75. evalscope-0.13.1/evalscope/benchmarks/live_code_bench/testing_util.py +0 -721
  76. evalscope-0.13.1/evalscope/version.py +0 -4
  77. evalscope-0.13.1/requirements/inner.txt +0 -25
  78. evalscope-0.13.1/requirements/rag.txt +0 -3
  79. evalscope-0.13.1/requirements/tests.txt +0 -5
  80. {evalscope-0.13.1 → evalscope-0.14.0}/LICENSE +0 -0
  81. {evalscope-0.13.1 → evalscope-0.14.0}/MANIFEST.in +0 -0
  82. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/__init__.py +0 -0
  83. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/__init__.py +0 -0
  84. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/base.py +0 -0
  85. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/__init__.py +0 -0
  86. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  87. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  88. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  89. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  90. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  91. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  92. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  93. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  94. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  95. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  96. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  97. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  98. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  99. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  100. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  101. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  102. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  103. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  104. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  105. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  106. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  107. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  108. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  109. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  110. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  111. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  112. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  113. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  114. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  115. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  116. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  117. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  118. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  119. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  120. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  121. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  122. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/__init__.py +0 -0
  123. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/aime/__init__.py +0 -0
  124. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  125. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  126. {evalscope-0.13.1/evalscope/benchmarks/chinese_simple_qa → evalscope-0.14.0/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
  127. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  128. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  129. {evalscope-0.13.1/evalscope/benchmarks/data_collection → evalscope-0.14.0/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
  130. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  131. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  132. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  133. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  134. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  135. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  136. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  137. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  138. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  139. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  140. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  141. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  142. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  143. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  144. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  145. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  146. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  147. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  148. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  149. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  150. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  151. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  152. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  153. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  154. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  155. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  156. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  157. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  158. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  159. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/benchmark.py +0 -0
  160. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  161. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  162. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  163. {evalscope-0.13.1/evalscope/benchmarks/general_mcq → evalscope-0.14.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  164. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  165. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  166. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  167. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  168. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  169. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  170. {evalscope-0.13.1/evalscope/benchmarks/gpqa → evalscope-0.14.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  171. {evalscope-0.13.1/evalscope/benchmarks/ifeval → evalscope-0.14.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  172. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
  173. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  174. {evalscope-0.13.1/evalscope/benchmarks/iquiz → evalscope-0.14.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  175. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  176. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  177. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  178. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  179. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  180. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  181. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  182. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  183. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  184. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  185. {evalscope-0.13.1/evalscope/benchmarks/live_code_bench → evalscope-0.14.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  186. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  187. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  188. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  189. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  190. {evalscope-0.13.1/evalscope/benchmarks/math_500 → evalscope-0.14.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  191. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  192. {evalscope-0.13.1/evalscope/benchmarks/mmlu_pro → evalscope-0.14.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  193. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  194. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  195. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  196. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  197. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  198. {evalscope-0.13.1/evalscope/benchmarks/musr → evalscope-0.14.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  199. {evalscope-0.13.1/evalscope/benchmarks/process_bench → evalscope-0.14.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
  200. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
  201. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  202. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  203. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  204. {evalscope-0.13.1/evalscope/benchmarks/simple_qa → evalscope-0.14.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  205. {evalscope-0.13.1/evalscope/benchmarks/super_gpqa → evalscope-0.14.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  206. {evalscope-0.13.1/evalscope/perf → evalscope-0.14.0/evalscope/benchmarks/musr}/__init__.py +0 -0
  207. {evalscope-0.13.1/evalscope/perf/utils → evalscope-0.14.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  208. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  209. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  210. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/__init__.py +0 -0
  211. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/race.py +0 -0
  212. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
  213. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  214. {evalscope-0.13.1/evalscope/third_party/thinkbench/tools → evalscope-0.14.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  215. {evalscope-0.13.1/tests/rag → evalscope-0.14.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  216. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  217. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  218. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  219. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  220. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  221. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  222. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  223. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  224. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  225. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  226. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  227. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/utils.py +0 -0
  228. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/__init__.py +0 -0
  229. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/base.py +0 -0
  230. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/cli.py +0 -0
  231. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_app.py +0 -0
  232. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_eval.py +0 -0
  233. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_perf.py +0 -0
  234. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_server.py +0 -0
  235. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/__init__.py +0 -0
  236. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/sampler.py +0 -0
  237. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/schema.py +0 -0
  238. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/constants.py +0 -0
  239. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/__init__.py +0 -0
  240. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/evaluator.py +0 -0
  241. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/rating_eval.py +0 -0
  242. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  243. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  244. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/__init__.py +0 -0
  245. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  246. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  247. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/code_metric.py +0 -0
  248. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/math_parser.py +0 -0
  249. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/metrics.py +0 -0
  250. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/named_metrics.py +0 -0
  251. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  252. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  253. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/rouge_metric.py +0 -0
  254. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/__init__.py +0 -0
  255. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/base_adapter.py +0 -0
  256. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/choice_adapter.py +0 -0
  257. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom/__init__.py +0 -0
  258. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom/custom_model.py +0 -0
  259. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom/dummy_model.py +0 -0
  260. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom_adapter.py +0 -0
  261. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/local_model.py +0 -0
  262. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/model.py +0 -0
  263. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/register.py +0 -0
  264. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/server_adapter.py +0 -0
  265. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/main.py +0 -0
  266. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/__init__.py +0 -0
  267. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  268. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/base.py +0 -0
  269. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  270. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  271. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  272. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  273. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/registry.py +0 -0
  274. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/analysis_result.py +0 -0
  275. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/handler.py +0 -0
  276. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/local_server.py +0 -0
  277. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/__init__.py +0 -0
  278. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  279. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  280. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  281. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  282. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  283. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  284. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  285. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  286. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/question.jsonl +0 -0
  287. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/arc.yaml +0 -0
  288. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  289. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  290. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  291. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  292. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  293. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  294. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  295. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  296. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  297. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  298. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/__init__.py +0 -0
  299. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/combinator.py +0 -0
  300. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/generator.py +0 -0
  301. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/utils.py +0 -0
  302. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/run_arena.py +0 -0
  303. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/__init__.py +0 -0
  304. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/README.md +0 -0
  305. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  306. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  307. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  308. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  309. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  310. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  311. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  312. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  313. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  314. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  315. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  316. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  317. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  318. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  319. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  320. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
  321. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/eval.py +0 -0
  322. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/infer.py +0 -0
  323. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  324. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  325. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  326. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  327. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  328. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  329. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  330. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  331. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  332. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  333. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  334. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  335. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  336. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  337. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/__init__.py +0 -0
  338. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/arena_utils.py +0 -0
  339. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/chat_service.py +0 -0
  340. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/completion_parsers.py +0 -0
  341. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/filters.py +0 -0
  342. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/io_utils.py +0 -0
  343. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/logger.py +0 -0
  344. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/model_utils.py +0 -0
  345. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/dependency_links.txt +0 -0
  346. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/entry_points.txt +0 -0
  347. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/not-zip-safe +0 -0
  348. {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/top_level.txt +0 -0
  349. {evalscope-0.13.1 → evalscope-0.14.0}/requirements/app.txt +0 -0
  350. {evalscope-0.13.1 → evalscope-0.14.0}/requirements/docs.txt +0 -0
  351. {evalscope-0.13.1 → evalscope-0.14.0}/requirements/opencompass.txt +0 -0
  352. {evalscope-0.13.1 → evalscope-0.14.0}/requirements/perf.txt +0 -0
  353. {evalscope-0.13.1 → evalscope-0.14.0}/requirements/vlmeval.txt +0 -0
  354. {evalscope-0.13.1 → evalscope-0.14.0}/requirements.txt +0 -0
  355. {evalscope-0.13.1 → evalscope-0.14.0}/setup.cfg +0 -0
  356. {evalscope-0.13.1 → evalscope-0.14.0}/tests/__init__.py +0 -0
  357. {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/__init__.py +0 -0
  358. {evalscope-0.13.1 → evalscope-0.14.0}/tests/perf/__init__.py +0 -0
  359. {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/__init__.py +0 -0
  360. {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/test_run_swift_eval.py +0 -0
  361. {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  362. {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  363. {evalscope-0.13.1 → evalscope-0.14.0}/tests/test_run_all.py +0 -0
  364. {evalscope-0.13.1 → evalscope-0.14.0}/tests/vlm/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.1
3
+ Version: 0.14.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
16
16
  Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: absl-py
20
19
  Requires-Dist: accelerate
21
- Requires-Dist: cachetools
22
20
  Requires-Dist: datasets<=3.2.0,>=3.0.0
23
- Requires-Dist: editdistance
24
21
  Requires-Dist: immutabledict
25
22
  Requires-Dist: jieba
26
23
  Requires-Dist: jsonlines
@@ -31,35 +28,31 @@ Requires-Dist: modelscope[framework]
31
28
  Requires-Dist: nltk>=3.9
32
29
  Requires-Dist: openai
33
30
  Requires-Dist: pandas
34
- Requires-Dist: plotly
35
31
  Requires-Dist: pyarrow
36
- Requires-Dist: pympler
37
32
  Requires-Dist: pyyaml
38
- Requires-Dist: regex
39
33
  Requires-Dist: requests
40
- Requires-Dist: requests-toolbelt
41
34
  Requires-Dist: rouge-chinese
42
35
  Requires-Dist: rouge-score>=0.1.0
43
36
  Requires-Dist: sacrebleu
44
37
  Requires-Dist: scikit-learn
45
38
  Requires-Dist: seaborn
46
- Requires-Dist: sentencepiece
47
- Requires-Dist: simple-ddl-parser
48
39
  Requires-Dist: sympy
49
40
  Requires-Dist: tabulate
50
- Requires-Dist: tiktoken
51
41
  Requires-Dist: torch
52
42
  Requires-Dist: tqdm
53
43
  Requires-Dist: transformers>=4.33
54
- Requires-Dist: transformers_stream_generator
55
44
  Requires-Dist: word2number
56
45
  Provides-Extra: opencompass
57
46
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
58
47
  Provides-Extra: vlmeval
59
48
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
60
49
  Provides-Extra: rag
50
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
51
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
52
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
53
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
61
54
  Requires-Dist: mteb==1.19.4; extra == "rag"
62
- Requires-Dist: ragas==0.2.9; extra == "rag"
55
+ Requires-Dist: ragas==0.2.14; extra == "rag"
63
56
  Requires-Dist: webdataset>0.2.0; extra == "rag"
64
57
  Provides-Extra: perf
65
58
  Requires-Dist: aiohttp; extra == "perf"
@@ -71,38 +64,9 @@ Requires-Dist: unicorn; extra == "perf"
71
64
  Provides-Extra: app
72
65
  Requires-Dist: gradio==5.4.0; extra == "app"
73
66
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
74
- Provides-Extra: inner
75
- Requires-Dist: absl-py; extra == "inner"
76
- Requires-Dist: accelerate; extra == "inner"
77
- Requires-Dist: alibaba_itag_sdk; extra == "inner"
78
- Requires-Dist: dashscope; extra == "inner"
79
- Requires-Dist: editdistance; extra == "inner"
80
- Requires-Dist: jsonlines; extra == "inner"
81
- Requires-Dist: nltk; extra == "inner"
82
- Requires-Dist: openai; extra == "inner"
83
- Requires-Dist: pandas==1.5.3; extra == "inner"
84
- Requires-Dist: plotly; extra == "inner"
85
- Requires-Dist: pyarrow; extra == "inner"
86
- Requires-Dist: pyodps; extra == "inner"
87
- Requires-Dist: pyyaml; extra == "inner"
88
- Requires-Dist: regex; extra == "inner"
89
- Requires-Dist: requests==2.28.1; extra == "inner"
90
- Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
91
- Requires-Dist: rouge-score; extra == "inner"
92
- Requires-Dist: sacrebleu; extra == "inner"
93
- Requires-Dist: scikit-learn; extra == "inner"
94
- Requires-Dist: seaborn; extra == "inner"
95
- Requires-Dist: simple-ddl-parser; extra == "inner"
96
- Requires-Dist: streamlit; extra == "inner"
97
- Requires-Dist: tqdm; extra == "inner"
98
- Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
99
- Requires-Dist: transformers_stream_generator; extra == "inner"
100
67
  Provides-Extra: all
101
- Requires-Dist: absl-py; extra == "all"
102
68
  Requires-Dist: accelerate; extra == "all"
103
- Requires-Dist: cachetools; extra == "all"
104
69
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
105
- Requires-Dist: editdistance; extra == "all"
106
70
  Requires-Dist: immutabledict; extra == "all"
107
71
  Requires-Dist: jieba; extra == "all"
108
72
  Requires-Dist: jsonlines; extra == "all"
@@ -113,32 +77,28 @@ Requires-Dist: modelscope[framework]; extra == "all"
113
77
  Requires-Dist: nltk>=3.9; extra == "all"
114
78
  Requires-Dist: openai; extra == "all"
115
79
  Requires-Dist: pandas; extra == "all"
116
- Requires-Dist: plotly; extra == "all"
117
80
  Requires-Dist: pyarrow; extra == "all"
118
- Requires-Dist: pympler; extra == "all"
119
81
  Requires-Dist: pyyaml; extra == "all"
120
- Requires-Dist: regex; extra == "all"
121
82
  Requires-Dist: requests; extra == "all"
122
- Requires-Dist: requests-toolbelt; extra == "all"
123
83
  Requires-Dist: rouge-chinese; extra == "all"
124
84
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
125
85
  Requires-Dist: sacrebleu; extra == "all"
126
86
  Requires-Dist: scikit-learn; extra == "all"
127
87
  Requires-Dist: seaborn; extra == "all"
128
- Requires-Dist: sentencepiece; extra == "all"
129
- Requires-Dist: simple-ddl-parser; extra == "all"
130
88
  Requires-Dist: sympy; extra == "all"
131
89
  Requires-Dist: tabulate; extra == "all"
132
- Requires-Dist: tiktoken; extra == "all"
133
90
  Requires-Dist: torch; extra == "all"
134
91
  Requires-Dist: tqdm; extra == "all"
135
92
  Requires-Dist: transformers>=4.33; extra == "all"
136
- Requires-Dist: transformers_stream_generator; extra == "all"
137
93
  Requires-Dist: word2number; extra == "all"
138
94
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
139
95
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
96
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
97
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
98
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
99
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
140
100
  Requires-Dist: mteb==1.19.4; extra == "all"
141
- Requires-Dist: ragas==0.2.9; extra == "all"
101
+ Requires-Dist: ragas==0.2.14; extra == "all"
142
102
  Requires-Dist: webdataset>0.2.0; extra == "all"
143
103
  Requires-Dist: aiohttp; extra == "all"
144
104
  Requires-Dist: fastapi; extra == "all"
@@ -161,7 +121,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
161
121
  </p>
162
122
 
163
123
  <p align="center">
164
- <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
124
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
165
125
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
166
126
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
167
127
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -239,6 +199,9 @@ Please scan the QR code below to join our community groups:
239
199
 
240
200
  ## 🎉 News
241
201
 
202
+ - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
203
+ - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
204
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
242
205
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
243
206
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
244
207
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
@@ -251,15 +214,14 @@ Please scan the QR code below to join our community groups:
251
214
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
252
215
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
253
216
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
217
+ <details><summary>More</summary>
218
+
254
219
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
255
220
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
256
221
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
257
222
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
258
223
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
259
224
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
260
-
261
- <details><summary>More</summary>
262
-
263
225
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
264
226
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
265
227
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -542,6 +504,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
542
504
 
543
505
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
544
506
 
507
+ **Supports swanlab for recording results**
508
+
509
+ ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
510
+
545
511
  **Supports Speed Benchmark**
546
512
 
547
513
  It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
@@ -10,7 +10,7 @@
10
10
  </p>
11
11
 
12
12
  <p align="center">
13
- <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
13
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
14
14
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
15
15
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
16
16
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -88,6 +88,9 @@ Please scan the QR code below to join our community groups:
88
88
 
89
89
  ## 🎉 News
90
90
 
91
+ - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
92
+ - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
93
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
91
94
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
92
95
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
93
96
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
@@ -100,15 +103,14 @@ Please scan the QR code below to join our community groups:
100
103
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
101
104
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
102
105
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
106
+ <details><summary>More</summary>
107
+
103
108
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
104
109
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
105
110
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
106
111
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
107
112
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
108
113
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
109
-
110
- <details><summary>More</summary>
111
-
112
114
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
113
115
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
114
116
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -391,6 +393,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
391
393
 
392
394
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
393
395
 
396
+ **Supports swanlab for recording results**
397
+
398
+ ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
399
+
394
400
  **Supports Speed Benchmark**
395
401
 
396
402
  It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
@@ -77,7 +77,7 @@ def add_argument(parser: argparse.ArgumentParser):
77
77
  # LLMJudge arguments
78
78
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
79
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
- parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
80
+ parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
81
81
  # yapf: enable
82
82
 
83
83
 
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
1
+ from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
2
2
  from evalscope.backend.rag_eval.utils.clip import VisionModel
3
3
  from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
4
4
  from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
@@ -8,6 +8,12 @@ from evalscope.utils.logger import get_logger
8
8
  logger = get_logger()
9
9
 
10
10
 
11
+ class Tools:
12
+ MTEB = 'mteb'
13
+ RAGAS = 'ragas'
14
+ CLIP_BENCHMARK = 'clip_benchmark'
15
+
16
+
11
17
  class RAGEvalBackendManager(BackendManager):
12
18
 
13
19
  def __init__(self, config: Union[str, dict], **kwargs):
@@ -47,9 +53,19 @@ class RAGEvalBackendManager(BackendManager):
47
53
  from evalscope.backend.rag_eval.ragas.tasks import generate_testset
48
54
 
49
55
  if testset_args is not None:
50
- generate_testset(TestsetGenerationArguments(**testset_args))
56
+ if isinstance(testset_args, dict):
57
+ generate_testset(TestsetGenerationArguments(**testset_args))
58
+ elif isinstance(testset_args, TestsetGenerationArguments):
59
+ generate_testset(testset_args)
60
+ else:
61
+ raise ValueError('Please provide the testset generation arguments.')
51
62
  if eval_args is not None:
52
- rag_eval(EvaluationArguments(**eval_args))
63
+ if isinstance(eval_args, dict):
64
+ rag_eval(EvaluationArguments(**eval_args))
65
+ elif isinstance(eval_args, EvaluationArguments):
66
+ rag_eval(eval_args)
67
+ else:
68
+ raise ValueError('Please provide the evaluation arguments.')
53
69
 
54
70
  @staticmethod
55
71
  def run_clip_benchmark(args):
@@ -59,17 +75,17 @@ class RAGEvalBackendManager(BackendManager):
59
75
 
60
76
  def run(self, *args, **kwargs):
61
77
  tool = self.config_d.pop('tool')
62
- if tool.lower() == 'mteb':
78
+ if tool.lower() == Tools.MTEB:
63
79
  self._check_env('mteb')
64
80
  model_args = self.config_d['model']
65
81
  eval_args = self.config_d['eval']
66
82
  self.run_mteb(model_args, eval_args)
67
- elif tool.lower() == 'ragas':
83
+ elif tool.lower() == Tools.RAGAS:
68
84
  self._check_env('ragas')
69
85
  testset_args = self.config_d.get('testset_generation', None)
70
86
  eval_args = self.config_d.get('eval', None)
71
87
  self.run_ragas(testset_args, eval_args)
72
- elif tool.lower() == 'clip_benchmark':
88
+ elif tool.lower() == Tools.CLIP_BENCHMARK:
73
89
  self._check_env('webdataset')
74
90
  self.run_clip_benchmark(self.config_d['eval'])
75
91
  else:
@@ -20,6 +20,12 @@ class ModelArguments:
20
20
  encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
21
21
  hub: str = 'modelscope' # modelscope or huggingface
22
22
 
23
+ # for API embedding model
24
+ model_name: Optional[str] = None
25
+ api_base: Optional[str] = None
26
+ api_key: Optional[str] = None
27
+ dimensions: Optional[int] = None
28
+
23
29
  def to_dict(self) -> Dict[str, Any]:
24
30
  return {
25
31
  'model_name_or_path': self.model_name_or_path,
@@ -31,6 +37,10 @@ class ModelArguments:
31
37
  'config_kwargs': self.config_kwargs,
32
38
  'encode_kwargs': self.encode_kwargs,
33
39
  'hub': self.hub,
40
+ 'model_name': self.model_name,
41
+ 'api_base': self.api_base,
42
+ 'api_key': self.api_key,
43
+ 'dimensions': self.dimensions,
34
44
  }
35
45
 
36
46
 
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
21
21
  """
22
22
  generator_llm: Dict = field(default_factory=dict)
23
23
  embeddings: Dict = field(default_factory=dict)
24
- distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
25
24
  # For LLM based evaluation
26
25
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
27
26
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
67
67
 
68
68
 
69
69
  def load_data(file_path):
70
- from langchain_community.document_loaders import UnstructuredFileLoader
70
+ import nltk
71
+ from langchain_unstructured import UnstructuredLoader
71
72
 
72
- loader = UnstructuredFileLoader(file_path, mode='single')
73
+ if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
74
+ # need to download nltk data for the first time
75
+ nltk.download('averaged_perceptron_tagger_eng')
76
+
77
+ loader = UnstructuredLoader(file_path)
73
78
  data = loader.load()
74
79
  return data
75
80
 
@@ -2,7 +2,6 @@ import asyncio
2
2
  import os
3
3
  from ragas.llms import BaseRagasLLM
4
4
  from ragas.prompt import PromptMixin, PydanticPrompt
5
- from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
6
5
  from typing import List
7
6
 
8
7
  from evalscope.utils.logger import get_logger
@@ -16,10 +15,6 @@ async def translate_prompt(
16
15
  llm: BaseRagasLLM,
17
16
  adapt_instruction: bool = False,
18
17
  ):
19
- if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
20
- logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
21
- return
22
-
23
18
  if not issubclass(type(prompt_user), PromptMixin):
24
19
  logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
25
20
  return
@@ -1,10 +1,12 @@
1
1
  import os
2
2
  import torch
3
3
  from langchain_core.embeddings import Embeddings
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
4
5
  from sentence_transformers import models
5
6
  from sentence_transformers.cross_encoder import CrossEncoder
6
7
  from sentence_transformers.SentenceTransformer import SentenceTransformer
7
8
  from torch import Tensor
9
+ from tqdm import tqdm
8
10
  from typing import Dict, List, Optional, Union
9
11
 
10
12
  from evalscope.backend.rag_eval.utils.tools import download_model
@@ -18,10 +20,10 @@ class BaseModel(Embeddings):
18
20
 
19
21
  def __init__(
20
22
  self,
21
- model_name_or_path: str,
23
+ model_name_or_path: str = '',
22
24
  max_seq_length: int = 512,
23
25
  prompt: str = '',
24
- revision: Optional[str] = None,
26
+ revision: Optional[str] = 'master',
25
27
  **kwargs,
26
28
  ):
27
29
  self.model_name_or_path = model_name_or_path
@@ -139,7 +141,7 @@ class CrossEncoderModel(BaseModel):
139
141
  max_length=self.max_seq_length,
140
142
  )
141
143
 
142
- def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
144
+ def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
143
145
  self.encode_kwargs.update(kwargs)
144
146
 
145
147
  if len(sentences[0]) == 3: # Note: For mteb retrieval task
@@ -154,6 +156,46 @@ class CrossEncoderModel(BaseModel):
154
156
  return embeddings
155
157
 
156
158
 
159
+ class APIEmbeddingModel(BaseModel):
160
+
161
+ def __init__(self, **kwargs):
162
+ self.model_name = kwargs.get('model_name')
163
+ self.openai_api_base = kwargs.get('api_base')
164
+ self.openai_api_key = kwargs.get('api_key')
165
+ self.dimensions = kwargs.get('dimensions')
166
+
167
+ self.model = OpenAIEmbeddings(
168
+ model=self.model_name,
169
+ openai_api_base=self.openai_api_base,
170
+ openai_api_key=self.openai_api_key,
171
+ dimensions=self.dimensions,
172
+ check_embedding_ctx_length=False)
173
+
174
+ super().__init__(model_name_or_path=self.model_name, **kwargs)
175
+
176
+ self.batch_size = self.encode_kwargs.get('batch_size', 10)
177
+
178
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
179
+ if isinstance(texts, str):
180
+ texts = [texts]
181
+
182
+ embeddings: List[List[float]] = []
183
+ for i in tqdm(range(0, len(texts), self.batch_size)):
184
+ response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
185
+ embeddings.extend(response)
186
+ return torch.tensor(embeddings)
187
+
188
+ def encode_queries(self, queries, **kwargs):
189
+ return self.encode(queries, **kwargs)
190
+
191
+ def encode_corpus(self, corpus, **kwargs):
192
+ if isinstance(corpus[0], dict):
193
+ input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
194
+ else:
195
+ input_texts = corpus
196
+ return self.encode(input_texts, **kwargs)
197
+
198
+
157
199
  class EmbeddingModel:
158
200
  """Custom embeddings"""
159
201
 
@@ -165,6 +207,10 @@ class EmbeddingModel:
165
207
  revision: Optional[str] = 'master',
166
208
  **kwargs,
167
209
  ):
210
+ if kwargs.get('model_name'):
211
+ # If model_name is provided, use OpenAIEmbeddings
212
+ return APIEmbeddingModel(**kwargs)
213
+
168
214
  # If model path does not exist and hub is 'modelscope', download the model
169
215
  if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
170
216
  model_name_or_path = download_model(model_name_or_path, revision)
@@ -2,11 +2,11 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from modelscope.utils.hf_util import GenerationConfig
5
+ from transformers.generation.configuration_utils import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter, LocalModel
10
10
 
11
11
 
12
12
  class LLM:
@@ -16,9 +16,9 @@ class LLM:
16
16
  api_base = kw.get('api_base', None)
17
17
  if api_base:
18
18
  return ChatOpenAI(
19
- model_name=kw.get('model_name', ''),
20
- openai_api_base=api_base,
21
- openai_api_key=kw.get('api_key', 'EMPTY'),
19
+ model=kw.get('model_name', ''),
20
+ base_url=api_base,
21
+ api_key=kw.get('api_key', 'EMPTY'),
22
22
  )
23
23
  else:
24
24
  return LocalLLM(**kw)
@@ -38,8 +38,7 @@ class LocalLLM(BaseLLM):
38
38
  super().__init__(**kw)
39
39
  self.model_name = os.path.basename(self.model_name_or_path)
40
40
  self.model = ChatGenerationModelAdapter(
41
- model_id=self.model_name_or_path,
42
- model_revision=self.model_revision,
41
+ model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
43
42
  generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
44
43
  )
45
44
 
@@ -53,8 +52,8 @@ class LocalLLM(BaseLLM):
53
52
  """Run the LLM on the given input."""
54
53
  infer_cfg = {'stop': stop}
55
54
 
56
- response = self.model._model_generate(prompt, infer_cfg)
57
- return response
55
+ response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
56
+ return response[0][0]
58
57
 
59
58
  @property
60
59
  def _identifying_params(self) -> Dict[str, Any]:
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import os
2
3
  import subprocess
3
4
  from functools import partial
4
5
  from typing import Optional, Union
@@ -66,8 +67,9 @@ class VLMEvalKitBackendManager(BackendManager):
66
67
  del remain_cfg['name'] # remove not used args
67
68
  del remain_cfg['type'] # remove not used args
68
69
 
69
- self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
70
- new_model_names.append(model_type)
70
+ norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
71
+ self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
72
+ new_model_names.append(norm_model_type)
71
73
  else:
72
74
  remain_cfg = copy.deepcopy(model_cfg)
73
75
  del remain_cfg['name'] # remove not used args