evalscope 0.5.5__tar.gz → 0.5.5rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/PKG-INFO +30 -24
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/README.md +25 -14
- evalscope-0.5.5rc1/evalscope/backend/opencompass/__init__.py +3 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/vlm_eval_kit/backend_manager.py +1 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/evaluator.py +0 -1
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/run.py +0 -4
- evalscope-0.5.5rc1/evalscope/utils/logger.py +64 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/task_utils.py +0 -3
- evalscope-0.5.5rc1/evalscope/version.py +4 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/PKG-INFO +30 -24
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/SOURCES.txt +0 -16
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/requires.txt +4 -10
- evalscope-0.5.5/evalscope/backend/rag_eval/__init__.py +0 -3
- evalscope-0.5.5/evalscope/backend/rag_eval/backend_manager.py +0 -68
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/__init__.py +0 -4
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/arguments.py +0 -59
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/base.py +0 -89
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/task_template.py +0 -83
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -302
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -252
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -113
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -153
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -345
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -302
- evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -64
- evalscope-0.5.5/evalscope/backend/rag_eval/ragas/__init__.py +0 -2
- evalscope-0.5.5/evalscope/backend/rag_eval/ragas/arguments.py +0 -37
- evalscope-0.5.5/evalscope/backend/rag_eval/ragas/task_template.py +0 -117
- evalscope-0.5.5/evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope-0.5.5/evalscope/utils/logger.py +0 -94
- evalscope-0.5.5/evalscope/version.py +0 -4
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/__init__.py +0 -0
- {evalscope-0.5.5/evalscope/backend/opencompass → evalscope-0.5.5rc1/evalscope/backend}/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/base.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cache.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/base.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/config.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/constants.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/api/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/api/openai_api.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/dummy_chat_model.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/model.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/model_adapter.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/openai_model.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/template.py +0 -0
- {evalscope-0.5.5/evalscope/backend → evalscope-0.5.5rc1/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/_logging.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/api_plugin_base.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/custom_api.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/dashscope_api.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/dataset_plugin_base.py +0 -0
- {evalscope-0.5.5/evalscope/perf → evalscope-0.5.5rc1/evalscope/perf/datasets}/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/datasets/line_by_line.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/datasets/openqa.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/how_to_analysis_result.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/openai_api.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/plugin_registry.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/query_parameters.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/server_sent_event.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/preprocess/__init__.py +0 -0
- {evalscope-0.5.5/evalscope/perf/datasets → evalscope-0.5.5rc1/evalscope/preprocess/tokenizers}/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/run_arena.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/run_ms.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/summarizer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/combine_reports.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/rewrite_eval_results.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/task_cfg_parser.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/utils.py +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.5.5 → evalscope-0.5.5rc1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5rc1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -24,7 +24,7 @@ Requires-Dist: editdistance
|
|
|
24
24
|
Requires-Dist: jsonlines
|
|
25
25
|
Requires-Dist: matplotlib
|
|
26
26
|
Requires-Dist: modelscope[framework]
|
|
27
|
-
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: nltk
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
@@ -34,7 +34,7 @@ Requires-Dist: pyyaml
|
|
|
34
34
|
Requires-Dist: regex
|
|
35
35
|
Requires-Dist: requests
|
|
36
36
|
Requires-Dist: requests-toolbelt
|
|
37
|
-
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: rouge-score
|
|
38
38
|
Requires-Dist: sacrebleu
|
|
39
39
|
Requires-Dist: scikit-learn
|
|
40
40
|
Requires-Dist: seaborn
|
|
@@ -51,9 +51,6 @@ Provides-Extra: opencompass
|
|
|
51
51
|
Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
|
|
52
52
|
Provides-Extra: vlmeval
|
|
53
53
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
|
|
54
|
-
Provides-Extra: rag
|
|
55
|
-
Requires-Dist: ragas; extra == "rag"
|
|
56
|
-
Requires-Dist: mteb>=0.14.16; extra == "rag"
|
|
57
54
|
Provides-Extra: inner
|
|
58
55
|
Requires-Dist: absl-py; extra == "inner"
|
|
59
56
|
Requires-Dist: accelerate; extra == "inner"
|
|
@@ -91,7 +88,7 @@ Requires-Dist: editdistance; extra == "all"
|
|
|
91
88
|
Requires-Dist: jsonlines; extra == "all"
|
|
92
89
|
Requires-Dist: matplotlib; extra == "all"
|
|
93
90
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
94
|
-
Requires-Dist: nltk
|
|
91
|
+
Requires-Dist: nltk; extra == "all"
|
|
95
92
|
Requires-Dist: openai; extra == "all"
|
|
96
93
|
Requires-Dist: pandas; extra == "all"
|
|
97
94
|
Requires-Dist: plotly; extra == "all"
|
|
@@ -101,7 +98,7 @@ Requires-Dist: pyyaml; extra == "all"
|
|
|
101
98
|
Requires-Dist: regex; extra == "all"
|
|
102
99
|
Requires-Dist: requests; extra == "all"
|
|
103
100
|
Requires-Dist: requests-toolbelt; extra == "all"
|
|
104
|
-
Requires-Dist: rouge-score
|
|
101
|
+
Requires-Dist: rouge-score; extra == "all"
|
|
105
102
|
Requires-Dist: sacrebleu; extra == "all"
|
|
106
103
|
Requires-Dist: scikit-learn; extra == "all"
|
|
107
104
|
Requires-Dist: seaborn; extra == "all"
|
|
@@ -116,8 +113,6 @@ Requires-Dist: jieba; extra == "all"
|
|
|
116
113
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
117
114
|
Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
|
|
118
115
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
|
|
119
|
-
Requires-Dist: ragas; extra == "all"
|
|
120
|
-
Requires-Dist: mteb>=0.14.16; extra == "all"
|
|
121
116
|
|
|
122
117
|
English | [简体中文](README_zh.md)
|
|
123
118
|
|
|
@@ -150,11 +145,30 @@ English | [简体中文](README_zh.md)
|
|
|
150
145
|
|
|
151
146
|
## 📝 Introduction
|
|
152
147
|
|
|
153
|
-
|
|
148
|
+
Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
|
|
149
|
+
|
|
150
|
+
### Framework Features
|
|
151
|
+
- **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
152
|
+
- **Evaluation Metrics**: Implements various commonly used evaluation metrics.
|
|
153
|
+
- **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
|
|
154
|
+
- **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
|
|
155
|
+
- **Evaluation Reports**: Automatically generates evaluation reports.
|
|
156
|
+
- **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
|
|
157
|
+
- **Single mode**: Scoring a single model.
|
|
158
|
+
- **Pairwise-baseline mode**: Comparing against a baseline model.
|
|
159
|
+
- **Pairwise (all) mode**: Pairwise comparison among all models.
|
|
160
|
+
- **Visualization Tools**: Provides intuitive displays of evaluation results.
|
|
161
|
+
- **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
|
|
162
|
+
- **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
|
|
163
|
+
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
164
|
+
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
<details><summary>Overall Architecture</summary>
|
|
154
168
|
|
|
155
169
|
<p align="center">
|
|
156
170
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
157
|
-
<br>EvalScope Framework.
|
|
171
|
+
<br>Fig 1. EvalScope Framework.
|
|
158
172
|
</p>
|
|
159
173
|
|
|
160
174
|
The architecture includes the following modules:
|
|
@@ -164,15 +178,14 @@ The architecture includes the following modules:
|
|
|
164
178
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
165
179
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
166
180
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
167
|
-
- **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
168
181
|
- **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
|
|
169
182
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
170
183
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
171
184
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
185
|
+
</details>
|
|
172
186
|
|
|
173
187
|
|
|
174
188
|
## 🎉 News
|
|
175
|
-
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
176
189
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
177
190
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
178
191
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -343,10 +356,9 @@ run_task(task_cfg=your_task_cfg)
|
|
|
343
356
|
## Evaluation Backend
|
|
344
357
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
345
358
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
346
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
347
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
348
|
-
- **
|
|
349
|
-
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
359
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
|
|
360
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
|
|
361
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
350
362
|
|
|
351
363
|
## Custom Dataset Evaluation
|
|
352
364
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
@@ -375,8 +387,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
375
387
|
|
|
376
388
|
|
|
377
389
|
## TO-DO List
|
|
378
|
-
- [x] RAG evaluation
|
|
379
|
-
- [x] VLM evaluation
|
|
380
390
|
- [x] Agents evaluation
|
|
381
391
|
- [x] vLLM
|
|
382
392
|
- [ ] Distributed evaluating
|
|
@@ -388,7 +398,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
388
398
|
- [ ] Auto-reviewer
|
|
389
399
|
- [ ] Qwen-max
|
|
390
400
|
|
|
391
|
-
|
|
392
|
-
## Star History
|
|
393
|
-
|
|
394
|
-
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -29,11 +29,30 @@ English | [简体中文](README_zh.md)
|
|
|
29
29
|
|
|
30
30
|
## 📝 Introduction
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
|
|
33
|
+
|
|
34
|
+
### Framework Features
|
|
35
|
+
- **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
36
|
+
- **Evaluation Metrics**: Implements various commonly used evaluation metrics.
|
|
37
|
+
- **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
|
|
38
|
+
- **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
|
|
39
|
+
- **Evaluation Reports**: Automatically generates evaluation reports.
|
|
40
|
+
- **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
|
|
41
|
+
- **Single mode**: Scoring a single model.
|
|
42
|
+
- **Pairwise-baseline mode**: Comparing against a baseline model.
|
|
43
|
+
- **Pairwise (all) mode**: Pairwise comparison among all models.
|
|
44
|
+
- **Visualization Tools**: Provides intuitive displays of evaluation results.
|
|
45
|
+
- **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
|
|
46
|
+
- **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
|
|
47
|
+
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
48
|
+
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
<details><summary>Overall Architecture</summary>
|
|
33
52
|
|
|
34
53
|
<p align="center">
|
|
35
54
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
36
|
-
<br>EvalScope Framework.
|
|
55
|
+
<br>Fig 1. EvalScope Framework.
|
|
37
56
|
</p>
|
|
38
57
|
|
|
39
58
|
The architecture includes the following modules:
|
|
@@ -43,15 +62,14 @@ The architecture includes the following modules:
|
|
|
43
62
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
44
63
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
45
64
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
46
|
-
- **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
47
65
|
- **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
|
|
48
66
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
49
67
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
50
68
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
69
|
+
</details>
|
|
51
70
|
|
|
52
71
|
|
|
53
72
|
## 🎉 News
|
|
54
|
-
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
55
73
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
56
74
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
57
75
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -222,10 +240,9 @@ run_task(task_cfg=your_task_cfg)
|
|
|
222
240
|
## Evaluation Backend
|
|
223
241
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
224
242
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
225
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
226
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
227
|
-
- **
|
|
228
|
-
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
243
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
|
|
244
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
|
|
245
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
229
246
|
|
|
230
247
|
## Custom Dataset Evaluation
|
|
231
248
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
@@ -254,8 +271,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
254
271
|
|
|
255
272
|
|
|
256
273
|
## TO-DO List
|
|
257
|
-
- [x] RAG evaluation
|
|
258
|
-
- [x] VLM evaluation
|
|
259
274
|
- [x] Agents evaluation
|
|
260
275
|
- [x] vLLM
|
|
261
276
|
- [ ] Distributed evaluating
|
|
@@ -267,7 +282,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
267
282
|
- [ ] Auto-reviewer
|
|
268
283
|
- [ ] Qwen-max
|
|
269
284
|
|
|
270
|
-
|
|
271
|
-
## Star History
|
|
272
|
-
|
|
273
|
-
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -8,7 +8,7 @@ class CustomDataset:
|
|
|
8
8
|
|
|
9
9
|
def load_data(self, dataset):
|
|
10
10
|
# customize the loading of the dataset
|
|
11
|
-
data_path = os.path.join(
|
|
11
|
+
data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
|
|
12
12
|
return load(data_path)
|
|
13
13
|
|
|
14
14
|
|
|
@@ -174,7 +174,6 @@ class Evaluator(object):
|
|
|
174
174
|
"""
|
|
175
175
|
assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
|
|
176
176
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
177
|
-
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
178
177
|
|
|
179
178
|
answers_list = []
|
|
180
179
|
pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
|
|
@@ -207,10 +207,6 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
|
|
|
207
207
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
208
208
|
vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
|
|
209
209
|
vlm_eval_kit_backend_manager.run()
|
|
210
|
-
elif eval_backend == EvalBackend.RAG_EVAL.value:
|
|
211
|
-
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
212
|
-
rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
|
|
213
|
-
rag_eval_backend_manager.run()
|
|
214
210
|
# TODO: Add other evaluation backends
|
|
215
211
|
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
216
212
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
init_loggers = {}
|
|
7
|
+
|
|
8
|
+
formatter = logging.Formatter(
|
|
9
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_logger(log_file: Optional[str] = None,
|
|
13
|
+
log_level: int = logging.INFO,
|
|
14
|
+
file_mode: str = 'w'):
|
|
15
|
+
""" Get logging logger
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
log_file: Log filename, if specified, file handler will be added to
|
|
19
|
+
logger
|
|
20
|
+
log_level: Logging level.
|
|
21
|
+
file_mode: Specifies the mode to open the file, if filename is
|
|
22
|
+
specified (if filemode is unspecified, it defaults to 'w').
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
logger_name = __name__.split('.')[0]
|
|
26
|
+
logger = logging.getLogger(logger_name)
|
|
27
|
+
|
|
28
|
+
if logger_name in init_loggers:
|
|
29
|
+
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
30
|
+
return logger
|
|
31
|
+
|
|
32
|
+
for handler in logger.root.handlers:
|
|
33
|
+
if type(handler) is logging.StreamHandler:
|
|
34
|
+
handler.setLevel(logging.ERROR)
|
|
35
|
+
|
|
36
|
+
stream_handler = logging.StreamHandler()
|
|
37
|
+
handlers = [stream_handler]
|
|
38
|
+
|
|
39
|
+
if log_file is not None:
|
|
40
|
+
file_handler = logging.FileHandler(log_file, file_mode)
|
|
41
|
+
handlers.append(file_handler)
|
|
42
|
+
|
|
43
|
+
for handler in handlers:
|
|
44
|
+
handler.setFormatter(formatter)
|
|
45
|
+
handler.setLevel(log_level)
|
|
46
|
+
logger.addHandler(handler)
|
|
47
|
+
|
|
48
|
+
logger.setLevel(log_level)
|
|
49
|
+
|
|
50
|
+
init_loggers[logger_name] = True
|
|
51
|
+
|
|
52
|
+
return logger
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
56
|
+
for handler in logger.handlers:
|
|
57
|
+
if isinstance(handler, logging.FileHandler):
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
if log_file is not None:
|
|
61
|
+
file_handler = logging.FileHandler(log_file, file_mode)
|
|
62
|
+
file_handler.setFormatter(formatter)
|
|
63
|
+
file_handler.setLevel(log_level)
|
|
64
|
+
logger.addHandler(file_handler)
|
|
@@ -11,9 +11,6 @@ class EvalBackend(Enum):
|
|
|
11
11
|
|
|
12
12
|
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
13
13
|
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
14
|
-
|
|
15
|
-
# Use RAGEval as the RAG evaluation backend
|
|
16
|
-
RAG_EVAL = 'RAGEval'
|
|
17
14
|
|
|
18
15
|
# Use third-party evaluation backend/modules
|
|
19
16
|
THIRD_PARTY = 'ThirdParty'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5rc1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -24,7 +24,7 @@ Requires-Dist: editdistance
|
|
|
24
24
|
Requires-Dist: jsonlines
|
|
25
25
|
Requires-Dist: matplotlib
|
|
26
26
|
Requires-Dist: modelscope[framework]
|
|
27
|
-
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: nltk
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
@@ -34,7 +34,7 @@ Requires-Dist: pyyaml
|
|
|
34
34
|
Requires-Dist: regex
|
|
35
35
|
Requires-Dist: requests
|
|
36
36
|
Requires-Dist: requests-toolbelt
|
|
37
|
-
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: rouge-score
|
|
38
38
|
Requires-Dist: sacrebleu
|
|
39
39
|
Requires-Dist: scikit-learn
|
|
40
40
|
Requires-Dist: seaborn
|
|
@@ -51,9 +51,6 @@ Provides-Extra: opencompass
|
|
|
51
51
|
Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
|
|
52
52
|
Provides-Extra: vlmeval
|
|
53
53
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
|
|
54
|
-
Provides-Extra: rag
|
|
55
|
-
Requires-Dist: ragas; extra == "rag"
|
|
56
|
-
Requires-Dist: mteb>=0.14.16; extra == "rag"
|
|
57
54
|
Provides-Extra: inner
|
|
58
55
|
Requires-Dist: absl-py; extra == "inner"
|
|
59
56
|
Requires-Dist: accelerate; extra == "inner"
|
|
@@ -91,7 +88,7 @@ Requires-Dist: editdistance; extra == "all"
|
|
|
91
88
|
Requires-Dist: jsonlines; extra == "all"
|
|
92
89
|
Requires-Dist: matplotlib; extra == "all"
|
|
93
90
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
94
|
-
Requires-Dist: nltk
|
|
91
|
+
Requires-Dist: nltk; extra == "all"
|
|
95
92
|
Requires-Dist: openai; extra == "all"
|
|
96
93
|
Requires-Dist: pandas; extra == "all"
|
|
97
94
|
Requires-Dist: plotly; extra == "all"
|
|
@@ -101,7 +98,7 @@ Requires-Dist: pyyaml; extra == "all"
|
|
|
101
98
|
Requires-Dist: regex; extra == "all"
|
|
102
99
|
Requires-Dist: requests; extra == "all"
|
|
103
100
|
Requires-Dist: requests-toolbelt; extra == "all"
|
|
104
|
-
Requires-Dist: rouge-score
|
|
101
|
+
Requires-Dist: rouge-score; extra == "all"
|
|
105
102
|
Requires-Dist: sacrebleu; extra == "all"
|
|
106
103
|
Requires-Dist: scikit-learn; extra == "all"
|
|
107
104
|
Requires-Dist: seaborn; extra == "all"
|
|
@@ -116,8 +113,6 @@ Requires-Dist: jieba; extra == "all"
|
|
|
116
113
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
117
114
|
Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
|
|
118
115
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
|
|
119
|
-
Requires-Dist: ragas; extra == "all"
|
|
120
|
-
Requires-Dist: mteb>=0.14.16; extra == "all"
|
|
121
116
|
|
|
122
117
|
English | [简体中文](README_zh.md)
|
|
123
118
|
|
|
@@ -150,11 +145,30 @@ English | [简体中文](README_zh.md)
|
|
|
150
145
|
|
|
151
146
|
## 📝 Introduction
|
|
152
147
|
|
|
153
|
-
|
|
148
|
+
Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
|
|
149
|
+
|
|
150
|
+
### Framework Features
|
|
151
|
+
- **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
152
|
+
- **Evaluation Metrics**: Implements various commonly used evaluation metrics.
|
|
153
|
+
- **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
|
|
154
|
+
- **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
|
|
155
|
+
- **Evaluation Reports**: Automatically generates evaluation reports.
|
|
156
|
+
- **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
|
|
157
|
+
- **Single mode**: Scoring a single model.
|
|
158
|
+
- **Pairwise-baseline mode**: Comparing against a baseline model.
|
|
159
|
+
- **Pairwise (all) mode**: Pairwise comparison among all models.
|
|
160
|
+
- **Visualization Tools**: Provides intuitive displays of evaluation results.
|
|
161
|
+
- **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
|
|
162
|
+
- **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
|
|
163
|
+
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
164
|
+
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
<details><summary>Overall Architecture</summary>
|
|
154
168
|
|
|
155
169
|
<p align="center">
|
|
156
170
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
157
|
-
<br>EvalScope Framework.
|
|
171
|
+
<br>Fig 1. EvalScope Framework.
|
|
158
172
|
</p>
|
|
159
173
|
|
|
160
174
|
The architecture includes the following modules:
|
|
@@ -164,15 +178,14 @@ The architecture includes the following modules:
|
|
|
164
178
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
165
179
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
166
180
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
167
|
-
- **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
168
181
|
- **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
|
|
169
182
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
170
183
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
171
184
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
185
|
+
</details>
|
|
172
186
|
|
|
173
187
|
|
|
174
188
|
## 🎉 News
|
|
175
|
-
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
176
189
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
177
190
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
178
191
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -343,10 +356,9 @@ run_task(task_cfg=your_task_cfg)
|
|
|
343
356
|
## Evaluation Backend
|
|
344
357
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
345
358
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
346
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
347
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
348
|
-
- **
|
|
349
|
-
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
359
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
|
|
360
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
|
|
361
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
350
362
|
|
|
351
363
|
## Custom Dataset Evaluation
|
|
352
364
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
@@ -375,8 +387,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
375
387
|
|
|
376
388
|
|
|
377
389
|
## TO-DO List
|
|
378
|
-
- [x] RAG evaluation
|
|
379
|
-
- [x] VLM evaluation
|
|
380
390
|
- [x] Agents evaluation
|
|
381
391
|
- [x] vLLM
|
|
382
392
|
- [ ] Distributed evaluating
|
|
@@ -388,7 +398,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
388
398
|
- [ ] Auto-reviewer
|
|
389
399
|
- [ ] Qwen-max
|
|
390
400
|
|
|
391
|
-
|
|
392
|
-
## Star History
|
|
393
|
-
|
|
394
|
-
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -23,22 +23,6 @@ evalscope/backend/opencompass/backend_manager.py
|
|
|
23
23
|
evalscope/backend/opencompass/tasks/__init__.py
|
|
24
24
|
evalscope/backend/opencompass/tasks/eval_api.py
|
|
25
25
|
evalscope/backend/opencompass/tasks/eval_datasets.py
|
|
26
|
-
evalscope/backend/rag_eval/__init__.py
|
|
27
|
-
evalscope/backend/rag_eval/backend_manager.py
|
|
28
|
-
evalscope/backend/rag_eval/cmteb/__init__.py
|
|
29
|
-
evalscope/backend/rag_eval/cmteb/arguments.py
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/base.py
|
|
31
|
-
evalscope/backend/rag_eval/cmteb/task_template.py
|
|
32
|
-
evalscope/backend/rag_eval/cmteb/tasks/Classification.py
|
|
33
|
-
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py
|
|
34
|
-
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py
|
|
35
|
-
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py
|
|
36
|
-
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py
|
|
37
|
-
evalscope/backend/rag_eval/cmteb/tasks/STS.py
|
|
38
|
-
evalscope/backend/rag_eval/cmteb/tasks/__init__.py
|
|
39
|
-
evalscope/backend/rag_eval/ragas/__init__.py
|
|
40
|
-
evalscope/backend/rag_eval/ragas/arguments.py
|
|
41
|
-
evalscope/backend/rag_eval/ragas/task_template.py
|
|
42
26
|
evalscope/backend/vlm_eval_kit/__init__.py
|
|
43
27
|
evalscope/backend/vlm_eval_kit/backend_manager.py
|
|
44
28
|
evalscope/backend/vlm_eval_kit/custom_dataset.py
|
|
@@ -7,7 +7,7 @@ editdistance
|
|
|
7
7
|
jsonlines
|
|
8
8
|
matplotlib
|
|
9
9
|
modelscope[framework]
|
|
10
|
-
nltk
|
|
10
|
+
nltk
|
|
11
11
|
openai
|
|
12
12
|
pandas
|
|
13
13
|
plotly
|
|
@@ -17,7 +17,7 @@ pyyaml
|
|
|
17
17
|
regex
|
|
18
18
|
requests
|
|
19
19
|
requests-toolbelt
|
|
20
|
-
rouge-score
|
|
20
|
+
rouge-score
|
|
21
21
|
sacrebleu
|
|
22
22
|
scikit-learn
|
|
23
23
|
seaborn
|
|
@@ -41,7 +41,7 @@ editdistance
|
|
|
41
41
|
jsonlines
|
|
42
42
|
matplotlib
|
|
43
43
|
modelscope[framework]
|
|
44
|
-
nltk
|
|
44
|
+
nltk
|
|
45
45
|
openai
|
|
46
46
|
pandas
|
|
47
47
|
plotly
|
|
@@ -51,7 +51,7 @@ pyyaml
|
|
|
51
51
|
regex
|
|
52
52
|
requests
|
|
53
53
|
requests-toolbelt
|
|
54
|
-
rouge-score
|
|
54
|
+
rouge-score
|
|
55
55
|
sacrebleu
|
|
56
56
|
scikit-learn
|
|
57
57
|
seaborn
|
|
@@ -66,8 +66,6 @@ jieba
|
|
|
66
66
|
rouge-chinese
|
|
67
67
|
ms-opencompass>=0.1.1
|
|
68
68
|
ms-vlmeval>=0.0.5
|
|
69
|
-
ragas
|
|
70
|
-
mteb>=0.14.16
|
|
71
69
|
|
|
72
70
|
[inner]
|
|
73
71
|
absl-py
|
|
@@ -99,9 +97,5 @@ transformers_stream_generator
|
|
|
99
97
|
[opencompass]
|
|
100
98
|
ms-opencompass>=0.1.1
|
|
101
99
|
|
|
102
|
-
[rag]
|
|
103
|
-
ragas
|
|
104
|
-
mteb>=0.14.16
|
|
105
|
-
|
|
106
100
|
[vlmeval]
|
|
107
101
|
ms-vlmeval>=0.0.5
|