evalscope 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.5.0 → evalscope-0.5.2}/PKG-INFO +130 -26
- {evalscope-0.5.0 → evalscope-0.5.2}/README.md +35 -19
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/backend_manager.py +1 -3
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/tasks/eval_api.py +1 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +3 -5
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/task_utils.py +1 -1
- evalscope-0.5.2/evalscope/version.py +4 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/PKG-INFO +130 -26
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/entry_points.txt +0 -1
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/requires.txt +4 -5
- evalscope-0.5.0/evalscope/version.py +0 -4
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/base.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cache.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/base.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/cli.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/config.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/constants.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/evaluator.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/dummy_chat_model.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/model.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/model_adapter.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/openai_model.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/models/template.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/_logging.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/api_plugin_base.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/custom_api.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/dashscope_api.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/dataset_plugin_base.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/line_by_line.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/datasets/openqa.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/how_to_analysis_result.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/openai_api.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/plugin_registry.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/query_parameters.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/perf/server_sent_event.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/preprocess/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/preprocess/tokenizers/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/run.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/run_arena.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/run_ms.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/summarizer.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/combine_reports.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/tools/rewrite_eval_results.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/logger.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/task_cfg_parser.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope/utils/utils.py +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/SOURCES.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.5.0 → evalscope-0.5.2}/setup.cfg +0 -0
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
4
|
-
Summary:
|
|
5
|
-
Home-page: https://github.com/modelscope/
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
|
+
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
-
License: UNKNOWN
|
|
9
8
|
Keywords: python,llm,evaluation
|
|
10
|
-
Platform: UNKNOWN
|
|
11
9
|
Classifier: Development Status :: 4 - Beta
|
|
12
10
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
11
|
Classifier: Operating System :: OS Independent
|
|
@@ -17,17 +15,109 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
17
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
16
|
Requires-Python: >=3.8
|
|
19
17
|
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: torch
|
|
19
|
+
Requires-Dist: absl-py
|
|
20
|
+
Requires-Dist: accelerate
|
|
21
|
+
Requires-Dist: cachetools
|
|
22
|
+
Requires-Dist: editdistance
|
|
23
|
+
Requires-Dist: jsonlines
|
|
24
|
+
Requires-Dist: matplotlib
|
|
25
|
+
Requires-Dist: modelscope[framework]
|
|
26
|
+
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: openai
|
|
28
|
+
Requires-Dist: pandas
|
|
29
|
+
Requires-Dist: plotly
|
|
30
|
+
Requires-Dist: pyarrow
|
|
31
|
+
Requires-Dist: pympler
|
|
32
|
+
Requires-Dist: pyyaml
|
|
33
|
+
Requires-Dist: regex
|
|
34
|
+
Requires-Dist: requests
|
|
35
|
+
Requires-Dist: requests-toolbelt
|
|
36
|
+
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: sacrebleu
|
|
38
|
+
Requires-Dist: scikit-learn
|
|
39
|
+
Requires-Dist: seaborn
|
|
40
|
+
Requires-Dist: sentencepiece
|
|
41
|
+
Requires-Dist: simple-ddl-parser
|
|
42
|
+
Requires-Dist: tabulate
|
|
43
|
+
Requires-Dist: tiktoken
|
|
44
|
+
Requires-Dist: tqdm
|
|
45
|
+
Requires-Dist: transformers<4.43,>=4.33
|
|
46
|
+
Requires-Dist: transformers_stream_generator
|
|
47
|
+
Requires-Dist: jieba
|
|
48
|
+
Requires-Dist: rouge-chinese
|
|
20
49
|
Provides-Extra: opencompass
|
|
50
|
+
Requires-Dist: ms-opencompass>=0.0.5; extra == "opencompass"
|
|
21
51
|
Provides-Extra: vlmeval
|
|
52
|
+
Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
|
|
22
53
|
Provides-Extra: inner
|
|
54
|
+
Requires-Dist: absl-py; extra == "inner"
|
|
55
|
+
Requires-Dist: accelerate; extra == "inner"
|
|
56
|
+
Requires-Dist: alibaba_itag_sdk; extra == "inner"
|
|
57
|
+
Requires-Dist: dashscope; extra == "inner"
|
|
58
|
+
Requires-Dist: editdistance; extra == "inner"
|
|
59
|
+
Requires-Dist: jsonlines; extra == "inner"
|
|
60
|
+
Requires-Dist: jsonlines; extra == "inner"
|
|
61
|
+
Requires-Dist: nltk; extra == "inner"
|
|
62
|
+
Requires-Dist: openai; extra == "inner"
|
|
63
|
+
Requires-Dist: pandas==1.5.3; extra == "inner"
|
|
64
|
+
Requires-Dist: plotly; extra == "inner"
|
|
65
|
+
Requires-Dist: pyarrow; extra == "inner"
|
|
66
|
+
Requires-Dist: pyodps; extra == "inner"
|
|
67
|
+
Requires-Dist: pyyaml; extra == "inner"
|
|
68
|
+
Requires-Dist: regex; extra == "inner"
|
|
69
|
+
Requires-Dist: requests==2.28.1; extra == "inner"
|
|
70
|
+
Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
|
|
71
|
+
Requires-Dist: rouge-score; extra == "inner"
|
|
72
|
+
Requires-Dist: sacrebleu; extra == "inner"
|
|
73
|
+
Requires-Dist: scikit-learn; extra == "inner"
|
|
74
|
+
Requires-Dist: seaborn; extra == "inner"
|
|
75
|
+
Requires-Dist: simple-ddl-parser; extra == "inner"
|
|
76
|
+
Requires-Dist: streamlit; extra == "inner"
|
|
77
|
+
Requires-Dist: tqdm; extra == "inner"
|
|
78
|
+
Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
|
|
79
|
+
Requires-Dist: transformers_stream_generator; extra == "inner"
|
|
23
80
|
Provides-Extra: all
|
|
81
|
+
Requires-Dist: torch; extra == "all"
|
|
82
|
+
Requires-Dist: absl-py; extra == "all"
|
|
83
|
+
Requires-Dist: accelerate; extra == "all"
|
|
84
|
+
Requires-Dist: cachetools; extra == "all"
|
|
85
|
+
Requires-Dist: editdistance; extra == "all"
|
|
86
|
+
Requires-Dist: jsonlines; extra == "all"
|
|
87
|
+
Requires-Dist: matplotlib; extra == "all"
|
|
88
|
+
Requires-Dist: modelscope[framework]; extra == "all"
|
|
89
|
+
Requires-Dist: nltk; extra == "all"
|
|
90
|
+
Requires-Dist: openai; extra == "all"
|
|
91
|
+
Requires-Dist: pandas; extra == "all"
|
|
92
|
+
Requires-Dist: plotly; extra == "all"
|
|
93
|
+
Requires-Dist: pyarrow; extra == "all"
|
|
94
|
+
Requires-Dist: pympler; extra == "all"
|
|
95
|
+
Requires-Dist: pyyaml; extra == "all"
|
|
96
|
+
Requires-Dist: regex; extra == "all"
|
|
97
|
+
Requires-Dist: requests; extra == "all"
|
|
98
|
+
Requires-Dist: requests-toolbelt; extra == "all"
|
|
99
|
+
Requires-Dist: rouge-score; extra == "all"
|
|
100
|
+
Requires-Dist: sacrebleu; extra == "all"
|
|
101
|
+
Requires-Dist: scikit-learn; extra == "all"
|
|
102
|
+
Requires-Dist: seaborn; extra == "all"
|
|
103
|
+
Requires-Dist: sentencepiece; extra == "all"
|
|
104
|
+
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
105
|
+
Requires-Dist: tabulate; extra == "all"
|
|
106
|
+
Requires-Dist: tiktoken; extra == "all"
|
|
107
|
+
Requires-Dist: tqdm; extra == "all"
|
|
108
|
+
Requires-Dist: transformers<4.43,>=4.33; extra == "all"
|
|
109
|
+
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
110
|
+
Requires-Dist: jieba; extra == "all"
|
|
111
|
+
Requires-Dist: rouge-chinese; extra == "all"
|
|
112
|
+
Requires-Dist: ms-opencompass>=0.0.5; extra == "all"
|
|
113
|
+
Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
|
|
24
114
|
|
|
25
115
|
English | [简体中文](README_zh.md)
|
|
26
116
|
|
|
27
117
|
<p align="center">
|
|
28
118
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/evalscope">
|
|
29
119
|
</a>
|
|
30
|
-
<a href="https://github.com/modelscope/
|
|
120
|
+
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
31
121
|
<p>
|
|
32
122
|
|
|
33
123
|
## 📖 Table of Content
|
|
@@ -42,7 +132,7 @@ English | [简体中文](README_zh.md)
|
|
|
42
132
|
|
|
43
133
|
## 📝 Introduction
|
|
44
134
|
|
|
45
|
-
Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the
|
|
135
|
+
Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework, which includes the following components and features:
|
|
46
136
|
|
|
47
137
|
- Pre-configured common benchmark datasets, including: MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
48
138
|
- Implementation of common evaluation metrics
|
|
@@ -55,7 +145,7 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
|
|
|
55
145
|
- Visualization tools
|
|
56
146
|
- Model Inference Performance Evaluation [Tutorial](evalscope/perf/README.md)
|
|
57
147
|
- Support for OpenCompass as an Evaluation Backend, featuring advanced encapsulation and task simplification to easily submit tasks to OpenCompass for evaluation.
|
|
58
|
-
- Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through
|
|
148
|
+
- Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through EvalScope, supporting various multimodal models and datasets.
|
|
59
149
|
- Full pipeline support: Seamlessly integrate with SWIFT to easily train and deploy model services, initiate evaluation tasks, view evaluation reports, and achieve an end-to-end large model development process.
|
|
60
150
|
|
|
61
151
|
|
|
@@ -76,33 +166,48 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
|
|
|
76
166
|
- **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
|
|
77
167
|
- **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. [User Guide](#vlmevalkit-evaluation-backend) 🔥🔥🔥
|
|
78
168
|
- **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. [User Guide](#opencompass-evaluation-backend) 🔥🔥🔥
|
|
79
|
-
- **[2024.06.13]**
|
|
169
|
+
- **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
|
|
80
170
|
- **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
|
|
81
171
|
|
|
82
172
|
|
|
83
173
|
|
|
84
174
|
## 🛠️ Installation
|
|
85
175
|
### Install with pip
|
|
86
|
-
1. create conda environment
|
|
176
|
+
1. create conda environment [Optional]
|
|
87
177
|
```shell
|
|
88
|
-
conda create -n
|
|
89
|
-
conda activate
|
|
178
|
+
conda create -n evalscope python=3.10
|
|
179
|
+
conda activate evalscope
|
|
90
180
|
```
|
|
91
181
|
|
|
92
|
-
2. Install
|
|
182
|
+
2. Install EvalScope
|
|
93
183
|
```shell
|
|
94
|
-
pip install evalscope
|
|
184
|
+
pip install evalscope # Installation with Native backend (by default)
|
|
185
|
+
|
|
186
|
+
pip install evalscope[opencompass] # Installation with OpenCompass backend
|
|
187
|
+
pip install evalscope[vlmeval] # Installation with VLMEvalKit backend
|
|
188
|
+
pip install evalscope[all] # Installation with all backends (Native, OpenCompass, VLMEvalKit)
|
|
95
189
|
```
|
|
96
190
|
|
|
191
|
+
DEPRECATION WARNING: For 0.4.3 or older versions, please use the following command to install:
|
|
192
|
+
```shell
|
|
193
|
+
pip install llmuses<=0.4.3
|
|
194
|
+
|
|
195
|
+
# Usage:
|
|
196
|
+
from llmuses.run import run_task
|
|
197
|
+
...
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
|
|
97
202
|
### Install from source code
|
|
98
203
|
1. Download source code
|
|
99
204
|
```shell
|
|
100
|
-
git clone https://github.com/modelscope/
|
|
205
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
101
206
|
```
|
|
102
207
|
|
|
103
208
|
2. Install dependencies
|
|
104
209
|
```shell
|
|
105
|
-
cd
|
|
210
|
+
cd evalscope/
|
|
106
211
|
pip install -e .
|
|
107
212
|
```
|
|
108
213
|
|
|
@@ -146,15 +251,15 @@ print(TemplateType.get_template_name_list())
|
|
|
146
251
|
```
|
|
147
252
|
|
|
148
253
|
### Evaluation Backend
|
|
149
|
-
|
|
150
|
-
- **Native**:
|
|
151
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through
|
|
152
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through
|
|
153
|
-
- **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to
|
|
254
|
+
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
255
|
+
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
256
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
|
|
257
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
|
|
258
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
154
259
|
|
|
155
260
|
#### OpenCompass Eval-Backend
|
|
156
261
|
|
|
157
|
-
To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through
|
|
262
|
+
To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through EvalScope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
|
|
158
263
|
|
|
159
264
|
|
|
160
265
|
##### Installation
|
|
@@ -210,7 +315,7 @@ python examples/example_eval_swift_openai_api.py
|
|
|
210
315
|
|
|
211
316
|
#### VLMEvalKit Evaluation Backend
|
|
212
317
|
|
|
213
|
-
To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through
|
|
318
|
+
To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through EvalScope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
|
|
214
319
|
|
|
215
320
|
##### Installation
|
|
216
321
|
```shell
|
|
@@ -228,7 +333,8 @@ For detailed information about the datasets, please refer to [VLMEvalKit Support
|
|
|
228
333
|
You can use the following to view the list of dataset names:
|
|
229
334
|
```python
|
|
230
335
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
231
|
-
print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.
|
|
336
|
+
print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
|
|
337
|
+
|
|
232
338
|
```
|
|
233
339
|
If the dataset file does not exist locally when loading the dataset, it will be automatically downloaded to the `~/LMUData/` directory.
|
|
234
340
|
|
|
@@ -471,5 +577,3 @@ The LLM Leaderboard aims to provide an objective and comprehensive evaluation st
|
|
|
471
577
|
- [ ] Auto-reviewer
|
|
472
578
|
- [ ] Qwen-max
|
|
473
579
|
|
|
474
|
-
|
|
475
|
-
|
|
@@ -3,7 +3,7 @@ English | [简体中文](README_zh.md)
|
|
|
3
3
|
<p align="center">
|
|
4
4
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/evalscope">
|
|
5
5
|
</a>
|
|
6
|
-
<a href="https://github.com/modelscope/
|
|
6
|
+
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
7
7
|
<p>
|
|
8
8
|
|
|
9
9
|
## 📖 Table of Content
|
|
@@ -18,7 +18,7 @@ English | [简体中文](README_zh.md)
|
|
|
18
18
|
|
|
19
19
|
## 📝 Introduction
|
|
20
20
|
|
|
21
|
-
Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the
|
|
21
|
+
Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework, which includes the following components and features:
|
|
22
22
|
|
|
23
23
|
- Pre-configured common benchmark datasets, including: MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
24
24
|
- Implementation of common evaluation metrics
|
|
@@ -31,7 +31,7 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
|
|
|
31
31
|
- Visualization tools
|
|
32
32
|
- Model Inference Performance Evaluation [Tutorial](evalscope/perf/README.md)
|
|
33
33
|
- Support for OpenCompass as an Evaluation Backend, featuring advanced encapsulation and task simplification to easily submit tasks to OpenCompass for evaluation.
|
|
34
|
-
- Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through
|
|
34
|
+
- Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through EvalScope, supporting various multimodal models and datasets.
|
|
35
35
|
- Full pipeline support: Seamlessly integrate with SWIFT to easily train and deploy model services, initiate evaluation tasks, view evaluation reports, and achieve an end-to-end large model development process.
|
|
36
36
|
|
|
37
37
|
|
|
@@ -52,33 +52,48 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
|
|
|
52
52
|
- **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
|
|
53
53
|
- **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. [User Guide](#vlmevalkit-evaluation-backend) 🔥🔥🔥
|
|
54
54
|
- **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. [User Guide](#opencompass-evaluation-backend) 🔥🔥🔥
|
|
55
|
-
- **[2024.06.13]**
|
|
55
|
+
- **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
|
|
56
56
|
- **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
## 🛠️ Installation
|
|
61
61
|
### Install with pip
|
|
62
|
-
1. create conda environment
|
|
62
|
+
1. create conda environment [Optional]
|
|
63
63
|
```shell
|
|
64
|
-
conda create -n
|
|
65
|
-
conda activate
|
|
64
|
+
conda create -n evalscope python=3.10
|
|
65
|
+
conda activate evalscope
|
|
66
66
|
```
|
|
67
67
|
|
|
68
|
-
2. Install
|
|
68
|
+
2. Install EvalScope
|
|
69
69
|
```shell
|
|
70
|
-
pip install evalscope
|
|
70
|
+
pip install evalscope # Installation with Native backend (by default)
|
|
71
|
+
|
|
72
|
+
pip install evalscope[opencompass] # Installation with OpenCompass backend
|
|
73
|
+
pip install evalscope[vlmeval] # Installation with VLMEvalKit backend
|
|
74
|
+
pip install evalscope[all] # Installation with all backends (Native, OpenCompass, VLMEvalKit)
|
|
71
75
|
```
|
|
72
76
|
|
|
77
|
+
DEPRECATION WARNING: For 0.4.3 or older versions, please use the following command to install:
|
|
78
|
+
```shell
|
|
79
|
+
pip install llmuses<=0.4.3
|
|
80
|
+
|
|
81
|
+
# Usage:
|
|
82
|
+
from llmuses.run import run_task
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
|
|
73
88
|
### Install from source code
|
|
74
89
|
1. Download source code
|
|
75
90
|
```shell
|
|
76
|
-
git clone https://github.com/modelscope/
|
|
91
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
77
92
|
```
|
|
78
93
|
|
|
79
94
|
2. Install dependencies
|
|
80
95
|
```shell
|
|
81
|
-
cd
|
|
96
|
+
cd evalscope/
|
|
82
97
|
pip install -e .
|
|
83
98
|
```
|
|
84
99
|
|
|
@@ -122,15 +137,15 @@ print(TemplateType.get_template_name_list())
|
|
|
122
137
|
```
|
|
123
138
|
|
|
124
139
|
### Evaluation Backend
|
|
125
|
-
|
|
126
|
-
- **Native**:
|
|
127
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through
|
|
128
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through
|
|
129
|
-
- **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to
|
|
140
|
+
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
141
|
+
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
142
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
|
|
143
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
|
|
144
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
130
145
|
|
|
131
146
|
#### OpenCompass Eval-Backend
|
|
132
147
|
|
|
133
|
-
To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through
|
|
148
|
+
To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through EvalScope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
|
|
134
149
|
|
|
135
150
|
|
|
136
151
|
##### Installation
|
|
@@ -186,7 +201,7 @@ python examples/example_eval_swift_openai_api.py
|
|
|
186
201
|
|
|
187
202
|
#### VLMEvalKit Evaluation Backend
|
|
188
203
|
|
|
189
|
-
To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through
|
|
204
|
+
To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through EvalScope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
|
|
190
205
|
|
|
191
206
|
##### Installation
|
|
192
207
|
```shell
|
|
@@ -204,7 +219,8 @@ For detailed information about the datasets, please refer to [VLMEvalKit Support
|
|
|
204
219
|
You can use the following to view the list of dataset names:
|
|
205
220
|
```python
|
|
206
221
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
207
|
-
print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.
|
|
222
|
+
print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
|
|
223
|
+
|
|
208
224
|
```
|
|
209
225
|
If the dataset file does not exist locally when loading the dataset, it will be automatically downloaded to the `~/LMUData/` directory.
|
|
210
226
|
|
|
@@ -76,9 +76,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
76
76
|
@staticmethod
|
|
77
77
|
def _check_env():
|
|
78
78
|
if is_module_installed('opencompass'):
|
|
79
|
-
logger.info('
|
|
80
|
-
else:
|
|
81
|
-
raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
|
|
79
|
+
logger.info('Check the OpenCompass environment: OK')
|
|
82
80
|
|
|
83
81
|
@staticmethod
|
|
84
82
|
def get_restore_arg(arg_name: str, arg_val: bool):
|
|
@@ -31,7 +31,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
31
31
|
from vlmeval.utils.arguments import Arguments as VLMEvalArguments
|
|
32
32
|
self.args = VLMEvalArguments(**self.config_d)
|
|
33
33
|
|
|
34
|
-
self.valid_models = self.
|
|
34
|
+
self.valid_models = self.list_supported_models()
|
|
35
35
|
self.valid_model_names = list(self.valid_models.keys())
|
|
36
36
|
self.valid_datasets = self.list_supported_datasets()
|
|
37
37
|
|
|
@@ -86,7 +86,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
86
86
|
return self.get_cmd()
|
|
87
87
|
|
|
88
88
|
@staticmethod
|
|
89
|
-
def
|
|
89
|
+
def list_supported_models():
|
|
90
90
|
from vlmeval.config import supported_VLM
|
|
91
91
|
return supported_VLM
|
|
92
92
|
|
|
@@ -98,9 +98,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
98
98
|
@staticmethod
|
|
99
99
|
def _check_env():
|
|
100
100
|
if is_module_installed('vlmeval'):
|
|
101
|
-
logger.info('
|
|
102
|
-
else:
|
|
103
|
-
raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
|
|
101
|
+
logger.info('Check VLM Evaluation Kit: Installed')
|
|
104
102
|
|
|
105
103
|
@staticmethod
|
|
106
104
|
def get_restore_arg(arg_name: str, arg_val: bool):
|