evalscope 0.5.3__tar.gz → 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.5.3 → evalscope-0.5.4}/PKG-INFO +24 -32
- {evalscope-0.5.3 → evalscope-0.5.4}/README.md +19 -29
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/opencompass/backend_manager.py +2 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/vlm_eval_kit/backend_manager.py +1 -1
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/benchmark.py +1 -1
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/evaluator/evaluator.py +3 -3
- evalscope-0.5.4/evalscope/models/api/__init__.py +3 -0
- evalscope-0.5.4/evalscope/models/api/openai_api.py +228 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/http_client.py +5 -5
- evalscope-0.5.4/evalscope/third_party/longbench_write/__init__.py +3 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/eval.py +284 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/infer.py +217 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/longbench_write.py +88 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/resources/judge.txt +31 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
- evalscope-0.5.4/evalscope/third_party/longbench_write/utils.py +37 -0
- evalscope-0.5.4/evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope-0.5.4/evalscope/tools/__init__.py +1 -0
- evalscope-0.5.4/evalscope/version.py +4 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/PKG-INFO +24 -32
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/SOURCES.txt +14 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/requires.txt +4 -2
- evalscope-0.5.3/evalscope/version.py +0 -4
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/base.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/cache.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/cli/base.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/cli/cli.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/config.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/constants.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/dummy_chat_model.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/model.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/model_adapter.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/openai_model.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/models/template.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/_logging.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/api_plugin_base.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/custom_api.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/dashscope_api.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/dataset_plugin_base.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/datasets/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/datasets/line_by_line.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/datasets/openqa.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/how_to_analysis_result.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/openai_api.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/plugin_registry.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/query_parameters.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/perf/server_sent_event.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/preprocess/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/preprocess/tokenizers/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/run.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/run_arena.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/run_ms.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/summarizer.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.5.3/evalscope/third_party/toolbench_static/llm → evalscope-0.5.4/evalscope/third_party/longbench_write/resources}/__init__.py +0 -0
- {evalscope-0.5.3/evalscope → evalscope-0.5.4/evalscope/third_party/longbench_write}/tools/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/tools/combine_reports.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/tools/rewrite_eval_results.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/logger.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/task_cfg_parser.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/task_utils.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope/utils/utils.py +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.5.3 → evalscope-0.5.4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,6 +19,7 @@ Requires-Dist: torch
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
+
Requires-Dist: datasets<3.0.0,>=2.18.0
|
|
22
23
|
Requires-Dist: editdistance
|
|
23
24
|
Requires-Dist: jsonlines
|
|
24
25
|
Requires-Dist: matplotlib
|
|
@@ -42,7 +43,7 @@ Requires-Dist: simple-ddl-parser
|
|
|
42
43
|
Requires-Dist: tabulate
|
|
43
44
|
Requires-Dist: tiktoken
|
|
44
45
|
Requires-Dist: tqdm
|
|
45
|
-
Requires-Dist: transformers
|
|
46
|
+
Requires-Dist: transformers>=4.33
|
|
46
47
|
Requires-Dist: transformers_stream_generator
|
|
47
48
|
Requires-Dist: jieba
|
|
48
49
|
Requires-Dist: rouge-chinese
|
|
@@ -82,6 +83,7 @@ Requires-Dist: torch; extra == "all"
|
|
|
82
83
|
Requires-Dist: absl-py; extra == "all"
|
|
83
84
|
Requires-Dist: accelerate; extra == "all"
|
|
84
85
|
Requires-Dist: cachetools; extra == "all"
|
|
86
|
+
Requires-Dist: datasets<3.0.0,>=2.18.0; extra == "all"
|
|
85
87
|
Requires-Dist: editdistance; extra == "all"
|
|
86
88
|
Requires-Dist: jsonlines; extra == "all"
|
|
87
89
|
Requires-Dist: matplotlib; extra == "all"
|
|
@@ -105,7 +107,7 @@ Requires-Dist: simple-ddl-parser; extra == "all"
|
|
|
105
107
|
Requires-Dist: tabulate; extra == "all"
|
|
106
108
|
Requires-Dist: tiktoken; extra == "all"
|
|
107
109
|
Requires-Dist: tqdm; extra == "all"
|
|
108
|
-
Requires-Dist: transformers
|
|
110
|
+
Requires-Dist: transformers>=4.33; extra == "all"
|
|
109
111
|
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
110
112
|
Requires-Dist: jieba; extra == "all"
|
|
111
113
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
@@ -161,7 +163,9 @@ Large Model (including Large Language Models, Multi-modal Large Language Models)
|
|
|
161
163
|
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
162
164
|
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
163
165
|
|
|
164
|
-
|
|
166
|
+
|
|
167
|
+
<details><summary>Overall Architecture</summary>
|
|
168
|
+
|
|
165
169
|
<p align="center">
|
|
166
170
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
167
171
|
<br>Fig 1. EvalScope Framework.
|
|
@@ -178,14 +182,20 @@ The architecture includes the following modules:
|
|
|
178
182
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
179
183
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
180
184
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
185
|
+
</details>
|
|
186
|
+
|
|
181
187
|
|
|
182
188
|
## 🎉 News
|
|
183
|
-
- **[2024.
|
|
184
|
-
- **[2024.
|
|
185
|
-
- **[2024.
|
|
186
|
-
- **[2024.
|
|
187
|
-
- **[2024.
|
|
188
|
-
- **[2024.
|
|
189
|
+
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
190
|
+
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
191
|
+
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
192
|
+
- 🔥 **[2024.08.20]** Updated the official documentation, including getting started guides, best practices, and FAQs. Feel free to [📖read it here](https://evalscope.readthedocs.io/en/latest/)!
|
|
193
|
+
- 🔥 **[2024.08.09]** Simplified the installation process, allowing for pypi installation of vlmeval dependencies; optimized the multimodal model evaluation experience, achieving up to 10x acceleration based on the OpenAI API evaluation chain.
|
|
194
|
+
- 🔥 **[2024.07.31]** Important change: The package name `llmuses` has been changed to `evalscope`. Please update your code accordingly.
|
|
195
|
+
- 🔥 **[2024.07.26]** Support for **VLMEvalKit** as a third-party evaluation framework to initiate multimodal model evaluation tasks.
|
|
196
|
+
- 🔥 **[2024.06.29]** Support for **OpenCompass** as a third-party evaluation framework, which we have encapsulated at a higher level, supporting pip installation and simplifying evaluation task configuration.
|
|
197
|
+
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
198
|
+
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
189
199
|
|
|
190
200
|
|
|
191
201
|
|
|
@@ -265,8 +275,8 @@ If prompted with `Do you wish to run the custom code? [y/N]`, please type `y`.
|
|
|
265
275
|
|
|
266
276
|
#### Basic Parameter Descriptions
|
|
267
277
|
- `--model`: Specifies the `model_id` of the model on [ModelScope](https://modelscope.cn/), allowing automatic download. For example, see the [Qwen2-0.5B-Instruct model link](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary); you can also use a local path, such as `/path/to/model`.
|
|
268
|
-
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/
|
|
269
|
-
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](
|
|
278
|
+
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-datasets.html#llm) for filling in this field.
|
|
279
|
+
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html) for available options.
|
|
270
280
|
|
|
271
281
|
### 2. Parameterized Evaluation
|
|
272
282
|
If you wish to conduct a more customized evaluation, such as modifying model parameters or dataset parameters, you can use the following commands:
|
|
@@ -276,8 +286,8 @@ If you wish to conduct a more customized evaluation, such as modifying model par
|
|
|
276
286
|
python evalscope/run.py \
|
|
277
287
|
--model qwen/Qwen2-0.5B-Instruct \
|
|
278
288
|
--template-type qwen \
|
|
279
|
-
--model-args revision=
|
|
280
|
-
--datasets
|
|
289
|
+
--model-args revision=master,precision=torch.float16,device_map=auto \
|
|
290
|
+
--datasets gsm8k ceval \
|
|
281
291
|
--use-cache true \
|
|
282
292
|
--limit 10
|
|
283
293
|
```
|
|
@@ -342,24 +352,6 @@ from evalscope.run import run_task
|
|
|
342
352
|
run_task(task_cfg=your_task_cfg)
|
|
343
353
|
```
|
|
344
354
|
|
|
345
|
-
### Supported Datasets List
|
|
346
|
-
> [!NOTE]
|
|
347
|
-
> The framework currently supports the following datasets. If the dataset you need is not in the list, please submit an issue, or use the [OpenCompass backend](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html) for evaluation, or use the [VLMEvalKit backend](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html) for multi-modal model evaluation.
|
|
348
|
-
|
|
349
|
-
| Dataset Name | Link | Status | Note |
|
|
350
|
-
|--------------------|----------------------------------------------------------------------------------------|--------|------|
|
|
351
|
-
| `mmlu` | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary) | Active | |
|
|
352
|
-
| `ceval` | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary) | Active | |
|
|
353
|
-
| `gsm8k` | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary) | Active | |
|
|
354
|
-
| `arc` | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary) | Active | |
|
|
355
|
-
| `hellaswag` | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary) | Active | |
|
|
356
|
-
| `truthful_qa` | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary) | Active | |
|
|
357
|
-
| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) | Active | |
|
|
358
|
-
| `humaneval` | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary) | Active | |
|
|
359
|
-
| `bbh` | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary) | Active | |
|
|
360
|
-
| `race` | [race](https://modelscope.cn/datasets/modelscope/race/summary) | Active | |
|
|
361
|
-
| `trivia_qa` | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary) | To be integrated | |
|
|
362
|
-
|
|
363
355
|
|
|
364
356
|
## Evaluation Backend
|
|
365
357
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
@@ -47,7 +47,9 @@ Large Model (including Large Language Models, Multi-modal Large Language Models)
|
|
|
47
47
|
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
48
48
|
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
|
|
51
|
+
<details><summary>Overall Architecture</summary>
|
|
52
|
+
|
|
51
53
|
<p align="center">
|
|
52
54
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
53
55
|
<br>Fig 1. EvalScope Framework.
|
|
@@ -64,14 +66,20 @@ The architecture includes the following modules:
|
|
|
64
66
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
65
67
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
66
68
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
69
|
+
</details>
|
|
70
|
+
|
|
67
71
|
|
|
68
72
|
## 🎉 News
|
|
69
|
-
- **[2024.
|
|
70
|
-
- **[2024.
|
|
71
|
-
- **[2024.
|
|
72
|
-
- **[2024.
|
|
73
|
-
- **[2024.
|
|
74
|
-
- **[2024.
|
|
73
|
+
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
74
|
+
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
75
|
+
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
76
|
+
- 🔥 **[2024.08.20]** Updated the official documentation, including getting started guides, best practices, and FAQs. Feel free to [📖read it here](https://evalscope.readthedocs.io/en/latest/)!
|
|
77
|
+
- 🔥 **[2024.08.09]** Simplified the installation process, allowing for pypi installation of vlmeval dependencies; optimized the multimodal model evaluation experience, achieving up to 10x acceleration based on the OpenAI API evaluation chain.
|
|
78
|
+
- 🔥 **[2024.07.31]** Important change: The package name `llmuses` has been changed to `evalscope`. Please update your code accordingly.
|
|
79
|
+
- 🔥 **[2024.07.26]** Support for **VLMEvalKit** as a third-party evaluation framework to initiate multimodal model evaluation tasks.
|
|
80
|
+
- 🔥 **[2024.06.29]** Support for **OpenCompass** as a third-party evaluation framework, which we have encapsulated at a higher level, supporting pip installation and simplifying evaluation task configuration.
|
|
81
|
+
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
82
|
+
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
75
83
|
|
|
76
84
|
|
|
77
85
|
|
|
@@ -151,8 +159,8 @@ If prompted with `Do you wish to run the custom code? [y/N]`, please type `y`.
|
|
|
151
159
|
|
|
152
160
|
#### Basic Parameter Descriptions
|
|
153
161
|
- `--model`: Specifies the `model_id` of the model on [ModelScope](https://modelscope.cn/), allowing automatic download. For example, see the [Qwen2-0.5B-Instruct model link](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary); you can also use a local path, such as `/path/to/model`.
|
|
154
|
-
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/
|
|
155
|
-
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](
|
|
162
|
+
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-datasets.html#llm) for filling in this field.
|
|
163
|
+
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html) for available options.
|
|
156
164
|
|
|
157
165
|
### 2. Parameterized Evaluation
|
|
158
166
|
If you wish to conduct a more customized evaluation, such as modifying model parameters or dataset parameters, you can use the following commands:
|
|
@@ -162,8 +170,8 @@ If you wish to conduct a more customized evaluation, such as modifying model par
|
|
|
162
170
|
python evalscope/run.py \
|
|
163
171
|
--model qwen/Qwen2-0.5B-Instruct \
|
|
164
172
|
--template-type qwen \
|
|
165
|
-
--model-args revision=
|
|
166
|
-
--datasets
|
|
173
|
+
--model-args revision=master,precision=torch.float16,device_map=auto \
|
|
174
|
+
--datasets gsm8k ceval \
|
|
167
175
|
--use-cache true \
|
|
168
176
|
--limit 10
|
|
169
177
|
```
|
|
@@ -228,24 +236,6 @@ from evalscope.run import run_task
|
|
|
228
236
|
run_task(task_cfg=your_task_cfg)
|
|
229
237
|
```
|
|
230
238
|
|
|
231
|
-
### Supported Datasets List
|
|
232
|
-
> [!NOTE]
|
|
233
|
-
> The framework currently supports the following datasets. If the dataset you need is not in the list, please submit an issue, or use the [OpenCompass backend](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html) for evaluation, or use the [VLMEvalKit backend](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html) for multi-modal model evaluation.
|
|
234
|
-
|
|
235
|
-
| Dataset Name | Link | Status | Note |
|
|
236
|
-
|--------------------|----------------------------------------------------------------------------------------|--------|------|
|
|
237
|
-
| `mmlu` | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary) | Active | |
|
|
238
|
-
| `ceval` | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary) | Active | |
|
|
239
|
-
| `gsm8k` | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary) | Active | |
|
|
240
|
-
| `arc` | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary) | Active | |
|
|
241
|
-
| `hellaswag` | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary) | Active | |
|
|
242
|
-
| `truthful_qa` | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary) | Active | |
|
|
243
|
-
| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) | Active | |
|
|
244
|
-
| `humaneval` | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary) | Active | |
|
|
245
|
-
| `bbh` | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary) | Active | |
|
|
246
|
-
| `race` | [race](https://modelscope.cn/datasets/modelscope/race/summary) | Active | |
|
|
247
|
-
| `trivia_qa` | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary) | To be integrated | |
|
|
248
|
-
|
|
249
239
|
|
|
250
240
|
## Evaluation Backend
|
|
251
241
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Optional, Union
|
|
2
|
-
from evalscope.utils import is_module_installed,
|
|
2
|
+
from evalscope.utils import is_module_installed, get_valid_list
|
|
3
3
|
from evalscope.backend.base import BackendManager
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
from functools import partial
|
|
@@ -46,7 +46,7 @@ class Benchmark(object):
|
|
|
46
46
|
|
|
47
47
|
dataset.dataset_name = dataset_name.split('/')[-1]
|
|
48
48
|
dataset.subset_name = subset
|
|
49
|
-
dataset.split = split
|
|
49
|
+
# dataset.split = split
|
|
50
50
|
return dataset
|
|
51
51
|
elif hub == 'HuggingFace':
|
|
52
52
|
# TODO: implement this by xingjun.wxj@alibaba-inc.com
|
|
@@ -244,8 +244,8 @@ class Evaluator(object):
|
|
|
244
244
|
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
245
245
|
|
|
246
246
|
if debug:
|
|
247
|
-
logger.
|
|
248
|
-
logger.
|
|
247
|
+
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
248
|
+
logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
|
|
249
249
|
|
|
250
250
|
answers_list.append(answer_d)
|
|
251
251
|
|
|
@@ -349,7 +349,7 @@ class Evaluator(object):
|
|
|
349
349
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
350
350
|
|
|
351
351
|
if debug:
|
|
352
|
-
logger.
|
|
352
|
+
logger.info(review_d)
|
|
353
353
|
|
|
354
354
|
reviews_list.append(review_d)
|
|
355
355
|
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from asyncio import Queue
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from typing import Union, List, Optional, Dict
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
+
from modelscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpenaiApi:
|
|
17
|
+
|
|
18
|
+
def __init__(self,
|
|
19
|
+
model: str,
|
|
20
|
+
openai_api_key,
|
|
21
|
+
openai_api_base,
|
|
22
|
+
logprobs: Optional[bool] = False,
|
|
23
|
+
top_logprobs: Optional[int] = None,
|
|
24
|
+
max_new_tokens: int = 4096,
|
|
25
|
+
temperature: Optional[float] = 0.0,
|
|
26
|
+
repetition_penalty: Optional[float] = 1.0,
|
|
27
|
+
is_chat: bool = True,
|
|
28
|
+
verbose: bool = True,
|
|
29
|
+
retry: int = 3,
|
|
30
|
+
query_per_second: int = 10, # TODO
|
|
31
|
+
**kwargs):
|
|
32
|
+
|
|
33
|
+
self.temperature = temperature
|
|
34
|
+
self.repetition_penalty = repetition_penalty
|
|
35
|
+
self.max_tokens = max_new_tokens
|
|
36
|
+
self.logprobs = logprobs
|
|
37
|
+
self.top_logprobs = top_logprobs
|
|
38
|
+
|
|
39
|
+
self.openai_api_key = openai_api_key
|
|
40
|
+
self.url = openai_api_base
|
|
41
|
+
self.model = model
|
|
42
|
+
self.is_chat = is_chat
|
|
43
|
+
self.retry = retry
|
|
44
|
+
self.verbose = verbose
|
|
45
|
+
|
|
46
|
+
self.token_bucket = TokenBucket(query_per_second, verbose)
|
|
47
|
+
|
|
48
|
+
def generate_simple(self, inputs: Union[List[str]]):
|
|
49
|
+
|
|
50
|
+
def process_one(in_data: str):
|
|
51
|
+
|
|
52
|
+
if self.is_chat:
|
|
53
|
+
data = dict(
|
|
54
|
+
model=self.model,
|
|
55
|
+
messages=[{'role': 'user', 'content': in_data}],
|
|
56
|
+
max_tokens=self.max_tokens,
|
|
57
|
+
n=1,
|
|
58
|
+
logprobs=self.logprobs,
|
|
59
|
+
top_logprobs=self.top_logprobs,
|
|
60
|
+
stop=None,
|
|
61
|
+
temperature=self.temperature,
|
|
62
|
+
repetition_penalty=self.repetition_penalty,
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
data = dict(
|
|
66
|
+
model=self.model,
|
|
67
|
+
prompt=in_data,
|
|
68
|
+
max_tokens=self.max_tokens,
|
|
69
|
+
temperature=self.temperature,
|
|
70
|
+
repetition_penalty=self.repetition_penalty,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# todo
|
|
74
|
+
openai_api_key = self.openai_api_key or ''
|
|
75
|
+
header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
|
|
76
|
+
data = json.dumps(data, ensure_ascii=False)
|
|
77
|
+
|
|
78
|
+
if self.verbose:
|
|
79
|
+
print(f'>>data in generate_simple: {data}')
|
|
80
|
+
|
|
81
|
+
resp = requests.post(self.url, headers=header, data=data)
|
|
82
|
+
resp = resp.json()
|
|
83
|
+
if self.verbose:
|
|
84
|
+
print(f'>>resp in generate_simple: {resp}')
|
|
85
|
+
|
|
86
|
+
if self.logprobs:
|
|
87
|
+
return resp['choices']
|
|
88
|
+
else:
|
|
89
|
+
if self.is_chat:
|
|
90
|
+
return resp['choices'][0]['message']['content'].strip()
|
|
91
|
+
else:
|
|
92
|
+
return resp['choices'][0]['text'].strip()
|
|
93
|
+
|
|
94
|
+
with ThreadPoolExecutor() as executor:
|
|
95
|
+
results = list(executor.map(process_one, inputs))
|
|
96
|
+
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
def generate(self,
|
|
100
|
+
inputs: Union[List[str], List[List]],
|
|
101
|
+
**kwargs) -> List[str]:
|
|
102
|
+
"""
|
|
103
|
+
Generate responses from OpenAI API.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
inputs: The input messages for the model. It can be a string or a list of messages.
|
|
107
|
+
e.g. ['who are you ?', 'what is your name ?']
|
|
108
|
+
e.g. [[{'role': 'user', 'content': 'who are you ?'}], ...]
|
|
109
|
+
kwargs: The optional arguments for the model.
|
|
110
|
+
"""
|
|
111
|
+
results = []
|
|
112
|
+
# with ThreadPoolExecutor() as executor:
|
|
113
|
+
# results = list(executor.map(self._generate, inputs))
|
|
114
|
+
|
|
115
|
+
for input in inputs:
|
|
116
|
+
results.append(self._generate(input))
|
|
117
|
+
|
|
118
|
+
return results
|
|
119
|
+
|
|
120
|
+
def _generate(self, messages: Union[str, List[Dict]]) -> str:
|
|
121
|
+
|
|
122
|
+
if isinstance(messages, str):
|
|
123
|
+
messages = [{'role': 'user', 'content': messages}]
|
|
124
|
+
|
|
125
|
+
max_num_retries = 0
|
|
126
|
+
while max_num_retries < self.retry:
|
|
127
|
+
# self.wait()
|
|
128
|
+
|
|
129
|
+
header = {
|
|
130
|
+
'Authorization': f'Bearer {self.openai_api_key}',
|
|
131
|
+
'content-type': 'application/json',
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
if self.is_chat:
|
|
136
|
+
data = dict(
|
|
137
|
+
model=self.model,
|
|
138
|
+
messages=messages,
|
|
139
|
+
max_tokens=self.max_tokens,
|
|
140
|
+
n=1,
|
|
141
|
+
logprobs=self.logprobs,
|
|
142
|
+
top_logprobs=self.top_logprobs,
|
|
143
|
+
stop=None,
|
|
144
|
+
temperature=self.temperature,
|
|
145
|
+
repetition_penalty=self.repetition_penalty,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
# TODO: This is a temporary solution for non-chat models.
|
|
149
|
+
input_prompts = []
|
|
150
|
+
for msg in messages:
|
|
151
|
+
input_prompts.append(msg['content'])
|
|
152
|
+
|
|
153
|
+
data = dict(
|
|
154
|
+
model=self.model,
|
|
155
|
+
prompt='\n'.join(input_prompts),
|
|
156
|
+
max_tokens=self.max_tokens,
|
|
157
|
+
temperature=self.temperature,
|
|
158
|
+
repetition_penalty=self.repetition_penalty,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def remove_none_val(input_d: dict):
|
|
162
|
+
return {k: v for k, v in input_d.items() if v is not None}
|
|
163
|
+
data = remove_none_val(data)
|
|
164
|
+
|
|
165
|
+
if self.verbose:
|
|
166
|
+
logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
|
|
167
|
+
raw_response = requests.post(self.url,
|
|
168
|
+
headers=header,
|
|
169
|
+
data=json.dumps(data, ensure_ascii=False))
|
|
170
|
+
|
|
171
|
+
response = raw_response.json()
|
|
172
|
+
if self.verbose:
|
|
173
|
+
logger.info(f'>> response: {response}')
|
|
174
|
+
|
|
175
|
+
if self.logprobs:
|
|
176
|
+
return response['choices']
|
|
177
|
+
else:
|
|
178
|
+
if self.is_chat:
|
|
179
|
+
return response['choices'][0]['message']['content'].strip()
|
|
180
|
+
else:
|
|
181
|
+
return response['choices'][0]['text'].strip()
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f'Error occurs: {str(e)}')
|
|
185
|
+
max_num_retries += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
def wait(self):
|
|
189
|
+
return self.token_bucket.get_token()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class TokenBucket:
|
|
193
|
+
"""A token bucket for rate limiting.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
query_per_second (float): The rate of the token bucket.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, rate, verbose=False):
|
|
200
|
+
self._rate = rate
|
|
201
|
+
self._tokens = threading.Semaphore(0)
|
|
202
|
+
self.started = False
|
|
203
|
+
self._request_queue = Queue()
|
|
204
|
+
self.logger = get_logger()
|
|
205
|
+
self.verbose = verbose
|
|
206
|
+
|
|
207
|
+
def _add_tokens(self):
|
|
208
|
+
"""Add tokens to the bucket."""
|
|
209
|
+
while True:
|
|
210
|
+
if self._tokens._value < self._rate:
|
|
211
|
+
self._tokens.release()
|
|
212
|
+
time.sleep(1 / self._rate)
|
|
213
|
+
|
|
214
|
+
def get_token(self):
|
|
215
|
+
"""Get a token from the bucket."""
|
|
216
|
+
if not self.started:
|
|
217
|
+
self.started = True
|
|
218
|
+
threading.Thread(target=self._add_tokens, daemon=True).start()
|
|
219
|
+
self._tokens.acquire()
|
|
220
|
+
if self.verbose:
|
|
221
|
+
cur_time = time.time()
|
|
222
|
+
while not self._request_queue.empty():
|
|
223
|
+
if cur_time - self._request_queue.queue[0] > 60:
|
|
224
|
+
self._request_queue.get()
|
|
225
|
+
else:
|
|
226
|
+
break
|
|
227
|
+
self._request_queue.put(cur_time)
|
|
228
|
+
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
|
|
@@ -51,15 +51,15 @@ UNLIMITED_RATE = -1
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
async def on_request_start(session, context, params):
|
|
54
|
-
logger.
|
|
54
|
+
logger.info(f'Starting request: <{params}>')
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
async def on_request_chunk_sent(session, context, params):
|
|
58
|
-
logger.
|
|
58
|
+
logger.info(f'Request body: {params}')
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
async def on_response_chunk_received(session, context, params):
|
|
62
|
-
logger.
|
|
62
|
+
logger.info(f'Response info: <{params}>')
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
class AioHttpClient:
|
|
@@ -116,7 +116,7 @@ class AioHttpClient:
|
|
|
116
116
|
line = line.decode("utf8")
|
|
117
117
|
line = line.rstrip("\n").rstrip("\r")
|
|
118
118
|
if self.debug:
|
|
119
|
-
logger.
|
|
119
|
+
logger.info(line)
|
|
120
120
|
sse_msg = ServerSentEvent.decode(line)
|
|
121
121
|
if not sse_msg:
|
|
122
122
|
continue
|
|
@@ -567,7 +567,7 @@ async def send_requests_worker(task_id, request_queue: asyncio.Queue, benchmark_
|
|
|
567
567
|
else:
|
|
568
568
|
if response_data:
|
|
569
569
|
collected_messages.append(response_data) # save the message
|
|
570
|
-
logger.
|
|
570
|
+
logger.info(response_data)
|
|
571
571
|
benchmark_data["chunk_times"].append(time.perf_counter())
|
|
572
572
|
|
|
573
573
|
benchmark_data["response_messages"] = collected_messages
|