evalscope 0.8.0__tar.gz → 0.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.8.0/evalscope.egg-info → evalscope-0.8.2}/PKG-INFO +15 -3
- {evalscope-0.8.0 → evalscope-0.8.2}/README.md +12 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/base.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/clip.py +2 -2
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/embedding.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
- evalscope-0.8.2/evalscope/benchmarks/humaneval/humaneval_adapter.py +206 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/race_adapter.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/config.py +38 -2
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/constants.py +24 -38
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/__init__.py +0 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/evaluator.py +6 -4
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/rating_eval.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/model_adapter.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/arguments.py +3 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/benchmark.py +3 -3
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/main.py +5 -6
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/openai_api.py +53 -49
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/registry.py +3 -3
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/benchmark_util.py +4 -4
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/db_util.py +66 -22
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/local_server.py +4 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/run.py +45 -82
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/run_arena.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/summarizer.py +14 -26
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/eval.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/longbench_write.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/combine_reports.py +2 -4
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/rewrite_eval_results.py +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/__init__.py +1 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/chat_service.py +1 -1
- evalscope-0.8.2/evalscope/utils/io_utils.py +162 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/logger.py +8 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/utils.py +0 -175
- evalscope-0.8.2/evalscope/version.py +4 -0
- {evalscope-0.8.0 → evalscope-0.8.2/evalscope.egg-info}/PKG-INFO +15 -3
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/SOURCES.txt +1 -21
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/requires.txt +2 -2
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/rag.txt +1 -1
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/cli/test_run.py +11 -12
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/perf/test_perf.py +3 -2
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/vlm/test_vlmeval.py +3 -2
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope-0.8.0/evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope-0.8.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -20
- evalscope-0.8.0/evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope-0.8.0/evalscope/version.py +0 -4
- {evalscope-0.8.0 → evalscope-0.8.2}/LICENSE +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/MANIFEST.in +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/arguments.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/base.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/cli.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/api/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/api/openai_api.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/dummy_chat_model.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/model.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/models/openai_model.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/docs.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/framework.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/inner.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/opencompass.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/perf.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/tests.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements/vlmeval.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/requirements.txt +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/setup.cfg +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/setup.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/cli/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/perf/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/__init__.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/test_run_all.py +0 -0
- {evalscope-0.8.0 → evalscope-0.8.2}/tests/vlm/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.2
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -54,7 +54,7 @@ Provides-Extra: vlmeval
|
|
|
54
54
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
55
55
|
Provides-Extra: rag
|
|
56
56
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
57
|
-
Requires-Dist: ragas==0.2.
|
|
57
|
+
Requires-Dist: ragas==0.2.9; extra == "rag"
|
|
58
58
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
59
59
|
Provides-Extra: perf
|
|
60
60
|
Requires-Dist: aiohttp; extra == "perf"
|
|
@@ -125,7 +125,7 @@ Requires-Dist: transformers_stream_generator; extra == "all"
|
|
|
125
125
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
126
126
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
127
127
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
128
|
-
Requires-Dist: ragas==0.2.
|
|
128
|
+
Requires-Dist: ragas==0.2.9; extra == "all"
|
|
129
129
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
130
130
|
Requires-Dist: aiohttp; extra == "all"
|
|
131
131
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -181,6 +181,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
|
|
|
181
181
|
<br>EvalScope Framework.
|
|
182
182
|
</p>
|
|
183
183
|
|
|
184
|
+
<details><summary>Framework Description</summary>
|
|
185
|
+
|
|
184
186
|
The architecture includes the following modules:
|
|
185
187
|
1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
|
|
186
188
|
2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
|
|
@@ -194,6 +196,16 @@ The architecture includes the following modules:
|
|
|
194
196
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
195
197
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
196
198
|
|
|
199
|
+
</details>
|
|
200
|
+
|
|
201
|
+
## ☎ User Groups
|
|
202
|
+
|
|
203
|
+
Please scan the QR code below to join our community groups:
|
|
204
|
+
|
|
205
|
+
[Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
|
|
206
|
+
:-------------------------:|:-------------------------:|:-------------------------:
|
|
207
|
+
<img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
|
|
208
|
+
|
|
197
209
|
|
|
198
210
|
## 🎉 News
|
|
199
211
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -45,6 +45,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
|
|
|
45
45
|
<br>EvalScope Framework.
|
|
46
46
|
</p>
|
|
47
47
|
|
|
48
|
+
<details><summary>Framework Description</summary>
|
|
49
|
+
|
|
48
50
|
The architecture includes the following modules:
|
|
49
51
|
1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
|
|
50
52
|
2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
|
|
@@ -58,6 +60,16 @@ The architecture includes the following modules:
|
|
|
58
60
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
59
61
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
60
62
|
|
|
63
|
+
</details>
|
|
64
|
+
|
|
65
|
+
## ☎ User Groups
|
|
66
|
+
|
|
67
|
+
Please scan the QR code below to join our community groups:
|
|
68
|
+
|
|
69
|
+
[Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
|
|
70
|
+
:-------------------------:|:-------------------------:|:-------------------------:
|
|
71
|
+
<img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
|
|
72
|
+
|
|
61
73
|
|
|
62
74
|
## 🎉 News
|
|
63
75
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -4,7 +4,7 @@ import torch.nn.functional as F
|
|
|
4
4
|
from langchain_core.embeddings import Embeddings
|
|
5
5
|
from PIL import Image
|
|
6
6
|
from transformers import AutoModel, AutoProcessor
|
|
7
|
-
from typing import List
|
|
7
|
+
from typing import List, Union
|
|
8
8
|
|
|
9
9
|
from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
|
|
10
10
|
from evalscope.constants import HubType
|
|
@@ -86,7 +86,7 @@ class CLIPModel(Embeddings):
|
|
|
86
86
|
self.transform = self.processor.image_processor
|
|
87
87
|
self.tokenizer = self.processor.tokenizer
|
|
88
88
|
|
|
89
|
-
def encode_text(self, batch_texts: List[str]
|
|
89
|
+
def encode_text(self, batch_texts: Union[List[str], List[List[str]]]):
|
|
90
90
|
if isinstance(batch_texts[0], list):
|
|
91
91
|
batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
|
|
92
92
|
# Ensure that the input texts are within the token limit
|
|
@@ -80,7 +80,7 @@ class BaseModel(Embeddings):
|
|
|
80
80
|
"""Embed query text. Compact mteb."""
|
|
81
81
|
raise NotImplementedError
|
|
82
82
|
|
|
83
|
-
def encode_corpus(self, corpus: List[str]
|
|
83
|
+
def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
|
|
84
84
|
"""Embed search docs . Compact mteb."""
|
|
85
85
|
raise NotImplementedError
|
|
86
86
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Optional
|
|
|
8
8
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
9
|
from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
|
|
10
10
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
11
|
-
from evalscope.utils import jsonl_to_list
|
|
11
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
@@ -6,7 +6,8 @@ import re
|
|
|
6
6
|
|
|
7
7
|
from evalscope.benchmarks import DataAdapter
|
|
8
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
9
|
-
from evalscope.utils import
|
|
9
|
+
from evalscope.utils import normalize_score
|
|
10
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
12
|
|
|
12
13
|
# flake8: noqa
|
|
@@ -5,7 +5,8 @@ import re
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
7
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
# flake8: noqa
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
|
+
from evalscope.metrics.metrics import weighted_mean
|
|
10
|
+
from evalscope.tools.combine_reports import gen_table
|
|
11
|
+
from evalscope.utils import normalize_score
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
DATASET_ID = 'modelscope/humaneval'
|
|
17
|
+
SUBSET_LIST = ['openai_humaneval']
|
|
18
|
+
|
|
19
|
+
# Example:
|
|
20
|
+
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HumanevalAdapter(DataAdapter):
|
|
24
|
+
"""
|
|
25
|
+
A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self,
|
|
29
|
+
subset_list: list = None,
|
|
30
|
+
metric_list: list = None,
|
|
31
|
+
few_shot_num: int = None,
|
|
32
|
+
train_split: str = None,
|
|
33
|
+
eval_split: str = 'test',
|
|
34
|
+
prompt_template: str = 'Complete the following python code:\n',
|
|
35
|
+
**kwargs):
|
|
36
|
+
try:
|
|
37
|
+
from human_eval.data import stream_jsonl, write_jsonl
|
|
38
|
+
from human_eval.evaluation import check_correctness
|
|
39
|
+
except ImportError:
|
|
40
|
+
raise ImportError('Please install human_eval:'
|
|
41
|
+
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
42
|
+
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
43
|
+
|
|
44
|
+
if subset_list is None:
|
|
45
|
+
subset_list = SUBSET_LIST
|
|
46
|
+
|
|
47
|
+
if metric_list is None:
|
|
48
|
+
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
49
|
+
|
|
50
|
+
self.k = [1]
|
|
51
|
+
self.num_workers = 4
|
|
52
|
+
self.timeout = 4.0
|
|
53
|
+
self.outputs = kwargs.get('outputs', None)
|
|
54
|
+
|
|
55
|
+
self.read_problems_func = stream_jsonl
|
|
56
|
+
self.write_jsonl_func = write_jsonl
|
|
57
|
+
self.eval_func = check_correctness
|
|
58
|
+
|
|
59
|
+
super().__init__(
|
|
60
|
+
subset_list=subset_list,
|
|
61
|
+
metric_list=metric_list,
|
|
62
|
+
few_shot_num=few_shot_num,
|
|
63
|
+
train_split=train_split,
|
|
64
|
+
eval_split=eval_split,
|
|
65
|
+
prompt_template=prompt_template,
|
|
66
|
+
**kwargs)
|
|
67
|
+
|
|
68
|
+
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
69
|
+
data_dict = {}
|
|
70
|
+
for subset_name in subset_list:
|
|
71
|
+
data_dict[subset_name] = {}
|
|
72
|
+
# [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
|
|
73
|
+
data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
|
|
74
|
+
|
|
75
|
+
return data_dict
|
|
76
|
+
|
|
77
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
78
|
+
"""
|
|
79
|
+
Generate prompt for the model.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
input_d (dict): The raw input. A single data format of the Humaneval:
|
|
83
|
+
{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
|
|
84
|
+
"""
|
|
85
|
+
full_prompt = input_d['prompt']
|
|
86
|
+
full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
|
|
87
|
+
|
|
88
|
+
return {'data': [full_prompt]}
|
|
89
|
+
|
|
90
|
+
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
91
|
+
ans_list: list = []
|
|
92
|
+
system_prompt: str = ''
|
|
93
|
+
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
94
|
+
prompt: str = system_prompt + data_d['prompt']
|
|
95
|
+
inputs: dict = {'data': [prompt]}
|
|
96
|
+
|
|
97
|
+
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
98
|
+
|
|
99
|
+
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
100
|
+
pred_ans = self._postprocess(pred_ans)
|
|
101
|
+
|
|
102
|
+
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
103
|
+
|
|
104
|
+
return ans_list
|
|
105
|
+
|
|
106
|
+
def eval(self, infer_cfg: dict, **kwargs):
|
|
107
|
+
|
|
108
|
+
# predict
|
|
109
|
+
ans_list: list = self.get_answers(infer_cfg)
|
|
110
|
+
ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
|
|
111
|
+
|
|
112
|
+
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
113
|
+
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
114
|
+
logger.info('** Dump predictions successfully.')
|
|
115
|
+
|
|
116
|
+
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
117
|
+
results = self.eval_func(
|
|
118
|
+
sample_file=ans_out_file,
|
|
119
|
+
k=self.k,
|
|
120
|
+
n_workers=self.num_workers,
|
|
121
|
+
timeout=self.timeout,
|
|
122
|
+
problem_file=self.problem_file)
|
|
123
|
+
|
|
124
|
+
# output: report
|
|
125
|
+
report_map: dict = self.gen_report(results=results)
|
|
126
|
+
report_dir: str = self.outputs_structure.reports_dir
|
|
127
|
+
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
128
|
+
|
|
129
|
+
with open(report_file, 'w') as f:
|
|
130
|
+
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
131
|
+
# logger.info(f'** Dump report to {report_file} \n')
|
|
132
|
+
logger.info('** Dump report \n')
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
# Make table
|
|
136
|
+
report_table: str = gen_table([report_dir])
|
|
137
|
+
logger.info(f'** Report table: \n {report_table} \n')
|
|
138
|
+
except Exception:
|
|
139
|
+
logger.error('Failed to generate report table.')
|
|
140
|
+
|
|
141
|
+
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
142
|
+
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
143
|
+
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
144
|
+
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
145
|
+
cate_avg_list = [{
|
|
146
|
+
'name': subset_name,
|
|
147
|
+
'score': normalize_score(score=score)
|
|
148
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
149
|
+
|
|
150
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
151
|
+
|
|
152
|
+
res_map = dict(
|
|
153
|
+
name=report_name or 'HumanEval',
|
|
154
|
+
metric='pass@1',
|
|
155
|
+
score=weighted_avg_acc,
|
|
156
|
+
category=[category_d],
|
|
157
|
+
total_num=total_num)
|
|
158
|
+
|
|
159
|
+
return res_map
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def _postprocess(cls, text: str) -> str:
|
|
163
|
+
if '```' in text:
|
|
164
|
+
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
165
|
+
if len(blocks) == 0:
|
|
166
|
+
text = text.split('```')[1] # fall back to default strategy
|
|
167
|
+
else:
|
|
168
|
+
text = blocks[0] # fetch the first code block
|
|
169
|
+
if not text.startswith('\n'): # in case starting with ```python
|
|
170
|
+
text = text[max(text.find('\n') + 1, 0):]
|
|
171
|
+
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
172
|
+
def_idx = text.find('def')
|
|
173
|
+
if def_idx != -1:
|
|
174
|
+
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
175
|
+
text = text.split('\n\n')[0]
|
|
176
|
+
if text.strip().startswith('def'):
|
|
177
|
+
text = '\n'.join(text.split('\n')[1:])
|
|
178
|
+
if not text.startswith(' '):
|
|
179
|
+
if text.startswith(' '):
|
|
180
|
+
text = ' ' + text.lstrip()
|
|
181
|
+
else:
|
|
182
|
+
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
183
|
+
return text
|
|
184
|
+
|
|
185
|
+
def compute_metric(self, review_res_list: list) -> float:
|
|
186
|
+
"""
|
|
187
|
+
Compute evaluation result by specific metric.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
The metric score.
|
|
194
|
+
"""
|
|
195
|
+
items = [(score, 1.0) for score in review_res_list]
|
|
196
|
+
return weighted_mean(items)
|
|
197
|
+
|
|
198
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
199
|
+
return self._postprocess(result)
|
|
200
|
+
|
|
201
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
202
|
+
return input_d
|
|
203
|
+
|
|
204
|
+
def match(self, gold: str, pred: str) -> float:
|
|
205
|
+
res = self.eval_func(gold, pred, self.timeout)
|
|
206
|
+
return float(res['passed'])
|
|
@@ -5,7 +5,8 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
7
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
# flake8: noqa
|
|
@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
|
|
|
9
9
|
|
|
10
10
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
|
|
11
11
|
from evalscope.models.custom import CustomModel
|
|
12
|
-
from evalscope.utils import
|
|
12
|
+
from evalscope.utils import gen_hash
|
|
13
|
+
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
13
14
|
from evalscope.utils.logger import get_logger
|
|
14
15
|
|
|
15
16
|
logger = get_logger()
|
|
@@ -31,6 +32,7 @@ DEFAULT_GENERATION_CONFIG = {
|
|
|
31
32
|
class TaskConfig:
|
|
32
33
|
# Model-related arguments
|
|
33
34
|
model: Union[str, CustomModel, None] = None
|
|
35
|
+
model_id: Optional[str] = None
|
|
34
36
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
35
37
|
|
|
36
38
|
# Template-related arguments
|
|
@@ -64,6 +66,13 @@ class TaskConfig:
|
|
|
64
66
|
dry_run: bool = False
|
|
65
67
|
seed: int = 42
|
|
66
68
|
|
|
69
|
+
def __post_init__(self):
|
|
70
|
+
if (not self.model_id) and self.model:
|
|
71
|
+
if isinstance(self.model, CustomModel):
|
|
72
|
+
self.model_id = type(self.model).__name__
|
|
73
|
+
else:
|
|
74
|
+
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
75
|
+
|
|
67
76
|
def to_dict(self):
|
|
68
77
|
# Note: to avoid serialization error for some model instance
|
|
69
78
|
return self.__dict__
|
|
@@ -105,7 +114,9 @@ class TaskConfig:
|
|
|
105
114
|
def from_args(args: Namespace):
|
|
106
115
|
# Convert Namespace to a dictionary and filter out None values
|
|
107
116
|
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
108
|
-
|
|
117
|
+
|
|
118
|
+
if 'func' in args_dict:
|
|
119
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
109
120
|
|
|
110
121
|
return TaskConfig.from_dict(args_dict)
|
|
111
122
|
|
|
@@ -119,6 +130,7 @@ class TaskConfig:
|
|
|
119
130
|
continue
|
|
120
131
|
|
|
121
132
|
task.model = custom_model
|
|
133
|
+
task.model_id = type(custom_model).__name__
|
|
122
134
|
res_list.append(task)
|
|
123
135
|
|
|
124
136
|
return res_list
|
|
@@ -168,6 +180,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
|
168
180
|
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
169
181
|
|
|
170
182
|
|
|
183
|
+
def parse_task_config(task_cfg) -> TaskConfig:
|
|
184
|
+
"""Parse task configuration from various formats into a TaskConfig object."""
|
|
185
|
+
if isinstance(task_cfg, TaskConfig):
|
|
186
|
+
logger.info('Args: Task config is provided with TaskConfig type.')
|
|
187
|
+
elif isinstance(task_cfg, dict):
|
|
188
|
+
logger.info('Args: Task config is provided with dictionary type.')
|
|
189
|
+
task_cfg = TaskConfig.from_dict(task_cfg)
|
|
190
|
+
elif isinstance(task_cfg, Namespace):
|
|
191
|
+
logger.info('Args: Task config is provided with CommandLine type.')
|
|
192
|
+
task_cfg = TaskConfig.from_args(task_cfg)
|
|
193
|
+
elif isinstance(task_cfg, str):
|
|
194
|
+
extension = task_cfg.split('.')[-1]
|
|
195
|
+
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
196
|
+
if extension in ['yaml', 'yml']:
|
|
197
|
+
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
198
|
+
elif extension == 'json':
|
|
199
|
+
task_cfg = TaskConfig.from_json(task_cfg)
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError('Args: Unsupported file extension.')
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError('Args: Please provide a valid task config.')
|
|
204
|
+
return task_cfg
|
|
205
|
+
|
|
206
|
+
|
|
171
207
|
class TempModel(CustomModel):
|
|
172
208
|
|
|
173
209
|
def __init__(self, config: dict):
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
4
3
|
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
5
4
|
|
|
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
|
|
|
7
6
|
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
8
7
|
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
9
8
|
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
9
|
+
DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class HubType:
|
|
@@ -76,33 +76,6 @@ class ArenaMode:
|
|
|
76
76
|
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
class OutputsStructure:
|
|
80
|
-
LOGS_DIR = 'logs'
|
|
81
|
-
PREDICTIONS_DIR = 'predictions'
|
|
82
|
-
REVIEWS_DIR = 'reviews'
|
|
83
|
-
REPORTS_DIR = 'reports'
|
|
84
|
-
CONFIGS_DIR = 'configs'
|
|
85
|
-
|
|
86
|
-
def __init__(self, outputs_dir: str, is_make: bool = True):
|
|
87
|
-
self.outputs_dir = outputs_dir
|
|
88
|
-
self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
|
|
89
|
-
self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
|
|
90
|
-
self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
|
|
91
|
-
self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
|
|
92
|
-
self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
|
|
93
|
-
|
|
94
|
-
if is_make:
|
|
95
|
-
self.create_directories()
|
|
96
|
-
|
|
97
|
-
def create_directories(self):
|
|
98
|
-
os.makedirs(self.outputs_dir, exist_ok=True)
|
|
99
|
-
os.makedirs(self.logs_dir, exist_ok=True)
|
|
100
|
-
os.makedirs(self.predictions_dir, exist_ok=True)
|
|
101
|
-
os.makedirs(self.reviews_dir, exist_ok=True)
|
|
102
|
-
os.makedirs(self.reports_dir, exist_ok=True)
|
|
103
|
-
os.makedirs(self.configs_dir, exist_ok=True)
|
|
104
|
-
|
|
105
|
-
|
|
106
79
|
class AnswerKeys:
|
|
107
80
|
ANSWER_ID = 'answer_id'
|
|
108
81
|
RAW_INPUT = 'raw_input'
|
|
@@ -166,17 +139,30 @@ class EvalType:
|
|
|
166
139
|
|
|
167
140
|
|
|
168
141
|
class EvalBackend:
|
|
169
|
-
# Use native evaluation pipeline of EvalScope
|
|
170
|
-
NATIVE = 'Native'
|
|
171
142
|
|
|
172
|
-
|
|
173
|
-
|
|
143
|
+
class _Backend:
|
|
144
|
+
# compatible with old version, set 'value'
|
|
145
|
+
|
|
146
|
+
def __init__(self, value):
|
|
147
|
+
self._value = value
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def value(self):
|
|
151
|
+
return self._value
|
|
152
|
+
|
|
153
|
+
def __str__(self):
|
|
154
|
+
return self._value
|
|
174
155
|
|
|
175
|
-
|
|
176
|
-
|
|
156
|
+
def __repr__(self):
|
|
157
|
+
return f"'{self._value}'"
|
|
177
158
|
|
|
178
|
-
|
|
179
|
-
|
|
159
|
+
def __eq__(self, other):
|
|
160
|
+
if isinstance(other, str):
|
|
161
|
+
return self._value == other
|
|
162
|
+
return NotImplemented
|
|
180
163
|
|
|
181
|
-
|
|
182
|
-
|
|
164
|
+
NATIVE = _Backend('Native')
|
|
165
|
+
OPEN_COMPASS = _Backend('OpenCompass')
|
|
166
|
+
VLM_EVAL_KIT = _Backend('VLMEvalKit')
|
|
167
|
+
RAG_EVAL = _Backend('RAGEval')
|
|
168
|
+
THIRD_PARTY = _Backend('ThirdParty')
|
|
@@ -11,10 +11,11 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
11
11
|
from evalscope.benchmarks import DataAdapter
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
13
|
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
|
|
14
|
-
|
|
14
|
+
ReviewKeys)
|
|
15
15
|
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
16
16
|
from evalscope.tools.combine_reports import gen_table
|
|
17
|
-
from evalscope.utils import dict_torch_dtype_to_str,
|
|
17
|
+
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
18
19
|
from evalscope.utils.logger import get_logger
|
|
19
20
|
|
|
20
21
|
logger = get_logger()
|
|
@@ -56,8 +57,8 @@ class Evaluator(object):
|
|
|
56
57
|
**kwargs):
|
|
57
58
|
|
|
58
59
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
59
|
-
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
|
|
60
|
-
self.model_name =
|
|
60
|
+
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
|
|
61
|
+
self.model_name = overall_task_cfg.model_id
|
|
61
62
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
62
63
|
|
|
63
64
|
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
@@ -85,6 +86,7 @@ class Evaluator(object):
|
|
|
85
86
|
**kwargs)
|
|
86
87
|
|
|
87
88
|
# Get prompts from dataset
|
|
89
|
+
# TODO: support sampler
|
|
88
90
|
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
89
91
|
del self.dataset
|
|
90
92
|
|
|
@@ -5,8 +5,8 @@ import pyarrow as pa
|
|
|
5
5
|
from typing import List, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.constants import MetricMembers
|
|
8
|
-
from evalscope.utils import jsonl_to_list
|
|
9
8
|
from evalscope.utils.arena_utils import compute_elo
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -12,8 +12,9 @@ from typing import Any, List
|
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
14
|
from evalscope.models.openai_model import OpenAIModel
|
|
15
|
-
from evalscope.utils import completion_parsers,
|
|
15
|
+
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
16
|
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
|
+
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
17
18
|
from evalscope.utils.logger import get_logger
|
|
18
19
|
|
|
19
20
|
logger = get_logger()
|
|
@@ -429,7 +429,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
429
429
|
fix_do_sample_warning(self.generation_config)
|
|
430
430
|
|
|
431
431
|
# Run inference
|
|
432
|
-
output_ids = self.model.generate(
|
|
432
|
+
output_ids = self.model.generate(input_ids, generation_config=self.generation_config)
|
|
433
433
|
|
|
434
434
|
response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
|
|
435
435
|
return response
|
|
@@ -16,7 +16,7 @@ class Arguments:
|
|
|
16
16
|
attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
|
|
17
17
|
api: str = 'openai' # API to be used (default: 'openai')
|
|
18
18
|
tokenizer_path: Optional[str] = None # Path to the tokenizer
|
|
19
|
-
port:
|
|
19
|
+
port: int = 8877 # Port number for the local API server
|
|
20
20
|
|
|
21
21
|
# Connection settings
|
|
22
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
@@ -68,6 +68,7 @@ class Arguments:
|
|
|
68
68
|
model=args.model,
|
|
69
69
|
attn_implementation=args.attn_implementation,
|
|
70
70
|
url=args.url,
|
|
71
|
+
port=args.port,
|
|
71
72
|
api_key=args.api_key,
|
|
72
73
|
connect_timeout=args.connect_timeout,
|
|
73
74
|
read_timeout=args.read_timeout,
|
|
@@ -138,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
138
139
|
|
|
139
140
|
# Connection settings
|
|
140
141
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
142
|
+
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
141
143
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
142
144
|
parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
|
|
143
145
|
parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
|
|
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
157
157
|
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
158
158
|
try:
|
|
159
159
|
# Attempt to get benchmark data from the queue with a timeout
|
|
160
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=
|
|
160
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
161
161
|
benchmark_data_queue.task_done()
|
|
162
162
|
except asyncio.TimeoutError:
|
|
163
163
|
# If timeout, continue to the next iteration
|
|
@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
|
|
|
195
195
|
server.start()
|
|
196
196
|
|
|
197
197
|
if args.dataset.startswith('speed_benchmark'):
|
|
198
|
-
args.url = 'http://127.0.0.1:
|
|
198
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
199
199
|
else:
|
|
200
|
-
args.url = 'http://127.0.0.1:
|
|
200
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
201
|
|
|
202
202
|
if not await test_connection(args):
|
|
203
203
|
raise TimeoutError('Test connection failed')
|