evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope import TaskConfig, run_task
|
|
4
|
+
|
|
5
|
+
DASHSCOPE_API_KEY = 'sk-723135c241x'
|
|
6
|
+
|
|
7
|
+
def eval_distill_qwen():
|
|
8
|
+
model_name = 'DeepSeek-R1-Distill-Qwen-7B'
|
|
9
|
+
dataset_name = 'math_500'
|
|
10
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
11
|
+
|
|
12
|
+
task_config = TaskConfig(
|
|
13
|
+
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
14
|
+
model=model_name,
|
|
15
|
+
eval_type='service',
|
|
16
|
+
datasets=[dataset_name],
|
|
17
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
18
|
+
eval_batch_size=32,
|
|
19
|
+
generation_config={
|
|
20
|
+
'max_tokens': 20000, # avoid exceed max length
|
|
21
|
+
'temperature': 0.6,
|
|
22
|
+
'top_p': 0.95,
|
|
23
|
+
'n': 1,
|
|
24
|
+
},
|
|
25
|
+
)
|
|
26
|
+
run_task(task_config)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def eval_math_qwen():
|
|
30
|
+
model_name = 'Qwen2.5-Math-7B-Instruct'
|
|
31
|
+
dataset_name = 'math_500'
|
|
32
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
33
|
+
|
|
34
|
+
task_config = TaskConfig(
|
|
35
|
+
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
36
|
+
model=model_name,
|
|
37
|
+
eval_type='service',
|
|
38
|
+
datasets=[dataset_name],
|
|
39
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
40
|
+
eval_batch_size=32,
|
|
41
|
+
generation_config={
|
|
42
|
+
'max_tokens': 3000, # avoid exceed max length
|
|
43
|
+
'temperature': 0.6,
|
|
44
|
+
'top_p': 0.95,
|
|
45
|
+
'n': 3,
|
|
46
|
+
},
|
|
47
|
+
)
|
|
48
|
+
run_task(task_config)
|
|
49
|
+
|
|
50
|
+
def eval_r1():
|
|
51
|
+
model_name = 'deepseek-r1'
|
|
52
|
+
dataset_name = 'math_500'
|
|
53
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
54
|
+
|
|
55
|
+
task_config = TaskConfig(
|
|
56
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
57
|
+
api_key=DASHSCOPE_API_KEY,
|
|
58
|
+
model=model_name,
|
|
59
|
+
eval_type='service',
|
|
60
|
+
datasets=[dataset_name],
|
|
61
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
62
|
+
eval_batch_size=8,
|
|
63
|
+
generation_config={
|
|
64
|
+
'max_tokens': 20000, # avoid exceed max length
|
|
65
|
+
'temperature': 0.6,
|
|
66
|
+
'top_p': 0.95,
|
|
67
|
+
'n': 1,
|
|
68
|
+
},
|
|
69
|
+
use_cache='./outputs/20250307_000404',
|
|
70
|
+
timeout=36000,
|
|
71
|
+
stream=True
|
|
72
|
+
)
|
|
73
|
+
run_task(task_config)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def eval_distill_32b():
|
|
77
|
+
model_name = 'deepseek-r1-distill-qwen-32b'
|
|
78
|
+
dataset_name = 'math_500'
|
|
79
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
80
|
+
|
|
81
|
+
task_config = TaskConfig(
|
|
82
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
83
|
+
api_key=DASHSCOPE_API_KEY,
|
|
84
|
+
model=model_name,
|
|
85
|
+
eval_type='service',
|
|
86
|
+
datasets=[dataset_name],
|
|
87
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
88
|
+
eval_batch_size=5,
|
|
89
|
+
generation_config={
|
|
90
|
+
'max_tokens': 12000, # avoid exceed max length
|
|
91
|
+
'temperature': 0.6,
|
|
92
|
+
'top_p': 0.95,
|
|
93
|
+
'n': 1,
|
|
94
|
+
},
|
|
95
|
+
use_cache='./outputs/20250306_235951',
|
|
96
|
+
timeout=32000,
|
|
97
|
+
stream=True
|
|
98
|
+
|
|
99
|
+
)
|
|
100
|
+
run_task(task_config)
|
|
101
|
+
|
|
102
|
+
def eval_qwq():
|
|
103
|
+
model_name = 'qwq-32b-preview'
|
|
104
|
+
dataset_name = 'math_500'
|
|
105
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
106
|
+
|
|
107
|
+
task_config = TaskConfig(
|
|
108
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
109
|
+
api_key=os.environ['DASHSCOPE_API_KEY'],
|
|
110
|
+
model=model_name,
|
|
111
|
+
eval_type='service',
|
|
112
|
+
datasets=[dataset_name],
|
|
113
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
114
|
+
eval_batch_size=32,
|
|
115
|
+
generation_config={
|
|
116
|
+
'max_tokens': 8000, # avoid exceed max length
|
|
117
|
+
'temperature': 0.6,
|
|
118
|
+
'top_p': 0.95,
|
|
119
|
+
'n': 1,
|
|
120
|
+
},
|
|
121
|
+
use_cache='./outputs/20250221_105911'
|
|
122
|
+
)
|
|
123
|
+
run_task(task_config)
|
|
124
|
+
|
|
125
|
+
if __name__ == '__main__':
|
|
126
|
+
# eval_distill_qwen()
|
|
127
|
+
# eval_math_qwen()
|
|
128
|
+
eval_r1()
|
|
129
|
+
# eval_qwq()
|
|
130
|
+
# eval_distill_32b()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
|
|
2
|
+
|
|
3
|
+
[Math Problem]
|
|
4
|
+
|
|
5
|
+
{problem}
|
|
6
|
+
|
|
7
|
+
[Correct Answer]
|
|
8
|
+
|
|
9
|
+
{answer}
|
|
10
|
+
|
|
11
|
+
[Solution]
|
|
12
|
+
|
|
13
|
+
{tagged_response}
|
|
14
|
+
|
|
15
|
+
Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
|
|
16
|
+
|
|
17
|
+
Please put your final answer (i.e., the index) in \boxed{{}}.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
|
|
2
|
+
|
|
3
|
+
* Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
|
|
4
|
+
|
|
5
|
+
- Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
|
|
6
|
+
|
|
7
|
+
- Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
|
|
8
|
+
|
|
9
|
+
- If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
|
|
10
|
+
|
|
11
|
+
- Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
|
|
12
|
+
|
|
13
|
+
* For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
|
|
14
|
+
|
|
15
|
+
* Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
|
|
16
|
+
|
|
17
|
+
* Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
|
|
18
|
+
|
|
19
|
+
* Reply with the reformatted solution directly.
|
|
20
|
+
|
|
21
|
+
--------------------------------------------------
|
|
22
|
+
|
|
23
|
+
Here is the math problem, and the solution that needs to be reformatted:
|
|
24
|
+
|
|
25
|
+
[Math Problem]
|
|
26
|
+
|
|
27
|
+
{problem}
|
|
28
|
+
|
|
29
|
+
[Solution]
|
|
30
|
+
|
|
31
|
+
{response}
|
|
File without changes
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from openai import OpenAI
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def request_url(llm_config, content):
|
|
6
|
+
try:
|
|
7
|
+
client = OpenAI(
|
|
8
|
+
api_key=llm_config['api_key'],
|
|
9
|
+
base_url=llm_config['base_url'],
|
|
10
|
+
)
|
|
11
|
+
completion = client.chat.completions.create(
|
|
12
|
+
model=llm_config['model_name'],
|
|
13
|
+
messages=[{'role': 'user', 'content': content}]
|
|
14
|
+
)
|
|
15
|
+
return completion.choices[0].message.content
|
|
16
|
+
except Exception as e:
|
|
17
|
+
print(e)
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
def request_qwen(content):
|
|
21
|
+
try:
|
|
22
|
+
client = OpenAI(
|
|
23
|
+
api_key=os.getenv('DASHSCOPE_API_KEY'),
|
|
24
|
+
base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
completion = client.chat.completions.create(
|
|
28
|
+
model='qwen-max',
|
|
29
|
+
messages=[{'role': 'user', 'content': content}]
|
|
30
|
+
)
|
|
31
|
+
return completion.choices[0].message.content
|
|
32
|
+
except Exception as e:
|
|
33
|
+
print(e)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def request_local(content):
|
|
37
|
+
try:
|
|
38
|
+
client = OpenAI(
|
|
39
|
+
api_key='EMPTY',
|
|
40
|
+
base_url='http://0.0.0.0:8801/v1',
|
|
41
|
+
)
|
|
42
|
+
completion = client.chat.completions.create(
|
|
43
|
+
model='Qwen2.5-72B-Instruct',
|
|
44
|
+
messages=[{'role': 'user', 'content': content}]
|
|
45
|
+
)
|
|
46
|
+
return completion.choices[0].message.content
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(e)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def extract_answer(solution_text: str):
|
|
5
|
+
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
6
|
+
matches = re.findall(boxed_pattern, solution_text)
|
|
7
|
+
if matches:
|
|
8
|
+
last_boxed_content = matches[-1]
|
|
9
|
+
number_pattern = r'-?\d+'
|
|
10
|
+
number_matches = re.findall(number_pattern, last_boxed_content)
|
|
11
|
+
if number_matches:
|
|
12
|
+
return number_matches[-1].strip()
|
|
13
|
+
return None
|
|
@@ -1,37 +1,63 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
1
|
from dataclasses import dataclass
|
|
3
|
-
from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
|
|
4
|
-
from swift.utils import seed_everything
|
|
5
|
-
|
|
6
|
-
# TODO: Support custom model for swift infer
|
|
7
2
|
|
|
8
3
|
|
|
9
4
|
@dataclass
|
|
10
5
|
class SwiftInferArgs:
|
|
11
6
|
model_id_or_path: str
|
|
12
7
|
model_type: str
|
|
8
|
+
infer_backend: str = 'vllm' # 可选 'pt', 'vllm', 'lmdeploy'
|
|
13
9
|
max_new_tokens: int = 2048
|
|
14
|
-
|
|
10
|
+
temperature: float = 0.1
|
|
11
|
+
max_batch_size: int = 16
|
|
15
12
|
|
|
16
13
|
class SwiftInfer:
|
|
17
14
|
|
|
18
15
|
def __init__(self, args: SwiftInferArgs):
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
16
|
+
# infer backend模型初始化
|
|
17
|
+
if args.infer_backend == 'pt':
|
|
18
|
+
self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
|
|
19
|
+
elif args.infer_backend == 'vllm':
|
|
20
|
+
from swift.llm import VllmEngine
|
|
21
|
+
self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
|
|
22
|
+
elif args.infer_backend == 'lmdeploy':
|
|
23
|
+
from swift.llm import LmdeployEngine
|
|
24
|
+
self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
|
|
25
|
+
else:
|
|
26
|
+
raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
|
|
27
|
+
|
|
28
|
+
# 基本配置获取 (可选)
|
|
29
|
+
self.request_config = RequestConfig(
|
|
30
|
+
max_tokens=args.max_new_tokens,
|
|
31
|
+
temperature=args.temperature,
|
|
32
|
+
stream=False # 可以透传参数改为True进行流式推理
|
|
33
|
+
)
|
|
25
34
|
|
|
26
|
-
|
|
27
|
-
|
|
35
|
+
def predict(self, system: str, query: str, history: list):
|
|
36
|
+
# Swift 3.0标准接口中,消息传入的格式是:
|
|
37
|
+
# messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
|
|
38
|
+
# {"role": "user", "content": "用户问题内容"},
|
|
39
|
+
# {"role": "assistant", "content": "助手回答内容"}, ...]
|
|
28
40
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
41
|
+
messages = []
|
|
42
|
+
if system.strip():
|
|
43
|
+
messages.append({'role': 'system', 'content': system})
|
|
32
44
|
|
|
33
|
-
|
|
45
|
+
# 将历史对话拼接进message中
|
|
46
|
+
for qa_pair in history:
|
|
47
|
+
# 假定 history 中每个元素形如 ("user input", "model response"),请根据你的数据格式进行调整。
|
|
48
|
+
user_answer, model_response = qa_pair
|
|
49
|
+
messages.append({'role': 'user', 'content': user_answer})
|
|
50
|
+
messages.append({'role': 'assistant', 'content': model_response})
|
|
51
|
+
|
|
52
|
+
# 添加本次用户问题
|
|
53
|
+
messages.append({'role': 'user', 'content': query})
|
|
54
|
+
|
|
55
|
+
infer_request = InferRequest(messages=messages)
|
|
56
|
+
|
|
57
|
+
# 进行推理
|
|
58
|
+
response = self.engine.infer([infer_request], self.request_config)
|
|
34
59
|
|
|
35
|
-
|
|
60
|
+
# 提取模型返回的文本结果(假设非stream模式)
|
|
61
|
+
result_text = response[0].choices[0].message.content.strip()
|
|
36
62
|
|
|
37
|
-
return
|
|
63
|
+
return result_text
|
|
@@ -6,11 +6,12 @@ from typing import Union
|
|
|
6
6
|
from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
|
|
7
7
|
from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
|
|
8
8
|
from evalscope.utils import get_logger
|
|
9
|
+
from evalscope.utils.deprecation_utils import deprecated
|
|
9
10
|
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
@deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
|
|
14
15
|
def run_task(task_cfg: Union[str, dict]):
|
|
15
16
|
|
|
16
17
|
if isinstance(task_cfg, str):
|
evalscope/utils/__init__.py
CHANGED
|
@@ -1,4 +1,84 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from .import_utils import _LazyModule
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
|
|
9
|
+
from .deprecation_utils import deprecated
|
|
10
|
+
from .function_utils import run_once, thread_safe
|
|
11
|
+
from .import_utils import get_module_path, is_module_installed
|
|
12
|
+
from .io_utils import (
|
|
13
|
+
OutputsStructure,
|
|
14
|
+
csv_to_jsonl,
|
|
15
|
+
csv_to_list,
|
|
16
|
+
dict_to_yaml,
|
|
17
|
+
gen_hash,
|
|
18
|
+
get_latest_folder_path,
|
|
19
|
+
get_valid_list,
|
|
20
|
+
json_to_dict,
|
|
21
|
+
jsonl_to_csv,
|
|
22
|
+
jsonl_to_list,
|
|
23
|
+
safe_filename,
|
|
24
|
+
yaml_to_dict,
|
|
25
|
+
)
|
|
26
|
+
from .logger import configure_logging, get_logger
|
|
27
|
+
from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
|
|
28
|
+
|
|
29
|
+
else:
|
|
30
|
+
_import_structure = {
|
|
31
|
+
'argument_utils': [
|
|
32
|
+
'BaseArgument',
|
|
33
|
+
'parse_int_or_float',
|
|
34
|
+
'get_supported_params',
|
|
35
|
+
],
|
|
36
|
+
'model_utils': [
|
|
37
|
+
'EvalBackend',
|
|
38
|
+
'get_device',
|
|
39
|
+
'seed_everything',
|
|
40
|
+
'dict_torch_dtype_to_str',
|
|
41
|
+
'fix_do_sample_warning',
|
|
42
|
+
],
|
|
43
|
+
'import_utils': [
|
|
44
|
+
'is_module_installed',
|
|
45
|
+
'get_module_path',
|
|
46
|
+
],
|
|
47
|
+
'function_utils': [
|
|
48
|
+
'thread_safe',
|
|
49
|
+
'run_once',
|
|
50
|
+
],
|
|
51
|
+
'io_utils': [
|
|
52
|
+
'OutputsStructure',
|
|
53
|
+
'csv_to_list',
|
|
54
|
+
'json_to_dict',
|
|
55
|
+
'yaml_to_dict',
|
|
56
|
+
'get_latest_folder_path',
|
|
57
|
+
'gen_hash',
|
|
58
|
+
'dict_to_yaml',
|
|
59
|
+
'csv_to_jsonl',
|
|
60
|
+
'jsonl_to_csv',
|
|
61
|
+
'jsonl_to_list',
|
|
62
|
+
'gen_hash',
|
|
63
|
+
'get_valid_list',
|
|
64
|
+
'safe_filename',
|
|
65
|
+
'thread_safe',
|
|
66
|
+
],
|
|
67
|
+
'deprecation_utils': [
|
|
68
|
+
'deprecated',
|
|
69
|
+
],
|
|
70
|
+
'logger': [
|
|
71
|
+
'get_logger',
|
|
72
|
+
'configure_logging',
|
|
73
|
+
],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
import sys
|
|
77
|
+
|
|
78
|
+
sys.modules[__name__] = _LazyModule(
|
|
79
|
+
__name__,
|
|
80
|
+
globals()['__file__'],
|
|
81
|
+
_import_structure,
|
|
82
|
+
module_spec=__spec__,
|
|
83
|
+
extra_objects={},
|
|
84
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from argparse import Namespace
|
|
3
|
+
from inspect import signature
|
|
4
|
+
|
|
5
|
+
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseArgument:
|
|
9
|
+
"""
|
|
10
|
+
BaseArgument is a base class designed to facilitate the creation and manipulation
|
|
11
|
+
of argument classes in the evalscope framework. It provides utility methods for
|
|
12
|
+
instantiating objects from various data formats and converting objects back into
|
|
13
|
+
dictionary representations.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def from_dict(cls, d: dict):
|
|
18
|
+
"""Instantiate the class from a dictionary."""
|
|
19
|
+
return cls(**d)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_json(cls, json_file: str):
|
|
23
|
+
"""Instantiate the class from a JSON file."""
|
|
24
|
+
return cls.from_dict(json_to_dict(json_file))
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_yaml(cls, yaml_file: str):
|
|
28
|
+
"""Instantiate the class from a YAML file."""
|
|
29
|
+
return cls.from_dict(yaml_to_dict(yaml_file))
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def from_args(cls, args: Namespace):
|
|
33
|
+
"""
|
|
34
|
+
Instantiate the class from an argparse.Namespace object.
|
|
35
|
+
Filters out None values and removes 'func' if present.
|
|
36
|
+
"""
|
|
37
|
+
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
38
|
+
|
|
39
|
+
if 'func' in args_dict:
|
|
40
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
41
|
+
|
|
42
|
+
return cls.from_dict(args_dict)
|
|
43
|
+
|
|
44
|
+
def to_dict(self):
|
|
45
|
+
"""Convert the instance to a dictionary."""
|
|
46
|
+
result = self.__dict__.copy()
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
def __str__(self):
|
|
50
|
+
"""Return a JSON-formatted string representation of the instance."""
|
|
51
|
+
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def parse_int_or_float(num):
|
|
55
|
+
number = float(num)
|
|
56
|
+
if number.is_integer():
|
|
57
|
+
return int(number)
|
|
58
|
+
return number
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_supported_params(func):
|
|
62
|
+
"""Get the supported parameters of a function."""
|
|
63
|
+
sig = signature(func)
|
|
64
|
+
return set(sig.parameters.keys())
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
|
-
import torch
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from functools import partial
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -32,6 +31,7 @@ class ModelList(BaseModel):
|
|
|
32
31
|
class ChatMessage(BaseModel):
|
|
33
32
|
role: Literal['user', 'assistant', 'system']
|
|
34
33
|
content: str
|
|
34
|
+
reasoning_content: Optional[str] = None
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
class DeltaMessage(BaseModel):
|
|
@@ -63,10 +63,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
|
63
63
|
|
|
64
64
|
class ChatCompletionResponse(BaseModel):
|
|
65
65
|
model: str
|
|
66
|
-
object: Literal['chat.completion', 'chat.completion.chunk']
|
|
66
|
+
object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
|
|
67
67
|
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
|
|
68
68
|
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
|
|
69
|
-
usage: Optional[Usage]
|
|
69
|
+
usage: Optional[Usage] = None
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class TextCompletionRequest(BaseModel):
|
|
@@ -94,6 +94,7 @@ class TextCompletionResponse(BaseModel):
|
|
|
94
94
|
class ChatService:
|
|
95
95
|
|
|
96
96
|
def __init__(self, model_path, attn_implementation):
|
|
97
|
+
import torch
|
|
97
98
|
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
98
99
|
from transformers import TextIteratorStreamer
|
|
99
100
|
|
|
@@ -174,7 +175,7 @@ class ChatService:
|
|
|
174
175
|
)
|
|
175
176
|
|
|
176
177
|
def _prepare_text_inputs(self, request: TextCompletionRequest):
|
|
177
|
-
inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=
|
|
178
|
+
inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
|
|
178
179
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
179
180
|
return inputs, prompt_tokens
|
|
180
181
|
|
|
@@ -203,8 +204,9 @@ class ChatService:
|
|
|
203
204
|
|
|
204
205
|
def _prepare_chat_inputs(self, request: ChatCompletionRequest):
|
|
205
206
|
formatted_prompt = self.tokenizer.apply_chat_template(
|
|
206
|
-
request.messages, tokenize=False, add_generation_prompt=True
|
|
207
|
-
|
|
207
|
+
request.messages, tokenize=False, add_generation_prompt=True
|
|
208
|
+
)
|
|
209
|
+
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
|
|
208
210
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
209
211
|
return formatted_prompt, inputs, prompt_tokens
|
|
210
212
|
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
import os
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
|
|
6
|
+
from .logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
|
|
12
|
+
"""
|
|
13
|
+
Decorator to mark functions as deprecated.
|
|
14
|
+
|
|
15
|
+
:param since: String indicating the version since deprecation
|
|
16
|
+
:param remove_in: Optional string indicating the version when it will be removed
|
|
17
|
+
:param alternative: Optional string suggesting an alternative
|
|
18
|
+
:return: Decorated function
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def decorator(func: Callable) -> Callable:
|
|
22
|
+
|
|
23
|
+
@functools.wraps(func)
|
|
24
|
+
def wrapper(*args, **kwargs):
|
|
25
|
+
# Get the file name where the function is defined
|
|
26
|
+
file_name = os.path.basename(inspect.getfile(func))
|
|
27
|
+
|
|
28
|
+
# Construct the warning message
|
|
29
|
+
warning_parts = [
|
|
30
|
+
f'{func.__name__} in {file_name} has been deprecated since version {since}',
|
|
31
|
+
f'and will be removed in version {remove_in}' if remove_in else None,
|
|
32
|
+
f'Use {alternative} instead' if alternative else None
|
|
33
|
+
]
|
|
34
|
+
warning_message = '. '.join(filter(None, warning_parts))
|
|
35
|
+
|
|
36
|
+
# Log the warning
|
|
37
|
+
logger.warning(warning_message)
|
|
38
|
+
|
|
39
|
+
return func(*args, **kwargs)
|
|
40
|
+
|
|
41
|
+
return wrapper
|
|
42
|
+
|
|
43
|
+
return decorator
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def deprecated_warning(logger, message: str):
|
|
47
|
+
"""
|
|
48
|
+
Log a deprecation warning.
|
|
49
|
+
|
|
50
|
+
:param logger: Logger instance to log the warning
|
|
51
|
+
:param message: Warning message to log
|
|
52
|
+
"""
|
|
53
|
+
logger.warning(f'Deprecated: {message}')
|