PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/cmmlu/cmmlu_adapter.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import csv
+import os
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
 from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
@@ -13,143 +15,89 @@ logger = get_logger()
 DATASET_ID = 'modelscope/cmmlu'
 SUBSET_LIST = [
-    "agronomy",
-    "anatomy",
-    "ancient_chinese",
-    "arts",
-    "astronomy",
-    "business_ethics",
-    "chinese_civil_service_exam",
-    "chinese_driving_rule",
-    "chinese_food_culture",
-    "chinese_foreign_policy",
-    "chinese_history",
-    "chinese_literature",
-    "chinese_teacher_qualification",
-    "college_actuarial_science",
-    "college_education",
-    "college_engineering_hydrology",
-    "college_law",
-    "college_mathematics",
-    "college_medical_statistics",
-    "clinical_knowledge",
-    "college_medicine",
-    "computer_science",
-    "computer_security",
-    "conceptual_physics",
-    "construction_project_management",
-    "economics",
-    "education",
-    "elementary_chinese",
-    "elementary_commonsense",
-    "elementary_information_and_technology",
-    "electrical_engineering",
-    "elementary_mathematics",
-    "ethnology",
-    "food_science",
-    "genetics",
-    "global_facts",
-    "high_school_biology",
-    "high_school_chemistry",
-    "high_school_geography",
-    "high_school_mathematics",
-    "high_school_physics",
-    "high_school_politics",
-    "human_sexuality",
-    "international_law",
-    "journalism",
-    "jurisprudence",
-    "legal_and_moral_basis",
-    "logical",
-    "machine_learning",
-    "management",
-    "marketing",
-    "marxist_theory",
-    "modern_chinese",
-    "nutrition",
-    "philosophy",
-    "professional_accounting",
-    "professional_law",
-    "professional_medicine",
-    "professional_psychology",
-    "public_relations",
-    "security_study",
-    "sociology",
-    "sports_science",
-    "traditional_chinese_medicine",
-    "virology",
-    "world_history",
-    "world_religions"
+    'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
+    'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
+    'chinese_teacher_qualification', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology',
+    'college_law', 'college_mathematics', 'college_medical_statistics', 'clinical_knowledge', 'college_medicine',
+    'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics',
+    'education', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
+    'electrical_engineering', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
+    'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics',
+    'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism',
+    'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
+    'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law',
+    'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology',
+    'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
 ]
-SUBJECT_MAPPING = {"agronomy": ["other", "Other"],
-                   "anatomy": ["biology", "STEM"],
-                   "ancient_chinese": ["china specific", "China specific"],
-                   "arts": ["arts", "Humanities"],
-                   "astronomy": ["physics", "STEM"],
-                   "business_ethics": ["business", "Social Science"],
-                   "chinese_civil_service_exam": ["china specific", "China specific"],
-                   "chinese_driving_rule": ["china specific", "China specific"],
-                   "chinese_food_culture": ["china specific", "China specific"],
-                   "chinese_foreign_policy": ["china specific", "China specific"],
-                   "chinese_history": ["china specific", "China specific"],
-                   "chinese_literature": ["china specific", "China specific"],
-                   "chinese_teacher_qualification": ["china specific", "China specific"],
-                   "college_actuarial_science": ["math", "STEM"],
-                   "college_education": ["education", "Social Science"],
-                   "college_engineering_hydrology": ["engineering", "STEM"],
-                   "college_law": ["law", "Humanities"],
-                   "college_mathematics": ["math", "STEM"],
-                   "college_medical_statistics": ["statistics", "STEM"],
-                   "clinical_knowledge": ["other", "Other"],
-                   "college_medicine": ["other", "Other"],
-                   "computer_science": ["computer science", "STEM"],
-                   "computer_security": ["other", "Other"],
-                   "conceptual_physics": ["physics", "STEM"],
-                   "construction_project_management": ["china specific", "China specific"],
-                   "economics": ["economics", "Social Science"],
-                   "education": ["education", "Social Science"],
-                   "elementary_chinese": ["china specific", "China specific"],
-                   "elementary_commonsense": ["china specific", "China specific"],
-                   "elementary_information_and_technology": ["other", "Other"],
-                   "electrical_engineering": ["engineering", "STEM"],
-                   "elementary_mathematics": ["math", "STEM"],
-                   "ethnology": ["china specific", "China specific"],
-                   "food_science": ["other", "Other"],
-                   "genetics": ["biology", "STEM"],
-                   "global_facts": ["global", "Humanities"],
-                   "high_school_biology": ["biology", "STEM"],
-                   "high_school_chemistry": ["chemistry", "STEM"],
-                   "high_school_geography": ["geography", "Social Science"],
-                   "high_school_mathematics": ["math", "STEM"],
-                   "high_school_physics": ["physics", "STEM"],
-                   "high_school_politics": ["china specific", "China specific"],
-                   "human_sexuality": ["other", "Other"],
-                   "international_law": ["law", "Humanities"],
-                   "journalism": ["sociology", "Social Science"],
-                   "jurisprudence": ["law", "Humanities"],
-                   "legal_and_moral_basis": ["other", "Other"],
-                   "logical": ["philosophy", "Humanities"],
-                   "machine_learning": ["computer science", "STEM"],
-                   "management": ["business", "Social Science"],
-                   "marketing": ["business", "Social Science"],
-                   "marxist_theory": ["philosophy", "Humanities"],
-                   "modern_chinese": ["china specific", "China specific"],
-                   "nutrition": ["other", "Other"],
-                   "philosophy": ["philosophy", "Humanities"],
-                   "professional_accounting": ["business", "Social Science"],
-                   "professional_law": ["law", "Humanities"],
-                   "professional_medicine": ["other", "Other"],
-                   "professional_psychology": ["psychology", "Social Science"],
-                   "public_relations": ["politics", "Social Science"],
-                   "security_study": ["politics", "Social Science"],
-                   "sociology": ["culture", "Social Science"],
-                   "sports_science": ["other", "Other"],
-                   "traditional_chinese_medicine": ["china specific", "China specific"],
-                   "virology": ["biology", "STEM"],
-                   "world_history": ["history", "Humanities"],
-                   "world_religions": ["global", "Humanities"]
+SUBJECT_MAPPING = {
+    'agronomy': ['other', 'Other'],
+    'anatomy': ['biology', 'STEM'],
+    'ancient_chinese': ['china specific', 'China specific'],
+    'arts': ['arts', 'Humanities'],
+    'astronomy': ['physics', 'STEM'],
+    'business_ethics': ['business', 'Social Science'],
+    'chinese_civil_service_exam': ['china specific', 'China specific'],
+    'chinese_driving_rule': ['china specific', 'China specific'],
+    'chinese_food_culture': ['china specific', 'China specific'],
+    'chinese_foreign_policy': ['china specific', 'China specific'],
+    'chinese_history': ['china specific', 'China specific'],
+    'chinese_literature': ['china specific', 'China specific'],
+    'chinese_teacher_qualification': ['china specific', 'China specific'],
+    'college_actuarial_science': ['math', 'STEM'],
+    'college_education': ['education', 'Social Science'],
+    'college_engineering_hydrology': ['engineering', 'STEM'],
+    'college_law': ['law', 'Humanities'],
+    'college_mathematics': ['math', 'STEM'],
+    'college_medical_statistics': ['statistics', 'STEM'],
+    'clinical_knowledge': ['other', 'Other'],
+    'college_medicine': ['other', 'Other'],
+    'computer_science': ['computer science', 'STEM'],
+    'computer_security': ['other', 'Other'],
+    'conceptual_physics': ['physics', 'STEM'],
+    'construction_project_management': ['china specific', 'China specific'],
+    'economics': ['economics', 'Social Science'],
+    'education': ['education', 'Social Science'],
+    'elementary_chinese': ['china specific', 'China specific'],
+    'elementary_commonsense': ['china specific', 'China specific'],
+    'elementary_information_and_technology': ['other', 'Other'],
+    'electrical_engineering': ['engineering', 'STEM'],
+    'elementary_mathematics': ['math', 'STEM'],
+    'ethnology': ['china specific', 'China specific'],
+    'food_science': ['other', 'Other'],
+    'genetics': ['biology', 'STEM'],
+    'global_facts': ['global', 'Humanities'],
+    'high_school_biology': ['biology', 'STEM'],
+    'high_school_chemistry': ['chemistry', 'STEM'],
+    'high_school_geography': ['geography', 'Social Science'],
+    'high_school_mathematics': ['math', 'STEM'],
+    'high_school_physics': ['physics', 'STEM'],
+    'high_school_politics': ['china specific', 'China specific'],
+    'human_sexuality': ['other', 'Other'],
+    'international_law': ['law', 'Humanities'],
+    'journalism': ['sociology', 'Social Science'],
+    'jurisprudence': ['law', 'Humanities'],
+    'legal_and_moral_basis': ['other', 'Other'],
+    'logical': ['philosophy', 'Humanities'],
+    'machine_learning': ['computer science', 'STEM'],
+    'management': ['business', 'Social Science'],
+    'marketing': ['business', 'Social Science'],
+    'marxist_theory': ['philosophy', 'Humanities'],
+    'modern_chinese': ['china specific', 'China specific'],
+    'nutrition': ['other', 'Other'],
+    'philosophy': ['philosophy', 'Humanities'],
+    'professional_accounting': ['business', 'Social Science'],
+    'professional_law': ['law', 'Humanities'],
+    'professional_medicine': ['other', 'Other'],
+    'professional_psychology': ['psychology', 'Social Science'],
+    'public_relations': ['politics', 'Social Science'],
+    'security_study': ['politics', 'Social Science'],
+    'sociology': ['culture', 'Social Science'],
+    'sports_science': ['other', 'Other'],
+    'traditional_chinese_medicine': ['china specific', 'China specific'],
+    'virology': ['biology', 'STEM'],
+    'world_history': ['history', 'Humanities'],
+    'world_religions': ['global', 'Humanities']
 }
@@ -171,12 +119,13 @@ class CMMLUAdapter(DataAdapter):
         if metric_list is None:
             metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-        super().__init__(subset_list=subset_list,
-                        metric_list=metric_list,
-                        few_shot_num=few_shot_num,
-                        train_split=train_split,
-                        eval_split=eval_split,
-                        **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -223,9 +172,7 @@ class CMMLUAdapter(DataAdapter):
             {'data': [(context, continuation), ...]}
         """
-        prompt = '以下是关于{}的单项选择题。\n\n'.format(
-            self._format_subject(subset_name)
-        )
+        prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts) + '\n'
@@ -331,17 +278,24 @@ class CMMLUAdapter(DataAdapter):
             domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
                                      sum([num for _, _, num in domain_res_list])
             domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
-            category_list.append({'name': domain_name,
-                                  'score': domain_weighted_avg_acc,
-                                  'subset': [{'name': subset_name, 'score': normalize_score(subset_score)}
-                                             for subset_name, subset_score, _ in domain_res_list]})
+            category_list.append({
+                'name':
+                domain_name,
+                'score':
+                domain_weighted_avg_acc,
+                'subset': [{
+                    'name': subset_name,
+                    'score': normalize_score(subset_score)
+                } for subset_name, subset_score, _ in domain_res_list]
+            })
         # Get final dict of report
-        res_map = dict(name=report_name or 'cmmlu',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=category_list,
-                       total_num=total_num)
+        res_map = dict(
+            name=report_name or 'cmmlu',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=category_list,
+            total_num=total_num)
         return res_map
@@ -366,4 +320,4 @@ class CMMLUAdapter(DataAdapter):
         s = ''
         for entry in l:
             s += ' ' + entry
-        return s
+        return s

evalscope/benchmarks/cmmlu/samples.jsonl CHANGED Viewed

@@ -2,4 +2,4 @@
 {'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用，与其运动状态无关', 'B': '对某一物体而言，重力的大小是一个恒量，不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力，重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
 {'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
 {'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
-{'input': '20世纪初，英国首相阿斯奎斯说：“我们现在有一个牢固确立了两百年的传统，即归根到底，王位的占有者接受其大臣的建议并据此行事。”这一传统的确立，使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家，成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
+{'input': '20世纪初，英国首相阿斯奎斯说：“我们现在有一个牢固确立了两百年的传统，即归根到底，王位的占有者接受其大臣的建议并据此行事。”这一传统的确立，使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家，成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}

evalscope/benchmarks/competition_math/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter
 from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/competition_math/competition_math.py CHANGED Viewed

@@ -1,13 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """Mathematics Aptitude Test of Heuristics (MATH) dataset."""
+import datasets
 import json
 import os
-import datasets
 _CITATION = """\
 @article{hendrycksmath2021,
   title={Measuring Mathematical Problem Solving With the MATH Dataset},
@@ -24,7 +21,6 @@ _CITATION = """\
 }
 """
 _DESCRIPTION = """\
 The Mathematics Aptitude Test of Heuristics (MATH) dataset consists of problems
 from mathematics competitions, including the AMC 10, AMC 12, AIME, and more.
@@ -32,13 +28,10 @@ Each problem in MATH has a full step-by-step solution, which can be used to teac
 models to generate answer derivations and explanations.
 """
 _HOMEPAGE = 'https://github.com/hendrycks/math'
 _LICENSE = 'https://github.com/hendrycks/math/blob/main/LICENSE'
 # Original data URL: "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
 _URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/math/MATH.zip'
@@ -49,14 +42,12 @@ class CompetitionMathDataset(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version('1.0.0')
     def _info(self):
-        features = datasets.Features(
-            {
-                'problem': datasets.Value('string'),
-                'level': datasets.Value('string'),
-                'type': datasets.Value('string'),
-                'solution': datasets.Value('string'),
-            }
-        )
+        features = datasets.Features({
+            'problem': datasets.Value('string'),
+            'level': datasets.Value('string'),
+            'type': datasets.Value('string'),
+            'solution': datasets.Value('string'),
+        })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -8,6 +8,7 @@ from evalscope.benchmarks import DataAdapter
 from evalscope.metrics.metrics import weighted_mean
 from evalscope.utils import normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
@@ -43,12 +44,13 @@ class CompetitionMathAdapter(DataAdapter):
                          f'but got {self.few_shot_num}. Use 4-shot by default.')
             few_shot_num = 4
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict: dict = {}
@@ -161,17 +163,19 @@ class CompetitionMathAdapter(DataAdapter):
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(name=report_name or 'competition_math',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        res_map = dict(
+            name=report_name or 'competition_math',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map
@@ -186,8 +190,7 @@ class CompetitionMathAdapter(DataAdapter):
                 'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
                 'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
                 'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
-                f'Problem:\n{problem}\nSolution:\n'
-            )
+                f'Problem:\n{problem}\nSolution:\n')
         else:
             context = 'Problem:\n' + problem + '\nSolution:\n'
         return context
@@ -212,15 +215,15 @@ class CompetitionMathAdapter(DataAdapter):
         if '\\boxed ' in s:
             left = '\\boxed '
-            assert s[: len(left)] == left
+            assert s[:len(left)] == left
             return s[len(left):]
         left = '\\boxed{'
-        assert s[: len(left)] == left
+        assert s[:len(left)] == left
         assert s[-1] == '}'
-        return s[len(left): -1]
+        return s[len(left):-1]
     @classmethod
     def _last_boxed_only_string(cls, string):
@@ -249,7 +252,7 @@ class CompetitionMathAdapter(DataAdapter):
         if right_brace_idx is None:
             retval = None
         else:
-            retval = string[idx: right_brace_idx + 1]
+            retval = string[idx:right_brace_idx + 1]
         return retval
@@ -409,18 +412,14 @@ class CompetitionMathAdapter(DataAdapter):
     @classmethod
     def _math_postprocess(cls, text: str) -> str:
-        SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''),
-                         (r'\ ', ''), (' ', ''), ('mbox', 'text'),
-                         (',\\text{and}', ','), ('\\text{and}', ','),
-                         ('\\text{m}', '\\text{}'), ('\\le', '<')]
+        SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''), (r'\ ', ''), (' ', ''), ('mbox', 'text'),
+                         (',\\text{and}', ','), ('\\text{and}', ','), ('\\text{m}', '\\text{}'), ('\\le', '<')]
         REMOVED_EXPRESSIONS = [
-            'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft',
-            'hours', 'km', 'units', '\\ldots', 'sue', 'points', 'feet', 'minutes',
-            'digits', 'cents', 'degrees', 'cm', 'gm', 'pounds', 'meters', 'meals',
-            'edges', 'students', 'childrentickets', 'multiples', '\\text{s}',
-            '\\text{.}', '\\text{\ns}', '\\text{}^2', '\\text{}^3', '\\text{\n}',
-            '\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!',
-            '{,}', '"', '\\dots', '\n', '\r', '\f'
+            'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft', 'hours', 'km', 'units', '\\ldots', 'sue',
+            'points', 'feet', 'minutes', 'digits', 'cents', 'degrees', 'cm', 'gm', 'pounds', 'meters', 'meals', 'edges',
+            'students', 'childrentickets', 'multiples', '\\text{s}', '\\text{.}', '\\text{\ns}', '\\text{}^2',
+            '\\text{}^3', '\\text{\n}', '\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!', '{,}', '"',
+            '\\dots', '\n', '\r', '\f'
         ]
         import re
@@ -453,8 +452,7 @@ class CompetitionMathAdapter(DataAdapter):
             if 'rac' in final_answer and '\\frac' not in final_answer:
                 final_answer = final_answer.replace('rac', '\\frac')
-            final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}',
-                                  final_answer)
+            final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}', final_answer)
             final_answer = re.sub(r'(sqrt)([^{])', 'sqrt{\\2}', final_answer)
             final_answer = final_answer.replace('$', '')

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl