evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import csv
|
|
4
|
+
import os
|
|
5
|
+
|
|
5
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
6
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
8
|
from evalscope.utils import ResponseParser, normalize_score
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
9
11
|
# flake8: noqa
|
|
10
12
|
|
|
11
13
|
logger = get_logger()
|
|
@@ -13,143 +15,89 @@ logger = get_logger()
|
|
|
13
15
|
DATASET_ID = 'modelscope/cmmlu'
|
|
14
16
|
|
|
15
17
|
SUBSET_LIST = [
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
"college_actuarial_science",
|
|
30
|
-
"college_education",
|
|
31
|
-
"college_engineering_hydrology",
|
|
32
|
-
"college_law",
|
|
33
|
-
"college_mathematics",
|
|
34
|
-
"college_medical_statistics",
|
|
35
|
-
"clinical_knowledge",
|
|
36
|
-
"college_medicine",
|
|
37
|
-
"computer_science",
|
|
38
|
-
"computer_security",
|
|
39
|
-
"conceptual_physics",
|
|
40
|
-
"construction_project_management",
|
|
41
|
-
"economics",
|
|
42
|
-
"education",
|
|
43
|
-
"elementary_chinese",
|
|
44
|
-
"elementary_commonsense",
|
|
45
|
-
"elementary_information_and_technology",
|
|
46
|
-
"electrical_engineering",
|
|
47
|
-
"elementary_mathematics",
|
|
48
|
-
"ethnology",
|
|
49
|
-
"food_science",
|
|
50
|
-
"genetics",
|
|
51
|
-
"global_facts",
|
|
52
|
-
"high_school_biology",
|
|
53
|
-
"high_school_chemistry",
|
|
54
|
-
"high_school_geography",
|
|
55
|
-
"high_school_mathematics",
|
|
56
|
-
"high_school_physics",
|
|
57
|
-
"high_school_politics",
|
|
58
|
-
"human_sexuality",
|
|
59
|
-
"international_law",
|
|
60
|
-
"journalism",
|
|
61
|
-
"jurisprudence",
|
|
62
|
-
"legal_and_moral_basis",
|
|
63
|
-
"logical",
|
|
64
|
-
"machine_learning",
|
|
65
|
-
"management",
|
|
66
|
-
"marketing",
|
|
67
|
-
"marxist_theory",
|
|
68
|
-
"modern_chinese",
|
|
69
|
-
"nutrition",
|
|
70
|
-
"philosophy",
|
|
71
|
-
"professional_accounting",
|
|
72
|
-
"professional_law",
|
|
73
|
-
"professional_medicine",
|
|
74
|
-
"professional_psychology",
|
|
75
|
-
"public_relations",
|
|
76
|
-
"security_study",
|
|
77
|
-
"sociology",
|
|
78
|
-
"sports_science",
|
|
79
|
-
"traditional_chinese_medicine",
|
|
80
|
-
"virology",
|
|
81
|
-
"world_history",
|
|
82
|
-
"world_religions"
|
|
18
|
+
'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
|
|
19
|
+
'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
|
20
|
+
'chinese_teacher_qualification', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology',
|
|
21
|
+
'college_law', 'college_mathematics', 'college_medical_statistics', 'clinical_knowledge', 'college_medicine',
|
|
22
|
+
'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics',
|
|
23
|
+
'education', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
|
|
24
|
+
'electrical_engineering', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
|
|
25
|
+
'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics',
|
|
26
|
+
'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism',
|
|
27
|
+
'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
|
|
28
|
+
'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law',
|
|
29
|
+
'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology',
|
|
30
|
+
'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
|
|
83
31
|
]
|
|
84
32
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
33
|
+
SUBJECT_MAPPING = {
|
|
34
|
+
'agronomy': ['other', 'Other'],
|
|
35
|
+
'anatomy': ['biology', 'STEM'],
|
|
36
|
+
'ancient_chinese': ['china specific', 'China specific'],
|
|
37
|
+
'arts': ['arts', 'Humanities'],
|
|
38
|
+
'astronomy': ['physics', 'STEM'],
|
|
39
|
+
'business_ethics': ['business', 'Social Science'],
|
|
40
|
+
'chinese_civil_service_exam': ['china specific', 'China specific'],
|
|
41
|
+
'chinese_driving_rule': ['china specific', 'China specific'],
|
|
42
|
+
'chinese_food_culture': ['china specific', 'China specific'],
|
|
43
|
+
'chinese_foreign_policy': ['china specific', 'China specific'],
|
|
44
|
+
'chinese_history': ['china specific', 'China specific'],
|
|
45
|
+
'chinese_literature': ['china specific', 'China specific'],
|
|
46
|
+
'chinese_teacher_qualification': ['china specific', 'China specific'],
|
|
47
|
+
'college_actuarial_science': ['math', 'STEM'],
|
|
48
|
+
'college_education': ['education', 'Social Science'],
|
|
49
|
+
'college_engineering_hydrology': ['engineering', 'STEM'],
|
|
50
|
+
'college_law': ['law', 'Humanities'],
|
|
51
|
+
'college_mathematics': ['math', 'STEM'],
|
|
52
|
+
'college_medical_statistics': ['statistics', 'STEM'],
|
|
53
|
+
'clinical_knowledge': ['other', 'Other'],
|
|
54
|
+
'college_medicine': ['other', 'Other'],
|
|
55
|
+
'computer_science': ['computer science', 'STEM'],
|
|
56
|
+
'computer_security': ['other', 'Other'],
|
|
57
|
+
'conceptual_physics': ['physics', 'STEM'],
|
|
58
|
+
'construction_project_management': ['china specific', 'China specific'],
|
|
59
|
+
'economics': ['economics', 'Social Science'],
|
|
60
|
+
'education': ['education', 'Social Science'],
|
|
61
|
+
'elementary_chinese': ['china specific', 'China specific'],
|
|
62
|
+
'elementary_commonsense': ['china specific', 'China specific'],
|
|
63
|
+
'elementary_information_and_technology': ['other', 'Other'],
|
|
64
|
+
'electrical_engineering': ['engineering', 'STEM'],
|
|
65
|
+
'elementary_mathematics': ['math', 'STEM'],
|
|
66
|
+
'ethnology': ['china specific', 'China specific'],
|
|
67
|
+
'food_science': ['other', 'Other'],
|
|
68
|
+
'genetics': ['biology', 'STEM'],
|
|
69
|
+
'global_facts': ['global', 'Humanities'],
|
|
70
|
+
'high_school_biology': ['biology', 'STEM'],
|
|
71
|
+
'high_school_chemistry': ['chemistry', 'STEM'],
|
|
72
|
+
'high_school_geography': ['geography', 'Social Science'],
|
|
73
|
+
'high_school_mathematics': ['math', 'STEM'],
|
|
74
|
+
'high_school_physics': ['physics', 'STEM'],
|
|
75
|
+
'high_school_politics': ['china specific', 'China specific'],
|
|
76
|
+
'human_sexuality': ['other', 'Other'],
|
|
77
|
+
'international_law': ['law', 'Humanities'],
|
|
78
|
+
'journalism': ['sociology', 'Social Science'],
|
|
79
|
+
'jurisprudence': ['law', 'Humanities'],
|
|
80
|
+
'legal_and_moral_basis': ['other', 'Other'],
|
|
81
|
+
'logical': ['philosophy', 'Humanities'],
|
|
82
|
+
'machine_learning': ['computer science', 'STEM'],
|
|
83
|
+
'management': ['business', 'Social Science'],
|
|
84
|
+
'marketing': ['business', 'Social Science'],
|
|
85
|
+
'marxist_theory': ['philosophy', 'Humanities'],
|
|
86
|
+
'modern_chinese': ['china specific', 'China specific'],
|
|
87
|
+
'nutrition': ['other', 'Other'],
|
|
88
|
+
'philosophy': ['philosophy', 'Humanities'],
|
|
89
|
+
'professional_accounting': ['business', 'Social Science'],
|
|
90
|
+
'professional_law': ['law', 'Humanities'],
|
|
91
|
+
'professional_medicine': ['other', 'Other'],
|
|
92
|
+
'professional_psychology': ['psychology', 'Social Science'],
|
|
93
|
+
'public_relations': ['politics', 'Social Science'],
|
|
94
|
+
'security_study': ['politics', 'Social Science'],
|
|
95
|
+
'sociology': ['culture', 'Social Science'],
|
|
96
|
+
'sports_science': ['other', 'Other'],
|
|
97
|
+
'traditional_chinese_medicine': ['china specific', 'China specific'],
|
|
98
|
+
'virology': ['biology', 'STEM'],
|
|
99
|
+
'world_history': ['history', 'Humanities'],
|
|
100
|
+
'world_religions': ['global', 'Humanities']
|
|
153
101
|
}
|
|
154
102
|
|
|
155
103
|
|
|
@@ -171,12 +119,13 @@ class CMMLUAdapter(DataAdapter):
|
|
|
171
119
|
if metric_list is None:
|
|
172
120
|
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
173
121
|
|
|
174
|
-
super().__init__(
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
122
|
+
super().__init__(
|
|
123
|
+
subset_list=subset_list,
|
|
124
|
+
metric_list=metric_list,
|
|
125
|
+
few_shot_num=few_shot_num,
|
|
126
|
+
train_split=train_split,
|
|
127
|
+
eval_split=eval_split,
|
|
128
|
+
**kwargs)
|
|
180
129
|
|
|
181
130
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
182
131
|
data_dict = {}
|
|
@@ -223,9 +172,7 @@ class CMMLUAdapter(DataAdapter):
|
|
|
223
172
|
{'data': [(context, continuation), ...]}
|
|
224
173
|
|
|
225
174
|
"""
|
|
226
|
-
prompt = '以下是关于{}的单项选择题。\n\n'.format(
|
|
227
|
-
self._format_subject(subset_name)
|
|
228
|
-
)
|
|
175
|
+
prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
|
|
229
176
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
230
177
|
|
|
231
178
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
@@ -331,17 +278,24 @@ class CMMLUAdapter(DataAdapter):
|
|
|
331
278
|
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
332
279
|
sum([num for _, _, num in domain_res_list])
|
|
333
280
|
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
334
|
-
category_list.append({
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
281
|
+
category_list.append({
|
|
282
|
+
'name':
|
|
283
|
+
domain_name,
|
|
284
|
+
'score':
|
|
285
|
+
domain_weighted_avg_acc,
|
|
286
|
+
'subset': [{
|
|
287
|
+
'name': subset_name,
|
|
288
|
+
'score': normalize_score(subset_score)
|
|
289
|
+
} for subset_name, subset_score, _ in domain_res_list]
|
|
290
|
+
})
|
|
338
291
|
|
|
339
292
|
# Get final dict of report
|
|
340
|
-
res_map = dict(
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
293
|
+
res_map = dict(
|
|
294
|
+
name=report_name or 'cmmlu',
|
|
295
|
+
metric=self.metric_list[0]['name'],
|
|
296
|
+
score=weighted_avg_acc,
|
|
297
|
+
category=category_list,
|
|
298
|
+
total_num=total_num)
|
|
345
299
|
|
|
346
300
|
return res_map
|
|
347
301
|
|
|
@@ -366,4 +320,4 @@ class CMMLUAdapter(DataAdapter):
|
|
|
366
320
|
s = ''
|
|
367
321
|
for entry in l:
|
|
368
322
|
s += ' ' + entry
|
|
369
|
-
return s
|
|
323
|
+
return s
|
|
@@ -2,4 +2,4 @@
|
|
|
2
2
|
{'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关', 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
|
|
3
3
|
{'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
|
|
4
4
|
{'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
|
|
5
|
-
{'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
|
|
5
|
+
{'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.competition_math.competition_math_adapter import
|
|
3
|
+
from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter
|
|
4
5
|
from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,13 +1,10 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
2
|
"""Mathematics Aptitude Test of Heuristics (MATH) dataset."""
|
|
4
3
|
|
|
4
|
+
import datasets
|
|
5
5
|
import json
|
|
6
6
|
import os
|
|
7
7
|
|
|
8
|
-
import datasets
|
|
9
|
-
|
|
10
|
-
|
|
11
8
|
_CITATION = """\
|
|
12
9
|
@article{hendrycksmath2021,
|
|
13
10
|
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
|
@@ -24,7 +21,6 @@ _CITATION = """\
|
|
|
24
21
|
}
|
|
25
22
|
"""
|
|
26
23
|
|
|
27
|
-
|
|
28
24
|
_DESCRIPTION = """\
|
|
29
25
|
The Mathematics Aptitude Test of Heuristics (MATH) dataset consists of problems
|
|
30
26
|
from mathematics competitions, including the AMC 10, AMC 12, AIME, and more.
|
|
@@ -32,13 +28,10 @@ Each problem in MATH has a full step-by-step solution, which can be used to teac
|
|
|
32
28
|
models to generate answer derivations and explanations.
|
|
33
29
|
"""
|
|
34
30
|
|
|
35
|
-
|
|
36
31
|
_HOMEPAGE = 'https://github.com/hendrycks/math'
|
|
37
32
|
|
|
38
|
-
|
|
39
33
|
_LICENSE = 'https://github.com/hendrycks/math/blob/main/LICENSE'
|
|
40
34
|
|
|
41
|
-
|
|
42
35
|
# Original data URL: "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
|
|
43
36
|
_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/math/MATH.zip'
|
|
44
37
|
|
|
@@ -49,14 +42,12 @@ class CompetitionMathDataset(datasets.GeneratorBasedBuilder):
|
|
|
49
42
|
VERSION = datasets.Version('1.0.0')
|
|
50
43
|
|
|
51
44
|
def _info(self):
|
|
52
|
-
features = datasets.Features(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
}
|
|
59
|
-
)
|
|
45
|
+
features = datasets.Features({
|
|
46
|
+
'problem': datasets.Value('string'),
|
|
47
|
+
'level': datasets.Value('string'),
|
|
48
|
+
'type': datasets.Value('string'),
|
|
49
|
+
'solution': datasets.Value('string'),
|
|
50
|
+
})
|
|
60
51
|
return datasets.DatasetInfo(
|
|
61
52
|
description=_DESCRIPTION,
|
|
62
53
|
features=features,
|
|
@@ -8,6 +8,7 @@ from evalscope.benchmarks import DataAdapter
|
|
|
8
8
|
from evalscope.metrics.metrics import weighted_mean
|
|
9
9
|
from evalscope.utils import normalize_score
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
11
12
|
# flake8: noqa
|
|
12
13
|
|
|
13
14
|
logger = get_logger()
|
|
@@ -43,12 +44,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
43
44
|
f'but got {self.few_shot_num}. Use 4-shot by default.')
|
|
44
45
|
few_shot_num = 4
|
|
45
46
|
|
|
46
|
-
super().__init__(
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
47
|
+
super().__init__(
|
|
48
|
+
subset_list=subset_list,
|
|
49
|
+
metric_list=metric_list,
|
|
50
|
+
few_shot_num=few_shot_num,
|
|
51
|
+
train_split=train_split,
|
|
52
|
+
eval_split=eval_split,
|
|
53
|
+
**kwargs)
|
|
52
54
|
|
|
53
55
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
54
56
|
data_dict: dict = {}
|
|
@@ -161,17 +163,19 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
161
163
|
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
162
164
|
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
163
165
|
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
164
|
-
cate_avg_list = [{
|
|
166
|
+
cate_avg_list = [{
|
|
167
|
+
'name': subset_name,
|
|
168
|
+
'score': normalize_score(score=score)
|
|
169
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
165
170
|
|
|
166
|
-
category_d = dict(name='DEFAULT',
|
|
167
|
-
score=weighted_avg_acc,
|
|
168
|
-
subset=cate_avg_list)
|
|
171
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
169
172
|
|
|
170
|
-
res_map = dict(
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
173
|
+
res_map = dict(
|
|
174
|
+
name=report_name or 'competition_math',
|
|
175
|
+
metric=self.metric_list[0]['name'],
|
|
176
|
+
score=weighted_avg_acc,
|
|
177
|
+
category=[category_d],
|
|
178
|
+
total_num=total_num)
|
|
175
179
|
|
|
176
180
|
return res_map
|
|
177
181
|
|
|
@@ -186,8 +190,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
186
190
|
'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
|
|
187
191
|
'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
|
|
188
192
|
'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
|
|
189
|
-
f'Problem:\n{problem}\nSolution:\n'
|
|
190
|
-
)
|
|
193
|
+
f'Problem:\n{problem}\nSolution:\n')
|
|
191
194
|
else:
|
|
192
195
|
context = 'Problem:\n' + problem + '\nSolution:\n'
|
|
193
196
|
return context
|
|
@@ -212,15 +215,15 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
212
215
|
|
|
213
216
|
if '\\boxed ' in s:
|
|
214
217
|
left = '\\boxed '
|
|
215
|
-
assert s[:
|
|
218
|
+
assert s[:len(left)] == left
|
|
216
219
|
return s[len(left):]
|
|
217
220
|
|
|
218
221
|
left = '\\boxed{'
|
|
219
222
|
|
|
220
|
-
assert s[:
|
|
223
|
+
assert s[:len(left)] == left
|
|
221
224
|
assert s[-1] == '}'
|
|
222
225
|
|
|
223
|
-
return s[len(left)
|
|
226
|
+
return s[len(left):-1]
|
|
224
227
|
|
|
225
228
|
@classmethod
|
|
226
229
|
def _last_boxed_only_string(cls, string):
|
|
@@ -249,7 +252,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
249
252
|
if right_brace_idx is None:
|
|
250
253
|
retval = None
|
|
251
254
|
else:
|
|
252
|
-
retval = string[idx:
|
|
255
|
+
retval = string[idx:right_brace_idx + 1]
|
|
253
256
|
|
|
254
257
|
return retval
|
|
255
258
|
|
|
@@ -409,18 +412,14 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
409
412
|
|
|
410
413
|
@classmethod
|
|
411
414
|
def _math_postprocess(cls, text: str) -> str:
|
|
412
|
-
SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''),
|
|
413
|
-
(
|
|
414
|
-
(',\\text{and}', ','), ('\\text{and}', ','),
|
|
415
|
-
('\\text{m}', '\\text{}'), ('\\le', '<')]
|
|
415
|
+
SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''), (r'\ ', ''), (' ', ''), ('mbox', 'text'),
|
|
416
|
+
(',\\text{and}', ','), ('\\text{and}', ','), ('\\text{m}', '\\text{}'), ('\\le', '<')]
|
|
416
417
|
REMOVED_EXPRESSIONS = [
|
|
417
|
-
'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft',
|
|
418
|
-
'
|
|
419
|
-
'
|
|
420
|
-
'
|
|
421
|
-
'\\
|
|
422
|
-
'\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!',
|
|
423
|
-
'{,}', '"', '\\dots', '\n', '\r', '\f'
|
|
418
|
+
'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft', 'hours', 'km', 'units', '\\ldots', 'sue',
|
|
419
|
+
'points', 'feet', 'minutes', 'digits', 'cents', 'degrees', 'cm', 'gm', 'pounds', 'meters', 'meals', 'edges',
|
|
420
|
+
'students', 'childrentickets', 'multiples', '\\text{s}', '\\text{.}', '\\text{\ns}', '\\text{}^2',
|
|
421
|
+
'\\text{}^3', '\\text{\n}', '\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!', '{,}', '"',
|
|
422
|
+
'\\dots', '\n', '\r', '\f'
|
|
424
423
|
]
|
|
425
424
|
import re
|
|
426
425
|
|
|
@@ -453,8 +452,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
453
452
|
if 'rac' in final_answer and '\\frac' not in final_answer:
|
|
454
453
|
final_answer = final_answer.replace('rac', '\\frac')
|
|
455
454
|
|
|
456
|
-
final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}',
|
|
457
|
-
final_answer)
|
|
455
|
+
final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}', final_answer)
|
|
458
456
|
final_answer = re.sub(r'(sqrt)([^{])', 'sqrt{\\2}', final_answer)
|
|
459
457
|
final_answer = final_answer.replace('$', '')
|
|
460
458
|
|