PyPI - evalscope - Versions diffs - 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +20 -5
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +1 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/evaluator/evaluator.py +15 -12
evalscope/metrics/__init__.py +6 -0
evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
evalscope/metrics/llm_judge.py +105 -20
evalscope/metrics/metrics.py +1 -1
evalscope/models/adapters/base_adapter.py +0 -2
evalscope/models/adapters/server_adapter.py +2 -2
evalscope/models/custom/dummy_model.py +3 -3
evalscope/perf/arguments.py +2 -16
evalscope/perf/main.py +1 -1
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +45 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +50 -2
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +13 -37
tests/perf/test_perf.py +2 -2
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0

evalscope/registry/data/qa_browser/category_mapping.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-文本生成: ['写作', '续写生成', '通用写作', '应用文写作', '头脑风暴', '开放对话', '角色扮演']
-文本理解:
-  ['文本点评', '文本摘要', '润色/纠错', '文本分类', '情感分析', '信息抽取', '文本聚类', '序列处理']
-知识问答: ['百科知识问答']
-数学解题: ['数学解题']
-逻辑推理: ['逻辑推理', '阅读理解']
-CODING: ['CODING', '表格处理']
-翻译: ['翻译']
-安全风险: ['有伤害性']
-其他: ['*']

evalscope/registry/data/question.jsonl DELETED Viewed

@@ -1,80 +0,0 @@
-{"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
-{"question_id": 2, "text": "What are the most effective ways to deal with stress?", "category": "generic"}
-{"question_id": 3, "text": "What are the main differences between Python and JavaScript programming languages?", "category": "generic"}
-{"question_id": 4, "text": "How can I increase my productivity while working from home?", "category": "generic"}
-{"question_id": 5, "text": "Can you explain the basics of quantum computing?", "category": "generic"}
-{"question_id": 6, "text": "What are the differences between plant-based and animal-based protein sources?", "category": "generic"}
-{"question_id": 7, "text": "How can I develop my critical thinking skills?", "category": "generic"}
-{"question_id": 8, "text": "What are the major challenges faced by the education sector today?", "category": "generic"}
-{"question_id": 9, "text": "What are the primary factors that influence consumer behavior?", "category": "generic"}
-{"question_id": 10, "text": "What are the most effective strategies for conflict resolution in the workplace?", "category": "generic"}
-{"question_id": 11, "text": "What are some potential implications of using a single-use plastic bottle versus a reusable bottle on both the environment and human health?", "category": "knowledge"}
-{"question_id": 12, "text": "What factors would you consider when designing an inclusive and accessible public transportation system?", "category": "knowledge"}
-{"question_id": 13, "text": "How can governments utilize fiscal and monetary policies to combat economic recessions?", "category": "knowledge"}
-{"question_id": 14, "text": "How do language and cultural barriers affect the way people communicate and form relationships in multicultural societies?", "category": "knowledge"}
-{"question_id": 15, "text": "Describe a scenario where artificial intelligence could be used to improve the quality and efficiency of healthcare delivery.", "category": "knowledge"}
-{"question_id": 16, "text": "Explain the process of gene editing using CRISPR-Cas9 technology, and discuss its potential applications and ethical implications.", "category": "knowledge"}
-{"question_id": 17, "text": "How do vaccinations work to protect individuals and communities from infectious diseases, and what is herd immunity?", "category": "knowledge"}
-{"question_id": 18, "text": "How do social media platforms influence the way people consume and share news, and what are the potential implications for the spread of misinformation?", "category": "knowledge"}
-{"question_id": 19, "text": "How do cultural, social, and economic factors influence people's food choices, and how can this knowledge be used to promote healthier diets?", "category": "knowledge"}
-{"question_id": 20, "text": "Explain the process of natural selection and how it contributes to the evolution and adaptation of species.", "category": "knowledge"}
-{"question_id": 21, "text": "How would you introduce yourself as a medieval knight at a royal banquet?", "category": "roleplay"}
-{"question_id": 22, "text": "As a pirate captain, what would you say to your crew to motivate them to search for hidden treasure?", "category": "roleplay"}
-{"question_id": 23, "text": "If you were a Shakespearean character, how would you declare your love for someone in a soliloquy?", "category": "roleplay"}
-{"question_id": 24, "text": "As a superhero, how would you explain your origin story to a curious child?", "category": "roleplay"}
-{"question_id": 25, "text": "Imagine you are a time traveler from the year 3000. What technological advancements would you tell people about?", "category": "roleplay"}
-{"question_id": 26, "text": "As a sports commentator, describe the winning play in the final seconds of a championship game.", "category": "roleplay"}
-{"question_id": 27, "text": "Pretend to be a world-famous chef. How would you describe your signature dish to a panel of judges?", "category": "roleplay"}
-{"question_id": 28, "text": "You are a mountain climber reaching the summit of Mount Everest. Describe your emotions and the view from the top.", "category": "roleplay"}
-{"question_id": 29, "text": "As a space colonist on Mars, describe your daily life and the challenges you face living on another planet.", "category": "roleplay"}
-{"question_id": 30, "text": "Pretend to be a character in a post-apocalyptic world. Describe how you survive and the allies you encounter.", "category": "roleplay"}
-{"question_id": 31, "text": "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?", "category": "common-sense"}
-{"question_id": 32, "text": "What are some subtle clues that suggest someone is pretending to understand a topic or conversation when they are actually confused or uninformed?", "category": "common-sense"}
-{"question_id": 33, "text": "Why might someone choose to use a paper map or ask for directions instead of relying on a GPS device or smartphone app?", "category": "common-sense"}
-{"question_id": 34, "text": "How can you determine if a person is genuinely interested in a conversation or simply being polite?", "category": "common-sense"}
-{"question_id": 35, "text": "Why might someone prefer to shop at a small, locally-owned business instead of a large chain store, even if the prices are higher?", "category": "common-sense"}
-{"question_id": 36, "text": "How can you assess the credibility of a source of information, such as a news article or blog post, without relying solely on the reputation of the author or publisher?", "category": "common-sense"}
-{"question_id": 37, "text": "Why do some people enjoy the sensation of being scared, such as by watching horror movies or going on roller coasters, while others avoid these experiences?", "category": "common-sense"}
-{"question_id": 38, "text": "How can observing the behavior of other people in a social situation provide clues about cultural norms and expectations?", "category": "common-sense"}
-{"question_id": 39, "text": "Do we have a moral obligation to explore space, or should we focus on solving Earth's problems first?", "category": "common-sense"}
-{"question_id": 40, "text": "In a world where automation is becoming increasingly prevalent, is it more important to prioritize job creation or technological progress?", "category": "common-sense"}
-{"question_id": 41, "text": "How many times does the average human blink in a lifetime? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 42, "text": "How many atoms are in a grain of salt? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 43, "text": "How many lightning strikes occur on Earth each day? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 44, "text": "How many balloons would it take to lift a house like in the movie \"Up\"? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 45, "text": "How many text messages are sent globally in a minute? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 46, "text": "How many words are spoken daily on Earth? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 47, "text": "How many snowflakes fall during a typical winter? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 48, "text": "How many pages are in all the books ever written? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 49, "text": "How many times has the Earth orbited the Sun since the beginning of life? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 50, "text": "How many songs have been recorded throughout history? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
-{"question_id": 51, "text": "What if the Internet had been invented during the Renaissance period?", "category": "counterfactual"}
-{"question_id": 52, "text": "What if the Aztecs had successfully repelled the Spanish conquistadors?", "category": "counterfactual"}
-{"question_id": 53, "text": "What if the Black Death had not occurred in the 14th century?", "category": "counterfactual"}
-{"question_id": 54, "text": "What if Isaac Newton had focused on biology instead of physics?", "category": "counterfactual"}
-{"question_id": 55, "text": "What if the Beatles had never formed as a band?", "category": "counterfactual"}
-{"question_id": 56, "text": "What if Alan Turing had not cracked the Enigma code during World War II?", "category": "counterfactual"}
-{"question_id": 57, "text": "What if the Suez Canal had never been constructed?", "category": "counterfactual"}
-{"question_id": 58, "text": "What if the Maya civilization had never mysteriously collapsed?", "category": "counterfactual"}
-{"question_id": 59, "text": "What if Christopher Columbus had not discovered the Americas?", "category": "counterfactual"}
-{"question_id": 60, "text": "What if Vincent van Gogh had been a successful artist during his lifetime?", "category": "counterfactual"}
-{"question_id": 61, "text": "Develop a C++ program that reads a text file line by line and counts the number of occurrences of a specific word in the file.", "category": "coding"}
-{"question_id": 62, "text": "Implement a Python function to find the longest common subsequence of two input strings using dynamic programming.", "category": "coding"}
-{"question_id": 63, "text": "Implement a regular expression in Python to validate an email address.", "category": "coding"}
-{"question_id": 64, "text": "Write a program to find the nth Fibonacci number using dynamic programming.", "category": "coding"}
-{"question_id": 65, "text": "Implement a binary search algorithm to find a specific element in a sorted array.", "category": "coding"}
-{"question_id": 66, "text": "Implement a queue data structure using two stacks in Python.", "category": "coding"}
-{"question_id": 67, "text": "Implement a program to find the common elements in two arrays without using any extra data structures.", "category": "coding"}
-{"question_id": 68, "text": "Given that f(x) = 5x^3 - 2x + 3, find the value of f(2).", "category": "math"}
-{"question_id": 69, "text": "Solve for x in the equation 3x + 10 = 5(x - 2).", "category": "math"}
-{"question_id": 70, "text": "If the endpoints of a line segment are (2, -2) and (10, 4), what is the length of the segment?", "category": "math"}
-{"question_id": 71, "text": "Can you help me write a formal email to a potential business partner proposing a joint venture?", "category": "writing"}
-{"question_id": 72, "text": "Can you help me write a resignation letter to my current employer, while leaving on good terms and expressing gratitude for the opportunities provided?", "category": "writing"}
-{"question_id": 73, "text": "Use an appropriate format to structure a formal letter of recommendation for a student applying to a prestigious graduate program in computer science.", "category": "writing"}
-{"question_id": 74, "text": "Write a compelling product launch announcement email to inform our customers of our new software solution.", "category": "writing"}
-{"question_id": 75, "text": "Draft an apology email to a customer who experienced a delay in their order, and provide reassurance that the issue has been resolved.", "category": "writing"}
-{"question_id": 76, "text": "Write a script for a YouTube video exploring the history and cultural significance of jazz.", "category": "writing"}
-{"question_id": 77, "text": "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.", "category": "writing"}
-{"question_id": 78, "text": "Write a captivating movie review for a recently released science fiction film, discussing its plot, characters, and special effects.", "category": "writing"}
-{"question_id": 79, "text": "Structure a podcast script for an episode discussing the influence of streaming platforms on the music industry.", "category": "writing"}
-{"question_id": 80, "text": "Write a symphony concert review, discussing the orchestra's performance and overall audience experience.", "category": "writing"}

evalscope/registry/tasks/arc.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args:
-  arc:
-    prompt_template: 'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:'
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - arc
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/bbh.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - bbh
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/bbh_mini.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {'bbh': {'subset_list': ['temporal_sequences', 'multistep_arithmetic_two']}}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - bbh
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/ceval.yaml DELETED Viewed

@@ -1,27 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - ceval
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/ceval_mini.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {'ceval': {'subset_list': ['computer_network', 'operating_system']}}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - ceval
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/cmmlu.yaml DELETED Viewed

@@ -1,27 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - cmmlu
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: v1.0.0
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - arc
-  - gsm8k
-outputs: ./outputs/eval_qwen-7b-chat_v100    # Directory to save the outputs, structure: logs, predictions, reviews, reports
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: 10

evalscope/registry/tasks/general_qa.yaml DELETED Viewed

@@ -1,27 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: true
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - general_qa
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
-use_cache: false
-stage: all
-dataset_hub: Local    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/gsm8k.yaml DELETED Viewed

@@ -1,29 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args:
-  gsm8k:
-    few_shot_num: 0
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - gsm8k
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/mmlu.yaml DELETED Viewed

@@ -1,29 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args:
-  mmlu:
-    few_shot_num: 0
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - mmlu
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
-use_cache: true
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/registry/tasks/mmlu_mini.yaml DELETED Viewed

@@ -1,27 +0,0 @@
-model_args:    # model args should be followed by benchmark requirements
-  revision: master
-  precision: torch.float16
-  device_map: auto
-#  model_name_or_path: qwen/qwen-7b-chat
-generation_config:
-  temperature: 0.3
-  max_length: 2048
-  max_new_tokens: 512
-  top_k: 50
-  top_p: 0.85
-  do_sample: false
-  num_beams: 1
-  repetition_penalty: 1.0
-#  eos_token_id: null
-#  pad_token_id: null
-dataset_args: {'mmlu': {'subset_list': ['high_school_european_history', 'business_ethics']}}
-dry_run: false
-model: null   # Note: to be implemented as CustomModel
-eval_type: custom
-datasets:
-  - mmlu
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
-use_cache: false
-stage: all
-dataset_hub: modelscope    # `Local` or `ModelScope`
-limit: null

evalscope/run_arena.py DELETED Viewed

@@ -1,202 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# flake8: noqa
-import argparse
-import os
-import torch
-from modelscope.utils.hf_util import GenerationConfig
-from pathlib import Path
-from tqdm import tqdm
-from evalscope.constants import EvalConfigKeys
-from evalscope.evaluator.rating_eval import RatingEvaluate
-from evalscope.models import ChatGenerationModelAdapter
-from evalscope.utils import get_obj_from_cfg
-from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-WORK_DIR = Path(__file__).absolute().parent
-class ArenaWorkflow:
-    def __init__(self, cfg_file: str, **kwargs):
-        self.cfg_dict = yaml_to_dict(os.path.join(WORK_DIR, cfg_file))
-        logger.info(f'**Arena Config: {self.cfg_dict}')
-        self.question_file: str = os.path.join(WORK_DIR, self.cfg_dict.get('question_file'))
-        self.answers_gen: dict = self.cfg_dict.get('answers_gen', {})
-        self.reviews_gen: dict = self.cfg_dict.get('reviews_gen', {})
-        self.reviewer_cfg: dict = ArenaWorkflow._get_obj_from_cfg(self.reviews_gen.get('reviewer', {}))
-        self.prompt_file = os.path.join(WORK_DIR, self.reviews_gen.get('prompt_file'))
-        self.review_file = os.path.join(WORK_DIR, self.reviews_gen.get('review_file'))
-        self.rating_gen: dict = self.cfg_dict.get('rating_gen', {})
-        self.report_file: str = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
-    @staticmethod
-    def _get_obj_from_cfg(obj_cfg: dict):
-        cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
-        if not cls_ref:
-            logger.warning(f'Class reference is not specified in config: {obj_cfg}')
-            return obj_cfg
-        cls = get_obj_from_cfg(cls_ref)
-        obj_cfg[EvalConfigKeys.CLASS_REF] = cls
-        return obj_cfg
-    def _predict_answers(self, model_id_or_path: str, model_revision: str, precision: torch.dtype,
-                         generation_config: GenerationConfig, template_type: str) -> list:
-        # TODO: multi-task to be supported
-        model_adapter = ChatGenerationModelAdapter(
-            model_id=model_id_or_path,
-            model_revision=model_revision,
-            torch_dtype=precision,
-            generation_config=generation_config,
-            template_type=template_type)
-        res_list = []
-        questions_list = jsonl_to_list(self.question_file)
-        for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
-            # {"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
-            text = data_d.get('text', None)
-            if not text:
-                logger.warning(f'Invalid question: {data_d}')
-                continue
-            prompt = f'Question: {text}\n\nAnswer:'
-            inputs = {'data': [prompt]}
-            res_d: dict = model_adapter.predict(inputs=inputs)
-            ans_text: str = res_d['choices'][0]['message']['content']
-            ans = {
-                'question_id': data_d['question_id'],
-                'text': data_d['text'],
-                'category': data_d['category'],
-                'model_id': model_id_or_path,
-                'metadata': {},
-                'answer': ans_text,
-            }
-            res_list.append(ans)
-        return res_list
-    def get_answers(self):
-        for model_name, cfg_d in self.answers_gen.items():
-            enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
-            if not enable:
-                logger.warning(f'Skip model {model_name} because it is not enabled.')
-                continue
-            model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
-            model_revision = cfg_d.get(EvalConfigKeys.MODEL_REVISION, None)
-            precision = cfg_d.get(EvalConfigKeys.PRECISION, torch.float16)
-            precision = eval(precision) if isinstance(precision, str) else precision
-            custom_generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
-            custom_generation_config = GenerationConfig(**custom_generation_config)
-            ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
-            template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
-            answers_list = self._predict_answers(
-                model_id_or_path=model_id_or_path,
-                model_revision=model_revision,
-                precision=precision,
-                generation_config=custom_generation_config,
-                template_type=template_type)
-            os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
-            dump_jsonl_data(answers_list, ans_output_file)
-            logger.info(f'Answers generated by model {model_name} and saved to {ans_output_file}')
-    def get_reviews(self, dry_run: bool = False):
-        enable = self.reviews_gen.get(EvalConfigKeys.ENABLE, True)
-        if enable:
-            reviewer_cls = self.reviewer_cfg.get(EvalConfigKeys.CLASS_REF)
-            if not reviewer_cls:
-                logger.warning('Skip reviews generation because class reference is not specified.')
-                return
-            reviewer_args = self.reviewer_cfg.get(EvalConfigKeys.CLASS_ARGS, {})
-            target_answers = self.reviews_gen.get('target_answers')
-            if target_answers is None:
-                # Get all answers from answers_gen config if target_answers is None
-                target_answers = [item[EvalConfigKeys.OUTPUT_FILE] for item in self.answers_gen.values()]
-            target_answers = [os.path.join(WORK_DIR, item) for item in target_answers]
-            target_answers = [file_path for file_path in target_answers if os.path.exists(file_path)]
-            baseline_file = self.reviews_gen.get('baseline_file', None)
-            if baseline_file:
-                baseline_file = os.path.join(WORK_DIR, baseline_file)
-            reference_file = self.reviews_gen.get('reference_file', None)
-            if reference_file:
-                reference_file = os.path.join(WORK_DIR, reference_file)
-            cache_file = self.reviews_gen.get('cache_file', None)
-            if cache_file:
-                cache_file = os.path.join(WORK_DIR, cache_file)
-            input_kwargs = dict(
-                prompt_file=self.prompt_file,
-                answer_file_list=target_answers,
-                review_result_file=self.review_file,
-                baseline_file=baseline_file,
-                reference_file=reference_file,
-                reviewer_args=reviewer_args,
-                cache_file=cache_file)
-            reviewer_obj = reviewer_cls(**input_kwargs)
-            reviewer_obj.run(dry_run=dry_run)
-            logger.info(f'Reviews with generated by reviewer and saved to {self.review_file}')
-        else:
-            logger.warning('Skip reviews generation because it is not enabled.')
-    def get_rating_results(self):
-        enable = self.rating_gen.get(EvalConfigKeys.ENABLE, True)
-        if enable:
-            report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
-            metrics = self.rating_gen.get('metrics', ['elo'])
-            baseline_model = self.rating_gen.get('baseline_model') if metrics[0] == 'pairwise' else None
-            ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
-            res_list = ae.run(self.review_file)
-            rating_df = res_list[0]
-            logger.info(f'Rating results:\n{rating_df.to_csv()}')
-            os.makedirs(os.path.dirname(report_file), exist_ok=True)
-            rating_df.to_csv(report_file, index=True)
-            logger.info(f'Rating results are saved to {report_file}')
-        else:
-            logger.warning('Skip rating because it is not enabled.')
-    def run(self, dry_run: bool = False):
-        # Get all answers
-        self.get_answers()
-        # Get all reviews
-        self.get_reviews(dry_run=dry_run)
-        # Get rating results
-        self.get_rating_results()
-        logger.info('*** Arena workflow is finished. ***')
-def main():
-    # Usage: python evalscope/run_arena.py -c /path/to/xxx_cfg_arena.yaml
-    parser = argparse.ArgumentParser(description='LLMs evaluations with arena mode.')
-    parser.add_argument('-c', '--cfg-file', required=True)
-    parser.add_argument('--dry-run', action='store_true', default=False)
-    args = parser.parse_args()
-    arena_workflow = ArenaWorkflow(cfg_file=args.cfg_file)
-    arena_workflow.run(dry_run=args.dry_run)
-if __name__ == '__main__':
-    main()

evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl