evalscope 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  3. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  22. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  23. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  24. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  25. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  26. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  27. evalscope/benchmarks/race/samples.jsonl +5 -0
  28. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  29. evalscope/cli/start_perf.py +8 -11
  30. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  31. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  32. evalscope/metrics/rouge_metric.py +30 -15
  33. evalscope/perf/arguments.py +179 -0
  34. evalscope/perf/benchmark.py +245 -0
  35. evalscope/perf/http_client.py +127 -711
  36. evalscope/perf/main.py +35 -0
  37. evalscope/perf/plugin/__init__.py +2 -0
  38. evalscope/perf/plugin/api/__init__.py +3 -0
  39. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  40. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  41. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  42. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  43. evalscope/perf/plugin/datasets/__init__.py +6 -0
  44. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  45. evalscope/perf/plugin/datasets/custom.py +21 -0
  46. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  47. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  48. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  49. evalscope/perf/plugin/datasets/openqa.py +38 -0
  50. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  51. evalscope/perf/plugin/registry.py +54 -0
  52. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  53. evalscope/perf/utils/benchmark_util.py +135 -0
  54. evalscope/perf/utils/chat_service.py +252 -0
  55. evalscope/perf/utils/db_util.py +200 -0
  56. evalscope/perf/utils/handler.py +46 -0
  57. evalscope/perf/utils/local_server.py +139 -0
  58. evalscope/registry/config/cfg_arena.yaml +77 -0
  59. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  60. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  61. evalscope/registry/config/cfg_single.yaml +78 -0
  62. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  63. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  64. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  65. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  66. evalscope/registry/data/question.jsonl +80 -0
  67. evalscope/third_party/longbench_write/README.md +118 -0
  68. evalscope/third_party/longbench_write/default_task.json +27 -0
  69. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  70. evalscope/third_party/toolbench_static/README.md +118 -0
  71. evalscope/third_party/toolbench_static/config_default.json +15 -0
  72. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  73. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  74. evalscope/utils/logger.py +18 -20
  75. evalscope/utils/utils.py +41 -42
  76. evalscope/version.py +2 -2
  77. evalscope-0.7.0.dist-info/LICENSE +203 -0
  78. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/METADATA +91 -33
  79. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/RECORD +99 -29
  80. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  81. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  82. tests/cli/__init__.py +1 -0
  83. tests/cli/test_run.py +76 -0
  84. tests/perf/__init__.py +1 -0
  85. tests/perf/test_perf.py +96 -0
  86. tests/rag/test_clip_benchmark.py +85 -0
  87. tests/rag/test_mteb.py +136 -0
  88. tests/rag/test_ragas.py +120 -0
  89. tests/swift/__init__.py +1 -0
  90. tests/swift/test_run_swift_eval.py +146 -0
  91. tests/swift/test_run_swift_vlm_eval.py +128 -0
  92. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  93. tests/test_run_all.py +12 -0
  94. tests/vlm/__init__.py +1 -0
  95. tests/vlm/test_vlmeval.py +59 -0
  96. evalscope/perf/_logging.py +0 -32
  97. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  98. evalscope/perf/datasets/openqa.py +0 -22
  99. evalscope/perf/plugin_registry.py +0 -35
  100. evalscope/perf/query_parameters.py +0 -42
  101. evalscope/perf/server_sent_event.py +0 -43
  102. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  103. /evalscope/perf/{datasets → utils}/__init__.py +0 -0
  104. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope/preprocess → tests}/__init__.py +0 -0
  106. {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
@@ -0,0 +1,10 @@
1
+ 文本生成: ['写作', '续写生成', '通用写作', '应用文写作', '头脑风暴', '开放对话', '角色扮演']
2
+ 文本理解:
3
+ ['文本点评', '文本摘要', '润色/纠错', '文本分类', '情感分析', '信息抽取', '文本聚类', '序列处理']
4
+ 知识问答: ['百科知识问答']
5
+ 数学解题: ['数学解题']
6
+ 逻辑推理: ['逻辑推理', '阅读理解']
7
+ CODING: ['CODING', '表格处理']
8
+ 翻译: ['翻译']
9
+ 安全风险: ['有伤害性']
10
+ 其他: ['*']
@@ -0,0 +1,80 @@
1
+ {"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
2
+ {"question_id": 2, "text": "What are the most effective ways to deal with stress?", "category": "generic"}
3
+ {"question_id": 3, "text": "What are the main differences between Python and JavaScript programming languages?", "category": "generic"}
4
+ {"question_id": 4, "text": "How can I increase my productivity while working from home?", "category": "generic"}
5
+ {"question_id": 5, "text": "Can you explain the basics of quantum computing?", "category": "generic"}
6
+ {"question_id": 6, "text": "What are the differences between plant-based and animal-based protein sources?", "category": "generic"}
7
+ {"question_id": 7, "text": "How can I develop my critical thinking skills?", "category": "generic"}
8
+ {"question_id": 8, "text": "What are the major challenges faced by the education sector today?", "category": "generic"}
9
+ {"question_id": 9, "text": "What are the primary factors that influence consumer behavior?", "category": "generic"}
10
+ {"question_id": 10, "text": "What are the most effective strategies for conflict resolution in the workplace?", "category": "generic"}
11
+ {"question_id": 11, "text": "What are some potential implications of using a single-use plastic bottle versus a reusable bottle on both the environment and human health?", "category": "knowledge"}
12
+ {"question_id": 12, "text": "What factors would you consider when designing an inclusive and accessible public transportation system?", "category": "knowledge"}
13
+ {"question_id": 13, "text": "How can governments utilize fiscal and monetary policies to combat economic recessions?", "category": "knowledge"}
14
+ {"question_id": 14, "text": "How do language and cultural barriers affect the way people communicate and form relationships in multicultural societies?", "category": "knowledge"}
15
+ {"question_id": 15, "text": "Describe a scenario where artificial intelligence could be used to improve the quality and efficiency of healthcare delivery.", "category": "knowledge"}
16
+ {"question_id": 16, "text": "Explain the process of gene editing using CRISPR-Cas9 technology, and discuss its potential applications and ethical implications.", "category": "knowledge"}
17
+ {"question_id": 17, "text": "How do vaccinations work to protect individuals and communities from infectious diseases, and what is herd immunity?", "category": "knowledge"}
18
+ {"question_id": 18, "text": "How do social media platforms influence the way people consume and share news, and what are the potential implications for the spread of misinformation?", "category": "knowledge"}
19
+ {"question_id": 19, "text": "How do cultural, social, and economic factors influence people's food choices, and how can this knowledge be used to promote healthier diets?", "category": "knowledge"}
20
+ {"question_id": 20, "text": "Explain the process of natural selection and how it contributes to the evolution and adaptation of species.", "category": "knowledge"}
21
+ {"question_id": 21, "text": "How would you introduce yourself as a medieval knight at a royal banquet?", "category": "roleplay"}
22
+ {"question_id": 22, "text": "As a pirate captain, what would you say to your crew to motivate them to search for hidden treasure?", "category": "roleplay"}
23
+ {"question_id": 23, "text": "If you were a Shakespearean character, how would you declare your love for someone in a soliloquy?", "category": "roleplay"}
24
+ {"question_id": 24, "text": "As a superhero, how would you explain your origin story to a curious child?", "category": "roleplay"}
25
+ {"question_id": 25, "text": "Imagine you are a time traveler from the year 3000. What technological advancements would you tell people about?", "category": "roleplay"}
26
+ {"question_id": 26, "text": "As a sports commentator, describe the winning play in the final seconds of a championship game.", "category": "roleplay"}
27
+ {"question_id": 27, "text": "Pretend to be a world-famous chef. How would you describe your signature dish to a panel of judges?", "category": "roleplay"}
28
+ {"question_id": 28, "text": "You are a mountain climber reaching the summit of Mount Everest. Describe your emotions and the view from the top.", "category": "roleplay"}
29
+ {"question_id": 29, "text": "As a space colonist on Mars, describe your daily life and the challenges you face living on another planet.", "category": "roleplay"}
30
+ {"question_id": 30, "text": "Pretend to be a character in a post-apocalyptic world. Describe how you survive and the allies you encounter.", "category": "roleplay"}
31
+ {"question_id": 31, "text": "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?", "category": "common-sense"}
32
+ {"question_id": 32, "text": "What are some subtle clues that suggest someone is pretending to understand a topic or conversation when they are actually confused or uninformed?", "category": "common-sense"}
33
+ {"question_id": 33, "text": "Why might someone choose to use a paper map or ask for directions instead of relying on a GPS device or smartphone app?", "category": "common-sense"}
34
+ {"question_id": 34, "text": "How can you determine if a person is genuinely interested in a conversation or simply being polite?", "category": "common-sense"}
35
+ {"question_id": 35, "text": "Why might someone prefer to shop at a small, locally-owned business instead of a large chain store, even if the prices are higher?", "category": "common-sense"}
36
+ {"question_id": 36, "text": "How can you assess the credibility of a source of information, such as a news article or blog post, without relying solely on the reputation of the author or publisher?", "category": "common-sense"}
37
+ {"question_id": 37, "text": "Why do some people enjoy the sensation of being scared, such as by watching horror movies or going on roller coasters, while others avoid these experiences?", "category": "common-sense"}
38
+ {"question_id": 38, "text": "How can observing the behavior of other people in a social situation provide clues about cultural norms and expectations?", "category": "common-sense"}
39
+ {"question_id": 39, "text": "Do we have a moral obligation to explore space, or should we focus on solving Earth's problems first?", "category": "common-sense"}
40
+ {"question_id": 40, "text": "In a world where automation is becoming increasingly prevalent, is it more important to prioritize job creation or technological progress?", "category": "common-sense"}
41
+ {"question_id": 41, "text": "How many times does the average human blink in a lifetime? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
42
+ {"question_id": 42, "text": "How many atoms are in a grain of salt? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
43
+ {"question_id": 43, "text": "How many lightning strikes occur on Earth each day? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
44
+ {"question_id": 44, "text": "How many balloons would it take to lift a house like in the movie \"Up\"? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
45
+ {"question_id": 45, "text": "How many text messages are sent globally in a minute? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
46
+ {"question_id": 46, "text": "How many words are spoken daily on Earth? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
47
+ {"question_id": 47, "text": "How many snowflakes fall during a typical winter? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
48
+ {"question_id": 48, "text": "How many pages are in all the books ever written? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
49
+ {"question_id": 49, "text": "How many times has the Earth orbited the Sun since the beginning of life? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
50
+ {"question_id": 50, "text": "How many songs have been recorded throughout history? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
51
+ {"question_id": 51, "text": "What if the Internet had been invented during the Renaissance period?", "category": "counterfactual"}
52
+ {"question_id": 52, "text": "What if the Aztecs had successfully repelled the Spanish conquistadors?", "category": "counterfactual"}
53
+ {"question_id": 53, "text": "What if the Black Death had not occurred in the 14th century?", "category": "counterfactual"}
54
+ {"question_id": 54, "text": "What if Isaac Newton had focused on biology instead of physics?", "category": "counterfactual"}
55
+ {"question_id": 55, "text": "What if the Beatles had never formed as a band?", "category": "counterfactual"}
56
+ {"question_id": 56, "text": "What if Alan Turing had not cracked the Enigma code during World War II?", "category": "counterfactual"}
57
+ {"question_id": 57, "text": "What if the Suez Canal had never been constructed?", "category": "counterfactual"}
58
+ {"question_id": 58, "text": "What if the Maya civilization had never mysteriously collapsed?", "category": "counterfactual"}
59
+ {"question_id": 59, "text": "What if Christopher Columbus had not discovered the Americas?", "category": "counterfactual"}
60
+ {"question_id": 60, "text": "What if Vincent van Gogh had been a successful artist during his lifetime?", "category": "counterfactual"}
61
+ {"question_id": 61, "text": "Develop a C++ program that reads a text file line by line and counts the number of occurrences of a specific word in the file.", "category": "coding"}
62
+ {"question_id": 62, "text": "Implement a Python function to find the longest common subsequence of two input strings using dynamic programming.", "category": "coding"}
63
+ {"question_id": 63, "text": "Implement a regular expression in Python to validate an email address.", "category": "coding"}
64
+ {"question_id": 64, "text": "Write a program to find the nth Fibonacci number using dynamic programming.", "category": "coding"}
65
+ {"question_id": 65, "text": "Implement a binary search algorithm to find a specific element in a sorted array.", "category": "coding"}
66
+ {"question_id": 66, "text": "Implement a queue data structure using two stacks in Python.", "category": "coding"}
67
+ {"question_id": 67, "text": "Implement a program to find the common elements in two arrays without using any extra data structures.", "category": "coding"}
68
+ {"question_id": 68, "text": "Given that f(x) = 5x^3 - 2x + 3, find the value of f(2).", "category": "math"}
69
+ {"question_id": 69, "text": "Solve for x in the equation 3x + 10 = 5(x - 2).", "category": "math"}
70
+ {"question_id": 70, "text": "If the endpoints of a line segment are (2, -2) and (10, 4), what is the length of the segment?", "category": "math"}
71
+ {"question_id": 71, "text": "Can you help me write a formal email to a potential business partner proposing a joint venture?", "category": "writing"}
72
+ {"question_id": 72, "text": "Can you help me write a resignation letter to my current employer, while leaving on good terms and expressing gratitude for the opportunities provided?", "category": "writing"}
73
+ {"question_id": 73, "text": "Use an appropriate format to structure a formal letter of recommendation for a student applying to a prestigious graduate program in computer science.", "category": "writing"}
74
+ {"question_id": 74, "text": "Write a compelling product launch announcement email to inform our customers of our new software solution.", "category": "writing"}
75
+ {"question_id": 75, "text": "Draft an apology email to a customer who experienced a delay in their order, and provide reassurance that the issue has been resolved.", "category": "writing"}
76
+ {"question_id": 76, "text": "Write a script for a YouTube video exploring the history and cultural significance of jazz.", "category": "writing"}
77
+ {"question_id": 77, "text": "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.", "category": "writing"}
78
+ {"question_id": 78, "text": "Write a captivating movie review for a recently released science fiction film, discussing its plot, characters, and special effects.", "category": "writing"}
79
+ {"question_id": 79, "text": "Structure a podcast script for an episode discussing the influence of streaming platforms on the music industry.", "category": "writing"}
80
+ {"question_id": 80, "text": "Write a symphony concert review, discussing the orchestra's performance and overall audience experience.", "category": "writing"}
@@ -0,0 +1,118 @@
1
+
2
+ ## Description
3
+ The LongWriter supports 10,000+ Word Generation From Long Context LLMs.
4
+ We can use the benchmark LongBench-Write focuses more on measuring the long output quality as well as the output length.
5
+
6
+ Refer to https://github.com/THUDM/LongWriter
7
+
8
+ ## Usage
9
+
10
+ ### Installation
11
+
12
+ ```bash
13
+ pip install evalscope[framework]
14
+ ```
15
+
16
+ ### Task configuration
17
+
18
+ There are few ways to configure the task: dict, json and yaml.
19
+
20
+ 1. Configuration with dict:
21
+
22
+ ```python
23
+ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
24
+ model='ZhipuAI/LongWriter-glm4-9b',
25
+ input_data_path=None,
26
+ output_dir='./outputs',
27
+ openai_api_key=None,
28
+ openai_gpt_model='gpt-4o-2024-05-13',
29
+ infer_generation_kwargs={
30
+ 'max_new_tokens': 32768,
31
+ 'temperature': 0.5
32
+ },
33
+ eval_generation_kwargs={
34
+ 'max_new_tokens': 1024,
35
+ 'temperature': 0.5,
36
+ 'stop': None
37
+ },
38
+ proc_num=8)
39
+
40
+ ```
41
+ - Arguments:
42
+ - `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process.
43
+ - `model`: model id on the ModelScope hub, or local model dir.
44
+ - `input_data_path`: input data path, default to `None`, it means to use [longbench_write](resources/longbench_write.jsonl)
45
+ - `output_dir`: output root directory.
46
+ - `openai_api_key`: openai_api_key when enabling the stage `eval_q` to use `Model-as-Judge`. Default to None if not needed.
47
+ - `openai_gpt_model`: Judge model name from OpenAI. Default to `gpt-4o-2024-05-13`
48
+ - `infer_generation_kwargs`: The generation kwargs for models to be evaluated.
49
+ - `eval_generation_kwargs`: The generation kwargs for judge-models.
50
+ - `proc_num`: proc num.
51
+
52
+
53
+ 2. Configuration with json (Optional):
54
+
55
+ ```json
56
+ {
57
+ "stage": ["infer", "eval_l", "eval_q"],
58
+ "model": "ZhipuAI/LongWriter-glm4-9b",
59
+ "input_data_path": null,
60
+ "output_dir": "./outputs",
61
+ "openai_api_key": null,
62
+ "openai_gpt_model": "gpt-4o-2024-05-13",
63
+ "infer_generation_kwargs": {
64
+ "max_new_tokens": 32768,
65
+ "temperature": 0.5
66
+ },
67
+ "eval_generation_kwargs": {
68
+ "max_new_tokens": 1024,
69
+ "temperature": 0.5,
70
+ "stop": null
71
+ },
72
+ "proc_num": 8
73
+ }
74
+ ```
75
+ Refer to [default_task.json](default_task.json) for more details.
76
+
77
+
78
+ 2. Configuration with yaml (Optional):
79
+
80
+ ```yaml
81
+ stage:
82
+ - infer
83
+ - eval_l
84
+ - eval_q
85
+ model: ZhipuAI/LongWriter-glm4-9b
86
+ input_data_path: null
87
+ output_dir: ./outputs
88
+ openai_api_key: null
89
+ openai_gpt_model: gpt-4o-2024-05-13
90
+ infer_generation_kwargs:
91
+ max_new_tokens: 32768
92
+ temperature: 0.5
93
+ eval_generation_kwargs:
94
+ max_new_tokens: 1024
95
+ temperature: 0.5
96
+ stop: null
97
+ proc_num: 8
98
+
99
+ ```
100
+ Refer to [default_task.yaml](default_task.yaml) for more details.
101
+
102
+
103
+
104
+ ### Run the task
105
+
106
+ ```python
107
+ from evalscope.third_party.longbench_write import run_task
108
+
109
+ run_task(task_cfg=task_cfg)
110
+ ```
111
+
112
+
113
+ ### Results and metrics
114
+ See `eval_length.jsonl` and `eval_quality.jsonl` in the outputs dir.
115
+
116
+ - Metrics:
117
+ - `score_l`: The average score of the length evaluation.
118
+ - `score_q`: The average score of the quality evaluation.
@@ -0,0 +1,27 @@
1
+ {
2
+ "stage": ["infer", "eval_l", "eval_q"],
3
+ "model": "ZhipuAI/LongWriter-glm4-9b",
4
+ "input_data_path": null,
5
+ "output_dir": "./outputs",
6
+ "infer_config": {
7
+ "openai_api_base": "http://127.0.0.1:8000/v1/chat/completions",
8
+ "is_chat": true,
9
+ "verbose": false,
10
+ "generation_kwargs": {
11
+ "max_new_tokens": 32768,
12
+ "temperature": 0.5,
13
+ "repetition_penalty": 1.0
14
+ }
15
+ },
16
+ "eval_config": {
17
+ "openai_api_key": "YOUR_OPENAI_API_KEY",
18
+ "openai_api_base": "https://api.openai.com/v1/chat/completions",
19
+ "openai_gpt_model": "gpt-4o-2024-05-13",
20
+ "generation_kwargs": {
21
+ "max_new_tokens": 1024,
22
+ "temperature": 0.5,
23
+ "stop": null
24
+ },
25
+ "proc_num": 16
26
+ }
27
+ }
@@ -0,0 +1,24 @@
1
+ stage:
2
+ - infer
3
+ - eval_l
4
+ - eval_q
5
+ model: ZhipuAI/LongWriter-glm4-9b
6
+ input_data_path: null
7
+ output_dir: './outputs'
8
+ infer_config:
9
+ openai_api_base: 'http://127.0.0.1:8000/v1/chat/completions'
10
+ is_chat: true
11
+ verbose: false
12
+ generation_kwargs:
13
+ max_new_tokens: 32768
14
+ temperature: 0.5
15
+ repetition_penalty: 1.0
16
+ eval_config:
17
+ openai_api_key: 'YOUR_OPENAI_API_KEY'
18
+ openai_api_base: 'https://api.openai.com/v1/chat/completions'
19
+ openai_gpt_model: 'gpt-4o-2024-05-13'
20
+ generation_kwargs:
21
+ max_new_tokens: 1024
22
+ temperature: 0.5
23
+ stop: null
24
+ proc_num: 16
@@ -0,0 +1,118 @@
1
+
2
+ ## Description
3
+ We evaluate the effectiveness of tool learning benchmark: [ToolBench](https://arxiv.org/pdf/2307.16789) (Qin et al.,2023b). The task involve integrating API calls to accomplish tasks, where the agent must accurately select the appropriate API and compose necessary API requests.
4
+
5
+ Moreover, we partition the test set of ToolBench into in-domain and out-of-domain based on whether the tools used in the test instances have been seen during training.
6
+
7
+ This division allows us to evaluate performance in both in-distribution and out-of-distribution scenarios. We call this dataset to be `ToolBench-Static`.
8
+
9
+ For more details, please refer to: [Small LLMs Are Weak Tool Learners: A Multi-LLM Agent](https://arxiv.org/abs/2401.07324)
10
+
11
+ ## Dataset
12
+
13
+ - Dataset statistics:
14
+ - Number of in_domain: 1588
15
+ - Number of out_domain: 781
16
+
17
+ ## Usage
18
+
19
+ ### Installation
20
+
21
+ ```bash
22
+ pip install evalscope -U
23
+ pip install ms-swift -U
24
+ pip install rouge -U
25
+ ```
26
+
27
+
28
+ ### Download the dataset
29
+
30
+ ```bash
31
+ wget https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/toolbench-static/data.zip
32
+ ```
33
+
34
+
35
+ ### Unzip the dataset
36
+
37
+ ```bash
38
+ unzip data.zip
39
+ # The dataset will be unzipped to the `/path/to/data/toolbench_static` folder
40
+ ```
41
+
42
+
43
+ ### Task configuration
44
+
45
+ There are two ways to configure the task: dict and yaml.
46
+
47
+ 1. Configuration with dict:
48
+
49
+ ```python
50
+ your_task_config = {
51
+ 'infer_args': {
52
+ 'model_name_or_path': '/path/to/model_dir',
53
+ 'model_type': 'qwen2-7b-instruct',
54
+ 'data_path': 'data/toolbench_static',
55
+ 'output_dir': 'output_res',
56
+ 'deploy_type': 'swift',
57
+ 'max_new_tokens': 2048,
58
+ 'num_infer_samples': None
59
+ },
60
+ 'eval_args': {
61
+ 'input_path': 'output_res',
62
+ 'output_path': 'output_res'
63
+ }
64
+ }
65
+ ```
66
+ - Arguments:
67
+ - `model_name_or_path`: The path to the model local directory.
68
+ - `model_type`: The model type, refer to [模型类型列表](https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md)
69
+ - `data_path`: The path to the dataset directory contains `in_domain.json` and `out_of_domain.json` files.
70
+ - `output_dir`: The path to the output directory. Default to `output_res`.
71
+ - `deploy_type`: The deploy type, default to `swift`.
72
+ - `max_new_tokens`: The maximum number of tokens to generate.
73
+ - `num_infer_samples`: The number of samples to infer. Default to `None`, which means infer all samples.
74
+ - `input_path`: The path to the input directory for evaluation, should be the same as `output_dir` of `infer_args`.
75
+ - `output_path`: The path to the output directory for evaluation.
76
+
77
+
78
+ 2. Configuration with yaml:
79
+
80
+ ```yaml
81
+ infer_args:
82
+ model_name_or_path: /path/to/model_dir # absolute path is recommended
83
+ model_type: qwen2-7b-instruct
84
+ data_path: /path/to/data/toolbench_static # absolute path is recommended
85
+ deploy_type: swift
86
+ max_new_tokens: 2048
87
+ num_infer_samples: null
88
+ output_dir: output_res
89
+ eval_args:
90
+ input_path: output_res
91
+ output_path: output_res
92
+ ```
93
+ refer to [config_default.yaml](config_default.yaml) for more details.
94
+
95
+
96
+ ### Run the task
97
+
98
+ ```python
99
+ from evalscope.third_party.toolbench_static import run_task
100
+
101
+ # Run the task with dict configuration
102
+ run_task(task_cfg=your_task_config)
103
+
104
+ # Run the task with yaml configuration
105
+ run_task(task_cfg='/path/to/your_task_config.yaml')
106
+ ```
107
+
108
+
109
+ ### Results and metrics
110
+
111
+ - Metrics:
112
+ - `Plan.EM`: The agent’s planning decisions at each step for using tools invocation, generating answer, or giving up. Exact match score.
113
+ - `Act.EM`: Action exact match score, including the tool name and arguments.
114
+ - `HalluRate`(lower is better): The hallucination rate of the agent's answers at each step.
115
+ - `Avg.F1`: The average F1 score of the agent's tools calling at each step.
116
+ - `R-L`: The Rouge-L score of the agent's answers at each step.
117
+
118
+ Generally, we focus on `Act.EM`, `HalluRate` and `Avg.F1` metrics.
@@ -0,0 +1,15 @@
1
+ {
2
+ "infer_args": {
3
+ "model_name_or_path": "/path/to/model_dir",
4
+ "model_type": "qwen2-7b-instruct",
5
+ "data_path": "/path/to/data/toolbench_static",
6
+ "deploy_type": "swift",
7
+ "max_new_tokens": 2048,
8
+ "num_infer_samples": null,
9
+ "output_dir": "output_res"
10
+ },
11
+ "eval_args": {
12
+ "input_path": "output_res",
13
+ "output_path": "output_res"
14
+ }
15
+ }
@@ -0,0 +1,12 @@
1
+
2
+ infer_args:
3
+ model_name_or_path: /path/to/model_dir # Note: need to replace with your model dir, absolute path is recommended
4
+ model_type: qwen2-7b-instruct # Note: need to replace with your model type
5
+ data_path: /path/to/data/toolbench_static # absolute path is recommended
6
+ deploy_type: swift
7
+ max_new_tokens: 2048
8
+ num_infer_samples: null
9
+ output_dir: output_res
10
+ eval_args:
11
+ input_path: output_res
12
+ output_path: output_res
@@ -0,0 +1,2 @@
1
+ ms-swift>=2.1.0
2
+ rouge
evalscope/utils/logger.py CHANGED
@@ -1,19 +1,19 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
1
  import importlib.util as iutil
4
2
  import logging
5
3
  from typing import Optional
6
4
 
7
5
  init_loggers = {}
8
- format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
9
- formatter = logging.Formatter(format)
10
6
 
11
- logging.basicConfig(format=format, level=logging.INFO)
7
+ detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s - %(message)s'
8
+ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
9
+
10
+ detailed_formatter = logging.Formatter(detailed_format)
11
+ simple_formatter = logging.Formatter(simple_format)
12
+
13
+ logging.basicConfig(format=simple_format, level=logging.INFO)
12
14
 
13
15
 
14
- def get_logger(
15
- log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = "w"
16
- ):
16
+ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = 'w'):
17
17
  """Get logging logger
18
18
 
19
19
  Args:
@@ -24,25 +24,23 @@ def get_logger(
24
24
  specified (if filemode is unspecified, it defaults to 'w').
25
25
  """
26
26
 
27
- logger_name = __name__.split(".")[0]
27
+ logger_name = __name__.split('.')[0]
28
28
  logger = logging.getLogger(logger_name)
29
29
  logger.propagate = False
30
+
30
31
  if logger_name in init_loggers:
31
- add_file_handler_if_needed(logger, log_file, file_mode, log_level)
32
32
  if logger.level != log_level:
33
33
  logger.setLevel(log_level)
34
+ add_file_handler_if_needed(logger, log_file, file_mode, log_level)
35
+ for handler in logger.handlers:
36
+ handler.setLevel(log_level)
37
+ handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
34
38
  return logger
35
39
 
36
40
  # handle duplicate logs to the console
37
- # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
38
- # to the root logger. As logger.propagate is True by default, this root
39
- # level handler causes logging messages from rank>0 processes to
40
- # unexpectedly show up on the console, creating much unwanted clutter.
41
- # To fix this issue, we set the root logger's StreamHandler, if any, to log
42
- # at the ERROR level.
43
41
  torch_dist = False
44
42
  is_worker0 = True
45
- if iutil.find_spec("torch") is not None:
43
+ if iutil.find_spec('torch') is not None:
46
44
  from modelscope.utils.torch_utils import is_dist, is_master
47
45
 
48
46
  torch_dist = is_dist()
@@ -61,7 +59,7 @@ def get_logger(
61
59
  handlers.append(file_handler)
62
60
 
63
61
  for handler in handlers:
64
- handler.setFormatter(formatter)
62
+ handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
65
63
  handler.setLevel(log_level)
66
64
  logger.addHandler(handler)
67
65
 
@@ -80,7 +78,7 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
80
78
  if isinstance(handler, logging.FileHandler):
81
79
  return
82
80
 
83
- if iutil.find_spec("torch") is not None:
81
+ if iutil.find_spec('torch') is not None:
84
82
  from modelscope.utils.torch_utils import is_master
85
83
 
86
84
  is_worker0 = is_master()
@@ -89,6 +87,6 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
89
87
 
90
88
  if is_worker0 and log_file is not None:
91
89
  file_handler = logging.FileHandler(log_file, file_mode)
92
- file_handler.setFormatter(formatter)
90
+ file_handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
93
91
  file_handler.setLevel(log_level)
94
92
  logger.addHandler(file_handler)