evalscope 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +38 -2
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +6 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/models/model_adapter.py +1 -1
  16. evalscope/perf/arguments.py +3 -1
  17. evalscope/perf/benchmark.py +3 -3
  18. evalscope/perf/main.py +5 -6
  19. evalscope/perf/plugin/api/openai_api.py +53 -49
  20. evalscope/perf/plugin/registry.py +3 -3
  21. evalscope/perf/utils/benchmark_util.py +4 -4
  22. evalscope/perf/utils/db_util.py +66 -22
  23. evalscope/perf/utils/local_server.py +4 -1
  24. evalscope/run.py +45 -82
  25. evalscope/run_arena.py +2 -1
  26. evalscope/summarizer.py +14 -26
  27. evalscope/third_party/longbench_write/eval.py +2 -1
  28. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  29. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  30. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  31. evalscope/tools/combine_reports.py +2 -4
  32. evalscope/tools/rewrite_eval_results.py +1 -1
  33. evalscope/utils/__init__.py +1 -0
  34. evalscope/utils/chat_service.py +1 -1
  35. evalscope/utils/io_utils.py +162 -0
  36. evalscope/utils/logger.py +8 -0
  37. evalscope/utils/utils.py +0 -175
  38. evalscope/version.py +2 -2
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/METADATA +15 -3
  40. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/RECORD +47 -67
  41. tests/cli/test_run.py +11 -12
  42. tests/perf/test_perf.py +3 -2
  43. tests/vlm/test_vlmeval.py +3 -2
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  52. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  53. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  54. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  55. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  56. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  57. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  58. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  59. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  60. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  61. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  62. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  63. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  64. evalscope/evaluator/humaneval_evaluator.py +0 -158
  65. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/LICENSE +0 -0
  66. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/WHEEL +0 -0
  67. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/top_level.txt +0 -0
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 7972318980248949928,
4
- "language": "chinese",
5
- "instruction": "从给定文本中提取最重要的max_num标题,这些标题可用于将文本分成独立的部分。重点关注第2级和第3级标题。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": " 介绍\n 主题概述...\n\n 主要概念\n 核心思想的解释...\n\n 详细分析\n 分析的技术和方法...\n\n 小节:专业技术\n 专业技术的进一步细节...\n\n 未来方向\n 对即将到来的趋势的见解...\n\n 小节:研究的下一步\n 新研究领域的讨论...\n\n 结论\n 最后的评论和总结。\n ",
10
- "max_num": 6
11
- },
12
- "output": {
13
- "headlines": [
14
- "介绍",
15
- "主要概念",
16
- "详细分析",
17
- "小节:专业技术",
18
- "未来方向",
19
- "结论"
20
- ]
21
- }
22
- }
23
- ]
24
- }
@@ -1,35 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5035835898922847346,
4
- "language": "chinese",
5
- "instruction": "通过将来自至少两个不同列表的概念配对来形成组合。\n**说明:**\n- 查看每个节点的概念。\n- 确定可以逻辑连接或对比的概念。\n- 形成涉及来自不同节点的概念的组合。\n- 每个组合应至少包括来自两个或多个节点的一个概念。\n- 清晰简洁地列出组合。\n- 不要重复相同的组合。",
6
- "examples": [
7
- {
8
- "input": {
9
- "lists_of_concepts": [
10
- [
11
- "人工智能",
12
- "自动化"
13
- ],
14
- [
15
- "医疗保健",
16
- "数据隐私"
17
- ]
18
- ],
19
- "max_combinations": 2
20
- },
21
- "output": {
22
- "combinations": [
23
- [
24
- "人工智能",
25
- "医疗保健"
26
- ],
27
- [
28
- "自动化",
29
- "数据隐私"
30
- ]
31
- ]
32
- }
33
- }
34
- ]
35
- }
@@ -1,30 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5691378570114822729,
4
- "language": "chinese",
5
- "instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成一个多跳查询和答案。主题代表从上下文中提取或生成的一组短语,这些短语突出显示了所选上下文在创建多跳查询时的适用性。确保查询明确包含这些主题。### 指导:\n1. **生成多跳查询**:使用提供的上下文片段和主题形成一个需要结合多个片段信息的查询(例如,`<1-hop>` 和 `<2-hop>`)。确保查询明确包含一个或多个主题,并反映其与上下文的相关性。\n2. **生成答案**:仅使用提供的上下文中的内容来创建查询的详细和忠实的答案。避免添加不直接存在或无法从给定上下文推断的信息。\n3. **多跳上下文标签**:\n - 每个上下文片段标记为 `<1-hop>`、`<2-hop>` 等。\n - 确保查询使用至少两个片段的信息并有意义地连接它们。",
6
- "examples": [
7
- {
8
- "input": {
9
- "persona": {
10
- "name": "历史学家",
11
- "role_description": "专注于主要科学里程碑及其全球影响。"
12
- },
13
- "themes": [
14
- "相对论",
15
- "实验验证"
16
- ],
17
- "query_style": "正式",
18
- "query_length": "中等",
19
- "context": [
20
- "<1-hop> 阿尔伯特·爱因斯坦发展了相对论,引入了时空的概念。",
21
- "<2-hop> 在1919年的日食期间,光线被重力弯曲的现象得到了证实,支持了爱因斯坦的理论。"
22
- ]
23
- },
24
- "output": {
25
- "query": "在1919年日食期间,相对论的实验验证是如何实现的?",
26
- "answer": "在1919年日食期间,通过确认光线被重力弯曲,实现了相对论的实验验证,这支持了爱因斯坦在理论中提出的时空概念。"
27
- }
28
- }
29
- ]
30
- }
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,30 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5691378570114822729,
4
- "language": "chinese",
5
- "instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成一个多跳查询和答案。主题代表从上下文中提取或生成的一组短语,这些短语突出显示了所选上下文适合多跳查询创建的适用性。确保查询明确包含这些主题。### 指导:\n1. **生成多跳查询**:使用提供的上下文片段和主题形成一个需要结合多个片段信息的查询(例如,`<1-hop>` 和 `<2-hop>`)。确保查询明确包含一个或多个主题,并反映其与上下文的相关性。\n2. **生成答案**:仅使用提供的上下文中的内容来创建对查询的详细和忠实的答案。避免添加不直接存在或无法从给定上下文推断的信息。\n3. **多跳上下文标签**:\n - 每个上下文片段标记为 `<1-hop>`、`<2-hop>` 等。\n - 确保查询使用至少两个片段的信息并有意义地连接它们。",
6
- "examples": [
7
- {
8
- "input": {
9
- "persona": {
10
- "name": "历史学家",
11
- "role_description": "专注于重大的科学里程碑及其全球影响。"
12
- },
13
- "themes": [
14
- "相对论",
15
- "实验验证"
16
- ],
17
- "query_style": "正式",
18
- "query_length": "中等",
19
- "context": [
20
- "<1-hop> 阿尔伯特·爱因斯坦发展了相对论,引入了时空的概念。",
21
- "<2-hop> 在1919年的日全食期间,光线被重力弯曲的现象得到了证实,支持了爱因斯坦的理论。"
22
- ]
23
- },
24
- "output": {
25
- "query": "在1919年的日全食期间,相对论的实验验证是如何实现的?",
26
- "answer": "在1919年的日全食期间,通过确认光线被重力弯曲的现象,实现了相对论的实验验证,这支持了爱因斯坦在理论中提出的时空概念。"
27
- }
28
- }
29
- ]
30
- }
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,34 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -1903496084584659501,
4
- "language": "chinese",
5
- "instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息,请回答“是”。",
6
- "examples": [
7
- {
8
- "input": {
9
- "response": "苹果派通常是双层皮的。",
10
- "retrieved_contexts": [
11
- "苹果派是一种水果派,其主要馅料成分是苹果。",
12
- "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
13
- "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
14
- ]
15
- },
16
- "output": {
17
- "faithful": true
18
- }
19
- },
20
- {
21
- "input": {
22
- "response": "苹果派味道不好。",
23
- "retrieved_contexts": [
24
- "苹果派是一种水果派,其主要馅料成分是苹果。",
25
- "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
26
- "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
27
- ]
28
- },
29
- "output": {
30
- "faithful": false
31
- }
32
- }
33
- ]
34
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -2067734205359291291,
4
- "language": "chinese",
5
- "instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致,则回答 - True,否则为 False。\n",
6
- "examples": [
7
- {
8
- "input": {
9
- "user_input": "传统玛格丽塔披萨的主要成分是什么?",
10
- "response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
11
- "retrieved_contexts": [
12
- "传统的玛格丽塔披萨由薄饼皮组成。",
13
- "主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
14
- "它是最简单和最经典的披萨类型之一。"
15
- ]
16
- },
17
- "output": {
18
- "relevance": true
19
- }
20
- },
21
- {
22
- "input": {
23
- "user_input": "谁在2021年获得了奥斯卡最佳男演员奖?",
24
- "response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
25
- "retrieved_contexts": [
26
- "第93届奥斯卡颁奖典礼于2021年举行。",
27
- "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色获得了最佳男演员奖。",
28
- "由于COVID-19的限制,这次活动具有独特性。"
29
- ]
30
- },
31
- "output": {
32
- "relevance": false
33
- }
34
- }
35
- ]
36
- }
@@ -1,25 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -677862064343016555,
4
- "language": "chinese",
5
- "instruction": "从给定文本中提取命名实体,限制输出为顶级实体。确保实体数量不超过指定的最大值。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "特斯拉和SpaceX的首席执行官埃隆·马斯克宣布计划将业务扩展到欧洲和亚洲的新地点。\n 此次扩展预计将创造数千个就业机会,特别是在柏林和上海等城市。",
10
- "max_num": 10
11
- },
12
- "output": {
13
- "entities": [
14
- "埃隆·马斯克",
15
- "特斯拉",
16
- "SpaceX",
17
- "欧洲",
18
- "亚洲",
19
- "柏林",
20
- "上海"
21
- ]
22
- }
23
- }
24
- ]
25
- }
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 3079700511467088808,
4
- "language": "chinese",
5
- "instruction": "根据指定的条件(角色、术语、风格、长度)和提供的上下文生成一个单跳查询和答案。确保答案完全忠实于上下文,仅使用提供的上下文中的信息。### 指导:\n1. **生成查询**:根据上下文、角色、术语、风格和长度,创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**:仅使用提供的上下文中的内容,构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n",
6
- "examples": [
7
- {
8
- "input": {
9
- "persona": {
10
- "name": "软件工程师",
11
- "role_description": "专注于编码最佳实践和系统设计。"
12
- },
13
- "term": "微服务",
14
- "query_style": "正式",
15
- "query_length": "中等",
16
- "context": "微服务是一种架构风格,其中应用程序被构建为一组松散耦合的服务。每个服务都是细粒度的,并专注于单一功能。"
17
- },
18
- "output": {
19
- "query": "微服务在软件架构中的目的是什么?",
20
- "answer": "微服务旨在将应用程序结构化为一组松散耦合的服务,每个服务专注于单一功能。"
21
- }
22
- }
23
- ]
24
- }
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,16 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -2203889341293275650,
4
- "language": "chinese",
5
- "instruction": "将给定文本总结为不超过10个句子。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗保健到金融,人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
10
- },
11
- "output": {
12
- "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新,正在革新各个行业。"
13
- }
14
- }
15
- ]
16
- }
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -7344189172470926110,
4
- "language": "chinese",
5
- "instruction": "从给定的文本中提取主要主题和概念。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据,推动了自动驾驶汽车和个性化推荐等创新。",
10
- "max_num": 10
11
- },
12
- "output": {
13
- "output": [
14
- "人工智能",
15
- "自动化",
16
- "数据分析",
17
- "创新",
18
- "自动驾驶汽车",
19
- "个性化推荐"
20
- ]
21
- }
22
- }
23
- ]
24
- }
@@ -1,158 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from tqdm import tqdm
5
- from typing import List, Optional
6
-
7
- from evalscope.constants import OutputsStructure
8
- from evalscope.evaluator.evaluator import logger
9
- from evalscope.models.model_adapter import BaseModelAdapter
10
- from evalscope.tools.combine_reports import gen_table
11
- from evalscope.utils import normalize_score
12
-
13
-
14
- class HumanevalEvaluator(object):
15
-
16
- def __init__(
17
- self,
18
- problem_file: str,
19
- model_id: str,
20
- model_revision: str,
21
- model_adapter: BaseModelAdapter,
22
- outputs: Optional[OutputsStructure] = None,
23
- k: List[int] = [1, 10, 100],
24
- n_workers: int = 4,
25
- timeout: float = 3.0,
26
- ):
27
- try:
28
- from human_eval.data import read_problems, write_jsonl
29
- from human_eval.evaluation import evaluate_functional_correctness
30
- except ImportError:
31
- raise ImportError('Please install human_eval:'
32
- 'https://github.com/openai/human-eval/tree/master#installation , '
33
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
34
-
35
- self.problem_file = problem_file
36
- self.k = k
37
- self.num_workers = n_workers
38
- self.timeout = timeout
39
- self.model_adapter = model_adapter
40
-
41
- self.read_problems_func = read_problems
42
- self.write_jsonl_func = write_jsonl
43
- self.eval_func = evaluate_functional_correctness
44
-
45
- # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
46
- self.problems = self.read_problems_func(self.problem_file)
47
-
48
- # Deal with the output paths
49
- self.outputs_structure = OutputsStructure(outputs)
50
-
51
- def get_answers(self, infer_cfg: dict) -> List[dict]:
52
- ans_list: list = []
53
- system_prompt: str = 'Complete the following python code:\n'
54
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
55
- prompt: str = system_prompt + data_d['prompt']
56
- inputs: dict = {'data': [prompt]}
57
- # pred_res: dict = self.model_adapter.predict(inputs)
58
-
59
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
60
-
61
- pred_ans: str = pred_res['choices'][0]['message']['content']
62
- pred_ans = self._postprocess(pred_ans)
63
-
64
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
65
-
66
- return ans_list
67
-
68
- def eval(self, infer_cfg: dict, **kwargs):
69
-
70
- # predict
71
- ans_list: list = self.get_answers(infer_cfg)
72
- ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
73
-
74
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
75
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
76
- logger.info('** Dump predictions successfully.')
77
-
78
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
79
- results = self.eval_func(
80
- sample_file=ans_out_file,
81
- k=self.k,
82
- n_workers=self.num_workers,
83
- timeout=self.timeout,
84
- problem_file=self.problem_file)
85
-
86
- # output: report
87
- report_map: dict = self.gen_report(results=results)
88
- report_dir: str = self.outputs_structure.reports_dir
89
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
90
-
91
- with open(report_file, 'w') as f:
92
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
93
- # logger.info(f'** Dump report to {report_file} \n')
94
- logger.info('** Dump report \n')
95
-
96
- try:
97
- # Make table
98
- report_table: str = gen_table([report_dir])
99
- logger.info(f'** Report table: \n {report_table} \n')
100
- except Exception:
101
- logger.error('Failed to generate report table.')
102
-
103
- def gen_report(self, results: dict) -> dict:
104
- """
105
- Generate report from evaluation results.
106
-
107
- Returns:
108
- {
109
- "name":"ARC-Challenge",
110
- "metric":"WeightedAverageAccuracy",
111
- "score":0.3389,
112
- "category":[
113
- {
114
- "name":"DEFAULT",
115
- "score":0.3389,
116
- "subset":[
117
- {
118
- "name":"ARC-Challenge",
119
- "score":0.3389
120
- },
121
- ]
122
- }
123
- ],
124
- "total_num":100
125
- }
126
- """
127
- results = {k: normalize_score(score=v) for k, v in results.items()}
128
-
129
- category_d = dict(name='DEFAULT', score=results, subset=[])
130
-
131
- res_map = dict(
132
- name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
133
-
134
- return res_map
135
-
136
- @classmethod
137
- def _postprocess(cls, text: str) -> str:
138
- if '```' in text:
139
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
140
- if len(blocks) == 0:
141
- text = text.split('```')[1] # fall back to default strategy
142
- else:
143
- text = blocks[0] # fetch the first code block
144
- if not text.startswith('\n'): # in case starting with ```python
145
- text = text[max(text.find('\n') + 1, 0):]
146
- if text.strip().startswith('from') or text.strip().startswith('import'):
147
- def_idx = text.find('def')
148
- if def_idx != -1:
149
- text = text[max(text.find('\n', def_idx) + 1, 0):]
150
- text = text.split('\n\n')[0]
151
- if text.strip().startswith('def'):
152
- text = '\n'.join(text.split('\n')[1:])
153
- if not text.startswith(' '):
154
- if text.startswith(' '):
155
- text = ' ' + text.lstrip()
156
- else:
157
- text = '\n'.join([' ' + line for line in text.split('\n')])
158
- return text